diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index c3cf714ce6c10..8e25fd61d6b32 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -19,6 +19,7 @@
 PROJECT_DEPENDENCIES = {
     "llvm": set(),
     "clang": {"llvm"},
+    "CIR": {"clang", "mlir"},
     "bolt": {"clang", "lld", "llvm"},
     "clang-tools-extra": {"clang", "llvm"},
     "compiler-rt": {"clang", "lld"},
@@ -55,6 +56,7 @@
     ".ci": {
         "llvm",
         "clang",
+        "CIR",
         "lld",
         "lldb",
         "bolt",
@@ -128,6 +130,7 @@
     "lldb": "check-lldb",
     "llvm": "check-llvm",
     "clang": "check-clang",
+    "CIR": "check-clang-cir",
     "bolt": "check-bolt",
     "lld": "check-lld",
     "flang": "check-flang",
@@ -247,6 +250,14 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]:
         # capacity.
         if len(path_parts) > 3 and path_parts[:3] == ("llvm", "utils", "gn"):
             continue
+        # If the file is in the clang/lib/CIR directory, add the CIR project.
+        if len(path_parts) > 3 and (
+            path_parts[:3] == ("clang", "lib", "CIR")
+            or path_parts[:3] == ("clang", "test", "CIR")
+            or path_parts[:4] == ("clang", "include", "clang", "CIR")
+        ):
+            modified_projects.add("CIR")
+            # Fall through to add clang.
         modified_projects.add(pathlib.Path(modified_file).parts[0])
     return modified_projects
 
@@ -267,6 +278,13 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
         runtimes_to_test_needs_reconfig
     )
+
+    # CIR is used as a pseudo-project in this script. It is built as part of the
+    # clang build, but it requires an explicit option to enable. We set that
+    # option here, and remove it from the projects_to_build list.
+    enable_cir = "ON" if "CIR" in projects_to_build else "OFF"
+    projects_to_build.discard("CIR")
+
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -279,6 +297,7 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "runtimes_check_targets_needs_reconfig": " ".join(
             sorted(runtimes_check_targets_needs_reconfig)
         ),
+        "enable_cir": enable_cir,
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6299931e1ec34..732514c96f5a6 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -104,6 +104,10 @@ def test_clang(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(
+            env_variables["enable_cir"],
+            "OFF",
+        )
 
     def test_clang_windows(self):
         env_variables = compute_projects.get_env_variables(
@@ -126,6 +130,32 @@ def test_clang_windows(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(env_variables["enable_cir"], "OFF")
+
+    def test_cir(self):
+        env_variables = compute_projects.get_env_variables(
+            ["clang/lib/CIR/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "clang;clang-tools-extra;lld;llvm;mlir",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-clang check-clang-cir check-clang-tools",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+        self.assertEqual(env_variables["enable_cir"], "ON")
 
     def test_bolt(self):
         env_variables = compute_projects.get_env_variables(
@@ -158,6 +188,7 @@ def test_mlir(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -168,6 +199,7 @@ def test_flang(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
@@ -237,7 +269,7 @@ def test_ci(self):
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
         )
         self.assertEqual(
             env_variables["runtimes_to_build"],
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 8d1faab13986c..6db24d894eb73 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -21,12 +21,7 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 INSTALL_DIR="${BUILD_DIR}/install"
 rm -rf "${BUILD_DIR}"
 
-ccache --zero-stats
-
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing cache"
-  ccache --clear
-fi
+sccache --zero-stats
 
 mkdir -p artifacts/reproducers
 
@@ -36,7 +31,7 @@ export CLANG_CRASH_DIAGNOSTICS_DIR=`realpath artifacts/reproducers`
 function at-exit {
   retcode=$?
 
-  ccache --print-stats > artifacts/ccache_stats.txt
+  sccache --show-stats > artifacts/sccache_stats.txt
   cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log
   cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || :
 
@@ -53,6 +48,7 @@ targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
 runtime_targets_needs_reconfig="${5}"
+enable_cir="${6}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -72,13 +68,15 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -G Ninja \
       -D CMAKE_PREFIX_PATH="${HOME}/.local" \
       -D CMAKE_BUILD_TYPE=Release \
+      -D CLANG_ENABLE_CIR=${enable_cir} \
       -D LLVM_ENABLE_ASSERTIONS=ON \
       -D LLVM_BUILD_EXAMPLES=ON \
       -D COMPILER_RT_BUILD_LIBFUZZER=OFF \
       -D LLVM_LIT_ARGS="${lit_args}" \
       -D LLVM_ENABLE_LLD=ON \
       -D CMAKE_CXX_FLAGS=-gmlt \
-      -D LLVM_CCACHE_BUILD=ON \
+      -D CMAKE_C_COMPILER_LAUNCHER=sccache \
+      -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
       -D LIBCXX_CXX_ABI=libcxxabi \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D LLDB_ENABLE_PYTHON=ON \
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 176350fac604c..50a741677d734 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -21,11 +21,6 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 
 rm -rf "${BUILD_DIR}"
 
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing sccache"
-  rm -rf "$SCCACHE_DIR"
-fi
-
 sccache --zero-stats
 function at-exit {
   retcode=$?
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index b05e9c6c56ed0..8e0fa8d42d735 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -48,6 +48,9 @@ flang:frontend:
   - flang/Evaluate/**/*
   - flang/Semantics/**/*
 
+libclc:
+  - libclc/**
+
 HLSL:
   - clang/*HLSL*/**/*
   - clang/**/*HLSL*
@@ -717,6 +720,8 @@ mlgo:
   - llvm/lib/Analysis/IR2Vec.cpp
   - llvm/lib/Analysis/models/**
   - llvm/test/Analysis/IR2Vec/**
+  - llvm/tools/llvm-ir2vec/**
+  - llvm/docs/CommandGuide/llvm-ir2vec.rst
 
 tools:llvm-exegesis:
   - llvm/tools/llvm-exegesis/**
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 59079f057d021..f76c69f29fb30 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -11,8 +11,6 @@ on:
       - .github/workflows/build-ci-container-windows.yml
       - '.github/workflows/containers/github-action-ci-windows/**'
   pull_request:
-    branches:
-      - main
     paths:
       - .github/workflows/build-ci-container-windows.yml
       - '.github/workflows/containers/github-action-ci-windows/**'
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 3159aae32ca51..7f01264af8534 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -11,8 +11,6 @@ on:
       - .github/workflows/build-ci-container.yml
       - '.github/workflows/containers/github-action-ci/**'
   pull_request:
-    branches:
-      - main
     paths:
       - .github/workflows/build-ci-container.yml
       - '.github/workflows/containers/github-action-ci/**'
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index efe08ebc221c5..69c71f638e2ac 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -63,11 +63,21 @@ RUN apt-get update && \
     python3-pip \
     ccache \
     file \
-    tzdata \
-    sccache && \
+    tzdata && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# We need sccache for caching. We cannot use the apt repository version because
+# it is too old and has bugs related to features we require (particularly GCS
+# caching), so we manually install it here.
+# TODO(boomanaiden154): We should return to installing this from the apt
+# repository once a version containing the necessary bug fixes is available.
+RUN curl -L 'https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz' > /tmp/sccache.tar.gz && \
+    echo "1fbb35e135660d04a2d5e42b59c7874d39b3deb17de56330b25b713ec59f849b /tmp/sccache.tar.gz" | sha256sum -c && \
+    tar xzf /tmp/sccache.tar.gz -O --wildcards '*/sccache' > '/usr/local/bin/sccache' && \
+    rm /tmp/sccache.tar.gz && \
+    chmod +x /usr/local/bin/sccache
+
 ENV LLVM_SYSROOT=$LLVM_SYSROOT
 ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index ff63355222065..c51325e2f0d45 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -34,10 +34,6 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 2
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          max-size: "2000M"
       - name: Build and Test
         # Mark the job as a success even if the step fails so that people do
         # not get notified while the new premerge pipeline is in an
@@ -61,7 +57,14 @@ jobs:
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
+          # This environment variable is passes into the container through the
+          # runner pod definition. This differs between our two clusters which
+          # why we do not hardcode it.
+          export SCCACHE_GCS_BUCKET=$CACHE_GCS_BUCKET
+          export SCCACHE_GCS_RW_MODE=READ_WRITE
+          sccache --start-server
+
+          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}" "${enable_cir}"
       - name: Upload Artifacts
         if: '!cancelled()'
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
@@ -85,11 +88,6 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 2
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          variant: "sccache"
-          max-size: "2000M"
       - name: Compute Projects
         id: vars
         run: |
@@ -112,7 +110,7 @@ jobs:
         shell: cmd
         run: |
           call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
-          bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}"
+          bash -c "export SCCACHE_GCS_BUCKET=$CACHE_GCS_BUCKET; export SCCACHE_GCS_RW_MODE=READ_WRITE; sccache --start-server; .ci/monolithic-windows.sh \"${{ steps.vars.outputs.windows-projects }}\" \"${{ steps.vars.outputs.windows-check-targets }}\""
       - name: Upload Artifacts
         if: '!cancelled()'
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
diff --git a/bolt/utils/nfc-check-setup.py b/bolt/utils/nfc-check-setup.py
index 275ac7b886d00..d8666e2158499 100755
--- a/bolt/utils/nfc-check-setup.py
+++ b/bolt/utils/nfc-check-setup.py
@@ -7,6 +7,8 @@
 import sys
 import textwrap
 
+msg_prefix = "\n> NFC-Mode:"
+
 def get_relevant_bolt_changes(dir: str) -> str:
     # Return a list of bolt source changes that are relevant to testing.
     all_changes = subprocess.run(
@@ -42,14 +44,32 @@ def get_git_ref_or_rev(dir: str) -> str:
     cmd_rev = "git rev-parse --short HEAD"
     return subprocess.check_output(shlex.split(cmd_rev), cwd=dir, text=True).strip()
 
+def switch_back(
+    switch_back: bool, stash: bool, source_dir: str, old_ref: str, new_ref: str
+):
+    # Switch back to the current revision if needed and inform the user of where
+    # the HEAD is. Must be called after checking out the previous commit on all
+    # exit paths.
+    if switch_back:
+        print(f"{msg_prefix} Switching back to current revision..")
+        if stash:
+            subprocess.run(shlex.split("git stash pop"), cwd=source_dir)
+        subprocess.run(shlex.split(f"git checkout {old_ref}"), cwd=source_dir)
+    else:
+        print(
+            f"The repository {source_dir} has been switched from {old_ref} "
+            f"to {new_ref}. Local changes were stashed. Switch back using\n\t"
+            f"git checkout {old_ref}\n"
+        )
 
 def main():
     parser = argparse.ArgumentParser(
         description=textwrap.dedent(
             """
-            This script builds two versions of BOLT (with the current and
-            previous revision) and sets up symlink for llvm-bolt-wrapper.
-            Passes the options through to llvm-bolt-wrapper.
+            This script builds two versions of BOLT:
+            llvm-bolt.new, using the current revision, and llvm-bolt.old using
+            the previous revision. These can be used to check whether the
+            current revision changes BOLT's functional behavior.
             """
         )
     )
@@ -59,6 +79,12 @@ def main():
         default=os.getcwd(),
         help="Path to BOLT build directory, default is current " "directory",
     )
+    parser.add_argument(
+        "--create-wrapper",
+        default=False,
+        action="store_true",
+        help="Sets up llvm-bolt as a symlink to llvm-bolt-wrapper. Passes the options through to llvm-bolt-wrapper.",
+    )
     parser.add_argument(
         "--check-bolt-sources",
         default=False,
@@ -76,28 +102,42 @@ def main():
         default="HEAD^",
         help="Revision to checkout to compare vs HEAD",
     )
+
+    # When creating a wrapper, pass any unknown arguments to it. Otherwise, die.
     args, wrapper_args = parser.parse_known_args()
-    bolt_path = f"{args.build_dir}/bin/llvm-bolt"
+    if not args.create_wrapper and len(wrapper_args) > 0:
+        parser.parse_args()
 
+    # Find the repo directory.
     source_dir = None
-    # find the repo directory
-    with open(f"{args.build_dir}/CMakeCache.txt") as f:
-        for line in f:
-            m = re.match(r"LLVM_SOURCE_DIR:STATIC=(.*)", line)
-            if m:
-                source_dir = m.groups()[0]
-    if not source_dir:
-        sys.exit("Source directory is not found")
-
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    wrapper_path = f"{script_dir}/llvm-bolt-wrapper.py"
-    # build the current commit
+    try:
+        CMCacheFilename = f"{args.build_dir}/CMakeCache.txt"
+        with open(CMCacheFilename) as f:
+            for line in f:
+                m = re.match(r"LLVM_SOURCE_DIR:STATIC=(.*)", line)
+                if m:
+                    source_dir = m.groups()[0]
+        if not source_dir:
+            raise Exception(f"Source directory not found: '{CMCacheFilename}'")
+    except Exception as e:
+        sys.exit(e)
+
+    # Clean the previous llvm-bolt if it exists.
+    bolt_path = f"{args.build_dir}/bin/llvm-bolt"
+    if os.path.exists(bolt_path):
+        os.remove(bolt_path)
+
+    # Build the current commit.
+    print(f"{msg_prefix} Building current revision..")
     subprocess.run(
         shlex.split("cmake --build . --target llvm-bolt"), cwd=args.build_dir
     )
-    # rename llvm-bolt
+
+    if not os.path.exists(bolt_path):
+        sys.exit(f"Failed to build the current revision: '{bolt_path}'")
+
+    # Rename llvm-bolt and memorize the old hash for logging.
     os.replace(bolt_path, f"{bolt_path}.new")
-    # memorize the old hash for logging
     old_ref = get_git_ref_or_rev(source_dir)
 
     if args.check_bolt_sources:
@@ -110,7 +150,7 @@ def main():
             print(f"BOLT source changes were found:\n{file_changes}")
             open(marker, "a").close()
 
-    # determine whether a stash is needed
+    # Determine whether a stash is needed.
     stash = subprocess.run(
         shlex.split("git status --porcelain"),
         cwd=source_dir,
@@ -119,42 +159,59 @@ def main():
         text=True,
     ).stdout
     if stash:
-        # save local changes before checkout
+        # Save local changes before checkout.
         subprocess.run(shlex.split("git stash push -u"), cwd=source_dir)
-    # check out the previous/cmp commit
+
+    # Check out the previous/cmp commit and get its commit hash for logging.
     subprocess.run(shlex.split(f"git checkout -f {args.cmp_rev}"), cwd=source_dir)
-    # get the parent commit hash for logging
     new_ref = get_git_ref_or_rev(source_dir)
-    # build the previous commit
+
+    # Build the previous commit.
+    print(f"{msg_prefix} Building previous revision..")
     subprocess.run(
         shlex.split("cmake --build . --target llvm-bolt"), cwd=args.build_dir
     )
-    # rename llvm-bolt
+
+    # Rename llvm-bolt.
+    if not os.path.exists(bolt_path):
+        print(f"Failed to build the previous revision: '{bolt_path}'")
+        switch_back(args.switch_back, stash, source_dir, old_ref, new_ref)
+        sys.exit(1)
     os.replace(bolt_path, f"{bolt_path}.old")
-    # set up llvm-bolt-wrapper.ini
-    ini = subprocess.check_output(
-        shlex.split(f"{wrapper_path} {bolt_path}.old {bolt_path}.new") + wrapper_args,
-        text=True,
+
+    # Symlink llvm-bolt-wrapper
+    if args.create_wrapper:
+        print(f"{msg_prefix} Creating llvm-bolt wrapper..")
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        wrapper_path = f"{script_dir}/llvm-bolt-wrapper.py"
+        try:
+            # Set up llvm-bolt-wrapper.ini
+            ini = subprocess.check_output(
+                shlex.split(f"{wrapper_path} {bolt_path}.old {bolt_path}.new")
+                + wrapper_args,
+                text=True,
+            )
+            with open(f"{args.build_dir}/bin/llvm-bolt-wrapper.ini", "w") as f:
+                f.write(ini)
+            os.symlink(wrapper_path, bolt_path)
+        except Exception as e:
+            print("Failed to create a wrapper:\n" + str(e))
+            switch_back(args.switch_back, stash, source_dir, old_ref, new_ref)
+            sys.exit(1)
+
+    switch_back(args.switch_back, stash, source_dir, old_ref, new_ref)
+
+    print(
+        f"{msg_prefix} Completed!\nBuild directory {args.build_dir} is ready for"
+        " NFC-Mode comparison between the two revisions."
     )
-    with open(f"{args.build_dir}/bin/llvm-bolt-wrapper.ini", "w") as f:
-        f.write(ini)
-    # symlink llvm-bolt-wrapper
-    os.symlink(wrapper_path, bolt_path)
-    if args.switch_back:
-        if stash:
-            subprocess.run(shlex.split("git stash pop"), cwd=source_dir)
-        subprocess.run(shlex.split(f"git checkout {old_ref}"), cwd=source_dir)
-    else:
+
+    if args.create_wrapper:
         print(
-            f"The repository {source_dir} has been switched from {old_ref} "
-            f"to {new_ref}. Local changes were stashed. Switch back using\n\t"
-            f"git checkout {old_ref}\n"
+            "Can run BOLT tests using:\n"
+            "\tbin/llvm-lit -sv tools/bolt/test\nor\n"
+            "\tbin/llvm-lit -sv tools/bolttests"
         )
-    print(
-        f"Build directory {args.build_dir} is ready to run BOLT tests, e.g.\n"
-        "\tbin/llvm-lit -sv tools/bolt/test\nor\n"
-        "\tbin/llvm-lit -sv tools/bolttests"
-    )
 
 
 if __name__ == "__main__":
diff --git a/clang-tools-extra/README.txt b/clang-tools-extra/README.txt
index 6891e4078997f..1195db9b468dd 100644
--- a/clang-tools-extra/README.txt
+++ b/clang-tools-extra/README.txt
@@ -8,12 +8,13 @@ Clang frontend.  These tools are kept in a separate "extra" repository to
 allow lighter weight checkouts of the core Clang codebase.
 
 All discussion regarding Clang, Clang-based tools, and code in this repository
-should be held using the standard Clang forum:
+should be held using the standard Clang forums:
   https://discourse.llvm.org/c/clang
+  https://discourse.llvm.org/c/clang/clang-tidy/71
+  https://discourse.llvm.org/c/clang/clangd/34
 
-Code review for this tree should take place on the standard Clang patch and
-commit lists:
-  http://lists.llvm.org/mailman/listinfo/cfe-commits
+Code review for this tree should take place on Github:
+  https://github.com/llvm/llvm-project/pulls?q=label%3Aclang-tools-extra
 
 If you find a bug in these tools, please file it in the LLVM bug tracker:
   https://github.com/llvm/llvm-project/issues/
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 6fdc7196e9095..cc4c68346ec53 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -45,7 +45,7 @@ static auto SerializeReferenceLambda = [](const auto &Ref, Object &Object) {
 
 static json::Object
 serializeLocation(const Location &Loc,
-                  const std::optional<StringRef> &RepositoryUrl) {
+                  const std::optional<StringRef> RepositoryUrl) {
   Object LocationObj = Object();
   LocationObj["LineNumber"] = Loc.StartLineNumber;
   LocationObj["Filename"] = Loc.Filename;
@@ -169,7 +169,7 @@ static json::Value serializeComment(const CommentInfo &I) {
 
 static void
 serializeCommonAttributes(const Info &I, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryUrl) {
+                          const std::optional<StringRef> RepositoryUrl) {
   Obj["Name"] = I.Name;
   Obj["USR"] = toHex(toStringRef(I.USR));
 
@@ -211,9 +211,9 @@ static void serializeReference(const Reference &Ref, Object &ReferenceObj) {
 // differently. Only enums, records, and typedefs are handled here.
 static void
 serializeCommonChildren(const ScopeChildren &Children, json::Object &Obj,
-                        const std::optional<StringRef> &RepositoryUrl) {
-  static auto SerializeInfo = [&RepositoryUrl](const auto &Info,
-                                               Object &Object) {
+                        const std::optional<StringRef> RepositoryUrl) {
+  static auto SerializeInfo = [RepositoryUrl](const auto &Info,
+                                              Object &Object) {
     serializeInfo(Info, Object, RepositoryUrl);
   };
 
@@ -304,7 +304,7 @@ static void serializeInfo(const FieldTypeInfo &I, Object &Obj) {
 }
 
 static void serializeInfo(const FunctionInfo &F, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryURL) {
+                          const std::optional<StringRef> RepositoryURL) {
   serializeCommonAttributes(F, Obj, RepositoryURL);
   Obj["IsStatic"] = F.IsStatic;
 
@@ -459,7 +459,7 @@ static void serializeInfo(const RecordInfo &I, json::Object &Obj,
 }
 
 static void serializeInfo(const VarInfo &I, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryUrl) {
+                          const std::optional<StringRef> RepositoryUrl) {
   serializeCommonAttributes(I, Obj, RepositoryUrl);
   Obj["IsStatic"] = I.IsStatic;
   auto TypeObj = Object();
@@ -468,15 +468,15 @@ static void serializeInfo(const VarInfo &I, json::Object &Obj,
 }
 
 static void serializeInfo(const NamespaceInfo &I, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryUrl) {
+                          const std::optional<StringRef> RepositoryUrl) {
   serializeCommonAttributes(I, Obj, RepositoryUrl);
 
   if (!I.Children.Namespaces.empty())
     serializeArray(I.Children.Namespaces, Obj, "Namespaces",
                    SerializeReferenceLambda);
 
-  static auto SerializeInfo = [&RepositoryUrl](const auto &Info,
-                                               Object &Object) {
+  static auto SerializeInfo = [RepositoryUrl](const auto &Info,
+                                              Object &Object) {
     serializeInfo(Info, Object, RepositoryUrl);
   };
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
index 3c3024d538785..4b495e3877000 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.cpp
@@ -49,7 +49,7 @@ static Matcher<Stmt> loopEndingStmt(Matcher<Stmt> Internal) {
 }
 
 /// Return whether `Var` was changed in `LoopStmt`.
-static bool isChanged(const Stmt *LoopStmt, const VarDecl *Var,
+static bool isChanged(const Stmt *LoopStmt, const ValueDecl *Var,
                       ASTContext *Context) {
   if (const auto *ForLoop = dyn_cast<ForStmt>(LoopStmt))
     return (ForLoop->getInc() &&
@@ -64,24 +64,35 @@ static bool isChanged(const Stmt *LoopStmt, const VarDecl *Var,
   return ExprMutationAnalyzer(*LoopStmt, *Context).isMutated(Var);
 }
 
+static bool isVarPossiblyChanged(const Decl *Func, const Stmt *LoopStmt,
+                                 const ValueDecl *VD, ASTContext *Context) {
+  const VarDecl *Var = nullptr;
+  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
+    Var = VarD;
+  } else if (const auto *BD = dyn_cast<BindingDecl>(VD)) {
+    if (const auto *DD = dyn_cast<DecompositionDecl>(BD->getDecomposedDecl()))
+      Var = DD;
+  }
+
+  if (!Var)
+    return false;
+
+  if (!Var->isLocalVarDeclOrParm() || Var->getType().isVolatileQualified())
+    return true;
+
+  if (!VD->getType().getTypePtr()->isIntegerType())
+    return true;
+
+  return hasPtrOrReferenceInFunc(Func, VD) || isChanged(LoopStmt, VD, Context);
+  // FIXME: Track references.
+}
+
 /// Return whether `Cond` is a variable that is possibly changed in `LoopStmt`.
 static bool isVarThatIsPossiblyChanged(const Decl *Func, const Stmt *LoopStmt,
                                        const Stmt *Cond, ASTContext *Context) {
   if (const auto *DRE = dyn_cast<DeclRefExpr>(Cond)) {
-    if (const auto *Var = dyn_cast<VarDecl>(DRE->getDecl())) {
-      if (!Var->isLocalVarDeclOrParm())
-        return true;
-
-      if (Var->getType().isVolatileQualified())
-        return true;
-
-      if (!Var->getType().getTypePtr()->isIntegerType())
-        return true;
-
-      return hasPtrOrReferenceInFunc(Func, Var) ||
-             isChanged(LoopStmt, Var, Context);
-      // FIXME: Track references.
-    }
+    if (const auto *VD = dyn_cast<ValueDecl>(DRE->getDecl()))
+      return isVarPossiblyChanged(Func, LoopStmt, VD, Context);
   } else if (isa<MemberExpr, CallExpr, ObjCIvarRefExpr, ObjCPropertyRefExpr,
                  ObjCMessageExpr>(Cond)) {
     // FIXME: Handle MemberExpr.
@@ -123,6 +134,10 @@ static std::string getCondVarNames(const Stmt *Cond) {
   if (const auto *DRE = dyn_cast<DeclRefExpr>(Cond)) {
     if (const auto *Var = dyn_cast<VarDecl>(DRE->getDecl()))
       return std::string(Var->getName());
+
+    if (const auto *BD = dyn_cast<BindingDecl>(DRE->getDecl())) {
+      return std::string(BD->getName());
+    }
   }
 
   std::string Result;
@@ -214,10 +229,17 @@ static bool overlap(ArrayRef<CallGraphNode *> SCC,
 
 /// returns true iff `Cond` involves at least one static local variable.
 static bool hasStaticLocalVariable(const Stmt *Cond) {
-  if (const auto *DRE = dyn_cast<DeclRefExpr>(Cond))
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(Cond)) {
     if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
       if (VD->isStaticLocal())
         return true;
+
+    if (const auto *BD = dyn_cast<BindingDecl>(DRE->getDecl()))
+      if (const auto *DD = dyn_cast<DecompositionDecl>(BD->getDecomposedDecl()))
+        if (DD->isStaticLocal())
+          return true;
+  }
+
   for (const Stmt *Child : Cond->children())
     if (Child && hasStaticLocalVariable(Child))
       return true;
diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
index 33642c407a3a9..bfa2ab51a6d03 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
@@ -45,7 +45,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee,
       // We still conservatively put a "std::" in front of the forward because
       // we don't know whether the code also had a "using std::forward;".
       Diag << FixItHint::CreateReplacement(CallRange, "std::" + ForwardName);
-    } else if (const NamespaceDecl *Namespace = NNS->getAsNamespace()) {
+    } else if (const NamespaceBaseDecl *Namespace = NNS->getAsNamespace()) {
       if (Namespace->getName() == "std") {
         if (!NNS->getPrefix()) {
           // Called as "std::move".
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
index 2dfaca19a8981..86992cd8a141b 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
@@ -36,7 +36,8 @@ void UnusedAliasDeclsCheck::check(const MatchFinder::MatchResult &Result) {
 
   if (const auto *NestedName =
           Result.Nodes.getNodeAs<NestedNameSpecifier>("nns")) {
-    if (const auto *AliasDecl = NestedName->getAsNamespaceAlias()) {
+    if (const auto *AliasDecl = dyn_cast_if_present<NamespaceAliasDecl>(
+            NestedName->getAsNamespace())) {
       FoundDecls[AliasDecl] = CharSourceRange();
     }
   }
diff --git a/clang-tools-extra/clang-tidy/utils/Aliasing.cpp b/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
index 2facf0625605e..cbe4873b5c022 100644
--- a/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
+++ b/clang-tools-extra/clang-tidy/utils/Aliasing.cpp
@@ -14,14 +14,14 @@
 namespace clang::tidy::utils {
 
 /// Return whether \p S is a reference to the declaration of \p Var.
-static bool isAccessForVar(const Stmt *S, const VarDecl *Var) {
+static bool isAccessForVar(const Stmt *S, const ValueDecl *Var) {
   if (const auto *DRE = dyn_cast<DeclRefExpr>(S))
     return DRE->getDecl() == Var;
 
   return false;
 }
 
-static bool capturesByRef(const CXXRecordDecl *RD, const VarDecl *Var) {
+static bool capturesByRef(const CXXRecordDecl *RD, const ValueDecl *Var) {
   return llvm::any_of(RD->captures(), [Var](const LambdaCapture &C) {
     return C.capturesVariable() && C.getCaptureKind() == LCK_ByRef &&
            C.getCapturedVar() == Var;
@@ -29,9 +29,9 @@ static bool capturesByRef(const CXXRecordDecl *RD, const VarDecl *Var) {
 }
 
 /// Return whether \p Var has a pointer or reference in \p S.
-static bool isPtrOrReferenceForVar(const Stmt *S, const VarDecl *Var) {
+static bool isPtrOrReferenceForVar(const Stmt *S, const ValueDecl *Var) {
   // Treat block capture by reference as a form of taking a reference.
-  if (Var->isEscapingByref())
+  if (const auto *VD = dyn_cast<VarDecl>(Var); VD && VD->isEscapingByref())
     return true;
 
   if (const auto *DS = dyn_cast<DeclStmt>(S)) {
@@ -61,7 +61,7 @@ static bool isPtrOrReferenceForVar(const Stmt *S, const VarDecl *Var) {
 }
 
 /// Return whether \p Var has a pointer or reference in \p S.
-static bool hasPtrOrReferenceInStmt(const Stmt *S, const VarDecl *Var) {
+static bool hasPtrOrReferenceInStmt(const Stmt *S, const ValueDecl *Var) {
   if (isPtrOrReferenceForVar(S, Var))
     return true;
 
@@ -77,7 +77,7 @@ static bool hasPtrOrReferenceInStmt(const Stmt *S, const VarDecl *Var) {
 }
 
 static bool refersToEnclosingLambdaCaptureByRef(const Decl *Func,
-                                                const VarDecl *Var) {
+                                                const ValueDecl *Var) {
   const auto *MD = dyn_cast<CXXMethodDecl>(Func);
   if (!MD)
     return false;
@@ -89,7 +89,7 @@ static bool refersToEnclosingLambdaCaptureByRef(const Decl *Func,
   return capturesByRef(RD, Var);
 }
 
-bool hasPtrOrReferenceInFunc(const Decl *Func, const VarDecl *Var) {
+bool hasPtrOrReferenceInFunc(const Decl *Func, const ValueDecl *Var) {
   return hasPtrOrReferenceInStmt(Func->getBody(), Var) ||
          refersToEnclosingLambdaCaptureByRef(Func, Var);
 }
diff --git a/clang-tools-extra/clang-tidy/utils/Aliasing.h b/clang-tools-extra/clang-tidy/utils/Aliasing.h
index 7dad16fc57f1e..6c0763b766805 100644
--- a/clang-tools-extra/clang-tidy/utils/Aliasing.h
+++ b/clang-tools-extra/clang-tidy/utils/Aliasing.h
@@ -25,7 +25,7 @@ namespace clang::tidy::utils {
 /// For `f()` and `n` the function returns ``true`` because `p` is a
 /// pointer to `n` created in `f()`.
 
-bool hasPtrOrReferenceInFunc(const Decl *Func, const VarDecl *Var);
+bool hasPtrOrReferenceInFunc(const Decl *Func, const ValueDecl *Var);
 
 } // namespace clang::tidy::utils
 
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 6cf38ddf3d914..dd28806e008ed 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -282,7 +282,8 @@ class RenamerClangTidyVisitor
 
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc Loc) {
     if (const NestedNameSpecifier *Spec = Loc.getNestedNameSpecifier()) {
-      if (const NamespaceDecl *Decl = Spec->getAsNamespace())
+      if (const auto *Decl =
+              dyn_cast_if_present<NamespaceDecl>(Spec->getAsNamespace()))
         Check->addUsage(Decl, Loc.getLocalSourceRange(), SM);
     }
 
diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp
index e274236527817..f2631e5abb6a3 100644
--- a/clang-tools-extra/clangd/AST.cpp
+++ b/clang-tools-extra/clangd/AST.cpp
@@ -666,12 +666,14 @@ std::string getQualification(ASTContext &Context,
   return getQualification(
       Context, DestContext, ND->getDeclContext(),
       [&](NestedNameSpecifier *NNS) {
-        if (NNS->getKind() != NestedNameSpecifier::Namespace)
+        const NamespaceDecl *NS =
+            dyn_cast_if_present<NamespaceDecl>(NNS->getAsNamespace());
+        if (!NS)
           return false;
-        const auto *CanonNSD = NNS->getAsNamespace()->getCanonicalDecl();
+        NS = NS->getCanonicalDecl();
         return llvm::any_of(VisibleNamespaceDecls,
-                            [CanonNSD](const NamespaceDecl *NSD) {
-                              return NSD->getCanonicalDecl() == CanonNSD;
+                            [NS](const NamespaceDecl *NSD) {
+                              return NSD->getCanonicalDecl() == NS;
                             });
       });
 }
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index d5907e3143bf6..184c3c962f063 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1470,7 +1470,6 @@ bool allowIndex(CodeCompletionContext &CC) {
   switch (NameSpec->getKind()) {
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
     return true;
   case NestedNameSpecifier::Super:
   case NestedNameSpecifier::TypeSpec:
diff --git a/clang-tools-extra/clangd/DumpAST.cpp b/clang-tools-extra/clangd/DumpAST.cpp
index 8f24477ecd3de..c6075e75e9a6b 100644
--- a/clang-tools-extra/clangd/DumpAST.cpp
+++ b/clang-tools-extra/clangd/DumpAST.cpp
@@ -158,7 +158,6 @@ class DumpVisitor : public RecursiveASTVisitor<DumpVisitor> {
       NNS_KIND(TypeSpec);
       NNS_KIND(Global);
       NNS_KIND(Super);
-      NNS_KIND(NamespaceAlias);
 #undef NNS_KIND
     }
     llvm_unreachable("Unhandled SpecifierKind enum");
@@ -281,8 +280,6 @@ class DumpVisitor : public RecursiveASTVisitor<DumpVisitor> {
       return NNS.getAsIdentifier()->getName().str() + "::";
     case NestedNameSpecifier::Namespace:
       return NNS.getAsNamespace()->getNameAsString() + "::";
-    case NestedNameSpecifier::NamespaceAlias:
-      return NNS.getAsNamespaceAlias()->getNameAsString() + "::";
     default:
       return "";
     }
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index 91fd3b0f8567b..b1089577ba819 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -491,9 +491,6 @@ struct TargetFinder {
     case NestedNameSpecifier::Namespace:
       add(NNS->getAsNamespace(), Flags);
       return;
-    case NestedNameSpecifier::NamespaceAlias:
-      add(NNS->getAsNamespaceAlias(), Flags);
-      return;
     case NestedNameSpecifier::Identifier:
       if (Resolver) {
         add(Resolver->resolveNestedNameSpecifierToType(NNS), Flags);
diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp
index 4ff021c4c390a..50bc2bd7ccb94 100644
--- a/clang-tools-extra/clangd/IncludeFixer.cpp
+++ b/clang-tools-extra/clangd/IncludeFixer.cpp
@@ -403,25 +403,27 @@ std::optional<CheapUnresolvedName> extractUnresolvedNameCheaply(
     if (auto *Nested = SS->getScopeRep()) {
       if (Nested->getKind() == NestedNameSpecifier::Global) {
         Result.ResolvedScope = "";
-      } else if (const auto *NS = Nested->getAsNamespace()) {
-        std::string SpecifiedNS = printNamespaceScope(*NS);
-        std::optional<std::string> Spelling = getSpelledSpecifier(*SS, SM);
-
-        // Check the specifier spelled in the source.
-        // If the resolved scope doesn't end with the spelled scope, the
-        // resolved scope may come from a sema typo correction. For example,
-        // sema assumes that "clangd::" is a typo of "clang::" and uses
-        // "clang::" as the specified scope in:
-        //     namespace clang { clangd::X; }
-        // In this case, we use the "typo" specifier as extra scope instead
-        // of using the scope assumed by sema.
-        if (!Spelling || llvm::StringRef(SpecifiedNS).ends_with(*Spelling)) {
-          Result.ResolvedScope = std::move(SpecifiedNS);
+      } else if (const NamespaceBaseDecl *NSB = Nested->getAsNamespace()) {
+        if (const auto *NS = dyn_cast<NamespaceDecl>(NSB)) {
+          std::string SpecifiedNS = printNamespaceScope(*NS);
+          std::optional<std::string> Spelling = getSpelledSpecifier(*SS, SM);
+
+          // Check the specifier spelled in the source.
+          // If the resolved scope doesn't end with the spelled scope, the
+          // resolved scope may come from a sema typo correction. For example,
+          // sema assumes that "clangd::" is a typo of "clang::" and uses
+          // "clang::" as the specified scope in:
+          //     namespace clang { clangd::X; }
+          // In this case, we use the "typo" specifier as extra scope instead
+          // of using the scope assumed by sema.
+          if (!Spelling || llvm::StringRef(SpecifiedNS).ends_with(*Spelling)) {
+            Result.ResolvedScope = std::move(SpecifiedNS);
+          } else {
+            Result.UnresolvedScope = std::move(*Spelling);
+          }
         } else {
-          Result.UnresolvedScope = std::move(*Spelling);
+          Result.ResolvedScope = printNamespaceScope(*cast<NamespaceAliasDecl>(NSB)->getNamespace());
         }
-      } else if (const auto *ANS = Nested->getAsNamespaceAlias()) {
-        Result.ResolvedScope = printNamespaceScope(*ANS->getNamespace());
       } else {
         // We don't fix symbols in scopes that are not top-level e.g. class
         // members, as we don't collect includes for them.
diff --git a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
index 00c05ebdb5216..67fc451a6a1a1 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
@@ -173,7 +173,8 @@ findInsertionPoint(const Tweak::Selection &Inputs,
     if (SM.isBeforeInTranslationUnit(Inputs.Cursor, U->getUsingLoc()))
       // "Usings" is sorted, so we're done.
       break;
-    if (const auto *Namespace = U->getQualifier()->getAsNamespace()) {
+    if (const auto *Namespace = dyn_cast_if_present<NamespaceDecl>(
+            U->getQualifier()->getAsNamespace())) {
       if (Namespace->getCanonicalDecl() ==
               QualifierToRemove.getNestedNameSpecifier()
                   ->getAsNamespace()
@@ -232,7 +233,10 @@ findInsertionPoint(const Tweak::Selection &Inputs,
 
 bool isNamespaceForbidden(const Tweak::Selection &Inputs,
                           const NestedNameSpecifier &Namespace) {
-  std::string NamespaceStr = printNamespaceScope(*Namespace.getAsNamespace());
+  const auto *NS = dyn_cast<NamespaceDecl>(Namespace.getAsNamespace());
+  if (!NS)
+    return true;
+  std::string NamespaceStr = printNamespaceScope(*NS);
 
   for (StringRef Banned : Config::current().Style.FullyQualifiedNamespaces) {
     StringRef PrefixMatch = NamespaceStr;
diff --git a/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt b/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt
index 59475b0dfd3d2..1d6e38088ad67 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt
+++ b/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt
@@ -14,9 +14,9 @@ set(LLVM_LINK_COMPONENTS
 add_clang_library(clangDaemonTweaks OBJECT
   AddUsing.cpp
   AnnotateHighlightings.cpp
-  DumpAST.cpp
   DefineInline.cpp
   DefineOutline.cpp
+  DumpAST.cpp
   ExpandDeducedType.cpp
   ExpandMacro.cpp
   ExtractFunction.cpp
@@ -24,6 +24,7 @@ add_clang_library(clangDaemonTweaks OBJECT
   MemberwiseConstructor.cpp
   ObjCLocalizeStringLiteral.cpp
   ObjCMemberwiseInitializer.cpp
+  OverridePureVirtuals.cpp
   PopulateSwitch.cpp
   RawStringLiteral.cpp
   RemoveUsingNamespace.cpp
diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
new file mode 100644
index 0000000000000..16febeca70809
--- /dev/null
+++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
@@ -0,0 +1,374 @@
+//===--- OverridePureVirtuals.cpp --------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tweak to automatically generate stubs for pure virtual methods inherited from
+// base classes.
+//
+// Purpose:
+//   - Simplifies making a derived class concrete by automating the creation of
+//     required method overrides from abstract bases.
+//
+// Tweak Summary:
+//
+// 1. Activation Conditions (prepare):
+//    - The tweak activates when the cursor is over a C++ class definition.
+//    - The class must be abstract (it, or its base classes, have unimplemented
+//      pure virtual functions).
+//    - It must also inherit from at least one other abstract class.
+//
+// 2. Identifying Missing Methods:
+//    - The tweak scans the inheritance hierarchy of the current class.
+//    - It identifies all unique pure virtual methods from base classes
+//      that are not yet implemented or overridden.
+//    - These missing methods are then grouped by their original access
+//      specifier (e.g., public, protected).
+//
+// 3. Code Generation and Insertion:
+//    - For each group of missing methods, stubs are inserted.
+//    - If an access specifier section (like `public:`) exists, stubs are
+//      inserted there; otherwise, a new section is created and appended.
+//    - Each generated stub includes the `override` keyword, a `// TODO:`
+//      comment, and a `static_assert(false, ...)` to force a compile-time
+//      error if the method remains unimplemented.
+//    - The base method's signature is adjusted (e.g., `virtual` and `= 0`
+//      are removed for the override).
+//
+// 4. Code Action Provided:
+//    - A single code action titled "Override pure virtual methods" is offered.
+//    - Applying this action results in a single source file modification
+//      containing all the generated method stubs.
+//
+// Example:
+//
+//  class Base {
+//  public:
+//    virtual void publicMethod() = 0;
+//  protected:
+//    virtual auto privateMethod() const -> int = 0;
+//  };
+//
+// Before:
+//                              // cursor here
+//  class Derived : public Base {}^;
+//
+// After:
+//
+// class Derived : public Base {
+//  public:
+//   void publicMethod() override {
+//     // TODO: Implement this pure virtual method.
+//     static_assert(false, "Method `publicMethod` is not implemented.");
+//   }
+//
+//  protected:
+//   auto privateMethod() const -> int override {
+//     // TODO: Implement this pure virtual method.
+//     static_assert(false, "Method `privateMethod` is not implemented.");
+//   }
+// };
+//
+//===----------------------------------------------------------------------===//
+
+#include "refactor/Tweak.h"
+#include "support/Token.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/Type.h"
+#include "clang/AST/TypeLoc.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Tooling/Core/Replacement.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace {
+
+// This function removes the "virtual" and the "= 0" at the end;
+// e.g.:
+//   "virtual void foo(int var = 0) = 0"  // input.
+//   "void foo(int var = 0)"              // output.
+std::string removePureVirtualSyntax(const std::string &MethodDecl,
+                                    const LangOptions &LangOpts) {
+  assert(!MethodDecl.empty());
+
+  TokenStream TS = lex(MethodDecl, LangOpts);
+
+  std::string DeclString;
+  for (const clangd::Token &Tk : TS.tokens()) {
+    if (Tk.Kind == clang::tok::raw_identifier && Tk.text() == "virtual")
+      continue;
+
+    // If the ending two tokens are "= 0", we break here and we already have the
+    // method's string without the pure virtual syntax.
+    const auto &Next = Tk.next();
+    if (Next.next().Kind == tok::eof && Tk.Kind == clang::tok::equal &&
+        Next.text() == "0")
+      break;
+
+    DeclString += Tk.text();
+    if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma &&
+        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren)
+      DeclString += ' ';
+  }
+  // Trim the last whitespace.
+  if (DeclString.back() == ' ')
+    DeclString.pop_back();
+
+  return DeclString;
+}
+
+class OverridePureVirtuals final : public Tweak {
+public:
+  const char *id() const final; // defined by REGISTER_TWEAK.
+  bool prepare(const Selection &Sel) override;
+  Expected<Effect> apply(const Selection &Sel) override;
+  std::string title() const override { return "Override pure virtual methods"; }
+  llvm::StringLiteral kind() const override {
+    return CodeAction::QUICKFIX_KIND;
+  }
+
+private:
+  // Stores the CXXRecordDecl of the class being modified.
+  const CXXRecordDecl *CurrentDeclDef = nullptr;
+  // Stores pure virtual methods that need overriding, grouped by their original
+  // access specifier.
+  llvm::MapVector<AccessSpecifier, llvm::SmallVector<const CXXMethodDecl *>>
+      MissingMethodsByAccess;
+  // Stores the source locations of existing access specifiers in CurrentDecl.
+  llvm::MapVector<AccessSpecifier, SourceLocation> AccessSpecifierLocations;
+  // Helper function to gather information before applying the tweak.
+  void collectMissingPureVirtuals();
+};
+
+REGISTER_TWEAK(OverridePureVirtuals)
+
+// Function to get all unique pure virtual methods from the entire
+// base class hierarchy of CurrentDeclDef.
+llvm::SmallVector<const clang::CXXMethodDecl *>
+getAllUniquePureVirtualsFromBaseHierarchy(
+    const clang::CXXRecordDecl *CurrentDeclDef) {
+  llvm::SmallVector<const clang::CXXMethodDecl *> AllPureVirtualsInHierarchy;
+  llvm::DenseSet<const clang::CXXMethodDecl *> CanonicalPureVirtualsSeen;
+
+  if (!CurrentDeclDef || !CurrentDeclDef->getDefinition())
+    return AllPureVirtualsInHierarchy;
+
+  const clang::CXXRecordDecl *Def = CurrentDeclDef->getDefinition();
+
+  Def->forallBases([&](const clang::CXXRecordDecl *BaseDefinition) {
+    for (const clang::CXXMethodDecl *Method : BaseDefinition->methods()) {
+      if (Method->isPureVirtual() &&
+          CanonicalPureVirtualsSeen.insert(Method->getCanonicalDecl()).second)
+        AllPureVirtualsInHierarchy.emplace_back(Method);
+    }
+    // Continue iterating through all bases.
+    return true;
+  });
+
+  return AllPureVirtualsInHierarchy;
+}
+
+// Gets canonical declarations of methods already overridden or implemented in
+// class D.
+llvm::SetVector<const CXXMethodDecl *>
+getImplementedOrOverriddenCanonicals(const CXXRecordDecl *D) {
+  llvm::SetVector<const CXXMethodDecl *> ImplementedSet;
+  for (const CXXMethodDecl *M : D->methods()) {
+    // If M provides an implementation for any virtual method it overrides.
+    // A method is an "implementation" if it's virtual and not pure.
+    // Or if it directly overrides a base method.
+    for (const CXXMethodDecl *OverriddenM : M->overridden_methods())
+      ImplementedSet.insert(OverriddenM->getCanonicalDecl());
+  }
+  return ImplementedSet;
+}
+
+// Get the location of every colon of the `AccessSpecifier`.
+llvm::MapVector<AccessSpecifier, SourceLocation>
+getSpecifierLocations(const CXXRecordDecl *D) {
+  llvm::MapVector<AccessSpecifier, SourceLocation> Locs;
+  for (auto *DeclNode : D->decls()) {
+    if (const auto *ASD = llvm::dyn_cast<AccessSpecDecl>(DeclNode))
+      Locs[ASD->getAccess()] = ASD->getColonLoc();
+  }
+  return Locs;
+}
+
+bool hasAbstractBaseAncestor(const clang::CXXRecordDecl *CurrentDecl) {
+  assert(CurrentDecl && CurrentDecl->getDefinition());
+
+  return llvm::any_of(
+      CurrentDecl->getDefinition()->bases(), [](CXXBaseSpecifier BaseSpec) {
+        const auto *D = BaseSpec.getType()->getAsCXXRecordDecl();
+        const auto *Def = D ? D->getDefinition() : nullptr;
+        return Def && Def->isAbstract();
+      });
+}
+
+// The tweak is available if the selection is over an abstract C++ class
+// definition that also inherits from at least one other abstract class.
+bool OverridePureVirtuals::prepare(const Selection &Sel) {
+  const SelectionTree::Node *Node = Sel.ASTSelection.commonAncestor();
+  if (!Node)
+    return false;
+
+  // Make sure we have a definition.
+  CurrentDeclDef = Node->ASTNode.get<CXXRecordDecl>();
+  if (!CurrentDeclDef || !CurrentDeclDef->getDefinition())
+    return false;
+
+  // From now on, we should work with the definition.
+  CurrentDeclDef = CurrentDeclDef->getDefinition();
+
+  // Only offer for abstract classes with abstract bases.
+  return CurrentDeclDef->isAbstract() &&
+         hasAbstractBaseAncestor(CurrentDeclDef);
+}
+
+// Collects all pure virtual methods from base classes that `CurrentDeclDef` has
+// not yet overridden, grouped by their original access specifier.
+//
+// Results are stored in `MissingMethodsByAccess` and `AccessSpecifierLocations`
+// is also populated.
+void OverridePureVirtuals::collectMissingPureVirtuals() {
+  if (!CurrentDeclDef)
+    return;
+
+  AccessSpecifierLocations = getSpecifierLocations(CurrentDeclDef);
+  MissingMethodsByAccess.clear();
+
+  // Get all unique pure virtual methods from the entire base class hierarchy.
+  llvm::SmallVector<const CXXMethodDecl *> AllPureVirtualsInHierarchy =
+      getAllUniquePureVirtualsFromBaseHierarchy(CurrentDeclDef);
+
+  // Get methods already implemented or overridden in CurrentDecl.
+  const auto ImplementedOrOverriddenSet =
+      getImplementedOrOverriddenCanonicals(CurrentDeclDef);
+
+  // Filter AllPureVirtualsInHierarchy to find those not in
+  // ImplementedOrOverriddenSet, which needs to be overriden.
+  for (const CXXMethodDecl *BaseMethod : AllPureVirtualsInHierarchy) {
+    bool AlreadyHandled = ImplementedOrOverriddenSet.contains(BaseMethod);
+    if (!AlreadyHandled)
+      MissingMethodsByAccess[BaseMethod->getAccess()].emplace_back(BaseMethod);
+  }
+}
+
+std::string generateOverrideString(const CXXMethodDecl *Method,
+                                   const LangOptions &LangOpts) {
+  std::string MethodDecl;
+  auto OS = llvm::raw_string_ostream(MethodDecl);
+  Method->print(OS);
+
+  return llvm::formatv(
+             "\n  {0} override {{\n"
+             "    // TODO: Implement this pure virtual method.\n"
+             "    static_assert(false, \"Method `{1}` is not implemented.\");\n"
+             "  }",
+             removePureVirtualSyntax(MethodDecl, LangOpts), Method->getName())
+      .str();
+}
+
+// Free function to generate the string for a group of method overrides.
+std::string generateOverridesStringForGroup(
+    llvm::SmallVector<const CXXMethodDecl *> Methods,
+    const LangOptions &LangOpts) {
+  llvm::SmallVector<std::string> MethodsString;
+  MethodsString.reserve(Methods.size());
+
+  for (const CXXMethodDecl *Method : Methods) {
+    MethodsString.emplace_back(generateOverrideString(Method, LangOpts));
+  }
+
+  return llvm::join(MethodsString, "\n") + '\n';
+}
+
+Expected<Tweak::Effect> OverridePureVirtuals::apply(const Selection &Sel) {
+  // The correctness of this tweak heavily relies on the accurate population of
+  // these members.
+  collectMissingPureVirtuals();
+  // The `prepare` should prevent this. If the prepare identifies an abstract
+  // method, then is must have missing methods.
+  assert(!MissingMethodsByAccess.empty());
+
+  const auto &SM = Sel.AST->getSourceManager();
+  const auto &LangOpts = Sel.AST->getLangOpts();
+
+  tooling::Replacements EditReplacements;
+  // Stores text for new access specifier sections that are not already present
+  // in the class.
+  // Example:
+  //  public:    // ...
+  //  protected: // ...
+  std::string NewSectionsToAppendText;
+
+  for (const auto &[AS, Methods] : MissingMethodsByAccess) {
+    assert(!Methods.empty());
+
+    std::string MethodsGroupString =
+        generateOverridesStringForGroup(Methods, LangOpts);
+
+    auto *ExistingSpecLocIter = AccessSpecifierLocations.find(AS);
+    bool ASExists = ExistingSpecLocIter != AccessSpecifierLocations.end();
+    if (ASExists) {
+      // Access specifier section already exists in the class.
+      // Get location immediately *after* the colon.
+      SourceLocation InsertLoc =
+          ExistingSpecLocIter->second.getLocWithOffset(1);
+
+      // Create a replacement to insert the method declarations.
+      // The replacement is at InsertLoc, has length 0 (insertion), and uses
+      // InsertionText.
+      std::string InsertionText = MethodsGroupString;
+      tooling::Replacement Rep(SM, InsertLoc, 0, InsertionText);
+      if (auto Err = EditReplacements.add(Rep))
+        return llvm::Expected<Tweak::Effect>(std::move(Err));
+    } else {
+      // Access specifier section does not exist in the class.
+      // These methods will be grouped into NewSectionsToAppendText and added
+      // towards the end of the class definition.
+      NewSectionsToAppendText +=
+          getAccessSpelling(AS).str() + ':' + MethodsGroupString;
+    }
+  }
+
+  // After processing all access specifiers, add any newly created sections
+  // (stored in NewSectionsToAppendText) to the end of the class.
+  if (!NewSectionsToAppendText.empty()) {
+    // AppendLoc is the SourceLocation of the closing brace '}' of the class.
+    // The replacement will insert text *before* this closing brace.
+    SourceLocation AppendLoc = CurrentDeclDef->getBraceRange().getEnd();
+    std::string FinalAppendText = std::move(NewSectionsToAppendText);
+
+    if (!CurrentDeclDef->decls_empty() || !EditReplacements.empty()) {
+      FinalAppendText = '\n' + FinalAppendText;
+    }
+
+    // Create a replacement to append the new sections.
+    tooling::Replacement Rep(SM, AppendLoc, 0, FinalAppendText);
+    if (auto Err = EditReplacements.add(Rep))
+      return llvm::Expected<Tweak::Effect>(std::move(Err));
+  }
+
+  if (EditReplacements.empty()) {
+    return llvm::make_error<llvm::StringError>(
+        "No changes to apply (internal error or no methods generated).",
+        llvm::inconvertibleErrorCode());
+  }
+
+  // Return the collected replacements as the effect of this tweak.
+  return Effect::mainFileEdit(SM, EditReplacements);
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index dffdcd5d014ca..d425070c7f3b7 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -131,6 +131,7 @@ add_unittest(ClangdUnitTests ClangdTests
   tweaks/MemberwiseConstructorTests.cpp
   tweaks/ObjCLocalizeStringLiteralTests.cpp
   tweaks/ObjCMemberwiseInitializerTests.cpp
+  tweaks/OverridePureVirtualsTests.cpp
   tweaks/PopulateSwitchTests.cpp
   tweaks/RawStringLiteralTests.cpp
   tweaks/RemoveUsingNamespaceTests.cpp
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 602f61d9ecb41..4d77f9d690ca0 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -838,7 +838,7 @@ TEST_F(TargetDeclTest, OverloadExpr) {
   )cpp";
   // Sized deallocation is enabled by default in C++14 onwards.
   EXPECT_DECLS("CXXDeleteExpr",
-               "void operator delete(void *, unsigned long) noexcept");
+               "void operator delete(void *, __size_t) noexcept");
 }
 
 TEST_F(TargetDeclTest, DependentExprs) {
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 775278ccf694b..4a21dafed5e95 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -2794,7 +2794,7 @@ TEST(Hover, All) {
           })cpp",
           [](HoverInfo &HI) {
             HI.Name = "expression";
-            HI.Type = "unsigned long";
+            HI.Type = {"__size_t", "unsigned long"};
             HI.Value = "1";
           }},
       {
@@ -2804,7 +2804,7 @@ TEST(Hover, All) {
           })cpp",
           [](HoverInfo &HI) {
             HI.Name = "expression";
-            HI.Type = "unsigned long";
+            HI.Type = {"__size_t", "unsigned long"};
             HI.Value = "1";
           }},
       {
diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
new file mode 100644
index 0000000000000..b7dcbee1650ec
--- /dev/null
+++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
@@ -0,0 +1,720 @@
+//===-- OverridePureVirtualsTests.cpp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TweakTesting.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+
+class OverridePureVirtualsTests : public TweakTest {
+protected:
+  OverridePureVirtualsTests() : TweakTest("OverridePureVirtuals") {}
+};
+
+TEST_F(OverridePureVirtualsTests, MinimalUnavailable) {
+  EXPECT_UNAVAILABLE("class ^C {};");
+}
+
+TEST_F(OverridePureVirtualsTests, MinimalAvailable) {
+  EXPECT_AVAILABLE(R"cpp(
+class B { public: virtual void Foo() = 0; };
+class ^C : public B {};
+)cpp");
+}
+
+TEST_F(OverridePureVirtualsTests, UnavailableWhenOverriden) {
+  EXPECT_UNAVAILABLE(
+      R"cpp(
+class B {
+public:
+  virtual void foo() = 0;
+};
+
+class ^D : public B {
+public:
+  void foo() override;
+};
+)cpp");
+}
+
+TEST_F(OverridePureVirtualsTests, AvailabilityNoOverride) {
+  EXPECT_AVAILABLE(R"cpp(
+class Base {
+public:
+virtual ~Base() = default;
+virtual void F1() = 0;
+virtual void F2() = 0;
+};
+
+class ^Derived : public Base {
+public:
+};
+
+)cpp");
+}
+
+TEST_F(OverridePureVirtualsTests, AvailabilityPartiallyOverridden) {
+  EXPECT_AVAILABLE(R"cpp(
+class Base {
+public:
+virtual ~Base() = default;
+virtual void F1() = 0;
+virtual void F2() = 0;
+};
+
+class ^Derived : public Base {
+public:
+void F1() override;
+};
+)cpp");
+}
+
+TEST_F(OverridePureVirtualsTests, EmptyDerivedClass) {
+  const char *Before = R"cpp(
+class Base {
+public:
+virtual ~Base() = default;
+virtual void F1() = 0;
+virtual void F2(int P1, const int &P2) = 0;
+};
+
+class ^Derived : public Base {};
+)cpp";
+  const auto *Expected = R"cpp(
+class Base {
+public:
+virtual ~Base() = default;
+virtual void F1() = 0;
+virtual void F2(int P1, const int &P2) = 0;
+};
+
+class Derived : public Base {
+public:
+  void F1() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `F1` is not implemented.");
+  }
+
+  void F2(int P1, const int & P2) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `F2` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, SingleBaseClassPartiallyImplemented) {
+  auto Applied = apply(
+      R"cpp(
+class Base {
+public:
+virtual ~Base() = default;
+virtual void F1() = 0;
+virtual void F2() = 0;
+};
+
+class ^Derived : public Base {
+public:
+  void F1() override;
+};
+)cpp");
+
+  const auto *Expected = R"cpp(
+class Base {
+public:
+virtual ~Base() = default;
+virtual void F1() = 0;
+virtual void F2() = 0;
+};
+
+class Derived : public Base {
+public:
+  void F2() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `F2` is not implemented.");
+  }
+
+  void F1() override;
+};
+)cpp";
+  EXPECT_EQ(Applied, Expected) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, MultipleDirectBaseClasses) {
+  const char *Before = R"cpp(
+class Base1 {
+public:
+  virtual void func1() = 0;
+};
+class Base2 {
+protected:
+  virtual bool func2(char c) const = 0;
+};
+
+class ^Derived : public Base1, public Base2 {};
+)cpp";
+  const auto *Expected = R"cpp(
+class Base1 {
+public:
+  virtual void func1() = 0;
+};
+class Base2 {
+protected:
+  virtual bool func2(char c) const = 0;
+};
+
+class Derived : public Base1, public Base2 {
+public:
+  void func1() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `func1` is not implemented.");
+  }
+protected:
+  bool func2(char c) const override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `func2` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, UnnamedParametersInBase) {
+  const char *Before = R"cpp(
+struct S {};
+class Base {
+public:
+  virtual void func(int, const S&, char*) = 0;
+};
+
+class ^Derived : public Base {};
+)cpp";
+
+  const auto *Expected = R"cpp(
+struct S {};
+class Base {
+public:
+  virtual void func(int, const S&, char*) = 0;
+};
+
+class Derived : public Base {
+public:
+  void func(int, const S &, char *) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `func` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, DiamondInheritance) {
+  const char *Before = R"cpp(
+class Top {
+public:
+  virtual ~Top() = default;
+  virtual void diamond_func() = 0;
+};
+class Left : virtual public Top {};
+class Right : virtual public Top {};
+class ^Bottom : public Left, public Right {};
+)cpp";
+  const auto *Expected = R"cpp(
+class Top {
+public:
+  virtual ~Top() = default;
+  virtual void diamond_func() = 0;
+};
+class Left : virtual public Top {};
+class Right : virtual public Top {};
+class Bottom : public Left, public Right {
+public:
+  void diamond_func() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `diamond_func` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, MixedAccessSpecifiers) {
+  const char *Before = R"cpp(
+class Base {
+public:
+  virtual void pub_func() = 0;
+  virtual void pub_func2(char) const = 0;
+protected:
+  virtual int prot_func(int x) const = 0;
+};
+
+class ^Derived : public Base {
+  int member; // Existing member
+public:
+  Derived(int m) : member(m) {}
+};
+)cpp";
+  const auto *Expected = R"cpp(
+class Base {
+public:
+  virtual void pub_func() = 0;
+  virtual void pub_func2(char) const = 0;
+protected:
+  virtual int prot_func(int x) const = 0;
+};
+
+class Derived : public Base {
+  int member; // Existing member
+public:
+  void pub_func() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `pub_func` is not implemented.");
+  }
+
+  void pub_func2(char) const override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `pub_func2` is not implemented.");
+  }
+
+  Derived(int m) : member(m) {}
+
+protected:
+  int prot_func(int x) const override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `prot_func` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, OutOfOrderMixedAccessSpecifiers) {
+  const char *Before = R"cpp(
+class Base {
+public:
+  virtual void pub_func() = 0;
+  virtual void pub_func2(char) const = 0;
+protected:
+  virtual int prot_func(int x) const = 0;
+};
+
+class ^Derived : public Base {
+  int member; // Existing member
+protected:
+  void foo();
+public:
+  Derived(int m) : member(m) {}
+};
+)cpp";
+  const auto *Expected = R"cpp(
+class Base {
+public:
+  virtual void pub_func() = 0;
+  virtual void pub_func2(char) const = 0;
+protected:
+  virtual int prot_func(int x) const = 0;
+};
+
+class Derived : public Base {
+  int member; // Existing member
+protected:
+  int prot_func(int x) const override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `prot_func` is not implemented.");
+  }
+
+  void foo();
+public:
+  void pub_func() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `pub_func` is not implemented.");
+  }
+
+  void pub_func2(char) const override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `pub_func2` is not implemented.");
+  }
+
+  Derived(int m) : member(m) {}
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, MultiAccessSpecifiersOverride) {
+  constexpr auto Before = R"cpp(
+class Base {
+public:
+  virtual void foo() = 0;
+protected:
+  virtual void bar() = 0;
+};
+
+class ^Derived : public Base {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class Base {
+public:
+  virtual void foo() = 0;
+protected:
+  virtual void bar() = 0;
+};
+
+class Derived : public Base {
+public:
+  void foo() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `foo` is not implemented.");
+  }
+protected:
+  void bar() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `bar` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, AccessSpecifierAlreadyExisting) {
+  const char *Before = R"cpp(
+class Base {
+public:
+  virtual void func1() = 0;
+};
+
+class ^Derived : public Base {
+public:
+};
+)cpp";
+
+  const auto *Expected = R"cpp(
+class Base {
+public:
+  virtual void func1() = 0;
+};
+
+class Derived : public Base {
+public:
+  void func1() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `func1` is not implemented.");
+  }
+
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, ConstexprSpecifier) {
+  ExtraArgs.push_back("-std=c++20");
+
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  constexpr virtual int getValue() const = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  constexpr virtual int getValue() const = 0;
+};
+
+class D : public B {
+public:
+  constexpr int getValue() const override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `getValue` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, ConstevalSpecifier) {
+  ExtraArgs.push_back("-std=c++20");
+
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual consteval float calculate() = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual consteval float calculate() = 0;
+};
+
+class D : public B {
+public:
+  consteval float calculate() override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `calculate` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, LValueRefQualifier) {
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual void process() & = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual void process() & = 0;
+};
+
+class D : public B {
+public:
+  void process() & override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `process` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, RValueRefQualifier) {
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual bool isValid() && = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual bool isValid() && = 0;
+};
+
+class D : public B {
+public:
+  bool isValid() && override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `isValid` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, SimpleTrailingReturnType) {
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual auto getStatus() -> bool = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual auto getStatus() -> bool = 0;
+};
+
+class D : public B {
+public:
+  auto getStatus() -> bool override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `getStatus` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, ConstexprLValueRefAndTrailingReturn) {
+  ExtraArgs.push_back("-std=c++20");
+
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  constexpr virtual auto getData() & -> const char * = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  constexpr virtual auto getData() & -> const char * = 0;
+};
+
+class D : public B {
+public:
+  constexpr auto getData() & -> const char * override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `getData` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, ConstevalRValueRefAndTrailingReturn) {
+  ExtraArgs.push_back("-std=c++20");
+
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual consteval auto foo() && -> double = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual consteval auto foo() && -> double = 0;
+};
+
+class D : public B {
+public:
+  consteval auto foo() && -> double override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `foo` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, CombinedFeaturesWithTrailingReturnTypes) {
+  ExtraArgs.push_back("-std=c++20");
+
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual auto f1() & -> int = 0;
+  constexpr virtual auto f2() && -> int = 0;
+  virtual consteval auto f3() -> int = 0;
+  virtual auto f4() const & -> char = 0;
+  constexpr virtual auto f5() const && -> bool = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual auto f1() & -> int = 0;
+  constexpr virtual auto f2() && -> int = 0;
+  virtual consteval auto f3() -> int = 0;
+  virtual auto f4() const & -> char = 0;
+  constexpr virtual auto f5() const && -> bool = 0;
+};
+
+class D : public B {
+public:
+  auto f1() & -> int override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `f1` is not implemented.");
+  }
+
+  constexpr auto f2() && -> int override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `f2` is not implemented.");
+  }
+
+  consteval auto f3() -> int override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `f3` is not implemented.");
+  }
+
+  auto f4() const & -> char override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `f4` is not implemented.");
+  }
+
+  constexpr auto f5() const && -> bool override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `f5` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+TEST_F(OverridePureVirtualsTests, DefaultParameters) {
+  ExtraArgs.push_back("-std=c++20");
+
+  constexpr auto Before = R"cpp(
+class B {
+public:
+  virtual void foo(int var = 0) = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+class B {
+public:
+  virtual void foo(int var = 0) = 0;
+};
+
+class D : public B {
+public:
+  void foo(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `foo` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9eb3835fe8340..bccb0ca83c79c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -58,9 +58,6 @@ Semantic Highlighting
 Compile flags
 ^^^^^^^^^^^^^
 
-- Added `BuiltinHeaders` config key which controls whether clangd's built-in
-  headers are used or ones extracted from the driver.
-
 Hover
 ^^^^^
 
@@ -70,6 +67,14 @@ Code completion
 Code actions
 ^^^^^^^^^^^^
 
+- New ``Override pure virtual methods`` code action. When invoked on a class
+  definition, this action automatically generates C++ ``override`` declarations
+  for all pure virtual methods inherited from its base classes that have not yet
+  been implemented. The generated method stubs prompts the user for the actual
+  implementation. The overrides are intelligently grouped under their original
+  access specifiers (e.g., ``public``, ``protected``), creating new access
+  specifier blocks if necessary.
+
 Signature help
 ^^^^^^^^^^^^^^
 
@@ -93,300 +98,21 @@ Improvements to clang-query
   arguments. So when porting a query to C++, remove all instances of trailing
   comma (otherwise C++ compiler will just complain about "expected expression").
 
-Improvements to include-cleaner
--------------------------------
-- Deprecated the ``-insert`` and ``-remove`` command line options, and added
-  the ``-disable-remove`` and ``-disable-insert`` command line options as
-  replacements. The previous command line options were confusing because they
-  did not imply the default state of the option (which is inserts and removes
-  being enabled). The new options are easier to understand the semantics of.
-
 Improvements to clang-tidy
 --------------------------
 
-- Changed the :program:`check_clang_tidy.py` tool to use FileCheck's
-  ``--match-full-lines`` instead of ``strict-whitespace`` for ``CHECK-FIXES``
-  clauses. Added a ``--match-partial-fixes`` option to keep previous behavior on
-  specific tests. This may break tests for users with custom out-of-tree checks
-  who use :program:`check_clang_tidy.py` as-is.
-
-- Improved :program:`clang-tidy-diff.py` script. Add the `-warnings-as-errors`
-  argument to treat warnings as errors.
-
-- Improved :program:`clang-tidy` to show `CheckOptions` only for checks enabled
-  in `Checks` when running ``--dump-config``.
-
-- Fixed bug in :program:`clang-tidy` by which `HeaderFilterRegex` did not take
-  effect when passed via the `.clang-tidy` file.
-
-- Fixed bug in :program:`run_clang_tidy.py` where the program would not
-  correctly display the checks enabled by the top-level `.clang-tidy` file.
-
 New checks
 ^^^^^^^^^^
 
-- New :doc:`bugprone-capturing-this-in-member-variable
-  <clang-tidy/checks/bugprone/capturing-this-in-member-variable>` check.
-
-  Finds lambda captures and ``bind`` function calls that capture the ``this``
-  pointer and store it as class members without handle the copy and move
-  constructors and the assignments.
-
-- New :doc:`bugprone-misleading-setter-of-reference
-  <clang-tidy/checks/bugprone/misleading-setter-of-reference>` check.
-
-  Finds setter-like member functions that take a pointer parameter and set a
-  reference member of the same class with the pointed value.
-
-- New :doc:`bugprone-unintended-char-ostream-output
-  <clang-tidy/checks/bugprone/unintended-char-ostream-output>` check.
-
-  Finds unintended character output from ``unsigned char`` and ``signed char``
-  to an ``ostream``.
-
-- New :doc:`cppcoreguidelines-use-enum-class
-  <clang-tidy/checks/cppcoreguidelines/use-enum-class>` check.
-
-  Finds unscoped (non-class) ``enum`` declarations and suggests using
-  ``enum class`` instead.
-
-- New :doc:`llvm-prefer-static-over-anonymous-namespace
-  <clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace>` check.
-
-  Finds function and variable declarations inside anonymous namespace and
-  suggests replacing them with ``static`` declarations.
-
-- New :doc:`modernize-use-scoped-lock
-  <clang-tidy/checks/modernize/use-scoped-lock>` check.
-
-  Finds uses of ``std::lock_guard`` and suggests replacing them with C++17's
-  alternative ``std::scoped_lock``.
-
-- New :doc:`portability-avoid-pragma-once
-  <clang-tidy/checks/portability/avoid-pragma-once>` check.
-
-  Finds uses of ``#pragma once`` and suggests replacing them with standard
-  include guards (``#ifndef``/``#define``/``#endif``) for improved portability.
-
-- New :doc:`readability-ambiguous-smartptr-reset-call
-  <clang-tidy/checks/readability/ambiguous-smartptr-reset-call>` check.
-
-  Finds potentially erroneous calls to ``reset`` method on smart pointers when
-  the pointee type also has a ``reset`` method.
-
-- New :doc:`readability-use-concise-preprocessor-directives
-  <clang-tidy/checks/readability/use-concise-preprocessor-directives>` check.
-
-  Finds uses of ``#if`` that can be simplified to ``#ifdef`` or ``#ifndef`` and,
-  since C23 and C++23, uses of ``#elif`` that can be simplified to ``#elifdef``
-  or ``#elifndef``.
-
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Improved :doc:`bugprone-crtp-constructor-accessibility
-  <clang-tidy/checks/bugprone/crtp-constructor-accessibility>` check by fixing
-  false positives on deleted constructors that cannot be used to construct
-  objects, even if they have public or protected access.
-
-- Improved :doc:`bugprone-exception-escape
-  <clang-tidy/checks/bugprone/exception-escape>` check to print stack trace
-  of a potentially escaped exception.
-
-- Added an option to :doc:`bugprone-multi-level-implicit-pointer-conversion
-  <clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion>` to
-  choose whether to enable the check in C code or not.
-
-- Improved :doc:`bugprone-optional-value-conversion
-  <clang-tidy/checks/bugprone/optional-value-conversion>` check to detect
-  conversion in argument of ``std::make_optional``.
-
-- Improved :doc:`bugprone-sizeof-expression
-  <clang-tidy/checks/bugprone/sizeof-expression>` check by adding
-  `WarnOnSizeOfInLoopTermination` option to detect misuses of ``sizeof``
-  expression in loop conditions.
-
-- Improved :doc:`bugprone-string-constructor
-  <clang-tidy/checks/bugprone/string-constructor>` check to find suspicious
-  calls of ``std::string`` constructor with char pointer, start position and
-  length parameters.
-
-- Improved :doc:`bugprone-unchecked-optional-access
-  <clang-tidy/checks/bugprone/unchecked-optional-access>` fixing false
-  positives from smart pointer accessors repeated in checking ``has_value``
-  and accessing ``value``. The option `IgnoreSmartPointerDereference` should
-  no longer be needed and will be removed. Also fixing false positive from
-  const reference accessors to objects containing optional member.
-
-- Improved :doc:`bugprone-unsafe-functions
-  <clang-tidy/checks/bugprone/unsafe-functions>` check to allow specifying
-  additional C++ member functions to match.
-
-- Improved :doc:`cert-err33-c
-  <clang-tidy/checks/cert/err33-c>` check by fixing false positives when
-  a function name is just prefixed with a targeted function name.
-
-- Improved :doc:`concurrency-mt-unsafe
-  <clang-tidy/checks/concurrency/mt-unsafe>` check by fixing a false positive
-  where ``strerror`` was flagged as MT-unsafe.
-
-- Improved :doc:`cppcoreguidelines-avoid-goto
-  <clang-tidy/checks/cppcoreguidelines/avoid-goto>` check by adding the option
-  `IgnoreMacros` to ignore ``goto`` labels defined in macros.
-
-- Improved :doc:`cppcoreguidelines-interfaces-global-init
-  <clang-tidy/checks/cppcoreguidelines/interfaces-global-init>` check by
-  fixing false positives on uses of ``constinit`` variables.
-
-- Improved :doc:`cppcoreguidelines-missing-std-forward
-  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by adding a
-  flag to specify the function used for forwarding instead of ``std::forward``.
-
-- Improved :doc:`cppcoreguidelines-pro-bounds-pointer-arithmetic
-  <clang-tidy/checks/cppcoreguidelines/pro-bounds-pointer-arithmetic>` check by
-  fixing false positives when calling indexing operators that do not perform
-  pointer arithmetic in template, for example ``std::map::operator[]`` and
-  when pointer arithmetic was used through type aliases.
-
-- Improved :doc:`cppcoreguidelines-rvalue-reference-param-not-moved
-  <clang-tidy/checks/cppcoreguidelines/rvalue-reference-param-not-moved>` check
-  by adding a flag to specify the function used for moving instead of
-  ``std::move``.
-
-- Improved :doc:`cppcoreguidelines-special-member-functions
-  <clang-tidy/checks/cppcoreguidelines/special-member-functions>` check by
-  adding the option `IgnoreMacros` to ignore classes defined in macros.
-
-- Improved :doc:`google-readability-namespace-comments
-  <clang-tidy/checks/google/readability-namespace-comments>` check by adding
-  the option `AllowOmittingNamespaceComments` to accept if a namespace comment
-  is omitted entirely.
-
-- Improved :doc:`hicpp-avoid-goto
-  <clang-tidy/checks/hicpp/avoid-goto>` check by adding the option
-  `IgnoreMacros` to ignore ``goto`` labels defined in macros.
-
-- Improved :doc:`hicpp-special-member-functions
-  <clang-tidy/checks/hicpp/special-member-functions>` check by adding the
-  option `IgnoreMacros` to ignore classes defined in macros.
-
-- Improved :doc:`llvm-namespace-comment
-  <clang-tidy/checks/llvm/namespace-comment>` check by adding the option
-  `AllowOmittingNamespaceComments` to accept if a namespace comment is omitted
-  entirely.
-
-- Improved :doc:`misc-const-correctness
-  <clang-tidy/checks/misc/const-correctness>` check by adding the option
-  `AllowedTypes`, that excludes specified types from const-correctness
-  checking and fixing false positives when modifying variant by ``operator[]``
-  with template in parameters and supporting to check pointee mutation by
-  `AnalyzePointers` option and fixing false positives when using const array
-  type.
-
-- Improved :doc:`misc-include-cleaner
-  <clang-tidy/checks/misc/include-cleaner>` check by adding the options
-  `UnusedIncludes` and `MissingIncludes`, which specify whether the check should
-  report unused or missing includes respectively.
-
-- Improved :doc:`misc-redundant-expression
-  <clang-tidy/checks/misc/redundant-expression>` check by providing additional
-  examples and fixing some macro related false positives.
-
-- Improved :doc:`misc-unconventional-assign-operator
-  <clang-tidy/checks/misc/unconventional-assign-operator>` check by fixing
-  false positives when copy assignment operator function in a template class
-  returns the result of another assignment to ``*this`` (``return *this=...``).
-
-- Improved :doc:`misc-unused-using-decls
-  <clang-tidy/checks/misc/unused-using-decls>` check by fixing false positives
-  on ``operator""`` with template parameters.
-
-- Improved :doc:`misc-use-internal-linkage
-  <clang-tidy/checks/misc/use-internal-linkage>` check by fix false positives
-  for function or variable in header file which contains macro expansion and
-  excluding variables with ``thread_local`` storage class specifier from being
-  matched.
-
-- Improved :doc:`modernize-pass-by-value
-  <clang-tidy/checks/modernize/pass-by-value>` check by fixing false positives
-  when class passed by const-reference had a private move constructor.
-
-- Improved :doc:`modernize-type-traits
-  <clang-tidy/checks/modernize/type-traits>` check by detecting more type traits.
-
-- Improved :doc:`modernize-use-default-member-init
-  <clang-tidy/checks/modernize/use-default-member-init>` check by matching
-  arithmetic operations, ``constexpr`` and ``static`` values, and detecting
-  explicit casting of built-in types within member list initialization.
-
-- Improved :doc:`modernize-use-designated-initializers
-  <clang-tidy/checks/modernize/use-designated-initializers>` check by avoiding
-  diagnosing designated initializers for ``std::array`` initializations.
-
-- Improved :doc:`modernize-use-ranges
-  <clang-tidy/checks/modernize/use-ranges>` check by updating suppress
-  warnings logic for ``nullptr`` in ``std::find``.
-
-- Improved :doc:`modernize-use-starts-ends-with
-  <clang-tidy/checks/modernize/use-starts-ends-with>` check by adding more
-  matched scenarios of ``find`` and ``rfind`` methods and fixing false
-  positives when those methods were called with 3 arguments.
-
-- Improved :doc:`modernize-use-std-numbers
-  <clang-tidy/checks/modernize/use-std-numbers>` check to support math
-  functions of different precisions.
-
-- Improved :doc:`modernize-use-trailing-return-type
-  <clang-tidy/checks/modernize/use-trailing-return-type>` check by adding
-  support to modernize lambda signatures to use trailing return type and adding
-  two new options: `TransformFunctions` and `TransformLambdas` to control
-  whether function declarations and lambdas should be transformed by the check.
-  Fixed false positives when lambda was matched as a function in C++11 mode.
-
-- Improved :doc:`performance-move-const-arg
-  <clang-tidy/checks/performance/move-const-arg>` check by fixing false
-  negatives on ternary operators calling ``std::move``.
-
-- Improved :doc:`performance-unnecessary-value-param
-  <clang-tidy/checks/performance/unnecessary-value-param>` check performance by
-  tolerating fix-it breaking compilation when functions is used as pointers
-  to avoid matching usage of functions within the current compilation unit.
-  Added an option `IgnoreCoroutines` with the default value `true` to
-  suppress this check for coroutines where passing by reference may be unsafe.
-
-- Improved :doc:`readability-convert-member-functions-to-static
-  <clang-tidy/checks/readability/convert-member-functions-to-static>` check by
-  fixing false positives on member functions with an explicit object parameter.
-
-- Improved :doc:`readability-function-size
-  <clang-tidy/checks/readability/function-size>` check by adding new option
-  `CountMemberInitAsStmt` that allows counting class member initializers in
-  constructors as statements.
-
-- Improved :doc:`readability-math-missing-parentheses
-  <clang-tidy/checks/readability/math-missing-parentheses>` check by fixing
-  false negatives where math expressions are the operand of assignment operators
-  or comparison operators.
-
-- Improved :doc:`readability-named-parameter
-  <clang-tidy/checks/readability/named-parameter>` check by adding the option
-  `InsertPlainNamesInForwardDecls` to insert parameter names without comments
-  for forward declarations only.
-
-- Improved :doc:`readability-qualified-auto
-  <clang-tidy/checks/readability/qualified-auto>` check by adding the option
-  `AllowedTypes`, that excludes specified types from adding qualifiers.
-
-- Improved :doc:`readability-redundant-inline-specifier
-  <clang-tidy/checks/readability/redundant-inline-specifier>` check by fixing
-  false positives on out-of-line explicitly defaulted functions.
-
-- Improved :doc:`readability-redundant-smartptr-get
-  <clang-tidy/checks/readability/redundant-smartptr-get>` check by fixing
-  some false positives involving smart pointers to arrays.
+- Improved :doc:`bugprone-infinite-loop
+  <clang-tidy/checks/bugprone/infinite-loop>` check by adding detection for
+  variables introduced by structured bindings.
 
 Removed checks
 ^^^^^^^^^^^^^^
@@ -414,3 +140,4 @@ Improvements to pp-trace
 
 Clang-tidy Visual Studio plugin
 -------------------------------
+
diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst
index 9611c655886f2..ad12b2343d1e9 100644
--- a/clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -19,11 +19,11 @@ check, the rest of this document explains how to do this.
 
 There are a few tools particularly useful when developing clang-tidy checks:
   * ``add_new_check.py`` is a script to automate the process of adding a new
-    check, it will create the check, update the CMake file and create a test;
+    check; it will create the check, update the CMake file and create a test.
   * ``rename_check.py`` does what the script name suggests, renames an existing
-    check;
+    check.
   * :program:`pp-trace` logs method calls on `PPCallbacks` for a source file
-    and is invaluable in understanding the preprocessor mechanism;
+    and is invaluable in understanding the preprocessor mechanism.
   * :program:`clang-query` is invaluable for interactive prototyping of AST
     matchers and exploration of the Clang AST;
   * `clang-check`_ with the ``-ast-dump`` (and optionally ``-ast-dump-filter``)
@@ -47,7 +47,7 @@ implemented as a:
 
 + *Clang diagnostic*: if the check is generic enough, targets code patterns that
   most probably are bugs (rather than style or readability issues), can be
-  implemented effectively and with extremely low false positive rate, it may
+  implemented effectively and with extremely low false-positive rate, it may
   make a good Clang diagnostic.
 
 + *Clang static analyzer check*: if the check requires some sort of control flow
@@ -77,7 +77,7 @@ make sure that you enable the ``clang`` and ``clang-tools-extra`` projects to
 build :program:`clang-tidy`.
 Because your new check will have associated documentation, you will also want to install
 `Sphinx <https://www.sphinx-doc.org/en/master/>`_ and enable it in the CMake configuration.
-To save build time of the core Clang libraries you may want to only enable the ``X86``
+To save build time of the core Clang libraries, you may want to only enable the ``X86``
 target in the CMake configuration.
 
 
@@ -130,7 +130,7 @@ So you have an idea of a useful check for :program:`clang-tidy`.
 First, if you're not familiar with LLVM development, read through the `Getting Started 
 with the LLVM System`_ document for instructions on setting up your workflow and
 the `LLVM Coding Standards`_ document to familiarize yourself with the coding
-style used in the project. For code reviews we currently use `LLVM Github`_,
+style used in the project. For code reviews, we currently use `LLVM Github`_,
 though historically we used Phabricator.
 
 .. _Getting Started with the LLVM System: https://llvm.org/docs/GettingStarted.html
@@ -141,7 +141,7 @@ Next, you need to decide which module the check belongs to. Modules
 are located in subdirectories of `clang-tidy/
 <https://github.com/llvm/llvm-project/tree/main/clang-tools-extra/clang-tidy/>`_
 and contain checks targeting a certain aspect of code quality (performance,
-readability, etc.), certain coding style or standard (Google, LLVM, CERT, etc.)
+readability, etc.), a certain coding style or standard (Google, LLVM, CERT, etc.)
 or a widely used API (e.g. MPI). Their names are the same as the user-facing
 check group names described :ref:`above <checks-groups-table>`.
 
@@ -166,7 +166,7 @@ The ``add_new_check.py`` script will:
   * create a documentation file and include it into the
     ``docs/clang-tidy/checks/list.rst``.
 
-Let's see in more detail at the check class definition:
+Let's look at the check class definition in more detail:
 
 .. code-block:: c++
 
@@ -200,7 +200,7 @@ In our case the check needs to operate on the AST level and it overrides the
 preprocessor level, we'd need instead to override the ``registerPPCallbacks``
 method.
 
-In the ``registerMatchers`` method we create an AST Matcher (see `AST Matchers`_
+In the ``registerMatchers`` method, we create an AST Matcher (see `AST Matchers`_
 for more information) that will find the pattern in the AST that we want to
 inspect. The results of the matching are passed to the ``check`` method, which
 can further inspect them and report diagnostics.
@@ -320,7 +320,7 @@ the ``add_new_check.py`` script:
 Developing your check incrementally
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The best way to develop your check is to start with the simple test cases and increase
+The best way to develop your check is to start with simple test cases and increase
 complexity incrementally.  The test file created by the ``add_new_check.py`` script is
 a starting point for your test cases.  A rough outline of the process looks like this:
 
@@ -393,7 +393,7 @@ good way to catch things you forgot to account for in your matchers.  However, t
 LLVM code base may be insufficient for testing purposes as it was developed against a
 particular set of coding styles and quality measures.  The larger the corpus of code
 the check is tested against, the higher confidence the community will have in the
-check's efficacy and false positive rate.
+check's efficacy and false-positive rate.
 
 Some suggestions to ensure your check is robust:
 
@@ -406,10 +406,10 @@ Some suggestions to ensure your check is robust:
 - Define template classes that contain code matched by your check.
 - Define template specializations that contain code matched by your check.
 - Test your check under both Windows and Linux environments.
-- Watch out for high false positive rates.  Ideally, a check would have no false
+- Watch out for high false-positive rates.  Ideally, a check would have no false
   positives, but given that matching against an AST is not control- or data flow-
-  sensitive, a number of false positives are expected.  The higher the false
-  positive rate, the less likely the check will be adopted in practice.
+  sensitive, a number of false positives are expected.  The higher the
+  false-positive rate, the less likely the check will be adopted in practice.
   Mechanisms should be put in place to help the user manage false positives.
 - There are two primary mechanisms for managing false positives: supporting a
   code pattern which allows the programmer to silence the diagnostic in an ad
@@ -428,10 +428,10 @@ Documenting your check
 The ``add_new_check.py`` script creates entries in the
 `release notes <https://clang.llvm.org/extra/ReleaseNotes.html>`_, the list of
 checks and a new file for the check documentation itself.  It is recommended that you
-have a concise summation of what your check does in a single sentence that is repeated
+have a concise summary of what your check does in a single sentence that is repeated
 in the release notes, as the first sentence in the doxygen comments in the header file
 for your check class and as the first sentence of the check documentation.  Avoid the
-phrase "this check" in your check summation and check documentation.
+phrase "this check" in your check summary and check documentation.
 
 If your check relates to a published coding guideline (C++ Core Guidelines, MISRA, etc.)
 or style guide, provide links to the relevant guideline or style guide sections in your
@@ -443,10 +443,10 @@ If there are exceptions or limitations to your check, document them thoroughly.
 will help users understand the scope of the diagnostics and fix-its provided by the check.
 
 Building the target ``docs-clang-tools-html`` will run the Sphinx documentation generator
-and create documentation HTML files in the tools/clang/tools/extra/docs/html directory in
+and create HTML documentation files in the tools/clang/tools/extra/docs/html directory in
 your build tree.  Make sure that your check is correctly shown in the release notes and the
 list of checks.  Make sure that the formatting and structure of your check's documentation
-looks correct.
+look correct.
 
 
 Registering your Check
@@ -503,11 +503,11 @@ Configuring Checks
 
 If a check needs configuration options, it can access check-specific options
 using the ``Options.get<Type>("SomeOption", DefaultValue)`` call in the check
-constructor. In this case the check should also override the
+constructor. In this case, the check should also override the
 ``ClangTidyCheck::storeOptions`` method to make the options provided by the
 check discoverable. This method lets :program:`clang-tidy` know which options
 the check implements and what the current values are (e.g. for the
-``-dump-config`` command line option).
+``-dump-config`` command-line option).
 
 .. code-block:: c++
 
@@ -576,7 +576,7 @@ typically the basic ``CHECK`` forms (``CHECK-MESSAGES`` and ``CHECK-FIXES``)
 are sufficient for clang-tidy tests. Note that the `FileCheck`_
 documentation mostly assumes the default prefix (``CHECK``), and hence
 describes the directive as ``CHECK:``, ``CHECK-SAME:``, ``CHECK-NOT:``, etc.
-Replace ``CHECK`` by either ``CHECK-FIXES`` or ``CHECK-MESSAGES`` for
+Replace ``CHECK`` with either ``CHECK-FIXES`` or ``CHECK-MESSAGES`` for
 clang-tidy tests.
 
 An additional check enabled by ``check_clang_tidy.py`` ensures that
@@ -590,7 +590,7 @@ appropriate ``RUN`` line in the ``test/clang-tidy`` directory. Use
 diagnostic messages and fixed code.
 
 It's advised to make the checks as specific as possible to avoid checks matching
-to incorrect parts of the input. Use ``[[@LINE+X]]``/``[[@LINE-X]]``
+incorrect parts of the input. Use ``[[@LINE+X]]``/``[[@LINE-X]]``
 substitutions and distinct function and variable names in the test code.
 
 Here's an example of a test using the ``check_clang_tidy.py`` script (the full
@@ -606,7 +606,7 @@ source code is at `test/clang-tidy/checkers/google/readability-casting.cpp`_):
     // CHECK-FIXES: int b = a;
   }
 
-To check more than one scenario in the same test file use
+To check more than one scenario in the same test file, use
 ``-check-suffix=SUFFIX-NAME`` on ``check_clang_tidy.py`` command line or
 ``-check-suffixes=SUFFIX-NAME-1,SUFFIX-NAME-2,...``.
 With ``-check-suffix[es]=SUFFIX-NAME`` you need to replace your ``CHECK-*``
@@ -631,15 +631,15 @@ There are many dark corners in the C++ language, and it may be difficult to make
 your check work perfectly in all cases, especially if it issues fix-it hints. The
 most frequent pitfalls are macros and templates:
 
-1. code written in a macro body/template definition may have a different meaning
-   depending on the macro expansion/template instantiation;
-2. multiple macro expansions/template instantiations may result in the same code
+1. Code written in a macro body/template definition may have a different meaning
+   depending on the macro expansion/template instantiation.
+2. Multiple macro expansions/template instantiations may result in the same code
    being inspected by the check multiple times (possibly, with different
    meanings, see 1), and the same warning (or a slightly different one) may be
    issued by the check multiple times; :program:`clang-tidy` will deduplicate
    _identical_ warnings, but if the warnings are slightly different, all of them
-   will be shown to the user (and used for applying fixes, if any);
-3. making replacements to a macro body/template definition may be fine for some
+   will be shown to the user (and used for applying fixes, if any).
+3. Making replacements to a macro body/template definition may be fine for some
    macro expansions/template instantiations, but easily break some other
    expansions/instantiations.
 
@@ -657,6 +657,29 @@ directory.  The path to this directory is available in a lit test with the varia
 .. _FileCheck: https://llvm.org/docs/CommandGuide/FileCheck.html
 .. _test/clang-tidy/checkers/google/readability-casting.cpp: https://github.com/llvm/llvm-project/blob/main/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
 
+
+Submitting a Pull Request
+-------------------------
+
+Before submitting a pull request, contributors are encouraged to run
+:program:`clang-tidy` and :program:`clang-format` on their changes to ensure
+code quality and catch potential issues. While :program:`clang-tidy` is not
+currently enforced in CI, following this practice helps maintain code
+consistency and prevent common errors.
+
+Here's a useful command to check your staged changes:
+
+.. code-block:: console
+
+  $ git diff --staged -U0 | ./clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py \
+      -j $(nproc) -path build/ -p1 -only-check-in-db
+  $ git clang-format
+
+Note that some warnings may be false positives or require careful consideration
+before fixing. Use your judgment and feel free to discuss in the pull request
+if you're unsure about a particular warning.
+
+
 Out-of-tree check plugins
 -------------------------
 
@@ -675,7 +698,7 @@ names of the checks to enable.
 
   $ clang-tidy --checks=-*,my-explicit-constructor -list-checks -load myplugin.so
 
-There is no expectations regarding ABI and API stability, so the plugin must be
+There are no expectations regarding ABI and API stability, so the plugin must be
 compiled against the version of clang-tidy that will be loading the plugin.
 
 The plugins can use threads, TLS, or any other facilities available to in-tree
@@ -697,10 +720,10 @@ and write a version of `check_clang_tidy.py`_ to suit your needs.
 Running clang-tidy on LLVM
 --------------------------
 
-To test a check it's best to try it out on a larger code base. LLVM and Clang
+To test a check, it's best to try it out on a larger code base. LLVM and Clang
 are the natural targets as you already have the source code around. The most
 convenient way to run :program:`clang-tidy` is with a compile command database;
-CMake can automatically generate one, for a description of how to enable it see
+CMake can automatically generate one; for a description of how to enable it, see
 `How To Setup Clang Tooling For LLVM`_. Once ``compile_commands.json`` is in
 place and a working version of :program:`clang-tidy` is in ``PATH`` the entire
 code base can be analyzed with ``clang-tidy/tool/run-clang-tidy.py``. The script
@@ -712,18 +735,18 @@ warnings and errors. The script provides multiple configuration flags.
 
 
 * The default set of checks can be overridden using the ``-checks`` argument,
-  taking the identical format as :program:`clang-tidy` does. For example
+  taking the identical format as :program:`clang-tidy` does. For example,
   ``-checks=-*,modernize-use-override`` will run the ``modernize-use-override``
   check only.
 
-* To restrict the files examined you can provide one or more regex arguments
+* To restrict the files examined, you can provide one or more regex arguments
   that the file names are matched against.
   ``run-clang-tidy.py clang-tidy/.*Check\.cpp`` will only analyze `clang-tidy`
   checks. It may also be necessary to restrict the header files that warnings
   are displayed from by using the ``-header-filter`` and ``-exclude-header-filter`` flags. 
   They have the same behavior as the corresponding :program:`clang-tidy` flags.
 
-* To apply suggested fixes ``-fix`` can be passed as an argument. This gathers
+* To apply suggested fixes, ``-fix`` can be passed as an argument. This gathers
   all changes in a temporary directory and applies them. Passing ``-format``
   will run clang-format over changed lines.
 
@@ -772,7 +795,7 @@ There is only one argument that controls profile storage:
 
 * ``-store-check-profile=<prefix>``
 
-  By default reports are printed in tabulated format to stderr. When this option
+  By default, reports are printed in tabulated format to stderr. When this option
   is passed, these per-TU profiles are instead stored as JSON.
   If the prefix is not an absolute path, it is considered to be relative to the
   directory from where you have run :program:`clang-tidy`. All ``.`` and ``..``
diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index baff90faa6eae..49cc13606f4c2 100644
--- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -140,7 +140,6 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
       return true;
     switch (Qual->getKind()) {
     case NestedNameSpecifier::Namespace:
-    case NestedNameSpecifier::NamespaceAlias:
     case NestedNameSpecifier::Global:
       return true;
     case NestedNameSpecifier::TypeSpec:
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/infinite-loop.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/infinite-loop.cpp
index bc14ece3f332c..9a58a7ae2f2ab 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/infinite-loop.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/infinite-loop.cpp
@@ -711,3 +711,205 @@ void test_local_static_recursion() {
   while (i >= 0)
     p(0); // we don't know what p points to so no warning
 }
+
+struct PairVal {
+  int a;
+  int b;
+  PairVal(int a, int b) : a(a), b(b) {}
+};
+
+void structured_binding_infinite_loop1() {
+  auto [x, y] = PairVal(0, 0);
+  while (x < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (x) are updated in the loop body [bugprone-infinite-loop]
+    y++;
+  }
+  while (y < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (y) are updated in the loop body [bugprone-infinite-loop]
+    x++;
+  }
+}
+
+void structured_binding_infinite_loop2() {
+  auto [x, y] = PairVal(0, 0);
+  while (x < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (x) are updated in the loop body [bugprone-infinite-loop]
+    // No update to x or y
+  }
+  while (y < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (y) are updated in the loop body [bugprone-infinite-loop]
+    // No update to x or y
+  }
+}
+
+void structured_binding_not_infinite1() {
+  auto [x, y] = PairVal(0, 0);
+  while (x < 10) {
+    x++;
+  }
+  while (y < 10) {
+    y++;
+  }
+}
+
+void volatile_structured_binding_in_condition() {
+  volatile auto [x, y] = PairVal(0, 0);
+  while (!x) {}
+}
+
+void test_local_static_structured_binding_recursion() {
+  static auto [i, _] = PairVal(0, 0);
+  int j = 0;
+
+  i--;
+  while (i >= 0)
+    test_local_static_structured_binding_recursion(); // no warning, recursively decrement i
+  for (; i >= 0;)
+    test_local_static_structured_binding_recursion(); // no warning, recursively decrement i
+  for (; i + j >= 0;)
+    test_local_static_structured_binding_recursion(); // no warning, recursively decrement i
+  for (; i >= 0; i--)
+    ; // no warning, i decrements
+  while (j >= 0)
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (j) are updated in the loop body [bugprone-infinite-loop]
+    test_local_static_structured_binding_recursion();
+
+  int (*p)(int) = 0;
+
+  while (i >= 0)
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (i) are updated in the loop body [bugprone-infinite-loop]
+    p = 0;
+  while (i >= 0)
+    p(0); // we don't know what p points to so no warning
+}
+
+struct S { int a; };
+void issue_138842_reduced() {
+    int x = 10;
+    auto [y] = S{1};
+
+    while (y < x) {
+      y++;
+    }
+}
+
+namespace std {
+template <typename T, typename U>
+struct pair {
+  T first;
+  U second;
+
+  pair(T a, U b) : first(a), second(b) {}
+};
+}
+
+template <typename T, typename U>
+void structured_binding_in_template_byval(T a, U b) {
+  auto [c, d] = std::pair<T, U>(a,b);
+
+  while (c < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (c) are updated in the loop body [bugprone-infinite-loop]
+    d++;
+  }
+
+  while (c < 10) {
+    c++; // no warning
+  }
+}
+
+template <typename T, typename U>
+void structured_binding_in_template_bylref(T a, U b) {
+  auto p = std::pair<T, U>(a,b);
+  auto& [c, d] = p;
+
+  while (c < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (c) are updated in the loop body [bugprone-infinite-loop]
+    d++;
+  }
+
+  while (c < 10) {
+    c++; // no warning
+  }
+}
+
+template <typename T, typename U>
+void structured_binding_in_template_byrref(T a, U b) {
+  auto p = std::pair<T, U>(a,b);
+  auto&& [c, d] = p;
+
+  while (c < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (c) are updated in the loop body [bugprone-infinite-loop]
+    d++;
+  }
+
+  while (c < 10) {
+    c++; // no warning
+  }
+}
+
+void structured_binding_in_template_instantiation(int b) {
+  structured_binding_in_template_byval(b, 0);
+  structured_binding_in_template_bylref(b, 0);
+  structured_binding_in_template_byrref(b, 0);
+}
+
+void array_structured_binding() {
+  int arr[2] = {0, 0};
+  auto [x, y] = arr;
+
+  while (x < 10) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: this loop is infinite; none of its condition variables (x) are updated in the loop body [bugprone-infinite-loop]
+    y++;
+  }
+
+  while (y < 10) {
+    y++; // no warning
+  }
+}
+
+namespace std {
+    using size_t = int;
+    template <class> struct tuple_size;
+    template <std::size_t, class> struct tuple_element;
+    template <class...> class tuple;
+
+namespace {
+    template <class T, T v>
+    struct size_helper { static const T value = v; };
+} // namespace
+
+template <class... T>
+struct tuple_size<tuple<T...>> : size_helper<std::size_t, sizeof...(T)> {};
+
+template <std::size_t I, class... T>
+struct tuple_element<I, tuple<T...>> {
+    using type =  __type_pack_element<I, T...>;
+};
+
+template <class...> class tuple {};
+
+template <std::size_t I, class... T>
+typename tuple_element<I, tuple<T...>>::type get(tuple<T...>);
+} // namespace std
+
+std::tuple<int*, int> &get_chunk();
+
+void test_structured_bindings_tuple() {
+  auto [buffer, size ] = get_chunk();
+  int maxLen = 8;
+
+  while (size < maxLen) {
+    // No warning. The loop is finite because 'size' is being incremented in each iteration and compared against 'maxLen' for termination
+    buffer[size++] = 2;
+  }
+}
+
+void test_structured_bindings_tuple_ref() {
+  auto& [buffer, size ] = get_chunk();
+  int maxLen = 8;
+
+  while (size < maxLen) {
+    // No warning. The loop is finite because 'size' is being incremented in each iteration and compared against 'maxLen' for termination
+    buffer[size++] = 2;
+  }
+}
diff --git a/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake b/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake
index ecd478aefdaee..b328f3d0ff9cf 100644
--- a/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake
@@ -43,3 +43,6 @@ set(CLANG_BOOTSTRAP_CMAKE_ARGS
   ${EXTRA_ARGS}
   -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake
   CACHE STRING "")
+
+# Do not use LLVM build for generating PGO data.
+set(CLANG_PGO_TRAINING_USE_LLVM_BUILD OFF CACHE BOOL "")
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0e21ef0244f78..d39ee49b432e5 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4201,8 +4201,8 @@ the configuration (without a prefix: ``Auto``).
   * ``""`` means "arbitrary suffix"
   * ``"$"`` means "no suffix"
 
-  For example, if configured to ``"(_test)?$"``, then a header a.h would be seen
-  as the "main" include in both a.cc and a_test.cc.
+  For example, if configured to ``"(_test)?$"``, then a header a.h would be
+  seen as the "main" include in both a.cc and a_test.cc.
 
 .. _IncludeIsMainSourceRegex:
 
@@ -6015,6 +6015,16 @@ the configuration (without a prefix: ``Auto``).
        #include "B/A.h"           #include "B/a.h"
        #include "B/a.h"           #include "a/b.h"
 
+  * ``bool IgnoreExtension`` When sorting includes in each block, only take file extensions into
+    account if two includes compare equal otherwise.
+
+    .. code-block:: c++
+
+       true:                          false:
+       # include "A.h"         vs.    # include "A-util.h"
+       # include "A.inc"              # include "A.h"
+       # include "A-util.h"           # include "A.inc"
+
 
 .. _SortJavaStaticImport:
 
diff --git a/clang/docs/ClangTools.rst b/clang/docs/ClangTools.rst
index 60e21590f9eb3..3216328bbb6a6 100644
--- a/clang/docs/ClangTools.rst
+++ b/clang/docs/ClangTools.rst
@@ -89,13 +89,50 @@ they'll be tracked here. The focus of this documentation is on the scope
 and features of the tools for other tool developers; each tool should
 provide its own user-focused documentation.
 
-``clang-tidy``
+``Clang-Doc``
+-------------
+
+`Clang-Doc <https://clang.llvm.org/extra/clang-doc.html>`_ is a tool for
+generating C and C++ documentation from source code and comments.
+
+``Clang-Include-Fixer``
+-----------------------
+
+`Clang-Include-Fixer <https://clang.llvm.org/extra/clang-include-fixer.html>`_
+is a tool to automate the addition of missing ``#include`` directives in a C++
+file. It adds missing namespace qualifiers to unidentified symbols when
+necessary and also removes unused headers.
+
+``Clang-Tidy``
 --------------
 
-`clang-tidy <https://clang.llvm.org/extra/clang-tidy/>`_ is a clang-based C++
+`Clang-Tidy <https://clang.llvm.org/extra/clang-tidy/>`_ is a Clang-based C++
 linter tool. It provides an extensible framework for building compiler-based
 static analyses detecting and fixing bug-prone patterns, performance,
-portability and maintainability issues.
+portability and maintainability issues. It also has checks for modernizing code
+to newer language standards.
+
+``Clangd``
+----------
+
+`Clangd <https://clangd.llvm.org/>`_ is a language server that can work with
+many editors via a plugin. It understands your C++ code and adds smart
+features to your editor: code completion, compile errors, go-to-definition and
+more.
+
+``Modularize``
+--------------
+
+`Modularize <https://clang.llvm.org/extra/modularize.html>`_ is a standalone
+tool that checks whether a set of headers provides the consistent definitions
+required to use modules.
+
+``pp-trace``
+------------
+
+`pp-trace <https://clang.llvm.org/extra/pp-trace.html>`_ is a standalone tool
+that traces preprocessor activity. It’s also used as a test of Clang’s
+``PPCallbacks`` interface.
 
 
 Ideas for new Tools
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index f448a9a8db172..34e1bf150aef1 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -138,7 +138,7 @@ for support for non-standardized features, i.e. features not prefixed ``c_``,
 ``cxx_`` or ``objc_``.
 
 Another use of ``__has_feature`` is to check for compiler features not related
-to the language standard, such as e.g. :doc:`AddressSanitizer
+to the language standard, such as :doc:`AddressSanitizer
 <AddressSanitizer>`.
 
 If the ``-pedantic-errors`` option is given, ``__has_extension`` is equivalent
@@ -377,8 +377,8 @@ Builtin Macros
 
 ``__FILE_NAME__``
   Clang-specific extension that functions similar to ``__FILE__`` but only
-  renders the last path component (the filename) instead of an invocation
-  dependent full path to that file.
+  renders the last path component (the filename) instead of an
+  invocation-dependent full path to that file.
 
 ``__COUNTER__``
   Defined to an integer value that starts at zero and is incremented each time
@@ -716,7 +716,7 @@ See also :ref:`langext-__builtin_shufflevector`, :ref:`langext-__builtin_convert
   a NEON vector or an SVE vector, it's only available in C++ and uses normal bool
   conversions (that is, != 0).
   If it's an extension (OpenCL) vector, it's only available in C and OpenCL C.
-  And it selects base on signedness of the condition operands (OpenCL v1.1 s6.3.9).
+  And it selects based on signedness of the condition operands (OpenCL v1.1 s6.3.9).
 .. [#] sizeof can only be used on vector length specific SVE types.
 .. [#] Clang does not allow the address of an element to be taken while GCC
    allows this. This is intentional for vectors with a boolean element type and
@@ -848,6 +848,14 @@ of different sizes and signs is forbidden in binary and ternary builtins.
                                                 semantics, see `LangRef
                                                 <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
                                                 for the comparison.
+ T __builtin_elementwise_maximumnum(T x, T y)   return x or y, whichever is larger. Follows IEEE 754-2019              floating point types
+                                                semantics, see `LangRef
+                                                <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
+                                                for the comparison.
+ T __builtin_elementwise_minimumnum(T x, T y)   return x or y, whichever is smaller. Follows IEEE 754-2019             floating point types
+                                                semantics, see `LangRef
+                                                <http://llvm.org/docs/LangRef.html#llvm-min-intrinsics-comparation>`_
+                                                for the comparison.
 ============================================== ====================================================================== =========================================
 
 
@@ -857,7 +865,7 @@ Each builtin returns a scalar equivalent to applying the specified
 operation(x, y) as recursive even-odd pairwise reduction to all vector
 elements. ``operation(x, y)`` is repeatedly applied to each non-overlapping
 even-odd element pair with indices ``i * 2`` and ``i * 2 + 1`` with
-``i in [0, Number of elements / 2)``. If the numbers of elements is not a
+``i in [0, Number of elements / 2)``. If the number of elements is not a
 power of 2, the vector is widened with neutral elements for the reduction
 at the end to the next power of 2.
 
@@ -1491,7 +1499,7 @@ C++14 digit separators
 
 Use ``__cpp_digit_separators`` to determine if support for digit separators
 using single quotes (for instance, ``10'000``) is enabled. At this time, there
-is no corresponding ``__has_feature`` name
+is no corresponding ``__has_feature`` name.
 
 C++14 generalized lambda capture
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1545,7 +1553,7 @@ C++ type aware allocators
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Use ``__has_extension(cxx_type_aware_allocators)`` to determine the existence of
-support for the future C++2d type aware allocator feature. For full details see
+support for the future C++2d type aware allocator feature. For full details, see
 :doc:`C++ Type Aware Allocators <CXXTypeAwareAllocators>` for additional details.
 
 C11
@@ -1643,7 +1651,7 @@ Modules
 Use ``__has_feature(modules)`` to determine if Modules have been enabled.
 For example, compiling code with ``-fmodules`` enables the use of Modules.
 
-More information could be found `here <https://clang.llvm.org/docs/Modules.html>`_.
+More information can be found `here <https://clang.llvm.org/docs/Modules.html>`_.
 
 Language Extensions Back-ported to Previous Standards
 =====================================================
@@ -1878,7 +1886,7 @@ The following type trait primitives are supported by Clang. Those traits marked
   C++26 relocatable types, and types which
   were made trivially relocatable via the ``clang::trivial_abi`` attribute.
   This trait is deprecated and should be replaced by
-  ``__builtin_is_cpp_trivially_relocatable``. Note however that it is generally
+  ``__builtin_is_cpp_trivially_relocatable``. Note, however, that it is generally
   unsafe to relocate a C++-relocatable type with ``memcpy`` or ``memmove``;
   use ``__builtin_trivially_relocate``.
 * ``__builtin_is_cpp_trivially_relocatable`` (C++): Returns true if an object
diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst
index 6c2b11ac7fc23..e747022b9c173 100644
--- a/clang/docs/LibClang.rst
+++ b/clang/docs/LibClang.rst
@@ -404,3 +404,9 @@ following situations are explicitly unsupported:
   compatible across library versions.
 * For the same reason as above, serializing objects from one version of the
   library and deserializing with a different version is also not supported.
+
+Note: because libclang is a wrapper around the compiler frontend, it is not a
+`security-sensitive component`_ of the LLVM Project. Consider using a sandbox
+or some other mitigation approach if processing untrusted input.
+
+.. _security-sensitive component: https://llvm.org/docs/Security.html#what-is-considered-a-security-issue
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1eb3e369a302e..4f6e9a4b4bd1e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -46,6 +46,7 @@ Potentially Breaking Changes
   ``endbr64`` instruction at the labels named as possible branch
   destinations, so it is not safe to use a register-controlled branch
   instruction to branch to one. (In line with gcc.)
+- Added a sugar type `PredefinedSugarType` to improve diagnostic messages. (#GH143653)
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
@@ -76,6 +77,10 @@ C++ Specific Potentially Breaking Changes
   whose nested-name-specifier doesn't refer to a base class such as
   ``using CurrentClass::Foo;`` is now rejected in C++98 mode.
 
+- For C++20 modules, the Reduced BMI mode will be the default option. This may introduce
+  regressions if your build system supports two-phase compilation model but haven't support
+  reduced BMI or it is a compiler bug or a bug in users code.
+
 ABI Changes in This Version
 ---------------------------
 
@@ -339,6 +344,7 @@ Non-comprehensive list of changes in this release
 - Added `__builtin_elementwise_exp10`.
 - For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction.
 - Added `__builtin_elementwise_minnum` and `__builtin_elementwise_maxnum`.
+- Added `__builtin_elementwise_minnumnum` and `__builtin_elementwise_maxnumnum`.
 - No longer crashing on invalid Objective-C categories and extensions when
   dumping the AST as JSON. (#GH137320)
 - Clang itself now uses split stacks instead of threads for allocating more
@@ -674,7 +680,7 @@ Improvements to Clang's diagnostics
   #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308,
   #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
   #GH36703, #GH32903, #GH23312, #GH69874.
-  
+
 - Clang no longer emits a spurious -Wdangling-gsl warning in C++23 when
   iterating over an element of a temporary container in a range-based
   for loop.(#GH109793, #GH145164)
@@ -710,6 +716,12 @@ Improvements to Clang's diagnostics
   pointer, provided it can be proven that the pointer only points to
   ``[[noreturn]]`` functions.
 
+- Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic
+  diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``).
+  Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
+  Added a new warning in this group for the case where the attribute is missing/implicit on
+  an override of a virtual method.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -802,6 +814,11 @@ Bug Fixes in This Version
   nested scopes. (#GH147495)
 - Fixed a failed assertion with an operator call expression which comes from a
   macro expansion when performing analysis for nullability attributes. (#GH138371)
+- Fixed a concept equivalent checking crash due to untransformed constraint expressions. (#GH146614)
+- Fixed a crash in `clang-scan-deps` when a module with the same name is found
+  in different locations (#GH134404, #GH146976).
+- Fix a crash when marco name is empty in ``#pragma push_macro("")`` or
+  ``#pragma pop_macro("")``. (GH149762).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -970,6 +987,7 @@ Bug Fixes to C++ Support
 - Fixed a crash involving list-initialization of an empty class with a
   non-empty initializer list. (#GH147949)
 - Fixed constant evaluation of equality comparisons of constexpr-unknown references. (#GH147663)
+- Diagnose binding a reference to ``*nullptr`` during constant evaluation. (#GH48665)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1199,6 +1217,8 @@ Static Analyzer
 ---------------
 - Fixed a crash when C++20 parenthesized initializer lists are used. This issue
   was causing a crash in clang-tidy. (#GH136041)
+- The Clang Static Analyzer now handles parenthesized initialization.
+  (#GH148875)
 
 New features
 ^^^^^^^^^^^^
diff --git a/clang/docs/SanitizerSpecialCaseList.rst b/clang/docs/SanitizerSpecialCaseList.rst
index 2c50778d0f491..194f2fc5a7825 100644
--- a/clang/docs/SanitizerSpecialCaseList.rst
+++ b/clang/docs/SanitizerSpecialCaseList.rst
@@ -39,6 +39,7 @@ Example
   void bad_foo() {
     int *a = (int*)malloc(40);
     a[10] = 1;
+    free(a);
   }
   int main() { bad_foo(); }
   $ cat ignorelist.txt
diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 933a57ff34dd9..31d0a5e769378 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -687,16 +687,12 @@ fails to instantiate. For such issues, users can add references to ``N::g`` in
 the `module purview <https://eel.is/c++draft/module.unit#5>`_ of ``M.cppm`` to
 ensure it is reachable, e.g. ``using N::g;``.
 
-Support for Reduced BMIs is still experimental, but it may become the default
-in the future. The expected roadmap for Reduced BMIs as of Clang 19.x is:
-
-1. ``-fexperimental-modules-reduced-bmi`` was introduced in v19.x
-2. For v20.x, ``-fmodules-reduced-bmi`` is introduced as an equivalent non-experimental
-   option. It is expected to stay opt-in for 1~2 releases, though the period depends
-   on user feedback and may be extended.
-3. Finally, ``-fmodules-reduced-bmi`` will be the default. When that time
-   comes, the term BMI will refer to the Reduced BMI and the Full BMI will only
-   be meaningful to build systems which elect to support two-phase compilation.
+As of Clang 22.x, the Reduced BMI is enabled by default. You may still want to
+use Full BMI with ``-fno-modules-reduced-bmi`` in the following case:
+1. Your build system uses two-phase compilation but it haven't adjusted the
+implementation for reduced BMI.
+2. You meet a regression with Reduced BMI that you cannot work around. Please
+report an issue for this case.
 
 Experimental Non-Cascading Changes
 ----------------------------------
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 284a404026dfe..af0a8746d45e7 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -319,7 +319,7 @@ output format of the diagnostics that it generates.
 
    This option, which defaults to "none", controls whether or not Clang
    prints the category associated with a diagnostic when emitting it.
-   Each diagnostic may or many not have an associated category, if it
+   Each diagnostic may or may not have an associated category, if it
    has one, it is listed in the diagnostic categorization field of the
    diagnostic line (in the []'s).
 
@@ -737,7 +737,7 @@ control the crash diagnostics.
    crash diagnostics files, but with lower precedence than the option.
 
 Clang is also capable of generating preprocessed source file(s) and associated
-run script(s) even without a crash. This is specially useful when trying to
+run script(s) even without a crash. This is especially useful when trying to
 generate a reproducer for warnings or errors while using modules.
 
 .. option:: -gen-reproducer
@@ -1061,7 +1061,7 @@ In this way, the user may only need to specify a root configuration file with
 
 Usually, config file options are placed before command-line options, regardless
 of the actual operation to be performed. The exception is being made for the
-options prefixed with the ``$`` character. These will be used only when linker
+options prefixed with the ``$`` character. These will be used only when the linker
 is being invoked, and added after all of the command-line specified linker
 inputs. Here is some example of ``$``-prefixed options:
 
@@ -1222,7 +1222,7 @@ existed.
 The push and pop pragmas will save and restore the full diagnostic state
 of the compiler, regardless of how it was set. It should be noted that while Clang
 supports the GCC pragma, Clang and GCC do not support the exact same set
-of warnings, so even when using GCC compatible #pragmas there is no
+of warnings, so even when using GCC-compatible #pragmas there is no
 guarantee that they will have identical behaviour on both compilers.
 
 Clang also doesn't yet support GCC behavior for ``#pragma diagnostic pop``
@@ -1681,7 +1681,7 @@ for more details.
    * ``preserve-sign`` - the sign of a flushed-to-zero number is preserved in the sign of 0
    * ``positive-zero`` - denormals are flushed to positive zero
 
-   The default value depends on the target. For most targets, defaults to
+   The default value depends on the target. For most targets, it defaults to
    ``ieee``.
 
 .. option:: -f[no-]strict-float-cast-overflow
@@ -1730,7 +1730,7 @@ for more details.
    the C and C++ standards but can be enabled using ``-ffp-contract=fast``.
 
    Fusion can be controlled with the ``FP_CONTRACT`` and ``clang fp contract``
-   pragmas. Please note that pragmas will be ingored with
+   pragmas. Please note that pragmas will be ignored with
    ``-ffp-contract=fast``, and refer to the pragma documentation for a
    description of how the pragmas interact with the different ``-ffp-contract``
    option values.
@@ -1984,11 +1984,11 @@ for more details.
      call to runtime library functions (generally the case, but the BE might
      sometimes replace the library call if it knows enough about the potential
      range of the inputs). Overflow and non-finite values are handled by the
-     library implementation. For the case of multiplication overflow will occur in
+     library implementation. For the case of multiplication, overflow will occur in
      accordance with normal floating-point rules. This is the default value.
    * ``promoted`` Implementation of complex division using algebraic formulas at
      higher precision. Overflow is handled. Non-finite values are handled in some
-     cases. If the target does not have native support for a higher precision
+     cases. If the target does not have native support for a higher-precision
      data type, the implementation for the complex operation using the Smith
      algorithm will be used. Overflow may still occur in some cases. NaN and
      infinite values are not handled.
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index c35311c886413..be038d9165fc6 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -6953,6 +6953,21 @@ clang_getCursorUnaryOperatorKind(CXCursor cursor);
  * @}
  */
 
+/* CINDEX_DEPRECATED - disabled to silence MSVC deprecation warnings */
+typedef void *CXRemapping;
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping clang_getRemappings(const char *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping
+clang_getRemappingsFromFileList(const char **, unsigned);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE unsigned clang_remap_getNumFiles(CXRemapping);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void
+clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void clang_remap_dispose(CXRemapping);
+
 LLVM_CLANG_C_EXTERN_C_END
 
 #endif
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 66ec3395571ea..17cbfb2693308 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -277,6 +277,11 @@ class ASTContext : public RefCountedBase<ASTContext> {
   mutable llvm::ContextualFoldingSet<ArrayParameterType, ASTContext &>
       ArrayParameterTypes;
 
+  /// Store the unique Type corresponding to each Kind.
+  mutable std::array<Type *,
+                     llvm::to_underlying(PredefinedSugarType::Kind::Last) + 1>
+      PredefinedSugarTypes{};
+
   /// The set of nested name specifiers.
   ///
   /// This set is managed by the NestedNameSpecifier class.
@@ -1192,6 +1197,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
   bool isInSameModule(const Module *M1, const Module *M2) const;
 
   TranslationUnitDecl *getTranslationUnitDecl() const {
+    assert(TUDecl->getMostRecentDecl() == TUDecl &&
+           "The active TU is not current one!");
     return TUDecl->getMostRecentDecl();
   }
   void addTranslationUnitDecl() {
@@ -1567,6 +1574,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// and bit count.
   QualType getDependentBitIntType(bool Unsigned, Expr *BitsExpr) const;
 
+  QualType getPredefinedSugarType(PredefinedSugarType::Kind KD) const;
+
   /// Gets the struct used to keep track of the extended descriptor for
   /// pointer to blocks.
   QualType getBlockDescriptorExtendedType() const;
@@ -1999,11 +2008,13 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// <stddef.h>.
   ///
   /// The sizeof operator requires this (C99 6.5.3.4p4).
-  CanQualType getSizeType() const;
+  QualType getSizeType() const;
+
+  CanQualType getCanonicalSizeType() const;
 
   /// Return the unique signed counterpart of
   /// the integer type corresponding to size_t.
-  CanQualType getSignedSizeType() const;
+  QualType getSignedSizeType() const;
 
   /// Return the unique type for "intmax_t" (C99 7.18.1.5), defined in
   /// <stdint.h>.
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 514f4cef3a694..0a2db9e205c7c 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -269,12 +269,7 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> {
 
       case NestedNameSpecifier::Namespace:
         cur = NestedNameSpecifier::Create(ctx, cur,
-                                          asImpl().readNamespaceDeclRef());
-        continue;
-
-      case NestedNameSpecifier::NamespaceAlias:
-        cur = NestedNameSpecifier::Create(ctx, cur,
-                                     asImpl().readNamespaceAliasDeclRef());
+                                          asImpl().readNamespaceBaseDeclRef());
         continue;
 
       case NestedNameSpecifier::TypeSpec:
diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h
index fedde8a2e46c5..c105bbbe45c92 100644
--- a/clang/include/clang/AST/AbstractBasicWriter.h
+++ b/clang/include/clang/AST/AbstractBasicWriter.h
@@ -251,11 +251,7 @@ class DataStreamBasicWriter : public BasicWriterBase<Impl> {
         continue;
 
       case NestedNameSpecifier::Namespace:
-        asImpl().writeNamespaceDeclRef(NNS->getAsNamespace());
-        continue;
-
-      case NestedNameSpecifier::NamespaceAlias:
-        asImpl().writeNamespaceAliasDeclRef(NNS->getAsNamespaceAlias());
+        asImpl().writeNamespaceBaseDeclRef(NNS->getAsNamespace());
         continue;
 
       case NestedNameSpecifier::TypeSpec:
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index c75e29c861f82..08fe1f881503b 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -565,8 +565,28 @@ class LabelDecl : public NamedDecl {
   static bool classofKind(Kind K) { return K == Label; }
 };
 
+/// Represents C++ namespaces and their aliases.
+///
+/// FIXME: Move `NamespaceBaseDecl` and `NamespaceDecl` to "DeclCXX.h" or
+/// explain why not moving.
+class NamespaceBaseDecl : public NamedDecl {
+protected:
+  using NamedDecl::NamedDecl;
+
+public:
+  NamespaceDecl *getNamespace();
+  const NamespaceDecl *getNamespace() const {
+    return const_cast<NamespaceBaseDecl *>(this)->getNamespace();
+  }
+
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) {
+    return K >= firstNamespaceBase && K <= lastNamespaceBase;
+  }
+};
+
 /// Represent a C++ namespace.
-class NamespaceDecl : public NamedDecl,
+class NamespaceDecl : public NamespaceBaseDecl,
                       public DeclContext,
                       public Redeclarable<NamespaceDecl> {
   /// The starting location of the source range, pointing
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 77bc3cad72ed9..33ae3d604020b 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -3186,7 +3186,7 @@ class UsingDirectiveDecl : public NamedDecl {
 /// \code
 /// namespace Foo = Bar;
 /// \endcode
-class NamespaceAliasDecl : public NamedDecl,
+class NamespaceAliasDecl : public NamespaceBaseDecl,
                            public Redeclarable<NamespaceAliasDecl> {
   friend class ASTDeclReader;
 
@@ -3203,14 +3203,14 @@ class NamespaceAliasDecl : public NamedDecl,
 
   /// The Decl that this alias points to, either a NamespaceDecl or
   /// a NamespaceAliasDecl.
-  NamedDecl *Namespace;
+  NamespaceBaseDecl *Namespace;
 
   NamespaceAliasDecl(ASTContext &C, DeclContext *DC,
                      SourceLocation NamespaceLoc, SourceLocation AliasLoc,
                      IdentifierInfo *Alias, NestedNameSpecifierLoc QualifierLoc,
-                     SourceLocation IdentLoc, NamedDecl *Namespace)
-      : NamedDecl(NamespaceAlias, DC, AliasLoc, Alias), redeclarable_base(C),
-        NamespaceLoc(NamespaceLoc), IdentLoc(IdentLoc),
+                     SourceLocation IdentLoc, NamespaceBaseDecl *Namespace)
+      : NamespaceBaseDecl(NamespaceAlias, DC, AliasLoc, Alias),
+        redeclarable_base(C), NamespaceLoc(NamespaceLoc), IdentLoc(IdentLoc),
         QualifierLoc(QualifierLoc), Namespace(Namespace) {}
 
   void anchor() override;
@@ -3222,13 +3222,11 @@ class NamespaceAliasDecl : public NamedDecl,
   NamespaceAliasDecl *getMostRecentDeclImpl() override;
 
 public:
-  static NamespaceAliasDecl *Create(ASTContext &C, DeclContext *DC,
-                                    SourceLocation NamespaceLoc,
-                                    SourceLocation AliasLoc,
-                                    IdentifierInfo *Alias,
-                                    NestedNameSpecifierLoc QualifierLoc,
-                                    SourceLocation IdentLoc,
-                                    NamedDecl *Namespace);
+  static NamespaceAliasDecl *
+  Create(ASTContext &C, DeclContext *DC, SourceLocation NamespaceLoc,
+         SourceLocation AliasLoc, IdentifierInfo *Alias,
+         NestedNameSpecifierLoc QualifierLoc, SourceLocation IdentLoc,
+         NamespaceBaseDecl *Namespace);
 
   static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
@@ -3282,7 +3280,7 @@ class NamespaceAliasDecl : public NamedDecl,
 
   /// Retrieve the namespace that this alias refers to, which
   /// may either be a NamespaceDecl or a NamespaceAliasDecl.
-  NamedDecl *getAliasedNamespace() const { return Namespace; }
+  NamespaceBaseDecl *getAliasedNamespace() const { return Namespace; }
 
   SourceRange getSourceRange() const override LLVM_READONLY {
     return SourceRange(NamespaceLoc, IdentLoc);
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
index 3560766433fe2..a284f2c44d633 100644
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -489,7 +489,8 @@ class FormatSpecifier {
 
   /// For a TypedefType QT, if it is a named integer type such as size_t,
   /// assign the appropriate value to LM and return true.
-  static bool namedTypeToLengthModifier(QualType QT, LengthModifier &LM);
+  static bool namedTypeToLengthModifier(ASTContext &Ctx, QualType QT,
+                                        LengthModifier &LM);
 };
 
 } // end analyze_format_string namespace
diff --git a/clang/include/clang/AST/NestedNameSpecifier.h b/clang/include/clang/AST/NestedNameSpecifier.h
index 952c79753d10a..1614f9d7c94e4 100644
--- a/clang/include/clang/AST/NestedNameSpecifier.h
+++ b/clang/include/clang/AST/NestedNameSpecifier.h
@@ -31,8 +31,7 @@ class ASTContext;
 class CXXRecordDecl;
 class IdentifierInfo;
 class LangOptions;
-class NamespaceAliasDecl;
-class NamespaceDecl;
+class NamespaceBaseDecl;
 struct PrintingPolicy;
 class Type;
 class TypeLoc;
@@ -79,12 +78,9 @@ class NestedNameSpecifier : public llvm::FoldingSetNode {
     /// An identifier, stored as an IdentifierInfo*.
     Identifier,
 
-    /// A namespace, stored as a NamespaceDecl*.
+    /// A namespace-like entity, stored as a NamespaceBaseDecl*.
     Namespace,
 
-    /// A namespace alias, stored as a NamespaceAliasDecl*.
-    NamespaceAlias,
-
     /// A type, stored as a Type*.
     TypeSpec,
 
@@ -121,15 +117,10 @@ class NestedNameSpecifier : public llvm::FoldingSetNode {
                                      NestedNameSpecifier *Prefix,
                                      const IdentifierInfo *II);
 
-  /// Builds a nested name specifier that names a namespace.
-  static NestedNameSpecifier *Create(const ASTContext &Context,
-                                     NestedNameSpecifier *Prefix,
-                                     const NamespaceDecl *NS);
-
-  /// Builds a nested name specifier that names a namespace alias.
+  /// Builds a nested name specifier that names a namespace or namespace alias.
   static NestedNameSpecifier *Create(const ASTContext &Context,
                                      NestedNameSpecifier *Prefix,
-                                     const NamespaceAliasDecl *Alias);
+                                     const NamespaceBaseDecl *NS);
 
   /// Builds a nested name specifier that names a type.
   static NestedNameSpecifier *
@@ -174,13 +165,9 @@ class NestedNameSpecifier : public llvm::FoldingSetNode {
     return nullptr;
   }
 
-  /// Retrieve the namespace stored in this nested name
+  /// Retrieve the namespace or namespace alias stored in this nested name
   /// specifier.
-  NamespaceDecl *getAsNamespace() const;
-
-  /// Retrieve the namespace alias stored in this nested name
-  /// specifier.
-  NamespaceAliasDecl *getAsNamespaceAlias() const;
+  NamespaceBaseDecl *getAsNamespace() const;
 
   /// Retrieve the record declaration stored in this nested name
   /// specifier.
@@ -425,29 +412,15 @@ class NestedNameSpecifierLocBuilder {
   /// \param Context The AST context in which this nested-name-specifier
   /// resides.
   ///
-  /// \param Namespace The namespace.
+  /// \param Namespace The namespace or namespace alias.
   ///
-  /// \param NamespaceLoc The location of the namespace name.
+  /// \param NamespaceLoc The location of the namespace name or the namespace
+  //  alias.
   ///
   /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceDecl *Namespace,
+  void Extend(ASTContext &Context, NamespaceBaseDecl *Namespace,
               SourceLocation NamespaceLoc, SourceLocation ColonColonLoc);
 
-  /// Extend the current nested-name-specifier by another
-  /// nested-name-specifier component of the form 'namespace-alias::'.
-  ///
-  /// \param Context The AST context in which this nested-name-specifier
-  /// resides.
-  ///
-  /// \param Alias The namespace alias.
-  ///
-  /// \param AliasLoc The location of the namespace alias
-  /// name.
-  ///
-  /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceAliasDecl *Alias,
-              SourceLocation AliasLoc, SourceLocation ColonColonLoc);
-
   /// Turn this (empty) nested-name-specifier into the global
   /// nested-name-specifier '::'.
   void MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc);
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index 1215056ffde1b..0438e4dfbafac 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -91,6 +91,7 @@ def DeclRef : RefPropertyType<"Decl"> { let ConstWhenWriting = 1; }
     SubclassPropertyType<"FunctionDecl", DeclRef>;
   def NamedDeclRef :
     SubclassPropertyType<"NamedDecl", DeclRef>;
+  def NamespaceBaseDeclRef : SubclassPropertyType<"NamespaceBaseDecl", DeclRef>;
   def NamespaceDeclRef :
     SubclassPropertyType<"NamespaceDecl", DeclRef>;
   def NamespaceAliasDeclRef :
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 5cb2f57edffe4..62991d986e675 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -789,7 +789,6 @@ bool RecursiveASTVisitor<Derived>::TraverseNestedNameSpecifier(
   switch (NNS->getKind()) {
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
     return true;
@@ -813,7 +812,6 @@ bool RecursiveASTVisitor<Derived>::TraverseNestedNameSpecifierLoc(
   switch (NNS.getNestedNameSpecifier()->getKind()) {
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
     return true;
@@ -1210,6 +1208,8 @@ DEF_TRAVERSE_TYPE(BitIntType, {})
 DEF_TRAVERSE_TYPE(DependentBitIntType,
                   { TRY_TO(TraverseStmt(T->getNumBitsExpr())); })
 
+DEF_TRAVERSE_TYPE(PredefinedSugarType, {})
+
 #undef DEF_TRAVERSE_TYPE
 
 // ----------------- TypeLoc traversal -----------------
@@ -1526,6 +1526,8 @@ DEF_TRAVERSE_TYPELOC(DependentBitIntType, {
   TRY_TO(TraverseStmt(TL.getTypePtr()->getNumBitsExpr()));
 })
 
+DEF_TRAVERSE_TYPELOC(PredefinedSugarType, {})
+
 #undef DEF_TRAVERSE_TYPELOC
 
 // ----------------- Decl traversal -----------------
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 21b97102db95a..764e9d508a25a 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2258,6 +2258,30 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned NumExpansions;
   };
 
+  enum class PredefinedSugarKind {
+    /// The "size_t" type.
+    SizeT,
+
+    /// The signed integer type corresponding to "size_t".
+    SignedSizeT,
+
+    /// The "ptrdiff_t" type.
+    PtrdiffT,
+
+    // Indicates how many items the enum has.
+    Last = PtrdiffT
+  };
+
+  class PresefinedSugarTypeBitfields {
+    friend class PredefinedSugarType;
+
+    LLVM_PREFERRED_TYPE(TypeBitfields)
+    unsigned : NumTypeBits;
+
+    LLVM_PREFERRED_TYPE(PredefinedSugarKind)
+    unsigned Kind : 8;
+  };
+
   class CountAttributedTypeBitfields {
     friend class CountAttributedType;
 
@@ -2297,6 +2321,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
       DependentTemplateSpecializationTypeBits;
     PackExpansionTypeBitfields PackExpansionTypeBits;
     CountAttributedTypeBitfields CountAttributedTypeBits;
+    PresefinedSugarTypeBitfields PredefinedSugarTypeBits;
   };
 
 private:
@@ -8038,6 +8063,37 @@ class DependentBitIntType final : public Type, public llvm::FoldingSetNode {
   }
 };
 
+class PredefinedSugarType final : public Type {
+public:
+  friend class ASTContext;
+  using Kind = PredefinedSugarKind;
+
+private:
+  PredefinedSugarType(Kind KD, const IdentifierInfo *IdentName,
+                      QualType CanonicalType)
+      : Type(PredefinedSugar, CanonicalType, TypeDependence::None),
+        Name(IdentName) {
+    PredefinedSugarTypeBits.Kind = llvm::to_underlying(KD);
+  }
+
+  static StringRef getName(Kind KD);
+
+  const IdentifierInfo *Name;
+
+public:
+  bool isSugared() const { return true; }
+
+  QualType desugar() const { return getCanonicalTypeInternal(); }
+
+  Kind getKind() const { return Kind(PredefinedSugarTypeBits.Kind); }
+
+  const IdentifierInfo *getIdentifier() const { return Name; }
+
+  static bool classof(const Type *T) {
+    return T->getTypeClass() == PredefinedSugar;
+  }
+};
+
 /// A qualifier set is used to build a set of qualifiers.
 class QualifierCollector : public Qualifiers {
 public:
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index cf06e27758996..be0bc896de3ea 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -2783,6 +2783,16 @@ class ObjCProtocolLoc {
   }
 };
 
+struct PredefinedSugarTypeLocInfo {}; // Nothing.
+
+class PredefinedSugarTypeLoc final
+    : public ConcreteTypeLoc<UnqualTypeLoc, PredefinedSugarTypeLoc,
+                             PredefinedSugarType, PredefinedSugarTypeLocInfo> {
+public:
+  void initializeLocal(ASTContext &Context, SourceLocation loc) {}
+  SourceRange getLocalSourceRange() const { return {}; }
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_TYPELOC_H
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index a6157649060b1..3114d1180319a 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -1028,3 +1028,12 @@ let Class = DependentBitIntType in {
     return ctx.getDependentBitIntType(isUnsigned, numBitsExpr);
   }]>;
 }
+
+let Class = PredefinedSugarType in {
+  def : Property<"kind", UInt32> {
+    let Read = [{ static_cast<uint32_t>(node->getKind()) }];
+  }
+  def : Creator<[{
+    return ctx.getPredefinedSugarType(static_cast<PredefinedSugarType::Kind>(kind));
+  }]>;
+}
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index b364b6556d0b3..08c898f7758ec 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -7894,9 +7894,9 @@ AST_MATCHER_P_OVERLOAD(NestedNameSpecifierLoc, hasPrefix,
 ///   matches "ns::"
 AST_MATCHER_P(NestedNameSpecifier, specifiesNamespace,
               internal::Matcher<NamespaceDecl>, InnerMatcher) {
-  if (!Node.getAsNamespace())
-    return false;
-  return InnerMatcher.matches(*Node.getAsNamespace(), Finder, Builder);
+  if (auto *NS = dyn_cast_if_present<NamespaceDecl>(Node.getAsNamespace()))
+    return InnerMatcher.matches(*NS, Finder, Builder);
+  return false;
 }
 
 /// Matches attributes.
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h
index 9998702a41cab..beeb0aaba5d0d 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h
@@ -17,14 +17,96 @@
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
 #define LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
-#include "clang/AST/DeclBase.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
-namespace clang {
+#include "llvm/ADT/ImmutableSet.h"
+#include "llvm/ADT/StringMap.h"
+#include <memory>
 
-void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg,
-                               AnalysisDeclContext &AC);
+namespace clang::lifetimes {
 
-} // namespace clang
+/// The main entry point for the analysis.
+void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC);
+
+namespace internal {
+// Forward declarations of internal types.
+class Fact;
+class FactManager;
+class LoanPropagationAnalysis;
+struct LifetimeFactory;
+
+/// A generic, type-safe wrapper for an ID, distinguished by its `Tag` type.
+/// Used for giving ID to loans and origins.
+template <typename Tag> struct ID {
+  uint32_t Value = 0;
+
+  bool operator==(const ID<Tag> &Other) const { return Value == Other.Value; }
+  bool operator!=(const ID<Tag> &Other) const { return !(*this == Other); }
+  bool operator<(const ID<Tag> &Other) const { return Value < Other.Value; }
+  ID<Tag> operator++(int) {
+    ID<Tag> Tmp = *this;
+    ++Value;
+    return Tmp;
+  }
+  void Profile(llvm::FoldingSetNodeID &IDBuilder) const {
+    IDBuilder.AddInteger(Value);
+  }
+};
+
+using LoanID = ID<struct LoanTag>;
+using OriginID = ID<struct OriginTag>;
+
+// Using LLVM's immutable collections is efficient for dataflow analysis
+// as it avoids deep copies during state transitions.
+// TODO(opt): Consider using a bitset to represent the set of loans.
+using LoanSet = llvm::ImmutableSet<LoanID>;
+using OriginSet = llvm::ImmutableSet<OriginID>;
+
+/// A `ProgramPoint` identifies a location in the CFG by pointing to a specific
+/// `Fact`. identified by a lifetime-related event (`Fact`).
+///
+/// A `ProgramPoint` has "after" semantics: it represents the location
+/// immediately after its corresponding `Fact`.
+using ProgramPoint = const Fact *;
+
+/// Running the lifetime safety analysis and querying its results. It
+/// encapsulates the various dataflow analyses.
+class LifetimeSafetyAnalysis {
+public:
+  LifetimeSafetyAnalysis(AnalysisDeclContext &AC);
+  ~LifetimeSafetyAnalysis();
+
+  void run();
+
+  /// Returns the set of loans an origin holds at a specific program point.
+  LoanSet getLoansAtPoint(OriginID OID, ProgramPoint PP) const;
+
+  /// Finds the OriginID for a given declaration.
+  /// Returns a null optional if not found.
+  std::optional<OriginID> getOriginIDForDecl(const ValueDecl *D) const;
+
+  /// Finds the LoanID's for the loan created with the specific variable as
+  /// their Path.
+  std::vector<LoanID> getLoanIDForVar(const VarDecl *VD) const;
+
+  /// Retrieves program points that were specially marked in the source code
+  /// for testing.
+  ///
+  /// The analysis recognizes special function calls of the form
+  /// `void("__lifetime_test_point_<name>")` as test points. This method returns
+  /// a map from the annotation string (<name>) to the corresponding
+  /// `ProgramPoint`. This allows test harnesses to query the analysis state at
+  /// user-defined locations in the code.
+  /// \note This is intended for testing only.
+  llvm::StringMap<ProgramPoint> getTestPoints() const;
+
+private:
+  AnalysisDeclContext &AC;
+  std::unique_ptr<LifetimeFactory> Factory;
+  std::unique_ptr<FactManager> FactMgr;
+  std::unique_ptr<LoanPropagationAnalysis> LoanPropagation;
+};
+} // namespace internal
+} // namespace clang::lifetimes
 
 #endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 5ebb82180521d..c81714e9b009d 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1334,6 +1334,18 @@ def ElementwiseMinimum : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwiseMaximumNum : Builtin {
+  let Spellings = ["__builtin_elementwise_maximumnum"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseMinimumNum : Builtin {
+  let Spellings = ["__builtin_elementwise_minimumnum"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ElementwiseCeil : Builtin {
   let Spellings = ["__builtin_elementwise_ceil"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 29e1e99bba9ef..878543566f0e3 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -164,6 +164,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n")
 BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
+TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts")
 
 //===----------------------------------------------------------------------===//
 // Ballot builtins.
@@ -668,13 +669,23 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep,  "vIs", "n", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tanhh, "hh", "nc", "tanh-insts")
 TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
 TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sqrt_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sin_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cos_bf16, "yy", "nc", "bf16-trans-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
 
 // GFX1250 WMMA builtins
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
@@ -694,6 +705,7 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbI
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4, "V8fIiV16iIiV16iIsV8f", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/include/clang/Basic/BuiltinsSPIRVVK.td b/clang/include/clang/Basic/BuiltinsSPIRVVK.td
index 61cc0343c415e..5dc3c7588cd2a 100644
--- a/clang/include/clang/Basic/BuiltinsSPIRVVK.td
+++ b/clang/include/clang/Basic/BuiltinsSPIRVVK.td
@@ -11,3 +11,4 @@ include "clang/Basic/BuiltinsSPIRVBase.td"
 
 def reflect : SPIRVBuiltin<"void(...)", [NoThrow, Const]>;
 def faceforward : SPIRVBuiltin<"void(...)", [NoThrow, Const, CustomTypeChecking]>;
+def refract : SPIRVBuiltin<"void(...)", [NoThrow, Const, CustomTypeChecking]>;
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index a11e12d495cd2..cfffeb71f09d1 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -56,6 +56,8 @@ CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr.
 CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata
 ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none
 
+ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible)
+
 CODEGENOPT(ClearASTBeforeBackend , 1, 0, Benign) ///< Free the AST before running backend code generation. Only works with -disable-free.
 CODEGENOPT(DisableFree       , 1, 0, Benign) ///< Don't free memory.
 CODEGENOPT(DiscardValueNames , 1, 0, Benign) ///< Discard Value Names from the IR (LLVMContext flag)
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index df4403ace5fe3..cdeedd5b4eac6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -176,6 +176,9 @@ class CodeGenOptions : public CodeGenOptionsBase {
     llvm_unreachable("invalid FramePointerKind");
   }
 
+  /// Possible exception handling behavior.
+  enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
+
   enum class SwiftAsyncFramePointerKind {
     Auto, // Choose Swift async extended frame info based on deployment target.
     Always, // Unconditionally emit Swift async extended frame info.
@@ -552,6 +555,22 @@ class CodeGenOptions : public CodeGenOptionsBase {
     return NoBuiltinFuncs;
   }
 
+  bool hasSjLjExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::SjLj;
+  }
+
+  bool hasSEHExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::WinEH;
+  }
+
+  bool hasDWARFExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI;
+  }
+
+  bool hasWasmExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::Wasm;
+  }
+
   /// Check if Clang profile instrumenation is on.
   bool hasProfileClangInstr() const {
     return getProfileInstr() ==
diff --git a/clang/include/clang/Basic/DeclNodes.td b/clang/include/clang/Basic/DeclNodes.td
index f1ebaf1db3fc0..8d6731b50f509 100644
--- a/clang/include/clang/Basic/DeclNodes.td
+++ b/clang/include/clang/Basic/DeclNodes.td
@@ -15,9 +15,10 @@ def PragmaComment : DeclNode<Decl>;
 def PragmaDetectMismatch : DeclNode<Decl>;
 def ExternCContext : DeclNode<Decl>, DeclContext;
 def Named : DeclNode<Decl, "named declarations", 1>;
-  def Namespace : DeclNode<Named, "namespaces">, DeclContext;
+  def NamespaceBase : DeclNode<Named, "namespace declarations", 1>;
+    def Namespace : DeclNode<NamespaceBase, "namespaces">, DeclContext;
+    def NamespaceAlias : DeclNode<NamespaceBase>;
   def UsingDirective : DeclNode<Named>;
-  def NamespaceAlias : DeclNode<Named>;
   def Label : DeclNode<Named, "labels">;
   def Type : DeclNode<Named, "types", 1>;
     def TypedefName : DeclNode<Type, "typedefs", 1>;
diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index c7a627600f3cc..cee5bed665d0a 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -895,7 +895,10 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   /// \param FormatString A fixed diagnostic format string that will be hashed
   /// and mapped to a unique DiagID.
   template <unsigned N>
-  // TODO: Deprecate this once all uses are removed from Clang.
+  // FIXME: this API should almost never be used; custom diagnostics do not
+  // have an associated diagnostic group and thus cannot be controlled by users
+  // like other diagnostics. The number of times this API is used in Clang
+  // should only ever be reduced, not increased.
   // [[deprecated("Use a CustomDiagDesc instead of a Level")]]
   unsigned getCustomDiagID(Level L, const char (&FormatString)[N]) {
     return Diags->getCustomDiagID((DiagnosticIDs::Level)L,
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index a67b9995d3b54..071a38f513911 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -174,10 +174,11 @@ def note_constexpr_heap_alloc_limit_exceeded : Note<
 def note_constexpr_this : Note<
   "%select{|implicit }0use of 'this' pointer is only allowed within the "
   "evaluation of a call to a 'constexpr' member function">;
-def access_kind : TextSubstitution<
-  "%select{read of|read of|assignment to|increment of|decrement of|"
-  "member call on|dynamic_cast of|typeid applied to|construction of|"
-  "destruction of|read of}0">;
+def access_kind
+    : TextSubstitution<
+          "%select{read of|read of|assignment to|increment of|decrement of|"
+          "member call on|dynamic_cast of|typeid applied to|construction of|"
+          "destruction of|read of|read of}0">;
 def access_kind_subobject : TextSubstitution<
   "%select{read of|read of|assignment to|increment of|decrement of|"
   "member call on|dynamic_cast of|typeid applied to|"
@@ -222,6 +223,9 @@ def note_constexpr_ltor_incomplete_type : Note<
 def note_constexpr_access_null : Note<
   "%sub{access_kind}0 "
   "dereferenced null pointer is not allowed in a constant expression">;
+def note_constexpr_dereferencing_null
+    : Note<"dereferencing a null pointer is not allowed in a constant "
+           "expression">;
 def note_constexpr_access_past_end : Note<
   "%sub{access_kind}0 dereferenced one-past-the-end pointer "
   "is not allowed in a constant expression">;
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 34b6c0d7a8acd..759ba0419bd45 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -116,6 +116,8 @@ def err_drv_cuda_host_arch : Error<
   "unsupported architecture '%0' for host compilation">;
 def err_drv_mix_cuda_hip : Error<
   "mixed CUDA and HIP compilation is not supported">;
+def err_drv_mix_offload : Error<
+  "mixed %0 and %1 offloading compilation is not supported">;
 def err_drv_bad_target_id : Error<
   "invalid target ID '%0'; format is a processor name followed by an optional "
   "colon-delimited list of features followed by an enable/disable sign (e.g., "
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index c28a919e35d08..ccb18aa37447e 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1293,6 +1293,7 @@ def ThreadSafetyBeta : DiagGroup<"thread-safety-beta">;
 // Warnings and notes related to the function effects system which underlies
 // the nonblocking and nonallocating attributes.
 def FunctionEffects : DiagGroup<"function-effects">;
+def FunctionEffectRedeclarations : DiagGroup<"function-effect-redeclarations">;
 def PerfConstraintImpliesNoexcept : DiagGroup<"perf-constraint-implies-noexcept">;
 
 // Uniqueness Analysis warnings
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index 2b095f0fd6741..f07a003f3fdef 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -283,7 +283,10 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   // writing, nearly all callers of this function were invalid.
   unsigned getCustomDiagID(CustomDiagDesc Diag);
 
-  // TODO: Deprecate this once all uses are removed from LLVM
+  // FIXME: this API should almost never be used; custom diagnostics do not
+  // have an associated diagnostic group and thus cannot be controlled by users
+  // like other diagnostics. The number of times this API is used in Clang
+  // should only ever be reduced, not increased.
   // [[deprecated("Use a CustomDiagDesc instead of a Level")]]
   unsigned getCustomDiagID(Level Level, StringRef Message) {
     return getCustomDiagID([&]() -> CustomDiagDesc {
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 723f5d48b4f5f..c7fe6e1db6d1f 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -694,6 +694,9 @@ def err_pragma_push_pop_macro_malformed : Error<
 def warn_pragma_pop_macro_no_push : Warning<
    "pragma pop_macro could not pop '%0', no matching push_macro">,
   InGroup<IgnoredPragmas>;
+def warn_pargma_push_pop_macro_empty_string : Warning<
+   "'#pragma %select{push_macro|pop_macro}0' expected a non-empty string">,
+  InGroup<IgnoredPragmas>;
 def warn_pragma_message : Warning<"%0">,
    InGroup<PoundPragmaMessage>, DefaultWarnNoWerror;
 def err_pragma_message : Error<"%0">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 2781ff81ab4cf..b2ea65ae111be 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11530,17 +11530,28 @@ def note_in_evaluating_default_argument : Note<
 def warn_invalid_add_func_effects : Warning<
   "attribute '%0' should not be added via type conversion">,
   InGroup<FunctionEffects>, DefaultIgnore;
-def warn_mismatched_func_effect_override : Warning<
-  "attribute '%0' on overriding function does not match base declaration">,
-  InGroup<FunctionEffects>, DefaultIgnore;
-def warn_mismatched_func_effect_redeclaration : Warning<
-  "attribute '%0' on function does not match previous declaration">,
-  InGroup<FunctionEffects>, DefaultIgnore;
+def warn_conflicting_func_effect_override
+    : Warning<"attribute '%0' on overriding function conflicts with base "
+              "declaration">,
+      InGroup<FunctionEffects>,
+      DefaultIgnore;
 def warn_conflicting_func_effects : Warning<
   "effects conflict when merging declarations; kept '%0', discarded '%1'">,
   InGroup<FunctionEffects>, DefaultIgnore;
 def err_func_with_effects_no_prototype : Error<
   "'%0' function must have a prototype">;
+// These are more pedantic: in redeclarations and virtual method overrides,
+// the effect attribute(s) should be restated.
+def warn_mismatched_func_effect_override
+    : Warning<"overriding function is missing '%0' attribute from base "
+              "declaration">,
+      InGroup<FunctionEffectRedeclarations>,
+      DefaultIgnore;
+def warn_mismatched_func_effect_redeclaration
+    : Warning<
+          "redeclaration is missing '%0' attribute from previous declaration">,
+      InGroup<FunctionEffectRedeclarations>,
+      DefaultIgnore;
 
 } // end of sema category
 
@@ -12357,7 +12368,7 @@ def err_export_using_internal : Error<
   "using declaration referring to %1 with %select{internal|module|unknown}0 "
   "linkage cannot be exported">;
 def err_export_not_in_module_interface : Error<
-  "export declaration can only be used within a module purview">;
+  "export declaration can only be used within a module interface">;
 def err_export_inline_not_defined : Error<
   "inline function not defined%select{| before the private module fragment}0">;
 def err_export_partition_impl : Error<
@@ -13478,6 +13489,12 @@ def err_acc_invalid_default_type
 def err_acc_device_type_multiple_archs
     : Error<"OpenACC 'device_type' clause on a 'set' construct only permits "
             "one architecture">;
+def warn_acc_var_referenced_lacks_op
+    : Warning<"variable of type %0 referenced in OpenACC '%1' clause does not "
+              "have a %enum_select<AccVarReferencedReason>{%DefCtor{default "
+              "constructor}|%Dtor{destructor}}2; reference has no effect">,
+      InGroup<DiagGroup<"openacc-var-lacks-operation">>,
+      DefaultError;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 6c47107796236..08d98a77e0252 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -98,8 +98,6 @@ LANGOPT(Exceptions        , 1, 0, NotCompatible, "exception handling")
 LANGOPT(ObjCExceptions    , 1, 0, NotCompatible, "Objective-C exceptions")
 LANGOPT(CXXExceptions     , 1, 0, NotCompatible, "C++ exceptions")
 LANGOPT(EHAsynch          , 1, 0, NotCompatible, "C/C++ EH Asynch exceptions")
-ENUM_LANGOPT(ExceptionHandling, ExceptionHandlingKind, 3,
-             ExceptionHandlingKind::None,  NotCompatible, "exception handling")
 LANGOPT(IgnoreExceptions  , 1, 0, NotCompatible, "ignore exceptions")
 LANGOPT(ExternCNoUnwind   , 1, 0, NotCompatible, "Assume extern C functions don't unwind")
 LANGOPT(AssumeNothrowExceptionDtor , 1, 0, NotCompatible, "Assume exception object's destructor is nothrow")
@@ -493,6 +491,8 @@ LANGOPT(CheckConstexprFunctionBodies, 1, 1, Benign,
 
 LANGOPT(BoundsSafety, 1, 0, NotCompatible, "Bounds safety extension for C")
 
+LANGOPT(EnableLifetimeSafety, 1, 0, NotCompatible, "Experimental lifetime safety analysis for C++")
+
 LANGOPT(PreserveVec3Type, 1, 0, NotCompatible, "Preserve 3-component vector type")
 
 #undef LANGOPT
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 4c642c9e10c91..0407897359b5e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -337,9 +337,6 @@ class LangOptionsBase {
 
   enum ExcessPrecisionKind { FPP_Standard, FPP_Fast, FPP_None };
 
-  /// Possible exception handling behavior.
-  enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
-
   enum class LaxVectorConversionKind {
     /// Permit no implicit vector bitcasts.
     None,
@@ -636,11 +633,6 @@ class LangOptions : public LangOptionsBase {
   // received as a result of a standard operator new (-fcheck-new)
   bool CheckNew = false;
 
-  // In OpenACC mode, contains a user provided override for the _OPENACC macro.
-  // This exists so that we can override the macro value and test our incomplete
-  // implementation on real-world examples.
-  std::string OpenACCMacroOverride;
-
   /// The HLSL root signature version for dxil.
   llvm::dxbc::RootSignatureVersion HLSLRootSigVer =
       llvm::dxbc::RootSignatureVersion::V1_1;
@@ -788,22 +780,6 @@ class LangOptions : public LangOptionsBase {
     return getSignReturnAddressScope() == SignReturnAddressScopeKind::All;
   }
 
-  bool hasSjLjExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::SjLj;
-  }
-
-  bool hasSEHExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::WinEH;
-  }
-
-  bool hasDWARFExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI;
-  }
-
-  bool hasWasmExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::Wasm;
-  }
-
   bool isSYCL() const { return SYCLIsDevice || SYCLIsHost; }
 
   bool hasDefaultVisibilityExportMapping() const {
diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td
index 567b8a5ca5a4d..971ce541d4831 100644
--- a/clang/include/clang/Basic/TypeNodes.td
+++ b/clang/include/clang/Basic/TypeNodes.td
@@ -117,3 +117,4 @@ def PipeType : TypeNode<Type>;
 def AtomicType : TypeNode<Type>;
 def BitIntType : TypeNode<Type>;
 def DependentBitIntType : TypeNode<Type>, AlwaysDependent;
+def PredefinedSugarType : TypeNode<Type>, NeverCanonical;
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 277c278fd38b7..25baf278bba38 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -129,6 +129,22 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   cir::BoolAttr getTrueAttr() { return getCIRBoolAttr(true); }
   cir::BoolAttr getFalseAttr() { return getCIRBoolAttr(false); }
 
+  mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real,
+                                  mlir::Value imag) {
+    auto resultComplexTy = cir::ComplexType::get(real.getType());
+    return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
+  }
+
+  mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
+    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
+    return create<cir::ComplexRealOp>(loc, operandTy.getElementType(), operand);
+  }
+
+  mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
+    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
+    return create<cir::ComplexImagOp>(loc, operandTy.getElementType(), operand);
+  }
+
   mlir::Value createNot(mlir::Value value) {
     return create<cir::UnaryOp>(value.getLoc(), value.getType(),
                                 cir::UnaryOpKind::Not, value);
@@ -169,6 +185,11 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return create<cir::ContinueOp>(loc);
   }
 
+  mlir::Value createUnaryOp(mlir::Location loc, cir::UnaryOpKind kind,
+                            mlir::Value operand) {
+    return create<cir::UnaryOp>(loc, kind, operand);
+  }
+
   mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) {
     return cir::ConstPtrAttr::get(type, getI64IntegerAttr(value));
   }
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2ce23dbb27ec6..694e3691c9361 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1739,7 +1739,8 @@ def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> {
     %2 = cir.load %0 : !cir.ptr<!cir.ptr<!record_type>>, !cir.ptr<!record_type>
     %3 = cir.get_member %2[1] {name = "e"} : !cir.ptr<!record_type>
                                                              -> !cir.ptr<!u16i>
-    %4 = cir.set_bitfield(#bfi_e, %3 : !cir.ptr<!u16i>, %1 : !s32i) -> !s32i
+    %4 = cir.set_bitfield align(4) (#bfi_e, %3 : !cir.ptr<!u16i>, %1 : !s32i)
+                                                                       -> !s32i
     ```
    }];
 
@@ -1747,12 +1748,15 @@ def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> {
     Arg<CIR_PointerType, "the address to store the value", [MemWrite]>:$addr,
     CIR_AnyType:$src,
     BitfieldInfoAttr:$bitfield_info,
+    DefaultValuedOptionalAttr<I64Attr, "0">:$alignment,
     UnitAttr:$is_volatile
   );
 
   let results = (outs CIR_IntType:$result);
 
-  let assemblyFormat = [{ `(`$bitfield_info`,` $addr`:`qualified(type($addr))`,`
+  let assemblyFormat = [{
+    (`align` `(` $alignment^ `)`)?
+    `(`$bitfield_info`,` $addr`:`qualified(type($addr))`,`
     $src`:`type($src) `)`  attr-dict `->` type($result) }];
 
   let builders = [
@@ -1764,14 +1768,15 @@ def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> {
                    "unsigned":$size,
                    "unsigned":$offset,
                    "bool":$is_signed,
-                   "bool":$is_volatile
+                   "bool":$is_volatile,
+                   CArg<"unsigned", "0">:$alignment
                    ),
    [{
       BitfieldInfoAttr info =
         BitfieldInfoAttr::get($_builder.getContext(),
                               name, storage_type,
                               size, offset, is_signed);
-      build($_builder, $_state, type, addr, src, info, is_volatile);
+      build($_builder, $_state, type, addr, src, info, alignment, is_volatile);
     }]>
   ];
 }
@@ -1823,20 +1828,23 @@ def CIR_GetBitfieldOp : CIR_Op<"get_bitfield"> {
     %2 = cir.load %0 : !cir.ptr<!cir.ptr<!record_type>>, !cir.ptr<!record_type>
     %3 = cir.get_member %2[1] {name = "e"} : !cir.ptr<!record_type>
                                                              -> !cir.ptr<!u16i>
-    %4 = cir.get_bitfield(#bfi_e, %3 : !cir.ptr<!u16i>) -> !s32i
+    %4 = cir.get_bitfield align(4) (#bfi_e, %3 : !cir.ptr<!u16i>) -> !s32i
     ```
     }];
 
   let arguments = (ins
     Arg<CIR_PointerType, "the address to load from", [MemRead]>:$addr,
     BitfieldInfoAttr:$bitfield_info,
+    DefaultValuedOptionalAttr<I64Attr, "0">:$alignment,
     UnitAttr:$is_volatile
     );
 
   let results = (outs CIR_IntType:$result);
 
-  let assemblyFormat = [{ `(`$bitfield_info `,` $addr attr-dict `:`
-   qualified(type($addr)) `)` `->` type($result) }];
+  let assemblyFormat = [{
+    (`align` `(` $alignment^ `)`)?
+    `(`$bitfield_info `,` $addr attr-dict `:`
+    qualified(type($addr)) `)` `->` type($result) }];
 
   let builders = [
     OpBuilder<(ins "mlir::Type":$type,
@@ -1846,14 +1854,15 @@ def CIR_GetBitfieldOp : CIR_Op<"get_bitfield"> {
                    "unsigned":$size,
                    "unsigned":$offset,
                    "bool":$is_signed,
-                   "bool":$is_volatile
+                   "bool":$is_volatile,
+                   CArg<"unsigned", "0">:$alignment
                    ),
    [{
       BitfieldInfoAttr info =
         BitfieldInfoAttr::get($_builder.getContext(),
                               name, storage_type,
                               size, offset, is_signed);
-      build($_builder, $_state, type, addr, info, is_volatile);
+      build($_builder, $_state, type, addr, info, alignment, is_volatile);
     }]>
   ];
 }
@@ -2934,6 +2943,45 @@ def CIR_ByteSwapOp : CIR_BitOpBase<"byte_swap",
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// RotateOp
+//===----------------------------------------------------------------------===//
+
+def CIR_RotateOp : CIR_Op<"rotate", [Pure, SameOperandsAndResultType]> {
+  let summary = "Rotate the bits in the operand integer";
+  let description = [{
+    The `cir.rotate` rotates the bits in `input` by the given amount `amount`.
+    The rotate direction is specified by the `left` and `right` keyword.
+
+    `input` must be an unsigned integer and its width must be either 8, 16, 32,
+    or 64. The types of `input`, `amount`, and the result must all match.
+
+    Example:
+
+    ```mlir
+    %r = cir.rotate left %0, %1 : !u32i
+    %r = cir.rotate right %0, %1 : !u32i
+    ```
+  }];
+
+  let results = (outs CIR_IntType:$result);
+  let arguments = (ins
+    CIR_UIntOfWidths<[8, 16, 32, 64]>:$input,
+    CIR_IntType:$amount,
+    UnitAttr:$rotateLeft
+  );
+
+  let assemblyFormat = [{
+    (`left` $rotateLeft^) : (`right`)?
+    $input `,` $amount `:` type($result) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    bool isRotateLeft() { return getRotateLeft(); }
+    bool isRotateRight() { return !getRotateLeft(); }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Assume Operations
 //===----------------------------------------------------------------------===//
@@ -2956,6 +3004,27 @@ def CIR_AssumeOp : CIR_Op<"assume"> {
   }];
 }
 
+def CIR_AssumeSepStorageOp : CIR_Op<"assume_separate_storage", [
+  SameTypeOperands
+]> {
+  let summary =
+      "Tell the optimizer that two pointers point to different allocations";
+  let description = [{
+    The `cir.assume_separate_storage` operation takes two pointers as arguments,
+    and the operation tells the optimizer that these two pointers point to
+    different allocations.
+
+    This operation corresponds to the `__builtin_assume_separate_storage`
+    builtin function.
+  }];
+
+  let arguments = (ins CIR_VoidPtrType:$ptr1, CIR_VoidPtrType:$ptr2);
+
+  let assemblyFormat = [{
+    $ptr1 `,` $ptr2 `:` qualified(type($ptr1)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Branch Probability Operations
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 48e309063d38b..37e0a4c8c1b6b 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -252,8 +252,8 @@ struct MissingFeatures {
   static bool writebacks() { return false; }
   static bool appleKext() { return false; }
   static bool dtorCleanups() { return false; }
-  static bool completeDtors() { return false; }
   static bool vtableInitialization() { return false; }
+  static bool msvcBuiltins() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index d9e328fe918bc..78a4c5738ae66 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -355,6 +355,9 @@ class Driver {
   phases::ID getFinalPhase(const llvm::opt::DerivedArgList &DAL,
                            llvm::opt::Arg **FinalPhaseArg = nullptr) const;
 
+  llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+  executeProgram(llvm::ArrayRef<llvm::StringRef> Args) const;
+
 private:
   /// Certain options suppress the 'no input files' warning.
   LLVM_PREFERRED_TYPE(bool)
@@ -367,10 +370,6 @@ class Driver {
   /// stored in it, and will clean them up when torn down.
   mutable llvm::StringMap<std::unique_ptr<ToolChain>> ToolChains;
 
-  /// The associated offloading architectures with each toolchain.
-  llvm::DenseMap<const ToolChain *, llvm::SmallVector<llvm::StringRef>>
-      OffloadArchs;
-
 private:
   /// TranslateInputArgs - Create a new derived argument list from the input
   /// arguments, after applying the standard argument translations.
@@ -537,8 +536,7 @@ class Driver {
   /// empty string.
   llvm::SmallVector<StringRef>
   getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
-                  Action::OffloadKind Kind, const ToolChain *TC,
-                  bool SpecificToolchain = true) const;
+                  Action::OffloadKind Kind, const ToolChain &TC) const;
 
   /// Check that the file referenced by Value exists. If it doesn't,
   /// issue a diagnostic and return false.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index bce29a76f3ac7..55e90b7a0048d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1156,7 +1156,7 @@ def offload_arch_EQ : CommaJoined<["--"], "offload-arch=">,
            "If 'native' is used the compiler will detect locally installed architectures. "
            "For HIP offloading, the device architecture can be followed by target ID features "
            "delimited by a colon (e.g. gfx908:xnack+:sramecc-). May be specified more than once.">;
-def no_offload_arch_EQ : Joined<["--"], "no-offload-arch=">,
+def no_offload_arch_EQ : CommaJoined<["--"], "no-offload-arch=">,
   Visibility<[ClangOption, FlangOption]>,
   HelpText<"Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. "
            "'all' resets the list to its default value.">;
@@ -1422,19 +1422,6 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
-// Clang specific/exclusive options for OpenACC.
-def openacc_macro_override
-    : Separate<["-"], "fexperimental-openacc-macro-override">,
-      Visibility<[ClangOption, CC1Option]>,
-      Group<f_Group>,
-      HelpText<"Overrides the _OPENACC macro value for experimental testing "
-               "during OpenACC support development">;
-def openacc_macro_override_EQ
-    : Joined<["-"], "fexperimental-openacc-macro-override=">,
-      Alias<openacc_macro_override>;
-
-// End Clang specific/exclusive options for OpenACC.
-
 def libomptarget_amdgpu_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-bc-path=">, Group<i_Group>,
   HelpText<"Path to libomptarget-amdgcn bitcode library">;
 def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path=">, Group<i_Group>,
@@ -1637,7 +1624,7 @@ defm auto_import : BoolFOption<"auto-import",
 // In the future this option will be supported by other offloading
 // languages and accept other values such as CPU/GPU architectures,
 // offload kinds and target aliases.
-def offload_EQ : CommaJoined<["--"], "offload=">, Flags<[NoXarchOption]>,
+def offload_EQ : CommaJoined<["--"], "offload=">, Flags<[NoXarchOption]>, Alias<offload_targets_EQ>,
   HelpText<"Specify comma-separated list of offloading target triples (CUDA and HIP only)">;
 
 // C++ Coroutines
@@ -1917,6 +1904,14 @@ defm bounds_safety : BoolFOption<
   BothFlags<[], [CC1Option],
           " experimental bounds safety extension for C">>;
 
+defm lifetime_safety : BoolFOption<
+  "experimental-lifetime-safety",
+  LangOpts<"EnableLifetimeSafety">, DefaultFalse,
+  PosFlag<SetTrue, [], [CC1Option], "Enable">,
+  NegFlag<SetFalse, [], [CC1Option], "Disable">,
+  BothFlags<[], [CC1Option],
+          " experimental lifetime safety for C++">>;
+
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Emit">,
@@ -2176,10 +2171,10 @@ def fwasm_exceptions : Flag<["-"], "fwasm-exceptions">, Group<f_Group>,
   HelpText<"Use WebAssembly style exceptions">;
 def exception_model : Separate<["-"], "exception-model">,
   Visibility<[CC1Option]>, HelpText<"The exception model">,
-  Values<"dwarf,sjlj,seh,wasm">,
-  NormalizedValuesScope<"LangOptions::ExceptionHandlingKind">,
-  NormalizedValues<["DwarfCFI", "SjLj", "WinEH", "Wasm"]>,
-  MarshallingInfoEnum<LangOpts<"ExceptionHandling">, "None">;
+  Values<"dwarf,sjlj,seh,wasm,none">,
+  NormalizedValuesScope<"CodeGenOptions::ExceptionHandlingKind">,
+  NormalizedValues<["DwarfCFI", "SjLj", "WinEH", "Wasm", "None"]>,
+  MarshallingInfoEnum<CodeGenOpts<"ExceptionHandling">, "None">;
 def exception_model_EQ : Joined<["-"], "exception-model=">,
   Visibility<[CC1Option]>, Alias<exception_model>;
 def fignore_exceptions : Flag<["-"], "fignore-exceptions">, Group<f_Group>,
@@ -3270,13 +3265,14 @@ defm skip_odr_check_in_gmf : BoolOption<"f", "skip-odr-check-in-gmf",
           "Perform ODR checks for decls in the global module fragment.">>,
   Group<f_Group>;
 
-def modules_reduced_bmi : Flag<["-"], "fmodules-reduced-bmi">,
-  Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Generate the reduced BMI">,
-  MarshallingInfoFlag<FrontendOpts<"GenReducedBMI">>;
+defm modules_reduced_bmi : BoolOption<"f", "modules-reduced-bmi",
+  FrontendOpts<"GenReducedBMI">, DefaultFalse,
+  NegFlag<SetFalse>,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Generate the reduced BMI">>;
 
 def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
-  Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<modules_reduced_bmi>;
+  Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<fmodules_reduced_bmi>;
 
 def fmodules_embed_all_files : Joined<["-"], "fmodules-embed-all-files">,
   Visibility<[ClangOption, CC1Option, CLOption]>,
@@ -7142,6 +7138,8 @@ def fintrinsic_modules_path : Separate<["-"], "fintrinsic-modules-path">,  Group
   HelpText<"Specify where to find the compiled intrinsic modules">,
   DocBrief<[{This option specifies the location of pre-compiled intrinsic modules,
   if they are not in the default location expected by the compiler.}]>;
+def fintrinsic_modules_path_EQ : Joined<["-"], "fintrinsic-modules-path=">,
+  Group<f_Group>, Alias<fintrinsic_modules_path>;
 
 defm backslash : OptInFC1FFlag<"backslash", "Specify that backslash in string introduces an escape character">;
 defm xor_operator : OptInFC1FFlag<"xor-operator", "Enable .XOR. as a synonym of .NEQV.">;
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index b8899e78176b4..d9f7fa2c31ade 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -202,10 +202,6 @@ class ToolChain {
   ToolChain(const Driver &D, const llvm::Triple &T,
             const llvm::opt::ArgList &Args);
 
-  /// Executes the given \p Executable and returns the stdout.
-  llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
-  executeToolChainProgram(StringRef Executable) const;
-
   void setTripleEnvironment(llvm::Triple::EnvironmentType Env);
 
   virtual Tool *buildAssembler() const;
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index b4f2a87fe7e83..7677604484f52 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4385,8 +4385,18 @@ struct FormatStyle {
     ///    #include "B/a.h"           #include "a/b.h"
     /// \endcode
     bool IgnoreCase;
+    /// When sorting includes in each block, only take file extensions into
+    /// account if two includes compare equal otherwise.
+    /// \code
+    ///    true:                          false:
+    ///    # include "A.h"         vs.    # include "A-util.h"
+    ///    # include "A.inc"              # include "A.h"
+    ///    # include "A-util.h"           # include "A.inc"
+    /// \endcode
+    bool IgnoreExtension;
     bool operator==(const SortIncludesOptions &R) const {
-      return Enabled == R.Enabled && IgnoreCase == R.IgnoreCase;
+      return Enabled == R.Enabled && IgnoreCase == R.IgnoreCase &&
+             IgnoreExtension == R.IgnoreExtension;
     }
     bool operator!=(const SortIncludesOptions &R) const {
       return !(*this == R);
diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index 78dff1165dcf5..83d2962cbf3ba 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -175,31 +175,42 @@ class Interpreter {
   llvm::Expected<llvm::orc::ExecutorAddr>
   getSymbolAddressFromLinkerName(llvm::StringRef LinkerName) const;
 
-  const llvm::SmallVectorImpl<Expr *> &getValuePrintingInfo() const {
-    return ValuePrintingInfo;
-  }
-
-  Expr *SynthesizeExpr(Expr *E);
+  std::unique_ptr<llvm::Module> GenModule(IncrementalAction *Action = nullptr);
+  PartialTranslationUnit &RegisterPTU(TranslationUnitDecl *TU,
+                                      std::unique_ptr<llvm::Module> M = {},
+                                      IncrementalAction *Action = nullptr);
 
 private:
   size_t getEffectivePTUSize() const;
   void markUserCodeStart();
   llvm::Expected<Expr *> ExtractValueFromExpr(Expr *E);
-  llvm::Expected<llvm::orc::ExecutorAddr> CompileDtorCall(CXXRecordDecl *CXXRD);
-
-  CodeGenerator *getCodeGen(IncrementalAction *Action = nullptr) const;
-  std::unique_ptr<llvm::Module> GenModule(IncrementalAction *Action = nullptr);
-  PartialTranslationUnit &RegisterPTU(TranslationUnitDecl *TU,
-                                      std::unique_ptr<llvm::Module> M = {},
-                                      IncrementalAction *Action = nullptr);
 
   // A cache for the compiled destructors used to for de-allocation of managed
   // clang::Values.
-  llvm::DenseMap<CXXRecordDecl *, llvm::orc::ExecutorAddr> Dtors;
+  mutable llvm::DenseMap<CXXRecordDecl *, llvm::orc::ExecutorAddr> Dtors;
 
-  llvm::SmallVector<Expr *, 4> ValuePrintingInfo;
+  std::array<Expr *, 4> ValuePrintingInfo = {0};
 
   std::unique_ptr<llvm::orc::LLJITBuilder> JITBuilder;
+
+  /// @}
+  /// @name Value and pretty printing support
+  /// @{
+
+  std::string ValueDataToString(const Value &V) const;
+  std::string ValueTypeToString(const Value &V) const;
+
+  llvm::Expected<Expr *> convertExprToValue(Expr *E);
+
+  // When we deallocate clang::Value we need to run the destructor of the type.
+  // This function forces emission of the needed dtor.
+  llvm::Expected<llvm::orc::ExecutorAddr>
+  CompileDtorCall(CXXRecordDecl *CXXRD) const;
+
+  /// @}
+  /// @name Code generation
+  /// @{
+  CodeGenerator *getCodeGen(IncrementalAction *Action = nullptr) const;
 };
 } // namespace clang
 
diff --git a/clang/include/clang/Interpreter/Value.h b/clang/include/clang/Interpreter/Value.h
index a93c0841915fc..b91301e6096eb 100644
--- a/clang/include/clang/Interpreter/Value.h
+++ b/clang/include/clang/Interpreter/Value.h
@@ -35,6 +35,7 @@
 
 #include "llvm/Config/llvm-config.h" // for LLVM_BUILD_LLVM_DYLIB, LLVM_BUILD_SHARED_LIBS
 #include "llvm/Support/Compiler.h"
+#include <cassert>
 #include <cstdint>
 
 // NOTE: Since the REPL itself could also include this runtime, extreme caution
@@ -97,6 +98,7 @@ class REPL_EXTERNAL_VISIBILITY Value {
     REPL_BUILTIN_TYPES
 #undef X
     void *m_Ptr;
+    unsigned char m_RawBits[sizeof(long double) * 8]; // widest type
   };
 
 public:
@@ -111,7 +113,7 @@ class REPL_EXTERNAL_VISIBILITY Value {
   };
 
   Value() = default;
-  Value(Interpreter *In, void *Ty);
+  Value(const Interpreter *In, void *Ty);
   Value(const Value &RHS);
   Value(Value &&RHS) noexcept;
   Value &operator=(const Value &RHS);
@@ -124,9 +126,7 @@ class REPL_EXTERNAL_VISIBILITY Value {
   void dump() const;
   void clear();
 
-  ASTContext &getASTContext();
   const ASTContext &getASTContext() const;
-  Interpreter &getInterpreter();
   const Interpreter &getInterpreter() const;
   QualType getType() const;
 
@@ -140,6 +140,7 @@ class REPL_EXTERNAL_VISIBILITY Value {
 
   void *getPtr() const;
   void setPtr(void *Ptr) { Data.m_Ptr = Ptr; }
+  void setRawBits(void *Ptr, unsigned NBits = sizeof(Storage));
 
 #define X(type, name)                                                          \
   void set##name(type Val) { Data.m_##name = Val; }                            \
@@ -193,7 +194,7 @@ class REPL_EXTERNAL_VISIBILITY Value {
     }
   };
 
-  Interpreter *Interp = nullptr;
+  const Interpreter *Interp = nullptr;
   void *OpaqueType = nullptr;
   Storage Data;
   Kind ValueKind = K_Unspecified;
@@ -205,6 +206,5 @@ template <> inline void *Value::as() const {
     return Data.m_Ptr;
   return (void *)as<uintptr_t>();
 }
-
 } // namespace clang
 #endif
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 4b7c8d609735f..e5680813e74de 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -44,8 +44,7 @@ namespace clang {
   class TypeLoc;
   class LangOptions;
   class IdentifierInfo;
-  class NamespaceAliasDecl;
-  class NamespaceDecl;
+  class NamespaceBaseDecl;
   class ObjCDeclSpec;
   class Sema;
   class Declarator;
@@ -129,29 +128,15 @@ class CXXScopeSpec {
   /// \param Context The AST context in which this nested-name-specifier
   /// resides.
   ///
-  /// \param Namespace The namespace.
+  /// \param Namespace The namespace or the namespace alias.
   ///
-  /// \param NamespaceLoc The location of the namespace name.
+  /// \param NamespaceLoc The location of the namespace name or the namespace
+  /// alias.
   ///
   /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceDecl *Namespace,
+  void Extend(ASTContext &Context, NamespaceBaseDecl *Namespace,
               SourceLocation NamespaceLoc, SourceLocation ColonColonLoc);
 
-  /// Extend the current nested-name-specifier by another
-  /// nested-name-specifier component of the form 'namespace-alias::'.
-  ///
-  /// \param Context The AST context in which this nested-name-specifier
-  /// resides.
-  ///
-  /// \param Alias The namespace alias.
-  ///
-  /// \param AliasLoc The location of the namespace alias
-  /// name.
-  ///
-  /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceAliasDecl *Alias,
-              SourceLocation AliasLoc, SourceLocation ColonColonLoc);
-
   /// Turn this (empty) nested-name-specifier into the global
   /// nested-name-specifier '::'.
   void MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc);
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 9135ff949eeab..d34a4146ddbd6 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -350,11 +350,6 @@ class Sema;
     LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToRvalue : 1;
 
-    /// Whether this was an identity conversion with qualification
-    /// conversion for the implicit object argument.
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned IsImplicitObjectArgumentQualificationConversion : 1;
-
     /// Whether this binds an implicit object argument to a
     /// non-static member function without a ref-qualifier.
     LLVM_PREFERRED_TYPE(bool)
@@ -453,11 +448,11 @@ class Sema;
 #endif
         return true;
       }
+      if (!C.hasSameType(getFromType(), getToType(2)))
+        return false;
       if (BindsToRvalue && IsLvalueReference)
         return false;
-      if (IsImplicitObjectArgumentQualificationConversion)
-        return C.hasSameUnqualifiedType(getFromType(), getToType(2));
-      return C.hasSameType(getFromType(), getToType(2));
+      return true;
     }
 
     ImplicitConversionRank getRank() const;
@@ -1496,8 +1491,6 @@ class Sema;
     OverloadingResult
     BestViableFunctionImpl(Sema &S, SourceLocation Loc,
                            OverloadCandidateSet::iterator &Best);
-    void PerfectViableFunction(Sema &S, SourceLocation Loc,
-                               OverloadCandidateSet::iterator &Best);
   };
 
   bool isBetterOverloadCandidate(Sema &S, const OverloadCandidate &Cand1,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b331acbe606b7..73eb730ca555b 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -834,6 +834,13 @@ enum class CCEKind {
                            ///< message.
 };
 
+/// Enums for the diagnostics of target, target_version and target_clones.
+namespace DiagAttrParams {
+enum DiagType { Unsupported, Duplicate, Unknown };
+enum Specifier { None, CPU, Tune };
+enum AttrName { Target, TargetClones, TargetVersion };
+} // end namespace DiagAttrParams
+
 void inferNoReturnAttr(Sema &S, const Decl *D);
 
 /// Sema - This implements semantic analysis and AST building for C.
@@ -4922,13 +4929,6 @@ class Sema final : public SemaBase {
   // handled later in the process, once we know how many exist.
   bool checkTargetAttr(SourceLocation LiteralLoc, StringRef Str);
 
-  /// Check Target Version attrs
-  bool checkTargetVersionAttr(SourceLocation Loc, Decl *D, StringRef Str);
-  bool checkTargetClonesAttrString(
-      SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
-      Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
-      SmallVectorImpl<SmallString<64>> &StringsBuffer);
-
   ErrorAttr *mergeErrorAttr(Decl *D, const AttributeCommonInfo &CI,
                             StringRef NewUserDiagnostic);
   FormatAttr *mergeFormatAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/include/clang/Sema/SemaARM.h b/clang/include/clang/Sema/SemaARM.h
index 788a7abf5f9c1..e77d65f9362d8 100644
--- a/clang/include/clang/Sema/SemaARM.h
+++ b/clang/include/clang/Sema/SemaARM.h
@@ -91,6 +91,11 @@ class SemaARM : public SemaBase {
   /// Return true if the given vector types are lax-compatible SVE vector types,
   /// false otherwise.
   bool areLaxCompatibleSveTypes(QualType FirstType, QualType SecondType);
+
+  bool checkTargetVersionAttr(const StringRef Str, const SourceLocation Loc);
+  bool checkTargetClonesAttr(SmallVectorImpl<StringRef> &Params,
+                             SmallVectorImpl<SourceLocation> &Locs,
+                             SmallVectorImpl<SmallString<64>> &NewParams);
 };
 
 SemaARM::ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD);
diff --git a/clang/include/clang/Sema/SemaRISCV.h b/clang/include/clang/Sema/SemaRISCV.h
index 8d2e1c6b7512f..844cc3ce4a440 100644
--- a/clang/include/clang/Sema/SemaRISCV.h
+++ b/clang/include/clang/Sema/SemaRISCV.h
@@ -55,6 +55,11 @@ class SemaRISCV : public SemaBase {
   bool DeclareAndesVectorBuiltins = false;
 
   std::unique_ptr<sema::RISCVIntrinsicManager> IntrinsicManager;
+
+  bool checkTargetVersionAttr(const StringRef Param, const SourceLocation Loc);
+  bool checkTargetClonesAttr(SmallVectorImpl<StringRef> &Params,
+                             SmallVectorImpl<SourceLocation> &Locs,
+                             SmallVectorImpl<SmallString<64>> &NewParams);
 };
 
 std::unique_ptr<sema::RISCVIntrinsicManager>
diff --git a/clang/include/clang/Sema/SemaX86.h b/clang/include/clang/Sema/SemaX86.h
index b5a23f1bede04..20783e344c02f 100644
--- a/clang/include/clang/Sema/SemaX86.h
+++ b/clang/include/clang/Sema/SemaX86.h
@@ -37,6 +37,10 @@ class SemaX86 : public SemaBase {
 
   void handleAnyInterruptAttr(Decl *D, const ParsedAttr &AL);
   void handleForceAlignArgPointerAttr(Decl *D, const ParsedAttr &AL);
+
+  bool checkTargetClonesAttr(SmallVectorImpl<StringRef> &Params,
+                             SmallVectorImpl<SourceLocation> &Locs,
+                             SmallVectorImpl<SmallString<64>> &NewParams);
 };
 } // namespace clang
 
diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def
index b8cde2e370960..613eb6af2005a 100644
--- a/clang/include/clang/Serialization/TypeBitCodes.def
+++ b/clang/include/clang/Serialization/TypeBitCodes.def
@@ -69,5 +69,6 @@ TYPE_BIT_CODE(CountAttributed, COUNT_ATTRIBUTED, 57)
 TYPE_BIT_CODE(ArrayParameter, ARRAY_PARAMETER, 58)
 TYPE_BIT_CODE(HLSLAttributedResource, HLSLRESOURCE_ATTRIBUTED, 59)
 TYPE_BIT_CODE(HLSLInlineSpirv, HLSL_INLINE_SPIRV, 60)
+TYPE_BIT_CODE(PredefinedSugar, PREDEFINED_SUGAR, 61)
 
 #undef TYPE_BIT_CODE
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 6370586e218ef..fbb34340a5c67 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -499,9 +499,6 @@ class ExprEngine {
   void VisitGuardedExpr(const Expr *Ex, const Expr *L, const Expr *R,
                         ExplodedNode *Pred, ExplodedNodeSet &Dst);
 
-  void VisitInitListExpr(const InitListExpr *E, ExplodedNode *Pred,
-                         ExplodedNodeSet &Dst);
-
   /// VisitAttributedStmt - Transfer function logic for AttributedStmt.
   void VisitAttributedStmt(const AttributedStmt *A, ExplodedNode *Pred,
                            ExplodedNodeSet &Dst);
@@ -591,6 +588,10 @@ class ExprEngine {
                                 ExplodedNode *Pred,
                                 ExplodedNodeSet &Dst);
 
+  void ConstructInitList(const Expr *Source, ArrayRef<Expr *> Args,
+                         bool IsTransparent, ExplodedNode *Pred,
+                         ExplodedNodeSet &Dst);
+
   /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume
   /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'.
   void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
diff --git a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h
index 43dbfb1585151..da3efd76c6aae 100644
--- a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h
+++ b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h
@@ -38,29 +38,29 @@
 // function clang_registerCheckers. For example:
 //
 //    extern "C"
-//    void clang_registerCheckers (CheckerRegistry &registry) {
-//      registry.addChecker<MainCallChecker>("example.MainCallChecker",
-//        "Disallows calls to functions called main");
+//    void clang_registerCheckers(CheckerRegistry &Registry) {
+//      Registry.addChecker<MainCallChecker>(
+//                    "example.MainCallChecker",
+//                    "Disallows calls to functions called main");
 //    }
 //
-// The first method argument is the full name of the checker, including its
-// enclosing package. By convention, the registered name of a checker is the
-// name of the associated class (the template argument).
-// The second method argument is a short human-readable description of the
-// checker.
+// The first argument of this templated method is the full name of the checker
+// (including its package), while the second argument is a short description
+// that is printed by `-analyzer-checker-help`.
 //
-// The clang_registerCheckers function may add any number of checkers to the
-// registry. If any checkers require additional initialization, use the three-
-// argument form of CheckerRegistry::addChecker.
+// A plugin may register several separate checkers by calling `addChecker()`
+// multiple times. If a checker requires custom registration functions (e.g.
+// checker option handling) use the non-templated overload of `addChecker` that
+// takes two callback functions as the first two parameters.
 //
 // To load a checker plugin, specify the full path to the dynamic library as
 // the argument to the -load option in the cc1 frontend. You can then enable
 // your custom checker using the -analyzer-checker:
 //
-//   clang -cc1 -load </path/to/plugin.dylib> -analyze
-//     -analyzer-checker=<example.MainCallChecker>
+//   clang -cc1 -load /path/to/plugin.dylib -analyze
+//     -analyzer-checker=example.MainCallChecker
 //
-// For a complete working example, see examples/analyzer-plugin.
+// For complete examples, see clang/lib/Analysis/plugins/SampleAnalyzer
 
 #ifndef CLANG_ANALYZER_API_VERSION_STRING
 // FIXME: The Clang version string is not particularly granular;
@@ -108,30 +108,25 @@ class CheckerRegistry {
     mgr.template registerChecker<T>();
   }
 
-  template <typename T> static bool returnTrue(const CheckerManager &mgr) {
-    return true;
-  }
+  static bool returnTrue(const CheckerManager &) { return true; }
 
 public:
-  /// Adds a checker to the registry. Use this non-templated overload when your
-  /// checker requires custom initialization.
-  void addChecker(RegisterCheckerFn Fn, ShouldRegisterFunction sfn,
-                  StringRef FullName, StringRef Desc, StringRef DocsUri,
-                  bool IsHidden);
-
-  /// Adds a checker to the registry. Use this templated overload when your
-  /// checker does not require any custom initialization.
-  /// This function isn't really needed and probably causes more headaches than
-  /// the tiny convenience that it provides, but external plugins might use it,
-  /// and there isn't a strong incentive to remove it.
+  /// Adds a checker to the registry.
+  /// Use this for a checker defined in a plugin if it requires custom
+  /// registration functions (e.g. for handling checker options).
+  /// NOTE: As of now `DocsUri` is never queried from the checker registry.
+  void addChecker(RegisterCheckerFn Fn, ShouldRegisterFunction Sfn,
+                  StringRef FullName, StringRef Desc,
+                  StringRef DocsUri = "NoDocsUri", bool IsHidden = false);
+
+  /// Adds a checker to the registry.
+  /// Use this for a checker defined in a plugin if it doesn't require custom
+  /// registration functions.
   template <class T>
-  void addChecker(StringRef FullName, StringRef Desc, StringRef DocsUri,
-                  bool IsHidden = false) {
-    // Avoid MSVC's Compiler Error C2276:
-    // http://msdn.microsoft.com/en-us/library/850cstw1(v=VS.80).aspx
+  void addChecker(StringRef FullName, StringRef Desc,
+                  StringRef DocsUri = "NoDocsUri", bool IsHidden = false) {
     addChecker(&CheckerRegistry::initializeManager<CheckerManager, T>,
-               &CheckerRegistry::returnTrue<T>, FullName, Desc, DocsUri,
-               IsHidden);
+               &CheckerRegistry::returnTrue, FullName, Desc, DocsUri, IsHidden);
   }
 
   /// Makes the checker with the full name \p fullName depend on the checker
diff --git a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
index fba90d8c51a66..bf060617deec7 100644
--- a/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
+++ b/clang/include/clang/Tooling/Inclusions/IncludeStyle.h
@@ -126,8 +126,8 @@ struct IncludeStyle {
   /// * ``""`` means "arbitrary suffix"
   /// * ``"$"`` means "no suffix"
   ///
-  /// For example, if configured to ``"(_test)?$"``, then a header a.h would be seen
-  /// as the "main" include in both a.cc and a_test.cc.
+  /// For example, if configured to ``"(_test)?$"``, then a header a.h would be
+  /// seen as the "main" include in both a.cc and a_test.cc.
   /// \version 3.9
   std::string IncludeIsMainRegex;
 
diff --git a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
index 015dbba26f688..271232e66626e 100644
--- a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
+++ b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
@@ -115,7 +115,8 @@ class RecursiveSymbolVisitor
     // The base visitor will visit NNSL prefixes, so we should only look at
     // the current NNS.
     if (NNS) {
-      const NamespaceDecl *ND = NNS.getNestedNameSpecifier()->getAsNamespace();
+      const auto *ND = dyn_cast_if_present<NamespaceDecl>(
+          NNS.getNestedNameSpecifier()->getAsNamespace());
       if (!visit(ND, NNS.getLocalBeginLoc(), NNS.getLocalEndLoc()))
         return false;
     }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 50bd93a143a28..6b6275faa215a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2597,6 +2597,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
   }
   break;
 
+  case Type::PredefinedSugar:
+    return getTypeInfo(cast<PredefinedSugarType>(T)->desugar().getTypePtr());
+
   case Type::Pipe:
     Width = Target->getPointerWidth(LangAS::opencl_global);
     Align = Target->getPointerAlign(LangAS::opencl_global);
@@ -5216,6 +5219,39 @@ QualType ASTContext::getDependentBitIntType(bool IsUnsigned,
   return QualType(New, 0);
 }
 
+QualType
+ASTContext::getPredefinedSugarType(PredefinedSugarType::Kind KD) const {
+  using Kind = PredefinedSugarType::Kind;
+
+  if (auto *Target = PredefinedSugarTypes[llvm::to_underlying(KD)];
+      Target != nullptr)
+    return QualType(Target, 0);
+
+  auto getCanonicalType = [](const ASTContext &Ctx, Kind KDI) -> QualType {
+    switch (KDI) {
+      // size_t (C99TC3 6.5.3.4), signed size_t (C++23 5.13.2) and
+      // ptrdiff_t (C99TC3 6.5.6) Although these types are not built-in, they
+      // are part of the core language and are widely used. Using
+      // PredefinedSugarType makes these types as named sugar types rather than
+      // standard integer types, enabling better hints and diagnostics.
+    case Kind::SizeT:
+      return Ctx.getFromTargetType(Ctx.Target->getSizeType());
+    case Kind::SignedSizeT:
+      return Ctx.getFromTargetType(Ctx.Target->getSignedSizeType());
+    case Kind::PtrdiffT:
+      return Ctx.getFromTargetType(Ctx.Target->getPtrDiffType(LangAS::Default));
+    }
+    llvm_unreachable("unexpected kind");
+  };
+
+  auto *New = new (*this, alignof(PredefinedSugarType))
+      PredefinedSugarType(KD, &Idents.get(PredefinedSugarType::getName(KD)),
+                          getCanonicalType(*this, static_cast<Kind>(KD)));
+  Types.push_back(New);
+  PredefinedSugarTypes[llvm::to_underlying(KD)] = New;
+  return QualType(New, 0);
+}
+
 #ifndef NDEBUG
 static bool NeedsInjectedClassNameType(const RecordDecl *D) {
   if (!isa<CXXRecordDecl>(D)) return false;
@@ -6796,14 +6832,31 @@ QualType ASTContext::getTagDeclType(const TagDecl *Decl) const {
 /// getSizeType - Return the unique type for "size_t" (C99 7.17), the result
 /// of the sizeof operator (C99 6.5.3.4p4). The value is target dependent and
 /// needs to agree with the definition in <stddef.h>.
-CanQualType ASTContext::getSizeType() const {
+QualType ASTContext::getSizeType() const {
+  return getPredefinedSugarType(PredefinedSugarType::Kind::SizeT);
+}
+
+CanQualType ASTContext::getCanonicalSizeType() const {
   return getFromTargetType(Target->getSizeType());
 }
 
 /// Return the unique signed counterpart of the integer type
 /// corresponding to size_t.
-CanQualType ASTContext::getSignedSizeType() const {
-  return getFromTargetType(Target->getSignedSizeType());
+QualType ASTContext::getSignedSizeType() const {
+  return getPredefinedSugarType(PredefinedSugarType::Kind::SignedSizeT);
+}
+
+/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17)
+/// defined in <stddef.h>. Pointer - pointer requires this (C99 6.5.6p9).
+QualType ASTContext::getPointerDiffType() const {
+  return getPredefinedSugarType(PredefinedSugarType::Kind::PtrdiffT);
+}
+
+/// Return the unique unsigned counterpart of "ptrdiff_t"
+/// integer type. The standard (C11 7.21.6.1p7) refers to this type
+/// in the definition of %tu format specifier.
+QualType ASTContext::getUnsignedPointerDiffType() const {
+  return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default));
 }
 
 /// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5).
@@ -6838,19 +6891,6 @@ QualType ASTContext::getUIntPtrType() const {
   return getCorrespondingUnsignedType(getIntPtrType());
 }
 
-/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17)
-/// defined in <stddef.h>. Pointer - pointer requires this (C99 6.5.6p9).
-QualType ASTContext::getPointerDiffType() const {
-  return getFromTargetType(Target->getPtrDiffType(LangAS::Default));
-}
-
-/// Return the unique unsigned counterpart of "ptrdiff_t"
-/// integer type. The standard (C11 7.21.6.1p7) refers to this type
-/// in the definition of %tu format specifier.
-QualType ASTContext::getUnsignedPointerDiffType() const {
-  return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default));
-}
-
 /// Return the unique type for "pid_t" defined in
 /// <sys/types.h>. We need this to compute the correct type for vfork().
 QualType ASTContext::getProcessIDType() const {
@@ -7387,21 +7427,9 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X,
   return hasSameTemplateName(TAX.getAsTemplate(), TAY.getAsTemplate());
 }
 
-static NamespaceDecl *getNamespace(const NestedNameSpecifier *X) {
-  if (auto *NS = X->getAsNamespace())
-    return NS;
-  if (auto *NAS = X->getAsNamespaceAlias())
-    return NAS->getNamespace();
-  return nullptr;
-}
-
 static bool isSameQualifier(const NestedNameSpecifier *X,
                             const NestedNameSpecifier *Y) {
-  if (auto *NSX = getNamespace(X)) {
-    auto *NSY = getNamespace(Y);
-    if (!NSY || NSX->getCanonicalDecl() != NSY->getCanonicalDecl())
-      return false;
-  } else if (X->getKind() != Y->getKind())
+  if (X->getKind() != Y->getKind())
     return false;
 
   // FIXME: For namespaces and types, we're permitted to check that the entity
@@ -7412,8 +7440,8 @@ static bool isSameQualifier(const NestedNameSpecifier *X,
       return false;
     break;
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
-    // We've already checked that we named the same namespace.
+    if (!declaresSameEntity(X->getAsNamespace(), Y->getAsNamespace()))
+      return false;
     break;
   case NestedNameSpecifier::TypeSpec:
     if (X->getAsType()->getCanonicalTypeInternal() !=
@@ -7836,17 +7864,10 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
                                        NNS->getAsIdentifier());
 
   case NestedNameSpecifier::Namespace:
-    // A namespace is canonical; build a nested-name-specifier with
-    // this namespace and no prefix.
-    return NestedNameSpecifier::Create(*this, nullptr,
-                                       NNS->getAsNamespace()->getFirstDecl());
-
-  case NestedNameSpecifier::NamespaceAlias:
     // A namespace is canonical; build a nested-name-specifier with
     // this namespace and no prefix.
     return NestedNameSpecifier::Create(
-        *this, nullptr,
-        NNS->getAsNamespaceAlias()->getNamespace()->getFirstDecl());
+        *this, nullptr, NNS->getAsNamespace()->getNamespace()->getFirstDecl());
 
   // The difference between TypeSpec and TypeSpecWithTemplate is that the
   // latter will have the 'template' keyword when printed.
@@ -13698,26 +13719,27 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx,
     R = NestedNameSpecifier::Create(Ctx, P, II);
     break;
   }
-  case NestedNameSpecifier::SpecifierKind::Namespace:
-  case NestedNameSpecifier::SpecifierKind::NamespaceAlias: {
-    assert(K2 == NestedNameSpecifier::SpecifierKind::Namespace ||
-           K2 == NestedNameSpecifier::SpecifierKind::NamespaceAlias);
+  case NestedNameSpecifier::SpecifierKind::Namespace: {
+    assert(K2 == NestedNameSpecifier::SpecifierKind::Namespace);
     // The prefixes for namespaces are not significant, its declaration
     // identifies it uniquely.
     NestedNameSpecifier *P =
         ::getCommonNNS(Ctx, NNS1->getPrefix(), NNS2->getPrefix(),
                        /*IsSame=*/false);
-    NamespaceAliasDecl *A1 = NNS1->getAsNamespaceAlias(),
-                       *A2 = NNS2->getAsNamespaceAlias();
-    // Are they the same namespace alias?
-    if (declaresSameEntity(A1, A2)) {
-      R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(A1, A2));
+    NamespaceBaseDecl *Namespace1 = NNS1->getAsNamespace(),
+                      *Namespace2 = NNS2->getAsNamespace();
+    auto Kind = Namespace1->getKind();
+    if (Kind != Namespace2->getKind() ||
+        (Kind == Decl::NamespaceAlias &&
+         !declaresSameEntity(Namespace1, Namespace2))) {
+      R = NestedNameSpecifier::Create(
+          Ctx, P,
+          ::getCommonDeclChecked(Namespace1->getNamespace(),
+                                 Namespace2->getNamespace()));
       break;
     }
-    // Otherwise, look at the namespaces only.
-    NamespaceDecl *N1 = A1 ? A1->getNamespace() : NNS1->getAsNamespace(),
-                  *N2 = A2 ? A2->getNamespace() : NNS2->getAsNamespace();
-    R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(N1, N2));
+    R = NestedNameSpecifier::Create(
+        Ctx, P, ::getCommonDeclChecked(Namespace1, Namespace2));
     break;
   }
   case NestedNameSpecifier::SpecifierKind::TypeSpec: {
@@ -14521,6 +14543,10 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X,
                                       DX->isCountInBytes(), DX->isOrNull(),
                                       CDX);
   }
+  case Type::PredefinedSugar:
+    assert(cast<PredefinedSugarType>(X)->getKind() !=
+           cast<PredefinedSugarType>(Y)->getKind());
+    return QualType();
   }
   llvm_unreachable("Unhandled Type Class");
 }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 4d3bd985739fb..b9bdabe0b8c06 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -2080,6 +2080,11 @@ ExpectedType clang::ASTNodeImporter::VisitDependentBitIntType(
                                                         *ToNumBitsExprOrErr);
 }
 
+ExpectedType clang::ASTNodeImporter::VisitPredefinedSugarType(
+    const clang::PredefinedSugarType *T) {
+  return Importer.getToContext().getPredefinedSugarType(T->getKind());
+}
+
 ExpectedType clang::ASTNodeImporter::VisitDependentSizedMatrixType(
     const clang::DependentSizedMatrixType *T) {
   Error Err = Error::success();
@@ -10063,17 +10068,10 @@ ASTImporter::Import(NestedNameSpecifier *FromNNS) {
   case NestedNameSpecifier::Namespace:
     if (ExpectedDecl NSOrErr = Import(FromNNS->getAsNamespace())) {
       return NestedNameSpecifier::Create(ToContext, Prefix,
-                                         cast<NamespaceDecl>(*NSOrErr));
+                                         cast<NamespaceBaseDecl>(*NSOrErr));
     } else
       return NSOrErr.takeError();
 
-  case NestedNameSpecifier::NamespaceAlias:
-    if (ExpectedDecl NSADOrErr = Import(FromNNS->getAsNamespaceAlias()))
-      return NestedNameSpecifier::Create(ToContext, Prefix,
-                                         cast<NamespaceAliasDecl>(*NSADOrErr));
-    else
-      return NSADOrErr.takeError();
-
   case NestedNameSpecifier::Global:
     return NestedNameSpecifier::GlobalSpecifier(ToContext);
 
@@ -10139,11 +10137,6 @@ ASTImporter::Import(NestedNameSpecifierLoc FromNNS) {
                      ToLocalEndLoc);
       break;
 
-    case NestedNameSpecifier::NamespaceAlias:
-      Builder.Extend(getToContext(), Spec->getAsNamespaceAlias(),
-                     ToLocalBeginLoc, ToLocalEndLoc);
-      break;
-
     case NestedNameSpecifier::TypeSpec: {
       SourceLocation ToTLoc;
       if (Error Err = importInto(ToTLoc, NNS.getTypeLoc().getBeginLoc()))
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 3aa6b37844103..0f2762d5c0f14 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -598,9 +598,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
   case NestedNameSpecifier::Namespace:
     return IsStructurallyEquivalent(Context, NNS1->getAsNamespace(),
                                     NNS2->getAsNamespace());
-  case NestedNameSpecifier::NamespaceAlias:
-    return IsStructurallyEquivalent(Context, NNS1->getAsNamespaceAlias(),
-                                    NNS2->getAsNamespaceAlias());
   case NestedNameSpecifier::TypeSpec:
     return IsStructurallyEquivalent(Context, QualType(NNS1->getAsType(), 0),
                                     QualType(NNS2->getAsType(), 0));
@@ -1480,6 +1477,13 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       return false;
     break;
   }
+  case Type::PredefinedSugar: {
+    const auto *TP1 = cast<PredefinedSugarType>(T1);
+    const auto *TP2 = cast<PredefinedSugarType>(T2);
+    if (TP1->getKind() != TP2->getKind())
+      return false;
+    break;
+  }
   } // end switch
 
   return true;
diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
index 965e235036031..3288585683c10 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.cpp
@@ -62,7 +62,7 @@ void ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl,
                   (Func->hasThisPointer() && !Func->isThisPointerExplicit());
   for (auto ParamOffset : llvm::drop_begin(Func->ParamOffsets, Drop)) {
     const ParmVarDecl *PD = FuncDecl->parameters()[ParamIndex];
-    std::optional<PrimType> T = Ctx.classify(PD->getType());
+    OptPrimType T = Ctx.classify(PD->getType());
     this->Params.insert({PD, {ParamOffset, T != std::nullopt}});
     ++ParamIndex;
   }
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index ea473730350b6..07efd6f852fc2 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -237,7 +237,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (SubExpr->getType().isVolatileQualified())
       return this->emitInvalidCast(CastKind::Volatile, /*Fatal=*/true, CE);
 
-    std::optional<PrimType> SubExprT = classify(SubExpr->getType());
+    OptPrimType SubExprT = classify(SubExpr->getType());
     // Prepare storage for the result.
     if (!Initializing && !SubExprT) {
       std::optional<unsigned> LocalIndex = allocateLocal(SubExpr);
@@ -388,7 +388,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     const Descriptor *Desc = nullptr;
     const QualType PointeeType = CE->getType()->getPointeeType();
     if (!PointeeType.isNull()) {
-      if (std::optional<PrimType> T = classify(PointeeType))
+      if (OptPrimType T = classify(PointeeType))
         Desc = P.createDescriptor(SubExpr, *T);
       else
         Desc = P.createDescriptor(SubExpr, PointeeType.getTypePtr(),
@@ -436,7 +436,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     PrimType T = classifyPrim(IntType);
     QualType PtrType = CE->getType();
     const Descriptor *Desc;
-    if (std::optional<PrimType> T = classify(PtrType->getPointeeType()))
+    if (OptPrimType T = classify(PtrType->getPointeeType()))
       Desc = P.createDescriptor(SubExpr, *T);
     else if (PtrType->getPointeeType()->isVoidType())
       Desc = nullptr;
@@ -473,12 +473,12 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
       return this->emitInvalidCast(CastKind::Reinterpret, /*Fatal=*/true, CE);
     }
     QualType SubExprTy = SubExpr->getType();
-    std::optional<PrimType> FromT = classify(SubExprTy);
+    OptPrimType FromT = classify(SubExprTy);
     // Casts from integer/vector to vector.
     if (CE->getType()->isVectorType())
       return this->emitBuiltinBitCast(CE);
 
-    std::optional<PrimType> ToT = classify(CE->getType());
+    OptPrimType ToT = classify(CE->getType());
     if (!FromT || !ToT)
       return false;
 
@@ -504,7 +504,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
   case CK_IntegralToBoolean:
   case CK_FixedPointToBoolean: {
     // HLSL uses this to cast to one-element vectors.
-    std::optional<PrimType> FromT = classify(SubExpr->getType());
+    OptPrimType FromT = classify(SubExpr->getType());
     if (!FromT)
       return false;
 
@@ -517,8 +517,8 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
   case CK_BooleanToSignedIntegral:
   case CK_IntegralCast: {
-    std::optional<PrimType> FromT = classify(SubExpr->getType());
-    std::optional<PrimType> ToT = classify(CE->getType());
+    OptPrimType FromT = classify(SubExpr->getType());
+    OptPrimType ToT = classify(CE->getType());
     if (!FromT || !ToT)
       return false;
 
@@ -688,7 +688,7 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
   case CK_HLSLVectorTruncation: {
     assert(SubExpr->getType()->isVectorType());
-    if (std::optional<PrimType> ResultT = classify(CE)) {
+    if (OptPrimType ResultT = classify(CE)) {
       assert(!DiscardResult);
       // Result must be either a float or integer. Take the first element.
       if (!this->visit(SubExpr))
@@ -872,9 +872,9 @@ bool Compiler<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   }
 
   // Typecheck the args.
-  std::optional<PrimType> LT = classify(LHS);
-  std::optional<PrimType> RT = classify(RHS);
-  std::optional<PrimType> T = classify(BO->getType());
+  OptPrimType LT = classify(LHS);
+  OptPrimType RT = classify(RHS);
+  OptPrimType T = classify(BO->getType());
 
   // Special case for C++'s three-way/spaceship operator <=>, which
   // returns a std::{strong,weak,partial}_ordering (which is a class, so doesn't
@@ -995,8 +995,8 @@ bool Compiler<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
       (!LHS->getType()->isPointerType() && !RHS->getType()->isPointerType()))
     return false;
 
-  std::optional<PrimType> LT = classify(LHS);
-  std::optional<PrimType> RT = classify(RHS);
+  OptPrimType LT = classify(LHS);
+  OptPrimType RT = classify(RHS);
 
   if (!LT || !RT)
     return false;
@@ -1068,7 +1068,7 @@ bool Compiler<Emitter>::VisitLogicalBinOp(const BinaryOperator *E) {
   BinaryOperatorKind Op = E->getOpcode();
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
-  std::optional<PrimType> T = classify(E->getType());
+  OptPrimType T = classify(E->getType());
 
   if (Op == BO_LOr) {
     // Logical OR. Visit LHS and only evaluate RHS if LHS was FALSE.
@@ -1648,7 +1648,7 @@ bool Compiler<Emitter>::VisitImplicitValueInitExpr(
     const ImplicitValueInitExpr *E) {
   QualType QT = E->getType();
 
-  if (std::optional<PrimType> T = classify(QT))
+  if (OptPrimType T = classify(QT))
     return this->visitZeroInitializer(*T, QT, E);
 
   if (QT->isRecordType()) {
@@ -1734,7 +1734,7 @@ bool Compiler<Emitter>::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
   if (!Success)
     return false;
 
-  std::optional<PrimType> IndexT = classify(Index->getType());
+  OptPrimType IndexT = classify(Index->getType());
   // In error-recovery cases, the index expression has a dependent type.
   if (!IndexT)
     return this->emitError(E);
@@ -1776,7 +1776,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
   }
 
   // Primitive values.
-  if (std::optional<PrimType> T = classify(QT)) {
+  if (OptPrimType T = classify(QT)) {
     assert(!DiscardResult);
     if (Inits.size() == 0)
       return this->visitZeroInitializer(*T, QT, E);
@@ -1840,7 +1840,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
           FToInit = cast<CXXParenListInitExpr>(E)->getInitializedFieldInUnion();
 
         const Record::Field *FieldToInit = R->getField(FToInit);
-        if (std::optional<PrimType> T = classify(Init)) {
+        if (OptPrimType T = classify(Init)) {
           if (!initPrimitiveField(FieldToInit, Init, *T, /*Activate=*/true))
             return false;
         } else {
@@ -1859,7 +1859,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
              R->getField(InitIndex)->isUnnamedBitField())
         ++InitIndex;
 
-      if (std::optional<PrimType> T = classify(Init)) {
+      if (OptPrimType T = classify(Init)) {
         const Record::Field *FieldToInit = R->getField(InitIndex);
         if (!initPrimitiveField(FieldToInit, Init, *T))
           return false;
@@ -1899,7 +1899,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
     if (!this->emitCheckArraySize(NumElems, E))
       return false;
 
-    std::optional<PrimType> InitT = classify(CAT->getElementType());
+    OptPrimType InitT = classify(CAT->getElementType());
     unsigned ElementIndex = 0;
     for (const Expr *Init : Inits) {
       if (const auto *EmbedS =
@@ -2013,7 +2013,7 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
 /// this.
 template <class Emitter>
 bool Compiler<Emitter>::visitArrayElemInit(unsigned ElemIndex, const Expr *Init,
-                                           std::optional<PrimType> InitT) {
+                                           OptPrimType InitT) {
   if (InitT) {
     // Visit the primitive element like normal.
     if (!this->visit(Init))
@@ -2042,7 +2042,7 @@ bool Compiler<Emitter>::visitCallArgs(ArrayRef<const Expr *> Args,
 
   unsigned ArgIndex = 0;
   for (const Expr *Arg : Args) {
-    if (std::optional<PrimType> T = classify(Arg)) {
+    if (OptPrimType T = classify(Arg)) {
       if (!this->visit(Arg))
         return false;
     } else {
@@ -2097,7 +2097,7 @@ bool Compiler<Emitter>::VisitSubstNonTypeTemplateParmExpr(
 
 template <class Emitter>
 bool Compiler<Emitter>::VisitConstantExpr(const ConstantExpr *E) {
-  std::optional<PrimType> T = classify(E->getType());
+  OptPrimType T = classify(E->getType());
   if (T && E->hasAPValueResult()) {
     // Try to emit the APValue directly, without visiting the subexpr.
     // This will only fail if we can't emit the APValue, so won't emit any
@@ -2292,7 +2292,7 @@ bool Compiler<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   const auto maybeLoadValue = [&]() -> bool {
     if (E->isGLValue())
       return true;
-    if (std::optional<PrimType> T = classify(E))
+    if (OptPrimType T = classify(E))
       return this->emitLoadPop(*T, E);
     return false;
   };
@@ -2357,7 +2357,7 @@ bool Compiler<Emitter>::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E) {
   //   Investigate compiling this to a loop.
   const Expr *SubExpr = E->getSubExpr();
   size_t Size = E->getArraySize().getZExtValue();
-  std::optional<PrimType> SubExprT = classify(SubExpr);
+  OptPrimType SubExprT = classify(SubExpr);
 
   // So, every iteration, we execute an assignment here
   // where the LHS is on the stack (the target array)
@@ -2589,8 +2589,8 @@ bool Compiler<Emitter>::VisitFloatCompoundAssignOperator(
   QualType LHSType = LHS->getType();
   QualType LHSComputationType = E->getComputationLHSType();
   QualType ResultType = E->getComputationResultType();
-  std::optional<PrimType> LT = classify(LHSComputationType);
-  std::optional<PrimType> RT = classify(ResultType);
+  OptPrimType LT = classify(LHSComputationType);
+  OptPrimType RT = classify(ResultType);
 
   assert(ResultType->isFloatingType());
 
@@ -2659,8 +2659,8 @@ bool Compiler<Emitter>::VisitPointerCompoundAssignOperator(
   BinaryOperatorKind Op = E->getOpcode();
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
-  std::optional<PrimType> LT = classify(LHS->getType());
-  std::optional<PrimType> RT = classify(RHS->getType());
+  OptPrimType LT = classify(LHS->getType());
+  OptPrimType RT = classify(RHS->getType());
 
   if (Op != BO_AddAssign && Op != BO_SubAssign)
     return false;
@@ -2698,11 +2698,10 @@ bool Compiler<Emitter>::VisitCompoundAssignOperator(
 
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
-  std::optional<PrimType> LHSComputationT =
-      classify(E->getComputationLHSType());
-  std::optional<PrimType> LT = classify(LHS->getType());
-  std::optional<PrimType> RT = classify(RHS->getType());
-  std::optional<PrimType> ResultT = classify(E->getType());
+  OptPrimType LHSComputationT = classify(E->getComputationLHSType());
+  OptPrimType LT = classify(LHS->getType());
+  OptPrimType RT = classify(RHS->getType());
+  OptPrimType ResultT = classify(E->getType());
 
   if (!Ctx.getLangOpts().CPlusPlus14)
     return this->visit(RHS) && this->visit(LHS) && this->emitError(E);
@@ -2837,7 +2836,7 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
 
   // When we're initializing a global variable *or* the storage duration of
   // the temporary is explicitly static, create a global variable.
-  std::optional<PrimType> SubExprT = classify(SubExpr);
+  OptPrimType SubExprT = classify(SubExpr);
   bool IsStatic = E->getStorageDuration() == SD_Static;
   if (IsStatic) {
     std::optional<unsigned> GlobalIndex = P.createGlobal(E);
@@ -2931,7 +2930,7 @@ bool Compiler<Emitter>::VisitCompoundLiteralExpr(const CompoundLiteralExpr *E) {
     return this->visitInitializer(Init) && this->emitFinishInit(E);
   }
 
-  std::optional<PrimType> T = classify(E->getType());
+  OptPrimType T = classify(E->getType());
   if (E->isFileScope()) {
     // Avoid creating a variable if this is a primitive RValue anyway.
     if (T && !E->isLValue())
@@ -3014,7 +3013,7 @@ bool Compiler<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
       continue;
     ++CaptureInitIt;
 
-    if (std::optional<PrimType> T = classify(Init)) {
+    if (OptPrimType T = classify(Init)) {
       if (!this->visit(Init))
         return false;
 
@@ -3061,21 +3060,21 @@ bool Compiler<Emitter>::VisitCXXReinterpretCastExpr(
     const CXXReinterpretCastExpr *E) {
   const Expr *SubExpr = E->getSubExpr();
 
-  std::optional<PrimType> FromT = classify(SubExpr);
-  std::optional<PrimType> ToT = classify(E);
+  OptPrimType FromT = classify(SubExpr);
+  OptPrimType ToT = classify(E);
 
   if (!FromT || !ToT)
     return this->emitInvalidCast(CastKind::Reinterpret, /*Fatal=*/true, E);
 
   if (FromT == PT_Ptr || ToT == PT_Ptr) {
     // Both types could be PT_Ptr because their expressions are glvalues.
-    std::optional<PrimType> PointeeFromT;
+    OptPrimType PointeeFromT;
     if (SubExpr->getType()->isPointerOrReferenceType())
       PointeeFromT = classify(SubExpr->getType()->getPointeeType());
     else
       PointeeFromT = classify(SubExpr->getType());
 
-    std::optional<PrimType> PointeeToT;
+    OptPrimType PointeeToT;
     if (E->getType()->isPointerOrReferenceType())
       PointeeToT = classify(E->getType()->getPointeeType());
     else
@@ -3344,7 +3343,7 @@ bool Compiler<Emitter>::VisitCXXScalarValueInitExpr(
   if (DiscardResult || Ty->isVoidType())
     return true;
 
-  if (std::optional<PrimType> T = classify(Ty))
+  if (OptPrimType T = classify(Ty))
     return this->visitZeroInitializer(*T, Ty, E);
 
   if (const auto *CT = Ty->getAs<ComplexType>()) {
@@ -3457,7 +3456,7 @@ bool Compiler<Emitter>::VisitCXXNewExpr(const CXXNewExpr *E) {
   assert(classifyPrim(E->getType()) == PT_Ptr);
   const Expr *Init = E->getInitializer();
   QualType ElementType = E->getAllocatedType();
-  std::optional<PrimType> ElemT = classify(ElementType);
+  OptPrimType ElemT = classify(ElementType);
   unsigned PlacementArgs = E->getNumPlacementArgs();
   const FunctionDecl *OperatorNew = E->getOperatorNew();
   const Expr *PlacementDest = nullptr;
@@ -3645,7 +3644,7 @@ bool Compiler<Emitter>::VisitCXXNewExpr(const CXXNewExpr *E) {
           if (!this->emitStorePop(InitT, E))
             return false;
         } else if (DynamicInit) {
-          if (std::optional<PrimType> InitT = classify(DynamicInit)) {
+          if (OptPrimType InitT = classify(DynamicInit)) {
             if (!this->visit(DynamicInit))
               return false;
             if (!this->emitStorePop(*InitT, E))
@@ -4154,7 +4153,7 @@ bool Compiler<Emitter>::visitInitializer(const Expr *E) {
 }
 
 template <class Emitter> bool Compiler<Emitter>::visitBool(const Expr *E) {
-  std::optional<PrimType> T = classify(E->getType());
+  OptPrimType T = classify(E->getType());
   if (!T) {
     // Convert complex values to bool.
     if (E->getType()->isAnyComplexType()) {
@@ -4309,7 +4308,7 @@ bool Compiler<Emitter>::visitZeroArrayInitializer(QualType T, const Expr *E) {
   QualType ElemType = AT->getElementType();
   size_t NumElems = cast<ConstantArrayType>(AT)->getZExtSize();
 
-  if (std::optional<PrimType> ElemT = classify(ElemType)) {
+  if (OptPrimType ElemT = classify(ElemType)) {
     for (size_t I = 0; I != NumElems; ++I) {
       if (!this->visitZeroInitializer(*ElemT, ElemType, E))
         return false;
@@ -4602,7 +4601,7 @@ bool Compiler<Emitter>::visitExpr(const Expr *E, bool DestroyToplevelScope) {
   }
 
   // Expressions with a primitive return type.
-  if (std::optional<PrimType> T = classify(E)) {
+  if (OptPrimType T = classify(E)) {
     if (!visit(E))
       return false;
 
@@ -4679,7 +4678,7 @@ bool Compiler<Emitter>::visitDeclAndReturn(const VarDecl *VD,
   if (!this->visitVarDecl(VD, /*Toplevel=*/true))
     return false;
 
-  std::optional<PrimType> VarT = classify(VD->getType());
+  OptPrimType VarT = classify(VD->getType());
   if (Context::shouldBeGloballyIndexed(VD)) {
     auto GlobalIndex = P.getGlobal(VD);
     assert(GlobalIndex); // visitVarDecl() didn't return false.
@@ -4736,7 +4735,7 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
     return VarCreationState::NotCreated();
 
   const Expr *Init = VD->getInit();
-  std::optional<PrimType> VarT = classify(VD->getType());
+  OptPrimType VarT = classify(VD->getType());
 
   if (Init && Init->isValueDependent())
     return false;
@@ -4868,7 +4867,7 @@ bool Compiler<Emitter>::visitAPValueInitializer(const APValue &Val,
       const Record::Field *RF = R->getField(I);
       QualType FieldType = RF->Decl->getType();
 
-      if (std::optional<PrimType> PT = classify(FieldType)) {
+      if (OptPrimType PT = classify(FieldType)) {
         if (!this->visitAPValue(F, *PT, E))
           return false;
         if (!this->emitInitField(*PT, RF->Offset, E))
@@ -4898,7 +4897,7 @@ bool Compiler<Emitter>::visitAPValueInitializer(const APValue &Val,
     QualType ElemType = ArrType->getElementType();
     for (unsigned A = 0, AN = Val.getArraySize(); A != AN; ++A) {
       const APValue &Elem = Val.getArrayInitializedElt(A);
-      if (std::optional<PrimType> ElemT = classify(ElemType)) {
+      if (OptPrimType ElemT = classify(ElemType)) {
         if (!this->visitAPValue(Elem, *ElemT, E))
           return false;
         if (!this->emitInitElem(*ElemT, A, E))
@@ -4958,7 +4957,7 @@ bool Compiler<Emitter>::VisitBuiltinCallExpr(const CallExpr *E,
   }
 
   QualType ReturnType = E->getType();
-  std::optional<PrimType> ReturnT = classify(E);
+  OptPrimType ReturnT = classify(E);
 
   // Non-primitive return type. Prepare storage.
   if (!Initializing && !ReturnT && !ReturnType->isVoidType()) {
@@ -5032,7 +5031,7 @@ bool Compiler<Emitter>::VisitCallExpr(const CallExpr *E) {
   BlockScope<Emitter> CallScope(this, ScopeKind::Call);
 
   QualType ReturnType = E->getCallReturnType(Ctx.getASTContext());
-  std::optional<PrimType> T = classify(ReturnType);
+  OptPrimType T = classify(ReturnType);
   bool HasRVO = !ReturnType->isVoidType() && !T;
 
   if (HasRVO) {
@@ -5933,7 +5932,7 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
     if (InitExpr->getType().isNull())
       return false;
 
-    if (std::optional<PrimType> T = this->classify(InitExpr)) {
+    if (OptPrimType T = this->classify(InitExpr)) {
       if (!this->visit(InitExpr))
         return false;
 
@@ -6189,7 +6188,7 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     return this->VisitVectorUnaryOperator(E);
   if (SubExpr->getType()->isFixedPointType())
     return this->VisitFixedPointUnaryOperator(E);
-  std::optional<PrimType> T = classify(SubExpr->getType());
+  OptPrimType T = classify(SubExpr->getType());
 
   switch (E->getOpcode()) {
   case UO_PostInc: { // x++
@@ -6375,6 +6374,9 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (!this->visit(SubExpr))
       return false;
 
+    if (!this->emitCheckNull(E))
+      return false;
+
     if (classifyPrim(SubExpr) == PT_Ptr)
       return this->emitNarrowPtr(E);
     return true;
@@ -6412,7 +6414,7 @@ bool Compiler<Emitter>::VisitComplexUnaryOperator(const UnaryOperator *E) {
   if (DiscardResult)
     return this->discard(SubExpr);
 
-  std::optional<PrimType> ResT = classify(E);
+  OptPrimType ResT = classify(E);
   auto prepareResult = [=]() -> bool {
     if (!ResT && !Initializing) {
       std::optional<unsigned> LocalIndex = allocateLocal(SubExpr);
@@ -6634,7 +6636,7 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     if (std::optional<unsigned> Index = P.getOrCreateGlobal(D)) {
       if (!this->emitGetPtrGlobal(*Index, E))
         return false;
-      if (std::optional<PrimType> T = classify(E->getType())) {
+      if (OptPrimType T = classify(E->getType())) {
         if (!this->visitAPValue(TPOD->getValue(), *T, E))
           return false;
         return this->emitInitGlobal(*T, *Index, E);
@@ -6670,6 +6672,11 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
   }
   // Function parameters.
   if (const auto *PVD = dyn_cast<ParmVarDecl>(D)) {
+    if (Ctx.getLangOpts().CPlusPlus && !Ctx.getLangOpts().CPlusPlus11 &&
+        !D->getType()->isIntegralOrEnumerationType()) {
+      return this->emitInvalidDeclRef(cast<DeclRefExpr>(E),
+                                      /*InitializerFailed=*/false, E);
+    }
     if (auto It = this->Params.find(PVD); It != this->Params.end()) {
       if (IsReference || !It->second.IsPtr)
         return this->emitGetParam(classifyPrim(E), It->second.Offset, E);
@@ -7128,7 +7135,7 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
   const Expr *SubExpr = E->getSubExpr();
   QualType FromType = SubExpr->getType();
   QualType ToType = E->getType();
-  std::optional<PrimType> ToT = classify(ToType);
+  OptPrimType ToT = classify(ToType);
 
   assert(!ToType->isReferenceType());
 
@@ -7149,7 +7156,7 @@ bool Compiler<Emitter>::emitBuiltinBitCast(const CastExpr *E) {
   if (SubExpr->isGLValue() || FromType->isVectorType()) {
     if (!this->visit(SubExpr))
       return false;
-  } else if (std::optional<PrimType> FromT = classify(SubExpr)) {
+  } else if (OptPrimType FromT = classify(SubExpr)) {
     unsigned TempOffset =
         allocateLocalPrimitive(SubExpr, *FromT, /*IsConst=*/true);
     if (!this->visit(SubExpr))
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index debee6726853b..503269399c757 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -254,12 +254,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   /// If the function does not exist yet, it is compiled.
   const Function *getFunction(const FunctionDecl *FD);
 
-  std::optional<PrimType> classify(const Expr *E) const {
-    return Ctx.classify(E);
-  }
-  std::optional<PrimType> classify(QualType Ty) const {
-    return Ctx.classify(Ty);
-  }
+  OptPrimType classify(const Expr *E) const { return Ctx.classify(E); }
+  OptPrimType classify(QualType Ty) const { return Ctx.classify(Ty); }
 
   /// Classifies a known primitive type.
   PrimType classifyPrim(QualType Ty) const {
@@ -306,7 +302,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *ArrayFiller,
                      const Expr *E);
   bool visitArrayElemInit(unsigned ElemIndex, const Expr *Init,
-                          std::optional<PrimType> InitT);
+                          OptPrimType InitT);
   bool visitCallArgs(ArrayRef<const Expr *> Args, const FunctionDecl *FuncDecl,
                      bool Activate);
 
@@ -435,7 +431,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool InitStackActive = false;
 
   /// Type of the expression returned by the function.
-  std::optional<PrimType> ReturnType;
+  OptPrimType ReturnType;
 
   /// Switch case mapping.
   CaseMap CaseLabels;
diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index a629ff9569428..aaeb52e0fa449 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -52,6 +52,19 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
   return Func->isValid();
 }
 
+void Context::isPotentialConstantExprUnevaluated(State &Parent, const Expr *E,
+                                                 const FunctionDecl *FD) {
+  assert(Stk.empty());
+  ++EvalID;
+  size_t StackSizeBefore = Stk.size();
+  Compiler<EvalEmitter> C(*this, *P, Parent, Stk);
+
+  if (!C.interpretCall(FD, E)) {
+    C.cleanup();
+    Stk.clearTo(StackSizeBefore);
+  }
+}
+
 bool Context::evaluateAsRValue(State &Parent, const Expr *E, APValue &Result) {
   ++EvalID;
   bool Recursing = !Stk.empty();
@@ -222,6 +235,43 @@ bool Context::evaluateCharRange(State &Parent, const Expr *SizeExpr,
   return evaluateStringRepr(Parent, SizeExpr, PtrExpr, Result);
 }
 
+bool Context::evaluateStrlen(State &Parent, const Expr *E, uint64_t &Result) {
+  assert(Stk.empty());
+  Compiler<EvalEmitter> C(*this, *P, Parent, Stk);
+
+  auto PtrRes = C.interpretAsPointer(E, [&](const Pointer &Ptr) {
+    const Descriptor *FieldDesc = Ptr.getFieldDesc();
+    if (!FieldDesc->isPrimitiveArray())
+      return false;
+
+    unsigned N = Ptr.getNumElems();
+    if (Ptr.elemSize() == 1) {
+      Result = strnlen(reinterpret_cast<const char *>(Ptr.getRawAddress()), N);
+      return Result != N;
+    }
+
+    PrimType ElemT = FieldDesc->getPrimType();
+    Result = 0;
+    for (unsigned I = Ptr.getIndex(); I != N; ++I) {
+      INT_TYPE_SWITCH(ElemT, {
+        auto Elem = Ptr.elem<T>(I);
+        if (Elem.isZero())
+          return true;
+        ++Result;
+      });
+    }
+    // We didn't find a 0 byte.
+    return false;
+  });
+
+  if (PtrRes.isInvalid()) {
+    C.cleanup();
+    Stk.clear();
+    return false;
+  }
+  return true;
+}
+
 const LangOptions &Context::getLangOpts() const { return Ctx.getLangOpts(); }
 
 static PrimType integralTypeToPrimTypeS(unsigned BitWidth) {
@@ -256,7 +306,7 @@ static PrimType integralTypeToPrimTypeU(unsigned BitWidth) {
   llvm_unreachable("Unhandled BitWidth");
 }
 
-std::optional<PrimType> Context::classify(QualType T) const {
+OptPrimType Context::classify(QualType T) const {
 
   if (const auto *BT = dyn_cast<BuiltinType>(T.getCanonicalType())) {
     auto Kind = BT->getKind();
@@ -492,7 +542,7 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FuncDecl) {
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
   for (const ParmVarDecl *PD : FuncDecl->parameters()) {
-    std::optional<PrimType> T = classify(PD->getType());
+    OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
     Descriptor *Desc = P->createDescriptor(PD, PT);
     ParamDescriptors.insert({ParamOffset, {PT, Desc}});
@@ -520,7 +570,7 @@ const Function *Context::getOrCreateObjCBlock(const BlockExpr *E) {
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
   for (const ParmVarDecl *PD : BD->parameters()) {
-    std::optional<PrimType> T = classify(PD->getType());
+    OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
     Descriptor *Desc = P->createDescriptor(PD, PT);
     ParamDescriptors.insert({ParamOffset, {PT, Desc}});
diff --git a/clang/lib/AST/ByteCode/Context.h b/clang/lib/AST/ByteCode/Context.h
index 5898ab5e54599..62ef5297bd19f 100644
--- a/clang/lib/AST/ByteCode/Context.h
+++ b/clang/lib/AST/ByteCode/Context.h
@@ -47,7 +47,9 @@ class Context final {
   ~Context();
 
   /// Checks if a function is a potential constant expression.
-  bool isPotentialConstantExpr(State &Parent, const FunctionDecl *FnDecl);
+  bool isPotentialConstantExpr(State &Parent, const FunctionDecl *FD);
+  void isPotentialConstantExprUnevaluated(State &Parent, const Expr *E,
+                                          const FunctionDecl *FD);
 
   /// Evaluates a toplevel expression as an rvalue.
   bool evaluateAsRValue(State &Parent, const Expr *E, APValue &Result);
@@ -64,6 +66,10 @@ class Context final {
   bool evaluateCharRange(State &Parent, const Expr *SizeExpr,
                          const Expr *PtrExpr, std::string &Result);
 
+  /// Evalute \param E and if it can be evaluated to a string literal,
+  /// run strlen() on it.
+  bool evaluateStrlen(State &Parent, const Expr *E, uint64_t &Result);
+
   /// Returns the AST context.
   ASTContext &getASTContext() const { return Ctx; }
   /// Returns the language options.
@@ -76,10 +82,10 @@ class Context final {
   uint32_t getBitWidth(QualType T) const { return Ctx.getIntWidth(T); }
 
   /// Classifies a type.
-  std::optional<PrimType> classify(QualType T) const;
+  OptPrimType classify(QualType T) const;
 
   /// Classifies an expression.
-  std::optional<PrimType> classify(const Expr *E) const {
+  OptPrimType classify(const Expr *E) const {
     assert(E);
     if (E->isGLValue())
       return PT_Ptr;
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index c89eca9bef440..5b9f44518fcc2 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -162,6 +162,8 @@ static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
   Desc->IsConst = IsConst || D->IsConst;
   Desc->IsFieldMutable = IsMutable || D->IsMutable;
   Desc->IsVolatile = IsVolatile || D->IsVolatile;
+  // True if this field is const AND the parent is mutable.
+  Desc->IsConstInMutable = Desc->IsConst && IsMutable;
 
   if (auto Fn = D->CtorFn)
     Fn(B, Ptr + FieldOffset, Desc->IsConst, Desc->IsFieldMutable,
diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h
index 4591eabb69bb4..4c925f6f0af1e 100644
--- a/clang/lib/AST/ByteCode/Descriptor.h
+++ b/clang/lib/AST/ByteCode/Descriptor.h
@@ -101,6 +101,10 @@ struct InlineDescriptor {
   /// Flag indicating if the field is mutable (if in a record).
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsFieldMutable : 1;
+  /// Flag indicating if this field is a const field nested in
+  /// a mutable parent field.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned IsConstInMutable : 1;
   /// Flag indicating if the field is an element of a composite array.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsArrayElement : 1;
@@ -160,7 +164,7 @@ struct Descriptor final {
   /// The primitive type this descriptor was created for,
   /// or the primitive element type in case this is
   /// a primitive array.
-  const std::optional<PrimType> PrimT = std::nullopt;
+  const OptPrimType PrimT = std::nullopt;
   /// Flag indicating if the block is mutable.
   const bool IsConst = false;
   /// Flag indicating if a field is mutable.
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index f64501f4a31e8..74399d177b5a2 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -445,6 +445,7 @@ LLVM_DUMP_METHOD void InlineDescriptor::dump(llvm::raw_ostream &OS) const {
   OS << "InUnion: " << InUnion << "\n";
   OS << "IsFieldMutable: " << IsFieldMutable << "\n";
   OS << "IsArrayElement: " << IsArrayElement << "\n";
+  OS << "IsConstInMutable: " << IsConstInMutable << '\n';
   OS << "Desc: ";
   if (Desc)
     Desc->dump(OS);
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp
index 5498065657e0a..81ebc5694d6f0 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -90,6 +90,19 @@ EvaluationResult EvalEmitter::interpretAsPointer(const Expr *E,
   return std::move(this->EvalResult);
 }
 
+bool EvalEmitter::interpretCall(const FunctionDecl *FD, const Expr *E) {
+  // Add parameters to the parameter map. The values in the ParamOffset don't
+  // matter in this case as reading from them can't ever work.
+  for (const ParmVarDecl *PD : FD->parameters()) {
+    this->Params.insert({PD, {0, false}});
+  }
+
+  if (!this->visit(E))
+    return false;
+  PrimType T = Ctx.classify(E).value_or(PT_Ptr);
+  return this->emitPop(T, E);
+}
+
 void EvalEmitter::emitLabel(LabelTy Label) { CurrentLabel = Label; }
 
 EvalEmitter::LabelTy EvalEmitter::getLabel() { return NextLabel++; }
@@ -311,7 +324,7 @@ void EvalEmitter::updateGlobalTemporaries() {
       const Pointer &Ptr = P.getPtrGlobal(*GlobalIndex);
       APValue *Cached = Temp->getOrCreateValue(true);
 
-      if (std::optional<PrimType> T = Ctx.classify(E->getType())) {
+      if (OptPrimType T = Ctx.classify(E->getType())) {
         TYPE_SWITCH(
             *T, { *Cached = Ptr.deref<T>().toAPValue(Ctx.getASTContext()); });
       } else {
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.h b/clang/lib/AST/ByteCode/EvalEmitter.h
index 7303adba22af7..2fe7da608c739 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.h
+++ b/clang/lib/AST/ByteCode/EvalEmitter.h
@@ -40,6 +40,9 @@ class EvalEmitter : public SourceMapper {
   EvaluationResult interpretDecl(const VarDecl *VD, bool CheckFullyInitialized);
   /// Interpret the given Expr to a Pointer.
   EvaluationResult interpretAsPointer(const Expr *E, PtrCallback PtrCB);
+  /// Interpret the given expression as if it was in the body of the given
+  /// function, i.e. the parameters of the function are available for use.
+  bool interpretCall(const FunctionDecl *FD, const Expr *E);
 
   /// Clean up all resources.
   void cleanup();
diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp
index f59612bf00015..b11531f4296a2 100644
--- a/clang/lib/AST/ByteCode/EvaluationResult.cpp
+++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp
@@ -204,7 +204,7 @@ static void collectBlocks(const Pointer &Ptr,
 
   } else if (Desc->isPrimitiveArray() && Desc->getPrimType() == PT_Ptr) {
     for (unsigned I = 0; I != Desc->getNumElems(); ++I) {
-      const Pointer &ElemPointee = Ptr.atIndex(I).deref<Pointer>();
+      const Pointer &ElemPointee = Ptr.elem<Pointer>(I);
       if (isUsefulPtr(ElemPointee) && !Blocks.contains(ElemPointee.block()))
         collectBlocks(ElemPointee, Blocks);
     }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 98fb8c8fcded5..5463aecf23087 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -142,8 +142,12 @@ static bool diagnoseUnknownDecl(InterpState &S, CodePtr OpPC,
     return false;
 
   if (isa<ParmVarDecl>(D)) {
-    if (D->getType()->isReferenceType())
+    if (D->getType()->isReferenceType()) {
+      if (S.inConstantContext() && S.getLangOpts().CPlusPlus &&
+          !S.getLangOpts().CPlusPlus11)
+        diagnoseNonConstVariable(S, OpPC, D);
       return false;
+    }
 
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     if (S.getLangOpts().CPlusPlus11) {
@@ -566,7 +570,10 @@ bool CheckDowncast(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
 bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   assert(Ptr.isLive() && "Pointer is not live");
-  if (!Ptr.isConst() || Ptr.isMutable())
+  if (!Ptr.isConst())
+    return true;
+
+  if (Ptr.isMutable() && !Ptr.isConstInMutable())
     return true;
 
   if (!Ptr.isBlockPointer())
@@ -574,7 +581,7 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 
   // The This pointer is writable in constructors and destructors,
   // even if isConst() returns true.
-  if (llvm::find(S.InitializingBlocks, Ptr.block()))
+  if (llvm::is_contained(S.InitializingBlocks, Ptr.block()))
     return true;
 
   const QualType Ty = Ptr.getType();
@@ -658,6 +665,9 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   if (Ptr.isInitialized())
     return true;
 
+  if (Ptr.isExtern() && S.checkingPotentialConstantExpression())
+    return false;
+
   if (const auto *VD = Ptr.getDeclDesc()->asVarDecl();
       VD && (VD->isConstexpr() || VD->hasGlobalStorage())) {
 
@@ -815,7 +825,7 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   return true;
 }
 
-bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+static bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_MemberCall))
     return false;
   if (!Ptr.isDummy()) {
@@ -937,7 +947,7 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
   return false;
 }
 
-bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
+static bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
   if ((S.Current->getDepth() + 1) > S.getLangOpts().ConstexprCallDepth) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_depth_limit_exceeded)
@@ -1092,8 +1102,8 @@ bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
-                      const CallExpr *CE, unsigned ArgSize) {
+static bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
+                             const CallExpr *CE, unsigned ArgSize) {
   auto Args = ArrayRef(CE->getArgs(), CE->getNumArgs());
   auto NonNullArgs = collectNonNullArgs(F->getDecl(), Args);
   unsigned Offset = 0;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 6be68e4a978b5..b42c7665c3a35 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -98,26 +98,12 @@ bool CheckGlobalInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 /// Checks if a value can be stored in a block.
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-/// Checks if a method can be invoked on an object.
-bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
-
 /// Checks if a value can be initialized.
 bool CheckInit(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-/// Checks if a method can be called.
-bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F);
-
-/// Checks if calling the currently active function would exceed
-/// the allowed call depth.
-bool CheckCallDepth(InterpState &S, CodePtr OpPC);
-
 /// Checks the 'this' pointer.
 bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This);
 
-/// Checks if all the arguments annotated as 'nonnull' are in fact not null.
-bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
-                      const CallExpr *CE, unsigned ArgSize);
-
 /// Checks if dynamic memory allocation is available in the current
 /// language mode.
 bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC);
@@ -482,10 +468,10 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
   const Pointer &Result = S.Stk.peek<Pointer>();
 
   if constexpr (std::is_same_v<T, Floating>) {
-    APFloat A = LHS.atIndex(0).deref<Floating>().getAPFloat();
-    APFloat B = LHS.atIndex(1).deref<Floating>().getAPFloat();
-    APFloat C = RHS.atIndex(0).deref<Floating>().getAPFloat();
-    APFloat D = RHS.atIndex(1).deref<Floating>().getAPFloat();
+    APFloat A = LHS.elem<Floating>(0).getAPFloat();
+    APFloat B = LHS.elem<Floating>(1).getAPFloat();
+    APFloat C = RHS.elem<Floating>(0).getAPFloat();
+    APFloat D = RHS.elem<Floating>(1).getAPFloat();
 
     APFloat ResR(A.getSemantics());
     APFloat ResI(A.getSemantics());
@@ -494,20 +480,20 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     // Copy into the result.
     Floating RA = S.allocFloat(A.getSemantics());
     RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.elem<Floating>(0) = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
 
     Floating RI = S.allocFloat(A.getSemantics());
     RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+    Result.elem<Floating>(1) = RI; // Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
     // Integer element type.
-    const T &LHSR = LHS.atIndex(0).deref<T>();
-    const T &LHSI = LHS.atIndex(1).deref<T>();
-    const T &RHSR = RHS.atIndex(0).deref<T>();
-    const T &RHSI = RHS.atIndex(1).deref<T>();
+    const T &LHSR = LHS.elem<T>(0);
+    const T &LHSI = LHS.elem<T>(1);
+    const T &RHSR = RHS.elem<T>(0);
+    const T &RHSI = RHS.elem<T>(1);
     unsigned Bits = LHSR.bitWidth();
 
     // real(Result) = (real(LHS) * real(RHS)) - (imag(LHS) * imag(RHS))
@@ -517,7 +503,7 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     T B;
     if (T::mul(LHSI, RHSI, Bits, &B))
       return false;
-    if (T::sub(A, B, Bits, &Result.atIndex(0).deref<T>()))
+    if (T::sub(A, B, Bits, &Result.elem<T>(0)))
       return false;
     Result.atIndex(0).initialize();
 
@@ -526,7 +512,7 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
       return false;
     if (T::mul(LHSI, RHSR, Bits, &B))
       return false;
-    if (T::add(A, B, Bits, &Result.atIndex(1).deref<T>()))
+    if (T::add(A, B, Bits, &Result.elem<T>(1)))
       return false;
     Result.atIndex(1).initialize();
     Result.initialize();
@@ -542,10 +528,10 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
   const Pointer &Result = S.Stk.peek<Pointer>();
 
   if constexpr (std::is_same_v<T, Floating>) {
-    APFloat A = LHS.atIndex(0).deref<Floating>().getAPFloat();
-    APFloat B = LHS.atIndex(1).deref<Floating>().getAPFloat();
-    APFloat C = RHS.atIndex(0).deref<Floating>().getAPFloat();
-    APFloat D = RHS.atIndex(1).deref<Floating>().getAPFloat();
+    APFloat A = LHS.elem<Floating>(0).getAPFloat();
+    APFloat B = LHS.elem<Floating>(1).getAPFloat();
+    APFloat C = RHS.elem<Floating>(0).getAPFloat();
+    APFloat D = RHS.elem<Floating>(1).getAPFloat();
 
     APFloat ResR(A.getSemantics());
     APFloat ResI(A.getSemantics());
@@ -554,21 +540,21 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     // Copy into the result.
     Floating RA = S.allocFloat(A.getSemantics());
     RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.elem<Floating>(0) = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
 
     Floating RI = S.allocFloat(A.getSemantics());
     RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+    Result.elem<Floating>(1) = RI; // Floating(ResI);
     Result.atIndex(1).initialize();
 
     Result.initialize();
   } else {
     // Integer element type.
-    const T &LHSR = LHS.atIndex(0).deref<T>();
-    const T &LHSI = LHS.atIndex(1).deref<T>();
-    const T &RHSR = RHS.atIndex(0).deref<T>();
-    const T &RHSI = RHS.atIndex(1).deref<T>();
+    const T &LHSR = LHS.elem<T>(0);
+    const T &LHSI = LHS.elem<T>(1);
+    const T &RHSR = RHS.elem<T>(0);
+    const T &RHSI = RHS.elem<T>(1);
     unsigned Bits = LHSR.bitWidth();
     const T Zero = T::from(0, Bits);
 
@@ -595,8 +581,8 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     }
 
     // real(Result) = ((real(LHS) * real(RHS)) + (imag(LHS) * imag(RHS))) / Den
-    T &ResultR = Result.atIndex(0).deref<T>();
-    T &ResultI = Result.atIndex(1).deref<T>();
+    T &ResultR = Result.elem<T>(0);
+    T &ResultI = Result.elem<T>(1);
 
     if (T::mul(LHSR, RHSR, Bits, &A) || T::mul(LHSI, RHSI, Bits, &B))
       return false;
@@ -1322,7 +1308,7 @@ bool Dup(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Pop(InterpState &S, CodePtr OpPC) {
-  S.Stk.pop<T>();
+  S.Stk.discard<T>();
   return true;
 }
 
@@ -1899,6 +1885,16 @@ inline bool Dump(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+inline bool CheckNull(InterpState &S, CodePtr OpPC) {
+  const auto &Ptr = S.Stk.peek<Pointer>();
+  if (Ptr.isZero()) {
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_dereferencing_null);
+    return S.noteUndefinedBehavior();
+  }
+  return true;
+}
+
 inline bool VirtBaseHelper(InterpState &S, CodePtr OpPC, const RecordDecl *Decl,
                            const Pointer &Ptr) {
   Pointer Base = Ptr;
@@ -3107,7 +3103,7 @@ inline bool ArrayElem(InterpState &S, CodePtr OpPC, uint32_t Index) {
     return false;
 
   assert(Ptr.atIndex(Index).getFieldDesc()->getPrimType() == Name);
-  S.Stk.push<T>(Ptr.atIndex(Index).deref<T>());
+  S.Stk.push<T>(Ptr.elem<T>(Index));
   return true;
 }
 
@@ -3119,7 +3115,7 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
     return false;
 
   assert(Ptr.atIndex(Index).getFieldDesc()->getPrimType() == Name);
-  S.Stk.push<T>(Ptr.atIndex(Index).deref<T>());
+  S.Stk.push<T>(Ptr.elem<T>(Index));
   return true;
 }
 
@@ -3561,12 +3557,22 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
       Floating Result = S.allocFloat(*Sem);
       Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
       S.Stk.push<Floating>(Result);
-
-      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
     } else if constexpr (needsAlloc<T>()) {
       T Result = S.allocAP<T>(ResultBitWidth);
       T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
       S.Stk.push<T>(Result);
+    } else if constexpr (std::is_same_v<T, Boolean>) {
+      // Only allow to cast single-byte integers to bool if they are either 0
+      // or 1.
+      assert(FullBitWidth.getQuantity() == 8);
+      auto Val = static_cast<unsigned int>(Buff[0]);
+      if (Val > 1) {
+        S.FFDiag(S.Current->getSource(OpPC),
+                 diag::note_constexpr_bit_cast_unrepresentable_value)
+            << S.getASTContext().BoolTy << Val;
+        return false;
+      }
+      S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h
index 7798b6f886a85..51622238e275c 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.h
+++ b/clang/lib/AST/ByteCode/InterpBlock.h
@@ -14,11 +14,6 @@
 #define LLVM_CLANG_AST_INTERP_BLOCK_H
 
 #include "Descriptor.h"
-#include "clang/AST/ComparisonCategories.h"
-#include "clang/AST/Decl.h"
-#include "clang/AST/DeclCXX.h"
-#include "clang/AST/Expr.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace clang {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index de0b97fd93c76..19d4c0c5b48d2 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
-#include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
 #include "InterpBuiltinBitCast.h"
@@ -54,7 +53,7 @@ static APSInt popToAPSInt(InterpStack &Stk, PrimType T) {
 static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(QT->isSignedIntegerOrEnumerationType() ||
          QT->isUnsignedIntegerOrEnumerationType());
-  std::optional<PrimType> T = S.getContext().classify(QT);
+  OptPrimType T = S.getContext().classify(QT);
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
@@ -1099,9 +1098,9 @@ static bool interp__builtin_complex(InterpState &S, CodePtr OpPC,
   const Floating &Arg1 = S.Stk.pop<Floating>();
   Pointer &Result = S.Stk.peek<Pointer>();
 
-  Result.atIndex(0).deref<Floating>() = Arg1;
+  Result.elem<Floating>(0) = Arg1;
   Result.atIndex(0).initialize();
-  Result.atIndex(1).deref<Floating>() = Arg2;
+  Result.elem<Floating>(1) = Arg2;
   Result.atIndex(1).initialize();
   Result.initialize();
 
@@ -1531,7 +1530,7 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
     return false;
 
   bool IsArray = NumElems.ugt(1);
-  std::optional<PrimType> ElemT = S.getContext().classify(ElemType);
+  OptPrimType ElemT = S.getContext().classify(ElemType);
   DynamicAllocator &Allocator = S.getAllocator();
   if (ElemT) {
     Block *B =
@@ -1645,10 +1644,10 @@ static bool interp__builtin_vector_reduce(InterpState &S, CodePtr OpPC,
   unsigned NumElems = Arg.getNumElems();
 
   INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-    T Result = Arg.atIndex(0).deref<T>();
+    T Result = Arg.elem<T>(0);
     unsigned BitWidth = Result.bitWidth();
     for (unsigned I = 1; I != NumElems; ++I) {
-      T Elem = Arg.atIndex(I).deref<T>();
+      T Elem = Arg.elem<T>(I);
       T PrevResult = Result;
 
       if (ID == Builtin::BI__builtin_reduce_add) {
@@ -1724,11 +1723,10 @@ static bool interp__builtin_elementwise_popcount(InterpState &S, CodePtr OpPC,
   for (unsigned I = 0; I != NumElems; ++I) {
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
       if (BuiltinID == Builtin::BI__builtin_elementwise_popcount) {
-        Dst.atIndex(I).deref<T>() =
-            T::from(Arg.atIndex(I).deref<T>().toAPSInt().popcount());
+        Dst.elem<T>(I) = T::from(Arg.elem<T>(I).toAPSInt().popcount());
       } else {
-        Dst.atIndex(I).deref<T>() = T::from(
-            Arg.atIndex(I).deref<T>().toAPSInt().reverseBits().getZExtValue());
+        Dst.elem<T>(I) =
+            T::from(Arg.elem<T>(I).toAPSInt().reverseBits().getZExtValue());
       }
       Dst.atIndex(I).initialize();
     });
@@ -2297,8 +2295,8 @@ static bool interp__builtin_elementwise_sat(InterpState &S, CodePtr OpPC,
     APSInt Elem1;
     APSInt Elem2;
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
-      Elem1 = LHS.atIndex(I).deref<T>().toAPSInt();
-      Elem2 = RHS.atIndex(I).deref<T>().toAPSInt();
+      Elem1 = LHS.elem<T>(I).toAPSInt();
+      Elem2 = RHS.elem<T>(I).toAPSInt();
     });
 
     APSInt Result;
@@ -2881,7 +2879,7 @@ static bool copyRecord(InterpState &S, CodePtr OpPC, const Pointer &Src,
 
   auto copyField = [&](const Record::Field &F, bool Activate) -> bool {
     Pointer DestField = Dest.atField(F.Offset);
-    if (std::optional<PrimType> FT = S.Ctx.classify(F.Decl->getType())) {
+    if (OptPrimType FT = S.Ctx.classify(F.Decl->getType())) {
       TYPE_SWITCH(*FT, {
         DestField.deref<T>() = Src.atField(F.Offset).deref<T>();
         if (Src.atField(F.Offset).isInitialized())
@@ -2906,6 +2904,8 @@ static bool copyRecord(InterpState &S, CodePtr OpPC, const Pointer &Src,
         if (!copyField(F, /*Activate=*/true))
           return false;
       } else {
+        if (!CheckMutable(S, OpPC, Src.atField(F.Offset)))
+          return false;
         Pointer DestField = Dest.atField(F.Offset);
         zeroAll(DestField);
       }
@@ -2941,7 +2941,7 @@ static bool copyComposite(InterpState &S, CodePtr OpPC, const Pointer &Src,
     for (unsigned I = 0, N = DestDesc->getNumElems(); I != N; ++I) {
       Pointer DestElem = Dest.atIndex(I);
       TYPE_SWITCH(ET, {
-        DestElem.deref<T>() = Src.atIndex(I).deref<T>();
+        DestElem.deref<T>() = Src.elem<T>(I);
         DestElem.initialize();
       });
     }
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index a5a4bd25fe712..d62a4f6275b50 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -128,6 +128,11 @@ static bool shouldSkipInBacktrace(const Function *F) {
   if (FD->getDeclName().getCXXOverloadedOperator() == OO_New ||
       FD->getDeclName().getCXXOverloadedOperator() == OO_Array_New)
     return true;
+
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(FD);
+      MD && MD->getParent()->isAnonymousStructOrUnion())
+    return true;
+
   return false;
 }
 
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 804853d29512e..80703ad72d954 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -865,6 +865,7 @@ def CheckNewTypeMismatchArray : Opcode {
 
 def IsConstantContext: Opcode;
 def CheckAllocations : Opcode;
+def CheckNull : Opcode;
 
 def BitCastTypeClass : TypeClass {
   let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64,
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 2f9ecf98e558e..4019b74669282 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -665,7 +665,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
       return false;
 
     // Primitive values.
-    if (std::optional<PrimType> T = Ctx.classify(Ty)) {
+    if (OptPrimType T = Ctx.classify(Ty)) {
       TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue(ASTCtx));
       return true;
     }
@@ -682,7 +682,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
           const Pointer &FP = Ptr.atField(F.Offset);
           QualType FieldTy = F.Decl->getType();
           if (FP.isActive()) {
-            if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
+            if (OptPrimType T = Ctx.classify(FieldTy)) {
               TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue(ASTCtx));
             } else {
               Ok &= Composite(FieldTy, FP, Value);
@@ -705,7 +705,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
           const Pointer &FP = Ptr.atField(FD->Offset);
           APValue &Value = R.getStructField(I);
 
-          if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
+          if (OptPrimType T = Ctx.classify(FieldTy)) {
             TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue(ASTCtx));
           } else {
             Ok &= Composite(FieldTy, FP, Value);
@@ -743,7 +743,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
       for (unsigned I = 0; I < NumElems; ++I) {
         APValue &Slot = R.getArrayInitializedElt(I);
         const Pointer &EP = Ptr.atIndex(I);
-        if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
+        if (OptPrimType T = Ctx.classify(ElemTy)) {
           TYPE_SWITCH(*T, Slot = EP.deref<T>().toAPValue(ASTCtx));
         } else {
           Ok &= Composite(ElemTy, EP.narrow(), Slot);
@@ -757,17 +757,17 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
       QualType ElemTy = CT->getElementType();
 
       if (ElemTy->isIntegerType()) {
-        std::optional<PrimType> ElemT = Ctx.classify(ElemTy);
+        OptPrimType ElemT = Ctx.classify(ElemTy);
         assert(ElemT);
         INT_TYPE_SWITCH(*ElemT, {
-          auto V1 = Ptr.atIndex(0).deref<T>();
-          auto V2 = Ptr.atIndex(1).deref<T>();
+          auto V1 = Ptr.elem<T>(0);
+          auto V2 = Ptr.elem<T>(1);
           R = APValue(V1.toAPSInt(), V2.toAPSInt());
           return true;
         });
       } else if (ElemTy->isFloatingType()) {
-        R = APValue(Ptr.atIndex(0).deref<Floating>().getAPFloat(),
-                    Ptr.atIndex(1).deref<Floating>().getAPFloat());
+        R = APValue(Ptr.elem<Floating>(0).getAPFloat(),
+                    Ptr.elem<Floating>(1).getAPFloat());
         return true;
       }
       return false;
@@ -782,9 +782,8 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
       SmallVector<APValue> Values;
       Values.reserve(VT->getNumElements());
       for (unsigned I = 0; I != VT->getNumElements(); ++I) {
-        TYPE_SWITCH(ElemT, {
-          Values.push_back(Ptr.atIndex(I).deref<T>().toAPValue(ASTCtx));
-        });
+        TYPE_SWITCH(ElemT,
+                    { Values.push_back(Ptr.elem<T>(I).toAPValue(ASTCtx)); });
       }
 
       assert(Values.size() == VT->getNumElements());
@@ -804,7 +803,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
     return toAPValue(ASTCtx);
 
   // Just load primitive types.
-  if (std::optional<PrimType> T = Ctx.classify(ResultType)) {
+  if (OptPrimType T = Ctx.classify(ResultType)) {
     TYPE_SWITCH(*T, return this->deref<T>().toAPValue(ASTCtx));
   }
 
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index e6a64e6658f06..d17eba5da9ca6 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -576,6 +576,11 @@ class Pointer {
       return true;
     return isRoot() ? getDeclDesc()->IsConst : getInlineDesc()->IsConst;
   }
+  bool isConstInMutable() const {
+    if (!isBlockPointer())
+      return false;
+    return isRoot() ? false : getInlineDesc()->IsConstInMutable;
+  }
 
   /// Checks if an object or a subfield is volatile.
   bool isVolatile() const {
@@ -688,6 +693,25 @@ class Pointer {
     return *reinterpret_cast<T *>(asBlockPointer().Pointee->rawData() + Offset);
   }
 
+  /// Dereferences the element at index \p I.
+  /// This is equivalent to atIndex(I).deref<T>().
+  template <typename T> T &elem(unsigned I) const {
+    assert(isLive() && "Invalid pointer");
+    assert(isBlockPointer());
+    assert(asBlockPointer().Pointee);
+    assert(isDereferencable());
+    assert(getFieldDesc()->isPrimitiveArray());
+
+    unsigned ElemByteOffset = I * getFieldDesc()->getElemSize();
+    if (isArrayRoot())
+      return *reinterpret_cast<T *>(asBlockPointer().Pointee->rawData() +
+                                    asBlockPointer().Base + sizeof(InitMapPtr) +
+                                    ElemByteOffset);
+
+    return *reinterpret_cast<T *>(asBlockPointer().Pointee->rawData() + Offset +
+                                  ElemByteOffset);
+  }
+
   /// Whether this block can be read from at all. This is only true for
   /// block pointers that point to a valid location inside that block.
   bool isDereferencable() const {
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index a156cccbb3c1b..38c29b9f82672 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CLANG_AST_INTERP_TYPE_H
 #define LLVM_CLANG_AST_INTERP_TYPE_H
 
+#include "clang/Basic/UnsignedOrNone.h"
 #include "llvm/Support/raw_ostream.h"
 #include <climits>
 #include <cstddef>
@@ -49,6 +50,38 @@ enum PrimType : unsigned {
   PT_MemberPtr = 14,
 };
 
+// Like std::optional<PrimType>, but only sizeof(PrimType).
+class OptPrimType final {
+  unsigned V = ~0u;
+
+public:
+  OptPrimType() = default;
+  OptPrimType(std::nullopt_t) {}
+  OptPrimType(PrimType T) : V(static_cast<unsigned>(T)) {}
+
+  explicit constexpr operator bool() const { return V != ~0u; }
+  PrimType operator*() const {
+    assert(operator bool());
+    return static_cast<PrimType>(V);
+  }
+
+  PrimType value_or(PrimType PT) const {
+    if (operator bool())
+      return static_cast<PrimType>(V);
+    return PT;
+  }
+
+  bool operator==(PrimType PT) const {
+    if (!operator bool())
+      return false;
+    return V == static_cast<unsigned>(PT);
+  }
+  bool operator==(OptPrimType OPT) const { return V == OPT.V; }
+  bool operator!=(PrimType PT) const { return !(*this == PT); }
+  bool operator!=(OptPrimType OPT) const { return V != OPT.V; }
+};
+static_assert(sizeof(OptPrimType) == sizeof(PrimType));
+
 inline constexpr bool isPtrType(PrimType T) {
   return T == PT_Ptr || T == PT_MemberPtr;
 }
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index 5ac0f59f32d4e..7002724a7a4fe 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -74,27 +74,25 @@ unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
 
   const Pointer Ptr(G->block());
   if (CharWidth == 1) {
-    std::memcpy(&Ptr.atIndex(0).deref<char>(), S->getString().data(),
-                StringLength);
+    std::memcpy(&Ptr.elem<char>(0), S->getString().data(), StringLength);
   } else {
     // Construct the string in storage.
     for (unsigned I = 0; I <= StringLength; ++I) {
-      Pointer Field = Ptr.atIndex(I);
       const uint32_t CodePoint = I == StringLength ? 0 : S->getCodeUnit(I);
       switch (CharType) {
       case PT_Sint8: {
         using T = PrimConv<PT_Sint8>::T;
-        Field.deref<T>() = T::from(CodePoint, BitWidth);
+        Ptr.elem<T>(I) = T::from(CodePoint, BitWidth);
         break;
       }
       case PT_Uint16: {
         using T = PrimConv<PT_Uint16>::T;
-        Field.deref<T>() = T::from(CodePoint, BitWidth);
+        Ptr.elem<T>(I) = T::from(CodePoint, BitWidth);
         break;
       }
       case PT_Uint32: {
         using T = PrimConv<PT_Uint32>::T;
-        Field.deref<T>() = T::from(CodePoint, BitWidth);
+        Ptr.elem<T>(I) = T::from(CodePoint, BitWidth);
         break;
       }
       default:
@@ -171,7 +169,7 @@ unsigned Program::getOrCreateDummy(const DeclTy &D) {
   assert(!QT.isNull());
 
   Descriptor *Desc;
-  if (std::optional<PrimType> T = Ctx.classify(QT))
+  if (OptPrimType T = Ctx.classify(QT))
     Desc = createDescriptor(D, *T, /*SourceTy=*/nullptr, std::nullopt,
                             /*IsConst=*/QT.isConstQualified());
   else
@@ -250,7 +248,7 @@ std::optional<unsigned> Program::createGlobal(const DeclTy &D, QualType Ty,
   const bool IsConst = Ty.isConstQualified();
   const bool IsTemporary = D.dyn_cast<const Expr *>();
   const bool IsVolatile = Ty.isVolatileQualified();
-  if (std::optional<PrimType> T = Ctx.classify(Ty))
+  if (OptPrimType T = Ctx.classify(Ty))
     Desc = createDescriptor(D, *T, nullptr, Descriptor::GlobalMD, IsConst,
                             IsTemporary, /*IsMutable=*/false, IsVolatile);
   else
@@ -373,7 +371,7 @@ Record *Program::getOrCreateRecord(const RecordDecl *RD) {
     const bool IsMutable = FD->isMutable();
     const bool IsVolatile = FT.isVolatileQualified();
     const Descriptor *Desc;
-    if (std::optional<PrimType> T = Ctx.classify(FT)) {
+    if (OptPrimType T = Ctx.classify(FT)) {
       Desc = createDescriptor(FD, *T, nullptr, std::nullopt, IsConst,
                               /*isTemporary=*/false, IsMutable, IsVolatile);
     } else {
@@ -412,7 +410,7 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
     // Array of well-known bounds.
     if (const auto *CAT = dyn_cast<ConstantArrayType>(ArrayType)) {
       size_t NumElems = CAT->getZExtSize();
-      if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
+      if (OptPrimType T = Ctx.classify(ElemTy)) {
         // Arrays of primitives.
         unsigned ElemSize = primSize(*T);
         if (std::numeric_limits<unsigned>::max() / ElemSize <= NumElems) {
@@ -439,7 +437,7 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
     // is forbidden on pointers to such objects.
     if (isa<IncompleteArrayType>(ArrayType) ||
         isa<VariableArrayType>(ArrayType)) {
-      if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
+      if (OptPrimType T = Ctx.classify(ElemTy)) {
         return allocateDescriptor(D, *T, MDSize, IsConst, IsTemporary,
                                   Descriptor::UnknownSize{});
       } else {
@@ -462,7 +460,7 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
 
   // Complex types - represented as arrays of elements.
   if (const auto *CT = Ty->getAs<ComplexType>()) {
-    std::optional<PrimType> ElemTy = Ctx.classify(CT->getElementType());
+    OptPrimType ElemTy = Ctx.classify(CT->getElementType());
     if (!ElemTy)
       return nullptr;
 
@@ -472,7 +470,7 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
 
   // Same with vector types.
   if (const auto *VT = Ty->getAs<VectorType>()) {
-    std::optional<PrimType> ElemTy = Ctx.classify(VT->getElementType());
+    OptPrimType ElemTy = Ctx.classify(VT->getElementType());
     if (!ElemTy)
       return nullptr;
 
diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h
index 9a81fa6b7d220..6fc33222ac956 100644
--- a/clang/lib/AST/ByteCode/State.h
+++ b/clang/lib/AST/ByteCode/State.h
@@ -35,6 +35,7 @@ enum AccessKinds {
   AK_Construct,
   AK_Destroy,
   AK_IsWithinLifetime,
+  AK_Dereference
 };
 
 /// The order of this enum is important for diagnostics.
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 4514965009793..673e3f73858c7 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -3211,6 +3211,12 @@ UsingDirectiveDecl *UsingDirectiveDecl::CreateDeserialized(ASTContext &C,
                                         SourceLocation(), nullptr, nullptr);
 }
 
+NamespaceDecl *NamespaceBaseDecl::getNamespace() {
+  if (auto *Alias = dyn_cast<NamespaceAliasDecl>(this))
+    return Alias->getNamespace();
+  return cast<NamespaceDecl>(this);
+}
+
 NamespaceDecl *UsingDirectiveDecl::getNominatedNamespace() {
   if (auto *NA = dyn_cast_or_null<NamespaceAliasDecl>(NominatedNamespace))
     return NA->getNamespace();
@@ -3221,7 +3227,7 @@ NamespaceDecl::NamespaceDecl(ASTContext &C, DeclContext *DC, bool Inline,
                              SourceLocation StartLoc, SourceLocation IdLoc,
                              IdentifierInfo *Id, NamespaceDecl *PrevDecl,
                              bool Nested)
-    : NamedDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace),
+    : NamespaceBaseDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace),
       redeclarable_base(C), LocStart(StartLoc) {
   setInline(Inline);
   setNested(Nested);
@@ -3268,13 +3274,11 @@ NamespaceAliasDecl *NamespaceAliasDecl::getMostRecentDeclImpl() {
   return getMostRecentDecl();
 }
 
-NamespaceAliasDecl *NamespaceAliasDecl::Create(ASTContext &C, DeclContext *DC,
-                                               SourceLocation UsingLoc,
-                                               SourceLocation AliasLoc,
-                                               IdentifierInfo *Alias,
-                                           NestedNameSpecifierLoc QualifierLoc,
-                                               SourceLocation IdentLoc,
-                                               NamedDecl *Namespace) {
+NamespaceAliasDecl *NamespaceAliasDecl::Create(
+    ASTContext &C, DeclContext *DC, SourceLocation UsingLoc,
+    SourceLocation AliasLoc, IdentifierInfo *Alias,
+    NestedNameSpecifierLoc QualifierLoc, SourceLocation IdentLoc,
+    NamespaceBaseDecl *Namespace) {
   // FIXME: Preserve the aliased namespace as written.
   if (auto *NS = dyn_cast_or_null<NamespaceDecl>(Namespace))
     Namespace = NS->getFirstDecl();
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 1b33b6706e204..0d12161756467 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1529,7 +1529,7 @@ CallStackFrame::~CallStackFrame() {
 
 static bool isRead(AccessKinds AK) {
   return AK == AK_Read || AK == AK_ReadObjectRepresentation ||
-         AK == AK_IsWithinLifetime;
+         AK == AK_IsWithinLifetime || AK == AK_Dereference;
 }
 
 static bool isModification(AccessKinds AK) {
@@ -1540,6 +1540,7 @@ static bool isModification(AccessKinds AK) {
   case AK_DynamicCast:
   case AK_TypeId:
   case AK_IsWithinLifetime:
+  case AK_Dereference:
     return false;
   case AK_Assign:
   case AK_Increment:
@@ -1558,15 +1559,16 @@ static bool isAnyAccess(AccessKinds AK) {
 /// Is this an access per the C++ definition?
 static bool isFormalAccess(AccessKinds AK) {
   return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy &&
-         AK != AK_IsWithinLifetime;
+         AK != AK_IsWithinLifetime && AK != AK_Dereference;
 }
 
-/// Is this kind of axcess valid on an indeterminate object value?
+/// Is this kind of access valid on an indeterminate object value?
 static bool isValidIndeterminateAccess(AccessKinds AK) {
   switch (AK) {
   case AK_Read:
   case AK_Increment:
   case AK_Decrement:
+  case AK_Dereference:
     // These need the object's value.
     return false;
 
@@ -1733,7 +1735,10 @@ namespace {
     bool checkNullPointerForFoldAccess(EvalInfo &Info, const Expr *E,
                                        AccessKinds AK) {
       return checkNullPointerDiagnosingWith([&Info, E, AK] {
-        Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
+        if (AK == AccessKinds::AK_Dereference)
+          Info.FFDiag(E, diag::note_constexpr_dereferencing_null);
+        else
+          Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
       });
     }
 
@@ -4305,7 +4310,10 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
   }
 
   if (!LVal.Base) {
-    Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
+    if (AK == AccessKinds::AK_Dereference)
+      Info.FFDiag(E, diag::note_constexpr_dereferencing_null);
+    else
+      Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
     return CompleteObject();
   }
 
@@ -4407,8 +4415,9 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
       ConstexprVar = VD->isConstexpr();
 
     // Unless we're looking at a local variable or argument in a constexpr call,
-    // the variable we're reading must be const.
-    if (!Frame) {
+    // the variable we're reading must be const (unless we are binding to a
+    // reference).
+    if (AK != clang::AK_Dereference && !Frame) {
       if (IsAccess && isa<ParmVarDecl>(VD)) {
         // Access of a parameter that's not associated with a frame isn't going
         // to work out, but we can leave it to evaluateVarDeclInit to provide a
@@ -4441,7 +4450,8 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
         }
       } else if (!IsAccess) {
         return CompleteObject(LVal.getLValueBase(), nullptr, BaseType);
-      } else if (IsConstant && Info.checkingPotentialConstantExpression() &&
+      } else if ((IsConstant || BaseType->isReferenceType()) &&
+                 Info.checkingPotentialConstantExpression() &&
                  BaseType->isLiteralType(Info.Ctx) && !VD->hasDefinition()) {
         // This variable might end up being constexpr. Don't diagnose it yet.
       } else if (IsConstant) {
@@ -4472,15 +4482,21 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
       }
     }
 
-    if (!evaluateVarDeclInit(Info, E, VD, Frame, LVal.getLValueVersion(), BaseVal))
+    // When binding to a reference, the variable does not need to be constexpr
+    // or have constant initalization.
+    if (AK != clang::AK_Dereference &&
+        !evaluateVarDeclInit(Info, E, VD, Frame, LVal.getLValueVersion(),
+                             BaseVal))
       return CompleteObject();
     // If evaluateVarDeclInit sees a constexpr-unknown variable, it returns
     // a null BaseVal. Any constexpr-unknown variable seen here is an error:
     // we can't access a constexpr-unknown object.
-    if (!BaseVal) {
-      Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
-          << AK << VD;
-      Info.Note(VD->getLocation(), diag::note_declared_at);
+    if (AK != clang::AK_Dereference && !BaseVal) {
+      if (!Info.checkingPotentialConstantExpression()) {
+        Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
+            << AK << VD;
+        Info.Note(VD->getLocation(), diag::note_declared_at);
+      }
       return CompleteObject();
     }
   } else if (DynamicAllocLValue DA = LVal.Base.dyn_cast<DynamicAllocLValue>()) {
@@ -4491,7 +4507,10 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
     }
     return CompleteObject(LVal.Base, &(*Alloc)->Value,
                           LVal.Base.getDynamicAllocType());
-  } else {
+  }
+  // When binding to a reference, the variable does not need to be
+  // within its lifetime.
+  else if (AK != clang::AK_Dereference) {
     const Expr *Base = LVal.Base.dyn_cast<const Expr*>();
 
     if (!Frame) {
@@ -4572,7 +4591,7 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
         NoteLValueLocation(Info, LVal.Base);
         return CompleteObject();
       }
-    } else {
+    } else if (AK != clang::AK_Dereference) {
       BaseVal = Frame->getTemporary(Base, LVal.Base.getVersion());
       assert(BaseVal && "missing value for temporary");
     }
@@ -5200,6 +5219,29 @@ enum EvalStmtResult {
   ESR_CaseNotFound
 };
 }
+/// Evaluates the initializer of a reference.
+static bool EvaluateInitForDeclOfReferenceType(EvalInfo &Info,
+                                               const ValueDecl *D,
+                                               const Expr *Init, LValue &Result,
+                                               APValue &Val) {
+  assert(Init->isGLValue() && D->getType()->isReferenceType());
+  // A reference is an lvalue.
+  if (!EvaluateLValue(Init, Result, Info))
+    return false;
+  // [C++26][decl.ref]
+  // The object designated by such a glvalue can be outside its lifetime
+  // Because a null pointer value or a pointer past the end of an object
+  // does not point to an object, a reference in a well-defined program cannot
+  // refer to such things;
+  if (!Result.Designator.Invalid && Result.Designator.isOnePastTheEnd()) {
+    Info.FFDiag(Init, diag::note_constexpr_access_past_end) << AK_Dereference;
+    return false;
+  }
+
+  // Save the result.
+  Result.moveInto(Val);
+  return true;
+}
 
 static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) {
   if (VD->isInvalidDecl())
@@ -5221,7 +5263,11 @@ static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) {
   if (InitE->isValueDependent())
     return false;
 
-  if (!EvaluateInPlace(Val, Info, Result, InitE)) {
+  // For references to objects, check they do not designate a one-past-the-end
+  // object.
+  if (VD->getType()->isReferenceType()) {
+    return EvaluateInitForDeclOfReferenceType(Info, VD, InitE, Result, Val);
+  } else if (!EvaluateInPlace(Val, Info, Result, InitE)) {
     // Wipe out any partially-computed value, to allow tracking that this
     // evaluation failed.
     Val = APValue();
@@ -6851,9 +6897,18 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This,
       ThisOverrideRAII ThisOverride(*Info.CurrentCall, &SubobjectParent,
                                     isa<CXXDefaultInitExpr>(Init));
       FullExpressionRAII InitScope(Info);
-      if (!EvaluateInPlace(*Value, Info, Subobject, Init) ||
-          (FD && FD->isBitField() &&
-           !truncateBitfieldValue(Info, Init, *Value, FD))) {
+      if (FD && FD->getType()->isReferenceType() &&
+          !FD->getType()->isFunctionReferenceType()) {
+        LValue Result;
+        if (!EvaluateInitForDeclOfReferenceType(Info, FD, Init, Result,
+                                                *Value)) {
+          if (!Info.noteFailure())
+            return false;
+          Success = false;
+        }
+      } else if (!EvaluateInPlace(*Value, Info, Subobject, Init) ||
+                 (FD && FD->isBitField() &&
+                  !truncateBitfieldValue(Info, Init, *Value, FD))) {
         // If we're checking for a potential constant expression, evaluate all
         // initializers even if some of them fail.
         if (!Info.noteFailure())
@@ -9287,7 +9342,17 @@ bool LValueExprEvaluator::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
 }
 
 bool LValueExprEvaluator::VisitUnaryDeref(const UnaryOperator *E) {
-  return evaluatePointer(E->getSubExpr(), Result);
+  bool Success = evaluatePointer(E->getSubExpr(), Result);
+  // [C++26][expr.unary.op]
+  // If the operand points to an object or function, the result
+  // denotes that object or function; otherwise, the behavior is undefined.
+  // Because &(*(type*)0) is a common pattern, we do not fail the evaluation
+  // immediately.
+  if (!Success || !E->getType().getNonReferenceType()->isObjectType())
+    return Success;
+  return bool(findCompleteObject(Info, E, AK_Dereference, Result,
+                                 E->getType())) ||
+         Info.noteUndefinedBehavior();
 }
 
 bool LValueExprEvaluator::VisitUnaryReal(const UnaryOperator *E) {
@@ -10906,9 +10971,17 @@ bool RecordExprEvaluator::VisitCXXParenListOrInitListExpr(
                                   isa<CXXDefaultInitExpr>(Init));
 
     APValue &FieldVal = Result.getStructField(Field->getFieldIndex());
-    if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) ||
-        (Field->isBitField() && !truncateBitfieldValue(Info, Init,
-                                                       FieldVal, Field))) {
+    if (Field->getType()->isReferenceType()) {
+      LValue Result;
+      if (!EvaluateInitForDeclOfReferenceType(Info, Field, Init, Result,
+                                              FieldVal)) {
+        if (!Info.noteFailure())
+          return false;
+        Success = false;
+      }
+    } else if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) ||
+               (Field->isBitField() &&
+                !truncateBitfieldValue(Info, Init, FieldVal, Field))) {
       if (!Info.noteFailure())
         return false;
       Success = false;
@@ -17949,6 +18022,11 @@ bool Expr::isPotentialConstantExprUnevaluated(Expr *E,
   Info.InConstantContext = true;
   Info.CheckingPotentialConstantExpression = true;
 
+  if (Info.EnableNewConstInterp) {
+    Info.Ctx.getInterpContext().isPotentialConstantExprUnevaluated(Info, E, FD);
+    return Diags.empty();
+  }
+
   // Fabricate a call stack frame to give the arguments a plausible cover story.
   CallStackFrame Frame(Info, SourceLocation(), FD, /*This=*/nullptr,
                        /*CallExpr=*/nullptr, CallRef());
@@ -18106,6 +18184,10 @@ bool Expr::EvaluateCharRangeAsString(APValue &Result,
 bool Expr::tryEvaluateStrLen(uint64_t &Result, ASTContext &Ctx) const {
   Expr::EvalStatus Status;
   EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantFold);
+
+  if (Info.EnableNewConstInterp)
+    return Info.Ctx.getInterpContext().evaluateStrlen(Info, this, Result);
+
   return EvaluateBuiltinStrLen(this, Result, Info);
 }
 
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
index 5d3b56fc4e713..112b756d2be1a 100644
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -320,6 +320,70 @@ bool clang::analyze_format_string::ParseUTF8InvalidSpecifier(
 // Methods on ArgType.
 //===----------------------------------------------------------------------===//
 
+static bool namedTypeToLengthModifierKind(ASTContext &Ctx, QualType QT,
+                                          LengthModifier::Kind &K) {
+  if (!Ctx.getLangOpts().C99 && !Ctx.getLangOpts().CPlusPlus)
+    return false;
+  for (/**/; const auto *TT = QT->getAs<TypedefType>(); QT = TT->desugar()) {
+    const auto *TD = TT->getDecl();
+    const auto *DC = TT->getDecl()->getDeclContext();
+    if (DC->isTranslationUnit() || DC->isStdNamespace()) {
+      StringRef Name = TD->getIdentifier()->getName();
+      if (Name == "size_t") {
+        K = LengthModifier::AsSizeT;
+        return true;
+      } else if (Name == "ssize_t" /*Not C99, but common in Unix.*/) {
+        K = LengthModifier::AsSizeT;
+        return true;
+      } else if (Name == "ptrdiff_t") {
+        K = LengthModifier::AsPtrDiff;
+        return true;
+      } else if (Name == "intmax_t") {
+        K = LengthModifier::AsIntMax;
+        return true;
+      } else if (Name == "uintmax_t") {
+        K = LengthModifier::AsIntMax;
+        return true;
+      }
+    }
+  }
+  if (const auto *PST = QT->getAs<PredefinedSugarType>()) {
+    using Kind = PredefinedSugarType::Kind;
+    switch (PST->getKind()) {
+    case Kind::SizeT:
+    case Kind::SignedSizeT:
+      K = LengthModifier::AsSizeT;
+      return true;
+    case Kind::PtrdiffT:
+      K = LengthModifier::AsPtrDiff;
+      return true;
+    }
+    llvm_unreachable("unexpected kind");
+  }
+  return false;
+}
+
+// Check whether T and E are compatible size_t/ptrdiff_t types. E must be
+// consistent with LE.
+// T is the type of the actual expression in the code to be checked, and E is
+// the expected type parsed from the format string.
+static clang::analyze_format_string::ArgType::MatchKind
+matchesSizeTPtrdiffT(ASTContext &C, QualType T, QualType E) {
+  using MatchKind = clang::analyze_format_string::ArgType::MatchKind;
+
+  if (!T->isIntegerType())
+    return MatchKind::NoMatch;
+
+  if (C.hasSameType(T, E))
+    return MatchKind::Match;
+
+  if (C.getCorrespondingSignedType(T.getCanonicalType()) !=
+      C.getCorrespondingSignedType(E.getCanonicalType()))
+    return MatchKind::NoMatch;
+
+  return MatchKind::NoMatchSignedness;
+}
+
 clang::analyze_format_string::ArgType::MatchKind
 ArgType::matchesType(ASTContext &C, QualType argTy) const {
   // When using the format attribute in C++, you can receive a function or an
@@ -394,6 +458,10 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const {
     }
 
     case SpecificTy: {
+      if (TK != TypeKind::DontCare) {
+        return matchesSizeTPtrdiffT(C, argTy, T);
+      }
+
       if (const EnumType *ETy = argTy->getAs<EnumType>()) {
         // If the enum is incomplete we know nothing about the underlying type.
         // Assume that it's 'int'. Do not use the underlying type for a scoped
@@ -653,6 +721,12 @@ ArgType::matchesArgType(ASTContext &C, const ArgType &Other) const {
 
   if (Left.K == AK::SpecificTy) {
     if (Right.K == AK::SpecificTy) {
+      if (Left.TK != TypeKind::DontCare) {
+        return matchesSizeTPtrdiffT(C, Right.T, Left.T);
+      } else if (Right.TK != TypeKind::DontCare) {
+        return matchesSizeTPtrdiffT(C, Left.T, Right.T);
+      }
+
       auto Canon1 = C.getCanonicalType(Left.T);
       auto Canon2 = C.getCanonicalType(Right.T);
       if (Canon1 == Canon2)
@@ -706,7 +780,11 @@ QualType ArgType::getRepresentativeType(ASTContext &C) const {
       Res = C.CharTy;
       break;
     case SpecificTy:
-      Res = T;
+      if (TK == TypeKind::PtrdiffT || TK == TypeKind::SizeT)
+        // Using Name as name, so no need to show the uglified name.
+        Res = T->getCanonicalTypeInternal();
+      else
+        Res = T;
       break;
     case CStrTy:
       Res = C.getPointerType(C.CharTy);
@@ -733,7 +811,6 @@ QualType ArgType::getRepresentativeType(ASTContext &C) const {
 
 std::string ArgType::getRepresentativeTypeName(ASTContext &C) const {
   std::string S = getRepresentativeType(C).getAsString(C.getPrintingPolicy());
-
   std::string Alias;
   if (Name) {
     // Use a specific name for this type, e.g. "size_t".
@@ -1198,29 +1275,12 @@ FormatSpecifier::getCorrectedLengthModifier() const {
   return std::nullopt;
 }
 
-bool FormatSpecifier::namedTypeToLengthModifier(QualType QT,
+bool FormatSpecifier::namedTypeToLengthModifier(ASTContext &Ctx, QualType QT,
                                                 LengthModifier &LM) {
-  for (/**/; const auto *TT = QT->getAs<TypedefType>();
-       QT = TT->getDecl()->getUnderlyingType()) {
-    const TypedefNameDecl *Typedef = TT->getDecl();
-    const IdentifierInfo *Identifier = Typedef->getIdentifier();
-    if (Identifier->getName() == "size_t") {
-      LM.setKind(LengthModifier::AsSizeT);
-      return true;
-    } else if (Identifier->getName() == "ssize_t") {
-      // Not C99, but common in Unix.
-      LM.setKind(LengthModifier::AsSizeT);
-      return true;
-    } else if (Identifier->getName() == "intmax_t") {
-      LM.setKind(LengthModifier::AsIntMax);
-      return true;
-    } else if (Identifier->getName() == "uintmax_t") {
-      LM.setKind(LengthModifier::AsIntMax);
-      return true;
-    } else if (Identifier->getName() == "ptrdiff_t") {
-      LM.setKind(LengthModifier::AsPtrDiff);
-      return true;
-    }
+  if (LengthModifier::Kind Out = LengthModifier::Kind::None;
+      namedTypeToLengthModifierKind(Ctx, QT, Out)) {
+    LM.setKind(Out);
+    return true;
   }
   return false;
 }
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 0520987ce6b3a..2a667934dba42 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -1384,14 +1384,6 @@ void CXXNameMangler::mangleUnresolvedPrefix(NestedNameSpecifier *qualifier,
       Out << "sr";
     mangleSourceNameWithAbiTags(qualifier->getAsNamespace());
     break;
-  case NestedNameSpecifier::NamespaceAlias:
-    if (qualifier->getPrefix())
-      mangleUnresolvedPrefix(qualifier->getPrefix(),
-                             /*recursive*/ true);
-    else
-      Out << "sr";
-    mangleSourceNameWithAbiTags(qualifier->getAsNamespaceAlias());
-    break;
 
   case NestedNameSpecifier::TypeSpec: {
     const Type *type = qualifier->getAsType();
@@ -2185,11 +2177,7 @@ void CXXNameMangler::manglePrefix(NestedNameSpecifier *qualifier) {
     llvm_unreachable("Can't mangle __super specifier");
 
   case NestedNameSpecifier::Namespace:
-    mangleName(qualifier->getAsNamespace());
-    return;
-
-  case NestedNameSpecifier::NamespaceAlias:
-    mangleName(qualifier->getAsNamespaceAlias()->getNamespace());
+    mangleName(qualifier->getAsNamespace()->getNamespace());
     return;
 
   case NestedNameSpecifier::TypeSpec:
@@ -2526,6 +2514,10 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
     mangleSourceNameWithAbiTags(cast<TypedefType>(Ty)->getDecl());
     break;
 
+  case Type::PredefinedSugar:
+    mangleType(cast<PredefinedSugarType>(Ty)->desugar());
+    break;
+
   case Type::UnresolvedUsing:
     mangleSourceNameWithAbiTags(
         cast<UnresolvedUsingType>(Ty)->getDecl());
diff --git a/clang/lib/AST/NestedNameSpecifier.cpp b/clang/lib/AST/NestedNameSpecifier.cpp
index db1ad89565189..56f74b92412d2 100644
--- a/clang/lib/AST/NestedNameSpecifier.cpp
+++ b/clang/lib/AST/NestedNameSpecifier.cpp
@@ -66,10 +66,9 @@ NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context,
   return FindOrInsert(Context, Mockup);
 }
 
-NestedNameSpecifier *
-NestedNameSpecifier::Create(const ASTContext &Context,
-                            NestedNameSpecifier *Prefix,
-                            const NamespaceDecl *NS) {
+NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context,
+                                                 NestedNameSpecifier *Prefix,
+                                                 const NamespaceBaseDecl *NS) {
   assert(NS && "Namespace cannot be NULL");
   assert((!Prefix ||
           (Prefix->getAsType() == nullptr &&
@@ -78,23 +77,7 @@ NestedNameSpecifier::Create(const ASTContext &Context,
   NestedNameSpecifier Mockup;
   Mockup.Prefix.setPointer(Prefix);
   Mockup.Prefix.setInt(StoredDecl);
-  Mockup.Specifier = const_cast<NamespaceDecl *>(NS);
-  return FindOrInsert(Context, Mockup);
-}
-
-NestedNameSpecifier *
-NestedNameSpecifier::Create(const ASTContext &Context,
-                            NestedNameSpecifier *Prefix,
-                            const NamespaceAliasDecl *Alias) {
-  assert(Alias && "Namespace alias cannot be NULL");
-  assert((!Prefix ||
-          (Prefix->getAsType() == nullptr &&
-           Prefix->getAsIdentifier() == nullptr)) &&
-         "Broken nested name specifier");
-  NestedNameSpecifier Mockup;
-  Mockup.Prefix.setPointer(Prefix);
-  Mockup.Prefix.setInt(StoredDecl);
-  Mockup.Specifier = const_cast<NamespaceAliasDecl *>(Alias);
+  Mockup.Specifier = const_cast<NamespaceBaseDecl *>(NS);
   return FindOrInsert(Context, Mockup);
 }
 
@@ -147,9 +130,7 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const {
 
   case StoredDecl: {
     NamedDecl *ND = static_cast<NamedDecl *>(Specifier);
-    if (isa<CXXRecordDecl>(ND))
-      return Super;
-    return isa<NamespaceDecl>(ND) ? Namespace : NamespaceAlias;
+    return isa<CXXRecordDecl>(ND) ? Super : Namespace;
   }
 
   case StoredTypeSpec:
@@ -159,18 +140,11 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const {
   llvm_unreachable("Invalid NNS Kind!");
 }
 
-/// Retrieve the namespace stored in this nested name specifier.
-NamespaceDecl *NestedNameSpecifier::getAsNamespace() const {
+/// Retrieve the namespace or namespace alias stored in this nested name
+/// specifier.
+NamespaceBaseDecl *NestedNameSpecifier::getAsNamespace() const {
   if (Prefix.getInt() == StoredDecl)
-    return dyn_cast<NamespaceDecl>(static_cast<NamedDecl *>(Specifier));
-
-  return nullptr;
-}
-
-/// Retrieve the namespace alias stored in this nested name specifier.
-NamespaceAliasDecl *NestedNameSpecifier::getAsNamespaceAlias() const {
-  if (Prefix.getInt() == StoredDecl)
-    return dyn_cast<NamespaceAliasDecl>(static_cast<NamedDecl *>(Specifier));
+    return dyn_cast<NamespaceBaseDecl>(static_cast<NamedDecl *>(Specifier));
 
   return nullptr;
 }
@@ -204,7 +178,6 @@ NestedNameSpecifierDependence NestedNameSpecifier::getDependence() const {
   }
 
   case Namespace:
-  case NamespaceAlias:
   case Global:
     return NestedNameSpecifierDependence::None;
 
@@ -284,7 +257,6 @@ NestedNameSpecifier::translateToType(const ASTContext &Context) const {
   }
   case SpecifierKind::Global:
   case SpecifierKind::Namespace:
-  case SpecifierKind::NamespaceAlias:
   case SpecifierKind::Super:
     // These are not representable as types.
     return nullptr;
@@ -305,16 +277,16 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy,
     OS << getAsIdentifier()->getName();
     break;
 
-  case Namespace:
-    if (getAsNamespace()->isAnonymousNamespace())
-      return;
-
-    OS << getAsNamespace()->getName();
-    break;
-
-  case NamespaceAlias:
-    OS << getAsNamespaceAlias()->getName();
+  case Namespace: {
+    NamespaceBaseDecl *Namespace = getAsNamespace();
+    if (const auto *NS = dyn_cast<NamespaceDecl>(Namespace)) {
+      assert(!NS->isAnonymousNamespace());
+      OS << NS->getName();
+    } else {
+      OS << cast<NamespaceAliasDecl>(Namespace)->getName();
+    }
     break;
+  }
 
   case Global:
     OS << "::";
@@ -367,7 +339,6 @@ NestedNameSpecifierLoc::getLocalDataLength(NestedNameSpecifier *Qualifier) {
 
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Super:
     // The location of the identifier or namespace name.
     Length += sizeof(SourceLocation::UIntTy);
@@ -418,7 +389,6 @@ SourceRange NestedNameSpecifierLoc::getLocalSourceRange() const {
 
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Super:
     return SourceRange(
         LoadSourceLocation(Data, Offset),
@@ -569,7 +539,7 @@ void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
 }
 
 void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
-                                           NamespaceDecl *Namespace,
+                                           NamespaceBaseDecl *Namespace,
                                            SourceLocation NamespaceLoc,
                                            SourceLocation ColonColonLoc) {
   Representation = NestedNameSpecifier::Create(Context, Representation,
@@ -580,17 +550,6 @@ void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
   SaveSourceLocation(ColonColonLoc, Buffer, BufferSize, BufferCapacity);
 }
 
-void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
-                                           NamespaceAliasDecl *Alias,
-                                           SourceLocation AliasLoc,
-                                           SourceLocation ColonColonLoc) {
-  Representation = NestedNameSpecifier::Create(Context, Representation, Alias);
-
-  // Push source-location info into the buffer.
-  SaveSourceLocation(AliasLoc, Buffer, BufferSize, BufferCapacity);
-  SaveSourceLocation(ColonColonLoc, Buffer, BufferSize, BufferCapacity);
-}
-
 void NestedNameSpecifierLocBuilder::MakeGlobal(ASTContext &Context,
                                                SourceLocation ColonColonLoc) {
   assert(!Representation && "Already have a nested-name-specifier!?");
@@ -627,7 +586,6 @@ void NestedNameSpecifierLocBuilder::MakeTrivial(ASTContext &Context,
     switch (NNS->getKind()) {
       case NestedNameSpecifier::Identifier:
       case NestedNameSpecifier::Namespace:
-      case NestedNameSpecifier::NamespaceAlias:
         SaveSourceLocation(R.getBegin(), Buffer, BufferSize, BufferCapacity);
         break;
 
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 7fdfcfa3014f3..bd87d4418484b 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -127,9 +127,6 @@ void ODRHash::AddNestedNameSpecifier(const NestedNameSpecifier *NNS) {
   case NestedNameSpecifier::Namespace:
     AddDecl(NNS->getAsNamespace());
     break;
-  case NestedNameSpecifier::NamespaceAlias:
-    AddDecl(NNS->getAsNamespaceAlias());
-    break;
   case NestedNameSpecifier::TypeSpec:
     AddType(NNS->getAsType());
     break;
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 293164ddac8f8..bcd44f0a85eed 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -543,7 +543,8 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx,
       case LengthModifier::AsIntMax:
         return ArgType(Ctx.getIntMaxType(), "intmax_t");
       case LengthModifier::AsSizeT:
-        return ArgType::makeSizeT(ArgType(Ctx.getSignedSizeType(), "ssize_t"));
+        return ArgType::makeSizeT(
+            ArgType(Ctx.getSignedSizeType(), "signed size_t"));
       case LengthModifier::AsInt3264:
         return Ctx.getTargetInfo().getTriple().isArch64Bit()
                    ? ArgType(Ctx.LongLongTy, "__int64")
@@ -626,9 +627,11 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx,
       case LengthModifier::AsIntMax:
         return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t"));
       case LengthModifier::AsSizeT:
-        return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t"));
+        return ArgType::PtrTo(ArgType::makeSizeT(
+            ArgType(Ctx.getSignedSizeType(), "signed size_t")));
       case LengthModifier::AsPtrDiff:
-        return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"));
+        return ArgType::PtrTo(ArgType::makePtrdiffT(
+            ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")));
       case LengthModifier::AsLongDouble:
         return ArgType(); // FIXME: Is this a known extension?
       case LengthModifier::AsAllocate:
@@ -917,7 +920,7 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt,
 
   // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99.
   if (LangOpt.C99 || LangOpt.CPlusPlus11)
-    namedTypeToLengthModifier(QT, LM);
+    namedTypeToLengthModifier(Ctx, QT, LM);
 
   // If fixing the length modifier was enough, we might be done.
   if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) {
diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp
index 39703d6d7b882..b43bcd8d1f1c1 100644
--- a/clang/lib/AST/QualTypeNames.cpp
+++ b/clang/lib/AST/QualTypeNames.cpp
@@ -218,16 +218,7 @@ static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier(
       return Scope;
     case NestedNameSpecifier::Namespace:
       return TypeName::createNestedNameSpecifier(
-          Ctx, Scope->getAsNamespace(), WithGlobalNsPrefix);
-    case NestedNameSpecifier::NamespaceAlias:
-      // Namespace aliases are only valid for the duration of the
-      // scope where they were introduced, and therefore are often
-      // invalid at the end of the TU.  So use the namespace name more
-      // likely to be valid at the end of the TU.
-      return TypeName::createNestedNameSpecifier(
-          Ctx,
-          Scope->getAsNamespaceAlias()->getNamespace()->getCanonicalDecl(),
-          WithGlobalNsPrefix);
+          Ctx, Scope->getAsNamespace()->getNamespace(), WithGlobalNsPrefix);
     case NestedNameSpecifier::Identifier:
       // A function or some other construct that makes it un-namable
       // at the end of the TU. Skip the current component of the name,
diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp
index 7ee21c8c61954..1227edd47d13d 100644
--- a/clang/lib/AST/ScanfFormatString.cpp
+++ b/clang/lib/AST/ScanfFormatString.cpp
@@ -251,9 +251,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const {
         case LengthModifier::AsIntMax:
           return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t"));
         case LengthModifier::AsSizeT:
-          return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t"));
+          return ArgType::PtrTo(ArgType::makeSizeT(
+              ArgType(Ctx.getSignedSizeType(), "signed size_t")));
         case LengthModifier::AsPtrDiff:
-          return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"));
+          return ArgType::PtrTo(ArgType::makePtrdiffT(
+              ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")));
         case LengthModifier::AsLongDouble:
           // GNU extension.
           return ArgType::PtrTo(Ctx.LongLongTy);
@@ -292,10 +294,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const {
         case LengthModifier::AsIntMax:
           return ArgType::PtrTo(ArgType(Ctx.getUIntMaxType(), "uintmax_t"));
         case LengthModifier::AsSizeT:
-          return ArgType::PtrTo(ArgType(Ctx.getSizeType(), "size_t"));
-        case LengthModifier::AsPtrDiff:
           return ArgType::PtrTo(
-              ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t"));
+              ArgType::makeSizeT(ArgType(Ctx.getSizeType(), "size_t")));
+        case LengthModifier::AsPtrDiff:
+          return ArgType::PtrTo(ArgType::makePtrdiffT(
+              ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t")));
         case LengthModifier::AsLongDouble:
           // GNU extension.
           return ArgType::PtrTo(Ctx.UnsignedLongLongTy);
@@ -390,9 +393,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const {
         case LengthModifier::AsIntMax:
           return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t"));
         case LengthModifier::AsSizeT:
-          return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t"));
+          return ArgType::PtrTo(ArgType::makeSizeT(
+              ArgType(Ctx.getSignedSizeType(), "signed size_t")));
         case LengthModifier::AsPtrDiff:
-          return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"));
+          return ArgType::PtrTo(ArgType::makePtrdiffT(
+              ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")));
         case LengthModifier::AsLongDouble:
           return ArgType(); // FIXME: Is this a known extension?
         case LengthModifier::AsAllocate:
@@ -501,7 +506,7 @@ bool ScanfSpecifier::fixType(QualType QT, QualType RawQT,
 
   // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99.
   if (LangOpt.C99 || LangOpt.CPlusPlus11)
-    namedTypeToLengthModifier(PT, LM);
+    namedTypeToLengthModifier(Ctx, PT, LM);
 
   // If fixing the length modifier was enough, we are done.
   if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) {
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index be02bdde38a3d..6ba5ec89964a9 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -610,7 +610,7 @@ void StmtPrinter::VisitObjCAtTryStmt(ObjCAtTryStmt *Node) {
     }
   }
 
-  if (auto *FS = static_cast<ObjCAtFinallyStmt *>(Node->getFinallyStmt())) {
+  if (ObjCAtFinallyStmt *FS = Node->getFinallyStmt()) {
     Indent() << "@finally";
     if (auto *CS = dyn_cast<CompoundStmt>(FS->getFinallyBody())) {
       PrintRawCompoundStmt(CS);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 9d7c2757d6ee4..6b524cfcd2d71 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -843,7 +843,10 @@ void TextNodeDumper::Visit(const APValue &Value, QualType Ty) {
     }
 
     ColorScope Color(OS, ShowColors, DeclNameColor);
-    OS << Value.getMemberPointerDecl()->getDeclName();
+    if (const ValueDecl *MemDecl = Value.getMemberPointerDecl())
+      OS << MemDecl->getDeclName();
+    else
+      OS << "null";
     return;
   }
   case APValue::AddrLabelDiff:
@@ -1050,10 +1053,6 @@ void clang::TextNodeDumper::dumpNestedNameSpecifier(const NestedNameSpecifier *N
       OS << " "; // "Namespace" is printed as the decl kind.
       dumpBareDeclRef(NNS->getAsNamespace());
       break;
-    case NestedNameSpecifier::NamespaceAlias:
-      OS << " "; // "NamespaceAlias" is printed as the decl kind.
-      dumpBareDeclRef(NNS->getAsNamespaceAlias());
-      break;
     case NestedNameSpecifier::TypeSpec:
       OS << " TypeSpec";
       dumpType(QualType(NNS->getAsType(), 0));
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index e5a1ab2ff8906..7444a2f90c5dd 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -5613,3 +5613,15 @@ HLSLAttributedResourceType::findHandleTypeOnResource(const Type *RT) {
   }
   return nullptr;
 }
+
+StringRef PredefinedSugarType::getName(Kind KD) {
+  switch (KD) {
+  case Kind::SizeT:
+    return "__size_t";
+  case Kind::SignedSizeT:
+    return "__signed_size_t";
+  case Kind::PtrdiffT:
+    return "__ptrdiff_t";
+  }
+  llvm_unreachable("unexpected kind");
+}
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 818d2139628e3..deb453fe6ee75 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -248,6 +248,7 @@ bool TypePrinter::canPrefixQualifiers(const Type *T,
     case Type::BTFTagAttributed:
     case Type::HLSLAttributedResource:
     case Type::HLSLInlineSpirv:
+    case Type::PredefinedSugar:
       CanPrefixQualifiers = true;
       break;
 
@@ -1417,6 +1418,15 @@ void TypePrinter::printDependentBitIntBefore(const DependentBitIntType *T,
 void TypePrinter::printDependentBitIntAfter(const DependentBitIntType *T,
                                             raw_ostream &OS) {}
 
+void TypePrinter::printPredefinedSugarBefore(const PredefinedSugarType *T,
+                                             raw_ostream &OS) {
+  OS << T->getIdentifier()->getName();
+  spaceBeforePlaceHolder(OS);
+}
+
+void TypePrinter::printPredefinedSugarAfter(const PredefinedSugarType *T,
+                                            raw_ostream &OS) {}
+
 /// Appends the given scope to the end of a string.
 void TypePrinter::AppendScope(DeclContext *DC, raw_ostream &OS,
                               DeclarationName NameInScope) {
diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp
index bf67bea6c9933..ae6ec9f76cbf6 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -18,13 +18,20 @@
 #include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/ImmutableSet.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <cstdint>
 
-namespace clang {
+namespace clang::lifetimes {
+namespace internal {
 namespace {
+template <typename Tag>
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, ID<Tag> ID) {
+  return OS << ID.Value;
+}
+} // namespace
 
 /// Represents the storage location being borrowed, e.g., a specific stack
 /// variable.
@@ -35,32 +42,6 @@ struct AccessPath {
   AccessPath(const clang::ValueDecl *D) : D(D) {}
 };
 
-/// A generic, type-safe wrapper for an ID, distinguished by its `Tag` type.
-/// Used for giving ID to loans and origins.
-template <typename Tag> struct ID {
-  uint32_t Value = 0;
-
-  bool operator==(const ID<Tag> &Other) const { return Value == Other.Value; }
-  bool operator!=(const ID<Tag> &Other) const { return !(*this == Other); }
-  bool operator<(const ID<Tag> &Other) const { return Value < Other.Value; }
-  ID<Tag> operator++(int) {
-    ID<Tag> Tmp = *this;
-    ++Value;
-    return Tmp;
-  }
-  void Profile(llvm::FoldingSetNodeID &IDBuilder) const {
-    IDBuilder.AddInteger(Value);
-  }
-};
-
-template <typename Tag>
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, ID<Tag> ID) {
-  return OS << ID.Value;
-}
-
-using LoanID = ID<struct LoanTag>;
-using OriginID = ID<struct OriginTag>;
-
 /// Information about a single borrow, or "Loan". A loan is created when a
 /// reference or pointer is created.
 struct Loan {
@@ -222,7 +203,9 @@ class Fact {
     /// An origin is propagated from a source to a destination (e.g., p = q).
     AssignOrigin,
     /// An origin escapes the function by flowing into the return value.
-    ReturnOfOrigin
+    ReturnOfOrigin,
+    /// A marker for a specific point in the code, for testing.
+    TestPoint,
   };
 
 private:
@@ -309,6 +292,24 @@ class ReturnOfOriginFact : public Fact {
   }
 };
 
+/// A dummy-fact used to mark a specific point in the code for testing.
+/// It is generated by recognizing a `void("__lifetime_test_point_...")` cast.
+class TestPointFact : public Fact {
+  StringRef Annotation;
+
+public:
+  static bool classof(const Fact *F) { return F->getKind() == Kind::TestPoint; }
+
+  explicit TestPointFact(StringRef Annotation)
+      : Fact(Kind::TestPoint), Annotation(Annotation) {}
+
+  StringRef getAnnotation() const { return Annotation; }
+
+  void dump(llvm::raw_ostream &OS) const override {
+    OS << "TestPoint (Annotation: \"" << getAnnotation() << "\")\n";
+  }
+};
+
 class FactManager {
 public:
   llvm::ArrayRef<const Fact *> getFacts(const CFGBlock *B) const {
@@ -362,6 +363,7 @@ class FactManager {
 };
 
 class FactGenerator : public ConstStmtVisitor<FactGenerator> {
+  using Base = ConstStmtVisitor<FactGenerator>;
 
 public:
   FactGenerator(FactManager &FactMgr, AnalysisDeclContext &AC)
@@ -457,6 +459,15 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
     }
   }
 
+  void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *FCE) {
+    // Check if this is a test point marker. If so, we are done with this
+    // expression.
+    if (VisitTestPoint(FCE))
+      return;
+    // Visit as normal otherwise.
+    Base::VisitCXXFunctionalCastExpr(FCE);
+  }
+
 private:
   // Check if a type has an origin.
   bool hasOrigin(QualType QT) { return QT->isPointerOrReferenceType(); }
@@ -490,101 +501,267 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
     }
   }
 
+  /// Checks if the expression is a `void("__lifetime_test_point_...")` cast.
+  /// If so, creates a `TestPointFact` and returns true.
+  bool VisitTestPoint(const CXXFunctionalCastExpr *FCE) {
+    if (!FCE->getType()->isVoidType())
+      return false;
+
+    const auto *SubExpr = FCE->getSubExpr()->IgnoreParenImpCasts();
+    if (const auto *SL = dyn_cast<StringLiteral>(SubExpr)) {
+      llvm::StringRef LiteralValue = SL->getString();
+      const std::string Prefix = "__lifetime_test_point_";
+
+      if (LiteralValue.starts_with(Prefix)) {
+        StringRef Annotation = LiteralValue.drop_front(Prefix.length());
+        CurrentBlockFacts.push_back(
+            FactMgr.createFact<TestPointFact>(Annotation));
+        return true;
+      }
+    }
+    return false;
+  }
+
   FactManager &FactMgr;
   AnalysisDeclContext &AC;
   llvm::SmallVector<Fact *> CurrentBlockFacts;
 };
 
 // ========================================================================= //
-//                              The Dataflow Lattice
+//                         Generic Dataflow Analysis
+// ========================================================================= //
+
+enum class Direction { Forward, Backward };
+
+/// A `ProgramPoint` identifies a location in the CFG by pointing to a specific
+/// `Fact`. identified by a lifetime-related event (`Fact`).
+///
+/// A `ProgramPoint` has "after" semantics: it represents the location
+/// immediately after its corresponding `Fact`.
+using ProgramPoint = const Fact *;
+
+/// A generic, policy-based driver for dataflow analyses. It combines
+/// the dataflow runner and the transferer logic into a single class hierarchy.
+///
+/// The derived class is expected to provide:
+/// - A `Lattice` type.
+/// - `StringRef getAnalysisName() const`
+/// - `Lattice getInitialState();` The initial state of the analysis.
+/// - `Lattice join(Lattice, Lattice);` Merges states from multiple CFG paths.
+/// - `Lattice transfer(Lattice, const FactType&);` Defines how a single
+///   lifetime-relevant `Fact` transforms the lattice state. Only overloads
+///   for facts relevant to the analysis need to be implemented.
+///
+/// \tparam Derived The CRTP derived class that implements the specific
+/// analysis.
+/// \tparam LatticeType The dataflow lattice used by the analysis.
+/// \tparam Dir The direction of the analysis (Forward or Backward).
+/// TODO: Maybe use the dataflow framework! The framework might need changes
+/// to support the current comparison done at block-entry.
+template <typename Derived, typename LatticeType, Direction Dir>
+class DataflowAnalysis {
+public:
+  using Lattice = LatticeType;
+  using Base = DataflowAnalysis<Derived, Lattice, Dir>;
+
+private:
+  const CFG &Cfg;
+  AnalysisDeclContext &AC;
+
+  /// The dataflow state before a basic block is processed.
+  llvm::DenseMap<const CFGBlock *, Lattice> InStates;
+  /// The dataflow state after a basic block is processed.
+  llvm::DenseMap<const CFGBlock *, Lattice> OutStates;
+  /// The dataflow state at a Program Point.
+  /// In a forward analysis, this is the state after the Fact at that point has
+  /// been applied, while in a backward analysis, it is the state before.
+  llvm::DenseMap<ProgramPoint, Lattice> PerPointStates;
+
+  static constexpr bool isForward() { return Dir == Direction::Forward; }
+
+protected:
+  FactManager &AllFacts;
+
+  explicit DataflowAnalysis(const CFG &C, AnalysisDeclContext &AC,
+                            FactManager &F)
+      : Cfg(C), AC(AC), AllFacts(F) {}
+
+public:
+  void run() {
+    Derived &D = static_cast<Derived &>(*this);
+    llvm::TimeTraceScope Time(D.getAnalysisName());
+
+    using Worklist =
+        std::conditional_t<Dir == Direction::Forward, ForwardDataflowWorklist,
+                           BackwardDataflowWorklist>;
+    Worklist W(Cfg, AC);
+
+    const CFGBlock *Start = isForward() ? &Cfg.getEntry() : &Cfg.getExit();
+    InStates[Start] = D.getInitialState();
+    W.enqueueBlock(Start);
+
+    llvm::SmallBitVector Visited(Cfg.getNumBlockIDs() + 1);
+
+    while (const CFGBlock *B = W.dequeue()) {
+      Lattice StateIn = getInState(B);
+      Lattice StateOut = transferBlock(B, StateIn);
+      OutStates[B] = StateOut;
+      Visited.set(B->getBlockID());
+      for (const CFGBlock *AdjacentB : isForward() ? B->succs() : B->preds()) {
+        Lattice OldInState = getInState(AdjacentB);
+        Lattice NewInState = D.join(OldInState, StateOut);
+        // Enqueue the adjacent block if its in-state has changed or if we have
+        // never visited it.
+        if (!Visited.test(AdjacentB->getBlockID()) ||
+            NewInState != OldInState) {
+          InStates[AdjacentB] = NewInState;
+          W.enqueueBlock(AdjacentB);
+        }
+      }
+    }
+  }
+
+  Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); }
+
+  Lattice getInState(const CFGBlock *B) const { return InStates.lookup(B); }
+
+  Lattice getOutState(const CFGBlock *B) const { return OutStates.lookup(B); }
+
+  void dump() const {
+    const Derived *D = static_cast<const Derived *>(this);
+    llvm::dbgs() << "==========================================\n";
+    llvm::dbgs() << D->getAnalysisName() << " results:\n";
+    llvm::dbgs() << "==========================================\n";
+    const CFGBlock &B = isForward() ? Cfg.getExit() : Cfg.getEntry();
+    getOutState(&B).dump(llvm::dbgs());
+  }
+
+private:
+  /// Computes the state at one end of a block by applying all its facts
+  /// sequentially to a given state from the other end.
+  Lattice transferBlock(const CFGBlock *Block, Lattice State) {
+    auto Facts = AllFacts.getFacts(Block);
+    if constexpr (isForward()) {
+      for (const Fact *F : Facts) {
+        State = transferFact(State, F);
+        PerPointStates[F] = State;
+      }
+    } else {
+      for (const Fact *F : llvm::reverse(Facts)) {
+        // In backward analysis, capture the state before applying the fact.
+        PerPointStates[F] = State;
+        State = transferFact(State, F);
+      }
+    }
+    return State;
+  }
+
+  Lattice transferFact(Lattice In, const Fact *F) {
+    assert(F);
+    Derived *D = static_cast<Derived *>(this);
+    switch (F->getKind()) {
+    case Fact::Kind::Issue:
+      return D->transfer(In, *F->getAs<IssueFact>());
+    case Fact::Kind::Expire:
+      return D->transfer(In, *F->getAs<ExpireFact>());
+    case Fact::Kind::AssignOrigin:
+      return D->transfer(In, *F->getAs<AssignOriginFact>());
+    case Fact::Kind::ReturnOfOrigin:
+      return D->transfer(In, *F->getAs<ReturnOfOriginFact>());
+    case Fact::Kind::TestPoint:
+      return D->transfer(In, *F->getAs<TestPointFact>());
+    }
+    llvm_unreachable("Unknown fact kind");
+  }
+
+public:
+  Lattice transfer(Lattice In, const IssueFact &) { return In; }
+  Lattice transfer(Lattice In, const ExpireFact &) { return In; }
+  Lattice transfer(Lattice In, const AssignOriginFact &) { return In; }
+  Lattice transfer(Lattice In, const ReturnOfOriginFact &) { return In; }
+  Lattice transfer(Lattice In, const TestPointFact &) { return In; }
+};
+
+namespace utils {
+
+/// Computes the union of two ImmutableSets.
+template <typename T>
+static llvm::ImmutableSet<T> join(llvm::ImmutableSet<T> A,
+                                  llvm::ImmutableSet<T> B,
+                                  typename llvm::ImmutableSet<T>::Factory &F) {
+  if (A.getHeight() < B.getHeight())
+    std::swap(A, B);
+  for (const T &E : B)
+    A = F.add(A, E);
+  return A;
+}
+
+/// Computes the key-wise union of two ImmutableMaps.
+// TODO(opt): This key-wise join is a performance bottleneck. A more
+// efficient merge could be implemented using a Patricia Trie or HAMT
+// instead of the current AVL-tree-based ImmutableMap.
+template <typename K, typename V, typename Joiner>
+static llvm::ImmutableMap<K, V>
+join(llvm::ImmutableMap<K, V> A, llvm::ImmutableMap<K, V> B,
+     typename llvm::ImmutableMap<K, V>::Factory &F, Joiner joinValues) {
+  if (A.getHeight() < B.getHeight())
+    std::swap(A, B);
+
+  // For each element in B, join it with the corresponding element in A
+  // (or with an empty value if it doesn't exist in A).
+  for (const auto &Entry : B) {
+    const K &Key = Entry.first;
+    const V &ValB = Entry.second;
+    if (const V *ValA = A.lookup(Key))
+      A = F.add(A, Key, joinValues(*ValA, ValB));
+    else
+      A = F.add(A, Key, ValB);
+  }
+  return A;
+}
+} // namespace utils
+
+// ========================================================================= //
+//                          Loan Propagation Analysis
 // ========================================================================= //
 
-// Using LLVM's immutable collections is efficient for dataflow analysis
-// as it avoids deep copies during state transitions.
-// TODO(opt): Consider using a bitset to represent the set of loans.
-using LoanSet = llvm::ImmutableSet<LoanID>;
 using OriginLoanMap = llvm::ImmutableMap<OriginID, LoanSet>;
 
 /// An object to hold the factories for immutable collections, ensuring
 /// that all created states share the same underlying memory management.
 struct LifetimeFactory {
   OriginLoanMap::Factory OriginMapFactory;
-  LoanSet::Factory LoanSetFact;
+  LoanSet::Factory LoanSetFactory;
 
   /// Creates a singleton set containing only the given loan ID.
   LoanSet createLoanSet(LoanID LID) {
-    return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+    return LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID);
   }
 };
 
-/// LifetimeLattice represents the state of our analysis at a given program
-/// point. It is an immutable object, and all operations produce a new
-/// instance rather than modifying the existing one.
-struct LifetimeLattice {
+/// Represents the dataflow lattice for loan propagation.
+///
+/// This lattice tracks which loans each origin may hold at a given program
+/// point.The lattice has a finite height: An origin's loan set is bounded by
+/// the total number of loans in the function.
+/// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+/// not expressions, because expressions are not visible across blocks.
+struct LoanPropagationLattice {
   /// The map from an origin to the set of loans it contains.
-  /// The lattice has a finite height: An origin's loan set is bounded by the
-  /// total number of loans in the function.
-  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
-  /// not expressions, because expressions are not visible across blocks.
   OriginLoanMap Origins = OriginLoanMap(nullptr);
 
-  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
-  LifetimeLattice() = default;
+  explicit LoanPropagationLattice(const OriginLoanMap &S) : Origins(S) {}
+  LoanPropagationLattice() = default;
 
-  bool operator==(const LifetimeLattice &Other) const {
+  bool operator==(const LoanPropagationLattice &Other) const {
     return Origins == Other.Origins;
   }
-  bool operator!=(const LifetimeLattice &Other) const {
+  bool operator!=(const LoanPropagationLattice &Other) const {
     return !(*this == Other);
   }
 
-  LoanSet getLoans(OriginID OID) const {
-    if (auto *Loans = Origins.lookup(OID))
-      return *Loans;
-    return LoanSet(nullptr);
-  }
-
-  /// Computes the union of two lattices by performing a key-wise join of
-  /// their OriginLoanMaps.
-  // TODO(opt): This key-wise join is a performance bottleneck. A more
-  // efficient merge could be implemented using a Patricia Trie or HAMT
-  // instead of the current AVL-tree-based ImmutableMap.
-  // TODO(opt): Keep the state small by removing origins which become dead.
-  LifetimeLattice join(const LifetimeLattice &Other,
-                       LifetimeFactory &Factory) const {
-    /// Merge the smaller map into the larger one ensuring we iterate over the
-    /// smaller map.
-    if (Origins.getHeight() < Other.Origins.getHeight())
-      return Other.join(*this, Factory);
-
-    OriginLoanMap JoinedState = Origins;
-    // For each origin in the other map, union its loan set with ours.
-    for (const auto &Entry : Other.Origins) {
-      OriginID OID = Entry.first;
-      LoanSet OtherLoanSet = Entry.second;
-      JoinedState = Factory.OriginMapFactory.add(
-          JoinedState, OID, join(getLoans(OID), OtherLoanSet, Factory));
-    }
-    return LifetimeLattice(JoinedState);
-  }
-
-  LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const {
-    /// Merge the smaller set into the larger one ensuring we iterate over the
-    /// smaller set.
-    if (a.getHeight() < b.getHeight())
-      std::swap(a, b);
-    LoanSet Result = a;
-    for (LoanID LID : b) {
-      /// TODO(opt): Profiling shows that this loop is a major performance
-      /// bottleneck. Investigate using a BitVector to represent the set of
-      /// loans for improved join performance.
-      Result = Factory.LoanSetFact.add(Result, LID);
-    }
-    return Result;
-  }
-
   void dump(llvm::raw_ostream &OS) const {
-    OS << "LifetimeLattice State:\n";
+    OS << "LoanPropagationLattice State:\n";
     if (Origins.isEmpty())
       OS << "  <empty>\n";
     for (const auto &Entry : Origins) {
@@ -596,155 +773,93 @@ struct LifetimeLattice {
   }
 };
 
-// ========================================================================= //
-//                              The Transfer Function
-// ========================================================================= //
-class Transferer {
-  FactManager &AllFacts;
+/// The analysis that tracks which loans belong to which origins.
+class LoanPropagationAnalysis
+    : public DataflowAnalysis<LoanPropagationAnalysis, LoanPropagationLattice,
+                              Direction::Forward> {
+
   LifetimeFactory &Factory;
 
 public:
-  explicit Transferer(FactManager &F, LifetimeFactory &Factory)
-      : AllFacts(F), Factory(Factory) {}
-
-  /// Computes the exit state of a block by applying all its facts sequentially
-  /// to a given entry state.
-  /// TODO: We might need to store intermediate states per-fact in the block for
-  /// later analysis.
-  LifetimeLattice transferBlock(const CFGBlock *Block,
-                                LifetimeLattice EntryState) {
-    LifetimeLattice BlockState = EntryState;
-    llvm::ArrayRef<const Fact *> Facts = AllFacts.getFacts(Block);
-
-    for (const Fact *F : Facts) {
-      BlockState = transferFact(BlockState, F);
-    }
-    return BlockState;
-  }
+  LoanPropagationAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F,
+                          LifetimeFactory &Factory)
+      : DataflowAnalysis(C, AC, F), Factory(Factory) {}
 
-private:
-  LifetimeLattice transferFact(LifetimeLattice In, const Fact *F) {
-    switch (F->getKind()) {
-    case Fact::Kind::Issue:
-      return transfer(In, *F->getAs<IssueFact>());
-    case Fact::Kind::AssignOrigin:
-      return transfer(In, *F->getAs<AssignOriginFact>());
-    // Expire and ReturnOfOrigin facts don't modify the Origins and the State.
-    case Fact::Kind::Expire:
-    case Fact::Kind::ReturnOfOrigin:
-      return In;
-    }
-    llvm_unreachable("Unknown fact kind");
+  using Base::transfer;
+
+  StringRef getAnalysisName() const { return "LoanPropagation"; }
+
+  Lattice getInitialState() { return Lattice{}; }
+
+  /// Merges two lattices by taking the union of loans for each origin.
+  // TODO(opt): Keep the state small by removing origins which become dead.
+  Lattice join(Lattice A, Lattice B) {
+    OriginLoanMap JoinedOrigins =
+        utils::join(A.Origins, B.Origins, Factory.OriginMapFactory,
+                    [this](LoanSet S1, LoanSet S2) {
+                      return utils::join(S1, S2, Factory.LoanSetFactory);
+                    });
+    return Lattice(JoinedOrigins);
   }
 
   /// A new loan is issued to the origin. Old loans are erased.
-  LifetimeLattice transfer(LifetimeLattice In, const IssueFact &F) {
+  Lattice transfer(Lattice In, const IssueFact &F) {
     OriginID OID = F.getOriginID();
     LoanID LID = F.getLoanID();
-    return LifetimeLattice(Factory.OriginMapFactory.add(
+    return LoanPropagationLattice(Factory.OriginMapFactory.add(
         In.Origins, OID, Factory.createLoanSet(LID)));
   }
 
   /// The destination origin's loan set is replaced by the source's.
   /// This implicitly "resets" the old loans of the destination.
-  LifetimeLattice transfer(LifetimeLattice InState, const AssignOriginFact &F) {
+  Lattice transfer(Lattice In, const AssignOriginFact &F) {
     OriginID DestOID = F.getDestOriginID();
     OriginID SrcOID = F.getSrcOriginID();
-    LoanSet SrcLoans = InState.getLoans(SrcOID);
-    return LifetimeLattice(
-        Factory.OriginMapFactory.add(InState.Origins, DestOID, SrcLoans));
+    LoanSet SrcLoans = getLoans(In, SrcOID);
+    return LoanPropagationLattice(
+        Factory.OriginMapFactory.add(In.Origins, DestOID, SrcLoans));
   }
-};
 
-// ========================================================================= //
-//                              Dataflow analysis
-// ========================================================================= //
-
-/// Drives the intra-procedural dataflow analysis.
-///
-/// Orchestrates the analysis by iterating over the CFG using a worklist
-/// algorithm. It computes a fixed point by propagating the LifetimeLattice
-/// state through each block until the state no longer changes.
-/// TODO: Maybe use the dataflow framework! The framework might need changes
-/// to support the current comparison done at block-entry.
-class LifetimeDataflow {
-  const CFG &Cfg;
-  AnalysisDeclContext &AC;
-  LifetimeFactory LifetimeFact;
-
-  Transferer Xfer;
-
-  /// Stores the merged analysis state at the entry of each CFG block.
-  llvm::DenseMap<const CFGBlock *, LifetimeLattice> BlockEntryStates;
-  /// Stores the analysis state at the exit of each CFG block, after the
-  /// transfer function has been applied.
-  llvm::DenseMap<const CFGBlock *, LifetimeLattice> BlockExitStates;
-
-public:
-  LifetimeDataflow(const CFG &C, FactManager &FS, AnalysisDeclContext &AC)
-      : Cfg(C), AC(AC), Xfer(FS, LifetimeFact) {}
-
-  void run() {
-    llvm::TimeTraceScope TimeProfile("Lifetime Dataflow");
-    ForwardDataflowWorklist Worklist(Cfg, AC);
-    const CFGBlock *Entry = &Cfg.getEntry();
-    BlockEntryStates[Entry] = LifetimeLattice{};
-    Worklist.enqueueBlock(Entry);
-    while (const CFGBlock *B = Worklist.dequeue()) {
-      LifetimeLattice EntryState = getEntryState(B);
-      LifetimeLattice ExitState = Xfer.transferBlock(B, EntryState);
-      BlockExitStates[B] = ExitState;
-
-      for (const CFGBlock *Successor : B->succs()) {
-        auto SuccIt = BlockEntryStates.find(Successor);
-        LifetimeLattice OldSuccEntryState = (SuccIt != BlockEntryStates.end())
-                                                ? SuccIt->second
-                                                : LifetimeLattice{};
-        LifetimeLattice NewSuccEntryState =
-            OldSuccEntryState.join(ExitState, LifetimeFact);
-        // Enqueue the successor if its entry state has changed.
-        // TODO(opt): Consider changing 'join' to report a change if !=
-        // comparison is found expensive.
-        if (SuccIt == BlockEntryStates.end() ||
-            NewSuccEntryState != OldSuccEntryState) {
-          BlockEntryStates[Successor] = NewSuccEntryState;
-          Worklist.enqueueBlock(Successor);
-        }
-      }
-    }
-  }
-
-  void dump() const {
-    llvm::dbgs() << "==========================================\n";
-    llvm::dbgs() << "       Dataflow results:\n";
-    llvm::dbgs() << "==========================================\n";
-    const CFGBlock &B = Cfg.getExit();
-    getExitState(&B).dump(llvm::dbgs());
-  }
-
-  LifetimeLattice getEntryState(const CFGBlock *B) const {
-    return BlockEntryStates.lookup(B);
+  LoanSet getLoans(OriginID OID, ProgramPoint P) {
+    return getLoans(getState(P), OID);
   }
 
-  LifetimeLattice getExitState(const CFGBlock *B) const {
-    return BlockExitStates.lookup(B);
+private:
+  LoanSet getLoans(Lattice L, OriginID OID) {
+    if (auto *Loans = L.Origins.lookup(OID))
+      return *Loans;
+    return Factory.LoanSetFactory.getEmptySet();
   }
 };
 
 // ========================================================================= //
-//  TODO: Analysing dataflow results and error reporting.
+//  TODO:
+// - Modify loan expiry analysis to answer `bool isExpired(Loan L, Point P)`
+// - Modify origin liveness analysis to answer `bool isLive(Origin O, Point P)`
+// - Using the above three to perform the final error reporting.
 // ========================================================================= //
-} // anonymous namespace
 
-void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg,
-                               AnalysisDeclContext &AC) {
+// ========================================================================= //
+//                  LifetimeSafetyAnalysis Class Implementation
+// ========================================================================= //
+
+// We need this here for unique_ptr with forward declared class.
+LifetimeSafetyAnalysis::~LifetimeSafetyAnalysis() = default;
+
+LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC)
+    : AC(AC), Factory(std::make_unique<LifetimeFactory>()),
+      FactMgr(std::make_unique<FactManager>()) {}
+
+void LifetimeSafetyAnalysis::run() {
   llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
+
+  const CFG &Cfg = *AC.getCFG();
   DEBUG_WITH_TYPE("PrintCFG", Cfg.dump(AC.getASTContext().getLangOpts(),
                                        /*ShowColors=*/true));
-  FactManager FactMgr;
-  FactGenerator FactGen(FactMgr, AC);
+
+  FactGenerator FactGen(*FactMgr, AC);
   FactGen.run();
-  DEBUG_WITH_TYPE("LifetimeFacts", FactMgr.dump(Cfg, AC));
+  DEBUG_WITH_TYPE("LifetimeFacts", FactMgr->dump(Cfg, AC));
 
   /// TODO(opt): Consider optimizing individual blocks before running the
   /// dataflow analysis.
@@ -755,8 +870,56 @@ void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg,
   ///    blocks; only Decls are visible.  Therefore, loans in a block that
   ///    never reach an Origin associated with a Decl can be safely dropped by
   ///    the analysis.
-  LifetimeDataflow Dataflow(Cfg, FactMgr, AC);
-  Dataflow.run();
-  DEBUG_WITH_TYPE("LifetimeDataflow", Dataflow.dump());
+  LoanPropagation =
+      std::make_unique<LoanPropagationAnalysis>(Cfg, AC, *FactMgr, *Factory);
+  LoanPropagation->run();
+}
+
+LoanSet LifetimeSafetyAnalysis::getLoansAtPoint(OriginID OID,
+                                                ProgramPoint PP) const {
+  assert(LoanPropagation && "Analysis has not been run.");
+  return LoanPropagation->getLoans(OID, PP);
+}
+
+std::optional<OriginID>
+LifetimeSafetyAnalysis::getOriginIDForDecl(const ValueDecl *D) const {
+  assert(FactMgr && "FactManager not initialized");
+  // This assumes the OriginManager's `get` can find an existing origin.
+  // We might need a `find` method on OriginManager to avoid `getOrCreate` logic
+  // in a const-query context if that becomes an issue.
+  return FactMgr->getOriginMgr().get(*D);
+}
+
+std::vector<LoanID>
+LifetimeSafetyAnalysis::getLoanIDForVar(const VarDecl *VD) const {
+  assert(FactMgr && "FactManager not initialized");
+  std::vector<LoanID> Result;
+  for (const Loan &L : FactMgr->getLoanMgr().getLoans())
+    if (L.Path.D == VD)
+      Result.push_back(L.ID);
+  return Result;
+}
+
+llvm::StringMap<ProgramPoint> LifetimeSafetyAnalysis::getTestPoints() const {
+  assert(FactMgr && "FactManager not initialized");
+  llvm::StringMap<ProgramPoint> AnnotationToPointMap;
+  for (const CFGBlock *Block : *AC.getCFG()) {
+    for (const Fact *F : FactMgr->getFacts(Block)) {
+      if (const auto *TPF = F->getAs<TestPointFact>()) {
+        StringRef PointName = TPF->getAnnotation();
+        assert(AnnotationToPointMap.find(PointName) ==
+                   AnnotationToPointMap.end() &&
+               "more than one test points with the same name");
+        AnnotationToPointMap[PointName] = F;
+      }
+    }
+  }
+  return AnnotationToPointMap;
+}
+} // namespace internal
+
+void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC) {
+  internal::LifetimeSafetyAnalysis Analysis(AC);
+  Analysis.run();
 }
-} // namespace clang
+} // namespace clang::lifetimes
diff --git a/clang/lib/Analysis/plugins/CheckerDependencyHandling/CheckerDependencyHandling.cpp b/clang/lib/Analysis/plugins/CheckerDependencyHandling/CheckerDependencyHandling.cpp
index aacb886f6e122..518f9e7ddf347 100644
--- a/clang/lib/Analysis/plugins/CheckerDependencyHandling/CheckerDependencyHandling.cpp
+++ b/clang/lib/Analysis/plugins/CheckerDependencyHandling/CheckerDependencyHandling.cpp
@@ -2,6 +2,9 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Frontend/CheckerRegistry.h"
 
+// This barebones plugin is used by clang/test/Analysis/checker-plugins.c
+// to test dependency handling among checkers loaded from plugins.
+
 using namespace clang;
 using namespace ento;
 
@@ -15,12 +18,12 @@ struct DependendentChecker : public Checker<check::BeginFunction> {
 } // end anonymous namespace
 
 // Register plugin!
-extern "C" void clang_registerCheckers(CheckerRegistry &registry) {
-  registry.addChecker<Dependency>("example.Dependency", "", "");
-  registry.addChecker<DependendentChecker>("example.DependendentChecker", "",
-                                           "");
+extern "C" void clang_registerCheckers(CheckerRegistry &Registry) {
+  Registry.addChecker<Dependency>("example.Dependency", "MockDescription");
+  Registry.addChecker<DependendentChecker>("example.DependendentChecker",
+                                           "MockDescription");
 
-  registry.addDependency("example.DependendentChecker", "example.Dependency");
+  Registry.addDependency("example.DependendentChecker", "example.Dependency");
 }
 
 extern "C" const char clang_analyzerAPIVersionString[] =
diff --git a/clang/lib/Analysis/plugins/CheckerOptionHandling/CheckerOptionHandling.cpp b/clang/lib/Analysis/plugins/CheckerOptionHandling/CheckerOptionHandling.cpp
index 82c1058242551..2adb9348f6715 100644
--- a/clang/lib/Analysis/plugins/CheckerOptionHandling/CheckerOptionHandling.cpp
+++ b/clang/lib/Analysis/plugins/CheckerOptionHandling/CheckerOptionHandling.cpp
@@ -5,6 +5,9 @@
 using namespace clang;
 using namespace ento;
 
+// This barebones plugin is used by clang/test/Analysis/checker-plugins.c
+// to test option handling on checkers loaded from plugins.
+
 namespace {
 struct MyChecker : public Checker<check::BeginFunction> {
   void checkBeginFunction(CheckerContext &Ctx) const {}
@@ -25,13 +28,11 @@ bool shouldRegisterMyChecker(const CheckerManager &mgr) { return true; }
 } // end anonymous namespace
 
 // Register plugin!
-extern "C" void clang_registerCheckers(CheckerRegistry &registry) {
-  registry.addChecker(registerMyChecker, shouldRegisterMyChecker,
-                      "example.MyChecker", "Example Description",
-                      "example.mychecker.documentation.nonexistent.html",
-                      /*isHidden*/false);
+extern "C" void clang_registerCheckers(CheckerRegistry &Registry) {
+  Registry.addChecker(registerMyChecker, shouldRegisterMyChecker,
+                      "example.MyChecker", "Example Description");
 
-  registry.addCheckerOption(/*OptionType*/ "bool",
+  Registry.addCheckerOption(/*OptionType*/ "bool",
                             /*CheckerFullName*/ "example.MyChecker",
                             /*OptionName*/ "ExampleOption",
                             /*DefaultValStr*/ "false",
diff --git a/clang/lib/Analysis/plugins/SampleAnalyzer/MainCallChecker.cpp b/clang/lib/Analysis/plugins/SampleAnalyzer/MainCallChecker.cpp
index fd210d733fd0a..53a01d278e6da 100644
--- a/clang/lib/Analysis/plugins/SampleAnalyzer/MainCallChecker.cpp
+++ b/clang/lib/Analysis/plugins/SampleAnalyzer/MainCallChecker.cpp
@@ -3,12 +3,16 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Frontend/CheckerRegistry.h"
 
+// This simple plugin is used by clang/test/Analysis/checker-plugins.c
+// to test the use of a checker that is defined in a plugin.
+
 using namespace clang;
 using namespace ento;
 
 namespace {
 class MainCallChecker : public Checker<check::PreStmt<CallExpr>> {
-  mutable std::unique_ptr<BugType> BT;
+
+  const BugType BT{this, "call to main", "example analyzer plugin"};
 
 public:
   void checkPreStmt(const CallExpr *CE, CheckerContext &C) const;
@@ -33,21 +37,17 @@ void MainCallChecker::checkPreStmt(const CallExpr *CE,
     if (!N)
       return;
 
-    if (!BT)
-      BT.reset(new BugType(this, "call to main", "example analyzer plugin"));
-
     auto report =
-        std::make_unique<PathSensitiveBugReport>(*BT, BT->getDescription(), N);
+        std::make_unique<PathSensitiveBugReport>(BT, BT.getDescription(), N);
     report->addRange(Callee->getSourceRange());
     C.emitReport(std::move(report));
   }
 }
 
 // Register plugin!
-extern "C" void clang_registerCheckers(CheckerRegistry &registry) {
-  registry.addChecker<MainCallChecker>(
-      "example.MainCallChecker", "Disallows calls to functions called main",
-      "");
+extern "C" void clang_registerCheckers(CheckerRegistry &Registry) {
+  Registry.addChecker<MainCallChecker>("example.MainCallChecker",
+                                       "Example Description");
 }
 
 extern "C" const char clang_analyzerAPIVersionString[] =
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 5c2af9b080b83..e3f9760ac7ce3 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -757,6 +757,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     case llvm::Triple::FreeBSD:
       return std::make_unique<FreeBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
                                                                         Opts);
+    case llvm::Triple::OpenBSD:
+      return std::make_unique<OpenBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
+                                                                        Opts);
     default:
       return std::make_unique<LoongArch64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 7ff8e51f8a7f8..29de34bbc4fe4 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -623,13 +623,15 @@ bool ARMTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       LDREX = LDREX_W;
     break;
   case 7:
+  case 8:
     if (ArchProfile == llvm::ARM::ProfileKind::M)
       LDREX = LDREX_W | LDREX_H | LDREX_B;
     else
       LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B;
     break;
-  case 8:
   case 9:
+    assert(ArchProfile != llvm::ARM::ProfileKind::M &&
+           "No Armv9-M architectures defined");
     LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B;
   }
 
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 35501ed44ccd7..e199df32f56ee 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -129,7 +129,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     LongWidth = LongAlign = 32;
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
     PointerWidth = PointerAlign = 32;
-    PtrDiffType = SignedInt;
+    PtrDiffType = IntPtrType = SignedInt;
     SizeType = UnsignedInt;
     SuitableAlign = 64;
   }
@@ -155,7 +155,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IntMaxType = Int64Type;
     LongWidth = LongAlign = 64;
     PointerWidth = PointerAlign = 64;
-    PtrDiffType = SignedLong;
+    PtrDiffType = IntPtrType = SignedLong;
     SizeType = UnsignedLong;
   }
 
@@ -165,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IntMaxType = Int64Type;
     LongWidth = LongAlign = 32;
     PointerWidth = PointerAlign = 32;
-    PtrDiffType = SignedInt;
+    PtrDiffType = IntPtrType = SignedInt;
     SizeType = UnsignedInt;
   }
 
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 42cff6540c5e3..94b018a0751d1 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -496,6 +496,7 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
     case llvm::Triple::sparcv9:
       this->MCountName = "_mcount";
       break;
+    case llvm::Triple::loongarch64:
     case llvm::Triple::riscv64:
       break;
     }
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 1abf798d93129..c13b286cd7916 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -264,6 +264,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo {
     PointerWidth = PointerAlign = 32;
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
+    // SPIR32 has support for atomic ops if atomic extension is enabled.
+    // Take the maximum because it's possible the Host supports wider types.
+    MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 32);
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
@@ -281,6 +284,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo {
     PointerWidth = PointerAlign = 64;
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
+    // SPIR64 has support for atomic ops if atomic extension is enabled.
+    // Take the maximum because it's possible the Host supports wider types.
+    MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 64);
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 5bd53ebc52ab5..73c9fb924f682 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -348,22 +348,6 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return CIRBaseBuilderTy::createStore(loc, val, dst.getPointer(), align);
   }
 
-  mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real,
-                                  mlir::Value imag) {
-    auto resultComplexTy = cir::ComplexType::get(real.getType());
-    return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
-  }
-
-  mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
-    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
-    return create<cir::ComplexRealOp>(loc, operandTy.getElementType(), operand);
-  }
-
-  mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
-    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
-    return create<cir::ComplexImagOp>(loc, operandTy.getElementType(), operand);
-  }
-
   /// Create a cir.complex.real_ptr operation that derives a pointer to the real
   /// part of the complex value pointed to by the specified pointer value.
   mlir::Value createComplexRealPtr(mlir::Location loc, mlir::Value value) {
@@ -424,21 +408,23 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   }
 
   mlir::Value createSetBitfield(mlir::Location loc, mlir::Type resultType,
-                                mlir::Value dstAddr, mlir::Type storageType,
+                                Address dstAddr, mlir::Type storageType,
                                 mlir::Value src, const CIRGenBitFieldInfo &info,
-                                bool isLvalueVolatile, bool useVolatile) {
-    return create<cir::SetBitfieldOp>(loc, resultType, dstAddr, storageType,
-                                      src, info.name, info.size, info.offset,
-                                      info.isSigned, isLvalueVolatile);
+                                bool isLvalueVolatile) {
+    return create<cir::SetBitfieldOp>(
+        loc, resultType, dstAddr.getPointer(), storageType, src, info.name,
+        info.size, info.offset, info.isSigned, isLvalueVolatile,
+        dstAddr.getAlignment().getAsAlign().value());
   }
 
   mlir::Value createGetBitfield(mlir::Location loc, mlir::Type resultType,
-                                mlir::Value addr, mlir::Type storageType,
+                                Address addr, mlir::Type storageType,
                                 const CIRGenBitFieldInfo &info,
-                                bool isLvalueVolatile, bool useVolatile) {
-    return create<cir::GetBitfieldOp>(loc, resultType, addr, storageType,
-                                      info.name, info.size, info.offset,
-                                      info.isSigned, isLvalueVolatile);
+                                bool isLvalueVolatile) {
+    return create<cir::GetBitfieldOp>(
+        loc, resultType, addr.getPointer(), storageType, info.name, info.size,
+        info.offset, info.isSigned, isLvalueVolatile,
+        addr.getAlignment().getAsAlign().value());
   }
 };
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 72e8d71c366d8..ef136f80637f3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -57,6 +57,20 @@ static RValue emitBuiltinBitOp(CIRGenFunction &cgf, const CallExpr *e,
   return RValue::get(result);
 }
 
+RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) {
+  mlir::Value input = emitScalarExpr(e->getArg(0));
+  mlir::Value amount = emitScalarExpr(e->getArg(1));
+
+  // TODO(cir): MSVC flavor bit rotate builtins use different types for input
+  // and amount, but cir.rotate requires them to have the same type. Cast amount
+  // to the type of input when necessary.
+  assert(!cir::MissingFeatures::msvcBuiltins());
+
+  auto r = builder.create<cir::RotateOp>(getLoc(e->getSourceRange()), input,
+                                         amount, isRotateLeft);
+  return RValue::get(r);
+}
+
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
                                        ReturnValueSlot returnValue) {
@@ -107,11 +121,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return RValue::get(nullptr);
   }
 
+  case Builtin::BI__builtin_assume_separate_storage: {
+    mlir::Value value0 = emitScalarExpr(e->getArg(0));
+    mlir::Value value1 = emitScalarExpr(e->getArg(1));
+    builder.create<cir::AssumeSepStorageOp>(loc, value0, value1);
+    return RValue::get(nullptr);
+  }
+
   case Builtin::BI__builtin_complex: {
     mlir::Value real = emitScalarExpr(e->getArg(0));
     mlir::Value imag = emitScalarExpr(e->getArg(1));
     mlir::Value complex = builder.createComplexCreate(loc, real, imag);
-    return RValue::get(complex);
+    return RValue::getComplex(complex);
   }
 
   case Builtin::BI__builtin_creal:
@@ -136,6 +157,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return RValue::get(imag);
   }
 
+  case Builtin::BI__builtin_conj:
+  case Builtin::BI__builtin_conjf:
+  case Builtin::BI__builtin_conjl:
+  case Builtin::BIconj:
+  case Builtin::BIconjf:
+  case Builtin::BIconjl: {
+    mlir::Value complex = emitComplexExpr(e->getArg(0));
+    mlir::Value conj = builder.createUnaryOp(getLoc(e->getExprLoc()),
+                                             cir::UnaryOpKind::Not, complex);
+    return RValue::getComplex(conj);
+  }
+
   case Builtin::BI__builtin_clrsb:
   case Builtin::BI__builtin_clrsbl:
   case Builtin::BI__builtin_clrsbll:
@@ -219,6 +252,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     mlir::Value arg = emitScalarExpr(e->getArg(0));
     return RValue::get(builder.create<cir::BitReverseOp>(loc, arg));
   }
+
+  case Builtin::BI__builtin_rotateleft8:
+  case Builtin::BI__builtin_rotateleft16:
+  case Builtin::BI__builtin_rotateleft32:
+  case Builtin::BI__builtin_rotateleft64:
+    return emitRotate(e, /*isRotateLeft=*/true);
+
+  case Builtin::BI__builtin_rotateright8:
+  case Builtin::BI__builtin_rotateright16:
+  case Builtin::BI__builtin_rotateright32:
+  case Builtin::BI__builtin_rotateright64:
+    return emitRotate(e, /*isRotateLeft=*/false);
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index eb079b877b7ff..5929568505ef2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -75,6 +75,11 @@ class CIRGenCXXABI {
   /// Emit dtor variants required by this ABI.
   virtual void emitCXXDestructors(const clang::CXXDestructorDecl *d) = 0;
 
+  virtual void emitDestructorCall(CIRGenFunction &cgf,
+                                  const CXXDestructorDecl *dd, CXXDtorType type,
+                                  bool forVirtualBase, bool delegating,
+                                  Address thisAddr, QualType thisTy) = 0;
+
   /// Returns true if the given destructor type should be emitted as a linkonce
   /// delegating thunk, regardless of whether the dtor is defined in this TU or
   /// not.
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenCXXExpr.cpp
index 8da832d9118e3..67d8988a5fbbd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXExpr.cpp
@@ -246,6 +246,29 @@ static void emitNewInitializer(CIRGenFunction &cgf, const CXXNewExpr *e,
   }
 }
 
+RValue CIRGenFunction::emitCXXDestructorCall(
+    GlobalDecl dtor, const CIRGenCallee &callee, mlir::Value thisVal,
+    QualType thisTy, mlir::Value implicitParam, QualType implicitParamTy,
+    const CallExpr *ce) {
+  const CXXMethodDecl *dtorDecl = cast<CXXMethodDecl>(dtor.getDecl());
+
+  assert(!thisTy.isNull());
+  assert(thisTy->getAsCXXRecordDecl() == dtorDecl->getParent() &&
+         "Pointer/Object mixup");
+
+  assert(!cir::MissingFeatures::addressSpace());
+
+  CallArgList args;
+  commonBuildCXXMemberOrOperatorCall(*this, dtorDecl, thisVal, implicitParam,
+                                     implicitParamTy, ce, args, nullptr);
+  assert((ce || dtor.getDecl()) && "expected source location provider");
+  assert(!cir::MissingFeatures::opCallMustTail());
+  return emitCall(cgm.getTypes().arrangeCXXStructorDeclaration(dtor), callee,
+                  ReturnValueSlot(), args, nullptr,
+                  ce ? getLoc(ce->getExprLoc())
+                     : getLoc(dtor.getDecl()->getSourceRange()));
+}
+
 /// Emit a call to an operator new or operator delete function, as implicitly
 /// created by new-expressions and delete-expressions.
 static RValue emitNewDeleteCall(CIRGenFunction &cgf,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 8667bb60d114e..fbf53dbdf385b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -392,6 +392,14 @@ void CIRGenFunction::emitDelegatingCXXConstructorCall(
   }
 }
 
+void CIRGenFunction::emitCXXDestructorCall(const CXXDestructorDecl *dd,
+                                           CXXDtorType type,
+                                           bool forVirtualBase, bool delegating,
+                                           Address thisAddr, QualType thisTy) {
+  cgm.getCXXABI().emitDestructorCall(*this, dd, type, forVirtualBase,
+                                     delegating, thisAddr, thisTy);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 51da48d330f55..1f64801926887 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -333,13 +333,12 @@ mlir::Value CIRGenFunction::emitStoreThroughBitfieldLValue(RValue src,
   Address ptr = dst.getBitFieldAddress();
 
   assert(!cir::MissingFeatures::armComputeVolatileBitfields());
-  const bool useVolatile = false;
 
   mlir::Value dstAddr = dst.getAddress().getPointer();
 
-  return builder.createSetBitfield(dstAddr.getLoc(), resLTy, dstAddr,
+  return builder.createSetBitfield(dstAddr.getLoc(), resLTy, ptr,
                                    ptr.getElementType(), src.getValue(), info,
-                                   dst.isVolatileQualified(), useVolatile);
+                                   dst.isVolatileQualified());
 }
 
 RValue CIRGenFunction::emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc) {
@@ -352,8 +351,7 @@ RValue CIRGenFunction::emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc) {
   assert(!cir::MissingFeatures::armComputeVolatileBitfields());
 
   mlir::Value field = builder.createGetBitfield(
-      getLoc(loc), resLTy, ptr.getPointer(), ptr.getElementType(), info,
-      lv.isVolatile(), false);
+      getLoc(loc), resLTy, ptr, ptr.getElementType(), info, lv.isVolatile());
   assert(!cir::MissingFeatures::opLoadEmitScalarRangeCheck() && "NYI");
   return RValue::get(field);
 }
@@ -366,7 +364,10 @@ Address CIRGenFunction::getAddrOfBitFieldStorage(LValue base,
   cir::PointerType fieldPtr = cir::PointerType::get(fieldType);
   cir::GetMemberOp sea = getBuilder().createGetMember(
       loc, fieldPtr, base.getPointer(), field->getName(), index);
-  return Address(sea, CharUnits::One());
+  auto rec = cast<cir::RecordType>(base.getAddress().getElementType());
+  CharUnits offset = CharUnits::fromQuantity(
+      rec.getElementOffset(cgm.getDataLayout().layout, index));
+  return Address(sea, base.getAlignment().alignmentAtOffset(offset));
 }
 
 LValue CIRGenFunction::emitLValueForBitField(LValue base,
@@ -662,7 +663,8 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) {
   }
   case UO_PreInc:
   case UO_PreDec: {
-    bool isInc = e->isIncrementOp();
+    cir::UnaryOpKind kind =
+        e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec;
     LValue lv = emitLValue(e->getSubExpr());
 
     assert(e->isPrefix() && "Prefix operator in unexpected state!");
@@ -671,7 +673,7 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) {
       cgm.errorNYI(e->getSourceRange(), "UnaryOp complex inc/dec");
       lv = LValue();
     } else {
-      emitScalarPrePostIncDec(e, lv, isInc, /*isPre=*/true);
+      emitScalarPrePostIncDec(e, lv, kind, /*isPre=*/true);
     }
 
     return lv;
@@ -1053,6 +1055,67 @@ LValue CIRGenFunction::emitMemberExpr(const MemberExpr *e) {
   llvm_unreachable("Unhandled member declaration!");
 }
 
+/// Evaluate an expression into a given memory location.
+void CIRGenFunction::emitAnyExprToMem(const Expr *e, Address location,
+                                      Qualifiers quals, bool isInit) {
+  // FIXME: This function should take an LValue as an argument.
+  switch (getEvaluationKind(e->getType())) {
+  case cir::TEK_Complex: {
+    LValue lv = makeAddrLValue(location, e->getType());
+    emitComplexExprIntoLValue(e, lv, isInit);
+    return;
+  }
+
+  case cir::TEK_Aggregate: {
+    emitAggExpr(e, AggValueSlot::forAddr(location, quals,
+                                         AggValueSlot::IsDestructed_t(isInit),
+                                         AggValueSlot::IsAliased_t(!isInit),
+                                         AggValueSlot::MayOverlap));
+    return;
+  }
+
+  case cir::TEK_Scalar: {
+    RValue rv = RValue::get(emitScalarExpr(e));
+    LValue lv = makeAddrLValue(location, e->getType());
+    emitStoreThroughLValue(rv, lv);
+    return;
+  }
+  }
+
+  llvm_unreachable("bad evaluation kind");
+}
+
+LValue CIRGenFunction::emitCompoundLiteralLValue(const CompoundLiteralExpr *e) {
+  if (e->isFileScope()) {
+    cgm.errorNYI(e->getSourceRange(), "emitCompoundLiteralLValue: FileScope");
+    return {};
+  }
+
+  if (e->getType()->isVariablyModifiedType()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCompoundLiteralLValue: VariablyModifiedType");
+    return {};
+  }
+
+  Address declPtr = createMemTemp(e->getType(), getLoc(e->getSourceRange()),
+                                  ".compoundliteral");
+  const Expr *initExpr = e->getInitializer();
+  LValue result = makeAddrLValue(declPtr, e->getType(), AlignmentSource::Decl);
+
+  emitAnyExprToMem(initExpr, declPtr, e->getType().getQualifiers(),
+                   /*Init*/ true);
+
+  // Block-scope compound literals are destroyed at the end of the enclosing
+  // scope in C.
+  if (!getLangOpts().CPlusPlus && e->getType().isDestructedType()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCompoundLiteralLValue: non C++ DestructedType");
+    return {};
+  }
+
+  return result;
+}
+
 LValue CIRGenFunction::emitCallExprLValue(const CallExpr *e) {
   RValue rv = emitCallExpr(e);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 3273d9000771a..6756a7ce067a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -52,11 +52,37 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
   mlir::Value VisitGenericSelectionExpr(GenericSelectionExpr *e);
   mlir::Value VisitImplicitCastExpr(ImplicitCastExpr *e);
   mlir::Value VisitInitListExpr(const InitListExpr *e);
+
+  mlir::Value VisitCompoundLiteralExpr(CompoundLiteralExpr *e) {
+    return emitLoadOfLValue(e);
+  }
+
   mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il);
   mlir::Value VisitParenExpr(ParenExpr *e);
   mlir::Value
   VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e);
+
+  mlir::Value VisitPrePostIncDec(const UnaryOperator *e, cir::UnaryOpKind op,
+                                 bool isPre);
+
+  mlir::Value VisitUnaryPostDec(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, cir::UnaryOpKind::Dec, false);
+  }
+
+  mlir::Value VisitUnaryPostInc(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, cir::UnaryOpKind::Inc, false);
+  }
+
+  mlir::Value VisitUnaryPreDec(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, cir::UnaryOpKind::Dec, true);
+  }
+
+  mlir::Value VisitUnaryPreInc(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, cir::UnaryOpKind::Inc, true);
+  }
+
   mlir::Value VisitUnaryDeref(const Expr *e);
+  mlir::Value VisitUnaryNot(const UnaryOperator *e);
 
   struct BinOpInfo {
     mlir::Location loc;
@@ -230,8 +256,7 @@ mlir::Value ComplexExprEmitter::VisitBinComma(const BinaryOperator *e) {
 mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
-
-  return cgf.emitCallExpr(e).getValue();
+  return cgf.emitCallExpr(e).getComplexValue();
 }
 
 mlir::Value ComplexExprEmitter::VisitCastExpr(CastExpr *e) {
@@ -334,10 +359,22 @@ mlir::Value ComplexExprEmitter::VisitSubstNonTypeTemplateParmExpr(
   return Visit(e->getReplacement());
 }
 
+mlir::Value ComplexExprEmitter::VisitPrePostIncDec(const UnaryOperator *e,
+                                                   cir::UnaryOpKind op,
+                                                   bool isPre) {
+  LValue lv = cgf.emitLValue(e->getSubExpr());
+  return cgf.emitComplexPrePostIncDec(e, lv, op, isPre);
+}
+
 mlir::Value ComplexExprEmitter::VisitUnaryDeref(const Expr *e) {
   return emitLoadOfLValue(e);
 }
 
+mlir::Value ComplexExprEmitter::VisitUnaryNot(const UnaryOperator *e) {
+  mlir::Value op = Visit(e->getSubExpr());
+  return builder.createNot(op);
+}
+
 mlir::Value ComplexExprEmitter::emitPromoted(const Expr *e,
                                              QualType promotionTy) {
   e = e->IgnoreParens();
@@ -417,6 +454,41 @@ mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   return ComplexExprEmitter(*this).Visit(const_cast<Expr *>(e));
 }
 
+mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e,
+                                                     LValue lv,
+                                                     cir::UnaryOpKind op,
+                                                     bool isPre) {
+  assert(op == cir::UnaryOpKind::Inc ||
+         op == cir::UnaryOpKind::Dec && "Invalid UnaryOp kind for ComplexType");
+
+  mlir::Value inVal = emitLoadOfComplex(lv, e->getExprLoc());
+  mlir::Location loc = getLoc(e->getExprLoc());
+  mlir::Value incVal = builder.createUnaryOp(loc, op, inVal);
+
+  // Store the updated result through the lvalue.
+  emitStoreOfComplex(loc, incVal, lv, /*isInit=*/false);
+
+  if (getLangOpts().OpenMP)
+    cgm.errorNYI(loc, "emitComplexPrePostIncDec OpenMP");
+
+  // If this is a postinc, return the value read from memory, otherwise use the
+  // updated value.
+  return isPre ? incVal : inVal;
+}
+
+void CIRGenFunction::emitComplexExprIntoLValue(const Expr *e, LValue dest,
+                                               bool isInit) {
+  assert(e && getComplexType(e->getType()) &&
+         "Invalid complex expression to emit");
+  ComplexExprEmitter emitter(*this);
+  mlir::Value value = emitter.Visit(const_cast<Expr *>(e));
+  emitter.emitStoreOfComplex(getLoc(e->getExprLoc()), value, dest, isInit);
+}
+
+mlir::Value CIRGenFunction::emitLoadOfComplex(LValue src, SourceLocation loc) {
+  return ComplexExprEmitter(*this).emitLoadOfLValue(src, loc);
+}
+
 void CIRGenFunction::emitStoreOfComplex(mlir::Location loc, mlir::Value v,
                                         LValue dest, bool isInit) {
   ComplexExprEmitter(*this).emitStoreOfComplex(loc, v, dest, isInit);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 9e13b4c83e3a8..eba6bffbf2927 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -233,6 +233,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitMemberExpr(MemberExpr *e);
 
+  mlir::Value VisitCompoundLiteralExpr(CompoundLiteralExpr *e) {
+    return emitLoadOfLValue(e);
+  }
+
   mlir::Value VisitInitListExpr(InitListExpr *e);
 
   mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) {
@@ -383,22 +387,22 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   // Unary Operators.
   mlir::Value VisitUnaryPostDec(const UnaryOperator *e) {
     LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, false, false);
+    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Dec, false);
   }
   mlir::Value VisitUnaryPostInc(const UnaryOperator *e) {
     LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, true, false);
+    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Inc, false);
   }
   mlir::Value VisitUnaryPreDec(const UnaryOperator *e) {
     LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, false, true);
+    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Dec, true);
   }
   mlir::Value VisitUnaryPreInc(const UnaryOperator *e) {
     LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, true, true);
+    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Inc, true);
   }
   mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
-                                      bool isInc, bool isPre) {
+                                      cir::UnaryOpKind kind, bool isPre) {
     if (cgf.getLangOpts().OpenMP)
       cgf.cgm.errorNYI(e->getSourceRange(), "inc/dec OpenMP");
 
@@ -427,7 +431,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     //          -> bool = ((int)bool + 1 != 0)
     // An interesting aspect of this is that increment is always true.
     // Decrement does not have this property.
-    if (isInc && type->isBooleanType()) {
+    if (kind == cir::UnaryOpKind::Inc && type->isBooleanType()) {
       value = builder.getTrue(cgf.getLoc(e->getExprLoc()));
     } else if (type->isIntegerType()) {
       QualType promotedType;
@@ -458,7 +462,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
       assert(!cir::MissingFeatures::sanitizers());
       if (e->canOverflow() && type->isSignedIntegerOrEnumerationType()) {
-        value = emitIncDecConsiderOverflowBehavior(e, value, isInc);
+        value = emitIncDecConsiderOverflowBehavior(e, value, kind);
       } else {
         cir::UnaryOpKind kind =
             e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec;
@@ -480,7 +484,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
         // For everything else, we can just do a simple increment.
         mlir::Location loc = cgf.getLoc(e->getSourceRange());
         CIRGenBuilderTy &builder = cgf.getBuilder();
-        int amount = (isInc ? 1 : -1);
+        int amount = kind == cir::UnaryOpKind::Inc ? 1 : -1;
         mlir::Value amt = builder.getSInt32(amount, loc);
         assert(!cir::MissingFeatures::sanitizers());
         value = builder.createPtrStride(loc, value, amt);
@@ -500,8 +504,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       if (mlir::isa<cir::SingleType, cir::DoubleType>(value.getType())) {
         // Create the inc/dec operation.
         // NOTE(CIR): clang calls CreateAdd but folds this to a unary op
-        cir::UnaryOpKind kind =
-            (isInc ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec);
+        assert(kind == cir::UnaryOpKind::Inc ||
+               kind == cir::UnaryOpKind::Dec && "Invalid UnaryOp kind");
         value = emitUnaryOp(e, kind, value);
       } else {
         cgf.cgm.errorNYI(e->getSourceRange(), "Unary inc/dec other fp type");
@@ -532,9 +536,9 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value emitIncDecConsiderOverflowBehavior(const UnaryOperator *e,
                                                  mlir::Value inVal,
-                                                 bool isInc) {
-    cir::UnaryOpKind kind =
-        e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec;
+                                                 cir::UnaryOpKind kind) {
+    assert(kind == cir::UnaryOpKind::Inc ||
+           kind == cir::UnaryOpKind::Dec && "Invalid UnaryOp kind");
     switch (cgf.getLangOpts().getSignedOverflowBehavior()) {
     case LangOptions::SOB_Defined:
       return emitUnaryOp(e, kind, inVal, /*nsw=*/false);
@@ -2147,8 +2151,9 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
 }
 
 mlir::Value CIRGenFunction::emitScalarPrePostIncDec(const UnaryOperator *e,
-                                                    LValue lv, bool isInc,
+                                                    LValue lv,
+                                                    cir::UnaryOpKind kind,
                                                     bool isPre) {
   return ScalarExprEmitter(*this, builder)
-      .emitScalarPrePostIncDec(e, lv, isInc, isPre);
+      .emitScalarPrePostIncDec(e, lv, kind, isPre);
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index e532b9d855843..3e69e5673dd86 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -593,11 +593,12 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) {
 
     assert(!cir::MissingFeatures::dtorCleanups());
 
-    // TODO(cir): A complete destructor is supposed to call the base destructor.
-    // Since we have to emit both dtor kinds we just fall through for now and.
-    // As long as we don't support virtual bases this should be functionally
-    // equivalent.
-    assert(!cir::MissingFeatures::completeDtors());
+    if (!isTryBody) {
+      QualType thisTy = dtor->getFunctionObjectParameterType();
+      emitCXXDestructorCall(dtor, Dtor_Base, /*forVirtualBase=*/false,
+                            /*delegating=*/false, loadCXXThisAddress(), thisTy);
+      break;
+    }
 
     // Fallthrough: act like we're in the base variant.
     [[fallthrough]];
@@ -698,6 +699,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitStringLiteralLValue(cast<StringLiteral>(e));
   case Expr::MemberExprClass:
     return emitMemberExpr(cast<MemberExpr>(e));
+  case Expr::CompoundLiteralExprClass:
+    return emitCompoundLiteralLValue(cast<CompoundLiteralExpr>(e));
   case Expr::BinaryOperatorClass:
     return emitBinaryOperatorLValue(cast<BinaryOperator>(e));
   case Expr::CompoundAssignOperatorClass: {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 1346333739bc1..2aceeef793385 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -757,6 +757,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   RValue emitAnyExpr(const clang::Expr *e,
                      AggValueSlot aggSlot = AggValueSlot::ignored());
 
+  /// Emits the code necessary to evaluate an arbitrary expression into the
+  /// given memory location.
+  void emitAnyExprToMem(const Expr *e, Address location, Qualifiers quals,
+                        bool isInitializer);
+
   /// Similarly to emitAnyExpr(), however, the result will always be accessible
   /// even if no aggregate location is provided.
   RValue emitAnyExprToTemp(const clang::Expr *e);
@@ -828,6 +833,7 @@ class CIRGenFunction : public CIRGenTypeCache {
   mlir::Value emitCheckedArgForAssume(const Expr *e);
 
   LValue emitCompoundAssignmentLValue(const clang::CompoundAssignOperator *e);
+  LValue emitCompoundLiteralLValue(const CompoundLiteralExpr *e);
 
   void emitConstructorBody(FunctionArgList &args);
   void emitDestructorBody(FunctionArgList &args);
@@ -847,6 +853,15 @@ class CIRGenFunction : public CIRGenTypeCache {
                               bool delegating, Address thisAddr,
                               CallArgList &args, clang::SourceLocation loc);
 
+  void emitCXXDestructorCall(const CXXDestructorDecl *dd, CXXDtorType type,
+                             bool forVirtualBase, bool delegating,
+                             Address thisAddr, QualType thisTy);
+
+  RValue emitCXXDestructorCall(GlobalDecl dtor, const CIRGenCallee &callee,
+                               mlir::Value thisVal, QualType thisTy,
+                               mlir::Value implicitParam,
+                               QualType implicitParamTy, const CallExpr *e);
+
   mlir::LogicalResult emitCXXForRangeStmt(const CXXForRangeStmt &s,
                                           llvm::ArrayRef<const Attr *> attrs);
 
@@ -911,7 +926,7 @@ class CIRGenFunction : public CIRGenTypeCache {
   mlir::Value emitScalarExpr(const clang::Expr *e);
 
   mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
-                                      bool isInc, bool isPre);
+                                      cir::UnaryOpKind kind, bool isPre);
 
   /// Build a debug stoppoint if we are emitting debug info.
   void emitStopPoint(const Stmt *s);
@@ -930,6 +945,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// returning the result.
   mlir::Value emitComplexExpr(const Expr *e);
 
+  void emitComplexExprIntoLValue(const Expr *e, LValue dest, bool isInit);
+
+  mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv,
+                                       cir::UnaryOpKind op, bool isPre);
+
   LValue emitComplexAssignmentLValue(const BinaryOperator *e);
 
   void emitCompoundStmt(const clang::CompoundStmt &s);
@@ -980,6 +1000,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   RValue emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc);
 
+  /// Load a complex number from the specified l-value.
+  mlir::Value emitLoadOfComplex(LValue src, SourceLocation loc);
+
   /// Given an expression that represents a value lvalue, this method emits
   /// the address of the lvalue, then loads the result as an rvalue,
   /// returning the rvalue.
@@ -1030,6 +1053,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitReturnStmt(const clang::ReturnStmt &s);
 
+  RValue emitRotate(const CallExpr *e, bool isRotateLeft);
+
   mlir::Value emitScalarConstant(const ConstantEmission &constant, Expr *e);
 
   /// Emit a conversion from the specified type to the specified destination
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 1496d877e7239..6577f5fb0f2ef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -46,6 +46,11 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
   void emitCXXDestructors(const clang::CXXDestructorDecl *d) override;
   void emitCXXStructor(clang::GlobalDecl gd) override;
 
+  void emitDestructorCall(CIRGenFunction &cgf, const CXXDestructorDecl *dd,
+                          CXXDtorType type, bool forVirtualBase,
+                          bool delegating, Address thisAddr,
+                          QualType thisTy) override;
+
   bool useThunkForDtorVariant(const CXXDestructorDecl *dtor,
                               CXXDtorType dt) const override {
     // Itanium does not emit any destructor variant as an inline thunk.
@@ -240,6 +245,25 @@ bool CIRGenItaniumCXXABI::needsVTTParameter(GlobalDecl gd) {
   return false;
 }
 
+void CIRGenItaniumCXXABI::emitDestructorCall(
+    CIRGenFunction &cgf, const CXXDestructorDecl *dd, CXXDtorType type,
+    bool forVirtualBase, bool delegating, Address thisAddr, QualType thisTy) {
+  GlobalDecl gd(dd, type);
+  if (needsVTTParameter(gd)) {
+    cgm.errorNYI(dd->getSourceRange(), "emitDestructorCall: VTT");
+  }
+
+  mlir::Value vtt = nullptr;
+  ASTContext &astContext = cgm.getASTContext();
+  QualType vttTy = astContext.getPointerType(astContext.VoidPtrTy);
+  assert(!cir::MissingFeatures::appleKext());
+  CIRGenCallee callee =
+      CIRGenCallee::forDirect(cgm.getAddrOfCXXStructor(gd), gd);
+
+  cgf.emitCXXDestructorCall(gd, callee, thisAddr.getPointer(), thisTy, vtt,
+                            vttTy, nullptr);
+}
+
 CIRGenCXXABI *clang::CIRGen::CreateCIRGenItaniumCXXABI(CIRGenModule &cgm) {
   switch (cgm.getASTContext().getCXXABIKind()) {
   case TargetCXXABI::GenericItanium:
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 0a6dba5e80a62..0832c4141a10f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -58,6 +58,12 @@ class RValue {
     return value;
   }
 
+  /// Return the value of this complex value.
+  mlir::Value getComplexValue() const {
+    assert(isComplex() && "Not a complex!");
+    return value;
+  }
+
   /// Return the value of the address of the aggregate.
   Address getAggregateAddress() const {
     assert(isAggregate() && "Not an aggregate!");
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 5493b86a0a321..8f848c7345610 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -8,7 +8,9 @@
 
 #include "PassDetail.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIROpsEnums.h"
 #include "clang/CIR/Dialect/Passes.h"
 
 #include <memory>
@@ -21,17 +23,70 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   LoweringPreparePass() = default;
   void runOnOperation() override;
 
-  void runOnOp(Operation *op);
+  void runOnOp(mlir::Operation *op);
+  void lowerUnaryOp(cir::UnaryOp op);
 };
 
 } // namespace
 
-void LoweringPreparePass::runOnOp(Operation *op) {}
+void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) {
+  mlir::Type ty = op.getType();
+  if (!mlir::isa<cir::ComplexType>(ty))
+    return;
+
+  mlir::Location loc = op.getLoc();
+  cir::UnaryOpKind opKind = op.getKind();
+
+  CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointAfter(op);
+
+  mlir::Value operand = op.getInput();
+  mlir::Value operandReal = builder.createComplexReal(loc, operand);
+  mlir::Value operandImag = builder.createComplexImag(loc, operand);
+
+  mlir::Value resultReal;
+  mlir::Value resultImag;
+
+  switch (opKind) {
+  case cir::UnaryOpKind::Inc:
+  case cir::UnaryOpKind::Dec:
+    resultReal = builder.createUnaryOp(loc, opKind, operandReal);
+    resultImag = operandImag;
+    break;
+
+  case cir::UnaryOpKind::Plus:
+  case cir::UnaryOpKind::Minus:
+    llvm_unreachable("Complex unary Plus/Minus NYI");
+    break;
+
+  case cir::UnaryOpKind::Not:
+    resultReal = operandReal;
+    resultImag =
+        builder.createUnaryOp(loc, cir::UnaryOpKind::Minus, operandImag);
+    break;
+  }
+
+  mlir::Value result = builder.createComplexCreate(loc, resultReal, resultImag);
+  op.replaceAllUsesWith(result);
+  op.erase();
+}
+
+void LoweringPreparePass::runOnOp(mlir::Operation *op) {
+  if (auto unary = dyn_cast<cir::UnaryOp>(op))
+    lowerUnaryOp(unary);
+}
 
 void LoweringPreparePass::runOnOperation() {
-  llvm::SmallVector<Operation *> opsToTransform;
+  mlir::Operation *op = getOperation();
+
+  llvm::SmallVector<mlir::Operation *> opsToTransform;
+
+  op->walk([&](mlir::Operation *op) {
+    if (mlir::isa<cir::UnaryOp>(op))
+      opsToTransform.push_back(op);
+  });
 
-  for (auto *o : opsToTransform)
+  for (mlir::Operation *o : opsToTransform)
     runOnOp(o);
 }
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 7dcea0c8eb529..3cd7de0a56bc3 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -460,6 +460,17 @@ mlir::LogicalResult CIRToLLVMAssumeOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMAssumeSepStorageOpLowering::matchAndRewrite(
+    cir::AssumeSepStorageOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  auto cond = rewriter.create<mlir::LLVM::ConstantOp>(op.getLoc(),
+                                                      rewriter.getI1Type(), 1);
+  rewriter.replaceOpWithNewOp<mlir::LLVM::AssumeOp>(
+      op, cond, mlir::LLVM::AssumeSeparateStorageTag{}, adaptor.getPtr1(),
+      adaptor.getPtr2());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMBitClrsbOpLowering::matchAndRewrite(
     cir::BitClrsbOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -872,6 +883,21 @@ mlir::LogicalResult CIRToLLVMReturnOpLowering::matchAndRewrite(
   return mlir::LogicalResult::success();
 }
 
+mlir::LogicalResult CIRToLLVMRotateOpLowering::matchAndRewrite(
+    cir::RotateOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  // Note that LLVM intrinsic calls to @llvm.fsh{r,l}.i* have the same type as
+  // the operand.
+  mlir::Value input = adaptor.getInput();
+  if (op.isRotateLeft())
+    rewriter.replaceOpWithNewOp<mlir::LLVM::FshlOp>(op, input, input,
+                                                    adaptor.getAmount());
+  else
+    rewriter.replaceOpWithNewOp<mlir::LLVM::FshrOp>(op, input, input,
+                                                    adaptor.getAmount());
+  return mlir::LogicalResult::success();
+}
+
 static mlir::LogicalResult
 rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands,
                     mlir::ConversionPatternRewriter &rewriter,
@@ -2051,6 +2077,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
   patterns.add<
       // clang-format off
                CIRToLLVMAssumeOpLowering,
+               CIRToLLVMAssumeSepStorageOpLowering,
                CIRToLLVMBaseClassAddrOpLowering,
                CIRToLLVMBinOpLowering,
                CIRToLLVMBitClrsbOpLowering,
@@ -2077,6 +2104,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMGetBitfieldOpLowering,
                CIRToLLVMGetGlobalOpLowering,
                CIRToLLVMGetMemberOpLowering,
+               CIRToLLVMRotateOpLowering,
                CIRToLLVMSelectOpLowering,
                CIRToLLVMSetBitfieldOpLowering,
                CIRToLLVMShiftOpLowering,
@@ -2555,7 +2583,7 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite(
     assert(storageSize > size && "Invalid bitfield size.");
 
     mlir::Value val = rewriter.create<mlir::LLVM::LoadOp>(
-        op.getLoc(), intType, adaptor.getAddr(), /* alignment */ 0,
+        op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(),
         op.getIsVolatile());
 
     srcVal =
@@ -2572,7 +2600,7 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite(
   }
 
   rewriter.create<mlir::LLVM::StoreOp>(op.getLoc(), srcVal, adaptor.getAddr(),
-                                       /* alignment */ 0, op.getIsVolatile());
+                                       op.getAlignment(), op.getIsVolatile());
 
   mlir::Type resultTy = getTypeConverter()->convertType(op.getType());
 
@@ -2646,7 +2674,8 @@ mlir::LogicalResult CIRToLLVMGetBitfieldOpLowering::matchAndRewrite(
       computeBitfieldIntType(storageType, context, storageSize);
 
   mlir::Value val = rewriter.create<mlir::LLVM::LoadOp>(
-      op.getLoc(), intType, adaptor.getAddr(), 0, op.getIsVolatile());
+      op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(),
+      op.getIsVolatile());
   val = rewriter.create<mlir::LLVM::BitcastOp>(op.getLoc(), intType, val);
 
   if (info.getIsSigned()) {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 3c30b1bc5b072..2911ced66e58e 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -44,6 +44,16 @@ class CIRToLLVMAssumeOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMAssumeSepStorageOpLowering
+    : public mlir::OpConversionPattern<cir::AssumeSepStorageOp> {
+public:
+  using mlir::OpConversionPattern<cir::AssumeSepStorageOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::AssumeSepStorageOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMBitClrsbOpLowering
     : public mlir::OpConversionPattern<cir::BitClrsbOp> {
 public:
@@ -160,6 +170,16 @@ class CIRToLLVMReturnOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMRotateOpLowering
+    : public mlir::OpConversionPattern<cir::RotateOp> {
+public:
+  using mlir::OpConversionPattern<cir::RotateOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::RotateOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMCallOpLowering : public mlir::OpConversionPattern<cir::CallOp> {
 public:
   using mlir::OpConversionPattern<cir::CallOp>::OpConversionPattern;
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 2f6d4c414e737..1b7257857dd3b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -407,13 +407,13 @@ static bool initTargetOptions(const CompilerInstance &CI,
   // Set EABI version.
   Options.EABIVersion = TargetOpts.EABIVersion;
 
-  if (LangOpts.hasSjLjExceptions())
+  if (CodeGenOpts.hasSjLjExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::SjLj;
-  if (LangOpts.hasSEHExceptions())
+  if (CodeGenOpts.hasSEHExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::WinEH;
-  if (LangOpts.hasDWARFExceptions())
+  if (CodeGenOpts.hasDWARFExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::DwarfCFI;
-  if (LangOpts.hasWasmExceptions())
+  if (CodeGenOpts.hasWasmExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::Wasm;
 
   Options.NoInfsFPMath = LangOpts.NoHonorInfs;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5f2eb76e7bacb..3f784fc8e798f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4108,6 +4108,22 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(Result);
   }
 
+  case Builtin::BI__builtin_elementwise_maximumnum: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Result = Builder.CreateBinaryIntrinsic(
+        Intrinsic::maximumnum, Op0, Op1, nullptr, "elt.maximumnum");
+    return RValue::get(Result);
+  }
+
+  case Builtin::BI__builtin_elementwise_minimumnum: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    Value *Result = Builder.CreateBinaryIntrinsic(
+        Intrinsic::minimumnum, Op0, Op1, nullptr, "elt.minimumnum");
+    return RValue::get(Result);
+  }
+
   case Builtin::BI__builtin_reduce_max: {
     auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index c8c3d6b20c496..0bceecec6e555 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -214,7 +214,7 @@ static void appendParameterTypes(
   for (unsigned I = 0, E = FPT->getNumParams(); I != E; ++I) {
     prefix.push_back(FPT->getParamType(I));
     if (ExtInfos[I].hasPassObjectSize())
-      prefix.push_back(CGT.getContext().getSizeType());
+      prefix.push_back(CGT.getContext().getCanonicalSizeType());
   }
 
   addExtParameterInfosForCall(paramInfos, FPT.getTypePtr(), PrefixSize,
@@ -2852,8 +2852,21 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
       if (AI.getInReg())
         Attrs.addAttribute(llvm::Attribute::InReg);
 
-      if (AI.getIndirectByVal())
+      // Depending on the ABI, this may be either a byval or a dead_on_return
+      // argument.
+      if (AI.getIndirectByVal()) {
         Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType));
+      } else {
+        // Add dead_on_return when the object's lifetime ends in the callee.
+        // This includes trivially-destructible objects, as well as objects
+        // whose destruction / clean-up is carried out within the callee (e.g.,
+        // Obj-C ARC-managed structs, MSVC callee-destroyed objects).
+        if (!ParamType.isDestructedType() || !ParamType->isRecordType() ||
+            ParamType->castAs<RecordType>()
+                ->getDecl()
+                ->isParamDestroyedInCallee())
+          Attrs.addAttribute(llvm::Attribute::DeadOnReturn);
+      }
 
       auto *Decl = ParamType->getAsRecordDecl();
       if (CodeGenOpts.PassByValueIsNoAlias && Decl &&
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 117ef3d16e21b..5ee908922b5a3 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -1006,15 +1006,15 @@ RValue CodeGenFunction::EmitCoroutineIntrinsic(const CallExpr *E,
   }
   case llvm::Intrinsic::coro_size: {
     auto &Context = getContext();
-    CanQualType SizeTy = Context.getSizeType();
-    llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy));
+    llvm::IntegerType *T =
+        Builder.getIntNTy(Context.getTypeSize(Context.getSizeType()));
     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_size, T);
     return RValue::get(Builder.CreateCall(F));
   }
   case llvm::Intrinsic::coro_align: {
     auto &Context = getContext();
-    CanQualType SizeTy = Context.getSizeType();
-    llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy));
+    llvm::IntegerType *T =
+        Builder.getIntNTy(Context.getTypeSize(Context.getSizeType()));
     llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_align, T);
     return RValue::get(Builder.CreateCall(F));
   }
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index b985db7a9494b..a371b6755f74d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -58,13 +58,6 @@
 using namespace clang;
 using namespace clang::CodeGen;
 
-// TODO: consider deprecating ClArrayBoundsPseudoFn; functionality is subsumed
-//       by -fsanitize-annotate-debug-info
-static llvm::cl::opt<bool> ClArrayBoundsPseudoFn(
-    "array-bounds-pseudofn", llvm::cl::Hidden, llvm::cl::Optional,
-    llvm::cl::desc("Emit debug info that places array-bounds instrumentation "
-                   "in an inline function called __ubsan_check_array_bounds."));
-
 static uint32_t getTypeAlignIfRequired(const Type *Ty, const ASTContext &Ctx) {
   auto TI = Ctx.getTypeInfo(Ty);
   if (TI.isAlignRequired())
@@ -170,6 +163,10 @@ void CGDebugInfo::addInstToSpecificSourceAtom(llvm::Instruction *KeyInstruction,
   if (!Group || !CGM.getCodeGenOpts().DebugKeyInstructions)
     return;
 
+  llvm::DISubprogram *SP = KeyInstruction->getFunction()->getSubprogram();
+  if (!SP || !SP->getKeyInstructionsEnabled())
+    return;
+
   addInstSourceAtomMetadata(KeyInstruction, Group, /*Rank=*/1);
 
   llvm::Instruction *BackupI =
@@ -4048,7 +4045,8 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) {
     return CreateType(cast<HLSLAttributedResourceType>(Ty), Unit);
   case Type::HLSLInlineSpirv:
     return CreateType(cast<HLSLInlineSpirvType>(Ty), Unit);
-
+  case Type::PredefinedSugar:
+    return getOrCreateType(cast<PredefinedSugarType>(Ty)->desugar(), Unit);
   case Type::CountAttributed:
   case Type::Auto:
   case Type::Attributed:
@@ -6064,11 +6062,10 @@ void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder,
     // ptr, in this case its debug info may not match the actual type of object
     // being used as in the next instruction, so we will need to emit a pseudo
     // variable for type-casted value.
-    auto DeclareTypeMatches = [&](auto *DbgDeclare) {
+    auto DeclareTypeMatches = [&](llvm::DbgVariableRecord *DbgDeclare) {
       return DbgDeclare->getVariable()->getType() == Type;
     };
-    if (any_of(llvm::findDbgDeclares(Var), DeclareTypeMatches) ||
-        any_of(llvm::findDVRDeclares(Var), DeclareTypeMatches))
+    if (any_of(llvm::findDVRDeclares(Var), DeclareTypeMatches))
       return;
   }
 
@@ -6476,24 +6473,25 @@ SanitizerOrdinalToCheckLabel(SanitizerKind::SanitizerOrdinal Ordinal) {
 llvm::DILocation *CodeGenFunction::SanitizerAnnotateDebugInfo(
     ArrayRef<SanitizerKind::SanitizerOrdinal> Ordinals,
     SanitizerHandler Handler) {
+  llvm::DILocation *CheckDebugLoc = Builder.getCurrentDebugLocation();
+  auto *DI = getDebugInfo();
+  if (!DI || !CheckDebugLoc)
+    return CheckDebugLoc;
+  const auto &AnnotateDebugInfo =
+      CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo;
+  if (AnnotateDebugInfo.empty())
+    return CheckDebugLoc;
+
   std::string Label;
   if (Ordinals.size() == 1)
     Label = SanitizerOrdinalToCheckLabel(Ordinals[0]);
   else
     Label = SanitizerHandlerToCheckLabel(Handler);
 
-  llvm::DILocation *CheckDI = Builder.getCurrentDebugLocation();
-
-  for (auto Ord : Ordinals) {
-    // TODO: deprecate ClArrayBoundsPseudoFn
-    if (((ClArrayBoundsPseudoFn && Ord == SanitizerKind::SO_ArrayBounds) ||
-         CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(Ord)) &&
-        CheckDI) {
-      return getDebugInfo()->CreateSyntheticInlineAt(CheckDI, Label);
-    }
-  }
+  if (any_of(Ordinals, [&](auto Ord) { return AnnotateDebugInfo.has(Ord); }))
+    return DI->CreateSyntheticInlineAt(CheckDebugLoc, Label);
 
-  return CheckDI;
+  return CheckDebugLoc;
 }
 
 SanitizerDebugLocation::SanitizerDebugLocation(
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index ad138b9876e8c..f86af4581c345 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -131,20 +131,21 @@ const EHPersonality EHPersonality::ZOS_CPlusPlus = {"__zos_cxx_personality_v2",
                                                     nullptr};
 
 static const EHPersonality &getCPersonality(const TargetInfo &Target,
-                                            const LangOptions &L) {
+                                            const CodeGenOptions &CGOpts) {
   const llvm::Triple &T = Target.getTriple();
   if (T.isWindowsMSVCEnvironment())
     return EHPersonality::MSVC_CxxFrameHandler3;
-  if (L.hasSjLjExceptions())
+  if (CGOpts.hasSjLjExceptions())
     return EHPersonality::GNU_C_SJLJ;
-  if (L.hasDWARFExceptions())
+  if (CGOpts.hasDWARFExceptions())
     return EHPersonality::GNU_C;
-  if (L.hasSEHExceptions())
+  if (CGOpts.hasSEHExceptions())
     return EHPersonality::GNU_C_SEH;
   return EHPersonality::GNU_C;
 }
 
 static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
+                                               const CodeGenOptions &CGOpts,
                                                const LangOptions &L) {
   const llvm::Triple &T = Target.getTriple();
   if (T.isWindowsMSVCEnvironment())
@@ -152,7 +153,7 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
 
   switch (L.ObjCRuntime.getKind()) {
   case ObjCRuntime::FragileMacOSX:
-    return getCPersonality(Target, L);
+    return getCPersonality(Target, CGOpts);
   case ObjCRuntime::MacOSX:
   case ObjCRuntime::iOS:
   case ObjCRuntime::WatchOS:
@@ -165,9 +166,9 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
     [[fallthrough]];
   case ObjCRuntime::GCC:
   case ObjCRuntime::ObjFW:
-    if (L.hasSjLjExceptions())
+    if (CGOpts.hasSjLjExceptions())
       return EHPersonality::GNU_ObjC_SJLJ;
-    if (L.hasSEHExceptions())
+    if (CGOpts.hasSEHExceptions())
       return EHPersonality::GNU_ObjC_SEH;
     return EHPersonality::GNU_ObjC;
   }
@@ -175,19 +176,19 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
 }
 
 static const EHPersonality &getCXXPersonality(const TargetInfo &Target,
-                                              const LangOptions &L) {
+                                              const CodeGenOptions &CGOpts) {
   const llvm::Triple &T = Target.getTriple();
   if (T.isWindowsMSVCEnvironment())
     return EHPersonality::MSVC_CxxFrameHandler3;
   if (T.isOSAIX())
     return EHPersonality::XL_CPlusPlus;
-  if (L.hasSjLjExceptions())
+  if (CGOpts.hasSjLjExceptions())
     return EHPersonality::GNU_CPlusPlus_SJLJ;
-  if (L.hasDWARFExceptions())
+  if (CGOpts.hasDWARFExceptions())
     return EHPersonality::GNU_CPlusPlus;
-  if (L.hasSEHExceptions())
+  if (CGOpts.hasSEHExceptions())
     return EHPersonality::GNU_CPlusPlus_SEH;
-  if (L.hasWasmExceptions())
+  if (CGOpts.hasWasmExceptions())
     return EHPersonality::GNU_Wasm_CPlusPlus;
   if (T.isOSzOS())
     return EHPersonality::ZOS_CPlusPlus;
@@ -197,6 +198,7 @@ static const EHPersonality &getCXXPersonality(const TargetInfo &Target,
 /// Determines the personality function to use when both C++
 /// and Objective-C exceptions are being caught.
 static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
+                                                 const CodeGenOptions &CGOpts,
                                                  const LangOptions &L) {
   if (Target.getTriple().isWindowsMSVCEnvironment())
     return EHPersonality::MSVC_CxxFrameHandler3;
@@ -205,7 +207,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
   // In the fragile ABI, just use C++ exception handling and hope
   // they're not doing crazy exception mixing.
   case ObjCRuntime::FragileMacOSX:
-    return getCXXPersonality(Target, L);
+    return getCXXPersonality(Target, CGOpts);
 
   // The ObjC personality defers to the C++ personality for non-ObjC
   // handlers.  Unlike the C++ case, we use the same personality
@@ -213,7 +215,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
   case ObjCRuntime::MacOSX:
   case ObjCRuntime::iOS:
   case ObjCRuntime::WatchOS:
-    return getObjCPersonality(Target, L);
+    return getObjCPersonality(Target, CGOpts, L);
 
   case ObjCRuntime::GNUstep:
     return Target.getTriple().isOSCygMing() ? EHPersonality::GNU_CPlusPlus_SEH
@@ -223,7 +225,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
   // mixed EH.  Use the ObjC personality just to avoid returning null.
   case ObjCRuntime::GCC:
   case ObjCRuntime::ObjFW:
-    return getObjCPersonality(Target, L);
+    return getObjCPersonality(Target, CGOpts, L);
   }
   llvm_unreachable("bad runtime kind");
 }
@@ -237,6 +239,7 @@ static const EHPersonality &getSEHPersonalityMSVC(const llvm::Triple &T) {
 const EHPersonality &EHPersonality::get(CodeGenModule &CGM,
                                         const FunctionDecl *FD) {
   const llvm::Triple &T = CGM.getTarget().getTriple();
+  const CodeGenOptions &CGOpts = CGM.getCodeGenOpts();
   const LangOptions &L = CGM.getLangOpts();
   const TargetInfo &Target = CGM.getTarget();
 
@@ -245,10 +248,10 @@ const EHPersonality &EHPersonality::get(CodeGenModule &CGM,
     return getSEHPersonalityMSVC(T);
 
   if (L.ObjC)
-    return L.CPlusPlus ? getObjCXXPersonality(Target, L)
-                       : getObjCPersonality(Target, L);
-  return L.CPlusPlus ? getCXXPersonality(Target, L)
-                     : getCPersonality(Target, L);
+    return L.CPlusPlus ? getObjCXXPersonality(Target, CGOpts, L)
+                       : getObjCPersonality(Target, CGOpts, L);
+  return L.CPlusPlus ? getCXXPersonality(Target, CGOpts)
+                     : getCPersonality(Target, CGOpts);
 }
 
 const EHPersonality &EHPersonality::get(CodeGenFunction &CGF) {
@@ -344,7 +347,7 @@ void CodeGenModule::SimplifyPersonality() {
     return;
 
   const EHPersonality &ObjCXX = EHPersonality::get(*this, /*FD=*/nullptr);
-  const EHPersonality &CXX = getCXXPersonality(getTarget(), LangOpts);
+  const EHPersonality &CXX = getCXXPersonality(getTarget(), CodeGenOpts);
   if (&ObjCXX == &CXX)
     return;
 
@@ -500,7 +503,7 @@ void CodeGenFunction::EmitStartEHSpec(const Decl *D) {
     // In Wasm EH we currently treat 'throw()' in the same way as 'noexcept'. In
     // case of throw with types, we ignore it and print a warning for now.
     // TODO Correctly handle exception specification in Wasm EH
-    if (CGM.getLangOpts().hasWasmExceptions()) {
+    if (CGM.getCodeGenOpts().hasWasmExceptions()) {
       if (EST == EST_DynamicNone)
         EHStack.pushTerminate();
       else
@@ -515,8 +518,8 @@ void CodeGenFunction::EmitStartEHSpec(const Decl *D) {
     // throw with types.
     // TODO Correctly handle exception specification in Emscripten EH
     if (getTarget().getCXXABI() == TargetCXXABI::WebAssembly &&
-        CGM.getLangOpts().getExceptionHandling() ==
-            LangOptions::ExceptionHandlingKind::None &&
+        CGM.getCodeGenOpts().getExceptionHandling() ==
+            CodeGenOptions::ExceptionHandlingKind::None &&
         EST == EST_Dynamic)
       CGM.getDiags().Report(D->getLocation(),
                             diag::warn_wasm_dynamic_exception_spec_ignored)
@@ -604,7 +607,7 @@ void CodeGenFunction::EmitEndEHSpec(const Decl *D) {
     // In wasm we currently treat 'throw()' in the same way as 'noexcept'. In
     // case of throw with types, we ignore it and print a warning for now.
     // TODO Correctly handle exception specification in wasm
-    if (CGM.getLangOpts().hasWasmExceptions()) {
+    if (CGM.getCodeGenOpts().hasWasmExceptions()) {
       if (EST == EST_DynamicNone)
         EHStack.popTerminate();
       return;
diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp
index 4a9092842858b..b2b569a43038c 100644
--- a/clang/lib/CodeGen/CGLoopInfo.cpp
+++ b/clang/lib/CodeGen/CGLoopInfo.cpp
@@ -221,18 +221,6 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs,
     return createUnrollAndJamMetadata(Attrs, LoopProperties, HasUserTransforms);
   }
 
-  // Apply all loop properties to the vectorized loop.
-  SmallVector<Metadata *, 4> FollowupLoopProperties;
-  FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end());
-
-  // Don't vectorize an already vectorized loop.
-  FollowupLoopProperties.push_back(
-      MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized")));
-
-  bool FollowupHasTransforms = false;
-  SmallVector<Metadata *, 4> Followup = createUnrollAndJamMetadata(
-      Attrs, FollowupLoopProperties, FollowupHasTransforms);
-
   SmallVector<Metadata *, 4> Args;
   Args.append(LoopProperties.begin(), LoopProperties.end());
 
@@ -286,22 +274,46 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs,
   // 5) it is implied when vectorize.width is unset (0) and the user
   //    explicitly requested fixed-width vectorization, i.e.
   //    vectorize.scalable.enable is false.
+  bool VectorizeEnabled = false;
   if (Attrs.VectorizeEnable != LoopAttributes::Unspecified ||
       (IsVectorPredicateEnabled && Attrs.VectorizeWidth != 1) ||
       Attrs.VectorizeWidth > 1 ||
       Attrs.VectorizeScalable == LoopAttributes::Enable ||
       (Attrs.VectorizeScalable == LoopAttributes::Disable &&
        Attrs.VectorizeWidth != 1)) {
-    bool AttrVal = Attrs.VectorizeEnable != LoopAttributes::Disable;
+    VectorizeEnabled = Attrs.VectorizeEnable != LoopAttributes::Disable;
     Args.push_back(
         MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
                           ConstantAsMetadata::get(ConstantInt::get(
-                              llvm::Type::getInt1Ty(Ctx), AttrVal))}));
+                              llvm::Type::getInt1Ty(Ctx), VectorizeEnabled))}));
   }
 
-  if (FollowupHasTransforms)
-    Args.push_back(
-        createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup));
+  // Apply all loop properties to the vectorized loop.
+  SmallVector<Metadata *, 4> FollowupLoopProperties;
+
+  // If vectorization is not explicitly enabled, the follow-up metadata will be
+  // directly appended to the list currently being created. In that case, adding
+  // LoopProperties to FollowupLoopProperties would result in duplication.
+  if (VectorizeEnabled)
+    FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end());
+
+  // Don't vectorize an already vectorized loop.
+  FollowupLoopProperties.push_back(
+      MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized")));
+
+  bool FollowupHasTransforms = false;
+  SmallVector<Metadata *, 4> Followup = createUnrollAndJamMetadata(
+      Attrs, FollowupLoopProperties, FollowupHasTransforms);
+
+  if (FollowupHasTransforms) {
+    // If vectorization is explicitly enabled, we create a follow-up metadata,
+    // otherwise directly add the contents of it to Args.
+    if (VectorizeEnabled)
+      Args.push_back(
+          createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup));
+    else
+      Args.append(Followup.begin(), Followup.end());
+  }
 
   HasUserTransforms = true;
   return Args;
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 8e71a576552d3..8c66176942cb5 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -285,7 +285,7 @@ class ObjCCommonTypesHelper {
     SmallVector<CanQualType, 5> Params;
     Params.push_back(Ctx.VoidPtrTy);
     Params.push_back(Ctx.VoidPtrTy);
-    Params.push_back(Ctx.getSizeType());
+    Params.push_back(Ctx.getCanonicalSizeType());
     Params.push_back(Ctx.BoolTy);
     Params.push_back(Ctx.BoolTy);
     llvm::FunctionType *FTy = Types.GetFunctionType(
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index e0650067b9547..1a8c6f015bda1 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -846,11 +846,13 @@ void CodeGenFunction::EmitGotoStmt(const GotoStmt &S) {
   if (HaveInsertPoint())
     EmitStopPoint(&S);
 
+  ApplyAtomGroup Grp(getDebugInfo());
   EmitBranchThroughCleanup(getJumpDestForLabel(S.getLabel()));
 }
 
 
 void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
+  ApplyAtomGroup Grp(getDebugInfo());
   if (const LabelDecl *Target = S.getConstantTarget()) {
     EmitBranchThroughCleanup(getJumpDestForLabel(Target));
     return;
@@ -869,6 +871,8 @@ void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
   cast<llvm::PHINode>(IndGotoBB->begin())->addIncoming(V, CurBB);
 
   EmitBranch(IndGotoBB);
+  if (CurBB && CurBB->getTerminator())
+    addInstToCurrentSourceAtom(CurBB->getTerminator(), nullptr);
 }
 
 void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
@@ -2672,6 +2676,9 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                                          llvm::ConstantAsMetadata::get(Loc)));
   }
 
+  // Make inline-asm calls Key for the debug info feature Key Instructions.
+  CGF.addInstToNewSourceAtom(&Result, nullptr);
+
   if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
     // Conservatively, mark all inline asm blocks in CUDA or OpenCL as
     // convergent (meaning, they may call an intrinsically convergent op, such
@@ -2750,6 +2757,7 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
       }
     }
 
+    ApplyAtomGroup Grp(CGF.getDebugInfo());
     LValue Dest = ResultRegDests[i];
     // ResultTypeRequiresCast elements correspond to the first
     // ResultTypeRequiresCast.size() elements of RegResults.
@@ -2757,7 +2765,8 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
       unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Dest.getAddress().withElementType(ResultRegTypes[i]);
       if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
-        Builder.CreateStore(Tmp, A);
+        llvm::StoreInst *S = Builder.CreateStore(Tmp, A);
+        CGF.addInstToCurrentSourceAtom(S, S->getValueOperand());
         continue;
       }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 776a646ceb32f..ab345a598c4e8 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -161,8 +161,7 @@ void CodeGenFunction::CGFPOptionsRAII::ConstructorHelper(FPOptions FPFeatures) {
   llvm::RoundingMode NewRoundingBehavior = FPFeatures.getRoundingMode();
   CGF.Builder.setDefaultConstrainedRounding(NewRoundingBehavior);
   auto NewExceptionBehavior =
-      ToConstrainedExceptMD(static_cast<LangOptions::FPExceptionModeKind>(
-          FPFeatures.getExceptionMode()));
+      ToConstrainedExceptMD(FPFeatures.getExceptionMode());
   CGF.Builder.setDefaultConstrainedExcept(NewExceptionBehavior);
 
   CGF.SetFastMathFlags(FPFeatures);
@@ -721,7 +720,7 @@ static bool matchesStlAllocatorFn(const Decl *D, const ASTContext &Ctx) {
       (MD->getNumParams() != 1 && MD->getNumParams() != 2))
     return false;
 
-  if (MD->parameters()[0]->getType().getCanonicalType() != Ctx.getSizeType())
+  if (!Ctx.hasSameType(MD->parameters()[0]->getType(), Ctx.getSizeType()))
     return false;
 
   if (MD->getNumParams() == 2) {
@@ -2492,6 +2491,7 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) {
     case Type::ObjCObjectPointer:
     case Type::BitInt:
     case Type::HLSLInlineSpirv:
+    case Type::PredefinedSugar:
       llvm_unreachable("type class is never variably-modified!");
 
     case Type::Elaborated:
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dcfdea648e93c..7dccf82b1a7a3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -416,11 +416,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_sqrt:
   case AMDGPU::BI__builtin_amdgcn_sqrtf:
   case AMDGPU::BI__builtin_amdgcn_sqrth:
+  case AMDGPU::BI__builtin_amdgcn_sqrt_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E,
                                                Intrinsic::amdgcn_sqrt);
   case AMDGPU::BI__builtin_amdgcn_rsq:
   case AMDGPU::BI__builtin_amdgcn_rsqf:
   case AMDGPU::BI__builtin_amdgcn_rsqh:
+  case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
@@ -428,15 +430,19 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                                Intrinsic::amdgcn_rsq_clamp);
   case AMDGPU::BI__builtin_amdgcn_sinf:
   case AMDGPU::BI__builtin_amdgcn_sinh:
+  case AMDGPU::BI__builtin_amdgcn_sin_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
   case AMDGPU::BI__builtin_amdgcn_cosf:
   case AMDGPU::BI__builtin_amdgcn_cosh:
+  case AMDGPU::BI__builtin_amdgcn_cos_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
     return EmitAMDGPUDispatchPtr(*this, E);
   case AMDGPU::BI__builtin_amdgcn_logf:
+  case AMDGPU::BI__builtin_amdgcn_log_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
   case AMDGPU::BI__builtin_amdgcn_exp2f:
+  case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E,
                                                Intrinsic::amdgcn_exp2);
   case AMDGPU::BI__builtin_amdgcn_log_clampf:
@@ -498,6 +504,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
     return Builder.CreateCall(F, { Src });
   }
+  case AMDGPU::BI__builtin_amdgcn_tanhf:
+  case AMDGPU::BI__builtin_amdgcn_tanhh:
   case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E,
                                                Intrinsic::amdgcn_tanh);
@@ -847,6 +855,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
   case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
@@ -1110,6 +1119,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       ArgsForMatchingMatrixTypes = {4, 1};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4:
+      ArgsForMatchingMatrixTypes = {5, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4;
+      break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
       ArgsForMatchingMatrixTypes = {3, 0, 1};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
diff --git a/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp b/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp
index 16243951c7bec..243aad8bf7083 100644
--- a/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp
@@ -58,6 +58,18 @@ Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/I->getType(), Intrinsic::spv_reflect,
         ArrayRef<Value *>{I, N}, nullptr, "spv.reflect");
   }
+  case SPIRV::BI__builtin_spirv_refract: {
+    Value *I = EmitScalarExpr(E->getArg(0));
+    Value *N = EmitScalarExpr(E->getArg(1));
+    Value *eta = EmitScalarExpr(E->getArg(2));
+    assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
+           E->getArg(1)->getType()->hasFloatingRepresentation() &&
+           E->getArg(2)->getType()->isFloatingType() &&
+           "refract operands must have a float representation");
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/I->getType(), Intrinsic::spv_refract,
+        ArrayRef<Value *>{I, N, eta}, nullptr, "spv.refract");
+  }
   case SPIRV::BI__builtin_spirv_smoothstep: {
     Value *Min = EmitScalarExpr(E->getArg(0));
     Value *Max = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 0b712ac2dabc4..abb91486e7ee6 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -2470,13 +2470,12 @@ GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset,
   return llvm::Type::getDoubleTy(getVMContext());
 }
 
-
 /// GetINTEGERTypeAtOffset - The ABI specifies that a value should be passed in
-/// an 8-byte GPR.  This means that we either have a scalar or we are talking
-/// about the high or low part of an up-to-16-byte struct.  This routine picks
-/// the best LLVM IR type to represent this, which may be i64 or may be anything
-/// else that the backend will pass in a GPR that works better (e.g. i8, %foo*,
-/// etc).
+/// one or more 8-byte GPRs.  This means that we either have a scalar or we are
+/// talking about the high and/or low part of an up-to-16-byte struct.  This
+/// routine picks the best LLVM IR type to represent this, which may be i64 or
+/// may be anything else that the backend will pass in GPRs that works better
+/// (e.g. i8, %foo*, etc).
 ///
 /// PrefType is an LLVM IR type that corresponds to (part of) the IR type for
 /// the source type.  IROffset is an offset in bytes into the LLVM IR type that
@@ -2534,6 +2533,13 @@ GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset,
                                   SourceOffset);
   }
 
+  // if we have a 128-bit integer, we can pass it safely using an i128
+  // so we return that
+  if (IRType->isIntegerTy(128)) {
+    assert(IROffset == 0);
+    return IRType;
+  }
+
   // Okay, we don't have any better idea of what to pass, so we pass this in an
   // integer register that isn't too big to fit the rest of the struct.
   unsigned TySizeInBytes =
@@ -2591,8 +2597,7 @@ GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi,
   return Result;
 }
 
-ABIArgInfo X86_64ABIInfo::
-classifyReturnType(QualType RetTy) const {
+ABIArgInfo X86_64ABIInfo::classifyReturnType(QualType RetTy) const {
   // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the
   // classification algorithm.
   X86_64ABIInfo::Class Lo, Hi;
@@ -2638,6 +2643,12 @@ classifyReturnType(QualType RetTy) const {
           isPromotableIntegerTypeForABI(RetTy))
         return ABIArgInfo::getExtend(RetTy);
     }
+
+    if (ResType->isIntegerTy(128)) {
+      // i128 are passed directly
+      assert(Hi == Integer);
+      return ABIArgInfo::getDirect(ResType);
+    }
     break;
 
     // AMD64-ABI 3.2.3p4: Rule 4. If the class is SSE, the next
@@ -2783,6 +2794,11 @@ X86_64ABIInfo::classifyArgumentType(QualType Ty, unsigned freeIntRegs,
         return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty));
     }
 
+    if (ResType->isIntegerTy(128)) {
+      assert(Hi == Integer);
+      ++neededInt;
+      return ABIArgInfo::getDirect(ResType);
+    }
     break;
 
     // AMD64-ABI 3.2.3p3: Rule 3. If the class is SSE, the next
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index ec1135eecd401..ff2f92d1a94c8 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -68,6 +68,7 @@
 #include "clang/Driver/Types.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
@@ -83,6 +84,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ExitCodes.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/Path.h"
@@ -109,65 +111,6 @@ using namespace clang::driver;
 using namespace clang;
 using namespace llvm::opt;
 
-static std::optional<llvm::Triple> getOffloadTargetTriple(const Driver &D,
-                                                          const ArgList &Args) {
-  auto OffloadTargets = Args.getAllArgValues(options::OPT_offload_EQ);
-  // Offload compilation flow does not support multiple targets for now. We
-  // need the HIPActionBuilder (and possibly the CudaActionBuilder{,Base}too)
-  // to support multiple tool chains first.
-  switch (OffloadTargets.size()) {
-  default:
-    D.Diag(diag::err_drv_only_one_offload_target_supported);
-    return std::nullopt;
-  case 0:
-    D.Diag(diag::err_drv_invalid_or_unsupported_offload_target) << "";
-    return std::nullopt;
-  case 1:
-    break;
-  }
-  return llvm::Triple(OffloadTargets[0]);
-}
-
-static std::optional<llvm::Triple>
-getNVIDIAOffloadTargetTriple(const Driver &D, const ArgList &Args,
-                             const llvm::Triple &HostTriple) {
-  if (!Args.hasArg(options::OPT_offload_EQ)) {
-    return llvm::Triple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
-                                                 : "nvptx-nvidia-cuda");
-  }
-  auto TT = getOffloadTargetTriple(D, Args);
-  if (TT && (TT->getArch() == llvm::Triple::spirv32 ||
-             TT->getArch() == llvm::Triple::spirv64)) {
-    if (Args.hasArg(options::OPT_emit_llvm))
-      return TT;
-    D.Diag(diag::err_drv_cuda_offload_only_emit_bc);
-    return std::nullopt;
-  }
-  D.Diag(diag::err_drv_invalid_or_unsupported_offload_target) << TT->str();
-  return std::nullopt;
-}
-
-static std::optional<llvm::Triple>
-getHIPOffloadTargetTriple(const Driver &D, const ArgList &Args) {
-  if (!Args.hasArg(options::OPT_offload_EQ)) {
-    auto OffloadArchs = Args.getAllArgValues(options::OPT_offload_arch_EQ);
-    if (llvm::is_contained(OffloadArchs, "amdgcnspirv") &&
-        OffloadArchs.size() == 1)
-      return llvm::Triple("spirv64-amd-amdhsa");
-    return llvm::Triple("amdgcn-amd-amdhsa"); // Default HIP triple.
-  }
-  auto TT = getOffloadTargetTriple(D, Args);
-  if (!TT)
-    return std::nullopt;
-  if (TT->isAMDGCN() && TT->getVendor() == llvm::Triple::AMD &&
-      TT->getOS() == llvm::Triple::AMDHSA)
-    return TT;
-  if (TT->getArch() == llvm::Triple::spirv64)
-    return TT;
-  D.Diag(diag::err_drv_invalid_or_unsupported_offload_target) << TT->str();
-  return std::nullopt;
-}
-
 template <typename F> static bool usesInput(const ArgList &Args, F &&Fn) {
   return llvm::any_of(Args, [&](Arg *A) {
     return (A->getOption().matches(options::OPT_x) &&
@@ -458,6 +401,44 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
   return FinalPhase;
 }
 
+llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+Driver::executeProgram(llvm::ArrayRef<llvm::StringRef> Args) const {
+  llvm::SmallString<64> OutputFile;
+  llvm::sys::fs::createTemporaryFile("driver-program", "txt", OutputFile,
+                                     llvm::sys::fs::OF_Text);
+  llvm::FileRemover OutputRemover(OutputFile.c_str());
+  std::optional<llvm::StringRef> Redirects[] = {
+      {""},
+      OutputFile.str(),
+      {""},
+  };
+
+  std::string ErrorMessage;
+  int SecondsToWait = 60;
+  if (std::optional<std::string> Str =
+          llvm::sys::Process::GetEnv("CLANG_TOOLCHAIN_PROGRAM_TIMEOUT")) {
+    if (!llvm::to_integer(*Str, SecondsToWait))
+      return llvm::createStringError(std::error_code(),
+                                     "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected "
+                                     "an integer, got '" +
+                                         *Str + "'");
+    SecondsToWait = std::max(SecondsToWait, 0); // infinite
+  }
+  StringRef Executable = Args[0];
+  if (llvm::sys::ExecuteAndWait(Executable, Args, {}, Redirects, SecondsToWait,
+                                /*MemoryLimit=*/0, &ErrorMessage))
+    return llvm::createStringError(std::error_code(),
+                                   Executable + ": " + ErrorMessage);
+
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> OutputBuf =
+      llvm::MemoryBuffer::getFile(OutputFile.c_str());
+  if (!OutputBuf)
+    return llvm::createStringError(OutputBuf.getError(),
+                                   "Failed to read stdout of " + Executable +
+                                       ": " + OutputBuf.getError().message());
+  return std::move(*OutputBuf);
+}
+
 static Arg *MakeInputArg(DerivedArgList &Args, const OptTable &Opts,
                          StringRef Value, bool Claim = true) {
   Arg *A = new Arg(Opts.getOption(options::OPT_INPUT), Value,
@@ -921,250 +902,265 @@ Driver::OpenMPRuntimeKind Driver::getOpenMPRuntime(const ArgList &Args) const {
   return RT;
 }
 
-static llvm::Triple getSYCLDeviceTriple(StringRef TargetArch) {
-  SmallVector<StringRef, 5> SYCLAlias = {"spir", "spir64", "spirv", "spirv32",
-                                         "spirv64"};
-  if (llvm::is_contained(SYCLAlias, TargetArch)) {
-    llvm::Triple TargetTriple;
-    TargetTriple.setArchName(TargetArch);
-    TargetTriple.setVendor(llvm::Triple::UnknownVendor);
-    TargetTriple.setOS(llvm::Triple::UnknownOS);
-    return TargetTriple;
-  }
-  return llvm::Triple(TargetArch);
+// Handles `native` offload architectures by using the 'offload-arch' utility.
+static llvm::SmallVector<std::string>
+getSystemOffloadArchs(Compilation &C, Action::OffloadKind Kind) {
+  StringRef Program = C.getArgs().getLastArgValue(
+      options::OPT_offload_arch_tool_EQ, "offload-arch");
+
+  SmallVector<std::string, 1> GPUArchs;
+  if (llvm::ErrorOr<std::string> Executable =
+          llvm::sys::findProgramByName(Program)) {
+    llvm::SmallVector<StringRef> Args{*Executable};
+    if (Kind == Action::OFK_HIP)
+      Args.push_back("--only=amdgpu");
+    else if (Kind == Action::OFK_Cuda)
+      Args.push_back("--only=nvptx");
+    auto StdoutOrErr = C.getDriver().executeProgram(Args);
+
+    if (!StdoutOrErr) {
+      C.getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+          << Action::GetOffloadKindName(Kind) << StdoutOrErr.takeError()
+          << "--offload-arch";
+      return GPUArchs;
+    }
+    if ((*StdoutOrErr)->getBuffer().empty()) {
+      C.getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+          << Action::GetOffloadKindName(Kind) << "No GPU detected in the system"
+          << "--offload-arch";
+      return GPUArchs;
+    }
+
+    for (StringRef Arch : llvm::split((*StdoutOrErr)->getBuffer(), "\n"))
+      if (!Arch.empty())
+        GPUArchs.push_back(Arch.str());
+  } else {
+    C.getDriver().Diag(diag::err_drv_command_failure) << "offload-arch";
+  }
+  return GPUArchs;
 }
 
-static bool addSYCLDefaultTriple(Compilation &C,
-                                 SmallVectorImpl<llvm::Triple> &SYCLTriples) {
-  // Check current set of triples to see if the default has already been set.
-  for (const auto &SYCLTriple : SYCLTriples) {
-    if (SYCLTriple.getSubArch() == llvm::Triple::NoSubArch &&
-        SYCLTriple.isSPIROrSPIRV())
-      return false;
+// Attempts to infer the correct offloading toolchain triple by looking at the
+// requested offloading kind and architectures.
+static llvm::DenseSet<llvm::StringRef>
+inferOffloadToolchains(Compilation &C, Action::OffloadKind Kind) {
+  std::set<std::string> Archs;
+  for (Arg *A : C.getInputArgs()) {
+    for (StringRef Arch : A->getValues()) {
+      if (A->getOption().matches(options::OPT_offload_arch_EQ)) {
+        if (Arch == "native") {
+          for (StringRef Str : getSystemOffloadArchs(C, Kind))
+            Archs.insert(Str.str());
+        } else {
+          Archs.insert(Arch.str());
+        }
+      } else if (A->getOption().matches(options::OPT_no_offload_arch_EQ)) {
+        if (Arch == "all")
+          Archs.clear();
+        else
+          Archs.erase(Arch.str());
+      }
+    }
   }
-  // Add the default triple as it was not found.
-  llvm::Triple DefaultTriple = getSYCLDeviceTriple(
-      C.getDefaultToolChain().getTriple().isArch32Bit() ? "spirv32"
-                                                        : "spirv64");
-  SYCLTriples.insert(SYCLTriples.begin(), DefaultTriple);
-  return true;
+
+  llvm::DenseSet<llvm::StringRef> Triples;
+  for (llvm::StringRef Arch : Archs) {
+    OffloadArch ID = StringToOffloadArch(Arch);
+    if (ID == OffloadArch::UNKNOWN)
+      ID = StringToOffloadArch(
+          getProcessorFromTargetID(llvm::Triple("amdgcn-amd-amdhsa"), Arch));
+
+    if (Kind == Action::OFK_HIP && !IsAMDOffloadArch(ID)) {
+      C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
+          << "HIP" << Arch;
+      return llvm::DenseSet<llvm::StringRef>();
+    }
+    if (Kind == Action::OFK_Cuda && !IsNVIDIAOffloadArch(ID)) {
+      C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
+          << "CUDA" << Arch;
+      return llvm::DenseSet<llvm::StringRef>();
+    }
+    if (Kind == Action::OFK_OpenMP &&
+        (ID == OffloadArch::UNKNOWN || ID == OffloadArch::UNUSED)) {
+      C.getDriver().Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch)
+          << Arch;
+      return llvm::DenseSet<llvm::StringRef>();
+    }
+    if (ID == OffloadArch::UNKNOWN || ID == OffloadArch::UNUSED) {
+      C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
+          << "offload" << Arch;
+      return llvm::DenseSet<llvm::StringRef>();
+    }
+
+    StringRef Triple;
+    if (ID == OffloadArch::AMDGCNSPIRV)
+      Triple = "spirv64-amd-amdhsa";
+    else if (IsNVIDIAOffloadArch(ID))
+      Triple = C.getDefaultToolChain().getTriple().isArch64Bit()
+                   ? "nvptx64-nvidia-cuda"
+                   : "nvptx-nvidia-cuda";
+    else if (IsAMDOffloadArch(ID))
+      Triple = "amdgcn-amd-amdhsa";
+    else
+      continue;
+
+    // Make a new argument that dispatches this argument to the appropriate
+    // toolchain. This is required when we infer it and create potentially
+    // incompatible toolchains from the global option.
+    Option Opt = C.getDriver().getOpts().getOption(options::OPT_Xarch__);
+    unsigned Index = C.getArgs().getBaseArgs().MakeIndex("-Xarch_");
+    Arg *A = new Arg(Opt, C.getArgs().getArgString(Index), Index,
+                     C.getArgs().MakeArgString(Triple.split("-").first),
+                     C.getArgs().MakeArgString("--offload-arch=" + Arch));
+    C.getArgs().append(A);
+    Triples.insert(Triple);
+  }
+
+  // Infer the default target triple if no specific architectures are given.
+  if (Archs.empty() && Kind == Action::OFK_HIP)
+    Triples.insert("amdgcn-amd-amdhsa");
+  else if (Archs.empty() && Kind == Action::OFK_Cuda)
+    Triples.insert(C.getDefaultToolChain().getTriple().isArch64Bit()
+                       ? "nvptx64-nvidia-cuda"
+                       : "nvptx-nvidia-cuda");
+  else if (Archs.empty() && Kind == Action::OFK_SYCL)
+    Triples.insert(C.getDefaultToolChain().getTriple().isArch64Bit()
+                       ? "spirv64-unknown-unknown"
+                       : "spirv32-unknown-unknown");
+
+  // We need to dispatch these to the appropriate toolchain now.
+  C.getArgs().eraseArg(options::OPT_offload_arch_EQ);
+  C.getArgs().eraseArg(options::OPT_no_offload_arch_EQ);
+
+  return Triples;
 }
 
 void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                                               InputList &Inputs) {
-
-  //
-  // CUDA/HIP
-  //
-  // We need to generate a CUDA/HIP toolchain if any of the inputs has a CUDA
-  // or HIP type. However, mixed CUDA/HIP compilation is not supported.
+  bool UseLLVMOffload = C.getInputArgs().hasArg(
+      options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
   bool IsCuda =
-      llvm::any_of(Inputs, [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
-        return types::isCuda(I.first);
-      });
-  bool IsHIP =
       llvm::any_of(Inputs,
                    [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
-                     return types::isHIP(I.first);
-                   }) ||
-      C.getInputArgs().hasArg(options::OPT_hip_link) ||
-      C.getInputArgs().hasArg(options::OPT_hipstdpar);
-  bool UseLLVMOffload = C.getInputArgs().hasArg(
-      options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
-  if (IsCuda && IsHIP) {
-    Diag(clang::diag::err_drv_mix_cuda_hip);
+                     return types::isCuda(I.first);
+                   }) &&
+      !UseLLVMOffload;
+  bool IsHIP =
+      (llvm::any_of(Inputs,
+                    [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
+                      return types::isHIP(I.first);
+                    }) ||
+       C.getInputArgs().hasArg(options::OPT_hip_link) ||
+       C.getInputArgs().hasArg(options::OPT_hipstdpar)) &&
+      !UseLLVMOffload;
+  bool IsSYCL = C.getInputArgs().hasFlag(options::OPT_fsycl,
+                                         options::OPT_fno_sycl, false);
+  bool IsOpenMPOffloading =
+      UseLLVMOffload ||
+      (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+                                options::OPT_fno_openmp, false) &&
+       (C.getInputArgs().hasArg(options::OPT_offload_targets_EQ) ||
+        (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
+         !(IsCuda || IsHIP))));
+
+  llvm::SmallSet<Action::OffloadKind, 4> Kinds;
+  const std::pair<bool, Action::OffloadKind> ActiveKinds[] = {
+      {IsCuda, Action::OFK_Cuda},
+      {IsHIP, Action::OFK_HIP},
+      {IsOpenMPOffloading, Action::OFK_OpenMP},
+      {IsSYCL, Action::OFK_SYCL}};
+  for (const auto &[Active, Kind] : ActiveKinds)
+    if (Active)
+      Kinds.insert(Kind);
+
+  // We currently don't support any kind of mixed offloading.
+  if (Kinds.size() > 1) {
+    Diag(clang::diag::err_drv_mix_offload)
+        << Action::GetOffloadKindName(*Kinds.begin()).upper()
+        << Action::GetOffloadKindName(*(++Kinds.begin())).upper();
     return;
   }
-  if (IsCuda && !UseLLVMOffload) {
-    auto CudaTriple = getNVIDIAOffloadTargetTriple(
-        *this, C.getInputArgs(), C.getDefaultToolChain().getTriple());
-    if (!CudaTriple)
-      return;
-
-    auto &TC =
-        getOffloadToolChain(C.getInputArgs(), Action::OFK_Cuda, *CudaTriple,
-                            C.getDefaultToolChain().getTriple());
-
-    // Emit a warning if the detected CUDA version is too new.
-    const CudaInstallationDetector &CudaInstallation =
-        static_cast<const toolchains::CudaToolChain &>(TC).CudaInstallation;
-    if (CudaInstallation.isValid())
-      CudaInstallation.WarnIfUnsupportedVersion();
-    C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
-    OffloadArchs[&TC] = getOffloadArchs(C, C.getArgs(), Action::OFK_Cuda, &TC,
-                                        /*SpecificToolchain=*/true);
-  } else if (IsHIP && !UseLLVMOffload) {
-    if (auto *OMPTargetArg =
-            C.getInputArgs().getLastArg(options::OPT_offload_targets_EQ)) {
-      Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
-          << OMPTargetArg->getSpelling() << "HIP";
-      return;
-    }
-
-    auto HIPTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
-    if (!HIPTriple)
-      return;
-
-    auto &TC =
-        getOffloadToolChain(C.getInputArgs(), Action::OFK_HIP, *HIPTriple,
-                            C.getDefaultToolChain().getTriple());
-    C.addOffloadDeviceToolChain(&TC, Action::OFK_HIP);
-
-    // TODO: Fix 'amdgcnspirv' handling with the new driver.
-    if (C.getInputArgs().hasFlag(options::OPT_offload_new_driver,
-                                 options::OPT_no_offload_new_driver, false))
-      OffloadArchs[&TC] = getOffloadArchs(C, C.getArgs(), Action::OFK_HIP, &TC,
-                                          /*SpecificToolchain=*/true);
-  }
 
+  // Initialize the compilation identifier used for unique CUDA / HIP names.
   if (IsCuda || IsHIP)
     CUIDOpts = CUIDOptions(C.getArgs(), *this);
 
-  //
-  // OpenMP
-  //
-  // We need to generate an OpenMP toolchain if the user specified targets with
-  // the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
-  bool IsOpenMPOffloading =
-      ((IsCuda || IsHIP) && UseLLVMOffload) ||
-      (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                                options::OPT_fno_openmp, false) &&
-       (C.getInputArgs().hasArg(options::OPT_offload_targets_EQ) ||
-        C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
-  if (IsOpenMPOffloading) {
-    // We expect that -fopenmp-targets is always used in conjunction with the
-    // option -fopenmp specifying a valid runtime with offloading support, i.e.
-    // libomp or libiomp.
-    OpenMPRuntimeKind RuntimeKind = getOpenMPRuntime(C.getInputArgs());
-    if (RuntimeKind != OMPRT_OMP && RuntimeKind != OMPRT_IOMP5) {
-      Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
-      return;
-    }
-
-    // If the user specified -fopenmp-targets= we create a toolchain for each
-    // valid triple. Otherwise, if only --offload-arch= was specified we instead
-    // attempt to derive the appropriate toolchains from the arguments.
-    if (Arg *OpenMPTargets =
-            C.getInputArgs().getLastArg(options::OPT_offload_targets_EQ)) {
-      if (OpenMPTargets && !OpenMPTargets->getNumValues()) {
-        Diag(clang::diag::warn_drv_empty_joined_argument)
-            << OpenMPTargets->getAsString(C.getInputArgs());
+  // Get the list of requested offloading toolchains. If they were not
+  // explicitly specified we will infer them based on the offloading language
+  // and requested architectures.
+  std::multiset<llvm::StringRef> Triples;
+  if (C.getInputArgs().hasArg(options::OPT_offload_targets_EQ)) {
+    std::vector<std::string> ArgValues =
+        C.getInputArgs().getAllArgValues(options::OPT_offload_targets_EQ);
+    for (llvm::StringRef Target : ArgValues)
+      Triples.insert(C.getInputArgs().MakeArgString(Target));
+
+    if (ArgValues.empty())
+      Diag(clang::diag::warn_drv_empty_joined_argument)
+          << C.getInputArgs()
+                 .getLastArg(options::OPT_offload_targets_EQ)
+                 ->getAsString(C.getInputArgs());
+  } else if (Kinds.size() > 0) {
+    for (Action::OffloadKind Kind : Kinds) {
+      llvm::DenseSet<llvm::StringRef> Derived = inferOffloadToolchains(C, Kind);
+      Triples.insert(Derived.begin(), Derived.end());
+    }
+  }
+
+  // Build an offloading toolchain for every requested target and kind.
+  llvm::StringMap<StringRef> FoundNormalizedTriples;
+  for (StringRef Target : Triples) {
+    // OpenMP offloading requires a compatible libomp.
+    if (Kinds.contains(Action::OFK_OpenMP)) {
+      OpenMPRuntimeKind RuntimeKind = getOpenMPRuntime(C.getInputArgs());
+      if (RuntimeKind != OMPRT_OMP && RuntimeKind != OMPRT_IOMP5) {
+        Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
         return;
       }
+    }
 
-      // Make sure these show up in a deterministic order.
-      std::multiset<StringRef> OpenMPTriples;
-      for (StringRef T : OpenMPTargets->getValues())
-        OpenMPTriples.insert(T);
-
-      llvm::StringMap<StringRef> FoundNormalizedTriples;
-      for (StringRef T : OpenMPTriples) {
-        llvm::Triple TT(ToolChain::getOpenMPTriple(T));
-        std::string NormalizedName = TT.normalize();
-
-        // Make sure we don't have a duplicate triple.
-        auto [TripleIt, Inserted] =
-            FoundNormalizedTriples.try_emplace(NormalizedName, T);
-        if (!Inserted) {
-          Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
-              << T << TripleIt->second;
-          continue;
-        }
-
-        // If the specified target is invalid, emit a diagnostic.
-        if (TT.getArch() == llvm::Triple::UnknownArch) {
-          Diag(clang::diag::err_drv_invalid_omp_target) << T;
-          continue;
-        }
+    // Certain options are not allowed when combined with SYCL compilation.
+    if (Kinds.contains(Action::OFK_SYCL)) {
+      for (auto ID :
+           {options::OPT_static_libstdcxx, options::OPT_ffreestanding})
+        if (Arg *IncompatArg = C.getInputArgs().getLastArg(ID))
+          Diag(clang::diag::err_drv_argument_not_allowed_with)
+              << IncompatArg->getSpelling() << "-fsycl";
+    }
 
-        auto &TC = getOffloadToolChain(C.getInputArgs(), Action::OFK_OpenMP, TT,
-                                       C.getDefaultToolChain().getTriple());
-        C.addOffloadDeviceToolChain(&TC, Action::OFK_OpenMP);
-        OffloadArchs[&TC] =
-            getOffloadArchs(C, C.getArgs(), Action::OFK_OpenMP, &TC,
-                            /*SpecificToolchain=*/true);
-      }
-    } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
-               ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
-      llvm::Triple AMDTriple("amdgcn-amd-amdhsa");
-      llvm::Triple NVPTXTriple("nvptx64-nvidia-cuda");
-
-      for (StringRef Arch :
-           C.getInputArgs().getAllArgValues(options::OPT_offload_arch_EQ)) {
-        bool IsNVPTX = IsNVIDIAOffloadArch(
-            StringToOffloadArch(getProcessorFromTargetID(NVPTXTriple, Arch)));
-        bool IsAMDGPU = IsAMDOffloadArch(
-            StringToOffloadArch(getProcessorFromTargetID(AMDTriple, Arch)));
-        if (!IsNVPTX && !IsAMDGPU && !Arch.empty() &&
-            !Arch.equals_insensitive("native")) {
-          Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch) << Arch;
-          return;
-        }
+    // Create a device toolchain for every specified kind and triple.
+    for (Action::OffloadKind Kind : Kinds) {
+      llvm::Triple TT = Kind == Action::OFK_OpenMP
+                            ? ToolChain::getOpenMPTriple(Target)
+                            : llvm::Triple(Target);
+      if (TT.getArch() == llvm::Triple::ArchType::UnknownArch) {
+        Diag(diag::err_drv_invalid_or_unsupported_offload_target) << TT.str();
+        continue;
       }
 
-      // Attempt to deduce the offloading triple from the set of architectures.
-      // We can only correctly deduce NVPTX / AMDGPU triples currently.
-      for (const llvm::Triple &TT : {AMDTriple, NVPTXTriple}) {
-        auto &TC = getOffloadToolChain(C.getInputArgs(), Action::OFK_OpenMP, TT,
-                                       C.getDefaultToolChain().getTriple());
-
-        llvm::SmallVector<StringRef> Archs =
-            getOffloadArchs(C, C.getArgs(), Action::OFK_OpenMP, &TC,
-                            /*SpecificToolchain=*/false);
-        if (!Archs.empty()) {
-          C.addOffloadDeviceToolChain(&TC, Action::OFK_OpenMP);
-          OffloadArchs[&TC] = Archs;
-        }
+      std::string NormalizedName = TT.normalize();
+      auto [TripleIt, Inserted] =
+          FoundNormalizedTriples.try_emplace(NormalizedName, Target);
+      if (!Inserted) {
+        Diag(clang::diag::warn_drv_omp_offload_target_duplicate)
+            << Target << TripleIt->second;
+        continue;
       }
 
-      // If the set is empty then we failed to find a native architecture.
-      auto TCRange = C.getOffloadToolChains(Action::OFK_OpenMP);
-      if (TCRange.first == TCRange.second)
-        Diag(clang::diag::err_drv_failed_to_deduce_target_from_arch)
-            << "native";
-    }
-  } else if (C.getInputArgs().hasArg(options::OPT_offload_targets_EQ)) {
-    Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets);
-    return;
-  }
+      auto &TC = getOffloadToolChain(C.getInputArgs(), Kind, TT,
+                                     C.getDefaultToolChain().getTriple());
 
-  // We need to generate a SYCL toolchain if the user specified -fsycl.
-  bool IsSYCL = C.getInputArgs().hasFlag(options::OPT_fsycl,
-                                         options::OPT_fno_sycl, false);
-
-  auto argSYCLIncompatible = [&](OptSpecifier OptId) {
-    if (!IsSYCL)
-      return;
-    if (Arg *IncompatArg = C.getInputArgs().getLastArg(OptId))
-      Diag(clang::diag::err_drv_argument_not_allowed_with)
-          << IncompatArg->getSpelling() << "-fsycl";
-  };
-  // -static-libstdc++ is not compatible with -fsycl.
-  argSYCLIncompatible(options::OPT_static_libstdcxx);
-  // -ffreestanding cannot be used with -fsycl
-  argSYCLIncompatible(options::OPT_ffreestanding);
-
-  llvm::SmallVector<llvm::Triple, 4> UniqueSYCLTriplesVec;
-
-  if (IsSYCL) {
-    addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
+      // Emit a warning if the detected CUDA version is too new.
+      if (Kind == Action::OFK_Cuda) {
+        auto &CudaInstallation =
+            static_cast<const toolchains::CudaToolChain &>(TC).CudaInstallation;
+        if (CudaInstallation.isValid())
+          CudaInstallation.WarnIfUnsupportedVersion();
+      }
 
-    // We'll need to use the SYCL and host triples as the key into
-    // getOffloadingDeviceToolChain, because the device toolchains we're
-    // going to create will depend on both.
-    const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
-    for (const auto &TT : UniqueSYCLTriplesVec) {
-      auto &TC = getOffloadToolChain(C.getInputArgs(), Action::OFK_SYCL, TT,
-                                     HostTC->getTriple());
-      C.addOffloadDeviceToolChain(&TC, Action::OFK_SYCL);
-      OffloadArchs[&TC] = getOffloadArchs(C, C.getArgs(), Action::OFK_SYCL, &TC,
-                                          /*SpecificToolchain=*/true);
+      C.addOffloadDeviceToolChain(&TC, Kind);
     }
   }
-
-  //
-  // TODO: Add support for other offloading programming models here.
-  //
 }
 
 bool Driver::loadZOSCustomizationFile(llvm::cl::ExpansionContext &ExpCtx) {
@@ -3306,9 +3302,6 @@ class OffloadingActionBuilder final {
       // architecture. If we are in host-only mode we return 'success' so that
       // the host uses the CUDA offload kind.
       if (auto *IA = dyn_cast<InputAction>(HostAction)) {
-        assert(!GpuArchList.empty() &&
-               "We should have at least one GPU architecture.");
-
         // If the host input is not CUDA or HIP, we don't need to bother about
         // this input.
         if (!(IA->getType() == types::TY_CUDA ||
@@ -3408,10 +3401,6 @@ class OffloadingActionBuilder final {
       CudaDeviceActions.clear();
     }
 
-    /// Get canonicalized offload arch option. \returns empty StringRef if the
-    /// option is invalid.
-    virtual StringRef getCanonicalOffloadArch(StringRef Arch) = 0;
-
     virtual std::optional<std::pair<llvm::StringRef, llvm::StringRef>>
     getConflictOffloadArchCombination(const std::set<StringRef> &GpuArchs) = 0;
 
@@ -3440,91 +3429,25 @@ class OffloadingActionBuilder final {
         return true;
       }
 
-      ToolChains.push_back(
-          AssociatedOffloadKind == Action::OFK_Cuda
-              ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
-              : C.getSingleOffloadToolChain<Action::OFK_HIP>());
-
-      CompileHostOnly = C.getDriver().offloadHostOnly();
-      EmitLLVM = Args.getLastArg(options::OPT_emit_llvm);
-      EmitAsm = Args.getLastArg(options::OPT_S);
-
-      // --offload and --offload-arch options are mutually exclusive.
-      if (Args.hasArgNoClaim(options::OPT_offload_EQ) &&
-          Args.hasArgNoClaim(options::OPT_offload_arch_EQ,
-                             options::OPT_no_offload_arch_EQ)) {
-        C.getDriver().Diag(diag::err_opt_not_valid_with_opt) << "--offload-arch"
-                                                             << "--offload";
-      }
-
-      // Collect all offload arch parameters, removing duplicates.
       std::set<StringRef> GpuArchs;
-      bool Error = false;
-      const ToolChain &TC = *ToolChains.front();
-      for (Arg *A : C.getArgsForToolChain(&TC, /*BoundArch=*/"",
-                                          AssociatedOffloadKind)) {
-        if (!(A->getOption().matches(options::OPT_offload_arch_EQ) ||
-              A->getOption().matches(options::OPT_no_offload_arch_EQ)))
-          continue;
-        A->claim();
-
-        for (StringRef ArchStr : llvm::split(A->getValue(), ",")) {
-          if (A->getOption().matches(options::OPT_no_offload_arch_EQ) &&
-              ArchStr == "all") {
-            GpuArchs.clear();
-          } else if (ArchStr == "native") {
-            auto GPUsOrErr = ToolChains.front()->getSystemGPUArchs(Args);
-            if (!GPUsOrErr) {
-              TC.getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
-                  << llvm::Triple::getArchTypeName(TC.getArch())
-                  << llvm::toString(GPUsOrErr.takeError()) << "--offload-arch";
-              continue;
-            }
+      for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_HIP}) {
+        for (auto &I : llvm::make_range(C.getOffloadToolChains(Kind))) {
+          ToolChains.push_back(I.second);
 
-            for (auto GPU : *GPUsOrErr) {
-              GpuArchs.insert(Args.MakeArgString(GPU));
-            }
-          } else {
-            ArchStr = getCanonicalOffloadArch(ArchStr);
-            if (ArchStr.empty()) {
-              Error = true;
-            } else if (A->getOption().matches(options::OPT_offload_arch_EQ))
-              GpuArchs.insert(ArchStr);
-            else if (A->getOption().matches(options::OPT_no_offload_arch_EQ))
-              GpuArchs.erase(ArchStr);
-            else
-              llvm_unreachable("Unexpected option.");
-          }
+          for (auto Arch :
+               C.getDriver().getOffloadArchs(C, C.getArgs(), Kind, *I.second))
+            GpuArchs.insert(Arch);
         }
       }
 
-      auto &&ConflictingArchs = getConflictOffloadArchCombination(GpuArchs);
-      if (ConflictingArchs) {
-        C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo)
-            << ConflictingArchs->first << ConflictingArchs->second;
-        C.setContainsError();
-        return true;
-      }
-
-      // Collect list of GPUs remaining in the set.
       for (auto Arch : GpuArchs)
         GpuArchList.push_back(Arch.data());
 
-      // Default to sm_20 which is the lowest common denominator for
-      // supported GPUs.  sm_20 code should work correctly, if
-      // suboptimally, on all newer GPUs.
-      if (GpuArchList.empty()) {
-        if (ToolChains.front()->getTriple().isSPIRV()) {
-          if (ToolChains.front()->getTriple().getVendor() == llvm::Triple::AMD)
-            GpuArchList.push_back(OffloadArch::AMDGCNSPIRV);
-          else
-            GpuArchList.push_back(OffloadArch::Generic);
-        } else {
-          GpuArchList.push_back(DefaultOffloadArch);
-        }
-      }
+      CompileHostOnly = C.getDriver().offloadHostOnly();
+      EmitLLVM = Args.getLastArg(options::OPT_emit_llvm);
+      EmitAsm = Args.getLastArg(options::OPT_S);
 
-      return Error;
+      return false;
     }
   };
 
@@ -3538,15 +3461,6 @@ class OffloadingActionBuilder final {
       DefaultOffloadArch = OffloadArch::CudaDefault;
     }
 
-    StringRef getCanonicalOffloadArch(StringRef ArchStr) override {
-      OffloadArch Arch = StringToOffloadArch(ArchStr);
-      if (Arch == OffloadArch::UNKNOWN || !IsNVIDIAOffloadArch(Arch)) {
-        C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
-        return StringRef();
-      }
-      return OffloadArchToString(Arch);
-    }
-
     std::optional<std::pair<llvm::StringRef, llvm::StringRef>>
     getConflictOffloadArchCombination(
         const std::set<StringRef> &GpuArchs) override {
@@ -3705,24 +3619,6 @@ class OffloadingActionBuilder final {
 
     bool canUseBundlerUnbundler() const override { return true; }
 
-    StringRef getCanonicalOffloadArch(StringRef IdStr) override {
-      llvm::StringMap<bool> Features;
-      // getHIPOffloadTargetTriple() is known to return valid value as it has
-      // been called successfully in the CreateOffloadingDeviceToolChains().
-      auto T =
-          (IdStr == "amdgcnspirv")
-              ? llvm::Triple("spirv64-amd-amdhsa")
-              : *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs());
-      auto ArchStr = parseTargetID(T, IdStr, &Features);
-      if (!ArchStr) {
-        C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << IdStr;
-        C.setContainsError();
-        return StringRef();
-      }
-      auto CanId = getCanonicalTargetID(*ArchStr, Features);
-      return Args.MakeArgStringRef(CanId);
-    };
-
     std::optional<std::pair<llvm::StringRef, llvm::StringRef>>
     getConflictOffloadArchCombination(
         const std::set<StringRef> &GpuArchs) override {
@@ -4715,23 +4611,20 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
 static StringRef getCanonicalArchString(Compilation &C,
                                         const llvm::opt::DerivedArgList &Args,
                                         StringRef ArchStr,
-                                        const llvm::Triple &Triple,
-                                        bool SpecificToolchain) {
+                                        const llvm::Triple &Triple) {
   // Lookup the CUDA / HIP architecture string. Only report an error if we were
   // expecting the triple to be only NVPTX / AMDGPU.
   OffloadArch Arch =
       StringToOffloadArch(getProcessorFromTargetID(Triple, ArchStr));
   if (Triple.isNVPTX() &&
       (Arch == OffloadArch::UNKNOWN || !IsNVIDIAOffloadArch(Arch))) {
-    if (SpecificToolchain)
-      C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
-          << "CUDA" << ArchStr;
+    C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
+        << "CUDA" << ArchStr;
     return StringRef();
   } else if (Triple.isAMDGPU() &&
              (Arch == OffloadArch::UNKNOWN || !IsAMDOffloadArch(Arch))) {
-    if (SpecificToolchain)
-      C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
-          << "HIP" << ArchStr;
+    C.getDriver().Diag(clang::diag::err_drv_offload_bad_gpu_arch)
+        << "HIP" << ArchStr;
     return StringRef();
   }
 
@@ -4767,11 +4660,7 @@ getConflictOffloadArchCombination(const llvm::DenseSet<StringRef> &Archs,
 
 llvm::SmallVector<StringRef>
 Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
-                        Action::OffloadKind Kind, const ToolChain *TC,
-                        bool SpecificToolchain) const {
-  if (!TC)
-    TC = &C.getDefaultToolChain();
-
+                        Action::OffloadKind Kind, const ToolChain &TC) const {
   // --offload and --offload-arch options are mutually exclusive.
   if (Args.hasArgNoClaim(options::OPT_offload_EQ) &&
       Args.hasArgNoClaim(options::OPT_offload_arch_EQ,
@@ -4784,48 +4673,44 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
   }
 
   llvm::DenseSet<StringRef> Archs;
-  for (auto *Arg : C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)) {
+  for (auto *Arg : C.getArgsForToolChain(&TC, /*BoundArch=*/"", Kind)) {
     // Add or remove the seen architectures in order of appearance. If an
     // invalid architecture is given we simply exit.
     if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) {
       for (StringRef Arch : Arg->getValues()) {
         if (Arch == "native" || Arch.empty()) {
-          auto GPUsOrErr = TC->getSystemGPUArchs(Args);
+          auto GPUsOrErr = TC.getSystemGPUArchs(Args);
           if (!GPUsOrErr) {
-            if (!SpecificToolchain)
-              llvm::consumeError(GPUsOrErr.takeError());
-            else
-              TC->getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
-                  << llvm::Triple::getArchTypeName(TC->getArch())
-                  << llvm::toString(GPUsOrErr.takeError()) << "--offload-arch";
+            TC.getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+                << llvm::Triple::getArchTypeName(TC.getArch())
+                << llvm::toString(GPUsOrErr.takeError()) << "--offload-arch";
             continue;
           }
 
           for (auto ArchStr : *GPUsOrErr) {
-            StringRef CanonicalStr =
-                getCanonicalArchString(C, Args, Args.MakeArgString(ArchStr),
-                                       TC->getTriple(), SpecificToolchain);
+            StringRef CanonicalStr = getCanonicalArchString(
+                C, Args, Args.MakeArgString(ArchStr), TC.getTriple());
             if (!CanonicalStr.empty())
               Archs.insert(CanonicalStr);
-            else if (SpecificToolchain)
+            else
               return llvm::SmallVector<StringRef>();
           }
         } else {
-          StringRef CanonicalStr = getCanonicalArchString(
-              C, Args, Arch, TC->getTriple(), SpecificToolchain);
+          StringRef CanonicalStr =
+              getCanonicalArchString(C, Args, Arch, TC.getTriple());
           if (!CanonicalStr.empty())
             Archs.insert(CanonicalStr);
-          else if (SpecificToolchain)
+          else
             return llvm::SmallVector<StringRef>();
         }
       }
     } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) {
-      for (StringRef Arch : llvm::split(Arg->getValue(), ",")) {
+      for (StringRef Arch : Arg->getValues()) {
         if (Arch == "all") {
           Archs.clear();
         } else {
-          StringRef ArchStr = getCanonicalArchString(
-              C, Args, Arch, TC->getTriple(), SpecificToolchain);
+          StringRef ArchStr =
+              getCanonicalArchString(C, Args, Arch, TC.getTriple());
           Archs.erase(ArchStr);
         }
       }
@@ -4833,28 +4718,30 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
   }
 
   if (auto ConflictingArchs =
-          getConflictOffloadArchCombination(Archs, TC->getTriple()))
+          getConflictOffloadArchCombination(Archs, TC.getTriple()))
     C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo)
         << ConflictingArchs->first << ConflictingArchs->second;
 
-  // Skip filling defaults if we're just querying what is availible.
-  if (SpecificToolchain && Archs.empty()) {
+  // Fill in the default architectures if not provided explicitly.
+  if (Archs.empty()) {
     if (Kind == Action::OFK_Cuda) {
       Archs.insert(OffloadArchToString(OffloadArch::CudaDefault));
     } else if (Kind == Action::OFK_HIP) {
-      Archs.insert(OffloadArchToString(OffloadArch::HIPDefault));
+      Archs.insert(OffloadArchToString(TC.getTriple().isSPIRV()
+                                           ? OffloadArch::Generic
+                                           : OffloadArch::HIPDefault));
     } else if (Kind == Action::OFK_SYCL) {
       Archs.insert(StringRef());
     } else if (Kind == Action::OFK_OpenMP) {
       // Accept legacy `-march` device arguments for OpenMP.
-      if (auto *Arg = C.getArgsForToolChain(TC, /*BoundArch=*/"", Kind)
+      if (auto *Arg = C.getArgsForToolChain(&TC, /*BoundArch=*/"", Kind)
                           .getLastArg(options::OPT_march_EQ)) {
         Archs.insert(Arg->getValue());
       } else {
-        auto ArchsOrErr = TC->getSystemGPUArchs(Args);
+        auto ArchsOrErr = TC.getSystemGPUArchs(Args);
         if (!ArchsOrErr) {
-          TC->getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
-              << llvm::Triple::getArchTypeName(TC->getArch())
+          TC.getDriver().Diag(diag::err_drv_undetermined_gpu_arch)
+              << llvm::Triple::getArchTypeName(TC.getArch())
               << llvm::toString(ArchsOrErr.takeError()) << "--offload-arch";
         } else if (!ArchsOrErr->empty()) {
           for (auto Arch : *ArchsOrErr)
@@ -4934,7 +4821,7 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     // Get the product of all bound architectures and toolchains.
     SmallVector<std::pair<const ToolChain *, StringRef>> TCAndArchs;
     for (const ToolChain *TC : ToolChains) {
-      for (StringRef Arch : OffloadArchs.lookup(TC)) {
+      for (StringRef Arch : getOffloadArchs(C, C.getArgs(), Kind, *TC)) {
         TCAndArchs.push_back(std::make_pair(TC, Arch));
         DeviceActions.push_back(
             C.MakeAction<InputAction>(*InputArg, InputType, CUID));
@@ -4966,7 +4853,7 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       if (Kind == Action::OFK_SYCL && Phase == phases::Assemble)
         continue;
 
-      auto TCAndArch = TCAndArchs.begin();
+      auto *TCAndArch = TCAndArchs.begin();
       for (Action *&A : DeviceActions) {
         if (A->getType() == types::TY_Nothing)
           continue;
@@ -5006,7 +4893,7 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
     }
 
-    auto TCAndArch = TCAndArchs.begin();
+    auto *TCAndArch = TCAndArchs.begin();
     for (Action *A : DeviceActions) {
       DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
       OffloadAction::DeviceDependences DDep;
@@ -5131,11 +5018,13 @@ Action *Driver::ConstructPhaseAction(
     if (Args.hasArg(options::OPT_extract_api))
       return C.MakeAction<ExtractAPIJobAction>(Input, types::TY_API_INFO);
 
-    // With 'fexperimental-modules-reduced-bmi', we don't want to run the
+    // With 'fmodules-reduced-bmi', we don't want to run the
     // precompile phase unless the user specified '--precompile'. In the case
     // the '--precompile' flag is enabled, we will try to emit the reduced BMI
     // as a by product in GenerateModuleInterfaceAction.
-    if (Args.hasArg(options::OPT_modules_reduced_bmi) &&
+    if (!Args.hasArg(options::OPT_fno_modules_reduced_bmi) &&
+        (Input->getType() == driver::types::TY_CXXModule ||
+         Input->getType() == driver::types::TY_PP_CXXModule) &&
         !Args.getLastArg(options::OPT__precompile))
       return Input;
 
@@ -6323,7 +6212,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
   // `-fmodule-output`.
   if (!AtTopLevel && isa<PrecompileJobAction>(JA) &&
       JA.getType() == types::TY_ModuleFile && SpecifiedModuleOutput) {
-    assert(!C.getArgs().hasArg(options::OPT_modules_reduced_bmi));
+    assert(C.getArgs().hasArg(options::OPT_fno_modules_reduced_bmi));
     return GetModuleOutputPath(C, JA, BaseInput);
   }
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 3f9b808b2722e..180452077dde1 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -104,44 +104,6 @@ ToolChain::ToolChain(const Driver &D, const llvm::Triple &T,
     addIfExists(getFilePaths(), Path);
 }
 
-llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
-ToolChain::executeToolChainProgram(StringRef Executable) const {
-  llvm::SmallString<64> OutputFile;
-  llvm::sys::fs::createTemporaryFile("toolchain-program", "txt", OutputFile,
-                                     llvm::sys::fs::OF_Text);
-  llvm::FileRemover OutputRemover(OutputFile.c_str());
-  std::optional<llvm::StringRef> Redirects[] = {
-      {""},
-      OutputFile.str(),
-      {""},
-  };
-
-  std::string ErrorMessage;
-  int SecondsToWait = 60;
-  if (std::optional<std::string> Str =
-          llvm::sys::Process::GetEnv("CLANG_TOOLCHAIN_PROGRAM_TIMEOUT")) {
-    if (!llvm::to_integer(*Str, SecondsToWait))
-      return llvm::createStringError(std::error_code(),
-                                     "CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected "
-                                     "an integer, got '" +
-                                         *Str + "'");
-    SecondsToWait = std::max(SecondsToWait, 0); // infinite
-  }
-  if (llvm::sys::ExecuteAndWait(Executable, {Executable}, {}, Redirects,
-                                SecondsToWait,
-                                /*MemoryLimit=*/0, &ErrorMessage))
-    return llvm::createStringError(std::error_code(),
-                                   Executable + ": " + ErrorMessage);
-
-  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> OutputBuf =
-      llvm::MemoryBuffer::getFile(OutputFile.c_str());
-  if (!OutputBuf)
-    return llvm::createStringError(OutputBuf.getError(),
-                                   "Failed to read stdout of " + Executable +
-                                       ": " + OutputBuf.getError().message());
-  return std::move(*OutputBuf);
-}
-
 void ToolChain::setTripleEnvironment(llvm::Triple::EnvironmentType Env) {
   Triple.setEnvironment(Env);
   if (EffectiveTriple != llvm::Triple())
@@ -255,13 +217,25 @@ static void getAArch64MultilibFlags(const Driver &D,
     Result.push_back(ABIArg->getAsString(Args));
   }
 
+  if (const Arg *A = Args.getLastArg(options::OPT_O_Group);
+      A && A->getOption().matches(options::OPT_O)) {
+    switch (A->getValue()[0]) {
+    case 's':
+      Result.push_back("-Os");
+      break;
+    case 'z':
+      Result.push_back("-Oz");
+      break;
+    }
+  }
+
   processMultilibCustomFlags(Result, Args);
 }
 
-static void getARMMultilibFlags(const Driver &D,
-                                      const llvm::Triple &Triple,
-                                      const llvm::opt::ArgList &Args,
-                                      Multilib::flags_list &Result) {
+static void getARMMultilibFlags(const Driver &D, const llvm::Triple &Triple,
+                                llvm::Reloc::Model RelocationModel,
+                                const llvm::opt::ArgList &Args,
+                                Multilib::flags_list &Result) {
   std::vector<StringRef> Features;
   llvm::ARM::FPUKind FPUKind = tools::arm::getARMTargetFeatures(
       D, Triple, Args, Features, false /*ForAs*/, true /*ForMultilib*/);
@@ -304,6 +278,18 @@ static void getARMMultilibFlags(const Driver &D,
     llvm_unreachable("Invalid float ABI");
   }
 
+  if (RelocationModel == llvm::Reloc::ROPI ||
+      RelocationModel == llvm::Reloc::ROPI_RWPI)
+    Result.push_back("-fropi");
+  else
+    Result.push_back("-fno-ropi");
+
+  if (RelocationModel == llvm::Reloc::RWPI ||
+      RelocationModel == llvm::Reloc::ROPI_RWPI)
+    Result.push_back("-frwpi");
+  else
+    Result.push_back("-fno-rwpi");
+
   const Arg *BranchProtectionArg =
       Args.getLastArgNoClaim(options::OPT_mbranch_protection_EQ);
   if (BranchProtectionArg) {
@@ -320,6 +306,19 @@ static void getARMMultilibFlags(const Driver &D,
     if (Endian->getOption().matches(options::OPT_mbig_endian))
       Result.push_back(Endian->getAsString(Args));
   }
+
+  if (const Arg *A = Args.getLastArg(options::OPT_O_Group);
+      A && A->getOption().matches(options::OPT_O)) {
+    switch (A->getValue()[0]) {
+    case 's':
+      Result.push_back("-Os");
+      break;
+    case 'z':
+      Result.push_back("-Oz");
+      break;
+    }
+  }
+
   processMultilibCustomFlags(Result, Args);
 }
 
@@ -344,6 +343,18 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
   const llvm::Triple Triple(ComputeEffectiveClangTriple(Args));
   Result.push_back("--target=" + Triple.str());
 
+  // A difference of relocation model (absolutely addressed data, PIC, Arm
+  // ROPI/RWPI) is likely to change whether a particular multilib variant is
+  // compatible with a given link. Determine the relocation model of the
+  // current link, so as to add appropriate multilib flags.
+  llvm::Reloc::Model RelocationModel;
+  unsigned PICLevel;
+  bool IsPIE;
+  {
+    RegisterEffectiveTriple TripleRAII(*this, Triple);
+    std::tie(RelocationModel, PICLevel, IsPIE) = ParsePICArgs(*this, Args);
+  }
+
   switch (Triple.getArch()) {
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_32:
@@ -354,7 +365,7 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
-    getARMMultilibFlags(D, Triple, Args, Result);
+    getARMMultilibFlags(D, Triple, RelocationModel, Args, Result);
     break;
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
@@ -376,6 +387,12 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
   else
     Result.push_back("-fexceptions");
 
+  if (RelocationModel == llvm::Reloc::PIC_)
+    Result.push_back(IsPIE ? (PICLevel > 1 ? "-fPIE" : "-fpie")
+                           : (PICLevel > 1 ? "-fPIC" : "-fpic"));
+  else
+    Result.push_back("-fno-pic");
+
   // Sort and remove duplicates.
   std::sort(Result.begin(), Result.end());
   Result.erase(llvm::unique(Result), Result.end());
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 7fc34f451f183..0cd8819a92b66 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -841,7 +841,7 @@ AMDGPUToolChain::getSystemGPUArchs(const ArgList &Args) const {
   else
     Program = GetProgramPath("amdgpu-arch");
 
-  auto StdoutOrErr = executeToolChainProgram(Program);
+  auto StdoutOrErr = getDriver().executeProgram({Program});
   if (!StdoutOrErr)
     return StdoutOrErr.takeError();
 
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 9595ee8383c85..504f110eac87c 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -130,7 +130,8 @@ std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
   return "";
 }
 
-void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
+void sparc::getSparcTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+                                   const ArgList &Args,
                                    std::vector<StringRef> &Features) {
   sparc::FloatABI FloatABI = sparc::getSparcFloatABI(D, Args);
   if (FloatABI == sparc::FloatABI::Soft)
@@ -150,11 +151,19 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
       Features.push_back("-popc");
   }
 
+  // Those OSes default to enabling VIS on 64-bit SPARC.
+  // See also the corresponding code for external assemblers in
+  // sparc::getSparcAsmModeForCPU().
+  bool IsSparcV9ATarget =
+      (Triple.getArch() == llvm::Triple::sparcv9) &&
+      (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD());
   if (Arg *A = Args.getLastArg(options::OPT_mvis, options::OPT_mno_vis)) {
     if (A->getOption().matches(options::OPT_mvis))
       Features.push_back("+vis");
     else
       Features.push_back("-vis");
+  } else if (IsSparcV9ATarget) {
+    Features.push_back("+vis");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mvis2, options::OPT_mno_vis2)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.h b/clang/lib/Driver/ToolChains/Arch/Sparc.h
index 2b178d9df1ee3..fa25b4992cc8b 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.h
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.h
@@ -31,7 +31,8 @@ FloatABI getSparcFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
 std::string getSparcTargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
                               const llvm::Triple &Triple);
 
-void getSparcTargetFeatures(const Driver &D, const llvm::opt::ArgList &Args,
+void getSparcTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+                            const llvm::opt::ArgList &Args,
                             std::vector<llvm::StringRef> &Features);
 const char *getSparcAsmModeForCPU(llvm::StringRef Name,
                                   const llvm::Triple &Triple);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index e670696cd59ae..497f3330237b9 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -694,9 +694,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       NeedCRTs)
     CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
 
-  if (TC.getTriple().isRISCV())
-    CmdArgs.push_back("-X");
-
   // The R_ARM_TARGET2 relocation must be treated as R_ARM_REL32 on arm*-*-elf
   // and arm*-*-eabi (the default is R_ARM_GOT_PREL, used on arm*-*-linux and
   // arm*-*-*bsd).
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 456bfe885f354..7d0c142ecd061 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -97,32 +97,15 @@ forAllAssociatedToolChains(Compilation &C, const JobAction &JA,
 
   // Apply Work on all the offloading tool chains associated with the current
   // action.
-  if (JA.isHostOffloading(Action::OFK_Cuda))
-    Work(*C.getSingleOffloadToolChain<Action::OFK_Cuda>());
-  else if (JA.isDeviceOffloading(Action::OFK_Cuda))
-    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
-  else if (JA.isHostOffloading(Action::OFK_HIP))
-    Work(*C.getSingleOffloadToolChain<Action::OFK_HIP>());
-  else if (JA.isDeviceOffloading(Action::OFK_HIP))
-    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
-
-  if (JA.isHostOffloading(Action::OFK_OpenMP)) {
-    auto TCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
-    for (auto II = TCs.first, IE = TCs.second; II != IE; ++II)
-      Work(*II->second);
-  } else if (JA.isDeviceOffloading(Action::OFK_OpenMP))
-    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
-
-  if (JA.isHostOffloading(Action::OFK_SYCL)) {
-    auto TCs = C.getOffloadToolChains<Action::OFK_SYCL>();
-    for (auto II = TCs.first, IE = TCs.second; II != IE; ++II)
-      Work(*II->second);
-  } else if (JA.isDeviceOffloading(Action::OFK_SYCL))
-    Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
-
-  //
-  // TODO: Add support for other offloading programming models here.
-  //
+  for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP,
+                                   Action::OFK_HIP, Action::OFK_SYCL}) {
+    if (JA.isHostOffloading(Kind)) {
+      auto TCs = C.getOffloadToolChains(Kind);
+      for (auto II = TCs.first, IE = TCs.second; II != IE; ++II)
+        Work(*II->second);
+    } else if (JA.isDeviceOffloading(Kind))
+      Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
+  }
 }
 
 static bool
@@ -2731,16 +2714,6 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
     CmdArgs.push_back(MipsTargetFeature);
   }
 
-  // Those OSes default to enabling VIS on 64-bit SPARC.
-  // See also the corresponding code for external assemblers in
-  // sparc::getSparcAsmModeForCPU().
-  bool IsSparcV9ATarget =
-      (C.getDefaultToolChain().getArch() == llvm::Triple::sparcv9) &&
-      (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD());
-  if (IsSparcV9ATarget && SparcTargetFeatures.empty()) {
-    CmdArgs.push_back("-target-feature");
-    CmdArgs.push_back("+vis");
-  }
   for (const char *Feature : SparcTargetFeatures) {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back(Feature);
@@ -3846,15 +3819,6 @@ static void RenderOpenACCOptions(const Driver &D, const ArgList &Args,
     return;
 
   CmdArgs.push_back("-fopenacc");
-
-  if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) {
-    StringRef Value = A->getValue();
-    int Version;
-    if (!Value.getAsInteger(10, Version))
-      A->renderAsInput(Args, CmdArgs);
-    else
-      D.Diag(diag::err_drv_clang_unsupported) << Value;
-  }
 }
 
 static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T,
@@ -4104,31 +4068,34 @@ static bool RenderModulesOptions(Compilation &C, const Driver &D,
   // module fragment.
   CmdArgs.push_back("-fskip-odr-check-in-gmf");
 
-  if (Args.hasArg(options::OPT_modules_reduced_bmi) &&
+  if (!Args.hasArg(options::OPT_fno_modules_reduced_bmi) &&
       (Input.getType() == driver::types::TY_CXXModule ||
-       Input.getType() == driver::types::TY_PP_CXXModule)) {
+       Input.getType() == driver::types::TY_PP_CXXModule) &&
+      !Args.hasArg(options::OPT__precompile)) {
     CmdArgs.push_back("-fmodules-reduced-bmi");
 
     if (Args.hasArg(options::OPT_fmodule_output_EQ))
       Args.AddLastArg(CmdArgs, options::OPT_fmodule_output_EQ);
-    else {
-      if (Args.hasArg(options::OPT__precompile) &&
-          (!Args.hasArg(options::OPT_o) ||
-           Args.getLastArg(options::OPT_o)->getValue() ==
-               getCXX20NamedModuleOutputPath(Args, Input.getBaseInput()))) {
-        D.Diag(diag::err_drv_reduced_module_output_overrided);
-      }
-
+    else
       CmdArgs.push_back(Args.MakeArgString(
           "-fmodule-output=" +
           getCXX20NamedModuleOutputPath(Args, Input.getBaseInput())));
-    }
   }
 
-  // Noop if we see '-fmodules-reduced-bmi' with other translation
-  // units than module units. This is more user friendly to allow end uers to
-  // enable this feature without asking for help from build systems.
-  Args.ClaimAllArgs(options::OPT_modules_reduced_bmi);
+  if (Args.hasArg(options::OPT_fmodules_reduced_bmi) &&
+      Args.hasArg(options::OPT__precompile) &&
+      (!Args.hasArg(options::OPT_o) ||
+       Args.getLastArg(options::OPT_o)->getValue() ==
+           getCXX20NamedModuleOutputPath(Args, Input.getBaseInput()))) {
+    D.Diag(diag::err_drv_reduced_module_output_overrided);
+  }
+
+  // Noop if we see '-fmodules-reduced-bmi' or `-fno-modules-reduced-bmi` with
+  // other translation units than module units. This is more user friendly to
+  // allow end uers to enable this feature without asking for help from build
+  // systems.
+  Args.ClaimAllArgs(options::OPT_fmodules_reduced_bmi);
+  Args.ClaimAllArgs(options::OPT_fno_modules_reduced_bmi);
 
   // We need to include the case the input file is a module file here.
   // Since the default compilation model for C++ module interface unit will
@@ -5001,8 +4968,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     else {
       // Host-side compilation.
       NormalizedTriple =
-          (IsCuda ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
-                  : C.getSingleOffloadToolChain<Action::OFK_HIP>())
+          (IsCuda ? C.getOffloadToolChains(Action::OFK_Cuda).first->second
+                  : C.getOffloadToolChains(Action::OFK_HIP).first->second)
               ->getTriple()
               .normalize();
       if (IsCuda) {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 651a39c03bb54..826e2ea7eb06d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -856,7 +856,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9:
-    sparc::getSparcTargetFeatures(D, Args, Features);
+    sparc::getSparcTargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::r600:
   case llvm::Triple::amdgcn:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 2373d945ae509..7d803beb7aa3c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -815,7 +815,7 @@ NVPTXToolChain::getSystemGPUArchs(const ArgList &Args) const {
   else
     Program = GetProgramPath("nvptx-arch");
 
-  auto StdoutOrErr = executeToolChainProgram(Program);
+  auto StdoutOrErr = getDriver().executeProgram({Program});
   if (!StdoutOrErr)
     return StdoutOrErr.takeError();
 
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1edb83f7255eb..7ab41e9b85a04 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -447,6 +447,7 @@ void Flang::addTargetOptions(const ArgList &Args,
   // Add the target features.
   switch (TC.getArch()) {
   default:
+    getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
     break;
   case llvm::Triple::aarch64:
     getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 79b1b6960da1f..8f589186af343 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -161,7 +161,7 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Nopie || Profiling)
     CmdArgs.push_back("-nopie");
 
-  if (Triple.isRISCV64()) {
+  if (Triple.isLoongArch64() || Triple.isRISCV64()) {
     CmdArgs.push_back("-X");
     if (Args.hasArg(options::OPT_mno_relax))
       CmdArgs.push_back("--no-relax");
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 791afc1a97575..51a6f6b779e77 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -220,8 +220,9 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS,
     break;
 
   case NestedNameSpecifier::Namespace: {
-    const NamespaceDecl *NS = NNS->getAsNamespace();
-    if (NS->isAnonymousNamespace())
+    const NamespaceBaseDecl *NS = NNS->getAsNamespace();
+    if (const auto *Namespace = dyn_cast<NamespaceDecl>(NS);
+        Namespace && Namespace->isAnonymousNamespace())
       return Fragments;
     SmallString<128> USR;
     index::generateUSRForDecl(NS, USR);
@@ -230,16 +231,6 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS,
     break;
   }
 
-  case NestedNameSpecifier::NamespaceAlias: {
-    const NamespaceAliasDecl *Alias = NNS->getAsNamespaceAlias();
-    SmallString<128> USR;
-    index::generateUSRForDecl(Alias, USR);
-    Fragments.append(Alias->getName(),
-                     DeclarationFragments::FragmentKind::Identifier, USR,
-                     Alias);
-    break;
-  }
-
   case NestedNameSpecifier::Global:
     // The global specifier `::` at the beginning. No stored value.
     break;
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index c36cb74bc4501..29db20067c361 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -25,7 +25,7 @@
 namespace clang {
 namespace format {
 
-static constexpr StringRef Blanks = " \t\v\f\r";
+static constexpr StringRef Blanks(" \t\v\f\r");
 
 static StringRef getLineCommentIndentPrefix(StringRef Comment,
                                             const FormatStyle &Style) {
@@ -513,7 +513,7 @@ BreakableBlockComment::BreakableBlockComment(
     Decoration = "";
   }
   for (size_t i = 1, e = Content.size(); i < e && !Decoration.empty(); ++i) {
-    const StringRef &Text = Content[i];
+    const StringRef Text(Content[i]);
     if (i + 1 == e) {
       // If the last line is empty, the closing "*/" will have a star.
       if (Text.empty())
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 4010f7fbd25be..bf67f9e5fd241 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -560,6 +560,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
         return true;
     }
   } else if (Current.is(TT_BinaryOperator) && Current.CanBreakBefore &&
+             Current.getPrecedence() != prec::Assignment &&
              CurrentState.BreakBeforeParameter) {
     return true;
   }
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 78c09be458f0a..62feb3db0ed5e 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -665,21 +665,25 @@ template <> struct MappingTraits<FormatStyle::SortIncludesOptions> {
     IO.enumCase(Value, "Never", FormatStyle::SortIncludesOptions({}));
     IO.enumCase(Value, "CaseInsensitive",
                 FormatStyle::SortIncludesOptions({/*Enabled=*/true,
-                                                  /*IgnoreCase=*/true}));
+                                                  /*IgnoreCase=*/true,
+                                                  /*IgnoreExtension=*/false}));
     IO.enumCase(Value, "CaseSensitive",
                 FormatStyle::SortIncludesOptions({/*Enabled=*/true,
-                                                  /*IgnoreCase=*/false}));
+                                                  /*IgnoreCase=*/false,
+                                                  /*IgnoreExtension=*/false}));
 
     // For backward compatibility.
     IO.enumCase(Value, "false", FormatStyle::SortIncludesOptions({}));
     IO.enumCase(Value, "true",
                 FormatStyle::SortIncludesOptions({/*Enabled=*/true,
-                                                  /*IgnoreCase=*/false}));
+                                                  /*IgnoreCase=*/false,
+                                                  /*IgnoreExtension=*/false}));
   }
 
   static void mapping(IO &IO, FormatStyle::SortIncludesOptions &Value) {
     IO.mapOptional("Enabled", Value.Enabled);
     IO.mapOptional("IgnoreCase", Value.IgnoreCase);
+    IO.mapOptional("IgnoreExtension", Value.IgnoreExtension);
   }
 };
 
@@ -1650,7 +1654,8 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.SeparateDefinitionBlocks = FormatStyle::SDS_Leave;
   LLVMStyle.ShortNamespaceLines = 1;
   LLVMStyle.SkipMacroDefinitionBody = false;
-  LLVMStyle.SortIncludes = {/*Enabled=*/true, /*IgnoreCase=*/false};
+  LLVMStyle.SortIncludes = {/*Enabled=*/true, /*IgnoreCase=*/false,
+                            /*IgnoreExtension=*/false};
   LLVMStyle.SortJavaStaticImport = FormatStyle::SJSIO_Before;
   LLVMStyle.SortUsingDeclarations = FormatStyle::SUD_LexicographicNumeric;
   LLVMStyle.SpaceAfterCStyleCast = false;
@@ -3239,19 +3244,27 @@ static void sortCppIncludes(const FormatStyle &Style,
   SmallVector<unsigned, 16> Indices =
       llvm::to_vector<16>(llvm::seq<unsigned>(0, Includes.size()));
 
-  if (Style.SortIncludes.Enabled && Style.SortIncludes.IgnoreCase) {
+  if (Style.SortIncludes.Enabled) {
     stable_sort(Indices, [&](unsigned LHSI, unsigned RHSI) {
-      const auto LHSFilenameLower = Includes[LHSI].Filename.lower();
-      const auto RHSFilenameLower = Includes[RHSI].Filename.lower();
-      return std::tie(Includes[LHSI].Priority, LHSFilenameLower,
-                      Includes[LHSI].Filename) <
-             std::tie(Includes[RHSI].Priority, RHSFilenameLower,
-                      Includes[RHSI].Filename);
-    });
-  } else {
-    stable_sort(Indices, [&](unsigned LHSI, unsigned RHSI) {
-      return std::tie(Includes[LHSI].Priority, Includes[LHSI].Filename) <
-             std::tie(Includes[RHSI].Priority, Includes[RHSI].Filename);
+      SmallString<128> LHSStem, RHSStem;
+      if (Style.SortIncludes.IgnoreExtension) {
+        LHSStem = Includes[LHSI].Filename;
+        RHSStem = Includes[RHSI].Filename;
+        llvm::sys::path::replace_extension(LHSStem, "");
+        llvm::sys::path::replace_extension(RHSStem, "");
+      }
+      std::string LHSStemLower, RHSStemLower;
+      std::string LHSFilenameLower, RHSFilenameLower;
+      if (Style.SortIncludes.IgnoreCase) {
+        LHSStemLower = LHSStem.str().lower();
+        RHSStemLower = RHSStem.str().lower();
+        LHSFilenameLower = Includes[LHSI].Filename.lower();
+        RHSFilenameLower = Includes[RHSI].Filename.lower();
+      }
+      return std::tie(Includes[LHSI].Priority, LHSStemLower, LHSStem,
+                      LHSFilenameLower, Includes[LHSI].Filename) <
+             std::tie(Includes[RHSI].Priority, RHSStemLower, RHSStem,
+                      RHSFilenameLower, Includes[RHSI].Filename);
     });
   }
 
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 49da3160daf50..3f4aa52a87d2e 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -1198,7 +1198,7 @@ void FormatTokenLexer::truncateToken(size_t NewLen) {
 /// Count the length of leading whitespace in a token.
 static size_t countLeadingWhitespace(StringRef Text) {
   // Basically counting the length matched by this regex.
-  // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
+  // "^([\n\r\f\v \t]|\\\\[\n\r])+"
   // Directly using the regex turned out to be slow. With the regex
   // version formatting all files in this directory took about 1.25
   // seconds. This version took about 0.5 seconds.
@@ -1222,13 +1222,6 @@ static size_t countLeadingWhitespace(StringRef Text) {
         break;
       // Splice found, consume it.
       Cur = Lookahead + 1;
-    } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
-               (Cur[3] == '\n' || Cur[3] == '\r')) {
-      // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
-      // characters are quoted individually in this comment because if we write
-      // them together some compilers warn that we have a trigraph in the code.
-      assert(End - Cur >= 4);
-      Cur += 4;
     } else {
       break;
     }
@@ -1300,22 +1293,16 @@ FormatToken *FormatTokenLexer::getNextToken() {
             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
         break;
       case '\\':
-      case '?':
-      case '/':
-        // The text was entirely whitespace when this loop was entered. Thus
-        // this has to be an escape sequence.
-        assert(Text.substr(i, 4) == "\?\?/\r" ||
-               Text.substr(i, 4) == "\?\?/\n" ||
-               (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
-                           Text.substr(i - 1, 4) == "\?\?/\n")) ||
-               (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
-                           Text.substr(i - 2, 4) == "\?\?/\n")) ||
-               (Text[i] == '\\' && [&]() -> bool {
-                 size_t j = i + 1;
-                 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
-                   ++j;
-                 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
-               }()));
+        // The code preceding the loop and in the countLeadingWhitespace
+        // function guarantees that Text is entirely whitespace, not including
+        // comments but including escaped newlines. So the character shows up,
+        // then it has to be in an escape sequence.
+        assert([&]() -> bool {
+          size_t j = i + 1;
+          while (j < Text.size() && isHorizontalWhitespace(Text[j]))
+            ++j;
+          return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
+        }());
         InEscape = true;
         break;
       default:
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 87823ae32b113..80487fa673bf0 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -19,7 +19,7 @@ namespace format {
 
 enum class Base { Binary, Decimal, Hex, Other };
 
-static Base getBase(const StringRef IntegerLiteral) {
+static Base getBase(StringRef IntegerLiteral) {
   assert(IntegerLiteral.size() > 1);
 
   if (IntegerLiteral[0] > '0') {
@@ -164,8 +164,8 @@ IntegerLiteralSeparatorFixer::process(const Environment &Env,
   return {Result, 0};
 }
 
-bool IntegerLiteralSeparatorFixer::checkSeparator(
-    const StringRef IntegerLiteral, int DigitsPerGroup) const {
+bool IntegerLiteralSeparatorFixer::checkSeparator(StringRef IntegerLiteral,
+                                                  int DigitsPerGroup) const {
   assert(DigitsPerGroup > 0);
 
   int I = 0;
@@ -184,7 +184,7 @@ bool IntegerLiteralSeparatorFixer::checkSeparator(
   return true;
 }
 
-std::string IntegerLiteralSeparatorFixer::format(const StringRef IntegerLiteral,
+std::string IntegerLiteralSeparatorFixer::format(StringRef IntegerLiteral,
                                                  int DigitsPerGroup,
                                                  int DigitCount,
                                                  bool RemoveSeparator) const {
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.h b/clang/lib/Format/IntegerLiteralSeparatorFixer.h
index 2c158e4473bfe..e24af18bb9572 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.h
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.h
@@ -26,8 +26,8 @@ class IntegerLiteralSeparatorFixer {
                                                      const FormatStyle &Style);
 
 private:
-  bool checkSeparator(const StringRef IntegerLiteral, int DigitsPerGroup) const;
-  std::string format(const StringRef IntegerLiteral, int DigitsPerGroup,
+  bool checkSeparator(StringRef IntegerLiteral, int DigitsPerGroup) const;
+  std::string format(StringRef IntegerLiteral, int DigitsPerGroup,
                      int DigitCount, bool RemoveSeparator) const;
 
   char Separator;
diff --git a/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp b/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp
index 37a1807197341..b885942efcb55 100644
--- a/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp
+++ b/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp
@@ -66,7 +66,7 @@ void ObjCPropertyAttributeOrderFixer::sortPropertyAttributes(
       return;
     }
 
-    const StringRef Attribute{Tok->TokenText};
+    const StringRef Attribute(Tok->TokenText);
     StringRef Value;
 
     // Also handle `getter=getFoo` attributes.
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 739209a5681f8..581bfbab0972d 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2996,14 +2996,18 @@ class AnnotatingParser {
     const FormatToken *PrevToken = Tok.getPreviousNonComment();
     if (!PrevToken)
       return TT_UnaryOperator;
-    if (PrevToken->is(TT_TypeName))
+    if (PrevToken->isTypeName(LangOpts))
       return TT_PointerOrReference;
     if (PrevToken->isPlacementOperator() && Tok.is(tok::ampamp))
       return TT_BinaryOperator;
 
-    const FormatToken *NextToken = Tok.getNextNonComment();
+    auto *NextToken = Tok.getNextNonComment();
     if (!NextToken)
       return TT_PointerOrReference;
+    if (NextToken->is(tok::greater)) {
+      NextToken->setFinalizedType(TT_TemplateCloser);
+      return TT_PointerOrReference;
+    }
 
     if (InTemplateArgument && NextToken->is(tok::kw_noexcept))
       return TT_BinaryOperator;
@@ -3112,7 +3116,7 @@ class AnnotatingParser {
 
     // It's more likely that & represents operator& than an uninitialized
     // reference.
-    if (Tok.is(tok::amp) && PrevToken && PrevToken->Tok.isAnyIdentifier() &&
+    if (Tok.is(tok::amp) && PrevToken->Tok.isAnyIdentifier() &&
         IsChainedOperatorAmpOrMember(PrevToken->getPreviousNonComment()) &&
         NextToken && NextToken->Tok.isAnyIdentifier()) {
       if (auto NextNext = NextToken->getNextNonComment();
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 56d10ceb986b3..3a36250da57a3 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -593,11 +593,11 @@ static bool FixupInvocation(CompilerInvocation &Invocation,
   CodeGenOpts.CodeModel = TargetOpts.CodeModel;
   CodeGenOpts.LargeDataThreshold = TargetOpts.LargeDataThreshold;
 
-  if (LangOpts.getExceptionHandling() !=
-          LangOptions::ExceptionHandlingKind::None &&
+  if (CodeGenOpts.getExceptionHandling() !=
+          CodeGenOptions::ExceptionHandlingKind::None &&
       T.isWindowsMSVCEnvironment())
     Diags.Report(diag::err_fe_invalid_exception_model)
-        << static_cast<unsigned>(LangOpts.getExceptionHandling()) << T.str();
+        << static_cast<unsigned>(CodeGenOpts.getExceptionHandling()) << T.str();
 
   if (LangOpts.AppleKext && !LangOpts.CPlusPlus)
     Diags.Report(diag::warn_c_kext);
@@ -3713,23 +3713,6 @@ static StringRef GetInputKindName(InputKind IK) {
   llvm_unreachable("unknown input language");
 }
 
-static StringRef getExceptionHandlingName(unsigned EHK) {
-  switch (static_cast<LangOptions::ExceptionHandlingKind>(EHK)) {
-  case LangOptions::ExceptionHandlingKind::None:
-    return "none";
-  case LangOptions::ExceptionHandlingKind::DwarfCFI:
-    return "dwarf";
-  case LangOptions::ExceptionHandlingKind::SjLj:
-    return "sjlj";
-  case LangOptions::ExceptionHandlingKind::WinEH:
-    return "seh";
-  case LangOptions::ExceptionHandlingKind::Wasm:
-    return "wasm";
-  }
-
-  llvm_unreachable("covered switch");
-}
-
 void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
                                               ArgumentConsumer Consumer,
                                               const llvm::Triple &T,
@@ -3745,10 +3728,6 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
       GenerateArg(Consumer, OPT_pic_is_pie);
     for (StringRef Sanitizer : serializeSanitizerKinds(Opts.Sanitize))
       GenerateArg(Consumer, OPT_fsanitize_EQ, Sanitizer);
-    if (Opts.ExceptionHandling) {
-      GenerateArg(Consumer, OPT_exception_model,
-                  getExceptionHandlingName(Opts.ExceptionHandling));
-    }
 
     return;
   }
@@ -3934,12 +3913,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
   if (Opts.OpenMPCUDAMode)
     GenerateArg(Consumer, OPT_fopenmp_cuda_mode);
 
-  if (Opts.OpenACC) {
+  if (Opts.OpenACC)
     GenerateArg(Consumer, OPT_fopenacc);
-    if (!Opts.OpenACCMacroOverride.empty())
-      GenerateArg(Consumer, OPT_openacc_macro_override,
-                  Opts.OpenACCMacroOverride);
-  }
 
   // The arguments used to set Optimize, OptimizeSize and NoInlineDefine are
   // generated from CodeGenOptions.
@@ -4057,24 +4032,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
     parseSanitizerKinds("-fsanitize=", Args.getAllArgValues(OPT_fsanitize_EQ),
                         Diags, Opts.Sanitize);
 
-    if (const Arg *A = Args.getLastArg(options::OPT_exception_model)) {
-      std::optional<LangOptions::ExceptionHandlingKind> EMValue =
-          llvm::StringSwitch<std::optional<LangOptions::ExceptionHandlingKind>>(
-              A->getValue())
-              .Case("dwarf", LangOptions::ExceptionHandlingKind::DwarfCFI)
-              .Case("sjlj", LangOptions::ExceptionHandlingKind::SjLj)
-              .Case("seh", LangOptions::ExceptionHandlingKind::WinEH)
-              .Case("wasm", LangOptions::ExceptionHandlingKind::Wasm)
-              .Case("none", LangOptions::ExceptionHandlingKind::None)
-              .Default(std::nullopt);
-      if (EMValue) {
-        Opts.ExceptionHandling = static_cast<unsigned>(*EMValue);
-      } else {
-        Diags.Report(diag::err_drv_invalid_value)
-            << A->getAsString(Args) << A->getValue();
-      }
-    }
-
     return Diags.getNumErrors() == NumErrorsBefore;
   }
 
@@ -4463,13 +4420,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                         Args.hasArg(options::OPT_fopenmp_cuda_mode);
 
   // OpenACC Configuration.
-  if (Args.hasArg(options::OPT_fopenacc)) {
+  if (Args.hasArg(options::OPT_fopenacc))
     Opts.OpenACC = true;
 
-    if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override))
-      Opts.OpenACCMacroOverride = A->getValue();
-  }
-
   if (Arg *A = Args.getLastArg(OPT_ffp_contract)) {
     StringRef Val = A->getValue();
     if (Val == "fast")
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 136bc55847cc1..382ccd610946c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -639,16 +639,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     }
   }
 
-  if (LangOpts.OpenACC) {
-    // FIXME: When we have full support for OpenACC, we should set this to the
-    // version we support. Until then, set as '1' by default, but provide a
-    // temporary mechanism for users to override this so real-world examples can
-    // be tested against.
-    if (!LangOpts.OpenACCMacroOverride.empty())
-      Builder.defineMacro("_OPENACC", LangOpts.OpenACCMacroOverride);
-    else
-      Builder.defineMacro("_OPENACC", "1");
-  }
+  if (LangOpts.OpenACC)
+    Builder.defineMacro("_OPENACC", "202506");
 }
 
 /// Initialize the predefined C++ language feature test macros defined in
@@ -1032,14 +1024,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.GNUCVersion && LangOpts.RTTI)
     Builder.defineMacro("__GXX_RTTI");
 
-  if (LangOpts.hasSjLjExceptions())
+  if (CGOpts.hasSjLjExceptions())
     Builder.defineMacro("__USING_SJLJ_EXCEPTIONS__");
-  else if (LangOpts.hasSEHExceptions())
+  else if (CGOpts.hasSEHExceptions())
     Builder.defineMacro("__SEH__");
-  else if (LangOpts.hasDWARFExceptions() &&
+  else if (CGOpts.hasDWARFExceptions() &&
            (TI.getTriple().isThumb() || TI.getTriple().isARM()))
     Builder.defineMacro("__ARM_DWARF_EH__");
-  else if (LangOpts.hasWasmExceptions() && TI.getTriple().isWasm())
+  else if (CGOpts.hasWasmExceptions() && TI.getTriple().isWasm())
     Builder.defineMacro("__WASM_EXCEPTIONS__");
 
   if (LangOpts.Deprecated)
diff --git a/clang/lib/Headers/__clang_spirv_builtins.h b/clang/lib/Headers/__clang_spirv_builtins.h
index 9915cdfcae7cd..9c7215f506508 100644
--- a/clang/lib/Headers/__clang_spirv_builtins.h
+++ b/clang/lib/Headers/__clang_spirv_builtins.h
@@ -52,30 +52,30 @@
 // Builtin IDs and sizes
 
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_workgroups) __size_t
-    __spirv_NumWorkgroups(int);
+    __spirv_BuiltInNumWorkgroups(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_size) __size_t
-    __spirv_WorkgroupSize(int);
+    __spirv_BuiltInWorkgroupSize(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_id) __size_t
-    __spirv_WorkgroupId(int);
+    __spirv_BuiltInWorkgroupId(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_local_invocation_id) __size_t
-    __spirv_LocalInvocationId(int);
+    __spirv_BuiltInLocalInvocationId(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_invocation_id) __size_t
-    __spirv_GlobalInvocationId(int);
+    __spirv_BuiltInGlobalInvocationId(int);
 
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_size) __size_t
-    __spirv_GlobalSize(int);
+    __spirv_BuiltInGlobalSize(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_offset) __size_t
-    __spirv_GlobalOffset(int);
+    __spirv_BuiltInGlobalOffset(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_size) __uint32_t
-    __spirv_SubgroupSize();
+    __spirv_BuiltInSubgroupSize();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_max_size) __uint32_t
-    __spirv_SubgroupMaxSize();
+    __spirv_BuiltInSubgroupMaxSize();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_subgroups) __uint32_t
-    __spirv_NumSubgroups();
+    __spirv_BuiltInNumSubgroups();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_id) __uint32_t
-    __spirv_SubgroupId();
+    __spirv_BuiltInSubgroupId();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_local_invocation_id)
-    __uint32_t __spirv_SubgroupLocalInvocationId();
+    __uint32_t __spirv_BuiltInSubgroupLocalInvocationId();
 
 // OpGenericCastToPtrExplicit
 
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
index 4eb7b8f45c85a..e8ccccb489815 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
@@ -71,6 +71,16 @@ constexpr vector<T, L> reflect_vec_impl(vector<T, L> I, vector<T, L> N) {
 #endif
 }
 
+template <typename T, typename U> constexpr T refract_impl(T I, T N, U Eta) {
+#if (__has_builtin(__builtin_spirv_refract))
+  return __builtin_spirv_refract(I, N, Eta);
+#endif
+  T Mul = dot(N, I);
+  T K = 1 - Eta * Eta * (1 - Mul * Mul);
+  T Result = (Eta * I - (Eta * Mul + sqrt(K)) * N);
+  return select<T>(K < 0, static_cast<T>(0), Result);
+}
+
 template <typename T> constexpr T fmod_impl(T X, T Y) {
 #if !defined(__DIRECTX__)
   return __builtin_elementwise_fmod(X, Y);
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index ea880105fac3b..499a05328ee4f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -475,6 +475,65 @@ reflect(__detail::HLSL_FIXED_VECTOR<float, L> I,
   return __detail::reflect_vec_impl(I, N);
 }
 
+//===----------------------------------------------------------------------===//
+// refract builtin
+//===----------------------------------------------------------------------===//
+
+/// \fn T refract(T I, T N, T eta)
+/// \brief Returns a refraction using an entering ray, \a I, a surface
+/// normal, \a N and refraction index \a eta
+/// \param I The entering ray.
+/// \param N The surface normal.
+/// \param eta The refraction index.
+///
+/// The return value is a floating-point vector that represents the refraction
+/// using the refraction index, \a eta, for the direction of the entering ray,
+/// \a I, off a surface with the normal \a N.
+///
+/// This function calculates the refraction vector using the following formulas:
+/// k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I))
+/// if k < 0.0 the result is 0.0
+/// otherwise, the result is eta * I - (eta * dot(N, I) + sqrt(k)) * N
+///
+/// I and N must already be normalized in order to achieve the desired result.
+///
+/// I and N must be a scalar or vector whose component type is
+/// floating-point.
+///
+/// eta must be a 16-bit or 32-bit floating-point scalar.
+///
+/// Result type, the type of I, and the type of N must all be the same type.
+
+template <typename T>
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline __detail::enable_if_t<__detail::is_arithmetic<T>::Value &&
+                                       __detail::is_same<half, T>::value,
+                                   T> refract(T I, T N, T eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
+template <typename T>
+const inline __detail::enable_if_t<
+    __detail::is_arithmetic<T>::Value && __detail::is_same<float, T>::value, T>
+refract(T I, T N, T eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
+template <int L>
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline __detail::HLSL_FIXED_VECTOR<half, L> refract(
+    __detail::HLSL_FIXED_VECTOR<half, L> I,
+    __detail::HLSL_FIXED_VECTOR<half, L> N, half eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
+template <int L>
+const inline __detail::HLSL_FIXED_VECTOR<float, L>
+refract(__detail::HLSL_FIXED_VECTOR<float, L> I,
+        __detail::HLSL_FIXED_VECTOR<float, L> N, float eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
 //===----------------------------------------------------------------------===//
 // smoothstep builtin
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Index/IndexTypeSourceInfo.cpp b/clang/lib/Index/IndexTypeSourceInfo.cpp
index 98b5513128fbe..adc33b3abd822 100644
--- a/clang/lib/Index/IndexTypeSourceInfo.cpp
+++ b/clang/lib/Index/IndexTypeSourceInfo.cpp
@@ -271,10 +271,6 @@ void IndexingContext::indexNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS,
     handleReference(NNS.getNestedNameSpecifier()->getAsNamespace(),
                     Loc, Parent, DC, SymbolRoleSet());
     break;
-  case NestedNameSpecifier::NamespaceAlias:
-    handleReference(NNS.getNestedNameSpecifier()->getAsNamespaceAlias(),
-                    Loc, Parent, DC, SymbolRoleSet());
-    break;
 
   case NestedNameSpecifier::TypeSpec:
     indexTypeLoc(NNS.getTypeLoc(), Parent, DC);
diff --git a/clang/lib/Interpreter/CMakeLists.txt b/clang/lib/Interpreter/CMakeLists.txt
index 38cf139fa86a6..70de4a2aaa541 100644
--- a/clang/lib/Interpreter/CMakeLists.txt
+++ b/clang/lib/Interpreter/CMakeLists.txt
@@ -29,6 +29,7 @@ add_clang_library(clangInterpreter
   InterpreterUtils.cpp
   RemoteJITUtils.cpp
   Value.cpp
+  InterpreterValuePrinter.cpp
   ${WASM_SRC}
   PARTIAL_SOURCES_INTENDED
 
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index ed3bae59a144c..db6a2bb914f43 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -264,7 +264,7 @@ class InProcessPrintingASTConsumer final : public MultiplexConsumer {
       if (auto *TLSD = llvm::dyn_cast<TopLevelStmtDecl>(D))
         if (TLSD && TLSD->isSemiMissing()) {
           auto ExprOrErr =
-              Interp.ExtractValueFromExpr(cast<Expr>(TLSD->getStmt()));
+              Interp.convertExprToValue(cast<Expr>(TLSD->getStmt()));
           if (llvm::Error E = ExprOrErr.takeError()) {
             llvm::logAllUnhandledErrors(std::move(E), llvm::errs(),
                                         "Value printing failed: ");
@@ -440,11 +440,10 @@ const char *const Runtimes = R"(
     #define __CLANG_REPL__ 1
 #ifdef __cplusplus
     #define EXTERN_C extern "C"
-    void *__clang_Interpreter_SetValueWithAlloc(void*, void*, void*);
     struct __clang_Interpreter_NewTag{} __ci_newtag;
     void* operator new(__SIZE_TYPE__, void* __p, __clang_Interpreter_NewTag) noexcept;
     template <class T, class = T (*)() /*disable for arrays*/>
-    void __clang_Interpreter_SetValueCopyArr(T* Src, void* Placement, unsigned long Size) {
+    void __clang_Interpreter_SetValueCopyArr(const T* Src, void* Placement, unsigned long Size) {
       for (auto Idx = 0; Idx < Size; ++Idx)
         new ((void*)(((T*)Placement) + Idx), __ci_newtag) T(Src[Idx]);
     }
@@ -454,8 +453,12 @@ const char *const Runtimes = R"(
     }
 #else
     #define EXTERN_C extern
+    EXTERN_C void *memcpy(void *restrict dst, const void *restrict src, __SIZE_TYPE__ n);
+    EXTERN_C inline void __clang_Interpreter_SetValueCopyArr(const void* Src, void* Placement, unsigned long Size) {
+      memcpy(Placement, Src, Size);
+    }
 #endif // __cplusplus
-
+  EXTERN_C void *__clang_Interpreter_SetValueWithAlloc(void*, void*, void*);
   EXTERN_C void __clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, ...);
 )";
 
@@ -470,12 +473,12 @@ Interpreter::create(std::unique_ptr<CompilerInstance> CI,
 
   // Add runtime code and set a marker to hide it from user code. Undo will not
   // go through that.
-  auto PTU = Interp->Parse(Runtimes);
-  if (!PTU)
-    return PTU.takeError();
+  Err = Interp->ParseAndExecute(Runtimes);
+  if (Err)
+    return std::move(Err);
+
   Interp->markUserCodeStart();
 
-  Interp->ValuePrintingInfo.resize(4);
   return std::move(Interp);
 }
 
@@ -524,12 +527,11 @@ Interpreter::createWithCUDA(std::unique_ptr<CompilerInstance> CI,
   return std::move(Interp);
 }
 
+CompilerInstance *Interpreter::getCompilerInstance() { return CI.get(); }
 const CompilerInstance *Interpreter::getCompilerInstance() const {
-  return CI.get();
+  return const_cast<Interpreter *>(this)->getCompilerInstance();
 }
 
-CompilerInstance *Interpreter::getCompilerInstance() { return CI.get(); }
-
 llvm::Expected<llvm::orc::LLJIT &> Interpreter::getExecutionEngine() {
   if (!IncrExecutor) {
     if (auto Err = CreateExecutor())
@@ -610,7 +612,14 @@ Interpreter::Parse(llvm::StringRef Code) {
   if (!TuOrErr)
     return TuOrErr.takeError();
 
-  return RegisterPTU(*TuOrErr);
+  PTUs.emplace_back(PartialTranslationUnit());
+  PartialTranslationUnit &LastPTU = PTUs.back();
+  LastPTU.TUPart = *TuOrErr;
+
+  if (std::unique_ptr<llvm::Module> M = GenModule())
+    LastPTU.TheModule = std::move(M);
+
+  return LastPTU;
 }
 
 static llvm::Expected<llvm::orc::JITTargetMachineBuilder>
@@ -808,10 +817,10 @@ Interpreter::GenModule(IncrementalAction *Action) {
     // sure it always stays empty.
     assert(((!CachedInCodeGenModule ||
              !getCompilerInstance()->getPreprocessorOpts().Includes.empty()) ||
-            (CachedInCodeGenModule->empty() &&
-             CachedInCodeGenModule->global_empty() &&
-             CachedInCodeGenModule->alias_empty() &&
-             CachedInCodeGenModule->ifunc_empty())) &&
+            ((CachedInCodeGenModule->empty() &&
+              CachedInCodeGenModule->global_empty() &&
+              CachedInCodeGenModule->alias_empty() &&
+              CachedInCodeGenModule->ifunc_empty()))) &&
            "CodeGen wrote to a readonly module");
     std::unique_ptr<llvm::Module> M(CG->ReleaseModule());
     CG->StartModule("incr_module_" + std::to_string(ID++), M->getContext());
@@ -828,4 +837,4 @@ CodeGenerator *Interpreter::getCodeGen(IncrementalAction *Action) const {
     return nullptr;
   return static_cast<CodeGenAction *>(WrappedAct)->getCodeGenerator();
 }
-} // namespace clang
+} // end namespace clang
diff --git a/clang/lib/Interpreter/InterpreterUtils.cpp b/clang/lib/Interpreter/InterpreterUtils.cpp
index 45f6322b8461e..a19f96c80b94f 100644
--- a/clang/lib/Interpreter/InterpreterUtils.cpp
+++ b/clang/lib/Interpreter/InterpreterUtils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InterpreterUtils.h"
+#include "clang/AST/QualTypeNames.h"
 
 namespace clang {
 
@@ -81,7 +82,7 @@ NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name,
   else {
     const DeclContext *PrimaryWithin = nullptr;
     if (const auto *TD = dyn_cast<TagDecl>(Within))
-      PrimaryWithin = llvm::dyn_cast_or_null<DeclContext>(TD->getDefinition());
+      PrimaryWithin = dyn_cast_if_present<DeclContext>(TD->getDefinition());
     else
       PrimaryWithin = Within->getPrimaryContext();
 
@@ -97,15 +98,16 @@ NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name,
   R.resolveKind();
 
   if (R.isSingleResult())
-    return llvm::dyn_cast<NamedDecl>(R.getFoundDecl());
+    return dyn_cast<NamedDecl>(R.getFoundDecl());
 
   return nullptr;
 }
 
 std::string GetFullTypeName(ASTContext &Ctx, QualType QT) {
+  QualType FQT = TypeName::getFullyQualifiedType(QT, Ctx);
   PrintingPolicy Policy(Ctx.getPrintingPolicy());
   Policy.SuppressScope = false;
   Policy.AnonymousTagLocations = false;
-  return QT.getAsString(Policy);
+  return FQT.getAsString(Policy);
 }
 } // namespace clang
diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h
index c7b405b486d93..fbf9814b0d4a7 100644
--- a/clang/lib/Interpreter/InterpreterUtils.h
+++ b/clang/lib/Interpreter/InterpreterUtils.h
@@ -45,7 +45,7 @@ NamespaceDecl *LookupNamespace(Sema &S, llvm::StringRef Name,
                                const DeclContext *Within = nullptr);
 
 NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name,
-                       const DeclContext *Within);
+                       const DeclContext *Within = nullptr);
 
 std::string GetFullTypeName(ASTContext &Ctx, QualType QT);
 } // namespace clang
diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
index 3e7e32b2e8557..0ea6274b79cba 100644
--- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp
+++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
@@ -18,6 +18,7 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
+#include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
 
@@ -25,13 +26,335 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
-
+#include <cmath>
 #include <cstdarg>
+#include <sstream>
+#include <string>
+
+#define DEBUG_TYPE "interp-value"
+
+using namespace clang;
+
+static std::string DeclTypeToString(const QualType &QT, NamedDecl *D) {
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  if (QT.hasQualifiers())
+    SS << QT.getQualifiers().getAsString() << " ";
+  SS << D->getQualifiedNameAsString();
+  return Str;
+}
+
+static std::string QualTypeToString(ASTContext &Ctx, QualType QT) {
+  PrintingPolicy Policy(Ctx.getPrintingPolicy());
+  // Print the Allocator in STL containers, for instance.
+  Policy.SuppressDefaultTemplateArgs = false;
+  Policy.SuppressUnwrittenScope = true;
+  // Print 'a<b<c> >' rather than 'a<b<c>>'.
+  Policy.SplitTemplateClosers = true;
+
+  struct LocalPrintingPolicyRAII {
+    ASTContext &Context;
+    PrintingPolicy Policy;
+
+    LocalPrintingPolicyRAII(ASTContext &Ctx, PrintingPolicy &PP)
+        : Context(Ctx), Policy(Ctx.getPrintingPolicy()) {
+      Context.setPrintingPolicy(PP);
+    }
+    ~LocalPrintingPolicyRAII() { Context.setPrintingPolicy(Policy); }
+  } X(Ctx, Policy);
+
+  const QualType NonRefTy = QT.getNonReferenceType();
+
+  if (const auto *TTy = llvm::dyn_cast<TagType>(NonRefTy))
+    return DeclTypeToString(NonRefTy, TTy->getDecl());
+
+  if (const auto *TRy = dyn_cast<RecordType>(NonRefTy))
+    return DeclTypeToString(NonRefTy, TRy->getDecl());
+
+  const QualType Canon = NonRefTy.getCanonicalType();
+
+  // FIXME: How a builtin type can be a function pointer type?
+  if (Canon->isBuiltinType() && !NonRefTy->isFunctionPointerType() &&
+      !NonRefTy->isMemberPointerType())
+    return Canon.getAsString(Ctx.getPrintingPolicy());
+
+  if (const auto *TDTy = dyn_cast<TypedefType>(NonRefTy)) {
+    // FIXME: TemplateSpecializationType & SubstTemplateTypeParmType checks
+    // are predominately to get STL containers to print nicer and might be
+    // better handled in GetFullyQualifiedName.
+    //
+    // std::vector<Type>::iterator is a TemplateSpecializationType
+    // std::vector<Type>::value_type is a SubstTemplateTypeParmType
+    //
+    QualType SSDesugar = TDTy->getLocallyUnqualifiedSingleStepDesugaredType();
+    if (llvm::isa<SubstTemplateTypeParmType>(SSDesugar))
+      return GetFullTypeName(Ctx, Canon);
+    else if (llvm::isa<TemplateSpecializationType>(SSDesugar))
+      return GetFullTypeName(Ctx, NonRefTy);
+    return DeclTypeToString(NonRefTy, TDTy->getDecl());
+  }
+  return GetFullTypeName(Ctx, NonRefTy);
+}
+
+static std::string EnumToString(const Value &V) {
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  ASTContext &Ctx = const_cast<ASTContext &>(V.getASTContext());
+
+  QualType DesugaredTy = V.getType().getDesugaredType(Ctx);
+  const EnumType *EnumTy = DesugaredTy.getNonReferenceType()->getAs<EnumType>();
+  assert(EnumTy && "Fail to cast to enum type");
+
+  EnumDecl *ED = EnumTy->getDecl();
+  uint64_t Data = V.convertTo<uint64_t>();
+  bool IsFirst = true;
+  llvm::APSInt AP = Ctx.MakeIntValue(Data, DesugaredTy);
+
+  for (auto I = ED->enumerator_begin(), E = ED->enumerator_end(); I != E; ++I) {
+    if (I->getInitVal() == AP) {
+      if (!IsFirst)
+        SS << " ? ";
+      SS << "(" + I->getQualifiedNameAsString() << ")";
+      IsFirst = false;
+    }
+  }
+  llvm::SmallString<64> APStr;
+  AP.toString(APStr, /*Radix=*/10);
+  SS << " : " << QualTypeToString(Ctx, ED->getIntegerType()) << " " << APStr;
+  return Str;
+}
+
+static std::string FunctionToString(const Value &V, const void *Ptr) {
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << "Function @" << Ptr;
+
+  const DeclContext *PTU = V.getASTContext().getTranslationUnitDecl();
+  // Find the last top-level-stmt-decl. This is a forward iterator but the
+  // partial translation unit should not be large.
+  const TopLevelStmtDecl *TLSD = nullptr;
+  for (const Decl *D : PTU->noload_decls())
+    if (isa<TopLevelStmtDecl>(D))
+      TLSD = cast<TopLevelStmtDecl>(D);
+
+  // Get __clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void
+  // *OpaqueType, void *Val);
+  const FunctionDecl *FD = nullptr;
+  if (auto *InterfaceCall = llvm::dyn_cast<CallExpr>(TLSD->getStmt())) {
+    const auto *Arg = InterfaceCall->getArg(/*Val*/ 3);
+    // Get rid of cast nodes.
+    while (const CastExpr *CastE = llvm::dyn_cast<CastExpr>(Arg))
+      Arg = CastE->getSubExpr();
+    if (const DeclRefExpr *DeclRefExp = llvm::dyn_cast<DeclRefExpr>(Arg))
+      FD = llvm::dyn_cast<FunctionDecl>(DeclRefExp->getDecl());
+
+    if (FD) {
+      SS << '\n';
+      const clang::FunctionDecl *FDef;
+      if (FD->hasBody(FDef))
+        FDef->print(SS);
+    }
+  }
+  return Str;
+}
+
+static std::string VoidPtrToString(const void *Ptr) {
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Ptr;
+  return Str;
+}
+
+static std::string CharPtrToString(const char *Ptr) {
+  if (!Ptr)
+    return "0";
+
+  std::string Result = "\"";
+  Result += Ptr;
+  Result += '"';
+  return Result;
+}
 
 namespace clang {
 
+struct ValueRef : public Value {
+  ValueRef(const Interpreter *In, void *Ty) : Value(In, Ty) {
+    // Tell the base class to not try to deallocate if it manages the value.
+    IsManuallyAlloc = false;
+  }
+};
+
+std::string Interpreter::ValueDataToString(const Value &V) const {
+  Sema &S = getCompilerInstance()->getSema();
+  ASTContext &Ctx = S.getASTContext();
+
+  QualType QT = V.getType();
+
+  if (const ConstantArrayType *CAT = Ctx.getAsConstantArrayType(QT)) {
+    QualType ElemTy = CAT->getElementType();
+    size_t ElemCount = Ctx.getConstantArrayElementCount(CAT);
+    const Type *BaseTy = CAT->getBaseElementTypeUnsafe();
+    size_t ElemSize = Ctx.getTypeSizeInChars(BaseTy).getQuantity();
+
+    // Treat null terminated char arrays as strings basically.
+    if (ElemTy->isCharType()) {
+      char last = *(char *)(((uintptr_t)V.getPtr()) + ElemCount * ElemSize - 1);
+      if (last == '\0')
+        return CharPtrToString((char *)V.getPtr());
+    }
+
+    std::string Result = "{ ";
+    for (unsigned Idx = 0, N = CAT->getZExtSize(); Idx < N; ++Idx) {
+      ValueRef InnerV = ValueRef(this, ElemTy.getAsOpaquePtr());
+      if (ElemTy->isBuiltinType()) {
+        // Single dim arrays, advancing.
+        uintptr_t Offset = (uintptr_t)V.getPtr() + Idx * ElemSize;
+        InnerV.setRawBits((void *)Offset, ElemSize * 8);
+      } else {
+        // Multi dim arrays, position to the next dimension.
+        size_t Stride = ElemCount / N;
+        uintptr_t Offset = ((uintptr_t)V.getPtr()) + Idx * Stride * ElemSize;
+        InnerV.setPtr((void *)Offset);
+      }
+
+      Result += ValueDataToString(InnerV);
+
+      // Skip the \0 if the char types
+      if (Idx < N - 1)
+        Result += ", ";
+    }
+    Result += " }";
+    return Result;
+  }
+
+  QualType DesugaredTy = QT.getDesugaredType(Ctx);
+  QualType NonRefTy = DesugaredTy.getNonReferenceType();
+
+  // FIXME: Add support for user defined printers.
+  // LookupResult R = LookupUserDefined(S, QT);
+  // if (!R.empty())
+  //   return CallUserSpecifiedPrinter(R, V);
+
+  // If it is a builtin type dispatch to the builtin overloads.
+  if (auto *BT = DesugaredTy.getCanonicalType()->getAs<BuiltinType>()) {
+
+    auto formatFloating = [](auto Val, char Suffix = '\0') -> std::string {
+      std::string Out;
+      llvm::raw_string_ostream SS(Out);
+
+      if (std::isnan(Val) || std::isinf(Val)) {
+        SS << llvm::format("%g", Val);
+        return SS.str();
+      }
+      if (Val == static_cast<decltype(Val)>(static_cast<int64_t>(Val)))
+        SS << llvm::format("%.1f", Val);
+      else if (std::abs(Val) < 1e-4 || std::abs(Val) > 1e6 || Suffix == 'f')
+        SS << llvm::format("%#.6g", Val);
+      else if (Suffix == 'L')
+        SS << llvm::format("%#.12Lg", Val);
+      else
+        SS << llvm::format("%#.8g", Val);
+
+      if (Suffix != '\0')
+        SS << Suffix;
+      return SS.str();
+    };
+
+    std::string Str;
+    llvm::raw_string_ostream SS(Str);
+    switch (BT->getKind()) {
+    default:
+      return "{ error: unknown builtin type '" + std::to_string(BT->getKind()) +
+             " '}";
+    case clang::BuiltinType::Bool:
+      SS << ((V.getBool()) ? "true" : "false");
+      return Str;
+    case clang::BuiltinType::Char_S:
+      SS << '\'' << V.getChar_S() << '\'';
+      return Str;
+    case clang::BuiltinType::SChar:
+      SS << '\'' << V.getSChar() << '\'';
+      return Str;
+    case clang::BuiltinType::Char_U:
+      SS << '\'' << V.getChar_U() << '\'';
+      return Str;
+    case clang::BuiltinType::UChar:
+      SS << '\'' << V.getUChar() << '\'';
+      return Str;
+    case clang::BuiltinType::Short:
+      SS << V.getShort();
+      return Str;
+    case clang::BuiltinType::UShort:
+      SS << V.getUShort();
+      return Str;
+    case clang::BuiltinType::Int:
+      SS << V.getInt();
+      return Str;
+    case clang::BuiltinType::UInt:
+      SS << V.getUInt();
+      return Str;
+    case clang::BuiltinType::Long:
+      SS << V.getLong();
+      return Str;
+    case clang::BuiltinType::ULong:
+      SS << V.getULong();
+      return Str;
+    case clang::BuiltinType::LongLong:
+      SS << V.getLongLong();
+      return Str;
+    case clang::BuiltinType::ULongLong:
+      SS << V.getULongLong();
+      return Str;
+    case clang::BuiltinType::Float:
+      return formatFloating(V.getFloat(), /*suffix=*/'f');
+
+    case clang::BuiltinType::Double:
+      return formatFloating(V.getDouble());
+
+    case clang::BuiltinType::LongDouble:
+      return formatFloating(V.getLongDouble(), /*suffix=*/'L');
+    }
+  }
+
+  if ((NonRefTy->isPointerType() || NonRefTy->isMemberPointerType()) &&
+      NonRefTy->getPointeeType()->isFunctionProtoType())
+    return FunctionToString(V, V.getPtr());
+
+  if (NonRefTy->isFunctionType())
+    return FunctionToString(V, &V);
+
+  if (NonRefTy->isEnumeralType())
+    return EnumToString(V);
+
+  if (NonRefTy->isNullPtrType())
+    return "nullptr\n";
+
+  // FIXME: Add support for custom printers in C.
+  if (NonRefTy->isPointerType()) {
+    if (NonRefTy->getPointeeType()->isCharType())
+      return CharPtrToString((char *)V.getPtr());
+
+    return VoidPtrToString(V.getPtr());
+  }
+
+  // Fall back to printing just the address of the unknown object.
+  return "@" + VoidPtrToString(V.getPtr());
+}
+
+std::string Interpreter::ValueTypeToString(const Value &V) const {
+  ASTContext &Ctx = const_cast<ASTContext &>(V.getASTContext());
+  QualType QT = V.getType();
+
+  std::string QTStr = QualTypeToString(Ctx, QT);
+
+  if (QT->isReferenceType())
+    QTStr += " &";
+
+  return QTStr;
+}
+
 llvm::Expected<llvm::orc::ExecutorAddr>
-Interpreter::CompileDtorCall(CXXRecordDecl *CXXRD) {
+Interpreter::CompileDtorCall(CXXRecordDecl *CXXRD) const {
   assert(CXXRD && "Cannot compile a destructor for a nullptr");
   if (auto Dtor = Dtors.find(CXXRD); Dtor != Dtors.end())
     return Dtor->getSecond();
@@ -81,7 +404,7 @@ class InterfaceKindVisitor
     return InterfaceKind::CopyArray;
   }
 
-  InterfaceKind VisitFunctionProtoType(const FunctionProtoType *Ty) {
+  InterfaceKind VisitFunctionType(const FunctionType *Ty) {
     HandlePtrType(Ty);
     return InterfaceKind::NoAlloc;
   }
@@ -141,9 +464,14 @@ class InterfaceKindVisitor
   }
 };
 
+static constexpr llvm::StringRef VPName[] = {
+    "__clang_Interpreter_SetValueNoAlloc",
+    "__clang_Interpreter_SetValueWithAlloc",
+    "__clang_Interpreter_SetValueCopyArr", "__ci_newtag"};
+
 // This synthesizes a call expression to a speciall
 // function that is responsible for generating the Value.
-// In general, we transform:
+// In general, we transform c++:
 //   clang-repl> x
 // To:
 //   // 1. If x is a built-in type like int, float.
@@ -154,7 +482,7 @@ class InterfaceKindVisitor
 //   // 3. If x is a struct, but a rvalue.
 //   new (__clang_Interpreter_SetValueWithAlloc(ThisInterp, OpaqueValue,
 //   xQualType)) (x);
-llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
+llvm::Expected<Expr *> Interpreter::convertExprToValue(Expr *E) {
   Sema &S = getCompilerInstance()->getSema();
   ASTContext &Ctx = S.getASTContext();
 
@@ -176,23 +504,21 @@ llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
       Interface = S.BuildDeclarationNameExpr(CSS, R, /*ADL=*/false).get();
       return llvm::Error::success();
     };
-    static constexpr llvm::StringRef Builtin[] = {
-        "__clang_Interpreter_SetValueNoAlloc",
-        "__clang_Interpreter_SetValueWithAlloc",
-        "__clang_Interpreter_SetValueCopyArr", "__ci_newtag"};
     if (llvm::Error Err =
-            LookupInterface(ValuePrintingInfo[NoAlloc], Builtin[NoAlloc]))
+            LookupInterface(ValuePrintingInfo[NoAlloc], VPName[NoAlloc]))
+      return std::move(Err);
+
+    if (llvm::Error Err =
+            LookupInterface(ValuePrintingInfo[CopyArray], VPName[CopyArray]))
+      return std::move(Err);
+
+    if (llvm::Error Err =
+            LookupInterface(ValuePrintingInfo[WithAlloc], VPName[WithAlloc]))
       return std::move(Err);
 
     if (Ctx.getLangOpts().CPlusPlus) {
       if (llvm::Error Err =
-              LookupInterface(ValuePrintingInfo[WithAlloc], Builtin[WithAlloc]))
-        return std::move(Err);
-      if (llvm::Error Err =
-              LookupInterface(ValuePrintingInfo[CopyArray], Builtin[CopyArray]))
-        return std::move(Err);
-      if (llvm::Error Err =
-              LookupInterface(ValuePrintingInfo[NewTag], Builtin[NewTag]))
+              LookupInterface(ValuePrintingInfo[NewTag], VPName[NewTag]))
         return std::move(Err);
     }
   }
@@ -211,7 +537,7 @@ llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
   if (auto *EWC = llvm::dyn_cast_if_present<ExprWithCleanups>(E))
     E = EWC->getSubExpr();
 
-  QualType Ty = E->getType();
+  QualType Ty = E->IgnoreImpCasts()->getType();
   QualType DesugaredTy = Ty.getDesugaredType(Ctx);
 
   // For lvalue struct, we treat it as a reference.
@@ -239,7 +565,10 @@ llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
     ExprResult AllocCall =
         S.ActOnCallExpr(Scope, ValuePrintingInfo[InterfaceKind::WithAlloc],
                         E->getBeginLoc(), AdjustedArgs, E->getEndLoc());
-    assert(!AllocCall.isInvalid() && "Can't create runtime interface call!");
+    if (AllocCall.isInvalid())
+      return llvm::make_error<llvm::StringError>(
+          "Cannot call to " + VPName[WithAlloc],
+          llvm::inconvertibleErrorCode());
 
     TypeSourceInfo *TSI = Ctx.getTrivialTypeSourceInfo(Ty, SourceLocation());
 
@@ -253,14 +582,23 @@ llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
 
     // __clang_Interpreter_SetValueCopyArr.
     if (Kind == InterfaceKind::CopyArray) {
-      const auto *ConstantArrTy =
-          cast<ConstantArrayType>(DesugaredTy.getTypePtr());
-      size_t ArrSize = Ctx.getConstantArrayElementCount(ConstantArrTy);
+      const auto *CATy = cast<ConstantArrayType>(DesugaredTy.getTypePtr());
+      size_t ArrSize = Ctx.getConstantArrayElementCount(CATy);
+
+      if (!Ctx.getLangOpts().CPlusPlus)
+        ArrSize *= Ctx.getTypeSizeInChars(CATy->getBaseElementTypeUnsafe())
+                       .getQuantity();
+
       Expr *ArrSizeExpr = IntegerLiteralExpr(Ctx, ArrSize);
       Expr *Args[] = {E, AllocCall.get(), ArrSizeExpr};
       SetValueE =
           S.ActOnCallExpr(Scope, ValuePrintingInfo[InterfaceKind::CopyArray],
                           SourceLocation(), Args, SourceLocation());
+      if (SetValueE.isInvalid())
+        return llvm::make_error<llvm::StringError>(
+            "Cannot call to " + VPName[CopyArray],
+            llvm::inconvertibleErrorCode());
+      break;
     }
     Expr *Args[] = {AllocCall.get(), ValuePrintingInfo[InterfaceKind::NewTag]};
     ExprResult CXXNewCall = S.BuildCXXNew(
@@ -270,8 +608,10 @@ llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
         /*TypeIdParens=*/SourceRange(), TSI->getType(), TSI, std::nullopt,
         E->getSourceRange(), E);
 
-    assert(!CXXNewCall.isInvalid() &&
-           "Can't create runtime placement new call!");
+    if (CXXNewCall.isInvalid())
+      return llvm::make_error<llvm::StringError>(
+          "Cannot build a call to placement new",
+          llvm::inconvertibleErrorCode());
 
     SetValueE = S.ActOnFinishFullExpr(CXXNewCall.get(),
                                       /*DiscardedValue=*/false);
@@ -300,6 +640,7 @@ llvm::Expected<Expr *> Interpreter::ExtractValueFromExpr(Expr *E) {
 using namespace clang;
 
 // Temporary rvalue struct that need special care.
+extern "C" {
 REPL_EXTERNAL_VISIBILITY void *
 __clang_Interpreter_SetValueWithAlloc(void *This, void *OutVal,
                                       void *OpaqueType) {
@@ -308,8 +649,9 @@ __clang_Interpreter_SetValueWithAlloc(void *This, void *OutVal,
   return VRef.getPtr();
 }
 
-extern "C" void REPL_EXTERNAL_VISIBILITY __clang_Interpreter_SetValueNoAlloc(
-    void *This, void *OutVal, void *OpaqueType, ...) {
+REPL_EXTERNAL_VISIBILITY void
+__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType,
+                                    ...) {
   Value &VRef = *(Value *)OutVal;
   Interpreter *I = static_cast<Interpreter *>(This);
   VRef = Value(I, OpaqueType);
@@ -384,6 +726,7 @@ extern "C" void REPL_EXTERNAL_VISIBILITY __clang_Interpreter_SetValueNoAlloc(
   }
   va_end(args);
 }
+}
 
 // A trampoline to work around the fact that operator placement new cannot
 // really be forward declared due to libc++ and libstdc++ declaration mismatch.
diff --git a/clang/lib/Interpreter/Value.cpp b/clang/lib/Interpreter/Value.cpp
index afdf406b37253..be2ab5587a980 100644
--- a/clang/lib/Interpreter/Value.cpp
+++ b/clang/lib/Interpreter/Value.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Interpreter/Value.h"
+#include "InterpreterUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Type.h"
 #include "clang/Interpreter/Interpreter.h"
@@ -19,6 +20,8 @@
 #include <cassert>
 #include <utility>
 
+using namespace clang;
+
 namespace {
 
 // This is internal buffer maintained by Value, used to hold temporaries.
@@ -117,8 +120,9 @@ static Value::Kind ConvertQualTypeToKind(const ASTContext &Ctx, QualType QT) {
   }
 }
 
-Value::Value(Interpreter *In, void *Ty) : Interp(In), OpaqueType(Ty) {
-  setKind(ConvertQualTypeToKind(getASTContext(), getType()));
+Value::Value(const Interpreter *In, void *Ty) : Interp(In), OpaqueType(Ty) {
+  const ASTContext &C = getASTContext();
+  setKind(ConvertQualTypeToKind(C, getType()));
   if (ValueKind == K_PtrOrObj) {
     QualType Canon = getType().getCanonicalType();
     if ((Canon->isPointerType() || Canon->isObjectType() ||
@@ -127,7 +131,7 @@ Value::Value(Interpreter *In, void *Ty) : Interp(In), OpaqueType(Ty) {
          Canon->isMemberPointerType())) {
       IsManuallyAlloc = true;
       // Compile dtor function.
-      Interpreter &Interp = getInterpreter();
+      const Interpreter &Interp = getInterpreter();
       void *DtorF = nullptr;
       size_t ElementsSize = 1;
       QualType DtorTy = getType();
@@ -228,14 +232,13 @@ void *Value::getPtr() const {
   return Data.m_Ptr;
 }
 
-QualType Value::getType() const {
-  return QualType::getFromOpaquePtr(OpaqueType);
+void Value::setRawBits(void *Ptr, unsigned NBits /*= sizeof(Storage)*/) {
+  assert(NBits <= sizeof(Storage) && "Greater than the total size");
+  memcpy(/*dest=*/Data.m_RawBits, /*src=*/Ptr, /*nbytes=*/NBits / 8);
 }
 
-Interpreter &Value::getInterpreter() {
-  assert(Interp != nullptr &&
-         "Can't get interpreter from a default constructed value");
-  return *Interp;
+QualType Value::getType() const {
+  return QualType::getFromOpaquePtr(OpaqueType);
 }
 
 const Interpreter &Value::getInterpreter() const {
@@ -244,8 +247,6 @@ const Interpreter &Value::getInterpreter() const {
   return *Interp;
 }
 
-ASTContext &Value::getASTContext() { return getInterpreter().getASTContext(); }
-
 const ASTContext &Value::getASTContext() const {
   return getInterpreter().getASTContext();
 }
@@ -253,14 +254,32 @@ const ASTContext &Value::getASTContext() const {
 void Value::dump() const { print(llvm::outs()); }
 
 void Value::printType(llvm::raw_ostream &Out) const {
-  Out << "Not implement yet.\n";
+  Out << Interp->ValueTypeToString(*this);
 }
+
 void Value::printData(llvm::raw_ostream &Out) const {
-  Out << "Not implement yet.\n";
+  Out << Interp->ValueDataToString(*this);
 }
+// FIXME: We do not support the multiple inheritance case where one of the base
+// classes has a pretty-printer and the other does not.
 void Value::print(llvm::raw_ostream &Out) const {
   assert(OpaqueType != nullptr && "Can't print default Value");
-  Out << "Not implement yet.\n";
+
+  // Don't even try to print a void or an invalid type, it doesn't make sense.
+  if (getType()->isVoidType() || !isValid())
+    return;
+
+  // We need to get all the results together then print it, since `printType` is
+  // much faster than `printData`.
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+
+  SS << "(";
+  printType(SS);
+  SS << ") ";
+  printData(SS);
+  SS << "\n";
+  Out << Str;
 }
 
 } // namespace clang
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 869c9cea566b6..9ccff5e3342d5 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -560,15 +560,13 @@ bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
     if (Tok.is(tok::semi))
       break;
   }
+
+  const auto &Tok = lexToken(First, End);
   pushDirective(Kind);
-  skipWhitespace(First, End);
-  if (First == End)
+  if (Tok.is(tok::eof) || Tok.is(tok::eod))
     return false;
-  if (!isVerticalWhitespace(*First))
-    return reportError(
-        DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
-  skipNewline(First, End);
-  return false;
+  return reportError(DirectiveLoc,
+                     diag::err_dep_source_scanner_unexpected_tokens_at_import);
 }
 
 dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
@@ -735,6 +733,13 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
       return false;
     break;
   }
+  case ';': {
+    // Handle the global module fragment `module;`.
+    if (Id == "module" && !Export)
+      break;
+    skipLine(First, End);
+    return false;
+  }
   case '<':
   case '"':
     break;
@@ -905,14 +910,6 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
     CurDirToks.clear();
   });
 
-  // Handle "@import".
-  if (*First == '@')
-    return lexAt(First, End);
-
-  // Handle module directives for C++20 modules.
-  if (*First == 'i' || *First == 'e' || *First == 'm')
-    return lexModule(First, End);
-
   if (*First == '_') {
     if (isNextIdentifierOrSkipLine("_Pragma", First, End))
       return lex_Pragma(First, End);
@@ -925,6 +922,14 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
   auto ScEx2 = make_scope_exit(
       [&]() { TheLexer.setParsingPreprocessorDirective(false); });
 
+  // Handle "@import".
+  if (*First == '@')
+    return lexAt(First, End);
+
+  // Handle module directives for C++20 modules.
+  if (*First == 'i' || *First == 'e' || *First == 'm')
+    return lexModule(First, End);
+
   // Lex '#'.
   const dependency_directives_scan::Token &HashTok = lexToken(First, End);
   if (HashTok.is(tok::hashhash)) {
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index a62508e3e27bf..5b08d7f0efe5a 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1467,7 +1467,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
   if (s != PossibleNewDigitStart)
     DigitsBegin = PossibleNewDigitStart;
   else
-    IsSingleZero = (s == ThisTokEnd); // Is the only thing we've seen a 0?
+    IsSingleZero = (s == ThisTokBegin + 1);
 
   if (s == ThisTokEnd)
     return; // Done, simple octal number like 01234
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index 01c85e6ad95d5..bba3c89bed38f 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -591,7 +591,8 @@ IdentifierInfo *Preprocessor::ParsePragmaPushOrPopMacro(Token &Tok) {
   }
 
   // Remember the macro string.
-  std::string StrVal = getSpelling(Tok);
+  Token StrTok = Tok;
+  std::string StrVal = getSpelling(StrTok);
 
   // Read the ')'.
   Lex(Tok);
@@ -604,6 +605,15 @@ IdentifierInfo *Preprocessor::ParsePragmaPushOrPopMacro(Token &Tok) {
   assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' &&
          "Invalid string token!");
 
+  if (StrVal.size() <= 2) {
+    Diag(StrTok.getLocation(), diag::warn_pargma_push_pop_macro_empty_string)
+        << SourceRange(
+               StrTok.getLocation(),
+               StrTok.getLocation().getLocWithOffset(StrTok.getLength()))
+        << PragmaTok.getIdentifierInfo()->isStr("pop_macro");
+    return nullptr;
+  }
+
   // Create a Token from the string.
   Token MacroTok;
   MacroTok.startToken();
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index bcd3ea60ce3da..e278846f6f36d 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -950,6 +950,8 @@ void Preprocessor::Lex(Token &Result) {
     case tok::period:
       ModuleDeclState.handlePeriod();
       break;
+    case tok::eod:
+      break;
     case tok::identifier:
       // Check "import" and "module" when there is no open bracket. The two
       // identifiers are not meaningful with open brackets.
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 9cae4f9a23ef0..31392d1dd8d4b 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -591,8 +591,7 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context,
        NextToken().isRegularKeywordAttribute() ||
        NextToken().is(tok::kw___attribute)) &&
       D.SS.isNotEmpty() && LastII == Tok.getIdentifierInfo() &&
-      !D.SS.getScopeRep()->getAsNamespace() &&
-      !D.SS.getScopeRep()->getAsNamespaceAlias()) {
+      D.SS.getScopeRep()->getKind() != NestedNameSpecifier::Namespace) {
     SourceLocation IdLoc = ConsumeToken();
     ParsedType Type =
         Actions.getInheritingConstructorName(D.SS, IdLoc, *LastII);
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 31b84b6f2ede0..bf1978c22ee9f 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -541,7 +541,8 @@ StmtResult Parser::ParseExprStatement(ParsedStmtContext StmtCtx) {
   }
 
   Token *CurTok = nullptr;
-  // Note we shouldn't eat the token since the callback needs it.
+  // If the semicolon is missing at the end of REPL input, we want to print
+  // the result. Note we shouldn't eat the token since the callback needs it.
   if (Tok.is(tok::annot_repl_input_end))
     CurTok = &Tok;
   else
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 8834bf80c4016..ff50b3f83908c 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2519,6 +2519,7 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
     break;
   }
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
+  TryConsumeToken(tok::eod);
 
   if (SeenError)
     return nullptr;
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 5eba024e83634..829c81bab16f5 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -522,8 +522,7 @@ static bool areAllValuesNoReturn(const VarDecl *VD, const CFGBlock &VarBlk,
     }
 
     // If all checked blocks satisfy the condition, the check is finished.
-    if (std::all_of(BlocksToCheck.begin(), BlocksToCheck.end(),
-                    BlockSatisfiesCondition))
+    if (llvm::all_of(BlocksToCheck, BlockSatisfiesCondition))
       return true;
 
     // If this block does not contain the variable definition, check
@@ -2902,8 +2901,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
       .setAlwaysAdd(Stmt::UnaryOperatorClass);
   }
 
-  bool EnableLifetimeSafetyAnalysis = !Diags.isIgnored(
-      diag::warn_experimental_lifetime_safety_dummy_warning, D->getBeginLoc());
+  bool EnableLifetimeSafetyAnalysis = S.getLangOpts().EnableLifetimeSafety;
   // Install the logical handler.
   std::optional<LogicalErrorHandler> LEH;
   if (LogicalErrorHandler::hasActiveDiagnostics(Diags, D->getBeginLoc())) {
@@ -3030,8 +3028,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   // TODO: Enable lifetime safety analysis for other languages once it is
   // stable.
   if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) {
-    if (CFG *cfg = AC.getCFG())
-      runLifetimeSafetyAnalysis(*cast<DeclContext>(D), *cfg, AC);
+    if (AC.getCFG())
+      lifetimes::runLifetimeSafetyAnalysis(AC);
   }
   // Check for violations of "called once" parameter properties.
   if (S.getLangOpts().ObjC && !S.getLangOpts().CPlusPlus &&
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index ee5a862c32509..f0f1d66f66e93 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -72,7 +72,7 @@ void CXXScopeSpec::Extend(ASTContext &Context, IdentifierInfo *Identifier,
          "NestedNameSpecifierLoc range computation incorrect");
 }
 
-void CXXScopeSpec::Extend(ASTContext &Context, NamespaceDecl *Namespace,
+void CXXScopeSpec::Extend(ASTContext &Context, NamespaceBaseDecl *Namespace,
                           SourceLocation NamespaceLoc,
                           SourceLocation ColonColonLoc) {
   Builder.Extend(Context, Namespace, NamespaceLoc, ColonColonLoc);
@@ -85,19 +85,6 @@ void CXXScopeSpec::Extend(ASTContext &Context, NamespaceDecl *Namespace,
          "NestedNameSpecifierLoc range computation incorrect");
 }
 
-void CXXScopeSpec::Extend(ASTContext &Context, NamespaceAliasDecl *Alias,
-                          SourceLocation AliasLoc,
-                          SourceLocation ColonColonLoc) {
-  Builder.Extend(Context, Alias, AliasLoc, ColonColonLoc);
-
-  if (Range.getBegin().isInvalid())
-    Range.setBegin(AliasLoc);
-  Range.setEnd(ColonColonLoc);
-
-  assert(Range == Builder.getSourceRange() &&
-         "NestedNameSpecifierLoc range computation incorrect");
-}
-
 void CXXScopeSpec::MakeGlobal(ASTContext &Context,
                               SourceLocation ColonColonLoc) {
   Builder.MakeGlobal(Context, ColonColonLoc);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index e6414a623b929..c23c98aa3aaeb 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
 
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+  case AMDGPU::BI__builtin_amdgcn_struct_ptr_buffer_load_lds:
   case AMDGPU::BI__builtin_amdgcn_load_to_lds:
   case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
     constexpr const int SizeIdx = 2;
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index bd603a925d15e..8e27fabccd583 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1535,4 +1535,95 @@ bool SemaARM::areLaxCompatibleSveTypes(QualType FirstType,
          IsLaxCompatible(SecondType, FirstType);
 }
 
+bool SemaARM::checkTargetVersionAttr(const StringRef Param,
+                                     const SourceLocation Loc) {
+  using namespace DiagAttrParams;
+
+  llvm::SmallVector<StringRef, 8> Features;
+  Param.split(Features, '+');
+  for (StringRef Feat : Features) {
+    Feat = Feat.trim();
+    if (Feat == "default")
+      continue;
+    if (!getASTContext().getTargetInfo().validateCpuSupports(Feat))
+      return Diag(Loc, diag::warn_unsupported_target_attribute)
+             << Unsupported << None << Feat << TargetVersion;
+  }
+  return false;
+}
+
+bool SemaARM::checkTargetClonesAttr(
+    SmallVectorImpl<StringRef> &Params, SmallVectorImpl<SourceLocation> &Locs,
+    SmallVectorImpl<SmallString<64>> &NewParams) {
+  using namespace DiagAttrParams;
+
+  if (!getASTContext().getTargetInfo().hasFeature("fmv"))
+    return true;
+
+  assert(Params.size() == Locs.size() &&
+         "Mismatch between number of string parameters and locations");
+
+  bool HasDefault = false;
+  bool HasNonDefault = false;
+  for (unsigned I = 0, E = Params.size(); I < E; ++I) {
+    const StringRef Param = Params[I].trim();
+    const SourceLocation &Loc = Locs[I];
+
+    if (Param.empty())
+      return Diag(Loc, diag::warn_unsupported_target_attribute)
+             << Unsupported << None << "" << TargetClones;
+
+    if (Param == "default") {
+      if (HasDefault)
+        Diag(Loc, diag::warn_target_clone_duplicate_options);
+      else {
+        NewParams.push_back(Param);
+        HasDefault = true;
+      }
+      continue;
+    }
+
+    bool HasCodeGenImpact = false;
+    llvm::SmallVector<StringRef, 8> Features;
+    llvm::SmallVector<StringRef, 8> ValidFeatures;
+    Param.split(Features, '+');
+    for (StringRef Feat : Features) {
+      Feat = Feat.trim();
+      if (!getASTContext().getTargetInfo().validateCpuSupports(Feat)) {
+        Diag(Loc, diag::warn_unsupported_target_attribute)
+            << Unsupported << None << Feat << TargetClones;
+        continue;
+      }
+      if (getASTContext().getTargetInfo().doesFeatureAffectCodeGen(Feat))
+        HasCodeGenImpact = true;
+      ValidFeatures.push_back(Feat);
+    }
+
+    // Ignore features that don't impact code generation.
+    if (!HasCodeGenImpact) {
+      Diag(Loc, diag::warn_target_clone_no_impact_options);
+      continue;
+    }
+
+    if (ValidFeatures.empty())
+      continue;
+
+    // Canonicalize attribute parameter.
+    llvm::sort(ValidFeatures);
+    SmallString<64> NewParam(llvm::join(ValidFeatures, "+"));
+    if (llvm::is_contained(NewParams, NewParam)) {
+      Diag(Loc, diag::warn_target_clone_duplicate_options);
+      continue;
+    }
+
+    // Valid non-default argument.
+    NewParams.push_back(NewParam);
+    HasNonDefault = true;
+  }
+  if (!HasNonDefault)
+    return true;
+
+  return false;
+}
+
 } // namespace clang
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index ab83f625d2849..6ac04837708f6 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -137,10 +137,7 @@ DeclContext *Sema::computeDeclContext(const CXXScopeSpec &SS,
     llvm_unreachable("Dependent nested-name-specifier has no DeclContext");
 
   case NestedNameSpecifier::Namespace:
-    return NNS->getAsNamespace();
-
-  case NestedNameSpecifier::NamespaceAlias:
-    return NNS->getAsNamespaceAlias()->getNamespace();
+    return NNS->getAsNamespace()->getNamespace();
 
   case NestedNameSpecifier::TypeSpec: {
     const TagType *Tag = NNS->getAsType()->getAs<TagType>();
@@ -992,7 +989,6 @@ bool Sema::ShouldEnterDeclaratorScope(Scope *S, const CXXScopeSpec &SS) {
   switch (Qualifier->getKind()) {
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
     // These are always namespace scopes.  We never want to enter a
     // namespace scope from anything but a file context.
     return CurContext->getRedeclContext()->isFileContext();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index dd5b710d7e1d4..c74b67106ad74 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3013,6 +3013,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_maxnum:
   case Builtin::BI__builtin_elementwise_minimum:
   case Builtin::BI__builtin_elementwise_maximum:
+  case Builtin::BI__builtin_elementwise_minimumnum:
+  case Builtin::BI__builtin_elementwise_maximumnum:
   case Builtin::BI__builtin_elementwise_atan2:
   case Builtin::BI__builtin_elementwise_fmod:
   case Builtin::BI__builtin_elementwise_pow:
@@ -5239,7 +5241,9 @@ bool Sema::BuiltinVAStartARMMicrosoft(CallExpr *Call) {
         << 2 << Arg1->getType() << ConstCharPtrTy;
 
   const QualType SizeTy = Context.getSizeType();
-  if (Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers() != SizeTy)
+  if (!Context.hasSameType(
+          Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers(),
+          SizeTy))
     Diag(Arg2->getBeginLoc(), diag::err_typecheck_convert_incompatible)
         << Arg2->getType() << SizeTy << 1 /* different class */
         << 0                              /* qualifier difference */
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 834417f8e15ac..5205ca0bca6fa 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -925,7 +925,12 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
       ND && ND->isFunctionOrFunctionTemplate()) {
     ScopeForParameters.emplace(S, /*CombineWithOuterScope=*/true);
     const FunctionDecl *FD = ND->getAsFunction();
+    if (FunctionTemplateDecl *Template = FD->getDescribedFunctionTemplate();
+        Template && Template->getInstantiatedFromMemberTemplate())
+      FD = Template->getInstantiatedFromMemberTemplate()->getTemplatedDecl();
     for (auto *PVD : FD->parameters()) {
+      if (ScopeForParameters->getInstantiationOfIfExists(PVD))
+        continue;
       if (!PVD->isParameterPack()) {
         ScopeForParameters->InstantiatedLocal(PVD, PVD);
         continue;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 78f4804202ddc..9a2950cf1648e 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3254,9 +3254,8 @@ static void handleCodeSegAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
-  enum FirstParam { Unsupported, Duplicate, Unknown };
-  enum SecondParam { None, CPU, Tune };
-  enum ThirdParam { Target, TargetClones };
+  using namespace DiagAttrParams;
+
   if (AttrStr.contains("fpmath="))
     return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
            << Unsupported << None << "fpmath=" << Target;
@@ -3331,80 +3330,22 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   return false;
 }
 
-bool Sema::checkTargetVersionAttr(SourceLocation LiteralLoc, Decl *D,
-                                  StringRef AttrStr) {
-  enum FirstParam { Unsupported };
-  enum SecondParam { None };
-  enum ThirdParam { Target, TargetClones, TargetVersion };
-  llvm::SmallVector<StringRef, 8> Features;
-  if (Context.getTargetInfo().getTriple().isRISCV()) {
-    llvm::SmallVector<StringRef, 8> AttrStrs;
-    AttrStr.split(AttrStrs, ';');
-
-    bool HasArch = false;
-    bool HasPriority = false;
-    bool HasDefault = false;
-    bool DuplicateAttr = false;
-    for (auto &AttrStr : AttrStrs) {
-      // Only support arch=+ext,... syntax.
-      if (AttrStr.starts_with("arch=+")) {
-        if (HasArch)
-          DuplicateAttr = true;
-        HasArch = true;
-        ParsedTargetAttr TargetAttr =
-            Context.getTargetInfo().parseTargetAttr(AttrStr);
-
-        if (TargetAttr.Features.empty() ||
-            llvm::any_of(TargetAttr.Features, [&](const StringRef Ext) {
-              return !RISCV().isValidFMVExtension(Ext);
-            }))
-          return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-                 << Unsupported << None << AttrStr << TargetVersion;
-      } else if (AttrStr.starts_with("default")) {
-        if (HasDefault)
-          DuplicateAttr = true;
-        HasDefault = true;
-      } else if (AttrStr.consume_front("priority=")) {
-        if (HasPriority)
-          DuplicateAttr = true;
-        HasPriority = true;
-        unsigned Digit;
-        if (AttrStr.getAsInteger(0, Digit))
-          return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-                 << Unsupported << None << AttrStr << TargetVersion;
-      } else {
-        return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-               << Unsupported << None << AttrStr << TargetVersion;
-      }
-    }
-
-    if (((HasPriority || HasArch) && HasDefault) || DuplicateAttr ||
-        (HasPriority && !HasArch))
-      return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-             << Unsupported << None << AttrStr << TargetVersion;
+static void handleTargetVersionAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  StringRef Param;
+  SourceLocation Loc;
+  if (!S.checkStringLiteralArgumentAttr(AL, 0, Param, &Loc))
+    return;
 
-    return false;
-  }
-  AttrStr.split(Features, "+");
-  for (auto &CurFeature : Features) {
-    CurFeature = CurFeature.trim();
-    if (CurFeature == "default")
-      continue;
-    if (!Context.getTargetInfo().validateCpuSupports(CurFeature))
-      return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-             << Unsupported << None << CurFeature << TargetVersion;
+  if (S.Context.getTargetInfo().getTriple().isAArch64()) {
+    if (S.ARM().checkTargetVersionAttr(Param, Loc))
+      return;
+  } else if (S.Context.getTargetInfo().getTriple().isRISCV()) {
+    if (S.RISCV().checkTargetVersionAttr(Param, Loc))
+      return;
   }
-  return false;
-}
 
-static void handleTargetVersionAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  StringRef Str;
-  SourceLocation LiteralLoc;
-  if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, &LiteralLoc) ||
-      S.checkTargetVersionAttr(LiteralLoc, D, Str))
-    return;
   TargetVersionAttr *NewAttr =
-      ::new (S.Context) TargetVersionAttr(S.Context, AL, Str);
+      ::new (S.Context) TargetVersionAttr(S.Context, AL, Param);
   D->addAttr(NewAttr);
 }
 
@@ -3419,158 +3360,7 @@ static void handleTargetAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(NewAttr);
 }
 
-bool Sema::checkTargetClonesAttrString(
-    SourceLocation LiteralLoc, StringRef Str, const StringLiteral *Literal,
-    Decl *D, bool &HasDefault, bool &HasCommas, bool &HasNotDefault,
-    SmallVectorImpl<SmallString<64>> &StringsBuffer) {
-  enum FirstParam { Unsupported, Duplicate, Unknown };
-  enum SecondParam { None, CPU, Tune };
-  enum ThirdParam { Target, TargetClones };
-  HasCommas = HasCommas || Str.contains(',');
-  const TargetInfo &TInfo = Context.getTargetInfo();
-  // Warn on empty at the beginning of a string.
-  if (Str.size() == 0)
-    return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-           << Unsupported << None << "" << TargetClones;
-
-  std::pair<StringRef, StringRef> Parts = {{}, Str};
-  while (!Parts.second.empty()) {
-    Parts = Parts.second.split(',');
-    StringRef Cur = Parts.first.trim();
-    SourceLocation CurLoc =
-        Literal->getLocationOfByte(Cur.data() - Literal->getString().data(),
-                                   getSourceManager(), getLangOpts(), TInfo);
-
-    bool DefaultIsDupe = false;
-    bool HasCodeGenImpact = false;
-    if (Cur.empty())
-      return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-             << Unsupported << None << "" << TargetClones;
-
-    if (TInfo.getTriple().isAArch64()) {
-      // AArch64 target clones specific
-      if (Cur == "default") {
-        DefaultIsDupe = HasDefault;
-        HasDefault = true;
-        if (llvm::is_contained(StringsBuffer, Cur) || DefaultIsDupe)
-          Diag(CurLoc, diag::warn_target_clone_duplicate_options);
-        else
-          StringsBuffer.push_back(Cur);
-      } else {
-        std::pair<StringRef, StringRef> CurParts = {{}, Cur};
-        llvm::SmallVector<StringRef, 8> CurFeatures;
-        while (!CurParts.second.empty()) {
-          CurParts = CurParts.second.split('+');
-          StringRef CurFeature = CurParts.first.trim();
-          if (!TInfo.validateCpuSupports(CurFeature)) {
-            Diag(CurLoc, diag::warn_unsupported_target_attribute)
-                << Unsupported << None << CurFeature << TargetClones;
-            continue;
-          }
-          if (TInfo.doesFeatureAffectCodeGen(CurFeature))
-            HasCodeGenImpact = true;
-          CurFeatures.push_back(CurFeature);
-        }
-        // Canonize TargetClones Attributes
-        llvm::sort(CurFeatures);
-        SmallString<64> Res;
-        for (auto &CurFeat : CurFeatures) {
-          if (!Res.empty())
-            Res.append("+");
-          Res.append(CurFeat);
-        }
-        if (llvm::is_contained(StringsBuffer, Res) || DefaultIsDupe)
-          Diag(CurLoc, diag::warn_target_clone_duplicate_options);
-        else if (!HasCodeGenImpact)
-          // Ignore features in target_clone attribute that don't impact
-          // code generation
-          Diag(CurLoc, diag::warn_target_clone_no_impact_options);
-        else if (!Res.empty()) {
-          StringsBuffer.push_back(Res);
-          HasNotDefault = true;
-        }
-      }
-    } else if (TInfo.getTriple().isRISCV()) {
-      // Suppress warn_target_clone_mixed_values
-      HasCommas = false;
-
-      // Cur is split's parts of Str. RISC-V uses Str directly,
-      // so skip when encountered more than once.
-      if (!Str.starts_with(Cur))
-        continue;
-
-      llvm::SmallVector<StringRef, 8> AttrStrs;
-      Str.split(AttrStrs, ";");
-
-      bool IsPriority = false;
-      bool IsDefault = false;
-      for (auto &AttrStr : AttrStrs) {
-        // Only support arch=+ext,... syntax.
-        if (AttrStr.starts_with("arch=+")) {
-          ParsedTargetAttr TargetAttr =
-              Context.getTargetInfo().parseTargetAttr(AttrStr);
-
-          if (TargetAttr.Features.empty() ||
-              llvm::any_of(TargetAttr.Features, [&](const StringRef Ext) {
-                return !RISCV().isValidFMVExtension(Ext);
-              }))
-            return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-                   << Unsupported << None << Str << TargetClones;
-        } else if (AttrStr.starts_with("default")) {
-          IsDefault = true;
-          DefaultIsDupe = HasDefault;
-          HasDefault = true;
-        } else if (AttrStr.consume_front("priority=")) {
-          IsPriority = true;
-          unsigned Digit;
-          if (AttrStr.getAsInteger(0, Digit))
-            return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-                   << Unsupported << None << Str << TargetClones;
-        } else {
-          return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-                 << Unsupported << None << Str << TargetClones;
-        }
-      }
-
-      if (IsPriority && IsDefault)
-        return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-               << Unsupported << None << Str << TargetClones;
-
-      if (llvm::is_contained(StringsBuffer, Str) || DefaultIsDupe)
-        Diag(CurLoc, diag::warn_target_clone_duplicate_options);
-      StringsBuffer.push_back(Str);
-    } else {
-      // Other targets ( currently X86 )
-      if (Cur.starts_with("arch=")) {
-        if (!Context.getTargetInfo().isValidCPUName(
-                Cur.drop_front(sizeof("arch=") - 1)))
-          return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-                 << Unsupported << CPU << Cur.drop_front(sizeof("arch=") - 1)
-                 << TargetClones;
-      } else if (Cur == "default") {
-        DefaultIsDupe = HasDefault;
-        HasDefault = true;
-      } else if (!Context.getTargetInfo().isValidFeatureName(Cur) ||
-                 Context.getTargetInfo().getFMVPriority(Cur) == 0)
-        return Diag(CurLoc, diag::warn_unsupported_target_attribute)
-               << Unsupported << None << Cur << TargetClones;
-      if (llvm::is_contained(StringsBuffer, Cur) || DefaultIsDupe)
-        Diag(CurLoc, diag::warn_target_clone_duplicate_options);
-      // Note: Add even if there are duplicates, since it changes name mangling.
-      StringsBuffer.push_back(Cur);
-    }
-  }
-  if (Str.rtrim().ends_with(","))
-    return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
-           << Unsupported << None << "" << TargetClones;
-  return false;
-}
-
 static void handleTargetClonesAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (S.Context.getTargetInfo().getTriple().isAArch64() &&
-      !S.Context.getTargetInfo().hasFeature("fmv"))
-    return;
-
   // Ensure we don't combine these with themselves, since that causes some
   // confusing behavior.
   if (const auto *Other = D->getAttr<TargetClonesAttr>()) {
@@ -3581,31 +3371,6 @@ static void handleTargetClonesAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if (checkAttrMutualExclusion<TargetClonesAttr>(S, D, AL))
     return;
 
-  SmallVector<StringRef, 2> Strings;
-  SmallVector<SmallString<64>, 2> StringsBuffer;
-  bool HasCommas = false, HasDefault = false, HasNotDefault = false;
-
-  for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) {
-    StringRef CurStr;
-    SourceLocation LiteralLoc;
-    if (!S.checkStringLiteralArgumentAttr(AL, I, CurStr, &LiteralLoc) ||
-        S.checkTargetClonesAttrString(
-            LiteralLoc, CurStr,
-            cast<StringLiteral>(AL.getArgAsExpr(I)->IgnoreParenCasts()), D,
-            HasDefault, HasCommas, HasNotDefault, StringsBuffer))
-      return;
-  }
-  for (auto &SmallStr : StringsBuffer)
-    Strings.push_back(SmallStr.str());
-
-  if (HasCommas && AL.getNumArgs() > 1)
-    S.Diag(AL.getLoc(), diag::warn_target_clone_mixed_values);
-
-  if (!HasDefault && !S.Context.getTargetInfo().getTriple().isAArch64()) {
-    S.Diag(AL.getLoc(), diag::err_target_clone_must_have_default);
-    return;
-  }
-
   // FIXME: We could probably figure out how to get this to work for lambdas
   // someday.
   if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) {
@@ -3617,13 +3382,34 @@ static void handleTargetClonesAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     }
   }
 
-  // No multiversion if we have default version only.
-  if (S.Context.getTargetInfo().getTriple().isAArch64() && !HasNotDefault)
-    return;
+  SmallVector<StringRef, 2> Params;
+  SmallVector<SourceLocation, 2> Locations;
+  for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) {
+    StringRef Param;
+    SourceLocation Loc;
+    if (!S.checkStringLiteralArgumentAttr(AL, I, Param, &Loc))
+      return;
+    Params.push_back(Param);
+    Locations.push_back(Loc);
+  }
+
+  SmallVector<SmallString<64>, 2> NewParams;
+  if (S.Context.getTargetInfo().getTriple().isAArch64()) {
+    if (S.ARM().checkTargetClonesAttr(Params, Locations, NewParams))
+      return;
+  } else if (S.Context.getTargetInfo().getTriple().isRISCV()) {
+    if (S.RISCV().checkTargetClonesAttr(Params, Locations, NewParams))
+      return;
+  } else if (S.Context.getTargetInfo().getTriple().isX86()) {
+    if (S.X86().checkTargetClonesAttr(Params, Locations, NewParams))
+      return;
+  }
+  Params.clear();
+  for (auto &SmallStr : NewParams)
+    Params.push_back(SmallStr.str());
 
-  cast<FunctionDecl>(D)->setIsMultiVersion();
   TargetClonesAttr *NewAttr = ::new (S.Context)
-      TargetClonesAttr(S.Context, AL, Strings.data(), Strings.size());
+      TargetClonesAttr(S.Context, AL, Params.data(), Params.size());
   D->addAttr(NewAttr);
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index f60ab4f0da7a0..f5b4614576086 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11930,7 +11930,7 @@ Decl *Sema::ActOnStartNamespaceDef(Scope *NamespcScope,
 
 /// getNamespaceDecl - Returns the namespace a decl represents. If the decl
 /// is a namespace alias, returns the namespace it points to.
-static inline NamespaceDecl *getNamespaceDecl(NamedDecl *D) {
+static inline NamespaceDecl *getNamespaceDecl(NamespaceBaseDecl *D) {
   if (NamespaceAliasDecl *AD = dyn_cast_or_null<NamespaceAliasDecl>(D))
     return AD->getNamespace();
   return dyn_cast_or_null<NamespaceDecl>(D);
@@ -13829,7 +13829,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc,
     }
   }
   assert(!R.isAmbiguous() && !R.empty());
-  NamedDecl *ND = R.getRepresentativeDecl();
+  auto *ND = cast<NamespaceBaseDecl>(R.getRepresentativeDecl());
 
   // Check if we have a previous declaration with the same name.
   LookupResult PrevR(*this, Alias, AliasLoc, LookupOrdinaryName,
@@ -18682,7 +18682,7 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New,
         case FunctionEffectDiff::OverrideResult::NoAction:
           break;
         case FunctionEffectDiff::OverrideResult::Warn:
-          Diag(New->getLocation(), diag::warn_mismatched_func_effect_override)
+          Diag(New->getLocation(), diag::warn_conflicting_func_effect_override)
               << Diff.effectName();
           Diag(Old->getLocation(), diag::note_overridden_virtual_function)
               << Old->getReturnTypeSourceRange();
@@ -18695,6 +18695,14 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New,
           QualType ModQT = Context.getFunctionType(NewFT->getReturnType(),
                                                    NewFT->getParamTypes(), EPI);
           New->setType(ModQT);
+          if (Errs.empty()) {
+            // A warning here is somewhat pedantic. Skip this if there was
+            // already a merge conflict, which is more serious.
+            Diag(New->getLocation(), diag::warn_mismatched_func_effect_override)
+                << Diff.effectName();
+            Diag(Old->getLocation(), diag::note_overridden_virtual_function)
+                << Old->getReturnTypeSourceRange();
+          }
           break;
         }
         }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 728ada33e2e63..45c7178c6965d 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -4564,6 +4564,9 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T,
     case Type::Atomic:
       T = cast<AtomicType>(Ty)->getValueType();
       break;
+    case Type::PredefinedSugar:
+      T = cast<PredefinedSugarType>(Ty)->desugar();
+      break;
     }
   } while (!T.isNull() && T->isVariablyModifiedType());
 }
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index f851c9e1d5015..0edfd6015cbd9 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -518,7 +518,6 @@ bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS,
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
     return false;
   }
 
@@ -3462,11 +3461,11 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name,
     // non-templated allocation function we are trying to declare here.
     if (FunctionDecl *Func = dyn_cast<FunctionDecl>(*Alloc)) {
       if (Func->getNumParams() == Params.size()) {
-        llvm::SmallVector<QualType, 3> FuncParams;
-        for (auto *P : Func->parameters())
-          FuncParams.push_back(
-              Context.getCanonicalType(P->getType().getUnqualifiedType()));
-        if (llvm::ArrayRef(FuncParams) == Params) {
+        if (std::equal(Func->param_begin(), Func->param_end(), Params.begin(),
+                       Params.end(), [&](ParmVarDecl *D, QualType RT) {
+                         return Context.hasSameUnqualifiedType(D->getType(),
+                                                               RT);
+                       })) {
           // Make the function visible to name lookup, even if we found it in
           // an unimported module. It either is an implicitly-declared global
           // allocation function, or is suppressing that function.
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 95746b35f71ef..1c6f292454ed6 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -3572,7 +3572,7 @@ ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig,
       Designators.push_back(ASTDesignator::CreateFieldDesignator(
           D.getFieldDecl(), D.getDotLoc(), D.getFieldLoc()));
     } else if (D.isArrayDesignator()) {
-      Expr *Index = static_cast<Expr *>(D.getArrayIndex());
+      Expr *Index = D.getArrayIndex();
       llvm::APSInt IndexValue;
       if (!Index->isTypeDependent() && !Index->isValueDependent())
         Index = CheckArrayDesignatorExpr(*this, Index, IndexValue).get();
@@ -3584,8 +3584,8 @@ ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig,
         InitExpressions.push_back(Index);
       }
     } else if (D.isArrayRangeDesignator()) {
-      Expr *StartIndex = static_cast<Expr *>(D.getArrayRangeStart());
-      Expr *EndIndex = static_cast<Expr *>(D.getArrayRangeEnd());
+      Expr *StartIndex = D.getArrayRangeStart();
+      Expr *EndIndex = D.getArrayRangeEnd();
       llvm::APSInt StartValue;
       llvm::APSInt EndValue;
       bool StartDependent = StartIndex->isTypeDependent() ||
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 6d6e07a2c03c7..8bde18f64f80b 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -4560,15 +4560,14 @@ static void getNestedNameSpecifierIdentifiers(
     II = NNS->getAsIdentifier();
     break;
 
-  case NestedNameSpecifier::Namespace:
-    if (NNS->getAsNamespace()->isAnonymousNamespace())
+  case NestedNameSpecifier::Namespace: {
+    const NamespaceBaseDecl *Namespace = NNS->getAsNamespace();
+    if (const auto *NS = dyn_cast<NamespaceDecl>(Namespace);
+        NS && NS->isAnonymousNamespace())
       return;
-    II = NNS->getAsNamespace()->getIdentifier();
-    break;
-
-  case NestedNameSpecifier::NamespaceAlias:
-    II = NNS->getAsNamespaceAlias()->getIdentifier();
+    II = Namespace->getIdentifier();
     break;
+  }
 
   case NestedNameSpecifier::TypeSpec:
     II = QualType(NNS->getAsType(), 0).getBaseTypeIdentifier();
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 46aa7dd0dcc21..128a5db57bf73 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -624,6 +624,66 @@ void SemaOpenACC::CheckDeclReference(SourceLocation Loc, Expr *E, Decl *D) {
   // loop (or we aren't in a loop!) so skip the diagnostic.
 }
 
+namespace {
+// Check whether the type of the thing we are referencing is OK for things like
+// private, firstprivate, and reduction, which require certain operators to be
+// available.
+ExprResult CheckVarType(SemaOpenACC &S, OpenACCClauseKind CK, Expr *VarExpr,
+                        Expr *InnerExpr) {
+  // There is nothing to do here, only these three have these sorts of
+  // restrictions.
+  if (CK != OpenACCClauseKind::Private &&
+      CK != OpenACCClauseKind::FirstPrivate &&
+      CK != OpenACCClauseKind::Reduction)
+    return VarExpr;
+
+  // We can't test this if it isn't here, or if the type isn't clear yet.
+  if (!InnerExpr || InnerExpr->isTypeDependent())
+    return VarExpr;
+
+  const auto *RD = InnerExpr->getType()->getAsCXXRecordDecl();
+
+  // if this isn't a C++ record decl, we can create/copy/destroy this thing at
+  // will without problem, so this is a success.
+  if (!RD)
+    return VarExpr;
+
+  // TODO: OpenACC:
+  // Private must have default ctor + dtor in InnerExpr
+  // FirstPrivate must have copyctor + dtor in InnerExpr
+  // Reduction must have copyctor + dtor + operation in InnerExpr
+
+  // TODO OpenACC: It isn't clear what the requirements are for default
+  // constructor/copy constructor are for First private and reduction, but
+  // private requires a default constructor.
+  if (CK == OpenACCClauseKind::Private) {
+    bool HasNonDeletedDefaultCtor =
+        llvm::find_if(RD->ctors(), [](const CXXConstructorDecl *CD) {
+          return CD->isDefaultConstructor() && !CD->isDeleted();
+        }) != RD->ctors().end();
+    if (!HasNonDeletedDefaultCtor && !RD->needsImplicitDefaultConstructor()) {
+      S.Diag(InnerExpr->getBeginLoc(),
+             clang::diag::warn_acc_var_referenced_lacks_op)
+          << InnerExpr->getType() << CK
+          << clang::diag::AccVarReferencedReason::DefCtor;
+      return ExprError();
+    }
+  }
+
+  // All 3 things need to make sure they have a dtor.
+  bool DestructorDeleted =
+      RD->getDestructor() && RD->getDestructor()->isDeleted();
+  if (DestructorDeleted && !RD->needsImplicitDestructor()) {
+    S.Diag(InnerExpr->getBeginLoc(),
+           clang::diag::warn_acc_var_referenced_lacks_op)
+        << InnerExpr->getType() << CK
+        << clang::diag::AccVarReferencedReason::Dtor;
+    return ExprError();
+  }
+  return VarExpr;
+}
+} // namespace
+
 ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
                                  Expr *VarExpr) {
   // This has unique enough restrictions that we should split it to a separate
@@ -660,7 +720,7 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
   if (const auto *DRE = dyn_cast<DeclRefExpr>(CurVarExpr)) {
     if (isa<VarDecl, NonTypeTemplateParmDecl>(
             DRE->getFoundDecl()->getCanonicalDecl()))
-      return VarExpr;
+      return CheckVarType(*this, CK, VarExpr, CurVarExpr);
   }
 
   // If CK is a Reduction, this special cases for OpenACC3.3 2.5.15: "A var in a
@@ -679,9 +739,9 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
         // declare, reduction, and use_device.
         const auto *This = dyn_cast<CXXThisExpr>(ME->getBase());
         if (This && This->isImplicit())
-          return VarExpr;
+          return CheckVarType(*this, CK, VarExpr, CurVarExpr);
       } else {
-        return VarExpr;
+        return CheckVarType(*this, CK, VarExpr, CurVarExpr);
       }
     }
   }
@@ -690,14 +750,14 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
   // doesn't fall into 'variable or array name'
   if (CK != OpenACCClauseKind::UseDevice &&
       DK != OpenACCDirectiveKind::Declare && isa<CXXThisExpr>(CurVarExpr))
-    return VarExpr;
+    return CheckVarType(*this, CK, VarExpr, CurVarExpr);
 
   // Nothing really we can do here, as these are dependent.  So just return they
   // are valid.
   if (isa<DependentScopeDeclRefExpr>(CurVarExpr) ||
       (CK != OpenACCClauseKind::Reduction &&
        isa<CXXDependentScopeMemberExpr>(CurVarExpr)))
-    return VarExpr;
+    return CheckVarType(*this, CK, VarExpr, CurVarExpr);
 
   // There isn't really anything we can do in the case of a recovery expr, so
   // skip the diagnostic rather than produce a confusing diagnostic.
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f3baf0c3ef3bc..5dd5b495480d9 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -245,7 +245,6 @@ void StandardConversionSequence::setAsIdentityConversion() {
   IsLvalueReference = true;
   BindsToFunctionLvalue = false;
   BindsToRvalue = false;
-  IsImplicitObjectArgumentQualificationConversion = false;
   BindsImplicitObjectArgumentWithoutRefQualifier = false;
   ObjCLifetimeConversionBinding = false;
   FromBracedInitList = false;
@@ -5318,7 +5317,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.Standard.DirectBinding = BindsDirectly;
     ICS.Standard.IsLvalueReference = !isRValRef;
     ICS.Standard.BindsToFunctionLvalue = T2->isFunctionType();
-    ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false;
     ICS.Standard.BindsToRvalue = InitCategory.isRValue();
     ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.Standard.ObjCLifetimeConversionBinding =
@@ -5498,7 +5496,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.Standard.IsLvalueReference = !isRValRef;
     ICS.Standard.BindsToFunctionLvalue = false;
     ICS.Standard.BindsToRvalue = true;
-    ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false;
     ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.Standard.ObjCLifetimeConversionBinding = false;
   } else if (ICS.isUserDefined()) {
@@ -5521,8 +5518,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.UserDefined.After.IsLvalueReference = !isRValRef;
     ICS.UserDefined.After.BindsToFunctionLvalue = false;
     ICS.UserDefined.After.BindsToRvalue = !LValRefType;
-    ICS.UserDefined.After.IsImplicitObjectArgumentQualificationConversion =
-        false;
     ICS.UserDefined.After.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.UserDefined.After.ObjCLifetimeConversionBinding = false;
     ICS.UserDefined.After.FromBracedInitList = false;
@@ -5807,7 +5802,6 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       StandardConversionSequence &SCS = Result.isStandard() ? Result.Standard :
                                             Result.UserDefined.After;
       SCS.ReferenceBinding = true;
-      SCS.IsImplicitObjectArgumentQualificationConversion = false;
       SCS.IsLvalueReference = ToType->isLValueReferenceType();
       SCS.BindsToRvalue = true;
       SCS.BindsToFunctionLvalue = false;
@@ -6005,12 +5999,8 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   // affects the conversion rank.
   QualType ClassTypeCanon = S.Context.getCanonicalType(ClassType);
   ImplicitConversionKind SecondKind;
-  bool IsQualificationConversion = false;
-  if (ImplicitParamType.getCanonicalType() == FromTypeCanon) {
+  if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
     SecondKind = ICK_Identity;
-  } else if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
-    SecondKind = ICK_Identity;
-    IsQualificationConversion = true;
   } else if (S.IsDerivedFrom(Loc, FromType, ClassType)) {
     SecondKind = ICK_Derived_To_Base;
   } else if (!Method->isExplicitObjectMemberFunction()) {
@@ -6051,8 +6041,6 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   ICS.Standard.setFromType(FromType);
   ICS.Standard.setAllToTypes(ImplicitParamType);
   ICS.Standard.ReferenceBinding = true;
-  ICS.Standard.IsImplicitObjectArgumentQualificationConversion =
-      IsQualificationConversion;
   ICS.Standard.DirectBinding = true;
   ICS.Standard.IsLvalueReference = Method->getRefQualifier() != RQ_RValue;
   ICS.Standard.BindsToFunctionLvalue = false;
@@ -11366,55 +11354,18 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S,
       DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled;
 
   if (TwoPhaseResolution) {
-
-    PerfectViableFunction(S, Loc, Best);
-    if (Best != end())
-      return ResultForBestCandidate(Best);
+    OverloadingResult Res = BestViableFunctionImpl(S, Loc, Best);
+    if (Best != end() && Best->isPerfectMatch(S.Context)) {
+      if (!(HasDeferredTemplateConstructors &&
+            isa_and_nonnull<CXXConversionDecl>(Best->Function)))
+        return Res;
+    }
   }
 
   InjectNonDeducedTemplateCandidates(S);
   return BestViableFunctionImpl(S, Loc, Best);
 }
 
-void OverloadCandidateSet::PerfectViableFunction(
-    Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) {
-
-  Best = end();
-  for (auto It = Candidates.begin(); It != Candidates.end(); ++It) {
-
-    if (!It->isPerfectMatch(S.getASTContext()))
-      continue;
-
-    // We found a suitable conversion function
-    // but if there is a template constructor in the target class
-    // we might prefer that instead.
-    if (HasDeferredTemplateConstructors &&
-        isa_and_nonnull<CXXConversionDecl>(It->Function)) {
-      Best = end();
-      break;
-    }
-
-    if (Best == end()) {
-      Best = It;
-      continue;
-    }
-    if (Best->Function && It->Function) {
-      FunctionDecl *D =
-          S.getMoreConstrainedFunction(Best->Function, It->Function);
-      if (D == nullptr) {
-        Best = end();
-        break;
-      }
-      if (D == It->Function)
-        Best = It;
-      continue;
-    }
-    // ambiguous
-    Best = end();
-    break;
-  }
-}
-
 OverloadingResult OverloadCandidateSet::BestViableFunctionImpl(
     Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) {
 
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 43f7992906c54..994cd07c1e263 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -1635,6 +1635,116 @@ bool SemaRISCV::isValidFMVExtension(StringRef Ext) {
   return -1 != RISCVISAInfo::getRISCVFeaturesBitsInfo(Ext).second;
 }
 
+bool SemaRISCV::checkTargetVersionAttr(const StringRef Param,
+                                       const SourceLocation Loc) {
+  using namespace DiagAttrParams;
+
+  llvm::SmallVector<StringRef, 8> AttrStrs;
+  Param.split(AttrStrs, ';');
+
+  bool HasArch = false;
+  bool HasPriority = false;
+  bool HasDefault = false;
+  bool DuplicateAttr = false;
+  for (StringRef AttrStr : AttrStrs) {
+    AttrStr = AttrStr.trim();
+    // Only support arch=+ext,... syntax.
+    if (AttrStr.starts_with("arch=+")) {
+      DuplicateAttr = HasArch;
+      HasArch = true;
+      ParsedTargetAttr TargetAttr =
+          getASTContext().getTargetInfo().parseTargetAttr(AttrStr);
+
+      if (TargetAttr.Features.empty() ||
+          llvm::any_of(TargetAttr.Features, [&](const StringRef Ext) {
+            return !isValidFMVExtension(Ext);
+          }))
+        return Diag(Loc, diag::warn_unsupported_target_attribute)
+               << Unsupported << None << AttrStr << TargetVersion;
+    } else if (AttrStr == "default") {
+      DuplicateAttr = HasDefault;
+      HasDefault = true;
+    } else if (AttrStr.consume_front("priority=")) {
+      DuplicateAttr = HasPriority;
+      HasPriority = true;
+      unsigned Digit;
+      if (AttrStr.getAsInteger(0, Digit))
+        return Diag(Loc, diag::warn_unsupported_target_attribute)
+               << Unsupported << None << AttrStr << TargetVersion;
+    } else {
+      return Diag(Loc, diag::warn_unsupported_target_attribute)
+             << Unsupported << None << AttrStr << TargetVersion;
+    }
+  }
+
+  if (((HasPriority || HasArch) && HasDefault) || DuplicateAttr ||
+      (HasPriority && !HasArch))
+    return Diag(Loc, diag::warn_unsupported_target_attribute)
+           << Unsupported << None << Param << TargetVersion;
+
+  return false;
+}
+
+bool SemaRISCV::checkTargetClonesAttr(
+    SmallVectorImpl<StringRef> &Params, SmallVectorImpl<SourceLocation> &Locs,
+    SmallVectorImpl<SmallString<64>> &NewParams) {
+  using namespace DiagAttrParams;
+
+  assert(Params.size() == Locs.size() &&
+         "Mismatch between number of string parameters and locations");
+
+  bool HasDefault = false;
+  for (unsigned I = 0, E = Params.size(); I < E; ++I) {
+    const StringRef Param = Params[I].trim();
+    const SourceLocation &Loc = Locs[I];
+
+    llvm::SmallVector<StringRef, 8> AttrStrs;
+    Param.split(AttrStrs, ';');
+
+    bool IsPriority = false;
+    bool IsDefault = false;
+    for (StringRef AttrStr : AttrStrs) {
+      AttrStr = AttrStr.trim();
+      // Only support arch=+ext,... syntax.
+      if (AttrStr.starts_with("arch=+")) {
+        ParsedTargetAttr TargetAttr =
+            getASTContext().getTargetInfo().parseTargetAttr(AttrStr);
+
+        if (TargetAttr.Features.empty() ||
+            llvm::any_of(TargetAttr.Features, [&](const StringRef Ext) {
+              return !isValidFMVExtension(Ext);
+            }))
+          return Diag(Loc, diag::warn_unsupported_target_attribute)
+                 << Unsupported << None << Param << TargetClones;
+      } else if (AttrStr == "default") {
+        IsDefault = true;
+        HasDefault = true;
+      } else if (AttrStr.consume_front("priority=")) {
+        IsPriority = true;
+        unsigned Digit;
+        if (AttrStr.getAsInteger(0, Digit))
+          return Diag(Loc, diag::warn_unsupported_target_attribute)
+                 << Unsupported << None << Param << TargetClones;
+      } else {
+        return Diag(Loc, diag::warn_unsupported_target_attribute)
+               << Unsupported << None << Param << TargetClones;
+      }
+    }
+
+    if (IsPriority && IsDefault)
+      return Diag(Loc, diag::warn_unsupported_target_attribute)
+             << Unsupported << None << Param << TargetClones;
+
+    if (llvm::is_contained(NewParams, Param))
+      Diag(Loc, diag::warn_target_clone_duplicate_options);
+    NewParams.push_back(Param);
+  }
+  if (!HasDefault)
+    return Diag(Locs[0], diag::err_target_clone_must_have_default);
+
+  return false;
+}
+
 SemaRISCV::SemaRISCV(Sema &S) : SemaBase(S) {}
 
 } // namespace clang
diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp
index 76d3cff908b37..c8ea0d09c4081 100644
--- a/clang/lib/Sema/SemaSPIRV.cpp
+++ b/clang/lib/Sema/SemaSPIRV.cpp
@@ -46,6 +46,49 @@ static bool CheckAllArgsHaveSameType(Sema *S, CallExpr *TheCall) {
   return false;
 }
 
+static bool CheckAllArgTypesAreCorrect(
+    Sema *S, CallExpr *TheCall,
+    llvm::ArrayRef<
+        llvm::function_ref<bool(Sema *, SourceLocation, int, QualType)>>
+        Checks) {
+  unsigned NumArgs = TheCall->getNumArgs();
+  assert(Checks.size() == NumArgs &&
+         "Wrong number of checks for Number of args.");
+  // Apply each check to the corresponding argument
+  for (unsigned I = 0; I < NumArgs; ++I) {
+    Expr *Arg = TheCall->getArg(I);
+    if (Checks[I](S, Arg->getBeginLoc(), I + 1, Arg->getType()))
+      return true;
+  }
+  return false;
+}
+
+static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
+                                           int ArgOrdinal,
+                                           clang::QualType PassedType) {
+  clang::QualType BaseType =
+      PassedType->isVectorType()
+          ? PassedType->castAs<clang::VectorType>()->getElementType()
+          : PassedType;
+  if (!BaseType->isHalfType() && !BaseType->isFloat16Type() &&
+      !BaseType->isFloat32Type())
+    return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
+           << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
+           << /* half or float */ 2 << PassedType;
+  return false;
+}
+
+static bool CheckFloatOrHalfScalarRepresentation(Sema *S, SourceLocation Loc,
+                                                 int ArgOrdinal,
+                                                 clang::QualType PassedType) {
+  if (!PassedType->isHalfType() && !PassedType->isFloat16Type() &&
+      !PassedType->isFloat32Type())
+    return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
+           << ArgOrdinal << /* scalar */ 1 << /* no int */ 0
+           << /* half or float */ 2 << PassedType;
+  return false;
+}
+
 static std::optional<int>
 processConstant32BitIntArgument(Sema &SemaRef, CallExpr *Call, int Argument) {
   ExprResult Arg =
@@ -235,6 +278,43 @@ bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(const TargetInfo &TI,
     TheCall->setType(RetTy);
     break;
   }
+  case SPIRV::BI__builtin_spirv_refract: {
+    if (SemaRef.checkArgCount(TheCall, 3))
+      return true;
+
+    llvm::function_ref<bool(Sema *, SourceLocation, int, QualType)>
+        ChecksArr[] = {CheckFloatOrHalfRepresentation,
+                       CheckFloatOrHalfRepresentation,
+                       CheckFloatOrHalfScalarRepresentation};
+    if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
+                                   llvm::ArrayRef(ChecksArr)))
+      return true;
+    // Check that first two arguments are vectors/scalars of the same type
+    QualType Arg0Type = TheCall->getArg(0)->getType();
+    if (!SemaRef.getASTContext().hasSameUnqualifiedType(
+            Arg0Type, TheCall->getArg(1)->getType()))
+      return SemaRef.Diag(TheCall->getBeginLoc(),
+                          diag::err_vec_builtin_incompatible_vector)
+             << TheCall->getDirectCallee() << /* first two */ 0
+             << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                            TheCall->getArg(1)->getEndLoc());
+
+    // Check that scalar type of 3rd arg is same as base type of first two args
+    clang::QualType BaseType =
+        Arg0Type->isVectorType()
+            ? Arg0Type->castAs<clang::VectorType>()->getElementType()
+            : Arg0Type;
+    if (!SemaRef.getASTContext().hasSameUnqualifiedType(
+            BaseType, TheCall->getArg(2)->getType()))
+      return SemaRef.Diag(TheCall->getBeginLoc(),
+                          diag::err_hlsl_builtin_scalar_vector_mismatch)
+             << /* all */ 0 << TheCall->getDirectCallee() << Arg0Type
+             << TheCall->getArg(2)->getType();
+
+    QualType RetTy = TheCall->getArg(0)->getType();
+    TheCall->setType(RetTy);
+    break;
+  }
   case SPIRV::BI__builtin_spirv_smoothstep: {
     if (SemaRef.checkArgCount(TheCall, 3))
       return true;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index b76619fc50268..698d1270be634 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -6299,7 +6299,6 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier(
   switch (NNS->getKind()) {
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
     return false;
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index d09a72b71b805..e1a975bcfb3e1 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3083,8 +3083,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
 
     // If there was no default argument, deduction is incomplete.
     if (DefArg.getArgument().isNull()) {
-      Info.Param = makeTemplateParameter(
-          const_cast<NamedDecl *>(TemplateParams->getParam(I)));
+      Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       Info.reset(
           TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted),
           TemplateArgumentList::CreateCopy(S.Context, CTAI.CanonicalConverted));
@@ -3100,8 +3099,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
     if (S.CheckTemplateArgument(
             Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(),
             /*ArgumentPackIndex=*/0, CTAI, Sema::CTAK_Specified)) {
-      Info.Param = makeTemplateParameter(
-                         const_cast<NamedDecl *>(TemplateParams->getParam(I)));
+      Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       // FIXME: These template arguments are temporary. Free them!
       Info.reset(
           TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted),
@@ -3227,7 +3225,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
     if (ParamIdx >= TPL->size())
       ParamIdx = TPL->size() - 1;
 
-    Decl *Param = const_cast<NamedDecl *>(TPL->getParam(ParamIdx));
+    Decl *Param = TPL->getParam(ParamIdx);
     Info.Param = makeTemplateParameter(Param);
     Info.FirstArg = Ps[ArgIdx].getArgument();
     return TemplateDeductionResult::SubstitutionFailure;
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 5c149bdec7073..850bcb17bece1 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -954,6 +954,11 @@ bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
     l = 0;
     u = 15;
     break;
+  case X86::BI__builtin_ia32_prefetchi:
+    i = 1;
+    l = 2; // _MM_HINT_T1
+    u = 3; // _MM_HINT_T0
+    break;
   }
 
   // Note that we don't force a hard error on the range check here, allowing
@@ -1056,4 +1061,61 @@ void SemaX86::handleForceAlignArgPointerAttr(Decl *D, const ParsedAttr &AL) {
                  X86ForceAlignArgPointerAttr(getASTContext(), AL));
 }
 
+bool SemaX86::checkTargetClonesAttr(
+    SmallVectorImpl<StringRef> &Params, SmallVectorImpl<SourceLocation> &Locs,
+    SmallVectorImpl<SmallString<64>> &NewParams) {
+  using namespace DiagAttrParams;
+
+  assert(Params.size() == Locs.size() &&
+         "Mismatch between number of string parameters and locations");
+
+  bool HasDefault = false;
+  bool HasComma = false;
+  for (unsigned I = 0, E = Params.size(); I < E; ++I) {
+    const StringRef Param = Params[I].trim();
+    const SourceLocation &Loc = Locs[I];
+
+    if (Param.empty() || Param.ends_with(','))
+      return Diag(Loc, diag::warn_unsupported_target_attribute)
+             << Unsupported << None << "" << TargetClones;
+
+    if (Param.contains(','))
+      HasComma = true;
+
+    StringRef LHS;
+    StringRef RHS = Param;
+    do {
+      std::tie(LHS, RHS) = RHS.split(',');
+      LHS = LHS.trim();
+      const SourceLocation &CurLoc =
+          Loc.getLocWithOffset(LHS.data() - Param.data());
+
+      if (LHS.starts_with("arch=")) {
+        if (!getASTContext().getTargetInfo().isValidCPUName(
+                LHS.drop_front(sizeof("arch=") - 1)))
+          return Diag(CurLoc, diag::warn_unsupported_target_attribute)
+                 << Unsupported << CPU << LHS.drop_front(sizeof("arch=") - 1)
+                 << TargetClones;
+      } else if (LHS == "default")
+        HasDefault = true;
+      else if (!getASTContext().getTargetInfo().isValidFeatureName(LHS) ||
+               getASTContext().getTargetInfo().getFMVPriority(LHS) == 0)
+        return Diag(CurLoc, diag::warn_unsupported_target_attribute)
+               << Unsupported << None << LHS << TargetClones;
+
+      if (llvm::is_contained(NewParams, LHS))
+        Diag(CurLoc, diag::warn_target_clone_duplicate_options);
+      // Note: Add even if there are duplicates, since it changes name mangling.
+      NewParams.push_back(LHS);
+    } while (!RHS.empty());
+  }
+  if (HasComma && Params.size() > 1)
+    Diag(Locs[0], diag::warn_target_clone_mixed_values);
+
+  if (!HasDefault)
+    return Diag(Locs[0], diag::err_target_clone_must_have_default);
+
+  return false;
+}
+
 } // namespace clang
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 3e38f8b183dfd..c7428d1a02345 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4621,22 +4621,12 @@ NestedNameSpecifierLoc TreeTransform<Derived>::TransformNestedNameSpecifierLoc(
     }
 
     case NestedNameSpecifier::Namespace: {
-      NamespaceDecl *NS =
-          cast_or_null<NamespaceDecl>(getDerived().TransformDecl(
-              Q.getLocalBeginLoc(), QNNS->getAsNamespace()));
+      auto *NS = cast<NamespaceBaseDecl>(getDerived().TransformDecl(
+          Q.getLocalBeginLoc(), QNNS->getAsNamespace()));
       SS.Extend(SemaRef.Context, NS, Q.getLocalBeginLoc(), Q.getLocalEndLoc());
       break;
     }
 
-    case NestedNameSpecifier::NamespaceAlias: {
-      NamespaceAliasDecl *Alias =
-          cast_or_null<NamespaceAliasDecl>(getDerived().TransformDecl(
-              Q.getLocalBeginLoc(), QNNS->getAsNamespaceAlias()));
-      SS.Extend(SemaRef.Context, Alias, Q.getLocalBeginLoc(),
-                Q.getLocalEndLoc());
-      break;
-    }
-
     case NestedNameSpecifier::Global:
       // There is no meaningful transformation that one could perform on the
       // global scope.
@@ -7255,6 +7245,12 @@ QualType TreeTransform<Derived>::TransformDependentBitIntType(
   return Result;
 }
 
+template <typename Derived>
+QualType TreeTransform<Derived>::TransformPredefinedSugarType(
+    TypeLocBuilder &TLB, PredefinedSugarTypeLoc TL) {
+  llvm_unreachable("This type does not need to be transformed.");
+}
+
   /// Simple iterator that traverses the template arguments in a
   /// container that provides a \c getArgLoc() member function.
   ///
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index d0bb7fb1d06ad..10aedb68fcd9d 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7574,11 +7574,16 @@ void TypeLocReader::VisitPipeTypeLoc(PipeTypeLoc TL) {
 void TypeLocReader::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) {
   TL.setNameLoc(readSourceLocation());
 }
+
 void TypeLocReader::VisitDependentBitIntTypeLoc(
     clang::DependentBitIntTypeLoc TL) {
   TL.setNameLoc(readSourceLocation());
 }
 
+void TypeLocReader::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) {
+  // Nothing to do.
+}
+
 void ASTRecordReader::readTypeLoc(TypeLoc TL) {
   TypeLocReader TLR(*this);
   for (; !TL.isNull(); TL = TL.getNextTypeLoc())
@@ -10107,19 +10112,12 @@ ASTRecordReader::readNestedNameSpecifierLoc() {
     }
 
     case NestedNameSpecifier::Namespace: {
-      NamespaceDecl *NS = readDeclAs<NamespaceDecl>();
+      auto *NS = readDeclAs<NamespaceBaseDecl>();
       SourceRange Range = readSourceRange();
       Builder.Extend(Context, NS, Range.getBegin(), Range.getEnd());
       break;
     }
 
-    case NestedNameSpecifier::NamespaceAlias: {
-      NamespaceAliasDecl *Alias = readDeclAs<NamespaceAliasDecl>();
-      SourceRange Range = readSourceRange();
-      Builder.Extend(Context, Alias, Range.getBegin(), Range.getEnd());
-      break;
-    }
-
     case NestedNameSpecifier::TypeSpec: {
       TypeSourceInfo *T = readTypeSourceInfo();
       if (!T)
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index b918bfbd549c3..bd84a9741d01b 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1889,7 +1889,7 @@ void ASTDeclReader::VisitNamespaceAliasDecl(NamespaceAliasDecl *D) {
   D->NamespaceLoc = readSourceLocation();
   D->IdentLoc = readSourceLocation();
   D->QualifierLoc = Record.readNestedNameSpecifierLoc();
-  D->Namespace = readDeclAs<NamedDecl>();
+  D->Namespace = readDeclAs<NamespaceBaseDecl>();
   mergeRedeclarable(D, Redecl);
 }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 847283e9842e5..a6957e54b66f1 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -692,7 +692,6 @@ void TypeLocWriter::VisitAtomicTypeLoc(AtomicTypeLoc TL) {
 void TypeLocWriter::VisitPipeTypeLoc(PipeTypeLoc TL) {
   addSourceLocation(TL.getKWLoc());
 }
-
 void TypeLocWriter::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) {
   addSourceLocation(TL.getNameLoc());
 }
@@ -701,6 +700,11 @@ void TypeLocWriter::VisitDependentBitIntTypeLoc(
   addSourceLocation(TL.getNameLoc());
 }
 
+void TypeLocWriter::VisitPredefinedSugarTypeLoc(
+    clang::PredefinedSugarTypeLoc TL) {
+  // Nothing to do.
+}
+
 void ASTWriter::WriteTypeAbbrevs() {
   using namespace llvm;
 
@@ -7093,11 +7097,6 @@ void ASTRecordWriter::AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) {
       AddSourceRange(NNS.getLocalSourceRange());
       break;
 
-    case NestedNameSpecifier::NamespaceAlias:
-      AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespaceAlias());
-      AddSourceRange(NNS.getLocalSourceRange());
-      break;
-
     case NestedNameSpecifier::TypeSpec:
       AddTypeRef(NNS.getTypeLoc().getType());
       AddTypeLoc(NNS.getTypeLoc());
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 30a04977d906d..68efdbaec341b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -1281,7 +1281,7 @@ SVal MallocChecker::evalMulForBufferSize(CheckerContext &C, const Expr *Blocks,
   SVal BlockBytesVal = C.getSVal(BlockBytes);
   ProgramStateRef State = C.getState();
   SVal TotalSize = SB.evalBinOp(State, BO_Mul, BlocksVal, BlockBytesVal,
-                                SB.getContext().getSizeType());
+                                SB.getContext().getCanonicalSizeType());
   return TotalSize;
 }
 
@@ -1311,11 +1311,9 @@ static bool isStandardRealloc(const CallEvent &Call) {
   const FunctionDecl *FD = dyn_cast<FunctionDecl>(Call.getDecl());
   assert(FD);
   ASTContext &AC = FD->getASTContext();
-
-  return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy &&
-         FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy &&
-         FD->getParamDecl(1)->getType().getDesugaredType(AC) ==
-             AC.getSizeType();
+  return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) &&
+         AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) &&
+         AC.hasSameType(FD->getParamDecl(1)->getType(), AC.getSizeType());
 }
 
 static bool isGRealloc(const CallEvent &Call) {
@@ -1323,10 +1321,9 @@ static bool isGRealloc(const CallEvent &Call) {
   assert(FD);
   ASTContext &AC = FD->getASTContext();
 
-  return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy &&
-         FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy &&
-         FD->getParamDecl(1)->getType().getDesugaredType(AC) ==
-             AC.UnsignedLongTy;
+  return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) &&
+         AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) &&
+         AC.hasSameType(FD->getParamDecl(1)->getType(), AC.UnsignedLongTy);
 }
 
 void MallocChecker::checkRealloc(ProgramStateRef State, const CallEvent &Call,
@@ -2830,10 +2827,10 @@ MallocChecker::ReallocMemAux(CheckerContext &C, const CallEvent &Call,
     return nullptr;
 
   // Compare the size argument to 0.
-  DefinedOrUnknownSVal SizeZero =
-      svalBuilder.evalEQ(State, TotalSize.castAs<DefinedOrUnknownSVal>(),
-                         svalBuilder.makeIntValWithWidth(
-                             svalBuilder.getContext().getSizeType(), 0));
+  DefinedOrUnknownSVal SizeZero = svalBuilder.evalEQ(
+      State, TotalSize.castAs<DefinedOrUnknownSVal>(),
+      svalBuilder.makeIntValWithWidth(
+          svalBuilder.getContext().getCanonicalSizeType(), 0));
 
   ProgramStateRef StatePtrIsNull, StatePtrNotNull;
   std::tie(StatePtrIsNull, StatePtrNotNull) = State->assume(PtrEQ);
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 1c748f9bc1828..52b3d1e95942c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -1666,7 +1666,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   const QualType IntTy = ACtx.IntTy;
   const QualType UnsignedIntTy = ACtx.UnsignedIntTy;
   const QualType LongTy = ACtx.LongTy;
-  const QualType SizeTy = ACtx.getSizeType();
+  const QualType SizeTyCanonTy = ACtx.getCanonicalSizeType();
 
   const QualType VoidPtrTy = getPointerTy(VoidTy); // void *
   const QualType IntPtrTy = getPointerTy(IntTy);   // int *
@@ -1684,14 +1684,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   const QualType ConstWchar_tPtrTy =
       getPointerTy(getConstTy(WCharTy)); // const wchar_t *
   const QualType ConstVoidPtrRestrictTy = getRestrictTy(ConstVoidPtrTy);
-  const QualType SizePtrTy = getPointerTy(SizeTy);
+  const QualType SizePtrTy = getPointerTy(SizeTyCanonTy);
   const QualType SizePtrRestrictTy = getRestrictTy(SizePtrTy);
 
   const RangeInt IntMax = BVF.getMaxValue(IntTy)->getLimitedValue();
   const RangeInt UnsignedIntMax =
       BVF.getMaxValue(UnsignedIntTy)->getLimitedValue();
   const RangeInt LongMax = BVF.getMaxValue(LongTy)->getLimitedValue();
-  const RangeInt SizeMax = BVF.getMaxValue(SizeTy)->getLimitedValue();
+  const RangeInt SizeMax = BVF.getMaxValue(SizeTyCanonTy)->getLimitedValue();
 
   // Set UCharRangeMax to min of int or uchar maximum value.
   // The C standard states that the arguments of functions like isalpha must
@@ -2057,18 +2057,19 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
 
   // size_t fread(void *restrict ptr, size_t size, size_t nitems,
   //              FILE *restrict stream);
-  addToFunctionSummaryMap(
-      "fread",
-      Signature(ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, FilePtrRestrictTy},
-                RetType{SizeTy}),
-      FreadSummary);
+  addToFunctionSummaryMap("fread",
+                          Signature(ArgTypes{VoidPtrRestrictTy, SizeTyCanonTy,
+                                             SizeTyCanonTy, FilePtrRestrictTy},
+                                    RetType{SizeTyCanonTy}),
+                          FreadSummary);
   // size_t fwrite(const void *restrict ptr, size_t size, size_t nitems,
   //               FILE *restrict stream);
-  addToFunctionSummaryMap("fwrite",
-                          Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTy,
-                                             SizeTy, FilePtrRestrictTy},
-                                    RetType{SizeTy}),
-                          FreadSummary);
+  addToFunctionSummaryMap(
+      "fwrite",
+      Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTyCanonTy, SizeTyCanonTy,
+                         FilePtrRestrictTy},
+                RetType{SizeTyCanonTy}),
+      FreadSummary);
 
   std::optional<QualType> Ssize_tTy = lookupTy("ssize_t");
   std::optional<RangeInt> Ssize_tMax = getMaxValue(Ssize_tTy);
@@ -2083,12 +2084,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   // should handle them together with the rest of the POSIX functions.
   // ssize_t read(int fildes, void *buf, size_t nbyte);
   addToFunctionSummaryMap(
-      "read", Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy}, RetType{Ssize_tTy}),
+      "read",
+      Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy}, RetType{Ssize_tTy}),
       ReadSummary);
   // ssize_t write(int fildes, const void *buf, size_t nbyte);
   addToFunctionSummaryMap(
       "write",
-      Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy}, RetType{Ssize_tTy}),
+      Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy},
+                RetType{Ssize_tTy}),
       ReadSummary);
 
   auto GetLineSummary =
@@ -2618,7 +2621,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // char *strndup(const char *s, size_t n);
     addToFunctionSummaryMap(
         "strndup",
-        Signature(ArgTypes{ConstCharPtrTy, SizeTy}, RetType{CharPtrTy}),
+        Signature(ArgTypes{ConstCharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}),
         Summary(NoEvalCall)
             .ArgConstraint(NotNull(ArgNo(0)))
             .ArgConstraint(
@@ -2649,7 +2652,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
 
     // char *getcwd(char *buf, size_t size);
     addToFunctionSummaryMap(
-        "getcwd", Signature(ArgTypes{CharPtrTy, SizeTy}, RetType{CharPtrTy}),
+        "getcwd",
+        Signature(ArgTypes{CharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}),
         Summary(NoEvalCall)
             .Case({NotNull(0),
                    ArgumentCondition(1, WithinRange, Range(1, SizeMax)),
@@ -2957,8 +2961,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // FIXME: Improve for errno modeling.
     addToFunctionSummaryMap(
         "mmap",
-        Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off_tTy},
-                  RetType{VoidPtrTy}),
+        Signature(
+            ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off_tTy},
+            RetType{VoidPtrTy}),
         Summary(NoEvalCall)
             .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax)))
             .ArgConstraint(
@@ -2970,8 +2975,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // FIXME: Improve for errno modeling.
     addToFunctionSummaryMap(
         "mmap64",
-        Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off64_tTy},
-                  RetType{VoidPtrTy}),
+        Signature(
+            ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off64_tTy},
+            RetType{VoidPtrTy}),
         Summary(NoEvalCall)
             .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax)))
             .ArgConstraint(
@@ -3002,8 +3008,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     //                  size_t bufsize);
     addToFunctionSummaryMap(
         "readlink",
-        Signature(ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy},
-                  RetType{Ssize_tTy}),
+        Signature(
+            ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTyCanonTy},
+            RetType{Ssize_tTy}),
         Summary(NoEvalCall)
             .Case({ArgumentCondition(2, WithinRange, Range(1, IntMax)),
                    ReturnValueCondition(LessThanOrEq, ArgNo(2)),
@@ -3025,9 +3032,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     //                    char *restrict buf, size_t bufsize);
     addToFunctionSummaryMap(
         "readlinkat",
-        Signature(
-            ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy},
-            RetType{Ssize_tTy}),
+        Signature(ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy,
+                           SizeTyCanonTy},
+                  RetType{Ssize_tTy}),
         Summary(NoEvalCall)
             .Case({ArgumentCondition(3, WithinRange, Range(1, IntMax)),
                    ReturnValueCondition(LessThanOrEq, ArgNo(3)),
@@ -3268,14 +3275,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             //                  size_t length,
             //                  int flags, struct sockaddr *restrict address,
             //                  socklen_t *restrict address_len);
-            Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy,
+            Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy,
                                StructSockaddrPtrRestrictTy,
                                Socklen_tPtrRestrictTy},
                       RetType{Ssize_tTy}),
             Recvfrom))
       addToFunctionSummaryMap(
           "recvfrom",
-          Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy,
+          Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy,
                              Irrelevant, Socklen_tPtrRestrictTy},
                     RetType{Ssize_tTy}),
           Recvfrom);
@@ -3297,14 +3304,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             // ssize_t sendto(int socket, const void *message, size_t length,
             //                int flags, const struct sockaddr *dest_addr,
             //                socklen_t dest_len);
-            Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy,
+            Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy,
                                ConstStructSockaddrPtrTy, Socklen_tTy},
                       RetType{Ssize_tTy}),
             Sendto))
       addToFunctionSummaryMap(
           "sendto",
-          Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, Irrelevant,
-                             Socklen_tTy},
+          Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy,
+                             Irrelevant, Socklen_tTy},
                     RetType{Ssize_tTy}),
           Sendto);
 
@@ -3320,7 +3327,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // ssize_t recv(int sockfd, void *buf, size_t len, int flags);
     addToFunctionSummaryMap(
         "recv",
-        Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy, IntTy},
+        Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy, IntTy},
                   RetType{Ssize_tTy}),
         Summary(NoEvalCall)
             .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)),
@@ -3395,7 +3402,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // ssize_t send(int sockfd, const void *buf, size_t len, int flags);
     addToFunctionSummaryMap(
         "send",
-        Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy},
+        Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy},
                   RetType{Ssize_tTy}),
         Summary(NoEvalCall)
             .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)),
@@ -3683,7 +3690,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
     // int pthread_attr_setguardsize(pthread_attr_t *attr, size_t guardsize);
     addToFunctionSummaryMap(
         {"pthread_attr_setstacksize", "pthread_attr_setguardsize"},
-        Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTy}, RetType{IntTy}),
+        Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTyCanonTy}, RetType{IntTy}),
         Summary(NoEvalCall)
             .ArgConstraint(NotNull(ArgNo(0)))
             .ArgConstraint(
@@ -3888,13 +3895,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             .ArgConstraint(NotNull(ArgNo(1))));
     addToFunctionSummaryMap(
         "__buf_size_arg_constraint",
-        Signature(ArgTypes{ConstVoidPtrTy, SizeTy}, RetType{IntTy}),
+        Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy}, RetType{IntTy}),
         Summary(EvalCallAsPure)
             .ArgConstraint(
                 BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1))));
     addToFunctionSummaryMap(
         "__buf_size_arg_constraint_mul",
-        Signature(ArgTypes{ConstVoidPtrTy, SizeTy, SizeTy}, RetType{IntTy}),
+        Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy, SizeTyCanonTy},
+                  RetType{IntTy}),
         Summary(EvalCallAsPure)
             .ArgConstraint(BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1),
                                       /*BufSizeMultiplier=*/ArgNo(2))));
diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
index 1042b43680fd2..c97341f072aba 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
@@ -92,7 +92,7 @@ ProgramStateRef VLASizeChecker::checkVLA(CheckerContext &C,
 
   ASTContext &Ctx = C.getASTContext();
   SValBuilder &SVB = C.getSValBuilder();
-  CanQualType SizeTy = Ctx.getSizeType();
+  QualType SizeTy = Ctx.getSizeType();
   uint64_t SizeMax =
       SVB.getBasicValueFactory().getMaxValue(SizeTy)->getZExtValue();
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index c77ef26da568d..d87484470f8b5 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1941,7 +1941,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::ConceptSpecializationExprClass:
     case Stmt::CXXRewrittenBinaryOperatorClass:
     case Stmt::RequiresExprClass:
-    case Expr::CXXParenListInitExprClass:
     case Stmt::EmbedExprClass:
       // Fall through.
 
@@ -2315,11 +2314,22 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       break;
     }
 
-    case Stmt::InitListExprClass:
+    case Stmt::InitListExprClass: {
+      const InitListExpr *E = cast<InitListExpr>(S);
       Bldr.takeNodes(Pred);
-      VisitInitListExpr(cast<InitListExpr>(S), Pred, Dst);
+      ConstructInitList(E, E->inits(), E->isTransparent(), Pred, Dst);
       Bldr.addNodes(Dst);
       break;
+    }
+
+    case Expr::CXXParenListInitExprClass: {
+      const CXXParenListInitExpr *E = cast<CXXParenListInitExpr>(S);
+      Bldr.takeNodes(Pred);
+      ConstructInitList(E, E->getInitExprs(), /*IsTransparent*/ false, Pred,
+                        Dst);
+      Bldr.addNodes(Dst);
+      break;
+    }
 
     case Stmt::MemberExprClass:
       Bldr.takeNodes(Pred);
@@ -4114,3 +4124,33 @@ void *ProgramStateTrait<ReplayWithoutInlining>::GDMIndex() {
 }
 
 void ExprEngine::anchor() { }
+
+void ExprEngine::ConstructInitList(const Expr *E, ArrayRef<Expr *> Args,
+                                   bool IsTransparent, ExplodedNode *Pred,
+                                   ExplodedNodeSet &Dst) {
+  assert((isa<InitListExpr, CXXParenListInitExpr>(E)));
+
+  const LocationContext *LC = Pred->getLocationContext();
+
+  StmtNodeBuilder B(Pred, Dst, *currBldrCtx);
+  ProgramStateRef S = Pred->getState();
+  QualType T = E->getType().getCanonicalType();
+
+  bool IsCompound = T->isArrayType() || T->isRecordType() ||
+                    T->isAnyComplexType() || T->isVectorType();
+
+  if (Args.size() > 1 || (E->isPRValue() && IsCompound && !IsTransparent)) {
+    llvm::ImmutableList<SVal> ArgList = getBasicVals().getEmptySValList();
+    for (Expr *E : llvm::reverse(Args))
+      ArgList = getBasicVals().prependSVal(S->getSVal(E, LC), ArgList);
+
+    B.generateNode(E, Pred,
+                   S->BindExpr(E, LC, svalBuilder.makeCompoundVal(T, ArgList)));
+  } else {
+    B.generateNode(E, Pred,
+                   S->BindExpr(E, LC,
+                               Args.size() == 0
+                                   ? getSValBuilder().makeZeroVal(T)
+                                   : S->getSVal(Args.front(), LC)));
+  }
+}
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index fa8e669b6bb2f..f1a25a750dd0d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -771,54 +771,6 @@ void ExprEngine::VisitLogicalExpr(const BinaryOperator* B, ExplodedNode *Pred,
   Bldr.generateNode(B, Pred, state->BindExpr(B, Pred->getLocationContext(), X));
 }
 
-void ExprEngine::VisitInitListExpr(const InitListExpr *IE,
-                                   ExplodedNode *Pred,
-                                   ExplodedNodeSet &Dst) {
-  StmtNodeBuilder B(Pred, Dst, *currBldrCtx);
-
-  ProgramStateRef state = Pred->getState();
-  const LocationContext *LCtx = Pred->getLocationContext();
-  QualType T = getContext().getCanonicalType(IE->getType());
-  unsigned NumInitElements = IE->getNumInits();
-
-  if (!IE->isGLValue() && !IE->isTransparent() &&
-      (T->isArrayType() || T->isRecordType() || T->isVectorType() ||
-       T->isAnyComplexType())) {
-    llvm::ImmutableList<SVal> vals = getBasicVals().getEmptySValList();
-
-    // Handle base case where the initializer has no elements.
-    // e.g: static int* myArray[] = {};
-    if (NumInitElements == 0) {
-      SVal V = svalBuilder.makeCompoundVal(T, vals);
-      B.generateNode(IE, Pred, state->BindExpr(IE, LCtx, V));
-      return;
-    }
-
-    for (const Stmt *S : llvm::reverse(*IE)) {
-      SVal V = state->getSVal(cast<Expr>(S), LCtx);
-      vals = getBasicVals().prependSVal(V, vals);
-    }
-
-    B.generateNode(IE, Pred,
-                   state->BindExpr(IE, LCtx,
-                                   svalBuilder.makeCompoundVal(T, vals)));
-    return;
-  }
-
-  // Handle scalars: int{5} and int{} and GLvalues.
-  // Note, if the InitListExpr is a GLvalue, it means that there is an address
-  // representing it, so it must have a single init element.
-  assert(NumInitElements <= 1);
-
-  SVal V;
-  if (NumInitElements == 0)
-    V = getSValBuilder().makeZeroVal(T);
-  else
-    V = state->getSVal(IE->getInit(0), LCtx);
-
-  B.generateNode(IE, Pred, state->BindExpr(IE, LCtx, V));
-}
-
 void ExprEngine::VisitGuardedExpr(const Expr *Ex,
                                   const Expr *L,
                                   const Expr *R,
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 9bd85479d9810..8ce2706cb1062 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -24,7 +24,6 @@
 #include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
 #include "clang/Tooling/DependencyScanning/InProcessModuleCache.h"
 #include "clang/Tooling/DependencyScanning/ModuleDepCollector.h"
-#include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
@@ -376,25 +375,23 @@ class ScanningDependencyDirectivesGetter : public DependencyDirectivesGetter {
 
 /// A clang tool that runs the preprocessor in a mode that's optimized for
 /// dependency scanning for the given compiler invocation.
-class DependencyScanningAction : public tooling::ToolAction {
+class DependencyScanningAction {
 public:
   DependencyScanningAction(
       DependencyScanningService &Service, StringRef WorkingDirectory,
       DependencyConsumer &Consumer, DependencyActionController &Controller,
       llvm::IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS,
-      bool DisableFree, std::optional<StringRef> ModuleName = std::nullopt)
+      std::optional<StringRef> ModuleName = std::nullopt)
       : Service(Service), WorkingDirectory(WorkingDirectory),
         Consumer(Consumer), Controller(Controller), DepFS(std::move(DepFS)),
-        DisableFree(DisableFree), ModuleName(ModuleName) {}
+        ModuleName(ModuleName) {}
 
   bool runInvocation(std::shared_ptr<CompilerInvocation> Invocation,
-                     FileManager *DriverFileMgr,
+                     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps,
-                     DiagnosticConsumer *DiagConsumer) override {
+                     DiagnosticConsumer *DiagConsumer) {
     // Make a deep copy of the original Clang invocation.
     CompilerInvocation OriginalInvocation(*Invocation);
-    // Restore the value of DisableFree, which may be modified by Tooling.
-    OriginalInvocation.getFrontendOpts().DisableFree = DisableFree;
     if (any(Service.getOptimizeArgs() & ScanningOptimizations::Macros))
       canonicalizeDefines(OriginalInvocation.getPreprocessorOpts());
 
@@ -419,8 +416,8 @@ class DependencyScanningAction : public tooling::ToolAction {
     // Create the compiler's actual diagnostics engine.
     sanitizeDiagOpts(ScanInstance.getDiagnosticOpts());
     assert(!DiagConsumerFinished && "attempt to reuse finished consumer");
-    ScanInstance.createDiagnostics(DriverFileMgr->getVirtualFileSystem(),
-                                   DiagConsumer, /*ShouldOwnClient=*/false);
+    ScanInstance.createDiagnostics(*FS, DiagConsumer,
+                                   /*ShouldOwnClient=*/false);
     if (!ScanInstance.hasDiagnostics())
       return false;
 
@@ -431,6 +428,7 @@ class DependencyScanningAction : public tooling::ToolAction {
       ScanInstance.getHeaderSearchOpts().BuildSessionTimestamp =
           Service.getBuildSessionTimestamp();
 
+    ScanInstance.getFrontendOpts().DisableFree = false;
     ScanInstance.getFrontendOpts().GenerateGlobalModuleIndex = false;
     ScanInstance.getFrontendOpts().UseGlobalModuleIndex = false;
     // This will prevent us compiling individual modules asynchronously since
@@ -441,9 +439,9 @@ class DependencyScanningAction : public tooling::ToolAction {
         any(Service.getOptimizeArgs() & ScanningOptimizations::VFS);
 
     // Support for virtual file system overlays.
-    auto FS = createVFSFromCompilerInvocation(
-        ScanInstance.getInvocation(), ScanInstance.getDiagnostics(),
-        DriverFileMgr->getVirtualFileSystemPtr());
+    FS = createVFSFromCompilerInvocation(ScanInstance.getInvocation(),
+                                         ScanInstance.getDiagnostics(),
+                                         std::move(FS));
 
     // Create a new FileManager to match the invocation's FileSystemOptions.
     auto *FileMgr = ScanInstance.createFileManager(FS);
@@ -554,9 +552,6 @@ class DependencyScanningAction : public tooling::ToolAction {
     if (Result)
       setLastCC1Arguments(std::move(OriginalInvocation));
 
-    // Propagate the statistics to the parent FileManager.
-    DriverFileMgr->AddStats(ScanInstance.getFileManager());
-
     return Result;
   }
 
@@ -584,7 +579,6 @@ class DependencyScanningAction : public tooling::ToolAction {
   DependencyConsumer &Consumer;
   DependencyActionController &Controller;
   llvm::IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS;
-  bool DisableFree;
   std::optional<StringRef> ModuleName;
   std::optional<CompilerInstance> ScanInstanceStorage;
   std::shared_ptr<ModuleDepCollector> MDC;
@@ -669,15 +663,14 @@ llvm::Error DependencyScanningWorker::computeDependencies(
 }
 
 static bool forEachDriverJob(
-    ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags, FileManager &FM,
+    ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     llvm::function_ref<bool(const driver::Command &Cmd)> Callback) {
   SmallVector<const char *, 256> Argv;
   Argv.reserve(ArgStrs.size());
   for (const std::string &Arg : ArgStrs)
     Argv.push_back(Arg.c_str());
 
-  llvm::vfs::FileSystem *FS = &FM.getVirtualFileSystem();
-
   std::unique_ptr<driver::Driver> Driver = std::make_unique<driver::Driver>(
       Argv[0], llvm::sys::getDefaultTargetTriple(), Diags,
       "clang LLVM compiler", FS);
@@ -687,7 +680,8 @@ static bool forEachDriverJob(
   bool CLMode = driver::IsClangCL(
       driver::getDriverMode(Argv[0], ArrayRef(Argv).slice(1)));
 
-  if (llvm::Error E = driver::expandResponseFiles(Argv, CLMode, Alloc, FS)) {
+  if (llvm::Error E =
+          driver::expandResponseFiles(Argv, CLMode, Alloc, FS.get())) {
     Diags.Report(diag::err_drv_expand_response_file)
         << llvm::toString(std::move(E));
     return false;
@@ -710,17 +704,25 @@ static bool forEachDriverJob(
 
 static bool createAndRunToolInvocation(
     std::vector<std::string> CommandLine, DependencyScanningAction &Action,
-    FileManager &FM,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     std::shared_ptr<clang::PCHContainerOperations> &PCHContainerOps,
     DiagnosticsEngine &Diags, DependencyConsumer &Consumer) {
 
   // Save executable path before providing CommandLine to ToolInvocation
   std::string Executable = CommandLine[0];
-  ToolInvocation Invocation(std::move(CommandLine), &Action, &FM,
-                            PCHContainerOps);
-  Invocation.setDiagnosticConsumer(Diags.getClient());
-  Invocation.setDiagnosticOptions(&Diags.getDiagnosticOptions());
-  if (!Invocation.run())
+
+  llvm::opt::ArgStringList Argv;
+  for (const std::string &Str : ArrayRef(CommandLine).drop_front())
+    Argv.push_back(Str.c_str());
+
+  auto Invocation = std::make_shared<CompilerInvocation>();
+  if (!CompilerInvocation::CreateFromArgs(*Invocation, Argv, Diags)) {
+    // FIXME: Should we just go on like cc1_main does?
+    return false;
+  }
+
+  if (!Action.runInvocation(std::move(Invocation), std::move(FS),
+                            PCHContainerOps, Diags.getClient()))
     return false;
 
   std::vector<std::string> Args = Action.takeLastCC1Arguments();
@@ -733,37 +735,24 @@ bool DependencyScanningWorker::scanDependencies(
     DependencyConsumer &Consumer, DependencyActionController &Controller,
     DiagnosticConsumer &DC, llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     std::optional<StringRef> ModuleName) {
-  auto FileMgr =
-      llvm::makeIntrusiveRefCnt<FileManager>(FileSystemOptions{}, FS);
-
   std::vector<const char *> CCommandLine(CommandLine.size(), nullptr);
   llvm::transform(CommandLine, CCommandLine.begin(),
                   [](const std::string &Str) { return Str.c_str(); });
   auto DiagOpts = CreateAndPopulateDiagOpts(CCommandLine);
   sanitizeDiagOpts(*DiagOpts);
-  IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
-      CompilerInstance::createDiagnostics(FileMgr->getVirtualFileSystem(),
-                                          *DiagOpts, &DC,
-                                          /*ShouldOwnClient=*/false);
-
-  // Although `Diagnostics` are used only for command-line parsing, the
-  // custom `DiagConsumer` might expect a `SourceManager` to be present.
-  SourceManager SrcMgr(*Diags, *FileMgr);
-  Diags->setSourceManager(&SrcMgr);
-  // DisableFree is modified by Tooling for running
-  // in-process; preserve the original value, which is
-  // always true for a driver invocation.
-  bool DisableFree = true;
+  auto Diags = CompilerInstance::createDiagnostics(*FS, *DiagOpts, &DC,
+                                                   /*ShouldOwnClient=*/false);
+
   DependencyScanningAction Action(Service, WorkingDirectory, Consumer,
-                                  Controller, DepFS, DisableFree, ModuleName);
+                                  Controller, DepFS, ModuleName);
 
   bool Success = false;
   if (CommandLine[1] == "-cc1") {
-    Success = createAndRunToolInvocation(CommandLine, Action, *FileMgr,
+    Success = createAndRunToolInvocation(CommandLine, Action, FS,
                                          PCHContainerOps, *Diags, Consumer);
   } else {
     Success = forEachDriverJob(
-        CommandLine, *Diags, *FileMgr, [&](const driver::Command &Cmd) {
+        CommandLine, *Diags, FS, [&](const driver::Command &Cmd) {
           if (StringRef(Cmd.getCreator().getName()) != "clang") {
             // Non-clang command. Just pass through to the dependency
             // consumer.
@@ -782,7 +771,7 @@ bool DependencyScanningWorker::scanDependencies(
           // system to ensure that any file system requests that
           // are made by the driver do not go through the
           // dependency scanning filesystem.
-          return createAndRunToolInvocation(std::move(Argv), Action, *FileMgr,
+          return createAndRunToolInvocation(std::move(Argv), Action, FS,
                                             PCHContainerOps, *Diags, Consumer);
         });
   }
diff --git a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
index 9f10ee1c0d3f8..2b5a293b35841 100644
--- a/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
+++ b/clang/lib/Tooling/Inclusions/HeaderIncludes.cpp
@@ -285,8 +285,7 @@ HeaderIncludes::HeaderIncludes(StringRef FileName, StringRef Code,
       MaxInsertOffset(MinInsertOffset +
                       getMaxHeaderInsertionOffset(
                           FileName, Code.drop_front(MinInsertOffset), Style)),
-      MainIncludeFound(false),
-      Categories(Style, FileName) {
+      MainIncludeFound(false), Categories(Style, FileName) {
   // Add 0 for main header and INT_MAX for headers that are not in any
   // category.
   Priorities = {0, INT_MAX};
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
index b88e6db7cceb7..807a8d8a34ad7 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
@@ -131,7 +131,7 @@ static int initialize(Lang Language) {
     Mapping->SymbolNames[SymIndex] = {
         QName.data(), NSLen, static_cast<unsigned int>(QName.size() - NSLen)};
     if (!HeaderName.empty())
-       Mapping->SymbolHeaderIDs[SymIndex].push_back(AddHeader(HeaderName));
+      Mapping->SymbolHeaderIDs[SymIndex].push_back(AddHeader(HeaderName));
 
     NSSymbolMap &NSSymbols = AddNS(QName.take_front(NSLen));
     NSSymbols.try_emplace(QName.drop_front(NSLen), SymIndex);
@@ -236,7 +236,7 @@ std::optional<Symbol> Symbol::named(llvm::StringRef Scope, llvm::StringRef Name,
   return std::nullopt;
 }
 std::optional<Header> Symbol::header() const {
-  const auto& Headers = getMappingPerLang(Language)->SymbolHeaderIDs[ID];
+  const auto &Headers = getMappingPerLang(Language)->SymbolHeaderIDs[ID];
   if (Headers.empty())
     return std::nullopt;
   return Header(Headers.front(), Language);
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index caac719caf8e8..eb9fa7a7fa1e8 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -950,7 +950,6 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
     case NestedNameSpecifier::Global:
       return syntax::NodeKind::GlobalNameSpecifier;
     case NestedNameSpecifier::Namespace:
-    case NestedNameSpecifier::NamespaceAlias:
     case NestedNameSpecifier::Identifier:
       return syntax::NodeKind::IdentifierNameSpecifier;
     case NestedNameSpecifier::TypeSpec: {
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast.cpp b/clang/test/AST/ByteCode/builtin-bit-cast.cpp
index 3c5e89d7d5a74..bc356b0b6e122 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast.cpp
@@ -22,6 +22,10 @@ typedef __INTPTR_TYPE__ intptr_t;
 static_assert(sizeof(int) == 4);
 static_assert(sizeof(long long) == 8);
 
+
+constexpr bool test_bad_bool = __builtin_bit_cast(bool, (char)0xff); // both-error {{must be initialized by a constant expression}} \
+                                                                     // both-note {{value 255 cannot be represented in type 'bool'}}
+
 template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
diff --git a/clang/test/AST/ByteCode/builtin-constant-p.cpp b/clang/test/AST/ByteCode/builtin-constant-p.cpp
index 9f5521590833d..315a907949c34 100644
--- a/clang/test/AST/ByteCode/builtin-constant-p.cpp
+++ b/clang/test/AST/ByteCode/builtin-constant-p.cpp
@@ -140,3 +140,11 @@ void test17(void) {
   F("string literal" + 1); // both-warning {{adding}} \
                            // both-note {{use array indexing}}
 }
+
+/// FIXME
+static void foo(int i) __attribute__((__diagnose_if__(!__builtin_constant_p(i), "not constant", "error"))) // expected-note {{from}}
+{
+}
+static void bar(int i) {
+  foo(15); // expected-error {{not constant}}
+}
diff --git a/clang/test/AST/ByteCode/complex.cpp b/clang/test/AST/ByteCode/complex.cpp
index 2c0111c53d3bf..be10b3cfa53da 100644
--- a/clang/test/AST/ByteCode/complex.cpp
+++ b/clang/test/AST/ByteCode/complex.cpp
@@ -396,10 +396,9 @@ namespace ComplexConstexpr {
                                     // both-note {{cannot refer to element 3 of array of 2 elements}}
   constexpr _Complex float *p = 0;
   constexpr float pr = __real *p; // both-error {{constant expr}} \
-                                  // ref-note {{cannot access real component of null}} \
-                                  // expected-note {{read of dereferenced null pointer}}
+                                  // both-note {{dereferencing a null pointer}}
   constexpr float pi = __imag *p; // both-error {{constant expr}} \
-                                  // ref-note {{cannot access imaginary component of null}}
+                                  // both-note {{dereferencing a null pointer}}
   constexpr const _Complex double *q = &test3 + 1;
   constexpr double qr = __real *q; // ref-error {{constant expr}} \
                                    // ref-note {{cannot access real component of pointer past the end}}
diff --git a/clang/test/AST/ByteCode/const-eval.c b/clang/test/AST/ByteCode/const-eval.c
index eab14c08ec809..3e228226ac8c1 100644
--- a/clang/test/AST/ByteCode/const-eval.c
+++ b/clang/test/AST/ByteCode/const-eval.c
@@ -180,6 +180,9 @@ typedef __INTPTR_TYPE__ intptr_t;
 const intptr_t A = (intptr_t)(((int*) 0) + 1);
 const intptr_t B = (intptr_t)(((char*)0) + 3);
 _Static_assert(A > B, "");
+int * GH149500_p = &(*(int *)0x400);
+static const void *GH149500_q = &(*(const struct sysrq_key_op *)0);
+
 #else
 #error :(
 #endif
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index b34e7823220e2..378702f9b3620 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -39,7 +39,8 @@ struct S {
 constexpr S s = { 5 };
 constexpr const int *p = &s.m + 1;
 
-constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok
+constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // both-error {{constexpr variable 'np2' must be initialized by a constant expression}} \
+                                                      // both-note  {{dereferencing a null pointer is not allowed in a constant expression}}
 
 constexpr int preDec(int x) { // both-error {{never produces a constant expression}}
   return --x;                 // both-note {{subexpression}}
diff --git a/clang/test/AST/ByteCode/mutable.cpp b/clang/test/AST/ByteCode/mutable.cpp
index aebbea920578c..35c5a0389921e 100644
--- a/clang/test/AST/ByteCode/mutable.cpp
+++ b/clang/test/AST/ByteCode/mutable.cpp
@@ -1,11 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++11 -verify=expected,expected11,both,both11 %s
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify=expected,expected14,both %s
-// RUN: %clang_cc1 -std=c++11 -verify=ref,ref11,both,both11 %s
-// RUN: %clang_cc1 -std=c++14 -verify=ref,ref14,both %s
-
-
-
-
+// RUN: %clang_cc1 -std=c++11 -verify=expected,expected11,both,both11 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++14 -verify=expected,expected14,both        %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++11 -verify=ref,ref11,both,both11           %s
+// RUN: %clang_cc1 -std=c++14 -verify=ref,ref14,both                  %s
 
 namespace Simple {
   struct S {
@@ -26,3 +22,47 @@ namespace Simple {
   static_assert(s2.a2 == 12, ""); // both11-error {{not an integral constant expression}} \
                                   // both11-note {{initializer of 's2' is not a constant expression}}
 }
+#if __cplusplus >= 201402L
+namespace ConstInMutable {
+  class B {
+    public:
+
+    const int f;
+    constexpr B() : f(12) {}
+  };
+  class A {
+    public:
+    mutable B b;
+    constexpr A() = default;
+  };
+  constexpr int constInMutable() {
+    A a;
+
+    int *m = (int*)&a.b.f;
+    *m = 12; // both-note {{modification of object of const-qualified type 'const int' is not allowed in a constant expression}}
+    return 1;
+  }
+  static_assert(constInMutable() == 1, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{in call to}}
+}
+
+namespace MutableInConst {
+  class C {
+  public:
+    mutable int c;
+    constexpr C() : c(50) {}
+  };
+  class D {
+  public:
+    C c;
+    constexpr D() {}
+  };
+  constexpr int mutableInConst() {
+    const D d{};
+    int *m = (int*)&d.c.c;
+    *m = 12;
+    return 1;
+  }
+  static_assert(mutableInConst() == 1, "");
+}
+#endif
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index 670def2d5870e..b587cd6eaf89c 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -486,3 +486,11 @@ namespace bitcast {
   }
   static_assert(foo() == 0);
 }
+
+constexpr int modify_const_variable() {
+  const int a = 10;
+  new ((int *)&a) int(12); // both-note {{modification of object of const-qualified type 'const int' is not allowed in a constant expression}}
+  return a;
+}
+static_assert(modify_const_variable()); // both-error {{not an integral constant expression}} \
+                                        // both-note {{in call to}}
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index d369c64bc3904..5ca3e2d12e2df 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -413,7 +413,7 @@ namespace DeriveFailures {
 
     constexpr Derived(int i) : OtherVal(i) {} // ref-error {{never produces a constant expression}} \
                                               // both-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} \
-                                              // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} 
+                                              // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}}
   };
 
   constexpr Derived D(12); // both-error {{must be initialized by a constant expression}} \
@@ -1660,9 +1660,9 @@ namespace NullptrCast {
   constexpr A *na = nullptr;
   constexpr B *nb = nullptr;
   constexpr A &ra = *nb; // both-error {{constant expression}} \
-                         // both-note {{cannot access base class of null pointer}}
+                         // both-note {{dereferencing a null pointer}}
   constexpr B &rb = (B&)*na; // both-error {{constant expression}} \
-                             // both-note {{cannot access derived class of null pointer}}
+                             // both-note {{dereferencing a null pointer}}
   constexpr bool test() {
     auto a = (A*)(B*)nullptr;
 
@@ -1740,7 +1740,7 @@ namespace CtorOfInvalidClass {
 #if __cplusplus >= 202002L
   template <typename T, auto Q>
   concept ReferenceOf = Q;
-  /// This calls a valid and constexpr copy constructor of InvalidCtor, 
+  /// This calls a valid and constexpr copy constructor of InvalidCtor,
   /// but should still be rejected.
   template<ReferenceOf<InvalidCtor> auto R, typename Rep> int F; // both-error {{non-type template argument is not a constant expression}}
 #endif
diff --git a/clang/test/AST/ByteCode/unions.cpp b/clang/test/AST/ByteCode/unions.cpp
index 0fa44a259a4ff..7cfd0d677a7b3 100644
--- a/clang/test/AST/ByteCode/unions.cpp
+++ b/clang/test/AST/ByteCode/unions.cpp
@@ -847,6 +847,20 @@ namespace Activation2 {
   }
   static_assert(change_member_indirectly() == 4);
 }
+
+namespace CopyCtorMutable {
+  struct E {
+    union { // expected-note {{read of mutable member 'b'}}
+      int a;
+      mutable int b; // both-note {{here}}
+    };
+  };
+  constexpr E e1 = {{1}};
+  constexpr E e2 = e1; // both-error {{constant}} \
+                       // ref-note {{read of mutable member 'b'}} \
+                       // both-note {{in call}}
+}
+
 #endif
 
 namespace AddressComparison {
diff --git a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl
index a4f6e6c44794e..fa8d78f38494a 100644
--- a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl
+++ b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl
@@ -9,7 +9,7 @@
 // CHECK:   |     `-TemplateTypeParm {{.*}} 'element_type'
 // CHECK:   `-BinaryOperator {{.*}} 'bool' lvalue '>='
 // CHECK:     |-UnaryExprOrTypeTraitExpr {{.*}} 'bool' sizeof 'element_type'
-// CHECK:     `-IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK:     `-IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 
 
 StructuredBuffer<float> Buffer;
diff --git a/clang/test/AST/ast-dump-APValue-lvalue.cpp b/clang/test/AST/ast-dump-APValue-lvalue.cpp
index 51d22a5ba8b6d..f4cf2f5291760 100644
--- a/clang/test/AST/ast-dump-APValue-lvalue.cpp
+++ b/clang/test/AST/ast-dump-APValue-lvalue.cpp
@@ -67,6 +67,10 @@ void Test(int (&arr)[10]) {
   // CHECK-NEXT:  |   |-value: LValue Base=TypeInfoLValue typeid(int), Null=0, Offset=0, HasPath=1, PathLength=0, Path=()
 
   constexpr int(MP::*pmi) = (int MP::*)&P::x;
-  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pmi 'int (MP::*const)' constexpr cinit
-  // CHECK-NEXT:      |-value: MemberPointer MP::x
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pmi 'int (MP::*const)' constexpr cinit
+  // CHECK-NEXT:  |   |-value: MemberPointer MP::x
+
+  constexpr int(MP::*pmn) = (int MP::*)nullptr;
+  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pmn 'int (MP::*const)' constexpr cinit
+  // CHECK-NEXT:      |-value: MemberPointer null
 }
diff --git a/clang/test/AST/ast-dump-array.cpp b/clang/test/AST/ast-dump-array.cpp
index 15771f227df8a..5a982d34683ff 100644
--- a/clang/test/AST/ast-dump-array.cpp
+++ b/clang/test/AST/ast-dump-array.cpp
@@ -14,7 +14,7 @@ void testArrayInitExpr()
     auto l = [a]{
     };
     // CHECK: |-ArrayInitLoopExpr 0x{{[^ ]*}} <col:15> 'int[10]'
-    // CHECK: |     `-ArrayInitIndexExpr 0x{{[^ ]*}} <<invalid sloc>> 'unsigned long'
+    // CHECK: |     `-ArrayInitIndexExpr 0x{{[^ ]*}} <<invalid sloc>> '__size_t':'unsigned long'
 }
 
 template<typename T, int Size>
diff --git a/clang/test/AST/ast-dump-expr-json.c b/clang/test/AST/ast-dump-expr-json.c
index e910864eeed65..ecb6191c52200 100644
--- a/clang/test/AST/ast-dump-expr-json.c
+++ b/clang/test/AST/ast-dump-expr-json.c
@@ -3911,7 +3911,8 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "qualType": "unsigned long"
+// CHECK-NEXT:       "desugaredQualType": "unsigned long",
+// CHECK-NEXT:       "qualType": "__size_t"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
 // CHECK-NEXT:      "name": "sizeof",
@@ -3964,7 +3965,8 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "qualType": "unsigned long"
+// CHECK-NEXT:       "desugaredQualType": "unsigned long",
+// CHECK-NEXT:       "qualType": "__size_t"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
 // CHECK-NEXT:      "name": "sizeof",
@@ -3989,7 +3991,8 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "qualType": "unsigned long"
+// CHECK-NEXT:       "desugaredQualType": "unsigned long",
+// CHECK-NEXT:       "qualType": "__size_t"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
 // CHECK-NEXT:      "name": "alignof",
diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp
index 5a762acad7917..11026c9d302f0 100644
--- a/clang/test/AST/ast-dump-expr-json.cpp
+++ b/clang/test/AST/ast-dump-expr-json.cpp
@@ -1545,7 +1545,8 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "qualType": "unsigned long"
+// CHECK-NEXT:         "desugaredQualType": "unsigned long",
+// CHECK-NEXT:         "qualType": "__size_t"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "prvalue",
 // CHECK-NEXT:        "name": "Ts"
@@ -1587,7 +1588,8 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "qualType": "long"
+// CHECK-NEXT:           "desugaredQualType": "long",
+// CHECK-NEXT:           "qualType": "__ptrdiff_t"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
 // CHECK-NEXT:          "opcode": "-",
@@ -1726,7 +1728,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        }
 // CHECK-NEXT:       },
@@ -1755,7 +1757,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        }
 // CHECK-NEXT:       },
@@ -1785,7 +1787,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
@@ -1860,7 +1862,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new[]",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
@@ -1880,7 +1882,8 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "qualType": "unsigned long"
+// CHECK-NEXT:           "desugaredQualType": "unsigned long",
+// CHECK-NEXT:           "qualType": "__size_t"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
 // CHECK-NEXT:          "castKind": "IntegralCast",
@@ -1937,7 +1940,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new[]",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
@@ -1957,7 +1960,8 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "qualType": "unsigned long"
+// CHECK-NEXT:           "desugaredQualType": "unsigned long",
+// CHECK-NEXT:           "qualType": "__size_t"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
 // CHECK-NEXT:          "castKind": "IntegralCast",
@@ -2333,7 +2337,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator delete",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:          "qualType": "void (void *, __size_t) noexcept"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
diff --git a/clang/test/AST/ast-dump-expr.c b/clang/test/AST/ast-dump-expr.c
index 959d61ec9794b..e7aba39be8f68 100644
--- a/clang/test/AST/ast-dump-expr.c
+++ b/clang/test/AST/ast-dump-expr.c
@@ -222,15 +222,15 @@ void UnaryOperators(int a, int *b) {
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:4> 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int'
 
   sizeof a;
-  // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:10> 'unsigned long' sizeof
+  // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:10> '__size_t':'unsigned long' sizeof
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:10> 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int'
 
   sizeof(int);
-  // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:13> 'unsigned long' sizeof 'int'
+  // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:13> '__size_t':'unsigned long' sizeof 'int'
 
   _Alignof(int);
   // FIXME: Uses C++ spelling for alignof in C mode.
-  // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} <line:[[@LINE-2]]:3, col:15> 'unsigned long' alignof 'int'
+  // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} <line:[[@LINE-2]]:3, col:15> '__size_t':'unsigned long' alignof 'int'
 }
 
 struct S {
diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp
index 8ccb39f8f3165..6fd429d1500a4 100644
--- a/clang/test/AST/ast-dump-expr.cpp
+++ b/clang/test/AST/ast-dump-expr.cpp
@@ -115,34 +115,34 @@ void Casting(const S *s) {
 template <typename... Ts>
 void UnaryExpressions(int *p) {
   sizeof...(Ts);
-  // CHECK: SizeOfPackExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:15> 'unsigned long' 0x{{[^ ]*}} Ts
+  // CHECK: SizeOfPackExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:15> '__size_t':'unsigned long' 0x{{[^ ]*}} Ts
 
   noexcept(p - p);
   // CHECK: CXXNoexceptExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:17> 'bool'
-  // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:12, col:16> 'long' '-'
+  // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:12, col:16> '__ptrdiff_t':'long' '-'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:12> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:16> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
   ::new int;
-  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:9> 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)'
+  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:9> 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)'
 
   new (int);
-  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:11> 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)'
+  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:11> 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)'
 
   new int{12};
-  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:13> 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)'
+  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:13> 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)'
   // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} <col:10, col:13> 'int'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:11> 'int' 12
 
   new int[2];
-  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)'
+  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:11> 'int' 2
 
   new int[2]{1, 2};
-  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:18> 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)'
+  // CHECK: CXXNewExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:18> 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:11> 'int' 2
   // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} <col:13, col:18> 'int[2]'
@@ -164,7 +164,7 @@ void UnaryExpressions(int *p) {
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:8> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
   ::delete p;
-  // CHECK: CXXDeleteExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, unsigned long) noexcept'
+  // CHECK: CXXDeleteExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, __size_t) noexcept'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:12> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c
index 10f27e759b5b1..672607fa90670 100644
--- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c
+++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c
@@ -57,8 +57,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -97,8 +97,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -144,8 +144,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -191,8 +191,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -251,8 +251,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c
index 419ba57191039..8eedf8ac8bc58 100644
--- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c
+++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c
@@ -57,8 +57,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -97,8 +97,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -144,8 +144,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -191,8 +191,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -251,8 +251,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
index c209a0456d7a0..64e19ce0a53bf 100644
--- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
+++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c
@@ -65,8 +65,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -94,8 +94,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -123,8 +123,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -152,8 +152,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | |     `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |       `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -189,8 +189,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -218,8 +218,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       |   | |   `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -247,8 +247,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       |     | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -276,8 +276,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict'
 // CHECK-NEXT: |       |       `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |         `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -325,8 +325,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -371,8 +371,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -417,8 +417,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -463,8 +463,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | |     |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -517,8 +517,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -563,8 +563,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       |   | |   |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -609,8 +609,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       |     | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -655,8 +655,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict'
 // CHECK-NEXT: |       |       |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -711,8 +711,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -757,8 +757,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -803,8 +803,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -849,8 +849,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | |     |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -903,8 +903,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -949,8 +949,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       |   | |   |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -995,8 +995,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       |     | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict'
 // CHECK-NEXT: |       |       |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | |     |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       |   | |   |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       |     | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict'
 // CHECK-NEXT: |       |       |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | | | | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         | | | | | | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | | |   |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         | | | | |   |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | |   | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         | | |   | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | |     |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         | | |     |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   | | | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         |   | | | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   | |   |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         |   | |   |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |     | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         |     | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |       |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict'
 // CHECK-NEXT:         |       |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c
index b13e096101e63..cf3f4bfcaf225 100644
--- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c
+++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c
@@ -65,8 +65,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -94,8 +94,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -123,8 +123,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -152,8 +152,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       | | |     `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |       `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -189,8 +189,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -218,8 +218,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       |   | |   `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -247,8 +247,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       |     | | `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -276,8 +276,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       | `-NullStmt {{.*}} <line:6:5>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:4:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict'
 // CHECK-NEXT: |       |       `-VarDecl {{.*}} <line:5:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |         `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -325,8 +325,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -371,8 +371,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -417,8 +417,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -463,8 +463,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       | | |     |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -517,8 +517,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -563,8 +563,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       |   | |   |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -609,8 +609,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       |     | | |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -655,8 +655,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       |   `-NullStmt {{.*}} <line:13:7>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:10:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict'
 // CHECK-NEXT: |       |       |-VarDecl {{.*}} <line:11:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -711,8 +711,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -757,8 +757,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -803,8 +803,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -849,8 +849,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       | | |     |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -903,8 +903,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -949,8 +949,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       |   | |   |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -995,8 +995,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       |     | | |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       |   `-NullStmt {{.*}} <line:20:7>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:17:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict'
 // CHECK-NEXT: |       |       |-VarDecl {{.*}} <line:18:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | |   |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | | | |   |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |   | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | |   | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | |     |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       | | |     |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       |   | | | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | |   |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       |   | |   |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |     | | |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       |     | | |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |       |   `-NullStmt {{.*}} <line:27:7>
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <line:24:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict'
 // CHECK-NEXT: |       |       |-VarDecl {{.*}} <line:25:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | | | | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         | | | | | | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | | |   |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         | | | | |   |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | |   | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | |   | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         | | |   | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | |   | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | |     |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | |     |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         | | |     |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | |     | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   | | | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         |   | | | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   | |   |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         |   | |   |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |     | | |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |     | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         |     | | |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |     | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |       |     `-NullStmt {{.*}} <line:35:9>
 // CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <line:31:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |       |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict'
 // CHECK-NEXT:         |       |-VarDecl {{.*}} <line:32:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |       | `-IntegerLiteral {{.*}} <col:16> 'int' 0
diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c
index 14356882b599a..c8da8cd1a5efa 100644
--- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c
+++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c
@@ -71,8 +71,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -99,8 +99,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -127,8 +127,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -155,8 +155,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -211,8 +211,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -239,8 +239,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -267,8 +267,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -295,8 +295,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -363,8 +363,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -407,8 +407,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -451,8 +451,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -495,8 +495,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -568,8 +568,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -612,8 +612,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -656,8 +656,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -700,8 +700,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -775,8 +775,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -819,8 +819,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -863,8 +863,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -907,8 +907,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -984,8 +984,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | |   | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | |   | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | |   |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | |   |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   |   | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         |   |   | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   |   |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         |   |   |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         |   | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict'
 // CHECK-NEXT:         |   |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c
index 0f983cfdff1dc..09b649cbb3660 100644
--- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c
+++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c
@@ -71,8 +71,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -99,8 +99,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -127,8 +127,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -155,8 +155,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       | | | |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -211,8 +211,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -239,8 +239,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -267,8 +267,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   | | | `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -295,8 +295,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | `-NullStmt {{.*}} <line:7:5>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:5:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict'
 // CHECK-NEXT: |       |   |   `-VarDecl {{.*}} <line:6:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |     `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -363,8 +363,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -407,8 +407,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -451,8 +451,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -495,8 +495,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -568,8 +568,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -612,8 +612,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -656,8 +656,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   | | | |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -700,8 +700,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   `-NullStmt {{.*}} <line:15:7>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:12:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict'
 // CHECK-NEXT: |       |   |   |-VarDecl {{.*}} <line:13:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -775,8 +775,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -819,8 +819,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -863,8 +863,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -907,8 +907,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -984,8 +984,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   | | | |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   `-NullStmt {{.*}} <line:23:7>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:20:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict'
 // CHECK-NEXT: |       |   |   |-VarDecl {{.*}} <line:21:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | |   | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       | | | |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       | | | |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   |   | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   |   |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   | | | |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   | | | |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT: |       |   |   |   `-NullStmt {{.*}} <line:31:7>
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <line:28:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT: |       |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict'
 // CHECK-NEXT: |       |   |   |-VarDecl {{.*}} <line:29:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT: |       |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | |   | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | |   | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | |   |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | |   |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         | | | |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         | | | |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         | | | |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         | | | |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   |   | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         |   |   | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   |   |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         |   |   |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   | | | |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   | | | |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         |   | | | |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   | | | | `-IntegerLiteral {{.*}} <col:16> 'int' 0
@@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) {
 // CHECK-NEXT:         |   |   |     `-NullStmt {{.*}} <line:40:9>
 // CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <line:36:1> col:1 implicit .global_tid. 'const int *const restrict'
 // CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit .bound_tid. 'const int *const restrict'
-// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const unsigned long'
-// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const unsigned long'
+// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long'
+// CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long'
 // CHECK-NEXT:         |   |   |-ImplicitParamDecl {{.*}} <col:1> col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict'
 // CHECK-NEXT:         |   |   |-VarDecl {{.*}} <line:37:8, col:16> col:12 used i 'int' cinit
 // CHECK-NEXT:         |   |   | `-IntegerLiteral {{.*}} <col:16> 'int' 0
diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp
index a473d17da9424..a8f113ce6a3d4 100644
--- a/clang/test/AST/ast-dump-stmt-json.cpp
+++ b/clang/test/AST/ast-dump-stmt-json.cpp
@@ -963,7 +963,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        }
 // CHECK-NEXT:       }
@@ -994,7 +994,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:       "kind": "FunctionDecl",
 // CHECK-NEXT:       "name": "operator delete",
 // CHECK-NEXT:       "type": {
-// CHECK-NEXT:        "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:        "qualType": "void (void *, __size_t) noexcept"
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
@@ -1126,7 +1126,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new[]",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
@@ -1146,7 +1146,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "qualType": "unsigned long"
+// CHECK-NEXT:           "desugaredQualType": "unsigned long",
+// CHECK-NEXT:           "qualType": "__size_t"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
 // CHECK-NEXT:          "castKind": "IntegralCast",
@@ -1337,7 +1338,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator new",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void *(unsigned long)"
+// CHECK-NEXT:          "qualType": "void *(__size_t)"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        }
 // CHECK-NEXT:       }
@@ -1369,7 +1370,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:       "kind": "FunctionDecl",
 // CHECK-NEXT:       "name": "operator delete",
 // CHECK-NEXT:       "type": {
-// CHECK-NEXT:        "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:        "qualType": "void (void *, __size_t) noexcept"
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
@@ -1444,7 +1445,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator new",
 // CHECK-NEXT:  "mangledName": "_Znwm",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void *(unsigned long)"
+// CHECK-NEXT:   "qualType": "void *(__size_t)"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -1457,7 +1458,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -1503,7 +1505,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator new",
 // CHECK-NEXT:  "mangledName": "_ZnwmSt11align_val_t",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void *(unsigned long, std::align_val_t)"
+// CHECK-NEXT:   "qualType": "void *(__size_t, std::align_val_t)"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -1516,7 +1518,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -1585,7 +1588,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator new[]",
 // CHECK-NEXT:  "mangledName": "_Znam",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void *(unsigned long)"
+// CHECK-NEXT:   "qualType": "void *(__size_t)"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -1598,7 +1601,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -1644,7 +1648,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator new[]",
 // CHECK-NEXT:  "mangledName": "_ZnamSt11align_val_t",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void *(unsigned long, std::align_val_t)"
+// CHECK-NEXT:   "qualType": "void *(__size_t, std::align_val_t)"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -1657,7 +1661,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -1821,7 +1826,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator delete",
 // CHECK-NEXT:  "mangledName": "_ZdlPvm",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:   "qualType": "void (void *, __size_t) noexcept"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -1847,7 +1852,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -1874,7 +1880,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator delete",
 // CHECK-NEXT:  "mangledName": "_ZdlPvmSt11align_val_t",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void (void *, unsigned long, std::align_val_t) noexcept"
+// CHECK-NEXT:   "qualType": "void (void *, __size_t, std::align_val_t) noexcept"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -1900,7 +1906,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -2036,7 +2043,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator delete[]",
 // CHECK-NEXT:  "mangledName": "_ZdaPvm",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:   "qualType": "void (void *, __size_t) noexcept"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -2062,7 +2069,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -2089,7 +2097,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  "name": "operator delete[]",
 // CHECK-NEXT:  "mangledName": "_ZdaPvmSt11align_val_t",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "qualType": "void (void *, unsigned long, std::align_val_t) noexcept"
+// CHECK-NEXT:   "qualType": "void (void *, __size_t, std::align_val_t) noexcept"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
 // CHECK-NEXT:   {
@@ -2115,7 +2123,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "isImplicit": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:     "desugaredQualType": "unsigned long",
+// CHECK-NEXT:     "qualType": "__size_t"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
 // CHECK-NEXT:   {
@@ -3881,7 +3890,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "qualType": "unsigned long"
+// CHECK-NEXT:             "desugaredQualType": "unsigned long",
+// CHECK-NEXT:             "qualType": "__size_t"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "prvalue",
 // CHECK-NEXT:            "name": "sizeof",
@@ -3955,7 +3965,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "qualType": "unsigned long"
+// CHECK-NEXT:             "desugaredQualType": "unsigned long",
+// CHECK-NEXT:             "qualType": "__size_t"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "prvalue",
 // CHECK-NEXT:            "castKind": "IntegralCast",
@@ -4085,7 +4096,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "qualType": "unsigned long"
+// CHECK-NEXT:             "desugaredQualType": "unsigned long",
+// CHECK-NEXT:             "qualType": "__size_t"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "prvalue",
 // CHECK-NEXT:            "name": "sizeof",
@@ -4159,7 +4171,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "qualType": "unsigned long"
+// CHECK-NEXT:             "desugaredQualType": "unsigned long",
+// CHECK-NEXT:             "qualType": "__size_t"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "prvalue",
 // CHECK-NEXT:            "castKind": "IntegralCast",
@@ -4980,7 +4993,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:               }
 // CHECK-NEXT:              },
 // CHECK-NEXT:              "type": {
-// CHECK-NEXT:               "qualType": "long"
+// CHECK-NEXT:               "desugaredQualType": "long",
+// CHECK-NEXT:               "qualType": "__ptrdiff_t"
 // CHECK-NEXT:              },
 // CHECK-NEXT:              "valueCategory": "prvalue",
 // CHECK-NEXT:              "value": "10"
@@ -6503,7 +6517,8 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:               }
 // CHECK-NEXT:              },
 // CHECK-NEXT:              "type": {
-// CHECK-NEXT:               "qualType": "long"
+// CHECK-NEXT:               "desugaredQualType": "long"
+// CHECK-NEXT:               "qualType": "__ptrdiff_t"
 // CHECK-NEXT:              },
 // CHECK-NEXT:              "valueCategory": "prvalue",
 // CHECK-NEXT:              "value": "10"
diff --git a/clang/test/AST/ast-dump-stmt.cpp b/clang/test/AST/ast-dump-stmt.cpp
index 407584e5b82de..42c5f3b3498a4 100644
--- a/clang/test/AST/ast-dump-stmt.cpp
+++ b/clang/test/AST/ast-dump-stmt.cpp
@@ -206,7 +206,7 @@ void TestIteration() {
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:14, col:16> 'int *' '+'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]'
-  // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:16> 'long' 10
+  // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:16> '__ptrdiff_t':'long' 10
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:14> 'bool' '!='
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
@@ -274,7 +274,7 @@ void TestIteration() {
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:21, col:23> 'int *' '+'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]'
-  // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:23> 'long' 10
+  // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:23> '__ptrdiff_t':'long' 10
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:21> 'bool' '!='
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
diff --git a/clang/test/AST/ast-dump-traits.cpp b/clang/test/AST/ast-dump-traits.cpp
index 3085e5883fd2e..72d2a2ae8603e 100644
--- a/clang/test/AST/ast-dump-traits.cpp
+++ b/clang/test/AST/ast-dump-traits.cpp
@@ -56,7 +56,7 @@ void test_unary_expr_or_type_trait() {
 // CHECK-NEXT: |-FunctionDecl {{.*}} <line:20:1, line:23:1> line:20:6{{( imported)?}} test_array_type_trait 'void ()'
 // CHECK-NEXT: | `-CompoundStmt {{.*}} <col:30, line:23:1>
 // CHECK-NEXT: |   `-CStyleCastExpr {{.*}} <line:22:3, col:34> 'void' <ToVoid>
-// CHECK-NEXT: |     `-ArrayTypeTraitExpr {{.*}} <col:10, col:34> 'unsigned long' __array_rank
+// CHECK-NEXT: |     `-ArrayTypeTraitExpr {{.*}} <col:10, col:34> '__size_t':'unsigned long' __array_rank
 // CHECK-NEXT: |-FunctionDecl {{.*}} <line:25:1, line:28:1> line:25:6{{( imported)?}} test_expression_trait 'void ()'
 // CHECK-NEXT: | `-CompoundStmt {{.*}} <col:30, line:28:1>
 // CHECK-NEXT: |   `-CStyleCastExpr {{.*}} <line:27:3, col:28> 'void' <ToVoid>
@@ -64,8 +64,8 @@ void test_unary_expr_or_type_trait() {
 // CHECK-NEXT: `-FunctionDecl {{.*}} <line:30:1, line:35:1> line:30:6{{( imported)?}} test_unary_expr_or_type_trait 'void ()'
 // CHECK-NEXT:   `-CompoundStmt {{.*}} <col:38, line:35:1>
 // CHECK-NEXT:     |-CStyleCastExpr {{.*}} <line:32:3, col:20> 'void' <ToVoid>
-// CHECK-NEXT:     | `-UnaryExprOrTypeTraitExpr {{.*}} <col:10, col:20> 'unsigned long' sizeof 'int'
+// CHECK-NEXT:     | `-UnaryExprOrTypeTraitExpr {{.*}} <col:10, col:20> '__size_t':'unsigned long' sizeof 'int'
 // CHECK-NEXT:     |-CStyleCastExpr {{.*}} <line:33:3, col:21> 'void' <ToVoid>
-// CHECK-NEXT:     | `-UnaryExprOrTypeTraitExpr {{.*}} <col:10, col:21> 'unsigned long' alignof 'int'
+// CHECK-NEXT:     | `-UnaryExprOrTypeTraitExpr {{.*}} <col:10, col:21> '__size_t':'unsigned long' alignof 'int'
 // CHECK-NEXT:     `-CStyleCastExpr {{.*}} <line:34:3, col:23> 'void' <ToVoid>
-// CHECK-NEXT:       `-UnaryExprOrTypeTraitExpr {{.*}} <col:10, col:23> 'unsigned long' __alignof 'int'
+// CHECK-NEXT:       `-UnaryExprOrTypeTraitExpr {{.*}} <col:10, col:23> '__size_t':'unsigned long' __alignof 'int'
diff --git a/clang/test/AST/ast-dump-types-errors-json.cpp b/clang/test/AST/ast-dump-types-errors-json.cpp
index e15f8eeee20cc..d9f918f6c3d72 100644
--- a/clang/test/AST/ast-dump-types-errors-json.cpp
+++ b/clang/test/AST/ast-dump-types-errors-json.cpp
@@ -60,7 +60,8 @@ using TestContainsErrors = int[sizeof(undef())];
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "qualType": "unsigned long"
+// CHECK-NEXT:       "desugaredQualType": "unsigned long",
+// CHECK-NEXT:       "qualType": "__size_t"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
 // CHECK-NEXT:      "name": "sizeof",
diff --git a/clang/test/Analysis/cfg.cpp b/clang/test/Analysis/cfg.cpp
index 44a89df28e3b2..d6cef88dc18a6 100644
--- a/clang/test/Analysis/cfg.cpp
+++ b/clang/test/Analysis/cfg.cpp
@@ -70,7 +70,7 @@ void F(EmptyE e) {
 // CHECK-NEXT: Succs (1): B1
 // CHECK: [B1]
 // CHECK-NEXT:   1: __builtin_object_size
-// CHECK-NEXT:   2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, unsigned long (*)(const void *, int) noexcept)
+// CHECK-NEXT:   2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, __size_t (*)(const void *, int) noexcept)
 // CHECK-NEXT:   3: [B1.2](dummy(), 0)
 // CHECK-NEXT:   4: (void)[B1.3] (CStyleCastExpr, ToVoid, void)
 // CHECK-NEXT:   Preds (1): B2
diff --git a/clang/test/Analysis/div-zero-cxx20.cpp b/clang/test/Analysis/div-zero-cxx20.cpp
new file mode 100644
index 0000000000000..00ea96e796777
--- /dev/null
+++ b/clang/test/Analysis/div-zero-cxx20.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core.DivideZero -std=c++20 -verify %s
+
+namespace GH148875 {
+struct A {
+  int x;
+  A(int v) : x(v) {}
+};
+
+struct B {
+  int x;
+  B() : x(0) {}
+};
+
+struct C {
+  int x, y;
+  C(int a, int b) : x(a), y(b) {}
+};
+
+struct D {
+  int x;
+};
+
+struct E {
+  D d;
+  E(int a) : d(a) {}
+};
+
+struct F {
+  int x;
+};
+
+int t1() {
+  A a{42};
+  return 1 / (a.x - 42); // expected-warning {{Division by zero}}
+}
+
+int t2() {
+  B b{};
+  return 1 / b.x; // expected-warning {{Division by zero}}
+}
+
+int t3() {
+  C c1{1, -1};
+  return 1 / (c1.x + c1.y); // expected-warning {{Division by zero}}
+}
+
+int t4() {
+  C c2{0, 0};
+  return 1 / (c2.x + c2.y); // expected-warning {{Division by zero}}
+}
+
+int t5() {
+  E e{32};
+  return 1 / (e.d.x - 32); // expected-warning {{Division by zero}}
+}
+
+int t6() {
+  F f(32);
+  return 1 / (f.x - 32); // expected-warning {{Division by zero}}
+}
+} // namespace GH148875
diff --git a/clang/test/Analysis/div-zero.cpp b/clang/test/Analysis/div-zero.cpp
index 063450d8883b0..51ea25e828a18 100644
--- a/clang/test/Analysis/div-zero.cpp
+++ b/clang/test/Analysis/div-zero.cpp
@@ -11,3 +11,63 @@ int fooPR10616 (int qX ) {
   return (a % (qX-1)); // expected-warning {{Division by zero}}
 
 }
+
+namespace GH148875 {
+struct A {
+  int x;
+  A(int v) : x(v) {}
+};
+
+struct B {
+  int x;
+  B() : x(0) {}
+};
+
+struct C {
+  int x, y;
+  C(int a, int b) : x(a), y(b) {}
+};
+
+struct D {
+  int x;
+};
+
+struct E {
+  D d;
+  E(int a) : d{a} {}
+};
+
+struct F {
+  int x;
+};
+
+int t1() {
+  A a{42};
+  return 1 / (a.x - 42); // expected-warning {{Division by zero}}
+}
+
+int t2() {
+  B b{};
+  return 1 / b.x; // expected-warning {{Division by zero}}
+}
+
+int t3() {
+  C c1{1, -1};
+  return 1 / (c1.x + c1.y); // expected-warning {{Division by zero}}
+}
+
+int t4() {
+  C c2{0, 0};
+  return 1 / (c2.x + c2.y); // expected-warning {{Division by zero}}
+}
+
+int t5() {
+  E e{32};
+  return 1 / (e.d.x - 32); // expected-warning {{Division by zero}}
+}
+
+int t6() {
+  F f{32};
+  return 1 / (f.x - 32); // expected-warning {{Division by zero}}
+}
+}
diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
index 267980c3b20c8..dfc650223c9e7 100644
--- a/clang/test/Analysis/explain-svals.cpp
+++ b/clang/test/Analysis/explain-svals.cpp
@@ -46,7 +46,7 @@ void test_1(int param, void *ptr) {
 
 void test_2(char *ptr, int ext) {
   clang_analyzer_explain((void *) "asdf"); // expected-warning-re{{{{^pointer to element of type 'char' with index 0 of string literal "asdf"$}}}}
-  clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type 'unsigned long' tied to pointee of argument 'ptr'$}}}}
+  clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type '__size_t' tied to pointee of argument 'ptr'$}}}}
   clang_analyzer_explain(conjure()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure\(\)'$}}}}
   clang_analyzer_explain(glob); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob'$}}}}
   clang_analyzer_explain(glob_ptr); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob_ptr'$}}}}
diff --git a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c
index 1f0d3627fae34..ba5bc57928b0c 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c
@@ -20,7 +20,7 @@
 // RUN:   -triple x86_64-unknown-linux 2>&1 | FileCheck %s
 
 // CHECK: Loaded summary for: int isalnum(int)
-// CHECK: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1)))
+// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1)))
 // CHECK: Loaded summary for: int fileno(FILE *stream)
 
 void initializeSummaryMap(void);
diff --git a/clang/test/Analysis/std-c-library-functions-lookup.c b/clang/test/Analysis/std-c-library-functions-lookup.c
index e47d9bddda91b..8182e5a1f5fde 100644
--- a/clang/test/Analysis/std-c-library-functions-lookup.c
+++ b/clang/test/Analysis/std-c-library-functions-lookup.c
@@ -6,7 +6,7 @@
 // RUN:   -analyzer-config eagerly-assume=false \
 // RUN:   -triple i686-unknown-linux 2>&1 | FileCheck %s
 
-// CHECK: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
+// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict)
 
 typedef typeof(sizeof(int)) size_t;
 typedef struct FILE FILE;
diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
index b99cc30149c91..887817ba8551e 100644
--- a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
+++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
@@ -31,8 +31,8 @@
 // Verify that the summaries are loaded when the StdLibraryFunctionsChecker is
 // enabled.
 //      CHECK: Loaded summary for: int getchar(void)
-// CHECK-NEXT: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict)
-// CHECK-NEXT: Loaded summary for: unsigned long fwrite(const void *restrict, size_t, size_t, FILE *restrict)
+// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict)
+// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict)
 
 #include "Inputs/system-header-simulator.h"
 
diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c
index b03a1a5656517..b5f663493a676 100644
--- a/clang/test/Analysis/std-c-library-functions.c
+++ b/clang/test/Analysis/std-c-library-functions.c
@@ -59,8 +59,8 @@
 // CHECK-NEXT: Loaded summary for: int tolower(int)
 // CHECK-NEXT: Loaded summary for: int toascii(int)
 // CHECK-NEXT: Loaded summary for: int getchar(void)
-// CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
-// CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict)
+// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict)
+// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: ssize_t read(int, void *, size_t)
 // CHECK-NEXT: Loaded summary for: ssize_t write(int, const void *, size_t)
 // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict)
diff --git a/clang/test/C/C2y/n3353.c b/clang/test/C/C2y/n3353.c
index cd61cbf039067..a2e08cf6344db 100644
--- a/clang/test/C/C2y/n3353.c
+++ b/clang/test/C/C2y/n3353.c
@@ -44,7 +44,12 @@ static const void *ptr = 0o0;  /* ext-warning {{octal integer literals are a C2y
 #endif
 
 // 0 by itself is not deprecated, of course.
-int k = 0;
+int k1                = 0;
+unsigned int k2       = 0u;
+long k3               = 0l;
+unsigned long k4      = 0ul;
+long long k5          = 0ll;
+unsigned long long k6 = 0ull;
 
 // Test a preprocessor use of 0 by itself, which is also not deprecated.
 #if 0
@@ -65,7 +70,6 @@ static_assert(__extension__ _Generic(typeof(l), const int : 1, default : 0)); //
 
 // Note that 0o by itself is an invalid literal.
 int m = 0o; /* expected-error {{invalid suffix 'o' on integer constant}}
-               c2y-warning {{octal literals without a '0o' prefix are deprecated}}
              */
 
 // Ensure negation works as expected.
@@ -83,13 +87,11 @@ int n = 0o18; /* expected-error {{invalid digit '8' in octal constant}}
                  cpp-warning {{octal integer literals are a Clang extension}}
                */
 int o1 = 0o8; /* expected-error {{invalid suffix 'o8' on integer constant}}
-                 c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                */
 // FIXME: however, it matches the behavior for hex literals in terms of the
 // error reported. Unfortunately, we then go on to think 0 is an octal literal
 // without a prefix, which is again a bit confusing.
 int o2 = 0xG; /* expected-error {{invalid suffix 'xG' on integer constant}}
-                 c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                */
 
 // Show that floating-point suffixes on octal literals are rejected.
@@ -130,7 +132,6 @@ constexpr int p = 0o0'1'2'3'4'5'6'7; /* compat-warning {{octal integer literals
                                       */
 static_assert(p == 01234567); // c2y-warning {{octal literals without a '0o' prefix are deprecated}}
 int q = 0o'0'1; /* expected-error {{invalid suffix 'o'0'1' on integer constant}}
-                   c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                  */
 
 #define M 0o123
diff --git a/clang/test/CIR/CodeGen/bitfields.c b/clang/test/CIR/CodeGen/bitfields.c
index 896acbfc854a4..a73c076ea81ab 100644
--- a/clang/test/CIR/CodeGen/bitfields.c
+++ b/clang/test/CIR/CodeGen/bitfields.c
@@ -87,14 +87,14 @@ int load_field(S* s) {
 // CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["s", init]
 // CIR:   [[TMP1:%.*]] = cir.load{{.*}} [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP3:%.*]] = cir.get_bitfield(#bfi_c, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
+// CIR:   [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
 
 // LLVM: define dso_local i32 @load_field
 // LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 // LLVM:   [[TMP1:%.*]] = alloca i32, i64 1, align 4
 // LLVM:   [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8
 // LLVM:   [[TMP3:%.*]] = getelementptr %struct.S, ptr [[TMP2]], i32 0, i32 0
-// LLVM:   [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+// LLVM:   [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4
 // LLVM:   [[TMP5:%.*]] = shl i64 [[TMP4]], 15
 // LLVM:   [[TMP6:%.*]] = ashr i64 [[TMP5]], 47
 // LLVM:   [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
@@ -115,13 +115,13 @@ unsigned int load_field_unsigned(A* s) {
 //CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>, ["s", init] {alignment = 8 : i64}
 //CIR:   [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
 //CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][3] {name = "more_bits"} : !cir.ptr<!rec_A> -> !cir.ptr<!u16i>
-//CIR:   [[TMP3:%.*]] = cir.get_bitfield(#bfi_more_bits, [[TMP2]] : !cir.ptr<!u16i>) -> !u32i
+//CIR:   [[TMP3:%.*]] = cir.get_bitfield align(1) (#bfi_more_bits, [[TMP2]] : !cir.ptr<!u16i>) -> !u32i
 
 //LLVM: define dso_local i32 @load_field_unsigned
 //LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 //LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 //LLVM:   [[TMP2:%.*]] = getelementptr %struct.A, ptr [[TMP1]], i32 0, i32 3
-//LLVM:   [[TMP3:%.*]] = load i16, ptr [[TMP2]], align 2
+//LLVM:   [[TMP3:%.*]] = load i16, ptr [[TMP2]], align 1
 //LLVM:   [[TMP4:%.*]] = lshr i16 [[TMP3]], 3
 //LLVM:   [[TMP5:%.*]] = and i16 [[TMP4]], 15
 //LLVM:   [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
@@ -143,15 +143,15 @@ void store_field() {
 // CIR:   [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>
 // CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
 // CIR:   [[TMP2:%.*]] = cir.get_member [[TMP0]][1] {name = "e"} : !cir.ptr<!rec_S> -> !cir.ptr<!u16i>
-// CIR:   cir.set_bitfield(#bfi_e, [[TMP2]] : !cir.ptr<!u16i>, [[TMP1]] : !s32i)
+// CIR:   cir.set_bitfield align(4) (#bfi_e, [[TMP2]] : !cir.ptr<!u16i>, [[TMP1]] : !s32i)
 
 // LLVM: define dso_local void @store_field()
 // LLVM:   [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4
 // LLVM:   [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 1
-// LLVM:   [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 2
+// LLVM:   [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 4
 // LLVM:   [[TMP3:%.*]] = and i16 [[TMP2]], -32768
 // LLVM:   [[TMP4:%.*]] = or i16 [[TMP3]], 3
-// LLVM:   store i16 [[TMP4]], ptr [[TMP1]], align 2
+// LLVM:   store i16 [[TMP4]], ptr [[TMP1]], align 4
 
 // OGCG: define dso_local void @store_field()
 // OGCG:   [[TMP0:%.*]] = alloca %struct.S, align 4
@@ -169,24 +169,24 @@ void store_bitfield_to_bitfield() {
 // CIR: cir.func {{.*@store_bitfield_to_bitfield}}
 // CIR:   [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s"] {alignment = 4 : i64}
 // CIR:   [[TMP1:%.*]] = cir.get_member [[TMP0]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP2:%.*]] = cir.get_bitfield(#bfi_c, [[TMP1]] : !cir.ptr<!u64i>) -> !s32i
+// CIR:   [[TMP2:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP1]] : !cir.ptr<!u64i>) -> !s32i
 // CIR:   [[TMP3:%.*]] = cir.get_member [[TMP0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_a, [[TMP3]] : !cir.ptr<!u64i>, [[TMP2]] : !s32i) -> !s32i
+// CIR:   [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_a, [[TMP3]] : !cir.ptr<!u64i>, [[TMP2]] : !s32i) -> !s32i
 
 // LLVM: define dso_local void @store_bitfield_to_bitfield()
 // LLVM:  [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4
 // LLVM:  [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0
-// LLVM:  [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// LLVM:  [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4
 // LLVM:  [[TMP3:%.*]] = shl i64 [[TMP2]], 15
 // LLVM:  [[TMP4:%.*]] = ashr i64 [[TMP3]], 47
 // LLVM:  [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
 // LLVM:  [[TMP6:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0
 // LLVM:  [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
-// LLVM:  [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
+// LLVM:  [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 4
 // LLVM:  [[TMP9:%.*]] = and i64 [[TMP7]], 15
 // LLVM:  [[TMP10:%.*]] = and i64 [[TMP8]], -16
 // LLVM:  [[TMP11:%.*]] = or i64 [[TMP10]], [[TMP9]]
-// LLVM:  store i64 [[TMP11]], ptr [[TMP6]], align 8
+// LLVM:  store i64 [[TMP11]], ptr [[TMP6]], align 4
 // LLVM:  [[TMP12:%.*]] = shl i64 [[TMP9]], 60
 // LLVM:  [[TMP13:%.*]] = ashr i64 [[TMP12]], 60
 // LLVM:  [[TMP15:%.*]] = trunc i64 [[TMP13]] to i32
@@ -222,16 +222,16 @@ void get_volatile(V* v) {
 // CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
 // CIR:   [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_V>>, !cir.ptr<!rec_V>
 // CIR:   [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr<!rec_V> -> !cir.ptr<!u64i>
-// CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) {is_volatile} -> !s32i
+// CIR:   [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) {is_volatile} -> !s32i
 
 // LLVM: define dso_local void @get_volatile
 // LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 // LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 // LLVM:   [[TMP2:%.*]] = getelementptr %struct.V, ptr [[TMP1]], i32 0, i32 0
-// LLVM:   [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 8
+// LLVM:   [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 4
 // LLVM:   [[TMP4:%.*]] = and i64 [[TMP3]], -1095216660481
 // LLVM:   [[TMP5:%.*]] = or i64 [[TMP4]], 12884901888
-// LLVM:   store volatile i64 [[TMP5]], ptr [[TMP2]], align 8
+// LLVM:   store volatile i64 [[TMP5]], ptr [[TMP2]], align 4
 
 // OCGC: define dso_local void @get_volatile
 // OCGC:   [[TMP0:%.*]] = alloca ptr, align 8
@@ -249,16 +249,16 @@ void set_volatile(V* v) {
 //CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
 //CIR:   [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_V>>, !cir.ptr<!rec_V>
 //CIR:   [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr<!rec_V> -> !cir.ptr<!u64i>
-//CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) {is_volatile} -> !s32i
+//CIR:   [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) {is_volatile} -> !s32i
 
 // LLVM: define dso_local void @set_volatile
 // LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 // LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 // LLVM:   [[TMP2:%.*]] = getelementptr %struct.V, ptr [[TMP1]], i32 0, i32 0
-// LLVM:   [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 8
+// LLVM:   [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 4
 // LLVM:   [[TMP4:%.*]] = and i64 [[TMP3]], -1095216660481
 // LLVM:   [[TMP5:%.*]] = or i64 [[TMP4]], 12884901888
-// LLVM:   store volatile i64 [[TMP5]], ptr [[TMP2]], align 8
+// LLVM:   store volatile i64 [[TMP5]], ptr [[TMP2]], align 4
 
 // OGCG: define dso_local void @set_volatile
 // OGCG:   [[TMP0:%.*]] = alloca ptr, align 8
@@ -276,24 +276,24 @@ void unOp(S* s) {
 // CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["s", init] {alignment = 8 : i64}
 // CIR:   [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "d"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP3:%.*]] = cir.get_bitfield(#bfi_d, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
+// CIR:   [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_d, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
 // CIR:   [[TMP4:%.*]] = cir.unary(inc, [[TMP3]]) nsw : !s32i, !s32i
-// CIR:   cir.set_bitfield(#bfi_d, [[TMP2]] : !cir.ptr<!u64i>, [[TMP4]] : !s32i)
+// CIR:   cir.set_bitfield align(4) (#bfi_d, [[TMP2]] : !cir.ptr<!u64i>, [[TMP4]] : !s32i)
 
 // LLVM: define {{.*@unOp}}
 // LLVM:   [[TMP0:%.*]] = getelementptr %struct.S, ptr [[LOAD0:%.*]], i32 0, i32 0
-// LLVM:   [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 8
+// LLVM:   [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4
 // LLVM:   [[TMP2:%.*]] = shl i64 [[TMP1]], 13
 // LLVM:   [[TMP3:%.*]] = ashr i64 [[TMP2]], 62
 // LLVM:   [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 // LLVM:   [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
 // LLVM:   [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
-// LLVM:   [[TMP7:%.*]] = load i64, ptr [[TMP0]], align 8
+// LLVM:   [[TMP7:%.*]] = load i64, ptr [[TMP0]], align 4
 // LLVM:   [[TMP8:%.*]] = and i64 [[TMP6]], 3
 // LLVM:   [[TMP9:%.*]] = shl i64 [[TMP8]], 49
 // LLVM:   [[TMP10:%.*]] = and i64 [[TMP7]], -1688849860263937
 // LLVM:   [[TMP11:%.*]] = or i64 [[TMP10]], [[TMP9]]
-// LLVM:   store i64 [[TMP11]], ptr [[TMP0]], align 8
+// LLVM:   store i64 [[TMP11]], ptr [[TMP0]], align 4
 // LLVM:   [[TMP12:%.*]] = shl i64 [[TMP8]], 62
 // LLVM:   [[TMP13:%.*]] = ashr i64 [[TMP12]], 62
 // LLVM:   [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
diff --git a/clang/test/CIR/CodeGen/bitfields.cpp b/clang/test/CIR/CodeGen/bitfields.cpp
index 6715ebf1f48b6..7650e0b83faf6 100644
--- a/clang/test/CIR/CodeGen/bitfields.cpp
+++ b/clang/test/CIR/CodeGen/bitfields.cpp
@@ -39,14 +39,14 @@ int load_field(S* s) {
 // CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["s", init]
 // CIR:   [[TMP1:%.*]] = cir.load{{.*}} [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP3:%.*]] = cir.get_bitfield(#bfi_c, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
+// CIR:   [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
 
 // LLVM: define dso_local i32 @_Z10load_fieldP1S
 // LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 // LLVM:   [[TMP1:%.*]] = alloca i32, i64 1, align 4
 // LLVM:   [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8
 // LLVM:   [[TMP3:%.*]] = getelementptr %struct.S, ptr [[TMP2]], i32 0, i32 0
-// LLVM:   [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+// LLVM:   [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4
 // LLVM:   [[TMP5:%.*]] = shl i64 [[TMP4]], 15
 // LLVM:   [[TMP6:%.*]] = ashr i64 [[TMP5]], 47
 // LLVM:   [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
@@ -67,15 +67,15 @@ void store_field() {
 // CIR:   [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>
 // CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
 // CIR:   [[TMP2:%.*]] = cir.get_member [[TMP0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   cir.set_bitfield(#bfi_a, [[TMP2]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i)
+// CIR:   cir.set_bitfield align(4) (#bfi_a, [[TMP2]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i)
 
 // LLVM: define dso_local void @_Z11store_fieldv
 // LLVM:   [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4
 // LLVM:   [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0
-// LLVM:   [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// LLVM:   [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4
 // LLVM:   [[TMP3:%.*]] = and i64 [[TMP2]], -16
 // LLVM:   [[TMP4:%.*]] = or i64 [[TMP3]], 3
-// LLVM:   store i64 [[TMP4]], ptr [[TMP1]], align 8
+// LLVM:   store i64 [[TMP4]], ptr [[TMP1]], align 4
 
 // OGCG: define dso_local void @_Z11store_fieldv()
 // OGCG:   [[TMP0:%.*]] = alloca %struct.S, align 4
@@ -93,25 +93,25 @@ void store_bitfield_to_bitfield(S* s) {
 // CIR:   [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i
 // CIR:   [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:   [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) -> !s32i
+// CIR:   [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_b, [[TMP3]] : !cir.ptr<!u64i>, [[TMP1]] : !s32i) -> !s32i
 // CIR:   [[TMP5:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:   [[TMP6:%.*]] = cir.get_member [[TMP5]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
-// CIR:   [[TMP7:%.*]] = cir.set_bitfield(#bfi_a, [[TMP6]] : !cir.ptr<!u64i>, [[TMP4]] : !s32i) -> !s32i
+// CIR:   [[TMP7:%.*]] = cir.set_bitfield align(4) (#bfi_a, [[TMP6]] : !cir.ptr<!u64i>, [[TMP4]] : !s32i) -> !s32i
 
 // LLVM: define dso_local void @_Z26store_bitfield_to_bitfieldP1S
 // LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 // LLVM:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 // LLVM:   [[TMP2:%.*]] = getelementptr %struct.S, ptr [[TMP1]], i32 0, i32 0
-// LLVM:   [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+// LLVM:   [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
 // LLVM:   [[TMP4:%.*]] = and i64 [[TMP3]], -2147483633
 // LLVM:   [[TMP5:%.*]] = or i64 [[TMP4]], 48
-// LLVM:   store i64 [[TMP5]], ptr [[TMP2]], align 8
+// LLVM:   store i64 [[TMP5]], ptr [[TMP2]], align 4
 // LLVM:   [[TMP6:%.*]] = load ptr, ptr [[TMP0]], align 8
 // LLVM:   [[TMP7:%.*]] = getelementptr %struct.S, ptr [[TMP6]], i32 0, i32 0
-// LLVM:   [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 8
+// LLVM:   [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 4
 // LLVM:   [[TMP9:%.*]] = and i64 [[TMP8]], -16
 // LLVM:   [[TMP10:%.*]] = or i64 [[TMP9]], 3
-// LLVM:   store i64 [[TMP10]], ptr [[TMP7]], align 8
+// LLVM:   store i64 [[TMP10]], ptr [[TMP7]], align 4
 
 // OGCG: define dso_local void @_Z26store_bitfield_to_bitfieldP1S
 // OGCG:   [[TMP0:%.*]] = alloca ptr, align 8
diff --git a/clang/test/CIR/CodeGen/bitfields_be.c b/clang/test/CIR/CodeGen/bitfields_be.c
index 6133927b67d21..77741ba74870b 100644
--- a/clang/test/CIR/CodeGen/bitfields_be.c
+++ b/clang/test/CIR/CodeGen/bitfields_be.c
@@ -25,7 +25,7 @@ int init(S* s) {
 //CIR:   [[TMP0:%.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["s", init] {alignment = 8 : i64}
 //CIR:   [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 //CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
-//CIR:   [[TMP3:%.*]] = cir.get_bitfield(#bfi_c, [[TMP2]] : !cir.ptr<!u32i>) -> !s32i
+//CIR:   [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr<!u32i>) -> !s32i
 
 //LLVM: define dso_local i32 @init(ptr %0) {
 //LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
@@ -57,7 +57,7 @@ void load(S* s) {
 // CIR:    %[[MIN1:.*]] = cir.unary(minus, %[[CONST1]]) nsw : !s32i, !s32i
 // CIR:    %[[VAL0:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:    %[[GET0:.*]] = cir.get_member %[[VAL0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
-// CIR:    %[[SET0:.*]] = cir.set_bitfield(#bfi_a, %[[GET0]] : !cir.ptr<!u32i>, %[[MIN1]] : !s32i) -> !s32i
+// CIR:    %[[SET0:.*]] = cir.set_bitfield align(4) (#bfi_a, %[[GET0]] : !cir.ptr<!u32i>, %[[MIN1]] : !s32i) -> !s32i
 
 // LLVM: define dso_local void @load
 // LLVM:   %[[PTR0:.*]] = load ptr
@@ -65,50 +65,50 @@ void load(S* s) {
 // LLVM:   %[[VAL0:.*]] = load i32, ptr %[[GET0]], align 4
 // LLVM:   %[[AND0:.*]] = and i32 %[[VAL0]], 268435455
 // LLVM:   %[[OR0:.*]] = or i32 %[[AND0]], -1073741824
-// LLVM:   store i32 %[[OR0]], ptr %[[GET0]]
+// LLVM:   store i32 %[[OR0]], ptr %[[GET0]], align 4
 
 // OGCG: define dso_local void @load
 // OGCG:   %[[PTR0:.*]] = load ptr
-// OGCG:   %[[VAL0:.*]] = load i32, ptr %[[PTR0]]
+// OGCG:   %[[VAL0:.*]] = load i32, ptr %[[PTR0]], align 4
 // OGCG:   %[[AND0:.*]] = and i32 %[[VAL0]], 268435455
 // OGCG:   %[[OR0:.*]] = or i32 %[[AND0]], -1073741824
-// OGCG:   store i32 %[[OR0]], ptr %[[PTR0]]
+// OGCG:   store i32 %[[OR0]], ptr %[[PTR0]], align 4
 
 // field 'b'
 // CIR:    %[[CONST2:.*]] = cir.const #cir.int<42> : !s32i
 // CIR:    %[[VAL1:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:    %[[GET1:.*]] = cir.get_member %[[VAL1]][0] {name = "b"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
-// CIR:    %[[SET1:.*]] = cir.set_bitfield(#bfi_b, %[[GET1]] : !cir.ptr<!u32i>, %[[CONST2]] : !s32i) -> !s32i
+// CIR:    %[[SET1:.*]] = cir.set_bitfield align(4) (#bfi_b, %[[GET1]] : !cir.ptr<!u32i>, %[[CONST2]] : !s32i) -> !s32i
 
 // LLVM:  %[[PTR1:.*]] = load ptr
 // LLVM:  %[[GET1:.*]] = getelementptr %struct.S, ptr %[[PTR1]], i32 0, i32 0
 // LLVM:  %[[VAL1:.*]] = load i32, ptr %[[GET1]], align 4
 // LLVM:  %[[AND1:.*]] = and i32 %[[VAL1]], -268304385
 // LLVM:  %[[OR1:.*]] = or i32 %[[AND1]], 5505024
-// LLVM:  store i32 %[[OR1]], ptr %[[GET1]]
+// LLVM:  store i32 %[[OR1]], ptr %[[GET1]], align 4
 
 // OGCG:   %[[PTR1:.*]] = load ptr
-// OGCG:   %[[VAL1:.*]] = load i32, ptr %[[PTR1]]
+// OGCG:   %[[VAL1:.*]] = load i32, ptr %[[PTR1]], align 4
 // OGCG:   %[[AND1:.*]] = and i32 %[[VAL1]], -268304385
 // OGCG:   %[[OR1:.*]] = or i32 %[[AND1]], 5505024
-// OGCG:   store i32 %[[OR1]], ptr %[[PTR1]]
+// OGCG:   store i32 %[[OR1]], ptr %[[PTR1]], align 4
 
 // field 'c'
 // CIR:    %[[CONST3:.*]] = cir.const #cir.int<12345> : !s32i
 // CIR:    %[[MIN2:.*]] = cir.unary(minus, %[[CONST3]]) nsw : !s32i, !s32i
 // CIR:    %[[VAL2:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:    %[[GET2:.*]] = cir.get_member %[[VAL2]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
-// CIR:    %[[SET2:.*]] = cir.set_bitfield(#bfi_c, %[[GET2]] : !cir.ptr<!u32i>, %[[MIN2]] : !s32i) -> !s32i
+// CIR:    %[[SET2:.*]] = cir.set_bitfield align(4) (#bfi_c, %[[GET2]] : !cir.ptr<!u32i>, %[[MIN2]] : !s32i) -> !s32i
 
 // LLVM:  %[[PTR2:.*]] = load ptr
 // LLVM:  %[[GET2:.*]] = getelementptr %struct.S, ptr  %[[PTR2]], i32 0, i32 0
 // LLVM:  %[[VAL2:.*]] = load i32, ptr %[[GET2]], align 4
 // LLVM:  %[[AND2:.*]] = and i32 %[[VAL2]], -131072
 // LLVM:  %[[OR2:.*]] = or i32 %[[AND2]], 118727
-// LLVM:  store i32 %[[OR2]], ptr %[[GET2]]
+// LLVM:  store i32 %[[OR2]], ptr %[[GET2]], align 4
 
 // OGCG:   %[[PTR2:.*]] = load ptr
-// OGCG:   %[[VAL2:.*]] = load i32, ptr %[[PTR2]]
+// OGCG:   %[[VAL2:.*]] = load i32, ptr %[[PTR2]], align 4
 // OGCG:   %[[AND2:.*]] = and i32 %[[VAL2]], -131072
 // OGCG:   %[[OR2:.*]] = or i32 %[[AND2]], 118727
-// OGCG:   store i32 %[[OR2]], ptr %[[PTR2]]
+// OGCG:   store i32 %[[OR2]], ptr %[[PTR2]], align 4
diff --git a/clang/test/CIR/CodeGen/builtin_bit.cpp b/clang/test/CIR/CodeGen/builtin_bit.cpp
index f017b6eb51971..4ac82bd749e8a 100644
--- a/clang/test/CIR/CodeGen/builtin_bit.cpp
+++ b/clang/test/CIR/CodeGen/builtin_bit.cpp
@@ -416,3 +416,141 @@ unsigned long long test_builtin_bswap64(unsigned long long x) {
 
 // OGCG-LABEL: @_Z20test_builtin_bswap64y
 // OGCG:         %{{.+}} = call i64 @llvm.bswap.i64(i64 %{{.+}})
+
+unsigned char test_builtin_rotateleft8(unsigned char x, unsigned char y) {
+  return __builtin_rotateleft8(x, y);
+}
+
+// CIR-LABEL: @_Z24test_builtin_rotateleft8hh
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u8i
+
+// LLVM-LABEL: @_Z24test_builtin_rotateleft8hh
+// LLVM:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %{{.+}} = call i8 @llvm.fshl.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z24test_builtin_rotateleft8hh
+// OGCG:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %{{.+}} = call i8 @llvm.fshl.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+unsigned short test_builtin_rotateleft16(unsigned short x, unsigned short y) {
+  return __builtin_rotateleft16(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateleft16tt
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u16i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateleft16tt
+// LLVM:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %{{.+}} = call i16 @llvm.fshl.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateleft16tt
+// OGCG:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %{{.+}} = call i16 @llvm.fshl.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+unsigned test_builtin_rotateleft32(unsigned x, unsigned y) {
+  return __builtin_rotateleft32(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateleft32jj
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u32i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateleft32jj
+// LLVM:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %{{.+}} = call i32 @llvm.fshl.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateleft32jj
+// OGCG:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %{{.+}} = call i32 @llvm.fshl.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+unsigned long long test_builtin_rotateleft64(unsigned long long x,
+                                             unsigned long long y) {
+  return __builtin_rotateleft64(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateleft64yy
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u64i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateleft64yy
+// LLVM:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %{{.+}} = call i64 @llvm.fshl.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateleft64yy
+// OGCG:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %{{.+}} = call i64 @llvm.fshl.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
+
+unsigned char test_builtin_rotateright8(unsigned char x, unsigned char y) {
+  return __builtin_rotateright8(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateright8hh
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u8i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateright8hh
+// LLVM:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %{{.+}} = call i8 @llvm.fshr.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateright8hh
+// OGCG:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %{{.+}} = call i8 @llvm.fshr.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+unsigned short test_builtin_rotateright16(unsigned short x, unsigned short y) {
+  return __builtin_rotateright16(x, y);
+}
+
+// CIR-LABEL: @_Z26test_builtin_rotateright16tt
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u16i
+
+// LLVM-LABEL: @_Z26test_builtin_rotateright16tt
+// LLVM:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %{{.+}} = call i16 @llvm.fshr.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z26test_builtin_rotateright16tt
+// OGCG:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %{{.+}} = call i16 @llvm.fshr.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+unsigned test_builtin_rotateright32(unsigned x, unsigned y) {
+  return __builtin_rotateright32(x, y);
+}
+
+// CIR-LABEL: @_Z26test_builtin_rotateright32jj
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u32i
+
+// LLVM-LABEL: @_Z26test_builtin_rotateright32jj
+// LLVM:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %{{.+}} = call i32 @llvm.fshr.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z26test_builtin_rotateright32jj
+// OGCG:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %{{.+}} = call i32 @llvm.fshr.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+unsigned long long test_builtin_rotateright64(unsigned long long x,
+                                              unsigned long long y) {
+  return __builtin_rotateright64(x, y);
+}
+
+// CIR-LABEL: @_Z26test_builtin_rotateright64yy
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u64i
+
+// LLVM-LABEL: @_Z26test_builtin_rotateright64yy
+// LLVM:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %{{.+}} = call i64 @llvm.fshr.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z26test_builtin_rotateright64yy
+// OGCG:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %{{.+}} = call i64 @llvm.fshr.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index ad0e478040836..d9a70683a4dbc 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -111,6 +111,22 @@ void assume(bool arg) {
 // OGCG:   call void @llvm.assume(i1 %{{.+}})
 // OGCG: }
 
+void assume_separate_storage(void *p1, void *p2) {
+  __builtin_assume_separate_storage(p1, p2);
+}
+
+// CIR: cir.func{{.*}} @_Z23assume_separate_storagePvS_
+// CIR:   cir.assume_separate_storage %{{.+}}, %{{.+}} : !cir.ptr<!void>
+// CIR: }
+
+// LLVM: define {{.*}}void @_Z23assume_separate_storagePvS_
+// LLVM:   call void @llvm.assume(i1 true) [ "separate_storage"(ptr %{{.+}}, ptr %{{.+}}) ]
+// LLVM: }
+
+// OGCG: define {{.*}}void @_Z23assume_separate_storagePvS_
+// OGCG:   call void @llvm.assume(i1 true) [ "separate_storage"(ptr %{{.+}}, ptr %{{.+}}) ]
+// OGCG: }
+
 void expect(int x, int y) {
   __builtin_expect(x, y);
 }
diff --git a/clang/test/CIR/CodeGen/complex-builtins.cpp b/clang/test/CIR/CodeGen/complex-builtins.cpp
index f0d12d0ef6663..811af47a704f5 100644
--- a/clang/test/CIR/CodeGen/complex-builtins.cpp
+++ b/clang/test/CIR/CodeGen/complex-builtins.cpp
@@ -83,3 +83,39 @@ void foo3() {
 // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
 // OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8
 // OGCG: store double %[[A_IMAG]], ptr %[[INIT]], align 8
+
+void foo4() {
+  float _Complex a;
+  float _Complex b = __builtin_conjf(a);
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[RESULT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float  %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/complex-unary.cpp b/clang/test/CIR/CodeGen/complex-unary.cpp
new file mode 100644
index 0000000000000..676b5546d28e0
--- /dev/null
+++ b/clang/test/CIR/CodeGen/complex-unary.cpp
@@ -0,0 +1,286 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-canonicalize -o %t.cir %s 2>&1 | FileCheck --check-prefix=CIR-BEFORE %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-after=cir-lowering-prepare -o %t.cir %s 2>&1 | FileCheck --check-prefixes=CIR-AFTER %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void foo() {
+  int _Complex a;
+  int _Complex b = ~a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex<!s32i>, !cir.complex<!s32i>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!s32i> -> !s32i
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!s32i> -> !s32i
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !s32i, !s32i
+// CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !s32i -> !cir.complex<!s32i>
+// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { i32, i32 } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { i32, i32 } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = sub i32 0, %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { i32, i32 } {{.*}}, i32 %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { i32, i32 } %[[RESULT_TMP]], i32 %[[IMAG_MINUS]], 1
+// LLVM: store { i32, i32 } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = sub i32 0, %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store i32 %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store i32 %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo2() {
+  float _Complex a;
+  float _Complex b = ~a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo3() {
+  float _Complex a;
+  float _Complex b = a++;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_INC:.*]] = fadd float 1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_INC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_INC:.*]] = fadd float %[[A_REAL]], 1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_INC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo4() {
+  float _Complex a;
+  float _Complex b = ++a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_INC:.*]] = fadd float 1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_INC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_INC:.*]] = fadd float %[[A_REAL]], 1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_INC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_INC]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo5() {
+  float _Complex a;
+  float _Complex b = a--;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_DEC:.*]] = fadd float -1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_DEC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_DEC:.*]] = fadd float %[[A_REAL]], -1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_DEC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo6() {
+  float _Complex a;
+  float _Complex b = --a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_DEC:.*]] = fadd float -1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_DEC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_DEC:.*]] = fadd float %[[A_REAL]], -1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_DEC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_DEC]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/compound_literal.cpp b/clang/test/CIR/CodeGen/compound_literal.cpp
new file mode 100644
index 0000000000000..a92af95c62a1b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/compound_literal.cpp
@@ -0,0 +1,99 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+int foo() {
+  int e = (int){1};
+  return e;
+}
+
+// CIR: %[[RET:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
+// CIR: %[[COMPOUND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, [".compoundliteral", init]
+// CIR: %[[VALUE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[VALUE]], %[[COMPOUND]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPOUND]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.store{{.*}}  %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP_2:.*]] = cir.load{{.*}} %[[INIT]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.store %[[TMP_2]], %[[RET]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP_3:.*]] = cir.load %[[RET]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.return %[[TMP_3]] : !s32i
+
+// LLVM: %[[RET:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[INIT:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[COMPOUND:.*]] = alloca i32, i64 1, align 4
+// LLVM: store i32 1, ptr %[[COMPOUND]], align 4
+// LLVM: %[[TMP:.*]] = load i32, ptr %[[COMPOUND]], align 4
+// LLVM: store i32 %[[TMP]], ptr %[[INIT]], align 4
+// LLVM: %[[TMP_2:.*]] = load i32, ptr %[[INIT]], align 4
+// LLVM: store i32 %[[TMP_2]], ptr %[[RET]], align 4
+// LLVM: %[[TMP_3:.*]] = load i32, ptr %[[RET]], align 4
+// LLVM: ret i32 %[[TMP_3]]
+
+// OGCG: %[[INIT:.*]] = alloca i32, align 4
+// OGCG: %[[COMPOUND:.*]] = alloca i32, align 4
+// OGCG: store i32 1, ptr %[[COMPOUND]], align 4
+// OGCG: %[[TMP:.*]] = load i32, ptr %[[COMPOUND]], align 4
+// OGCG: store i32 %[[TMP]], ptr %[[INIT]], align 4
+// OGCG: %[[TMP_2:.*]] = load i32, ptr %[[INIT]], align 4
+// OGCG: ret i32 %[[TMP_2]]
+
+void foo2() {
+  int _Complex a = (int _Complex) { 1, 2};
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a", init]
+// CIR: %[[CL_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, [".compoundliteral"]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[CL_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[CL_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[TMP]], %[[A_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM:  %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[CL_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } { i32 1, i32 2 }, ptr %[[CL_ADDR]], align 4
+// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[CL_ADDR]], align 4
+// LLVM: store { i32, i32 } %[[TMP]], ptr %[[A_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[CL_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[CL_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 0
+// OGCG: %[[CL_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 1
+// OGCG: store i32 1, ptr %[[CL_REAL_PTR]], align 4
+// OGCG: store i32 2, ptr %[[CL_IMAG_PTR]], align 4
+// OGCG: %[[CL_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 0
+// OGCG: %[[CL_REAL:.*]] = load i32, ptr %[[CL_REAL_PTR]], align 4
+// OGCG: %[[CL_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 1
+// OGCG: %[[CL_IMAG:.*]] = load i32, ptr %[[CL_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store i32 %[[CL_REAL]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store i32 %[[CL_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+
+void foo3() {
+  typedef int vi4 __attribute__((vector_size(16)));
+  auto a = (vi4){10, 20, 30, 40};
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CIR: %[[CL_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, [".compoundliteral", init]
+// CIR: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<10> : !s32i, #cir.int<20> : !s32i, #cir.int<30> : !s32i, #cir.int<40> : !s32i]> : !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[VEC]], %[[CL_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[CL_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[CL_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: store <4 x i32> <i32 10, i32 20, i32 30, i32 40>, ptr %[[CL_ADDR]], align 16
+// LLVM: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16
+
+// OGCG:  %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[CL_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: store <4 x i32> <i32 10, i32 20, i32 30, i32 40>, ptr %[[CL_ADDR]], align 16
+// OGCG: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16
+
diff --git a/clang/test/CIR/CodeGen/destructors.cpp b/clang/test/CIR/CodeGen/destructors.cpp
index d8f9f23ae191c..de7718f0998fc 100644
--- a/clang/test/CIR/CodeGen/destructors.cpp
+++ b/clang/test/CIR/CodeGen/destructors.cpp
@@ -31,11 +31,11 @@ out_of_line_destructor::~out_of_line_destructor() {
 // OGCG:   ret void
 
 // CIR: cir.func dso_local @_ZN22out_of_line_destructorD1Ev(%{{.+}}: !cir.ptr<!rec_out_of_line_destructor>
-// CIR:  cir.call @_Z13some_functionv() nothrow : () -> ()
+// CIR:  cir.call @_ZN22out_of_line_destructorD2Ev(%{{.*}}) nothrow : (!cir.ptr<!rec_out_of_line_destructor>)
 // CIR:  cir.return
 
 // LLVM: define dso_local void @_ZN22out_of_line_destructorD1Ev(ptr %{{.+}})
-// LLVM:   call void @_Z13some_functionv()
+// LLVM:   call void @_ZN22out_of_line_destructorD2Ev
 // LLVM:   ret void
 
 // OGCG: define dso_local void @_ZN22out_of_line_destructorD1Ev(ptr {{.*}}%{{.+}})
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index 17d5c2fc2e210..047df171afffa 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -6,6 +6,14 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors
 
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-17,since-cxx11, -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx14-17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx14-17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors -fexperimental-new-constant-interpreter
+
 namespace cwg1413 { // cwg1413: 12
   template<int> struct Check {
     typedef int type;
@@ -107,6 +115,8 @@ void f() {
   constexpr int p = &*a;
   // since-cxx11-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}}
   constexpr A *p2 = &*a;
+  // since-cxx11-error@-1 {{constexpr variable 'p2' must be initialized by a constant expression}}
+  // since-cxx11-note@-2 {{dereferencing a null pointer}}
 }
 
 struct A {
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index a53a8d1ed64a8..556407afa2641 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -1429,7 +1429,7 @@ namespace cwg299 { // cwg299: 2.8 c++11
   // cxx98-11-error@#cwg299-q {{ambiguous conversion of array size expression of type 'T' to an integral or enumeration type}}
   //  cxx98-11-note@#cwg299-int {{conversion to integral type 'int' declared here}}
   //  cxx98-11-note@#cwg299-ushort {{conversion to integral type 'unsigned short' declared here}}
-  // since-cxx14-error-re@#cwg299-q {{{{conversion from 'T' to 'unsigned (long long|long|int)' is ambiguous}}}}
+  // since-cxx14-error-re@#cwg299-q {{conversion from 'T' to '__size_t' (aka 'unsigned {{long long|long|int}}') is ambiguous}}
   //  since-cxx14-note@#cwg299-int {{candidate function}}
   //  since-cxx14-note@#cwg299-ushort {{candidate function}}
 } // namespace cwg299
diff --git a/clang/test/CXX/drs/cwg8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp
index ecb9113ccfe66..7395f04c8e399 100644
--- a/clang/test/CXX/drs/cwg8xx.cpp
+++ b/clang/test/CXX/drs/cwg8xx.cpp
@@ -9,10 +9,10 @@
 namespace cwg820 { // cwg820: 2.7
 export template <class T> struct B {};
 // cxx98-17-warning@-1 {{exported templates are unsupported}}
-// since-cxx20-error@-2 {{export declaration can only be used within a module purview}}
+// since-cxx20-error@-2 {{export declaration can only be used within a module interface}}
 export template<typename T> void f() {}
 // cxx98-17-warning@-1 {{exported templates are unsupported}}
-// since-cxx20-error@-2 {{export declaration can only be used within a module purview}}
+// since-cxx20-error@-2 {{export declaration can only be used within a module interface}}
 } // namespace cwg820
 
 namespace cwg873 { // cwg873: 3.0
diff --git a/clang/test/CXX/expr/expr.const/p2-0x.cpp b/clang/test/CXX/expr/expr.const/p2-0x.cpp
index c6c3381be5523..910c8635f7353 100644
--- a/clang/test/CXX/expr/expr.const/p2-0x.cpp
+++ b/clang/test/CXX/expr/expr.const/p2-0x.cpp
@@ -199,15 +199,15 @@ namespace UndefinedBehavior {
 
     constexpr A *na = nullptr;
     constexpr B *nb = nullptr;
-    constexpr A &ra = *nb; // expected-error {{constant expression}} expected-note {{cannot access base class of null pointer}}
-    constexpr B &rb = (B&)*na; // expected-error {{constant expression}} expected-note {{cannot access derived class of null pointer}}
+    constexpr A &ra = *nb; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
+    constexpr B &rb = (B&)*na; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
     static_assert((A*)nb == 0, "");
     static_assert((B*)na == 0, "");
     constexpr const int &nf = nb->n; // expected-error {{constant expression}} expected-note {{cannot access field of null pointer}}
     constexpr const int &mf = nb->m; // expected-error {{constant expression}} expected-note {{cannot access field of null pointer}}
     constexpr const int *np1 = (int*)nullptr + 0; // ok
-    constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok
-    constexpr const int *np3 = &(*(int(*)[4])nullptr)[2]; // expected-error {{constant expression}} expected-note {{cannot perform pointer arithmetic on null pointer}}
+    constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
+    constexpr const int *np3 = &(*(int(*)[4])nullptr)[2]; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
 
     struct C {
       constexpr int f() const { return 0; }
@@ -485,7 +485,7 @@ namespace std {
 namespace TypeId {
   struct S { virtual void f(); };
   constexpr S *p = 0;
-  constexpr const std::type_info &ti1 = typeid(*p); // expected-error {{must be initialized by a constant expression}} cxx11-note {{typeid applied to expression of polymorphic type 'S'}} cxx20-note {{dereferenced null pointer}}
+  constexpr const std::type_info &ti1 = typeid(*p); // expected-error {{must be initialized by a constant expression}} cxx11-note {{typeid applied to expression of polymorphic type 'S'}} cxx20-note {{dereferencing a null pointer}}
 
   struct T {} t;
   constexpr const std::type_info &ti2 = typeid(t);
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp
index 6942b68690c5d..d439f304b5101 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp
@@ -5,11 +5,11 @@ typedef decltype(sizeof(int)) size_t;
 // FIXME: These diagnostics should say 'size_t' instead of 'unsigned long'
 int a = 123_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'unsigned long long' or 'const char *', and no matching literal operator template}}
 int b = 4.2_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'long double' or 'const char *', and no matching literal operator template}}
-int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}}
-int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and 'unsigned}}
-int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}}
-int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and 'unsigned}}
-int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and 'unsigned}}
+int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}}
+int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned}}
+int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}}
+int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and '__size_t' (aka 'unsigned}}
+int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and '__size_t' (aka 'unsigned}}
 int h = 'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char'}}
 int i = L'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'wchar_t'}}
 int j = u'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char16_t'}}
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp
index afadba282e626..463d7854867a2 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp
@@ -13,7 +13,7 @@ float &operator ""_x1 (const char8_t *, size_t);
 using char8 = double;
 #endif
 char8 &i2 = u8"foo"_x1;
-double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and 'unsigned long'}}
+double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned long')}}
 
 char &operator ""_x1(const wchar_t *, size_t);
 char &i4 = L"foo"_x1; // ok
@@ -46,8 +46,8 @@ template<S> float &operator""_s();
 void no_fallback() {
   "hello"_s;
   // FIXME: It'd be useful to explain what candidates were found and why they didn't work.
-  "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}}
-  "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}}
+  "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}}
+  "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}}
 }
 
 double &operator""_s(const char*, size_t);
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp
index d571fcb8697eb..17d9c83055a1c 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp
@@ -17,7 +17,7 @@ int main() {
   auto v1 = 1.2_w;    // calls operator""_w(1.2L)
   auto v2 = u"one"_w; // calls operator""_w(u"one", 3)
   auto v3 = 12_w;     // calls operator""_w("12")
-  "two"_w;            // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and 'unsigned long'}}
+  "two"_w;            // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long')}}
 
   same_type<decltype(v1), long double> test1;
   same_type<decltype(v2), std::string> test2;
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
index 2158d7fa84b86..ebc76ad16467d 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
@@ -9,7 +9,7 @@
 
 //--- ExportDeclNotInModulePurview.cppm
 // expected-error@* {{missing 'export module' declaration in module interface unit}}
-export int b; // expected-error {{export declaration can only be used within a module purview}}
+export int b; // expected-error {{export declaration can only be used within a module interface}}
 
 //--- A.cppm
 // expected-no-diagnostics
@@ -18,7 +18,7 @@ export int a;
 
 //--- AddExport.cppm
 module A; // #module-decl
-export int b; // expected-error {{export declaration can only be used within a module purview}}
+export int b; // expected-error {{export declaration can only be used within a module interface}}
 // expected-note@#module-decl {{add 'export' here}}
 
 //--- AddExport2.cppm
diff --git a/clang/test/CXX/module/module.interface/p1.cpp b/clang/test/CXX/module/module.interface/p1.cpp
index c3bfca930f5cc..1754d9ea14618 100644
--- a/clang/test/CXX/module/module.interface/p1.cpp
+++ b/clang/test/CXX/module/module.interface/p1.cpp
@@ -7,7 +7,7 @@
 
 //--- errors.cpp
 module;
-export int a; // expected-error {{export declaration can only be used within a module purview}}
+export int a; // expected-error {{export declaration can only be used within a module interface}}
 export module M;
 export int b; // #1
 namespace N {
@@ -37,8 +37,8 @@ namespace N {
 //--- impl.cpp
 module M; // #M
 
-export int b2; // expected-error {{export declaration can only be used within a module purview}}
+export int b2; // expected-error {{export declaration can only be used within a module interface}}
 namespace N {
-  export int c2; // expected-error {{export declaration can only be used within a module purview}}
+  export int c2; // expected-error {{export declaration can only be used within a module interface}}
 }
 // expected-note@#M 2+{{add 'export'}}
diff --git a/clang/test/ClangScanDeps/modules-full-named-modules.cppm b/clang/test/ClangScanDeps/modules-full-named-modules.cppm
index 5967a8705c09d..c69a215a62dc1 100644
--- a/clang/test/ClangScanDeps/modules-full-named-modules.cppm
+++ b/clang/test/ClangScanDeps/modules-full-named-modules.cppm
@@ -92,14 +92,7 @@ export void Hello();
 // CHECK-NEXT:             ]
 // CHECK:                 "command-line": [
 // CHECK:                   "-o",
-// CHECK-NEXT:              "{{.*}}/M-{{.*}}.pcm"
-// CHECK:                 ]
-// CHECK:                 "input-file": "[[PREFIX]]/M.cppm"
-// CHECK:               },
-// CHECK-NEXT:          {
-// CHECK:                 "command-line": [
-// CHECK:                   "-o",
-// CHECK-NEXT:              "[[PREFIX]]/M.o"
+// CHECK-NEXT:              "{{.*}}/M.o"
 // CHECK:                 ]
 // CHECK:                 "input-file": "[[PREFIX]]/M.cppm"
 // CHECK:               }
@@ -160,18 +153,7 @@ void World() {
 // CHECK-NEXT:             ]
 // CHECK:                 "command-line": [
 // CHECK:                   "-o",
-// CHECK-NEXT:              "{{.*}}/impl_part-{{.*}}.pcm",
-// CHECK:                 ]
-// CHECK:                 "input-file": "[[PREFIX]]/impl_part.cppm"
-// CHECK:               },
-// CHECK-NEXT:          {
-// CHECK:                 "named-module": "M:impl_part"
-// CHECK-NEXT:            "named-module-deps": [
-// CHECK-NEXT:              "M:interface_part"
-// CHECK-NEXT:             ]
-// CHECK:                 "command-line": [
-// CHECK:                   "-o",
-// CHECK-NEXT:              "[[PREFIX]]/impl_part.o",
+// CHECK-NEXT:              "{{.*}}/impl_part.o",
 // CHECK:                 ]
 // CHECK:                 "input-file": "[[PREFIX]]/impl_part.cppm"
 // CHECK:               }
@@ -194,16 +176,7 @@ export void World();
 // CHECK-NOT:             "named-module-deps": []
 // CHECK:                 "command-line": [
 // CHECK:                   "-o",
-// CHECK-NEXT:              "{{.*}}/interface_part-{{.*}}.pcm",
-// CHECK:                 ]
-// CHECK:                 "input-file": "[[PREFIX]]/interface_part.cppm"
-// CHECK:               },
-// CHECK-NEXT:          {
-// CHECK:                 "named-module": "M:interface_part"
-// CHECK-NOT:             "named-module-deps": []
-// CHECK:                 "command-line": [
-// CHECK:                   "-o",
-// CHECK-NEXT:              "[[PREFIX]]/interface_part.o",
+// CHECK-NEXT:              "{{.*}}/interface_part.o",
 // CHECK:                 ]
 // CHECK:                 "input-file": "[[PREFIX]]/interface_part.cppm"
 // CHECK:               }
@@ -259,14 +232,7 @@ int main() {
 // CHECK-NEXT:             ]
 // CHECK:                 "command-line": [
 // CHECK:                   "-o",
-// CHECK-NEXT:              "{{.*}}/M-{{.*}}.pcm"
-// CHECK:                 ]
-// CHECK:                 "input-file": "[[PREFIX]]/M.cppm"
-// CHECK:               },
-// CHECK-NEXT:          {
-// CHECK:                 "command-line": [
-// CHECK:                   "-o",
-// CHECK-NEXT:              "[[PREFIX]]/M.o"
+// CHECK-NEXT:              "{{.*}}/M.o"
 // CHECK:                 ]
 // CHECK:                 "input-file": "[[PREFIX]]/M.cppm"
 // CHECK:               },
@@ -292,18 +258,7 @@ int main() {
 // CHECK-NEXT:             ]
 // CHECK:                 "command-line": [
 // CHECK:                   "-o",
-// CHECK-NEXT:              "{{.*}}/impl_part-{{.*}}.pcm",
-// CHECK:                 ]
-// CHECK:                 "input-file": "[[PREFIX]]/impl_part.cppm"
-// CHECK:               },
-// CHECK-NEXT:          {
-// CHECK:                 "named-module": "M:impl_part"
-// CHECK-NEXT:            "named-module-deps": [
-// CHECK-NEXT:              "M:interface_part"
-// CHECK-NEXT:             ]
-// CHECK:                 "command-line": [
-// CHECK:                   "-o",
-// CHECK-NEXT:              "[[PREFIX]]/impl_part.o",
+// CHECK-NEXT:              "{{.*}}/impl_part.o",
 // CHECK:                 ]
 // CHECK:                 "input-file": "[[PREFIX]]/impl_part.cppm"
 // CHECK:               }
@@ -316,16 +271,7 @@ int main() {
 // CHECK-NOT:             "named-module-deps": []
 // CHECK:                 "command-line": [
 // CHECK:                   "-o",
-// CHECK-NEXT:              "{{.*}}/interface_part-{{.*}}.pcm",
-// CHECK:                 ]
-// CHECK:                 "input-file": "[[PREFIX]]/interface_part.cppm"
-// CHECK:               },
-// CHECK-NEXT:          {
-// CHECK:                 "named-module": "M:interface_part"
-// CHECK-NOT:             "named-module-deps": []
-// CHECK:                 "command-line": [
-// CHECK:                   "-o",
-// CHECK-NEXT:              "[[PREFIX]]/interface_part.o",
+// CHECK-NEXT:              "{{.*}}/interface_part.o",
 // CHECK:                 ]
 // CHECK:                 "input-file": "[[PREFIX]]/interface_part.cppm"
 // CHECK:               }
diff --git a/clang/test/CodeGen/64bit-swiftcall.c b/clang/test/CodeGen/64bit-swiftcall.c
index 7f8aa02d97ce1..448bca7acbca3 100644
--- a/clang/test/CodeGen/64bit-swiftcall.c
+++ b/clang/test/CodeGen/64bit-swiftcall.c
@@ -239,7 +239,7 @@ TEST(struct_big_1)
 // CHECK-LABEL: define {{.*}} void @return_struct_big_1(ptr dead_on_unwind noalias writable sret
 
 // Should not be byval.
-// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr{{( %.*)?}})
+// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr dead_on_return{{( %.*)?}})
 
 /*****************************************************************************/
 /********************************* TYPE MERGING ******************************/
diff --git a/clang/test/CodeGen/AArch64/byval-temp.c b/clang/test/CodeGen/AArch64/byval-temp.c
index 0ee0312b2362d..5033b6cf5ac03 100644
--- a/clang/test/CodeGen/AArch64/byval-temp.c
+++ b/clang/test/CodeGen/AArch64/byval-temp.c
@@ -30,10 +30,10 @@ void example(void) {
 // Then, memcpy `l` to the temporary stack space.
 // CHECK-O0-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp]], ptr align 8 %[[l]], i64 64, i1 false)
 // Finally, call using a pointer to the temporary stack space.
-// CHECK-O0-NEXT: call void @pass_large(ptr noundef %[[byvaltemp]])
+// CHECK-O0-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp]])
 // Now, do the same for the second call, using the second temporary alloca.
 // CHECK-O0-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp1]], ptr align 8 %[[l]], i64 64, i1 false)
-// CHECK-O0-NEXT: call void @pass_large(ptr noundef %[[byvaltemp1]])
+// CHECK-O0-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp1]])
 // CHECK-O0-NEXT: ret void
 //
 // At O3, we should have lifetime markers to help the optimizer re-use the temporary allocas.
@@ -58,7 +58,7 @@ void example(void) {
 // Then, memcpy `l` to the temporary stack space.
 // CHECK-O3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp]], ptr align 8 %[[l]], i64 64, i1 false)
 // Finally, call using a pointer to the temporary stack space.
-// CHECK-O3-NEXT: call void @pass_large(ptr noundef %[[byvaltemp]])
+// CHECK-O3-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp]])
 //
 // The lifetime of the temporary used to pass a pointer to the struct ends here.
 // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr %[[byvaltemp]])
@@ -66,7 +66,7 @@ void example(void) {
 // Now, do the same for the second call, using the second temporary alloca.
 // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 64, ptr %[[byvaltemp1]])
 // CHECK-O3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp1]], ptr align 8 %[[l]], i64 64, i1 false)
-// CHECK-O3-NEXT: call void @pass_large(ptr noundef %[[byvaltemp1]])
+// CHECK-O3-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp1]])
 // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr %[[byvaltemp1]])
 //
 // Mark the end of the lifetime of `l`.
@@ -88,12 +88,12 @@ void example_BitInt(void) {
 // CHECK-O0-NEXT:    [[LOADEDV:%.*]] = trunc i256 [[TMP0]] to i129
 // CHECK-O0-NEXT:    [[STOREDV:%.*]] = sext i129 [[LOADEDV]] to i256
 // CHECK-O0-NEXT:    store i256 [[STOREDV]], ptr [[INDIRECT_ARG_TEMP]], align 16
-// CHECK-O0-NEXT:    call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP]])
+// CHECK-O0-NEXT:    call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP]])
 // CHECK-O0-NEXT:    [[TMP1:%.*]] = load i256, ptr [[L]], align 16
 // CHECK-O0-NEXT:    [[LOADEDV1:%.*]] = trunc i256 [[TMP1]] to i129
 // CHECK-O0-NEXT:    [[STOREDV1:%.*]] = sext i129 [[LOADEDV1]] to i256
 // CHECK-O0-NEXT:    store i256 [[STOREDV1]], ptr [[INDIRECT_ARG_TEMP1]], align 16
-// CHECK-O0-NEXT:    call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP1]])
+// CHECK-O0-NEXT:    call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP1]])
 // CHECK-O0-NEXT:    ret void
 //
 // CHECK-O3-LABEL: define dso_local void @example_BitInt(
@@ -108,13 +108,13 @@ void example_BitInt(void) {
 // CHECK-O3-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[INDIRECT_ARG_TEMP]]) 
 // CHECK-O3-NEXT:    [[STOREDV:%.*]] = sext i129 [[LOADEDV]] to i256
 // CHECK-O3-NEXT:    store i256 [[STOREDV]], ptr [[INDIRECT_ARG_TEMP]], align 16, !tbaa [[TBAA6]]
-// CHECK-O3-NEXT:    call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP]])
+// CHECK-O3-NEXT:    call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP]])
 // CHECK-O3-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[INDIRECT_ARG_TEMP]]) 
 // CHECK-O3-NEXT:    [[TMP1:%.*]] = load i256, ptr [[L]], align 16, !tbaa [[TBAA6]]
 // CHECK-O3-NEXT:    [[LOADEDV1:%.*]] = trunc i256 [[TMP1]] to i129
 // CHECK-O3-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[INDIRECT_ARG_TEMP1]]) 
 // CHECK-O3-NEXT:    [[STOREDV1:%.*]] = sext i129 [[LOADEDV1]] to i256
 // CHECK-O3-NEXT:    store i256 [[STOREDV1]], ptr [[INDIRECT_ARG_TEMP1]], align 16, !tbaa [[TBAA6]]
-// CHECK-O3-NEXT:    call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP1]])
+// CHECK-O3-NEXT:    call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP1]])
 // CHECK-O3-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[INDIRECT_ARG_TEMP1]]) 
 // CHECK-O3-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[L]]) 
diff --git a/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c b/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c
index 546910068c78a..804e14a2ea34b 100644
--- a/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c
+++ b/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c
@@ -19,7 +19,7 @@ void f0(S0 *p) {
   use0(*p);
 }
 // CHECK-C:   declare void @use0(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
-// CHECK-CXX: declare void @use0(ptr noundef)
+// CHECK-CXX: declare void @use0(ptr dead_on_return noundef)
 
 #ifdef __cplusplus
 
diff --git a/clang/test/CodeGen/AArch64/pure-scalable-args.c b/clang/test/CodeGen/AArch64/pure-scalable-args.c
index fecd370d09be3..48988f7a1722b 100644
--- a/clang/test/CodeGen/AArch64/pure-scalable-args.c
+++ b/clang/test/CodeGen/AArch64/pure-scalable-args.c
@@ -92,7 +92,7 @@ void test_argpass_simple(PST *p) {
 // CHECK-AAPCS-NEXT: ret void
 
 // CHECK-AAPCS:  declare void @argpass_simple_callee(<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @argpass_simple_callee(ptr noundef)
+// CHECK-DARWIN: declare void @argpass_simple_callee(ptr dead_on_return noundef)
 
 // Boundary case of using the last available Z-reg, PST expanded.
 //   0.0  -> d0-d3
@@ -107,7 +107,7 @@ void test_argpass_last_z(PST *p) {
     argpass_last_z_callee(.0, .0, .0, .0, *p);
 }
 // CHECK-AAPCS:  declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, ptr noundef)
+// CHECK-DARWIN: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, ptr dead_on_return noundef)
 
 
 // Like the above, but using a tuple type to occupy some registers.
@@ -123,7 +123,7 @@ void test_argpass_last_z_tuple(PST *p, svfloat64x4_t x) {
   argpass_last_z_tuple_callee(x, *p);
 }
 // CHECK-AAPCS:  declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, ptr noundef)
+// CHECK-DARWIN: declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, ptr dead_on_return noundef)
 
 
 // Boundary case of using the last available P-reg, PST expanded.
@@ -139,7 +139,7 @@ void test_argpass_last_p(PST *p) {
     argpass_last_p_callee(svpfalse(), svpfalse_c(), *p);
 }
 // CHECK-AAPCS:  declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), ptr noundef)
+// CHECK-DARWIN: declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), ptr dead_on_return noundef)
 
 
 // Not enough Z-regs, push PST to memory and pass a pointer, Z-regs and
@@ -157,7 +157,7 @@ void test_argpass_no_z(PST *p, double dummy, svmfloat8_t u, int8x16_t v, mfloat8
     void argpass_no_z_callee(svmfloat8_t, int8x16_t, mfloat8x16_t, double, double, int, PST, int, double, svbool_t);
     argpass_no_z_callee(u, v, w, .0, .0, 1, *p, 2, 3.0, svptrue_b64());
 }
-// CHECK: declare void @argpass_no_z_callee(<vscale x 16 x i8>, <16 x i8> noundef, <16 x i8>, double noundef, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
+// CHECK: declare void @argpass_no_z_callee(<vscale x 16 x i8>, <16 x i8> noundef, <16 x i8>, double noundef, double noundef, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 
 
 // Like the above, using a tuple to occupy some registers.
@@ -173,7 +173,7 @@ void test_argpass_no_z_tuple_f64(PST *p, float dummy, svfloat64x4_t x) {
                                      double, svbool_t);
   argpass_no_z_tuple_f64_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64());
 }
-// CHECK: declare void @argpass_no_z_tuple_f64_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
+// CHECK: declare void @argpass_no_z_tuple_f64_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, double noundef, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 
 
 // Likewise, using a different tuple.
@@ -189,7 +189,7 @@ void test_argpass_no_z_tuple_mfp8(PST *p, float dummy, svmfloat8x4_t x) {
                                       double, svbool_t);
   argpass_no_z_tuple_mfp8_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64());
 }
-// CHECK: declare void @argpass_no_z_tuple_mfp8_callee(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
+// CHECK: declare void @argpass_no_z_tuple_mfp8_callee(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, double noundef, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 
 
 // Not enough Z-regs (consumed by a HFA), PST passed indirectly
@@ -204,8 +204,8 @@ void test_argpass_no_z_hfa(HFA *h, PST *p) {
     void argpass_no_z_hfa_callee(double, HFA, int, PST, int, svbool_t);
     argpass_no_z_hfa_callee(.0, *h, 1, *p, 2, svptrue_b64());
 }
-// CHECK-AAPCS:  declare void @argpass_no_z_hfa_callee(double noundef, [4 x float] alignstack(8), i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float], i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
+// CHECK-AAPCS:  declare void @argpass_no_z_hfa_callee(double noundef, [4 x float] alignstack(8), i32 noundef, ptr dead_on_return noundef, i32 noundef, <vscale x 16 x i1>)
+// CHECK-DARWIN: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float], i32 noundef, ptr dead_on_return noundef, i32 noundef, <vscale x 16 x i1>)
 
 // Not enough Z-regs (consumed by a HVA), PST passed indirectly
 //   0.0  -> d0
@@ -219,8 +219,8 @@ void test_argpass_no_z_hva(HVA *h, PST *p) {
     void argpass_no_z_hva_callee(double, HVA, int, PST, int, svbool_t);
     argpass_no_z_hva_callee(.0, *h, 1, *p, 2, svptrue_b64());
 }
-// CHECK-AAPCS:  declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>] alignstack(16), i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>], i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
+// CHECK-AAPCS:  declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>] alignstack(16), i32 noundef, ptr dead_on_return noundef, i32 noundef, <vscale x 16 x i1>)
+// CHECK-DARWIN: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>], i32 noundef, ptr dead_on_return noundef, i32 noundef, <vscale x 16 x i1>)
 
 // Not enough P-regs, PST passed indirectly, Z-regs and P-regs still available.
 //   true -> p0-p2
@@ -233,7 +233,7 @@ void test_argpass_no_p(PST *p) {
     void argpass_no_p_callee(svbool_t, svbool_t, svbool_t, int, PST, int, double, svbool_t);
     argpass_no_p_callee(svptrue_b8(), svptrue_b16(), svptrue_b32(), 1, *p, 2, 3.0, svptrue_b64());
 }
-// CHECK: declare void @argpass_no_p_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
+// CHECK: declare void @argpass_no_p_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 
 
 // Like above, using a tuple to occupy some registers.
@@ -250,7 +250,7 @@ void test_argpass_no_p_tuple(PST *p, svbool_t u, svboolx2_t v) {
                                  svbool_t);
   argpass_no_p_tuple_callee(v, u, 1, *p, 2, 3.0, svptrue_b64());
 }
-// CHECK: declare void @argpass_no_p_tuple_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
+// CHECK: declare void @argpass_no_p_tuple_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 
 
 // HFAs go back-to-back to memory, afterwards Z-regs not available, PST passed indirectly.
@@ -263,8 +263,8 @@ void test_after_hfa(HFA *h, PST *p) {
     void after_hfa_callee(double, double, double, double, double, HFA, PST, HFA, svbool_t);
     after_hfa_callee(.0, .0, .0, .0, .0, *h, *p, *h, svpfalse());
 }
-// CHECK-AAPCS:  declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float] alignstack(8), ptr noundef, [4 x float] alignstack(8), <vscale x 16 x i1>)
-// CHECK-DARWIN: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float], ptr noundef, [4 x float], <vscale x 16 x i1>)
+// CHECK-AAPCS:  declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float] alignstack(8), ptr dead_on_return noundef, [4 x float] alignstack(8), <vscale x 16 x i1>)
+// CHECK-DARWIN: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float], ptr dead_on_return noundef, [4 x float], <vscale x 16 x i1>)
 
 // Small PST, not enough registers, passed indirectly, unlike other small
 // aggregates.
@@ -277,7 +277,7 @@ void test_small_pst(SmallPST *p, SmallAgg *s) {
     void small_pst_callee(SmallAgg, double, double, double, double, double, double, double, double, double, SmallPST, double);
     small_pst_callee(*s, .0, .0, .0, .0, .0, .0, .0, .0, 1.0, *p, 2.0);
 }
-// CHECK-AAPCS:  declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, ptr noundef, double noundef)
+// CHECK-AAPCS:  declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, ptr dead_on_return noundef, double noundef)
 // CHECK-DARWIN: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, i128, double noundef)
 
 
@@ -326,12 +326,12 @@ void test_pass_variadic(PST *p, PST *q) {
     pass_variadic_callee(*p, *q);
 }
 // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false)
-// CHECK-AAPCS: call void (<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>, ...) @pass_variadic_callee(<vscale x 16 x i1> %1, <vscale x 2 x double> %cast.scalable1, <vscale x 4 x float> %cast.scalable2, <vscale x 4 x float> %cast.scalable3, <vscale x 16 x i8> %cast.scalable4, <vscale x 16 x i1> %12, ptr noundef nonnull %byval-temp)
+// CHECK-AAPCS: call void (<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>, ...) @pass_variadic_callee(<vscale x 16 x i1> %1, <vscale x 2 x double> %cast.scalable1, <vscale x 4 x float> %cast.scalable2, <vscale x 4 x float> %cast.scalable3, <vscale x 16 x i8> %cast.scalable4, <vscale x 16 x i1> %12, ptr dead_on_return noundef nonnull %byval-temp)
 
 // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %p, i64 96, i1 false)
 // CHECK-DARWIN: call void @llvm.lifetime.start.p0(i64 96, ptr nonnull %byval-temp1)
 // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp1, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false)
-// CHECK-DARWIN: call void (ptr, ...) @pass_variadic_callee(ptr noundef nonnull %byval-temp, ptr noundef nonnull %byval-temp1)
+// CHECK-DARWIN: call void (ptr, ...) @pass_variadic_callee(ptr dead_on_return noundef nonnull %byval-temp, ptr dead_on_return noundef nonnull %byval-temp1)
 
 
 // Test passing a small PST, still passed indirectly, despite being <= 128 bits
@@ -340,7 +340,7 @@ void test_small_pst_variadic(SmallPST *p) {
     small_pst_variadic_callee(0, *p);
 }
 // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %byval-temp, ptr noundef nonnull align 16 dereferenceable(16) %p, i64 16, i1 false)
-// CHECK-AAPCS: call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, ptr noundef nonnull %byval-temp)
+// CHECK-AAPCS: call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, ptr dead_on_return noundef nonnull %byval-temp)
 
 // CHECK-DARWIN: %0 = load i128, ptr %p, align 16
 // CHECK-DARWIN: tail call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, i128 %0)
@@ -467,7 +467,7 @@ void test_tuple_reg_count(svfloat32_t x, svfloat32x2_t y) {
                                    svfloat32_t, svfloat32_t, svfloat32_t, svfloat32x2_t);
   test_tuple_reg_count_callee(x, x, x, x, x, x, x, y);
 }
-// CHECK-AAPCS: declare void @test_tuple_reg_count_callee(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, ptr noundef)
+// CHECK-AAPCS: declare void @test_tuple_reg_count_callee(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, ptr dead_on_return noundef)
 // CHECK-DARWIN: declare void @test_tuple_reg_count_callee(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
 
 // Regression test for incorrect passing of SVE vector tuples
@@ -476,5 +476,5 @@ void test_tuple_reg_count_bool(svboolx4_t x, svboolx4_t y) {
   void test_tuple_reg_count_bool_callee(svboolx4_t, svboolx4_t);
   test_tuple_reg_count_bool_callee(x, y);
 }
-// CHECK-AAPCS:  declare void @test_tuple_reg_count_bool_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, ptr noundef)
+// CHECK-AAPCS:  declare void @test_tuple_reg_count_bool_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, ptr dead_on_return noundef)
 // CHECK-DARWIN: declare void @test_tuple_reg_count_bool_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
index b1232921df363..f0c9ef28201a5 100644
--- a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
+++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
@@ -82,7 +82,7 @@ struct Sppp {
     int *x, *y, *z;
 };
 // CHECK-A64-LABEL: define dso_local void @_Z4Tppp4Sppp(
-// CHECK-A64-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] {
+// CHECK-A64-SAME: ptr dead_on_return noundef [[S:%.*]]) #[[ATTR0]] {
 // CHECK-A64-NEXT:  [[ENTRY:.*:]]
 // CHECK-A64-NEXT:    [[S_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-A64-NEXT:    store ptr [[S]], ptr [[S_INDIRECT_ADDR]], align 8
@@ -490,7 +490,7 @@ struct Spa3 {
     int* xs[3];
 };
 // CHECK-A64-LABEL: define dso_local void @_Z4Tpa34Spa3(
-// CHECK-A64-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] {
+// CHECK-A64-SAME: ptr dead_on_return noundef [[S:%.*]]) #[[ATTR0]] {
 // CHECK-A64-NEXT:  [[ENTRY:.*:]]
 // CHECK-A64-NEXT:    [[S_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-A64-NEXT:    store ptr [[S]], ptr [[S_INDIRECT_ADDR]], align 8
diff --git a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
index c3d0541229fac..d244a8ba88572 100644
--- a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
+++ b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c
@@ -58,7 +58,7 @@ typedef int8_t vec_int8 __attribute__((vector_size(N / 8)));
 // CHECK128-NEXT:    ret <16 x i8> [[CASTFIXEDSVE]]
 
 // CHECK-LABEL: define{{.*}} void @f2(
-// CHECK-SAME:   ptr dead_on_unwind noalias writable writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 captures(none) initializes((0, [[#div(VBITS,8)]])) %agg.result, ptr noundef readonly captures(none) %0)
+// CHECK-SAME:   ptr dead_on_unwind noalias writable writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 captures(none) initializes((0, [[#div(VBITS,8)]])) %agg.result, ptr dead_on_return noundef readonly captures(none) %0)
 // CHECK-NEXT: entry:
 // CHECK-NEXT:   [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, ptr [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]]
 // CHECK-NEXT:   [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> poison, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
@@ -88,13 +88,13 @@ typedef svint8_t vec2 __attribute__((arm_sve_vector_bits(N)));
 // CHECK-NEXT:   [[X:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
 // CHECK-NEXT:   call void @llvm.lifetime.start.p0(i64 [[SIZE:[0-9]+]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:   store <[[#div(VBITS,8)]] x i8> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]]
-// CHECK-NEXT:   call void @f3(ptr noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
+// CHECK-NEXT:   call void @f3(ptr dead_on_return noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
 // CHECK-NEXT:   call void @llvm.lifetime.end.p0(i64 [[SIZE]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:   ret void
 
 // CHECK128-LABEL: declare void @f3(<16 x i8> noundef)
 
 // CHECK-LABEL: declare void @f3(
-// CHECK-SAME:   ptr noundef)
+// CHECK-SAME:   ptr dead_on_return noundef)
 void g(vec2 x) { f3(x); } // OK
 #endif
diff --git a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
index e82069aab2486..d42ecb663050f 100644
--- a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
+++ b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp
@@ -75,7 +75,7 @@ typedef svint16_t vec2 __attribute__((arm_sve_vector_bits(N)));
 // CHECKWIDE-NEXT:   [[X:%.*]] = tail call <[[#div(VBITS, 16)]] x i16> @llvm.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
 // CHECKWIDE-NEXT:   call void @llvm.lifetime.start.p0(i64 [[SIZE:[0-9]+]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]]
 // CHECKWIDE-NEXT:   store <[[#div(VBITS, 16)]] x i16> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6:!tbaa !.*]]
-// CHECKWIDE-NEXT:   call void @_Z1fDv[[#div(VBITS, 16)]]_s(ptr noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
+// CHECKWIDE-NEXT:   call void @_Z1fDv[[#div(VBITS, 16)]]_s(ptr dead_on_return noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
 // CHECKWIDE-NEXT:   call void @llvm.lifetime.end.p0(i64 [[SIZE]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]]
 // CHECKWIDE-NEXT:   ret void
 void g(vec2 x) { f(x); } // OK
diff --git a/clang/test/CodeGen/LoongArch/bitint.c b/clang/test/CodeGen/LoongArch/bitint.c
index f346f569d0eb0..950f5308e32cc 100644
--- a/clang/test/CodeGen/LoongArch/bitint.c
+++ b/clang/test/CodeGen/LoongArch/bitint.c
@@ -26,12 +26,12 @@ void pass_BitInt129(_BitInt(129));
 // LA32-NEXT:    [[LOADEDV1:%.*]] = trunc i128 [[TMP1]] to i65
 // LA32-NEXT:    [[STOREDV:%.*]] = sext i65 [[LOADEDV1]] to i128
 // LA32-NEXT:    store i128 [[STOREDV]], ptr [[BYVAL_TEMP]], align 16
-// LA32-NEXT:    call void @pass_BitInt65(ptr noundef [[BYVAL_TEMP]])
+// LA32-NEXT:    call void @pass_BitInt65(ptr dead_on_return noundef [[BYVAL_TEMP]])
 // LA32-NEXT:    [[TMP2:%.*]] = load i256, ptr [[L129]], align 16
 // LA32-NEXT:    [[LOADEDV2:%.*]] = trunc i256 [[TMP2]] to i129
 // LA32-NEXT:    [[STOREDV4:%.*]] = sext i129 [[LOADEDV2]] to i256
 // LA32-NEXT:    store i256 [[STOREDV4]], ptr [[BYVAL_TEMP3]], align 16
-// LA32-NEXT:    call void @pass_BitInt129(ptr noundef [[BYVAL_TEMP3]])
+// LA32-NEXT:    call void @pass_BitInt129(ptr dead_on_return noundef [[BYVAL_TEMP3]])
 // LA32-NEXT:    ret void
 //
 // LA64-LABEL: define dso_local void @example_BitInt(
@@ -54,7 +54,7 @@ void pass_BitInt129(_BitInt(129));
 // LA64-NEXT:    [[LOADEDV2:%.*]] = trunc i256 [[TMP2]] to i129
 // LA64-NEXT:    [[STOREDV:%.*]] = sext i129 [[LOADEDV2]] to i256
 // LA64-NEXT:    store i256 [[STOREDV]], ptr [[BYVAL_TEMP]], align 16
-// LA64-NEXT:    call void @pass_BitInt129(ptr noundef [[BYVAL_TEMP]])
+// LA64-NEXT:    call void @pass_BitInt129(ptr dead_on_return noundef [[BYVAL_TEMP]])
 // LA64-NEXT:    ret void
 //
 void example_BitInt(void) {
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
index 838db02415fe5..b46fa9f2cf157 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
@@ -11,7 +11,7 @@
 // RUN: -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s
-// RUN: %clang_cc1 -triple powerpcspe -ffp-exception-behavior=strict \
+// RUN: %clang_cc1 -triple powerpc -ffp-exception-behavior=strict \
 // RUN: -target-feature +vsx -fexperimental-strict-floating-point -emit-llvm \
 // RUN: %s -o - | FileCheck --check-prefix=CHECK-CONSTRAINED %s
 
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c
deleted file mode 100644
index 9def39f5fa479..0000000000000
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: not %clang_cc1 -triple powerpc64le-unknown-linux-gnu -target-cpu pwr10 \
-// RUN:   %s -emit-llvm-only 2>&1 | FileCheck %s
-
-__attribute__((target("no-mma")))
-void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
-  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
-  __vector_pair vp = *((__vector_pair *)vpp);
-  __builtin_mma_dmsetdmrz(&vdmr);
-  __builtin_mma_dmmr(&vdmr, (__dmr1024*)vpp);
-  __builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
-
-// CHECK: error: '__builtin_mma_dmsetdmrz' needs target feature mma,isa-future-instructions
-// CHECK: error: '__builtin_mma_dmmr' needs target feature mma,isa-future-instructions
-// CHECK: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
-}
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index c02274696244a..5a92d6e982511 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -1,3 +1,5 @@
+// RUN: not %clang_cc1 -triple powerpc64le-unknown-linux-gnu -target-cpu pwr10 \
+// RUN:   %s -emit-llvm-only 2>&1 | FileCheck %s
 // RUN: not %clang_cc1 -triple powerpc64le-unknown-linux-gnu -target-cpu future \
 // RUN:   %s -emit-llvm-only 2>&1 | FileCheck %s
 
diff --git a/clang/test/CodeGen/PowerPC/ppc64-vector.c b/clang/test/CodeGen/PowerPC/ppc64-vector.c
index 5d3dd86a009d5..2e99781f84910 100644
--- a/clang/test/CodeGen/PowerPC/ppc64-vector.c
+++ b/clang/test/CodeGen/PowerPC/ppc64-vector.c
@@ -39,7 +39,7 @@ v8i16 test_v8i16(v8i16 x)
   return x;
 }
 
-// CHECK: define{{.*}} void @test_v16i16(ptr dead_on_unwind noalias writable sret(<16 x i16>) align 32 %agg.result, ptr noundef %0)
+// CHECK: define{{.*}} void @test_v16i16(ptr dead_on_unwind noalias writable sret(<16 x i16>) align 32 %agg.result, ptr dead_on_return noundef %0)
 v16i16 test_v16i16(v16i16 x)
 {
   return x;
diff --git a/clang/test/CodeGen/RISCV/riscv-abi.cpp b/clang/test/CodeGen/RISCV/riscv-abi.cpp
index fe1a2b6d8595c..d2e080829e72f 100644
--- a/clang/test/CodeGen/RISCV/riscv-abi.cpp
+++ b/clang/test/CodeGen/RISCV/riscv-abi.cpp
@@ -75,7 +75,7 @@ struct child3_int64_s : parent3_float_s {
 };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @_Z30float_int64_struct_inheritance14child3_int64_s
-// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD3_INT64_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD3_INT64_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 // LP64-LABEL: define dso_local [2 x i64] @_Z30float_int64_struct_inheritance14child3_int64_s
@@ -99,7 +99,7 @@ struct child4_double_s : parent4_double_s {
 };
 
 // ILP32-ILP32F-LABEL: define dso_local void @_Z32double_double_struct_inheritance15child4_double_s
-// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD4_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD4_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local { double, double } @_Z32double_double_struct_inheritance15child4_double_s
@@ -130,11 +130,11 @@ struct child5_virtual_s : virtual parent5_virtual_s {
 };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @_Z38int32_float_virtual_struct_inheritance16child5_virtual_s
-// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 // LP64-LP64F-LP64D-LABEL: define dso_local void @_Z38int32_float_virtual_struct_inheritance16child5_virtual_s
-// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 struct child5_virtual_s int32_float_virtual_struct_inheritance(struct child5_virtual_s a) {
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
index 82e43fff0c3aa..bc89cb532bdcc 100644
--- a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c
@@ -138,7 +138,7 @@ struct st_i32x4x9 {
 
 typedef int __attribute__((vector_size(256))) int32x64_t;
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_too_large(ptr noundef %0)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_too_large(ptr dead_on_return noundef %0)
 void __attribute__((riscv_vls_cc)) test_too_large(int32x64_t arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_too_large_256(<vscale x 16 x i32> noundef %arg.coerce)
 void __attribute__((riscv_vls_cc(256))) test_too_large_256(int32x64_t arg) {}
@@ -173,9 +173,9 @@ void __attribute__((riscv_vls_cc)) test_st_i32x8x2(struct st_i32x8x2 arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x8x2_256(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x8x2_256(struct st_i32x8x2 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x64x2(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x64x2(ptr dead_on_return noundef %arg)
 void __attribute__((riscv_vls_cc)) test_st_i32x64x2(struct st_i32x64x2 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x64x2_256(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x64x2_256(ptr dead_on_return noundef %arg)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x64x2_256(struct st_i32x64x2 arg) {}
 
 // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x3(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %arg)
@@ -188,7 +188,7 @@ void __attribute__((riscv_vls_cc)) test_st_i32x4x8(struct st_i32x4x8 arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x8_256(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4x8_256(struct st_i32x4x8 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x9(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x9(ptr dead_on_return noundef %arg)
 void __attribute__((riscv_vls_cc)) test_st_i32x4x9(struct st_i32x4x9 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x9_256(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x9_256(ptr dead_on_return noundef %arg)
 void __attribute__((riscv_vls_cc(256))) test_st_i32x4x9_256(struct st_i32x4x9 arg) {}
diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
index 5f6539796c20d..128610e578c26 100644
--- a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
+++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp
@@ -118,7 +118,7 @@ struct st_i32x4x9 {
 
 typedef int __attribute__((vector_size(256))) int32x64_t;
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z14test_too_largeDv64_i(ptr noundef %0)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z14test_too_largeDv64_i(ptr dead_on_return noundef %0)
 [[riscv::vls_cc]] void test_too_large(int32x64_t arg) {}
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z18test_too_large_256Dv64_i(<vscale x 16 x i32> noundef %arg.coerce)
 [[riscv::vls_cc(256)]] void test_too_large_256(int32x64_t arg) {}
@@ -153,9 +153,9 @@ typedef int __attribute__((vector_size(256))) int32x64_t;
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x8x2_25610st_i32x8x2(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %arg)
 [[riscv::vls_cc(256)]] void test_st_i32x8x2_256(struct st_i32x8x2 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z16test_st_i32x64x211st_i32x64x2(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z16test_st_i32x64x211st_i32x64x2(ptr dead_on_return noundef %arg)
 [[riscv::vls_cc]] void test_st_i32x64x2(struct st_i32x64x2 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z20test_st_i32x64x2_25611st_i32x64x2(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z20test_st_i32x64x2_25611st_i32x64x2(ptr dead_on_return noundef %arg)
 [[riscv::vls_cc(256)]] void test_st_i32x64x2_256(struct st_i32x64x2 arg) {}
 
 // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x310st_i32x4x3(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %arg)
@@ -168,7 +168,7 @@ typedef int __attribute__((vector_size(256))) int32x64_t;
 // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x8_25610st_i32x4x8(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %arg)
 [[riscv::vls_cc(256)]] void test_st_i32x4x8_256(struct st_i32x4x8 arg) {}
 
-// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x910st_i32x4x9(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x910st_i32x4x9(ptr dead_on_return noundef %arg)
 [[riscv::vls_cc]] void test_st_i32x4x9(struct st_i32x4x9 arg) {}
-// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x9_25610st_i32x4x9(ptr noundef %arg)
+// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x9_25610st_i32x4x9(ptr dead_on_return noundef %arg)
 [[riscv::vls_cc(256)]] void test_st_i32x4x9_256(struct st_i32x4x9 arg) {}
diff --git a/clang/test/CodeGen/RISCV/riscv32-abi.c b/clang/test/CodeGen/RISCV/riscv32-abi.c
index b53f9a9169146..a9e56d40817ae 100644
--- a/clang/test/CodeGen/RISCV/riscv32-abi.c
+++ b/clang/test/CodeGen/RISCV/riscv32-abi.c
@@ -246,7 +246,7 @@ struct large {
 };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_agg_large
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[X:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[X:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_agg_large(struct large x) {
@@ -266,7 +266,7 @@ struct large f_agg_large_ret(int32_t i, int8_t j) {
 typedef unsigned char v16i8 __attribute__((vector_size(16)));
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_vec_large_v16i8
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[TMP0:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_vec_large_v16i8(v16i8 x) {
@@ -285,7 +285,7 @@ v16i8 f_vec_large_v16i8_ret(void) {
 // if they were passed in registers.
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local i32 @f_scalar_stack_1
-// ILP32-ILP32F-ILP32D-SAME: (i32 [[A_COERCE:%.*]], [2 x i32] [[B_COERCE:%.*]], i64 [[C_COERCE:%.*]], ptr noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (i32 [[A_COERCE:%.*]], [2 x i32] [[B_COERCE:%.*]], i64 [[C_COERCE:%.*]], ptr dead_on_return noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
@@ -343,7 +343,7 @@ struct large f_scalar_stack_6(float a, int64_t b, double c, long double d,
 // they would be if passed via registers.
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_agg_stack
-// ILP32-ILP32F-ILP32D-SAME: (double noundef [[A:%.*]], i64 noundef [[B:%.*]], double noundef [[C:%.*]], i64 noundef [[D:%.*]], i32 [[E_COERCE:%.*]], [2 x i32] [[F_COERCE:%.*]], i64 [[G_COERCE:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (double noundef [[A:%.*]], i64 noundef [[B:%.*]], double noundef [[C:%.*]], i64 noundef [[D:%.*]], i32 [[E_COERCE:%.*]], [2 x i32] [[F_COERCE:%.*]], i64 [[G_COERCE:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e,
@@ -366,7 +366,7 @@ struct double_int8_s { double d; int64_t i; };
 struct int_double_s { int a; double b; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_int_double_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_int_double_s_arg
@@ -482,7 +482,7 @@ struct zbf_double_zbf_s f_ret_zbf_double_zbf_s(void) {
 struct double_float_s { double f; float g; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_double_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_double_s_arg
@@ -504,7 +504,7 @@ struct double_double_s f_ret_double_double_s(void) {
 }
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_float_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_float_s_arg
@@ -526,7 +526,7 @@ struct double_float_s f_ret_double_float_s(void) {
 }
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_double_s_arg_insufficient_fprs
-// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_double_double_s_arg_insufficient_fprs(float a, double b, double c, double d,
@@ -543,7 +543,7 @@ struct double_int64bf_s { double d; int64_t i : 32; };
 struct double_int8_zbf_s { double d; int8_t i; int : 0; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_int8_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_double_int8_s_arg(struct double_int8_s a) {}
@@ -557,7 +557,7 @@ struct double_int8_s f_ret_double_int8_s(void) {
 }
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_uint8_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_uint8_s_arg
@@ -579,7 +579,7 @@ struct double_uint8_s f_ret_double_uint8_s(void) {
 }
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_int32_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_int32_s_arg
@@ -601,7 +601,7 @@ struct double_int32_s f_ret_double_int32_s(void) {
 }
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_int64_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_double_int64_s_arg(struct double_int64_s a) {}
@@ -615,7 +615,7 @@ struct double_int64_s f_ret_double_int64_s(void) {
 }
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_int64bf_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_int64bf_s_arg
@@ -640,7 +640,7 @@ struct double_int64bf_s f_ret_double_int64bf_s(void) {
 // floating point calling convention.
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_int8_zbf_s
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_int8_zbf_s
@@ -662,14 +662,14 @@ struct double_int8_zbf_s f_ret_double_int8_zbf_s(void) {
 }
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_int8_s_arg_insufficient_gprs
-// ILP32-ILP32F-ILP32D-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], i32 noundef [[H:%.*]], ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], i32 noundef [[H:%.*]], ptr dead_on_return noundef [[I:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_double_int8_s_arg_insufficient_gprs(int a, int b, int c, int d, int e,
                                           int f, int g, int h, struct double_int8_s i) {}
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_struct_double_int8_insufficient_fprs
-// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], double noundef [[H:%.*]], ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], double noundef [[H:%.*]], ptr dead_on_return noundef [[I:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_struct_double_int8_insufficient_fprs(float a, double b, double c, double d,
@@ -679,7 +679,7 @@ void f_struct_double_int8_insufficient_fprs(float a, double b, double c, double
 // floating-point value should be passed as if it were an fp+fp struct.
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublecomplex
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublecomplex
@@ -703,7 +703,7 @@ double __complex__ f_ret_doublecomplex(void) {
 struct doublecomplex_s { double __complex__ c; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublecomplex_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublecomplex_s_arg
@@ -754,7 +754,7 @@ struct doublearr1_s f_ret_doublearr1_s(void) {
 struct doublearr2_s { double a[2]; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublearr2_s_arg
@@ -778,7 +778,7 @@ struct doublearr2_s f_ret_doublearr2_s(void) {
 struct doublearr2_tricky1_s { struct { double f[1]; } g[2]; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky1_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky1_s_arg
@@ -802,7 +802,7 @@ struct doublearr2_tricky1_s f_ret_doublearr2_tricky1_s(void) {
 struct doublearr2_tricky2_s { struct {}; struct { double f[1]; } g[2]; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky2_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky2_s_arg
@@ -826,7 +826,7 @@ struct doublearr2_tricky2_s f_ret_doublearr2_tricky2_s(void) {
 struct doublearr2_tricky3_s { union {}; struct { double f[1]; } g[2]; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky3_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky3_s_arg
@@ -850,7 +850,7 @@ struct doublearr2_tricky3_s f_ret_doublearr2_tricky3_s(void) {
 struct doublearr2_tricky4_s { union {}; struct { struct {}; double f[1]; } g[2]; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky4_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky4_s_arg
@@ -877,7 +877,7 @@ struct doublearr2_tricky4_s f_ret_doublearr2_tricky4_s(void) {
 struct int_double_int_s { int a; double b; int c; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int_double_int_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_int_double_int_s_arg(struct int_double_int_s a) {}
@@ -893,7 +893,7 @@ struct int_double_int_s f_ret_int_double_int_s(void) {
 struct int64_double_s { int64_t a; double b; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int64_double_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_int64_double_s_arg(struct int64_double_s a) {}
@@ -909,7 +909,7 @@ struct int64_double_s f_ret_int64_double_s(void) {
 struct char_char_double_s { char a; char b; double c; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_char_char_double_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_char_char_double_s_arg(struct char_char_double_s a) {}
@@ -948,7 +948,7 @@ union double_u f_ret_double_u(void) {
 // double+double structs by the ABI.
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_ret_double_int32_s_double_int32_s_just_sufficient_gprs
-// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_INT32_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_INT32_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local { double, i32 } @f_ret_double_int32_s_double_int32_s_just_sufficient_gprs
@@ -961,7 +961,7 @@ struct double_int32_s f_ret_double_int32_s_double_int32_s_just_sufficient_gprs(
 }
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_ret_double_double_s_double_int32_s_just_sufficient_gprs
-// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local { double, double } @f_ret_double_double_s_double_int32_s_just_sufficient_gprs
@@ -974,7 +974,7 @@ struct double_double_s f_ret_double_double_s_double_int32_s_just_sufficient_gprs
 }
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_ret_doublecomplex_double_int32_s_just_sufficient_gprs
-// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local { double, double } @f_ret_doublecomplex_double_int32_s_just_sufficient_gprs
@@ -1189,7 +1189,7 @@ struct float_int32_s f_ret_float_int32_s(void) {
 }
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_float_int64_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_float_int64_s_arg(struct float_int64_s a) {}
@@ -1465,7 +1465,7 @@ struct floatarr2_tricky4_s f_ret_floatarr2_tricky4_s(void) {
 struct int_float_int_s { int a; float b; int c; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int_float_int_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_int_float_int_s_arg(struct int_float_int_s a) {}
@@ -1481,7 +1481,7 @@ struct int_float_int_s f_ret_int_float_int_s(void) {
 struct int64_float_s { int64_t a; float b; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int64_float_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_int64_float_s_arg(struct int64_float_s a) {}
@@ -1619,7 +1619,7 @@ struct zbf_float16_zbf_s f_ret_zbf_float16_zbf_s(void) {
 struct double_float16_s { double f; _Float16 g; };
 
 // ILP32-ILP32F-LABEL: define dso_local void @f_double_float16_s_arg
-// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F:  entry:
 //
 // ILP32D-LABEL: define dso_local void @f_double_float16_s_arg
@@ -1641,7 +1641,7 @@ struct double_float16_s f_ret_double_float16_s(void) {
 }
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_float16_s_arg_insufficient_fprs
-// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_double_float16_s_arg_insufficient_fprs(float a, double b, double c, double d,
@@ -1725,7 +1725,7 @@ struct float16_int32_s f_ret_float16_int32_s(void) {
 }
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_float16_int64_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_float16_int64_s_arg(struct float16_int64_s a) {}
@@ -2001,7 +2001,7 @@ struct float16arr2_tricky4_s f_ret_float16arr2_tricky4_s(void) {
 struct int_float16_int_s { int a; _Float16 b; int c; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int_float16_int_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_int_float16_int_s_arg(struct int_float16_int_s a) {}
@@ -2017,7 +2017,7 @@ struct int_float16_int_s f_ret_int_float16_int_s(void) {
 struct int64_float16_s { int64_t a; _Float16 b; };
 
 // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int64_float16_s_arg
-// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // ILP32-ILP32F-ILP32D:  entry:
 //
 void f_int64_float16_s_arg(struct int64_float16_s a) {}
diff --git a/clang/test/CodeGen/RISCV/riscv32-vararg.c b/clang/test/CodeGen/RISCV/riscv32-vararg.c
index 2b332410f8637..ed301f9269bb8 100644
--- a/clang/test/CodeGen/RISCV/riscv32-vararg.c
+++ b/clang/test/CodeGen/RISCV/riscv32-vararg.c
@@ -64,7 +64,7 @@ int f_va_callee(int, ...);
 // CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SMALL_ALIGNED]], ptr [[DOTCOMPOUNDLITERAL4]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[COERCE_DIVE]], align 8
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[BYVAL_TEMP]], ptr align 4 [[DOTCOMPOUNDLITERAL6]], i32 16, i1 false)
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 (i32, ...) @f_va_callee(i32 noundef 1, i32 noundef 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i32 [[TMP0]], [2 x i32] [[TMP1]], i64 [[TMP2]], ptr noundef [[BYVAL_TEMP]])
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (i32, ...) @f_va_callee(i32 noundef 1, i32 noundef 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i32 [[TMP0]], [2 x i32] [[TMP1]], i64 [[TMP2]], ptr dead_on_return noundef [[BYVAL_TEMP]])
 // CHECK-NEXT:    ret void
 //
 void f_va_caller(void) {
diff --git a/clang/test/CodeGen/RISCV/riscv64-abi.c b/clang/test/CodeGen/RISCV/riscv64-abi.c
index 021565238904e..dc01750e56970 100644
--- a/clang/test/CodeGen/RISCV/riscv64-abi.c
+++ b/clang/test/CodeGen/RISCV/riscv64-abi.c
@@ -242,7 +242,7 @@ struct large {
 };
 
 // LP64-LP64F-LP64D-LABEL: define dso_local void @f_agg_large
-// LP64-LP64F-LP64D-SAME: (ptr noundef [[X:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (ptr dead_on_return noundef [[X:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 void f_agg_large(struct large x) {
@@ -262,7 +262,7 @@ struct large f_agg_large_ret(int32_t i, int8_t j) {
 typedef unsigned char v32i8 __attribute__((vector_size(32)));
 
 // LP64-LP64F-LP64D-LABEL: define dso_local void @f_vec_large_v32i8
-// LP64-LP64F-LP64D-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (ptr dead_on_return noundef [[TMP0:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 void f_vec_large_v32i8(v32i8 x) {
@@ -281,7 +281,7 @@ v32i8 f_vec_large_v32i8_ret(void) {
 // if they were passed in registers.
 
 // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_1
-// LP64-LP64F-LP64D-SAME: (i64 [[A_COERCE:%.*]], [2 x i64] [[B_COERCE:%.*]], i128 [[C_COERCE:%.*]], ptr noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (i64 [[A_COERCE:%.*]], [2 x i64] [[B_COERCE:%.*]], i128 [[C_COERCE:%.*]], ptr dead_on_return noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
@@ -290,7 +290,7 @@ int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c,
 }
 
 // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_2
-// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], i64 noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], i64 noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 int f_scalar_stack_2(int32_t a, __int128_t b, int64_t c, long double d, v32i8 e,
@@ -299,7 +299,7 @@ int f_scalar_stack_2(int32_t a, __int128_t b, int64_t c, long double d, v32i8 e,
 }
 
 // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_3
-// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], double noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], double noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 int f_scalar_stack_3(int32_t a, __int128_t b, double c, long double d, v32i8 e,
@@ -312,7 +312,7 @@ int f_scalar_stack_3(int32_t a, __int128_t b, double c, long double d, v32i8 e,
 // to pass a pointer.
 
 // LP64-LP64F-LP64D-LABEL: define dso_local void @f_scalar_stack_4
-// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 struct large f_scalar_stack_4(uint32_t a, __int128_t b, long double c, v32i8 d,
@@ -321,7 +321,7 @@ struct large f_scalar_stack_4(uint32_t a, __int128_t b, long double c, v32i8 d,
 }
 
 // LP64-LP64F-LP64D-LABEL: define dso_local void @f_scalar_stack_5
-// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], double noundef [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], double noundef [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 struct large f_scalar_stack_5(double a, __int128_t b, long double c, v32i8 d,
@@ -330,7 +330,7 @@ struct large f_scalar_stack_5(double a, __int128_t b, long double c, v32i8 d,
 }
 
 // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_6
-// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], float noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]], half noundef [[I:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], float noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]], half noundef [[I:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 int f_scalar_stack_6(int32_t a, __int128_t b, float c, long double d, v32i8 e,
@@ -1440,7 +1440,7 @@ struct doublearr2_tricky4_s f_ret_doublearr2_tricky4_s(void) {
 struct int_double_int_s { int a; double b; int c; };
 
 // LP64-LP64F-LP64D-LABEL: define dso_local void @f_int_double_int_s_arg
-// LP64-LP64F-LP64D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// LP64-LP64F-LP64D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] {
 // LP64-LP64F-LP64D:  entry:
 //
 void f_int_double_int_s_arg(struct int_double_int_s a) {}
diff --git a/clang/test/CodeGen/RISCV/riscv64-vararg.c b/clang/test/CodeGen/RISCV/riscv64-vararg.c
index a278f74ca4a86..17802553c795a 100644
--- a/clang/test/CodeGen/RISCV/riscv64-vararg.c
+++ b/clang/test/CodeGen/RISCV/riscv64-vararg.c
@@ -74,7 +74,7 @@ int f_va_callee(int, ...);
 // CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SMALL_ALIGNED]], ptr [[DOTCOMPOUNDLITERAL4]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[COERCE_DIVE]], align 16
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BYVAL_TEMP]], ptr align 8 [[DOTCOMPOUNDLITERAL6]], i64 32, i1 false)
-// CHECK-NEXT:    [[CALL:%.*]] = call signext i32 (i32, ...) @f_va_callee(i32 noundef signext 1, i32 noundef signext 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i64 [[TMP0]], [2 x i64] [[TMP1]], i128 [[TMP2]], ptr noundef [[BYVAL_TEMP]])
+// CHECK-NEXT:    [[CALL:%.*]] = call signext i32 (i32, ...) @f_va_callee(i32 noundef signext 1, i32 noundef signext 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i64 [[TMP0]], [2 x i64] [[TMP1]], i128 [[TMP2]], ptr dead_on_return noundef [[BYVAL_TEMP]])
 // CHECK-NEXT:    [[CALL11:%.*]] = call signext i32 (i32, ...) @f_va_callee(i32 noundef signext 1, i32 noundef signext 2, i32 noundef signext 3, i32 noundef signext 4, fp128 noundef 0xL00000000000000004001400000000000, i32 noundef signext 6, i32 noundef signext 7, i32 noundef signext 8, i32 noundef signext 9)
 // CHECK-NEXT:    [[A13:%.*]] = getelementptr inbounds nuw [[STRUCT_SMALL_ALIGNED]], ptr [[DOTCOMPOUNDLITERAL12]], i32 0, i32 0
 // CHECK-NEXT:    store i128 5, ptr [[A13]], align 16
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vcompress.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vcompress.c
new file mode 100644
index 0000000000000..6cff3dfbbb7dd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vcompress.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vcompress_vm_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vcompress.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vcompress_vm_bf16mf4(vbfloat16mf4_t vs2, vbool64_t vs1,
+                                         size_t vl) {
+  return __riscv_vcompress_vm_bf16mf4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vcompress_vm_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vcompress.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vcompress_vm_bf16mf2(vbfloat16mf2_t vs2, vbool32_t vs1,
+                                         size_t vl) {
+  return __riscv_vcompress_vm_bf16mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vcompress_vm_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vcompress.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vcompress_vm_bf16m1(vbfloat16m1_t vs2, vbool16_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress_vm_bf16m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vcompress_vm_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vcompress.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vcompress_vm_bf16m2(vbfloat16m2_t vs2, vbool8_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress_vm_bf16m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vcompress_vm_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vcompress.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vcompress_vm_bf16m4(vbfloat16m4_t vs2, vbool4_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress_vm_bf16m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vcompress_vm_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vcompress.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vcompress_vm_bf16m8(vbfloat16m8_t vs2, vbool2_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress_vm_bf16m8(vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vrgather.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vrgather.c
new file mode 100644
index 0000000000000..cb0004fa2b64d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/non-overloaded/vrgather.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4(vbfloat16mf4_t vs2, vuint16mf4_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather_vv_bf16mf4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4(vbfloat16mf4_t vs2, size_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather_vx_bf16mf4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2(vbfloat16mf2_t vs2, vuint16mf2_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather_vv_bf16mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2(vbfloat16mf2_t vs2, size_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather_vx_bf16mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1(vbfloat16m1_t vs2, vuint16m1_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vv_bf16m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1(vbfloat16m1_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vx_bf16m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2(vbfloat16m2_t vs2, vuint16m2_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vv_bf16m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2(vbfloat16m2_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vx_bf16m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4(vbfloat16m4_t vs2, vuint16m4_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vv_bf16m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4(vbfloat16m4_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vx_bf16m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8(vbfloat16m8_t vs2, vuint16m8_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vv_bf16m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8(vbfloat16m8_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather_vx_bf16m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                          vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                          size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                          vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                          size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                        vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                        vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                        vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                        vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m8_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m8_m(vm, vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vcompress.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vcompress.c
new file mode 100644
index 0000000000000..40de6fdccf95f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vcompress.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vcompress_vm_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vcompress.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vcompress_vm_bf16mf4(vbfloat16mf4_t vs2, vbool64_t vs1,
+                                         size_t vl) {
+  return __riscv_vcompress(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vcompress_vm_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vcompress.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vcompress_vm_bf16mf2(vbfloat16mf2_t vs2, vbool32_t vs1,
+                                         size_t vl) {
+  return __riscv_vcompress(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vcompress_vm_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vcompress.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vcompress_vm_bf16m1(vbfloat16m1_t vs2, vbool16_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vcompress_vm_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vcompress.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vcompress_vm_bf16m2(vbfloat16m2_t vs2, vbool8_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vcompress_vm_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vcompress.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vcompress_vm_bf16m4(vbfloat16m4_t vs2, vbool4_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vcompress_vm_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vcompress.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vcompress_vm_bf16m8(vbfloat16m8_t vs2, vbool2_t vs1,
+                                       size_t vl) {
+  return __riscv_vcompress(vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vrgather.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vrgather.c
new file mode 100644
index 0000000000000..068d8498997b6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/non-policy/overloaded/vrgather.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4(vbfloat16mf4_t vs2, vuint16mf4_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4(vbfloat16mf4_t vs2, size_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2(vbfloat16mf2_t vs2, vuint16mf2_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2(vbfloat16mf2_t vs2, size_t vs1,
+                                        size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1(vbfloat16m1_t vs2, vuint16m1_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1(vbfloat16m1_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2(vbfloat16m2_t vs2, vuint16m2_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2(vbfloat16m2_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4(vbfloat16m4_t vs2, vuint16m4_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4(vbfloat16m4_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8(vbfloat16m8_t vs2, vuint16m8_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8(vbfloat16m8_t vs2, size_t vs1,
+                                      size_t vl) {
+  return __riscv_vrgather(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                          vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                          size_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                          vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                          size_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                        vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                        vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                        vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                        vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                        size_t vs1, size_t vl) {
+  return __riscv_vrgather(vm, vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vcompress.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vcompress.c
new file mode 100644
index 0000000000000..90160c8fe19c3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vcompress.c
@@ -0,0 +1,68 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vcompress_vm_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vcompress.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vcompress_vm_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, vbool64_t vs1, size_t vl) {
+  return __riscv_vcompress_vm_bf16mf4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vcompress_vm_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vcompress.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vcompress_vm_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, vbool32_t vs1, size_t vl) {
+  return __riscv_vcompress_vm_bf16mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vcompress_vm_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vcompress.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vcompress_vm_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, vbool16_t vs1, size_t vl) {
+  return __riscv_vcompress_vm_bf16m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vcompress_vm_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vcompress.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vcompress_vm_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, vbool8_t vs1, size_t vl) {
+  return __riscv_vcompress_vm_bf16m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vcompress_vm_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vcompress.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vcompress_vm_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, vbool4_t vs1, size_t vl) {
+  return __riscv_vcompress_vm_bf16m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vcompress_vm_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vcompress.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vcompress_vm_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, vbool2_t vs1, size_t vl) {
+  return __riscv_vcompress_vm_bf16m8_tu(vd, vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vrgather.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vrgather.c
new file mode 100644
index 0000000000000..137ab17c190b9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/non-overloaded/vrgather.c
@@ -0,0 +1,488 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather_vv_bf16m8_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vs1, size_t vl) {
+  return __riscv_vrgather_vx_bf16m8_mu(vm, vd, vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vcompress.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vcompress.c
new file mode 100644
index 0000000000000..079977a0a06b0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vcompress.c
@@ -0,0 +1,76 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vcompress_vm_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vcompress.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vcompress_vm_bf16mf4_tu(vbfloat16mf4_t vd,
+                                            vbfloat16mf4_t vs2, vbool64_t vs1,
+                                            size_t vl) {
+  return __riscv_vcompress_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vcompress_vm_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vcompress.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vcompress_vm_bf16mf2_tu(vbfloat16mf2_t vd,
+                                            vbfloat16mf2_t vs2, vbool32_t vs1,
+                                            size_t vl) {
+  return __riscv_vcompress_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vcompress_vm_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vcompress.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vcompress_vm_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
+                                          vbool16_t vs1, size_t vl) {
+  return __riscv_vcompress_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vcompress_vm_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vcompress.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vcompress_vm_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
+                                          vbool8_t vs1, size_t vl) {
+  return __riscv_vcompress_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vcompress_vm_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vcompress.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vcompress_vm_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
+                                          vbool4_t vs1, size_t vl) {
+  return __riscv_vcompress_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vcompress_vm_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i1> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vcompress.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vcompress_vm_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
+                                          vbool2_t vs1, size_t vl) {
+  return __riscv_vcompress_tu(vd, vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vrgather.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vrgather.c
new file mode 100644
index 0000000000000..7a5624aed608b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfmin/policy/overloaded/vrgather.c
@@ -0,0 +1,576 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x \
+// RUN:   -target-feature +zvfbfmin -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_tu(vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, vuint16mf4_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_tu(vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_tu(vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, vuint16mf2_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_tu(vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
+                                         vuint16m1_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
+                                         size_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
+                                         vuint16m2_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
+                                         size_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
+                                         vuint16m4_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
+                                         size_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
+                                         vuint16m8_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
+                                         size_t vs1, size_t vl) {
+  return __riscv_vrgather_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vbfloat16mf4_t vs2,
+                                            vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vbfloat16mf4_t vs2, size_t vs1,
+                                            size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vbfloat16mf2_t vs2,
+                                            vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vbfloat16mf2_t vs2, size_t vs1,
+                                            size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                          vbfloat16m1_t vs2, vuint16m1_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                          vbfloat16m1_t vs2, size_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                          vbfloat16m2_t vs2, vuint16m2_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                          vbfloat16m2_t vs2, size_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                          vbfloat16m4_t vs2, vuint16m4_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                          vbfloat16m4_t vs2, size_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                          vbfloat16m8_t vs2, vuint16m8_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                          vbfloat16m8_t vs2, size_t vs1,
+                                          size_t vl) {
+  return __riscv_vrgather_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vbfloat16mf4_t vs2,
+                                             vuint16mf4_t vs1, size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vbfloat16mf4_t vs2, size_t vs1,
+                                             size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vuint16mf2_t vs1, size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vbfloat16mf2_t vs2, size_t vs1,
+                                             size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                           vbfloat16m1_t vs2, vuint16m1_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                           vbfloat16m1_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                           vbfloat16m2_t vs2, vuint16m2_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                           vbfloat16m2_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                           vbfloat16m4_t vs2, vuint16m4_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                           vbfloat16m4_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                           vbfloat16m8_t vs2, vuint16m8_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                           vbfloat16m8_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i16> [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vv_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, vuint16mf4_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vrgather_vx_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vrgather_vx_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i16> [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vv_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, vuint16mf2_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vrgather_vx_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vrgather_vx_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, size_t vs1,
+                                           size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i16> [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vv_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                         vbfloat16m1_t vs2, vuint16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vrgather_vx_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vrgather_vx_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                         vbfloat16m1_t vs2, size_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i16> [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vv_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                         vbfloat16m2_t vs2, vuint16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vrgather_vx_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vrgather_vx_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                         vbfloat16m2_t vs2, size_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i16> [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vv_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                         vbfloat16m4_t vs2, vuint16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vrgather_vx_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vrgather_vx_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                         vbfloat16m4_t vs2, size_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], <vscale x 32 x i16> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i16> [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vv_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                         vbfloat16m8_t vs2, vuint16m8_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vrgather_vx_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VS1]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vrgather_vx_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                         vbfloat16m8_t vs2, size_t vs1,
+                                         size_t vl) {
+  return __riscv_vrgather_mu(vm, vd, vs2, vs1, vl);
+}
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
index e5704709a3a33..fab6050a0d876 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
@@ -58,91 +58,91 @@ unsigned int align = __alignof__ (v16i8);
 // CHECK-VECTOR: @align ={{.*}} global i32 8
 
 v1i8 pass_v1i8(v1i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1i8(ptr dead_on_unwind noalias writable sret(<1 x i8>) align 1 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1i8(ptr dead_on_unwind noalias writable sret(<1 x i8>) align 1 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x i8> @pass_v1i8(<1 x i8> %{{.*}})
 
 v2i8 pass_v2i8(v2i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v2i8(ptr dead_on_unwind noalias writable sret(<2 x i8>) align 2 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v2i8(ptr dead_on_unwind noalias writable sret(<2 x i8>) align 2 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <2 x i8> @pass_v2i8(<2 x i8> %{{.*}})
 
 v4i8 pass_v4i8(v4i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v4i8(ptr dead_on_unwind noalias writable sret(<4 x i8>) align 4 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v4i8(ptr dead_on_unwind noalias writable sret(<4 x i8>) align 4 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <4 x i8> @pass_v4i8(<4 x i8> %{{.*}})
 
 v8i8 pass_v8i8(v8i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v8i8(ptr dead_on_unwind noalias writable sret(<8 x i8>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v8i8(ptr dead_on_unwind noalias writable sret(<8 x i8>) align 8 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <8 x i8> @pass_v8i8(<8 x i8> %{{.*}})
 
 v16i8 pass_v16i8(v16i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v16i8(ptr dead_on_unwind noalias writable sret(<16 x i8>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v16i8(ptr dead_on_unwind noalias writable sret(<16 x i8>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <16 x i8> @pass_v16i8(<16 x i8> %{{.*}})
 
 v32i8 pass_v32i8(v32i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 32 %{{.*}}, ptr %0)
-// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 32 %{{.*}}, ptr dead_on_return %0)
+// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 8 %{{.*}}, ptr dead_on_return %0)
 
 v1i16 pass_v1i16(v1i16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1i16(ptr dead_on_unwind noalias writable sret(<1 x i16>) align 2 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1i16(ptr dead_on_unwind noalias writable sret(<1 x i16>) align 2 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x i16> @pass_v1i16(<1 x i16> %{{.*}})
 
 v2i16 pass_v2i16(v2i16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v2i16(ptr dead_on_unwind noalias writable sret(<2 x i16>) align 4 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v2i16(ptr dead_on_unwind noalias writable sret(<2 x i16>) align 4 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <2 x i16> @pass_v2i16(<2 x i16> %{{.*}})
 
 v4i16 pass_v4i16(v4i16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v4i16(ptr dead_on_unwind noalias writable sret(<4 x i16>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v4i16(ptr dead_on_unwind noalias writable sret(<4 x i16>) align 8 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <4 x i16> @pass_v4i16(<4 x i16> %{{.*}})
 
 v8i16 pass_v8i16(v8i16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v8i16(ptr dead_on_unwind noalias writable sret(<8 x i16>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v8i16(ptr dead_on_unwind noalias writable sret(<8 x i16>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <8 x i16> @pass_v8i16(<8 x i16> %{{.*}})
 
 v1i32 pass_v1i32(v1i32 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1i32(ptr dead_on_unwind noalias writable sret(<1 x i32>) align 4 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1i32(ptr dead_on_unwind noalias writable sret(<1 x i32>) align 4 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x i32> @pass_v1i32(<1 x i32> %{{.*}})
 
 v2i32 pass_v2i32(v2i32 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v2i32(ptr dead_on_unwind noalias writable sret(<2 x i32>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v2i32(ptr dead_on_unwind noalias writable sret(<2 x i32>) align 8 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <2 x i32> @pass_v2i32(<2 x i32> %{{.*}})
 
 v4i32 pass_v4i32(v4i32 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v4i32(ptr dead_on_unwind noalias writable sret(<4 x i32>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v4i32(ptr dead_on_unwind noalias writable sret(<4 x i32>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <4 x i32> @pass_v4i32(<4 x i32> %{{.*}})
 
 v1i64 pass_v1i64(v1i64 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1i64(ptr dead_on_unwind noalias writable sret(<1 x i64>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1i64(ptr dead_on_unwind noalias writable sret(<1 x i64>) align 8 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x i64> @pass_v1i64(<1 x i64> %{{.*}})
 
 v2i64 pass_v2i64(v2i64 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v2i64(ptr dead_on_unwind noalias writable sret(<2 x i64>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v2i64(ptr dead_on_unwind noalias writable sret(<2 x i64>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <2 x i64> @pass_v2i64(<2 x i64> %{{.*}})
 
 v1i128 pass_v1i128(v1i128 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1i128(ptr dead_on_unwind noalias writable sret(<1 x i128>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1i128(ptr dead_on_unwind noalias writable sret(<1 x i128>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x i128> @pass_v1i128(<1 x i128> %{{.*}})
 
 v1f32 pass_v1f32(v1f32 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1f32(ptr dead_on_unwind noalias writable sret(<1 x float>) align 4 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1f32(ptr dead_on_unwind noalias writable sret(<1 x float>) align 4 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x float> @pass_v1f32(<1 x float> %{{.*}})
 
 v2f32 pass_v2f32(v2f32 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v2f32(ptr dead_on_unwind noalias writable sret(<2 x float>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v2f32(ptr dead_on_unwind noalias writable sret(<2 x float>) align 8 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <2 x float> @pass_v2f32(<2 x float> %{{.*}})
 
 v4f32 pass_v4f32(v4f32 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v4f32(ptr dead_on_unwind noalias writable sret(<4 x float>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v4f32(ptr dead_on_unwind noalias writable sret(<4 x float>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <4 x float> @pass_v4f32(<4 x float> %{{.*}})
 
 v1f64 pass_v1f64(v1f64 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1f64(ptr dead_on_unwind noalias writable sret(<1 x double>) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1f64(ptr dead_on_unwind noalias writable sret(<1 x double>) align 8 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x double> @pass_v1f64(<1 x double> %{{.*}})
 
 v2f64 pass_v2f64(v2f64 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v2f64(ptr dead_on_unwind noalias writable sret(<2 x double>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v2f64(ptr dead_on_unwind noalias writable sret(<2 x double>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <2 x double> @pass_v2f64(<2 x double> %{{.*}})
 
 v1f128 pass_v1f128(v1f128 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_v1f128(ptr dead_on_unwind noalias writable sret(<1 x fp128>) align 16 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_v1f128(ptr dead_on_unwind noalias writable sret(<1 x fp128>) align 16 %{{.*}}, ptr dead_on_return %0)
 // CHECK-VECTOR-LABEL: define{{.*}} <1 x fp128> @pass_v1f128(<1 x fp128> %{{.*}})
 
 
@@ -170,13 +170,13 @@ struct agg_v8i8 pass_agg_v8i8(struct agg_v8i8 arg) { return arg; }
 
 struct agg_v16i8 { v16i8 a; };
 struct agg_v16i8 pass_agg_v16i8(struct agg_v16i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_v16i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v16i8) align 16 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_v16i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v16i8) align 16 %{{.*}}, ptr dead_on_return %{{.*}})
 // CHECK-VECTOR-LABEL: define{{.*}} void @pass_agg_v16i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v16i8) align 8 %{{.*}}, <16 x i8> %{{.*}})
 
 struct agg_v32i8 { v32i8 a; };
 struct agg_v32i8 pass_agg_v32i8(struct agg_v32i8 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 32 %{{.*}}, ptr %{{.*}})
-// CHECK-VECTOR-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 8 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 32 %{{.*}}, ptr dead_on_return %{{.*}})
+// CHECK-VECTOR-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 8 %{{.*}}, ptr dead_on_return %{{.*}})
 
 
 // Verify that the following are *not* vector-like aggregate types
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index f26084ab44eae..83137ae6d5f82 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -50,7 +50,7 @@ long long pass_longlong(long long arg) { return arg; }
 // CHECK-LABEL: define{{.*}} i64 @pass_longlong(i64 %{{.*}})
 
 __int128 pass_int128(__int128 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr dead_on_return %0)
 
 _Float16 pass__Float16(_Float16 arg) { return arg; }
 // CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}})
@@ -62,37 +62,37 @@ double pass_double(double arg) { return arg; }
 // CHECK-LABEL: define{{.*}} double @pass_double(double %{{.*}})
 
 long double pass_longdouble(long double arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_longdouble(ptr dead_on_unwind noalias writable sret(fp128) align 8 %{{.*}}, ptr %0)
+// CHECK-LABEL: define{{.*}} void @pass_longdouble(ptr dead_on_unwind noalias writable sret(fp128) align 8 %{{.*}}, ptr dead_on_return %0)
 
 
 // Complex types
 
 _Complex char pass_complex_char(_Complex char arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_char(ptr dead_on_unwind noalias writable sret({ i8, i8 }) align 1 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_char(ptr dead_on_unwind noalias writable sret({ i8, i8 }) align 1 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex short pass_complex_short(_Complex short arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_short(ptr dead_on_unwind noalias writable sret({ i16, i16 }) align 2 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_short(ptr dead_on_unwind noalias writable sret({ i16, i16 }) align 2 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex int pass_complex_int(_Complex int arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_int(ptr dead_on_unwind noalias writable sret({ i32, i32 }) align 4 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_int(ptr dead_on_unwind noalias writable sret({ i32, i32 }) align 4 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex long pass_complex_long(_Complex long arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_long(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_long(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex float pass_complex_float(_Complex float arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex double pass_complex_double(_Complex double arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_double(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_double(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 _Complex long double pass_complex_longdouble(_Complex long double arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_complex_longdouble(ptr dead_on_unwind noalias writable sret({ fp128, fp128 }) align 8 %{{.*}}, ptr %{{.*}}arg)
+// CHECK-LABEL: define{{.*}} void @pass_complex_longdouble(ptr dead_on_unwind noalias writable sret({ fp128, fp128 }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg)
 
 
 // Aggregate types
@@ -107,7 +107,7 @@ struct agg_2byte pass_agg_2byte(struct agg_2byte arg) { return arg; }
 
 struct agg_3byte { char a[3]; };
 struct agg_3byte pass_agg_3byte(struct agg_3byte arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_3byte(ptr dead_on_unwind noalias writable sret(%struct.agg_3byte) align 1 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_3byte(ptr dead_on_unwind noalias writable sret(%struct.agg_3byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}})
 
 struct agg_4byte { char a[4]; };
 struct agg_4byte pass_agg_4byte(struct agg_4byte arg) { return arg; }
@@ -115,15 +115,15 @@ struct agg_4byte pass_agg_4byte(struct agg_4byte arg) { return arg; }
 
 struct agg_5byte { char a[5]; };
 struct agg_5byte pass_agg_5byte(struct agg_5byte arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_5byte(ptr dead_on_unwind noalias writable sret(%struct.agg_5byte) align 1 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_5byte(ptr dead_on_unwind noalias writable sret(%struct.agg_5byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}})
 
 struct agg_6byte { char a[6]; };
 struct agg_6byte pass_agg_6byte(struct agg_6byte arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_6byte(ptr dead_on_unwind noalias writable sret(%struct.agg_6byte) align 1 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_6byte(ptr dead_on_unwind noalias writable sret(%struct.agg_6byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}})
 
 struct agg_7byte { char a[7]; };
 struct agg_7byte pass_agg_7byte(struct agg_7byte arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_7byte(ptr dead_on_unwind noalias writable sret(%struct.agg_7byte) align 1 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_7byte(ptr dead_on_unwind noalias writable sret(%struct.agg_7byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}})
 
 struct agg_8byte { char a[8]; };
 struct agg_8byte pass_agg_8byte(struct agg_8byte arg) { return arg; }
@@ -131,7 +131,7 @@ struct agg_8byte pass_agg_8byte(struct agg_8byte arg) { return arg; }
 
 struct agg_16byte { char a[16]; };
 struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_16byte(ptr dead_on_unwind noalias writable sret(%struct.agg_16byte) align 1 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_16byte(ptr dead_on_unwind noalias writable sret(%struct.agg_16byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}})
 
 
 // Float-like aggregate types
@@ -153,7 +153,7 @@ struct agg_double pass_agg_double(struct agg_double arg) { return arg; }
 
 struct agg_longdouble { long double a; };
 struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr dead_on_return %{{.*}})
 
 struct agg__Float16_a4 { _Float16 a __attribute__((aligned (4))); };
 struct agg__Float16_a4 pass_agg__Float16_a4(struct agg__Float16_a4 arg) { return arg; }
@@ -167,7 +167,7 @@ struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return
 
 struct agg__Float16_a16 { _Float16 a __attribute__((aligned (16))); };
 struct agg__Float16_a16 pass_agg__Float16_a16(struct agg__Float16_a16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg__Float16_a16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a16) align 16 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg__Float16_a16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a16) align 16 %{{.*}}, ptr dead_on_return %{{.*}})
 
 struct agg_float_a8 { float a __attribute__((aligned (8))); };
 struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; }
@@ -176,7 +176,7 @@ struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; }
 
 struct agg_float_a16 { float a __attribute__((aligned (16))); };
 struct agg_float_a16 pass_agg_float_a16(struct agg_float_a16 arg) { return arg; }
-// CHECK-LABEL: define{{.*}} void @pass_agg_float_a16(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a16) align 16 %{{.*}}, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @pass_agg_float_a16(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a16) align 16 %{{.*}}, ptr dead_on_return %{{.*}})
 
 
 // Verify that the following are *not* float-like aggregate types
diff --git a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c
index 434937a66389c..d76fb4bd1fda6 100644
--- a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c
+++ b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c
@@ -131,7 +131,7 @@ double test_f64(double f, double g) {
 long double test_f128(long double f, long double g) {
   asm("axbr %0, %2" : "=f" (f) : "0" (f), "f" (g));
   return f;
-// CHECK: define{{.*}} void @test_f128(ptr dead_on_unwind noalias writable writeonly sret(fp128) align 8 captures(none) initializes((0, 16)) [[DEST:%.*]], ptr noundef readonly captures(none) %0, ptr noundef readonly captures(none) %1)
+// CHECK: define{{.*}} void @test_f128(ptr dead_on_unwind noalias writable writeonly sret(fp128) align 8 captures(none) initializes((0, 16)) [[DEST:%.*]], ptr dead_on_return noundef readonly captures(none) %0, ptr dead_on_return noundef readonly captures(none) %1)
 // CHECK: %f = load fp128, ptr %0
 // CHECK: %g = load fp128, ptr %1
 // CHECK: [[RESULT:%.*]] = tail call fp128 asm "axbr $0, $2", "=f,0,f"(fp128 %f, fp128 %g)
diff --git a/clang/test/CodeGen/X86/cx-complex-range.c b/clang/test/CodeGen/X86/cx-complex-range.c
index f87091427df71..b16b10b7b8a21 100644
--- a/clang/test/CodeGen/X86/cx-complex-range.c
+++ b/clang/test/CodeGen/X86/cx-complex-range.c
@@ -1064,7 +1064,7 @@ _Complex _Float16 mulf16(_Complex _Float16 a, _Complex _Float16 b) {
 // PRMTD-NEXT:    ret <2 x half> [[TMP33]]
 //
 // X86WINPRMTD-LABEL: define dso_local i32 @f1(
-// X86WINPRMTD-SAME: i32 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i32 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: i32 noundef [[A_COERCE:%.*]], ptr dead_on_return noundef [[B:%.*]], i32 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[RETVAL:%.*]] = alloca { half, half }, align 2
 // X86WINPRMTD-NEXT:    [[A:%.*]] = alloca { half, half }, align 2
diff --git a/clang/test/CodeGen/X86/i128-debuginfo.c b/clang/test/CodeGen/X86/i128-debuginfo.c
new file mode 100644
index 0000000000000..4b865c1bed9f0
--- /dev/null
+++ b/clang/test/CodeGen/X86/i128-debuginfo.c
@@ -0,0 +1,10 @@
+// no autogeneration since update_cc_test_checks does not support -g
+// RUN: %clang_cc1 -triple x86_64-pc-linux -O1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} i128 @add(i128 noundef %a)
+// CHECK: #dbg_value(i128 %a, ![[DI:.*]], !DIExpression()
+__int128_t add(__int128_t a) {
+  return a + a;
+}
+
+// CHECK: ![[DI]] = !DILocalVariable(name: "a", arg: 1
diff --git a/clang/test/CodeGen/X86/prefetchi-error.c b/clang/test/CodeGen/X86/prefetchi-error.c
new file mode 100644
index 0000000000000..31494f7cff152
--- /dev/null
+++ b/clang/test/CodeGen/X86/prefetchi-error.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +prefetchi  -fsyntax-only -verify
+
+#include <immintrin.h>
+
+void test_invalid_prefetchi(void* p) {
+  __builtin_ia32_prefetchi(p, 1); // expected-error {{argument value 1 is outside the valid range [2, 3]}}
+}
diff --git a/clang/test/CodeGen/X86/x86_32-arguments-win32.c b/clang/test/CodeGen/X86/x86_32-arguments-win32.c
index 5b81c43f4bbb8..53d040af725b0 100644
--- a/clang/test/CodeGen/X86/x86_32-arguments-win32.c
+++ b/clang/test/CodeGen/X86/x86_32-arguments-win32.c
@@ -72,10 +72,10 @@ void receive_vec_512(__m512 x, __m512 y, __m512 z, __m512 w, __m512 q) {
 void receive_vec_1024(__m1024 x, __m1024 y, __m1024 z, __m1024 w, __m1024 q) {
   gv1024 = x + y + z + w + q;
 }
-// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr noundef %0, ptr noundef %1)
-// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg noundef %x, <8 x float> inreg noundef %y, <8 x float> inreg noundef %z, ptr noundef %0, ptr noundef %1)
-// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg noundef %x, <16 x float> inreg noundef %y, <16 x float> inreg noundef %z, ptr noundef %0, ptr noundef %1)
-// CHECK-LABEL: define dso_local void @receive_vec_1024(ptr noundef %0, ptr noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4)
+// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
+// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg noundef %x, <8 x float> inreg noundef %y, <8 x float> inreg noundef %z, ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
+// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg noundef %x, <16 x float> inreg noundef %y, <16 x float> inreg noundef %z, ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
+// CHECK-LABEL: define dso_local void @receive_vec_1024(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1, ptr dead_on_return noundef %2, ptr dead_on_return noundef %3, ptr dead_on_return noundef %4)
 
 void pass_vec_128(void) {
   __m128 z = {0};
@@ -83,13 +83,13 @@ void pass_vec_128(void) {
 }
 
 // CHECK-LABEL: define dso_local void @pass_vec_128()
-// CHECK: call void @receive_vec_128(<4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, ptr noundef %{{[^,)]*}}, ptr noundef %{{[^,)]*}})
+// CHECK: call void @receive_vec_128(<4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, ptr dead_on_return noundef %{{[^,)]*}}, ptr dead_on_return noundef %{{[^,)]*}})
 
 
 void __fastcall fastcall_indirect_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q) {
   gv128 = x + y + z + w + q;
 }
-// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr inreg noundef %0, i32 inreg noundef %edx, ptr noundef %1)
+// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr dead_on_return inreg noundef %0, i32 inreg noundef %edx, ptr dead_on_return noundef %1)
 
 struct __declspec(align(1)) Align1 { unsigned long long x; };
 struct __declspec(align(4)) Align4 { unsigned long long x; };
@@ -156,4 +156,4 @@ void pass_fixed_align_variadic() {
 // correctly in Clang than it is to be bug for bug compatible, so we pass such
 // arguments indirectly.
 // CHECK-LABEL: define dso_local void @pass_fixed_align_variadic()
-// CHECK: call void (ptr, ...) @receive_fixed_align_variadic(ptr noundef %{{[^)]*}}, i32 noundef 42)
+// CHECK: call void (ptr, ...) @receive_fixed_align_variadic(ptr dead_on_return noundef %{{[^)]*}}, i32 noundef 42)
diff --git a/clang/test/CodeGen/X86/x86_64-arguments-win32.c b/clang/test/CodeGen/X86/x86_64-arguments-win32.c
index 8768e73a854aa..6010e531acb00 100644
--- a/clang/test/CodeGen/X86/x86_64-arguments-win32.c
+++ b/clang/test/CodeGen/X86/x86_64-arguments-win32.c
@@ -21,7 +21,7 @@ void f4(unsigned short a) {}
 // CHECK-LABEL: define dso_local void @f5(i64 noundef %a.coerce)
 void f5(_Complex float a) {}
 
-// CHECK-LABEL: define dso_local void @f6(ptr noundef %a)
+// CHECK-LABEL: define dso_local void @f6(ptr dead_on_return noundef %a)
 void f6(_Complex double a) {}
 
 // CHECK-LABEL: define dso_local i64 @f7()
diff --git a/clang/test/CodeGen/X86/x86_64-arguments.c b/clang/test/CodeGen/X86/x86_64-arguments.c
index 82845f0a2b31f..580f9487395d3 100644
--- a/clang/test/CodeGen/X86/x86_64-arguments.c
+++ b/clang/test/CodeGen/X86/x86_64-arguments.c
@@ -551,6 +551,45 @@ struct s68 {
 void f68(struct s68 x) {
 }
 
+// CHECK-LABEL: define{{.*}} i128 @f69(i128 noundef %a)
+__int128_t f69(__int128_t a) {
+  return a;
+}
+
+// CHECK-LABEL: define{{.*}} i128 @f70(i128 noundef %a)
+__uint128_t f70(__uint128_t a) {
+  return a;
+}
+
+// check that registers are correctly counted for (u)int128_t arguments
+struct s71 {
+  long long a, b;
+};
+// CHECK-LABEL: define{{.*}} void @f71(i128 noundef %a, i128 noundef %b, i64 noundef %c, ptr noundef byval(%struct.s71) align 8 %d)
+void f71(__int128_t a, __int128_t b, long long c, struct s71 d) {
+}
+// CHECK-LABEL: define{{.*}} void @f72(i128 noundef %a, i128 noundef %b, i64 %d.coerce0, i64 %d.coerce1)
+void f72(__int128_t a, __int128_t b, struct s71 d) {
+}
+
+// check that structs containing (u)int128_t are passed correctly
+struct s73 {
+  struct inner {
+    __uint128_t a;
+  };
+  struct inner in;
+};
+// CHECK-LABEL: define{{.*}} i128 @f73(i128 %a.coerce)
+struct s73 f73(struct s73 a) {
+  return a;
+}
+
+// check that _BitInt(128) is still passed correctly on the stack
+// CHECK-LABEL: define{{.*}} i128 @f74(i128 noundef %b, i128 noundef %c, i128 noundef %d, i64 noundef %e, ptr noundef byval(i128) align 8 %0)
+_BitInt(128) f74(__uint128_t b, __uint128_t c, __uint128_t d, long e, _BitInt(128) a) {
+  return a;
+}
+
 /// The synthesized __va_list_tag does not have file/line fields.
 // CHECK:      = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "__va_list_tag",
 // CHECK-NOT:  file:
diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp
index 1c26d68e434f4..53fc53c2f7296 100644
--- a/clang/test/CodeGen/aapcs64-align.cpp
+++ b/clang/test/CodeGen/aapcs64-align.cpp
@@ -122,8 +122,8 @@ unsigned sizeof_RidiculouslyOverSizedBitfield = sizeof(RidiculouslyOverSizedBitf
 unsigned alignof_RidiculouslyOverSizedBitfield = alignof(RidiculouslyOverSizedBitfield);
 
 // CHECK: define{{.*}} void @g9
-// CHECK: call void @f9(i32 noundef 1, ptr noundef nonnull %agg.tmp)
-// CHECK: declare void @f9(i32 noundef, ptr noundef)
+// CHECK: call void @f9(i32 noundef 1, ptr dead_on_return noundef nonnull %agg.tmp)
+// CHECK: declare void @f9(i32 noundef, ptr dead_on_return noundef)
 void f9(int a, RidiculouslyOverSizedBitfield b);
 void g9() {
   RidiculouslyOverSizedBitfield s = {42};
diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c
index 76e5d1041b19f..c4c4e76eaaa04 100644
--- a/clang/test/CodeGen/alloc-align-attr.c
+++ b/clang/test/CodeGen/alloc-align-attr.c
@@ -70,66 +70,42 @@ __INT32_TYPE__ test4(__SIZE_TYPE__ a) {
 
 struct Empty {};
 struct MultiArgs { __INT64_TYPE__ a, b;};
-// Struct parameter doesn't take up an IR parameter, 'i' takes up 2.
+// Struct parameter doesn't take up an IR parameter, 'i' takes up 1.
 // Truncation to i64 is permissible, since alignments of greater than 2^64 are insane.
 __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2)));
 // CHECK-LABEL: @test5(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1
-// CHECK-NEXT:    [[COERCE:%.*]] = alloca i128, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1
-// CHECK-NEXT:    store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[A1:%.*]] = load i128, ptr [[A]], align 16
-// CHECK-NEXT:    store i128 [[A1]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[COERCE]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m3(i64 noundef [[TMP4]], i64 noundef [[TMP6]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64
+// CHECK-NEXT:    store i128 [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m3(i128 noundef [[TMP0]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64
 // CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ]
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CALL]], align 4
-// CHECK-NEXT:    ret i32 [[TMP7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CALL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 __INT32_TYPE__ test5(__int128_t a) {
   struct Empty e;
   return *m3(e, a);
 }
-// Struct parameter takes up 2 parameters, 'i' takes up 2.
+// Struct parameter takes up 2 parameters, 'i' takes up 1.
 __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(2)));
 // CHECK-LABEL: @test6(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_MULTIARGS:%.*]], align 8
-// CHECK-NEXT:    [[COERCE:%.*]] = alloca i128, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1
-// CHECK-NEXT:    store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[A1:%.*]] = load i128, ptr [[A]], align 16
-// CHECK-NEXT:    store i128 [[A1]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0
+// CHECK-NEXT:    store i128 [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[COERCE]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m4(i64 [[TMP4]], i64 [[TMP6]], i64 noundef [[TMP8]], i64 noundef [[TMP10]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m4(i64 [[TMP2]], i64 [[TMP4]], i128 noundef [[TMP0]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64
 // CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ]
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CALL]], align 4
-// CHECK-NEXT:    ret i32 [[TMP11]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CALL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP5]]
 //
 __INT32_TYPE__ test6(__int128_t a) {
   struct MultiArgs e;
diff --git a/clang/test/CodeGen/arm-aapcs-vfp.c b/clang/test/CodeGen/arm-aapcs-vfp.c
index 6581929f99f14..e60ed1e52c33a 100644
--- a/clang/test/CodeGen/arm-aapcs-vfp.c
+++ b/clang/test/CodeGen/arm-aapcs-vfp.c
@@ -65,7 +65,7 @@ struct big_struct {
   float f4;
 };
 // CHECK: define{{.*}} arm_aapcs_vfpcc void @test_big([5 x i32] %{{.*}})
-// CHECK64: define{{.*}} void @test_big(ptr noundef %{{.*}})
+// CHECK64: define{{.*}} void @test_big(ptr dead_on_return noundef %{{.*}})
 // CHECK64: call void @llvm.memcpy
 // CHECK64: call void @big_callee(ptr
 extern void big_callee(struct big_struct);
diff --git a/clang/test/CodeGen/arm-abi-vector.c b/clang/test/CodeGen/arm-abi-vector.c
index c2a8902007980..93b770878c3fa 100644
--- a/clang/test/CodeGen/arm-abi-vector.c
+++ b/clang/test/CodeGen/arm-abi-vector.c
@@ -177,11 +177,11 @@ double varargs_vec_19c(int fixed, ...) {
 
 double test_19c(__char19 *in) {
 // CHECK: test_19c
-// CHECK: call arm_aapcscc double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}})
+// CHECK: call arm_aapcscc double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}})
 // APCS-GNU: test_19c
-// APCS-GNU: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}})
+// APCS-GNU: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}})
 // ANDROID: test_19c
-// ANDROID: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}})
+// ANDROID: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}})
   return varargs_vec_19c(19, *in);
 }
 
diff --git a/clang/test/CodeGen/arm-swiftcall.c b/clang/test/CodeGen/arm-swiftcall.c
index 677b878c6765d..55c22a45fd8de 100644
--- a/clang/test/CodeGen/arm-swiftcall.c
+++ b/clang/test/CodeGen/arm-swiftcall.c
@@ -261,7 +261,7 @@ TEST(struct_big_1)
 // CHECK-LABEL: define{{.*}} void @return_struct_big_1({{.*}} dead_on_unwind noalias writable sret({{.*}})
 
 // Should not be byval.
-// CHECK-LABEL: define{{.*}} void @take_struct_big_1(ptr{{( %.*)?}})
+// CHECK-LABEL: define{{.*}} void @take_struct_big_1(ptr dead_on_return{{( %.*)?}})
 
 /*****************************************************************************/
 /********************************* TYPE MERGING ******************************/
diff --git a/clang/test/CodeGen/arm64-abi-vector.c b/clang/test/CodeGen/arm64-abi-vector.c
index 81e42315c883b..cf50cdd2fe86e 100644
--- a/clang/test/CodeGen/arm64-abi-vector.c
+++ b/clang/test/CodeGen/arm64-abi-vector.c
@@ -128,7 +128,7 @@ double varargs_vec_19c(int fixed, ...) {
 
 double test_19c(__char19 *in) {
 // CHECK: test_19c
-// CHECK: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}})
+// CHECK: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}})
   return varargs_vec_19c(19, *in);
 }
 
@@ -211,7 +211,7 @@ double varargs_vec_5i(int fixed, ...) {
 
 double test_5i(__int5 *in) {
 // CHECK: test_5i
-// CHECK: call double (i32, ...) @varargs_vec_5i(i32 noundef 5, ptr noundef {{%.*}})
+// CHECK: call double (i32, ...) @varargs_vec_5i(i32 noundef 5, ptr dead_on_return noundef {{%.*}})
   return varargs_vec_5i(5, *in);
 }
 
@@ -231,7 +231,7 @@ double varargs_vec_3d(int fixed, ...) {
 
 double test_3d(__double3 *in) {
 // CHECK: test_3d
-// CHECK: call double (i32, ...) @varargs_vec_3d(i32 noundef 3, ptr noundef {{%.*}})
+// CHECK: call double (i32, ...) @varargs_vec_3d(i32 noundef 3, ptr dead_on_return noundef {{%.*}})
   return varargs_vec_3d(3, *in);
 }
 
@@ -291,7 +291,7 @@ double test(__char3 *c3, __char5 *c5, __char9 *c9, __char19 *c19,
             __short3 *s3, __short5 *s5, __int3 *i3, __int5 *i5,
             __double3 *d3) {
   double ret = varargs_vec(3, *c3, *c5, *c9, *c19, *s3, *s5, *i3, *i5, *d3);
-// CHECK: call double (i32, ...) @varargs_vec(i32 noundef 3, i32 {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr noundef {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr noundef {{%.*}}, ptr noundef {{%.*}})
+// CHECK: call double (i32, ...) @varargs_vec(i32 noundef 3, i32 {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr dead_on_return noundef {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr dead_on_return noundef {{%.*}}, ptr dead_on_return noundef {{%.*}})
   return ret;
 }
 
@@ -350,7 +350,7 @@ __attribute__((noinline)) double args_vec_19c(int fixed, __char19 c19) {
 
 double fixed_19c(__char19 *in) {
 // CHECK: fixed_19c
-// CHECK: call double @args_vec_19c(i32 noundef 19, ptr noundef {{%.*}})
+// CHECK: call double @args_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}})
   return args_vec_19c(19, *in);
 }
 
@@ -409,7 +409,7 @@ __attribute__((noinline)) double args_vec_5i(int fixed, __int5 c5) {
 
 double fixed_5i(__int5 *in) {
 // CHECK: fixed_5i
-// CHECK: call double @args_vec_5i(i32 noundef 5, ptr noundef {{%.*}})
+// CHECK: call double @args_vec_5i(i32 noundef 5, ptr dead_on_return noundef {{%.*}})
   return args_vec_5i(5, *in);
 }
 
@@ -424,6 +424,6 @@ __attribute__((noinline)) double args_vec_3d(int fixed, __double3 c3) {
 
 double fixed_3d(__double3 *in) {
 // CHECK: fixed_3d
-// CHECK: call double @args_vec_3d(i32 noundef 3, ptr noundef {{%.*}})
+// CHECK: call double @args_vec_3d(i32 noundef 3, ptr dead_on_return noundef {{%.*}})
   return args_vec_3d(3, *in);
 }
diff --git a/clang/test/CodeGen/arm64-arguments.c b/clang/test/CodeGen/arm64-arguments.c
index 4c4f85d923e78..2e3ab388432f6 100644
--- a/clang/test/CodeGen/arm64-arguments.c
+++ b/clang/test/CodeGen/arm64-arguments.c
@@ -163,7 +163,7 @@ void f32(struct s32 s) { }
 // A composite type larger than 16 bytes should be passed indirectly.
 struct s33 { char buf[32*32]; };
 void f33(struct s33 s) { }
-// CHECK: define{{.*}} void @f33(ptr noundef %s)
+// CHECK: define{{.*}} void @f33(ptr dead_on_return noundef %s)
 
 struct s34 { char c; };
 void f34(struct s34 s);
@@ -226,9 +226,9 @@ T_float32x2 f1_0(T_float32x2 a0) { return a0; }
 // CHECK: define{{.*}} <4 x float> @f1_1(<4 x float> noundef %{{.*}})
 T_float32x4 f1_1(T_float32x4 a0) { return a0; }
 // Vector with length bigger than 16-byte is illegal and is passed indirectly.
-// CHECK: define{{.*}} void @f1_2(ptr dead_on_unwind noalias writable sret(<8 x float>) align 16 %{{.*}}, ptr noundef %0)
+// CHECK: define{{.*}} void @f1_2(ptr dead_on_unwind noalias writable sret(<8 x float>) align 16 %{{.*}}, ptr dead_on_return noundef %0)
 T_float32x8 f1_2(T_float32x8 a0) { return a0; }
-// CHECK: define{{.*}} void @f1_3(ptr dead_on_unwind noalias writable sret(<16 x float>) align 16 %{{.*}}, ptr noundef %0)
+// CHECK: define{{.*}} void @f1_3(ptr dead_on_unwind noalias writable sret(<16 x float>) align 16 %{{.*}}, ptr dead_on_return noundef %0)
 T_float32x16 f1_3(T_float32x16 a0) { return a0; }
 
 // Testing alignment with aggregates: HFA, aggregates with size <= 16 bytes and
@@ -278,7 +278,7 @@ struct s37
 typedef struct s37 s37_with_align;
 
 int32x4_t f37(int i, s37_with_align s1, s37_with_align s2) {
-// CHECK: define{{.*}} <4 x i32> @f37(i32 noundef %i, ptr noundef %s1, ptr noundef %s2)
+// CHECK: define{{.*}} <4 x i32> @f37(i32 noundef %i, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2)
 // CHECK: load <4 x i32>, ptr %s1, align 16
 // CHECK: load <4 x i32>, ptr %s2, align 16
   int32x4_t v = vaddq_s32(*(int32x4_t *)&s1,
@@ -292,7 +292,7 @@ int32x4_t caller37() {
 // CHECK: %[[b:.*]] = alloca %struct.s37, align 16
 // CHECK: call void @llvm.memcpy
 // CHECK: call void @llvm.memcpy
-// CHECK: call <4 x i32> @f37(i32 noundef 3, ptr noundef %[[a]], ptr noundef %[[b]])
+// CHECK: call <4 x i32> @f37(i32 noundef 3, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]])
   return f37(3, g37, g37);
 }
 
@@ -530,7 +530,7 @@ typedef struct s42 s42_no_align;
 // passing structs in registers
 __attribute__ ((noinline))
 int f42(int i, s42_no_align s1, s42_no_align s2) {
-// CHECK: define{{.*}} i32 @f42(i32 noundef %i, ptr noundef %s1, ptr noundef %s2)
+// CHECK: define{{.*}} i32 @f42(i32 noundef %i, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2)
 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s2, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 1
@@ -545,14 +545,14 @@ int caller42() {
 // CHECK: %[[b:.*]] = alloca %struct.s42, align 4
 // CHECK: call void @llvm.memcpy.p0.p0.i64
 // CHECK: call void @llvm.memcpy.p0.p0.i64
-// CHECK: call i32 @f42(i32 noundef 3, ptr noundef %[[a]], ptr noundef %[[b]])
+// CHECK: call i32 @f42(i32 noundef 3, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]])
   return f42(3, g42, g42_2);
 }
 // passing structs on stack
 __attribute__ ((noinline))
 int f42_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
               int i9, s42_no_align s1, s42_no_align s2) {
-// CHECK: define{{.*}} i32 @f42_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr noundef %s1, ptr noundef %s2)
+// CHECK: define{{.*}} i32 @f42_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2)
 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s2, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 1
@@ -565,7 +565,7 @@ int caller42_stack() {
 // CHECK: %[[b:.*]] = alloca %struct.s42, align 4
 // CHECK: call void @llvm.memcpy.p0.p0.i64
 // CHECK: call void @llvm.memcpy.p0.p0.i64
-// CHECK: call i32 @f42_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr noundef %[[a]], ptr noundef %[[b]])
+// CHECK: call i32 @f42_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]])
   return f42_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g42, g42_2);
 }
 
@@ -583,7 +583,7 @@ typedef struct s43 s43_with_align;
 // passing aligned structs in registers
 __attribute__ ((noinline))
 int f43(int i, s43_with_align s1, s43_with_align s2) {
-// CHECK: define{{.*}} i32 @f43(i32 noundef %i, ptr noundef %s1, ptr noundef %s2)
+// CHECK: define{{.*}} i32 @f43(i32 noundef %i, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2)
 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s2, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 1
@@ -598,14 +598,14 @@ int caller43() {
 // CHECK: %[[b:.*]] = alloca %struct.s43, align 16
 // CHECK: call void @llvm.memcpy.p0.p0.i64
 // CHECK: call void @llvm.memcpy.p0.p0.i64
-// CHECK: call i32 @f43(i32 noundef 3, ptr noundef %[[a]], ptr noundef %[[b]])
+// CHECK: call i32 @f43(i32 noundef 3, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]])
   return f43(3, g43, g43_2);
 }
 // passing aligned structs on stack
 __attribute__ ((noinline))
 int f43_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8,
               int i9, s43_with_align s1, s43_with_align s2) {
-// CHECK: define{{.*}} i32 @f43_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr noundef %s1, ptr noundef %s2)
+// CHECK: define{{.*}} i32 @f43_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2)
 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s2, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 1
@@ -618,7 +618,7 @@ int caller43_stack() {
 // CHECK: %[[b:.*]] = alloca %struct.s43, align 16
 // CHECK: call void @llvm.memcpy.p0.p0.i64
 // CHECK: call void @llvm.memcpy.p0.p0.i64
-// CHECK: call i32 @f43_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr noundef %[[a]], ptr noundef %[[b]])
+// CHECK: call i32 @f43_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]])
   return f43_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g43, g43_2);
 }
 
diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
index 85472645acb3b..a0a81be54325f 100644
--- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp
+++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
@@ -29,7 +29,7 @@ S2 f2() {
 
 // Pass and return for type size > 16 bytes.
 // CHECK: define {{.*}} void @{{.*}}f3{{.*}}(ptr dead_on_unwind noalias writable sret(%struct.S3) align 4 %agg.result)
-// CHECK: call void {{.*}}func3{{.*}}(ptr dead_on_unwind writable sret(%struct.S3) align 4 %agg.result, ptr noundef %agg.tmp)
+// CHECK: call void {{.*}}func3{{.*}}(ptr dead_on_unwind writable sret(%struct.S3) align 4 %agg.result, ptr dead_on_return noundef %agg.tmp)
 struct S3 {
   int a[5];
 };
diff --git a/clang/test/CodeGen/armv7k-abi.c b/clang/test/CodeGen/armv7k-abi.c
index 872e6423a4a99..6a781bc04d042 100644
--- a/clang/test/CodeGen/armv7k-abi.c
+++ b/clang/test/CodeGen/armv7k-abi.c
@@ -39,7 +39,7 @@ typedef struct {
   double z;
 } BigStruct;
 
-// CHECK: define{{.*}} void @big_struct_indirect(ptr noundef %b)
+// CHECK: define{{.*}} void @big_struct_indirect(ptr dead_on_return noundef %b)
 void big_struct_indirect(BigStruct b) {}
 
 // CHECK: define{{.*}} void @return_big_struct_indirect(ptr dead_on_unwind noalias writable sret
diff --git a/clang/test/CodeGen/atomic-arm64.c b/clang/test/CodeGen/atomic-arm64.c
index d2a30a3b6e66f..d539cad0c6a7d 100644
--- a/clang/test/CodeGen/atomic-arm64.c
+++ b/clang/test/CodeGen/atomic-arm64.c
@@ -57,7 +57,7 @@ void test3(pointer_pair_t pair) {
 }
 
 // CHECK-LABEL:define{{.*}} void @test4(
-// CHECK-SAME: ptr noundef [[QUAD:%.*]])
+// CHECK-SAME: ptr dead_on_return noundef [[QUAD:%.*]])
 // CHECK:      [[QUAD_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT: [[TEMP:%.*]] = alloca [[QUAD_T:%.*]], align 8
 // CHECK-NEXT: store ptr [[QUAD]], ptr [[QUAD_INDIRECT_ADDR]]
diff --git a/clang/test/CodeGen/attr-noundef.cpp b/clang/test/CodeGen/attr-noundef.cpp
index abdf9496bd396..619dbec7678db 100644
--- a/clang/test/CodeGen/attr-noundef.cpp
+++ b/clang/test/CodeGen/attr-noundef.cpp
@@ -27,7 +27,7 @@ struct NoCopy {
 NoCopy ret_nocopy() { return {}; }
 void pass_nocopy(NoCopy e) {}
 // CHECK: [[DEF]] void @{{.*}}ret_nocopy{{.*}}(ptr dead_on_unwind noalias writable sret({{[^)]+}}) align 4 %
-// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr noundef %
+// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr dead_on_return noundef %
 
 struct Huge {
   int a[1024];
@@ -35,7 +35,8 @@ struct Huge {
 Huge ret_huge() { return {}; }
 void pass_huge(Huge h) {}
 // CHECK: [[DEF]] void @{{.*}}ret_huge{{.*}}(ptr dead_on_unwind noalias writable sret({{[^)]+}}) align 4 %
-// CHECK: [[DEF]] void @{{.*}}pass_huge{{.*}}(ptr noundef
+// CHECK-INTEL: [[DEF]] void @{{.*}}pass_huge{{.*}}(ptr noundef
+// CHECK-AARCH: [[DEF]] void @{{.*}}pass_huge{{.*}}(ptr dead_on_return noundef
 } // namespace check_structs
 
 //************ Passing unions by value
@@ -59,7 +60,7 @@ union NoCopy {
 NoCopy ret_nocopy() { return {}; }
 void pass_nocopy(NoCopy e) {}
 // CHECK: [[DEF]] void @{{.*}}ret_nocopy{{.*}}(ptr dead_on_unwind noalias writable sret({{[^)]+}}) align 4 %
-// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr noundef %
+// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr dead_on_return noundef %
 } // namespace check_unions
 
 //************ Passing `this` pointers
diff --git a/clang/test/CodeGen/builtin-maximumnum-minimumnum.c b/clang/test/CodeGen/builtin-maximumnum-minimumnum.c
new file mode 100644
index 0000000000000..ea9d2e7a4ed38
--- /dev/null
+++ b/clang/test/CodeGen/builtin-maximumnum-minimumnum.c
@@ -0,0 +1,171 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -x c++ -std=c++20 -disable-llvm-passes -O3 -triple x86_64 %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK
+
+typedef _Float16 half8 __attribute__((ext_vector_type(8)));
+typedef __bf16 bf16x8 __attribute__((ext_vector_type(8)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef long double ldouble2 __attribute__((ext_vector_type(2)));
+
+// CHECK-LABEL: define dso_local noundef <8 x half> @_Z7pfmin16Dv8_DF16_S_(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[ELT_MINIMUMNUM]]
+//
+half8 pfmin16(half8 a, half8 b) {
+	return __builtin_elementwise_minimumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <8 x bfloat> @_Z8pfmin16bDv8_DF16bS_(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]], <8 x bfloat> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
+// CHECK-NEXT:    ret <8 x bfloat> [[ELT_MINIMUMNUM]]
+//
+bf16x8 pfmin16b(bf16x8 a, bf16x8 b) {
+	return __builtin_elementwise_minimumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z7pfmin32Dv4_fS_(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[ELT_MINIMUMNUM]]
+//
+float4 pfmin32(float4 a, float4 b) {
+	return __builtin_elementwise_minimumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <2 x double> @_Z7pfmin64Dv2_dS_(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    ret <2 x double> [[ELT_MINIMUMNUM]]
+//
+double2 pfmin64(double2 a, double2 b) {
+	return __builtin_elementwise_minimumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <2 x x86_fp80> @_Z7pfmin80Dv2_eS_(
+// CHECK-SAME: ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP0:%.*]], ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <2 x x86_fp80> @llvm.minimumnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
+// CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINIMUMNUM]]
+//
+ldouble2 pfmin80(ldouble2 a, ldouble2 b) {
+	return __builtin_elementwise_minimumnum(a, b);
+}
+
+// CHECK-LABEL: define dso_local noundef <8 x half> @_Z7pfmax16Dv8_DF16_S_(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
+// CHECK-NEXT:    ret <8 x half> [[ELT_MAXIMUMNUM]]
+//
+half8 pfmax16(half8 a, half8 b) {
+	return __builtin_elementwise_maximumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <8 x bfloat> @_Z8pfmax16bDv8_DF16bS_(
+// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]], <8 x bfloat> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
+// CHECK-NEXT:    ret <8 x bfloat> [[ELT_MAXIMUMNUM]]
+//
+bf16x8 pfmax16b(bf16x8 a, bf16x8 b) {
+	return __builtin_elementwise_maximumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z7pfmax32Dv4_fS_(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    ret <4 x float> [[ELT_MAXIMUMNUM]]
+//
+float4 pfmax32(float4 a, float4 b) {
+	return __builtin_elementwise_maximumnum(a, b);
+}
+// CHECK-LABEL: define dso_local noundef <2 x double> @_Z7pfmax64Dv2_dS_(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    ret <2 x double> [[ELT_MAXIMUMNUM]]
+//
+double2 pfmax64(double2 a, double2 b) {
+	return __builtin_elementwise_maximumnum(a, b);
+}
+
+// CHECK-LABEL: define dso_local noundef <2 x x86_fp80> @_Z7pfmax80Dv2_eS_(
+// CHECK-SAME: ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP0:%.*]], ptr noundef byval(<2 x x86_fp80>) align 32 [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <2 x x86_fp80> @llvm.minimumnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
+// CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINIMUMNUM]]
+//
+ldouble2 pfmax80(ldouble2 a, ldouble2 b) {
+	return __builtin_elementwise_minimumnum(a, b);
+}
+
+//.
+// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+//.
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index eda6c67fdad00..aa9965b815983 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -956,36 +956,24 @@ void test_builtin_os_log_errno(void) {
 void test_builtin_os_log_long_double(void *buf, long double ld) {
   // CHECK: %[[BUF_ADDR:.*]] = alloca ptr, align 8
   // CHECK: %[[LD_ADDR:.*]] = alloca x86_fp80, align 16
-  // CHECK: %[[COERCE:.*]] = alloca i128, align 16
   // CHECK: store ptr %[[BUF]], ptr %[[BUF_ADDR]], align 8
   // CHECK: store x86_fp80 %[[LD]], ptr %[[LD_ADDR]], align 16
   // CHECK: %[[V0:.*]] = load ptr, ptr %[[BUF_ADDR]], align 8
   // CHECK: %[[V1:.*]] = load x86_fp80, ptr %[[LD_ADDR]], align 16
   // CHECK: %[[V2:.*]] = bitcast x86_fp80 %[[V1]] to i80
   // CHECK: %[[V3:.*]] = zext i80 %[[V2]] to i128
-  // CHECK: store i128 %[[V3]], ptr %[[COERCE]], align 16
-  // CHECK: %[[V5:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 0
-  // CHECK: %[[V6:.*]] = load i64, ptr %[[V5]], align 16
-  // CHECK: %[[V7:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 1
-  // CHECK: %[[V8:.*]] = load i64, ptr %[[V7]], align 8
-  // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i64 noundef %[[V6]], i64 noundef %[[V8]])
+  // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i128 noundef %[[V3]])
 
   __builtin_os_log_format(buf, "%Lf", ld);
 }
 
 // CHECK-LABEL: define linkonce_odr hidden void @__os_log_helper_1_0_1_16_0
-// CHECK: (ptr noundef %[[BUFFER:.*]], i64 noundef %[[ARG0_COERCE0:.*]], i64 noundef %[[ARG0_COERCE1:.*]])
+// CHECK: (ptr noundef %[[BUFFER:.*]], i128 noundef %[[ARG0:.*]])
 
-// CHECK: %[[ARG0:.*]] = alloca i128, align 16
 // CHECK: %[[BUFFER_ADDR:.*]] = alloca ptr, align 8
 // CHECK: %[[ARG0_ADDR:.*]] = alloca i128, align 16
-// CHECK: %[[V1:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 0
-// CHECK: store i64 %[[ARG0_COERCE0]], ptr %[[V1]], align 16
-// CHECK: %[[V2:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 1
-// CHECK: store i64 %[[ARG0_COERCE1]], ptr %[[V2]], align 8
-// CHECK: %[[ARG01:.*]] = load i128, ptr %[[ARG0]], align 16
 // CHECK: store ptr %[[BUFFER]], ptr %[[BUFFER_ADDR]], align 8
-// CHECK: store i128 %[[ARG01]], ptr %[[ARG0_ADDR]], align 16
+// CHECK: store i128 %[[ARG0]], ptr %[[ARG0_ADDR]], align 16
 // CHECK: %[[BUF:.*]] = load ptr, ptr %[[BUFFER_ADDR]], align 8
 // CHECK: %[[SUMMARY:.*]] = getelementptr i8, ptr %[[BUF]], i64 0
 // CHECK: store i8 0, ptr %[[SUMMARY]], align 1
diff --git a/clang/test/CodeGen/cx-complex-range.c b/clang/test/CodeGen/cx-complex-range.c
index cf74ab2dcca3f..fba692cac4492 100644
--- a/clang/test/CodeGen/cx-complex-range.c
+++ b/clang/test/CodeGen/cx-complex-range.c
@@ -1520,7 +1520,7 @@ void mulassignf(_Complex float *a, _Complex float b) {
 // PRMTD-NEXT:    ret { double, double } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD-LABEL: define dso_local void @divd(
-// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -1744,7 +1744,7 @@ void mulassignf(_Complex float *a, _Complex float b) {
 // PRMTD_FAST-NEXT:    ret { double, double } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @divd(
-// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -1938,7 +1938,7 @@ _Complex double divd(_Complex double a, _Complex double b) {
 // PRMTD-NEXT:    ret void
 //
 // X86WINPRMTD-LABEL: define dso_local void @divassignd(
-// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -2180,7 +2180,7 @@ _Complex double divd(_Complex double a, _Complex double b) {
 // PRMTD_FAST-NEXT:    ret void
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @divassignd(
-// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -2325,7 +2325,7 @@ void divassignd(_Complex double *a, _Complex double b) {
 // PRMTD-NEXT:    ret { double, double } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD-LABEL: define dso_local void @muld(
-// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -2457,7 +2457,7 @@ void divassignd(_Complex double *a, _Complex double b) {
 // PRMTD_FAST-NEXT:    ret { double, double } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @muld(
-// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -2594,7 +2594,7 @@ _Complex double muld(_Complex double a, _Complex double b) {
 // PRMTD-NEXT:    ret void
 //
 // X86WINPRMTD-LABEL: define dso_local void @mulassignd(
-// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -2744,7 +2744,7 @@ _Complex double muld(_Complex double a, _Complex double b) {
 // PRMTD_FAST-NEXT:    ret void
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @mulassignd(
-// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -2922,7 +2922,7 @@ void mulassignd(_Complex double *a, _Complex double b) {
 // PRMTD-NEXT:    ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD-LABEL: define dso_local void @divld(
-// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -3190,7 +3190,7 @@ void mulassignd(_Complex double *a, _Complex double b) {
 // PRMTD_FAST-NEXT:    ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @divld(
-// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -3432,7 +3432,7 @@ _Complex long double divld(_Complex long double a, _Complex long double b) {
 // PRMTD-NEXT:    ret void
 //
 // X86WINPRMTD-LABEL: define dso_local void @divassignld(
-// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -3702,7 +3702,7 @@ _Complex long double divld(_Complex long double a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    ret void
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @divassignld(
-// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -3895,7 +3895,7 @@ void divassignld(_Complex long double *a, _Complex long double b) {
 // PRMTD-NEXT:    ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD-LABEL: define dso_local void @mulld(
-// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -4059,7 +4059,7 @@ void divassignld(_Complex long double *a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]]
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @mulld(
-// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
@@ -4220,7 +4220,7 @@ _Complex long double mulld(_Complex long double a, _Complex long double b) {
 // PRMTD-NEXT:    ret void
 //
 // X86WINPRMTD-LABEL: define dso_local void @mulassignld(
-// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -4386,7 +4386,7 @@ _Complex long double mulld(_Complex long double a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    ret void
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local void @mulassignld(
-// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0
 // X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
@@ -4644,7 +4644,7 @@ void mulassignld(_Complex long double *a, _Complex long double b) {
 // PRMTD-NEXT:    ret <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT]]
 //
 // X86WINPRMTD-LABEL: define dso_local i64 @f1(
-// X86WINPRMTD-SAME: i64 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD-SAME: i64 noundef [[A_COERCE:%.*]], ptr dead_on_return noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD-NEXT:  entry:
 // X86WINPRMTD-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE]] to i32
 // X86WINPRMTD-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_SROA_0_0_EXTRACT_TRUNC]] to float
@@ -5052,7 +5052,7 @@ void mulassignld(_Complex long double *a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    ret <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT]]
 //
 // X86WINPRMTD_STRICT-LABEL: define dso_local i64 @f1(
-// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], ptr dead_on_return noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
 // X86WINPRMTD_STRICT-NEXT:  entry:
 // X86WINPRMTD_STRICT-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE]] to i32
 // X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A_SROA_0_0_EXTRACT_TRUNC]] to float
diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c
index f31a4eb240c25..7cfd992fd48b4 100644
--- a/clang/test/CodeGen/ext-int-cc.c
+++ b/clang/test/CodeGen/ext-int-cc.c
@@ -32,10 +32,10 @@
 
 // Make sure 128 and 64 bit versions are passed like integers.
 void ParamPassing(_BitInt(128) b, _BitInt(64) c) {}
-// LIN64: define{{.*}} void @ParamPassing(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}})
-// WIN64: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
-// LIN32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
-// WIN32: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
+// LIN64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
+// WIN64: define dso_local void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
+// LIN32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
+// WIN32: define dso_local void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
 // NVPTX64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // NVPTX: define{{.*}} void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}})
 // SPARCV9: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
@@ -50,22 +50,22 @@ void ParamPassing(_BitInt(128) b, _BitInt(64) c) {}
 // ARC: define{{.*}} void @ParamPassing(ptr byval(i128) align 4 %{{.+}}, i64 inreg %{{.+}})
 // XCORE: define{{.*}} void @ParamPassing(ptr byval(i128) align 4 %{{.+}}, i64 %{{.+}})
 // RISCV64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
-// RISCV32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
+// RISCV32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
 // WASM: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
-// SYSTEMZ: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
+// SYSTEMZ: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
 // PPC64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // PPC32: define{{.*}} void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}})
 // AARCH64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // AARCH64DARWIN: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // ARM: define{{.*}} arm_aapcscc void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}})
 // LA64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
-// LA32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
+// LA32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
 
 void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {}
 // LIN64: define{{.*}} void @ParamPassing2(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}})
-// WIN64: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
-// LIN32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
-// WIN32: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
+// WIN64: define dso_local void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}})
+// LIN32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}})
+// WIN32: define dso_local void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}})
 // NVPTX64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}})
 // NVPTX: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}})
 // SPARCV9: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
@@ -80,16 +80,16 @@ void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {}
 // ARC: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 inreg %{{.+}})
 // XCORE: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 %{{.+}})
 // RISCV64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
-// RISCV32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
+// RISCV32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}})
 // WASM: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}})
-// SYSTEMZ: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 signext %{{.+}})
+// SYSTEMZ: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 signext %{{.+}})
 // PPC64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
 // PPC32: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}})
 // AARCH64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}})
 // AARCH64DARWIN: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}})
 // ARM: define{{.*}} arm_aapcscc void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}})
 // LA64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
-// LA32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
+// LA32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}})
 
 // Make sure we follow the signext rules for promotable integer types.
 void ParamPassing3(_BitInt(15) a, _BitInt(31) b) {}
@@ -129,10 +129,10 @@ void ParamPassing3(_BitInt(15) a, _BitInt(31) b) {}
 // _BitInt widths to alert us to enable the test.
 void ParamPassing4(_BitInt(129) a) {}
 // LIN64: define{{.*}} void @ParamPassing4(ptr byval([24 x i8]) align 8 %{{.+}})
-// WIN64: define dso_local void @ParamPassing4(ptr %{{.+}})
-// LIN32: define{{.*}} void @ParamPassing4(ptr %{{.+}})
-// WIN32: define dso_local void @ParamPassing4(ptr %{{.+}})
-// AARCH64: define{{.*}} void @ParamPassing4(ptr %{{.+}})
+// WIN64: define dso_local void @ParamPassing4(ptr dead_on_return %{{.+}})
+// LIN32: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}})
+// WIN32: define dso_local void @ParamPassing4(ptr dead_on_return %{{.+}})
+// AARCH64: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}})
 // NVPTX64-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // NVPTX-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // SPARCV9-NOT: define{{.*}} void @ParamPassing4(ptr %{{.+}})
@@ -154,8 +154,8 @@ void ParamPassing4(_BitInt(129) a) {}
 // PPC32-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // AARCH64DARWIN-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // ARM-NOT: define{{.*}} arm_aapcscc void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
-// LA64: define{{.*}} void @ParamPassing4(ptr %{{.+}})
-// LA32: define{{.*}} void @ParamPassing4(ptr %{{.+}})
+// LA64: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}})
+// LA32: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}})
 #endif
 
 _BitInt(63) ReturnPassing(void) { return 0; }
@@ -251,7 +251,7 @@ _BitInt(127) ReturnPassing3(void) { return 0; }
 // LA32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 
 _BitInt(128) ReturnPassing4(void) { return 0; }
-// LIN64: define{{.*}} { i64, i64 } @ReturnPassing4(
+// LIN64: define{{.*}} i128 @ReturnPassing4(
 // WIN64: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // WIN32: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
diff --git a/clang/test/CodeGen/extend-arg-64.c b/clang/test/CodeGen/extend-arg-64.c
index 2cb56d35af21d..8b99c01807ecc 100644
--- a/clang/test/CodeGen/extend-arg-64.c
+++ b/clang/test/CodeGen/extend-arg-64.c
@@ -84,7 +84,7 @@ int test(void) {
 #ifdef D128
   knr(i128);
   // CHECKEXT: load i128
-  // CHECKEXT: call{{.*}} void (i64, i64, ...) @knr
+  // CHECKEXT: call{{.*}} void (i128, ...) @knr
 #endif
 
   knr(u32, s32, u16, s16, u8, s8);
diff --git a/clang/test/CodeGen/isfpclass.c b/clang/test/CodeGen/isfpclass.c
index 26dd846a2bf20..ee3a22b40fefd 100644
--- a/clang/test/CodeGen/isfpclass.c
+++ b/clang/test/CodeGen/isfpclass.c
@@ -160,7 +160,7 @@ int4 check_isfpclass_nan_strict_v4f32(float4 x) {
 }
 
 // CHECK-LABEL: define dso_local void @check_isfpclass_nan_v4f64
-// CHECK-SAME: (ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-SAME: (ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[X:%.*]] = load <4 x double>, ptr [[TMP0]], align 16, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <4 x double> [[X]], zeroinitializer
diff --git a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
index eb706154300a2..0124cc5c06d43 100644
--- a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
+++ b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
@@ -30,37 +30,37 @@ long double powl(long double a, long double b);
 //
 // CHECK-PPC-LABEL: define dso_local ppc_fp128 @test_powl(
 // CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]], ppc_fp128 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-PPC:    [[CALL:%.*]] = tail call ppc_fp128 @powl(ppc_fp128 noundef [[A]], ppc_fp128 noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-PPC:    [[CALL:%.*]] = tail call ppc_fp128 @powl(ppc_fp128 noundef [[A]], ppc_fp128 noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
 //
 // CHECK-ARM-LABEL: define dso_local double @test_powl(
 // CHECK-ARM-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-ARM:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-ARM:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
 //
 // CHECK-ARM-HF-LABEL: define dso_local double @test_powl(
 // CHECK-ARM-HF-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
 //
 // CHECK-THUMB-LABEL: define double @test_powl(
 // CHECK-THUMB-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-THUMB:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
+// CHECK-THUMB:    [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]]
 //
 // CHECK-AARCH-LABEL: define dso_local fp128 @test_powl(
 // CHECK-AARCH-SAME: fp128 noundef [[A:%.*]], fp128 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @powl(fp128 noundef [[A]], fp128 noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @powl(fp128 noundef [[A]], fp128 noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
 //
 // CHECK-SPIR-LABEL: define dso_local spir_func double @test_powl(
 // CHECK-SPIR-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
+// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
 //
 // CHECK-MINGW32-LABEL: define dso_local void @test_powl(
-// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret(x86_fp80) align 16 captures(none) initializes((0, 10)) [[AGG_RESULT:%.*]], ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA3:![0-9]+]]
-// CHECK-MINGW32:    [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[TBAA3]]
-// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA3]]
-// CHECK-MINGW32:    store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[TBAA3]]
-// CHECK-MINGW32:    call void @powl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr noundef nonnull [[BYVAL_TEMP]], ptr noundef nonnull [[BYVAL_TEMP1]]) #[[ATTR3:[0-9]+]]
-// CHECK-MINGW32:    [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA3]]
-// CHECK-MINGW32:    store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[TBAA3]]
+// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret(x86_fp80) align 16 captures(none) initializes((0, 10)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-MINGW32:    [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    call void @powl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-MINGW32:    [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[TBAA6]]
 //
 long double test_powl(long double a, long double b) {
    return powl(a, b);
@@ -93,51 +93,51 @@ long double test_powl(long double a, long double b) {
 // CHECK-I686:    store x86_fp80 [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 4
 //
 // CHECK-PPC-LABEL: define dso_local void @test_cargl(
-// CHECK-PPC-SAME: ptr dead_on_unwind noalias writable writeonly sret({ ppc_fp128, ppc_fp128 }) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ ppc_fp128, ppc_fp128 }) align 16 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-PPC-SAME: ptr dead_on_unwind noalias writable writeonly sret({ ppc_fp128, ppc_fp128 }) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ ppc_fp128, ppc_fp128 }) align 16 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-PPC:    [[CLD_REAL:%.*]] = load ppc_fp128, ptr [[CLD]], align 16
 // CHECK-PPC:    [[CLD_IMAG:%.*]] = load ppc_fp128, ptr [[CLD_IMAGP:%.*]], align 16
 // CHECK-PPC:    store ppc_fp128 [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 16
 // CHECK-PPC:    store ppc_fp128 [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 16
-// CHECK-PPC:    [[CALL:%.*]] = tail call ppc_fp128 @cargl(ptr noundef nonnull byval({ ppc_fp128, ppc_fp128 }) align 16 [[BYVAL_TEMP]]) #[[ATTR3]]
+// CHECK-PPC:    [[CALL:%.*]] = tail call ppc_fp128 @cargl(ptr noundef nonnull byval({ ppc_fp128, ppc_fp128 }) align 16 [[BYVAL_TEMP]]) #[[ATTR4]]
 // CHECK-PPC:    store ppc_fp128 [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 16
 // CHECK-PPC:    store ppc_fp128 [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 16
 //
 // CHECK-ARM-LABEL: define dso_local void @test_cargl(
-// CHECK-ARM-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], [2 x i64] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// CHECK-ARM:    [[CALL:%.*]] = tail call double @cargl([2 x i64] noundef [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA3]]
+// CHECK-ARM-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], [2 x i64] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-ARM:    [[CALL:%.*]] = tail call double @cargl([2 x i64] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]]
 // CHECK-ARM:    store double [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 8
 // CHECK-ARM:    store double [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 8
 //
 // CHECK-ARM-HF-LABEL: define dso_local { double, double } @test_cargl(
-// CHECK-ARM-HF-SAME: { double, double } noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @cargl({ double, double } noundef [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA3]]
+// CHECK-ARM-HF-SAME: { double, double } noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-ARM-HF:    [[CALL:%.*]] = tail call double @cargl({ double, double } noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]]
 //
 // CHECK-THUMB-LABEL: define { double, double } @test_cargl(
-// CHECK-THUMB-SAME: [2 x double] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// CHECK-THUMB:    [[CALL:%.*]] = tail call double @cargl([2 x double] noundef [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA3]]
+// CHECK-THUMB-SAME: [2 x double] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-THUMB:    [[CALL:%.*]] = tail call double @cargl([2 x double] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]]
 //
 // CHECK-AARCH-LABEL: define dso_local { fp128, fp128 } @test_cargl(
-// CHECK-AARCH-SAME: [2 x fp128] noundef alignstack(16) [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @cargl([2 x fp128] noundef alignstack(16) [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA2]]
+// CHECK-AARCH-SAME: [2 x fp128] noundef alignstack(16) [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-AARCH:    [[CALL:%.*]] = tail call fp128 @cargl([2 x fp128] noundef alignstack(16) [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA2]]
 //
 // CHECK-SPIR-LABEL: define dso_local spir_func void @test_cargl(
-// CHECK-SPIR-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ double, double }) align 8 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// CHECK-SPIR-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ double, double }) align 8 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-SPIR:    [[CLD_REAL:%.*]] = load double, ptr [[CLD]], align 8
 // CHECK-SPIR:    [[CLD_IMAG:%.*]] = load double, ptr [[CLD_IMAGP:%.*]], align 8
 // CHECK-SPIR:    store double [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 8
 // CHECK-SPIR:    store double [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 8
-// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func double @cargl(ptr noundef nonnull byval({ double, double }) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func double @cargl(ptr noundef nonnull byval({ double, double }) align 8 [[BYVAL_TEMP]]) #[[ATTR4]]
 // CHECK-SPIR:    store double [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 8
 // CHECK-SPIR:    store double [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 8
 //
 // CHECK-MINGW32-LABEL: define dso_local void @test_cargl(
-// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret({ x86_fp80, x86_fp80 }) align 16 captures(none) initializes((0, 10), (16, 26)) [[AGG_RESULT:%.*]], ptr noundef readonly captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret({ x86_fp80, x86_fp80 }) align 16 captures(none) initializes((0, 10), (16, 26)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-MINGW32:    [[CLD_REAL:%.*]] = load x86_fp80, ptr [[CLD]], align 16
 // CHECK-MINGW32:    [[CLD_IMAG:%.*]] = load x86_fp80, ptr [[CLD_IMAGP:%.*]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 16
-// CHECK-MINGW32:    call void @cargl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
-// CHECK-MINGW32:    [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA3]]
+// CHECK-MINGW32:    call void @cargl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
+// CHECK-MINGW32:    [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA6]]
 // CHECK-MINGW32:    [[CLD_REAL3:%.*]] = load x86_fp80, ptr [[CLD]], align 16
 // CHECK-MINGW32:    [[CLD_IMAG5:%.*]] = load x86_fp80, ptr [[CLD_IMAGP]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 16
@@ -166,33 +166,33 @@ int ilogbl(long double a);
 //
 // CHECK-PPC-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-PPC:    [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]]
+// CHECK-PPC:    [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR4]], !tbaa [[TBAA2]]
 //
 // CHECK-ARM-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-ARM-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-ARM:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]]
+// CHECK-ARM:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]]
 //
 // CHECK-ARM-HF-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-ARM-HF-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-ARM-HF:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]]
+// CHECK-ARM-HF:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]]
 //
 // CHECK-THUMB-LABEL: define i32 @test_ilogb(
 // CHECK-THUMB-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-THUMB:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]]
+// CHECK-THUMB:    [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]]
 //
 // CHECK-AARCH-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-AARCH-SAME: fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-AARCH:    [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA2]]
+// CHECK-AARCH:    [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]]
 //
 // CHECK-SPIR-LABEL: define dso_local spir_func i32 @test_ilogb(
 // CHECK-SPIR-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]]
+// CHECK-SPIR:    [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR4]], !tbaa [[TBAA2]]
 //
 // CHECK-MINGW32-LABEL: define dso_local i32 @test_ilogb(
-// CHECK-MINGW32-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA3]]
-// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA3]]
-// CHECK-MINGW32:    [[CALL:%.*]] = call i32 @ilogbl(ptr noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
+// CHECK-MINGW32-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-MINGW32:    [[CALL:%.*]] = call i32 @ilogbl(ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
 //
 int test_ilogb(long double a) {
    return ilogbl(a);
@@ -243,8 +243,8 @@ int test_ilogb(long double a) {
 // CHECK-SPIR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-SPIR: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-MINGW32: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
-// CHECK-MINGW32: [[META4]] = !{!"long double", [[META5:![0-9]+]], i64 0}
-// CHECK-MINGW32: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-MINGW32: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-MINGW32: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-MINGW32: [[META7]] = !{!"long double", [[META8:![0-9]+]], i64 0}
+// CHECK-MINGW32: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-MINGW32: [[META9]] = !{!"Simple C/C++ TBAA"}
 //.
diff --git a/clang/test/CodeGen/mingw-long-double.c b/clang/test/CodeGen/mingw-long-double.c
index 0fc8f01509682..b98929701bc62 100644
--- a/clang/test/CodeGen/mingw-long-double.c
+++ b/clang/test/CodeGen/mingw-long-double.c
@@ -29,15 +29,15 @@ long double TestLD(long double x) {
   return x * x;
 }
 // GNU32: define dso_local x86_fp80 @TestLD(x86_fp80 noundef %x)
-// GNU64: define dso_local void @TestLD(ptr dead_on_unwind noalias writable sret(x86_fp80) align 16 %agg.result, ptr noundef %0)
+// GNU64: define dso_local void @TestLD(ptr dead_on_unwind noalias writable sret(x86_fp80) align 16 %agg.result, ptr dead_on_return noundef %0)
 // MSC64: define dso_local double @TestLD(double noundef %x)
 
 long double _Complex TestLDC(long double _Complex x) {
   return x * x;
 }
 // GNU32: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ x86_fp80, x86_fp80 }) align 4 %agg.result, ptr noundef byval({ x86_fp80, x86_fp80 }) align 4 %x)
-// GNU64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ x86_fp80, x86_fp80 }) align 16 %agg.result, ptr noundef %x)
-// MSC64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %agg.result, ptr noundef %x)
+// GNU64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ x86_fp80, x86_fp80 }) align 16 %agg.result, ptr dead_on_return noundef %x)
+// MSC64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %agg.result, ptr dead_on_return noundef %x)
 
 // GNU32: declare dso_local void @__mulxc3
 // GNU64: declare dso_local void @__mulxc3
diff --git a/clang/test/CodeGen/ms_abi.c b/clang/test/CodeGen/ms_abi.c
index 528e546f315d5..5d58c9816da78 100644
--- a/clang/test/CodeGen/ms_abi.c
+++ b/clang/test/CodeGen/ms_abi.c
@@ -142,7 +142,7 @@ struct i128 {
 };
 
 __attribute__((ms_abi)) struct i128 f7(struct i128 a) {
-  // WIN64: define dso_local void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr noundef %a)
-  // FREEBSD: define{{.*}} win64cc void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr noundef %a)
+  // WIN64: define dso_local void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr dead_on_return noundef %a)
+  // FREEBSD: define{{.*}} win64cc void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr dead_on_return noundef %a)
   return a;
 }
diff --git a/clang/test/CodeGen/new-pass-manager-opt-bisect.c b/clang/test/CodeGen/new-pass-manager-opt-bisect.c
index 91a0adf252bb5..5d5fdd473422a 100644
--- a/clang/test/CodeGen/new-pass-manager-opt-bisect.c
+++ b/clang/test/CodeGen/new-pass-manager-opt-bisect.c
@@ -7,6 +7,6 @@
 // CHECK: BISECT: running pass (1)
 // CHECK-NOT: BISECT: running pass (1)
 // Make sure that legacy pass manager is running
-// CHECK: Instruction Selection
+// CHECK: -isel
 
 int func(int a) { return a; }
diff --git a/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp b/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp
new file mode 100644
index 0000000000000..0b62f24177bbd
--- /dev/null
+++ b/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp
@@ -0,0 +1,5 @@
+// RUN: %clangxx -g -fsanitize=null -fsanitize-trap=all -fsanitize-annotate-debug-info=all -O2 -std=c++17 -c -o /dev/null %s
+
+struct foo {
+  foo(int, long, const int & = int());
+} foo(0, 0);
diff --git a/clang/test/CodeGen/pass-by-value-noalias.c b/clang/test/CodeGen/pass-by-value-noalias.c
index bc35d13c4df6a..e673ceb80bebe 100644
--- a/clang/test/CodeGen/pass-by-value-noalias.c
+++ b/clang/test/CodeGen/pass-by-value-noalias.c
@@ -11,6 +11,6 @@ struct Foo {
   int f;
 };
 
-// WITH_NOALIAS: define{{.*}} void @take(ptr noalias noundef %arg)
-// NO_NOALIAS: define{{.*}} void @take(ptr noundef %arg)
+// WITH_NOALIAS: define{{.*}} void @take(ptr dead_on_return noalias noundef %arg)
+// NO_NOALIAS: define{{.*}} void @take(ptr dead_on_return noundef %arg)
 void take(struct Foo arg) {}
diff --git a/clang/test/CodeGen/ptrauth-in-c-struct.c b/clang/test/CodeGen/ptrauth-in-c-struct.c
index 2aec31ec3baf9..c74be17b4c837 100644
--- a/clang/test/CodeGen/ptrauth-in-c-struct.c
+++ b/clang/test/CodeGen/ptrauth-in-c-struct.c
@@ -115,7 +115,7 @@ void test_move_assignment_SA(SA *p) {
   *p = getSA();
 }
 
-// CHECK: define void @test_parameter_SA(ptr noundef %{{.*}})
+// CHECK: define void @test_parameter_SA(ptr dead_on_return noundef %{{.*}})
 // CHECK-NOT: call
 // CHECK: ret void
 
@@ -128,7 +128,7 @@ void test_parameter_SA(SA a) {
 // CHECK: store ptr %[[A]], ptr %[[A_ADDR]], align 8
 // CHECK: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]], align 8
 // CHECK: call void @__copy_constructor_8_8_t0w4_pa1_50_8(ptr %[[AGG_TMP]], ptr %[[V0]])
-// CHECK: call void @calleeSA(ptr noundef %[[AGG_TMP]])
+// CHECK: call void @calleeSA(ptr dead_on_return noundef %[[AGG_TMP]])
 // CHECK-NOT: call
 // CHECK: ret void
 
diff --git a/clang/test/CodeGen/regcall.c b/clang/test/CodeGen/regcall.c
index f10da87353fa1..d4b9f00d54d41 100644
--- a/clang/test/CodeGen/regcall.c
+++ b/clang/test/CodeGen/regcall.c
@@ -28,7 +28,7 @@ struct Large { int a[5]; };
 void __regcall v4(int a, struct Large b, int c) {}
 // Win32: define dso_local x86_regcallcc void @__regcall3__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c)
 // Lin32: define dso_local x86_regcallcc void @__regcall3__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 noundef %c)
-// Win64: define dso_local x86_regcallcc void @__regcall3__v4(i32 noundef %a, ptr noundef %b, i32 noundef %c)
+// Win64: define dso_local x86_regcallcc void @__regcall3__v4(i32 noundef %a, ptr dead_on_return noundef %b, i32 noundef %c)
 // Lin64: define dso_local x86_regcallcc void @__regcall3__v4(i32 noundef %a, [5 x i32] %b.coerce, i32 noundef %c)
 
 void __regcall v5(long long a, int b, int c) {}
@@ -47,7 +47,7 @@ void __regcall hfa1(int a, struct HFA4 b, int c) {}
 // indirectly. Additional vector arguments can consume the rest of the SSE
 // registers.
 void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
-// X86: define dso_local x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr inreg noundef %0)
+// X86: define dso_local x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr dead_on_return inreg noundef %0)
 // X64: define dso_local x86_regcallcc void @__regcall3__hfa2(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double noundef %c)
 
 // Ensure that we pass builtin types directly while counting them against the
@@ -61,7 +61,7 @@ void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA
 // handling to ensure alignment.
 void __regcall hfa4(struct HFA5 a) {}
 // X32: define dso_local x86_regcallcc void @__regcall3__hfa4(ptr noundef byval(%struct.HFA5) align 4 %{{.*}})
-// Win64: define dso_local x86_regcallcc void @__regcall3__hfa4(ptr noundef %a)
+// Win64: define dso_local x86_regcallcc void @__regcall3__hfa4(ptr dead_on_return noundef %a)
 // Lin64: define dso_local x86_regcallcc void @__regcall3__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4)
 
 // Return HFAs of 4 or fewer elements in registers.
@@ -79,7 +79,7 @@ void __regcall hva1(int a, struct HVA4 b, int c) {}
 // X64: define dso_local x86_regcallcc void @__regcall3__hva1(i32 noundef %a, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 noundef %c)
 
 void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
-// X86: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %0)
+// X86: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %0)
 // X64: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> noundef %c)
 
 void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
@@ -95,6 +95,6 @@ void __regcall odd_size_hva(struct OddSizeHVA a) {}
 
 struct HFA6 { __m128 f[4]; };
 struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;}
-// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %c, ptr inreg noundef %d)
+// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %c, ptr dead_on_return inreg noundef %d)
 // Win64: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3)
 // Lin64: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce)
diff --git a/clang/test/CodeGen/regcall2.c b/clang/test/CodeGen/regcall2.c
index c88d4e485b104..42619369677b2 100644
--- a/clang/test/CodeGen/regcall2.c
+++ b/clang/test/CodeGen/regcall2.c
@@ -20,7 +20,7 @@ double __regcall bar(__sVector a) {
 
 // FIXME: Do we need to change for Windows?
 // Win: define dso_local x86_regcallcc void @__regcall3__foo(ptr dead_on_unwind noalias writable sret(%struct.__sVector) align 64 %agg.result, i32 noundef %a) #0
-// Win: define dso_local x86_regcallcc double @__regcall3__bar(ptr noundef %a) #0
+// Win: define dso_local x86_regcallcc double @__regcall3__bar(ptr dead_on_return noundef %a) #0
 // Win: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" }
 
 // Lin: define dso_local x86_regcallcc %struct.__sVector @__regcall3__foo(i32 noundef %a) #0
diff --git a/clang/test/CodeGen/regcall4.c b/clang/test/CodeGen/regcall4.c
index 5fbe77fbc7d76..d5fe5d88a0e8c 100644
--- a/clang/test/CodeGen/regcall4.c
+++ b/clang/test/CodeGen/regcall4.c
@@ -28,7 +28,7 @@ struct Large { int a[5]; };
 void __regcall v4(int a, struct Large b, int c) {}
 // Win32: define dso_local x86_regcallcc void @__regcall4__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c)
 // Lin32: define dso_local x86_regcallcc void @__regcall4__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 noundef %c)
-// Win64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, ptr noundef %b, i32 noundef %c)
+// Win64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, ptr dead_on_return noundef %b, i32 noundef %c)
 // Lin64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, [5 x i32] %b.coerce, i32 noundef %c)
 
 void __regcall v5(long long a, int b, int c) {}
@@ -47,7 +47,7 @@ void __regcall hfa1(int a, struct HFA4 b, int c) {}
 // indirectly. Additional vector arguments can consume the rest of the SSE
 // registers.
 void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
-// X86: define dso_local x86_regcallcc void @__regcall4__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr inreg noundef %0)
+// X86: define dso_local x86_regcallcc void @__regcall4__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr dead_on_return inreg noundef %0)
 // X64: define dso_local x86_regcallcc void @__regcall4__hfa2(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double noundef %c)
 
 // Ensure that we pass builtin types directly while counting them against the
@@ -61,7 +61,7 @@ void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA
 // handling to ensure alignment.
 void __regcall hfa4(struct HFA5 a) {}
 // X32: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr noundef byval(%struct.HFA5) align 4 %{{.*}})
-// Win64: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr noundef %a)
+// Win64: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr dead_on_return noundef %a)
 // Lin64: define dso_local x86_regcallcc void @__regcall4__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4)
 
 // Return HFAs of 4 or fewer elements in registers.
@@ -79,7 +79,7 @@ void __regcall hva1(int a, struct HVA4 b, int c) {}
 // X64: define dso_local x86_regcallcc void @__regcall4__hva1(i32 noundef %a, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 noundef %c)
 
 void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
-// X86: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %0)
+// X86: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %0)
 // X64: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> noundef %c)
 
 void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
@@ -95,6 +95,6 @@ void __regcall odd_size_hva(struct OddSizeHVA a) {}
 
 struct HFA6 { __m128 f[4]; };
 struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;}
-// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %c, ptr inreg noundef %d)
+// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %c, ptr dead_on_return inreg noundef %d)
 // Win64: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3)
 // Lin64: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce)
diff --git a/clang/test/CodeGen/sparcv9-abi.c b/clang/test/CodeGen/sparcv9-abi.c
index 616e24e7c519d..5a3d64fd37889 100644
--- a/clang/test/CodeGen/sparcv9-abi.c
+++ b/clang/test/CodeGen/sparcv9-abi.c
@@ -80,7 +80,7 @@ struct medium {
   int *c, *d;
 };
 
-// CHECK-LABEL: define{{.*}} %struct.medium @f_medium(ptr noundef %x)
+// CHECK-LABEL: define{{.*}} %struct.medium @f_medium(ptr dead_on_return noundef %x)
 struct medium f_medium(struct medium x) {
   x.a += *x.b;
   x.b = 0;
@@ -94,7 +94,7 @@ struct large {
   int x;
 };
 
-// CHECK-LABEL: define{{.*}} void @f_large(ptr dead_on_unwind noalias writable sret(%struct.large) align 8 %agg.result, ptr noundef %x)
+// CHECK-LABEL: define{{.*}} void @f_large(ptr dead_on_unwind noalias writable sret(%struct.large) align 8 %agg.result, ptr dead_on_return noundef %x)
 struct large f_large(struct large x) {
   x.a += *x.b;
   x.b = 0;
diff --git a/clang/test/CodeGen/vectorcall.c b/clang/test/CodeGen/vectorcall.c
index cab7fc0972d7b..09b3310c7c4c8 100644
--- a/clang/test/CodeGen/vectorcall.c
+++ b/clang/test/CodeGen/vectorcall.c
@@ -17,7 +17,7 @@ void __vectorcall v3(int a, struct Small b, int c) {}
 struct Large { int a[5]; };
 void __vectorcall v4(int a, struct Large b, int c) {}
 // X86: define dso_local x86_vectorcallcc void @"\01v4@@28"(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c)
-// X64: define dso_local x86_vectorcallcc void @"\01v4@@40"(i32 noundef %a, ptr noundef %b, i32 noundef %c)
+// X64: define dso_local x86_vectorcallcc void @"\01v4@@40"(i32 noundef %a, ptr dead_on_return noundef %b, i32 noundef %c)
 
 void __vectorcall v5(long long a, int b, int c) {}
 // X86: define dso_local x86_vectorcallcc void @"\01v5@@16"(i64 noundef %a, i32 inreg noundef %b, i32 inreg noundef %c)
@@ -35,21 +35,21 @@ void __vectorcall hfa1(int a, struct HFA4 b, int c) {}
 // indirectly. Additional vector arguments can consume the rest of the SSE
 // registers.
 void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
-// X86: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr inreg noundef %b, double inreg noundef %c)
-// X64: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr noundef %b, double noundef %c)
+// X86: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b, double inreg noundef %c)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr dead_on_return noundef %b, double noundef %c)
 
 // Ensure that we pass builtin types directly while counting them against the
 // SSE register usage.
 void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
-// X86: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double inreg noundef %a, double inreg noundef %b, double inreg noundef %c, double inreg noundef %d, double inreg noundef %e, ptr inreg noundef %f)
-// X64: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, ptr noundef %f)
+// X86: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double inreg noundef %a, double inreg noundef %b, double inreg noundef %c, double inreg noundef %d, double inreg noundef %e, ptr dead_on_return inreg noundef %f)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, ptr dead_on_return noundef %f)
 
 // Aggregates with more than four elements are not HFAs and are passed byval.
 // Because they are not classified as homogeneous, they don't get special
 // handling to ensure alignment.
 void __vectorcall hfa4(struct HFA5 a) {}
 // X86: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(ptr noundef byval(%struct.HFA5) align 4 %0)
-// X64: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(ptr noundef %a)
+// X64: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(ptr dead_on_return noundef %a)
 
 // Return HFAs of 4 or fewer elements in registers.
 static struct HFA2 g_hfa2;
@@ -68,26 +68,26 @@ v4f32 __vectorcall hva1(int a, struct HVA4 b, int c) {return b.w;}
 // X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva1@@80"(i32 noundef %a, %struct.HVA4 inreg %b.coerce, i32 noundef %c)
 
 v4f32 __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {return c;}
-// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr inreg noundef %b, <4 x float> inreg noundef %c)
-// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr noundef %b, <4 x float> noundef %c)
+// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b, <4 x float> inreg noundef %c)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return noundef %b, <4 x float> noundef %c)
 
 v4f32 __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {return f.x;}
-// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> inreg noundef %a, <4 x float> inreg noundef %b, <4 x float> inreg noundef %c, <4 x float> inreg noundef %d, <4 x float> inreg noundef %e, ptr inreg noundef %f)
-// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c, <4 x float> noundef %d, <4 x float> noundef %e, ptr noundef %f)
+// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> inreg noundef %a, <4 x float> inreg noundef %b, <4 x float> inreg noundef %c, <4 x float> inreg noundef %d, <4 x float> inreg noundef %e, ptr dead_on_return inreg noundef %f)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c, <4 x float> noundef %d, <4 x float> noundef %e, ptr dead_on_return noundef %f)
 
 // vector types have higher priority then HVA structures, So vector types are allocated first
 // and HVAs are allocated if enough registers are available
 v4f32 __vectorcall hva4(struct HVA4 a, struct HVA2 b, v4f32 c) {return b.y;}
-// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr inreg noundef %b, <4 x float> inreg noundef %c)
-// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr noundef %b, <4 x float> noundef %c)
+// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b, <4 x float> inreg noundef %c)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return noundef %b, <4 x float> noundef %c)
 
 v4f32 __vectorcall hva5(struct HVA3 a, struct HVA3 b, v4f32 c, struct HVA2 d) {return d.y;}
-// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr inreg noundef %b, <4 x float> inreg noundef %c, %struct.HVA2 inreg %d.coerce)
-// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr noundef %b, <4 x float> noundef %c, %struct.HVA2 inreg %d.coerce)
+// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr dead_on_return inreg noundef %b, <4 x float> inreg noundef %c, %struct.HVA2 inreg %d.coerce)
+// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr dead_on_return noundef %b, <4 x float> noundef %c, %struct.HVA2 inreg %d.coerce)
 
 struct HVA4 __vectorcall hva6(struct HVA4 a, struct HVA4 b) { return b;}
-// X86: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr inreg noundef %b)
-// X64: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr noundef %b)
+// X86: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b)
+// X64: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return noundef %b)
 
 struct HVA5 __vectorcall hva7(void) {struct HVA5 a = {}; return a;}
 // X86: define dso_local x86_vectorcallcc void @"\01hva7@@0"(ptr dead_on_unwind noalias writable sret(%struct.HVA5) align 16 %agg.result)
@@ -108,8 +108,8 @@ void __vectorcall odd_size_hva(struct OddSizeHVA a) {}
 // consider 'p7' as a register.  Instead p5 gets put into the register on the second pass.
 // x86 should pass p2, p6 and p7 in registers, then p1 in the second pass.
 struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, int p4, struct HFA2 p5, float p6, float p7, int p8){ return p1;}
-// X86: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float inreg noundef %p2, ptr inreg noundef %p3, i32 inreg noundef %p4, ptr noundef %p5, float inreg noundef %p6, float inreg noundef %p7, i32 noundef %p8)
-// X64: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float noundef %p2, ptr noundef %p3, i32 noundef %p4, %struct.HFA2 inreg %p5.coerce, float noundef %p6, float noundef %p7, i32 noundef %p8)
+// X86: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float inreg noundef %p2, ptr dead_on_return inreg noundef %p3, i32 inreg noundef %p4, ptr dead_on_return noundef %p5, float inreg noundef %p6, float inreg noundef %p7, i32 noundef %p8)
+// X64: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float noundef %p2, ptr dead_on_return noundef %p3, i32 noundef %p4, %struct.HFA2 inreg %p5.coerce, float noundef %p6, float noundef %p7, i32 noundef %p8)
 
 // Vectorcall in both architectures allows passing of an HVA as long as there is room,
 // even if it is not one of the first 6 arguments.  First pass puts p4 into a
@@ -117,8 +117,8 @@ struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3,
 // in a register, does NOT put p7 in a register (since there's no room), then puts
 // p8 in a register.
 void __vectorcall HVAAnywhere(struct HFA2 p1, int p2, int p3, float p4, int p5, int p6, struct HFA4 p7, struct HFA2 p8, float p9){}
-// X86: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg noundef %p2, i32 inreg noundef %p3, float inreg noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr noundef %p7, %struct.HFA2 inreg %p8.coerce, float inreg noundef %p9)
-// X64: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 noundef %p2, i32 noundef %p3, float noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr noundef %p7, %struct.HFA2 inreg %p8.coerce, float noundef %p9)
+// X86: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg noundef %p2, i32 inreg noundef %p3, float inreg noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr dead_on_return noundef %p7, %struct.HFA2 inreg %p8.coerce, float inreg noundef %p9)
+// X64: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 noundef %p2, i32 noundef %p3, float noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr dead_on_return noundef %p7, %struct.HFA2 inreg %p8.coerce, float noundef %p9)
 
 #ifndef __x86_64__
 // This covers the three ways XMM values can be passed on 32-bit x86:
@@ -137,9 +137,9 @@ void __vectorcall vectorcall_indirect_vec(
 // X86-SAME: double inreg noundef %xmm3,
 // X86-SAME: double inreg noundef %xmm4,
 // X86-SAME: <4 x float> inreg noundef %xmm5,
-// X86-SAME: ptr inreg noundef %0,
+// X86-SAME: ptr dead_on_return inreg noundef %0,
 // X86-SAME: i32 inreg noundef %edx,
-// X86-SAME: ptr noundef %1)
+// X86-SAME: ptr dead_on_return noundef %1)
 
 void __vectorcall vectorcall_indirect_fp(
     double xmm0, double xmm1, double xmm2, double xmm3, double xmm4,
@@ -153,7 +153,7 @@ void __vectorcall vectorcall_indirect_fp(
 // X86-SAME: double inreg noundef %xmm3,
 // X86-SAME: double inreg noundef %xmm4,
 // X86-SAME: <4 x float> inreg noundef %xmm5,
-// X86-SAME: ptr inreg noundef %0,
+// X86-SAME: ptr dead_on_return inreg noundef %0,
 // X86-SAME: i32 inreg noundef %edx,
 // X86-SAME: double noundef %mem)
 #endif
diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c
index 328a7aaa7df57..8d223741bc93e 100644
--- a/clang/test/CodeGen/win-fp128.c
+++ b/clang/test/CodeGen/win-fp128.c
@@ -6,7 +6,7 @@ __float128 fp128_ret(void) { return 0; }
 // CHECK-GNU64: define dso_local <2 x i64>  @fp128_ret()
 
 __float128 fp128_args(__float128 a, __float128 b) { return a * b; }
-// CHECK-GNU64: define dso_local <2 x i64> @fp128_args(ptr noundef %0, ptr noundef %1)
+// CHECK-GNU64: define dso_local <2 x i64> @fp128_args(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
 
 void fp128_vararg(int a, ...) {
   // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg
diff --git a/clang/test/CodeGen/win64-i128.c b/clang/test/CodeGen/win64-i128.c
index e10b2be0530eb..2d83889d8f89b 100644
--- a/clang/test/CodeGen/win64-i128.c
+++ b/clang/test/CodeGen/win64-i128.c
@@ -12,8 +12,8 @@ int128_t foo(void) { return 0; }
 
 int128_t bar(int128_t a, int128_t b) { return a * b; }
 
-// GNU64: define dso_local <2 x i64> @bar(ptr noundef %0, ptr noundef %1)
-// MSC64: define dso_local <2 x i64> @bar(ptr noundef %0, ptr noundef %1)
+// GNU64: define dso_local <2 x i64> @bar(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
+// MSC64: define dso_local <2 x i64> @bar(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
 
 void vararg(int a, ...) {
   // GNU64-LABEL: define{{.*}} void @vararg
diff --git a/clang/test/CodeGen/windows-swiftcall.c b/clang/test/CodeGen/windows-swiftcall.c
index 41569c2606622..8716f25b9ddfb 100644
--- a/clang/test/CodeGen/windows-swiftcall.c
+++ b/clang/test/CodeGen/windows-swiftcall.c
@@ -219,7 +219,7 @@ TEST(struct_big_1)
 // CHECK-LABEL: define {{.*}} void @return_struct_big_1({{.*}} dead_on_unwind noalias writable sret
 
 // Should not be byval.
-// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr noundef{{( %.*)?}})
+// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr dead_on_return noundef{{( %.*)?}})
 
 /*****************************************************************************/
 /********************************* TYPE MERGING ******************************/
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp
index 152be26948f28..1709c88563267 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp
@@ -576,7 +576,7 @@ void f(__clang_svmfloat8x4_t, __clang_svmfloat8x4_t);
 // CHECK-NEXT:    [[COERCE74_EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1> } [[COERCE74_TUPLE]], 2
 // CHECK-NEXT:    [[COERCE74_EXTRACT3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1> } [[COERCE74_TUPLE]], 3
 // CHECK-NEXT:    store { <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1> } zeroinitializer, ptr [[BYVAL_TEMP]], align 2
-// CHECK-NEXT:    call void @_Z1f10svboolx4_tS_(<vscale x 16 x i1> [[COERCE74_EXTRACT0]], <vscale x 16 x i1> [[COERCE74_EXTRACT1]], <vscale x 16 x i1> [[COERCE74_EXTRACT2]], <vscale x 16 x i1> [[COERCE74_EXTRACT3]], ptr noundef [[BYVAL_TEMP]])
+// CHECK-NEXT:    call void @_Z1f10svboolx4_tS_(<vscale x 16 x i1> [[COERCE74_EXTRACT0]], <vscale x 16 x i1> [[COERCE74_EXTRACT1]], <vscale x 16 x i1> [[COERCE74_EXTRACT2]], <vscale x 16 x i1> [[COERCE74_EXTRACT3]], ptr dead_on_return noundef [[BYVAL_TEMP]])
 // CHECK-NEXT:    store { <vscale x 16 x i8>, <vscale x 16 x i8> } zeroinitializer, ptr [[COERCE75]], align 16
 // CHECK-NEXT:    [[COERCE75_TUPLE:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[COERCE75]], align 16
 // CHECK-NEXT:    [[COERCE75_EXTRACT0:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[COERCE75_TUPLE]], 0
@@ -1125,7 +1125,7 @@ void f(__clang_svmfloat8x4_t, __clang_svmfloat8x4_t);
 // COMPAT_17-NEXT:    [[COERCE74_EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1> } [[COERCE74_TUPLE]], 2
 // COMPAT_17-NEXT:    [[COERCE74_EXTRACT3:%.*]] = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1> } [[COERCE74_TUPLE]], 3
 // COMPAT_17-NEXT:    store { <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1> } zeroinitializer, ptr [[BYVAL_TEMP]], align 2
-// COMPAT_17-NEXT:    call void @_Z1f10svboolx4_t10svboolx4_t(<vscale x 16 x i1> [[COERCE74_EXTRACT0]], <vscale x 16 x i1> [[COERCE74_EXTRACT1]], <vscale x 16 x i1> [[COERCE74_EXTRACT2]], <vscale x 16 x i1> [[COERCE74_EXTRACT3]], ptr noundef [[BYVAL_TEMP]])
+// COMPAT_17-NEXT:    call void @_Z1f10svboolx4_t10svboolx4_t(<vscale x 16 x i1> [[COERCE74_EXTRACT0]], <vscale x 16 x i1> [[COERCE74_EXTRACT1]], <vscale x 16 x i1> [[COERCE74_EXTRACT2]], <vscale x 16 x i1> [[COERCE74_EXTRACT3]], ptr dead_on_return noundef [[BYVAL_TEMP]])
 // COMPAT_17-NEXT:    store { <vscale x 16 x i8>, <vscale x 16 x i8> } zeroinitializer, ptr [[COERCE75]], align 16
 // COMPAT_17-NEXT:    [[COERCE75_TUPLE:%.*]] = load { <vscale x 16 x i8>, <vscale x 16 x i8> }, ptr [[COERCE75]], align 16
 // COMPAT_17-NEXT:    [[COERCE75_EXTRACT0:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[COERCE75_TUPLE]], 0
diff --git a/clang/test/CodeGenCXX/arm-cc.cpp b/clang/test/CodeGenCXX/arm-cc.cpp
index 68e1b7e4e1e46..939615fcc69e0 100644
--- a/clang/test/CodeGenCXX/arm-cc.cpp
+++ b/clang/test/CodeGenCXX/arm-cc.cpp
@@ -17,4 +17,4 @@ void baz() {
 }
 
 // CHECK: declare void @_Z3fooPv(ptr dead_on_unwind writable sret(%class.SMLoc) align 4, ptr noundef)
-// CHECK: declare void @_Z3zed5SMLoc(ptr noundef)
+// CHECK: declare void @_Z3zed5SMLoc(ptr dead_on_return noundef)
diff --git a/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp b/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp
index c341bd2b855ff..f2d602b3b523e 100644
--- a/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp
+++ b/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp
@@ -55,18 +55,18 @@ void usage() {
 // WINDOWS-NEXT: ret i32 %[[RET]]
 
 
-// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z"(ptr noundef %[[O:[0-9a-zA-Z]+]])
+// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z"(ptr dead_on_return noundef %[[O:[0-9a-zA-Z]+]])
 // WINDOWS64: %[[X:[0-9a-zA-Z]+]] = getelementptr inbounds nuw %struct.Foo, ptr %[[O]], i32 0, i32 0
 // WINDOWS64: %[[LOAD:[0-9a-zA-Z]+]] = load i32, ptr %[[X]]
 // WINDOWS64: ret i32 %[[LOAD]]
 
-// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.sse4.2"(ptr noundef %[[O:[0-9a-zA-Z]+]])
+// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.sse4.2"(ptr dead_on_return noundef %[[O:[0-9a-zA-Z]+]])
 // WINDOWS64: %[[X:[0-9a-zA-Z]+]] = getelementptr inbounds nuw %struct.Foo, ptr %[[O]], i32 0, i32 0
 // WINDOWS64: %[[LOAD:[0-9a-zA-Z]+]] = load i32, ptr %[[X]]
 // WINDOWS64: %[[ADD:[0-9a-zA-Z]+]] = add nsw i32 %[[LOAD]], 1
 // WINDOWS64: ret i32 %[[ADD]]
 
-// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.arch_ivybridge"(ptr noundef %[[O:[0-9a-zA-Z]+]])
+// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.arch_ivybridge"(ptr dead_on_return noundef %[[O:[0-9a-zA-Z]+]])
 // WINDOWS64: %[[X:[0-9a-zA-Z]+]] = getelementptr inbounds nuw %struct.Foo, ptr %[[O]], i32 0, i32 0
 // WINDOWS64: %[[LOAD:[0-9a-zA-Z]+]] = load i32, ptr %[[X]]
 // WINDOWS64: %[[ADD:[0-9a-zA-Z]+]] = add nsw i32 %[[LOAD]], 2
@@ -75,7 +75,7 @@ void usage() {
 // WINDOWS64: define dso_local void @"?usage@@YAXXZ"()
 // WINDOWS64: %[[F:[0-9a-zA-Z]+]] = alloca %struct.Foo
 // WINDOWS64: %[[ARG:[0-9a-zA-Z.]+]] = alloca %struct.Foo
-// WINDOWS64: %[[CALL:[0-9a-zA-Z]+]] = call noundef i32 @"?bar@@YAHUFoo@@@Z.resolver"(ptr noundef %[[ARG]])
+// WINDOWS64: %[[CALL:[0-9a-zA-Z]+]] = call noundef i32 @"?bar@@YAHUFoo@@@Z.resolver"(ptr dead_on_return noundef %[[ARG]])
 
 // WINDOWS64: define weak_odr dso_local i32 @"?bar@@YAHUFoo@@@Z.resolver"(ptr %0)
 // WINDOWS64: %[[RET:[0-9a-zA-Z]+]] = musttail call i32 @"?bar@@YAHUFoo@@@Z.arch_ivybridge"(ptr %0)
diff --git a/clang/test/CodeGenCXX/copy-initialization.cpp b/clang/test/CodeGenCXX/copy-initialization.cpp
index aa0c6395f158d..4e6194cc040c2 100644
--- a/clang/test/CodeGenCXX/copy-initialization.cpp
+++ b/clang/test/CodeGenCXX/copy-initialization.cpp
@@ -12,7 +12,7 @@ struct Bar {
 
 void f(Foo);
 
-// CHECK-LABEL: define{{.*}} void @_Z1g3Foo(ptr noundef %foo)
+// CHECK-LABEL: define{{.*}} void @_Z1g3Foo(ptr dead_on_return noundef %foo)
 void g(Foo foo) {
   // CHECK: call void @_ZN3BarC1Ev
   // CHECK: @_ZNK3BarcvRK3FooEv
diff --git a/clang/test/CodeGenCXX/debug-info.cpp b/clang/test/CodeGenCXX/debug-info.cpp
index 8594a897ef7c0..9cf26ba83ba3e 100644
--- a/clang/test/CodeGenCXX/debug-info.cpp
+++ b/clang/test/CodeGenCXX/debug-info.cpp
@@ -4,7 +4,7 @@
 // CHECK: @_ZN6pr96081xE ={{.*}} global ptr null, align 8, !dbg [[X:![0-9]+]]
 
 // CHECK: define{{.*}} void @_ZN7pr147634funcENS_3fooE
-// CHECK-SAME: ptr noundef [[param:%.*]])
+// CHECK-SAME: ptr dead_on_return noundef [[param:%.*]])
 // CHECK-NEXT: entry:
 // CHECK-NEXT:   alloca ptr, align 8
 // CHECK-NEXT:   [[param_addr_storage:%.*]] = alloca ptr, align 8
diff --git a/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp b/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp
index c7d3a017414ef..c8f5a0f7c2ea6 100644
--- a/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp
+++ b/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp
@@ -13,7 +13,7 @@ struct Empty {
 };
 
 bool foo(Empty e) {
-// CHECK: @_Z3foo5Empty(ptr noundef %e)
+// CHECK: @_Z3foo5Empty(ptr dead_on_return noundef %e)
 // CHECK: call {{.*}} @_ZN5Empty5checkEv(ptr {{[^,]*}} %e)
   return e.check();
 }
@@ -21,6 +21,6 @@ bool foo(Empty e) {
 void caller(Empty &e) {
 // CHECK: @_Z6callerR5Empty(ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %e)
 // CHECK: call {{.*}} @_ZN5EmptyC1ERKS_(ptr {{[^,]*}} [[NEWTMP:%.*]], ptr
-// CHECK: call {{.*}} @_Z3foo5Empty(ptr noundef [[NEWTMP]])
+// CHECK: call {{.*}} @_Z3foo5Empty(ptr dead_on_return noundef [[NEWTMP]])
   foo(e);
 }
diff --git a/clang/test/CodeGenCXX/fastcall.cpp b/clang/test/CodeGenCXX/fastcall.cpp
index 4c94c1623ee16..405917f7a14bb 100644
--- a/clang/test/CodeGenCXX/fastcall.cpp
+++ b/clang/test/CodeGenCXX/fastcall.cpp
@@ -15,6 +15,6 @@ struct S1 {
 void __attribute__((fastcall)) foo2(S1 a, int b);
 void bar2(S1 a, int b) {
   // CHECK-LABEL: define{{.*}} void @_Z4bar22S1i
-  // CHECK: call x86_fastcallcc void @_Z4foo22S1i(ptr inreg %{{.*}}, i32 inreg %
+  // CHECK: call x86_fastcallcc void @_Z4foo22S1i(ptr dead_on_return inreg %{{.*}}, i32 inreg %
   foo2(a, b);
 }
diff --git a/clang/test/CodeGenCXX/homogeneous-aggregates.cpp b/clang/test/CodeGenCXX/homogeneous-aggregates.cpp
index 63ffc6b5bfac8..5ebeb8aad4c18 100644
--- a/clang/test/CodeGenCXX/homogeneous-aggregates.cpp
+++ b/clang/test/CodeGenCXX/homogeneous-aggregates.cpp
@@ -41,8 +41,8 @@ struct D5 : I1, I2, I3 {}; // homogeneous aggregate
 
 // PPC: define{{.*}} void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, [3 x i64] %x.coerce)
 // ARM32: define{{.*}} arm_aapcs_vfpcc void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, [3 x i64] %x.coerce)
-// ARM64: define{{.*}} void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr noundef %x)
-// X64: define dso_local x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr noundef %x)
+// ARM64: define{{.*}} void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr dead_on_return noundef %x)
+// X64: define dso_local x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr dead_on_return noundef %x)
 D1 CC func_D1(D1 x) { return x; }
 
 // PPC: define{{.*}} [3 x double] @_Z7func_D22D2([3 x double] %x.coerce)
@@ -53,7 +53,7 @@ D2 CC func_D2(D2 x) { return x; }
 
 // PPC: define{{.*}} void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, [4 x i64] %x.coerce)
 // ARM32: define{{.*}} arm_aapcs_vfpcc void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, [4 x i64] %x.coerce)
-// ARM64: define{{.*}} void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, ptr noundef %x)
+// ARM64: define{{.*}} void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, ptr dead_on_return noundef %x)
 D3 CC func_D3(D3 x) { return x; }
 
 // PPC: define{{.*}} [4 x double] @_Z7func_D42D4([4 x double] %x.coerce)
@@ -201,7 +201,7 @@ struct NonHFA {
   virtual void f1();
 };
 double foo(NonHFA v) { return v.x + v.y; }
-// WOA64: define dso_local noundef double @"?foo@polymorphic@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}})
+// WOA64: define dso_local noundef double @"?foo@polymorphic@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}})
 }
 namespace trivial_copy_assignment {
 struct HFA {
@@ -221,7 +221,7 @@ struct NonHFA {
   NonHFA &operator=(const NonHFA&);
 };
 double foo(NonHFA v) { return v.x + v.y; }
-// WOA64: define dso_local noundef double @"?foo@non_trivial_copy_assignment@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}})
+// WOA64: define dso_local noundef double @"?foo@non_trivial_copy_assignment@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}})
 }
 namespace user_provided_ctor {
 struct HFA {
@@ -251,7 +251,7 @@ struct NonHFA {
   ~NonHFA();
 };
 double foo(NonHFA v) { return v.x + v.y; }
-// WOA64: define dso_local noundef double @"?foo@non_trivial_dtor@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}})
+// WOA64: define dso_local noundef double @"?foo@non_trivial_dtor@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}})
 }
 namespace non_empty_base {
 struct non_empty_base { double d; };
@@ -272,7 +272,7 @@ struct NonHFA {
   empty e;
 };
 double foo(NonHFA v) { return v.x + v.y; }
-// WOA64: define dso_local noundef double @"?foo@empty_field@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}})
+// WOA64: define dso_local noundef double @"?foo@empty_field@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}})
 }
 namespace non_empty_field {
 struct non_empty { double d; };
diff --git a/clang/test/CodeGenCXX/inalloca-lambda.cpp b/clang/test/CodeGenCXX/inalloca-lambda.cpp
index dc78aa2773f89..0d527e1a02e5f 100644
--- a/clang/test/CodeGenCXX/inalloca-lambda.cpp
+++ b/clang/test/CodeGenCXX/inalloca-lambda.cpp
@@ -22,7 +22,7 @@ void test() {
 // CHECK: %[[V:.*]] = getelementptr inbounds nuw <{ %struct.A }>, ptr %[[ARG]], i32 0, i32 0
 // CHECK: %call = call x86_thiscallcc noundef i32
 // CHECK-SAME: @"?__impl@<lambda_0>@?0??test@@YAXXZ@QBE?A?<auto>@@UA@@@Z"
-// CHECK-SAME: (ptr noundef %this, ptr noundef %[[V]])
+// CHECK-SAME: (ptr noundef %this, ptr dead_on_return noundef %[[V]])
 
 // CHECK: define internal noundef i32
 // CHECK-SAME: @"?__invoke@<lambda_0>@?0??test@@YAXXZ@CA?A?<auto>@@UA@@@Z"
@@ -31,12 +31,12 @@ void test() {
 // CHECK: %[[VAR:.*]] = getelementptr inbounds nuw <{ %struct.A }>, ptr %[[ARG]], i32 0, i32 0
 // CHECK: %call = call x86_thiscallcc noundef i32
 // CHECK-SAME: @"?__impl@<lambda_0>@?0??test@@YAXXZ@QBE?A?<auto>@@UA@@@Z"
-// CHECK-SAME: (ptr noundef %unused.capture, ptr noundef %[[VAR]])
+// CHECK-SAME: (ptr noundef %unused.capture, ptr dead_on_return noundef %[[VAR]])
 // CHECK: ret i32 %call
 
 // CHECK: define internal x86_thiscallcc noundef i32
 // CHECK-SAME: @"?__impl@<lambda_0>@?0??test@@YAXXZ@QBE?A?<auto>@@UA@@@Z"
-// CHECK-SAME: (ptr noundef %this, ptr noundef %[[ARG:.*]])
+// CHECK-SAME: (ptr noundef %this, ptr dead_on_return noundef %[[ARG:.*]])
 // CHECK: %this.addr = alloca ptr, align 4
 // CHECK: store ptr %this, ptr %this.addr, align 4
 // CHECK: %this1 = load ptr, ptr %this.addr, align 4
diff --git a/clang/test/CodeGenCXX/inalloca-overaligned.cpp b/clang/test/CodeGenCXX/inalloca-overaligned.cpp
index 3751751ec0bca..305b8c5dccd10 100644
--- a/clang/test/CodeGenCXX/inalloca-overaligned.cpp
+++ b/clang/test/CodeGenCXX/inalloca-overaligned.cpp
@@ -57,7 +57,7 @@ int receive_both(Both o) {
 }
 
 // CHECK-LABEL: define dso_local noundef i32 @"?receive_both@@Y{{.*}}"
-// CHECK-SAME: (ptr noundef %o)
+// CHECK-SAME: (ptr dead_on_return noundef %o)
 
 int pass_both() {
   gvi32 = receive_both(Both());
@@ -67,7 +67,7 @@ int pass_both() {
 // CHECK-LABEL: define dso_local noundef i32 @"?pass_both@@Y{{.*}}"
 // CHECK: [[TMP:%[^ ]*]] = alloca %struct.Both, align 8
 // CHECK: call x86_thiscallcc noundef ptr @"??0Both@@QAE@XZ"(ptr {{[^,]*}} [[TMP]])
-// CHECK: call noundef i32 @"?receive_both@@Y{{.*}}"(ptr noundef [[TMP]])
+// CHECK: call noundef i32 @"?receive_both@@Y{{.*}}"(ptr dead_on_return noundef [[TMP]])
 
 int receive_inalloca_both(NonTrivial nt, Both o) {
   return nt.x + o.x + o.y;
@@ -101,11 +101,11 @@ struct [[trivial_abi]] alignas(8) MyPtr {
 int receiveMyPtr(MyPtr o) { return *o.ptr; }
 
 // CHECK-LABEL: define dso_local noundef i32 @"?receiveMyPtr@@Y{{.*}}"
-// CHECK-SAME: (ptr noundef %o)
+// CHECK-SAME: (ptr dead_on_return noundef %o)
 
 int passMyPtr() { return receiveMyPtr(MyPtr()); }
 
 // CHECK-LABEL: define dso_local noundef i32 @"?passMyPtr@@Y{{.*}}"
 // CHECK: [[TMP:%[^ ]*]] = alloca %struct.MyPtr, align 8
 // CHECK: call x86_thiscallcc noundef ptr @"??0MyPtr@@QAE@XZ"(ptr {{[^,]*}} [[TMP]])
-// CHECK: call noundef i32 @"?receiveMyPtr@@Y{{.*}}"(ptr noundef [[TMP]])
+// CHECK: call noundef i32 @"?receiveMyPtr@@Y{{.*}}"(ptr dead_on_return noundef [[TMP]])
diff --git a/clang/test/CodeGenCXX/inalloca-vector.cpp b/clang/test/CodeGenCXX/inalloca-vector.cpp
index d1bacb4f0dc8c..2db4c49df116a 100644
--- a/clang/test/CodeGenCXX/inalloca-vector.cpp
+++ b/clang/test/CodeGenCXX/inalloca-vector.cpp
@@ -56,7 +56,7 @@ void __fastcall fastcall_receive_vec(__m128 x, __m128 y, __m128 z, __m128 w, int
 // CHECK-SAME: (<4 x float> inreg noundef %x,
 // CHECK-SAME: <4 x float> inreg noundef %y,
 // CHECK-SAME: <4 x float> inreg noundef %z,
-// CHECK-SAME: ptr inreg noundef %0,
+// CHECK-SAME: ptr dead_on_return inreg noundef %0,
 // CHECK-SAME: i32 inreg noundef %edx,
 // CHECK-SAME: ptr inalloca(<{ ptr, %struct.NonTrivial }>) %1)
 
@@ -73,6 +73,6 @@ void __vectorcall vectorcall_receive_vec(double xmm0, double xmm1, double xmm2,
 // CHECK-SAME: <4 x float> inreg noundef %x,
 // CHECK-SAME: <4 x float> inreg noundef %y,
 // CHECK-SAME: <4 x float> inreg noundef %z,
-// CHECK-SAME: ptr inreg noundef %0,
+// CHECK-SAME: ptr dead_on_return inreg noundef %0,
 // CHECK-SAME: i32 inreg noundef %edx,
 // CHECK-SAME: ptr inalloca(<{ ptr, %struct.NonTrivial }>) %1)
diff --git a/clang/test/CodeGenCXX/inheriting-constructor.cpp b/clang/test/CodeGenCXX/inheriting-constructor.cpp
index 100ca269d7f3c..21751bea055dc 100644
--- a/clang/test/CodeGenCXX/inheriting-constructor.cpp
+++ b/clang/test/CodeGenCXX/inheriting-constructor.cpp
@@ -166,7 +166,7 @@ namespace inalloca_nonvirt {
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG3]], i32 3)
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1)
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
-  // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]])
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
   // WIN64: call void @"??1Q@@QEAA@XZ"(ptr {{[^,]*}} %[[TMP]])
 
@@ -202,7 +202,7 @@ namespace inalloca_nonvirt {
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG3]], i32 3)
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1)
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
-  // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]])
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
   // WIN64: call void @"??1Q@@QEAA@XZ"(ptr {{[^,]*}} %[[TMP]])
 }
@@ -253,7 +253,7 @@ namespace inalloca_virt {
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG3]], i32 3)
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1)
   // WIN64: br i1
-  // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]])
   // WIN64: br
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
@@ -302,7 +302,7 @@ namespace inalloca_virt {
   // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1)
   // WIN64: br i1
   // WIN64: store {{.*}} @"??_8C@inalloca_virt@@7B@"
-  // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]])
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
   // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"(
   // WIN64: call void @"??1Q@@QEAA@XZ"(ptr {{[^,]*}} %[[TMP]])
diff --git a/clang/test/CodeGenCXX/member-function-pointer-calls.cpp b/clang/test/CodeGenCXX/member-function-pointer-calls.cpp
index ff511c0243801..f06cda8b7684e 100644
--- a/clang/test/CodeGenCXX/member-function-pointer-calls.cpp
+++ b/clang/test/CodeGenCXX/member-function-pointer-calls.cpp
@@ -16,7 +16,7 @@ int f(A* a, int (A::*fp)()) {
 // CHECK-NOT: }
 // CHECK: ret i32 1
 // MINGW64-LABEL: define dso_local noundef i32 @_Z2g1v()
-// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr noundef %{{.*}})
+// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr dead_on_return noundef %{{.*}})
 int g1() {
   A a;
   return f(&a, &A::vf1);
@@ -26,7 +26,7 @@ int g1() {
 // CHECK-NOT: }
 // CHECK: ret i32 2
 // MINGW64-LABEL: define dso_local noundef i32 @_Z2g2v()
-// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr noundef %{{.*}})
+// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr dead_on_return noundef %{{.*}})
 int g2() {
   A a;
   return f(&a, &A::vf2);
diff --git a/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp b/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp
index b551df747c073..63a4d5525336b 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp
@@ -24,7 +24,7 @@ void foo(A a, A b, A c) {
 // X86: ret void
 
 // X64-LABEL: define dso_local void @"?foo@@YAXUA@@00@Z"
-// X64:         (ptr noundef %[[a:[^,]*]], ptr noundef %[[b:[^,]*]], ptr noundef %[[c:[^)]*]])
+// X64:         (ptr dead_on_return noundef %[[a:[^,]*]], ptr dead_on_return noundef %[[b:[^,]*]], ptr dead_on_return noundef %[[c:[^)]*]])
 // X64: call void @"??1A@@QEAA@XZ"(ptr {{[^,]*}} %[[a]])
 // X64: call void @"??1A@@QEAA@XZ"(ptr {{[^,]*}} %[[b]])
 // X64: call void @"??1A@@QEAA@XZ"(ptr {{[^,]*}} %[[c]])
@@ -64,7 +64,7 @@ void call_foo() {
 // X64: invoke noundef ptr @"??0A@@QEAA@H@Z"(ptr {{[^,]*}} %[[arg2:[^,]*]], i32 noundef 2)
 // X64: invoke noundef ptr @"??0A@@QEAA@H@Z"(ptr {{[^,]*}} %[[arg1:[^,]*]], i32 noundef 1)
 // X64: call void @"?foo@@YAXUA@@00@Z"
-// X64:       (ptr noundef %[[arg1]], ptr noundef %[[arg2]], ptr noundef %[[arg3]])
+// X64:       (ptr dead_on_return noundef %[[arg1]], ptr dead_on_return noundef %[[arg2]], ptr dead_on_return noundef %[[arg3]])
 // X64: ret void
 //
 //   lpad2:
diff --git a/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp
index 5654db3ba8151..813abb03a7810 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp
@@ -22,10 +22,10 @@ C::C() {} // force emission
 // CHECK32-NEXT: ret void
 
 // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@byval_thunk@@W7EAAXUAgg@2@@Z"
-// CHECK64:             (ptr noundef %this, ptr noundef %x)
+// CHECK64:             (ptr noundef %this, ptr dead_on_return noundef %x)
 // CHECK64:   getelementptr i8, ptr %{{.*}}, i32 -8
 // CHECK64:   call void @"?foo@C@byval_thunk@@UEAAXUAgg@2@@Z"
-// CHECK64:       (ptr {{[^,]*}} %{{.*}}, ptr noundef %x)
+// CHECK64:       (ptr {{[^,]*}} %{{.*}}, ptr dead_on_return noundef %x)
 // CHECK64-NOT: call
 // CHECK64:   ret void
 }
@@ -54,10 +54,10 @@ C::C() {} // force emission
 // CHECK32-NEXT: ret void
 
 // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@stdcall_thunk@@W7EAAXUAgg@2@@Z"
-// CHECK64:             (ptr noundef %this, ptr noundef %x)
+// CHECK64:             (ptr noundef %this, ptr dead_on_return noundef %x)
 // CHECK64:   getelementptr i8, ptr %{{.*}}, i32 -8
 // CHECK64:   call void @"?foo@C@stdcall_thunk@@UEAAXUAgg@2@@Z"
-// CHECK64:       (ptr {{[^,]*}} %{{.*}}, ptr noundef %x)
+// CHECK64:       (ptr {{[^,]*}} %{{.*}}, ptr dead_on_return noundef %x)
 // CHECK64-NOT: call
 // CHECK64:   ret void
 }
@@ -86,10 +86,10 @@ C::C() {} // force emission
 // CHECK32-NEXT: ret ptr %[[rv]]
 
 // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@sret_thunk@@W7EAA?AUAgg@2@U32@@Z"
-// CHECK64:             (ptr noundef %this, ptr dead_on_unwind noalias writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr noundef %x)
+// CHECK64:             (ptr noundef %this, ptr dead_on_unwind noalias writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr dead_on_return noundef %x)
 // CHECK64:   getelementptr i8, ptr %{{.*}}, i32 -8
 // CHECK64:   call void @"?foo@C@sret_thunk@@UEAA?AUAgg@2@U32@@Z"
-// CHECK64:       (ptr {{[^,]*}} %{{.*}}, ptr dead_on_unwind writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr noundef %x)
+// CHECK64:       (ptr {{[^,]*}} %{{.*}}, ptr dead_on_unwind writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr dead_on_return noundef %x)
 // CHECK64-NOT: call
 // CHECK64:   ret void
 }
diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-async.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-async.cpp
new file mode 100644
index 0000000000000..b83173742a2a7
--- /dev/null
+++ b/clang/test/CodeGenCXX/microsoft-abi-eh-async.cpp
@@ -0,0 +1,209 @@
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cl -c --target=x86_64-windows-msvc /EHa -O2 /GS- \
+// RUN:   -Xclang=-import-call-optimization \
+// RUN:   /clang:-S /clang:-o- -- %s 2>&1 \
+// RUN:   | FileCheck %s
+
+#ifdef __clang__
+#define NO_TAIL __attribute((disable_tail_calls))
+#else
+#define NO_TAIL
+#endif
+
+void might_throw();
+void other_func(int x);
+
+void does_not_throw() noexcept(true);
+
+extern "C" void __declspec(dllimport) some_dll_import();
+
+class HasDtor {
+    int x;
+    char foo[40];
+
+public:
+    explicit HasDtor(int x);
+    ~HasDtor();
+};
+
+class BadError {
+public:
+    int errorCode;
+};
+
+void normal_has_regions() {
+    // CHECK-LABEL: .def "?normal_has_regions@@YAXXZ"
+    // CHECK: .seh_endprologue
+
+    // <-- state -1 (none)
+    {
+        HasDtor hd{42};
+
+        // <-- state goes from -1 to 0
+        // because state changes, we expect the HasDtor::HasDtor() call to have a NOP
+        // CHECK: call "??0HasDtor@@QEAA@H@Z"
+        // CHECK-NEXT: nop
+
+        might_throw();
+        // CHECK: call "?might_throw@@YAXXZ"
+        // CHECK-NEXT: nop
+
+        // <-- state goes from 0 to -1 because we're about to call HasDtor::~HasDtor()
+        // CHECK: call "??1HasDtor@@QEAA@XZ"
+        // <-- state -1
+    }
+
+    // <-- state -1
+    other_func(10);
+    // CHECK: call "?other_func@@YAXH@Z"
+    // CHECK-NEXT: nop
+    // CHECK: .seh_startepilogue
+
+    // <-- state -1
+}
+
+// This tests a tail call to a destructor.
+void case_dtor_arg_empty_body(HasDtor x)
+{
+    // CHECK-LABEL: .def "?case_dtor_arg_empty_body@@YAXVHasDtor@@@Z"
+    // CHECK: jmp "??1HasDtor@@QEAA@XZ"
+}
+
+int case_dtor_arg_empty_with_ret(HasDtor x)
+{
+    // CHECK-LABEL: .def "?case_dtor_arg_empty_with_ret@@YAHVHasDtor@@@Z"
+    // CHECK: .seh_endprologue
+
+    // CHECK: call "??1HasDtor@@QEAA@XZ"
+    // CHECK-NOT: nop
+
+    // The call to HasDtor::~HasDtor() should NOT have a NOP because the
+    // following "mov eax, 100" instruction is in the same EH state.
+
+    return 100;
+
+    // CHECK: mov eax, 100
+    // CHECK: .seh_startepilogue
+    // CHECK: .seh_endepilogue
+    // CHECK: .seh_endproc
+}
+
+int case_noexcept_dtor(HasDtor x) noexcept(true)
+{
+    // CHECK: .def "?case_noexcept_dtor@@YAHVHasDtor@@@Z"
+    // CHECK: call "??1HasDtor@@QEAA@XZ"
+    // CHECK-NEXT: mov eax, 100
+    // CHECK: .seh_startepilogue
+    return 100;
+}
+
+void case_except_simple_call() NO_TAIL
+{
+    does_not_throw();
+}
+// CHECK-LABEL: .def "?case_except_simple_call@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK-NEXT: call "?does_not_throw@@YAXXZ"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+// CHECK: .seh_endproc
+
+void case_noexcept_simple_call() noexcept(true) NO_TAIL
+{
+    does_not_throw();
+}
+// CHECK-LABEL: .def "?case_noexcept_simple_call@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK-NEXT: call "?does_not_throw@@YAXXZ"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+// CHECK: .seh_endepilogue
+// CHECK-NEXT: ret
+// CHECK-NEXT: .seh_endproc
+
+// This tests that the destructor is called right before SEH_BeginEpilogue,
+// but in a function that has a return value. Loading the return value
+// counts as a real instruction, so there is no need for a NOP after the
+// dtor call.
+int case_dtor_arg_calls_no_throw(HasDtor x)
+{
+    does_not_throw(); // no NOP expected
+    return 100;
+}
+// CHECK-LABEL: .def "?case_dtor_arg_calls_no_throw@@YAHVHasDtor@@@Z"
+// CHECK: .seh_endprologue
+// CHECK: "?does_not_throw@@YAXXZ"
+// CHECK-NEXT: nop
+// CHECK: "??1HasDtor@@QEAA@XZ"
+// CHECK-NEXT: mov eax, 100
+// CHECK: .seh_startepilogue
+// CHECK: .seh_endproc
+
+// Check the behavior of CALLs that are at the end of MBBs. If a CALL is within
+// a non-null EH state (state -1) and is at the end of an MBB, then we expect
+// to find an EH_LABEL after the CALL. This causes us to insert a NOP, which
+// is the desired result.
+void case_dtor_runs_after_join(int x) {
+    // CHECK-LABEL: .def "?case_dtor_runs_after_join@@YAXH@Z"
+    // CHECK: .seh_endprologue
+
+    // <-- EH state -1
+
+    // ctor call does not need a NOP, because it has real instructions after it
+    HasDtor hd{42};
+    // CHECK: call "??0HasDtor@@QEAA@H@Z"
+    // CHECK-NEXT: nop
+    // CHECK: test
+
+    // <-- EH state transition from -1 0
+    if (x) {
+        might_throw(); // <-- NOP expected (at end of BB w/ EH_LABEL)
+        // CHECK: call "?might_throw@@YAXXZ"
+        // CHECK-NEXT: nop
+    } else {
+        other_func(10); // <-- NOP expected (at end of BB w/ EH_LABEL)
+        // CHECK: call "?other_func@@YAXH@Z"
+        // CHECK-NEXT: nop
+    }
+    does_not_throw();
+    // <-- EH state transition 0 to -1
+    // ~HasDtor() runs
+
+    // CHECK: .seh_endproc
+
+    // CHECK: "$ip2state$?case_dtor_runs_after_join@@YAXH@Z":
+    // CHECK-NEXT: .long [[func_begin:.Lfunc_begin([0-9]+)@IMGREL]]
+    // CHECK-NEXT: .long -1
+    // CHECK-NEXT: .long [[tmp1:.Ltmp([0-9]+)]]@IMGREL
+    // CHECK-NEXT: .long 0
+    // CHECK-NEXT: .long [[tmp2:.Ltmp([0-9]+)]]@IMGREL
+    // CHECK-NEXT: .long -1
+}
+
+
+// Check the behavior of NOP padding around tail calls.
+// We do not expect to insert NOPs around tail calls.
+// However, the first call (to other_func()) does get a NOP
+// because it comes before .seh_startepilogue.
+void case_tail_call_no_eh(bool b) {
+    // tail call; no NOP padding after JMP
+    if (b) {
+        does_not_throw();
+        // <-- no NOP here
+        return;
+    }
+
+    other_func(20);
+    // <-- NOP does get inserted here
+}
+// CHECK-LABEL: .def "?case_tail_call_no_eh@@YAX_N@Z"
+// CHECK: test
+// CHECK-NEXT: je .LBB
+// CHECK: jmp "?does_not_throw@@YAXXZ"
+// CHECK-SAME: TAILCALL
+// CHECK-NEXT: .LBB
+// CHECK-NEXT: mov ecx, 20
+// CHECK-NEXT: jmp "?other_func@@YAXH@Z"
+// CHECK-SAME: TAILCALL
+// CHECK-NEXT: # -- End function
diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-disabled.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-disabled.cpp
new file mode 100644
index 0000000000000..4c6493f15dee5
--- /dev/null
+++ b/clang/test/CodeGenCXX/microsoft-abi-eh-disabled.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cl -c --target=x86_64-windows-msvc -EHs-c- -O2 -GS- \
+// RUN:   -Xclang=-import-call-optimization \
+// RUN:   -clang:-S -clang:-o- -- %s 2>&1 \
+// RUN:   | FileCheck %s
+
+#ifdef __clang__
+#define NO_TAIL __attribute((disable_tail_calls))
+#else
+#define NO_TAIL
+#endif
+
+void might_throw();
+void other_func(int x);
+
+void does_not_throw() noexcept(true);
+
+extern "C" void __declspec(dllimport) some_dll_import();
+
+class HasDtor {
+    int x;
+    char foo[40];
+
+public:
+    explicit HasDtor(int x);
+    ~HasDtor();
+};
+
+void normal_has_regions() {
+    {
+        HasDtor hd{42};
+
+        // because state changes, we expect the HasDtor::HasDtor() call to have a NOP
+        might_throw();
+    }
+
+    other_func(10);
+}
+// CHECK-LABEL: .def "?normal_has_regions@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK: call "??0HasDtor@@QEAA@H@Z"
+// CHECK-NEXT: call "?might_throw@@YAXXZ"
+// CHECK-NEXT: mov
+// CHECK: call "??1HasDtor@@QEAA@XZ"
+// CHECK-NEXT: mov ecx, 10
+// CHECK-NEXT: call "?other_func@@YAXH@Z"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+// CHECK-NOT: "$ip2state$?normal_has_regions@@YAXXZ"
+
+// This tests a tail call to a destructor.
+void case_dtor_arg_empty_body(HasDtor x)
+{
+}
+// CHECK-LABEL: .def "?case_dtor_arg_empty_body@@YAXVHasDtor@@@Z"
+// CHECK: jmp "??1HasDtor@@QEAA@XZ"
+
+int case_dtor_arg_empty_with_ret(HasDtor x)
+{
+    // The call to HasDtor::~HasDtor() should NOT have a NOP because the
+    // following "mov eax, 100" instruction is in the same EH state.
+    return 100;
+}
+// CHECK-LABEL: .def "?case_dtor_arg_empty_with_ret@@YAHVHasDtor@@@Z"
+// CHECK: .seh_endprologue
+// CHECK: call "??1HasDtor@@QEAA@XZ"
+// CHECK-NOT: nop
+// CHECK: mov eax, 100
+// CHECK: .seh_startepilogue
+// CHECK: .seh_endepilogue
+// CHECK: .seh_endproc
+
+void case_except_simple_call() NO_TAIL
+{
+    does_not_throw();
+}
+
+// This tests that the destructor is called right before SEH_BeginEpilogue,
+// but in a function that has a return value.
+int case_dtor_arg_calls_no_throw(HasDtor x)
+{
+    does_not_throw(); // no NOP expected
+    return 100;
+}
+
+// Check the behavior of CALLs that are at the end of MBBs. If a CALL is within
+// a non-null EH state (state -1) and is at the end of an MBB, then we expect
+// to find an EH_LABEL after the CALL. This causes us to insert a NOP, which
+// is the desired result.
+void case_dtor_runs_after_join(int x) {
+
+    // ctor call does not need a NOP, because it has real instructions after it
+    HasDtor hd{42};
+
+    if (x) {
+        might_throw();
+    } else {
+        other_func(10);
+    }
+    does_not_throw();
+    // ~HasDtor() runs
+}
+
+// CHECK-LABEL: .def "?case_dtor_runs_after_join@@YAXH@Z"
+// CHECK: .seh_endprologue
+// CHECK: call "??0HasDtor@@QEAA@H@Z"
+// CHECK-NEXT: test
+// CHECK: call "?might_throw@@YAXXZ"
+// CHECK-NEXT: jmp
+// CHECK: call "?other_func@@YAXH@Z"
+// CHECK-NEXT: .LBB
+// CHECK: call "?does_not_throw@@YAXXZ"
+// CHECK-NEXT: lea
+// CHECK-NEXT: call "??1HasDtor@@QEAA@XZ"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+// CHECK-NOT: "$ip2state$?case_dtor_runs_after_join@@YAXH@Z":
+
+
+// Check the behavior of NOP padding around tail calls.
+// We do not expect to insert NOPs around tail calls.
+// However, the first call (to other_func()) does get a NOP
+// because it comes before .seh_startepilogue.
+void case_tail_call_no_eh() {
+    // ordinary call
+    other_func(10);
+
+    // tail call; no NOP padding after JMP
+    does_not_throw();
+}
+
+// CHECK-LABEL: .def "?case_tail_call_no_eh@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK: call "?other_func@@YAXH@Z"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+// CHECK: .seh_endepilogue
+// CHECK: jmp "?does_not_throw@@YAXXZ"
+// CHECK-NOT: nop
+// CHECK: .seh_endproc
diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
new file mode 100644
index 0000000000000..d6b9b58e55845
--- /dev/null
+++ b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
@@ -0,0 +1,241 @@
+// RUN: %clang_cl -c --target=x86_64-windows-msvc -O2 -EHsc -GS- \
+// RUN:   -Xclang=-import-call-optimization \
+// RUN:   -clang:-S -clang:-o- -- %s 2>&1 \
+// RUN:   | FileCheck %s
+
+#ifdef __clang__
+#define NO_TAIL __attribute((disable_tail_calls))
+#else
+#define NO_TAIL
+#endif
+
+void might_throw();
+void other_func(int x);
+
+void does_not_throw() noexcept(true);
+
+extern "C" void __declspec(dllimport) some_dll_import();
+
+class HasDtor {
+    int x;
+    char foo[40];
+
+public:
+    explicit HasDtor(int x);
+    ~HasDtor();
+};
+
+class BadError {
+public:
+    int errorCode;
+};
+
+// Verify that when NOP padding for IP2State is active *and* Import Call
+// Optimization is active that we see both forms of NOP padding.
+void case_calls_dll_import() NO_TAIL {
+    some_dll_import();
+}
+// CHECK-LABEL: .def "?case_calls_dll_import@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK: .Limpcall{{[0-9]+}}:
+// CHECK-NEXT: rex64
+// CHECK-NEXT: call __imp_some_dll_import
+// CHECK-NEXT: nop dword ptr {{\[.*\]}}
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+
+void normal_has_regions() {
+
+    // <-- state -1 (none)
+    {
+        HasDtor hd{42};
+
+        // <-- state goes from -1 to 0
+        // because state changes, we expect the HasDtor::HasDtor() call to have a NOP
+
+        might_throw();
+
+        // <-- state goes from 0 to -1 because we're about to call HasDtor::~HasDtor()
+        // <-- state -1
+    }
+
+    // <-- state -1
+    other_func(10);
+
+    // <-- state -1
+}
+// CHECK-LABEL: .def "?normal_has_regions@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK: call "??0HasDtor@@QEAA@H@Z"
+// CHECK-NEXT: nop
+// CHECK: call "?might_throw@@YAXXZ"
+// CHECK-NEXT: nop
+// CHECK: call "??1HasDtor@@QEAA@XZ"
+// CHECK: call "?other_func@@YAXH@Z"
+// CHECK-NEXT: nop
+// CHECK: .seh_startepilogue
+
+// This tests a tail call to a destructor.
+void case_dtor_arg_empty_body(HasDtor x)
+{
+}
+// CHECK-LABEL: .def "?case_dtor_arg_empty_body@@YAXVHasDtor@@@Z"
+// CHECK: jmp "??1HasDtor@@QEAA@XZ"
+
+int case_dtor_arg_empty_with_ret(HasDtor x)
+{
+    // CHECK-LABEL: .def "?case_dtor_arg_empty_with_ret@@YAHVHasDtor@@@Z"
+    // CHECK: .seh_endprologue
+
+    // CHECK: call "??1HasDtor@@QEAA@XZ"
+    // CHECK-NOT: nop
+
+    // The call to HasDtor::~HasDtor() should NOT have a NOP because the
+    // following "mov eax, 100" instruction is in the same EH state.
+
+    return 100;
+
+    // CHECK: mov eax, 100
+    // CHECK: .seh_startepilogue
+    // CHECK: .seh_endepilogue
+    // CHECK: .seh_endproc
+}
+
+int case_noexcept_dtor(HasDtor x) noexcept(true)
+{
+    // CHECK: .def "?case_noexcept_dtor@@YAHVHasDtor@@@Z"
+    // CHECK: call "??1HasDtor@@QEAA@XZ"
+    // CHECK-NEXT: mov eax, 100
+    // CHECK-NEXT: .seh_startepilogue
+    return 100;
+}
+
+// Simple call of a function that can throw
+void case_except_simple_call() NO_TAIL
+{
+    might_throw();
+}
+// CHECK-LABEL: .def "?case_except_simple_call@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK-NEXT: call "?might_throw@@YAXXZ"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+
+// Simple call of a function that cannot throw, in a noexcept context.
+void case_noexcept_simple_call() noexcept(true) NO_TAIL
+{
+    does_not_throw();
+}
+// CHECK-LABEL: .def "?case_noexcept_simple_call@@YAXXZ"
+// CHECK: .seh_endprologue
+// CHECK-NEXT: call "?does_not_throw@@YAXXZ"
+// CHECK-NEXT: nop
+// CHECK-NEXT: .seh_startepilogue
+
+
+// This tests that the destructor is called right before SEH_BeginEpilogue,
+// but in a function that has a return value.
+int case_dtor_arg_calls_no_throw(HasDtor x)
+{
+    does_not_throw(); // no NOP expected
+    return 100;
+}
+
+// Check the behavior of CALLs that are at the end of MBBs. If a CALL is within
+// a non-null EH state (state -1) and is at the end of an MBB, then we expect
+// to find an EH_LABEL after the CALL. This causes us to insert a NOP, which
+// is the desired result.
+void case_dtor_runs_after_join(int x) {
+    // CHECK-LABEL: .def "?case_dtor_runs_after_join@@YAXH@Z"
+    // CHECK: .seh_endprologue
+
+    // <-- EH state -1
+
+    // ctor call does not need a NOP, because it has real instructions after it
+    HasDtor hd{42};
+    // CHECK: call "??0HasDtor@@QEAA@H@Z"
+    // CHECK-NEXT: test
+
+    // <-- EH state transition from -1 0
+    if (x) {
+        might_throw(); // <-- NOP expected (at end of BB w/ EH_LABEL)
+        // CHECK: call "?might_throw@@YAXXZ"
+        // CHECK-NEXT: nop
+    } else {
+        other_func(10); // <-- NOP expected (at end of BB w/ EH_LABEL)
+        // CHECK: call "?other_func@@YAXH@Z"
+        // CHECK-NEXT: nop
+    }
+    does_not_throw();
+    // <-- EH state transition 0 to -1
+    // ~HasDtor() runs
+
+    // CHECK: .seh_endproc
+
+    // CHECK: "$ip2state$?case_dtor_runs_after_join@@YAXH@Z":
+    // CHECK-NEXT: .long [[func_begin:.Lfunc_begin([0-9]+)@IMGREL]]
+    // CHECK-NEXT: .long -1
+    // CHECK-NEXT: .long [[tmp1:.Ltmp([0-9]+)]]@IMGREL
+    // CHECK-NEXT: .long 0
+    // CHECK-NEXT: .long [[tmp2:.Ltmp([0-9]+)]]@IMGREL
+    // CHECK-NEXT: .long -1
+}
+
+
+// Check the behavior of NOP padding around tail calls.
+// We do not expect to insert NOPs around tail calls.
+// However, the first call (to other_func()) does get a NOP
+// because it comes before .seh_startepilogue.
+void case_tail_call_no_eh() {
+    // CHECK-LABEL: .def "?case_tail_call_no_eh@@YAXXZ"
+    // CHECK: .seh_endprologue
+
+    // ordinary call
+    other_func(10);
+    // CHECK: call "?other_func@@YAXH@Z"
+    // CHECK-NEXT: nop
+
+    // tail call; no NOP padding after JMP
+    does_not_throw();
+
+    // CHECK: .seh_startepilogue
+    // CHECK: .seh_endepilogue
+    // CHECK: jmp "?does_not_throw@@YAXXZ"
+    // CHECK-NOT: nop
+    // CHECK: .seh_endproc
+}
+
+
+// Check the behavior of a try/catch
+int case_try_catch() {
+    // CHECK-LABEL: .def "?case_try_catch@@YAHXZ"
+    // CHECK: .seh_endprologue
+
+    // Because of the EH_LABELs, the ctor and other_func() get NOPs.
+
+    int result = 0;
+    try {
+        // CHECK: call "??0HasDtor@@QEAA@H@Z"
+        // CHECK-NEXT: nop
+        HasDtor hd{20};
+
+        // CHECK: call "?other_func@@YAXH@Z"
+        // CHECK-NEXT: nop
+        other_func(10);
+
+        // CHECK: call "??1HasDtor@@QEAA@XZ"
+        // CHECK: mov
+    } catch (BadError& e) {
+        result = 1;
+    }
+    return result;
+
+    // CHECK: .seh_endproc
+
+    // CHECK: .def "?dtor$4@?0??case_try_catch@@YAHXZ@4HA"
+    // CHECK: .seh_endprologue
+    // CHECK: call "??1HasDtor@@QEAA@XZ"
+    // CHECK-NEXT: nop
+    // CHECK: .seh_startepilogue
+    // CHECK: .seh_endproc
+}
diff --git a/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp b/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
index 806bc5b63ef02..8defb68c668b2 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
@@ -428,7 +428,7 @@ bool nullTestDataUnspecified(int Unspecified::*mp) {
 
 // Pass this large type indirectly.
 // X64-LABEL: define dso_local noundef zeroext i1 @"?nullTestDataUnspecified@@
-// X64:             (ptr noundef %0)
+// X64:             (ptr dead_on_return noundef %0)
 }
 
 bool nullTestFunctionUnspecified(void (Unspecified::*mp)()) {
@@ -590,7 +590,7 @@ bool unspecFuncMemptrEq(void (Unspecified::*l)(), void (Unspecified::*r)()) {
 // CHECK: }
 
 // X64-LABEL: define dso_local noundef zeroext i1 @"?unspecFuncMemptrEq@@
-// X64:             (ptr noundef %0, ptr noundef %1)
+// X64:             (ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
 }
 
 bool unspecFuncMemptrNeq(void (Unspecified::*l)(), void (Unspecified::*r)()) {
@@ -635,7 +635,7 @@ bool unspecDataMemptrEq(int Unspecified::*l, int Unspecified::*r) {
 // CHECK: }
 
 // X64-LABEL: define dso_local noundef zeroext i1 @"?unspecDataMemptrEq@@
-// X64:             (ptr noundef %0, ptr noundef %1)
+// X64:             (ptr dead_on_return noundef %0, ptr dead_on_return noundef %1)
 }
 
 void (Multiple::*convertB2FuncToMultiple(void (B2::*mp)()))() {
diff --git a/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp b/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
index 6e8ba3953b2cf..767bf168633ae 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
@@ -175,7 +175,7 @@ void multibyte_arg(Multibyte s) {}
 void packed_arg(Packed s) {}
 // LINUX-LABEL: define{{.*}} void @_Z10packed_arg6Packed(ptr noundef byval(%struct.Packed) align 4 %s)
 // WIN32: define dso_local void @"?packed_arg@@YAXUPacked@@@Z"(ptr noundef byval(%struct.Packed) align 4 %s)
-// WIN64: define dso_local void @"?packed_arg@@YAXUPacked@@@Z"(ptr noundef %s)
+// WIN64: define dso_local void @"?packed_arg@@YAXUPacked@@@Z"(ptr dead_on_return noundef %s)
 
 // Test that dtors are invoked in the callee.
 void small_arg_with_dtor(SmallWithDtor s) {}
@@ -190,7 +190,7 @@ void small_arg_with_dtor(SmallWithDtor s) {}
 // WOA64: }
 
 // FIXME: MSVC incompatible!
-// WOA: define dso_local arm_aapcs_vfpcc void @"?small_arg_with_dtor@@YAXUSmallWithDtor@@@Z"(ptr noundef %s) {{.*}} {
+// WOA: define dso_local arm_aapcs_vfpcc void @"?small_arg_with_dtor@@YAXUSmallWithDtor@@@Z"(ptr dead_on_return noundef %s) {{.*}} {
 // WOA:   call arm_aapcs_vfpcc void @"??1SmallWithDtor@@QAA@XZ"(ptr {{[^,]*}} %s)
 // WOA: }
 
@@ -220,7 +220,7 @@ void ref_small_arg_with_dtor(const SmallWithDtor &s) { }
 // WIN64-LABEL: define dso_local void @"?ref_small_arg_with_dtor@@YAXAEBUSmallWithDtor@@@Z"(ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %s)
 
 void big_arg_with_dtor(BigWithDtor s) {}
-// WIN64-LABEL: define dso_local void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr noundef %s)
+// WIN64-LABEL: define dso_local void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr dead_on_return noundef %s)
 // WIN64:   call void @"??1BigWithDtor@@QEAA@XZ"
 // WIN64: }
 
@@ -231,7 +231,7 @@ void call_big_arg_with_dtor() {
 // larger than 8 bytes and is passed indirectly.
 // WIN64-LABEL: define dso_local void @"?call_big_arg_with_dtor@@YAXXZ"()
 // WIN64:   call noundef ptr @"??0BigWithDtor@@QEAA@XZ"
-// WIN64:   call void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr noundef %{{.*}})
+// WIN64:   call void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr dead_on_return noundef %{{.*}})
 // WIN64-NOT: call void @"??1BigWithDtor@@QEAA@XZ"
 // WIN64:   ret void
 
@@ -259,22 +259,22 @@ void eh_cleanup_arg_with_dtor() {
 // WIN32: }
 
 void small_arg_with_vftable(SmallWithVftable s) {}
-// LINUX-LABEL: define{{.*}} void @_Z22small_arg_with_vftable16SmallWithVftable(ptr noundef %s)
+// LINUX-LABEL: define{{.*}} void @_Z22small_arg_with_vftable16SmallWithVftable(ptr dead_on_return noundef %s)
 // WIN32: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr inalloca(<{ %struct.SmallWithVftable }>) %0)
-// WIN64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr noundef %s)
-// WOA64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr noundef %s)
+// WIN64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr dead_on_return noundef %s)
+// WOA64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr dead_on_return noundef %s)
 
 void medium_arg_with_copy_ctor(MediumWithCopyCtor s) {}
-// LINUX-LABEL: define{{.*}} void @_Z25medium_arg_with_copy_ctor18MediumWithCopyCtor(ptr noundef %s)
+// LINUX-LABEL: define{{.*}} void @_Z25medium_arg_with_copy_ctor18MediumWithCopyCtor(ptr dead_on_return noundef %s)
 // WIN32: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr inalloca(<{ %struct.MediumWithCopyCtor }>) %0)
-// WIN64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr noundef %s)
-// WOA: define dso_local arm_aapcs_vfpcc void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr noundef %s)
-// WOA64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr noundef %s)
+// WIN64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr dead_on_return noundef %s)
+// WOA: define dso_local arm_aapcs_vfpcc void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr dead_on_return noundef %s)
+// WOA64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr dead_on_return noundef %s)
 
 void big_arg(Big s) {}
 // LINUX-LABEL: define{{.*}} void @_Z7big_arg3Big(ptr noundef byval(%struct.Big) align 4 %s)
 // WIN32: define dso_local void @"?big_arg@@YAXUBig@@@Z"(ptr noundef byval(%struct.Big) align 4 %s)
-// WIN64: define dso_local void @"?big_arg@@YAXUBig@@@Z"(ptr noundef %s)
+// WIN64: define dso_local void @"?big_arg@@YAXUBig@@@Z"(ptr dead_on_return noundef %s)
 
 // PR27607: We would attempt to load i32 value out of the reference instead of
 // just loading the pointer from the struct during argument expansion.
@@ -346,7 +346,7 @@ class Class {
   void thiscall_method_arg(Big s) {}
   // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE3Big(ptr {{[^,]*}} %this, ptr noundef byval(%struct.Big) align 4 %s)
   // WIN32: define {{.*}} void @"?thiscall_method_arg@Class@@QAEXUBig@@@Z"(ptr {{[^,]*}} %this, ptr noundef byval(%struct.Big) align 4 %s)
-  // WIN64: define linkonce_odr dso_local void @"?thiscall_method_arg@Class@@QEAAXUBig@@@Z"(ptr {{[^,]*}} %this, ptr noundef %s)
+  // WIN64: define linkonce_odr dso_local void @"?thiscall_method_arg@Class@@QEAAXUBig@@@Z"(ptr {{[^,]*}} %this, ptr dead_on_return noundef %s)
 };
 
 void use_class() {
diff --git a/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp b/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp
index 9e37e71e257fd..b7653632cf882 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp
@@ -18,4 +18,4 @@ A B::foo(A x) {
   return x;
 }
 
-// CHECK-LABEL: define{{.*}} void @"?foo@B@@QEAA?AUA@@U2@@Z"(ptr {{[^,]*}} %this, ptr dead_on_unwind noalias writable sret(%struct.A) align 4 %agg.result, ptr noundef %x)
+// CHECK-LABEL: define{{.*}} void @"?foo@B@@QEAA?AUA@@U2@@Z"(ptr {{[^,]*}} %this, ptr dead_on_unwind noalias writable sret(%struct.A) align 4 %agg.result, ptr dead_on_return noundef %x)
diff --git a/clang/test/CodeGenCXX/ms-property.cpp b/clang/test/CodeGenCXX/ms-property.cpp
index 744de224b2f9a..d9fbf46dacb25 100644
--- a/clang/test/CodeGenCXX/ms-property.cpp
+++ b/clang/test/CodeGenCXX/ms-property.cpp
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
   // CHECK: [[ARGC:%.+]] = load i32, ptr %
   // CHECK: [[P1_X_ARGC_0:%.+]] = call noundef i32 @"?GetX@S@@QEAAHHH@Z"(ptr {{[^,]*}} [[P1]], i32 noundef [[ARGC]], i32 noundef 0)
   // CHECK: [[CAST:%.+]] = trunc i32 [[P1_X_ARGC_0]] to i8
-  // CHECK: [[P2_Y_p1_X_ARGC_0_T:%.+]] = call noundef i8 @"?GetY@?$St@M@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[P2_2]], i8 noundef [[CAST]], ptr noundef %{{.+}})
+  // CHECK: [[P2_Y_p1_X_ARGC_0_T:%.+]] = call noundef i8 @"?GetY@?$St@M@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[P2_2]], i8 noundef [[CAST]], ptr dead_on_return noundef %{{.+}})
   // CHECK: [[CAST:%.+]] = sitofp i8 [[P2_Y_p1_X_ARGC_0_T]] to float
   // CHECK: [[J:%.+]] = load i32, ptr %
   // CHECK: [[CAST1:%.+]] = sitofp i32 [[J]] to float
@@ -124,6 +124,6 @@ int main(int argc, char **argv) {
 // CHECK: call noundef i32 @"?GetX@?$St@H@@QEAAHHH@Z"(ptr {{[^,]*}} [[BAR]], i32 noundef %{{.+}} i32 noundef %{{.+}})
 // CHECK: call void @"?PutY@?$St@H@@QEAAXDHN@Z"(ptr {{[^,]*}} [[BAR]], i8 noundef %{{.+}}, i32 noundef %{{.+}}, double noundef %{{.+}}
 // CHECK: call noundef i32 @"?GetX@?$St@H@@QEAAHHH@Z"(ptr {{[^,]*}} [[BAR]], i32 noundef %{{.+}} i32 noundef %{{.+}})
-// CHECK: call noundef i8 @"?GetY@?$St@H@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[BAR]], i8 noundef %{{.+}}, ptr noundef %{{.+}})
+// CHECK: call noundef i8 @"?GetY@?$St@H@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[BAR]], i8 noundef %{{.+}}, ptr dead_on_return noundef %{{.+}})
 // CHECK: call noundef i32 @"?PutX@?$St@H@@QEAAHHHH@Z"(ptr {{[^,]*}} [[BAR]], i32 noundef %{{.+}}, i32 noundef %{{.+}}, i32 noundef %{{.+}})
 #endif //HEADER
diff --git a/clang/test/CodeGenCXX/nrvo.cpp b/clang/test/CodeGenCXX/nrvo.cpp
index 1141bc35de582..5b0fc914120e3 100644
--- a/clang/test/CodeGenCXX/nrvo.cpp
+++ b/clang/test/CodeGenCXX/nrvo.cpp
@@ -2197,7 +2197,7 @@ void test16() { // http://wg21.link/p2025r2#ex-9
 // CHECK-EH-11-NEXT:    br i1 [[CMP9]], label [[IF_THEN10:%.*]], label [[IF_END11:%.*]]
 // CHECK-EH-11:       if.then10:
 // CHECK-EH-11-NEXT:    store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4
-// CHECK-EH-11-NEXT:    br label [[CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK-EH-11-NEXT:    br label [[CLEANUP]], !llvm.loop [[LOOP4:![0-9]+]]
 // CHECK-EH-11:       if.end11:
 // CHECK-EH-11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I_ADDR]], align 4
 // CHECK-EH-11-NEXT:    [[CMP12:%.*]] = icmp eq i32 [[TMP4]], 3
@@ -2239,7 +2239,7 @@ void test16() { // http://wg21.link/p2025r2#ex-9
 // CHECK-EH-11-NEXT:      i32 2, label [[IMPOSSIBLE]]
 // CHECK-EH-11-NEXT:    ]
 // CHECK-EH-11:       cleanup.cont:
-// CHECK-EH-11-NEXT:    br label [[WHILE_BODY]], !llvm.loop [[LOOP3]]
+// CHECK-EH-11-NEXT:    br label [[WHILE_BODY]], !llvm.loop [[LOOP4]]
 // CHECK-EH-11:       while.end:
 // CHECK-EH-11-NEXT:    call void @_ZN1XC1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_RESULT]])
 // CHECK-EH-11-NEXT:    br label [[RETURN]]
diff --git a/clang/test/CodeGenCXX/pass-by-value-noalias.cpp b/clang/test/CodeGenCXX/pass-by-value-noalias.cpp
index 773cf6b81c3b2..947379d9b8b92 100644
--- a/clang/test/CodeGenCXX/pass-by-value-noalias.cpp
+++ b/clang/test/CodeGenCXX/pass-by-value-noalias.cpp
@@ -14,8 +14,8 @@ struct Foo {
 // Make sure noalias is added to indirect arguments with trivially copyable types
 // if -fpass-by-value-is-noalias is provided.
 
-// WITH_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr noalias noundef %arg)
-// NO_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr noundef %arg)
+// WITH_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr dead_on_return noalias noundef %arg)
+// NO_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr dead_on_return noundef %arg)
 void take(Foo arg) {}
 
 int G;
@@ -38,8 +38,8 @@ struct NonTrivial {
 // Make sure noalias is not added to indirect arguments that are not trivially
 // copyable even if -fpass-by-value-is-noalias is provided.
 
-// WITH_NOALIAS: define{{.*}} void @_Z4take10NonTrivial(ptr noundef %arg)
-// NO_NOALIAS:   define{{.*}} void @_Z4take10NonTrivial(ptr noundef %arg)
+// WITH_NOALIAS: define{{.*}} void @_Z4take10NonTrivial(ptr dead_on_return noundef %arg)
+// NO_NOALIAS:   define{{.*}} void @_Z4take10NonTrivial(ptr dead_on_return noundef %arg)
 void take(NonTrivial arg) {}
 
 // Escape examples. Pointers to the objects passed to take() may escape, depending on whether a temporary copy is created or not (e.g. due to NRVO).
@@ -54,8 +54,8 @@ struct A {
 };
 A *p;
 
-// WITH_NOALIAS: define{{.*}} void @_Z4take1A(ptr noalias noundef %arg)
-// NO_NOALIAS: define{{.*}} void @_Z4take1A(ptr noundef %arg)
+// WITH_NOALIAS: define{{.*}} void @_Z4take1A(ptr dead_on_return noalias noundef %arg)
+// NO_NOALIAS: define{{.*}} void @_Z4take1A(ptr dead_on_return noundef %arg)
 void take(A arg) {}
 
 // WITH_NOALIAS: define{{.*}} void @_Z7CreateAPP1A(ptr dead_on_unwind noalias writable sret(%struct.A) align 1 %agg.result, ptr noundef %where)
diff --git a/clang/test/CodeGenCXX/pragma-loop.cpp b/clang/test/CodeGenCXX/pragma-loop.cpp
index 4857299f1c037..8cb3346247daf 100644
--- a/clang/test/CodeGenCXX/pragma-loop.cpp
+++ b/clang/test/CodeGenCXX/pragma-loop.cpp
@@ -203,6 +203,43 @@ void for_test_scalable_1(int *List, int Length) {
   }
 }
 
+// Verify for loop is not performing vectorization
+void for_test_width_1(int *List, int Length) {
+#pragma clang loop vectorize_width(1) interleave_count(4) unroll(disable) distribute(disable)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_20:.*]]
+    List[i] = i * 2;
+  }
+}
+
+// Verify for loop is not performing vectorization
+void for_test_fixed_1(int *List, int Length) {
+#pragma clang loop vectorize_width(1, fixed) interleave_count(4) unroll(disable) distribute(disable)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_21:.*]]
+    List[i] = i * 2;
+  }
+}
+
+
+// Verify unroll attributes are directly attached to the loop metadata
+void for_test_vectorize_disable_unroll(int *List, int Length) {
+#pragma clang loop vectorize(disable) unroll_count(8)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_22:.*]]
+    List[i] = i * 2;
+  }
+}
+
+// Verify unroll attributes are directly attached to the loop metadata
+void for_test_interleave_vectorize_disable_unroll(int *List, int Length) {
+#pragma clang loop vectorize(disable) interleave_count(4) unroll_count(8)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_23:.*]]
+    List[i] = i * 2;
+  }
+}
+
 // CHECK-DAG: ![[MP:[0-9]+]] = !{!"llvm.loop.mustprogress"}
 
 // CHECK-DAG: ![[UNROLL_DISABLE:[0-9]+]] = !{!"llvm.loop.unroll.disable"}
@@ -270,3 +307,7 @@ void for_test_scalable_1(int *List, int Length) {
 // CHECK-DAG: ![[LOOP_17]] = distinct !{![[LOOP_17]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
 // CHECK-DAG: ![[LOOP_18]] = distinct !{![[LOOP_18]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
 // CHECK-DAG: ![[LOOP_19]] = distinct !{![[LOOP_19]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
+// CHECK-DAG: ![[LOOP_20]] = distinct !{![[LOOP_20]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]}
+// CHECK-DAG: ![[LOOP_21]] = distinct !{![[LOOP_21]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]}
+// CHECK-DAG: ![[LOOP_22]] = distinct !{![[LOOP_22]], ![[MP]], ![[WIDTH_1]], ![[ISVECTORIZED]], ![[UNROLL_8]]}
+// CHECK-DAG: ![[LOOP_23]] = distinct !{![[LOOP_23]], ![[MP]], ![[WIDTH_1]], ![[INTERLEAVE_4]], ![[ISVECTORIZED]], ![[UNROLL_8]]}
diff --git a/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp b/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp
index daeea77774ec8..0310535362e3d 100644
--- a/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp
@@ -24,7 +24,7 @@ struct __attribute__((trivial_abi)) TrivialSA {
 // Check that TrivialSA is passed indirectly despite being annotated with
 // 'trivial_abi'.
 
-// CHECK: define {{.*}}void @_Z18testParamTrivialSA9TrivialSA(ptr noundef %{{.*}})
+// CHECK: define {{.*}}void @_Z18testParamTrivialSA9TrivialSA(ptr dead_on_return noundef %{{.*}})
 
 void testParamTrivialSA(TrivialSA a) {
 }
diff --git a/clang/test/CodeGenCXX/regparm.cpp b/clang/test/CodeGenCXX/regparm.cpp
index b9735485db8de..a31394016fb55 100644
--- a/clang/test/CodeGenCXX/regparm.cpp
+++ b/clang/test/CodeGenCXX/regparm.cpp
@@ -11,7 +11,7 @@ struct S1 {
 };
 
 void __attribute__((regparm(3))) foo2(S1 a, int b);
-// CHECK: declare void @_Z4foo22S1i(ptr inreg noundef, i32 inreg noundef)
+// CHECK: declare void @_Z4foo22S1i(ptr dead_on_return inreg noundef, i32 inreg noundef)
 void bar2(S1 a, int b) {
   foo2(a, b);
 }
diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp
index b8cc0d1cc6528..eacbde594e517 100644
--- a/clang/test/CodeGenCXX/trivial_abi.cpp
+++ b/clang/test/CodeGenCXX/trivial_abi.cpp
@@ -140,7 +140,7 @@ void testIgnoredSmall() {
   testReturnSmall();
 }
 
-// CHECK: define{{.*}} void @_Z14testParamLarge5Large(ptr noundef %[[A:.*]])
+// CHECK: define{{.*}} void @_Z14testParamLarge5Large(ptr dead_on_return noundef %[[A:.*]])
 // CHECK: %[[CALL:.*]] = call noundef ptr @_ZN5LargeD1Ev(ptr {{[^,]*}} %[[A]])
 // CHECK: ret void
 // CHECK: }
@@ -163,7 +163,7 @@ Large testReturnLarge() {
 // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_LARGE]], align 8
 // CHECK: %[[CALL:.*]] = call noundef ptr @_ZN5LargeC1Ev(ptr {{[^,]*}} %[[T]])
 // CHECK: %[[CALL1:.*]] = call noundef ptr @_ZN5LargeC1ERKS_(ptr {{[^,]*}} %[[AGG_TMP]], ptr noundef nonnull align 8 dereferenceable(520) %[[T]])
-// CHECK: call void @_Z14testParamLarge5Large(ptr noundef %[[AGG_TMP]])
+// CHECK: call void @_Z14testParamLarge5Large(ptr dead_on_return noundef %[[AGG_TMP]])
 // CHECK: %[[CALL2:.*]] = call noundef ptr @_ZN5LargeD1Ev(ptr {{[^,]*}} %[[T]])
 // CHECK: ret void
 // CHECK: }
@@ -176,7 +176,7 @@ void testCallLarge0() {
 // CHECK: define{{.*}} void @_Z14testCallLarge1v()
 // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_LARGE:.*]], align 8
 // CHECK: call void @_Z15testReturnLargev(ptr dead_on_unwind writable sret(%[[STRUCT_LARGE]]) align 8 %[[AGG_TMP]])
-// CHECK: call void @_Z14testParamLarge5Large(ptr noundef %[[AGG_TMP]])
+// CHECK: call void @_Z14testParamLarge5Large(ptr dead_on_return noundef %[[AGG_TMP]])
 // CHECK: ret void
 // CHECK: }
 
@@ -244,7 +244,7 @@ void testExceptionSmall() {
 // CHECK: call noundef ptr @_ZN5LargeC1Ev(ptr {{[^,]*}} %[[AGG_TMP]])
 // CHECK: invoke noundef ptr @_ZN5LargeC1Ev(ptr {{[^,]*}} %[[AGG_TMP1]])
 
-// CHECK: call void @_Z20calleeExceptionLarge5LargeS_(ptr noundef %[[AGG_TMP]], ptr noundef %[[AGG_TMP1]])
+// CHECK: call void @_Z20calleeExceptionLarge5LargeS_(ptr dead_on_return noundef %[[AGG_TMP]], ptr dead_on_return noundef %[[AGG_TMP1]])
 // CHECK-NEXT: ret void
 
 // CHECK: landingpad { ptr, i32 }
diff --git a/clang/test/CodeGenCXX/uncopyable-args.cpp b/clang/test/CodeGenCXX/uncopyable-args.cpp
index 31192b65cc362..2d09732f9d3c2 100644
--- a/clang/test/CodeGenCXX/uncopyable-args.cpp
+++ b/clang/test/CodeGenCXX/uncopyable-args.cpp
@@ -59,12 +59,12 @@ void bar() {
 // CHECK-LABEL: define{{.*}} void @_ZN9move_ctor3barEv()
 // CHECK: call void @_Z{{.*}}C1Ev(
 // CHECK-NOT: call
-// NEWABI: call void @_ZN9move_ctor3fooENS_1AE(ptr noundef %{{.*}})
+// NEWABI: call void @_ZN9move_ctor3fooENS_1AE(ptr dead_on_return noundef %{{.*}})
 // OLDABI: call void @_ZN9move_ctor3fooENS_1AE(ptr %{{.*}})
-// NEWABI-LABEL: declare void @_ZN9move_ctor3fooENS_1AE(ptr noundef)
+// NEWABI-LABEL: declare void @_ZN9move_ctor3fooENS_1AE(ptr dead_on_return noundef)
 // OLDABI-LABEL: declare void @_ZN9move_ctor3fooENS_1AE(ptr)
 
-// WIN64-LABEL: declare dso_local void @"?foo@move_ctor@@YAXUA@1@@Z"(ptr noundef)
+// WIN64-LABEL: declare dso_local void @"?foo@move_ctor@@YAXUA@1@@Z"(ptr dead_on_return noundef)
 }
 
 namespace all_deleted {
@@ -81,12 +81,12 @@ void bar() {
 // CHECK-LABEL: define{{.*}} void @_ZN11all_deleted3barEv()
 // CHECK: call void @_Z{{.*}}C1Ev(
 // CHECK-NOT: call
-// NEWABI: call void @_ZN11all_deleted3fooENS_1AE(ptr noundef %{{.*}})
+// NEWABI: call void @_ZN11all_deleted3fooENS_1AE(ptr dead_on_return noundef %{{.*}})
 // OLDABI: call void @_ZN11all_deleted3fooENS_1AE(ptr %{{.*}})
-// NEWABI-LABEL: declare void @_ZN11all_deleted3fooENS_1AE(ptr noundef)
+// NEWABI-LABEL: declare void @_ZN11all_deleted3fooENS_1AE(ptr dead_on_return noundef)
 // OLDABI-LABEL: declare void @_ZN11all_deleted3fooENS_1AE(ptr)
 
-// WIN64-LABEL: declare dso_local void @"?foo@all_deleted@@YAXUA@1@@Z"(ptr noundef)
+// WIN64-LABEL: declare dso_local void @"?foo@all_deleted@@YAXUA@1@@Z"(ptr dead_on_return noundef)
 }
 
 namespace implicitly_deleted {
@@ -102,14 +102,14 @@ void bar() {
 // CHECK-LABEL: define{{.*}} void @_ZN18implicitly_deleted3barEv()
 // CHECK: call void @_Z{{.*}}C1Ev(
 // CHECK-NOT: call
-// NEWABI: call void @_ZN18implicitly_deleted3fooENS_1AE(ptr noundef %{{.*}})
+// NEWABI: call void @_ZN18implicitly_deleted3fooENS_1AE(ptr dead_on_return noundef %{{.*}})
 // OLDABI: call void @_ZN18implicitly_deleted3fooENS_1AE(ptr %{{.*}})
-// NEWABI-LABEL: declare void @_ZN18implicitly_deleted3fooENS_1AE(ptr noundef)
+// NEWABI-LABEL: declare void @_ZN18implicitly_deleted3fooENS_1AE(ptr dead_on_return noundef)
 // OLDABI-LABEL: declare void @_ZN18implicitly_deleted3fooENS_1AE(ptr)
 
 // In MSVC 2013, the copy ctor is not deleted by a move assignment. In MSVC 2015, it is.
 // WIN64-18-LABEL: declare dso_local void @"?foo@implicitly_deleted@@YAXUA@1@@Z"(i64
-// WIN64-19-LABEL: declare dso_local void @"?foo@implicitly_deleted@@YAXUA@1@@Z"(ptr noundef)
+// WIN64-19-LABEL: declare dso_local void @"?foo@implicitly_deleted@@YAXUA@1@@Z"(ptr dead_on_return noundef)
 }
 
 namespace one_deleted {
@@ -125,12 +125,12 @@ void bar() {
 // CHECK-LABEL: define{{.*}} void @_ZN11one_deleted3barEv()
 // CHECK: call void @_Z{{.*}}C1Ev(
 // CHECK-NOT: call
-// NEWABI: call void @_ZN11one_deleted3fooENS_1AE(ptr noundef %{{.*}})
+// NEWABI: call void @_ZN11one_deleted3fooENS_1AE(ptr dead_on_return noundef %{{.*}})
 // OLDABI: call void @_ZN11one_deleted3fooENS_1AE(ptr %{{.*}})
-// NEWABI-LABEL: declare void @_ZN11one_deleted3fooENS_1AE(ptr noundef)
+// NEWABI-LABEL: declare void @_ZN11one_deleted3fooENS_1AE(ptr dead_on_return noundef)
 // OLDABI-LABEL: declare void @_ZN11one_deleted3fooENS_1AE(ptr)
 
-// WIN64-LABEL: declare dso_local void @"?foo@one_deleted@@YAXUA@1@@Z"(ptr noundef)
+// WIN64-LABEL: declare dso_local void @"?foo@one_deleted@@YAXUA@1@@Z"(ptr dead_on_return noundef)
 }
 
 namespace copy_defaulted {
@@ -170,7 +170,7 @@ void bar() {
 // CHECK: call void @_ZN14move_defaulted3fooENS_1AE(ptr %{{.*}})
 // CHECK-LABEL: declare void @_ZN14move_defaulted3fooENS_1AE(ptr)
 
-// WIN64-LABEL: declare dso_local void @"?foo@move_defaulted@@YAXUA@1@@Z"(ptr noundef)
+// WIN64-LABEL: declare dso_local void @"?foo@move_defaulted@@YAXUA@1@@Z"(ptr dead_on_return noundef)
 }
 
 namespace trivial_defaulted {
@@ -207,12 +207,12 @@ void bar() {
 }
 // CHECK-LABEL: define{{.*}} void @_ZN14two_copy_ctors3barEv()
 // CHECK: call void @_Z{{.*}}C1Ev(
-// NEWABI: call void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef %{{.*}})
+// NEWABI: call void @_ZN14two_copy_ctors3fooENS_1BE(ptr dead_on_return noundef %{{.*}})
 // OLDABI: call void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef byval
-// NEWABI-LABEL: declare void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef)
+// NEWABI-LABEL: declare void @_ZN14two_copy_ctors3fooENS_1BE(ptr dead_on_return noundef)
 // OLDABI-LABEL: declare void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef byval
 
-// WIN64-LABEL: declare dso_local void @"?foo@two_copy_ctors@@YAXUB@1@@Z"(ptr noundef)
+// WIN64-LABEL: declare dso_local void @"?foo@two_copy_ctors@@YAXUB@1@@Z"(ptr dead_on_return noundef)
 }
 
 namespace definition_only {
diff --git a/clang/test/CodeGenCXX/wasm-args-returns.cpp b/clang/test/CodeGenCXX/wasm-args-returns.cpp
index fbb152ac1bb3d..7b1c27bf0808d 100644
--- a/clang/test/CodeGenCXX/wasm-args-returns.cpp
+++ b/clang/test/CodeGenCXX/wasm-args-returns.cpp
@@ -46,17 +46,17 @@ struct copy_ctor {
   copy_ctor(copy_ctor const &);
 };
 test(copy_ctor);
-// CHECK: define void @_Z7forward9copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.copy_ctor) align 8 %{{.*}}, ptr nonnull %{{.*}})
+// CHECK: define void @_Z7forward9copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.copy_ctor) align 8 %{{.*}}, ptr dead_on_return nonnull %{{.*}})
 //
 // CHECK: declare ptr @_ZN9copy_ctorC1ERKS_(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align 8 dereferenceable(8))
 //
 // CHECK: define void @_Z14test_copy_ctorv()
 // CHECK: %[[tmp:.*]] = alloca %struct.copy_ctor, align 8
 // CHECK: call void @_Z13def_copy_ctorv(ptr dead_on_unwind nonnull writable sret(%struct.copy_ctor) align 8 %[[tmp]])
-// CHECK: call void @_Z3use9copy_ctor(ptr nonnull %[[tmp]])
+// CHECK: call void @_Z3use9copy_ctor(ptr dead_on_return nonnull %[[tmp]])
 // CHECK: ret void
 //
-// CHECK: declare void @_Z3use9copy_ctor(ptr)
+// CHECK: declare void @_Z3use9copy_ctor(ptr dead_on_return)
 // CHECK: declare void @_Z13def_copy_ctorv(ptr dead_on_unwind writable sret(%struct.copy_ctor) align 8)
 
 struct __attribute__((aligned(16))) aligned_copy_ctor {
@@ -64,17 +64,17 @@ struct __attribute__((aligned(16))) aligned_copy_ctor {
   aligned_copy_ctor(aligned_copy_ctor const &);
 };
 test(aligned_copy_ctor);
-// CHECK: define void @_Z7forward17aligned_copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.aligned_copy_ctor) align 16 %{{.*}}, ptr nonnull %{{.*}})
+// CHECK: define void @_Z7forward17aligned_copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.aligned_copy_ctor) align 16 %{{.*}}, ptr dead_on_return nonnull %{{.*}})
 //
 // CHECK: declare ptr @_ZN17aligned_copy_ctorC1ERKS_(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align 16 dereferenceable(16))
 //
 // CHECK: define void @_Z22test_aligned_copy_ctorv()
 // CHECK: %[[tmp:.*]] = alloca %struct.aligned_copy_ctor, align 16
 // CHECK: call void @_Z21def_aligned_copy_ctorv(ptr dead_on_unwind nonnull writable sret(%struct.aligned_copy_ctor) align 16 %[[tmp]])
-// CHECK: call void @_Z3use17aligned_copy_ctor(ptr nonnull %[[tmp]])
+// CHECK: call void @_Z3use17aligned_copy_ctor(ptr dead_on_return nonnull %[[tmp]])
 // CHECK: ret void
 //
-// CHECK: declare void @_Z3use17aligned_copy_ctor(ptr)
+// CHECK: declare void @_Z3use17aligned_copy_ctor(ptr dead_on_return)
 // CHECK: declare void @_Z21def_aligned_copy_ctorv(ptr dead_on_unwind writable sret(%struct.aligned_copy_ctor) align 16)
 
 struct empty {};
diff --git a/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp b/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp
index 9927d0b24799e..78c45753b5bbd 100644
--- a/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp
+++ b/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp
@@ -18,7 +18,7 @@ struct NonTrivial {
 
 SWIFTCALL int receiveNonTrivial(NonTrivial o) { return o.o; }
 
-// CHECK-LABEL: define dso_local swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr noundef %o)
+// CHECK-LABEL: define dso_local swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr dead_on_return noundef %o)
 
 int passNonTrivial() {
   return receiveNonTrivial({});
@@ -26,4 +26,4 @@ int passNonTrivial() {
 
 // CHECK-LABEL: define dso_local noundef i32 @"?passNonTrivial@@YAHXZ"()
 // CHECK-NOT: stacksave
-// CHECK: call swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr noundef %{{.*}})
+// CHECK: call swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr dead_on_return noundef %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl
new file mode 100644
index 0000000000000..eda256451ee2b
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl
@@ -0,0 +1,244 @@
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
+// CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half 0xH3C00, [[MUL2_I]]
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[MUL1_I]], [[SUB_I]]
+// CHECK:    [[SUB4_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half 0xH3C00, [[MUL3_I]]
+// CHECK:    [[MUL5_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL6_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[TMP0:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.sqrt.f16(half %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn half [[MUL6_I]], %{{.*}}
+// CHECK:    [[MUL7_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB8_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half %{{.*}}, [[MUL7_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CMP_I]], half 0xH0000, half %{{.*}}
+// CHECK:    ret half [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
+// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] 
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.refract.f16.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret half [[SPV_REFRACT_I]]
+//
+half test_refract_half(half I, half N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z18test_refract_half2Dv2_DhS_Dh(
+// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> splat (half 0xH3C00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> splat (half 0xH3C00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sqrt.v2f16(<2 x half> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <2 x half> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <2 x half> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <2 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <2 x half> zeroinitializer, <2 x half> %{{.*}}
+// CHECK:    ret <2 x half> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_refract_half2Dv2_DhS_Dh(
+// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.refract.v2f16.f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret <2 x half> [[SPV_REFRACT_I]]
+//
+half2 test_refract_half2(half2 I, half2 N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z18test_refract_half3Dv3_DhS_Dh(
+// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> splat (half 0xH3C00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> splat (half 0xH3C00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sqrt.v3f16(<3 x half> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <3 x half> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <3 x half> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <3 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <3 x half> zeroinitializer, <3 x half> %{{.*}}
+// CHECK:    ret <3 x half> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_refract_half3Dv3_DhS_Dh(
+// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.refract.v3f16.f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret <3 x half> [[SPV_REFRACT_I]]
+//
+half3 test_refract_half3(half3 I, half3 N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18test_refract_half4Dv4_DhS_Dh(
+// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> splat (half 0xH3C00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> splat (half 0xH3C00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sqrt.v4f16(<4 x half> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <4 x half> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <4 x half> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <4 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <4 x half> zeroinitializer, <4 x half> %{{.*}}
+// CHECK:    ret <4 x half> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_refract_half4Dv4_DhS_Dh(
+// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret <4 x half> [[SPV_REFRACT_I]]
+//
+half4 test_refract_half4(half4 I, half4 N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_refract_floatfff(
+// CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float 1.000000e+00, [[MUL2_I]]
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[MUL1_I]], [[SUB_I]]
+// CHECK:    [[SUB4_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float 1.000000e+00, [[MUL3_I]]
+// CHECK:    [[MUL5_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL6_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(float %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn float [[MUL6_I]], %{{.*}}
+// CHECK:    [[MUL7_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB8_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float %{{.*}}, [[MUL7_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float %{{.*}}, 0.000000e+00
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CMP_I]], float 0.000000e+00, float %{{.*}}
+// CHECK:    ret float [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_refract_floatfff(
+// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.refract.f32.f32(float %{{.*}}, float  %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret float [[SPV_REFRACT_I]]
+//
+float test_refract_float(float I, float N, float ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z19test_refract_float2Dv2_fS_f(
+// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <2 x float> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <2 x float> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <2 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <2 x float> zeroinitializer, <2 x float> %{{.*}}
+// CHECK:    ret <2 x float> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_refract_float2Dv2_fS_f(
+// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.refract.v2f32.f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret <2 x float> [[SPV_REFRACT_I]]
+//
+float2 test_refract_float2(float2 I, float2 N, float ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z19test_refract_float3Dv3_fS_f(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <3 x float> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <3 x float> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <3 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <3 x float> zeroinitializer, <3 x float> %{{.*}}
+// CHECK:    ret <3 x float> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_refract_float3Dv3_fS_f(
+// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.refract.v3f32.f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret <3 x float> [[SPV_REFRACT_I]]
+//
+float3 test_refract_float3(float3 I, float3 N, float ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z19test_refract_float4Dv4_fS_f
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <4 x float> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <4 x float> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <4 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <4 x float> zeroinitializer, <4 x float> %{{.*}}
+// CHECK:    ret <4 x float> [[HLSL_SELECT_I]]
+
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_refract_float4Dv4_fS_f(
+// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) %{{.*}}, <4 x float> noundef nofpclass(nan inf) %{{.*}}, float noundef nofpclass(nan inf) %{{.*}}) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret <4 x float> [[SPV_REFRACT_I]]
+//
+float4 test_refract_float4(float4 I, float4 N, float ETA) {
+    return refract(I, N, ETA);
+}
diff --git a/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m b/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m
index d2a954ae26a04..ba8a04b52716e 100644
--- a/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m
+++ b/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m
@@ -42,7 +42,7 @@ void testStrongException(void) {
 // CHECK: call void @genWeak(ptr dead_on_unwind writable sret(%[[STRUCT_WEAK]]) align 8 %[[AGG_TMP]])
 // CHECK: invoke void @genWeak(ptr dead_on_unwind writable sret(%[[STRUCT_WEAK]]) align 8 %[[AGG_TMP1]])
 
-// CHECK: call void @calleeWeak(ptr noundef %[[AGG_TMP]], ptr noundef %[[AGG_TMP1]])
+// CHECK: call void @calleeWeak(ptr dead_on_return noundef %[[AGG_TMP]], ptr dead_on_return noundef %[[AGG_TMP1]])
 // CHECK: ret void
 
 // CHECK: landingpad { ptr, i32 }
diff --git a/clang/test/CodeGenObjC/pass-by-value-noalias.m b/clang/test/CodeGenObjC/pass-by-value-noalias.m
index ed94d4c80b525..86a4ba36a1b8f 100644
--- a/clang/test/CodeGenObjC/pass-by-value-noalias.m
+++ b/clang/test/CodeGenObjC/pass-by-value-noalias.m
@@ -17,6 +17,6 @@ @interface Bar
   Bar *__weak f;
 };
 
-// WITH_NOALIAS: define{{.*}} void @take(ptr noundef %arg)
-// NO_NOALIAS: define{{.*}} void @take(ptr noundef %arg)
+// WITH_NOALIAS: define{{.*}} void @take(ptr dead_on_return noundef %arg)
+// NO_NOALIAS: define{{.*}} void @take(ptr dead_on_return noundef %arg)
 void take(struct Foo arg) {}
diff --git a/clang/test/CodeGenObjC/weak-in-c-struct.m b/clang/test/CodeGenObjC/weak-in-c-struct.m
index be80edd1ff11d..6809360d03da1 100644
--- a/clang/test/CodeGenObjC/weak-in-c-struct.m
+++ b/clang/test/CodeGenObjC/weak-in-c-struct.m
@@ -130,7 +130,7 @@ void test_move_assignment_Weak(Weak *p) {
   *p = getWeak();
 }
 
-// COMMON: define{{.*}} void @test_parameter_Weak(ptr noundef %[[A:.*]])
+// COMMON: define{{.*}} void @test_parameter_Weak(ptr dead_on_return noundef %[[A:.*]])
 // COMMON: call void @__destructor_{{.*}}(ptr %[[A]])
 
 void test_parameter_Weak(Weak a) {
@@ -142,7 +142,7 @@ void test_parameter_Weak(Weak a) {
 // COMMON: store ptr %[[A]], ptr %[[A_ADDR]]
 // COMMON: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]]
 // COMMON: call void @__copy_constructor_{{.*}}(ptr %[[AGG_TMP]], ptr %[[V0]])
-// COMMON: call void @calleeWeak(ptr noundef %[[AGG_TMP]])
+// COMMON: call void @calleeWeak(ptr dead_on_return noundef %[[AGG_TMP]])
 // COMMON-NEXT: ret
 
 void test_argument_Weak(Weak *a) {
@@ -164,7 +164,7 @@ Weak test_return_Weak(Weak *a) {
 // COMMON: %[[AGG_TMP:.*]] = alloca %[[STRUCT_WEAK]]
 // COMMON: br i1
 
-// COMMON: call void @objc_msgSend({{.*}}, ptr noundef %[[AGG_TMP]])
+// COMMON: call void @objc_msgSend({{.*}}, ptr dead_on_return noundef %[[AGG_TMP]])
 // COMMON: br
 
 // COMMON: call void @__destructor_{{.*}}(ptr %[[AGG_TMP]])
diff --git a/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm b/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm
index 4d76796d86d1f..3a043c4892981 100644
--- a/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm
+++ b/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm
@@ -75,7 +75,7 @@ - (void)passStrongWeak:(StrongWeak)a;
 - (void)passNonTrivial:(NonTrivial)a;
 @end
 
-// CHECK: define{{.*}} void @_Z19testParamStrongWeak10StrongWeak(ptr noundef %{{.*}})
+// CHECK: define{{.*}} void @_Z19testParamStrongWeak10StrongWeak(ptr dead_on_return noundef %{{.*}})
 // CHECK: call noundef ptr @_ZN10StrongWeakD1Ev(
 // CHECK-NEXT: ret void
 
@@ -88,7 +88,7 @@ void testParamStrongWeak(StrongWeak a) {
 // CHECK: store ptr %[[A]], ptr %[[A_ADDR]], align 8
 // CHECK: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]], align 8
 // CHECK: %[[CALL:.*]] = call noundef ptr @_ZN10StrongWeakC1ERKS_(ptr {{[^,]*}} %[[AGG_TMP]], ptr noundef nonnull align 8 dereferenceable(16) %[[V0]])
-// CHECK: call void @_Z19testParamStrongWeak10StrongWeak(ptr noundef %[[AGG_TMP]])
+// CHECK: call void @_Z19testParamStrongWeak10StrongWeak(ptr dead_on_return noundef %[[AGG_TMP]])
 // CHECK-NOT: call
 // CHECK: ret void
 
@@ -107,13 +107,13 @@ StrongWeak testReturnStrongWeak(StrongWeak *a) {
   return *a;
 }
 
-// CHECK: define{{.*}} void @_Z27testParamContainsStrongWeak18ContainsStrongWeak(ptr noundef %[[A:.*]])
+// CHECK: define{{.*}} void @_Z27testParamContainsStrongWeak18ContainsStrongWeak(ptr dead_on_return noundef %[[A:.*]])
 // CHECK: call noundef ptr @_ZN18ContainsStrongWeakD1Ev(ptr {{[^,]*}} %[[A]])
 
 void testParamContainsStrongWeak(ContainsStrongWeak a) {
 }
 
-// CHECK: define{{.*}} void @_Z26testParamDerivedStrongWeak17DerivedStrongWeak(ptr noundef %[[A:.*]])
+// CHECK: define{{.*}} void @_Z26testParamDerivedStrongWeak17DerivedStrongWeak(ptr dead_on_return noundef %[[A:.*]])
 // CHECK: call noundef ptr @_ZN17DerivedStrongWeakD1Ev(ptr {{[^,]*}} %[[A]])
 
 void testParamDerivedStrongWeak(DerivedStrongWeak a) {
@@ -163,7 +163,7 @@ Strong testReturnStrong(Strong *a) {
   return *a;
 }
 
-// CHECK: define{{.*}} void @_Z21testParamWeakTemplate1SIU6__weakP11objc_objectE(ptr noundef %{{.*}})
+// CHECK: define{{.*}} void @_Z21testParamWeakTemplate1SIU6__weakP11objc_objectE(ptr dead_on_return noundef %{{.*}})
 // CHECK: call noundef ptr @_ZN1SIU6__weakP11objc_objectED1Ev(
 // CHECK-NEXT: ret void
 
@@ -237,7 +237,7 @@ void test0(C *c) {
 // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_STRONGWEAK]], align 8
 // CHECK: br i1
 
-// CHECK: call void @objc_msgSend({{.*}}, ptr noundef %[[AGG_TMP]])
+// CHECK: call void @objc_msgSend({{.*}}, ptr dead_on_return noundef %[[AGG_TMP]])
 // CHECK: br
 
 // CHECK: %[[CALL1:.*]] = call noundef ptr @_ZN10StrongWeakD1Ev(ptr noundef nonnull align 8 dereferenceable(16) %[[AGG_TMP]])
diff --git a/clang/test/CodeGenObjCXX/property-objects.mm b/clang/test/CodeGenObjCXX/property-objects.mm
index 7ae20f66177bd..8354794254933 100644
--- a/clang/test/CodeGenObjCXX/property-objects.mm
+++ b/clang/test/CodeGenObjCXX/property-objects.mm
@@ -60,7 +60,7 @@ - (struct CGRect)extent {return bounds;}
 
 // CHECK-LABEL: define{{.*}} i32 @main
 // CHECK: call void @_ZN1SC1ERKS_(ptr {{[^,]*}} [[AGGTMP:%[a-zA-Z0-9\.]+]], ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) {{%[a-zA-Z0-9\.]+}})
-// CHECK: call void @objc_msgSend(ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr noundef [[AGGTMP]])
+// CHECK: call void @objc_msgSend(ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr dead_on_return noundef [[AGGTMP]])
 // CHECK-NEXT: ret i32 0
 int main() {
   I *i;
diff --git a/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm b/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm
index e5cb71bad47c0..9428940d6da48 100644
--- a/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm
+++ b/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm
@@ -15,7 +15,7 @@
 
 // Check that AddrDiscStrong0 is destructed in the callee.
 
-// CHECK: define void @_Z24testParamAddrDiscStrong015AddrDiscStrong0(ptr noundef %[[A:.*]])
+// CHECK: define void @_Z24testParamAddrDiscStrong015AddrDiscStrong0(ptr dead_on_return noundef %[[A:.*]])
 // CHECK: call noundef ptr @_ZN15AddrDiscStrong0D1Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %[[A]])
 // CHECK: ret void
 
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 42768ac8def1f..75e9710f96705 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index e4ef3defdb341..86c27d48ab0d4 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -157,6 +157,18 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
   *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
 }
 
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_f8f6f4(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+// CHECK-GFX1250-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A:%.*]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C:%.*]])
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP1]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, 0, c);
+}
+
 // CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
 // CHECK-GFX1250-NEXT:  entry:
 // CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50f02ad27357..a21862c4a9395 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -4,6 +4,9 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+typedef unsigned int uint;
+typedef unsigned short int ushort;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
 typedef half __attribute__((ext_vector_type(2))) half2;
 
 // CHECK-LABEL: @test_setprio_inc_wg(
@@ -42,6 +45,63 @@ void test_s_wait_tensorcnt() {
   __builtin_amdgcn_s_wait_tensorcnt(0);
 }
 
+// CHECK-LABEL: @test_prng_b32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_prng_b32(global uint* out, uint a) {
+  *out = __builtin_amdgcn_prng_b32(a);
+}
+
+// CHECK-LABEL: @test_tanh_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_tanh_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_tanhf(a);
+}
+
+// CHECK-LABEL: @test_tanh_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_tanh_f16(global half* out, global half* a)
+{
+  *out = __builtin_amdgcn_tanhh(*a);
+}
+
 // CHECK-LABEL: @test_tanh_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -80,6 +140,120 @@ void test_rcp_bf16(global __bf16* out, __bf16 a)
   *out = __builtin_amdgcn_rcp_bf16(a);
 }
 
+// CHECK-LABEL: @test_sqrt_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_sqrt_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_sqrt_bf16(a);
+}
+
+// CHECK-LABEL: @test_rsq_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_rsq_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_rsq_bf16(a);
+}
+
+// CHECK-LABEL: @test_log_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_log_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_log_bf16(a);
+}
+
+// CHECK-LABEL: @test_exp2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_exp2_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_exp2_bf16(a);
+}
+
+// CHECK-LABEL: @test_sin_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sin.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_sin_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_sin_bf16(a);
+}
+
+// CHECK-LABEL: @test_cos_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.cos.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_cos_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_cos_bf16(a);
+}
+
 // CHECK-LABEL: @test_cvt_f16_fp8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -196,6 +370,76 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
   out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
 }
 
+// CHECK-LABEL: @test_sat_pk4_i4_i8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP2]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i16 [[TMP4]], ptr [[TMP5]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_sat_pk4_i4_i8(ushort *out, uint src)
+{
+  *out = __builtin_amdgcn_sat_pk4_i4_i8(src);
+  *out = __builtin_amdgcn_sat_pk4_u4_u8(src);
+}
+
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
+// CHECK-NEXT:    [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
+// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
+// CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT:    ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+  *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+  *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+  *out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
+}
+
 // CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
index 8256b61525f9d..177165972b7a9 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
@@ -10,3 +10,12 @@
 void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) {
     __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3);
 }
+
+// CHECK-LABEL: @test_amdgcn_struct_ptr_buffer_load_lds(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset) {
+    __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3);
+}
diff --git a/clang/test/CodeGenOpenCL/scoped-atomic.cl b/clang/test/CodeGenOpenCL/scoped-atomic.cl
new file mode 100644
index 0000000000000..ec7e936684a3a
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/scoped-atomic.cl
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir-unknown-unknown -verify
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir64-unknown-unknown -verify
+
+// expected-no-diagnostics
+
+int fi1a(int *i) {
+  int v;
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  return v;
+}
+
+#ifdef __SPIR64__
+long fl1a(long *i) {
+  long v;
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  return v;
+}
+#endif
diff --git a/clang/test/CodeGenSPIRV/Builtins/refract.c b/clang/test/CodeGenSPIRV/Builtins/refract.c
new file mode 100644
index 0000000000000..f399462d68d4a
--- /dev/null
+++ b/clang/test/CodeGenSPIRV/Builtins/refract.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+
+typedef _Float16 half;
+typedef half half2 __attribute__((ext_vector_type(2)));
+typedef half half3 __attribute__((ext_vector_type(3)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define spir_func half @test_refract_half(
+// CHECK-SAME: half noundef [[I:%.*]], half noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call half @llvm.spv.refract.f16.f16(half [[I]], half [[N]], half [[ETA]])
+// CHECK-NEXT:    ret half [[SPV_REFRACT]]
+//
+half test_refract_half(half I, half N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <2 x half> @test_refract_half2(
+// CHECK-SAME: <2 x half> noundef [[I:%.*]], <2 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call <2 x half> @llvm.spv.refract.v2f16.f16(<2 x half> [[I]], <2 x half> [[N]], half [[ETA]])
+// CHECK-NEXT:    ret <2 x half> [[SPV_REFRACT]]
+//
+half2 test_refract_half2(half2 I, half2 N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <3 x half> @test_refract_half3(
+// CHECK-SAME: <3 x half> noundef [[I:%.*]], <3 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <3 x half> @llvm.spv.refract.v3f16.f16(<3 x half> [[I]], <3 x half> [[N]], half [[ETA]])
+// CHECK-NEXT:    ret <3 x half> [[SPV_REFRACT]]
+//
+half3 test_refract_half3(half3 I, half3 N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <4 x half> @test_refract_half4(
+// CHECK-SAME: <4 x half> noundef [[I:%.*]], <4 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> [[I]], <4 x half> [[N]], half [[ETA]])
+// CHECK-NEXT:    ret <4 x half> [[SPV_REFRACT]]
+//
+half4 test_refract_half4(half4 I, half4 N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+
+// CHECK-LABEL: define spir_func float @test_refract_float(
+// CHECK-SAME: float noundef [[I:%.*]], float noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call float @llvm.spv.refract.f32.f32(float [[I]], float [[N]], float [[ETA]])
+// CHECK-NEXT:    ret float [[SPV_REFRACT]]
+//
+float test_refract_float(float I, float N, float eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <2 x float> @test_refract_float2(
+// CHECK-SAME: <2 x float> noundef [[I:%.*]], <2 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call <2 x float> @llvm.spv.refract.v2f32.f32(<2 x float> [[I]], <2 x float> [[N]], float [[ETA]])
+// CHECK-NEXT:    ret <2 x float> [[SPV_REFRACT]]
+//
+float2 test_refract_float2(float2 I, float2 N, float eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <3 x float> @test_refract_float3(
+// CHECK-SAME: <3 x float> noundef [[I:%.*]], <3 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <3 x float> @llvm.spv.refract.v3f32.f32(<3 x float> [[I]], <3 x float> [[N]], float [[ETA]])
+// CHECK-NEXT:    ret <3 x float> [[SPV_REFRACT]]
+//
+float3 test_refract_float3(float3 I, float3 N, float eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <4 x float> @test_refract_float4(
+// CHECK-SAME: <4 x float> noundef [[I:%.*]], <4 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> [[I]], <4 x float> [[N]], float [[ETA]])
+// CHECK-NEXT:    ret <4 x float> [[SPV_REFRACT]]
+//
+float4 test_refract_float4(float4 I, float4 N, float eta) { return __builtin_spirv_refract(I, N, eta); }
diff --git a/clang/test/DebugInfo/KeyInstructions/asm.c b/clang/test/DebugInfo/KeyInstructions/asm.c
new file mode 100644
index 0000000000000..2b3301660f7ba
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/asm.c
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O0 -emit-llvm -x c %s -o - -gkey-instructions -debug-info-kind=line-tables-only -gno-column-info | FileCheck %s
+// Partially copied from clang/test/CodeGen/AArch64/ls64-inline-asm.c
+
+// Check the inline asm call and result store are Key and distinct atoms.
+
+struct foo { unsigned long long x[8]; };
+// CHECK-LABEL: define dso_local void @load(
+// CHECK-SAME: ptr noundef [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUTPUT_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 8, !dbg [[DBG9:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8, !dbg [[DBG9]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[TMP1]]) #[[ATTR1:[0-9]+]], !dbg [[DBG10:![0-9]+]], !srcloc [[META11:![0-9]+]]
+// CHECK-NEXT:    store i512 [[TMP2]], ptr [[TMP0]], align 8, !dbg [[DBG12:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG13:![0-9]+]]
+//
+void load(struct foo *output, void *addr) {
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+
+// CHECK-LABEL: define dso_local void @load2(
+// CHECK-SAME: ptr noundef [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) #[[ATTR0]] !dbg [[DBG14:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUTPUT_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 8, !dbg [[DBG15:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8, !dbg [[DBG15]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[TMP1]]) #[[ATTR1]], !dbg [[DBG16:![0-9]+]], !srcloc [[META17:![0-9]+]]
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP0]], align 4, !dbg [[DBG18:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG19:![0-9]+]]
+//
+void load2(int *output, void *addr) {
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+//.
+// CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+// CHECK: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
+// CHECK: [[DBG5]] = distinct !DISubprogram(name: "load", scope: [[META6:![0-9]+]], file: [[META6]], line: 21, type: [[META7:![0-9]+]], scopeLine: 21, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], keyInstructions: true)
+// CHECK: [[META6]] = !DIFile(filename: "{{.*}}asm.c", directory: {{.*}})
+// CHECK: [[META7]] = !DISubroutineType(types: [[META8:![0-9]+]])
+// CHECK: [[META8]] = !{}
+// CHECK: [[DBG9]] = !DILocation(line: 22, scope: [[DBG5]])
+// CHECK: [[DBG10]] = !DILocation(line: 22, scope: [[DBG5]], atomGroup: 1, atomRank: 1)
+// CHECK: [[META11]] = !{i64 1458}
+// CHECK: [[DBG12]] = !DILocation(line: 22, scope: [[DBG5]], atomGroup: 2, atomRank: 1)
+// CHECK: [[DBG13]] = !DILocation(line: 23, scope: [[DBG5]], atomGroup: 3, atomRank: 1)
+// CHECK: [[DBG14]] = distinct !DISubprogram(name: "load2", scope: [[META6]], file: [[META6]], line: 38, type: [[META7]], scopeLine: 38, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], keyInstructions: true)
+// CHECK: [[DBG15]] = !DILocation(line: 39, scope: [[DBG14]])
+// CHECK: [[DBG16]] = !DILocation(line: 39, scope: [[DBG14]], atomGroup: 1, atomRank: 1)
+// CHECK: [[META17]] = !{i64 2501}
+// CHECK: [[DBG18]] = !DILocation(line: 39, scope: [[DBG14]], atomGroup: 2, atomRank: 1)
+// CHECK: [[DBG19]] = !DILocation(line: 40, scope: [[DBG14]], atomGroup: 3, atomRank: 1)
+//.
diff --git a/clang/test/DebugInfo/KeyInstructions/goto.c b/clang/test/DebugInfo/KeyInstructions/goto.c
new file mode 100644
index 0000000000000..ead92e600ca5b
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/goto.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions -x c++ -std=c++17 %s -debug-info-kind=line-tables-only -emit-llvm -o - -gno-column-info \
+// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions -x c %s -debug-info-kind=line-tables-only -emit-llvm -o - -gno-column-info \
+// RUN: | FileCheck %s
+
+// Check the goto branches get Key Instructions metadata.
+void ext();
+void test_goto(void) {
+// CHECK: br label %dst1, !dbg [[G1R1:!.*]]
+  goto dst1;
+dst1:
+  ext();
+
+  void *ptr = &&dst2;
+// CHECK: br label %indirectgoto, !dbg [[G3R1:!.*]]
+  goto *ptr;
+dst2:
+  ext();
+
+// CHECK: br label %dst3, !dbg [[G4R1:!.*]]
+  goto *&&dst3;
+dst3:
+  ext();
+
+  return;
+}
+
+// CHECK: [[G1R1]] = !DILocation(line: 10, scope: ![[#]], atomGroup: 1, atomRank: 1)
+// CHECK: [[G3R1]] = !DILocation(line: 16, scope: ![[#]], atomGroup: 3, atomRank: 1)
+// CHECK: [[G4R1]] = !DILocation(line: 21, scope: ![[#]], atomGroup: 4, atomRank: 1)
diff --git a/clang/test/Driver/amdgpu-hip-system-arch.c b/clang/test/Driver/amdgpu-hip-system-arch.c
index 9c27bc09fb36c..12e298a8636b1 100644
--- a/clang/test/Driver/amdgpu-hip-system-arch.c
+++ b/clang/test/Driver/amdgpu-hip-system-arch.c
@@ -14,14 +14,14 @@
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-new-driver --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_fail -x hip %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: cannot determine amdgcn architecture{{.*}}; consider passing it via '--offload-arch'
+// NO-OUTPUT-ERROR: error: cannot determine hip architecture{{.*}}; consider passing it via '--offload-arch'
 
 // case when amdgpu-arch does not return anything with successful execution
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_empty -x hip %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-new-driver --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_empty -x hip %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
-// EMPTY-OUTPUT: error: cannot determine amdgcn architecture: No AMD GPU detected in the system; consider passing it via '--offload-arch'
+// EMPTY-OUTPUT: error: cannot determine hip architecture: No GPU detected in the system; consider passing it via '--offload-arch'
 
 // case when amdgpu-arch returns a gfx906 GPU.
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 -x hip %s 2>&1 \
@@ -36,4 +36,4 @@
 // RUN:     --offload-arch=native --amdgpu-arch-tool=%t/amdgpu_arch_gfx906 \
 // RUN:     -x hip %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT
-// BAD-TIMEOUT: clang: error: cannot determine amdgcn architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+// BAD-TIMEOUT: clang: error: cannot determine hip architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 4dc320191317e..adb59e1debd4e 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -257,7 +257,7 @@
 // CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
-// CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
+// CHECK-RV64-SAME: "-o" "{{.*}}.tmp.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -271,7 +271,7 @@
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
-// CHECK-RV64-DEFAULTCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-DEFAULTCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -288,7 +288,7 @@
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
-// CHECK-RV64-LIBCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-LIBCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -305,7 +305,7 @@
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV64-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-LIBSTDCXX-SAME: "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -L some/directory/user/asked/for \
@@ -325,7 +325,7 @@
 // CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
-// CHECK-RV32-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -339,7 +339,7 @@
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
-// CHECK-RV32-DEFAULTCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-DEFAULTCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -355,7 +355,7 @@
 // CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -372,7 +372,7 @@
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
diff --git a/clang/test/Driver/cuda-phases.cu b/clang/test/Driver/cuda-phases.cu
index 8b91a1d5a7fcf..220a320e32705 100644
--- a/clang/test/Driver/cuda-phases.cu
+++ b/clang/test/Driver/cuda-phases.cu
@@ -324,8 +324,8 @@
 // RUN:        -ccc-print-phases --offload-arch=sm_999 -fgpu-rdc -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=INVALID-ARCH %s
 //      INVALID-ARCH: error: unsupported CUDA gpu architecture: sm_999
-// INVALID-ARCH-NEXT: 0: input, "[[INPUT:.+]]", cuda, (host-cuda)
-// INVALID-ARCH-NEXT: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// INVALID-ARCH-NEXT: 2: compiler, {1}, ir, (host-cuda)
-// INVALID-ARCH-NEXT: 3: backend, {2}, assembler, (host-cuda)
-// INVALID-ARCH-NEXT: 4: assembler, {3}, object, (host-cuda)
+//      INVALID-ARCH: 0: input, "[[INPUT:.+]]", cuda
+// INVALID-ARCH-NEXT: 1: preprocessor, {0}, cuda-cpp-output
+// INVALID-ARCH-NEXT: 2: compiler, {1}, ir
+// INVALID-ARCH-NEXT: 3: backend, {2}, assembler
+// INVALID-ARCH-NEXT: 4: assembler, {3}, object
diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
index f00940bd7613d..6e21671f43775 100644
--- a/clang/test/Driver/frame-pointer-elim.c
+++ b/clang/test/Driver/frame-pointer-elim.c
@@ -162,7 +162,7 @@
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: not %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 
 // On ARM backend bare metal targets, frame pointer is omitted
diff --git a/clang/test/Driver/hip-inputs.hip b/clang/test/Driver/hip-inputs.hip
index 2d4cc3103c5ec..a8e25ad8ed198 100644
--- a/clang/test/Driver/hip-inputs.hip
+++ b/clang/test/Driver/hip-inputs.hip
@@ -15,5 +15,5 @@
 // RUN:   --hip-link %S/Inputs/hip_multiple_inputs/a.cu 2>&1 \
 // RUN: | FileCheck -check-prefix=MIX %s
 
-// CHECK-NOT: error: mixed CUDA and HIP compilation is not supported
-// MIX: error: mixed CUDA and HIP compilation is not supported
+// CHECK-NOT: error: mixed CUDA and HIP offloading compilation is not supported
+// MIX: error: mixed CUDA and HIP offloading compilation is not supported
diff --git a/clang/test/Driver/hip-invalid-target-id.hip b/clang/test/Driver/hip-invalid-target-id.hip
index 555043facb2a3..ad942e476617e 100644
--- a/clang/test/Driver/hip-invalid-target-id.hip
+++ b/clang/test/Driver/hip-invalid-target-id.hip
@@ -4,7 +4,7 @@
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=NOPLUS %s
 
-// NOPLUS: error: invalid target ID 'gfx908xnack'
+// NOPLUS: error: unsupported HIP gpu architecture: gfx908xnack 
 
 // RUN: not %clang -### --target=x86_64-linux-gnu \
 // RUN:   -x hip --offload-arch=gfx900 \
@@ -22,7 +22,7 @@
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=UNK %s
 
-// UNK: error: invalid target ID 'gfx908:unknown+'
+// UNK: error: unsupported HIP gpu architecture: gfx900+xnack 
 
 // RUN: not %clang -### --target=x86_64-linux-gnu \
 // RUN:   -x hip --offload-arch=gfx908 \
@@ -31,7 +31,7 @@
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=MIXED %s
 
-// MIXED: error: invalid target ID 'gfx908:sramecc+:unknown+'
+// MIXED: error: unsupported HIP gpu architecture: gfx900+xnack 
 
 // RUN: not %clang -### --target=x86_64-linux-gnu \
 // RUN:   -x hip --offload-arch=gfx908 \
@@ -55,7 +55,7 @@
 // RUN:   --rocm-path=%S/Inputs/rocm \
 // RUN:   %s 2>&1 | FileCheck -check-prefix=NOCOLON %s
 
-// NOCOLON: error: invalid target ID 'gfx900+xnack'
+// NOCOLON: error: unsupported HIP gpu architecture: gfx900+xnack
 
 // RUN: not %clang -### --target=x86_64-linux-gnu \
 // RUN:   -x hip --offload-arch=gfx908 \
diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index 4fb5571b838fb..ba23bc2d59b56 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -115,11 +115,6 @@
 // OMP-NOT: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fopenmp"
 // OMP: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fopenmp"
 
-// RUN: not %clang --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
-// RUN:   --offload-arch=gfx906 -fopenmp=libomp -fopenmp-targets=amdgcn %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=OMPTGT %s
-// OMPTGT: unsupported option '--offload-targets=' for language mode 'HIP'
-
 // Check -Xoffload-linker option is passed to lld.
 
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
diff --git a/clang/test/Driver/invalid-offload-options.cpp b/clang/test/Driver/invalid-offload-options.cpp
index 48d5310538a3c..6048a3ca82e77 100644
--- a/clang/test/Driver/invalid-offload-options.cpp
+++ b/clang/test/Driver/invalid-offload-options.cpp
@@ -1,29 +1,7 @@
 // UNSUPPORTED: system-windows
 
-// RUN: not %clang -### -x hip --target=x86_64-linux-gnu --offload= \
-// RUN:   --hip-path=%S/Inputs/hipspv -nogpuinc -nogpulib %s \
-// RUN: 2>&1 | FileCheck --check-prefix=INVALID-TARGET %s
 // RUN: not %clang -### -x hip --target=x86_64-linux-gnu --offload=foo \
 // RUN:   --hip-path=%S/Inputs/hipspv -nogpuinc -nogpulib %s \
 // RUN: 2>&1 | FileCheck --check-prefix=INVALID-TARGET %s
 
 // INVALID-TARGET: error: invalid or unsupported offload target: '{{.*}}'
-
-// In the future we should be able to specify multiple targets for HIP
-// compilation but currently it is not supported.
-//
-// RUN: not %clang -### -x hip --target=x86_64-linux-gnu --offload=foo,bar \
-// RUN:   --hip-path=%S/Inputs/hipspv -nogpuinc -nogpulib %s \
-// RUN: 2>&1 | FileCheck --check-prefix=TOO-MANY-TARGETS %s
-// RUN: not %clang -### -x hip --target=x86_64-linux-gnu \
-// RUN:   --offload=foo --offload=bar \
-// RUN:   --hip-path=%S/Inputs/hipspv -nogpuinc -nogpulib %s \
-// RUN: 2>&1 | FileCheck --check-prefix=TOO-MANY-TARGETS %s
-
-// TOO-MANY-TARGETS: error: only one offload target is supported
-
-// RUN: not %clang -### -x hip --target=x86_64-linux-gnu -nogpuinc -nogpulib \
-// RUN:   --offload=amdgcn-amd-amdhsa --offload-arch=gfx900 %s \
-// RUN: 2>&1 | FileCheck --check-prefix=OFFLOAD-ARCH-MIX %s
-
-// OFFLOAD-ARCH-MIX: error: option '--offload-arch' cannot be specified with '--offload'
diff --git a/clang/test/Driver/module-fgen-reduced-bmi.cppm b/clang/test/Driver/module-fgen-reduced-bmi.cppm
index 9bdd4c9f6682f..4b893ffbfaae8 100644
--- a/clang/test/Driver/module-fgen-reduced-bmi.cppm
+++ b/clang/test/Driver/module-fgen-reduced-bmi.cppm
@@ -64,7 +64,8 @@
 // RUN:     -Wno-missing-reduced-bmi -### 2>&1 | FileCheck Hello.cppm -check-prefix=NO_WARN
 //
 // RUN: %clang -std=c++20 Hello.cppm --precompile -o Hello.pcm \
-// RUN:     -Wno-missing-reduced-bmi -### 2>&1 | FileCheck Hello.cppm -check-prefix=NO_WARN
+// RUN:     -fno-modules-reduced-bmi -Wno-missing-reduced-bmi -### 2>&1 | \
+// RUN:     FileCheck Hello.cppm -check-prefix=NO_WARN
 
 //--- Hello.cppm
 export module Hello;
diff --git a/clang/test/Driver/module-output.cppm b/clang/test/Driver/module-output.cppm
index 7cf0771f3d6ef..197f1d85b0f9c 100644
--- a/clang/test/Driver/module-output.cppm
+++ b/clang/test/Driver/module-output.cppm
@@ -13,28 +13,29 @@
 // Tests that the .pcm file will be generated in the same directory with the specified
 // output and the name of the .pcm file should be the same with the input file.
 // RUN: %clang -std=c++20 %t/Hello.cppm -fmodule-output -c -o %t/output/Hello.o \
-// RUN:   -### 2>&1 | FileCheck %t/Hello.cppm
+// RUN:   -fno-modules-reduced-bmi -### 2>&1 | FileCheck %t/Hello.cppm
 //
 // Tests that the output file will be generated in the input directory if the output
 // file is not the corresponding object file.
 // RUN: %clang -std=c++20 %t/Hello.cppm %t/AnotherModule.cppm -fmodule-output -o \
-// RUN:   %t/output/a.out -### 2>&1 | FileCheck  %t/AnotherModule.cppm
+// RUN:   %t/output/a.out -fno-modules-reduced-bmi -### 2>&1 | FileCheck  %t/AnotherModule.cppm
 //
 // Tests that clang will reject the command line if it specifies -fmodule-output with
 // multiple archs.
 // RUN: not %clang %t/Hello.cppm -fmodule-output -arch i386 -arch x86_64 -### \
-// RUN:   --target=x86_64-apple-darwin 2>&1 | FileCheck %t/Hello.cppm -check-prefix=MULTIPLE-ARCH
+// RUN:   -fno-modules-reduced-bmi --target=x86_64-apple-darwin 2>&1 | FileCheck %t/Hello.cppm \
+// RUN:   -check-prefix=MULTIPLE-ARCH
 
 // Tests that the .pcm file will be generated in the same path with the specified one
 // in the comamnd line.
 // RUN: %clang -std=c++20 %t/Hello.cppm -fmodule-output=%t/pcm/Hello.pcm -o %t/Hello.o \
-// RUN:   -c -### 2>&1 | FileCheck %t/Hello.cppm --check-prefix=CHECK-SPECIFIED
+// RUN:   -fno-modules-reduced-bmi  -c -### 2>&1 | FileCheck %t/Hello.cppm --check-prefix=CHECK-SPECIFIED
 //
 // RUN: %clang -std=c++20 %t/Hello.cppm -fmodule-output=%t/Hello.pcm -fmodule-output -c -fsyntax-only \
-// RUN:   -### 2>&1 | FileCheck %t/Hello.cppm --check-prefix=CHECK-NOT-USED
+// RUN:   -fno-modules-reduced-bmi  -### 2>&1 | FileCheck %t/Hello.cppm --check-prefix=CHECK-NOT-USED
 
 // Test that we can emit a warning if the type of the input file is not a module interface unit.
-// RUN: %clang -std=c++20 %t/a.cpp -fmodule-output -c -o %t/a.o -### 2>&1 | FileCheck %t/a.cpp
+// RUN: %clang -std=c++20 %t/a.cpp -fmodule-output -fno-modules-reduced-bmi  -c -o %t/a.o -### 2>&1 | FileCheck %t/a.cpp
 
 //--- Hello.cppm
 export module Hello;
diff --git a/clang/test/Driver/modules.cpp b/clang/test/Driver/modules.cpp
index 088a73230f81e..edbe8d8e92c85 100644
--- a/clang/test/Driver/modules.cpp
+++ b/clang/test/Driver/modules.cpp
@@ -34,7 +34,7 @@
 
 // Check combining precompile and compile steps works.
 //
-// RUN: %clang -std=c++2a -x c++-module %t/foo.cpp -S -o %t/foo2.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE --check-prefix=CHECK-COMPILE
+// RUN: %clang -std=c++2a -x c++-module -fno-modules-reduced-bmi %t/foo.cpp -S -o %t/foo2.pcm.o -v 2>&1 | FileCheck %s --check-prefix=CHECK-PRECOMPILE --check-prefix=CHECK-COMPILE
 
 // Check that .cppm is treated as a module implicitly.
 //
diff --git a/clang/test/Driver/nvptx-cuda-system-arch.c b/clang/test/Driver/nvptx-cuda-system-arch.c
index c54eeac73f73b..2d4eca8c43bc3 100644
--- a/clang/test/Driver/nvptx-cuda-system-arch.c
+++ b/clang/test/Driver/nvptx-cuda-system-arch.c
@@ -16,14 +16,14 @@
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-new-driver --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_fail -x cuda %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: cannot determine nvptx64 architecture{{.*}}; consider passing it via '--offload-arch'
+// NO-OUTPUT-ERROR: error: cannot determine cuda architecture{{.*}}; consider passing it via '--offload-arch'
 
 // case when nvptx-arch does not return anything with successful execution
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_empty -x cuda %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-new-driver --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_empty -x cuda %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=EMPTY-OUTPUT
-// EMPTY-OUTPUT: error: cannot determine nvptx64 architecture: No NVIDIA GPU detected in the system; consider passing it via '--offload-arch'
+// EMPTY-OUTPUT: error: cannot determine cuda architecture: No GPU detected in the system; consider passing it via '--offload-arch'
 
 // case when nvptx-arch does not return anything with successful execution
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -nogpulib --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_sm_70 -x cuda --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda %s 2>&1 \
@@ -49,4 +49,4 @@
 // RUN:     --offload-arch=native --nvptx-arch-tool=%t/nvptx_arch_sm_70 \
 // RUN:     --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda -x cuda %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=BAD-TIMEOUT
-// BAD-TIMEOUT: clang: error: cannot determine nvptx64 architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
+// BAD-TIMEOUT: clang: error: cannot determine cuda architecture: CLANG_TOOLCHAIN_PROGRAM_TIMEOUT expected an integer, got 'foo'; consider passing it via '--offload-arch'; environment variable CLANG_TOOLCHAIN_PROGRAM_TIMEOUT specifies the tool timeout (integer secs, <=0 is infinite)
diff --git a/clang/test/Driver/offload-target.c b/clang/test/Driver/offload-target.c
new file mode 100644
index 0000000000000..123ecd3eb830e
--- /dev/null
+++ b/clang/test/Driver/offload-target.c
@@ -0,0 +1,22 @@
+// RUN: %clang -### -fsycl --offload-targets=spirv64 -nogpuinc %s -ccc-print-bindings 2>&1 \
+// RUN: | FileCheck %s -check-prefix=SYCL
+// SYCL: "spirv64" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[SYCL_BC:.+]]"
+
+// RUN: %clang -### --offload-targets=amdgcn-amd-amdhsa -nogpulib -nogpuinc -x hip %s -ccc-print-bindings 2>&1 \
+// RUN: | FileCheck %s -check-prefix=HIP
+// HIP: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[AMD_OBJ:.+]]"
+
+// RUN: %clang -### --offload-targets=nvptx64-nvidia-cuda -nogpulib -nogpuinc -x cuda %s -ccc-print-bindings 2>&1 \
+// RUN: | FileCheck %s -check-prefix=CUDA
+// CUDA: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[NV_OBJ:.+]]"
+
+// RUN: %clang -### --offload-targets=amdgcn-amd-amdhsa,nvptx64-nvidia-cuda -fopenmp \
+// RUN:   -Xarch_amdgcn --offload-arch=gfx90a -Xarch_nvptx64 --offload-arch=sm_89 \
+// RUN:   -nogpulib -nogpuinc %s -ccc-print-bindings 2>&1 \
+// RUN: | FileCheck %s -check-prefix=OPENMP
+// OPENMP: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[AMD_OBJ:.+]]"
+// OPENMP: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]"], output: "[[NV_OBJ:.+]]"
+
+// RUN: %clang -### --offload-targets=spirv64-amd-amdhsa -nogpulib -nogpuinc -x hip %s -ccc-print-bindings 2>&1 \
+// RUN: | FileCheck %s -check-prefix=HIPSPIRV
+// HIPSPIRV: "spirv64-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[AMD_OBJ:.+]]"
diff --git a/clang/test/Driver/openacc.c b/clang/test/Driver/openacc.c
index c7f1d2545bd03..f46e2a32bcab2 100644
--- a/clang/test/Driver/openacc.c
+++ b/clang/test/Driver/openacc.c
@@ -1,14 +1,2 @@
 // RUN: %clang -S -### -fopenacc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DRIVER
 // CHECK-DRIVER: "-cc1" {{.*}} "-fopenacc"
-
-// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override=202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE
-// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override 202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE
-// CHECK-MACRO-OVERRIDE: "-cc1"{{.*}} "-fexperimental-openacc-macro-override" "202211"
-
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// INVALID: error: the clang compiler does not support
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index 6639e9d2d9d67..1f12cfca9488b 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -127,9 +127,12 @@
 // UNWIND-TABLES: "-funwind-tables=2"
 // NO-UNWIND-TABLES-NOT: "-funwind-tables=2"
 
-// Check that the -X and --no-relax flags are passed to the linker on riscv64
+// Check that the -X and --no-relax flags are passed to the linker
+// RUN: %clang --target=loongarch64-unknown-openbsd -mno-relax -### %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LA64-FLAGS %s
 // RUN: %clang --target=riscv64-unknown-openbsd -mno-relax -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=RISCV64-FLAGS %s
+// LA64-FLAGS: "-X" "--no-relax"
 // RISCV64-FLAGS: "-X" "--no-relax"
 
 // Check passing LTO flags to the linker
diff --git a/clang/test/Driver/openmp-offload.c b/clang/test/Driver/openmp-offload.c
index 162ff20a9745a..64d45f9479fb6 100644
--- a/clang/test/Driver/openmp-offload.c
+++ b/clang/test/Driver/openmp-offload.c
@@ -7,7 +7,7 @@
 /// Check whether an invalid OpenMP target is specified:
 // RUN:   not %clang -### -fopenmp=libomp -fopenmp-targets=aaa-bbb-ccc-ddd %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-INVALID-TARGET %s
-// CHK-INVALID-TARGET: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
+// CHK-INVALID-TARGET: error: invalid or unsupported offload target: 'aaa-bbb-ccc-ddd'
 
 /// ###########################################################################
 
@@ -18,15 +18,6 @@
 
 /// ###########################################################################
 
-/// Check error for no -fopenmp option
-// RUN:   not %clang -### -fopenmp-targets=powerpc64le-ibm-linux-gnu  %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHK-NO-FOPENMP %s
-// RUN:   not %clang -### -fopenmp=libgomp -fopenmp-targets=powerpc64le-ibm-linux-gnu  %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHK-NO-FOPENMP %s
-// CHK-NO-FOPENMP: error: '-fopenmp-targets' must be used in conjunction with a '-fopenmp' option compatible with offloading; e.g., '-fopenmp=libomp' or '-fopenmp=libiomp5'
-
-/// ###########################################################################
-
 /// Check warning for duplicate offloading targets.
 // RUN:   %clang -### -ccc-print-phases -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu,powerpc64le-ibm-linux-gnu  %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-DUPLICATES %s
diff --git a/clang/test/Driver/openmp-system-arch.c b/clang/test/Driver/openmp-system-arch.c
index b18ecf3ec474b..167b07a23f512 100644
--- a/clang/test/Driver/openmp-system-arch.c
+++ b/clang/test/Driver/openmp-system-arch.c
@@ -24,13 +24,7 @@
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch=native \
 // RUN:     --nvptx-arch-tool=%t/nvptx_arch_empty --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch= \
-// RUN:     --nvptx-arch-tool=%t/nvptx_arch_fail --amdgpu-arch-tool=%t/amdgpu_arch_fail %s 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch= \
-// RUN:     --nvptx-arch-tool=%t/nvptx_arch_empty --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: failed to deduce triple for target architecture 'native'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead
+// NO-OUTPUT-ERROR: error: cannot determine openmp architecture
 
 // case when amdgpu-arch succeeds.
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch=native \
diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c
index 5f9383fbed8f4..b1a0a29ec4180 100644
--- a/clang/test/Driver/print-multi-selection-flags.c
+++ b/clang/test/Driver/print-multi-selection-flags.c
@@ -107,3 +107,39 @@
 // CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi
 // CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=foo
 // CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=bar
+
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fropi              | FileCheck --check-prefixes=CHECK-ROPI,CHECK-NO-RWPI,CHECK-NO-PIC %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -frwpi              | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-RWPI,CHECK-NO-PIC %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fropi -frwpi       | FileCheck --check-prefixes=CHECK-ROPI,CHECK-RWPI,CHECK-NO-PIC %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fno-ropi -fno-rwpi | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-NO-PIC %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a                     | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-NO-PIC %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fpic               | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIC1 %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fPIC               | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIC2 %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fpie               | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIE1 %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fPIE               | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIE2 %s
+// CHECK-PIC2: -fPIC
+// CHECK-PIE2: -fPIE
+// CHECK-NO-PIC: -fno-pic
+// CHECK-NO-ROPI: -fno-ropi
+// CHECK-NO-RWPI: -fno-rwpi
+// CHECK-PIC1: -fpic
+// CHECK-PIE1: -fpie
+// CHECK-ROPI: -fropi
+// CHECK-RWPI: -frwpi
+
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -Os | FileCheck --check-prefix=CHECK-OPT-OS %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -Oz | FileCheck --check-prefix=CHECK-OPT-OZ %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a     | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -O1 | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -O2 | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -O3 | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -Os           | FileCheck --check-prefix=CHECK-OPT-OS %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -Oz           | FileCheck --check-prefix=CHECK-OPT-OZ %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi               | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -O1           | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -O2           | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -O3           | FileCheck --check-prefix=CHECK-OPT %s
+// CHECK-OPT-OZ: -Oz
+// CHECK-OPT-OS: -Os
+// CHECK-OPT-NOT: -Oz
+// CHECK-OPT-NOT: -Os
diff --git a/clang/test/FixIt/fixit-format-ios-nopedantic.m b/clang/test/FixIt/fixit-format-ios-nopedantic.m
index db9ac797c2472..836a4b5372f13 100644
--- a/clang/test/FixIt/fixit-format-ios-nopedantic.m
+++ b/clang/test/FixIt/fixit-format-ios-nopedantic.m
@@ -1,5 +1,5 @@
 // RUN: cp %s %t
-// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -Werror -fixit %t
+// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -fixit %t
 
 int printf(const char *restrict, ...);
 typedef unsigned int NSUInteger;
diff --git a/clang/test/FixIt/format.m b/clang/test/FixIt/format.m
index 950765bad9339..e97ae10c974aa 100644
--- a/clang/test/FixIt/format.m
+++ b/clang/test/FixIt/format.m
@@ -237,14 +237,14 @@ void testSizeTypes(void) {
   printf("%zu", 0.f); // expected-warning-re{{format specifies type 'size_t' (aka '{{.+}}') but the argument has type 'float'}}
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f"
 
-  printf("%zd", 0.f); // expected-warning-re{{format specifies type 'ssize_t' (aka '{{.+}}') but the argument has type 'float'}}
+  printf("%zd", 0.f); // expected-warning-re{{format specifies type 'signed size_t' (aka '{{.+}}') but the argument has type 'float'}}
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f"
 
   short x;
 #if !defined(__ANDROID__) && !defined(__Fuchsia__)
-  printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}}
+  printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}}
 #else
-  printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}}
+  printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}}
   // expected-warning@-1 {{'%n' specifier not supported on this platform}}
 #endif // !defined(__ANDROID__) && !defined(__Fuchsia__)
   // PrintfSpecifier::fixType doesn't handle %n, so a fix-it is not emitted,
diff --git a/clang/test/Headers/spirv_ids.cpp b/clang/test/Headers/spirv_ids.cpp
index 0cd74dbca53aa..466be5deee87a 100644
--- a/clang/test/Headers/spirv_ids.cpp
+++ b/clang/test/Headers/spirv_ids.cpp
@@ -53,58 +53,58 @@
 // CHECK: call i32 @llvm.spv.subgroup.id()
 // CHECK: call i32 @llvm.spv.subgroup.local.invocation.id()
   
-// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 0) #2
-// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 1) #2
-// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 2) #2
-// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 0) #2
-// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 1) #2
-// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 2) #2
-// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 0) #2
-// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 1) #2
-// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 2) #2
-// NV: call noundef i32 @_Z20__spirv_SubgroupSizev() #2
-// NV: call noundef i32 @_Z23__spirv_SubgroupMaxSizev() #2
-// NV: call noundef i32 @_Z20__spirv_NumSubgroupsv() #2
-// NV: call noundef i32 @_Z18__spirv_SubgroupIdv() #2
-// NV: call noundef i32 @_Z33__spirv_SubgroupLocalInvocationIdv() #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 0) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 1) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 2) #2
+// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 0) #2
+// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 1) #2
+// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 2) #2
+// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 0) #2
+// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 1) #2
+// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 2) #2
+// NV: call noundef i32 @_Z27__spirv_BuiltInSubgroupSizev() #2
+// NV: call noundef i32 @_Z30__spirv_BuiltInSubgroupMaxSizev() #2
+// NV: call noundef i32 @_Z27__spirv_BuiltInNumSubgroupsv() #2
+// NV: call noundef i32 @_Z25__spirv_BuiltInSubgroupIdv() #2
+// NV: call noundef i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv() #2
 
 void test_id_and_range() {
-  __spirv_NumWorkgroups(0);
-  __spirv_NumWorkgroups(1);
-  __spirv_NumWorkgroups(2);
-  __spirv_WorkgroupSize(0);
-  __spirv_WorkgroupSize(1);
-  __spirv_WorkgroupSize(2);
-  __spirv_WorkgroupId(0);
-  __spirv_WorkgroupId(1);
-  __spirv_WorkgroupId(2);
-  __spirv_LocalInvocationId(0);
-  __spirv_LocalInvocationId(1);
-  __spirv_LocalInvocationId(2);
-  __spirv_GlobalInvocationId(0);
-  __spirv_GlobalInvocationId(1);
-  __spirv_GlobalInvocationId(2);
-  __spirv_GlobalSize(0);
-  __spirv_GlobalSize(1);
-  __spirv_GlobalSize(2);
-  __spirv_GlobalOffset(0);
-  __spirv_GlobalOffset(1);
-  __spirv_GlobalOffset(2);
-  unsigned int ssize = __spirv_SubgroupSize();
-  unsigned int smax = __spirv_SubgroupMaxSize();
-  unsigned int snum = __spirv_NumSubgroups();
-  unsigned int sid = __spirv_SubgroupId();
-  unsigned int sinvocid = __spirv_SubgroupLocalInvocationId();
+  __spirv_BuiltInNumWorkgroups(0);
+  __spirv_BuiltInNumWorkgroups(1);
+  __spirv_BuiltInNumWorkgroups(2);
+  __spirv_BuiltInWorkgroupSize(0);
+  __spirv_BuiltInWorkgroupSize(1);
+  __spirv_BuiltInWorkgroupSize(2);
+  __spirv_BuiltInWorkgroupId(0);
+  __spirv_BuiltInWorkgroupId(1);
+  __spirv_BuiltInWorkgroupId(2);
+  __spirv_BuiltInLocalInvocationId(0);
+  __spirv_BuiltInLocalInvocationId(1);
+  __spirv_BuiltInLocalInvocationId(2);
+  __spirv_BuiltInGlobalInvocationId(0);
+  __spirv_BuiltInGlobalInvocationId(1);
+  __spirv_BuiltInGlobalInvocationId(2);
+  __spirv_BuiltInGlobalSize(0);
+  __spirv_BuiltInGlobalSize(1);
+  __spirv_BuiltInGlobalSize(2);
+  __spirv_BuiltInGlobalOffset(0);
+  __spirv_BuiltInGlobalOffset(1);
+  __spirv_BuiltInGlobalOffset(2);
+  unsigned int ssize = __spirv_BuiltInSubgroupSize();
+  unsigned int smax = __spirv_BuiltInSubgroupMaxSize();
+  unsigned int snum = __spirv_BuiltInNumSubgroups();
+  unsigned int sid = __spirv_BuiltInSubgroupId();
+  unsigned int sinvocid = __spirv_BuiltInSubgroupLocalInvocationId();
 }
diff --git a/clang/test/Headers/stdarg.cpp b/clang/test/Headers/stdarg.cpp
index 20bf17caf15f7..bfc3af11a23b6 100644
--- a/clang/test/Headers/stdarg.cpp
+++ b/clang/test/Headers/stdarg.cpp
@@ -15,8 +15,8 @@
 
 #include <stdarg.h>
 
-// AARCH64-C: define {{.*}} @f(i32 noundef %n, ptr noundef %list)
-// AARCH64-CXX: define {{.*}} @_Z1fiSt9__va_list(i32 noundef %n, ptr noundef %list)
+// AARCH64-C: define {{.*}} @f(i32 noundef %n, ptr dead_on_return noundef %list)
+// AARCH64-CXX: define {{.*}} @_Z1fiSt9__va_list(i32 noundef %n, ptr dead_on_return noundef %list)
 // X86_64-C: define {{.*}} @f(i32 noundef %n, ptr noundef %list)
 // X86_64-CXX: define {{.*}} @_Z1fiP13__va_list_tag(i32 noundef %n, ptr noundef %list)
 // PPC64-C: define {{.*}} @f(i32 noundef signext %n, ptr noundef %list)
diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c
index d21749a649e1c..56488a164719b 100644
--- a/clang/test/Interpreter/pretty-print.c
+++ b/clang/test/Interpreter/pretty-print.c
@@ -3,9 +3,88 @@
 // RUN: cat %s | clang-repl -Xcc -xc  | FileCheck %s
 // RUN: cat %s | clang-repl -Xcc -std=c++11 | FileCheck %s
 
-// Fails with `Symbols not found: [ __clang_Interpreter_SetValueNoAlloc ]`.
 // UNSUPPORTED: hwasan
 
+
+char c = 'a'; c
+// CHECK: (char) 'a'
+
 const char* c_str = "Hello, world!"; c_str
+// CHECK-NEXT: (const char *) "Hello, world!"
+
+c_str = "Goodbye, world!"; c_str
+// CHECK-NEXT: (const char *) "Goodbye, world!"
+
+const char* c_null_str = 0; c_null_str
+// CHECK-NEXT: (const char *) 0
+
+"Hello, world"
+// CHECK-NEXT: ({{(const )?}}char[13]) "Hello, world"
+
+int x = 42; x
+// CHECK-NEXT: (int) 42
+
+&x
+// CHECK-NEXT: (int *) 0x{{[0-9a-f]+}}
+
+x - 2
+// CHECK-NEXT: (int) 40
+
+float f = 4.2f; f
+// CHECK-NEXT: (float) 4.20000f
+
+double d = 4.21; d
+// CHECK-NEXT: (double) 4.2100000
+
+long double tau = 6.2831853; tau
+// CHECK-NEXT: (long double) 6.28318530000L
+
+int foo() { return 42; } foo()
+// CHECK-NEXT: (int) 42
+
+void bar(int a, float b) {} bar
+// CHECK-NEXT: (void (int, float)) Function @0x{{[0-9a-f]+}}
+// CHECK-NEXT: void bar(int a, float b) {
+
+bar
+// CHECK: (void (int, float)) Function @0x{{[0-9a-f]+}}
+// CHECK-NEXT: void bar(int a, float b) {
+
+// Arrays.
+
+int arr[3] = {1,2,3}; arr
+// CHECK: (int[3]) { 1, 2, 3 }
+
+double darr[3][4] = { {1,2,3,4}, {5,6,7,8}, {9,10,11,12} }; darr
+// CHECK-NEXT: (double[3][4]) { { 1.0, 2.0, 3.0, 4.0 }, { 5.0, 6.0, 7.0, 8.0 }, { 9.0, 10.0, 11.0, 12.0 } }
+
+float farr[2][1] = { {0}, {3.14}}; farr
+// CHECK-NEXT: (float[2][1]) { { 0.0f }, { 3.14000f } }
+
+0./0.
+// CHECK-NEXT: (double) nan
+
+1.0f / 0.0f
+// CHECK-NEXT: (float) inf
+
+0.00001f
+// CHECK-NEXT: (float) 1.00000e-05f
+
+int * ptr = (int*)0x123; ptr
+// CHECK-NEXT: (int *) 0x123
+
+int * null_ptr = (int*)0; null_ptr
+// CHECK-NEXT: (int *) 0x0
+
+// TODO: _Bool, _Complex, _Atomic, and _BitInt
+// union U { int I; float F; } u; u.I = 12; u.I
+// TODO-CHECK-NEXT: (int) 12
+// struct S1{} s1; s1
+// TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}}
+
+// struct S2 {int d;} E = {22}; E
+// TODO-CHECK-NEXT: (struct S2 &) @0x{{[0-9a-f]+}}
+// E.d
+// TODO-CHECK-NEXT: (int) 22
 
-// CHECK: Not implement yet.
+%quit
diff --git a/clang/test/Interpreter/pretty-print.cpp b/clang/test/Interpreter/pretty-print.cpp
new file mode 100644
index 0000000000000..fd79d315e48ba
--- /dev/null
+++ b/clang/test/Interpreter/pretty-print.cpp
@@ -0,0 +1,73 @@
+// RUN: clang-repl "int i = 10;" 'extern "C" int printf(const char*,...);' \
+// RUN:            'auto r1 = printf("i = %d\n", i);' | FileCheck --check-prefix=CHECK-DRIVER %s
+// UNSUPPORTED: system-aix
+// CHECK-DRIVER: i = 10
+// RUN: cat %s | clang-repl -Xcc -std=c++11 -Xcc -fno-delayed-template-parsing | FileCheck %s
+extern "C" int printf(const char*,...);
+
+"ab"
+// CHECK: (const char[3]) "ab"
+
+123456
+// CHECK-NEXT: (int) 123456
+
+char ch[2] = {'1','a'}; ch
+// CHECK-NEXT: (char[2]) { '1', 'a' }
+
+char chnull[3] = {'1','a', '\0'}; chnull
+// CHECK-NEXT: (char[3]) "1a"
+
+char ch_arr[2][3][1] = {{{'a'}, {'b'}, {'c'}}, {{'d'}, {'e'}, {'f'}}}; ch_arr
+// CHECK: (char[2][3][1]) { { { 'a' }, { 'b' }, { 'c' } }, { { 'd' }, { 'e' }, { 'f' } } }
+struct S3 { int* p; S3() { p = new int(42); } ~S3() { delete p; } };
+S3{}
+// CHECK-NEXT: (S3) @0x{{[0-9a-f]+}}
+S3 s3;
+s3
+// CHECK-NEXT: (S3 &) @0x{{[0-9a-f]+}}
+
+struct S4 { ~S4() { printf("~S4()\n"); }};
+S4{}
+// CHECK-NEXT: (S4) @0x{{[0-9a-f]+}}
+// TODO-CHECK-NEXT: ~S4()
+
+enum Enum{ e1 = -12, e2, e3=33, e4, e5 = 33};
+e2
+// CHECK-NEXT: (Enum) (e2) : int -11
+::e1
+// CHECK-NEXT: (Enum) (e1) : int -12
+
+enum class Color { R = 0, G, B };
+Color::R
+// CHECK-NEXT: (Color) (Color::R) : int 0
+
+
+// Lambdas.
+
+auto Lambda1 = []{};
+Lambda1
+// CHECK-NEXT: ((lambda) &) @0x{{[0-9a-f]+}}
+[]{}
+// CHECK-NEXT: ((lambda at input_line_{{[0-9]+}}:1:1)) @0x{{[0-9a-f]+}}
+
+template<int n> struct F{ enum {RET=F<n-1>::RET*n} ; };
+template<> struct F<0> { enum {RET = 1}; };
+F<7>::RET
+// CHECK-NEXT: (F<7>::(unnamed enum at input_line_{{[0-9]+}}:1:27)) (F<7>::RET) : unsigned int 5040
+
+struct S5 { int foo() { return 42; }};
+&S5::foo
+// CHECK-NEXT: (int (S5::*)()) Function @0x{{[0-9a-f]+}}
+
+// int i = 12;
+// int &iref = i;
+// iref
+// // TODO-CHECK-NEXT: (int &) 12
+
+// int &&rref = 100;
+// rref
+
+// // TODO-CHECK-NEXT: (int &&) 100
+
+%quit
+
diff --git a/clang/test/Modules/cxx20-10-2-ex1.cpp b/clang/test/Modules/cxx20-10-2-ex1.cpp
index 0cd6f77466f4b..749b15213098a 100644
--- a/clang/test/Modules/cxx20-10-2-ex1.cpp
+++ b/clang/test/Modules/cxx20-10-2-ex1.cpp
@@ -14,7 +14,7 @@ export int x;
 module;
 
 #include "std-10-2-ex1.h"
-// expected-error@std-10-2-ex1.h:* {{export declaration can only be used within a module purview}}
+// expected-error@std-10-2-ex1.h:* {{export declaration can only be used within a module interface}}
 
 export module M1;
 export namespace {} // expected-error {{anonymous namespaces cannot be exported}}
diff --git a/clang/test/Modules/cxx20-export-import.cpp b/clang/test/Modules/cxx20-export-import.cpp
index 0b505668e8589..c14883e575575 100644
--- a/clang/test/Modules/cxx20-export-import.cpp
+++ b/clang/test/Modules/cxx20-export-import.cpp
@@ -11,4 +11,4 @@
 export module dummy;
 
 //--- test.cpp
-export import dummy; // expected-error {{export declaration can only be used within a module purview}}
+export import dummy; // expected-error {{export declaration can only be used within a module interface}}
diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
index 1b38259e0358c..72a31ea1d7d78 100644
--- a/clang/test/Modules/cxx20-import-diagnostics-a.cpp
+++ b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
@@ -110,7 +110,7 @@ module;
 
 module AOK1;
 
-export import C; // expected-error {{export declaration can only be used within a module purview}}
+export import C; // expected-error {{export declaration can only be used within a module interface}}
 
 int theAnswer () { return 42; }
 
diff --git a/clang/test/Modules/export-in-non-modules.cpp b/clang/test/Modules/export-in-non-modules.cpp
index 69360eb46d774..7b2575c60f1fd 100644
--- a/clang/test/Modules/export-in-non-modules.cpp
+++ b/clang/test/Modules/export-in-non-modules.cpp
@@ -1,4 +1,4 @@
 // RUN: %clang_cc1 -std=c++20 %s -fsyntax-only -verify
-export struct Unit { // expected-error {{export declaration can only be used within a module purview}}
+export struct Unit { // expected-error {{export declaration can only be used within a module interface}}
   bool operator<(const Unit &);
 };
diff --git a/clang/test/Modules/mingw-exceptions.cppm b/clang/test/Modules/mingw-exceptions.cppm
index db7aa2ce90a94..be9d61d7d3418 100644
--- a/clang/test/Modules/mingw-exceptions.cppm
+++ b/clang/test/Modules/mingw-exceptions.cppm
@@ -1,5 +1,6 @@
 // REQUIRES: x86-registered-target
-// RUN: %clang -target x86_64-windows-gnu -x c++-module -std=gnu++23 -c -o /dev/null -Xclang -disable-llvm-passes %s
+// RUN: %clang -target x86_64-windows-gnu -x c++-module -std=gnu++23 -fno-modules-reduced-bmi \
+// RUN:     -c -o /dev/null -Xclang -disable-llvm-passes %s
 
 // Make sure the command succeeds and doesn't break on the -exception-model flag in cc1.
 export module empty;
diff --git a/clang/test/OpenMP/for_firstprivate_codegen.cpp b/clang/test/OpenMP/for_firstprivate_codegen.cpp
index 0255e1e3d4aea..83b5939799642 100644
--- a/clang/test/OpenMP/for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/for_firstprivate_codegen.cpp
@@ -427,7 +427,7 @@ int main() {
 // CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP0]])
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
@@ -469,7 +469,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -543,12 +543,12 @@ int main() {
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -572,7 +572,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE6]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done6:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP8]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef [[AGG_TMP8]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP8]]) #[[ATTR2]]
@@ -611,7 +611,7 @@ int main() {
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC4]], i64 0, i64 [[IDXPROM]]
 // CHECK1-NEXT:    store i32 [[TMP16]], ptr [[ARRAYIDX]], align 4
-// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP9]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP9]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP19]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i64 0, i64 [[IDXPROM11]]
@@ -711,7 +711,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1044,8 +1044,8 @@ int main() {
 // CHECK4-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
-// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr @g1, align 8
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
+// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr @g1, align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK4-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
 // CHECK4-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK4-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -1086,7 +1086,7 @@ int main() {
 // CHECK4-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK4-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
 // CHECK4-NEXT:    store i32 1, ptr [[G]], align 4
-// CHECK4-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK4-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK4-NEXT:    store volatile i32 1, ptr [[TMP13]], align 4
 // CHECK4-NEXT:    store i32 2, ptr [[SIVAR3]], align 4
 // CHECK4-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[BLOCK]], i32 0, i32 0
@@ -1103,7 +1103,7 @@ int main() {
 // CHECK4-NEXT:    [[TMP14:%.*]] = load volatile i32, ptr [[G]], align 4
 // CHECK4-NEXT:    store volatile i32 [[TMP14]], ptr [[BLOCK_CAPTURED]], align 8
 // CHECK4-NEXT:    [[BLOCK_CAPTURED5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[BLOCK]], i32 0, i32 5
-// CHECK4-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK4-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK4-NEXT:    store ptr [[TMP15]], ptr [[BLOCK_CAPTURED5]], align 8
 // CHECK4-NEXT:    [[BLOCK_CAPTURED6:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[BLOCK]], i32 0, i32 7
 // CHECK4-NEXT:    [[TMP16:%.*]] = load i32, ptr [[SIVAR3]], align 4
@@ -1137,7 +1137,7 @@ int main() {
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
 // CHECK4-NEXT:    store i32 2, ptr [[BLOCK_CAPTURE_ADDR]], align 8
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR1]], align 8
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK4-NEXT:    store i32 2, ptr [[TMP0]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 7
 // CHECK4-NEXT:    store i32 4, ptr [[BLOCK_CAPTURE_ADDR2]], align 4
diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
index 93e71b9a8312e..441e809dc59e5 100644
--- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -350,9 +350,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i32 8, i1 false)
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
@@ -524,7 +524,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0
 // CHECK1-NEXT:    store i8 [[BF_SET]], ptr [[B]], align 4
 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[C]], align 4
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A3]], ptr [[A2]], align 4
@@ -535,22 +535,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32
 // CHECK1-NEXT:    store i32 [[BF_CAST]], ptr [[B4]], align 4
 // CHECK1-NEXT:    [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[C7]], align 4
 // CHECK1-NEXT:    [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3
 // CHECK1-NEXT:    store ptr [[E9]], ptr [[E]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B4]], align 4
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[B_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    store i32 [[TMP8]], ptr [[C_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[C_CASTED]], align 4
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP9]], ptr [[TMP10]])
 // CHECK1-NEXT:    ret void
 //
@@ -578,25 +578,25 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 4
 // CHECK1-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 4
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[_TMP2]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[E3]], ptr align 4 [[TMP2]], i32 16, i1 false)
 // CHECK1-NEXT:    store ptr [[E3]], ptr [[_TMP4]], align 4
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[TMP3]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK1-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP5]], -1
 // CHECK1-NEXT:    store i32 [[DEC]], ptr [[B_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP7]], 1
 // CHECK1-NEXT:    store i32 [[DIV]], ptr [[TMP6]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP8]], i32 0, i32 2
 // CHECK1-NEXT:    store i32 1111, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    ret void
@@ -656,7 +656,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -738,9 +738,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i32 8, i1 false)
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
@@ -840,7 +840,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SST]], ptr [[THIS1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A3]], ptr [[A2]], align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A2]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -862,7 +862,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[TMP1]], align 4
@@ -896,7 +896,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -965,7 +965,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK3-NEXT:    [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0
 // CHECK3-NEXT:    store i8 [[BF_SET]], ptr [[B]], align 4
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]]
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[C]], align 4
 // CHECK3-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[A3]], ptr [[A2]], align 4
@@ -976,22 +976,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK3-NEXT:    [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32
 // CHECK3-NEXT:    store i32 [[BF_CAST]], ptr [[B4]], align 4
 // CHECK3-NEXT:    [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[C7]], align 4
 // CHECK3-NEXT:    [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[E9]], ptr [[E]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B4]], align 4
 // CHECK3-NEXT:    store i32 [[TMP5]], ptr [[B_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[C_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[C_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP9]], ptr [[TMP10]])
 // CHECK3-NEXT:    ret void
 //
@@ -1020,22 +1020,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK3-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[_TMP2]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[E3]], ptr align 4 [[TMP2]], i32 16, i1 false)
 // CHECK3-NEXT:    store ptr [[E3]], ptr [[_TMP4]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK3-NEXT:    store ptr [[B_ADDR]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 3
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    call void @_ZZN2SSC1ERiENKUlvE_clEv(ptr nonnull align 4 dereferenceable(16) [[REF_TMP]])
 // CHECK3-NEXT:    ret void
@@ -1053,32 +1053,32 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0:%.*]], ptr [[THIS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK3-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK3-NEXT:    store i32 [[INC]], ptr [[TMP3]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP7]], -1
 // CHECK3-NEXT:    store i32 [[DEC]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3
-// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 1
 // CHECK3-NEXT:    store i32 [[DIV]], ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    store i32 [[TMP13]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    store i32 [[TMP17]], ptr [[B_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[B_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3
-// CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    store i32 [[TMP21]], ptr [[C_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = load i32, ptr [[C_CASTED]], align 4
@@ -1106,14 +1106,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK3-NEXT:    store i32 [[INC]], ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP3]], -1
 // CHECK3-NEXT:    store i32 [[DEC]], ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META4]], !align [[META5]]
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 1
 // CHECK3-NEXT:    store i32 [[DIV]], ptr [[TMP4]], align 4
@@ -1252,7 +1252,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0
 // CHECK4-NEXT:    store i8 [[BF_SET]], ptr [[B]], align 4
 // CHECK4-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
 // CHECK4-NEXT:    store ptr [[TMP0]], ptr [[C]], align 4
 // CHECK4-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
 // CHECK4-NEXT:    store ptr [[A3]], ptr [[A2]], align 4
@@ -1263,22 +1263,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32
 // CHECK4-NEXT:    store i32 [[BF_CAST]], ptr [[B4]], align 4
 // CHECK4-NEXT:    [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4
+// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    store ptr [[TMP1]], ptr [[C7]], align 4
 // CHECK4-NEXT:    [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3
 // CHECK4-NEXT:    store ptr [[E9]], ptr [[E]], align 4
-// CHECK4-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4
+// CHECK4-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK4-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B4]], align 4
 // CHECK4-NEXT:    store i32 [[TMP5]], ptr [[B_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
-// CHECK4-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4
+// CHECK4-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK4-NEXT:    store i32 [[TMP8]], ptr [[C_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[C_CASTED]], align 4
-// CHECK4-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 4
+// CHECK4-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP9]], ptr [[TMP10]])
 // CHECK4-NEXT:    ret void
 //
@@ -1307,11 +1307,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
 // CHECK4-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4
+// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 4
 // CHECK4-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 4
 // CHECK4-NEXT:    store ptr [[TMP1]], ptr [[_TMP2]], align 4
-// CHECK4-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4
+// CHECK4-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[E3]], ptr align 4 [[TMP2]], i32 16, i1 false)
 // CHECK4-NEXT:    store ptr [[E3]], ptr [[_TMP4]], align 4
 // CHECK4-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 0
@@ -1327,13 +1327,13 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[BLOCK_CAPTURED_THIS_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 5
 // CHECK4-NEXT:    store ptr [[TMP0]], ptr [[BLOCK_CAPTURED_THIS_ADDR]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 6
-// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    store ptr [[TMP3]], ptr [[BLOCK_CAPTURED]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURED5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 7
 // CHECK4-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[TMP4]], ptr [[BLOCK_CAPTURED5]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURED6:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 8
-// CHECK4-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CHECK4-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    store ptr [[TMP5]], ptr [[BLOCK_CAPTURED6]], align 4
 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3
 // CHECK4-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
@@ -1354,7 +1354,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[BLOCK_CAPTURED_THIS:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
 // CHECK4-NEXT:    [[THIS:%.*]] = load ptr, ptr [[BLOCK_CAPTURED_THIS]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 4
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK4-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK4-NEXT:    store i32 [[INC]], ptr [[TMP0]], align 4
@@ -1363,12 +1363,12 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP2]], -1
 // CHECK4-NEXT:    store i32 [[DEC]], ptr [[BLOCK_CAPTURE_ADDR1]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 8
-// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 4
+// CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK4-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP4]], 1
 // CHECK4-NEXT:    store i32 [[DIV]], ptr [[TMP3]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// CHECK4-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 4
+// CHECK4-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK4-NEXT:    store i32 [[TMP6]], ptr [[A_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A_CASTED]], align 4
@@ -1377,7 +1377,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    store i32 [[TMP8]], ptr [[B_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[B_CASTED]], align 4
 // CHECK4-NEXT:    [[BLOCK_CAPTURE_ADDR5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 8
-// CHECK4-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 4
+// CHECK4-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
 // CHECK4-NEXT:    store i32 [[TMP11]], ptr [[C_CASTED]], align 4
 // CHECK4-NEXT:    [[TMP12:%.*]] = load i32, ptr [[C_CASTED]], align 4
@@ -1405,14 +1405,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK4-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 4
 // CHECK4-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 4
-// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK4-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK4-NEXT:    store i32 [[INC]], ptr [[TMP1]], align 4
 // CHECK4-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK4-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP3]], -1
 // CHECK4-NEXT:    store i32 [[DEC]], ptr [[B_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4
+// CHECK4-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META6]], !align [[META7]]
 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK4-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 1
 // CHECK4-NEXT:    store i32 [[DIV]], ptr [[TMP4]], align 4
@@ -1530,9 +1530,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i64 8, i1 false)
 // CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
@@ -1704,7 +1704,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0
 // CHECK9-NEXT:    store i8 [[BF_SET]], ptr [[B]], align 4
 // CHECK9-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    store ptr [[TMP0]], ptr [[C]], align 8
 // CHECK9-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
 // CHECK9-NEXT:    store ptr [[A3]], ptr [[A2]], align 8
@@ -1715,22 +1715,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32
 // CHECK9-NEXT:    store i32 [[BF_CAST]], ptr [[B4]], align 4
 // CHECK9-NEXT:    [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    store ptr [[TMP1]], ptr [[C7]], align 8
 // CHECK9-NEXT:    [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3
 // CHECK9-NEXT:    store ptr [[E9]], ptr [[E]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK9-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
 // CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B4]], align 4
 // CHECK9-NEXT:    store i32 [[TMP5]], ptr [[B_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
-// CHECK9-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8
+// CHECK9-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK9-NEXT:    store i32 [[TMP8]], ptr [[C_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = load i64, ptr [[C_CASTED]], align 8
-// CHECK9-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 8
+// CHECK9-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP9]], ptr [[TMP10]])
 // CHECK9-NEXT:    ret void
 //
@@ -1758,25 +1758,25 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store i64 [[C]], ptr [[C_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 8
 // CHECK9-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    store ptr [[TMP1]], ptr [[_TMP2]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[E3]], ptr align 4 [[TMP2]], i64 16, i1 false)
 // CHECK9-NEXT:    store ptr [[E3]], ptr [[_TMP4]], align 8
-// CHECK9-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK9-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK9-NEXT:    store i32 [[INC]], ptr [[TMP3]], align 4
 // CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK9-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP5]], -1
 // CHECK9-NEXT:    store i32 [[DEC]], ptr [[B_ADDR]], align 4
-// CHECK9-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK9-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP7]], 1
 // CHECK9-NEXT:    store i32 [[DIV]], ptr [[TMP6]], align 4
-// CHECK9-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 8
+// CHECK9-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP8]], i64 0, i64 2
 // CHECK9-NEXT:    store i32 1111, ptr [[ARRAYIDX]], align 4
 // CHECK9-NEXT:    ret void
@@ -1836,7 +1836,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1918,9 +1918,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i64 8, i1 false)
 // CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
@@ -2020,7 +2020,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK9-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SST]], ptr [[THIS1]], i32 0, i32 0
 // CHECK9-NEXT:    store ptr [[A3]], ptr [[A2]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A2]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -2042,7 +2042,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK9-NEXT:    store i32 [[INC]], ptr [[TMP1]], align 4
@@ -2076,7 +2076,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK9-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -2145,7 +2145,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0
 // CHECK11-NEXT:    store i8 [[BF_SET]], ptr [[B]], align 4
 // CHECK11-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK11-NEXT:    store ptr [[TMP0]], ptr [[C]], align 8
 // CHECK11-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
 // CHECK11-NEXT:    store ptr [[A3]], ptr [[A2]], align 8
@@ -2156,22 +2156,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32
 // CHECK11-NEXT:    store i32 [[BF_CAST]], ptr [[B4]], align 4
 // CHECK11-NEXT:    [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    store ptr [[TMP1]], ptr [[C7]], align 8
 // CHECK11-NEXT:    [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3
 // CHECK11-NEXT:    store ptr [[E9]], ptr [[E]], align 8
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK11-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
 // CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B4]], align 4
 // CHECK11-NEXT:    store i32 [[TMP5]], ptr [[B_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
-// CHECK11-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8
+// CHECK11-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK11-NEXT:    store i32 [[TMP8]], ptr [[C_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP9:%.*]] = load i64, ptr [[C_CASTED]], align 8
-// CHECK11-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 8
+// CHECK11-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP9]], ptr [[TMP10]])
 // CHECK11-NEXT:    ret void
 //
@@ -2200,22 +2200,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store i64 [[C]], ptr [[C_ADDR]], align 8
 // CHECK11-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 8
 // CHECK11-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 8
 // CHECK11-NEXT:    store ptr [[TMP1]], ptr [[_TMP2]], align 8
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[E3]], ptr align 4 [[TMP2]], i64 16, i1 false)
 // CHECK11-NEXT:    store ptr [[E3]], ptr [[_TMP4]], align 8
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK11-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 8
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 8
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK11-NEXT:    store ptr [[B_ADDR]], ptr [[TMP6]], align 8
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 3
-// CHECK11-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK11-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 // CHECK11-NEXT:    call void @_ZZN2SSC1ERiENKUlvE_clEv(ptr nonnull align 8 dereferenceable(32) [[REF_TMP]])
 // CHECK11-NEXT:    ret void
@@ -2233,32 +2233,32 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0:%.*]], ptr [[THIS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1
-// CHECK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CHECK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK11-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK11-NEXT:    store i32 [[INC]], ptr [[TMP3]], align 4
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2
-// CHECK11-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
+// CHECK11-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP7]], -1
 // CHECK11-NEXT:    store i32 [[DEC]], ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3
-// CHECK11-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// CHECK11-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
 // CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 1
 // CHECK11-NEXT:    store i32 [[DIV]], ptr [[TMP9]], align 4
 // CHECK11-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1
-// CHECK11-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8
+// CHECK11-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
 // CHECK11-NEXT:    store i32 [[TMP13]], ptr [[A_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP14:%.*]] = load i64, ptr [[A_CASTED]], align 8
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2
-// CHECK11-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+// CHECK11-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    store i32 [[TMP17]], ptr [[B_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP18:%.*]] = load i64, ptr [[B_CASTED]], align 8
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3
-// CHECK11-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
+// CHECK11-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    store i32 [[TMP21]], ptr [[C_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = load i64, ptr [[C_CASTED]], align 8
@@ -2286,14 +2286,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK11-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 8
 // CHECK11-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 8
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK11-NEXT:    store i32 [[INC]], ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP3]], -1
 // CHECK11-NEXT:    store i32 [[DEC]], ptr [[B_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 1
 // CHECK11-NEXT:    store i32 [[DIV]], ptr [[TMP4]], align 4
@@ -2432,7 +2432,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0
 // CHECK12-NEXT:    store i8 [[BF_SET]], ptr [[B]], align 4
 // CHECK12-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
 // CHECK12-NEXT:    store ptr [[TMP0]], ptr [[C]], align 8
 // CHECK12-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
 // CHECK12-NEXT:    store ptr [[A3]], ptr [[A2]], align 8
@@ -2443,22 +2443,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32
 // CHECK12-NEXT:    store i32 [[BF_CAST]], ptr [[B4]], align 4
 // CHECK12-NEXT:    [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2
-// CHECK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8
+// CHECK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    store ptr [[TMP1]], ptr [[C7]], align 8
 // CHECK12-NEXT:    [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3
 // CHECK12-NEXT:    store ptr [[E9]], ptr [[E]], align 8
-// CHECK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8
+// CHECK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK12-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
 // CHECK12-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
 // CHECK12-NEXT:    [[TMP5:%.*]] = load i32, ptr [[B4]], align 4
 // CHECK12-NEXT:    store i32 [[TMP5]], ptr [[B_CASTED]], align 4
 // CHECK12-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
-// CHECK12-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8
+// CHECK12-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK12-NEXT:    store i32 [[TMP8]], ptr [[C_CASTED]], align 4
 // CHECK12-NEXT:    [[TMP9:%.*]] = load i64, ptr [[C_CASTED]], align 8
-// CHECK12-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 8
+// CHECK12-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[E]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP9]], ptr [[TMP10]])
 // CHECK12-NEXT:    ret void
 //
@@ -2487,11 +2487,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    store i64 [[C]], ptr [[C_ADDR]], align 8
 // CHECK12-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// CHECK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 8
 // CHECK12-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 8
 // CHECK12-NEXT:    store ptr [[TMP1]], ptr [[_TMP2]], align 8
-// CHECK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[E3]], ptr align 4 [[TMP2]], i64 16, i1 false)
 // CHECK12-NEXT:    store ptr [[E3]], ptr [[_TMP4]], align 8
 // CHECK12-NEXT:    [[BLOCK_ISA:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 0
@@ -2507,13 +2507,13 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[BLOCK_CAPTURED_THIS_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 5
 // CHECK12-NEXT:    store ptr [[TMP0]], ptr [[BLOCK_CAPTURED_THIS_ADDR]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 6
-// CHECK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    store ptr [[TMP3]], ptr [[BLOCK_CAPTURED]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURED5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 8
 // CHECK12-NEXT:    [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[TMP4]], ptr [[BLOCK_CAPTURED5]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURED6:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 7
-// CHECK12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    store ptr [[TMP5]], ptr [[BLOCK_CAPTURED6]], align 8
 // CHECK12-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3
 // CHECK12-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
@@ -2534,7 +2534,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[BLOCK_CAPTURED_THIS:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
 // CHECK12-NEXT:    [[THIS:%.*]] = load ptr, ptr [[BLOCK_CAPTURED_THIS]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 8
+// CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK12-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
 // CHECK12-NEXT:    store i32 [[INC]], ptr [[TMP0]], align 4
@@ -2543,12 +2543,12 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP2]], -1
 // CHECK12-NEXT:    store i32 [[DEC]], ptr [[BLOCK_CAPTURE_ADDR1]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 7
-// CHECK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 8
+// CHECK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK12-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP4]], 1
 // CHECK12-NEXT:    store i32 [[DIV]], ptr [[TMP3]], align 4
 // CHECK12-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// CHECK12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 8
+// CHECK12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK12-NEXT:    store i32 [[TMP6]], ptr [[A_CASTED]], align 4
 // CHECK12-NEXT:    [[TMP7:%.*]] = load i64, ptr [[A_CASTED]], align 8
@@ -2557,7 +2557,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    store i32 [[TMP8]], ptr [[B_CASTED]], align 4
 // CHECK12-NEXT:    [[TMP9:%.*]] = load i64, ptr [[B_CASTED]], align 8
 // CHECK12-NEXT:    [[BLOCK_CAPTURE_ADDR5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 7
-// CHECK12-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 8
+// CHECK12-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
 // CHECK12-NEXT:    store i32 [[TMP11]], ptr [[C_CASTED]], align 4
 // CHECK12-NEXT:    [[TMP12:%.*]] = load i64, ptr [[C_CASTED]], align 8
@@ -2585,14 +2585,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK12-NEXT:    store ptr [[A_ADDR]], ptr [[TMP]], align 8
 // CHECK12-NEXT:    store ptr [[C_ADDR]], ptr [[_TMP1]], align 8
-// CHECK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK12-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP2]], 1
 // CHECK12-NEXT:    store i32 [[INC]], ptr [[TMP1]], align 4
 // CHECK12-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
 // CHECK12-NEXT:    [[DEC:%.*]] = add nsw i32 [[TMP3]], -1
 // CHECK12-NEXT:    store i32 [[DEC]], ptr [[B_ADDR]], align 4
-// CHECK12-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK12-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK12-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK12-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP5]], 1
 // CHECK12-NEXT:    store i32 [[DIV]], ptr [[TMP4]], align 4
@@ -2660,11 +2660,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    store i64 [[VLA2]], ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    store i64 [[VLA4]], ptr [[VLA_ADDR5]], align 8
 // CHECK17-NEXT:    store ptr [[VLA26]], ptr [[VLA2_ADDR]], align 8
-// CHECK17-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK17-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8
-// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META3]], !align [[META5:![0-9]+]]
 // CHECK17-NEXT:    [[TMP5:%.*]] = call ptr @llvm.stacksave.p0()
 // CHECK17-NEXT:    store ptr [[TMP5]], ptr [[SAVED_STACK]], align 8
 // CHECK17-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]]
@@ -2751,8 +2751,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8
-// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8
-// CHECK17-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META3]], !align [[META5]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK17-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]]
 // CHECK17-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
 // CHECK17-NEXT:    [[TMP8:%.*]] = add nuw i64 [[TMP7]], 127
diff --git a/clang/test/OpenMP/sections_firstprivate_codegen.cpp b/clang/test/OpenMP/sections_firstprivate_codegen.cpp
index 7c6d1839fb10e..32c5826e6f75d 100644
--- a/clang/test/OpenMP/sections_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -448,7 +448,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -517,10 +517,10 @@ int main() {
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_SECTIONS_LB_]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_SECTIONS_UB_]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_SECTIONS_ST_]], align 4
@@ -668,7 +668,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -996,7 +996,7 @@ int main() {
 // CHECK4-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK4-NEXT:    store i32 0, ptr [[DOTOMP_SECTIONS_LB_]], align 4
 // CHECK4-NEXT:    store i32 1, ptr [[DOTOMP_SECTIONS_UB_]], align 4
 // CHECK4-NEXT:    store i32 1, ptr [[DOTOMP_SECTIONS_ST_]], align 4
diff --git a/clang/test/OpenMP/single_firstprivate_codegen.cpp b/clang/test/OpenMP/single_firstprivate_codegen.cpp
index 27cd220adf225..31ea1ca4952fc 100644
--- a/clang/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/single_firstprivate_codegen.cpp
@@ -403,7 +403,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -467,10 +467,10 @@ int main() {
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_single(ptr @[[GLOB1]], i32 [[TMP5]])
@@ -585,7 +585,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -908,7 +908,7 @@ int main() {
 // CHECK4-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK4-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 // CHECK4-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_single(ptr @[[GLOB1]], i32 [[TMP2]])
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index aa50d8fb3aabd..a171827a18646 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -354,9 +354,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -396,9 +396,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -550,12 +550,12 @@ int main() {
 // CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -655,7 +655,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -711,14 +711,14 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK1-NEXT:    ret void
 //
@@ -752,9 +752,9 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -776,7 +776,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -815,7 +815,7 @@ int main() {
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i64 0, i64 [[IDXPROM]]
 // CHECK1-NEXT:    store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4
-// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i64 0, i64 [[IDXPROM9]]
@@ -912,7 +912,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1155,9 +1155,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -1197,9 +1197,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -1349,12 +1349,12 @@ int main() {
 // CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[VEC]], ptr [[TMP4]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -1454,7 +1454,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1510,14 +1510,14 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK3-NEXT:    ret void
 //
@@ -1551,9 +1551,9 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -1575,7 +1575,7 @@ int main() {
 // CHECK3-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK3-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK3:       omp.arraycpy.done4:
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -1613,7 +1613,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP14]]
 // CHECK3-NEXT:    store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4
-// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP16]]
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP15]], i32 4, i1 false)
@@ -1709,7 +1709,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1877,7 +1877,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
 // CHECK9-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4
 // CHECK9-NEXT:    store i32 [[TMP3]], ptr [[G1_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -1943,13 +1943,13 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK9-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
 // CHECK9-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
-// CHECK9-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    store volatile i32 1, ptr [[TMP8]], align 4
 // CHECK9-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK9-NEXT:    store ptr [[G_ADDR]], ptr [[TMP9]], align 8
 // CHECK9-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8
 // CHECK9-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK9-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP12]], align 8
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
index 78e40e54671ac..678770520f677 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -415,9 +415,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -459,9 +459,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -616,9 +616,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
@@ -741,12 +741,12 @@ int main() {
 // CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -846,7 +846,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -902,14 +902,14 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK1-NEXT:    ret void
 //
@@ -944,9 +944,9 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
@@ -968,7 +968,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -1005,7 +1005,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP16]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i64 [[TMP13]], i64 [[TMP15]], ptr [[VEC2]], i64 [[TMP17]], ptr [[S_ARR3]], ptr [[TMP18]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -1082,9 +1082,9 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -1112,7 +1112,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done5:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR2]]
@@ -1151,7 +1151,7 @@ int main() {
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC3]], i64 0, i64 [[IDXPROM]]
 // CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM10:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i64 0, i64 [[IDXPROM10]]
@@ -1233,7 +1233,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1476,9 +1476,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -1520,9 +1520,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -1675,9 +1675,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
@@ -1796,12 +1796,12 @@ int main() {
 // CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[VEC]], ptr [[TMP4]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -1901,7 +1901,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1957,14 +1957,14 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK3-NEXT:    ret void
 //
@@ -1999,9 +1999,9 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
@@ -2023,7 +2023,7 @@ int main() {
 // CHECK3-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK3-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK3:       omp.arraycpy.done4:
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -2058,7 +2058,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP14]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i32 [[TMP12]], i32 [[TMP13]], ptr [[VEC2]], i32 [[TMP15]], ptr [[S_ARR3]], ptr [[TMP16]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -2135,9 +2135,9 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -2163,7 +2163,7 @@ int main() {
 // CHECK3-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK3-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK3:       omp.arraycpy.done4:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -2201,7 +2201,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP16]]
 // CHECK3-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP18]]
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP17]], i32 4, i1 false)
@@ -2282,7 +2282,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -2450,7 +2450,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK5-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK5-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK5-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
 // CHECK5-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4
 // CHECK5-NEXT:    store i32 [[TMP3]], ptr [[G1_CASTED]], align 4
 // CHECK5-NEXT:    [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -2520,7 +2520,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK5-NEXT:    store i32 [[TMP11]], ptr [[G_CASTED]], align 4
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load volatile i32, ptr [[TMP13]], align 4
 // CHECK5-NEXT:    store i32 [[TMP14]], ptr [[G1_CASTED]], align 4
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -2607,13 +2607,13 @@ int main() {
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
 // CHECK5-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
-// CHECK5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK5-NEXT:    store volatile i32 1, ptr [[TMP10]], align 4
 // CHECK5-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK5-NEXT:    store ptr [[G_ADDR]], ptr [[TMP11]], align 8
 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK5-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK5-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8
@@ -2659,9 +2659,9 @@ int main() {
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK13-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK13-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -2703,9 +2703,9 @@ int main() {
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK13-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK13-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -2860,9 +2860,9 @@ int main() {
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK13-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
@@ -2987,14 +2987,14 @@ int main() {
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK13-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK13-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK13-NEXT:    ret void
 //
@@ -3029,9 +3029,9 @@ int main() {
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK13-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK13-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
@@ -3053,7 +3053,7 @@ int main() {
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done4:
-// CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR3]]
@@ -3090,7 +3090,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP16:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK13-NEXT:    store i32 [[TMP16]], ptr [[T_VAR_CASTED]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK13-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8
+// CHECK13-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i64 [[TMP13]], i64 [[TMP15]], ptr [[VEC2]], i64 [[TMP17]], ptr [[S_ARR3]], ptr [[TMP18]])
 // CHECK13-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK13:       omp.inner.for.inc:
@@ -3167,9 +3167,9 @@ int main() {
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK13-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK13-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -3197,7 +3197,7 @@ int main() {
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done5:
-// CHECK13-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK13-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]])
 // CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]])
 // CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR3]]
@@ -3236,7 +3236,7 @@ int main() {
 // CHECK13-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC3]], i64 0, i64 [[IDXPROM]]
 // CHECK13-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-// CHECK13-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// CHECK13-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK13-NEXT:    [[IDXPROM10:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK13-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i64 0, i64 [[IDXPROM10]]
@@ -3321,7 +3321,7 @@ int main() {
 // CHECK13-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK13-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -3352,7 +3352,7 @@ int main() {
 // CHECK13-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK13-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK13-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -3379,9 +3379,9 @@ int main() {
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9:![0-9]+]], !align [[META10:![0-9]+]]
+// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK15-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -3423,9 +3423,9 @@ int main() {
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK15-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK15-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -3578,9 +3578,9 @@ int main() {
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK15-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
@@ -3701,14 +3701,14 @@ int main() {
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK15-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK15-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK15-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK15-NEXT:    ret void
 //
@@ -3743,9 +3743,9 @@ int main() {
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK15-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK15-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
@@ -3767,7 +3767,7 @@ int main() {
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done4:
-// CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR3]]
@@ -3802,7 +3802,7 @@ int main() {
 // CHECK15-NEXT:    [[TMP14:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[TMP14]], ptr [[T_VAR_CASTED]], align 4
 // CHECK15-NEXT:    [[TMP15:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK15-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK15-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i32 [[TMP12]], i32 [[TMP13]], ptr [[VEC2]], i32 [[TMP15]], ptr [[S_ARR3]], ptr [[TMP16]])
 // CHECK15-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK15:       omp.inner.for.inc:
@@ -3879,9 +3879,9 @@ int main() {
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
+// CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK15-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK15-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -3907,7 +3907,7 @@ int main() {
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done4:
-// CHECK15-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK15-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]])
 // CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR3]]
@@ -3945,7 +3945,7 @@ int main() {
 // CHECK15-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK15-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP16]]
 // CHECK15-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-// CHECK15-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK15-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK15-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP18]]
 // CHECK15-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP17]], i32 4, i1 false)
@@ -4029,7 +4029,7 @@ int main() {
 // CHECK15-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK15-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -4060,7 +4060,7 @@ int main() {
 // CHECK15-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META9]], !align [[META10]]
 // CHECK15-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK15-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -4089,7 +4089,7 @@ int main() {
 // CHECK17-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK17-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
 // CHECK17-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK17-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK17-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]]
 // CHECK17-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4
 // CHECK17-NEXT:    store i32 [[TMP3]], ptr [[G1_CASTED]], align 4
 // CHECK17-NEXT:    [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -4159,7 +4159,7 @@ int main() {
 // CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK17-NEXT:    store i32 [[TMP11]], ptr [[G_CASTED]], align 4
 // CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6]], !align [[META7]]
 // CHECK17-NEXT:    [[TMP14:%.*]] = load volatile i32, ptr [[TMP13]], align 4
 // CHECK17-NEXT:    store i32 [[TMP14]], ptr [[G1_CASTED]], align 4
 // CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -4246,13 +4246,13 @@ int main() {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
 // CHECK17-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
-// CHECK17-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK17-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6]], !align [[META7]]
 // CHECK17-NEXT:    store volatile i32 1, ptr [[TMP10]], align 4
 // CHECK17-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK17-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK17-NEXT:    store ptr [[G_ADDR]], ptr [[TMP11]], align 8
 // CHECK17-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6]], !align [[META7]]
 // CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK17-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8
diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
index 294fcba7872b3..f3c9565a17656 100644
--- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
@@ -357,9 +357,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -399,9 +399,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -553,14 +553,14 @@ int main() {
 // CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -660,7 +660,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -716,14 +716,14 @@ int main() {
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK1-NEXT:    ret void
 //
@@ -757,9 +757,9 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -781,7 +781,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -820,7 +820,7 @@ int main() {
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i64 0, i64 [[IDXPROM]]
 // CHECK1-NEXT:    store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4
-// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i64 0, i64 [[IDXPROM9]]
@@ -917,7 +917,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1160,9 +1160,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -1202,9 +1202,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -1354,14 +1354,14 @@ int main() {
 // CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -1461,7 +1461,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1517,14 +1517,14 @@ int main() {
 // CHECK3-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK3-NEXT:    ret void
 //
@@ -1558,9 +1558,9 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -1582,7 +1582,7 @@ int main() {
 // CHECK3-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK3-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK3:       omp.arraycpy.done4:
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -1620,7 +1620,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP14]]
 // CHECK3-NEXT:    store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4
-// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP16]]
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP15]], i32 4, i1 false)
@@ -1716,7 +1716,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1884,7 +1884,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
 // CHECK9-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4
 // CHECK9-NEXT:    store i32 [[TMP3]], ptr [[G1_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -1950,13 +1950,13 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK9-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
 // CHECK9-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
-// CHECK9-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    store volatile i32 1, ptr [[TMP8]], align 4
 // CHECK9-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK9-NEXT:    store ptr [[G_ADDR]], ptr [[TMP9]], align 8
 // CHECK9-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8
 // CHECK9-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK9-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP12]], align 8
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
index d742b0a85af42..037aa12d57226 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -391,9 +391,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -435,9 +435,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -592,9 +592,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
@@ -717,14 +717,14 @@ int main() {
 // CHECK1-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -824,7 +824,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -880,14 +880,14 @@ int main() {
 // CHECK1-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK1-NEXT:    ret void
 //
@@ -922,9 +922,9 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
@@ -946,7 +946,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done4:
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -983,7 +983,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP16]], ptr [[T_VAR_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
-// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined, i64 [[TMP13]], i64 [[TMP15]], ptr [[VEC2]], i64 [[TMP17]], ptr [[S_ARR3]], ptr [[TMP18]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -1060,9 +1060,9 @@ int main() {
 // CHECK1-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -1090,7 +1090,7 @@ int main() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK1-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK1:       omp.arraycpy.done5:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]])
 // CHECK1-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]])
 // CHECK1-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR2]]
@@ -1129,7 +1129,7 @@ int main() {
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC3]], i64 0, i64 [[IDXPROM]]
 // CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM10:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i64 0, i64 [[IDXPROM10]]
@@ -1211,7 +1211,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]]
 // CHECK1-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1454,9 +1454,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -1498,9 +1498,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
@@ -1653,9 +1653,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
@@ -1774,14 +1774,14 @@ int main() {
 // CHECK3-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1
 // CHECK3-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -1881,7 +1881,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1937,14 +1937,14 @@ int main() {
 // CHECK3-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]])
 // CHECK3-NEXT:    ret void
 //
@@ -1979,9 +1979,9 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
@@ -2003,7 +2003,7 @@ int main() {
 // CHECK3-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK3-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK3:       omp.arraycpy.done4:
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -2038,7 +2038,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP14]], ptr [[T_VAR_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
-// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined, i32 [[TMP12]], i32 [[TMP13]], ptr [[VEC2]], i32 [[TMP15]], ptr [[S_ARR3]], ptr [[TMP16]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -2115,9 +2115,9 @@ int main() {
 // CHECK3-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
@@ -2143,7 +2143,7 @@ int main() {
 // CHECK3-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK3-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK3:       omp.arraycpy.done4:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]])
 // CHECK3-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]]
@@ -2181,7 +2181,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP16]]
 // CHECK3-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK3-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP18]]
 // CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP17]], i32 4, i1 false)
@@ -2262,7 +2262,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK3-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK3-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -2430,7 +2430,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[G_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
 // CHECK9-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4
 // CHECK9-NEXT:    store i32 [[TMP3]], ptr [[G1_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -2500,7 +2500,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[G_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP11]], ptr [[G_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP12:%.*]] = load i64, ptr [[G_CASTED]], align 8
-// CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    [[TMP14:%.*]] = load volatile i32, ptr [[TMP13]], align 4
 // CHECK9-NEXT:    store i32 [[TMP14]], ptr [[G1_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP15:%.*]] = load i64, ptr [[G1_CASTED]], align 8
@@ -2587,13 +2587,13 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
 // CHECK9-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
 // CHECK9-NEXT:    store i32 1, ptr [[G_ADDR]], align 4
-// CHECK9-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    store volatile i32 1, ptr [[TMP10]], align 4
 // CHECK9-NEXT:    store i32 2, ptr [[SIVAR_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
 // CHECK9-NEXT:    store ptr [[G_ADDR]], ptr [[TMP11]], align 8
 // CHECK9-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
-// CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]]
 // CHECK9-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
 // CHECK9-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK9-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8
diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
index 6f21c9e31bd8d..fec8fcb78f91e 100644
--- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -458,9 +458,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11:![0-9]+]], !align [[META12:![0-9]+]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
 // CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -493,9 +493,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i64 8, i1 false)
 // CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
@@ -808,7 +808,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
 // CHECK9-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -872,9 +872,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
 // CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
@@ -902,9 +902,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8
-// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
+// CHECK9-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i64 8, i1 false)
 // CHECK9-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
@@ -1032,7 +1032,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META11]], !align [[META12]]
 // CHECK9-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1250,9 +1250,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12:![0-9]+]], !align [[META13:![0-9]+]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
 // CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -1285,9 +1285,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i32 8, i1 false)
 // CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
@@ -1600,7 +1600,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
 // CHECK11-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP1:%.*]] = load float, ptr [[F2]], align 4
 // CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1664,9 +1664,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
 // CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4
 // CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
@@ -1694,9 +1694,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[VAR]], ptr [[VAR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4
-// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
+// CHECK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i32 8, i1 false)
 // CHECK11-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
@@ -1824,7 +1824,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META12]], !align [[META13]]
 // CHECK11-NEXT:    [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[F2]], align 4
 // CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0
@@ -1997,7 +1997,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8
-// CHECK17-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
 // CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8
@@ -2031,11 +2031,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    store i64 [[VLA2]], ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    store i64 [[VLA4]], ptr [[VLA_ADDR5]], align 8
 // CHECK17-NEXT:    store ptr [[VLA26]], ptr [[VLA2_ADDR]], align 8
-// CHECK17-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META8]], !align [[META10:![0-9]+]]
 // CHECK17-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8
-// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK17-NEXT:    [[TMP5:%.*]] = call ptr @llvm.stacksave.p0()
 // CHECK17-NEXT:    store ptr [[TMP5]], ptr [[SAVED_STACK]], align 8
 // CHECK17-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]]
@@ -2235,7 +2235,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8
-// CHECK17-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
 // CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[S_ADDR]], align 8
@@ -2273,8 +2273,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8
 // CHECK17-NEXT:    [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8
-// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8
-// CHECK17-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK17-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8]], !align [[META9]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META8]], !align [[META10]]
 // CHECK17-NEXT:    [[TMP6:%.*]] = call ptr @llvm.stacksave.p0()
 // CHECK17-NEXT:    store ptr [[TMP6]], ptr [[SAVED_STACK]], align 8
 // CHECK17-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]]
@@ -2460,7 +2460,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4
 // CHECK19-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4
-// CHECK19-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4
+// CHECK19-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]]
 // CHECK19-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 4
@@ -2494,11 +2494,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    store i32 [[VLA2]], ptr [[VLA_ADDR3]], align 4
 // CHECK19-NEXT:    store i32 [[VLA4]], ptr [[VLA_ADDR5]], align 4
 // CHECK19-NEXT:    store ptr [[VLA26]], ptr [[VLA2_ADDR]], align 4
-// CHECK19-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK19-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK19-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4
 // CHECK19-NEXT:    [[TMP3:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4
-// CHECK19-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4
+// CHECK19-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK19-NEXT:    [[TMP5:%.*]] = call ptr @llvm.stacksave.p0()
 // CHECK19-NEXT:    store ptr [[TMP5]], ptr [[SAVED_STACK]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP2]], [[TMP3]]
@@ -2696,7 +2696,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4
 // CHECK19-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4
-// CHECK19-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4
+// CHECK19-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK19-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[S_ADDR]], align 4
@@ -2734,8 +2734,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4
 // CHECK19-NEXT:    [[TMP3:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4
-// CHECK19-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4
-// CHECK19-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK19-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 4, !nonnull [[META8]], !align [[META9]]
 // CHECK19-NEXT:    [[TMP6:%.*]] = call ptr @llvm.stacksave.p0()
 // CHECK19-NEXT:    store ptr [[TMP6]], ptr [[SAVED_STACK]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP2]], [[TMP3]]
diff --git a/clang/test/Preprocessor/arm-acle-6.4.c b/clang/test/Preprocessor/arm-acle-6.4.c
index 2c8f4868263a6..48deba74c8ab2 100644
--- a/clang/test/Preprocessor/arm-acle-6.4.c
+++ b/clang/test/Preprocessor/arm-acle-6.4.c
@@ -188,6 +188,37 @@
 // RUN: %clang --target=arm-arm-none-eabi -mcpu=cortex-m33 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-M-DSP
 // RUN: %clang --target=arm-arm-none-eabi -march=armv8m.main+dsp -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-M-DSP
 
+// RUN: %clang -target arm-none-linux-eabi -march=armv8m.base -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V8M-BASE
+
+// CHECK-V8M-BASE-NOT: __ARM_ARCH_ISA_ARM
+// CHECK-V8M-BASE-NOT: __ARM_FEATURE_DSP
+// CHECK-V8M-BASE-NOT: __ARM_FEATURE_SIMD32
+// CHECK-V8M-BASE:     __ARM_ARCH 8
+// CHECK-V8M-BASE:     __ARM_ARCH_ISA_THUMB 1
+// CHECK-V8M-BASE:     __ARM_ARCH_PROFILE 'M'
+// CHECK-V8M-BASE:     __ARM_FEATURE_CLZ 1
+// CHECK-V8M-BASE:     __ARM_FEATURE_IDIV 1
+// CHECK-V8M-BASE:     __ARM_FEATURE_LDREX 0x7
+// CHECK-V8M-BASE:     __ARM_FEATURE_QBIT 1
+// CHECK-V8M-BASE:     __ARM_FEATURE_SAT
+// CHECK-V8M-BASE-NOT: __ARM_FEATURE_UNALIGNED
+
+// RUN: %clang -target arm-none-linux-eabi -march=armv8m.main -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V8M-MAIN
+// RUN: %clang -target arm-none-linux-eabi -march=armv8.1m.main -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V8M-MAIN
+
+// CHECK-V8M-MAIN-NOT: __ARM_ARCH_ISA_ARM
+// CHECK-V8M-MAIN-NOT: __ARM_FEATURE_DSP
+// CHECK-V8M-MAIN-NOT: __ARM_FEATURE_SIMD32
+// CHECK-V8M-MAIN:     __ARM_ARCH 8
+// CHECK-V8M-MAIN:     __ARM_ARCH_ISA_THUMB 2
+// CHECK-V8M-MAIN:     __ARM_ARCH_PROFILE 'M'
+// CHECK-V8M-MAIN:     __ARM_FEATURE_CLZ 1
+// CHECK-V8M-MAIN:     __ARM_FEATURE_IDIV 1
+// CHECK-V8M-MAIN:     __ARM_FEATURE_LDREX 0x7
+// CHECK-V8M-MAIN:     __ARM_FEATURE_QBIT 1
+// CHECK-V8M-MAIN:     __ARM_FEATURE_SAT 1
+// CHECK-V8M-MAIN:     __ARM_FEATURE_UNALIGNED 1
+
 // CHECK-M-DSP: __ARM_FEATURE_DSP 1
 // CHECK-M-DSP: __ARM_FEATURE_SIMD32 1
 
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index 4fead33bd826e..125872a001bac 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -80,10 +80,10 @@
 // MIPS32BE:#define __INTMAX_MAX__ 9223372036854775807LL
 // MIPS32BE:#define __INTMAX_TYPE__ long long int
 // MIPS32BE:#define __INTMAX_WIDTH__ 64
-// MIPS32BE:#define __INTPTR_FMTd__ "ld"
-// MIPS32BE:#define __INTPTR_FMTi__ "li"
-// MIPS32BE:#define __INTPTR_MAX__ 2147483647L
-// MIPS32BE:#define __INTPTR_TYPE__ long int
+// MIPS32BE:#define __INTPTR_FMTd__ "d"
+// MIPS32BE:#define __INTPTR_FMTi__ "i"
+// MIPS32BE:#define __INTPTR_MAX__ 2147483647
+// MIPS32BE:#define __INTPTR_TYPE__ int
 // MIPS32BE:#define __INTPTR_WIDTH__ 32
 // MIPS32BE:#define __INT_FAST16_FMTd__ "hd"
 // MIPS32BE:#define __INT_FAST16_FMTi__ "hi"
@@ -185,8 +185,8 @@
 // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINTMAX_WIDTH__ 64
-// MIPS32BE:#define __UINTPTR_MAX__ 4294967295UL
-// MIPS32BE:#define __UINTPTR_TYPE__ long unsigned int
+// MIPS32BE:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32BE:#define __UINTPTR_TYPE__ unsigned int
 // MIPS32BE:#define __UINTPTR_WIDTH__ 32
 // MIPS32BE:#define __UINT_FAST16_MAX__ 65535
 // MIPS32BE:#define __UINT_FAST16_TYPE__ unsigned short
@@ -300,10 +300,10 @@
 // MIPS32EL:#define __INTMAX_MAX__ 9223372036854775807LL
 // MIPS32EL:#define __INTMAX_TYPE__ long long int
 // MIPS32EL:#define __INTMAX_WIDTH__ 64
-// MIPS32EL:#define __INTPTR_FMTd__ "ld"
-// MIPS32EL:#define __INTPTR_FMTi__ "li"
-// MIPS32EL:#define __INTPTR_MAX__ 2147483647L
-// MIPS32EL:#define __INTPTR_TYPE__ long int
+// MIPS32EL:#define __INTPTR_FMTd__ "d"
+// MIPS32EL:#define __INTPTR_FMTi__ "i"
+// MIPS32EL:#define __INTPTR_MAX__ 2147483647
+// MIPS32EL:#define __INTPTR_TYPE__ int
 // MIPS32EL:#define __INTPTR_WIDTH__ 32
 // MIPS32EL:#define __INT_FAST16_FMTd__ "hd"
 // MIPS32EL:#define __INT_FAST16_FMTi__ "hi"
@@ -402,8 +402,8 @@
 // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINTMAX_WIDTH__ 64
-// MIPS32EL:#define __UINTPTR_MAX__ 4294967295UL
-// MIPS32EL:#define __UINTPTR_TYPE__ long unsigned int
+// MIPS32EL:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32EL:#define __UINTPTR_TYPE__ unsigned int
 // MIPS32EL:#define __UINTPTR_WIDTH__ 32
 // MIPS32EL:#define __UINT_FAST16_MAX__ 65535
 // MIPS32EL:#define __UINT_FAST16_TYPE__ unsigned short
@@ -547,10 +547,10 @@
 // MIPSN32BE: #define __INTMAX_MAX__ 9223372036854775807LL
 // MIPSN32BE: #define __INTMAX_TYPE__ long long int
 // MIPSN32BE: #define __INTMAX_WIDTH__ 64
-// MIPSN32BE: #define __INTPTR_FMTd__ "ld"
-// MIPSN32BE: #define __INTPTR_FMTi__ "li"
-// MIPSN32BE: #define __INTPTR_MAX__ 2147483647L
-// MIPSN32BE: #define __INTPTR_TYPE__ long int
+// MIPSN32BE: #define __INTPTR_FMTd__ "d"
+// MIPSN32BE: #define __INTPTR_FMTi__ "i"
+// MIPSN32BE: #define __INTPTR_MAX__ 2147483647
+// MIPSN32BE: #define __INTPTR_TYPE__ int
 // MIPSN32BE: #define __INTPTR_WIDTH__ 32
 // MIPSN32BE: #define __INT_FAST16_FMTd__ "hd"
 // MIPSN32BE: #define __INT_FAST16_FMTi__ "hi"
@@ -684,12 +684,12 @@
 // MIPSN32BE: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPSN32BE: #define __UINTMAX_TYPE__ long long unsigned int
 // MIPSN32BE: #define __UINTMAX_WIDTH__ 64
-// MIPSN32BE: #define __UINTPTR_FMTX__ "lX"
-// MIPSN32BE: #define __UINTPTR_FMTo__ "lo"
-// MIPSN32BE: #define __UINTPTR_FMTu__ "lu"
-// MIPSN32BE: #define __UINTPTR_FMTx__ "lx"
-// MIPSN32BE: #define __UINTPTR_MAX__ 4294967295UL
-// MIPSN32BE: #define __UINTPTR_TYPE__ long unsigned int
+// MIPSN32BE: #define __UINTPTR_FMTX__ "X"
+// MIPSN32BE: #define __UINTPTR_FMTo__ "o"
+// MIPSN32BE: #define __UINTPTR_FMTu__ "u"
+// MIPSN32BE: #define __UINTPTR_FMTx__ "x"
+// MIPSN32BE: #define __UINTPTR_MAX__ 4294967295U
+// MIPSN32BE: #define __UINTPTR_TYPE__ unsigned int
 // MIPSN32BE: #define __UINTPTR_WIDTH__ 32
 // MIPSN32BE: #define __UINT_FAST16_FMTX__ "hX"
 // MIPSN32BE: #define __UINT_FAST16_FMTo__ "ho"
@@ -864,10 +864,10 @@
 // MIPSN32EL: #define __INTMAX_MAX__ 9223372036854775807LL
 // MIPSN32EL: #define __INTMAX_TYPE__ long long int
 // MIPSN32EL: #define __INTMAX_WIDTH__ 64
-// MIPSN32EL: #define __INTPTR_FMTd__ "ld"
-// MIPSN32EL: #define __INTPTR_FMTi__ "li"
-// MIPSN32EL: #define __INTPTR_MAX__ 2147483647L
-// MIPSN32EL: #define __INTPTR_TYPE__ long int
+// MIPSN32EL: #define __INTPTR_FMTd__ "d"
+// MIPSN32EL: #define __INTPTR_FMTi__ "i"
+// MIPSN32EL: #define __INTPTR_MAX__ 2147483647
+// MIPSN32EL: #define __INTPTR_TYPE__ int
 // MIPSN32EL: #define __INTPTR_WIDTH__ 32
 // MIPSN32EL: #define __INT_FAST16_FMTd__ "hd"
 // MIPSN32EL: #define __INT_FAST16_FMTi__ "hi"
@@ -1001,12 +1001,12 @@
 // MIPSN32EL: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPSN32EL: #define __UINTMAX_TYPE__ long long unsigned int
 // MIPSN32EL: #define __UINTMAX_WIDTH__ 64
-// MIPSN32EL: #define __UINTPTR_FMTX__ "lX"
-// MIPSN32EL: #define __UINTPTR_FMTo__ "lo"
-// MIPSN32EL: #define __UINTPTR_FMTu__ "lu"
-// MIPSN32EL: #define __UINTPTR_FMTx__ "lx"
-// MIPSN32EL: #define __UINTPTR_MAX__ 4294967295UL
-// MIPSN32EL: #define __UINTPTR_TYPE__ long unsigned int
+// MIPSN32EL: #define __UINTPTR_FMTX__ "X"
+// MIPSN32EL: #define __UINTPTR_FMTo__ "o"
+// MIPSN32EL: #define __UINTPTR_FMTu__ "u"
+// MIPSN32EL: #define __UINTPTR_FMTx__ "x"
+// MIPSN32EL: #define __UINTPTR_MAX__ 4294967295U
+// MIPSN32EL: #define __UINTPTR_TYPE__ unsigned int
 // MIPSN32EL: #define __UINTPTR_WIDTH__ 32
 // MIPSN32EL: #define __UINT_FAST16_FMTX__ "hX"
 // MIPSN32EL: #define __UINT_FAST16_FMTo__ "ho"
diff --git a/clang/test/Preprocessor/openacc.c b/clang/test/Preprocessor/openacc.c
index be7052f00e0ce..283baa6c2fe4b 100644
--- a/clang/test/Preprocessor/openacc.c
+++ b/clang/test/Preprocessor/openacc.c
@@ -1,13 +1,9 @@
 // RUN: %clang_cc1 -E -fopenacc %s | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang_cc1 -E -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=OVERRIDE
 
-// DEFAULT: OpenACC:1:
-// OVERRIDE: OpenACC:202211:
+// DEFAULT: OpenACC:202506:
 OpenACC:_OPENACC:
 
 // RUN: %clang_cc1 -E -dM -fopenacc %s | FileCheck %s --check-prefix=MACRO_PRINT_DEF
-// RUN: %clang_cc1 -E -dM -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=MACRO_PRINT_OVR
-// MACRO_PRINT_DEF: #define _OPENACC 1
-// MACRO_PRINT_OVR: #define _OPENACC 202211
+// MACRO_PRINT_DEF: #define _OPENACC 202506
 
 
diff --git a/clang/test/Preprocessor/pragma-pushpop-macro-diag.c b/clang/test/Preprocessor/pragma-pushpop-macro-diag.c
new file mode 100644
index 0000000000000..293cb828d832e
--- /dev/null
+++ b/clang/test/Preprocessor/pragma-pushpop-macro-diag.c
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -fms-extensions %s -fsyntax-only -verify
+
+#pragma push_macro("") // expected-warning {{'#pragma push_macro' expected a non-empty string}}
+#pragma pop_macro("") // expected-warning {{'#pragma pop_macro' expected a non-empty string}}
diff --git a/clang/test/Preprocessor/pragma-pushpop-macro.c b/clang/test/Preprocessor/pragma-pushpop-macro.c
index 0aee074c55c77..238e3ed5eddb3 100644
--- a/clang/test/Preprocessor/pragma-pushpop-macro.c
+++ b/clang/test/Preprocessor/pragma-pushpop-macro.c
@@ -56,3 +56,6 @@ int P;
 // CHECK: int pmy2 = 4
 // CHECK: int Q;
 // CHECK: int P;
+
+#pragma push_macro("")
+#pragma pop_macro("")
diff --git a/clang/test/Preprocessor/stdint.c b/clang/test/Preprocessor/stdint.c
index 899ff59bf0b6a..9f982a3a94fd6 100644
--- a/clang/test/Preprocessor/stdint.c
+++ b/clang/test/Preprocessor/stdint.c
@@ -350,8 +350,8 @@
 // MIPS:typedef int8_t int_fast8_t;
 // MIPS:typedef uint8_t uint_fast8_t;
 //
-// MIPS:typedef long int intptr_t;
-// MIPS:typedef long unsigned int uintptr_t;
+// MIPS:typedef int intptr_t;
+// MIPS:typedef unsigned int uintptr_t;
 //
 // MIPS:typedef long long int intmax_t;
 // MIPS:typedef long long unsigned int uintmax_t;
@@ -396,9 +396,9 @@
 // MIPS:INT_FAST64_MAX_ 9223372036854775807LL
 // MIPS:UINT_FAST64_MAX_ 18446744073709551615ULL
 //
-// MIPS:INTPTR_MIN_ (-2147483647L -1)
-// MIPS:INTPTR_MAX_ 2147483647L
-// MIPS:UINTPTR_MAX_ 4294967295UL
+// MIPS:INTPTR_MIN_ (-2147483647 -1)
+// MIPS:INTPTR_MAX_ 2147483647
+// MIPS:UINTPTR_MAX_ 4294967295U
 // MIPS:PTRDIFF_MIN_ (-2147483647 -1)
 // MIPS:PTRDIFF_MAX_ 2147483647
 // MIPS:SIZE_MAX_ 4294967295U
diff --git a/clang/test/Sema/attr-nonblocking-sema.cpp b/clang/test/Sema/attr-nonblocking-sema.cpp
index f13cc783dfc33..c8fb40693eec0 100644
--- a/clang/test/Sema/attr-nonblocking-sema.cpp
+++ b/clang/test/Sema/attr-nonblocking-sema.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify -Wfunction-effects %s
-// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 -Wfunction-effects %s
+// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify -Wfunction-effects -Wfunction-effect-redeclarations %s
+// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 -Wfunction-effects -Wfunction-effect-redeclarations %s
 
 #if !__has_attribute(nonblocking)
 #error "the 'nonblocking' attribute is not available"
@@ -127,29 +127,35 @@ void type_conversions_2()
 #endif
 
 // --- VIRTUAL METHODS ---
-// Attributes propagate to overridden methods, so no diagnostics except for conflicts.
+// Attributes propagate to overridden methods.
 // Check this in the syntax tests too.
 #ifdef __cplusplus
 struct Base {
   virtual void f1();
-  virtual void nonblocking() noexcept [[clang::nonblocking]];
-  virtual void nonallocating() noexcept [[clang::nonallocating]];
+  virtual void nonblocking() noexcept [[clang::nonblocking]]; // expected-note {{overridden virtual function is here}}
+  virtual void nonallocating() noexcept [[clang::nonallocating]]; // expected-note {{overridden virtual function is here}}
   virtual void f2() [[clang::nonallocating]]; // expected-note {{previous declaration is here}}
+  virtual void f3() [[clang::nonblocking]]; // expected-note {{overridden virtual function is here}}
 };
 
 struct Derived : public Base {
   void f1() [[clang::nonblocking]] override;
-  void nonblocking() noexcept override;
-  void nonallocating() noexcept override;
+  void nonblocking() noexcept override; // expected-warning {{overriding function is missing 'nonblocking' attribute from base declaration}}
+  void nonallocating() noexcept override; // expected-warning {{overriding function is missing 'nonallocating' attribute from base declaration}}
   void f2() [[clang::allocating]] override; // expected-warning {{effects conflict when merging declarations; kept 'allocating', discarded 'nonallocating'}}
 };
+
+template <bool B>
+struct TDerived : public Base {
+  void f3() [[clang::nonblocking(B)]] override; // expected-warning {{attribute 'nonblocking' on overriding function conflicts with base declaration}}
+};
 #endif // __cplusplus
 
 // --- REDECLARATIONS ---
 
 void f2();
 void f2() [[clang::nonblocking]]; // expected-note {{previous declaration is here}}
-void f2(); // expected-warning {{attribute 'nonblocking' on function does not match previous declaration}}
+void f2(); // expected-warning {{redeclaration is missing 'nonblocking' attribute from previous declaration}}
 // Note: we verify that the attribute is actually seen during the constraints tests.
 
 void f3() [[clang::blocking]]; // expected-note {{previous declaration is here}}
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index 01057b3f8d083..8548d3be8c44a 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -386,6 +386,96 @@ void test_builtin_elementwise_minimum(int i, short s, float f, double d, float4
   // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was '_Complex float')}}
 }
 
+void test_builtin_elementwise_maximumnum(int i, short s, float f, double d, float4 fv, double4 dv, int3 iv, unsigned3 uv, int *p) {
+  i = __builtin_elementwise_maximumnum(p, d);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int *')}}
+
+  struct Foo foo = __builtin_elementwise_maximumnum(d, d);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'double'}}
+
+  i = __builtin_elementwise_maximumnum(i);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+
+  i = __builtin_elementwise_maximumnum();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+
+  i = __builtin_elementwise_maximumnum(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+
+  i = __builtin_elementwise_maximumnum(fv, iv);
+  // expected-error@-1 {{arguments are of different types ('float4' (vector of 4 'float' values) vs 'int3' (vector of 3 'int' values))}}
+
+  i = __builtin_elementwise_maximumnum(uv, iv);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned3' (vector of 3 'unsigned int' values))}}
+
+  dv = __builtin_elementwise_maximumnum(fv, dv);
+  // expected-error@-1 {{arguments are of different types ('float4' (vector of 4 'float' values) vs 'double4' (vector of 4 'double' values))}}
+
+  d = __builtin_elementwise_maximumnum(f, d);
+  // expected-error@-1 {{arguments are of different types ('float' vs 'double')}}
+
+  fv = __builtin_elementwise_maximumnum(fv, fv);
+
+  i = __builtin_elementwise_maximumnum(iv, iv);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
+
+  i = __builtin_elementwise_maximumnum(i, i);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int')}}
+
+  int A[10];
+  A = __builtin_elementwise_maximumnum(A, A);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int *')}}
+
+  _Complex float c1, c2;
+  c1 = __builtin_elementwise_maximumnum(c1, c2);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was '_Complex float')}}
+}
+
+void test_builtin_elementwise_minimumnum(int i, short s, float f, double d, float4 fv, double4 dv, int3 iv, unsigned3 uv, int *p) {
+  i = __builtin_elementwise_minimumnum(p, d);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int *')}}
+
+  struct Foo foo = __builtin_elementwise_minimumnum(d, d);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'double'}}
+
+  i = __builtin_elementwise_minimumnum(i);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+
+  i = __builtin_elementwise_minimumnum();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+
+  i = __builtin_elementwise_minimumnum(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+
+  i = __builtin_elementwise_minimumnum(fv, iv);
+  // expected-error@-1 {{arguments are of different types ('float4' (vector of 4 'float' values) vs 'int3' (vector of 3 'int' values))}}
+
+  i = __builtin_elementwise_minimumnum(uv, iv);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'unsigned3' (vector of 3 'unsigned int' values))}}
+
+  dv = __builtin_elementwise_minimumnum(fv, dv);
+  // expected-error@-1 {{arguments are of different types ('float4' (vector of 4 'float' values) vs 'double4' (vector of 4 'double' values))}}
+
+  d = __builtin_elementwise_minimumnum(f, d);
+  // expected-error@-1 {{arguments are of different types ('float' vs 'double')}}
+
+  fv = __builtin_elementwise_minimumnum(fv, fv);
+
+  i = __builtin_elementwise_minimumnum(iv, iv);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
+
+  i = __builtin_elementwise_minimumnum(i, i);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int')}}
+
+  int A[10];
+  A = __builtin_elementwise_minimumnum(A, A);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was 'int *')}}
+
+  _Complex float c1, c2;
+  c1 = __builtin_elementwise_minimumnum(c1, c2);
+  // expected-error@-1 {{1st argument must be a scalar or vector of floating-point types (was '_Complex float')}}
+}
+
 void test_builtin_elementwise_bitreverse(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
 
   struct Foo s = __builtin_elementwise_bitreverse(i);
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index e358aceaad5a4..11cc7fbc0feb3 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -32,7 +32,7 @@ void f(void)
 _Complex float g16 = (1.0f + 1.0fi);
 
 // ?: in constant expressions.
-int g17[(3?:1) - 2]; 
+int g17[(3?:1) - 2];
 
 EVAL_EXPR(18, ((int)((void*)10 + 10)) == 20 ? 1 : -1);
 
@@ -150,3 +150,7 @@ struct PR35214_X {
 int PR35214_x;
 int PR35214_y = ((struct PR35214_X *)&PR35214_x)->arr[1]; // expected-error {{not a compile-time constant}}
 int *PR35214_z = &((struct PR35214_X *)&PR35214_x)->arr[1]; // ok, &PR35214_x + 2
+
+
+int * GH149500_p = &(*(int *)0x400);
+static const void *GH149500_q = &(*(const struct sysrq_key_op *)0);
diff --git a/clang/test/Sema/diagnose_if.c b/clang/test/Sema/diagnose_if.c
index e9b8497d5ca4e..a4cf43e9c869f 100644
--- a/clang/test/Sema/diagnose_if.c
+++ b/clang/test/Sema/diagnose_if.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -verify -fno-builtin
+// RUN: %clang_cc1 %s -verify -fno-builtin -fexperimental-new-constant-interpreter
 
 #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
 
diff --git a/clang/test/Sema/dllexport.c b/clang/test/Sema/dllexport.c
index 3f911fb095c0f..5f6ff36e290e9 100644
--- a/clang/test/Sema/dllexport.c
+++ b/clang/test/Sema/dllexport.c
@@ -2,6 +2,10 @@
 // RUN: %clang_cc1 -triple x86_64-win32   -fsyntax-only -fms-extensions -verify -std=c11 %s
 // RUN: %clang_cc1 -triple i686-mingw32   -fsyntax-only -fms-extensions -verify -std=c11 %s
 // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 %s
+// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify -std=c99 %s
+// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fms-extensions -verify -std=c99 %s
+// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fms-extensions -verify -std=c11 %s
 
 // Invalid usage.
 __declspec(dllexport) typedef int typedef1;
diff --git a/clang/test/Sema/format-strings-fixit-ssize_t.c b/clang/test/Sema/format-strings-fixit-ssize_t.c
index 2c83db0b66362..96806517b80f2 100644
--- a/clang/test/Sema/format-strings-fixit-ssize_t.c
+++ b/clang/test/Sema/format-strings-fixit-ssize_t.c
@@ -11,8 +11,8 @@
 int printf(char const *, ...);
 int scanf(const char *, ...);
 
+typedef long ssize_t;
 void test(void) {
-  typedef signed long int ssize_t;
   printf("%f", (ssize_t) 42);
   ssize_t s;
   scanf("%f",  &s);
diff --git a/clang/test/Sema/format-strings-scanf.c b/clang/test/Sema/format-strings-scanf.c
index eb5b8ec36bf7a..d1f694f3595cf 100644
--- a/clang/test/Sema/format-strings-scanf.c
+++ b/clang/test/Sema/format-strings-scanf.c
@@ -210,13 +210,13 @@ void test_size_types(void) {
   scanf("%zd", &s); // No warning.
 
   double d2 = 0.;
-  scanf("%zd", &d2); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}}
+  scanf("%zd", &d2); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}}
 
   ssize_t sn = 0;
   scanf("%zn", &sn); // No warning.
 
   double d3 = 0.;
-  scanf("%zn", &d3); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}}
+  scanf("%zn", &d3); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}}
 }
 
 void test_ptrdiff_t_types(void) {
diff --git a/clang/test/Sema/format-strings-size_t.c b/clang/test/Sema/format-strings-size_t.c
index 5058a762183d3..19e3ac9e6ecd9 100644
--- a/clang/test/Sema/format-strings-size_t.c
+++ b/clang/test/Sema/format-strings-size_t.c
@@ -2,10 +2,14 @@
 
 int printf(char const *, ...);
 
+#include <stddef.h>
+
 void test(void) {
   // size_t
+  printf("%zu", (size_t)0); // no-warning
+  printf("%zu", sizeof(int)); // no-warning
+  printf("%zu", (size_t)0 + sizeof(int)); // no-warning
   printf("%zu", (double)42); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long') but the argument has type 'double'}}
-
   // intmax_t / uintmax_t
   printf("%jd", (double)42); // expected-warning {{format specifies type 'intmax_t' (aka 'long') but the argument has type 'double'}}
   printf("%ju", (double)42); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long') but the argument has type 'double'}}
diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c
index b92f3ce6a3e8c..77e3b8a4287ed 100644
--- a/clang/test/Sema/matrix-type-builtins.c
+++ b/clang/test/Sema/matrix-type-builtins.c
@@ -73,13 +73,13 @@ void column_major_load(float *p1, int *p2, _Bool *p3, struct Foo *p4) {
       10,         // expected-error {{1st argument must be a pointer to a valid matrix element type}}
       1ull << 21, // expected-error {{row dimension is outside the allowed range [1, 1048575]}}
       1ull << 21, // expected-error {{column dimension is outside the allowed range [1, 1048575]}}
-      "");        // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}}
+      "");        // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}}
 
   sx5x10_t a13 = __builtin_matrix_column_major_load(
       10,  // expected-error {{1st argument must be a pointer to a valid matrix element type}}
-      *p4, // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}}
+      *p4, // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}}
       "",  // expected-error {{column argument must be a constant unsigned integer expression}}
-           // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}}
+           // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}}
       10);
 }
 
@@ -96,7 +96,7 @@ void column_major_store(sx5x10_t *m1, ix3x2_t *m2, float *p1, int *p2, struct Fo
   __builtin_matrix_column_major_store(
       "",   // expected-error {{1st argument must be a matrix}}
       10,   // expected-error {{2nd argument must be a pointer to a valid matrix element type}}
-      *p3); // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}}
+      *p3); // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}}
 
   __builtin_matrix_column_major_store(
       *m1,
diff --git a/clang/test/Sema/ptrauth-atomic-ops.c b/clang/test/Sema/ptrauth-atomic-ops.c
index ccb9a1abcc14d..8872090d83b8d 100644
--- a/clang/test/Sema/ptrauth-atomic-ops.c
+++ b/clang/test/Sema/ptrauth-atomic-ops.c
@@ -54,7 +54,7 @@ void f() {
   __c11_atomic_exchange(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst);
   // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}}
   __c11_atomic_fetch_add(ATOMIZE(non_addr_discriminatedauthenticated_ptr), ATOMIZE(j), memory_order_seq_cst);
-  // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type 'long'}}
+  // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type '__ptrdiff_t'}}
   __c11_atomic_fetch_and(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst);
   // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}}
 
diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c
index e3932615c2962..b4e5214a7cb50 100644
--- a/clang/test/Sema/ptrauth.c
+++ b/clang/test/Sema/ptrauth.c
@@ -57,7 +57,7 @@ void test_string_discriminator(const char *str) {
   __builtin_ptrauth_string_discriminator(str); // expected-error {{argument must be a string literal}}
   __builtin_ptrauth_string_discriminator(L"wide test"); // expected-error {{argument must be a string literal}} expected-warning {{incompatible pointer types passing 'int[10]' to parameter of type 'const char *'}}
 
-  void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type 'unsigned long'}}
+  void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type '__size_t'}}
 }
 
 
diff --git a/clang/test/Sema/warn-fortify-source.c b/clang/test/Sema/warn-fortify-source.c
index f48ea0907c657..216878c0836d8 100644
--- a/clang/test/Sema/warn-fortify-source.c
+++ b/clang/test/Sema/warn-fortify-source.c
@@ -3,6 +3,11 @@
 // RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify
 // RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS
 
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS -fexperimental-new-constant-interpreter
+
 typedef unsigned long size_t;
 
 #ifdef __cplusplus
diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
index 38dfdb98f08fc..2b934ac23b92d 100644
--- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-lifetime-safety -mllvm -debug-only=LifetimeFacts -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
 // REQUIRES: asserts
 
 struct MyObj {
@@ -19,10 +19,6 @@ MyObj* return_local_addr() {
 // CHECK:   ReturnOfOrigin (OriginID: [[O_RET_VAL]])
 // CHECK:   Expire (LoanID: [[L_X]])
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_ADDR_X]] contains Loan [[L_X]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_X]]
-// CHECK-DAG: Origin [[O_RET_VAL]] contains Loan [[L_X]]
 
 
 // Pointer Assignment and Return
@@ -47,15 +43,6 @@ MyObj* assign_and_return_local_addr() {
 // CHECK: ReturnOfOrigin (OriginID: [[O_PTR2_RVAL_2]])
 // CHECK: Expire (LoanID: [[L_Y]])
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_ADDR_Y]] contains Loan [[L_Y]]
-// CHECK-DAG: Origin [[O_PTR1]] contains Loan [[L_Y]]
-// CHECK-DAG: Origin [[O_PTR2]] contains Loan [[L_Y]]
-// CHECK-DAG: Origin [[O_PTR1_RVAL]] contains Loan [[L_Y]]
-// CHECK-DAG: Origin [[O_PTR1_RVAL_2]] contains Loan [[L_Y]]
-// CHECK-DAG: Origin [[O_PTR2_RVAL]] contains Loan [[L_Y]]
-// CHECK-DAG: Origin [[O_PTR2_RVAL_2]] contains Loan [[L_Y]]
-
 
 // Return of Non-Pointer Type
 // CHECK-LABEL: Function: return_int_val
@@ -65,8 +52,6 @@ int return_int_val() {
   return x;
 }
 // CHECK-NEXT: End of Block
-// CHECK: Dataflow results:
-// CHECK:  <empty>
 
 
 // Loan Expiration (Automatic Variable, C++)
@@ -79,9 +64,6 @@ void loan_expires_cpp() {
 // CHECK: AssignOrigin (DestID: [[O_POBJ:[0-9]+]], SrcID: [[O_ADDR_OBJ]])
 // CHECK: Expire (LoanID: [[L_OBJ]])
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_ADDR_OBJ]] contains Loan [[L_OBJ]]
-// CHECK-DAG: Origin [[O_POBJ]] contains Loan [[L_OBJ]]
 
 
 // FIXME: No expire for Trivial Destructors
@@ -96,10 +78,6 @@ void loan_expires_trivial() {
 // CHECK-NEXT: End of Block
   // FIXME: Add check for Expire once trivial destructors are handled for expiration.
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_ADDR_TRIVIAL_OBJ]] contains Loan [[L_TRIVIAL_OBJ]]
-// CHECK-DAG: Origin [[O_PTOBJ]] contains Loan [[L_TRIVIAL_OBJ]]
-
 
 // CHECK-LABEL: Function: conditional
 void conditional(bool condition) {
@@ -119,13 +97,6 @@ void conditional(bool condition) {
   // CHECK: AssignOrigin (DestID: [[O_P_RVAL:[0-9]+]], SrcID: [[O_P]])
   // CHECK: AssignOrigin (DestID: [[O_Q:[0-9]+]], SrcID: [[O_P_RVAL]])
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_ADDR_A]] contains Loan [[L_A]]
-// CHECK-DAG: Origin [[O_ADDR_B]] contains Loan [[L_B]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_A]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_B]]
-// CHECK-DAG: Origin [[O_Q]] contains Loan [[L_A]]
-// CHECK-DAG: Origin [[O_Q]] contains Loan [[L_B]]
 
 
 // CHECK-LABEL: Function: pointers_in_a_cycle
@@ -161,25 +132,6 @@ void pointers_in_a_cycle(bool condition) {
 // CHECK:   AssignOrigin (DestID: [[O_P3]], SrcID: [[O_TEMP_RVAL]])
   }
 }
-// At the end of the analysis, the origins for the pointers involved in the cycle
-// (p1, p2, p3, temp) should all contain the loans from v1, v2, and v3 at the fixed point.
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V1]]
-// CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V2]]
-// CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V3]]
-// CHECK-DAG: Origin [[O_P2]] contains Loan [[L_V1]]
-// CHECK-DAG: Origin [[O_P2]] contains Loan [[L_V2]]
-// CHECK-DAG: Origin [[O_P2]] contains Loan [[L_V3]]
-// CHECK-DAG: Origin [[O_P3]] contains Loan [[L_V1]]
-// CHECK-DAG: Origin [[O_P3]] contains Loan [[L_V2]]
-// CHECK-DAG: Origin [[O_P3]] contains Loan [[L_V3]]
-// CHECK-DAG: Origin [[O_TEMP]] contains Loan [[L_V1]]
-// CHECK-DAG: Origin [[O_TEMP]] contains Loan [[L_V2]]
-// CHECK-DAG: Origin [[O_TEMP]] contains Loan [[L_V3]]
-// CHECK-DAG: Origin [[O_ADDR_V1]] contains Loan [[L_V1]]
-// CHECK-DAG: Origin [[O_ADDR_V2]] contains Loan [[L_V2]]
-// CHECK-DAG: Origin [[O_ADDR_V3]] contains Loan [[L_V3]]
-
 
 // CHECK-LABEL: Function: overwrite_origin
 void overwrite_origin() {
@@ -195,10 +147,6 @@ void overwrite_origin() {
 // CHECK:   Expire (LoanID: [[L_S2]])
 // CHECK:   Expire (LoanID: [[L_S1]])
 }
-// CHECK: Dataflow results:
-// CHECK:     Origin [[O_P]] contains Loan [[L_S2]]
-// CHECK-NOT: Origin [[O_P]] contains Loan [[L_S1]]
-
 
 // CHECK-LABEL: Function: reassign_to_null
 void reassign_to_null() {
@@ -213,8 +161,6 @@ void reassign_to_null() {
 }
 // FIXME: Have a better representation for nullptr than just an empty origin. 
 //        It should be a separate loan and origin kind.
-// CHECK: Dataflow results:
-// CHECK: Origin [[O_P]] contains no loans
 
 
 // CHECK-LABEL: Function: reassign_in_if
@@ -235,11 +181,6 @@ void reassign_in_if(bool condition) {
 // CHECK:   Expire (LoanID: [[L_S2]])
 // CHECK:   Expire (LoanID: [[L_S1]])
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]]
-// CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]]
-// CHECK-DAG: Origin [[O_ADDR_S2]] contains Loan [[L_S2]]
 
 
 // CHECK-LABEL: Function: assign_in_switch
@@ -276,14 +217,6 @@ void assign_in_switch(int mode) {
 // CHECK-DAG:   Expire (LoanID: [[L_S2]])
 // CHECK-DAG:   Expire (LoanID: [[L_S1]])
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S3]]
-// CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]]
-// CHECK-DAG: Origin [[O_ADDR_S2]] contains Loan [[L_S2]]
-// CHECK-DAG: Origin [[O_ADDR_S3]] contains Loan [[L_S3]]
-
 
 // CHECK-LABEL: Function: loan_in_loop
 void loan_in_loop(bool condition) {
@@ -299,10 +232,6 @@ void loan_in_loop(bool condition) {
 // CHECK:   Expire (LoanID: [[L_INNER]])
   }
 }
-// CHECK: Dataflow results:
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_INNER]]
-// CHECK-DAG: Origin [[O_ADDR_INNER]] contains Loan [[L_INNER]]
-
 
 // CHECK-LABEL: Function: loop_with_break
 void loop_with_break(int count) {
@@ -326,13 +255,6 @@ void loop_with_break(int count) {
 // CHECK:   Expire (LoanID: [[L_S1]])
 }
 
-// CHECK-LABEL: Dataflow results:
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]]
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]]
-// CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]]
-// CHECK-DAG: Origin [[O_ADDR_S2]] contains Loan [[L_S2]]
-
-
 // CHECK-LABEL: Function: nested_scopes
 void nested_scopes() {
   MyObj* p = nullptr;
@@ -355,13 +277,6 @@ void nested_scopes() {
 // CHECK:   Expire (LoanID: [[L_OUTER]])
 }
 
-// CHECK-LABEL: Dataflow results:
-// CHECK-DAG: Origin [[O_P]] contains Loan [[L_INNER]]
-// CHECK-DAG: Origin [[O_ADDR_INNER]] contains Loan [[L_INNER]]
-// CHECK-DAG: Origin [[O_ADDR_OUTER]] contains Loan [[L_OUTER]]
-// CHECK-NOT: Origin [[O_P]] contains Loan [[L_OUTER]]
-
-
 // CHECK-LABEL: Function: pointer_indirection
 void pointer_indirection() {
   int a;
diff --git a/clang/test/SemaCXX/attr-target-clones-riscv.cpp b/clang/test/SemaCXX/attr-target-clones-riscv.cpp
index 102bb4b9b3d2b..7648284f80c48 100644
--- a/clang/test/SemaCXX/attr-target-clones-riscv.cpp
+++ b/clang/test/SemaCXX/attr-target-clones-riscv.cpp
@@ -9,6 +9,9 @@ void __attribute__((target_clones("default", "mtune=sifive-u74"))) mtune() {}
 // expected-warning@+1 {{version list contains duplicate entries}}
 void __attribute__((target_clones("default", "arch=+c", "arch=+c"))) dupVersion() {}
 
+// expected-warning@+1 {{version list contains duplicate entries}}
+void __attribute__((target_clones(" default", "default "))) dupDefault() {}
+
 // expected-warning@+1 {{unsupported '' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
 void __attribute__((target_clones("default", ""))) emptyVersion() {}
 
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index c390fee1c38d9..5ecb8c607f59a 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1413,8 +1413,8 @@ namespace ComplexConstexpr {
   static_assert(t2p[2] == 0.0, ""); // expected-error {{constant expr}} expected-note {{one-past-the-end pointer}}
   static_assert(t2p[3] == 0.0, ""); // expected-error {{constant expr}} expected-note {{cannot refer to element 3 of array of 2 elements}}
   constexpr _Complex float *p = 0; // expected-warning {{'_Complex' is a C99 extension}}
-  constexpr float pr = __real *p; // expected-error {{constant expr}} expected-note {{cannot access real component of null}}
-  constexpr float pi = __imag *p; // expected-error {{constant expr}} expected-note {{cannot access imaginary component of null}}
+  constexpr float pr = __real *p; // expected-error {{constant expr}} expected-note {{dereferencing a null pointer}}
+  constexpr float pi = __imag *p; // expected-error {{constant expr}} expected-note {{dereferencing a null pointer}}
   constexpr const _Complex double *q = &test3 + 1; // expected-warning {{'_Complex' is a C99 extension}}
   constexpr double qr = __real *q; // expected-error {{constant expr}} expected-note {{cannot access real component of pointer past the end}}
   constexpr double qi = __imag *q; // expected-error {{constant expr}} expected-note {{cannot access imaginary component of pointer past the end}}
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index e93b98c185a82..1743e0e3ac4b5 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -265,7 +265,7 @@ namespace const_modify {
 
 namespace null {
   constexpr int test(int *p) {
-    return *p = 123; // expected-note {{assignment to dereferenced null pointer}}
+    return *p = 123; // expected-note {{dereferencing a null pointer}}
   }
   static_assert(test(0), ""); // expected-error {{constant expression}} expected-note {{in call}}
 }
@@ -1335,4 +1335,118 @@ namespace comparison_dead_variable {
   }
   // FIXME: This should fail.
   static_assert(f(),"");
+
+}
+namespace GH48665 {
+constexpr bool foo(int *i) {
+    int &j = *i;
+    // expected-note@-1 {{dereferencing a null pointer}}
+    return true;
+}
+
+static_assert(foo(nullptr), ""); // expected-note {{in call to 'foo(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+constexpr bool foo_rvalue(int *i) {
+    int &&j = (int&&)*i;
+    // expected-note@-1 {{dereferencing a null pointer}}
+    return true;
+}
+static_assert(foo_rvalue(nullptr), ""); // expected-note {{in call to 'foo_rvalue(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+int arr[3]; // expected-note {{declared here}}
+constexpr bool f() { // cxx14_20-error {{constexpr function never produces a constant expression}}
+  int &r  = arr[3]; // expected-note {{read of dereferenced one-past-the-end pointer}} \
+                    // cxx14_20-note {{read of dereferenced one-past-the-end pointer}} \
+                    // expected-warning {{array index 3 is past the end of the array}}
+  return true;
+}
+static_assert(f(), ""); // expected-note {{in call to 'f()'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+
+struct Aggregate {
+   int &r;
+};
+constexpr bool test_agg(int *i) {
+   Aggregate a{*i}; //expected-note {{dereferencing a null pointer}}
+   return true;
+}
+static_assert(test_agg(nullptr), ""); // expected-note {{in call to 'test_agg(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+struct B {
+  constexpr B(int *p) : r{*p} {}  // expected-note {{dereferencing a null pointer}}
+  int &r;
+};
+
+constexpr bool test_ctr(int *i) {
+    B b(i); // expected-note {{in call to 'B(nullptr)'}}
+    return true;
+}
+
+static_assert(test_ctr(nullptr), ""); // expected-note {{in call to 'test_ctr(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+
+// verify that we can dereference function pointers
+namespace functions {
+
+constexpr int f() {return 0;}
+constexpr int(*f_ptr)() = &f;
+constexpr int(*null_ptr)() = nullptr;
+
+constexpr int(&f_ref)() = f;
+constexpr int test = (*f_ptr)();
+constexpr int test2 = (*f_ref)();
+constexpr int test3 = (*f_ref)();
+constexpr int test4 = (*null_ptr)();
+//expected-error@-1 {{constexpr variable 'test4' must be initialized by a constant expression}} \
+//expected-note@-1 {{'(*null_ptr)' evaluates to a null function pointer}}
+
+constexpr int(*f_ptr_arr[1])() = {&f};
+constexpr int test_array_ok = (f_ptr_arr[0])();
+constexpr int test_array_err = (f_ptr_arr[1])();
+// expected-error@-1 {{constexpr variable 'test_array_err' must be initialized by a constant expression}} \
+// expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+
+struct S {
+    int(*f_ptr)() = &f;
+    int(*f_ptr_arr[1])() = {&f};
+    int(&f_ref)() = f;
+    int(*null_ptr)() = nullptr;
+};
+
+constexpr int test_member() {
+    S s {};
+    (*s.f_ptr)();
+    (*s.f_ref)();
+    (s.f_ref)();
+    (s.f_ptr_arr[0])();
+    (s.f_ptr_arr[1])();
+    // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+    return 0;
+}
+constexpr int test_member_null() { // cxx14_20-error {{never produces a constant expression}}
+    S s {};
+    (*s.null_ptr)(); // expected-note {{'(*s.null_ptr)' evaluates to a null function pointer}} \
+                     // cxx14_20-note {{'(*s.null_ptr)' evaluates to a null function pointer}}
+    return 0;
+}
+
+static_assert(test_member(), "");
+// expected-error@-1 {{static assertion expression is not an integral constant expression}} \
+// expected-note@-1 {{in call to 'test_member()'}}
+
+static_assert(test_member_null(), "");
+// expected-error@-1 {{static assertion expression is not an integral constant expression}} \
+// expected-note@-1 {{in call to 'test_member_null()'}}
+
+}
+}
+
+namespace GH149500 {
+  unsigned int * p = &(*(unsigned int *)0x400);
+  static const void *q = &(*(const struct sysrq_key_op *)0);
 }
diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
index 85720606fe9de..ffb7e633c2919 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
@@ -927,7 +927,7 @@ namespace dynamic_alloc {
   constexpr void use_after_free() { // expected-error {{never produces a constant expression}}
     int *p = new int;
     delete p;
-    *p = 1; // expected-note {{assignment to heap allocated object that has been deleted}}
+    *p = 1; // expected-note {{read of heap allocated object that has been deleted}}
   }
   constexpr void use_after_free_2() { // expected-error {{never produces a constant expression}}
     struct X { constexpr void f() {} };
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index 03fea91169787..16f5f823d26c1 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -357,3 +357,29 @@ namespace pointer_comparisons {
   static_assert(!f4()); // expected-error {{static assertion expression is not an integral constant expression}} \
                         // expected-note {{in call to 'f4()'}}
 }
+
+namespace GH149188 {
+namespace enable_if_1 {
+  template <__SIZE_TYPE__ N>
+  constexpr void foo(const char (&Str)[N])
+  __attribute((enable_if(__builtin_strlen(Str), ""))) {}
+
+  void x() {
+      foo("1234");
+  }
+}
+
+namespace enable_if_2 {
+  constexpr const char (&f())[];
+  extern const char (&Str)[];
+  constexpr int foo()
+  __attribute((enable_if(__builtin_strlen(Str), "")))
+  {return __builtin_strlen(Str);}
+
+  constexpr const char (&f())[] {return "a";}
+  constexpr const char (&Str)[] = f();
+  void x() {
+      constexpr int x = foo();
+  }
+}
+}
diff --git a/clang/test/SemaCXX/constexpr-backtrace-limit.cpp b/clang/test/SemaCXX/constexpr-backtrace-limit.cpp
index e867afdff5c3c..f0c1206a4b8d3 100644
--- a/clang/test/SemaCXX/constexpr-backtrace-limit.cpp
+++ b/clang/test/SemaCXX/constexpr-backtrace-limit.cpp
@@ -15,14 +15,14 @@
 
 // RUN: not %clang_cc1 -std=c++11 -fsyntax-only %s -fconstexpr-backtrace-limit=2 -fconstexpr-depth=8 -fno-caret-diagnostics 2>&1 | FileCheck %s -check-prefix=TEST3
 // TEST3: constant expression
-// TEST3-NEXT: reinterpret_cast
+// TEST3-NEXT: dereferencing a null pointer
 // TEST3-NEXT: in call to 'recurse(0)'
 // TEST3-NEXT: skipping 4 calls
 // TEST3-NEXT: in call to 'recurse(5)'
 
 // RUN: not %clang_cc1 -std=c++11 -fsyntax-only %s -fconstexpr-backtrace-limit=8 -fconstexpr-depth=8 -fno-caret-diagnostics 2>&1 | FileCheck %s -check-prefix=TEST4
 // TEST4: constant expression
-// TEST4-NEXT: reinterpret_cast
+// TEST4-NEXT: dereferencing a null pointer
 // TEST4-NEXT: in call to 'recurse(0)'
 // TEST4-NEXT: in call to 'recurse(1)'
 // TEST4-NEXT: in call to 'recurse(2)'
diff --git a/clang/test/SemaCXX/constexpr-never-constant.cpp b/clang/test/SemaCXX/constexpr-never-constant.cpp
index 307810ee263dd..5756bb647ce88 100644
--- a/clang/test/SemaCXX/constexpr-never-constant.cpp
+++ b/clang/test/SemaCXX/constexpr-never-constant.cpp
@@ -24,3 +24,10 @@ constexpr void other_func() {
 
   throw 12;
 }
+
+namespace GH149041 {
+  // Make sure these don't trigger the diagnostic.
+  extern const bool& b;
+  constexpr bool fun1() { return b; }
+  constexpr bool fun2(const bool& b) { return b; }
+}
diff --git a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp
index 6f6f9b04aa392..4cf0e9ffe1d64 100644
--- a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp
+++ b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
+// RUN: %clang_cc1 -std=c++2c -verify %s -fexperimental-new-constant-interpreter
 
 
 namespace std {
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index 6f4003f525930..c6919447798da 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -372,7 +372,7 @@ void test__builtin_trivially_relocate() {
     __builtin_trivially_relocate((S*)0, 0, 0); //expected-error {{argument to '__builtin_trivially_relocate' must be relocatable}}
     __builtin_trivially_relocate((int*)0, 0, 0); //expected-error {{first and second arguments to '__builtin_trivially_relocate' must be of the same type}}
 
-    __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '{{.*}}' with an rvalue of type 'int *'}}
+    __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '__size_t' (aka '{{.*}}') with an rvalue of type 'int *'}}
     __builtin_trivially_relocate((int*)0, (int*)0, 0);
     __builtin_trivially_relocate((R*)0, (R*)0, 0);
 }
diff --git a/clang/test/SemaCXX/diagnose_if-ext.cpp b/clang/test/SemaCXX/diagnose_if-ext.cpp
index d5625b501322e..e0f73976eea3a 100644
--- a/clang/test/SemaCXX/diagnose_if-ext.cpp
+++ b/clang/test/SemaCXX/diagnose_if-ext.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -Wpedantic -fsyntax-only %s -verify
+// RUN: %clang_cc1 -Wpedantic -fsyntax-only %s -verify -fexperimental-new-constant-interpreter
 
 void foo() __attribute__((diagnose_if(1, "", "error"))); // expected-warning{{'diagnose_if' is a clang extension}}
 void foo(int a) __attribute__((diagnose_if(a, "", "error"))); // expected-warning{{'diagnose_if' is a clang extension}}
diff --git a/clang/test/SemaCXX/diagnose_if.cpp b/clang/test/SemaCXX/diagnose_if.cpp
index 21897c5184b73..1b9e660c4e224 100644
--- a/clang/test/SemaCXX/diagnose_if.cpp
+++ b/clang/test/SemaCXX/diagnose_if.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 -fexperimental-new-constant-interpreter
 
 #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
 
diff --git a/clang/test/SemaCXX/enum-scoped.cpp b/clang/test/SemaCXX/enum-scoped.cpp
index 0ce47274979d9..2d7b3c9557ebd 100644
--- a/clang/test/SemaCXX/enum-scoped.cpp
+++ b/clang/test/SemaCXX/enum-scoped.cpp
@@ -35,7 +35,7 @@ int a1[Val2];
 int a2[E1::Val1];
 
 #if __cplusplus >= 201703L
-// expected-error@-3 {{type 'E1' is not implicitly convertible to 'unsigned long'}}
+// expected-error@-3 {{type 'E1' is not implicitly convertible to '__size_t' (aka 'unsigned long')}}
 #else
 // expected-error@-5 {{size of array has non-integer type}}
 #endif
@@ -44,7 +44,7 @@ int* p1 = new int[Val2];
 int* p2 = new int[E1::Val1];
 
 #if __cplusplus >= 201703L
-// expected-error@-3 {{converting 'E1' to incompatible type 'unsigned long'}}
+// expected-error@-3 {{converting 'E1' to incompatible type '__size_t'}}
 #else
 // expected-error@-5 {{array size expression must have integral or unscoped enumeration type, not 'E1'}}
 #endif
diff --git a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp
index 0b76fdd92dabd..91c4ffda9d818 100644
--- a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp
+++ b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp
@@ -22,7 +22,7 @@ void test_non_last_argument(int i, int j, ...) {
   va_list ap;
   __va_start(&ap, &i, 4);
   // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}}
-  // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}}
+  // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}}
 }
 
 void test_stack_allocated(int i, ...) {
@@ -30,13 +30,13 @@ void test_stack_allocated(int i, ...) {
   int j;
   __va_start(&ap, &j, 4);
   // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}}
-  // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}}
+  // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}}
 }
 
 void test_non_pointer_addressof(int i, ...) {
   va_list ap;
   __va_start(&ap, 1, 4);
   // expected-error@-1{{passing 'int' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int' vs 'const char *')}}
-  // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}}
+  // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}}
 }
 
diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp
index f918501554f80..c05130bb30729 100644
--- a/clang/test/SemaCXX/new-delete.cpp
+++ b/clang/test/SemaCXX/new-delete.cpp
@@ -109,7 +109,7 @@ void bad_news(int *ip)
 #elif __cplusplus <= 201103L
   // expected-error@-4 {{array size expression must have integral or unscoped enumeration type, not 'double'}}
 #else
-  // expected-warning@-6 {{implicit conversion from 'double' to 'unsigned int' changes value from 1.1 to 1}}
+  // expected-warning@-6 {{implicit conversion from 'double' to '__size_t' (aka 'unsigned int') changes value from 1.1 to 1}}
 #endif
 
   (void)new int[1][i];  // expected-note {{read of non-const variable 'i' is not allowed in a constant expression}}
diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp
index b53c67ee67932..b2ebd2abb785e 100644
--- a/clang/test/SemaCXX/static-assert-cxx26.cpp
+++ b/clang/test/SemaCXX/static-assert-cxx26.cpp
@@ -19,7 +19,7 @@ struct InvalidSize {
     const char* data() const;
 };
 static_assert(true, InvalidSize{}); // expected-error {{the message in a static assertion must have a 'size()' member function returning an object convertible to 'std::size_t'}} \
-                                    // expected-error {{value of type 'const char *' is not implicitly convertible to 'unsigned long'}}
+                                    // expected-error {{value of type 'const char *' is not implicitly convertible to '__size_t' (aka 'unsigned long')}}
 struct InvalidData {
     unsigned long size() const;
     unsigned long data() const;
@@ -371,13 +371,13 @@ struct E {
 
 static_assert(true, A{}); // expected-error {{the message in this static assertion is not a constant expression}}
                           // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
-static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}}
                           // expected-error@-1 {{the message in this static assertion is not a constant expression}}
                           // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
-static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}}
                           // expected-error@-1 {{the message in this static assertion is not a constant expression}}
                           // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
-static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}}
+static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}}
                           // expected-error@-1 {{the message in this static assertion is not a constant expression}}
                           // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
 static_assert(true, E{}); // expected-error {{the message in this static assertion is not a constant expression}}
@@ -391,21 +391,21 @@ static_assert(
 
 static_assert(
   false, // expected-error {{static assertion failed}}
-  B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+  B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}}
       // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}}
       // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
 );
 
 static_assert(
   false, // expected-error {{static assertion failed}}
-  C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+  C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}}
       // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}}
       // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
 );
 
 static_assert(
   false, // expected-error {{static assertion failed}}
-  D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}}
+  D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}}
       // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}}
       // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
 );
diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp
index 87dc58861ee81..281ef5fa63d6f 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp
@@ -75,7 +75,7 @@ template <typename T, typename U> void *operator new(std::type_identity<T>, U);
 template <typename T, typename U> void operator delete(std::type_identity<T>, U, size_t, std::align_val_t);
 // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter; use 'void *' instead}}
 template <typename T, typename U> void operator delete(std::type_identity<T>, void *, U, std::align_val_t);
-// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use 'unsigned long' instead}}
+// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use '__size_t' (aka 'unsigned long') instead}}
 template <typename T, typename U> void operator delete(std::type_identity<T>, void *, size_t, U);
 // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 4th parameter; use 'std::align_val_t' instead}}
 template <typename U> void *operator new(std::type_identity<int>, typename S<U>::size_ty, std::align_val_t);
diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
index 45fdec606ad1b..56c564f170271 100644
--- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
+++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
@@ -65,12 +65,12 @@ void testOveraligned() {
 #ifdef NO_ERRORS
 // expected-no-diagnostics
 #else
-// expected-error-re@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-17 {{if you supply your own aligned allocation functions}}
 // expected-error-re@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-19 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-20 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-20 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-21 {{if you supply your own aligned allocation functions}}
 // expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-23 {{if you supply your own aligned allocation functions}}
@@ -83,12 +83,12 @@ void testOveraligned() {
 // expected-error-re@-28 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
 // expected-note@-29 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-29 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-29 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-30 {{if you supply your own aligned allocation functions}}
 // expected-error-re@-31 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-32 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-33 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-33 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-34 {{if you supply your own aligned allocation functions}}
 // expected-error-re@-35 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-36 {{if you supply your own aligned allocation functions}}
@@ -111,19 +111,19 @@ void testOveralignedCheckOS() {
 // expected-no-diagnostics
 #else
 #if defined(IOS)
-// expected-error@-7 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on iOS 11 or newer}}
+// expected-error@-7 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on iOS 11 or newer}}
 // expected-error@-8 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on iOS 11 or newer}}}
 #elif defined(TVOS)
-// expected-error@-10 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on tvOS 11 or newer}}}
+// expected-error@-10 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on tvOS 11 or newer}}}
 // expected-error@-11 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on tvOS 11 or newer}}}
 #elif defined(WATCHOS)
-// expected-error@-13 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on watchOS 4 or newer}}}
+// expected-error@-13 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on watchOS 4 or newer}}}
 // expected-error@-14 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
 #elif defined(MACOS)
-// expected-error@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on macOS 10.13 or newer}}}
+// expected-error@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on macOS 10.13 or newer}}}
 // expected-error@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}}
 #elif defined(ZOS)
-// expected-error@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is not available on z/OS}}}
+// expected-error@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is not available on z/OS}}}
 // expected-error@-20 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}}
 #endif
 
@@ -181,19 +181,19 @@ void testExplicitOperatorNewDeleteOveraligned() {
 #ifdef NO_ERRORS
 // expected-no-diagnostics
 #else
-// expected-error-re@-11 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-11 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-12 {{if you supply your own aligned allocation functions}}
 
 // expected-error-re@-13 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-14 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-15 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-15 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-16 {{if you supply your own aligned allocation functions}}
 
 // expected-error-re@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-18 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}}
+// expected-error-re@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}}
 // expected-note@-20 {{if you supply your own aligned allocation functions}}
 
 // expected-error-re@-21 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
new file mode 100644
index 0000000000000..6cb3e56c20f0e
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+
+float test_no_second_arg(float3 p0) {
+  return refract(p0);
+  // expected-error@-1 {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+}
+
+float test_no_third_arg(float3 p0) {
+  return refract(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+}
+
+float test_too_many_arg(float2 p0) {
+  return refract(p0, p0, p0, p0);
+  // expected-error@-1 {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+}
+
+float test_double_inputs(double p0, double p1, double p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+}
+
+float test_int_inputs(int p0, int p1, int p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+}
+
+float1 test_vec1_inputs(float1 p0, float1 p1, float1 p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with T = float1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, vector<float, 1>>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with T = float1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, vector<float, 1>>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, half>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, float>'}}
+}
+
+typedef float float5 __attribute__((ext_vector_type(5)));
+
+float5 test_vec5_inputs(float5 p0, float5 p1,  float p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: deduced conflicting types for parameter 'T' ('float5' (vector of 5 'float' values) vs. 'float')}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: deduced conflicting types for parameter 'T' ('float5' (vector of 5 'float' values) vs. 'float')}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 5]: no type named 'Type' in 'hlsl::__detail::enable_if<false, half>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 5]: no type named 'Type' in 'hlsl::__detail::enable_if<false, float>'}}
+}
diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl
index 1f813e7a350b1..16b60fe40f806 100644
--- a/clang/test/SemaHLSL/Language/AssignArray.hlsl
+++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl
@@ -13,7 +13,7 @@ export void fn(int8 A) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<int, 4> *' <ArrayToPointerDecay>
 // CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector<int, 4>[2]' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector<int, 4>[2]' lvalue Var {{.*}} 'a' 'int8':'vector<int, 4>[2]'
-// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long'
+// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long'
   int8 b = a;
 
 // CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector<int, 4>[2]' cinit
@@ -25,7 +25,7 @@ export void fn(int8 A) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<int, 4> *' <ArrayToPointerDecay>
 // CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector<int, 4>[2]' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} 'vector<int, 4>[2]' lvalue ParmVar {{.*}} 'A' 'vector<int, 4>[2]'
-// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long'
+// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long'
   int8 c = A;
 }
 
diff --git a/clang/test/SemaHLSL/Language/InitListAST.hlsl b/clang/test/SemaHLSL/Language/InitListAST.hlsl
index 78bf269769ae6..460ec38bb44af 100644
--- a/clang/test/SemaHLSL/Language/InitListAST.hlsl
+++ b/clang/test/SemaHLSL/Language/InitListAST.hlsl
@@ -97,12 +97,12 @@ TwoFloats case3(int Val) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}}'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 TwoFloats case4(int2 TwoVals) {
   TwoFloats TF4 = {TwoVals};
   return TF4;
@@ -115,11 +115,11 @@ TwoFloats case4(int2 TwoVals) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 TwoInts case5(int2 TwoVals) {
   TwoInts TI1 = {TwoVals};
   return TI1;
@@ -209,22 +209,22 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
@@ -240,32 +240,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
@@ -273,32 +273,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 AnimalBits case8(Doggo D1) {
   AnimalBits A1 = {D1};
   return A1;
@@ -317,22 +317,22 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
@@ -347,32 +347,32 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
@@ -380,32 +380,32 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'Doggo'
 // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
@@ -413,25 +413,25 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}}
@@ -446,43 +446,43 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh[4]'
 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
 // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
@@ -490,22 +490,22 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
@@ -520,32 +520,32 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
@@ -553,32 +553,32 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
 // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
@@ -586,25 +586,25 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}}
@@ -619,65 +619,65 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
 // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
@@ -692,32 +692,32 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
@@ -725,32 +725,32 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
 // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
@@ -758,25 +758,25 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}}
@@ -791,43 +791,43 @@ AnimalBits case8(Doggo D1) {
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 Zoo case9(Doggo D1, AnimalBits A1) {
   Zoo Z1 = {D1, A1, D1, A1, D1, A1};
   return Z1;
@@ -867,28 +867,28 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) {
 // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
 // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
 // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
 // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3
 FourFloats case11(float F) {
   FourFloats FF1 = {F.xxxx};
   return FF1;
@@ -1008,52 +1008,52 @@ FourFloats case16() {
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}}
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 float case17() {
   IntAndFloat Structs[] = {1,2,3,4};
   float Floats[] = {Structs, Structs};
diff --git a/clang/test/SemaObjC/matrix-type-builtins.m b/clang/test/SemaObjC/matrix-type-builtins.m
index 21b8bf864271d..3916017cf0fe0 100644
--- a/clang/test/SemaObjC/matrix-type-builtins.m
+++ b/clang/test/SemaObjC/matrix-type-builtins.m
@@ -27,5 +27,5 @@ void test_element_type_mismatch(u4x4 m, MatrixValue *mv) {
 
   __builtin_matrix_column_major_store(mv.value, mv.value, mv.value);
   // expected-error@-1 {{2nd argument must be a pointer to a valid matrix element type}}
-  // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type 'unsigned long}}
+  // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type '__size_t' (aka 'unsigned long')}}
 }
diff --git a/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp b/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp
new file mode 100644
index 0000000000000..e0aee123fe754
--- /dev/null
+++ b/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct ImplicitCtorDtor{};
+
+struct ImplDeletedCtor{
+  ImplDeletedCtor(int i);
+};
+
+struct DefaultedCtor {
+  DefaultedCtor() = default;
+};
+
+struct ImpledCtor {
+  ImpledCtor() = default;
+};
+
+
+struct DeletedCtor {
+  DeletedCtor() = delete;
+};
+
+struct ImpledDtor {
+  ~ImpledDtor();
+};
+
+struct DefaultedDtor {
+  ~DefaultedDtor() = default;
+};
+
+struct DeletedDtor {
+  ~DeletedDtor() = delete;
+};
+
+struct ImplicitDelDtor {
+  DeletedDtor d;
+};
+
+void private_uses(ImplicitCtorDtor &CDT, ImplDeletedCtor &IDC,
+                  DefaultedCtor &DefC, ImpledCtor &IC, DeletedCtor &DelC,
+                  ImpledDtor &ID, DefaultedDtor &DefD, DeletedDtor &DelD,
+                  ImplicitDelDtor &IDD) {
+
+#pragma acc parallel private(CDT)
+  ;
+
+  // expected-error@+1{{variable of type 'ImplDeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+#pragma acc parallel private(IDC)
+  ;
+
+#pragma acc parallel private(DefC)
+  ;
+
+#pragma acc parallel private(IC)
+  ;
+
+  // expected-error@+1{{variable of type 'DeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+#pragma acc parallel private(DelC)
+  ;
+
+#pragma acc parallel private(ID)
+  ;
+
+#pragma acc parallel private(DefD)
+  ;
+
+  // expected-error@+1{{variable of type 'DeletedDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+#pragma acc parallel private(DelD)
+  ;
+
+  // expected-error@+1{{variable of type 'ImplicitDelDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+#pragma acc parallel private(IDD)
+  ;
+
+}
+
+template<typename T>
+void private_templ(T& t) {
+#pragma acc parallel private(t) // #PRIV
+  ;
+}
+
+void inst(ImplicitCtorDtor &CDT, ImplDeletedCtor &IDC,
+                  DefaultedCtor &DefC, ImpledCtor &IC, DeletedCtor &DelC,
+                  ImpledDtor &ID, DefaultedDtor &DefD, DeletedDtor &DelD,
+                  ImplicitDelDtor &IDD) {
+  private_templ(CDT);
+  // expected-error@#PRIV{{variable of type 'ImplDeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(IDC);
+  private_templ(DefC);
+  private_templ(IC);
+  // expected-error@#PRIV{{variable of type 'DeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(DelC);
+  private_templ(ID);
+  private_templ(DefD);
+  // expected-error@#PRIV{{variable of type 'DeletedDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(DelD);
+  // expected-error@#PRIV{{variable of type 'ImplicitDelDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(IDD);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 55d705e6ad238..8aa7c34672783 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -114,6 +114,13 @@ void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int
   *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
 }
 
+void test_amdgcn_wmma_f32_16x16x128_f8f6f4(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(mod, a, 2, b, 0, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, mod, b, 0, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4(1, a, 2, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_f8f6f4' must be a constant integer}}
+}
+
 void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, int mod)
 {
   *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
index 5915393ae7f56..8fbffbeea0531 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
@@ -8,3 +8,10 @@ void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local vo
   __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
   __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
 }
+
+void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset, int x) {
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, x, vindex, voffset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 3, vindex, voffset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl
index 74944f2d93c72..cb832b9aa4845 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl
@@ -5,6 +5,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -verify -o - %s
 // REQUIRES: amdgpu-registered-target
 
-void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) {
+void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int vindex, int offset, int soffset) {
   __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
 }
diff --git a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl
index a44d9dd86b86a..22569fa7b443c 100644
--- a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl
+++ b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl
@@ -87,7 +87,7 @@ kernel void enqueue_kernel_tests(void) {
                  },
                  1024, 4294967296L);
 #ifdef B32
-// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 4294967296 to 0}}
+// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 4294967296 to 0}}
 #endif
 
   char c;
@@ -97,7 +97,7 @@ kernel void enqueue_kernel_tests(void) {
                  },
                  c, 1024L);
 #ifdef WCONV
-// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to 'unsigned {{int|long}}'}}
+// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to '__size_t' (aka 'unsigned {{int|long}}')}}
 #endif
 #define UINT_MAX 4294967295
 
@@ -107,7 +107,7 @@ kernel void enqueue_kernel_tests(void) {
                  },
                  sizeof(int), sizeof(int) * UINT_MAX);
 #ifdef B32
-// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 17179869180 to 4294967292}}
+// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 17179869180 to 4294967292}}
 #endif
 
   typedef void (^bl_A_t)(local void *);
diff --git a/clang/test/SemaSPIRV/BuiltIns/refract-errors.c b/clang/test/SemaSPIRV/BuiltIns/refract-errors.c
new file mode 100644
index 0000000000000..07486c2a60cbf
--- /dev/null
+++ b/clang/test/SemaSPIRV/BuiltIns/refract-errors.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef _Float16 half;
+typedef half half2 __attribute__((ext_vector_type(2)));
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_spirv_refract(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0, float p1) {
+  return __builtin_spirv_refract(p0, p0, p1, p1);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float test_double_scalar_inputs(double p0, double p1, double p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}}
+}
+
+float test_int_scalar_inputs(int p0, int p1, int p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}}
+}
+
+float test_float_and_half_inputs(float2 p0, half2 p1, float p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{first two arguments to '__builtin_spirv_refract' must have the same type}}
+}
+
+float test_float_and_half_2_inputs(float2 p0, float2 p1, half p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{all arguments to '__builtin_spirv_refract' must be of scalar or vector type with matching scalar element type: 'float2' (vector of 2 'float' values) vs 'half' (aka '_Float16')}}
+}
+
+float2 test_mismatch_vector_size_inputs(float2 p0, float3 p1, float p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{first two arguments to '__builtin_spirv_refract' must have the same type}}
+}
diff --git a/clang/test/SemaTemplate/concepts-using-decl.cpp b/clang/test/SemaTemplate/concepts-using-decl.cpp
index fca69dea5c88f..41f7b6d2f8faa 100644
--- a/clang/test/SemaTemplate/concepts-using-decl.cpp
+++ b/clang/test/SemaTemplate/concepts-using-decl.cpp
@@ -176,3 +176,24 @@ void func() {
   f.foo<10, 10>(); // expected-error {{no matching member function for call to 'foo'}}
 }
 } // namespace heads_without_concepts.
+
+namespace GH146614 {
+
+template <typename T>
+struct base {
+    template <typename A>
+    void foo(A x)
+        requires (requires{x;})
+    {}
+};
+
+
+struct child : base<int> {
+  using base<int>::foo;
+  template <typename A>
+  void foo(A x)
+      requires (false)
+  {}
+};
+
+}
diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp
index 264b4dcdc044d..5ff010c7db29c 100644
--- a/clang/test/SemaTemplate/type_pack_element.cpp
+++ b/clang/test/SemaTemplate/type_pack_element.cpp
@@ -7,9 +7,9 @@ using test1 = __type_pack_element<0, int>;
 // CHECK-NEXT:       |-name: '__type_pack_element' qualified
 // CHECK-NEXT:       | `-BuiltinTemplateDecl {{.+}} __type_pack_element
 // CHECK-NEXT:       |-TemplateArgument expr '0'
-// CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:35> 'unsigned long'
+// CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:35> '__size_t':'unsigned long'
 // CHECK-NEXT:       |   |-value: Int 0
-// CHECK-NEXT:       |   `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:35> 'unsigned long' <IntegralCast>
+// CHECK-NEXT:       |   `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:35> '__size_t':'unsigned long' <IntegralCast>
 // CHECK-NEXT:       |     `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:35> 'int' 0
 // CHECK-NEXT:       |-TemplateArgument type 'int'
 // CHECK-NEXT:       | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
@@ -23,7 +23,7 @@ template<int N, class ...Ts> struct A {
 // CHECK-NEXT:       |-name: '__type_pack_element' qualified
 // CHECK-NEXT:       | `-BuiltinTemplateDecl {{.+}} __type_pack_element
 // CHECK-NEXT:       |-TemplateArgument expr 'N'
-// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
+// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> '__size_t':'unsigned long' <IntegralCast>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 // CHECK-NEXT:       `-TemplateArgument type 'Ts...'
 // CHECK-NEXT:         `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent
@@ -37,9 +37,9 @@ template<int N, class ...Ts> struct A {
 // CHECK-NEXT:       |-name: '__type_pack_element' qualified
 // CHECK-NEXT:       | `-BuiltinTemplateDecl {{.+}} __type_pack_element
 // CHECK-NEXT:       |-TemplateArgument expr '0'
-// CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long'
+// CHECK-NEXT:       | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:37> '__size_t':'unsigned long'
 // CHECK-NEXT:       |   |-value: Int 0
-// CHECK-NEXT:       |   `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
+// CHECK-NEXT:       |   `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> '__size_t':'unsigned long' <IntegralCast>
 // CHECK-NEXT:       |     `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:37> 'int' 0
 // CHECK-NEXT:       `-TemplateArgument type 'Ts...'
 // CHECK-NEXT:         `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent
@@ -53,7 +53,7 @@ template<int N, class ...Ts> struct A {
 // CHECK-NEXT:       |-name: '__type_pack_element' qualified
 // CHECK-NEXT:       | `-BuiltinTemplateDecl {{.+}} __type_pack_element
 // CHECK-NEXT:       |-TemplateArgument expr 'N'
-// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'unsigned long' <IntegralCast>
+// CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:37> '__size_t':'unsigned long' <IntegralCast>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:37> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
 // CHECK-NEXT:       `-TemplateArgument type 'int'
 // CHECK-NEXT:         `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 9089984fa4a54..9412d9735ef82 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1438,10 +1438,6 @@ bool CursorVisitor::VisitNestedNameSpecifier(NestedNameSpecifier *NNS,
     return Visit(
         MakeCursorNamespaceRef(NNS->getAsNamespace(), Range.getBegin(), TU));
 
-  case NestedNameSpecifier::NamespaceAlias:
-    return Visit(MakeCursorNamespaceRef(NNS->getAsNamespaceAlias(),
-                                        Range.getBegin(), TU));
-
   case NestedNameSpecifier::TypeSpec: {
     // If the type has a form where we know that the beginning of the source
     // range matches up with a reference cursor. Visit the appropriate reference
@@ -1483,13 +1479,6 @@ bool CursorVisitor::VisitNestedNameSpecifierLoc(
 
       break;
 
-    case NestedNameSpecifier::NamespaceAlias:
-      if (Visit(MakeCursorNamespaceRef(NNS->getAsNamespaceAlias(),
-                                       Q.getLocalBeginLoc(), TU)))
-        return true;
-
-      break;
-
     case NestedNameSpecifier::TypeSpec:
       if (Visit(Q.getTypeLoc()))
         return true;
@@ -1683,6 +1672,10 @@ bool CursorVisitor::VisitTypedefTypeLoc(TypedefTypeLoc TL) {
   return Visit(MakeCursorTypeRef(TL.getTypedefNameDecl(), TL.getNameLoc(), TU));
 }
 
+bool CursorVisitor::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) {
+  return false;
+}
+
 bool CursorVisitor::VisitUnresolvedUsingTypeLoc(UnresolvedUsingTypeLoc TL) {
   return Visit(MakeCursorTypeRef(TL.getDecl(), TL.getNameLoc(), TU));
 }
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index b6662b66206b2..2b1e266f07392 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SOURCES
   Indexing.cpp
   FatalErrorHandler.cpp
   Rewrite.cpp
+  Obsolete.cpp
 
   ADDITIONAL_HEADERS
   CIndexDiagnostic.h
diff --git a/clang/tools/libclang/Obsolete.cpp b/clang/tools/libclang/Obsolete.cpp
new file mode 100644
index 0000000000000..3596f76e1be6f
--- /dev/null
+++ b/clang/tools/libclang/Obsolete.cpp
@@ -0,0 +1,48 @@
+//===- Obsolete.cpp - Obsolete libclang functions and types -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--------------------------------------------------------------------===//
+//
+// This file contains libclang symbols whose underlying functionality has been
+// removed from Clang, but which need to be kept around so as to retain ABI
+// compatibility.
+//
+//===--------------------------------------------------------------------===//
+
+#include "clang-c/CXString.h"
+#include "clang-c/Index.h"
+#include "clang-c/Platform.h"
+#include "llvm/Support/raw_ostream.h"
+
+extern "C" {
+
+// The functions below used to be part of the C API for ARCMigrate, which has
+// since been removed from Clang; they already used to print an error if Clang
+// was compiled without arcmt support, so we continue doing so.
+CXRemapping clang_getRemappings(const char *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+CXRemapping clang_getRemappingsFromFileList(const char **, unsigned) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+unsigned clang_remap_getNumFiles(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return 0;
+}
+
+void clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+void clang_remap_dispose(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+} // extern "C"
diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index 49c472e3833fd..3d9d2e268a611 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -327,6 +327,8 @@ LLVM_13 {
     clang_getRange;
     clang_getRangeEnd;
     clang_getRangeStart;
+    clang_getRemappings;
+    clang_getRemappingsFromFileList;
     clang_getResultType;
     clang_getSkippedRanges;
     clang_getSpecializedCursorTemplate;
@@ -387,6 +389,9 @@ LLVM_13 {
     clang_parseTranslationUnit;
     clang_parseTranslationUnit2;
     clang_parseTranslationUnit2FullArgv;
+    clang_remap_dispose;
+    clang_remap_getFilenames;
+    clang_remap_getNumFiles;
     clang_reparseTranslationUnit;
     clang_saveTranslationUnit;
     clang_sortCodeCompletionResults;
diff --git a/clang/unittests/Analysis/CMakeLists.txt b/clang/unittests/Analysis/CMakeLists.txt
index 059a74843155c..52e7d2854633d 100644
--- a/clang/unittests/Analysis/CMakeLists.txt
+++ b/clang/unittests/Analysis/CMakeLists.txt
@@ -4,6 +4,7 @@ add_clang_unittest(ClangAnalysisTests
   CloneDetectionTest.cpp
   ExprMutationAnalyzerTest.cpp
   IntervalPartitionTest.cpp
+  LifetimeSafetyTest.cpp
   MacroExpansionContextTest.cpp
   UnsafeBufferUsageTest.cpp
   CLANG_LIBS
diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
new file mode 100644
index 0000000000000..b08159714e78a
--- /dev/null
+++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
@@ -0,0 +1,439 @@
+//===- LifetimeSafetyTest.cpp - Lifetime Safety Tests -*---------- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/Analyses/LifetimeSafety.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Testing/TestAST.h"
+#include "llvm/ADT/StringMap.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <optional>
+#include <vector>
+
+namespace clang::lifetimes::internal {
+namespace {
+
+using namespace ast_matchers;
+using ::testing::UnorderedElementsAreArray;
+
+// A helper class to run the full lifetime analysis on a piece of code
+// and provide an interface for querying the results.
+class LifetimeTestRunner {
+public:
+  LifetimeTestRunner(llvm::StringRef Code) {
+    std::string FullCode = R"(
+      #define POINT(name) void("__lifetime_test_point_" #name)
+      struct MyObj { ~MyObj() {} int i; };
+    )";
+    FullCode += Code.str();
+
+    AST = std::make_unique<clang::TestAST>(FullCode);
+    ASTCtx = &AST->context();
+
+    // Find the target function using AST matchers.
+    auto MatchResult =
+        match(functionDecl(hasName("target")).bind("target"), *ASTCtx);
+    auto *FD = selectFirst<FunctionDecl>("target", MatchResult);
+    if (!FD) {
+      ADD_FAILURE() << "Test case must have a function named 'target'";
+      return;
+    }
+    AnalysisCtx = std::make_unique<AnalysisDeclContext>(nullptr, FD);
+    AnalysisCtx->getCFGBuildOptions().setAllAlwaysAdd();
+
+    // Run the main analysis.
+    Analysis = std::make_unique<LifetimeSafetyAnalysis>(*AnalysisCtx);
+    Analysis->run();
+
+    AnnotationToPointMap = Analysis->getTestPoints();
+  }
+
+  LifetimeSafetyAnalysis &getAnalysis() { return *Analysis; }
+  ASTContext &getASTContext() { return *ASTCtx; }
+
+  ProgramPoint getProgramPoint(llvm::StringRef Annotation) {
+    auto It = AnnotationToPointMap.find(Annotation);
+    if (It == AnnotationToPointMap.end()) {
+      ADD_FAILURE() << "Annotation '" << Annotation << "' not found.";
+      return nullptr;
+    }
+    return It->second;
+  }
+
+private:
+  std::unique_ptr<TestAST> AST;
+  ASTContext *ASTCtx = nullptr;
+  std::unique_ptr<AnalysisDeclContext> AnalysisCtx;
+  std::unique_ptr<LifetimeSafetyAnalysis> Analysis;
+  llvm::StringMap<ProgramPoint> AnnotationToPointMap;
+};
+
+// A convenience wrapper that uses the LifetimeSafetyAnalysis public API.
+class LifetimeTestHelper {
+public:
+  LifetimeTestHelper(LifetimeTestRunner &Runner)
+      : Runner(Runner), Analysis(Runner.getAnalysis()) {}
+
+  std::optional<OriginID> getOriginForDecl(llvm::StringRef VarName) {
+    auto *VD = findDecl<ValueDecl>(VarName);
+    if (!VD)
+      return std::nullopt;
+    auto OID = Analysis.getOriginIDForDecl(VD);
+    if (!OID)
+      ADD_FAILURE() << "Origin for '" << VarName << "' not found.";
+    return OID;
+  }
+
+  std::optional<LoanID> getLoanForVar(llvm::StringRef VarName) {
+    auto *VD = findDecl<VarDecl>(VarName);
+    if (!VD)
+      return std::nullopt;
+    std::vector<LoanID> LID = Analysis.getLoanIDForVar(VD);
+    if (LID.empty()) {
+      ADD_FAILURE() << "Loan for '" << VarName << "' not found.";
+      return std::nullopt;
+    }
+    // TODO: Support retrieving more than one loans to a var.
+    if (LID.size() > 1) {
+      ADD_FAILURE() << "More than 1 loans found for '" << VarName;
+      return std::nullopt;
+    }
+    return LID[0];
+  }
+
+  std::optional<LoanSet> getLoansAtPoint(OriginID OID,
+                                         llvm::StringRef Annotation) {
+    ProgramPoint PP = Runner.getProgramPoint(Annotation);
+    if (!PP)
+      return std::nullopt;
+    return Analysis.getLoansAtPoint(OID, PP);
+  }
+
+private:
+  template <typename DeclT> DeclT *findDecl(llvm::StringRef Name) {
+    auto &Ctx = Runner.getASTContext();
+    auto Results = match(valueDecl(hasName(Name)).bind("v"), Ctx);
+    if (Results.empty()) {
+      ADD_FAILURE() << "Declaration '" << Name << "' not found in AST.";
+      return nullptr;
+    }
+    return const_cast<DeclT *>(selectFirst<DeclT>("v", Results));
+  }
+
+  LifetimeTestRunner &Runner;
+  LifetimeSafetyAnalysis &Analysis;
+};
+
+// ========================================================================= //
+//                         GTest Matchers & Fixture
+// ========================================================================= //
+
+// It holds the name of the origin variable and a reference to the helper.
+class OriginInfo {
+public:
+  OriginInfo(llvm::StringRef OriginVar, LifetimeTestHelper &Helper)
+      : OriginVar(OriginVar), Helper(Helper) {}
+  llvm::StringRef OriginVar;
+  LifetimeTestHelper &Helper;
+};
+
+/// Matcher to verify the set of loans held by an origin at a specific
+/// program point.
+///
+/// This matcher is intended to be used with an \c OriginInfo object.
+///
+/// \param LoanVars A vector of strings, where each string is the name of a
+/// variable expected to be the source of a loan.
+/// \param Annotation A string identifying the program point (created with
+/// POINT()) where the check should be performed.
+MATCHER_P2(HasLoansToImpl, LoanVars, Annotation, "") {
+  const OriginInfo &Info = arg;
+  std::optional<OriginID> OIDOpt = Info.Helper.getOriginForDecl(Info.OriginVar);
+  if (!OIDOpt) {
+    *result_listener << "could not find origin for '" << Info.OriginVar.str()
+                     << "'";
+    return false;
+  }
+
+  std::optional<LoanSet> ActualLoansSetOpt =
+      Info.Helper.getLoansAtPoint(*OIDOpt, Annotation);
+  if (!ActualLoansSetOpt) {
+    *result_listener << "could not get a valid loan set at point '"
+                     << Annotation << "'";
+    return false;
+  }
+  std::vector<LoanID> ActualLoans(ActualLoansSetOpt->begin(),
+                                  ActualLoansSetOpt->end());
+
+  std::vector<LoanID> ExpectedLoans;
+  for (const auto &LoanVar : LoanVars) {
+    std::optional<LoanID> ExpectedLIDOpt = Info.Helper.getLoanForVar(LoanVar);
+    if (!ExpectedLIDOpt) {
+      *result_listener << "could not find loan for var '" << LoanVar << "'";
+      return false;
+    }
+    ExpectedLoans.push_back(*ExpectedLIDOpt);
+  }
+
+  return ExplainMatchResult(UnorderedElementsAreArray(ExpectedLoans),
+                            ActualLoans, result_listener);
+}
+
+// Base test fixture to manage the runner and helper.
+class LifetimeAnalysisTest : public ::testing::Test {
+protected:
+  void SetupTest(llvm::StringRef Code) {
+    Runner = std::make_unique<LifetimeTestRunner>(Code);
+    Helper = std::make_unique<LifetimeTestHelper>(*Runner);
+  }
+
+  OriginInfo Origin(llvm::StringRef OriginVar) {
+    return OriginInfo(OriginVar, *Helper);
+  }
+
+  // Factory function that hides the std::vector creation.
+  auto HasLoansTo(std::initializer_list<std::string> LoanVars,
+                  const char *Annotation) {
+    return HasLoansToImpl(std::vector<std::string>(LoanVars), Annotation);
+  }
+
+  std::unique_ptr<LifetimeTestRunner> Runner;
+  std::unique_ptr<LifetimeTestHelper> Helper;
+};
+
+// ========================================================================= //
+//                                 TESTS
+// ========================================================================= //
+
+TEST_F(LifetimeAnalysisTest, SimpleLoanAndOrigin) {
+  SetupTest(R"(
+    void target() {
+      int x;
+      int* p = &x;
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"x"}, "p1"));
+}
+
+TEST_F(LifetimeAnalysisTest, OverwriteOrigin) {
+  SetupTest(R"(
+    void target() {
+      MyObj s1, s2;
+
+      MyObj* p = &s1;
+      POINT(after_s1);
+
+      p = &s2;
+      POINT(after_s2);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1"}, "after_s1"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s2"}, "after_s2"));
+}
+
+TEST_F(LifetimeAnalysisTest, ConditionalLoan) {
+  SetupTest(R"(
+    void target(bool cond) {
+      int a, b;
+      int *p = nullptr;
+      if (cond) {
+        p = &a;
+        POINT(after_then);
+      } else {
+        p = &b;
+        POINT(after_else);
+      }
+      POINT(after_if);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"a"}, "after_then"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"b"}, "after_else"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"a", "b"}, "after_if"));
+}
+
+TEST_F(LifetimeAnalysisTest, PointerChain) {
+  SetupTest(R"(
+    void target() {
+      MyObj y;
+      MyObj* ptr1 = &y;
+      POINT(p1);
+
+      MyObj* ptr2 = ptr1;
+      POINT(p2);
+
+      ptr2 = ptr1;
+      POINT(p3);
+
+      ptr2 = ptr2; // Self assignment
+      POINT(p4);
+    }
+  )");
+  EXPECT_THAT(Origin("ptr1"), HasLoansTo({"y"}, "p1"));
+  EXPECT_THAT(Origin("ptr2"), HasLoansTo({"y"}, "p2"));
+  EXPECT_THAT(Origin("ptr2"), HasLoansTo({"y"}, "p3"));
+  EXPECT_THAT(Origin("ptr2"), HasLoansTo({"y"}, "p4"));
+}
+
+TEST_F(LifetimeAnalysisTest, ReassignToNull) {
+  SetupTest(R"(
+    void target() {
+      MyObj s1;
+      MyObj* p = &s1;
+      POINT(before_null);
+      p = nullptr;
+      POINT(after_null);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1"}, "before_null"));
+  // After assigning to null, the origin for `p` should have no loans.
+  EXPECT_THAT(Origin("p"), HasLoansTo({}, "after_null"));
+}
+
+TEST_F(LifetimeAnalysisTest, ReassignInIf) {
+  SetupTest(R"(
+    void target(bool condition) {
+      MyObj s1, s2;
+      MyObj* p = &s1;
+      POINT(before_if);
+      if (condition) {
+        p = &s2;
+        POINT(after_reassign);
+      }
+      POINT(after_if);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1"}, "before_if"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s2"}, "after_reassign"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1", "s2"}, "after_if"));
+}
+
+TEST_F(LifetimeAnalysisTest, AssignInSwitch) {
+  SetupTest(R"(
+    void target(int mode) {
+      MyObj s1, s2, s3;
+      MyObj* p = nullptr;
+      switch (mode) {
+        case 1:
+          p = &s1;
+          POINT(case1);
+          break;
+        case 2:
+          p = &s2;
+          POINT(case2);
+          break;
+        default:
+          p = &s3;
+          POINT(case3);
+          break;
+      }
+      POINT(after_switch);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1"}, "case1"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s2"}, "case2"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s3"}, "case3"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1", "s2", "s3"}, "after_switch"));
+}
+
+TEST_F(LifetimeAnalysisTest, LoanInLoop) {
+  SetupTest(R"(
+    void target(bool condition) {
+      MyObj* p = nullptr;
+      while (condition) {
+        MyObj inner;
+        p = &inner;
+        POINT(in_loop);
+      }
+      POINT(after_loop);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"inner"}, "in_loop"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"inner"}, "after_loop"));
+}
+
+TEST_F(LifetimeAnalysisTest, LoopWithBreak) {
+  SetupTest(R"(
+    void target(int count) {
+      MyObj s1;
+      MyObj s2;
+      MyObj* p = &s1;
+      POINT(before_loop);
+      for (int i = 0; i < count; ++i) {
+        if (i == 5) {
+          p = &s2;
+          POINT(inside_if);
+          break;
+        }
+        POINT(after_if);
+      }
+      POINT(after_loop);
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1"}, "before_loop"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s2"}, "inside_if"));
+  // At the join point after if, s2 cannot make it to p without the if.
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1"}, "after_if"));
+  // At the join point after the loop, p could hold a loan to s1 (if the loop
+  // completed normally) or to s2 (if the loop was broken).
+  EXPECT_THAT(Origin("p"), HasLoansTo({"s1", "s2"}, "after_loop"));
+}
+
+TEST_F(LifetimeAnalysisTest, PointersInACycle) {
+  SetupTest(R"(
+    void target(bool condition) {
+      MyObj v1, v2, v3;
+      MyObj *p1 = &v1, *p2 = &v2, *p3 = &v3;
+
+      POINT(before_while);
+      while (condition) {
+        MyObj* temp = p1;
+        p1 = p2;
+        p2 = p3;
+        p3 = temp;
+      }
+      POINT(after_loop);
+    }
+  )");
+  EXPECT_THAT(Origin("p1"), HasLoansTo({"v1"}, "before_while"));
+  EXPECT_THAT(Origin("p2"), HasLoansTo({"v2"}, "before_while"));
+  EXPECT_THAT(Origin("p3"), HasLoansTo({"v3"}, "before_while"));
+
+  // At the fixed point after the loop, all pointers could point to any of
+  // the three variables.
+  EXPECT_THAT(Origin("p1"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
+  EXPECT_THAT(Origin("p2"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
+  EXPECT_THAT(Origin("p3"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
+  EXPECT_THAT(Origin("temp"), HasLoansTo({"v1", "v2", "v3"}, "after_loop"));
+}
+
+TEST_F(LifetimeAnalysisTest, NestedScopes) {
+  SetupTest(R"(
+    void target() {
+      MyObj* p = nullptr;
+      {
+        MyObj outer;
+        p = &outer;
+        POINT(before_inner_scope);
+        {
+          MyObj inner;
+          p = &inner;
+          POINT(inside_inner_scope);
+        } // inner expires
+        POINT(after_inner_scope);
+      } // outer expires
+    }
+  )");
+  EXPECT_THAT(Origin("p"), HasLoansTo({"outer"}, "before_inner_scope"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"inner"}, "inside_inner_scope"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"inner"}, "after_inner_scope"));
+}
+
+} // anonymous namespace
+} // namespace clang::lifetimes::internal
diff --git a/clang/unittests/Format/BracesInserterTest.cpp b/clang/unittests/Format/BracesInserterTest.cpp
index e0c447d671f45..572e53e595e37 100644
--- a/clang/unittests/Format/BracesInserterTest.cpp
+++ b/clang/unittests/Format/BracesInserterTest.cpp
@@ -257,9 +257,9 @@ TEST_F(BracesInserterTest, InsertBracesRange) {
   FormatStyle Style = getLLVMStyle();
   Style.InsertBraces = true;
 
-  const StringRef Code("while (a)\n"
-                       "  if (b)\n"
-                       "    return;");
+  constexpr StringRef Code("while (a)\n"
+                           "  if (b)\n"
+                           "    return;");
 
   verifyFormat("while (a) {\n"
                "  if (b)\n"
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index d17109aebc0f8..65d8b36c677bd 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -259,6 +259,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, Other);
   CHECK_PARSE_NESTED_BOOL(SortIncludes, Enabled);
   CHECK_PARSE_NESTED_BOOL(SortIncludes, IgnoreCase);
+  CHECK_PARSE_NESTED_BOOL(SortIncludes, IgnoreExtension);
 }
 
 #undef CHECK_PARSE_BOOL
@@ -980,17 +981,20 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               IncludeStyle.IncludeIsMainSourceRegex, "abc$");
 
   Style.SortIncludes = {};
-  CHECK_PARSE("SortIncludes: true", SortIncludes,
-              FormatStyle::SortIncludesOptions(
-                  {/*Enabled=*/true, /*IgnoreCase=*/false}));
+  CHECK_PARSE(
+      "SortIncludes: true", SortIncludes,
+      FormatStyle::SortIncludesOptions(
+          {/*Enabled=*/true, /*IgnoreCase=*/false, /*IgnoreExtension=*/false}));
   CHECK_PARSE("SortIncludes: false", SortIncludes,
               FormatStyle::SortIncludesOptions({}));
-  CHECK_PARSE("SortIncludes: CaseInsensitive", SortIncludes,
-              FormatStyle::SortIncludesOptions(
-                  {/*Enabled=*/true, /*IgnoreCase=*/true}));
-  CHECK_PARSE("SortIncludes: CaseSensitive", SortIncludes,
-              FormatStyle::SortIncludesOptions(
-                  {/*Enabled=*/true, /*IgnoreCase=*/false}));
+  CHECK_PARSE(
+      "SortIncludes: CaseInsensitive", SortIncludes,
+      FormatStyle::SortIncludesOptions(
+          {/*Enabled=*/true, /*IgnoreCase=*/true, /*IgnoreExtension=*/false}));
+  CHECK_PARSE(
+      "SortIncludes: CaseSensitive", SortIncludes,
+      FormatStyle::SortIncludesOptions(
+          {/*Enabled=*/true, /*IgnoreCase=*/false, /*IgnoreExtension=*/false}));
   CHECK_PARSE("SortIncludes: Never", SortIncludes,
               FormatStyle::SortIncludesOptions({}));
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0bc1c6d45656e..dbf6950446ef0 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -3185,7 +3185,7 @@ TEST_F(FormatTest, FormatsLabels) {
   // The opening brace may either be on the same unwrapped line as the colon or
   // on a separate one. The formatter should recognize both.
   Style = getLLVMStyle();
-  Style.BreakBeforeBraces = FormatStyle::BraceBreakingStyle::BS_Allman;
+  Style.BreakBeforeBraces = FormatStyle::BS_Allman;
   verifyFormat("{\n"
                "  some_code();\n"
                "test_label:\n"
@@ -3206,7 +3206,7 @@ TEST_F(FormatTest, FormatsLabels) {
 
 TEST_F(FormatTest, MultiLineControlStatements) {
   FormatStyle Style = getLLVMStyleWithColumns(20);
-  Style.BreakBeforeBraces = FormatStyle::BraceBreakingStyle::BS_Custom;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_MultiLine;
   // Short lines should keep opening brace on same line.
   verifyFormat("if (foo) {\n"
@@ -3441,7 +3441,7 @@ TEST_F(FormatTest, MultiLineControlStatements) {
 
 TEST_F(FormatTest, BeforeWhile) {
   FormatStyle Style = getLLVMStyle();
-  Style.BreakBeforeBraces = FormatStyle::BraceBreakingStyle::BS_Custom;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
 
   verifyFormat("do {\n"
                "  foo();\n"
@@ -4803,12 +4803,13 @@ TEST_F(FormatTest, FormatsInlineASM) {
                "int   i;");
 
   auto Style = getLLVMStyleWithColumns(0);
-  const StringRef Code1{"asm(\"xyz\" : \"=a\"(a), \"=d\"(b) : \"a\"(data));"};
-  const StringRef Code2{"asm(\"xyz\"\n"
-                        "    : \"=a\"(a), \"=d\"(b)\n"
-                        "    : \"a\"(data));"};
-  const StringRef Code3{"asm(\"xyz\" : \"=a\"(a), \"=d\"(b)\n"
-                        "    : \"a\"(data));"};
+  constexpr StringRef Code1(
+      "asm(\"xyz\" : \"=a\"(a), \"=d\"(b) : \"a\"(data));");
+  constexpr StringRef Code2("asm(\"xyz\"\n"
+                            "    : \"=a\"(a), \"=d\"(b)\n"
+                            "    : \"a\"(data));");
+  constexpr StringRef Code3("asm(\"xyz\" : \"=a\"(a), \"=d\"(b)\n"
+                            "    : \"a\"(data));");
 
   Style.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline;
   verifyFormat(Code1, Style);
@@ -6681,6 +6682,17 @@ TEST_F(FormatTest, EscapedNewlines) {
                "  int x(int a);",
                AlignLeft);
 
+  // Escaped with a trigraph.  The program just has to avoid crashing.
+  verifyNoCrash("#define A \?\?/\n"
+                "int i;\?\?/\n"
+                "  int j;");
+  verifyNoCrash("#define A \?\?/\r\n"
+                "int i;\?\?/\r\n"
+                "  int j;");
+  verifyNoCrash("#define A \?\?/\n"
+                "int i;",
+                getGoogleStyle(FormatStyle::LK_CSharp));
+
   // CRLF line endings
   verifyFormat("#define A \\\r\n  int i;  \\\r\n  int j;",
                "#define A \\\r\nint i;\\\r\n  int j;", Narrow);
@@ -6693,16 +6705,16 @@ TEST_F(FormatTest, EscapedNewlines) {
                "  int x(int a);",
                AlignLeft);
 
-  constexpr StringRef Code{"#define A   \\\n"
+  constexpr StringRef Code("#define A   \\\n"
                            "  int a123; \\\n"
                            "  int a;    \\\n"
-                           "  int a1234;"};
+                           "  int a1234;");
   verifyFormat(Code, AlignLeft);
 
-  constexpr StringRef Code2{"#define A    \\\n"
+  constexpr StringRef Code2("#define A    \\\n"
                             "  int a123;  \\\n"
                             "  int a;     \\\n"
-                            "  int a1234;"};
+                            "  int a1234;");
   auto LastLine = getLLVMStyle();
   LastLine.AlignEscapedNewlines = FormatStyle::ENAS_LeftWithLastLine;
   verifyFormat(Code2, LastLine);
@@ -12097,9 +12109,9 @@ TEST_F(FormatTest, PointerAlignmentFallback) {
   FormatStyle Style = getLLVMStyle();
   Style.DerivePointerAlignment = true;
 
-  const StringRef Code("int* p;\n"
-                       "int *q;\n"
-                       "int * r;");
+  constexpr StringRef Code("int* p;\n"
+                           "int *q;\n"
+                           "int * r;");
 
   EXPECT_EQ(Style.PointerAlignment, FormatStyle::PAS_Right);
   verifyFormat("int *p;\n"
@@ -15014,7 +15026,7 @@ TEST_F(FormatTest, PullTrivialFunctionDefinitionsIntoSingleLine) {
       "    aaaaaaaaaaaaaaaaaa,\n"
       "    aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb) {}");
 
-  constexpr StringRef Code{"void foo() { /* Empty */ }"};
+  constexpr StringRef Code("void foo() { /* Empty */ }");
   verifyFormat(Code);
   verifyFormat(Code, "void foo() { /* Empty */\n"
                      "}");
@@ -23779,7 +23791,7 @@ TEST_F(FormatTest, FormatsLambdas) {
   LLVMWithBeforeLambdaBody.BreakBeforeBraces = FormatStyle::BS_Custom;
   LLVMWithBeforeLambdaBody.BraceWrapping.BeforeLambdaBody = true;
   LLVMWithBeforeLambdaBody.AllowShortLambdasOnASingleLine =
-      FormatStyle::ShortLambdaStyle::SLS_None;
+      FormatStyle::SLS_None;
   verifyFormat("FctWithOneNestedLambdaInline_SLS_None(\n"
                "    []()\n"
                "    {\n"
@@ -23815,7 +23827,7 @@ TEST_F(FormatTest, FormatsLambdas) {
                LLVMWithBeforeLambdaBody);
 
   LLVMWithBeforeLambdaBody.AllowShortLambdasOnASingleLine =
-      FormatStyle::ShortLambdaStyle::SLS_Empty;
+      FormatStyle::SLS_Empty;
   verifyFormat("FctWithOneNestedLambdaInline_SLS_Empty(\n"
                "    []()\n"
                "    {\n"
@@ -23862,7 +23874,7 @@ TEST_F(FormatTest, FormatsLambdas) {
       LLVMWithBeforeLambdaBody);
 
   LLVMWithBeforeLambdaBody.AllowShortLambdasOnASingleLine =
-      FormatStyle::ShortLambdaStyle::SLS_Inline;
+      FormatStyle::SLS_Inline;
   verifyFormat("FctWithOneNestedLambdaInline_SLS_Inline([]() { return 17; });",
                LLVMWithBeforeLambdaBody);
   verifyFormat("FctWithOneNestedLambdaEmpty_SLS_Inline([]() {});",
@@ -23893,7 +23905,7 @@ TEST_F(FormatTest, FormatsLambdas) {
       LLVMWithBeforeLambdaBody);
 
   LLVMWithBeforeLambdaBody.AllowShortLambdasOnASingleLine =
-      FormatStyle::ShortLambdaStyle::SLS_All;
+      FormatStyle::SLS_All;
   verifyFormat("FctWithOneNestedLambdaInline_SLS_All([]() { return 17; });",
                LLVMWithBeforeLambdaBody);
   verifyFormat("FctWithOneNestedLambdaEmpty_SLS_All([]() {});",
@@ -24025,7 +24037,7 @@ TEST_F(FormatTest, FormatsLambdas) {
                LLVMWithBeforeLambdaBody);
 
   LLVMWithBeforeLambdaBody.AllowShortLambdasOnASingleLine =
-      FormatStyle::ShortLambdaStyle::SLS_None;
+      FormatStyle::SLS_None;
 
   verifyFormat("auto select = [this]() -> const Library::Object *\n"
                "{\n"
@@ -24273,7 +24285,7 @@ TEST_F(FormatTest, LambdaWithLineComments) {
   LLVMWithBeforeLambdaBody.BreakBeforeBraces = FormatStyle::BS_Custom;
   LLVMWithBeforeLambdaBody.BraceWrapping.BeforeLambdaBody = true;
   LLVMWithBeforeLambdaBody.AllowShortLambdasOnASingleLine =
-      FormatStyle::ShortLambdaStyle::SLS_All;
+      FormatStyle::SLS_All;
 
   verifyFormat("auto k = []() { return; }", LLVMWithBeforeLambdaBody);
   verifyFormat("auto k = []() // comment\n"
@@ -27244,7 +27256,7 @@ TEST_F(FormatTest, IndentAccessModifiers) {
 
 TEST_F(FormatTest, LimitlessStringsAndComments) {
   auto Style = getLLVMStyleWithColumns(0);
-  constexpr StringRef Code =
+  constexpr StringRef Code(
       "/**\n"
       " * This is a multiline comment with quite some long lines, at least for "
       "the LLVM Style.\n"
@@ -27265,7 +27277,7 @@ TEST_F(FormatTest, LimitlessStringsAndComments) {
       "  const std::string SmallString = \"Hello World\";\n"
       "  // Small line comment\n"
       "  return String.size() > SmallString.size();\n"
-      "}";
+      "}");
   verifyNoChange(Code, Style);
 }
 
@@ -28371,10 +28383,15 @@ TEST_F(FormatTest, BreakAfterAttributes) {
                "Foo &operator-(Foo &);",
                Style);
 
-  Style.ReferenceAlignment = FormatStyle::ReferenceAlignmentStyle::RAS_Left;
+  Style.ReferenceAlignment = FormatStyle::RAS_Left;
   verifyFormat("[[nodiscard]]\n"
                "Foo& operator-(Foo&);",
                Style);
+
+  Style.BreakBeforeBinaryOperators = FormatStyle::BOS_All;
+  verifyFormat("[[deprecated]]\n"
+               "void f() = delete;",
+               Style);
 }
 
 TEST_F(FormatTest, InsertNewlineAtEOF) {
@@ -28384,9 +28401,9 @@ TEST_F(FormatTest, InsertNewlineAtEOF) {
   verifyNoChange("int i;\n", Style);
   verifyFormat("int i;\n", "int i;", Style);
 
-  constexpr StringRef Code{"namespace {\n"
+  constexpr StringRef Code("namespace {\n"
                            "int i;\n"
-                           "} // namespace"};
+                           "} // namespace");
   verifyFormat(Code.str() + '\n', Code, Style,
                {tooling::Range(19, 13)}); // line 3
 }
@@ -28395,7 +28412,7 @@ TEST_F(FormatTest, KeepEmptyLinesAtEOF) {
   FormatStyle Style = getLLVMStyle();
   Style.KeepEmptyLines.AtEndOfFile = true;
 
-  const StringRef Code{"int i;\n\n"};
+  constexpr StringRef Code("int i;\n\n");
   verifyNoChange(Code, Style);
   verifyFormat(Code, "int i;\n\n\n", Style);
 }
@@ -28628,8 +28645,8 @@ TEST_F(FormatTest, PPDirectivesAndCommentsInBracedInit) {
 }
 
 TEST_F(FormatTest, BreakAdjacentStringLiterals) {
-  constexpr StringRef Code{
-      "return \"Code\" \"\\0\\52\\26\\55\\55\\0\" \"x013\" \"\\02\\xBA\";"};
+  constexpr StringRef Code(
+      "return \"Code\" \"\\0\\52\\26\\55\\55\\0\" \"x013\" \"\\02\\xBA\";");
 
   verifyFormat("return \"Code\"\n"
                "       \"\\0\\52\\26\\55\\55\\0\"\n"
@@ -29040,9 +29057,9 @@ TEST_F(FormatTest, KeepFormFeed) {
   auto Style = getLLVMStyle();
   Style.KeepFormFeed = true;
 
-  constexpr StringRef NoFormFeed{"int i;\n"
+  constexpr StringRef NoFormFeed("int i;\n"
                                  "\n"
-                                 "void f();"};
+                                 "void f();");
   verifyFormat(NoFormFeed,
                "int i;\n"
                " \f\n"
@@ -29064,9 +29081,9 @@ TEST_F(FormatTest, KeepFormFeed) {
                "void f();\f",
                Style);
 
-  constexpr StringRef FormFeed{"int i;\n"
+  constexpr StringRef FormFeed("int i;\n"
                                "\f\n"
-                               "void f();"};
+                               "void f();");
   verifyNoChange(FormFeed, Style);
 
   Style.LineEnding = FormatStyle::LE_LF;
@@ -29076,10 +29093,10 @@ TEST_F(FormatTest, KeepFormFeed) {
                "void f();",
                Style);
 
-  constexpr StringRef FormFeedBeforeEmptyLine{"int i;\n"
+  constexpr StringRef FormFeedBeforeEmptyLine("int i;\n"
                                               "\f\n"
                                               "\n"
-                                              "void f();"};
+                                              "void f();");
   Style.MaxEmptyLinesToKeep = 2;
   verifyFormat(FormFeedBeforeEmptyLine,
                "int i;\n"
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 88707551b7698..69026bce98705 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -1120,11 +1120,11 @@ TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
                    "  }\n"
                    "}"));
 
-  const StringRef Code("void func() {\n"
-                       "  // clang-format off\n"
-                       "  #define KV(value) #value, value\n"
-                       "  // clang-format on\n"
-                       "}");
+  constexpr StringRef Code("void func() {\n"
+                           "  // clang-format off\n"
+                           "  #define KV(value) #value, value\n"
+                           "  // clang-format on\n"
+                           "}");
   verifyNoChange(Code);
 
   auto Style = getLLVMStyle();
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index ca5aba043b932..127556488bab0 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -631,17 +631,17 @@ TEST_F(FormatTestJava, SwitchExpression) {
                "});",
                Style);
 
-  constexpr StringRef Code1{"i = switch (day) {\n"
+  constexpr StringRef Code1("i = switch (day) {\n"
                             "  case THURSDAY, SATURDAY -> 8;\n"
                             "  case WEDNESDAY -> 9;\n"
                             "  default -> 0;\n"
-                            "};"};
+                            "};");
   verifyFormat(Code1, Style);
 
   Style.IndentCaseLabels = true;
   verifyFormat(Code1, Style);
 
-  constexpr StringRef Code2{"i = switch (day) {\n"
+  constexpr StringRef Code2("i = switch (day) {\n"
                             "  case THURSDAY, SATURDAY -> {\n"
                             "    foo();\n"
                             "    yield 8;\n"
@@ -653,17 +653,17 @@ TEST_F(FormatTestJava, SwitchExpression) {
                             "  default -> {\n"
                             "    yield 0;\n"
                             "  }\n"
-                            "};"};
+                            "};");
   verifyFormat(Code2, Style);
 
   Style.IndentCaseLabels = false;
   verifyFormat(Code2, Style);
 
-  constexpr StringRef Code3{"switch (day) {\n"
+  constexpr StringRef Code3("switch (day) {\n"
                             "case THURSDAY, SATURDAY -> i = 8;\n"
                             "case WEDNESDAY -> i = 9;\n"
                             "default -> i = 0;\n"
-                            "};"};
+                            "};");
   verifyFormat(Code3, Style);
 
   Style.IndentCaseLabels = true;
diff --git a/clang/unittests/Format/FormatTestSelective.cpp b/clang/unittests/Format/FormatTestSelective.cpp
index 0b7ac21fd33d3..1a01153a0af99 100644
--- a/clang/unittests/Format/FormatTestSelective.cpp
+++ b/clang/unittests/Format/FormatTestSelective.cpp
@@ -672,15 +672,14 @@ TEST_F(FormatTestSelective, FormatMacroRegardlessOfPreviousIndent) {
   // need to be adapted.
   Style = getLLVMStyle();
 
-  const StringRef Code{"      class Foo {\n"
-                       "            void test() {\n"
-                       "    #ifdef 1\n"
-                       "                #define some\n" // format this line
-                       "         #endif\n"
-                       "    }};"};
-
-  EXPECT_EQ(Style.IndentPPDirectives,
-            FormatStyle::PPDirectiveIndentStyle::PPDIS_None);
+  constexpr StringRef Code("      class Foo {\n"
+                           "            void test() {\n"
+                           "    #ifdef 1\n"
+                           "                #define some\n" // format this line
+                           "         #endif\n"
+                           "    }};");
+
+  EXPECT_EQ(Style.IndentPPDirectives, FormatStyle::PPDIS_None);
   EXPECT_EQ("      class Foo {\n"
             "            void test() {\n"
             "    #ifdef 1\n"
@@ -689,8 +688,7 @@ TEST_F(FormatTestSelective, FormatMacroRegardlessOfPreviousIndent) {
             "            }};", // Ditto: Bug?
             format(Code, 57, 0));
 
-  Style.IndentPPDirectives =
-      FormatStyle::PPDirectiveIndentStyle::PPDIS_BeforeHash;
+  Style.IndentPPDirectives = FormatStyle::PPDIS_BeforeHash;
   EXPECT_EQ("      class Foo {\n"
             "            void test() {\n"
             "    #ifdef 1\n"
@@ -699,8 +697,7 @@ TEST_F(FormatTestSelective, FormatMacroRegardlessOfPreviousIndent) {
             "    }};",
             format(Code, 57, 0));
 
-  Style.IndentPPDirectives =
-      FormatStyle::PPDirectiveIndentStyle::PPDIS_AfterHash;
+  Style.IndentPPDirectives = FormatStyle::PPDIS_AfterHash;
   EXPECT_EQ("      class Foo {\n"
             "            void test() {\n"
             "    #ifdef 1\n"
diff --git a/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp b/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
index b1e42e924e05c..8681c3d2f89ce 100644
--- a/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
+++ b/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
@@ -24,7 +24,7 @@ TEST_F(IntegerLiteralSeparatorTest, SingleQuoteAsSeparator) {
   EXPECT_EQ(Style.IntegerLiteralSeparator.Decimal, 0);
   EXPECT_EQ(Style.IntegerLiteralSeparator.Hex, 0);
 
-  const StringRef Binary("b = 0b10011'11'0110'1u;");
+  constexpr StringRef Binary("b = 0b10011'11'0110'1u;");
   verifyFormat(Binary, Style);
   Style.IntegerLiteralSeparator.Binary = -1;
   verifyFormat("b = 0b100111101101u;", Binary, Style);
@@ -33,14 +33,14 @@ TEST_F(IntegerLiteralSeparatorTest, SingleQuoteAsSeparator) {
   Style.IntegerLiteralSeparator.Binary = 4;
   verifyFormat("b = 0b1001'1110'1101u;", Binary, Style);
 
-  const StringRef Decimal("d = 184467'440737'0'95505'92Ull;");
+  constexpr StringRef Decimal("d = 184467'440737'0'95505'92Ull;");
   verifyFormat(Decimal, Style);
   Style.IntegerLiteralSeparator.Decimal = -1;
   verifyFormat("d = 18446744073709550592Ull;", Decimal, Style);
   Style.IntegerLiteralSeparator.Decimal = 3;
   verifyFormat("d = 18'446'744'073'709'550'592Ull;", Decimal, Style);
 
-  const StringRef Hex("h = 0xDEAD'BEEF'DE'AD'BEE'Fuz;");
+  constexpr StringRef Hex("h = 0xDEAD'BEEF'DE'AD'BEE'Fuz;");
   verifyFormat(Hex, Style);
   Style.IntegerLiteralSeparator.Hex = -1;
   verifyFormat("h = 0xDEADBEEFDEADBEEFuz;", Hex, Style);
@@ -87,9 +87,9 @@ TEST_F(IntegerLiteralSeparatorTest, SingleQuoteAsSeparator) {
 
 TEST_F(IntegerLiteralSeparatorTest, UnderscoreAsSeparator) {
   FormatStyle Style = getLLVMStyle();
-  const StringRef Binary("B = 0B10011_11_0110_1;");
-  const StringRef Decimal("d = 184467_440737_0_95505_92;");
-  const StringRef Hex("H = 0XDEAD_BEEF_DE_AD_BEE_F;");
+  constexpr StringRef Binary("B = 0B10011_11_0110_1;");
+  constexpr StringRef Decimal("d = 184467_440737_0_95505_92;");
+  constexpr StringRef Hex("H = 0XDEAD_BEEF_DE_AD_BEE_F;");
 
   auto TestUnderscore = [&](auto Language) {
     Style.Language = Language;
@@ -173,16 +173,16 @@ TEST_F(IntegerLiteralSeparatorTest, FixRanges) {
   FormatStyle Style = getLLVMStyle();
   Style.IntegerLiteralSeparator.Decimal = 3;
 
-  const StringRef Code("i = -12'34;\n"
-                       "// clang-format off\n"
-                       "j = 123'4;\n"
-                       "// clang-format on\n"
-                       "k = +1'23'4;");
-  const StringRef Expected("i = -1'234;\n"
+  constexpr StringRef Code("i = -12'34;\n"
                            "// clang-format off\n"
                            "j = 123'4;\n"
                            "// clang-format on\n"
-                           "k = +1'234;");
+                           "k = +1'23'4;");
+  constexpr StringRef Expected("i = -1'234;\n"
+                               "// clang-format off\n"
+                               "j = 123'4;\n"
+                               "// clang-format on\n"
+                               "k = +1'234;");
 
   verifyFormat(Expected, Code, Style);
 
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 994227efdd4f8..48ecd5d32d034 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -1084,10 +1084,10 @@ TEST_F(SortIncludesTest, DoNotSortLikelyXml) {
 }
 
 TEST_F(SortIncludesTest, DoNotSortCSharp) {
-  constexpr StringRef Code{"const string expectedDataStruct = @\"\n"
+  constexpr StringRef Code("const string expectedDataStruct = @\"\n"
                            "            #include <b.h>\n"
                            "            #include <a.h>\n"
-                           "        \";"};
+                           "        \";");
   FmtStyle.Language = FormatStyle::LK_CSharp;
   EXPECT_TRUE(sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a.cs").empty());
 }
@@ -1483,6 +1483,26 @@ TEST_F(SortIncludesTest, BlockCommentedOutIncludes) {
   verifyFormat(Code, sort(Code, "input.cpp", 0));
 }
 
+TEST_F(SortIncludesTest, IgnoreExtension) {
+  FmtStyle.SortIncludes.IgnoreExtension = true;
+
+  verifyFormat("#include <a.h>\n"
+               "#include <a.inc>\n"
+               "#include <a-util.h>",
+               sort("#include <a.inc>\n"
+                    "#include <a-util.h>\n"
+                    "#include <a.h>",
+                    "input.h"));
+
+  verifyFormat("#include <ab.h>\n"
+               "#include <ab-beta.h>\n"
+               "#include <ab-data.h>",
+               sort("#include <ab-data.h>\n"
+                    "#include <ab.h>\n"
+                    "#include <ab-beta.h>",
+                    "input.h"));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index e281a4945a862..7f99655b1fa49 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -390,6 +390,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
   EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 
+  Tokens = annotate("bool foo = requires { static_cast<Foo &&>(1); };");
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference);
+
   Tokens = annotate("return s.operator int *();");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   // Not TT_FunctionDeclarationName.
@@ -614,7 +618,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[20], tok::r_brace, TT_StructRBrace);
 
-  constexpr StringRef Code{"struct EXPORT StructName {};"};
+  constexpr StringRef Code("struct EXPORT StructName {};");
 
   Tokens = annotate(Code);
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
@@ -3954,7 +3958,7 @@ TEST_F(TokenAnnotatorTest, SplitPenalty) {
 }
 
 TEST_F(TokenAnnotatorTest, TemplateName) {
-  constexpr StringRef Code{"return Foo < A || B > (C ^ D);"};
+  constexpr StringRef Code("return Foo < A || B > (C ^ D);");
 
   auto Tokens = annotate(Code);
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index b97f5ae17c9f0..2ba15cbd37093 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -22,6 +22,8 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
 
+#include "llvm/TargetParser/Host.h"
+
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -389,6 +391,26 @@ TEST_F(InterpreterTest, Value) {
   EXPECT_TRUE(V9.getType()->isMemberFunctionPointerType());
   EXPECT_EQ(V9.getKind(), Value::K_PtrOrObj);
   EXPECT_TRUE(V9.isManuallyAlloc());
+
+  Value V10;
+  llvm::cantFail(Interp->ParseAndExecute(
+      "enum D : unsigned int {Zero = 0, One}; One", &V10));
+
+  std::string prettyType;
+  llvm::raw_string_ostream OSType(prettyType);
+  V10.printType(OSType);
+  EXPECT_STREQ(prettyType.c_str(), "D");
+
+  // FIXME: We should print only the value or the constant not the type.
+  std::string prettyData;
+  llvm::raw_string_ostream OSData(prettyData);
+  V10.printData(OSData);
+  EXPECT_STREQ(prettyData.c_str(), "(One) : unsigned int 1");
+
+  std::string prettyPrint;
+  llvm::raw_string_ostream OSPrint(prettyPrint);
+  V10.print(OSPrint);
+  EXPECT_STREQ(prettyPrint.c_str(), "(D) (One) : unsigned int 1\n");
 }
 
 TEST_F(InterpreterTest, TranslationUnit_CanonicalDecl) {
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 46dbb4d4b91b4..ddc87921ea084 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -640,14 +640,14 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) {
   EXPECT_STREQ("@import A;\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A\n;", Out));
-  EXPECT_STREQ("@import A;\n", Out.data());
+  EXPECT_STREQ("@import A\n;\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A.B;\n", Out));
   EXPECT_STREQ("@import A.B;\n", Out.data());
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(
-      "@import /*x*/ A /*x*/ . /*x*/ B /*x*/ \n /*x*/ ; /*x*/", Out));
-  EXPECT_STREQ("@import A.B;\n", Out.data());
+      "@import /*x*/ A /*x*/ . /*x*/ B /*x*/ \\n /*x*/ ; /*x*/", Out));
+  EXPECT_STREQ("@import A.B\\n;\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIncludesAndImports) {
@@ -1122,16 +1122,23 @@ ort \
     )";
   ASSERT_FALSE(
       minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives));
-  EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;"
-               "exp\\\nort import:l[[rename]];"
-               "import<<=3;import a b d e d e f e;"
-               "import foo[[no_unique_address]];import foo();"
-               "import f(:sefse);import f(->a=3);"
+
+  EXPECT_STREQ("module;\n"
+               "#include \"textual-header.h\"\n"
+               "export module m;\n"
+               "exp\\\nort import:l[[rename]];\n"
+               "import<<=3;\n"
+               "import a b d e d e f e;\n"
+               "import foo[[no_unique_address]];\n"
+               "import foo();\n"
+               "import f(:sefse);\n"
+               "import f(->a=3);\n"
                "<TokBeforeEOF>\n",
                Out.data());
-  ASSERT_EQ(Directives.size(), 11u);
-  EXPECT_EQ(Directives[0].Kind, pp_include);
-  EXPECT_EQ(Directives[1].Kind, cxx_export_module_decl);
+  ASSERT_EQ(Directives.size(), 12u);
+  EXPECT_EQ(Directives[0].Kind, cxx_module_decl);
+  EXPECT_EQ(Directives[1].Kind, pp_include);
+  EXPECT_EQ(Directives[2].Kind, cxx_export_module_decl);
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, ObjCMethodArgs) {
diff --git a/clang/unittests/StaticAnalyzer/BlockEntranceCallbackTest.cpp b/clang/unittests/StaticAnalyzer/BlockEntranceCallbackTest.cpp
index 0f05c39df93e0..d15bec02879f2 100644
--- a/clang/unittests/StaticAnalyzer/BlockEntranceCallbackTest.cpp
+++ b/clang/unittests/StaticAnalyzer/BlockEntranceCallbackTest.cpp
@@ -91,8 +91,7 @@ void addBlockEntranceTester(AnalysisASTConsumer &AnalysisConsumer,
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker(&registerChecker<BlockEntranceCallbackTester>,
                         &shouldAlwaysRegister, "test.BlockEntranceTester",
-                        "EmptyDescription", "EmptyDocsUri",
-                        /*IsHidden=*/false);
+                        "EmptyDescription");
   });
 }
 
@@ -102,8 +101,7 @@ void addBranchConditionTester(AnalysisASTConsumer &AnalysisConsumer,
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker(&registerChecker<BranchConditionCallbackTester>,
                         &shouldAlwaysRegister, "test.BranchConditionTester",
-                        "EmptyDescription", "EmptyDocsUri",
-                        /*IsHidden=*/false);
+                        "EmptyDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/BugReportInterestingnessTest.cpp b/clang/unittests/StaticAnalyzer/BugReportInterestingnessTest.cpp
index 0ef63b049621e..fc50f0028015b 100644
--- a/clang/unittests/StaticAnalyzer/BugReportInterestingnessTest.cpp
+++ b/clang/unittests/StaticAnalyzer/BugReportInterestingnessTest.cpp
@@ -120,7 +120,7 @@ class TestAction : public ASTFrontendAction {
             std::move(ExpectedDiags), Compiler.getSourceManager()));
     AnalysisConsumer->AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
       Registry.addChecker<InterestingnessTestChecker>("test.Interestingness",
-                                                      "Description", "");
+                                                      "MockDescription");
     });
     Compiler.getAnalyzerOpts().CheckersAndPackages = {
         {"test.Interestingness", true}};
diff --git a/clang/unittests/StaticAnalyzer/CallDescriptionTest.cpp b/clang/unittests/StaticAnalyzer/CallDescriptionTest.cpp
index 4cb6bd34fa36d..e2007a9589c60 100644
--- a/clang/unittests/StaticAnalyzer/CallDescriptionTest.cpp
+++ b/clang/unittests/StaticAnalyzer/CallDescriptionTest.cpp
@@ -616,8 +616,8 @@ void addCallDescChecker(AnalysisASTConsumer &AnalysisConsumer,
                         AnalyzerOptions &AnOpts) {
   AnOpts.CheckersAndPackages = {{"test.CallDescChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<CallDescChecker>("test.CallDescChecker", "Description",
-                                         "");
+    Registry.addChecker<CallDescChecker>("test.CallDescChecker",
+                                         "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/CallEventTest.cpp b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
index 2843572e5f800..8b5289ea7472b 100644
--- a/clang/unittests/StaticAnalyzer/CallEventTest.cpp
+++ b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
@@ -56,7 +56,7 @@ void addCXXDeallocatorChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"test.CXXDeallocator", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker<CXXDeallocatorChecker>("test.CXXDeallocator",
-                                               "Description", "");
+                                               "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/ConflictingEvalCallsTest.cpp b/clang/unittests/StaticAnalyzer/ConflictingEvalCallsTest.cpp
index e410cca076637..cffdbf1896df3 100644
--- a/clang/unittests/StaticAnalyzer/ConflictingEvalCallsTest.cpp
+++ b/clang/unittests/StaticAnalyzer/ConflictingEvalCallsTest.cpp
@@ -33,10 +33,8 @@ void addEvalFooCheckers(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"test.EvalFoo1", true},
                                 {"test.EvalFoo2", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<EvalCallFoo1>("test.EvalFoo1", "EmptyDescription",
-                                      "EmptyDocsUri");
-    Registry.addChecker<EvalCallFoo2>("test.EvalFoo2", "EmptyDescription",
-                                      "EmptyDocsUri");
+    Registry.addChecker<EvalCallFoo1>("test.EvalFoo1", "MockDescription");
+    Registry.addChecker<EvalCallFoo2>("test.EvalFoo2", "MockDescription");
   });
 }
 } // namespace
diff --git a/clang/unittests/StaticAnalyzer/ExprEngineVisitTest.cpp b/clang/unittests/StaticAnalyzer/ExprEngineVisitTest.cpp
index b6eeb9ce37386..12be2289c3174 100644
--- a/clang/unittests/StaticAnalyzer/ExprEngineVisitTest.cpp
+++ b/clang/unittests/StaticAnalyzer/ExprEngineVisitTest.cpp
@@ -78,7 +78,7 @@ void addExprEngineVisitPreChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"ExprEngineVisitPreChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker<ExprEngineVisitPreChecker>("ExprEngineVisitPreChecker",
-                                                   "Desc", "DocsURI");
+                                                   "MockDescription");
   });
 }
 
@@ -87,7 +87,7 @@ void addExprEngineVisitPostChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"ExprEngineVisitPostChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker<ExprEngineVisitPostChecker>(
-        "ExprEngineVisitPostChecker", "Desc", "DocsURI");
+        "ExprEngineVisitPostChecker", "MockDescription");
   });
 }
 
@@ -95,8 +95,8 @@ void addMemAccessChecker(AnalysisASTConsumer &AnalysisConsumer,
                          AnalyzerOptions &AnOpts) {
   AnOpts.CheckersAndPackages = {{"MemAccessChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<MemAccessChecker>("MemAccessChecker", "Desc",
-                                          "DocsURI");
+    Registry.addChecker<MemAccessChecker>("MemAccessChecker",
+                                          "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp b/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp
index 8f0a96d41e752..146797f5b17f2 100644
--- a/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp
+++ b/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp
@@ -92,8 +92,8 @@ void addFalsePositiveGenerator(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"test.FalsePositiveGenerator", true},
                                 {"debug.ViewExplodedGraph", false}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<FalsePositiveGenerator>(
-        "test.FalsePositiveGenerator", "EmptyDescription", "EmptyDocsUri");
+    Registry.addChecker<FalsePositiveGenerator>("test.FalsePositiveGenerator",
+                                                "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp b/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp
index 0f6e49bf42f4a..7b837f3b7fb2a 100644
--- a/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp
+++ b/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp
@@ -46,7 +46,7 @@ void addDescriptiveNameChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"DescriptiveNameChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker<DescriptiveNameChecker>("DescriptiveNameChecker",
-                                                "Desc", "DocsURI");
+                                                "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp b/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp
index a9033425dfb51..68d267853e926 100644
--- a/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp
+++ b/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp
@@ -140,7 +140,7 @@ void addNonThoroughStatefulChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry
         .addChecker<StatefulChecker<NonThoroughErrorNotPreventedFuncVisitor>>(
-            "test.StatefulChecker", "Description", "");
+            "test.StatefulChecker", "MockDescription");
   });
 }
 
@@ -233,7 +233,7 @@ void addThoroughStatefulChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"test.StatefulChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker<StatefulChecker<ThoroughErrorNotPreventedFuncVisitor>>(
-        "test.StatefulChecker", "Description", "");
+        "test.StatefulChecker", "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp b/clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp
index 51bd33210032c..ab78090b42f31 100644
--- a/clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp
+++ b/clang/unittests/StaticAnalyzer/ObjcBug-124477.cpp
@@ -37,7 +37,7 @@ void addFlagFlipperChecker(AnalysisASTConsumer &AnalysisConsumer,
   AnOpts.CheckersAndPackages = {{"test.FlipFlagOnCheckLocation", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker<FlipFlagOnCheckLocation>("test.FlipFlagOnCheckLocation",
-                                                 "Description", "");
+                                                 "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp
index 454eee9cf7e0a..e17d107d90cef 100644
--- a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp
+++ b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp
@@ -44,7 +44,7 @@ void addCustomChecker(AnalysisASTConsumer &AnalysisConsumer,
                       AnalyzerOptions &AnOpts) {
   AnOpts.CheckersAndPackages = {{"test.CustomChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<CustomChecker>("test.CustomChecker", "Description", "");
+    Registry.addChecker<CustomChecker>("test.CustomChecker", "MockDescription");
   });
 }
 
@@ -73,8 +73,8 @@ void addLocIncDecChecker(AnalysisASTConsumer &AnalysisConsumer,
                          AnalyzerOptions &AnOpts) {
   AnOpts.CheckersAndPackages = {{"test.LocIncDecChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<CustomChecker>("test.LocIncDecChecker", "Description",
-                                       "");
+    Registry.addChecker<CustomChecker>("test.LocIncDecChecker",
+                                       "MockDescription");
   });
 }
 
@@ -119,10 +119,10 @@ bool shouldRegisterCheckerRegistrationOrderPrinter(const CheckerManager &mgr) {
 void addCheckerRegistrationOrderPrinter(CheckerRegistry &Registry) {
   Registry.addChecker(registerCheckerRegistrationOrderPrinter,
                       shouldRegisterCheckerRegistrationOrderPrinter,
-                      "test.RegistrationOrder", "Description", "", false);
+                      "test.RegistrationOrder", "Description");
 }
 
-#define UNITTEST_CHECKER(CHECKER_NAME, DIAG_MSG)                               \
+#define UNITTEST_CHECKER(CHECKER_NAME)                                         \
   class CHECKER_NAME : public Checker<check::PreStmt<DeclStmt>> {              \
   public:                                                                      \
     void checkPreStmt(const DeclStmt *DS, CheckerContext &C) const {}          \
@@ -137,11 +137,11 @@ void addCheckerRegistrationOrderPrinter(CheckerRegistry &Registry) {
   }                                                                            \
   void add##CHECKER_NAME(CheckerRegistry &Registry) {                          \
     Registry.addChecker(register##CHECKER_NAME, shouldRegister##CHECKER_NAME,  \
-                        "test." #CHECKER_NAME, "Description", "", false);      \
+                        "test." #CHECKER_NAME, "Description");                 \
   }
 
-UNITTEST_CHECKER(StrongDep, "Strong")
-UNITTEST_CHECKER(Dep, "Dep")
+UNITTEST_CHECKER(StrongDep)
+UNITTEST_CHECKER(Dep)
 
 bool shouldRegisterStrongFALSE(const CheckerManager &mgr) {
   return false;
@@ -154,7 +154,7 @@ void addDep(AnalysisASTConsumer &AnalysisConsumer,
                                 {"test.RegistrationOrder", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
     Registry.addChecker(registerStrongDep, shouldRegisterStrongFALSE,
-                        "test.Strong", "Description", "", false);
+                        "test.Strong", "Description");
     addStrongDep(Registry);
     addDep(Registry);
     addCheckerRegistrationOrderPrinter(Registry);
@@ -172,7 +172,7 @@ TEST(RegisterDeps, UnsatisfiedDependency) {
 // Weak checker dependencies.
 //===----------------------------------------------------------------------===//
 
-UNITTEST_CHECKER(WeakDep, "Weak")
+UNITTEST_CHECKER(WeakDep)
 
 void addWeakDepCheckerBothEnabled(AnalysisASTConsumer &AnalysisConsumer,
                                   AnalyzerOptions &AnOpts) {
@@ -225,8 +225,8 @@ void addWeakDepCheckerDepUnspecified(AnalysisASTConsumer &AnalysisConsumer,
   });
 }
 
-UNITTEST_CHECKER(WeakDep2, "Weak2")
-UNITTEST_CHECKER(Dep2, "Dep2")
+UNITTEST_CHECKER(WeakDep2)
+UNITTEST_CHECKER(Dep2)
 
 void addWeakDepHasWeakDep(AnalysisASTConsumer &AnalysisConsumer,
                           AnalyzerOptions &AnOpts) {
diff --git a/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp b/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp
index 85cfe2c1965ac..4331ffc1b585c 100644
--- a/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp
+++ b/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp
@@ -68,8 +68,7 @@ static void addSimplifyChecker(AnalysisASTConsumer &AnalysisConsumer,
                                AnalyzerOptions &AnOpts) {
   AnOpts.CheckersAndPackages = {{"SimplifyChecker", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-    Registry.addChecker<SimplifyChecker>("SimplifyChecker", "EmptyDescription",
-                                         "EmptyDocsUri");
+    Registry.addChecker<SimplifyChecker>("SimplifyChecker", "MockDescription");
   });
 }
 
diff --git a/clang/unittests/StaticAnalyzer/SValTest.cpp b/clang/unittests/StaticAnalyzer/SValTest.cpp
index d8897b0f2183d..58e9a8da0e99d 100644
--- a/clang/unittests/StaticAnalyzer/SValTest.cpp
+++ b/clang/unittests/StaticAnalyzer/SValTest.cpp
@@ -139,10 +139,10 @@ class SValTest : public testing::TestWithParam<TestClangConfig> {};
                                                                                \
   void add##NAME##SValCollector(AnalysisASTConsumer &AnalysisConsumer,         \
                                 AnalyzerOptions &AnOpts) {                     \
-    AnOpts.CheckersAndPackages = {{"test.##NAME##SValCollector", true}};       \
+    AnOpts.CheckersAndPackages = {{"test." #NAME "SValColl", true}};           \
     AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {  \
-      Registry.addChecker<NAME##SValCollector>("test.##NAME##SValCollector",   \
-                                               "Description", "");             \
+      Registry.addChecker<NAME##SValCollector>("test." #NAME "SValColl",       \
+                                               "MockDescription");             \
     });                                                                        \
   }                                                                            \
                                                                                \
diff --git a/clang/unittests/StaticAnalyzer/TestReturnValueUnderConstruction.cpp b/clang/unittests/StaticAnalyzer/TestReturnValueUnderConstruction.cpp
index 5fc084a48667c..0cb3c59a4421d 100644
--- a/clang/unittests/StaticAnalyzer/TestReturnValueUnderConstruction.cpp
+++ b/clang/unittests/StaticAnalyzer/TestReturnValueUnderConstruction.cpp
@@ -49,9 +49,9 @@ void addTestReturnValueUnderConstructionChecker(
   AnOpts.CheckersAndPackages =
     {{"test.TestReturnValueUnderConstruction", true}};
   AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
-      Registry.addChecker<TestReturnValueUnderConstructionChecker>(
-          "test.TestReturnValueUnderConstruction", "", "");
-    });
+    Registry.addChecker<TestReturnValueUnderConstructionChecker>(
+        "test.TestReturnValueUnderConstruction", "MockDescription");
+  });
 }
 
 TEST(TestReturnValueUnderConstructionChecker,
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
index ddc663e2b6fd3..23a2df42ff08c 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
@@ -25,8 +25,8 @@ class NestedNameSpecifiersVisitor : public ExpectedLocationVisitor {
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) override {
     if (!NNS)
       return true;
-    if (const NamespaceDecl *ND =
-            NNS.getNestedNameSpecifier()->getAsNamespace())
+    if (const auto *ND = dyn_cast_if_present<NamespaceDecl>(
+            NNS.getNestedNameSpecifier()->getAsNamespace()))
       Match(ND->getName(), NNS.getLocalBeginLoc());
     return ExpectedLocationVisitor::TraverseNestedNameSpecifierLoc(NNS);
   }
diff --git a/clang/unittests/Tooling/RefactoringTest.cpp b/clang/unittests/Tooling/RefactoringTest.cpp
index 254d95bc20cb0..35d114343b517 100644
--- a/clang/unittests/Tooling/RefactoringTest.cpp
+++ b/clang/unittests/Tooling/RefactoringTest.cpp
@@ -748,7 +748,8 @@ class NestedNameSpecifierAVisitor : public TestVisitor {
 public:
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNSLoc) override {
     if (NNSLoc.getNestedNameSpecifier()) {
-      if (const NamespaceDecl* NS = NNSLoc.getNestedNameSpecifier()->getAsNamespace()) {
+      if (const auto *NS = dyn_cast_if_present<NamespaceDecl>(
+              NNSLoc.getNestedNameSpecifier()->getAsNamespace())) {
         if (NS->getName() == "a") {
           Replace = Replacement(*SM, &NNSLoc, "", Context->getLangOpts());
         }
diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp
index d3c0288285b86..9ebe4d08ec8c1 100644
--- a/compiler-rt/lib/asan/asan_allocator.cpp
+++ b/compiler-rt/lib/asan/asan_allocator.cpp
@@ -1007,13 +1007,8 @@ void PrintInternalAllocatorStats() {
   instance.PrintStats();
 }
 
-void asan_free(void *ptr, BufferedStackTrace *stack, AllocType alloc_type) {
-  instance.Deallocate(ptr, 0, 0, stack, alloc_type);
-}
-
-void asan_delete(void *ptr, uptr size, uptr alignment,
-                 BufferedStackTrace *stack, AllocType alloc_type) {
-  instance.Deallocate(ptr, size, alignment, stack, alloc_type);
+void asan_free(void *ptr, BufferedStackTrace *stack) {
+  instance.Deallocate(ptr, 0, 0, stack, FROM_MALLOC);
 }
 
 void *asan_malloc(uptr size, BufferedStackTrace *stack) {
@@ -1068,8 +1063,7 @@ void *asan_pvalloc(uptr size, BufferedStackTrace *stack) {
       instance.Allocate(size, PageSize, stack, FROM_MALLOC, true));
 }
 
-void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
-                    AllocType alloc_type) {
+void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack) {
   if (UNLIKELY(!IsPowerOfTwo(alignment))) {
     errno = errno_EINVAL;
     if (AllocatorMayReturnNull())
@@ -1077,7 +1071,7 @@ void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
     ReportInvalidAllocationAlignment(alignment, stack);
   }
   return SetErrnoOnNull(
-      instance.Allocate(size, alignment, stack, alloc_type, true));
+      instance.Allocate(size, alignment, stack, FROM_MALLOC, true));
 }
 
 void *asan_aligned_alloc(uptr alignment, uptr size, BufferedStackTrace *stack) {
@@ -1117,6 +1111,99 @@ uptr asan_malloc_usable_size(const void *ptr, uptr pc, uptr bp) {
   return usable_size;
 }
 
+namespace {
+
+void *asan_new(uptr size, BufferedStackTrace *stack, bool array) {
+  return SetErrnoOnNull(
+      instance.Allocate(size, 0, stack, array ? FROM_NEW_BR : FROM_NEW, true));
+}
+
+void *asan_new_aligned(uptr size, uptr alignment, BufferedStackTrace *stack,
+                       bool array) {
+  if (UNLIKELY(alignment == 0 || !IsPowerOfTwo(alignment))) {
+    errno = errno_EINVAL;
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    ReportInvalidAllocationAlignment(alignment, stack);
+  }
+  return SetErrnoOnNull(instance.Allocate(
+      size, alignment, stack, array ? FROM_NEW_BR : FROM_NEW, true));
+}
+
+void asan_delete(void *ptr, BufferedStackTrace *stack, bool array) {
+  instance.Deallocate(ptr, 0, 0, stack, array ? FROM_NEW_BR : FROM_NEW);
+}
+
+void asan_delete_aligned(void *ptr, uptr alignment, BufferedStackTrace *stack,
+                         bool array) {
+  instance.Deallocate(ptr, 0, alignment, stack, array ? FROM_NEW_BR : FROM_NEW);
+}
+
+void asan_delete_sized(void *ptr, uptr size, BufferedStackTrace *stack,
+                       bool array) {
+  instance.Deallocate(ptr, size, 0, stack, array ? FROM_NEW_BR : FROM_NEW);
+}
+
+void asan_delete_sized_aligned(void *ptr, uptr size, uptr alignment,
+                               BufferedStackTrace *stack, bool array) {
+  instance.Deallocate(ptr, size, alignment, stack,
+                      array ? FROM_NEW_BR : FROM_NEW);
+}
+
+}  // namespace
+
+void *asan_new(uptr size, BufferedStackTrace *stack) {
+  return asan_new(size, stack, /*array=*/false);
+}
+
+void *asan_new_aligned(uptr size, uptr alignment, BufferedStackTrace *stack) {
+  return asan_new_aligned(size, alignment, stack, /*array=*/false);
+}
+
+void *asan_new_array(uptr size, BufferedStackTrace *stack) {
+  return asan_new(size, stack, /*array=*/true);
+}
+
+void *asan_new_array_aligned(uptr size, uptr alignment,
+                             BufferedStackTrace *stack) {
+  return asan_new_aligned(size, alignment, stack, /*array=*/true);
+}
+
+void asan_delete(void *ptr, BufferedStackTrace *stack) {
+  asan_delete(ptr, stack, /*array=*/false);
+}
+
+void asan_delete_aligned(void *ptr, uptr alignment, BufferedStackTrace *stack) {
+  asan_delete_aligned(ptr, alignment, stack, /*array=*/false);
+}
+
+void asan_delete_sized(void *ptr, uptr size, BufferedStackTrace *stack) {
+  asan_delete_sized(ptr, size, stack, /*array=*/false);
+}
+
+void asan_delete_sized_aligned(void *ptr, uptr size, uptr alignment,
+                               BufferedStackTrace *stack) {
+  asan_delete_sized_aligned(ptr, size, alignment, stack, /*array=*/false);
+}
+
+void asan_delete_array(void *ptr, BufferedStackTrace *stack) {
+  asan_delete(ptr, stack, /*array=*/true);
+}
+
+void asan_delete_array_aligned(void *ptr, uptr alignment,
+                               BufferedStackTrace *stack) {
+  asan_delete_aligned(ptr, alignment, stack, /*array=*/true);
+}
+
+void asan_delete_array_sized(void *ptr, uptr size, BufferedStackTrace *stack) {
+  asan_delete_sized(ptr, size, stack, /*array=*/true);
+}
+
+void asan_delete_array_sized_aligned(void *ptr, uptr size, uptr alignment,
+                                     BufferedStackTrace *stack) {
+  asan_delete_sized_aligned(ptr, size, alignment, stack, /*array=*/true);
+}
+
 uptr asan_mz_size(const void *ptr) {
   return instance.AllocationSize(reinterpret_cast<uptr>(ptr));
 }
diff --git a/compiler-rt/lib/asan/asan_allocator.h b/compiler-rt/lib/asan/asan_allocator.h
index 247d8bb77c984..fdf456473fb02 100644
--- a/compiler-rt/lib/asan/asan_allocator.h
+++ b/compiler-rt/lib/asan/asan_allocator.h
@@ -270,11 +270,8 @@ struct AsanThreadLocalMallocStorage {
   AsanThreadLocalMallocStorage() {}
 };
 
-void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
-                    AllocType alloc_type);
-void asan_free(void *ptr, BufferedStackTrace *stack, AllocType alloc_type);
-void asan_delete(void *ptr, uptr size, uptr alignment,
-                 BufferedStackTrace *stack, AllocType alloc_type);
+void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack);
+void asan_free(void *ptr, BufferedStackTrace *stack);
 
 void *asan_malloc(uptr size, BufferedStackTrace *stack);
 void *asan_calloc(uptr nmemb, uptr size, BufferedStackTrace *stack);
@@ -289,6 +286,23 @@ int asan_posix_memalign(void **memptr, uptr alignment, uptr size,
                         BufferedStackTrace *stack);
 uptr asan_malloc_usable_size(const void *ptr, uptr pc, uptr bp);
 
+void *asan_new(uptr size, BufferedStackTrace *stack);
+void *asan_new_aligned(uptr size, uptr alignment, BufferedStackTrace *stack);
+void *asan_new_array(uptr size, BufferedStackTrace *stack);
+void *asan_new_array_aligned(uptr size, uptr alignment,
+                             BufferedStackTrace *stack);
+void asan_delete(void *ptr, BufferedStackTrace *stack);
+void asan_delete_aligned(void *ptr, uptr alignment, BufferedStackTrace *stack);
+void asan_delete_sized(void *ptr, uptr size, BufferedStackTrace *stack);
+void asan_delete_sized_aligned(void *ptr, uptr size, uptr alignment,
+                               BufferedStackTrace *stack);
+void asan_delete_array(void *ptr, BufferedStackTrace *stack);
+void asan_delete_array_aligned(void *ptr, uptr alignment,
+                               BufferedStackTrace *stack);
+void asan_delete_array_sized(void *ptr, uptr size, BufferedStackTrace *stack);
+void asan_delete_array_sized_aligned(void *ptr, uptr size, uptr alignment,
+                                     BufferedStackTrace *stack);
+
 uptr asan_mz_size(const void *ptr);
 void asan_mz_force_lock();
 void asan_mz_force_unlock();
diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp
index 0c30959b23e28..18c2a6c571c1f 100644
--- a/compiler-rt/lib/asan/asan_descriptions.cpp
+++ b/compiler-rt/lib/asan/asan_descriptions.cpp
@@ -449,10 +449,12 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size,
   // are put to the STACK region for unknown reasons. Check global first can
   // workaround this issue.
   // TODO: Look into whether there's a different solution to this problem.
+#if SANITIZER_AIX
   if (GetGlobalAddressInformation(addr, access_size, &data.global)) {
     data.kind = kAddressKindGlobal;
     return;
   }
+#endif
 
   if (GetHeapAddressInformation(addr, access_size, &data.heap)) {
     data.kind = kAddressKindHeap;
@@ -471,6 +473,14 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size,
     return;
   }
 
+// GetGlobalAddressInformation is called earlier on AIX due to a workaround
+#if !SANITIZER_AIX
+  if (GetGlobalAddressInformation(addr, access_size, &data.global)) {
+    data.kind = kAddressKindGlobal;
+    return;
+  }
+#endif
+
   data.kind = kAddressKindWild;
   data.wild.addr = addr;
   data.wild.access_size = access_size;
diff --git a/compiler-rt/lib/asan/asan_mac.cpp b/compiler-rt/lib/asan/asan_mac.cpp
index be513a03ed5cd..4bc2c88e7d88a 100644
--- a/compiler-rt/lib/asan/asan_mac.cpp
+++ b/compiler-rt/lib/asan/asan_mac.cpp
@@ -176,7 +176,7 @@ void asan_dispatch_call_block_and_release(void *block) {
   asan_register_worker_thread(context->parent_tid, &stack);
   // Call the original dispatcher for the block.
   context->func(context->block);
-  asan_free(context, &stack, FROM_MALLOC);
+  asan_free(context, &stack);
 }
 
 }  // namespace __asan
diff --git a/compiler-rt/lib/asan/asan_malloc_linux.cpp b/compiler-rt/lib/asan/asan_malloc_linux.cpp
index 3f023d4c2ed0a..add57318785be 100644
--- a/compiler-rt/lib/asan/asan_malloc_linux.cpp
+++ b/compiler-rt/lib/asan/asan_malloc_linux.cpp
@@ -49,7 +49,7 @@ INTERCEPTOR(void, free, void *ptr) {
   if (DlsymAlloc::PointerIsMine(ptr))
     return DlsymAlloc::Free(ptr);
   GET_STACK_TRACE_FREE;
-  asan_free(ptr, &stack, FROM_MALLOC);
+  asan_free(ptr, &stack);
 }
 
 #if SANITIZER_INTERCEPT_CFREE
@@ -57,7 +57,7 @@ INTERCEPTOR(void, cfree, void *ptr) {
   if (DlsymAlloc::PointerIsMine(ptr))
     return DlsymAlloc::Free(ptr);
   GET_STACK_TRACE_FREE;
-  asan_free(ptr, &stack, FROM_MALLOC);
+  asan_free(ptr, &stack);
 }
 #endif // SANITIZER_INTERCEPT_CFREE
 
@@ -93,12 +93,12 @@ INTERCEPTOR(void*, reallocarray, void *ptr, uptr nmemb, uptr size) {
 #if SANITIZER_INTERCEPT_MEMALIGN
 INTERCEPTOR(void*, memalign, uptr boundary, uptr size) {
   GET_STACK_TRACE_MALLOC;
-  return asan_memalign(boundary, size, &stack, FROM_MALLOC);
+  return asan_memalign(boundary, size, &stack);
 }
 
 INTERCEPTOR(void*, __libc_memalign, uptr boundary, uptr size) {
   GET_STACK_TRACE_MALLOC;
-  return asan_memalign(boundary, size, &stack, FROM_MALLOC);
+  return asan_memalign(boundary, size, &stack);
 }
 #endif // SANITIZER_INTERCEPT_MEMALIGN
 
diff --git a/compiler-rt/lib/asan/asan_malloc_mac.cpp b/compiler-rt/lib/asan/asan_malloc_mac.cpp
index f25d7e1901536..a442bdbbaa4d3 100644
--- a/compiler-rt/lib/asan/asan_malloc_mac.cpp
+++ b/compiler-rt/lib/asan/asan_malloc_mac.cpp
@@ -31,7 +31,7 @@ using namespace __asan;
 #  define COMMON_MALLOC_FORCE_UNLOCK() asan_mz_force_unlock()
 #  define COMMON_MALLOC_MEMALIGN(alignment, size) \
     GET_STACK_TRACE_MALLOC;                       \
-    void *p = asan_memalign(alignment, size, &stack, FROM_MALLOC)
+    void *p = asan_memalign(alignment, size, &stack)
 #  define COMMON_MALLOC_MALLOC(size) \
     GET_STACK_TRACE_MALLOC;          \
     void *p = asan_malloc(size, &stack)
@@ -46,10 +46,10 @@ using namespace __asan;
     int res = asan_posix_memalign(memptr, alignment, size, &stack);
 #  define COMMON_MALLOC_VALLOC(size) \
     GET_STACK_TRACE_MALLOC;          \
-    void *p = asan_memalign(GetPageSizeCached(), size, &stack, FROM_MALLOC);
+    void *p = asan_memalign(GetPageSizeCached(), size, &stack);
 #  define COMMON_MALLOC_FREE(ptr) \
     GET_STACK_TRACE_FREE;         \
-    asan_free(ptr, &stack, FROM_MALLOC);
+    asan_free(ptr, &stack);
 #  define COMMON_MALLOC_SIZE(ptr) uptr size = asan_mz_size(ptr);
 #  define COMMON_MALLOC_FILL_STATS(zone, stats)                    \
     AsanMallocStats malloc_stats;                                  \
diff --git a/compiler-rt/lib/asan/asan_malloc_win.cpp b/compiler-rt/lib/asan/asan_malloc_win.cpp
index 3278f07219876..8d98da940800f 100644
--- a/compiler-rt/lib/asan/asan_malloc_win.cpp
+++ b/compiler-rt/lib/asan/asan_malloc_win.cpp
@@ -69,7 +69,7 @@ __declspec(noinline) size_t _msize_base(void *ptr) { return _msize(ptr); }
 
 __declspec(noinline) void free(void *ptr) {
   GET_STACK_TRACE_FREE;
-  return asan_free(ptr, &stack, FROM_MALLOC);
+  return asan_free(ptr, &stack);
 }
 
 __declspec(noinline) void _free_dbg(void *ptr, int) { free(ptr); }
@@ -252,7 +252,7 @@ INTERCEPTOR_WINAPI(BOOL, HeapFree, HANDLE hHeap, DWORD dwFlags, LPVOID lpMem) {
     CHECK((HEAP_FREE_UNSUPPORTED_FLAGS & dwFlags) != 0 && "unsupported flags");
   }
   GET_STACK_TRACE_FREE;
-  asan_free(lpMem, &stack, FROM_MALLOC);
+  asan_free(lpMem, &stack);
   return true;
 }
 
@@ -306,7 +306,7 @@ void *SharedReAlloc(ReAllocFunction reallocFunc, SizeFunction heapSizeFunc,
         if (replacement_alloc) {
           size_t old_size = heapSizeFunc(hHeap, dwFlags, lpMem);
           if (old_size == ((size_t)0) - 1) {
-            asan_free(replacement_alloc, &stack, FROM_MALLOC);
+            asan_free(replacement_alloc, &stack);
             return nullptr;
           }
           REAL(memcpy)(replacement_alloc, lpMem, old_size);
@@ -331,7 +331,7 @@ void *SharedReAlloc(ReAllocFunction reallocFunc, SizeFunction heapSizeFunc,
         old_usable_size = asan_malloc_usable_size(lpMem, pc, bp);
         REAL(memcpy)(replacement_alloc, lpMem,
                      Min<size_t>(dwBytes, old_usable_size));
-        asan_free(lpMem, &stack, FROM_MALLOC);
+        asan_free(lpMem, &stack);
       }
       return replacement_alloc;
     }
@@ -429,7 +429,7 @@ INTERCEPTOR_WINAPI(BOOL, RtlFreeHeap, HANDLE HeapHandle, DWORD Flags,
     return REAL(RtlFreeHeap)(HeapHandle, Flags, BaseAddress);
   }
   GET_STACK_TRACE_FREE;
-  asan_free(BaseAddress, &stack, FROM_MALLOC);
+  asan_free(BaseAddress, &stack);
   return true;
 }
 
diff --git a/compiler-rt/lib/asan/asan_new_delete.cpp b/compiler-rt/lib/asan/asan_new_delete.cpp
index 99c7c9938dfb3..d7ed5b570728b 100644
--- a/compiler-rt/lib/asan/asan_new_delete.cpp
+++ b/compiler-rt/lib/asan/asan_new_delete.cpp
@@ -60,42 +60,42 @@ enum class align_val_t: size_t {};
 // TODO(alekseyshl): throw std::bad_alloc instead of dying on OOM.
 // For local pool allocation, align to SHADOW_GRANULARITY to match asan
 // allocator behavior.
-#define OPERATOR_NEW_BODY                               \
-  GET_STACK_TRACE_MALLOC;                               \
-  void *res = asan_memalign(0, size, &stack, FROM_NEW); \
-  if (UNLIKELY(!res))                                   \
-    ReportOutOfMemory(size, &stack);                    \
+#define OPERATOR_NEW_BODY             \
+  GET_STACK_TRACE_MALLOC;             \
+  void *res = asan_new(size, &stack); \
+  if (UNLIKELY(!res))                 \
+    ReportOutOfMemory(size, &stack);  \
   return res
 #define OPERATOR_NEW_BODY_NOTHROW \
   GET_STACK_TRACE_MALLOC;         \
-  return asan_memalign(0, size, &stack, FROM_NEW)
-#define OPERATOR_NEW_BODY_ARRAY                            \
-  GET_STACK_TRACE_MALLOC;                                  \
-  void *res = asan_memalign(0, size, &stack, FROM_NEW_BR); \
-  if (UNLIKELY(!res))                                      \
-    ReportOutOfMemory(size, &stack);                       \
+  return asan_new(size, &stack)
+#define OPERATOR_NEW_BODY_ARRAY             \
+  GET_STACK_TRACE_MALLOC;                   \
+  void *res = asan_new_array(size, &stack); \
+  if (UNLIKELY(!res))                       \
+    ReportOutOfMemory(size, &stack);        \
   return res
 #define OPERATOR_NEW_BODY_ARRAY_NOTHROW \
   GET_STACK_TRACE_MALLOC;               \
-  return asan_memalign(0, size, &stack, FROM_NEW_BR)
-#define OPERATOR_NEW_BODY_ALIGN                                   \
-  GET_STACK_TRACE_MALLOC;                                         \
-  void *res = asan_memalign((uptr)align, size, &stack, FROM_NEW); \
-  if (UNLIKELY(!res))                                             \
-    ReportOutOfMemory(size, &stack);                              \
+  return asan_new_array(size, &stack)
+#define OPERATOR_NEW_BODY_ALIGN                                         \
+  GET_STACK_TRACE_MALLOC;                                               \
+  void *res = asan_new_aligned(size, static_cast<uptr>(align), &stack); \
+  if (UNLIKELY(!res))                                                   \
+    ReportOutOfMemory(size, &stack);                                    \
   return res
 #define OPERATOR_NEW_BODY_ALIGN_NOTHROW \
   GET_STACK_TRACE_MALLOC;               \
-  return asan_memalign((uptr)align, size, &stack, FROM_NEW)
-#define OPERATOR_NEW_BODY_ALIGN_ARRAY                                \
-  GET_STACK_TRACE_MALLOC;                                            \
-  void *res = asan_memalign((uptr)align, size, &stack, FROM_NEW_BR); \
-  if (UNLIKELY(!res))                                                \
-    ReportOutOfMemory(size, &stack);                                 \
+  return asan_new_aligned(size, static_cast<uptr>(align), &stack)
+#define OPERATOR_NEW_BODY_ALIGN_ARRAY                                         \
+  GET_STACK_TRACE_MALLOC;                                                     \
+  void *res = asan_new_array_aligned(size, static_cast<uptr>(align), &stack); \
+  if (UNLIKELY(!res))                                                         \
+    ReportOutOfMemory(size, &stack);                                          \
   return res
 #define OPERATOR_NEW_BODY_ALIGN_ARRAY_NOTHROW \
   GET_STACK_TRACE_MALLOC;                     \
-  return asan_memalign((uptr)align, size, &stack, FROM_NEW_BR)
+  return asan_new_array_aligned(size, static_cast<uptr>(align), &stack)
 
 // On OS X it's not enough to just provide our own 'operator new' and
 // 'operator delete' implementations, because they're going to be in the
@@ -149,28 +149,28 @@ INTERCEPTOR(void *, _ZnamRKSt9nothrow_t, size_t size, std::nothrow_t const&) {
 
 #define OPERATOR_DELETE_BODY \
   GET_STACK_TRACE_FREE;      \
-  asan_delete(ptr, 0, 0, &stack, FROM_NEW)
+  asan_delete(ptr, &stack)
 #define OPERATOR_DELETE_BODY_ARRAY \
   GET_STACK_TRACE_FREE;            \
-  asan_delete(ptr, 0, 0, &stack, FROM_NEW_BR)
+  asan_delete_array(ptr, &stack)
 #define OPERATOR_DELETE_BODY_ALIGN \
   GET_STACK_TRACE_FREE;            \
-  asan_delete(ptr, 0, static_cast<uptr>(align), &stack, FROM_NEW)
+  asan_delete_aligned(ptr, static_cast<uptr>(align), &stack)
 #define OPERATOR_DELETE_BODY_ALIGN_ARRAY \
   GET_STACK_TRACE_FREE;                  \
-  asan_delete(ptr, 0, static_cast<uptr>(align), &stack, FROM_NEW_BR)
+  asan_delete_array_aligned(ptr, static_cast<uptr>(align), &stack)
 #define OPERATOR_DELETE_BODY_SIZE \
   GET_STACK_TRACE_FREE;           \
-  asan_delete(ptr, size, 0, &stack, FROM_NEW)
+  asan_delete_sized(ptr, size, &stack)
 #define OPERATOR_DELETE_BODY_SIZE_ARRAY \
   GET_STACK_TRACE_FREE;                 \
-  asan_delete(ptr, size, 0, &stack, FROM_NEW_BR)
+  asan_delete_array_sized(ptr, size, &stack)
 #define OPERATOR_DELETE_BODY_SIZE_ALIGN \
   GET_STACK_TRACE_FREE;                 \
-  asan_delete(ptr, size, static_cast<uptr>(align), &stack, FROM_NEW)
+  asan_delete_sized_aligned(ptr, size, static_cast<uptr>(align), &stack)
 #define OPERATOR_DELETE_BODY_SIZE_ALIGN_ARRAY \
   GET_STACK_TRACE_FREE;                       \
-  asan_delete(ptr, size, static_cast<uptr>(align), &stack, FROM_NEW_BR)
+  asan_delete_array_sized_aligned(ptr, size, static_cast<uptr>(align), &stack)
 
 #if !SANITIZER_APPLE
 CXX_OPERATOR_ATTRIBUTE
diff --git a/compiler-rt/lib/asan/tests/asan_noinst_test.cpp b/compiler-rt/lib/asan/tests/asan_noinst_test.cpp
index f485404758b54..401219ac3628c 100644
--- a/compiler-rt/lib/asan/tests/asan_noinst_test.cpp
+++ b/compiler-rt/lib/asan/tests/asan_noinst_test.cpp
@@ -71,7 +71,7 @@ static void *MallocStress(void *NumOfItrPtr) {
       void *ptr = vec[idx];
       vec[idx] = vec.back();
       vec.pop_back();
-      __asan::asan_free(ptr, &stack1, __asan::FROM_MALLOC);
+      __asan::asan_free(ptr, &stack1);
     } else {
       size_t size = my_rand_r(&seed) % 1000 + 1;
       switch ((my_rand_r(&seed) % 128)) {
@@ -80,8 +80,7 @@ static void *MallocStress(void *NumOfItrPtr) {
         case 2: size += 4096; break;
       }
       size_t alignment = 1 << (my_rand_r(&seed) % 10 + 1);
-      char *ptr = (char *)__asan::asan_memalign(alignment, size, &stack2,
-                                                __asan::FROM_MALLOC);
+      char *ptr = (char *)__asan::asan_memalign(alignment, size, &stack2);
       EXPECT_EQ(size, __asan::asan_malloc_usable_size(ptr, 0, 0));
       vec.push_back(ptr);
       ptr[0] = 0;
@@ -89,8 +88,7 @@ static void *MallocStress(void *NumOfItrPtr) {
       ptr[size/2] = 0;
     }
   }
-  for (size_t i = 0; i < vec.size(); i++)
-    __asan::asan_free(vec[i], &stack3, __asan::FROM_MALLOC);
+  for (size_t i = 0; i < vec.size(); i++) __asan::asan_free(vec[i], &stack3);
   return nullptr;
 }
 
@@ -143,12 +141,12 @@ TEST(AddressSanitizer, QuarantineTest) {
 
   const int size = 1024;
   void *p = __asan::asan_malloc(size, &stack);
-  __asan::asan_free(p, &stack, __asan::FROM_MALLOC);
+  __asan::asan_free(p, &stack);
   size_t i;
   size_t max_i = 1 << 30;
   for (i = 0; i < max_i; i++) {
     void *p1 = __asan::asan_malloc(size, &stack);
-    __asan::asan_free(p1, &stack, __asan::FROM_MALLOC);
+    __asan::asan_free(p1, &stack);
     if (p1 == p) break;
   }
   EXPECT_GE(i, 10000U);
@@ -165,7 +163,7 @@ void *ThreadedQuarantineTestWorker(void *unused) {
 
   for (size_t i = 0; i < 1000; i++) {
     void *p = __asan::asan_malloc(1 + (my_rand_r(&seed) % 4000), &stack);
-    __asan::asan_free(p, &stack, __asan::FROM_MALLOC);
+    __asan::asan_free(p, &stack);
   }
   return NULL;
 }
@@ -204,7 +202,7 @@ void *ThreadedOneSizeMallocStress(void *unused) {
       p[i] = __asan::asan_malloc(32, &stack);
     }
     for (size_t i = 0; i < kNumMallocs; i++) {
-      __asan::asan_free(p[i], &stack, __asan::FROM_MALLOC);
+      __asan::asan_free(p[i], &stack);
     }
   }
   return NULL;
@@ -260,7 +258,7 @@ static void TestLoadStoreCallbacks(CB cb[2][5]) {
         }
       }
     }
-    __asan::asan_free(ptr, &stack, __asan::FROM_MALLOC);
+    __asan::asan_free(ptr, &stack);
   }
   __asan_test_only_reported_buggy_pointer = 0;
 }
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 3ab92403d4168..eb2e7664152b8 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -591,10 +591,16 @@ set(aarch64_SOURCES
   ${GENERIC_TF_SOURCES}
   ${GENERIC_SOURCES}
   cpu_model/aarch64.c
-  aarch64/emupac.cpp
   aarch64/fp_mode.c
 )
 
+# Append sources specific to AArch64 targets that aren't supported by MSVC.
+if(NOT MSVC)
+  list(APPEND aarch64_SOURCES
+    aarch64/emupac.cpp
+  )
+endif()
+
 set(COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR NOT(FUCHSIA OR APPLE))
 
 if (COMPILER_RT_HAS_AARCH64_SME)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index a2b6c37d5450c..00e237a24fc91 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -102,6 +102,8 @@ const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID
                                            ? FIRST_32_SECOND_64(104, 128)
 #      if defined(_ABIN32) && _MIPS_SIM == _ABIN32
                                            : FIRST_32_SECOND_64(176, 216);
+#      elif SANITIZER_MUSL
+                                           : FIRST_32_SECOND_64(160, 208);
 #      else
                                            : FIRST_32_SECOND_64(160, 216);
 #      endif
@@ -476,6 +478,30 @@ struct __sanitizer_cmsghdr {
   int cmsg_level;
   int cmsg_type;
 };
+#  elif SANITIZER_MUSL
+struct __sanitizer_msghdr {
+  void *msg_name;
+  unsigned msg_namelen;
+  struct __sanitizer_iovec *msg_iov;
+  int msg_iovlen;
+#    if SANITIZER_WORDSIZE == 64
+  int __pad1;
+#    endif
+  void *msg_control;
+  unsigned msg_controllen;
+#    if SANITIZER_WORDSIZE == 64
+  int __pad2;
+#    endif
+  int msg_flags;
+};
+struct __sanitizer_cmsghdr {
+  unsigned cmsg_len;
+#    if SANITIZER_WORDSIZE == 64
+  int __pad1;
+#    endif
+  int cmsg_level;
+  int cmsg_type;
+};
 #  else
 // In POSIX, int msg_iovlen; socklen_t msg_controllen; socklen_t cmsg_len; but
 // many implementations don't conform to the standard.
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index 4d435976d3a10..567402ee85e96 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -87,7 +87,7 @@ AR="${AR}" CC="${CC}" CFLAGS="$FLAGS -Wno-deprecated-non-prototype" RANLIB=/bin/
 make -j libz.a
 
 # Build and install libcxxabi and libcxx.
-if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
+if [[ ! -f ${LIBCXX_BUILD}/build.ninja ]]; then
   rm -rf "${LIBCXX_BUILD}" "${LIBCXX_INSTALL}"
   mkdir -p ${LIBCXX_BUILD} ${LIBCXX_INSTALL}
   cd ${LIBCXX_BUILD}
@@ -120,7 +120,7 @@ ninja cxx cxxabi && ninja install-cxx install-cxxabi
 
 FLAGS="${FLAGS} -fno-rtti -fno-exceptions"
 LLVM_CFLAGS="${FLAGS} -Wno-global-constructors"
-LLVM_CXXFLAGS="${LLVM_CFLAGS} -nostdinc++ -I${ZLIB_BUILD} -isystem ${LIBCXX_INSTALL}/include -isystem ${LIBCXX_INSTALL}/include/c++/v1"
+LLVM_CXXFLAGS="-isystem ${LIBCXX_INSTALL}/include -isystem ${LIBCXX_INSTALL}/include/c++/v1 ${LLVM_CFLAGS} -nostdinc++ -I${ZLIB_BUILD}"
 
 # Build LLVM.
 if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
diff --git a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
index e6ddbb00b843c..71f810e9d9724 100644
--- a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
@@ -11,44 +11,60 @@
 
 #include "common.h"
 #include "mem_map.h"
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+
 #include <algorithm>
-#include <fstream>
+#include <vector>
 
 namespace scudo {
 
-static uptr getResidentMemorySize() {
-  if (!SCUDO_LINUX)
-    UNREACHABLE("Not implemented!");
-  uptr Size;
-  uptr Resident;
-  std::ifstream IFS("/proc/self/statm");
-  IFS >> Size;
-  IFS >> Resident;
-  return Resident * getPageSizeCached();
+static void getResidentPages(void *BaseAddress, size_t TotalPages,
+                             size_t *ResidentPages) {
+  std::vector<unsigned char> Pages(TotalPages, 0);
+  ASSERT_EQ(
+      0, mincore(BaseAddress, TotalPages * getPageSizeCached(), Pages.data()))
+      << strerror(errno);
+  *ResidentPages = 0;
+  for (unsigned char Value : Pages) {
+    if (Value & 1) {
+      ++*ResidentPages;
+    }
+  }
 }
 
-// Fuchsia needs getResidentMemorySize implementation.
+// Fuchsia needs getResidentPages implementation.
 TEST(ScudoCommonTest, SKIP_ON_FUCHSIA(ResidentMemorySize)) {
-  uptr OnStart = getResidentMemorySize();
-  EXPECT_GT(OnStart, 0UL);
-
-  const uptr Size = 1ull << 30;
-  const uptr Threshold = Size >> 3;
+  // Make sure to have the size of the map on a page boundary.
+  const uptr PageSize = getPageSizeCached();
+  const size_t NumPages = 1000;
+  const uptr SizeBytes = NumPages * PageSize;
 
   MemMapT MemMap;
-  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "ResidentMemorySize"));
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize"));
   ASSERT_NE(MemMap.getBase(), 0U);
-  void *P = reinterpret_cast<void *>(MemMap.getBase());
-  EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
-
-  memset(P, 1, Size);
-  EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
-
-  MemMap.releasePagesToOS(MemMap.getBase(), Size);
-  EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
 
-  memset(P, 1, Size);
-  EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
+  void *P = reinterpret_cast<void *>(MemMap.getBase());
+  size_t ResidentPages;
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(0U, ResidentPages);
+
+  // Make the entire map resident.
+  memset(P, 1, SizeBytes);
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(NumPages, ResidentPages);
+
+  // Should release the memory to the kernel immediately.
+  MemMap.releasePagesToOS(MemMap.getBase(), SizeBytes);
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(0U, ResidentPages);
+
+  // Make the entire map resident again.
+  memset(P, 1, SizeBytes);
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(NumPages, ResidentPages);
 
   MemMap.unmap();
 }
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 05065444a70c5..612317b3c3293 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -183,7 +183,8 @@ TEST_F(ScudoWrappersCDeathTest, Malloc) {
   // process doing free(P) is not a double free.
   EXPECT_DEATH(
       {
-        void *Ptr = malloc(Size);
+        // Note: volatile here prevents the calls from being optimized out.
+        void *volatile Ptr = malloc(Size);
         free(Ptr);
         free(Ptr);
       },
diff --git a/compiler-rt/lib/tysan/lit.cfg b/compiler-rt/lib/tysan/lit.cfg
index e3ef6c9c97147..c906c03cc3fb2 100644
--- a/compiler-rt/lib/tysan/lit.cfg
+++ b/compiler-rt/lib/tysan/lit.cfg
@@ -27,7 +27,7 @@ config.substitutions.append( ("%clangxx_tysan ", build_invocation(clang_tysan_cx
 config.suffixes = ['.c', '.cc', '.cpp']
 
 # TypeSanitizer tests are currently supported on Linux only.
-if config.host_os not in ['Linux']:
+if config.target_os not in ['Linux']:
   config.unsupported = True
 
 if config.target_arch != 'aarch64':
diff --git a/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py
index 57c0979e60962..b622e072bcbfb 100644
--- a/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Windows"]:
+if root.target_os not in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 3da073332c458..96201e679b0a3 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -28,7 +28,7 @@ def get_required_attr(config, attr_name):
 # tests to prevent regressions.
 # Currently, detect_leaks for asan tests only work on Intel MacOS.
 if (
-    config.host_os == "Darwin"
+    config.target_os == "Darwin"
     and config.apple_platform == "osx"
     and config.target_arch == "x86_64"
 ):
@@ -45,7 +45,7 @@ def get_required_attr(config, attr_name):
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
 
-if config.host_os not in ["FreeBSD", "NetBSD"]:
+if config.target_os not in ["FreeBSD", "NetBSD"]:
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -125,17 +125,17 @@ def build_invocation(compile_flags, with_lto=False):
     ("%clangxx_asan_lto ", build_invocation(clang_asan_cxxflags, True))
 )
 if config.asan_dynamic:
-    if config.host_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
+    if config.target_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
         shared_libasan_path = os.path.join(
             config.compiler_rt_libdir,
             "libclang_rt.asan{}.so".format(config.target_suffix),
         )
-    elif config.host_os == "Darwin":
+    elif config.target_os == "Darwin":
         shared_libasan_path = os.path.join(
             config.compiler_rt_libdir,
             "libclang_rt.asan_{}_dynamic.dylib".format(config.apple_platform),
         )
-    elif config.host_os == "Windows":
+    elif config.target_os == "Windows":
         shared_libasan_path = os.path.join(
             config.compiler_rt_libdir,
             "clang_rt.asan_dynamic-{}.lib".format(config.target_suffix),
@@ -274,16 +274,16 @@ def build_invocation(compile_flags, with_lto=False):
     and (config.target_arch in ["x86_64", "i386", "i686", "aarch64"])
 )
 leak_detection_linux = (
-    (config.host_os == "Linux")
+    (config.target_os == "Linux")
     and (not config.android)
     and (config.target_arch in ["x86_64", "i386", "riscv64", "loongarch64"])
 )
 leak_detection_mac = (
-    (config.host_os == "Darwin")
+    (config.target_os == "Darwin")
     and (config.apple_platform == "osx")
     and (config.target_arch == "x86_64")
 )
-leak_detection_netbsd = (config.host_os == "NetBSD") and (
+leak_detection_netbsd = (config.target_os == "NetBSD") and (
     config.target_arch in ["x86_64", "i386"]
 )
 if (
@@ -296,7 +296,7 @@ def build_invocation(compile_flags, with_lto=False):
 
 # Add the RT libdir to PATH directly so that we can successfully run the gtest
 # binary to list its tests.
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     os.environ["PATH"] = os.path.pathsep.join(
         [config.compiler_rt_libdir, os.environ.get("PATH", "")]
     )
@@ -310,10 +310,10 @@ def build_invocation(compile_flags, with_lto=False):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.suffixes.append(".mm")
 
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     config.substitutions.append(("%fPIC", ""))
     config.substitutions.append(("%fPIE", ""))
     config.substitutions.append(("%pie", ""))
@@ -323,11 +323,11 @@ def build_invocation(compile_flags, with_lto=False):
     config.substitutions.append(("%pie", "-pie"))
 
 # Only run the tests on supported OSs.
-if config.host_os not in ["Linux", "Darwin", "FreeBSD", "SunOS", "Windows", "NetBSD"]:
+if config.target_os not in ["Linux", "Darwin", "FreeBSD", "SunOS", "Windows", "NetBSD"]:
     config.unsupported = True
 
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/asan_abi/lit.cfg.py b/compiler-rt/test/asan_abi/lit.cfg.py
index 5bc1881ed9c32..dd99a5373e7b6 100644
--- a/compiler-rt/test/asan_abi/lit.cfg.py
+++ b/compiler-rt/test/asan_abi/lit.cfg.py
@@ -68,7 +68,7 @@ def build_invocation(compile_flags):
 
 config.suffixes = ['.c', '.cpp']
 
-if config.host_os == 'Darwin':
+if config.target_os == 'Darwin':
   config.suffixes.append('.mm')
 else:
   config.unsupported = True
diff --git a/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py
index c030f89c66e42..59da054848f3c 100644
--- a/compiler-rt/test/builtins/Unit/lit.cfg.py
+++ b/compiler-rt/test/builtins/Unit/lit.cfg.py
@@ -80,10 +80,10 @@ def get_libgcc_file_name():
         config.compiler_rt_libdir, "clang_rt.builtins%s.lib " % config.target_suffix
     )
     config.substitutions.append(("%librt ", base_lib))
-elif config.host_os == "Darwin":
+elif config.target_os == "Darwin":
     base_lib = os.path.join(config.compiler_rt_libdir, "libclang_rt.osx.a ")
     config.substitutions.append(("%librt ", base_lib + " -lSystem "))
-elif config.host_os == "Windows":
+elif config.target_os == "Windows":
     base_lib = os.path.join(
         config.compiler_rt_libdir, "libclang_rt.builtins%s.a" % config.target_suffix
     )
@@ -104,7 +104,7 @@ def get_libgcc_file_name():
     if sys.platform in ["win32"] and execute_external:
         # Don't pass dosish path separator to msys bash.exe.
         base_lib = base_lib.replace("\\", "/")
-    if config.host_os == "Haiku":
+    if config.target_os == "Haiku":
         config.substitutions.append(("%librt ", base_lib + " -lroot "))
     else:
         config.substitutions.append(("%librt ", base_lib + " -lc -lm "))
diff --git a/compiler-rt/test/builtins/lit.cfg.py b/compiler-rt/test/builtins/lit.cfg.py
index 9300488c8428d..6491f4735b9e6 100644
--- a/compiler-rt/test/builtins/lit.cfg.py
+++ b/compiler-rt/test/builtins/lit.cfg.py
@@ -21,7 +21,7 @@
     ("%clang ", " " + config.clang + " " + " ".join(extra_flags) + " ")
 )
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.substitutions.append(
         ("%macos_version_major", str(config.darwin_osx_version[0]))
     )
diff --git a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py
index dceb7cde7218b..2778d8c995fd1 100644
--- a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py
+++ b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py
@@ -6,7 +6,7 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux", "FreeBSD", "NetBSD"]:
+if root.target_os not in ["Linux", "FreeBSD", "NetBSD"]:
     config.unsupported = True
 
 # Android O (API level 26) has support for cross-dso cfi in libdl.so.
diff --git a/compiler-rt/test/ctx_profile/lit.cfg.py b/compiler-rt/test/ctx_profile/lit.cfg.py
index 74d9bfd11ae28..75367d95a47bd 100644
--- a/compiler-rt/test/ctx_profile/lit.cfg.py
+++ b/compiler-rt/test/ctx_profile/lit.cfg.py
@@ -7,7 +7,7 @@
 import lit.formats
 
 # Only run the tests on supported OSs.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
 
 
diff --git a/compiler-rt/test/dfsan/lit.cfg.py b/compiler-rt/test/dfsan/lit.cfg.py
index e947c51f99a5b..b26ff3e367942 100644
--- a/compiler-rt/test/dfsan/lit.cfg.py
+++ b/compiler-rt/test/dfsan/lit.cfg.py
@@ -25,5 +25,5 @@ def build_invocation(compile_flags):
 config.suffixes = [".c", ".cpp"]
 
 # DataFlowSanitizer tests are currently supported on Linux only.
-if not (config.host_os in ["Linux"] and config.target_arch in ["aarch64", "x86_64", "loongarch64"]):
+if not (config.target_os in ["Linux"] and config.target_arch in ["aarch64", "x86_64", "loongarch64"]):
     config.unsupported = True
diff --git a/compiler-rt/test/fuzzer/lit.cfg.py b/compiler-rt/test/fuzzer/lit.cfg.py
index 75d4cf2e4c529..1689f53d0b021 100644
--- a/compiler-rt/test/fuzzer/lit.cfg.py
+++ b/compiler-rt/test/fuzzer/lit.cfg.py
@@ -149,5 +149,5 @@ def generate_compiler_cmd(is_cpp=True, fuzzer_enabled=True, msan_enabled=False):
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/gwp_asan/lit.cfg.py b/compiler-rt/test/gwp_asan/lit.cfg.py
index 7f68682162e3f..1592cf400023e 100644
--- a/compiler-rt/test/gwp_asan/lit.cfg.py
+++ b/compiler-rt/test/gwp_asan/lit.cfg.py
@@ -67,5 +67,5 @@ def build_invocation(compile_flags):
 )
 
 # GWP-ASan tests are currently supported on Linux only.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/hwasan/lit.cfg.py b/compiler-rt/test/hwasan/lit.cfg.py
index bbf23e683240a..3a1c8e1466aea 100644
--- a/compiler-rt/test/hwasan/lit.cfg.py
+++ b/compiler-rt/test/hwasan/lit.cfg.py
@@ -86,5 +86,5 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Linux", "Android"] or not config.has_lld:
+if config.target_os not in ["Linux", "Android"] or not config.has_lld:
     config.unsupported = True
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index f5576ce0e013d..8328b407dcc36 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -66,7 +66,7 @@ def find_compiler_libdir():
     # Fall back for older AppleClang that doesn't support `-print-runtime-dir`
     # Note `-print-file-name=<path to compiler-rt lib>` was broken for Apple
     # platforms so we can't use that approach here (see https://reviews.llvm.org/D101682).
-    if config.host_os == "Darwin":
+    if config.target_os == "Darwin":
         lib_dir, _ = get_path_from_clang(["-print-file-name=lib"], allow_failure=False)
         runtime_dir = os.path.join(lib_dir, "darwin")
         if not os.path.exists(runtime_dir):
@@ -312,7 +312,7 @@ def push_dynamic_library_lookup_path(config, new_path):
 if platform.system() == "Windows" and target_is_msvc:
     config.environment["LIB"] = os.environ["LIB"]
 
-config.available_features.add(config.host_os.lower())
+config.available_features.add(config.target_os.lower())
 
 if config.target_triple.startswith("ppc") or config.target_triple.startswith("powerpc"):
     config.available_features.add("ppc")
@@ -344,7 +344,7 @@ def push_dynamic_library_lookup_path(config, new_path):
     )
 )
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     nb_commands_dir = os.path.join(
         config.compiler_rt_src_root, "test", "sanitizer_common", "netbsd_commands"
     )
@@ -395,7 +395,7 @@ def get_ios_commands_dir():
         if sanitizer not in config.environment:
             config.environment[sanitizer] = symbolizer_path
 
-env_utility = "/opt/freeware/bin/env" if config.host_os == "AIX" else "env"
+env_utility = "/opt/freeware/bin/env" if config.target_os == "AIX" else "env"
 env_unset_command = " ".join(f"-u {var}" for var in tool_symbolizer_path_list)
 config.substitutions.append(
     ("%env_unset_tool_symbolizer_path", f"{env_utility} {env_unset_command}")
@@ -410,7 +410,7 @@ def get_ios_commands_dir():
     lit_config.warning("%device_rm is not implemented")
     config.substitutions.append(("%device_rm", "echo "))
     config.compile_wrapper = ""
-elif config.host_os == "Darwin" and config.apple_platform != "osx":
+elif config.target_os == "Darwin" and config.apple_platform != "osx":
     # Darwin tests can be targetting macOS, a device or a simulator. All devices
     # are declared as "ios", even for iOS derivatives (tvOS, watchOS). Similarly,
     # all simulators are "iossim". See the table below.
@@ -498,7 +498,7 @@ def get_ios_commands_dir():
     config.compile_wrapper = ""
 
 # Define CHECK-%os to check for OS-dependent output.
-config.substitutions.append(("CHECK-%os", ("CHECK-" + config.host_os)))
+config.substitutions.append(("CHECK-%os", ("CHECK-" + config.target_os)))
 
 # Define %arch to check for architecture-dependent output.
 config.substitutions.append(("%arch", (config.host_arch)))
@@ -519,7 +519,7 @@ def get_ios_commands_dir():
     config.available_features.add(target_arch + "-target-arch")
     if target_arch in ["x86_64", "i386"]:
         config.available_features.add("x86-target-arch")
-    config.available_features.add(target_arch + "-" + config.host_os.lower())
+    config.available_features.add(target_arch + "-" + config.target_os.lower())
 
 compiler_rt_debug = getattr(config, "compiler_rt_debug", False)
 if not compiler_rt_debug:
@@ -565,7 +565,7 @@ def get_ios_commands_dir():
     ("%darwin_min_target_with_tls_support", "%min_macos_deployment_target=10.12")
 )
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     osx_version = (10, 0, 0)
     try:
         osx_version = subprocess.check_output(
@@ -602,12 +602,12 @@ def get_ios_commands_dir():
 
     def get_macos_aligned_version(macos_vers):
         platform = config.apple_platform
-        if platform == "osx":
+        macos_major, macos_minor = macos_vers
+
+        if platform == "osx" or macos_major >= 26:
             return macos_vers
 
-        macos_major, macos_minor = macos_vers
         assert macos_major >= 10
-
         if macos_major == 10:  # macOS 10.x
             major = macos_minor
             minor = 0
@@ -708,7 +708,7 @@ def get_macos_aligned_version(macos_vers):
     config.substitutions.append(("%push_to_device", "echo "))
     config.substitutions.append(("%adb_shell", "echo "))
 
-if config.host_os == "Linux":
+if config.target_os == "Linux":
     def add_glibc_versions(ver_string):
         if config.android:
             return
@@ -806,10 +806,10 @@ def is_windows_lto_supported():
     return os.path.exists(os.path.join(config.llvm_tools_dir, "lld-link.exe"))
 
 
-if config.host_os == "Darwin" and is_darwin_lto_supported():
+if config.target_os == "Darwin" and is_darwin_lto_supported():
     config.lto_supported = True
     config.lto_flags = ["-Wl,-lto_library," + liblto_path()]
-elif config.host_os in ["Linux", "FreeBSD", "NetBSD"]:
+elif config.target_os in ["Linux", "FreeBSD", "NetBSD"]:
     config.lto_supported = False
     if config.use_lld and is_lld_lto_supported():
         config.lto_supported = True
@@ -822,7 +822,7 @@ def is_windows_lto_supported():
             config.lto_flags = ["-fuse-ld=lld"]
         else:
             config.lto_flags = ["-fuse-ld=gold"]
-elif config.host_os == "Windows" and is_windows_lto_supported():
+elif config.target_os == "Windows" and is_windows_lto_supported():
     config.lto_supported = True
     config.lto_flags = ["-fuse-ld=lld"]
 else:
@@ -871,7 +871,7 @@ def is_windows_lto_supported():
 # Note that substitutions with numbers have to be defined first to avoid
 # being subsumed by substitutions with smaller postfix.
 for postfix in ["2", "1", ""]:
-    if config.host_os == "Darwin":
+    if config.target_os == "Darwin":
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -884,7 +884,7 @@ def is_windows_lto_supported():
                 "-install_name @rpath/`basename %dynamiclib{}`".format(postfix),
             )
         )
-    elif config.host_os in ("FreeBSD", "NetBSD", "OpenBSD"):
+    elif config.target_os in ("FreeBSD", "NetBSD", "OpenBSD"):
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -893,7 +893,7 @@ def is_windows_lto_supported():
             )
         )
         config.substitutions.append(("%ld_flags_rpath_so" + postfix, ""))
-    elif config.host_os == "Linux":
+    elif config.target_os == "Linux":
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -901,7 +901,7 @@ def is_windows_lto_supported():
             )
         )
         config.substitutions.append(("%ld_flags_rpath_so" + postfix, ""))
-    elif config.host_os == "SunOS":
+    elif config.target_os == "SunOS":
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -923,7 +923,7 @@ def is_windows_lto_supported():
     config.substitutions.append(("%xdynamiclib_namespec", "%basename_t.dynamic"))
 
 config.default_sanitizer_opts = []
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     config.default_sanitizer_opts += ["abort_on_error=0"]
@@ -983,7 +983,7 @@ def is_windows_lto_supported():
 elif config.use_lld and (not config.has_lld):
     config.unsupported = True
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     if getattr(config, "darwin_linker_version", None):
         extra_cflags += ["-mlinker-version=" + config.darwin_linker_version]
 
@@ -998,7 +998,7 @@ def is_windows_lto_supported():
 )
 config.target_cflags = " " + " ".join(target_cflags + extra_cflags) + " "
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.substitutions.append(
         (
             "%get_pid_from_output",
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 04d1a4df5a54f..2b4b72bc895c5 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -10,7 +10,7 @@ set_default("target_triple", "@COMPILER_RT_DEFAULT_TARGET_TRIPLE@")
 set_default("target_cflags", "@COMPILER_RT_TEST_COMPILER_CFLAGS@")
 set_default("host_arch", "@HOST_ARCH@")
 set_default("target_arch", "@COMPILER_RT_DEFAULT_TARGET_ARCH@")
-set_default("host_os", "@HOST_OS@")
+set_default("target_os", "@HOST_OS@")
 set_default("llvm_build_mode", "@LLVM_BUILD_MODE@")
 set_default("llvm_src_root", "@LLVM_MAIN_SRC_DIR@")
 set_default("llvm_obj_root", "@LLVM_BINARY_DIR@")
diff --git a/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/lsan/lit.common.cfg.py b/compiler-rt/test/lsan/lit.common.cfg.py
index 9426b7d108bbf..1e2679438b114 100644
--- a/compiler-rt/test/lsan/lit.common.cfg.py
+++ b/compiler-rt/test/lsan/lit.common.cfg.py
@@ -34,7 +34,7 @@ def get_required_attr(config, attr_name):
     config.name = "LeakSanitizer-AddressSanitizer"
     lsan_cflags = ["-fsanitize=address"]
     config.available_features.add("asan")
-    if config.host_os == "NetBSD":
+    if config.target_os == "NetBSD":
         config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
 elif lsan_lit_test_mode == "HWAddressSanitizer":
     config.name = "LeakSanitizer-HWAddressSanitizer"
@@ -42,7 +42,7 @@ def get_required_attr(config, attr_name):
     if target_arch == "x86_64":
         lsan_cflags = lsan_cflags + ["-fsanitize-hwaddress-experimental-aliasing"]
     config.available_features.add("hwasan")
-    if config.host_os == "NetBSD":
+    if config.target_os == "NetBSD":
         config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
 else:
     lit_config.fatal("Unknown LSan test mode: %r" % lsan_lit_test_mode)
@@ -51,7 +51,7 @@ def get_required_attr(config, attr_name):
 # Platform-specific default LSAN_OPTIONS for lit tests.
 default_common_opts_str = ":".join(list(config.default_sanitizer_opts))
 default_lsan_opts = default_common_opts_str + ":detect_leaks=1"
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     # Also, make sure we do not overwhelm the syslog while testing.
@@ -101,7 +101,7 @@ def build_invocation(compile_flags):
 )
 supported_linux = (
     (not config.android)
-    and config.host_os == "Linux"
+    and config.target_os == "Linux"
     and config.host_arch
     in [
         "aarch64",
@@ -117,8 +117,8 @@ def build_invocation(compile_flags):
         "loongarch64",
     ]
 )
-supported_darwin = config.host_os == "Darwin" and config.target_arch in ["x86_64"]
-supported_netbsd = config.host_os == "NetBSD" and config.target_arch in [
+supported_darwin = config.target_os == "Darwin" and config.target_arch in ["x86_64"]
+supported_netbsd = config.target_os == "NetBSD" and config.target_arch in [
     "x86_64",
     "i386",
 ]
diff --git a/compiler-rt/test/memprof/lit.cfg.py b/compiler-rt/test/memprof/lit.cfg.py
index 4057da0c65b51..e28507be4dc9e 100644
--- a/compiler-rt/test/memprof/lit.cfg.py
+++ b/compiler-rt/test/memprof/lit.cfg.py
@@ -106,7 +106,7 @@ def build_invocation(compile_flags):
 config.substitutions.append(("%pie", "-pie"))
 
 # Only run the tests on supported OSs.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
 
 if not config.parallelism_group:
diff --git a/compiler-rt/test/metadata/lit.cfg.py b/compiler-rt/test/metadata/lit.cfg.py
index 73ba27ad3a4e2..9980e93b3a6ec 100644
--- a/compiler-rt/test/metadata/lit.cfg.py
+++ b/compiler-rt/test/metadata/lit.cfg.py
@@ -5,5 +5,5 @@
 config.suffixes = [".cpp"]
 # Binary metadata is currently emitted only for ELF binaries
 # and sizes of stack arguments depend on the arch.
-if config.host_os not in ["Linux"] or config.target_arch not in ["x86_64"]:
+if config.target_os not in ["Linux"] or config.target_arch not in ["x86_64"]:
     config.unsupported = True
diff --git a/compiler-rt/test/msan/Linux/lit.local.cfg.py b/compiler-rt/test/msan/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/msan/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/msan/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/msan/lit.cfg.py b/compiler-rt/test/msan/lit.cfg.py
index 361be79e2557e..d9e83c67b84c8 100644
--- a/compiler-rt/test/msan/lit.cfg.py
+++ b/compiler-rt/test/msan/lit.cfg.py
@@ -20,7 +20,7 @@
     + config.debug_info_flags
 )
 # Some Msan tests leverage backtrace() which requires libexecinfo on FreeBSD.
-if config.host_os == "FreeBSD":
+if config.target_os == "FreeBSD":
     clang_msan_cflags += ["-lexecinfo", "-fPIC"]
 # On SystemZ we need -mbackchain to make the fast unwinder work.
 if config.target_arch == "s390x":
@@ -44,7 +44,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Linux", "NetBSD", "FreeBSD"]:
+if config.target_os not in ["Linux", "NetBSD", "FreeBSD"]:
     config.unsupported = True
 
 # For mips64, mips64el we have forced store_context_size to 1 because these
@@ -55,5 +55,5 @@ def build_invocation(compile_flags):
 else:
     config.substitutions.append(("CHECK-%short-stack", "CHECK-FULL-STACK"))
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/nsan/lit.cfg.py b/compiler-rt/test/nsan/lit.cfg.py
index 2d67911a7d5d8..8225c85c41b81 100644
--- a/compiler-rt/test/nsan/lit.cfg.py
+++ b/compiler-rt/test/nsan/lit.cfg.py
@@ -32,5 +32,5 @@ def build_invocation(compile_flags):
 )
 
 # NSan tests are currently supported on Linux only.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py
index b455a936e7cc1..2e3d36c446714 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "Darwin":
+if config.root.target_os != "Darwin":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py
index e9b1b38ccacd1..0efdb55dc77f4 100644
--- a/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "FreeBSD":
+if config.root.target_os != "FreeBSD":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py
index 7d85fa3fce392..32e5cfdb141ae 100644
--- a/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "Linux":
+if config.root.target_os != "Linux":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py
index 6d4e7da813641..99d4464cf9e77 100644
--- a/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "Windows":
+if config.root.target_os != "Windows":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/lit.cfg.py b/compiler-rt/test/orc/lit.cfg.py
index 7a6eb4e7de325..3c3badb642ff7 100644
--- a/compiler-rt/test/orc/lit.cfg.py
+++ b/compiler-rt/test/orc/lit.cfg.py
@@ -18,11 +18,11 @@
     config.available_features.add("host-arch-compatible")
 
 # If the target OS hasn't been set then assume host.
-if not config.target_os:
-    config.target_os = config.host_os
+if not config.orc_test_target_os:
+    config.orc_test_target_os = config.target_os
 
 config.test_target_is_host_executable = (
-    config.target_os == config.host_os and host_arch_compatible
+    config.orc_test_target_os == config.target_os and host_arch_compatible
 )
 
 # Assume that llvm-jitlink is in the config.llvm_tools_dir.
@@ -31,7 +31,7 @@
     config.compiler_rt_obj_root, "lib/orc/tests/tools/orc-rt-executor"
 )
 lli = os.path.join(config.llvm_tools_dir, "lli")
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     orc_rt_path = "%s/liborc_rt_osx.a" % config.compiler_rt_libdir
 else:
     orc_rt_path = "%s/liborc_rt%s.a" % (config.compiler_rt_libdir, config.target_suffix)
@@ -53,7 +53,7 @@ def build_invocation(compile_flags):
 config.substitutions.append(
     ("%clang_cl ", build_invocation(["--driver-mode=cl"] + [config.target_cflags]))
 )
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     config.substitutions.append(
         (
             "%llvm_jitlink",
@@ -86,7 +86,7 @@ def build_invocation(compile_flags):
 # Exclude Inputs directories.
 config.excludes = ["Inputs"]
 
-if config.host_os not in ["Darwin", "FreeBSD", "Linux", "Windows"]:
+if config.target_os not in ["Darwin", "FreeBSD", "Linux", "Windows"]:
     config.unsupported = True
 
 # Ask llvm-config about assertion mode.
diff --git a/compiler-rt/test/orc/lit.site.cfg.py.in b/compiler-rt/test/orc/lit.site.cfg.py.in
index a33ef3d7d7207..d0625f6ace15c 100644
--- a/compiler-rt/test/orc/lit.site.cfg.py.in
+++ b/compiler-rt/test/orc/lit.site.cfg.py.in
@@ -5,7 +5,8 @@ config.name_suffix = "@ORC_TEST_CONFIG_SUFFIX@"
 config.orc_lit_source_dir = "@ORC_LIT_SOURCE_DIR@"
 config.target_cflags = "@ORC_TEST_TARGET_CFLAGS@"
 config.target_arch = "@ORC_TEST_TARGET_ARCH@"
-config.target_os = "@ORC_TEST_TARGET_OS@"
+# FIXME: Remove this variable, the target OS is available in config.target_os.
+config.orc_test_target_os = "@ORC_TEST_TARGET_OS@"
 config.built_with_llvm = ("@COMPILER_RT_STANDALONE_BUILD@" != "TRUE")
 config.libunwind_shared = "@LIBUNWIND_ENABLE_SHARED@"
 config.libunwind_install_dir = "@LLVM_BINARY_DIR@/@LIBUNWIND_INSTALL_LIBRARY_DIR@"
diff --git a/compiler-rt/test/profile/AIX/lit.local.cfg.py b/compiler-rt/test/profile/AIX/lit.local.cfg.py
index 55462708e3b6c..3337c692bd0d7 100644
--- a/compiler-rt/test/profile/AIX/lit.local.cfg.py
+++ b/compiler-rt/test/profile/AIX/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["AIX"]:
+if root.target_os not in ["AIX"]:
     config.unsupported = True
diff --git a/compiler-rt/test/profile/Darwin/lit.local.cfg.py b/compiler-rt/test/profile/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/profile/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/profile/Linux/lit.local.cfg.py b/compiler-rt/test/profile/Linux/lit.local.cfg.py
index c1e89581a1ab9..4bce33db9bbf7 100644
--- a/compiler-rt/test/profile/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Linux/lit.local.cfg.py
@@ -42,7 +42,7 @@ def is_gold_linker_available():
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"] or not is_gold_linker_available():
+if root.target_os not in ["Linux"] or not is_gold_linker_available():
     config.unsupported = True
 
 if config.have_curl:
diff --git a/compiler-rt/test/profile/Posix/lit.local.cfg.py b/compiler-rt/test/profile/Posix/lit.local.cfg.py
index 17a67689192d0..62ee3cbb466c4 100644
--- a/compiler-rt/test/profile/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Posix/lit.local.cfg.py
@@ -6,12 +6,12 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
 
 # AIX usually usually makes use of an explicit export list when linking a shared
 # object, since the linker doesn't export anything by default.
-if root.host_os in ["AIX"]:
+if root.target_os in ["AIX"]:
     config.substitutions.append(("%shared_linker_xopts", "-Wl,-bE:shr.exp"))
 else:
     config.substitutions.append(("%shared_linker_xopts", ""))
diff --git a/compiler-rt/test/profile/Windows/lit.local.cfg.py b/compiler-rt/test/profile/Windows/lit.local.cfg.py
index 57c0979e60962..b622e072bcbfb 100644
--- a/compiler-rt/test/profile/Windows/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Windows/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Windows"]:
+if root.target_os not in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index c9a716abeccd8..df7f11e2b286b 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -30,7 +30,7 @@ def get_required_attr(config, attr_name):
 
 target_is_msvc = bool(re.match(r".*-windows-msvc$", config.target_triple))
 
-if config.host_os in ["Linux"]:
+if config.target_os in ["Linux"]:
     extra_link_flags = ["-ldl"]
 elif target_is_msvc:
     # InstrProf is incompatible with incremental linking. Disable it as a
@@ -154,7 +154,7 @@ def exclude_unsupported_files_for_aix(dirname):
     )
 )
 
-if config.host_os not in [
+if config.target_os not in [
     "Windows",
     "Darwin",
     "FreeBSD",
@@ -167,10 +167,10 @@ def exclude_unsupported_files_for_aix(dirname):
     config.unsupported = True
 
 config.substitutions.append(
-    ("%shared_lib_flag", "-dynamiclib" if (config.host_os == "Darwin") else "-shared")
+    ("%shared_lib_flag", "-dynamiclib" if (config.target_os == "Darwin") else "-shared")
 )
 
-if config.host_os in ["AIX"]:
+if config.target_os in ["AIX"]:
     config.available_features.add("system-aix")
     exclude_unsupported_files_for_aix(config.test_source_root)
     exclude_unsupported_files_for_aix(config.test_source_root + "/Posix")
@@ -184,5 +184,5 @@ def exclude_unsupported_files_for_aix(dirname):
 if config.have_curl:
     config.available_features.add("curl")
 
-if config.host_os in ("AIX", "Darwin", "Linux"):
+if config.target_os in ("AIX", "Darwin", "Linux"):
     config.available_features.add("continuous-mode")
diff --git a/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in b/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in
index 59e1e10360b52..41fcb32e5009b 100644
--- a/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in
+++ b/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in
@@ -15,7 +15,7 @@ config.test_source_root = config.test_exec_root
 if not config.parallelism_group:
   config.parallelism_group = 'shadow-memory'
 
-if config.host_os == 'Darwin':
+if config.target_os == 'Darwin':
   # On Darwin, we default to ignore_noninstrumented_modules=1, which also
   # suppresses some races the tests are supposed to find.  See rtsan/lit.cfg.py.
   if 'RTSAN_OPTIONS' in config.environment:
diff --git a/compiler-rt/test/rtsan/lit.cfg.py b/compiler-rt/test/rtsan/lit.cfg.py
index 7c75515a7608d..6d880c10ecd45 100644
--- a/compiler-rt/test/rtsan/lit.cfg.py
+++ b/compiler-rt/test/rtsan/lit.cfg.py
@@ -6,7 +6,7 @@
 
 default_rtsan_opts = "atexit_sleep_ms=0"
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     default_rtsan_opts += ":abort_on_error=0"
@@ -36,7 +36,7 @@ def build_invocation(compile_flags):
 llvm_rtsan = os.path.join(config.llvm_tools_dir, "llvm-rtsan")
 
 # Setup substitutions.
-if config.host_os == "Linux":
+if config.target_os == "Linux":
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -52,7 +52,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
+if config.target_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
     config.unsupported = True
 elif "64" not in config.host_arch:
     if "arm" in config.host_arch:
@@ -61,5 +61,5 @@ def build_invocation(compile_flags):
     else:
         config.unsupported = True
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_nomprotect_prefix))
diff --git a/compiler-rt/test/safestack/lit.cfg.py b/compiler-rt/test/safestack/lit.cfg.py
index 4ab9c1ce70bac..3f5565caa65c6 100644
--- a/compiler-rt/test/safestack/lit.cfg.py
+++ b/compiler-rt/test/safestack/lit.cfg.py
@@ -33,5 +33,5 @@
         )
     )
 
-if config.host_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
+if config.target_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py
index 0102001660cf1..d4948f04ef64e 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["FreeBSD"]:
+if root.target_os not in ["FreeBSD"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py
index 3cd1aa667343c..aa4438d04380a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["NetBSD"]:
+if root.target_os not in ["NetBSD"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/lit.common.cfg.py b/compiler-rt/test/sanitizer_common/lit.common.cfg.py
index 88d3ea9bc5ad2..5614229d9a126 100644
--- a/compiler-rt/test/sanitizer_common/lit.common.cfg.py
+++ b/compiler-rt/test/sanitizer_common/lit.common.cfg.py
@@ -40,7 +40,7 @@
 config.available_features.add(config.tool_name)
 
 if (
-    config.host_os == "Linux"
+    config.target_os == "Linux"
     and config.tool_name == "lsan"
     and config.target_arch == "i386"
 ):
@@ -49,7 +49,7 @@
 if config.arm_thumb:
     config.available_features.add("thumb")
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     default_tool_options += ["abort_on_error=0"]
@@ -68,7 +68,7 @@
 
 extra_link_flags = []
 
-if config.host_os in ["Linux"]:
+if config.target_os in ["Linux"]:
     extra_link_flags += ["-ldl"]
 
 clang_cflags = config.debug_info_flags + tool_cflags + [config.target_cflags]
@@ -92,13 +92,13 @@ def build_invocation(compile_flags):
 
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Linux", "Darwin", "NetBSD", "FreeBSD", "SunOS"]:
+if config.target_os not in ["Linux", "Darwin", "NetBSD", "FreeBSD", "SunOS"]:
     config.unsupported = True
 
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
 
 if os.path.exists("/etc/services"):
diff --git a/compiler-rt/test/scudo/lit.cfg.py b/compiler-rt/test/scudo/lit.cfg.py
index 5d45bd99804c7..b09c996e9ccc5 100644
--- a/compiler-rt/test/scudo/lit.cfg.py
+++ b/compiler-rt/test/scudo/lit.cfg.py
@@ -70,5 +70,5 @@ def build_invocation(compile_flags):
 )
 
 # Hardened Allocator tests are currently supported on Linux only.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/shadowcallstack/lit.cfg.py b/compiler-rt/test/shadowcallstack/lit.cfg.py
index 70a6b16174c4b..5b95deb1b0986 100644
--- a/compiler-rt/test/shadowcallstack/lit.cfg.py
+++ b/compiler-rt/test/shadowcallstack/lit.cfg.py
@@ -32,5 +32,5 @@
     )
 )
 
-if config.host_os not in ["Linux"] or config.target_arch not in ["aarch64", "riscv64"]:
+if config.target_os not in ["Linux"] or config.target_arch not in ["aarch64", "riscv64"]:
     config.unsupported = True
diff --git a/compiler-rt/test/tsan/Darwin/lit.local.cfg.py b/compiler-rt/test/tsan/Darwin/lit.local.cfg.py
index 7bf80ac5e1375..876f0cd638bd2 100644
--- a/compiler-rt/test/tsan/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/Darwin/lit.local.cfg.py
@@ -6,7 +6,7 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
 
 config.environment["TSAN_OPTIONS"] += ":ignore_noninstrumented_modules=1"
diff --git a/compiler-rt/test/tsan/Linux/lit.local.cfg.py b/compiler-rt/test/tsan/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/tsan/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in b/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in
index a9c6261ba48d4..b90af4f2c0d3a 100644
--- a/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in
+++ b/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in
@@ -15,7 +15,7 @@ config.test_source_root = config.test_exec_root
 if not config.parallelism_group:
   config.parallelism_group = 'shadow-memory'
 
-if config.host_os == 'Darwin':
+if config.target_os == 'Darwin':
   # On Darwin, we default to ignore_noninstrumented_modules=1, which also
   # suppresses some races the tests are supposed to find.  See tsan/lit.cfg.py.
   if 'TSAN_OPTIONS' in config.environment:
diff --git a/compiler-rt/test/tsan/libcxx/lit.local.cfg.py b/compiler-rt/test/tsan/libcxx/lit.local.cfg.py
index f4820dccb0109..b8d054e2de976 100644
--- a/compiler-rt/test/tsan/libcxx/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/libcxx/lit.local.cfg.py
@@ -8,5 +8,5 @@ def getRoot(config):
 
 # Only run if we have an instrumented libcxx.  On Darwin, run always (we have
 # interceptors to support the system-provided libcxx).
-if not root.has_libcxx and root.host_os != "Darwin":
+if not root.has_libcxx and root.target_os != "Darwin":
     config.unsupported = True
diff --git a/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py b/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py
index a7653f4305952..27edf611a0522 100644
--- a/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py
@@ -14,5 +14,5 @@ def getRoot(config):
 else:
     config.unsupported = True
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.environment["TSAN_OPTIONS"] += ":ignore_noninstrumented_modules=1"
diff --git a/compiler-rt/test/tsan/lit.cfg.py b/compiler-rt/test/tsan/lit.cfg.py
index a93333e2e593d..8803a7bda9aa5 100644
--- a/compiler-rt/test/tsan/lit.cfg.py
+++ b/compiler-rt/test/tsan/lit.cfg.py
@@ -23,7 +23,7 @@ def get_required_attr(config, attr_name):
 # Setup environment variables for running ThreadSanitizer.
 default_tsan_opts = "atexit_sleep_ms=0"
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     default_tsan_opts += ":abort_on_error=0"
@@ -61,7 +61,7 @@ def get_required_attr(config, attr_name):
 )
 # Add additional flags if we're using instrumented libc++.
 # Instrumented libcxx currently not supported on Darwin.
-if config.has_libcxx and config.host_os != "Darwin":
+if config.has_libcxx and config.target_os != "Darwin":
     # FIXME: Dehardcode this path somehow.
     libcxx_path = os.path.join(
         config.compiler_rt_obj_root,
@@ -86,7 +86,7 @@ def build_invocation(compile_flags):
 config.substitutions.append(("%clangxx_tsan ", build_invocation(clang_tsan_cxxflags)))
 
 # Define CHECK-%os to check for OS-dependent output.
-config.substitutions.append(("CHECK-%os", ("CHECK-" + config.host_os)))
+config.substitutions.append(("CHECK-%os", ("CHECK-" + config.target_os)))
 
 config.substitutions.append(
     (
@@ -101,7 +101,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp", ".m", ".mm"]
 
-if config.host_os not in ["FreeBSD", "Linux", "Darwin", "NetBSD"]:
+if config.target_os not in ["FreeBSD", "Linux", "Darwin", "NetBSD"]:
     config.unsupported = True
 
 if config.android:
@@ -110,5 +110,5 @@ def build_invocation(compile_flags):
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/tysan/lit.cfg.py b/compiler-rt/test/tysan/lit.cfg.py
index f38e0211639da..26846017b1957 100644
--- a/compiler-rt/test/tysan/lit.cfg.py
+++ b/compiler-rt/test/tysan/lit.cfg.py
@@ -71,7 +71,7 @@ def push_dynamic_library_lookup_path(config, new_path):
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
 
-if config.host_os not in ["FreeBSD", "NetBSD"]:
+if config.target_os not in ["FreeBSD", "NetBSD"]:
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -127,10 +127,10 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.suffixes.append(".mm")
 
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     config.substitutions.append(("%fPIC", ""))
     config.substitutions.append(("%fPIE", ""))
     config.substitutions.append(("%pie", ""))
@@ -140,7 +140,7 @@ def build_invocation(compile_flags):
     config.substitutions.append(("%pie", "-pie"))
 
 # Only run the tests on supported OSs.
-if config.host_os not in [
+if config.target_os not in [
     "Linux",
     "Darwin",
 ]:
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
index e69d15f5b141c..4342649532865 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
@@ -1,4 +1,4 @@
-if config.host_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD"]:
+if config.target_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD"]:
     config.unsupported = True
 # Work around "Cannot represent a difference across sections"
 if config.target_arch == "powerpc64":
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/ubsan/lit.common.cfg.py b/compiler-rt/test/ubsan/lit.common.cfg.py
index 04d6f24de5a9f..25e527903788e 100644
--- a/compiler-rt/test/ubsan/lit.common.cfg.py
+++ b/compiler-rt/test/ubsan/lit.common.cfg.py
@@ -74,7 +74,7 @@ def build_invocation(compile_flags):
 config.suffixes = [".c", ".cpp", ".m"]
 
 # Check that the host supports UndefinedBehaviorSanitizer tests
-if config.host_os not in [
+if config.target_os not in [
     "Linux",
     "Darwin",
     "FreeBSD",
@@ -90,5 +90,5 @@ def build_invocation(compile_flags):
 if ubsan_lit_test_mode in ["AddressSanitizer", "MemorySanitizer", "ThreadSanitizer"]:
     if not config.parallelism_group:
         config.parallelism_group = "shadow-memory"
-    if config.host_os == "NetBSD":
+    if config.target_os == "NetBSD":
         config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/ubsan_minimal/lit.common.cfg.py b/compiler-rt/test/ubsan_minimal/lit.common.cfg.py
index 714241a580f9d..bcc0e46fbef91 100644
--- a/compiler-rt/test/ubsan_minimal/lit.common.cfg.py
+++ b/compiler-rt/test/ubsan_minimal/lit.common.cfg.py
@@ -35,7 +35,7 @@ def build_invocation(compile_flags):
 config.suffixes = [".c", ".cpp"]
 
 # Check that the host supports UndefinedBehaviorSanitizerMinimal tests
-if config.host_os not in [
+if config.target_os not in [
     "Linux",
     "FreeBSD",
     "NetBSD",
diff --git a/compiler-rt/test/xray/lit.cfg.py b/compiler-rt/test/xray/lit.cfg.py
index f73ae3acd7715..e56ed85d1d822 100644
--- a/compiler-rt/test/xray/lit.cfg.py
+++ b/compiler-rt/test/xray/lit.cfg.py
@@ -14,7 +14,7 @@
 # If libc++ was used to build XRAY libraries, libc++ is needed. Fix applied
 # to Linux only since -rpath may not be portable. This can be extended to
 # other platforms.
-if config.libcxx_used == "1" and config.host_os == "Linux":
+if config.libcxx_used == "1" and config.target_os == "Linux":
     clang_xray_cflags = clang_xray_cflags + (
         ["-L%s -lc++ -Wl,-rpath=%s" % (config.llvm_shlib_dir, config.llvm_shlib_dir)]
     )
@@ -30,7 +30,7 @@ def build_invocation(compile_flags):
 llvm_xray = os.path.join(config.llvm_tools_dir, "llvm-xray")
 
 # Setup substitutions.
-if config.host_os == "Linux":
+if config.target_os == "Linux":
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -56,7 +56,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
+if config.target_os not in ["FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
     config.unsupported = True
 elif "64" not in config.host_arch:
     if "arm" in config.host_arch:
@@ -65,5 +65,5 @@ def build_invocation(compile_flags):
     else:
         config.unsupported = True
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_nomprotect_prefix))
diff --git a/compiler-rt/unittests/lit.common.unit.cfg.py b/compiler-rt/unittests/lit.common.unit.cfg.py
index 557a42893ec15..93f417c1d50ae 100644
--- a/compiler-rt/unittests/lit.common.unit.cfg.py
+++ b/compiler-rt/unittests/lit.common.unit.cfg.py
@@ -42,7 +42,7 @@ def get_lit_conf(name, default=None):
 if "TEMP" in os.environ:
     config.environment["TEMP"] = os.environ["TEMP"]
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # Only run up to 3 processes that require shadow memory simultaneously on
     # 64-bit Darwin. Using more scales badly and hogs the system due to
     # inefficient handling of large mmap'd regions (terabytes) by the kernel.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 3e42e83c9e70a..30ccf452ac71f 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -10,7 +10,7 @@ config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"
-config.host_os = "@HOST_OS@"
+config.target_os = "@HOST_OS@"
 config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
 config.gwp_asan = @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@
 config.emulator = "@COMPILER_RT_EMULATOR@"
diff --git a/cross-project-tests/dtlto/link-archive-thin.test b/cross-project-tests/dtlto/link-archive-thin.test
new file mode 100644
index 0000000000000..fbd8fd67300cf
--- /dev/null
+++ b/cross-project-tests/dtlto/link-archive-thin.test
@@ -0,0 +1,93 @@
+REQUIRES: lld-link
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-pc-windows-msvc -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: lld-link /lib /llvmlibthin /out:foo.lib foo.o
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: lld-link /lib /llvmlibthin /out:lib/bar.lib bar.o
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: lld-link /lib /llvmlibthin /out:dog.lib %t/lib/../dog.o
+RUN: lld-link /lib /llvmlibthin /out:cat.lib cat.o
+RUN: lld-link /lib /llvmlibthin /out:start.lib start.o
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+RUN: lld-link /subsystem:console /machine:x64 /entry:start /out:my.exe  \
+RUN:   %t/foo.lib %t/lib/bar.lib ../start.lib %t/cat.lib \
+RUN:   /includeoptional:dog ../dog.lib \
+RUN:   -thinlto-distributor:%python \
+RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/local.py \
+RUN:   -thinlto-remote-compiler:%clang \
+RUN:   /lldsavetemps
+
+## Check that the required output files have been created.
+RUN: ls | FileCheck %s --check-prefix=OUTPUTS --implicit-check-not=cat
+
+## JSON jobs description.
+OUTPUTS-DAG: my.[[PID:[a-zA-Z0-9_]+]].dist-file.json
+
+## Individual summary index files.
+OUTPUTS-DAG: start.1.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   dog.2.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   foo.3.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   bar.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## Native output object files.
+OUTPUTS-DAG: start.1.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   dog.2.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   foo.3.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   bar.4.[[PID]].native.o{{$}}
+
+
+## It is important that cross-module inlining occurs for this test to show that Clang can
+## successfully load the bitcode file dependencies recorded in the summary indices.
+## Explicitly check that the expected importing has occurred.
+
+RUN: llvm-dis start.1.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis dog.2.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,DOG,START
+
+RUN: llvm-dis foo.3.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis bar.4.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+FOO-DAG:   foo.o
+BAR-DAG:   bar.o
+DOG-DAG:   dog.o
+START-DAG: start.o
+
+
+#--- foo.c
+extern int bar(int), start(int);
+__attribute__((retain)) int foo(int x) { return x + bar(x) + start(x); }
+
+#--- bar.c
+extern int foo(int), start(int);
+__attribute__((retain)) int bar(int x) { return x + foo(x) + start(x); }
+
+#--- dog.c
+extern int foo(int), bar(int), start(int);
+__attribute__((retain)) int dog(int x) { return x + foo(x) + bar(x) + start(x); }
+
+#--- cat.c
+__attribute__((retain)) void cat(int x) {}
+
+#--- start.c
+extern int foo(int), bar(int);
+__attribute__((retain)) int start(int x) { return x + foo(x) + bar(x); }
diff --git a/cross-project-tests/dtlto/link-dtlto.c b/cross-project-tests/dtlto/link-dtlto.c
new file mode 100644
index 0000000000000..0ab4ec57f115d
--- /dev/null
+++ b/cross-project-tests/dtlto/link-dtlto.c
@@ -0,0 +1,41 @@
+// REQUIRES: lld-link
+
+/// Simple test that DTLTO works with a single input bitcode file and that
+/// --save-temps can be applied to the remote compilation.
+
+// RUN: rm -rf %t && mkdir %t && cd %t
+
+// RUN: %clang --target=x86_64-pc-windows-msvc -c -flto=thin %s -o dtlto.obj
+
+// RUN: lld-link /subsystem:console /entry:_start dtlto.obj \
+// RUN:   -thinlto-distributor:%python \
+// RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/local.py \
+// RUN:   -thinlto-remote-compiler:%clang \
+// RUN:   -thinlto-remote-compiler-arg:--save-temps
+
+/// Check that the required output files have been created.
+// RUN: ls | sort | FileCheck %s
+
+/// No files are expected before.
+// CHECK-NOT: {{.}}
+
+/// Linked ELF.
+// CHECK: {{^}}dtlto.exe{{$}}
+
+/// Produced by the bitcode compilation.
+// CHECK-NEXT: {{^}}dtlto.obj{{$}}
+
+/// --save-temps output for the backend compilation.
+// CHECK-NEXT: {{^}}dtlto.s{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.0.preopt.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.1.promote.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.2.internalize.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.3.import.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.4.opt.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.5.precodegen.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.resolution.txt{{$}}
+
+/// No files are expected after.
+// CHECK-NOT: {{.}}
+
+int _start() { return 0; }
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index b35c643ac898c..6a902bc877c89 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -19,7 +19,7 @@
 config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = [".c", ".cl", ".cpp", ".m"]
+config.suffixes = [".c", ".cl", ".cpp", ".m", ".test"]
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
@@ -237,15 +237,6 @@ def can_target_host():
     dependencies = configure_dexter_substitutions()
     if all(d in config.available_features for d in dependencies):
         config.available_features.add("dexter")
-        llvm_config.with_environment(
-            "PATHTOCLANG", add_host_triple(llvm_config.config.clang)
-        )
-        llvm_config.with_environment(
-            "PATHTOCLANGPP", add_host_triple(llvm_config.use_llvm_tool("clang++"))
-        )
-        llvm_config.with_environment(
-            "PATHTOCLANGCL", add_host_triple(llvm_config.use_llvm_tool("clang-cl"))
-        )
 else:
     print(
         "Host triple {} not supported. Skipping dexter tests in the "
diff --git a/flang-rt/include/flang-rt/runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h
index 68106f3462c9b..bc5a5b5f14697 100644
--- a/flang-rt/include/flang-rt/runtime/descriptor.h
+++ b/flang-rt/include/flang-rt/runtime/descriptor.h
@@ -478,7 +478,8 @@ class Descriptor {
       const SubscriptValue *upper = nullptr,
       const SubscriptValue *stride = nullptr);
 
-  RT_API_ATTRS void ApplyMold(const Descriptor &, int rank);
+  RT_API_ATTRS void ApplyMold(
+      const Descriptor &, int rank, bool isMonomorphic = false);
 
   RT_API_ATTRS void Check() const;
 
diff --git a/flang-rt/include/flang-rt/runtime/non-tbp-dio.h b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
index 99d4113b6c7a8..26849298ec959 100644
--- a/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
+++ b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
@@ -34,11 +34,16 @@ namespace Fortran::runtime::io {
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
+enum NonTbpDefinedIoFlags {
+  IsDtvArgPolymorphic = 1 << 0, // first dummy arg is CLASS(T)
+  DefinedIoInteger8 = 1 << 1, // -fdefault-integer-8 affected UNIT= & IOSTAT=
+};
+
 struct NonTbpDefinedIo {
   const typeInfo::DerivedType &derivedType;
   void (*subroutine)(); // null means no non-TBP defined I/O here
   common::DefinedIo definedIo;
-  bool isDtvArgPolymorphic; // first dummy arg is CLASS(T)
+  std::uint8_t flags;
 };
 
 struct NonTbpDefinedIoTable {
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index a8d39f4f8a1a3..93bca24a602b4 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -143,9 +143,9 @@ class SpecialBinding {
   // I/O procedures that are not type-bound.
   RT_API_ATTRS SpecialBinding(Which which, ProcedurePointer proc,
       std::uint8_t isArgDescSet, std::uint8_t isTypeBound,
-      std::uint8_t isArgContiguousSet)
+      std::uint8_t specialCaseFlag)
       : which_{which}, isArgDescriptorSet_{isArgDescSet},
-        isTypeBound_{isTypeBound}, isArgContiguousSet_{isArgContiguousSet},
+        isTypeBound_{isTypeBound}, specialCaseFlag_{specialCaseFlag},
         proc_{proc} {}
 
   static constexpr RT_API_ATTRS Which RankFinal(int rank) {
@@ -153,13 +153,11 @@ class SpecialBinding {
   }
 
   RT_API_ATTRS Which which() const { return which_; }
+  RT_API_ATTRS bool specialCaseFlag() const { return specialCaseFlag_; }
   RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const {
     return (isArgDescriptorSet_ >> zeroBasedArg) & 1;
   }
   RT_API_ATTRS bool IsTypeBound() const { return isTypeBound_ != 0; }
-  RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const {
-    return (isArgContiguousSet_ >> zeroBasedArg) & 1;
-  }
   template <typename PROC>
   RT_API_ATTRS PROC GetProc(const Binding *bindings = nullptr) const {
     if (bindings && isTypeBound_ > 0) {
@@ -203,10 +201,10 @@ class SpecialBinding {
   // When a special binding is type-bound, this is its binding's index (plus 1,
   // so that 0 signifies that it's not type-bound).
   std::uint8_t isTypeBound_{0};
-  // True when a FINAL subroutine has a dummy argument that is an array that
-  // is CONTIGUOUS or neither assumed-rank nor assumed-shape.
-  std::uint8_t isArgContiguousSet_{0};
-
+  // For a FINAL subroutine, set when it has a dummy argument that is an array
+  // that is CONTIGUOUS or neither assumed-rank nor assumed-shape.
+  // For a defined I/O subroutine, set when UNIT= and IOSTAT= are INTEGER(8).
+  std::uint8_t specialCaseFlag_{0};
   ProcedurePointer proc_{nullptr};
 };
 
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index d642ed578b061..7cf4147a94a95 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -279,13 +279,15 @@ RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
     if (mustDeallocateLHS) {
       // Convert the LHS into a temporary, then make it look deallocated.
       toDeallocate_ = &tempDescriptor_.descriptor();
-      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
           reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
       to_.set_base_addr(nullptr);
       if (toDerived_ && (flags_ & NeedFinalization)) {
-        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
-            status != StatOk && status != StatContinue) {
+        int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+        if (status == StatContinue) {
+          // tempDescriptor_ state must outlive pending child ticket
+          persist_ = true;
+        } else if (status != StatOk) {
           return status;
         }
         flags_ &= ~NeedFinalization;
@@ -304,6 +306,9 @@ RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
       if (int stat{ReturnError(
               workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
           stat != StatOk) {
+        if (stat == StatContinue) {
+          persist_ = true;
+        }
         return stat;
       }
       if (HasDynamicComponent(*from_)) {
@@ -507,6 +512,7 @@ RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
     }
   }
   if (persist_) {
+    // tempDescriptor_ must outlive pending child ticket(s)
     done_ = true;
     return StatContinue;
   } else {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index bb9a68abef2a7..4ed0baaa3d108 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -270,7 +270,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
       StaticDescriptor<maxRank, true, 10> statDesc;
       Descriptor &copy{statDesc.descriptor()};
       const Descriptor *argDescriptor{&descriptor};
-      if (descriptor.rank() > 0 && special->IsArgContiguous(0) &&
+      if (descriptor.rank() > 0 && special->specialCaseFlag() &&
           !descriptor.IsContiguous()) {
         // The FINAL subroutine demands a contiguous array argument, but
         // this INTENT(OUT) or intrinsic assignment LHS isn't contiguous.
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index b208cb2c397b3..3868c8ddce19f 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -67,13 +67,29 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
           ioType, io.mutableModes().inNamelist ? "NAMELIST" : "LISTDIRECTED");
       ioTypeLen = runtime::strlen(ioType);
     }
+    // V_LIST= argument
     StaticDescriptor<1, true> vListStatDesc;
     Descriptor &vListDesc{vListStatDesc.descriptor()};
-    vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1);
-    vListDesc.set_base_addr(edit.vList);
-    vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
-    vListDesc.GetDimension(0).SetByteStride(
-        static_cast<SubscriptValue>(sizeof(int)));
+    bool integer8{special.specialCaseFlag()};
+    std::int64_t vList64[edit.maxVListEntries];
+    if (integer8) {
+      // Convert v_list values to INTEGER(8)
+      for (int j{0}; j < edit.vListEntries; ++j) {
+        vList64[j] = edit.vList[j];
+      }
+      vListDesc.Establish(
+          TypeCategory::Integer, sizeof(std::int64_t), nullptr, 1);
+      vListDesc.set_base_addr(vList64);
+      vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
+      vListDesc.GetDimension(0).SetByteStride(
+          static_cast<SubscriptValue>(sizeof(std::int64_t)));
+    } else {
+      vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1);
+      vListDesc.set_base_addr(edit.vList);
+      vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
+      vListDesc.GetDimension(0).SetByteStride(
+          static_cast<SubscriptValue>(sizeof(int)));
+    }
     ExternalFileUnit *actualExternal{io.GetExternalFileUnit()};
     ExternalFileUnit *external{actualExternal};
     if (!external) {
@@ -84,8 +100,8 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
     ChildIo &child{external->PushChildIo(io)};
     // Child formatted I/O is nonadvancing by definition (F'2018 12.6.2.4).
     auto restorer{common::ScopedSet(io.mutableModes().nonAdvancing, true)};
-    int unit{external->unitNumber()};
-    int ioStat{IostatOk};
+    std::int32_t unit{external->unitNumber()};
+    std::int32_t ioStat{IostatOk};
     char ioMsg[100];
     Fortran::common::optional<std::int64_t> startPos;
     if (edit.descriptor == DataEdit::DefinedDerivedType &&
@@ -98,23 +114,45 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
         derived.binding().OffsetElement<const typeInfo::Binding>()};
     if (special.IsArgDescriptor(0)) {
       // "dtv" argument is "class(t)", pass a descriptor
-      auto *p{special.GetProc<void (*)(const Descriptor &, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
-          bindings)};
       StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
       Descriptor &elementDesc{elementStatDesc.descriptor()};
       elementDesc.Establish(
           derived, nullptr, 0, nullptr, CFI_attribute_pointer);
       elementDesc.set_base_addr(descriptor.Element<char>(subscripts));
-      p(elementDesc, unit, ioType, vListDesc, ioStat, ioMsg, ioTypeLen,
-          sizeof ioMsg);
+      if (integer8) { // 64-bit UNIT=/IOSTAT=
+        std::int64_t unit64{unit};
+        std::int64_t ioStat64{ioStat};
+        auto *p{special.GetProc<void (*)(const Descriptor &, std::int64_t &,
+            char *, const Descriptor &, std::int64_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(elementDesc, unit64, ioType, vListDesc, ioStat64, ioMsg, ioTypeLen,
+            sizeof ioMsg);
+        ioStat = ioStat64;
+      } else { // 32-bit UNIT=/IOSTAT=
+        auto *p{special.GetProc<void (*)(const Descriptor &, std::int32_t &,
+            char *, const Descriptor &, std::int32_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(elementDesc, unit, ioType, vListDesc, ioStat, ioMsg, ioTypeLen,
+            sizeof ioMsg);
+      }
     } else {
       // "dtv" argument is "type(t)", pass a raw pointer
-      auto *p{special.GetProc<void (*)(const void *, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
-          bindings)};
-      p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
-          ioMsg, ioTypeLen, sizeof ioMsg);
+      if (integer8) { // 64-bit UNIT= and IOSTAT=
+        std::int64_t unit64{unit};
+        std::int64_t ioStat64{ioStat};
+        auto *p{special.GetProc<void (*)(const void *, std::int64_t &, char *,
+            const Descriptor &, std::int64_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(descriptor.Element<char>(subscripts), unit64, ioType, vListDesc,
+            ioStat64, ioMsg, ioTypeLen, sizeof ioMsg);
+        ioStat = ioStat64;
+      } else { // 32-bit UNIT= and IOSTAT=
+        auto *p{special.GetProc<void (*)(const void *, std::int32_t &, char *,
+            const Descriptor &, std::int32_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
+            ioMsg, ioTypeLen, sizeof ioMsg);
+      }
     }
     handler.Forward(ioStat, ioMsg, sizeof ioMsg);
     external->PopChildIo(child);
@@ -458,11 +496,16 @@ RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
                       ? common::DefinedIo::ReadUnformatted
                       : common::DefinedIo::WriteUnformatted)}) {
             if (definedIo->subroutine) {
+              std::uint8_t isArgDescriptorSet{0};
+              if (definedIo->flags & IsDtvArgPolymorphic) {
+                isArgDescriptorSet = 1;
+              }
               typeInfo::SpecialBinding special{DIR == Direction::Input
                       ? typeInfo::SpecialBinding::Which::ReadUnformatted
                       : typeInfo::SpecialBinding::Which::WriteUnformatted,
-                  definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-                  false};
+                  definedIo->subroutine, isArgDescriptorSet,
+                  /*IsTypeBound=*/false,
+                  /*specialCaseFlag=*/!!(definedIo->flags & DefinedIoInteger8)};
               if (DefinedUnformattedIo(io_, instance_, *type, special)) {
                 anyIoTookPlace_ = true;
                 return StatOk;
@@ -719,8 +762,11 @@ RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
             nonTbpSpecial_.emplace(DIR == Direction::Input
                     ? typeInfo::SpecialBinding::Which::ReadFormatted
                     : typeInfo::SpecialBinding::Which::WriteFormatted,
-                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-                false);
+                definedIo->subroutine,
+                /*isArgDescriptorSet=*/
+                (definedIo->flags & IsDtvArgPolymorphic) ? 1 : 0,
+                /*isTypeBound=*/false,
+                /*specialCaseFlag=*/!!(definedIo->flags & DefinedIoInteger8));
             special_ = &*nonTbpSpecial_;
           }
         }
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index b723acdd27bd5..021440cbdd0f6 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -85,7 +85,7 @@ RT_API_ATTRS void Descriptor::Establish(int characterKind,
 RT_API_ATTRS void Descriptor::Establish(const typeInfo::DerivedType &dt,
     void *p, int rank, const SubscriptValue *extent,
     ISO::CFI_attribute_t attribute) {
-  std::size_t elementBytes{dt.sizeInBytes()};
+  auto elementBytes{static_cast<std::size_t>(dt.sizeInBytes())};
   ISO::EstablishDescriptor(
       &raw_, p, attribute, CFI_type_struct, elementBytes, rank, extent);
   if (elementBytes == 0) {
@@ -252,18 +252,21 @@ RT_API_ATTRS bool Descriptor::EstablishPointerSection(const Descriptor &source,
   return CFI_section(&raw_, &source.raw_, lower, upper, stride) == CFI_SUCCESS;
 }
 
-RT_API_ATTRS void Descriptor::ApplyMold(const Descriptor &mold, int rank) {
-  raw_.elem_len = mold.raw_.elem_len;
+RT_API_ATTRS void Descriptor::ApplyMold(
+    const Descriptor &mold, int rank, bool isMonomorphic) {
   raw_.rank = rank;
-  raw_.type = mold.raw_.type;
   for (int j{0}; j < rank && j < mold.raw_.rank; ++j) {
     GetDimension(j) = mold.GetDimension(j);
   }
-  if (auto *addendum{Addendum()}) {
-    if (auto *moldAddendum{mold.Addendum()}) {
-      *addendum = *moldAddendum;
-    } else {
-      INTERNAL_CHECK(!addendum->derivedType());
+  if (!isMonomorphic) {
+    raw_.elem_len = mold.raw_.elem_len;
+    raw_.type = mold.raw_.type;
+    if (auto *addendum{Addendum()}) {
+      if (auto *moldAddendum{mold.Addendum()}) {
+        *addendum = *moldAddendum;
+      } else {
+        INTERNAL_CHECK(!addendum->derivedType());
+      }
     }
   }
 }
diff --git a/flang-rt/lib/runtime/edit-input.cpp b/flang-rt/lib/runtime/edit-input.cpp
index 0cc287aa3b47e..13557678f6057 100644
--- a/flang-rt/lib/runtime/edit-input.cpp
+++ b/flang-rt/lib/runtime/edit-input.cpp
@@ -19,16 +19,19 @@
 namespace Fortran::runtime::io {
 RT_OFFLOAD_API_GROUP_BEGIN
 
-// Checks that a list-directed input value has been entirely consumed and
-// doesn't contain unparsed characters before the next value separator.
+// Handle DC or DECIMAL='COMMA' and determine the active separator character
+static inline RT_API_ATTRS char32_t GetSeparatorChar(const DataEdit &edit) {
+  return edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','};
+}
+
 static inline RT_API_ATTRS bool IsCharValueSeparator(
     const DataEdit &edit, char32_t ch) {
-  char32_t comma{
-      edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}};
-  return ch == ' ' || ch == '\t' || ch == comma || ch == '/' ||
+  return ch == ' ' || ch == '\t' || ch == '/' || ch == GetSeparatorChar(edit) ||
       (edit.IsNamelist() && (ch == '&' || ch == '$'));
 }
 
+// Checks that a list-directed input value has been entirely consumed and
+// doesn't contain unparsed characters before the next value separator.
 static RT_API_ATTRS bool CheckCompleteListDirectedField(
     IoStatementState &io, const DataEdit &edit) {
   if (edit.IsListDirected()) {
@@ -54,10 +57,6 @@ static RT_API_ATTRS bool CheckCompleteListDirectedField(
   }
 }
 
-static inline RT_API_ATTRS char32_t GetSeparatorChar(const DataEdit &edit) {
-  return edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','};
-}
-
 template <int LOG2_BASE>
 static RT_API_ATTRS bool EditBOZInput(
     IoStatementState &io, const DataEdit &edit, void *n, std::size_t bytes) {
@@ -518,7 +517,7 @@ static RT_API_ATTRS ScannedRealInput ScanRealInput(
   // Consume the trailing ')' of a list-directed or NAMELIST complex
   // input value.
   if (edit.descriptor == DataEdit::ListDirectedImaginaryPart) {
-    if (next && (*next == ' ' || *next == '\t')) {
+    if (!next || *next == ' ' || *next == '\t') {
       io.SkipSpaces(remaining);
       next = io.NextInField(remaining, edit);
     }
@@ -1006,27 +1005,7 @@ static RT_API_ATTRS bool EditListDirectedCharacterInput(
   // Undelimited list-directed character input: stop at a value separator
   // or the end of the current record.
   while (auto ch{io.GetCurrentChar(byteCount)}) {
-    bool isSep{false};
-    switch (*ch) {
-    case ' ':
-    case '\t':
-    case '/':
-      isSep = true;
-      break;
-    case '&':
-    case '$':
-      isSep = edit.IsNamelist();
-      break;
-    case ',':
-      isSep = !(edit.modes.editingFlags & decimalComma);
-      break;
-    case ';':
-      isSep = !!(edit.modes.editingFlags & decimalComma);
-      break;
-    default:
-      break;
-    }
-    if (isSep) {
+    if (IsCharValueSeparator(edit, *ch)) {
       break;
     }
     if (length > 0) {
diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp
index e70dff3997233..f6c39468d5655 100644
--- a/flang-rt/lib/runtime/extensions.cpp
+++ b/flang-rt/lib/runtime/extensions.cpp
@@ -27,10 +27,7 @@
 #include <thread>
 
 #ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-
+#include "flang/Common/windows-include.h"
 #include <synchapi.h>
 
 inline void CtimeBuffer(char *buffer, size_t bufsize, const time_t cur_time,
@@ -309,6 +306,9 @@ void RTNAME(Perror)(const char *str) { perror(str); }
 // GNU extension function TIME()
 std::int64_t RTNAME(time)() { return time(nullptr); }
 
+// MCLOCK: returns accumulated CPU time in ticks
+std::int32_t FORTRAN_PROCEDURE_NAME(mclock)() { return std::clock(); }
+
 // Extension procedures related to I/O
 
 namespace io {
diff --git a/flang-rt/lib/runtime/non-tbp-dio.cpp b/flang-rt/lib/runtime/non-tbp-dio.cpp
index 72101b06e0c6e..d516526033c27 100644
--- a/flang-rt/lib/runtime/non-tbp-dio.cpp
+++ b/flang-rt/lib/runtime/non-tbp-dio.cpp
@@ -17,7 +17,7 @@ const NonTbpDefinedIo *NonTbpDefinedIoTable::Find(
   for (const auto *p{item}; j-- > 0; ++p) {
     if (&p->derivedType == &type && p->definedIo == definedIo) {
       return p;
-    } else if (p->isDtvArgPolymorphic) {
+    } else if (p->flags & IsDtvArgPolymorphic) {
       for (const typeInfo::DerivedType *t{type.GetParentType()}; t;
            t = t->GetParentType()) {
         if (&p->derivedType == t && p->definedIo == definedIo) {
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index 04487abd3272e..68db2594acdd4 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -87,9 +87,9 @@ void RTDEF(PointerAssociateLowerBounds)(Descriptor &pointer,
   }
 }
 
-void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
+static void RT_API_ATTRS PointerRemapping(Descriptor &pointer,
     const Descriptor &target, const Descriptor &bounds, const char *sourceFile,
-    int sourceLine) {
+    int sourceLine, bool isMonomorphic) {
   Terminator terminator{sourceFile, sourceLine};
   SubscriptValue byteStride{/*captured from first dimension*/};
   std::size_t boundElementBytes{bounds.ElementBytes()};
@@ -99,7 +99,7 @@ void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
   // the ranks may mismatch. Use target as a mold for initializing
   // the pointer descriptor.
   INTERNAL_CHECK(static_cast<std::size_t>(pointer.rank()) == boundsRank);
-  pointer.ApplyMold(target, boundsRank);
+  pointer.ApplyMold(target, boundsRank, isMonomorphic);
   pointer.set_base_addr(target.raw().base_addr);
   pointer.raw().attribute = CFI_attribute_pointer;
   for (unsigned j{0}; j < boundsRank; ++j) {
@@ -124,6 +124,19 @@ void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
   }
 }
 
+void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
+    const Descriptor &target, const Descriptor &bounds, const char *sourceFile,
+    int sourceLine) {
+  PointerRemapping(
+      pointer, target, bounds, sourceFile, sourceLine, /*isMonomorphic=*/false);
+}
+void RTDEF(PointerAssociateRemappingMonomorphic)(Descriptor &pointer,
+    const Descriptor &target, const Descriptor &bounds, const char *sourceFile,
+    int sourceLine) {
+  PointerRemapping(
+      pointer, target, bounds, sourceFile, sourceLine, /*isMonomorphic=*/true);
+}
+
 RT_API_ATTRS void *AllocateValidatedPointerPayload(
     std::size_t byteSize, int allocatorIdx) {
   // Add space for a footer to validate during deallocation.
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 3e1d7c9c3c788..50123f4cf321c 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -330,7 +330,7 @@ FILE *SpecialBinding::Dump(FILE *f) const {
   }
   std::fprintf(f, "    isArgDescriptorSet: 0x%x\n", isArgDescriptorSet_);
   std::fprintf(f, "    isTypeBound: %d\n", isTypeBound_);
-  std::fprintf(f, "    isArgContiguousSet: 0x%x\n", isArgContiguousSet_);
+  std::fprintf(f, "    specialCaseFlag 0x%x\n", specialCaseFlag_);
   std::fprintf(f, "    proc: %p\n", reinterpret_cast<void *>(proc_));
   return f;
 }
diff --git a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 83aa37f8d06f3..d76fca2c4250e 100644
--- a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -79,6 +79,6 @@ TEST(AllocatableCUFTest, CUFSetAllocatorIndex) {
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};
   EXPECT_EQ((int)kDefaultAllocator, a->GetAllocIdx());
-  RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
+  RTNAME(CUFSetAllocatorIndex)(a, kDeviceAllocatorPos, __FILE__, __LINE__);
   EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
 }
diff --git a/flang-rt/unittests/Runtime/NumericalFormatTest.cpp b/flang-rt/unittests/Runtime/NumericalFormatTest.cpp
index f1492d0e39fec..73245dca13bc0 100644
--- a/flang-rt/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang-rt/unittests/Runtime/NumericalFormatTest.cpp
@@ -213,6 +213,37 @@ TEST(IOApiTests, ListInputTest) {
       << "', but got '" << output << "'";
 }
 
+TEST(IOApiTests, ListInputComplexRegressionTest) {
+  static const char input[]{"(1,;2, );(3,;4,)"};
+  auto cookie{IONAME(BeginInternalListInput)(input, sizeof input - 1)};
+  static constexpr int numRealValues{4};
+  float z[numRealValues];
+  ASSERT_TRUE(IONAME(SetDecimal)(cookie, "COMMA", 5));
+  for (int j{0}; j < numRealValues; j += 2) {
+    ASSERT_TRUE(IONAME(InputComplex32)(cookie, &z[j]))
+        << "InputComplex32 failed with value " << z[j];
+  }
+  auto status{IONAME(EndIoStatement)(cookie)};
+  ASSERT_EQ(status, 0) << "Failed complex list-directed input, status "
+                       << static_cast<int>(status);
+  static constexpr int bufferSize{18};
+  char output[bufferSize];
+  output[bufferSize - 1] = '\0';
+  cookie = IONAME(BeginInternalListOutput)(output, bufferSize - 1);
+  for (int j{0}; j < numRealValues; j += 2) {
+    ASSERT_TRUE(IONAME(OutputComplex32)(cookie, z[j], z[j + 1]))
+        << "OutputComplex32 failed when outputting value " << z[j] << ", "
+        << z[j + 1];
+  }
+  status = IONAME(EndIoStatement)(cookie);
+  ASSERT_EQ(status, 0) << "Failed complex list-directed output, status "
+                       << static_cast<int>(status);
+  static const char expect[bufferSize]{" (1.,2.) (3.,4.) "};
+  ASSERT_EQ(std::strncmp(output, expect, bufferSize), 0)
+      << "Failed complex list-directed output, expected '" << expect
+      << "', but got '" << output << "'";
+}
+
 TEST(IOApiTests, DescriptorOutputTest) {
   static constexpr int bufferSize{10};
   char buffer[bufferSize];
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 72d12cd92600d..c167a55bc486d 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -1,9 +1,9 @@
-<!--===- docs/Extensions.md 
-  
+<!--===- docs/Extensions.md
+
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
+
 -->
 
 # Fortran Extensions supported by Flang
@@ -170,6 +170,18 @@ end
   In the case of `DEFERRED` bindings in an `ABSTRACT` derived type,
   however, overrides are necessary, so they are permitted for inaccessible
   bindings with an optional warning.
+* Main program name is allowed to be the same as the other symbols used
+  in the main program, for example:
+```
+module m
+end
+program m
+use m
+end
+```
+  Note that internally the main program symbol name is all uppercase, unlike
+  the names of all other symbols, which are usually all lowercase. This
+  may make a difference in testing/debugging.
 
 ## Extensions, deletions, and legacy features supported by default
 
diff --git a/flang/docs/GettingStarted.md b/flang/docs/GettingStarted.md
index 0b3b551ffbfba..2ea8093b607cf 100644
--- a/flang/docs/GettingStarted.md
+++ b/flang/docs/GettingStarted.md
@@ -74,15 +74,14 @@ cmake \
   -G Ninja \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_INSTALL_PREFIX=$INSTALLDIR \
-  -DCMAKE_CXX_STANDARD=17 \
   -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
   -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$LD_LIBRARY_PATH" \
   -DFLANG_ENABLE_WERROR=ON \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DLLVM_TARGETS_TO_BUILD=host \
   -DLLVM_LIT_ARGS=-v \
-  -DLLVM_ENABLE_PROJECTS="clang;mlir;flang;openmp" \
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt" \
+  -DLLVM_ENABLE_PROJECTS="clang;mlir;flang" \
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt;openmp" \
   ../llvm-project/llvm
 
 ninja
@@ -141,7 +140,6 @@ cd build
 cmake \
   -G Ninja \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_STANDARD=17 \
   -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$LD_LIBRARY_PATH" \
   -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
   -DFLANG_ENABLE_WERROR=ON \
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 0118f8eb7d913..f7da6c889d413 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -709,8 +709,9 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC
 MALLOC, FREE
 ```
 
-### Library subroutine
+### Library subroutines and functions
 ```
+ticks = MCLOCK()
 CALL BACKTRACE()
 CALL FDATE(TIME)
 CALL GETLOG(USRNAME)
diff --git a/flang/docs/ParallelMultiImageFortranRuntime.md b/flang/docs/ParallelMultiImageFortranRuntime.md
new file mode 100644
index 0000000000000..8cf0055e5817b
--- /dev/null
+++ b/flang/docs/ParallelMultiImageFortranRuntime.md
@@ -0,0 +1,18 @@
+<!--===- docs/ParallelMultiImageFortranRuntime.md
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
+# Multi-Image Parallel Fortran Runtime
+
+
+The Parallel Runtime Interface for Fortran (PRIF) defines an 
+interface designed for LLVM Flang to target implementations of 
+Fortran's multi-image parallel features.
+
+The current revision of the PRIF specification is here:
+<https://doi.org/10.25344/S4CG6G>
+
diff --git a/flang/docs/index.md b/flang/docs/index.md
index 2568ad70c5d09..016577bcb1e98 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -78,6 +78,7 @@ on how to get in touch with us and to learn more about the current status.
    OpenMP-semantics
    OptionComparison
    Overview
+   ParallelMultiImageFortranRuntime
    ParameterizedDerivedTypes
    ParserCombinators
    Parsing
diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h
index fccc2ad774a8f..5953fc81cb111 100644
--- a/flang/include/flang/Evaluate/integer.h
+++ b/flang/include/flang/Evaluate/integer.h
@@ -74,6 +74,7 @@ class Integer {
   static_assert(std::is_unsigned_v<BigPart>);
   static_assert(CHAR_BIT * sizeof(BigPart) >= 2 * partBits);
   static constexpr bool littleEndian{IS_LITTLE_ENDIAN};
+  static constexpr int alignment{ALIGNMENT};
 
 private:
   static constexpr int maxPartBits{CHAR_BIT * sizeof(Part)};
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index 03294881850a1..76d25d9fe2670 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -490,7 +490,10 @@ template <typename WORD, int PREC> class Real {
       bool isNegative, int exponent, const Fraction &, Rounding, RoundingBits,
       bool multiply = false);
 
-  Word word_{}; // an Integer<>
+  // Require alignment, in case code generation on x86_64 decides that our
+  // Real object is suitable for SSE2 instructions and then gets surprised
+  // by unaligned address.
+  alignas(Word::alignment / 8) Word word_{}; // an Integer<>
 };
 
 extern template class Real<Integer<16>, 11>; // IEEE half format
diff --git a/flang/include/flang/Lower/Runtime.h b/flang/include/flang/Lower/Runtime.h
index 77e98a1e019e7..f76f398569b54 100644
--- a/flang/include/flang/Lower/Runtime.h
+++ b/flang/include/flang/Lower/Runtime.h
@@ -70,7 +70,7 @@ void genPointerAssociate(fir::FirOpBuilder &, mlir::Location,
                          mlir::Value pointer, mlir::Value target);
 void genPointerAssociateRemapping(fir::FirOpBuilder &, mlir::Location,
                                   mlir::Value pointer, mlir::Value target,
-                                  mlir::Value bounds);
+                                  mlir::Value bounds, bool isMonomorphic);
 void genPointerAssociateLowerBounds(fir::FirOpBuilder &, mlir::Location,
                                     mlir::Value pointer, mlir::Value target,
                                     mlir::Value lbounds);
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index d38c5b6d09a82..d84d3593ebca6 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -241,11 +241,11 @@ struct IntrinsicLibrary {
   void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genClock64(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <mlir::arith::CmpIPredicate pred>
   fir::ExtendedValue genCPtrCompare(mlir::Type,
                                     llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genCosd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genCospi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genDim(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genDotProduct(mlir::Type,
@@ -376,6 +376,8 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genNorm2(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genNot(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  template <typename OpTy>
+  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genPerror(llvm::ArrayRef<fir::ExtendedValue>);
@@ -417,6 +419,7 @@ struct IntrinsicLibrary {
   mlir::Value genShiftA(mlir::Type resultType, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSign(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSind(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSinpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genSize(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genSizeOf(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genSpacing(mlir::Type resultType,
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
index 9ca4b2baeaa65..145ea04e56484 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h
@@ -37,7 +37,7 @@ void genPointerAssociate(fir::FirOpBuilder &, mlir::Location,
                          mlir::Value pointer, mlir::Value target);
 void genPointerAssociateRemapping(fir::FirOpBuilder &, mlir::Location,
                                   mlir::Value pointer, mlir::Value target,
-                                  mlir::Value bounds);
+                                  mlir::Value bounds, bool isMonomorphic);
 
 mlir::Value genCpuTime(fir::FirOpBuilder &, mlir::Location);
 void genDateAndTime(fir::FirOpBuilder &, mlir::Location,
diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 8bb4791859bb7..aacba233a2b32 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -23,6 +23,7 @@
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
@@ -41,7 +42,8 @@ namespace fir::support {
       mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,                   \
       mlir::vector::VectorDialect, mlir::math::MathDialect,                    \
       mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect,       \
-      mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect
+      mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect,                          \
+      mlir::index::IndexDialect
 
 #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect
 
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 32b6ca45609b6..23e35d106c077 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -529,6 +529,8 @@ class ParseTreeDumper {
   NODE(parser, OmpAlignClause)
   NODE(parser, OmpAlignedClause)
   NODE(OmpAlignedClause, Modifier)
+  NODE(parser, OmpAlwaysModifier)
+  NODE_ENUM(OmpAlwaysModifier, Value)
   NODE(parser, OmpAtClause)
   NODE_ENUM(OmpAtClause, ActionTime)
   NODE_ENUM(OmpSeverityClause, Severity)
@@ -546,6 +548,8 @@ class ParseTreeDumper {
 #include "llvm/Frontend/OpenMP/OMP.inc"
   NODE(parser, OmpClauseList)
   NODE(parser, OmpCancellationConstructTypeClause)
+  NODE(parser, OmpCloseModifier)
+  NODE_ENUM(OmpCloseModifier, Value)
   NODE(parser, OmpContainsClause)
   NODE(parser, OmpCriticalDirective)
   NODE(parser, OmpErrorDirective)
@@ -561,6 +565,8 @@ class ParseTreeDumper {
   NODE(parser, OmpDefaultmapClause)
   NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior)
   NODE(OmpDefaultmapClause, Modifier)
+  NODE(parser, OmpDeleteModifier)
+  NODE_ENUM(OmpDeleteModifier, Value)
   NODE(parser, OmpDependenceType)
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
@@ -628,6 +634,8 @@ class ParseTreeDumper {
   NODE(OmpNumTasksClause, Modifier)
   NODE(parser, OmpBindClause)
   NODE_ENUM(OmpBindClause, Binding)
+  NODE(parser, OmpPresentModifier)
+  NODE_ENUM(OmpPresentModifier, Value)
   NODE(parser, OmpProcBindClause)
   NODE_ENUM(OmpProcBindClause, AffinityPolicy)
   NODE(parser, OmpReductionModifier)
@@ -637,6 +645,10 @@ class ParseTreeDumper {
   NODE(parser, OmpInReductionClause)
   NODE(OmpInReductionClause, Modifier)
   NODE(parser, OmpReductionCombiner)
+  NODE(parser, OmpRefModifier)
+  NODE_ENUM(OmpRefModifier, Value)
+  NODE(parser, OmpSelfModifier)
+  NODE_ENUM(OmpSelfModifier, Value)
   NODE(parser, OmpTaskReductionClause)
   NODE(OmpTaskReductionClause, Modifier)
   NODE(parser, OmpInitializerProc)
@@ -673,6 +685,8 @@ class ParseTreeDumper {
   NODE(parser, OmpSectionsDirective)
   NODE(parser, OmpToClause)
   NODE(OmpToClause, Modifier)
+  NODE(parser, OmpxHoldModifier)
+  NODE_ENUM(OmpxHoldModifier, Value)
   NODE(parser, Only)
   NODE(parser, OpenACCAtomicConstruct)
   NODE(parser, OpenACCBlockConstruct)
diff --git a/flang/include/flang/Parser/message.h b/flang/include/flang/Parser/message.h
index db1a0a65157e3..9192d23529913 100644
--- a/flang/include/flang/Parser/message.h
+++ b/flang/include/flang/Parser/message.h
@@ -355,9 +355,9 @@ class Messages {
   void Emit(llvm::raw_ostream &, const AllCookedSources &,
       bool echoSourceLines = true,
       const common::LanguageFeatureControl *hintFlags = nullptr,
-      std::size_t maxErrorsToEmit = 0) const;
+      std::size_t maxErrorsToEmit = 0, bool warningsAreErrors = false) const;
   void AttachTo(Message &, std::optional<Severity> = std::nullopt);
-  bool AnyFatalError() const;
+  bool AnyFatalError(bool warningsAreErrors = false) const;
 
 private:
   std::list<Message> messages_;
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index cc1d032f94d4a..0b3dec1010312 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3756,6 +3756,19 @@ struct OmpAllocatorComplexModifier {
   WRAPPER_CLASS_BOILERPLATE(OmpAllocatorComplexModifier, ScalarIntExpr);
 };
 
+// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158],
+// [6.0:279-288]
+//
+// always-modifier ->
+//    ALWAYS                                        // since 4.5
+//
+// Until 5.2, it was a part of map-type-modifier. Since 6.0 the
+// map-type-modifier has been split into individual modifiers.
+struct OmpAlwaysModifier {
+  ENUM_CLASS(Value, Always)
+  WRAPPER_CLASS_BOILERPLATE(OmpAlwaysModifier, Value);
+};
+
 // Ref: [5.2:252-254]
 //
 // chunk-modifier ->
@@ -3767,17 +3780,29 @@ struct OmpChunkModifier {
   WRAPPER_CLASS_BOILERPLATE(OmpChunkModifier, Value);
 };
 
-// Ref: [5.0:47-49], [5.1:49-51], [5.2:67-69]
+// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158],
+// [6.0:279-288]
 //
-// iterator-specifier ->
-//    [iterator-type] iterator-identifier
-//        = range-specification |                   // since 5.0
-//    [iterator-type ::] iterator-identifier
-//        = range-specification                     // since 5.2
-struct OmpIteratorSpecifier {
-  TUPLE_CLASS_BOILERPLATE(OmpIteratorSpecifier);
-  CharBlock source;
-  std::tuple<TypeDeclarationStmt, SubscriptTriplet> t;
+// close-modifier ->
+//    CLOSE                                         // since 5.0
+//
+// Until 5.2, it was a part of map-type-modifier. Since 6.0 the
+// map-type-modifier has been split into individual modifiers.
+struct OmpCloseModifier {
+  ENUM_CLASS(Value, Close)
+  WRAPPER_CLASS_BOILERPLATE(OmpCloseModifier, Value);
+};
+
+// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158],
+// [6.0:279-288]
+//
+// delete-modifier ->
+//    DELETE                                        // since 6.0
+//
+// Until 5.2, it was a part of map-type.
+struct OmpDeleteModifier {
+  ENUM_CLASS(Value, Delete)
+  WRAPPER_CLASS_BOILERPLATE(OmpDeleteModifier, Value);
 };
 
 // Ref: [4.5:169-170], [5.0:255-256], [5.1:288-289]
@@ -3867,6 +3892,19 @@ struct OmpInteropType {
   WRAPPER_CLASS_BOILERPLATE(OmpInteropType, Value);
 };
 
+// Ref: [5.0:47-49], [5.1:49-51], [5.2:67-69]
+//
+// iterator-specifier ->
+//    [iterator-type] iterator-identifier
+//        = range-specification |                   // since 5.0
+//    [iterator-type ::] iterator-identifier
+//        = range-specification                     // since 5.2
+struct OmpIteratorSpecifier {
+  TUPLE_CLASS_BOILERPLATE(OmpIteratorSpecifier);
+  CharBlock source;
+  std::tuple<TypeDeclarationStmt, SubscriptTriplet> t;
+};
+
 // Ref: [5.0:47-49], [5.1:49-51], [5.2:67-69]
 //
 // iterator-modifier ->
@@ -3901,21 +3939,28 @@ struct OmpMapper {
   WRAPPER_CLASS_BOILERPLATE(OmpMapper, Name);
 };
 
-// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
+// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158],
+// [6.0:279-288]
 //
 // map-type ->
-//    ALLOC | DELETE | FROM | RELEASE | TO | TOFROM // since 4.5
+//    ALLOC | DELETE | RELEASE |                    // since 4.5, until 5.2
+//    FROM | TO | TOFROM |                          // since 4.5
+//    STORAGE                                       // since 6.0
+//
+// Since 6.0 DELETE is a separate delete-modifier.
 struct OmpMapType {
-  ENUM_CLASS(Value, Alloc, Delete, From, Release, To, Tofrom);
+  ENUM_CLASS(Value, Alloc, Delete, From, Release, Storage, To, Tofrom);
   WRAPPER_CLASS_BOILERPLATE(OmpMapType, Value);
 };
 
 // Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
 //
 // map-type-modifier ->
-//    ALWAYS |                                      // since 4.5
-//    CLOSE |                                       // since 5.0
-//    PRESENT                                       // since 5.1
+//    ALWAYS |                                      // since 4.5, until 5.2
+//    CLOSE |                                       // since 5.0, until 5.2
+//    PRESENT                                       // since 5.1, until 5.2
+// Since 6.0 the map-type-modifier has been split into individual modifiers.
+//
 struct OmpMapTypeModifier {
   ENUM_CLASS(Value, Always, Close, Present, Ompx_Hold)
   WRAPPER_CLASS_BOILERPLATE(OmpMapTypeModifier, Value);
@@ -3954,6 +3999,19 @@ struct OmpPrescriptiveness {
   WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value);
 };
 
+// Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158],
+// [6.0:279-288]
+//
+// present-modifier ->
+//    PRESENT                                       // since 5.1
+//
+// Until 5.2, it was a part of map-type-modifier. Since 6.0 the
+// map-type-modifier has been split into individual modifiers.
+struct OmpPresentModifier {
+  ENUM_CLASS(Value, Present)
+  WRAPPER_CLASS_BOILERPLATE(OmpPresentModifier, Value);
+};
+
 // Ref: [5.0:300-302], [5.1:332-334], [5.2:134-137]
 //
 // reduction-modifier ->
@@ -3963,6 +4021,26 @@ struct OmpReductionModifier {
   WRAPPER_CLASS_BOILERPLATE(OmpReductionModifier, Value);
 };
 
+// Ref: [6.0:279-288]
+//
+// ref-modifier ->
+//    REF_PTEE | REF_PTR | REF_PTR_PTEE             // since 6.0
+//
+struct OmpRefModifier {
+  ENUM_CLASS(Value, Ref_Ptee, Ref_Ptr, Ref_Ptr_Ptee)
+  WRAPPER_CLASS_BOILERPLATE(OmpRefModifier, Value);
+};
+
+// Ref: [6.0:279-288]
+//
+// self-modifier ->
+//    SELF                                          // since 6.0
+//
+struct OmpSelfModifier {
+  ENUM_CLASS(Value, Self)
+  WRAPPER_CLASS_BOILERPLATE(OmpSelfModifier, Value);
+};
+
 // Ref: [5.2:117-120]
 //
 // step-complex-modifier ->
@@ -4001,6 +4079,19 @@ struct OmpVariableCategory {
   WRAPPER_CLASS_BOILERPLATE(OmpVariableCategory, Value);
 };
 
+// Extension:
+// https://openmp.llvm.org//openacc/OpenMPExtensions.html#ompx-hold
+//
+// ompx-hold-modifier ->
+//    OMPX_HOLD                                     // since 4.5
+//
+// Until 5.2, it was a part of map-type-modifier. Since 6.0 the
+// map-type-modifier has been split into individual modifiers.
+struct OmpxHoldModifier {
+  ENUM_CLASS(Value, Ompx_Hold)
+  WRAPPER_CLASS_BOILERPLATE(OmpxHoldModifier, Value);
+};
+
 // context-selector
 using OmpContextSelector = traits::OmpContextSelectorSpecification;
 } // namespace modifier
@@ -4376,13 +4467,25 @@ struct OmpLinearClause {
 // map-clause ->
 //    MAP([modifier...:] locator-list)              // since 4.5
 // modifier ->
-//    map-type-modifier |                           // since 4.5
+//    map-type-modifier [replaced] |                // since 4.5, until 5.2
+//    always-modifier |                             // since 6.0
+//    close-modifier |                              // since 6.0
+//    delete-modifier |                             // since 6.0
+//    present-modifier |                            // since 6.0
+//    ref-modifier |                                // since 6.0
+//    self-modifier |                               // since 6.0
 //    mapper |                                      // since 5.0
 //    iterator |                                    // since 5.1
 //    map-type                                      // since 4.5
+//    ompx-hold-modifier |                          // since 6.0
+//
+// Since 6.0 the map-type-modifier has been split into individual modifiers,
+// and delete-modifier has been split from map-type.
 struct OmpMapClause {
   TUPLE_CLASS_BOILERPLATE(OmpMapClause);
-  MODIFIER_BOILERPLATE(OmpMapTypeModifier, OmpMapper, OmpIterator, OmpMapType);
+  MODIFIER_BOILERPLATE(OmpAlwaysModifier, OmpCloseModifier, OmpDeleteModifier,
+      OmpMapTypeModifier, OmpPresentModifier, OmpRefModifier, OmpSelfModifier,
+      OmpMapper, OmpIterator, OmpMapType, OmpxHoldModifier);
   std::tuple<MODIFIERS(), OmpObjectList, /*CommaSeparated=*/bool> t;
 };
 
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index 06ae7f35d9b5b..b350204714431 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -12,14 +12,12 @@
 #ifndef FORTRAN_RUNTIME_EXTENSIONS_H_
 #define FORTRAN_RUNTIME_EXTENSIONS_H_
 
-#include "flang/Runtime/entry-names.h"
-
-#define FORTRAN_PROCEDURE_NAME(name) name##_
-
 #include "flang/Runtime/entry-names.h"
 #include <cstddef>
 #include <cstdint>
 
+#define FORTRAN_PROCEDURE_NAME(name) name##_
+
 #ifdef _WIN32
 // UID and GID don't exist on Windows, these exist to avoid errors.
 typedef std::uint32_t uid_t;
@@ -89,5 +87,8 @@ int FORTRAN_PROCEDURE_NAME(ierrno)();
 // GNU extension subroutine PERROR(STRING)
 void RTNAME(Perror)(const char *str);
 
+// MCLOCK -- returns accumulated time in ticks
+int FORTRAN_PROCEDURE_NAME(mclock)();
+
 } // extern "C"
 #endif // FORTRAN_RUNTIME_EXTENSIONS_H_
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 83472ee59d2ab..6787ef3ece232 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -59,9 +59,14 @@ void RTDECL(PointerAssociateLowerBounds)(
 // Associates a pointer with a target with bounds remapping.  The target must be
 // simply contiguous &/or of rank 1.  The bounds constitute a [2,newRank]
 // integer array whose columns are [lower bound, upper bound] on each dimension.
+// Use the Monomorphic form if the pointer's type shouldn't change and
+// the target is polymorphic.
 void RTDECL(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
     const Descriptor &bounds, const char *sourceFile = nullptr,
     int sourceLine = 0);
+void RTDECL(PointerAssociateRemappingMonomorphic)(Descriptor &,
+    const Descriptor &target, const Descriptor &bounds,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Data pointer allocation and deallocation
 
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h
index 4fbd80f989e72..a9fe911ef8807 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -71,8 +71,11 @@ DECLARE_DESCRIPTOR(parser::OmpAlignment);
 DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorSimpleModifier);
+DECLARE_DESCRIPTOR(parser::OmpAlwaysModifier);
 DECLARE_DESCRIPTOR(parser::OmpChunkModifier);
+DECLARE_DESCRIPTOR(parser::OmpCloseModifier);
 DECLARE_DESCRIPTOR(parser::OmpContextSelector);
+DECLARE_DESCRIPTOR(parser::OmpDeleteModifier);
 DECLARE_DESCRIPTOR(parser::OmpDependenceType);
 DECLARE_DESCRIPTOR(parser::OmpDeviceModifier);
 DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier);
@@ -88,12 +91,16 @@ DECLARE_DESCRIPTOR(parser::OmpMapTypeModifier);
 DECLARE_DESCRIPTOR(parser::OmpOrderModifier);
 DECLARE_DESCRIPTOR(parser::OmpOrderingModifier);
 DECLARE_DESCRIPTOR(parser::OmpPrescriptiveness);
+DECLARE_DESCRIPTOR(parser::OmpPresentModifier);
 DECLARE_DESCRIPTOR(parser::OmpReductionIdentifier);
 DECLARE_DESCRIPTOR(parser::OmpReductionModifier);
+DECLARE_DESCRIPTOR(parser::OmpRefModifier);
+DECLARE_DESCRIPTOR(parser::OmpSelfModifier);
 DECLARE_DESCRIPTOR(parser::OmpStepComplexModifier);
 DECLARE_DESCRIPTOR(parser::OmpStepSimpleModifier);
 DECLARE_DESCRIPTOR(parser::OmpTaskDependenceType);
 DECLARE_DESCRIPTOR(parser::OmpVariableCategory);
+DECLARE_DESCRIPTOR(parser::OmpxHoldModifier);
 
 #undef DECLARE_DESCRIPTOR
 
diff --git a/flang/include/flang/Semantics/runtime-type-info.h b/flang/include/flang/Semantics/runtime-type-info.h
index 6c5a061d1c1a2..94e8293b14643 100644
--- a/flang/include/flang/Semantics/runtime-type-info.h
+++ b/flang/include/flang/Semantics/runtime-type-info.h
@@ -52,10 +52,15 @@ constexpr char procCompName[]{"proc"};
 
 SymbolVector CollectBindings(const Scope &dtScope);
 
+enum NonTbpDefinedIoFlags {
+  IsDtvArgPolymorphic = 1 << 0,
+  DefinedIoInteger8 = 1 << 1,
+};
+
 struct NonTbpDefinedIo {
   const Symbol *subroutine;
   common::DefinedIo definedIo;
-  bool isDtvArgPolymorphic;
+  std::uint8_t flags;
 };
 
 std::multimap<const Symbol *, NonTbpDefinedIo>
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index b319e2c7e5e74..5bde9f39ca0b0 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -812,9 +812,8 @@ class Symbol {
       // OpenMP data-sharing attribute
       OmpShared, OmpPrivate, OmpLinear, OmpFirstPrivate, OmpLastPrivate,
       // OpenMP data-mapping attribute
-      OmpMapTo, OmpMapFrom, OmpMapToFrom, OmpMapAlloc, OmpMapRelease,
-      OmpMapDelete, OmpUseDevicePtr, OmpUseDeviceAddr, OmpIsDevicePtr,
-      OmpHasDeviceAddr,
+      OmpMapTo, OmpMapFrom, OmpMapToFrom, OmpMapStorage, OmpMapDelete,
+      OmpUseDevicePtr, OmpUseDeviceAddr, OmpIsDevicePtr, OmpHasDeviceAddr,
       // OpenMP data-copying attribute
       OmpCopyIn, OmpCopyPrivate,
       // OpenMP miscellaneous flags
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 4773e136c41cb..d44239b41fa20 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -428,6 +428,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"conjg", {{"z", SameComplex}}, SameComplex},
     {"cos", {{"x", SameFloating}}, SameFloating},
     {"cosd", {{"x", SameFloating}}, SameFloating},
+    {"cospi", {{"x", SameFloating}}, SameFloating},
     {"cosh", {{"x", SameFloating}}, SameFloating},
     {"coshape", {{"coarray", AnyData, Rank::coarray}, SizeDefaultKIND}, KINDInt,
         Rank::vector, IntrinsicClass::inquiryFunction},
@@ -956,6 +957,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"sin", {{"x", SameFloating}}, SameFloating},
     {"sind", {{"x", SameFloating}}, SameFloating},
     {"sinh", {{"x", SameFloating}}, SameFloating},
+    {"sinpi", {{"x", SameFloating}}, SameFloating},
     {"size",
         {{"array", AnyData, Rank::arrayOrAssumedRank},
             OptionalDIM, // unless array is assumed-size
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 2429e07e5b8c4..58901c6000380 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -230,15 +230,14 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
   const common::LanguageFeatureControl &features{
       instance->getInvocation().getFortranOpts().features};
   const size_t maxErrors{instance->getInvocation().getMaxErrors()};
-  if (!instance->getParsing().messages().empty() &&
-      (instance->getInvocation().getWarnAsErr() ||
-       instance->getParsing().messages().AnyFatalError())) {
+  const bool warningsAreErrors{instance->getInvocation().getWarnAsErr()};
+  if (instance->getParsing().messages().AnyFatalError(warningsAreErrors)) {
     const unsigned diagID = instance->getDiagnostics().getCustomDiagID(
         clang::DiagnosticsEngine::Error, message);
     instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
     instance->getParsing().messages().Emit(
         llvm::errs(), instance->getAllCookedSources(),
-        /*echoSourceLines=*/true, &features, maxErrors);
+        /*echoSourceLines=*/true, &features, maxErrors, warningsAreErrors);
     return true;
   }
   if (instance->getParsing().parseTree().has_value() &&
@@ -249,7 +248,7 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
     instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
     instance->getParsing().messages().Emit(
         llvm::errs(), instance->getAllCookedSources(),
-        /*echoSourceLine=*/true, &features, maxErrors);
+        /*echoSourceLine=*/true, &features, maxErrors, warningsAreErrors);
     instance->getParsing().EmitMessage(
         llvm::errs(), instance->getParsing().finalRestingPlace(),
         "parser FAIL (final position)", "error: ", llvm::raw_ostream::RED);
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 5536bfe8d63ca..15cd9770b35ba 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -78,8 +78,8 @@ struct ErrorManager {
         statExpr && errMsgExpr
             ? builder.createBox(loc,
                                 converter.genExprAddr(loc, errMsgExpr, stmtCtx))
-            : builder.create<fir::AbsentOp>(
-                  loc,
+            : fir::AbsentOp::create(
+                  builder, loc,
                   fir::BoxType::get(mlir::NoneType::get(builder.getContext())));
     sourceFile = fir::factory::locationToFilename(builder, loc);
     sourceLine = fir::factory::locationToLineNo(builder, loc,
@@ -92,10 +92,10 @@ struct ErrorManager {
     if (statValue) {
       mlir::Value zero =
           builder.createIntegerConstant(loc, statValue.getType(), 0);
-      auto cmp = builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, statValue, zero);
-      auto ifOp = builder.create<fir::IfOp>(loc, cmp,
-                                            /*withElseRegion=*/false);
+      auto cmp = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, statValue, zero);
+      auto ifOp = fir::IfOp::create(builder, loc, cmp,
+                                    /*withElseRegion=*/false);
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     }
   }
@@ -106,7 +106,7 @@ struct ErrorManager {
       assert(stat && "missing stat value");
       mlir::Value castStat = builder.createConvert(
           loc, fir::dyn_cast_ptrEleTy(statAddr.getType()), stat);
-      builder.create<fir::StoreOp>(loc, castStat, statAddr);
+      fir::StoreOp::create(builder, loc, castStat, statAddr);
       statValue = stat;
     }
   }
@@ -141,7 +141,7 @@ static void genRuntimeSetBounds(fir::FirOpBuilder &builder, mlir::Location loc,
   const auto args = fir::runtime::createArguments(
       builder, loc, callee.getFunctionType(), box.getAddr(), dimIndex,
       lowerBound, upperBound);
-  builder.create<fir::CallOp>(loc, callee, args);
+  fir::CallOp::create(builder, loc, callee, args);
 }
 
 /// Generate runtime call to set the lengths of a character allocatable or
@@ -171,7 +171,7 @@ static void genRuntimeInitCharacter(fir::FirOpBuilder &builder,
   args.push_back(builder.createIntegerConstant(loc, inputTypes[4], corank));
   const auto convertedArgs = fir::runtime::createArguments(
       builder, loc, callee.getFunctionType(), args);
-  builder.create<fir::CallOp>(loc, callee, convertedArgs);
+  fir::CallOp::create(builder, loc, callee, convertedArgs);
 }
 
 /// Generate a sequence of runtime calls to allocate memory.
@@ -194,7 +194,7 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
   args.push_back(errorManager.sourceLine);
   const auto convertedArgs = fir::runtime::createArguments(
       builder, loc, callee.getFunctionType(), args);
-  return builder.create<fir::CallOp>(loc, callee, convertedArgs).getResult(0);
+  return fir::CallOp::create(builder, loc, callee, convertedArgs).getResult(0);
 }
 
 /// Generate a sequence of runtime calls to allocate memory and assign with the
@@ -214,7 +214,7 @@ static mlir::Value genRuntimeAllocateSource(fir::FirOpBuilder &builder,
       builder, loc, callee.getFunctionType(), box.getAddr(),
       fir::getBase(source), errorManager.hasStat, errorManager.errMsgAddr,
       errorManager.sourceFile, errorManager.sourceLine);
-  return builder.create<fir::CallOp>(loc, callee, args).getResult(0);
+  return fir::CallOp::create(builder, loc, callee, args).getResult(0);
 }
 
 /// Generate runtime call to apply mold to the descriptor.
@@ -233,7 +233,7 @@ static void genRuntimeAllocateApplyMold(fir::FirOpBuilder &builder,
       fir::factory::getMutableIRBox(builder, loc, box), fir::getBase(mold),
       builder.createIntegerConstant(
           loc, callee.getFunctionType().getInputs()[2], rank));
-  builder.create<fir::CallOp>(loc, callee, args);
+  fir::CallOp::create(builder, loc, callee, args);
 }
 
 /// Generate a runtime call to deallocate memory.
@@ -270,7 +270,7 @@ static mlir::Value genRuntimeDeallocate(fir::FirOpBuilder &builder,
         errorManager.hasStat, errorManager.errMsgAddr, errorManager.sourceFile,
         errorManager.sourceLine);
   }
-  return builder.create<fir::CallOp>(loc, callee, operands).getResult(0);
+  return fir::CallOp::create(builder, loc, callee, operands).getResult(0);
 }
 
 //===----------------------------------------------------------------------===//
@@ -433,9 +433,9 @@ class AllocateStmtHelper {
           loc, Fortran::semantics::GetExpr(std::get<1>(shapeSpec.t)), stmtCtx));
       ub = builder.createConvert(loc, idxTy, ub);
       if (lb) {
-        mlir::Value diff = builder.create<mlir::arith::SubIOp>(loc, ub, lb);
+        mlir::Value diff = mlir::arith::SubIOp::create(builder, loc, ub, lb);
         extents.emplace_back(
-            builder.create<mlir::arith::AddIOp>(loc, diff, one));
+            mlir::arith::AddIOp::create(builder, loc, diff, one));
       } else {
         extents.emplace_back(ub);
       }
@@ -461,7 +461,7 @@ class AllocateStmtHelper {
     mlir::Value falseValue = builder.createBool(loc, false);
     mlir::Value falseConv = builder.createConvert(
         loc, fir::unwrapRefType(pinned.getType()), falseValue);
-    builder.create<fir::StoreOp>(loc, falseConv, pinned);
+    fir::StoreOp::create(builder, loc, falseConv, pinned);
   }
 
   void genSimpleAllocation(const Allocation &alloc,
@@ -557,7 +557,7 @@ class AllocateStmtHelper {
       mlir::Value nullPointer = fir::factory::createUnallocatedBox(
           builder, loc, box.getBoxTy(), box.nonDeferredLenParams(),
           /*typeSourceBox=*/{}, allocatorIdx);
-      builder.create<fir::StoreOp>(loc, nullPointer, box.getAddr());
+      fir::StoreOp::create(builder, loc, nullPointer, box.getAddr());
     } else {
       assert(box.isAllocatable() && "must be an allocatable");
       // For allocatables, sync the MutableBoxValue and descriptor before the
@@ -597,13 +597,14 @@ class AllocateStmtHelper {
       assert(sourceBox && "source expression should be lowered to one box");
       for (int i = 0; i < sourceExpr->Rank(); ++i) {
         auto dimVal = builder.createIntegerConstant(loc, idxTy, i);
-        auto dimInfo = builder.create<fir::BoxDimsOp>(
-            loc, idxTy, idxTy, idxTy, sourceBox->getAddr(), dimVal);
+        auto dimInfo = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy,
+                                              sourceBox->getAddr(), dimVal);
         mlir::Value lb =
             fir::factory::readLowerBound(builder, loc, sourceExv, i, one);
         mlir::Value extent = dimInfo.getResult(1);
-        mlir::Value ub = builder.create<mlir::arith::SubIOp>(
-            loc, builder.create<mlir::arith::AddIOp>(loc, extent, lb), one);
+        mlir::Value ub = mlir::arith::SubIOp::create(
+            builder, loc, mlir::arith::AddIOp::create(builder, loc, extent, lb),
+            one);
         mlir::Value dimIndex = builder.createIntegerConstant(loc, i32Ty, i);
         genRuntimeSetBounds(builder, loc, box, dimIndex, lb, ub);
       }
@@ -668,7 +669,7 @@ class AllocateStmtHelper {
     const auto args = fir::runtime::createArguments(
         builder, loc, callee.getFunctionType(), box.getAddr(), typeDescAddr,
         rankValue, corankValue);
-    builder.create<fir::CallOp>(loc, callee, args);
+    fir::CallOp::create(builder, loc, callee, args);
   }
 
   /// Generate call to PointerNullifyIntrinsic or AllocatableInitIntrinsic to
@@ -697,7 +698,7 @@ class AllocateStmtHelper {
     const auto args = fir::runtime::createArguments(
         builder, loc, callee.getFunctionType(), box.getAddr(), categoryValue,
         kindValue, rankValue, corankValue);
-    builder.create<fir::CallOp>(loc, callee, args);
+    fir::CallOp::create(builder, loc, callee, args);
   }
 
   /// Generate call to the AllocatableInitDerived to set up the type descriptor
@@ -909,8 +910,8 @@ void Fortran::lower::genDeallocateIfAllocated(
       .genThen([&]() {
         if (mlir::Type eleType = box.getEleTy();
             mlir::isa<fir::RecordType>(eleType) && box.isPolymorphic()) {
-          mlir::Value declaredTypeDesc = builder.create<fir::TypeDescOp>(
-              loc, mlir::TypeAttr::get(eleType));
+          mlir::Value declaredTypeDesc = fir::TypeDescOp::create(
+              builder, loc, mlir::TypeAttr::get(eleType));
           genDeallocateBox(converter, box, loc, sym, declaredTypeDesc);
         } else {
           genDeallocateBox(converter, box, loc, sym);
@@ -1151,7 +1152,7 @@ mlir::Value Fortran::lower::getAssumedCharAllocatableOrPointerLen(
   // here).
   auto readLength = [&]() {
     fir::BoxValue boxLoad =
-        builder.create<fir::LoadOp>(loc, fir::getBase(box)).getResult();
+        fir::LoadOp::create(builder, loc, fir::getBase(box)).getResult();
     return fir::factory::readCharLen(builder, loc, boxLoad);
   };
   if (Fortran::semantics::IsOptional(sym)) {
@@ -1160,15 +1161,15 @@ mlir::Value Fortran::lower::getAssumedCharAllocatableOrPointerLen(
     // they are absents. According to 15.5.2.12 3 (9), it is illegal to
     // inquire the length of absent optional, even if non deferred, so
     // it's fine to use undefOp in this case.
-    auto isPresent = builder.create<fir::IsPresentOp>(loc, builder.getI1Type(),
-                                                      fir::getBase(box));
+    auto isPresent = fir::IsPresentOp::create(builder, loc, builder.getI1Type(),
+                                              fir::getBase(box));
     mlir::Value len =
         builder.genIfOp(loc, {idxTy}, isPresent, true)
             .genThen(
-                [&]() { builder.create<fir::ResultOp>(loc, readLength()); })
+                [&]() { fir::ResultOp::create(builder, loc, readLength()); })
             .genElse([&]() {
-              auto undef = builder.create<fir::UndefOp>(loc, idxTy);
-              builder.create<fir::ResultOp>(loc, undef.getResult());
+              auto undef = fir::UndefOp::create(builder, loc, idxTy);
+              fir::ResultOp::create(builder, loc, undef.getResult());
             })
             .getResults()[0];
     return len;
@@ -1183,5 +1184,5 @@ mlir::Value Fortran::lower::getTypeDescAddr(
   mlir::Type typeDesc =
       Fortran::lower::translateDerivedTypeToFIRType(converter, typeSpec);
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  return builder.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(typeDesc));
+  return fir::TypeDescOp::create(builder, loc, mlir::TypeAttr::get(typeDesc));
 }
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 33c1f1e7a3c3a..b94833d852b2e 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -333,11 +333,12 @@ class TypeInfoConverter {
         if (details.numPrivatesNotOverridden() > 0)
           tbpName += "."s + std::to_string(details.numPrivatesNotOverridden());
         std::string bindingName = converter.mangleName(details.symbol());
-        builder.create<fir::DTEntryOp>(
-            info.loc, mlir::StringAttr::get(builder.getContext(), tbpName),
+        fir::DTEntryOp::create(
+            builder, info.loc,
+            mlir::StringAttr::get(builder.getContext(), tbpName),
             mlir::SymbolRefAttr::get(builder.getContext(), bindingName));
       }
-      builder.create<fir::FirEndOp>(info.loc);
+      fir::FirEndOp::create(builder, info.loc);
     }
     // Gather info about components that is not reflected in fir.type and may be
     // needed later: component initial values and array component non default
@@ -360,11 +361,11 @@ class TypeInfoConverter {
           componentInfo = builder.createBlock(&dt.getComponentInfo());
         auto compName = mlir::StringAttr::get(builder.getContext(),
                                               toStringRef(component.name()));
-        builder.create<fir::DTComponentOp>(info.loc, compName, lbs, init_val);
+        fir::DTComponentOp::create(builder, info.loc, compName, lbs, init_val);
       }
     }
     if (componentInfo)
-      builder.create<fir::FirEndOp>(info.loc);
+      fir::FirEndOp::create(builder, info.loc);
     builder.restoreInsertionPoint(insertPointIfCreated);
   }
 
@@ -810,11 +811,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             fir::ExtendedValue read = fir::factory::genMutableBoxRead(
                 *builder, loc, box, /*mayBePolymorphic=*/false);
             if (auto read_arr_box = read.getBoxOf<fir::ArrayBoxValue>()) {
-              fir::factory::genInlinedAllocation(
-                  *builder, loc, *new_box, read_arr_box->getLBounds(),
-                  read_arr_box->getExtents(),
-                  /*lenParams=*/std::nullopt, name,
-                  /*mustBeHeap=*/true);
+              fir::factory::genInlinedAllocation(*builder, loc, *new_box,
+                                                 read_arr_box->getLBounds(),
+                                                 read_arr_box->getExtents(),
+                                                 /*lenParams=*/{}, name,
+                                                 /*mustBeHeap=*/true);
             } else if (auto read_char_arr_box =
                            read.getBoxOf<fir::CharArrayBoxValue>()) {
               fir::factory::genInlinedAllocation(
@@ -825,8 +826,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             } else if (auto read_char_box =
                            read.getBoxOf<fir::CharBoxValue>()) {
               fir::factory::genInlinedAllocation(*builder, loc, *new_box,
-                                                 /*lbounds=*/std::nullopt,
-                                                 /*extents=*/std::nullopt,
+                                                 /*lbounds=*/{},
+                                                 /*extents=*/{},
                                                  read_char_box->getLen(), name,
                                                  /*mustBeHeap=*/true);
             } else {
@@ -1466,8 +1467,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     assert(falseTarget && "missing conditional branch false block");
     mlir::Location loc = toLocation();
     mlir::Value bcc = builder->createConvert(loc, builder->getI1Type(), cond);
-    builder->create<mlir::cf::CondBranchOp>(loc, bcc, trueTarget, std::nullopt,
-                                            falseTarget, std::nullopt);
+    builder->create<mlir::cf::CondBranchOp>(loc, bcc, trueTarget,
+                                            mlir::ValueRange{}, falseTarget,
+                                            mlir::ValueRange{});
   }
   void genConditionalBranch(mlir::Value cond,
                             Fortran::lower::pft::Evaluation *trueTarget,
@@ -2556,8 +2558,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       builder->setInsertionPointToEnd(loopWrapperOp.getBody());
       auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
           loc, nestLBs, nestUBs, nestSts, /*loopAnnotation=*/nullptr,
-          /*local_vars=*/std::nullopt,
-          /*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt,
+          /*local_vars=*/mlir::ValueRange{},
+          /*local_syms=*/nullptr, /*reduce_vars=*/mlir::ValueRange{},
           /*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr,
           /*reduce_attrs=*/nullptr);
 
@@ -3810,9 +3812,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       mlir::Block *selectCaseBlock = insertBlock(blockList[0]);
       mlir::Block *assumedSizeBlock =
           rankStarBlock ? rankStarBlock : defaultBlock;
-      builder->create<mlir::cf::CondBranchOp>(loc, isAssumedSize,
-                                              assumedSizeBlock, std::nullopt,
-                                              selectCaseBlock, std::nullopt);
+      builder->create<mlir::cf::CondBranchOp>(
+          loc, isAssumedSize, assumedSizeBlock, mlir::ValueRange{},
+          selectCaseBlock, mlir::ValueRange{});
       startBlock(selectCaseBlock);
     }
     // Create fir.select_case for the other rank cases.
@@ -4590,8 +4592,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     // the static type of the LHS.
     if (Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(
             assign.rhs))
-      return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType,
-                                                std::nullopt);
+      return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {});
     hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR(
         loc, *this, assign.rhs, localSymbols, rhsContext);
     // Create pointer descriptor value from the RHS.
@@ -4703,8 +4704,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       mlir::Value lhs = lhsMutableBox.getAddr();
       mlir::Value rhs = fir::getBase(genExprBox(loc, assign.rhs, stmtCtx));
       mlir::Value boundsDesc = createBoundArray(lbounds, ubounds, loc);
-      Fortran::lower::genPointerAssociateRemapping(*builder, loc, lhs, rhs,
-                                                   boundsDesc);
+      Fortran::lower::genPointerAssociateRemapping(
+          *builder, loc, lhs, rhs, boundsDesc,
+          lhsType && rhsType && !lhsType->IsPolymorphic() &&
+              rhsType->IsPolymorphic());
       return;
     }
     if (!lowerToHighLevelFIR() && explicitIterationSpace()) {
@@ -4827,18 +4830,18 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           base = convertOp.getValue();
         // Special case if the rhs is a constant.
         if (matchPattern(base.getDefiningOp(), mlir::m_Constant())) {
-          builder.create<cuf::DataTransferOp>(loc, base, lhsVal, shape,
-                                              transferKindAttr);
+          cuf::DataTransferOp::create(builder, loc, base, lhsVal, shape,
+                                      transferKindAttr);
         } else {
           auto associate = hlfir::genAssociateExpr(
               loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
-          builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhsVal,
-                                              shape, transferKindAttr);
-          builder.create<hlfir::EndAssociateOp>(loc, associate);
+          cuf::DataTransferOp::create(builder, loc, associate.getBase(), lhsVal,
+                                      shape, transferKindAttr);
+          hlfir::EndAssociateOp::create(builder, loc, associate);
         }
       } else {
-        builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal, shape,
-                                            transferKindAttr);
+        cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
+                                    transferKindAttr);
       }
       return;
     }
@@ -4847,8 +4850,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     if (!lhsIsDevice && rhsIsDevice) {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceHost);
-      builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal, shape,
-                                          transferKindAttr);
+      cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
+                                  transferKindAttr);
       return;
     }
 
@@ -4857,8 +4860,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       assert(rhs.isVariable() && "CUDA Fortran assignment rhs is not legal");
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceDevice);
-      builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal, shape,
-                                          transferKindAttr);
+      cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
+                                  transferKindAttr);
       return;
     }
     llvm_unreachable("Unhandled CUDA data transfer");
@@ -4904,8 +4907,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           addSymbol(sym,
                     hlfir::translateToExtendedValue(loc, builder, temp).first,
                     /*forced=*/true);
-          builder.create<cuf::DataTransferOp>(
-              loc, addr, temp, /*shape=*/mlir::Value{}, transferKindAttr);
+          cuf::DataTransferOp::create(builder, loc, addr, temp,
+                                      /*shape=*/mlir::Value{},
+                                      transferKindAttr);
           ++nbDeviceResidentObject;
         }
       }
@@ -4994,13 +4998,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       if (isCUDATransfer && !hasCUDAImplicitTransfer)
         genCUDADataTransfer(builder, loc, assign, lhs, rhs);
       else
-        builder.create<hlfir::AssignOp>(loc, rhs, lhs,
-                                        isWholeAllocatableAssignment,
-                                        keepLhsLengthInAllocatableAssignment);
+        hlfir::AssignOp::create(builder, loc, rhs, lhs,
+                                isWholeAllocatableAssignment,
+                                keepLhsLengthInAllocatableAssignment);
       if (hasCUDAImplicitTransfer && !isInDeviceContext) {
         localSymbols.popScope();
         for (mlir::Value temp : implicitTemps)
-          builder.create<fir::FreeMemOp>(loc, temp);
+          fir::FreeMemOp::create(builder, loc, temp);
       }
       return;
     }
@@ -5008,13 +5012,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     // left-hand side requires using an hlfir.region_assign in HLFIR. The
     // right-hand side and left-hand side must be evaluated inside the
     // hlfir.region_assign regions.
-    auto regionAssignOp = builder.create<hlfir::RegionAssignOp>(loc);
+    auto regionAssignOp = hlfir::RegionAssignOp::create(builder, loc);
 
     // Lower RHS in its own region.
     builder.createBlock(&regionAssignOp.getRhsRegion());
     Fortran::lower::StatementContext rhsContext;
     hlfir::Entity rhs = evaluateRhs(rhsContext);
-    auto rhsYieldOp = builder.create<hlfir::YieldOp>(loc, rhs);
+    auto rhsYieldOp = hlfir::YieldOp::create(builder, loc, rhs);
     Fortran::lower::genCleanUpInRegionIfAny(
         loc, builder, rhsYieldOp.getCleanup(), rhsContext);
     // Lower LHS in its own region.
@@ -5023,7 +5027,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::Value lhsYield = nullptr;
     if (!lhsHasVectorSubscripts) {
       hlfir::Entity lhs = evaluateLhs(lhsContext);
-      auto lhsYieldOp = builder.create<hlfir::YieldOp>(loc, lhs);
+      auto lhsYieldOp = hlfir::YieldOp::create(builder, loc, lhs);
       Fortran::lower::genCleanUpInRegionIfAny(
           loc, builder, lhsYieldOp.getCleanup(), lhsContext);
       lhsYield = lhs;
@@ -5052,7 +5056,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       builder.createBlock(&regionAssignOp.getUserDefinedAssignment(),
                           mlir::Region::iterator{}, {rhsType, lhsType},
                           {loc, loc});
-      auto end = builder.create<fir::FirEndOp>(loc);
+      auto end = fir::FirEndOp::create(builder, loc);
       builder.setInsertionPoint(end);
       hlfir::Entity lhsBlockArg{regionAssignOp.getUserAssignmentLhs()};
       hlfir::Entity rhsBlockArg{regionAssignOp.getUserAssignmentRhs()};
@@ -5199,7 +5203,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                               "LEN parameters");
                   lhsRealloc = fir::factory::genReallocIfNeeded(
                       *builder, loc, *lhsMutableBox,
-                      /*shape=*/std::nullopt, lengthParams);
+                      /*shape=*/{}, lengthParams);
                   return lhsRealloc->newValue;
                 }
                 return genExprAddr(assign.lhs, stmtCtx);
@@ -5271,7 +5275,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
               if (lhsIsWholeAllocatable) {
                 assert(lhsRealloc.has_value());
                 fir::factory::finalizeRealloc(*builder, loc, *lhsMutableBox,
-                                              /*lbounds=*/std::nullopt,
+                                              /*lbounds=*/{},
                                               /*takeLboundsIfRealloc=*/false,
                                               *lhsRealloc);
               }
@@ -6059,8 +6063,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::func::FuncOp func = fir::FirOpBuilder::createFunction(
         mlir::UnknownLoc::get(context), getModuleOp(),
         fir::NameUniquer::doGenerated("Sham"),
-        mlir::FunctionType::get(context, std::nullopt, std::nullopt),
-        symbolTable);
+        mlir::FunctionType::get(context, {}, {}), symbolTable);
     func.addEntryBlock();
     CHECK(!builder && "Expected builder to be uninitialized");
     builder = new fir::FirOpBuilder(func, bridge.getKindMap(), symbolTable);
diff --git a/flang/lib/Lower/ConvertArrayConstructor.cpp b/flang/lib/Lower/ConvertArrayConstructor.cpp
index 7e2142693eac5..55c4b45554f78 100644
--- a/flang/lib/Lower/ConvertArrayConstructor.cpp
+++ b/flang/lib/Lower/ConvertArrayConstructor.cpp
@@ -137,9 +137,9 @@ class InlinedTempStrategyImpl : public StrategyBase,
                              mlir::Value stride) {
     if constexpr (!hasLoops)
       fir::emitFatalError(loc, "array constructor lowering is inconsistent");
-    auto loop = builder.create<fir::DoLoopOp>(loc, lower, upper, stride,
-                                              /*unordered=*/false,
-                                              /*finalCount=*/false);
+    auto loop = fir::DoLoopOp::create(builder, loc, lower, upper, stride,
+                                      /*unordered=*/false,
+                                      /*finalCount=*/false);
     builder.setInsertionPointToStart(loop.getBody());
     return loop.getInductionVar();
   }
@@ -213,15 +213,15 @@ class AsElementalStrategy : public StrategyBase {
     assert(!elementalOp && "expected only one implied-do");
     mlir::Value one =
         builder.createIntegerConstant(loc, builder.getIndexType(), 1);
-    elementalOp = builder.create<hlfir::ElementalOp>(
-        loc, exprType, shape,
-        /*mold=*/nullptr, lengthParams, /*isUnordered=*/true);
+    elementalOp = hlfir::ElementalOp::create(builder, loc, exprType, shape,
+                                             /*mold=*/nullptr, lengthParams,
+                                             /*isUnordered=*/true);
     builder.setInsertionPointToStart(elementalOp.getBody());
     // implied-do-index = lower+((i-1)*stride)
-    mlir::Value diff = builder.create<mlir::arith::SubIOp>(
-        loc, elementalOp.getIndices()[0], one);
-    mlir::Value mul = builder.create<mlir::arith::MulIOp>(loc, diff, stride);
-    mlir::Value add = builder.create<mlir::arith::AddIOp>(loc, lower, mul);
+    mlir::Value diff = mlir::arith::SubIOp::create(
+        builder, loc, elementalOp.getIndices()[0], one);
+    mlir::Value mul = mlir::arith::MulIOp::create(builder, loc, diff, stride);
+    mlir::Value add = mlir::arith::AddIOp::create(builder, loc, lower, mul);
     return add;
   }
 
@@ -260,7 +260,7 @@ class AsElementalStrategy : public StrategyBase {
     if (destroyOp)
       destroyOp->erase();
 
-    builder.create<hlfir::YieldElementOp>(loc, elementResult);
+    hlfir::YieldElementOp::create(builder, loc, elementResult);
   }
 
   // Override the default, because the context scope must be popped in
@@ -315,8 +315,8 @@ class RuntimeTempStrategy : public StrategyBase {
       mlir::Value tempStorage = builder.createHeapTemporary(
           loc, declaredType, tempName, extents, lengths);
       mlir::Value shape = builder.genShape(loc, extents);
-      declare = builder.create<hlfir::DeclareOp>(
-          loc, tempStorage, tempName, shape, lengths,
+      declare = hlfir::DeclareOp::create(
+          builder, loc, tempStorage, tempName, shape, lengths,
           /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
       initialBoxValue =
           builder.createBox(loc, boxType, declare->getOriginalBase(), shape,
@@ -347,7 +347,7 @@ class RuntimeTempStrategy : public StrategyBase {
                                           /*slice=*/mlir::Value{}, emboxLengths,
                                           /*tdesc=*/{});
     }
-    builder.create<fir::StoreOp>(loc, initialBoxValue, allocatableTemp);
+    fir::StoreOp::create(builder, loc, initialBoxValue, allocatableTemp);
     arrayConstructorVector = fir::runtime::genInitArrayConstructorVector(
         loc, builder, allocatableTemp,
         builder.createBool(loc, missingLengthParameters));
@@ -369,7 +369,7 @@ class RuntimeTempStrategy : public StrategyBase {
           loc, builder, value, arrayConstructorElementType);
       mlir::Value addr = fir::getBase(addrExv);
       if (mlir::isa<fir::BaseBoxType>(addr.getType()))
-        addr = builder.create<fir::BoxAddrOp>(loc, addr);
+        addr = fir::BoxAddrOp::create(builder, loc, addr);
       fir::runtime::genPushArrayConstructorSimpleScalar(
           loc, builder, arrayConstructorVector, addr);
       if (cleanUp)
@@ -389,9 +389,9 @@ class RuntimeTempStrategy : public StrategyBase {
   mlir::Value startImpliedDo(mlir::Location loc, fir::FirOpBuilder &builder,
                              mlir::Value lower, mlir::Value upper,
                              mlir::Value stride) {
-    auto loop = builder.create<fir::DoLoopOp>(loc, lower, upper, stride,
-                                              /*unordered=*/false,
-                                              /*finalCount=*/false);
+    auto loop = fir::DoLoopOp::create(builder, loc, lower, upper, stride,
+                                      /*unordered=*/false,
+                                      /*finalCount=*/false);
     builder.setInsertionPointToStart(loop.getBody());
     return loop.getInductionVar();
   }
@@ -409,7 +409,7 @@ class RuntimeTempStrategy : public StrategyBase {
     else
       temp = hlfir::derefPointersAndAllocatables(
           loc, builder, hlfir::Entity{allocatableTemp});
-    auto hlfirExpr = builder.create<hlfir::AsExprOp>(loc, temp, mustFree);
+    auto hlfirExpr = hlfir::AsExprOp::create(builder, loc, temp, mustFree);
     return hlfir::Entity{hlfirExpr};
   }
 
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 6ed15df0de754..8c3648bcb0f35 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -85,7 +85,7 @@ static mlir::Value genRecordCPtrValueArg(fir::FirOpBuilder &builder,
                                          mlir::Location loc, mlir::Value rec,
                                          mlir::Type ty) {
   mlir::Value cAddr = fir::factory::genCPtrOrCFunptrAddr(builder, loc, rec, ty);
-  mlir::Value cVal = builder.create<fir::LoadOp>(loc, cAddr);
+  mlir::Value cVal = fir::LoadOp::create(builder, loc, cAddr);
   return builder.createConvert(loc, cAddr.getType(), cVal);
 }
 
@@ -159,8 +159,8 @@ static mlir::Value readDim3Value(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Type refI32Ty = fir::ReferenceType::get(i32Ty);
   llvm::SmallVector<mlir::Value> lenParams;
 
-  mlir::Value designate = builder.create<hlfir::DesignateOp>(
-      loc, refI32Ty, dim3Addr, /*component=*/comp,
+  mlir::Value designate = hlfir::DesignateOp::create(
+      builder, loc, refI32Ty, dim3Addr, /*component=*/comp,
       /*componentShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
       /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
       mlir::Value{}, lenParams);
@@ -229,8 +229,8 @@ static mlir::Value remapActualToDummyDescriptor(
   if (fir::isPolymorphicType(dummyBoxType))
     mold = explicitArgument;
   mlir::Value remapped =
-      builder.create<fir::EmboxOp>(loc, dummyBoxType, baseAddr, shape,
-                                   /*slice=*/mlir::Value{}, lengths, mold);
+      fir::EmboxOp::create(builder, loc, dummyBoxType, baseAddr, shape,
+                           /*slice=*/mlir::Value{}, lengths, mold);
   if (mapSymbols)
     symMap.popScope();
   return remapped;
@@ -273,12 +273,12 @@ static void remapActualToDummyDescriptors(
                   mlir::Value newBox = remapActualToDummyDescriptor(
                       loc, converter, symMap, argLambdaCapture, caller,
                       isBindcCall);
-                  builder.create<fir::ResultOp>(loc, newBox);
+                  fir::ResultOp::create(builder, loc, newBox);
                 })
                 .genElse([&]() {
                   mlir::Value absent =
-                      builder.create<fir::AbsentOp>(loc, dummyType);
-                  builder.create<fir::ResultOp>(loc, absent);
+                      fir::AbsentOp::create(builder, loc, dummyType);
+                  fir::ResultOp::create(builder, loc, absent);
                 })
                 .getResults()[0];
         caller.placeInput(arg, remapped);
@@ -381,8 +381,8 @@ Fortran::lower::genCallOpAndResult(
 
     if (isExprCall) {
       mlir::Type exprType = hlfir::getExprType(type);
-      evaluateInMemory = builder.create<hlfir::EvaluateInMemoryOp>(
-          loc, exprType, arrayResultShape, resultLengths);
+      evaluateInMemory = hlfir::EvaluateInMemoryOp::create(
+          builder, loc, exprType, arrayResultShape, resultLengths);
       builder.setInsertionPointToStart(&evaluateInMemory.getBody().front());
       return toExtendedValue(loc, evaluateInMemory.getMemory(), extents,
                              lengths);
@@ -454,7 +454,7 @@ Fortran::lower::genCallOpAndResult(
     if (!addHostAssociations &&
         mustCastFuncOpToCopeWithImplicitInterfaceMismatch(
             loc, converter, callSiteType, funcOpType))
-      funcPointer = builder.create<fir::AddrOfOp>(loc, funcOpType, symbolAttr);
+      funcPointer = fir::AddrOfOp::create(builder, loc, funcOpType, symbolAttr);
     else
       funcSymbolAttr = symbolAttr;
 
@@ -482,7 +482,7 @@ Fortran::lower::genCallOpAndResult(
   if (funcPointer) {
     operands.push_back(
         mlir::isa<fir::BoxProcType>(funcPointer.getType())
-            ? builder.create<fir::BoxAddrOp>(loc, funcType, funcPointer)
+            ? fir::BoxAddrOp::create(builder, loc, funcType, funcPointer)
             : builder.createConvert(loc, funcType, funcPointer));
   }
 
@@ -496,14 +496,13 @@ Fortran::lower::genCallOpAndResult(
     auto *context = builder.getContext();
     if (mlir::isa<fir::BoxProcType>(snd) &&
         mlir::isa<mlir::FunctionType>(fst.getType())) {
-      auto funcTy =
-          mlir::FunctionType::get(context, std::nullopt, std::nullopt);
+      auto funcTy = mlir::FunctionType::get(context, {}, {});
       auto boxProcTy = builder.getBoxProcType(funcTy);
       if (mlir::Value host = argumentHostAssocs(converter, fst)) {
-        cast = builder.create<fir::EmboxProcOp>(
-            loc, boxProcTy, llvm::ArrayRef<mlir::Value>{fst, host});
+        cast = fir::EmboxProcOp::create(builder, loc, boxProcTy,
+                                        llvm::ArrayRef<mlir::Value>{fst, host});
       } else {
-        cast = builder.create<fir::EmboxProcOp>(loc, boxProcTy, fst);
+        cast = fir::EmboxProcOp::create(builder, loc, boxProcTy, fst);
       }
     } else {
       mlir::Type fromTy = fir::unwrapRefType(fst.getType());
@@ -614,10 +613,10 @@ Fortran::lower::genCallOpAndResult(
       stream = fir::getBase(converter.genExprAddr(
           caller.getCallDescription().chevrons()[3], stmtCtx));
 
-    builder.create<cuf::KernelLaunchOp>(
-        loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, grid_z,
-        block_x, block_y, block_z, bytes, stream, operands,
-        /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    cuf::KernelLaunchOp::create(builder, loc, funcType.getResults(),
+                                funcSymbolAttr, grid_x, grid_y, grid_z, block_x,
+                                block_y, block_z, bytes, stream, operands,
+                                /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr);
     callNumResults = 0;
   } else if (caller.requireDispatchCall()) {
     // Procedure call requiring a dynamic dispatch. Call is created with
@@ -641,8 +640,8 @@ Fortran::lower::genCallOpAndResult(
       // passed object because interface mismatch issues may have inserted a
       // cast to the operand with a different declared type, which would break
       // later type bound call resolution in the FIR to FIR pass.
-      dispatch = builder.create<fir::DispatchOp>(
-          loc, funcType.getResults(), builder.getStringAttr(procName),
+      dispatch = fir::DispatchOp::create(
+          builder, loc, funcType.getResults(), builder.getStringAttr(procName),
           caller.getInputs()[*passArg], operands,
           builder.getI32IntegerAttr(*passArg), /*arg_attrs=*/nullptr,
           /*res_attrs=*/nullptr, procAttrs);
@@ -657,9 +656,9 @@ Fortran::lower::genCallOpAndResult(
       mlir::Value passObject = fir::getBase(dataRefValue);
 
       if (fir::isa_ref_type(passObject.getType()))
-        passObject = builder.create<fir::LoadOp>(loc, passObject);
-      dispatch = builder.create<fir::DispatchOp>(
-          loc, funcType.getResults(), builder.getStringAttr(procName),
+        passObject = fir::LoadOp::create(builder, loc, passObject);
+      dispatch = fir::DispatchOp::create(
+          builder, loc, funcType.getResults(), builder.getStringAttr(procName),
           passObject, operands, nullptr, /*arg_attrs=*/nullptr,
           /*res_attrs=*/nullptr, procAttrs);
     }
@@ -668,8 +667,8 @@ Fortran::lower::genCallOpAndResult(
       callResult = dispatch.getResult(0);
   } else {
     // Standard procedure call with fir.call.
-    auto call = builder.create<fir::CallOp>(
-        loc, funcType.getResults(), funcSymbolAttr, operands,
+    auto call = fir::CallOp::create(
+        builder, loc, funcType.getResults(), funcSymbolAttr, operands,
         /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr, procAttrs);
 
     callNumResults = call.getNumResults();
@@ -692,9 +691,9 @@ Fortran::lower::genCallOpAndResult(
 
   if (caller.mustSaveResult()) {
     assert(allocatedResult.has_value());
-    builder.create<fir::SaveResultOp>(loc, callResult,
-                                      fir::getBase(*allocatedResult),
-                                      arrayResultShape, resultLengths);
+    fir::SaveResultOp::create(builder, loc, callResult,
+                              fir::getBase(*allocatedResult), arrayResultShape,
+                              resultLengths);
   }
 
   if (evaluateInMemory) {
@@ -865,9 +864,9 @@ static hlfir::EntityWithAttributes genStmtFunctionRef(
   // The result must not be a variable.
   result = hlfir::loadTrivialScalar(loc, builder, result);
   if (result.isVariable())
-    result = hlfir::Entity{builder.create<hlfir::AsExprOp>(loc, result)};
+    result = hlfir::Entity{hlfir::AsExprOp::create(builder, loc, result)};
   for (auto associate : exprAssociations)
-    builder.create<hlfir::EndAssociateOp>(loc, associate);
+    hlfir::EndAssociateOp::create(builder, loc, associate);
   return hlfir::EntityWithAttributes{result};
 }
 
@@ -952,9 +951,9 @@ extendedValueToHlfirEntity(mlir::Location loc, fir::FirOpBuilder &builder,
     // rid of the memory indirection in a = char(b), so there is
     // little incentive to increase the compiler complexity.
     hlfir::Entity storage{builder.createTemporary(loc, charTy)};
-    builder.create<fir::StoreOp>(loc, firBase, storage);
-    auto asExpr = builder.create<hlfir::AsExprOp>(
-        loc, storage, /*mustFree=*/builder.createBool(loc, false));
+    fir::StoreOp::create(builder, loc, firBase, storage);
+    auto asExpr = hlfir::AsExprOp::create(
+        builder, loc, storage, /*mustFree=*/builder.createBool(loc, false));
     return hlfir::EntityWithAttributes{asExpr.getResult()};
   }
   return hlfir::genDeclare(loc, builder, exv, name,
@@ -966,7 +965,7 @@ namespace {
 struct CallCleanUp {
   struct CopyIn {
     void genCleanUp(mlir::Location loc, fir::FirOpBuilder &builder) {
-      builder.create<hlfir::CopyOutOp>(loc, tempBox, wasCopied, copyBackVar);
+      hlfir::CopyOutOp::create(builder, loc, tempBox, wasCopied, copyBackVar);
     }
     // address of the descriptor holding the temp if a temp was created.
     mlir::Value tempBox;
@@ -977,7 +976,7 @@ struct CallCleanUp {
   };
   struct ExprAssociate {
     void genCleanUp(mlir::Location loc, fir::FirOpBuilder &builder) {
-      builder.create<hlfir::EndAssociateOp>(loc, tempVar, mustFree);
+      hlfir::EndAssociateOp::create(builder, loc, tempVar, mustFree);
     }
     mlir::Value tempVar;
     mlir::Value mustFree;
@@ -1075,7 +1074,7 @@ struct ConditionallyPreparedDummy {
   /// Generate the "fir.result %preparedDummy" in the then branch of the
   /// wrapping fir.if.
   void genThenResult(mlir::Location loc, fir::FirOpBuilder &builder) const {
-    builder.create<fir::ResultOp>(loc, thenResultValues);
+    fir::ResultOp::create(builder, loc, thenResultValues);
   }
 
   /// Generate the "fir.result %absent" in the else branch of the
@@ -1090,7 +1089,7 @@ struct ConditionallyPreparedDummy {
       else
         elseResultValues.push_back(builder.genAbsentOp(loc, type));
     }
-    builder.create<fir::ResultOp>(loc, elseResultValues);
+    fir::ResultOp::create(builder, loc, elseResultValues);
   }
 
   /// Once the fir.if has been created, get the resulting %conditionallyPrepared
@@ -1135,7 +1134,7 @@ static hlfir::Entity fixProcedureDummyMismatch(mlir::Location loc,
   if (mlir::isa<fir::BoxProcType>(actual.getType()) &&
       fir::isCharacterProcedureTuple(dummyType)) {
     mlir::Value length =
-        builder.create<fir::UndefOp>(loc, builder.getCharacterLengthType());
+        fir::UndefOp::create(builder, loc, builder.getCharacterLengthType());
     mlir::Value tuple = fir::factory::createCharacterProcedureTuple(
         builder, loc, dummyType, actual, length);
     return hlfir::Entity{tuple};
@@ -1318,8 +1317,8 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     mlir::Type tempBoxType = baseBoxTy.getBoxTypeWithNewAttr(
         fir::BaseBoxType::Attribute::Allocatable);
     mlir::Value tempBox = builder.createTemporary(loc, tempBoxType);
-    auto copyIn = builder.create<hlfir::CopyInOp>(
-        loc, var, tempBox, /*var_is_present=*/mlir::Value{});
+    auto copyIn = hlfir::CopyInOp::create(builder, loc, var, tempBox,
+                                          /*var_is_present=*/mlir::Value{});
     // Register the copy-out after the call.
     preparedDummy.pushCopyInCleanUp(copyIn.getTempBox(), copyIn.getWasCopied(),
                                     doCopyOut ? copyIn.getVar()
@@ -1331,16 +1330,17 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     fir::BaseBoxType boxType = fir::BoxType::get(
         hlfir::getFortranElementOrSequenceType(dummyTypeWithActualRank));
     if (actualIsAssumedRank)
-      return hlfir::Entity{builder.create<fir::ReboxAssumedRankOp>(
-          loc, boxType, var, fir::LowerBoundModifierAttribute::SetToOnes)};
+      return hlfir::Entity{fir::ReboxAssumedRankOp::create(
+          builder, loc, boxType, var,
+          fir::LowerBoundModifierAttribute::SetToOnes)};
     // Use actual shape when creating descriptor with dummy type, the dummy
     // shape may be unknown in case of sequence association.
     mlir::Type actualTy =
         hlfir::getFortranElementOrSequenceType(actual.getType());
     boxType = boxType.getBoxTypeWithNewShape(actualTy);
-    return hlfir::Entity{builder.create<fir::ReboxOp>(loc, boxType, var,
-                                                      /*shape=*/mlir::Value{},
-                                                      /*slice=*/mlir::Value{})};
+    return hlfir::Entity{fir::ReboxOp::create(builder, loc, boxType, var,
+                                              /*shape=*/mlir::Value{},
+                                              /*slice=*/mlir::Value{})};
   };
 
   // Step 2: prepare the storage for the dummy arguments, ensuring that it
@@ -1362,7 +1362,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
         // generated writes in copy-out.
         isParameterObjectOrSubObject(entity)) {
       // Make a copy in a temporary.
-      auto copy = builder.create<hlfir::AsExprOp>(loc, entity);
+      auto copy = hlfir::AsExprOp::create(builder, loc, entity);
       mlir::Type storageType = entity.getType();
       mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
       hlfir::AssociateOp associate = hlfir::genAssociateExpr(
@@ -1442,14 +1442,14 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
         auto lbModifier = needsZeroLowerBounds
                               ? fir::LowerBoundModifierAttribute::SetToZeroes
                               : fir::LowerBoundModifierAttribute::SetToOnes;
-        entity = hlfir::Entity{builder.create<fir::ReboxAssumedRankOp>(
-            loc, dummyTypeWithActualRank, entity, lbModifier)};
+        entity = hlfir::Entity{fir::ReboxAssumedRankOp::create(
+            builder, loc, dummyTypeWithActualRank, entity, lbModifier)};
       } else {
         mlir::Value shift{};
         if (needsZeroLowerBounds)
           shift = getZeroLowerBounds(loc, builder, entity);
-        entity = hlfir::Entity{builder.create<fir::ReboxOp>(
-            loc, dummyTypeWithActualRank, entity, /*shape=*/shift,
+        entity = hlfir::Entity{fir::ReboxOp::create(
+            builder, loc, dummyTypeWithActualRank, entity, /*shape=*/shift,
             /*slice=*/mlir::Value{})};
       }
     }
@@ -1502,8 +1502,8 @@ static PreparedDummyArgument prepareUserCallActualArgument(
   // for this unusual if/then/else generation is that the number
   // and types of the if results will depend on how the argument
   // is prepared, and forecasting that here would be brittle.
-  auto badIfOp = builder.create<fir::IfOp>(loc, dummyType, isPresent,
-                                           /*withElseRegion=*/false);
+  auto badIfOp = fir::IfOp::create(builder, loc, dummyType, isPresent,
+                                   /*withElseRegion=*/false);
   mlir::Block *preparationBlock = &badIfOp.getThenRegion().front();
   builder.setInsertionPointToStart(preparationBlock);
   PreparedDummyArgument unconditionalDummy =
@@ -1521,9 +1521,9 @@ static PreparedDummyArgument prepareUserCallActualArgument(
   // badIfOp cannot be modified and used here).
   llvm::SmallVector<mlir::Type> ifOpResultTypes;
   ConditionallyPreparedDummy conditionalDummy(unconditionalDummy);
-  auto ifOp = builder.create<fir::IfOp>(loc, conditionalDummy.getIfResulTypes(),
-                                        isPresent,
-                                        /*withElseRegion=*/true);
+  auto ifOp = fir::IfOp::create(builder, loc,
+                                conditionalDummy.getIfResulTypes(), isPresent,
+                                /*withElseRegion=*/true);
   // Move "preparationBlock" into the "then" of the new
   // fir.if operation and create fir.result propagating
   // unconditionalDummy.
@@ -1560,7 +1560,7 @@ static PreparedDummyArgument prepareProcedurePointerActualArgument(
     auto tempBoxProc{builder.createTemporary(loc, boxTy)};
     hlfir::Entity nullBoxProc(
         fir::factory::createNullBoxProc(builder, loc, boxTy));
-    builder.create<fir::StoreOp>(loc, nullBoxProc, tempBoxProc);
+    fir::StoreOp::create(builder, loc, nullBoxProc, tempBoxProc);
     return PreparedDummyArgument{tempBoxProc, /*cleanups=*/{}};
   }
   hlfir::Entity actual = preparedActual.getActual(loc, builder);
@@ -1569,7 +1569,7 @@ static PreparedDummyArgument prepareProcedurePointerActualArgument(
   assert(actual.isProcedure());
   // Procedure actual to procedure pointer dummy.
   auto tempBoxProc{builder.createTemporary(loc, actual.getType())};
-  builder.create<fir::StoreOp>(loc, actual, tempBoxProc);
+  fir::StoreOp::create(builder, loc, actual, tempBoxProc);
   return PreparedDummyArgument{tempBoxProc, /*cleanups=*/{}};
 }
 
@@ -1608,7 +1608,7 @@ void prepareUserCallArguments(
                                                    "adapt.cptrbyval");
           value = hlfir::Entity{genRecordCPtrValueArg(
               builder, loc, associate.getFirBase(), eleTy)};
-          builder.create<hlfir::EndAssociateOp>(loc, associate);
+          hlfir::EndAssociateOp::create(builder, loc, associate);
         } else {
           value =
               hlfir::Entity{genRecordCPtrValueArg(builder, loc, value, eleTy)};
@@ -1627,7 +1627,7 @@ void prepareUserCallArguments(
             loadedValue = builder.createConvert(
                 loc, fir::ReferenceType::get(argTy), loadedValue);
         if (fir::isa_ref_type(loadedValue.getType()))
-          loadedValue = builder.create<fir::LoadOp>(loc, loadedValue);
+          loadedValue = fir::LoadOp::create(builder, loc, loadedValue);
         caller.placeInput(arg, loadedValue);
         if (cleanup)
           (*cleanup)();
@@ -1714,7 +1714,7 @@ void prepareUserCallArguments(
                                     /*nonDeferredParams=*/mlir::ValueRange{},
                                     /*mutableProperties=*/{});
         fir::factory::associateMutableBox(builder, loc, ptrBox, actualExv,
-                                          /*lbounds=*/std::nullopt);
+                                          /*lbounds=*/{});
         caller.placeInput(arg, irBox);
         continue;
       }
@@ -1812,8 +1812,9 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
       // In such case, the expression should not be freed after its use since
       // the result is stack allocated or deallocation (for allocatable results)
       // was already inserted in genCallOpAndResult.
-      auto asExpr = builder.create<hlfir::AsExprOp>(
-          loc, resultEntity, /*mustFree=*/builder.createBool(loc, false));
+      auto asExpr =
+          hlfir::AsExprOp::create(builder, loc, resultEntity,
+                                  /*mustFree=*/builder.createBool(loc, false));
       return hlfir::EntityWithAttributes{asExpr.getResult()};
     }
     return hlfir::EntityWithAttributes{resultEntity};
@@ -1861,12 +1862,12 @@ static ExvAndCleanup genOptionalValue(fir::FirOpBuilder &builder,
                        "must be a numerical or logical scalar");
                 mlir::Value val =
                     hlfir::loadTrivialScalar(loc, builder, entity);
-                builder.create<fir::ResultOp>(loc, val);
+                fir::ResultOp::create(builder, loc, val);
               })
               .genElse([&]() {
                 mlir::Value zero =
                     fir::factory::createZeroValue(builder, loc, eleType);
-                builder.create<fir::ResultOp>(loc, zero);
+                fir::ResultOp::create(builder, loc, zero);
               })
               .getResults()[0],
           std::nullopt};
@@ -1913,9 +1914,9 @@ static ExvAndCleanup genOptionalBox(fir::FirOpBuilder &builder,
   // ensures it won't be.
   mlir::Value box = builder.createBox(loc, newExv);
   mlir::Type boxType = box.getType();
-  auto absent = builder.create<fir::AbsentOp>(loc, boxType);
-  auto boxOrAbsent = builder.create<mlir::arith::SelectOp>(
-      loc, boxType, isPresent, box, absent);
+  auto absent = fir::AbsentOp::create(builder, loc, boxType);
+  auto boxOrAbsent = mlir::arith::SelectOp::create(builder, loc, boxType,
+                                                   isPresent, box, absent);
   return {fir::BoxValue(boxOrAbsent), cleanup};
 }
 
@@ -2143,10 +2144,10 @@ genIntrinsicRefCore(Fortran::lower::PreparedActualArguments &loweredActuals,
     // ownership of this address cannot be taken here since it may not be a
     // temp.
     if (intrinsicName == "merge")
-      asExpr = builder.create<hlfir::AsExprOp>(loc, resultEntity);
+      asExpr = hlfir::AsExprOp::create(builder, loc, resultEntity);
     else
-      asExpr = builder.create<hlfir::AsExprOp>(
-          loc, resultEntity, builder.createBool(loc, mustBeFreed));
+      asExpr = hlfir::AsExprOp::create(builder, loc, resultEntity,
+                                       builder.createBool(loc, mustBeFreed));
     resultEntity = hlfir::EntityWithAttributes{asExpr.getResult()};
   }
   return resultEntity;
@@ -2526,7 +2527,7 @@ genIsPresentIfArgMaybeAbsent(mlir::Location loc, hlfir::Entity actual,
   // May fall into the category above if the allocatable is not optional.
 
   // Passing an optional to an optional.
-  return builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), actual)
+  return fir::IsPresentOp::create(builder, loc, builder.getI1Type(), actual)
       .getResult();
 }
 
@@ -2814,9 +2815,9 @@ genProcedureRef(CallContext &callContext) {
           // TYPE(*) cannot be ALLOCATABLE/POINTER (C709) so there is no
           // need to cover the case of passing an ALLOCATABLE/POINTER to an
           // OPTIONAL.
-          isPresent =
-              builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), actual)
-                  .getResult();
+          isPresent = fir::IsPresentOp::create(builder, loc,
+                                               builder.getI1Type(), actual)
+                          .getResult();
         }
         loweredActuals.push_back(Fortran::lower::PreparedActualArgument{
             hlfir::Entity{*var}, isPresent});
@@ -2932,7 +2933,7 @@ std::optional<hlfir::EntityWithAttributes> Fortran::lower::convertCallToHLFIR(
     // this can be enforced whenscheduling forall/where expression evaluations.
     Fortran::lower::StatementContext localStmtCtx;
     mlir::Type bogusType = builder.getIndexType();
-    auto exactlyOnce = builder.create<hlfir::ExactlyOnceOp>(loc, bogusType);
+    auto exactlyOnce = hlfir::ExactlyOnceOp::create(builder, loc, bogusType);
     mlir::Block *block = builder.createBlock(&exactlyOnce.getBody());
     builder.setInsertionPointToStart(block);
     CallContext callContext(procRef, resultType, loc, converter, symMap,
@@ -2940,7 +2941,7 @@ std::optional<hlfir::EntityWithAttributes> Fortran::lower::convertCallToHLFIR(
     std::optional<hlfir::EntityWithAttributes> res =
         genProcedureRef(callContext);
     assert(res.has_value() && "must be a function");
-    auto yield = builder.create<hlfir::YieldOp>(loc, *res);
+    auto yield = hlfir::YieldOp::create(builder, loc, *res);
     Fortran::lower::genCleanUpInRegionIfAny(loc, builder, yield.getCleanup(),
                                             localStmtCtx);
     builder.setInsertionPointAfter(exactlyOnce);
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index 1051d50ce8a9a..768a237c92396 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -237,8 +237,8 @@ static mlir::Value genScalarLit(
                                     ? value.UnsignedDecimal()
                                     : value.SignedDecimal(),
                                 10);
-      return builder.create<mlir::arith::ConstantOp>(
-          loc, ty, mlir::IntegerAttr::get(ty, bigInt));
+      return mlir::arith::ConstantOp::create(
+          builder, loc, ty, mlir::IntegerAttr::get(ty, bigInt));
     }
     return builder.createIntegerConstant(loc, ty, value.ToInt64());
   } else if constexpr (TC == Fortran::common::TypeCategory::Logical) {
@@ -302,8 +302,9 @@ createStringLitOp(fir::FirOpBuilder &builder, mlir::Location loc,
     auto sizeTag = mlir::StringAttr::get(context, fir::StringLitOp::size());
     mlir::NamedAttribute sizeAttr(sizeTag, builder.getI64IntegerAttr(len));
     llvm::SmallVector<mlir::NamedAttribute> attrs = {dataAttr, sizeAttr};
-    return builder.create<fir::StringLitOp>(
-        loc, llvm::ArrayRef<mlir::Type>{type}, std::nullopt, attrs);
+    return fir::StringLitOp::create(builder, loc,
+                                    llvm::ArrayRef<mlir::Type>{type},
+                                    mlir::ValueRange{}, attrs);
   }
 }
 
@@ -340,11 +341,11 @@ genScalarLit(fir::FirOpBuilder &builder, mlir::Location loc,
         [&](fir::FirOpBuilder &builder) {
           fir::StringLitOp str =
               createStringLitOp<KIND>(builder, loc, value, len);
-          builder.create<fir::HasValueOp>(loc, str);
+          fir::HasValueOp::create(builder, loc, str);
         },
         builder.createLinkOnceLinkage());
-  return builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                       global.getSymbol());
+  return fir::AddrOfOp::create(builder, loc, global.resultType(),
+                               global.getSymbol());
 }
 
 // Helper to generate StructureConstructor component values.
@@ -364,9 +365,9 @@ static mlir::Value genStructureComponentInit(
   auto fieldTy = fir::FieldType::get(recTy.getContext());
   assert(componentTy && "failed to retrieve component");
   // FIXME: type parameters must come from the derived-type-spec
-  auto field = builder.create<fir::FieldIndexOp>(
-      loc, fieldTy, name, recTy,
-      /*typeParams=*/mlir::ValueRange{} /*TODO*/);
+  auto field =
+      fir::FieldIndexOp::create(builder, loc, fieldTy, name, recTy,
+                                /*typeParams=*/mlir::ValueRange{} /*TODO*/);
 
   if (Fortran::semantics::IsAllocatable(sym)) {
     if (!Fortran::evaluate::IsNullPointerOrAllocatable(&expr)) {
@@ -374,12 +375,12 @@ static mlir::Value genStructureComponentInit(
                                "allocatable component value that is not NULL");
     } else {
       // Handle NULL() initialization
-      mlir::Value componentValue{fir::factory::createUnallocatedBox(
-          builder, loc, componentTy, std::nullopt)};
+      mlir::Value componentValue{
+          fir::factory::createUnallocatedBox(builder, loc, componentTy, {})};
       componentValue = builder.createConvert(loc, componentTy, componentValue);
 
-      return builder.create<fir::InsertValueOp>(
-          loc, recTy, res, componentValue,
+      return fir::InsertValueOp::create(
+          builder, loc, recTy, res, componentValue,
           builder.getArrayAttr(field.getAttributes()));
     }
   }
@@ -400,9 +401,9 @@ static mlir::Value genStructureComponentInit(
     } else
       initialTarget = Fortran::lower::genInitialDataTarget(converter, loc,
                                                            componentTy, expr);
-    res = builder.create<fir::InsertValueOp>(
-        loc, recTy, res, initialTarget,
-        builder.getArrayAttr(field.getAttributes()));
+    res =
+        fir::InsertValueOp::create(builder, loc, recTy, res, initialTarget,
+                                   builder.getArrayAttr(field.getAttributes()));
     return res;
   }
 
@@ -426,7 +427,7 @@ static mlir::Value genStructureComponentInit(
     mlir::Value addr = fir::getBase(
         Fortran::lower::genExtAddrInInitializer(converter, loc, expr));
     if (mlir::isa<fir::BoxProcType>(addr.getType()))
-      addr = builder.create<fir::BoxAddrOp>(loc, addr);
+      addr = fir::BoxAddrOp::create(builder, loc, addr);
     assert((fir::isa_ref_type(addr.getType()) ||
             mlir::isa<mlir::FunctionType>(addr.getType())) &&
            "expect reference type for address field");
@@ -435,24 +436,25 @@ static mlir::Value genStructureComponentInit(
     auto cPtrRecTy = mlir::cast<fir::RecordType>(componentTy);
     llvm::StringRef addrFieldName = Fortran::lower::builtin::cptrFieldName;
     mlir::Type addrFieldTy = cPtrRecTy.getType(addrFieldName);
-    auto addrField = builder.create<fir::FieldIndexOp>(
-        loc, fieldTy, addrFieldName, componentTy,
+    auto addrField = fir::FieldIndexOp::create(
+        builder, loc, fieldTy, addrFieldName, componentTy,
         /*typeParams=*/mlir::ValueRange{});
     mlir::Value castAddr = builder.createConvert(loc, addrFieldTy, addr);
-    auto undef = builder.create<fir::UndefOp>(loc, componentTy);
-    addr = builder.create<fir::InsertValueOp>(
-        loc, componentTy, undef, castAddr,
+    auto undef = fir::UndefOp::create(builder, loc, componentTy);
+    addr = fir::InsertValueOp::create(
+        builder, loc, componentTy, undef, castAddr,
         builder.getArrayAttr(addrField.getAttributes()));
-    res = builder.create<fir::InsertValueOp>(
-        loc, recTy, res, addr, builder.getArrayAttr(field.getAttributes()));
+    res =
+        fir::InsertValueOp::create(builder, loc, recTy, res, addr,
+                                   builder.getArrayAttr(field.getAttributes()));
     return res;
   }
 
   mlir::Value val = fir::getBase(genConstantValue(converter, loc, expr));
   assert(!fir::isa_ref_type(val.getType()) && "expecting a constant value");
   mlir::Value castVal = builder.createConvert(loc, componentTy, val);
-  res = builder.create<fir::InsertValueOp>(
-      loc, recTy, res, castVal, builder.getArrayAttr(field.getAttributes()));
+  res = fir::InsertValueOp::create(builder, loc, recTy, res, castVal,
+                                   builder.getArrayAttr(field.getAttributes()));
   return res;
 }
 
@@ -465,7 +467,7 @@ static mlir::Value genInlinedStructureCtorLitImpl(
   auto recTy = mlir::cast<fir::RecordType>(type);
 
   if (!converter.getLoweringOptions().getLowerToHighLevelFIR()) {
-    mlir::Value res = builder.create<fir::UndefOp>(loc, recTy);
+    mlir::Value res = fir::UndefOp::create(builder, loc, recTy);
     for (const auto &[sym, expr] : ctor.values()) {
       // Parent components need more work because they do not appear in the
       // fir.rec type.
@@ -495,13 +497,13 @@ static mlir::Value genInlinedStructureCtorLitImpl(
         break;
     }
     for (mlir::Type parentType : llvm::reverse(parentTypes)) {
-      auto undef = builder.create<fir::UndefOp>(loc, parentType);
+      auto undef = fir::UndefOp::create(builder, loc, parentType);
       fir::RecordType parentRecTy = mlir::cast<fir::RecordType>(parentType);
-      auto field = builder.create<fir::FieldIndexOp>(
-          loc, fieldTy, parentRecTy.getTypeList()[0].first, parentType,
+      auto field = fir::FieldIndexOp::create(
+          builder, loc, fieldTy, parentRecTy.getTypeList()[0].first, parentType,
           /*typeParams=*/mlir::ValueRange{} /*TODO*/);
-      res = builder.create<fir::InsertValueOp>(
-          loc, parentRecTy, undef, res,
+      res = fir::InsertValueOp::create(
+          builder, loc, parentRecTy, undef, res,
           builder.getArrayAttr(field.getAttributes()));
     }
   };
@@ -514,7 +516,7 @@ static mlir::Value genInlinedStructureCtorLitImpl(
     if (!res) {
       mlir::Type parentType = converter.genType(*componentParentType);
       curentType = componentParentType;
-      res = builder.create<fir::UndefOp>(loc, parentType);
+      res = fir::UndefOp::create(builder, loc, parentType);
     } else if (*componentParentType != *curentType) {
       mlir::Type parentType = converter.genType(*componentParentType);
       insertParentValueIntoExtension(parentType);
@@ -524,7 +526,7 @@ static mlir::Value genInlinedStructureCtorLitImpl(
   }
 
   if (!res) // structure constructor for empty type.
-    return builder.create<fir::UndefOp>(loc, recTy);
+    return fir::UndefOp::create(builder, loc, recTy);
 
   // The last component may belong to a parent type.
   if (res.getType() != recTy)
@@ -550,12 +552,12 @@ static mlir::Value genScalarLit(
         [&](fir::FirOpBuilder &builder) {
           mlir::Value result =
               genInlinedStructureCtorLitImpl(converter, loc, value, eleTy);
-          builder.create<fir::HasValueOp>(loc, result);
+          fir::HasValueOp::create(builder, loc, result);
         },
         builder.createInternalLinkage());
   }
-  return builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                       global.getSymbol());
+  return fir::AddrOfOp::create(builder, loc, global.resultType(),
+                               global.getSymbol());
 }
 
 /// Create an evaluate::Constant<T> array to a fir.array<> value
@@ -576,7 +578,7 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
           builder.getIntegerAttr(idxTy, subscripts[i] - con.lbounds()[i]));
     return idx;
   };
-  mlir::Value array = builder.create<fir::UndefOp>(loc, arrayTy);
+  mlir::Value array = fir::UndefOp::create(builder, loc, arrayTy);
   if (Fortran::evaluate::GetSize(con.shape()) == 0)
     return array;
   if constexpr (T::category == Fortran::common::TypeCategory::Character) {
@@ -584,8 +586,9 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
       mlir::Value elementVal =
           genScalarLit<T::kind>(builder, loc, con.At(subscripts), con.LEN(),
                                 /*outlineInReadOnlyMemory=*/false);
-      array = builder.create<fir::InsertValueOp>(
-          loc, arrayTy, array, elementVal, builder.getArrayAttr(createIdx()));
+      array =
+          fir::InsertValueOp::create(builder, loc, arrayTy, array, elementVal,
+                                     builder.getArrayAttr(createIdx()));
     } while (con.IncrementSubscripts(subscripts));
   } else if constexpr (T::category == Fortran::common::TypeCategory::Derived) {
     do {
@@ -594,8 +597,9 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
       mlir::Value elementVal =
           genScalarLit(converter, loc, con.At(subscripts), eleTy,
                        /*outlineInReadOnlyMemory=*/false);
-      array = builder.create<fir::InsertValueOp>(
-          loc, arrayTy, array, elementVal, builder.getArrayAttr(createIdx()));
+      array =
+          fir::InsertValueOp::create(builder, loc, arrayTy, array, elementVal,
+                                     builder.getArrayAttr(createIdx()));
     } while (con.IncrementSubscripts(subscripts));
   } else {
     llvm::SmallVector<mlir::Attribute> rangeStartIdx;
@@ -611,9 +615,9 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
       bool nextIsSame = con.IncrementSubscripts(nextSubscripts) &&
                         con.At(subscripts) == con.At(nextSubscripts);
       if (!rangeSize && !nextIsSame) { // single (non-range) value
-        array = builder.create<fir::InsertValueOp>(
-            loc, arrayTy, array, getElementVal(),
-            builder.getArrayAttr(createIdx()));
+        array = fir::InsertValueOp::create(builder, loc, arrayTy, array,
+                                           getElementVal(),
+                                           builder.getArrayAttr(createIdx()));
       } else if (!rangeSize) { // start a range
         rangeStartIdx = createIdx();
         rangeSize = 1;
@@ -629,8 +633,8 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
           rangeBounds.push_back(
               mlir::cast<mlir::IntegerAttr>(idx[i]).getValue().getSExtValue());
         }
-        array = builder.create<fir::InsertOnRangeOp>(
-            loc, arrayTy, array, getElementVal(),
+        array = fir::InsertOnRangeOp::create(
+            builder, loc, arrayTy, array, getElementVal(),
             builder.getIndexVectorAttr(rangeBounds));
         rangeSize = 0;
       }
@@ -679,12 +683,12 @@ genOutlineArrayLit(Fortran::lower::AbstractConverter &converter,
           [&](fir::FirOpBuilder &builder) {
             mlir::Value result =
                 genInlinedArrayLit(converter, loc, arrayTy, constant);
-            builder.create<fir::HasValueOp>(loc, result);
+            fir::HasValueOp::create(builder, loc, result);
           },
           builder.createInternalLinkage());
   }
-  return builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                       global.getSymbol());
+  return fir::AddrOfOp::create(builder, loc, global.resultType(),
+                               global.getSymbol());
 }
 
 /// Convert an evaluate::Constant<T> array into an fir::ExtendedValue.
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index f3430bfa3021e..3578f941ec1b4 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -267,8 +267,8 @@ static mlir::Value genActualIsPresentTest(fir::FirOpBuilder &builder,
   // Optional case (not that optional allocatable/pointer cannot be absent
   // when passed to CMPLX as per 15.5.2.12 point 3 (7) and (8)). It is
   // therefore possible to catch them in the `then` case above.
-  return builder.create<fir::IsPresentOp>(loc, builder.getI1Type(),
-                                          fir::getBase(actual));
+  return fir::IsPresentOp::create(builder, loc, builder.getI1Type(),
+                                  fir::getBase(actual));
 }
 
 /// Convert the array_load, `load`, to an extended value. If `path` is not
@@ -345,8 +345,8 @@ arrayLoadExtValue(fir::FirOpBuilder &builder, mlir::Location loc,
     auto origins = fir::factory::getNonDefaultLowerBounds(builder, loc, exv);
     if (shapeVal) {
       // shapeVal is a ShiftOp and load.memref() is a boxed value.
-      newBase = builder.create<fir::ReboxOp>(loc, oldBox.getType(), oldBox,
-                                             shapeVal, /*slice=*/mlir::Value{});
+      newBase = fir::ReboxOp::create(builder, loc, oldBox.getType(), oldBox,
+                                     shapeVal, /*slice=*/mlir::Value{});
       origins = fir::factory::getOrigins(shapeVal);
     }
     return fir::substBase(arrayToExtendedValue(extents, origins), newBase);
@@ -378,7 +378,7 @@ placeScalarValueInMemory(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value temp = builder.createTemporary(
       loc, storageType,
       llvm::ArrayRef<mlir::NamedAttribute>{fir::getAdaptToByRefAttr(builder)});
-  builder.create<fir::StoreOp>(loc, val, temp);
+  fir::StoreOp::create(builder, loc, val, temp);
   return fir::substBase(exv, temp);
 }
 
@@ -434,14 +434,14 @@ static fir::ExtendedValue genLoad(fir::FirOpBuilder &builder,
         if (mlir::isa<fir::RecordType>(
                 fir::unwrapRefType(fir::getBase(p).getType())))
           return p;
-        mlir::Value load = builder.create<fir::LoadOp>(loc, fir::getBase(p));
+        mlir::Value load = fir::LoadOp::create(builder, loc, fir::getBase(p));
         return fir::PolymorphicValue(load, p.getSourceBox());
       },
       [&](const fir::UnboxedValue &v) -> fir::ExtendedValue {
         if (mlir::isa<fir::RecordType>(
                 fir::unwrapRefType(fir::getBase(v).getType())))
           return v;
-        return builder.create<fir::LoadOp>(loc, fir::getBase(v));
+        return fir::LoadOp::create(builder, loc, fir::getBase(v));
       },
       [&](const fir::MutableBoxValue &box) -> fir::ExtendedValue {
         return genLoad(builder, loc,
@@ -473,11 +473,11 @@ static fir::ExtendedValue genOptionalValue(fir::FirOpBuilder &builder,
                /*withElseRegion=*/true)
       .genThen([&]() {
         mlir::Value val = fir::getBase(genLoad(builder, loc, exv));
-        builder.create<fir::ResultOp>(loc, val);
+        fir::ResultOp::create(builder, loc, val);
       })
       .genElse([&]() {
         mlir::Value zero = fir::factory::createZeroValue(builder, loc, eleType);
-        builder.create<fir::ResultOp>(loc, zero);
+        fir::ResultOp::create(builder, loc, zero);
       })
       .getResults()[0];
 }
@@ -521,9 +521,9 @@ static fir::ExtendedValue genOptionalBox(fir::FirOpBuilder &builder,
   // ensures it won't be.
   mlir::Value box = builder.createBox(loc, newExv);
   mlir::Type boxType = box.getType();
-  auto absent = builder.create<fir::AbsentOp>(loc, boxType);
-  auto boxOrAbsent = builder.create<mlir::arith::SelectOp>(
-      loc, boxType, isPresent, box, absent);
+  auto absent = fir::AbsentOp::create(builder, loc, boxType);
+  auto boxOrAbsent = mlir::arith::SelectOp::create(builder, loc, boxType,
+                                                   isPresent, box, absent);
   return fir::BoxValue(boxOrAbsent);
 }
 
@@ -569,13 +569,13 @@ createBoxProcCharTuple(Fortran::lower::AbstractConverter &converter,
   if (fir::isa_ref_type(fromTy))
     funcAddr = builder.createConvert(loc, toTy, funcAddr);
   else if (mlir::isa<fir::BoxProcType>(fromTy))
-    funcAddr = builder.create<fir::BoxAddrOp>(loc, toTy, funcAddr);
+    funcAddr = fir::BoxAddrOp::create(builder, loc, toTy, funcAddr);
 
   auto boxProc = [&]() -> mlir::Value {
     if (auto host = Fortran::lower::argumentHostAssocs(converter, funcAddr))
-      return builder.create<fir::EmboxProcOp>(
-          loc, boxTy, llvm::ArrayRef<mlir::Value>{funcAddr, host});
-    return builder.create<fir::EmboxProcOp>(loc, boxTy, funcAddr);
+      return fir::EmboxProcOp::create(
+          builder, loc, boxTy, llvm::ArrayRef<mlir::Value>{funcAddr, host});
+    return fir::EmboxProcOp::create(builder, loc, boxTy, funcAddr);
   }();
   return fir::factory::createCharacterProcedureTuple(builder, loc, argTy,
                                                      boxProc, charLen);
@@ -596,9 +596,9 @@ absentBoxToUnallocatedBox(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Type boxType = box.getType();
   assert(mlir::isa<fir::BoxType>(boxType) && "argument must be a fir.box");
   mlir::Value emptyBox =
-      fir::factory::createUnallocatedBox(builder, loc, boxType, std::nullopt);
+      fir::factory::createUnallocatedBox(builder, loc, boxType, {});
   auto safeToReadBox =
-      builder.create<mlir::arith::SelectOp>(loc, isPresent, box, emptyBox);
+      mlir::arith::SelectOp::create(builder, loc, isPresent, box, emptyBox);
   return fir::substBase(exv, safeToReadBox);
 }
 
@@ -822,9 +822,9 @@ class ScalarExprLowering {
               Fortran::common::TypeCategory::Integer, *unsignedKind);
           mlir::Value lhsSL = builder.createConvert(loc, signlessType, *lhs);
           mlir::Value rhsSL = builder.createConvert(loc, signlessType, *rhs);
-          return builder.create<OpTy>(loc, pred, lhsSL, rhsSL);
+          return OpTy::create(builder, loc, pred, lhsSL, rhsSL);
         }
-        return builder.create<OpTy>(loc, pred, *lhs, *rhs);
+        return OpTy::create(builder, loc, pred, *lhs, *rhs);
       }
     }
     fir::emitFatalError(getLoc(), "array compare should be handled in genarr");
@@ -841,7 +841,7 @@ class ScalarExprLowering {
                              const ExtValue &left, const ExtValue &right) {
     if (const fir::UnboxedValue *lhs = left.getUnboxed())
       if (const fir::UnboxedValue *rhs = right.getUnboxed())
-        return builder.create<OpTy>(getLoc(), pred, *lhs, *rhs);
+        return OpTy::create(builder, getLoc(), pred, *lhs, *rhs);
     fir::emitFatalError(getLoc(), "array compare should be handled in genarr");
   }
   template <typename OpTy, typename A>
@@ -904,7 +904,7 @@ class ScalarExprLowering {
 
           mlir::Value cnvrt = Fortran::lower::addCrayPointerInst(
               loc, builder, ptrVal, ptrTy, pteVal.getType());
-          addr = builder.create<fir::LoadOp>(loc, cnvrt);
+          addr = fir::LoadOp::create(builder, loc, cnvrt);
         }
         return genLoad(addr);
       }
@@ -970,12 +970,12 @@ class ScalarExprLowering {
 
       std::string name = converter.getRecordTypeFieldName(sym);
       // FIXME: type parameters must come from the derived-type-spec
-      mlir::Value field = builder.create<fir::FieldIndexOp>(
-          loc, fieldTy, name, ty,
-          /*typeParams=*/mlir::ValueRange{} /*TODO*/);
+      mlir::Value field =
+          fir::FieldIndexOp::create(builder, loc, fieldTy, name, ty,
+                                    /*typeParams=*/mlir::ValueRange{} /*TODO*/);
       mlir::Type coorTy = builder.getRefType(recTy.getType(name));
-      auto coor = builder.create<fir::CoordinateOp>(loc, coorTy,
-                                                    fir::getBase(res), field);
+      auto coor = fir::CoordinateOp::create(builder, loc, coorTy,
+                                            fir::getBase(res), field);
       ExtValue to = fir::factory::componentToExtendedValue(builder, loc, coor);
       to.match(
           [&](const fir::UnboxedValue &toPtr) {
@@ -1003,9 +1003,9 @@ class ScalarExprLowering {
           },
           [&](const fir::MutableBoxValue &toBox) {
             if (toBox.isPointer()) {
-              Fortran::lower::associateMutableBox(converter, loc, toBox, expr,
-                                                  /*lbounds=*/std::nullopt,
-                                                  stmtCtx);
+              Fortran::lower::associateMutableBox(
+                  converter, loc, toBox, expr,
+                  /*lbounds=*/mlir::ValueRange{}, stmtCtx);
               return;
             }
             // For allocatable components, a deep copy is needed.
@@ -1077,7 +1077,7 @@ class ScalarExprLowering {
     mlir::Value input = genunbox(op.left());
     // Like LLVM, integer negation is the binary op "0 - value"
     mlir::Value zero = genIntegerConstant<KIND>(builder.getContext(), 0);
-    return builder.create<mlir::arith::SubIOp>(getLoc(), zero, input);
+    return mlir::arith::SubIOp::create(builder, getLoc(), zero, input);
   }
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Negate<Fortran::evaluate::Type<
@@ -1088,18 +1088,18 @@ class ScalarExprLowering {
     mlir::Value input = genunbox(op.left());
     mlir::Value signless = builder.createConvert(loc, signlessType, input);
     mlir::Value zero = genIntegerConstant<KIND>(builder.getContext(), 0);
-    mlir::Value neg = builder.create<mlir::arith::SubIOp>(loc, zero, signless);
+    mlir::Value neg = mlir::arith::SubIOp::create(builder, loc, zero, signless);
     return builder.createConvert(loc, input.getType(), neg);
   }
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Negate<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Real, KIND>> &op) {
-    return builder.create<mlir::arith::NegFOp>(getLoc(), genunbox(op.left()));
+    return mlir::arith::NegFOp::create(builder, getLoc(), genunbox(op.left()));
   }
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Negate<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Complex, KIND>> &op) {
-    return builder.create<fir::NegcOp>(getLoc(), genunbox(op.left()));
+    return fir::NegcOp::create(builder, getLoc(), genunbox(op.left()));
   }
 
   template <typename OpTy>
@@ -1312,7 +1312,7 @@ class ScalarExprLowering {
     ExtValue input = genval(op.left());
     mlir::Value base = fir::getBase(input);
     mlir::Value newBase =
-        builder.create<fir::NoReassocOp>(getLoc(), base.getType(), base);
+        fir::NoReassocOp::create(builder, getLoc(), base.getType(), base);
     return fir::substBase(input, newBase);
   }
 
@@ -1322,7 +1322,7 @@ class ScalarExprLowering {
     mlir::Value one = genBoolConstant(true);
     mlir::Value val =
         builder.createConvert(getLoc(), builder.getI1Type(), logical);
-    return builder.create<mlir::arith::XOrIOp>(getLoc(), val, one);
+    return mlir::arith::XOrIOp::create(builder, getLoc(), val, one);
   }
 
   template <int KIND>
@@ -1384,8 +1384,9 @@ class ScalarExprLowering {
     mlir::Value offset = builder.createIntegerConstant(
         loc, idxTy,
         x.part() == Fortran::evaluate::ComplexPart::Part::RE ? 0 : 1);
-    mlir::Value result = builder.create<fir::CoordinateOp>(
-        loc, builder.getRefType(eleTy), base, mlir::ValueRange{offset});
+    mlir::Value result =
+        fir::CoordinateOp::create(builder, loc, builder.getRefType(eleTy), base,
+                                  mlir::ValueRange{offset});
     return {result};
   }
   ExtValue genval(const Fortran::evaluate::ComplexPart &x) {
@@ -1500,8 +1501,8 @@ class ScalarExprLowering {
       auto recTy = mlir::cast<fir::RecordType>(ty);
       const Fortran::semantics::Symbol &sym = getLastSym(*field);
       std::string name = converter.getRecordTypeFieldName(sym);
-      coorArgs.push_back(builder.create<fir::FieldIndexOp>(
-          loc, fldTy, name, recTy, fir::getTypeParams(obj)));
+      coorArgs.push_back(fir::FieldIndexOp::create(
+          builder, loc, fldTy, name, recTy, fir::getTypeParams(obj)));
       ty = recTy.getType(name);
     }
     // If parent component is referred then it has no coordinate argument.
@@ -1510,8 +1511,8 @@ class ScalarExprLowering {
     ty = builder.getRefType(ty);
     return fir::factory::componentToExtendedValue(
         builder, loc,
-        builder.create<fir::CoordinateOp>(loc, ty, fir::getBase(obj),
-                                          coorArgs));
+        fir::CoordinateOp::create(builder, loc, ty, fir::getBase(obj),
+                                  coorArgs));
   }
 
   ExtValue gen(const Fortran::evaluate::Component &cmpt) {
@@ -1587,7 +1588,7 @@ class ScalarExprLowering {
       mlir::Value val = fir::getBase(subVal);
       mlir::Type ty = val.getType();
       mlir::Value lb = getLBound(array, subsc.index(), ty);
-      args.push_back(builder.create<mlir::arith::SubIOp>(loc, ty, val, lb));
+      args.push_back(mlir::arith::SubIOp::create(builder, loc, ty, val, lb));
     }
     mlir::Value base = fir::getBase(array);
 
@@ -1602,7 +1603,7 @@ class ScalarExprLowering {
 
       mlir::Value cnvrt = Fortran::lower::addCrayPointerInst(
           loc, builder, ptrVal, ptrTy, base.getType());
-      base = builder.create<fir::LoadOp>(loc, cnvrt);
+      base = fir::LoadOp::create(builder, loc, cnvrt);
     }
 
     mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(base.getType());
@@ -1611,7 +1612,7 @@ class ScalarExprLowering {
     auto seqTy = mlir::cast<fir::SequenceType>(eleTy);
     assert(args.size() == seqTy.getDimension());
     mlir::Type ty = builder.getRefType(seqTy.getEleTy());
-    auto addr = builder.create<fir::CoordinateOp>(loc, ty, base, args);
+    auto addr = fir::CoordinateOp::create(builder, loc, ty, base, args);
     return fir::factory::arrayElementToExtendedValue(builder, loc, array, addr);
   }
 
@@ -1648,12 +1649,12 @@ class ScalarExprLowering {
         mlir::Value val =
             builder.createConvert(loc, idxTy, fir::getBase(subVal));
         mlir::Value lb = builder.createConvert(loc, idxTy, getLB(arr, dim));
-        mlir::Value diff = builder.create<mlir::arith::SubIOp>(loc, val, lb);
+        mlir::Value diff = mlir::arith::SubIOp::create(builder, loc, val, lb);
         mlir::Value prod =
-            builder.create<mlir::arith::MulIOp>(loc, delta, diff);
-        total = builder.create<mlir::arith::AddIOp>(loc, prod, total);
+            mlir::arith::MulIOp::create(builder, loc, delta, diff);
+        total = mlir::arith::AddIOp::create(builder, loc, prod, total);
         if (ext)
-          delta = builder.create<mlir::arith::MulIOp>(loc, delta, ext);
+          delta = mlir::arith::MulIOp::create(builder, loc, delta, ext);
         ++dim;
       }
       mlir::Type origRefTy = refTy;
@@ -1672,8 +1673,8 @@ class ScalarExprLowering {
           base = builder.createConvert(loc, seqRefTy, base);
         }
       }
-      auto coor = builder.create<fir::CoordinateOp>(
-          loc, refTy, base, llvm::ArrayRef<mlir::Value>{total});
+      auto coor = fir::CoordinateOp::create(builder, loc, refTy, base,
+                                            llvm::ArrayRef<mlir::Value>{total});
       // Convert to expected, original type after address arithmetic.
       return builder.createConvert(loc, origRefTy, coor);
     };
@@ -1725,9 +1726,9 @@ class ScalarExprLowering {
           builder.createConvert(loc, idxTy, fir::getBase(subVal)));
     }
     mlir::Value shape = builder.createShape(loc, exv);
-    mlir::Value elementAddr = builder.create<fir::ArrayCoorOp>(
-        loc, refTy, addr, shape, /*slice=*/mlir::Value{}, arrayCoorArgs,
-        fir::getTypeParams(exv));
+    mlir::Value elementAddr = fir::ArrayCoorOp::create(
+        builder, loc, refTy, addr, shape, /*slice=*/mlir::Value{},
+        arrayCoorArgs, fir::getTypeParams(exv));
     return fir::factory::arrayElementToExtendedValue(builder, loc, exv,
                                                      elementAddr);
   }
@@ -1826,8 +1827,8 @@ class ScalarExprLowering {
     if (mlir::isa<mlir::FunctionType>(exvTy)) {
       auto boxProcTy =
           builder.getBoxProcType(mlir::cast<mlir::FunctionType>(exvTy));
-      return builder.create<fir::EmboxProcOp>(loc, boxProcTy,
-                                              fir::getBase(exv));
+      return fir::EmboxProcOp::create(builder, loc, boxProcTy,
+                                      fir::getBase(exv));
     }
     mlir::Value box = builder.createBox(loc, exv, exv.isPolymorphic());
     if (Fortran::lower::isParentComponent(expr)) {
@@ -2073,8 +2074,8 @@ class ScalarExprLowering {
       TODO(loc, "creating temporary for derived type with length parameters");
     }
 
-    mlir::Value temp = builder.create<fir::AllocMemOp>(
-        loc, type, tempName, allocMemTypeParams, extents);
+    mlir::Value temp = fir::AllocMemOp::create(builder, loc, type, tempName,
+                                               allocMemTypeParams, extents);
     if (mlir::isa<fir::CharacterType>(fir::unwrapSequenceType(type)))
       return fir::CharArrayBoxValue{temp, charLen, extents};
     return fir::ArrayBoxValue{temp, extents};
@@ -2124,9 +2125,9 @@ class ScalarExprLowering {
           mlir::Type type = v.getType();
           mlir::Value value = v;
           if (fir::isa_ref_type(type))
-            value = builder.create<fir::LoadOp>(loc, value);
+            value = fir::LoadOp::create(builder, loc, value);
           mlir::Value temp = builder.createTemporary(loc, value.getType());
-          builder.create<fir::StoreOp>(loc, value, temp);
+          fir::StoreOp::create(builder, loc, value, temp);
           return temp;
         },
         [&](const fir::BoxValue &x) -> ExtValue {
@@ -2141,9 +2142,9 @@ class ScalarExprLowering {
           // created always has the declared type.
           mlir::Value var =
               fir::getBase(fir::factory::readBoxValue(builder, loc, x));
-          auto value = builder.create<fir::LoadOp>(loc, var);
+          auto value = fir::LoadOp::create(builder, loc, var);
           mlir::Value temp = builder.createTemporary(loc, value.getType());
-          builder.create<fir::StoreOp>(loc, value, temp);
+          fir::StoreOp::create(builder, loc, value, temp);
           return temp;
         },
         [&](const fir::PolymorphicValue &p) -> ExtValue {
@@ -2242,7 +2243,7 @@ class ScalarExprLowering {
       // at this point.
       mlir::Value destBox = fir::getBase(builder.createBox(loc, temp));
       mlir::Value boxRef = builder.createTemporary(loc, destBox.getType());
-      builder.create<fir::StoreOp>(loc, destBox, boxRef);
+      fir::StoreOp::create(builder, loc, destBox, boxRef);
       fir::runtime::genAssignTemporary(builder, loc, boxRef,
                                        fir::getBase(actualArg));
       return temp;
@@ -2250,21 +2251,22 @@ class ScalarExprLowering {
 
     auto noCopy = [&]() {
       mlir::Value box = fir::getBase(actualArg);
-      mlir::Value boxAddr = builder.create<fir::BoxAddrOp>(loc, addrType, box);
-      builder.create<fir::ResultOp>(loc, boxAddr);
+      mlir::Value boxAddr = fir::BoxAddrOp::create(builder, loc, addrType, box);
+      fir::ResultOp::create(builder, loc, boxAddr);
     };
 
     auto combinedCondition = [&]() {
       if (isActualArgBox) {
         mlir::Value zero =
             builder.createIntegerConstant(loc, builder.getI1Type(), 0);
-        mlir::Value notContiguous = builder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::eq, isContiguousResult, zero);
+        mlir::Value notContiguous = mlir::arith::CmpIOp::create(
+            builder, loc, mlir::arith::CmpIPredicate::eq, isContiguousResult,
+            zero);
         if (!restrictCopyAtRuntime) {
           restrictCopyAtRuntime = notContiguous;
         } else {
-          mlir::Value cond = builder.create<mlir::arith::AndIOp>(
-              loc, *restrictCopyAtRuntime, notContiguous);
+          mlir::Value cond = mlir::arith::AndIOp::create(
+              builder, loc, *restrictCopyAtRuntime, notContiguous);
           restrictCopyAtRuntime = cond;
         }
       }
@@ -2280,7 +2282,7 @@ class ScalarExprLowering {
                 .genThen([&]() { noCopy(); })
                 .genElse([&] {
                   ExtValue temp = doCopyIn();
-                  builder.create<fir::ResultOp>(loc, fir::getBase(temp));
+                  fir::ResultOp::create(builder, loc, fir::getBase(temp));
                 })
                 .getResults()[0];
         fir::ExtendedValue temp =
@@ -2312,19 +2314,19 @@ class ScalarExprLowering {
                         .genThen([&]() { noCopy(); })
                         .genElse([&]() {
                           ExtValue temp = doCopyIn();
-                          builder.create<fir::ResultOp>(loc,
-                                                        fir::getBase(temp));
+                          fir::ResultOp::create(builder, loc,
+                                                fir::getBase(temp));
                         })
                         .getResults()[0];
-                builder.create<fir::ResultOp>(loc, addr1);
+                fir::ResultOp::create(builder, loc, addr1);
               } else {
                 ExtValue temp = doCopyIn();
-                builder.create<fir::ResultOp>(loc, fir::getBase(temp));
+                fir::ResultOp::create(builder, loc, fir::getBase(temp));
               }
             })
             .genElse([&]() {
               mlir::Value nullPtr = builder.createNullConstant(loc, addrType);
-              builder.create<fir::ResultOp>(loc, nullPtr);
+              fir::ResultOp::create(builder, loc, nullPtr);
             })
             .getResults()[0];
     // Associate the temp address with actualArg lengths and extents if a
@@ -2357,7 +2359,7 @@ class ScalarExprLowering {
                                                                  tempBox);
         }
         // Deallocate the top-level entity of the temporary.
-        builder.create<fir::FreeMemOp>(loc, fir::getBase(copyOutPair.temp));
+        fir::FreeMemOp::create(builder, loc, fir::getBase(copyOutPair.temp));
         return;
       }
       // Generate CopyOutAssign() call to copy data from the temporary
@@ -2376,11 +2378,11 @@ class ScalarExprLowering {
       mlir::Type allocBoxTy =
           mlir::cast<fir::BaseBoxType>(srcBox.getType())
               .getBoxTypeWithNewAttr(fir::BaseBoxType::Attribute::Allocatable);
-      srcBox = builder.create<fir::ReboxOp>(loc, allocBoxTy, srcBox,
-                                            /*shift=*/mlir::Value{},
-                                            /*slice=*/mlir::Value{});
+      srcBox = fir::ReboxOp::create(builder, loc, allocBoxTy, srcBox,
+                                    /*shift=*/mlir::Value{},
+                                    /*slice=*/mlir::Value{});
       mlir::Value srcBoxRef = builder.createTemporary(loc, srcBox.getType());
-      builder.create<fir::StoreOp>(loc, srcBox, srcBoxRef);
+      fir::StoreOp::create(builder, loc, srcBox, srcBoxRef);
       // Create descriptor pointer to variable descriptor if copy out is needed,
       // and nullptr otherwise.
       mlir::Value destBoxRef;
@@ -2388,9 +2390,9 @@ class ScalarExprLowering {
         mlir::Value destBox =
             fir::getBase(builder.createBox(loc, copyOutPair.var));
         destBoxRef = builder.createTemporary(loc, destBox.getType());
-        builder.create<fir::StoreOp>(loc, destBox, destBoxRef);
+        fir::StoreOp::create(builder, loc, destBox, destBoxRef);
       } else {
-        destBoxRef = builder.create<fir::ZeroOp>(loc, srcBoxRef.getType());
+        destBoxRef = fir::ZeroOp::create(builder, loc, srcBoxRef.getType());
       }
       fir::runtime::genCopyOutAssign(builder, loc, destBoxRef, srcBoxRef);
     };
@@ -2436,8 +2438,8 @@ class ScalarExprLowering {
     // fir.box is absent.
     ExtValue actualArg = gen(expr);
     mlir::Value actualArgBase = fir::getBase(actualArg);
-    mlir::Value isPresent = builder.create<fir::IsPresentOp>(
-        loc, builder.getI1Type(), actualArgBase);
+    mlir::Value isPresent = fir::IsPresentOp::create(
+        builder, loc, builder.getI1Type(), actualArgBase);
     if (!mlir::isa<fir::BoxType>(actualArgBase.getType()))
       return {actualArg, isPresent};
     ExtValue safeToReadBox =
@@ -2457,7 +2459,7 @@ class ScalarExprLowering {
     if (const fir::CharBoxValue *charBox = actualArg.getCharBox()) {
       mlir::Value len = charBox->getLen();
       mlir::Value zero = builder.createIntegerConstant(loc, len.getType(), 0);
-      len = builder.create<mlir::arith::SelectOp>(loc, isPresent, len, zero);
+      len = mlir::arith::SelectOp::create(builder, loc, isPresent, len, zero);
       mlir::Value temp =
           builder.createTemporary(loc, type, /*name=*/{},
                                   /*shape=*/{}, mlir::ValueRange{len},
@@ -2538,12 +2540,12 @@ class ScalarExprLowering {
                 .genThen([&]() {
                   fir::factory::genScalarAssignment(builder, loc, temp,
                                                     actualArg);
-                  builder.create<fir::ResultOp>(loc, fir::getBase(temp));
+                  fir::ResultOp::create(builder, loc, fir::getBase(temp));
                 })
                 .genElse([&]() {
                   mlir::Value absent =
-                      builder.create<fir::AbsentOp>(loc, tempAddrTy);
-                  builder.create<fir::ResultOp>(loc, absent);
+                      fir::AbsentOp::create(builder, loc, tempAddrTy);
+                  fir::ResultOp::create(builder, loc, absent);
                 })
                 .getResults()[0];
         return {fir::substBase(temp, selectAddr), isPresent};
@@ -2647,7 +2649,7 @@ class ScalarExprLowering {
           mlir::Value boxStorage = builder.createTemporary(loc, boxTy);
           mlir::Value nullBox = fir::factory::createUnallocatedBox(
               builder, loc, boxTy, /*nonDeferredParams=*/{});
-          builder.create<fir::StoreOp>(loc, nullBox, boxStorage);
+          fir::StoreOp::create(builder, loc, nullBox, boxStorage);
           caller.placeInput(arg, boxStorage);
           continue;
         }
@@ -2663,8 +2665,7 @@ class ScalarExprLowering {
                                        /*nonDeferredParams=*/mlir::ValueRange{},
                                        /*mutableProperties=*/{});
           Fortran::lower::associateMutableBox(converter, loc, pointer, *expr,
-                                              /*lbounds=*/std::nullopt,
-                                              stmtCtx);
+                                              /*lbounds=*/{}, stmtCtx);
           caller.placeInput(arg, irBox);
           continue;
         }
@@ -2707,9 +2708,10 @@ class ScalarExprLowering {
                       mlir::cast<fir::BoxCharType>(funcTy.getResult(0));
                   mlir::Value ref = builder.createConvertWithVolatileCast(
                       loc, builder.getRefType(boxTy.getEleTy()), x.getAddr());
-                  auto len = builder.create<fir::UndefOp>(
-                      loc, builder.getCharacterLengthType());
-                  return builder.create<fir::EmboxCharOp>(loc, boxTy, ref, len);
+                  auto len = fir::UndefOp::create(
+                      builder, loc, builder.getCharacterLengthType());
+                  return fir::EmboxCharOp::create(builder, loc, boxTy, ref,
+                                                  len);
                 }
                 return helper.createEmbox(x);
               },
@@ -2759,10 +2761,10 @@ class ScalarExprLowering {
           mlir::Value box = builder.createBox(loc, argAddr);
           if (isPresentValue) {
             mlir::Value convertedBox = builder.createConvert(loc, argTy, box);
-            auto absent = builder.create<fir::AbsentOp>(loc, argTy);
-            caller.placeInput(arg,
-                              builder.create<mlir::arith::SelectOp>(
-                                  loc, *isPresentValue, convertedBox, absent));
+            auto absent = fir::AbsentOp::create(builder, loc, argTy);
+            caller.placeInput(
+                arg, mlir::arith::SelectOp::create(
+                         builder, loc, *isPresentValue, convertedBox, absent));
           } else {
             caller.placeInput(arg, builder.createBox(loc, argAddr));
           }
@@ -2783,7 +2785,7 @@ class ScalarExprLowering {
           mlir::Value isAllocated =
               fir::factory::genIsAllocatedOrAssociatedTest(builder, loc,
                                                            mutableBox);
-          auto absent = builder.create<fir::AbsentOp>(loc, argTy);
+          auto absent = fir::AbsentOp::create(builder, loc, argTy);
           /// For now, assume it is not OK to pass the allocatable/pointer
           /// descriptor to a non pointer/allocatable dummy. That is a strict
           /// interpretation of 18.3.6 point 4 that stipulates the descriptor
@@ -2802,14 +2804,15 @@ class ScalarExprLowering {
                 box);
           } else if (mlir::isa<fir::BoxType>(box.getType()) &&
                      fir::isPolymorphicType(argTy)) {
-            box = builder.create<fir::ReboxOp>(loc, argTy, box, mlir::Value{},
-                                               /*slice=*/mlir::Value{});
+            box = fir::ReboxOp::create(builder, loc, argTy, box, mlir::Value{},
+                                       /*slice=*/mlir::Value{});
           }
 
           // Need the box types to be exactly similar for the selectOp.
           mlir::Value convertedBox = builder.createConvert(loc, argTy, box);
-          caller.placeInput(arg, builder.create<mlir::arith::SelectOp>(
-                                     loc, isAllocated, convertedBox, absent));
+          caller.placeInput(
+              arg, mlir::arith::SelectOp::create(builder, loc, isAllocated,
+                                                 convertedBox, absent));
         } else {
           auto dynamicType = expr->GetType();
           mlir::Value box;
@@ -2831,12 +2834,12 @@ class ScalarExprLowering {
                     .genThen([&]() {
                       auto boxed = builder.createBox(
                           loc, genBoxArg(*expr), fir::isPolymorphicType(argTy));
-                      builder.create<fir::ResultOp>(loc, boxed);
+                      fir::ResultOp::create(builder, loc, boxed);
                     })
                     .genElse([&]() {
-                      auto absent =
-                          builder.create<fir::AbsentOp>(loc, argTy).getResult();
-                      builder.create<fir::ResultOp>(loc, absent);
+                      auto absent = fir::AbsentOp::create(builder, loc, argTy)
+                                        .getResult();
+                      fir::ResultOp::create(builder, loc, absent);
                     })
                     .getResults()[0];
           } else {
@@ -2868,19 +2871,19 @@ class ScalarExprLowering {
                                         loc, actualTy, box, mlir::Value{},
                                         /*slice=*/mlir::Value{})
                                     .getResult();
-                            builder.create<fir::ResultOp>(loc, rebox);
+                            fir::ResultOp::create(builder, loc, rebox);
                           })
                           .genElse([&]() {
                             auto absent =
-                                builder.create<fir::AbsentOp>(loc, actualTy)
+                                fir::AbsentOp::create(builder, loc, actualTy)
                                     .getResult();
-                            builder.create<fir::ResultOp>(loc, absent);
+                            fir::ResultOp::create(builder, loc, absent);
                           })
                           .getResults()[0];
               } else {
-                box = builder.create<fir::ReboxOp>(loc, actualTy, box,
-                                                   mlir::Value{},
-                                                   /*slice=*/mlir::Value{});
+                box = fir::ReboxOp::create(builder, loc, actualTy, box,
+                                           mlir::Value{},
+                                           /*slice=*/mlir::Value{});
               }
             } else if (Fortran::lower::isParentComponent(*expr)) {
               fir::ExtendedValue newExv =
@@ -3133,12 +3136,12 @@ static void genScalarUserDefinedAssignmentCall(fir::FirOpBuilder &builder,
     if (argBaseType != fir::unwrapRefType(from.getType())) {
       // With logicals, it is possible that from is i1 here.
       if (fir::isa_ref_type(from.getType()))
-        from = builder.create<fir::LoadOp>(loc, from);
+        from = fir::LoadOp::create(builder, loc, from);
       from = builder.createConvert(loc, argBaseType, from);
     }
     if (!fir::isa_ref_type(from.getType())) {
       mlir::Value temp = builder.createTemporary(loc, argBaseType);
-      builder.create<fir::StoreOp>(loc, from, temp);
+      fir::StoreOp::create(builder, loc, from, temp);
       from = temp;
     }
     return builder.createConvert(loc, argType, from);
@@ -3148,7 +3151,7 @@ static void genScalarUserDefinedAssignmentCall(fir::FirOpBuilder &builder,
   mlir::Type rhsType = func.getFunctionType().getInput(1);
   mlir::Value lhsArg = prepareUserDefinedArg(builder, loc, lhs, lhsType);
   mlir::Value rhsArg = prepareUserDefinedArg(builder, loc, rhs, rhsType);
-  builder.create<fir::CallOp>(loc, func, mlir::ValueRange{lhsArg, rhsArg});
+  fir::CallOp::create(builder, loc, func, mlir::ValueRange{lhsArg, rhsArg});
 }
 
 /// Convert the result of a fir.array_modify to an ExtendedValue given the
@@ -3188,17 +3191,17 @@ createDerivedArrayAmend(mlir::Location loc, fir::ArrayLoadOp destLoad,
     fir::factory::genRecordAssignment(builder, loc, destAcc, elementExv);
   } else {
     auto boxTy = fir::BoxType::get(eleTy);
-    auto toBox = builder.create<fir::EmboxOp>(loc, boxTy, destAcc.getResult(),
-                                              mlir::Value{}, mlir::Value{},
-                                              destLoad.getTypeparams());
-    auto fromBox = builder.create<fir::EmboxOp>(
-        loc, boxTy, fir::getBase(elementExv), mlir::Value{}, mlir::Value{},
-        destLoad.getTypeparams());
+    auto toBox = fir::EmboxOp::create(builder, loc, boxTy, destAcc.getResult(),
+                                      mlir::Value{}, mlir::Value{},
+                                      destLoad.getTypeparams());
+    auto fromBox = fir::EmboxOp::create(
+        builder, loc, boxTy, fir::getBase(elementExv), mlir::Value{},
+        mlir::Value{}, destLoad.getTypeparams());
     fir::factory::genRecordAssignment(builder, loc, fir::BoxValue(toBox),
                                       fir::BoxValue(fromBox));
   }
-  return builder.create<fir::ArrayAmendOp>(loc, innerArg.getType(), innerArg,
-                                           destAcc);
+  return fir::ArrayAmendOp::create(builder, loc, innerArg.getType(), innerArg,
+                                   destAcc);
 }
 
 inline static fir::ArrayAmendOp
@@ -3220,7 +3223,7 @@ createCharArrayAmend(mlir::Location loc, fir::FirOpBuilder &builder,
   helper.createAssign(fir::ExtendedValue{dstChar}, srcExv);
   // Mark this array element as amended.
   mlir::Type ty = innerArg.getType();
-  auto amend = builder.create<fir::ArrayAmendOp>(loc, ty, innerArg, dstOp);
+  auto amend = fir::ArrayAmendOp::create(builder, loc, ty, innerArg, dstOp);
   return amend;
 }
 
@@ -3236,7 +3239,7 @@ convertToArrayBoxValue(mlir::Location loc, fir::FirOpBuilder &builder,
   mlir::Type ty = fir::unwrapRefType(val.getType());
   mlir::IndexType idxTy = builder.getIndexType();
   auto seqTy = mlir::cast<fir::SequenceType>(ty);
-  auto undef = builder.create<fir::UndefOp>(loc, idxTy);
+  auto undef = fir::UndefOp::create(builder, loc, idxTy);
   llvm::SmallVector<mlir::Value> extents(seqTy.getDimension(), undef);
   if (fir::isa_char(seqTy.getEleTy()))
     return fir::CharArrayBoxValue(val, len ? len : undef, extents);
@@ -3316,10 +3319,10 @@ class ArrayExprLowering {
     ExtValue exv = lowerArrayExpression(rhs);
     if (explicitSpaceIsActive()) {
       explicitSpace->finalizeContext();
-      builder.create<fir::ResultOp>(loc, fir::getBase(exv));
+      fir::ResultOp::create(builder, loc, fir::getBase(exv));
     } else {
-      builder.create<fir::ArrayMergeStoreOp>(
-          loc, destination, fir::getBase(exv), destination.getMemref(),
+      fir::ArrayMergeStoreOp::create(
+          builder, loc, destination, fir::getBase(exv), destination.getMemref(),
           destination.getSlice(), destination.getTypeparams());
     }
   }
@@ -3433,8 +3436,8 @@ class ArrayExprLowering {
     assert(destination && "destination must have been set");
     ExtValue exv = lowerArrayExpression(rhsCC, destination.getType());
     if (!explicitSpaceIsActive())
-      builder.create<fir::ArrayMergeStoreOp>(
-          loc, destination, fir::getBase(exv), destination.getMemref(),
+      fir::ArrayMergeStoreOp::create(
+          builder, loc, destination, fir::getBase(exv), destination.getMemref(),
           destination.getSlice(), destination.getTypeparams());
     // destShape may originally be null, if rhs did not define a shape.
     // In this case the destShape is computed from lhs, and we may have
@@ -3503,7 +3506,7 @@ class ArrayExprLowering {
                                          lengthParams, assignToStorage);
     if (explicitSpaceIsActive()) {
       explicitSpace->finalizeContext();
-      builder.create<fir::ResultOp>(loc, fir::getBase(realloc.newValue));
+      fir::ResultOp::create(builder, loc, fir::getBase(realloc.newValue));
     }
     fir::factory::finalizeRealloc(builder, loc, mutableBox, lbounds,
                                   takeLboundsIfRealloc, realloc);
@@ -3548,9 +3551,9 @@ class ArrayExprLowering {
     mlir::Value tempRes = dest.getMemref();
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
     mlir::Location loc = converter.getCurrentLocation();
-    builder.create<fir::ArrayMergeStoreOp>(loc, dest, fir::getBase(loopRes),
-                                           tempRes, dest.getSlice(),
-                                           dest.getTypeparams());
+    fir::ArrayMergeStoreOp::create(builder, loc, dest, fir::getBase(loopRes),
+                                   tempRes, dest.getSlice(),
+                                   dest.getTypeparams());
 
     auto arrTy = mlir::cast<fir::SequenceType>(
         fir::dyn_cast_ptrEleTy(tempRes.getType()));
@@ -3596,25 +3599,26 @@ class ArrayExprLowering {
     // as there isn't any necessity for it.
     ccLoadDest = [=](llvm::ArrayRef<mlir::Value> shape) -> fir::ArrayLoadOp {
       mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-      auto var = builder.create<fir::CoordinateOp>(
-          loc, builder.getRefType(hdrTy.getType(1)), header, one);
-      auto load = builder.create<fir::LoadOp>(loc, var);
+      auto var = fir::CoordinateOp::create(
+          builder, loc, builder.getRefType(hdrTy.getType(1)), header, one);
+      auto load = fir::LoadOp::create(builder, loc, var);
       mlir::Type eleTy =
           fir::unwrapSequenceType(fir::unwrapRefType(load.getType()));
       auto seqTy = fir::SequenceType::get(eleTy, shape.size());
       mlir::Value castTo =
           builder.createConvert(loc, fir::HeapType::get(seqTy), load);
       mlir::Value shapeOp = builder.genShape(loc, shape);
-      return builder.create<fir::ArrayLoadOp>(
-          loc, seqTy, castTo, shapeOp, /*slice=*/mlir::Value{}, std::nullopt);
+      return fir::ArrayLoadOp::create(builder, loc, seqTy, castTo, shapeOp,
+                                      /*slice=*/mlir::Value{},
+                                      mlir::ValueRange{});
     };
     // Custom lowering of the element store to deal with the extra indirection
     // to the lazy allocated buffer.
     ccStoreToDest = [=](IterSpace iters) {
       mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-      auto var = builder.create<fir::CoordinateOp>(
-          loc, builder.getRefType(hdrTy.getType(1)), header, one);
-      auto load = builder.create<fir::LoadOp>(loc, var);
+      auto var = fir::CoordinateOp::create(
+          builder, loc, builder.getRefType(hdrTy.getType(1)), header, one);
+      auto load = fir::LoadOp::create(builder, loc, var);
       mlir::Type eleTy =
           fir::unwrapSequenceType(fir::unwrapRefType(load.getType()));
       auto seqTy = fir::SequenceType::get(eleTy, iters.iterVec().size());
@@ -3623,12 +3627,12 @@ class ArrayExprLowering {
       mlir::Value shape = builder.genShape(loc, genIterationShape());
       llvm::SmallVector<mlir::Value> indices = fir::factory::originateIndices(
           loc, builder, castTo.getType(), shape, iters.iterVec());
-      auto eleAddr = builder.create<fir::ArrayCoorOp>(
-          loc, builder.getRefType(eleTy), castTo, shape,
+      auto eleAddr = fir::ArrayCoorOp::create(
+          builder, loc, builder.getRefType(eleTy), castTo, shape,
           /*slice=*/mlir::Value{}, indices, destination.getTypeparams());
       mlir::Value eleVal =
           builder.createConvert(loc, eleTy, iters.getElement());
-      builder.create<fir::StoreOp>(loc, eleVal, eleAddr);
+      fir::StoreOp::create(builder, loc, eleVal, eleAddr);
       return iters.innerArgument();
     };
 
@@ -3684,10 +3688,10 @@ class ArrayExprLowering {
     auto exv = lowerArrayExpression(rhs);
     if (explicitSpaceIsActive()) {
       explicitSpace->finalizeContext();
-      builder.create<fir::ResultOp>(loc, fir::getBase(exv));
+      fir::ResultOp::create(builder, loc, fir::getBase(exv));
     } else {
-      builder.create<fir::ArrayMergeStoreOp>(
-          loc, destination, fir::getBase(exv), destination.getMemref(),
+      fir::ArrayMergeStoreOp::create(
+          builder, loc, destination, fir::getBase(exv), destination.getMemref(),
           destination.getSlice(), destination.getTypeparams());
     }
   }
@@ -3766,7 +3770,7 @@ class ArrayExprLowering {
       std::size_t offset = explicitSpace->argPosition(oldInnerArg);
       explicitSpace->setInnerArg(offset, fir::getBase(lexv));
       finalizeElementCtx();
-      builder.create<fir::ResultOp>(loc, fir::getBase(lexv));
+      fir::ResultOp::create(builder, loc, fir::getBase(lexv));
     };
     if (mlir::Operation *defOp = fir::getBase(lexv).getDefiningOp()) {
       llvm::TypeSwitch<mlir::Operation *>(defOp)
@@ -3836,7 +3840,7 @@ class ArrayExprLowering {
     // 5). Thread the array value updated forward.
     if (!isIllFormedLHS) {
       finalizeElementCtx();
-      builder.create<fir::ResultOp>(getLoc(), fir::getBase(lexv));
+      fir::ResultOp::create(builder, getLoc(), fir::getBase(lexv));
     }
     return lexv;
   }
@@ -3979,7 +3983,7 @@ class ArrayExprLowering {
     if (auto origEleTy = fir::dyn_cast_ptrEleTy(origVal.getType()))
       if (mlir::isa<fir::BaseBoxType>(origEleTy)) {
         // If origVal is a box variable, load it so it is in the value domain.
-        origVal = builder.create<fir::LoadOp>(loc, origVal);
+        origVal = fir::LoadOp::create(builder, loc, origVal);
       }
     if (mlir::isa<fir::BoxType>(origVal.getType()) &&
         !mlir::isa<fir::BoxType>(eleTy)) {
@@ -3998,7 +4002,7 @@ class ArrayExprLowering {
         TODO(loc, "TARGET of pointer assignment with runtime size/shape");
       auto memrefTy = fir::boxMemRefType(mlir::cast<fir::BoxType>(eleTy));
       auto castTo = builder.createConvert(loc, memrefTy, origVal);
-      origVal = builder.create<fir::EmboxOp>(loc, eleTy, castTo);
+      origVal = fir::EmboxOp::create(builder, loc, eleTy, castTo);
     }
     mlir::Value val = builder.convertWithSemantics(loc, eleTy, origVal);
     if (isBoundsSpec()) {
@@ -4007,9 +4011,9 @@ class ArrayExprLowering {
       if (lbs.size() > 0) {
         // Rebox the value with user-specified shift.
         auto shiftTy = fir::ShiftType::get(eleTy.getContext(), lbs.size());
-        mlir::Value shiftOp = builder.create<fir::ShiftOp>(loc, shiftTy, lbs);
-        val = builder.create<fir::ReboxOp>(loc, eleTy, val, shiftOp,
-                                           mlir::Value{});
+        mlir::Value shiftOp = fir::ShiftOp::create(builder, loc, shiftTy, lbs);
+        val = fir::ReboxOp::create(builder, loc, eleTy, val, shiftOp,
+                                   mlir::Value{});
       }
     } else if (isBoundsRemap()) {
       assert(lbounds.has_value());
@@ -4020,9 +4024,9 @@ class ArrayExprLowering {
         auto shapeShiftArgs = flatZip(lbs, *ubounds);
         auto shapeTy = fir::ShapeShiftType::get(eleTy.getContext(), lbs.size());
         mlir::Value shapeShift =
-            builder.create<fir::ShapeShiftOp>(loc, shapeTy, shapeShiftArgs);
-        val = builder.create<fir::ReboxOp>(loc, eleTy, val, shapeShift,
-                                           mlir::Value{});
+            fir::ShapeShiftOp::create(builder, loc, shapeTy, shapeShiftArgs);
+        val = fir::ReboxOp::create(builder, loc, eleTy, val, shapeShift,
+                                   mlir::Value{});
       }
     }
     return val;
@@ -4045,8 +4049,8 @@ class ArrayExprLowering {
         // memory into the destination array.
         mlir::Type resRefTy = builder.getRefType(eleTy);
         // Get a reference to the array element to be amended.
-        auto arrayOp = builder.create<fir::ArrayAccessOp>(
-            loc, resRefTy, innerArg, iterSpace.iterVec(),
+        auto arrayOp = fir::ArrayAccessOp::create(
+            builder, loc, resRefTy, innerArg, iterSpace.iterVec(),
             fir::factory::getTypeParams(loc, builder, destination));
         if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
@@ -4067,9 +4071,9 @@ class ArrayExprLowering {
       }
       // By value semantics. The element is being assigned by value.
       auto ele = convertElementForUpdate(loc, eleTy, fir::getBase(exv));
-      auto update = builder.create<fir::ArrayUpdateOp>(
-          loc, arrTy, innerArg, ele, iterSpace.iterVec(),
-          destination.getTypeparams());
+      auto update = fir::ArrayUpdateOp::create(builder, loc, arrTy, innerArg,
+                                               ele, iterSpace.iterVec(),
+                                               destination.getTypeparams());
       return abstractArrayExtValue(update);
     };
   }
@@ -4094,7 +4098,7 @@ class ArrayExprLowering {
                       : defaultStoreToDestination(/*substring=*/nullptr);
     mlir::Value updVal = fir::getBase(lambda(iterSpace));
     finalizeElementCtx();
-    builder.create<fir::ResultOp>(loc, updVal);
+    fir::ResultOp::create(builder, loc, updVal);
     builder.restoreInsertionPoint(insPt);
     return abstractArrayExtValue(iterSpace.outerResult());
   }
@@ -4208,7 +4212,7 @@ class ArrayExprLowering {
       auto addr =
           builder->create<fir::ArrayCoorOp>(loc, eleRefTy, tmp, shape,
                                             /*slice=*/mlir::Value{}, indices,
-                                            /*typeParams=*/std::nullopt);
+                                            /*typeParams=*/mlir::ValueRange{});
       auto load = builder->create<fir::LoadOp>(loc, addr);
       return builder->createConvert(loc, i1Ty, load);
     };
@@ -4247,31 +4251,34 @@ class ArrayExprLowering {
       // Compute the dynamic position into the header.
       llvm::SmallVector<mlir::Value> offsets;
       for (auto doLoop : loopStack[i]) {
-        auto m = builder.create<mlir::arith::SubIOp>(
-            loc, doLoop.getInductionVar(), doLoop.getLowerBound());
-        auto n = builder.create<mlir::arith::DivSIOp>(loc, m, doLoop.getStep());
+        auto m = mlir::arith::SubIOp::create(
+            builder, loc, doLoop.getInductionVar(), doLoop.getLowerBound());
+        auto n =
+            mlir::arith::DivSIOp::create(builder, loc, m, doLoop.getStep());
         mlir::Value one = builder.createIntegerConstant(loc, n.getType(), 1);
-        offsets.push_back(builder.create<mlir::arith::AddIOp>(loc, n, one));
+        offsets.push_back(mlir::arith::AddIOp::create(builder, loc, n, one));
       }
       mlir::IntegerType i32Ty = builder.getIntegerType(32);
       mlir::Value uno = builder.createIntegerConstant(loc, i32Ty, 1);
       mlir::Type coorTy = builder.getRefType(raggedTy.getType(1));
-      auto hdOff = builder.create<fir::CoordinateOp>(loc, coorTy, header, uno);
+      auto hdOff = fir::CoordinateOp::create(builder, loc, coorTy, header, uno);
       auto toTy = fir::SequenceType::get(raggedTy, offsets.size());
       mlir::Type toRefTy = builder.getRefType(toTy);
-      auto ldHdr = builder.create<fir::LoadOp>(loc, hdOff);
+      auto ldHdr = fir::LoadOp::create(builder, loc, hdOff);
       mlir::Value hdArr = builder.createConvert(loc, toRefTy, ldHdr);
       auto shapeOp = builder.genShape(loc, extents);
-      header = builder.create<fir::ArrayCoorOp>(
-          loc, builder.getRefType(raggedTy), hdArr, shapeOp,
+      header = fir::ArrayCoorOp::create(
+          builder, loc, builder.getRefType(raggedTy), hdArr, shapeOp,
           /*slice=*/mlir::Value{}, offsets,
           /*typeparams=*/mlir::ValueRange{});
-      auto hdrVar = builder.create<fir::CoordinateOp>(loc, coorTy, header, uno);
-      auto inVar = builder.create<fir::LoadOp>(loc, hdrVar);
+      auto hdrVar =
+          fir::CoordinateOp::create(builder, loc, coorTy, header, uno);
+      auto inVar = fir::LoadOp::create(builder, loc, hdrVar);
       mlir::Value two = builder.createIntegerConstant(loc, i32Ty, 2);
       mlir::Type coorTy2 = builder.getRefType(raggedTy.getType(2));
-      auto hdrSh = builder.create<fir::CoordinateOp>(loc, coorTy2, header, two);
-      auto shapePtr = builder.create<fir::LoadOp>(loc, hdrSh);
+      auto hdrSh =
+          fir::CoordinateOp::create(builder, loc, coorTy2, header, two);
+      auto shapePtr = fir::LoadOp::create(builder, loc, hdrSh);
       // Replace the binding.
       implicitSpace->rebind(expr, genMaskAccess(inVar, shapePtr));
       if (i < depth - 1)
@@ -4301,7 +4308,7 @@ class ArrayExprLowering {
             Fortran::lower::createLazyArrayTempValue(converter, *e, header,
                                                      symMap, stmtCtx);
             // Close the explicit loops.
-            builder.create<fir::ResultOp>(loc, explicitSpace->getInnerArgs());
+            fir::ResultOp::create(builder, loc, explicitSpace->getInnerArgs());
             builder.setInsertionPointAfter(explicitSpace->getOuterLoop());
             // Open a new copy of the explicit loop nest.
             explicitSpace->genLoopNest();
@@ -4327,9 +4334,10 @@ class ArrayExprLowering {
               fir::factory::getRaggedArrayHeaderType(builder);
           mlir::IntegerType i32Ty = builder.getIntegerType(32);
           mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-          auto coor1 = builder.create<fir::CoordinateOp>(
-              loc, builder.getRefType(raggedTy.getType(1)), header, one);
-          auto db = builder.create<fir::LoadOp>(loc, coor1);
+          auto coor1 = fir::CoordinateOp::create(
+              builder, loc, builder.getRefType(raggedTy.getType(1)), header,
+              one);
+          auto db = fir::LoadOp::create(builder, loc, coor1);
           mlir::Type eleTy =
               fir::unwrapSequenceType(fir::unwrapRefType(db.getType()));
           mlir::Type buffTy =
@@ -4338,17 +4346,18 @@ class ArrayExprLowering {
           mlir::Value buff = builder.createConvert(loc, buffTy, db);
 
           mlir::Value two = builder.createIntegerConstant(loc, i32Ty, 2);
-          auto coor2 = builder.create<fir::CoordinateOp>(
-              loc, builder.getRefType(raggedTy.getType(2)), header, two);
-          auto shBuff = builder.create<fir::LoadOp>(loc, coor2);
+          auto coor2 = fir::CoordinateOp::create(
+              builder, loc, builder.getRefType(raggedTy.getType(2)), header,
+              two);
+          auto shBuff = fir::LoadOp::create(builder, loc, coor2);
           mlir::IntegerType i64Ty = builder.getIntegerType(64);
           mlir::IndexType idxTy = builder.getIndexType();
           llvm::SmallVector<mlir::Value> extents;
           for (std::remove_const_t<decltype(rank)> i = 0; i < rank; ++i) {
             mlir::Value off = builder.createIntegerConstant(loc, i32Ty, i);
-            auto coor = builder.create<fir::CoordinateOp>(
-                loc, builder.getRefType(i64Ty), shBuff, off);
-            auto ldExt = builder.create<fir::LoadOp>(loc, coor);
+            auto coor = fir::CoordinateOp::create(
+                builder, loc, builder.getRefType(i64Ty), shBuff, off);
+            auto ldExt = fir::LoadOp::create(builder, loc, coor);
             extents.push_back(builder.createConvert(loc, idxTy, ldExt));
           }
           if (destShape.empty())
@@ -4376,7 +4385,7 @@ class ArrayExprLowering {
     // run from 0 to `extent - 1` inclusive.
     for (auto extent : shape)
       loopUppers.push_back(
-          builder.create<mlir::arith::SubIOp>(loc, extent, one));
+          mlir::arith::SubIOp::create(builder, loc, extent, one));
 
     // Iteration space is created with outermost columns, innermost rows
     llvm::SmallVector<fir::DoLoopOp> loops;
@@ -4391,16 +4400,16 @@ class ArrayExprLowering {
       }
       fir::DoLoopOp loop;
       if (innerArg) {
-        loop = builder.create<fir::DoLoopOp>(
-            loc, zero, i.value(), one, isUnordered(),
+        loop = fir::DoLoopOp::create(
+            builder, loc, zero, i.value(), one, isUnordered(),
             /*finalCount=*/false, mlir::ValueRange{innerArg});
         innerArg = loop.getRegionIterArgs().front();
         if (explicitSpaceIsActive())
           explicitSpace->setInnerArg(0, innerArg);
       } else {
-        loop = builder.create<fir::DoLoopOp>(loc, zero, i.value(), one,
-                                             isUnordered(),
-                                             /*finalCount=*/false);
+        loop = fir::DoLoopOp::create(builder, loc, zero, i.value(), one,
+                                     isUnordered(),
+                                     /*finalCount=*/false);
       }
       ivars.push_back(loop.getInductionVar());
       loops.push_back(loop);
@@ -4410,7 +4419,7 @@ class ArrayExprLowering {
       for (std::remove_const_t<decltype(loopDepth)> i = 0; i + 1 < loopDepth;
            ++i) {
         builder.setInsertionPointToEnd(loops[i].getBody());
-        builder.create<fir::ResultOp>(loc, loops[i + 1].getResult(0));
+        fir::ResultOp::create(builder, loc, loops[i + 1].getResult(0));
       }
 
     // Move insertion point to the start of the innermost loop in the nest.
@@ -4468,21 +4477,23 @@ class ArrayExprLowering {
            implicitSpace->getMasks()) {
         const std::size_t size = maskExprs.size() - 1;
         auto genFalseBlock = [&](const auto *e, auto &&cond) {
-          auto ifOp = builder.create<fir::IfOp>(
-              loc, mlir::TypeRange{innerArg.getType()}, fir::getBase(cond),
-              /*withElseRegion=*/true);
-          builder.create<fir::ResultOp>(loc, ifOp.getResult(0));
+          auto ifOp = fir::IfOp::create(builder, loc,
+                                        mlir::TypeRange{innerArg.getType()},
+                                        fir::getBase(cond),
+                                        /*withElseRegion=*/true);
+          fir::ResultOp::create(builder, loc, ifOp.getResult(0));
           builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-          builder.create<fir::ResultOp>(loc, innerArg);
+          fir::ResultOp::create(builder, loc, innerArg);
           builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
         };
         auto genTrueBlock = [&](const auto *e, auto &&cond) {
-          auto ifOp = builder.create<fir::IfOp>(
-              loc, mlir::TypeRange{innerArg.getType()}, fir::getBase(cond),
-              /*withElseRegion=*/true);
-          builder.create<fir::ResultOp>(loc, ifOp.getResult(0));
+          auto ifOp = fir::IfOp::create(builder, loc,
+                                        mlir::TypeRange{innerArg.getType()},
+                                        fir::getBase(cond),
+                                        /*withElseRegion=*/true);
+          fir::ResultOp::create(builder, loc, ifOp.getResult(0));
           builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-          builder.create<fir::ResultOp>(loc, innerArg);
+          fir::ResultOp::create(builder, loc, innerArg);
           builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
         };
         for (std::remove_const_t<decltype(size)> i = 0; i < size; ++i)
@@ -4523,17 +4534,18 @@ class ArrayExprLowering {
         fir::isRecordWithAllocatableMember(eleTy))
       TODO(loc, "creating an array temp where the element type has "
                 "allocatable members");
-    mlir::Value temp = !seqTy.hasDynamicExtents()
-                           ? builder.create<fir::AllocMemOp>(loc, type)
-                           : builder.create<fir::AllocMemOp>(
-                                 loc, type, ".array.expr", std::nullopt, shape);
+    mlir::Value temp =
+        !seqTy.hasDynamicExtents()
+            ? fir::AllocMemOp::create(builder, loc, type)
+            : fir::AllocMemOp::create(builder, loc, type, ".array.expr",
+                                      mlir::ValueRange{}, shape);
     fir::FirOpBuilder *bldr = &converter.getFirOpBuilder();
     stmtCtx.attachCleanup(
         [bldr, loc, temp]() { bldr->create<fir::FreeMemOp>(loc, temp); });
     mlir::Value shapeOp = genShapeOp(shape);
-    return builder.create<fir::ArrayLoadOp>(loc, seqTy, temp, shapeOp,
-                                            /*slice=*/mlir::Value{},
-                                            std::nullopt);
+    return fir::ArrayLoadOp::create(builder, loc, seqTy, temp, shapeOp,
+                                    /*slice=*/mlir::Value{},
+                                    mlir::ValueRange{});
   }
 
   static fir::ShapeOp genShapeOp(mlir::Location loc, fir::FirOpBuilder &builder,
@@ -4542,7 +4554,7 @@ class ArrayExprLowering {
     llvm::SmallVector<mlir::Value> idxShape;
     for (auto s : shape)
       idxShape.push_back(builder.createConvert(loc, idxTy, s));
-    return builder.create<fir::ShapeOp>(loc, idxShape);
+    return fir::ShapeOp::create(builder, loc, idxShape);
   }
 
   fir::ShapeOp genShapeOp(llvm::ArrayRef<mlir::Value> shape) {
@@ -4790,7 +4802,7 @@ class ArrayExprLowering {
       mlir::Type argTy = callSiteType.getInput(arg.firArgument);
       if (!actual) {
         // Optional dummy argument for which there is no actual argument.
-        auto absent = builder.create<fir::AbsentOp>(loc, argTy);
+        auto absent = fir::AbsentOp::create(builder, loc, argTy);
         operands.emplace_back([=](IterSpace) { return absent; });
         continue;
       }
@@ -4823,7 +4835,7 @@ class ArrayExprLowering {
               builder.createTemporary(loc, val.getType(),
                                       llvm::ArrayRef<mlir::NamedAttribute>{
                                           fir::getAdaptToByRefAttr(builder)});
-          builder.create<fir::StoreOp>(loc, val, temp);
+          fir::StoreOp::create(builder, loc, val, temp);
           operands.emplace_back(
               [=](IterSpace iters) -> ExtValue { return temp; });
         }
@@ -4882,14 +4894,14 @@ class ArrayExprLowering {
                 fir::dyn_cast_ptrOrBoxEleTy(fir::getBase(exv).getType());
             mlir::Type innerTy = fir::unwrapSequenceType(baseTy);
             operands.emplace_back([=](IterSpace iters) -> ExtValue {
-              mlir::Value coord = builder.create<fir::CoordinateOp>(
-                  loc, fir::ReferenceType::get(innerTy), fir::getBase(exv),
-                  iters.iterVec());
+              mlir::Value coord = fir::CoordinateOp::create(
+                  builder, loc, fir::ReferenceType::get(innerTy),
+                  fir::getBase(exv), iters.iterVec());
               mlir::Value empty;
               mlir::ValueRange emptyRange;
-              return builder.create<fir::EmboxOp>(
-                  loc, fir::ClassType::get(innerTy), coord, empty, empty,
-                  emptyRange, sourceBox);
+              return fir::EmboxOp::create(builder, loc,
+                                          fir::ClassType::get(innerTy), coord,
+                                          empty, empty, emptyRange, sourceBox);
             });
           } else {
             ExtValue exv = asScalarRef(*expr);
@@ -4902,9 +4914,9 @@ class ArrayExprLowering {
               operands.emplace_back([=](IterSpace iters) -> ExtValue {
                 mlir::Value empty;
                 mlir::ValueRange emptyRange;
-                return builder.create<fir::EmboxOp>(
-                    loc, fir::ClassType::get(baseTy), fir::getBase(exv), empty,
-                    empty, emptyRange);
+                return fir::EmboxOp::create(
+                    builder, loc, fir::ClassType::get(baseTy),
+                    fir::getBase(exv), empty, empty, emptyRange);
               });
             }
           }
@@ -5096,8 +5108,8 @@ class ArrayExprLowering {
     return exv.match(
         [&](const fir::CharBoxValue &cb) -> ExtValue {
           mlir::Value len = cb.getLen();
-          auto mem =
-              builder.create<fir::AllocaOp>(loc, toType, mlir::ValueRange{len});
+          auto mem = fir::AllocaOp::create(builder, loc, toType,
+                                           mlir::ValueRange{len});
           fir::CharBoxValue result(mem, len);
           fir::factory::CharacterExprHelper{builder, loc}.createAssign(
               ExtValue{result}, exv);
@@ -5152,7 +5164,7 @@ class ArrayExprLowering {
       auto val = f(iters);
       mlir::Value base = fir::getBase(val);
       auto newBase =
-          builder.create<fir::NoReassocOp>(loc, base.getType(), base);
+          fir::NoReassocOp::create(builder, loc, base.getType(), base);
       return fir::substBase(val, newBase);
     };
   }
@@ -5169,10 +5181,10 @@ class ArrayExprLowering {
       if constexpr (CAT == Fortran::common::TypeCategory::Unsigned) {
         mlir::Value signless = builder.createConvert(loc, ty, val);
         mlir::Value neg =
-            builder.create<mlir::arith::SubIOp>(loc, zero, signless);
+            mlir::arith::SubIOp::create(builder, loc, zero, signless);
         return builder.createConvert(loc, val.getType(), neg);
       }
-      return builder.create<mlir::arith::SubIOp>(loc, zero, val);
+      return mlir::arith::SubIOp::create(builder, loc, zero, val);
     };
   }
   template <int KIND>
@@ -5191,7 +5203,7 @@ class ArrayExprLowering {
     mlir::Location loc = getLoc();
     auto f = genarr(x.left());
     return [=](IterSpace iters) -> ExtValue {
-      return builder.create<mlir::arith::NegFOp>(loc, fir::getBase(f(iters)));
+      return mlir::arith::NegFOp::create(builder, loc, fir::getBase(f(iters)));
     };
   }
   template <int KIND>
@@ -5200,7 +5212,7 @@ class ArrayExprLowering {
     mlir::Location loc = getLoc();
     auto f = genarr(x.left());
     return [=](IterSpace iters) -> ExtValue {
-      return builder.create<fir::NegcOp>(loc, fir::getBase(f(iters)));
+      return fir::NegcOp::create(builder, loc, fir::getBase(f(iters)));
     };
   }
 
@@ -5425,8 +5437,8 @@ class ArrayExprLowering {
     mlir::Location loc = getLoc();
     mlir::Value lb = getLBound(x, dim, one);
     mlir::Value extent = fir::factory::readExtent(builder, loc, x, dim);
-    auto add = builder.create<mlir::arith::AddIOp>(loc, lb, extent);
-    return builder.create<mlir::arith::SubIOp>(loc, add, one);
+    auto add = mlir::arith::AddIOp::create(builder, loc, lb, extent);
+    return mlir::arith::SubIOp::create(builder, loc, add, one);
   }
 
   /// Return the extent of the boxed array `x` in dimesion `dim`.
@@ -5468,11 +5480,11 @@ class ArrayExprLowering {
     if (destShape[0] != savedDestShape[dim]) {
       // Not the same, so choose the smaller value.
       mlir::Location loc = getLoc();
-      auto cmp = builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::sgt, destShape[0],
-          savedDestShape[dim]);
-      auto sel = builder.create<mlir::arith::SelectOp>(
-          loc, cmp, savedDestShape[dim], destShape[0]);
+      auto cmp = mlir::arith::CmpIOp::create(builder, loc,
+                                             mlir::arith::CmpIPredicate::sgt,
+                                             destShape[0], savedDestShape[dim]);
+      auto sel = mlir::arith::SelectOp::create(
+          builder, loc, cmp, savedDestShape[dim], destShape[0]);
       savedDestShape[dim] = sel;
       destShape = savedDestShape;
     }
@@ -5548,12 +5560,12 @@ class ArrayExprLowering {
                     // FIXME: must use the lower bound of this component.
                     auto arrLowerBound =
                         atBase ? getLBound(arrayExv, subsIndex, one) : one;
-                    auto initial = builder.create<mlir::arith::SubIOp>(
-                        loc, lowerBound, arrLowerBound);
-                    auto prod = builder.create<mlir::arith::MulIOp>(
-                        loc, impliedIter, stride);
-                    auto result =
-                        builder.create<mlir::arith::AddIOp>(loc, initial, prod);
+                    auto initial = mlir::arith::SubIOp::create(
+                        builder, loc, lowerBound, arrLowerBound);
+                    auto prod = mlir::arith::MulIOp::create(
+                        builder, loc, impliedIter, stride);
+                    auto result = mlir::arith::AddIOp::create(builder, loc,
+                                                              initial, prod);
                     newIters.setIndexValue(subsIndex, result);
                     return newIters;
                   };
@@ -5587,15 +5599,15 @@ class ArrayExprLowering {
                     // using the base array's lower bound value.
                     mlir::Value lb = fir::factory::readLowerBound(
                         builder, loc, arrayExv, subsIndex, one);
-                    auto origin = builder.create<mlir::arith::SubIOp>(
-                        loc, idxTy, val, lb);
+                    auto origin = mlir::arith::SubIOp::create(builder, loc,
+                                                              idxTy, val, lb);
                     newIters.setIndexValue(subsIndex, origin);
                     return newIters;
                   };
                   if (useTripsForSlice) {
                     LLVM_ATTRIBUTE_UNUSED auto vectorSubscriptShape =
                         getShape(arrayOperands.back());
-                    auto undef = builder.create<fir::UndefOp>(loc, idxTy);
+                    auto undef = fir::UndefOp::create(builder, loc, idxTy);
                     trips.push_back(undef);
                     trips.push_back(undef);
                     trips.push_back(undef);
@@ -5613,7 +5625,7 @@ class ArrayExprLowering {
                     // the array's declared rank.
                     mlir::Value v = fir::getBase(asScalarArray(e));
                     trips.push_back(v);
-                    auto undef = builder.create<fir::UndefOp>(loc, idxTy);
+                    auto undef = fir::UndefOp::create(builder, loc, idxTy);
                     trips.push_back(undef);
                     trips.push_back(undef);
                     auto currentPC = pc;
@@ -5622,8 +5634,8 @@ class ArrayExprLowering {
                     // Normalize `e` by subtracting the declared lbound.
                     mlir::Value lb = fir::factory::readLowerBound(
                         builder, loc, arrayExv, subsIndex, one);
-                    mlir::Value ivAdj =
-                        builder.create<mlir::arith::SubIOp>(loc, idxTy, iv, lb);
+                    mlir::Value ivAdj = mlir::arith::SubIOp::create(
+                        builder, loc, idxTy, iv, lb);
                     // Add lbound adjusted value of `e` to the iteration vector
                     // (except when creating a box because the iteration vector
                     // is empty).
@@ -5640,8 +5652,8 @@ class ArrayExprLowering {
                         builder.createConvert(loc, idxTy, newValue);
                     mlir::Value lb = fir::factory::readLowerBound(
                         builder, loc, arrayExv, subsIndex, one);
-                    result = builder.create<mlir::arith::SubIOp>(loc, idxTy,
-                                                                 result, lb);
+                    result = mlir::arith::SubIOp::create(builder, loc, idxTy,
+                                                         result, lb);
                     pc = [=](IterSpace iters) {
                       IterationSpace newIters = currentPC(iters);
                       newIters.insertIndexValue(subsIndex, result);
@@ -5670,7 +5682,7 @@ class ArrayExprLowering {
     auto seqType = mlir::cast<fir::SequenceType>(ty);
     for (auto extent : seqType.getShape()) {
       auto v = extent == fir::SequenceType::getUnknownExtent()
-                   ? builder.create<fir::UndefOp>(loc, idxTy).getResult()
+                   ? fir::UndefOp::create(builder, loc, idxTy).getResult()
                    : builder.createIntegerConstant(loc, idxTy, extent);
       result.push_back(v);
     }
@@ -5763,20 +5775,20 @@ class ArrayExprLowering {
         mlir::Value one =
             builder.createIntegerConstant(loc, substringBounds[0].getType(), 1);
         substringBounds[0] =
-            builder.create<mlir::arith::SubIOp>(loc, substringBounds[0], one);
+            mlir::arith::SubIOp::create(builder, loc, substringBounds[0], one);
         // Convert the upper bound to a length.
         mlir::Value cast = builder.createConvert(loc, iTy, substringBounds[1]);
         mlir::Value zero = builder.createIntegerConstant(loc, iTy, 0);
         auto size =
-            builder.create<mlir::arith::SubIOp>(loc, cast, substringBounds[0]);
-        auto cmp = builder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::sgt, size, zero);
+            mlir::arith::SubIOp::create(builder, loc, cast, substringBounds[0]);
+        auto cmp = mlir::arith::CmpIOp::create(
+            builder, loc, mlir::arith::CmpIPredicate::sgt, size, zero);
         // size = MAX(upper - (lower - 1), 0)
         substringBounds[1] =
-            builder.create<mlir::arith::SelectOp>(loc, cmp, size, zero);
-        slice = builder.create<fir::SliceOp>(
-            loc, padSlice(components.trips, shape), components.suffixComponents,
-            substringBounds);
+            mlir::arith::SelectOp::create(builder, loc, cmp, size, zero);
+        slice = fir::SliceOp::create(
+            builder, loc, padSlice(components.trips, shape),
+            components.suffixComponents, substringBounds);
       } else {
         slice = builder.createSlice(loc, extMemref, components.trips,
                                     components.suffixComponents);
@@ -5826,7 +5838,7 @@ class ArrayExprLowering {
       }
       mlir::Value embox =
           mlir::isa<fir::BaseBoxType>(memref.getType())
-              ? builder.create<fir::ReboxOp>(loc, boxTy, memref, shape, slice)
+              ? fir::ReboxOp::create(builder, loc, boxTy, memref, shape, slice)
                     .getResult()
               : builder
                     .create<fir::EmboxOp>(loc, boxTy, memref, shape, slice,
@@ -5848,8 +5860,8 @@ class ArrayExprLowering {
         // ArrayCoorOp does not expect zero based indices.
         llvm::SmallVector<mlir::Value> indices = fir::factory::originateIndices(
             loc, builder, memref.getType(), shape, iters.iterVec());
-        mlir::Value coor = builder.create<fir::ArrayCoorOp>(
-            loc, refEleTy, memref, shape, slice, indices,
+        mlir::Value coor = fir::ArrayCoorOp::create(
+            builder, loc, refEleTy, memref, shape, slice, indices,
             fir::getTypeParams(extMemref));
         if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
@@ -5868,16 +5880,17 @@ class ArrayExprLowering {
             builder, loc, extMemref, coor, slice);
       };
     }
-    auto arrLoad = builder.create<fir::ArrayLoadOp>(
-        loc, arrTy, memref, shape, slice, fir::getTypeParams(extMemref));
+    auto arrLoad =
+        fir::ArrayLoadOp::create(builder, loc, arrTy, memref, shape, slice,
+                                 fir::getTypeParams(extMemref));
 
     if (CrayPtr) {
       mlir::Type ptrTy = CrayPtr.getType();
       mlir::Value cnvrt = Fortran::lower::addCrayPointerInst(
           loc, builder, CrayPtr, ptrTy, memref.getType());
-      auto addr = builder.create<fir::LoadOp>(loc, cnvrt);
-      arrLoad = builder.create<fir::ArrayLoadOp>(loc, arrTy, addr, shape, slice,
-                                                 fir::getTypeParams(extMemref));
+      auto addr = fir::LoadOp::create(builder, loc, cnvrt);
+      arrLoad = fir::ArrayLoadOp::create(builder, loc, arrTy, addr, shape,
+                                         slice, fir::getTypeParams(extMemref));
     }
 
     mlir::Value arrLd = arrLoad.getResult();
@@ -5905,9 +5918,9 @@ class ArrayExprLowering {
         mlir::Type eleTy = fir::applyPathToType(resTy, iters.iterVec());
         mlir::Type refEleTy =
             fir::isa_ref_type(eleTy) ? eleTy : builder.getRefType(eleTy);
-        auto arrModify = builder.create<fir::ArrayModifyOp>(
-            loc, mlir::TypeRange{refEleTy, resTy}, innerArg, iters.iterVec(),
-            destination.getTypeparams());
+        auto arrModify = fir::ArrayModifyOp::create(
+            builder, loc, mlir::TypeRange{refEleTy, resTy}, innerArg,
+            iters.iterVec(), destination.getTypeparams());
         return abstractArrayExtValue(arrModify.getResult(1));
       };
     }
@@ -5931,17 +5944,17 @@ class ArrayExprLowering {
         mlir::Type eleTy = fir::applyPathToType(arrTy, iters.iterVec());
         if (isAdjustedArrayElementType(eleTy)) {
           mlir::Type eleRefTy = builder.getRefType(eleTy);
-          base = builder.create<fir::ArrayAccessOp>(
-              loc, eleRefTy, arrLd, iters.iterVec(), arrLdTypeParams);
+          base = fir::ArrayAccessOp::create(builder, loc, eleRefTy, arrLd,
+                                            iters.iterVec(), arrLdTypeParams);
         } else {
-          base = builder.create<fir::ArrayFetchOp>(
-              loc, eleTy, arrLd, iters.iterVec(), arrLdTypeParams);
+          base = fir::ArrayFetchOp::create(builder, loc, eleTy, arrLd,
+                                           iters.iterVec(), arrLdTypeParams);
         }
         mlir::Value temp =
             builder.createTemporary(loc, base.getType(),
                                     llvm::ArrayRef<mlir::NamedAttribute>{
                                         fir::getAdaptToByRefAttr(builder)});
-        builder.create<fir::StoreOp>(loc, base, temp);
+        fir::StoreOp::create(builder, loc, base, temp);
         return fir::factory::arraySectionElementToExtendedValue(
             builder, loc, extMemref, temp, slice);
       };
@@ -5952,8 +5965,8 @@ class ArrayExprLowering {
       mlir::Type eleTy = fir::applyPathToType(arrTy, iters.iterVec());
       if (isAdjustedArrayElementType(eleTy)) {
         mlir::Type eleRefTy = builder.getRefType(eleTy);
-        mlir::Value arrayOp = builder.create<fir::ArrayAccessOp>(
-            loc, eleRefTy, arrLd, iters.iterVec(), arrLdTypeParams);
+        mlir::Value arrayOp = fir::ArrayAccessOp::create(
+            builder, loc, eleRefTy, arrLd, iters.iterVec(), arrLdTypeParams);
         if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, components.substring);
@@ -5968,8 +5981,8 @@ class ArrayExprLowering {
         return fir::factory::arraySectionElementToExtendedValue(
             builder, loc, extMemref, arrayOp, slice);
       }
-      auto arrFetch = builder.create<fir::ArrayFetchOp>(
-          loc, eleTy, arrLd, iters.iterVec(), arrLdTypeParams);
+      auto arrFetch = fir::ArrayFetchOp::create(
+          builder, loc, eleTy, arrLd, iters.iterVec(), arrLdTypeParams);
       return fir::factory::arraySectionElementToExtendedValue(
           builder, loc, extMemref, arrFetch, slice);
     };
@@ -6008,8 +6021,8 @@ class ArrayExprLowering {
     mlir::Value memref = fir::getBase(exv);
     mlir::Value shape = builder.createShape(loc, exv);
     mlir::Value noSlice;
-    auto arrLoad = builder.create<fir::ArrayLoadOp>(
-        loc, arrType, memref, shape, noSlice, fir::getTypeParams(exv));
+    auto arrLoad = fir::ArrayLoadOp::create(
+        builder, loc, arrType, memref, shape, noSlice, fir::getTypeParams(exv));
     mlir::Operation::operand_range arrLdTypeParams = arrLoad.getTypeparams();
     mlir::Value arrLd = arrLoad.getResult();
     // Mark the load to tell later passes it is unsafe to use this array_load
@@ -6024,8 +6037,8 @@ class ArrayExprLowering {
 
     // By value semantics.
     auto cc = [=](IterSpace iters) -> ExtValue {
-      auto arrFetch = builder.create<fir::ArrayFetchOp>(
-          loc, eleType, arrLd, iters.iterVec(), arrLdTypeParams);
+      auto arrFetch = fir::ArrayFetchOp::create(
+          builder, loc, eleType, arrLd, iters.iterVec(), arrLdTypeParams);
       return fir::factory::arraySectionElementToExtendedValue(
           builder, loc, exv, arrFetch, noSlice);
     };
@@ -6063,12 +6076,12 @@ class ArrayExprLowering {
               .genIfOp(loc, {eleType}, isPresent,
                        /*withElseRegion=*/true)
               .genThen([&]() {
-                builder.create<fir::ResultOp>(loc, fir::getBase(cc(iters)));
+                fir::ResultOp::create(builder, loc, fir::getBase(cc(iters)));
               })
               .genElse([&]() {
                 mlir::Value zero =
                     fir::factory::createZeroValue(builder, loc, eleType);
-                builder.create<fir::ResultOp>(loc, zero);
+                fir::ResultOp::create(builder, loc, zero);
               })
               .getResults()[0];
       return elementValue;
@@ -6177,8 +6190,8 @@ class ArrayExprLowering {
     mlir::Type eleRefTy = builder.getRefType(eleTy);
     mlir::Type resRefTy = builder.getRefType(resTy);
     mlir::Value nullPtr = builder.createNullConstant(loc, resRefTy);
-    auto offset = builder.create<fir::CoordinateOp>(
-        loc, eleRefTy, nullPtr, mlir::ValueRange{multiplier});
+    auto offset = fir::CoordinateOp::create(builder, loc, eleRefTy, nullPtr,
+                                            mlir::ValueRange{multiplier});
     return builder.createConvert(loc, idxTy, offset);
   }
 
@@ -6186,14 +6199,14 @@ class ArrayExprLowering {
   mlir::FunctionType memcpyType() {
     auto ptrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
     llvm::SmallVector<mlir::Type> args = {ptrTy, ptrTy, builder.getI64Type()};
-    return mlir::FunctionType::get(builder.getContext(), args, std::nullopt);
+    return mlir::FunctionType::get(builder.getContext(), args, {});
   }
 
   /// Create a call to the LLVM memcpy intrinsic.
   void createCallMemcpy(llvm::ArrayRef<mlir::Value> args, bool isVolatile) {
     mlir::Location loc = getLoc();
-    builder.create<mlir::LLVM::MemcpyOp>(loc, args[0], args[1], args[2],
-                                         isVolatile);
+    mlir::LLVM::MemcpyOp::create(builder, loc, args[0], args[1], args[2],
+                                 isVolatile);
   }
 
   // Construct code to check for a buffer overrun and realloc the buffer when
@@ -6203,32 +6216,33 @@ class ArrayExprLowering {
                          mlir::Value eleSz) {
     mlir::Location loc = getLoc();
     mlir::func::FuncOp reallocFunc = fir::factory::getRealloc(builder);
-    auto cond = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::sle, bufferSize, needed);
-    auto ifOp = builder.create<fir::IfOp>(loc, mem.getType(), cond,
-                                          /*withElseRegion=*/true);
+    auto cond = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::sle, bufferSize, needed);
+    auto ifOp = fir::IfOp::create(builder, loc, mem.getType(), cond,
+                                  /*withElseRegion=*/true);
     auto insPt = builder.saveInsertionPoint();
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     // Not enough space, resize the buffer.
     mlir::IndexType idxTy = builder.getIndexType();
     mlir::Value two = builder.createIntegerConstant(loc, idxTy, 2);
-    auto newSz = builder.create<mlir::arith::MulIOp>(loc, needed, two);
-    builder.create<fir::StoreOp>(loc, newSz, buffSize);
-    mlir::Value byteSz = builder.create<mlir::arith::MulIOp>(loc, newSz, eleSz);
+    auto newSz = mlir::arith::MulIOp::create(builder, loc, needed, two);
+    fir::StoreOp::create(builder, loc, newSz, buffSize);
+    mlir::Value byteSz =
+        mlir::arith::MulIOp::create(builder, loc, newSz, eleSz);
     mlir::SymbolRefAttr funcSymAttr =
         builder.getSymbolRefAttr(reallocFunc.getName());
     mlir::FunctionType funcTy = reallocFunc.getFunctionType();
-    auto newMem = builder.create<fir::CallOp>(
-        loc, funcSymAttr, funcTy.getResults(),
+    auto newMem = fir::CallOp::create(
+        builder, loc, funcSymAttr, funcTy.getResults(),
         llvm::ArrayRef<mlir::Value>{
             builder.createConvert(loc, funcTy.getInputs()[0], mem),
             builder.createConvert(loc, funcTy.getInputs()[1], byteSz)});
     mlir::Value castNewMem =
         builder.createConvert(loc, mem.getType(), newMem.getResult(0));
-    builder.create<fir::ResultOp>(loc, castNewMem);
+    fir::ResultOp::create(builder, loc, castNewMem);
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
     // Otherwise, just forward the buffer.
-    builder.create<fir::ResultOp>(loc, mem);
+    fir::ResultOp::create(builder, loc, mem);
     builder.restoreInsertionPoint(insPt);
     return ifOp.getResult(0);
   }
@@ -6240,8 +6254,8 @@ class ArrayExprLowering {
                                        mlir::Value eleSz, mlir::Type eleTy,
                                        mlir::Type eleRefTy, mlir::Type resTy) {
     mlir::Location loc = getLoc();
-    auto off = builder.create<fir::LoadOp>(loc, buffPos);
-    auto limit = builder.create<fir::LoadOp>(loc, buffSize);
+    auto off = fir::LoadOp::create(builder, loc, buffPos);
+    auto limit = fir::LoadOp::create(builder, loc, buffSize);
     mlir::IndexType idxTy = builder.getIndexType();
     mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
 
@@ -6259,7 +6273,7 @@ class ArrayExprLowering {
         mlir::Value length = fir::getLen(exv);
         if (!length)
           fir::emitFatalError(loc, "result is not boxed character");
-        eleSz = builder.create<mlir::arith::MulIOp>(loc, bytes, length);
+        eleSz = mlir::arith::MulIOp::create(builder, loc, bytes, length);
       } else {
         TODO(loc, "PDT size");
         // Will call the PDT's size function with the type parameters.
@@ -6278,13 +6292,13 @@ class ArrayExprLowering {
           refTy = builder.getRefType(chTy);
           mlir::Type toTy = builder.getRefType(builder.getVarLenSeqTy(chTy));
           buff = builder.createConvert(loc, toTy, buff);
-          off = builder.create<mlir::arith::MulIOp>(loc, off, eleSz);
+          off = mlir::arith::MulIOp::create(builder, loc, off, eleSz);
         } else {
           TODO(loc, "PDT offset");
         }
       }
-      auto coor = builder.create<fir::CoordinateOp>(loc, refTy, buff,
-                                                    mlir::ValueRange{off});
+      auto coor = fir::CoordinateOp::create(builder, loc, refTy, buff,
+                                            mlir::ValueRange{off});
       return builder.createConvert(loc, eleRefTy, coor);
     };
 
@@ -6293,15 +6307,15 @@ class ArrayExprLowering {
       // Compute the array size.
       mlir::Value arrSz = one;
       for (auto ext : v.getExtents())
-        arrSz = builder.create<mlir::arith::MulIOp>(loc, arrSz, ext);
+        arrSz = mlir::arith::MulIOp::create(builder, loc, arrSz, ext);
 
       // Grow the buffer as needed.
-      auto endOff = builder.create<mlir::arith::AddIOp>(loc, off, arrSz);
+      auto endOff = mlir::arith::AddIOp::create(builder, loc, off, arrSz);
       mem = growBuffer(mem, endOff, limit, buffSize, eleSz);
 
       // Copy the elements to the buffer.
       mlir::Value byteSz =
-          builder.create<mlir::arith::MulIOp>(loc, arrSz, eleSz);
+          mlir::arith::MulIOp::create(builder, loc, arrSz, eleSz);
       auto buff = builder.createConvert(loc, fir::HeapType::get(resTy), mem);
       mlir::Value buffi = computeCoordinate(buff, off);
       llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
@@ -6310,13 +6324,13 @@ class ArrayExprLowering {
       createCallMemcpy(args, isVolatile);
 
       // Save the incremented buffer position.
-      builder.create<fir::StoreOp>(loc, endOff, buffPos);
+      fir::StoreOp::create(builder, loc, endOff, buffPos);
     };
 
     // Copy a trivial scalar value into the buffer.
     auto doTrivialScalar = [&](const ExtValue &v, mlir::Value len = {}) {
       // Increment the buffer position.
-      auto plusOne = builder.create<mlir::arith::AddIOp>(loc, off, one);
+      auto plusOne = mlir::arith::AddIOp::create(builder, loc, off, one);
 
       // Grow the buffer as needed.
       mem = growBuffer(mem, plusOne, limit, buffSize, eleSz);
@@ -6324,8 +6338,8 @@ class ArrayExprLowering {
       // Store the element in the buffer.
       mlir::Value buff =
           builder.createConvert(loc, fir::HeapType::get(resTy), mem);
-      auto buffi = builder.create<fir::CoordinateOp>(loc, eleRefTy, buff,
-                                                     mlir::ValueRange{off});
+      auto buffi = fir::CoordinateOp::create(builder, loc, eleRefTy, buff,
+                                             mlir::ValueRange{off});
       fir::factory::genScalarAssignment(
           builder, loc,
           [&]() -> ExtValue {
@@ -6334,7 +6348,7 @@ class ArrayExprLowering {
             return buffi;
           }(),
           v);
-      builder.create<fir::StoreOp>(loc, plusOne, buffPos);
+      fir::StoreOp::create(builder, loc, plusOne, buffPos);
     };
 
     // Copy the value.
@@ -6346,7 +6360,7 @@ class ArrayExprLowering {
             doTrivialScalar(exv, eleSz);
           } else {
             // Increment the buffer position.
-            auto plusOne = builder.create<mlir::arith::AddIOp>(loc, off, one);
+            auto plusOne = mlir::arith::AddIOp::create(builder, loc, off, one);
 
             // Grow the buffer as needed.
             mem = growBuffer(mem, plusOne, limit, buffSize, eleSz);
@@ -6361,7 +6375,7 @@ class ArrayExprLowering {
                 fir::isa_volatile_type(v.getAddr().getType());
             createCallMemcpy(args, isVolatile);
 
-            builder.create<fir::StoreOp>(loc, plusOne, buffPos);
+            fir::StoreOp::create(builder, loc, plusOne, buffPos);
           }
         },
         [&](const fir::ArrayBoxValue &v) { doAbstractArray(v); },
@@ -6402,8 +6416,8 @@ class ArrayExprLowering {
     auto seqTy = mlir::cast<fir::SequenceType>(resTy);
     mlir::Type eleTy = fir::unwrapSequenceType(seqTy);
     auto loop =
-        builder.create<fir::DoLoopOp>(loc, lo, up, step, /*unordered=*/false,
-                                      /*finalCount=*/false, mem);
+        fir::DoLoopOp::create(builder, loc, lo, up, step, /*unordered=*/false,
+                              /*finalCount=*/false, mem);
     // create a new binding for x.name(), to ac-do-variable, to the iteration
     // value.
     symMap.pushImpliedDoBinding(toStringRef(x.name()), loop.getInductionVar());
@@ -6433,22 +6447,22 @@ class ArrayExprLowering {
         mlir::Value castLen =
             builder.createConvert(loc, builder.getI64Type(), fir::getLen(exv));
         assert(charLen.has_value());
-        builder.create<fir::StoreOp>(loc, castLen, *charLen);
+        fir::StoreOp::create(builder, loc, castLen, *charLen);
       }
     }
     stmtCtx.finalizeAndPop();
 
-    builder.create<fir::ResultOp>(loc, mem);
+    fir::ResultOp::create(builder, loc, mem);
     builder.restoreInsertionPoint(insPt);
     mem = loop.getResult(0);
     symMap.popImpliedDoBinding();
     llvm::SmallVector<mlir::Value> extents = {
-        builder.create<fir::LoadOp>(loc, buffPos).getResult()};
+        fir::LoadOp::create(builder, loc, buffPos).getResult()};
 
     // Convert to extended value.
     if (fir::isa_char(seqTy.getEleTy())) {
       assert(charLen.has_value());
-      auto len = builder.create<fir::LoadOp>(loc, *charLen);
+      auto len = fir::LoadOp::create(builder, loc, *charLen);
       return {fir::CharArrayBoxValue{mem, len, extents}, /*needCopy=*/false};
     }
     return {fir::ArrayBoxValue{mem, extents}, /*needCopy=*/false};
@@ -6471,7 +6485,7 @@ class ArrayExprLowering {
     mlir::Value buffSize = builder.createTemporary(loc, idxTy, ".buff.size");
     mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
     mlir::Value buffPos = builder.createTemporary(loc, idxTy, ".buff.pos");
-    builder.create<fir::StoreOp>(loc, zero, buffPos);
+    fir::StoreOp::create(builder, loc, zero, buffPos);
     // Allocate space for the array to be constructed.
     mlir::Value mem;
     if (fir::hasDynamicSize(resTy)) {
@@ -6479,22 +6493,22 @@ class ArrayExprLowering {
         // The size of each element may depend on a general expression. Defer
         // creating the buffer until after the expression is evaluated.
         mem = builder.createNullConstant(loc, builder.getRefType(eleTy));
-        builder.create<fir::StoreOp>(loc, zero, buffSize);
+        fir::StoreOp::create(builder, loc, zero, buffSize);
       } else {
         mlir::Value initBuffSz =
             builder.createIntegerConstant(loc, idxTy, clInitialBufferSize);
-        mem = builder.create<fir::AllocMemOp>(
-            loc, eleTy, /*typeparams=*/std::nullopt, initBuffSz);
-        builder.create<fir::StoreOp>(loc, initBuffSz, buffSize);
+        mem = fir::AllocMemOp::create(
+            builder, loc, eleTy, /*typeparams=*/mlir::ValueRange{}, initBuffSz);
+        fir::StoreOp::create(builder, loc, initBuffSz, buffSize);
       }
     } else {
-      mem = builder.create<fir::AllocMemOp>(loc, resTy);
+      mem = fir::AllocMemOp::create(builder, loc, resTy);
       int64_t buffSz = 1;
       for (auto extent : seqTy.getShape())
         buffSz *= extent;
       mlir::Value initBuffSz =
           builder.createIntegerConstant(loc, idxTy, buffSz);
-      builder.create<fir::StoreOp>(loc, initBuffSz, buffSize);
+      fir::StoreOp::create(builder, loc, initBuffSz, buffSize);
     }
     // Compute size of element
     mlir::Type eleRefTy = builder.getRefType(eleTy);
@@ -6516,12 +6530,12 @@ class ArrayExprLowering {
         charLen = builder.createTemporary(loc, builder.getI64Type());
         mlir::Value castLen =
             builder.createConvert(loc, builder.getI64Type(), fir::getLen(exv));
-        builder.create<fir::StoreOp>(loc, castLen, *charLen);
+        fir::StoreOp::create(builder, loc, castLen, *charLen);
       }
     }
     mem = builder.createConvert(loc, fir::HeapType::get(resTy), mem);
     llvm::SmallVector<mlir::Value> extents = {
-        builder.create<fir::LoadOp>(loc, buffPos)};
+        fir::LoadOp::create(builder, loc, buffPos)};
 
     // Cleanup the temporary.
     fir::FirOpBuilder *bldr = &converter.getFirOpBuilder();
@@ -6531,7 +6545,7 @@ class ArrayExprLowering {
     // Return the continuation.
     if (fir::isa_char(seqTy.getEleTy())) {
       if (charLen) {
-        auto len = builder.create<fir::LoadOp>(loc, *charLen);
+        auto len = fir::LoadOp::create(builder, loc, *charLen);
         return genarr(fir::CharArrayBoxValue{mem, len, extents});
       }
       return genarr(fir::CharArrayBoxValue{mem, zero, extents});
@@ -6568,7 +6582,7 @@ class ArrayExprLowering {
     return [=](IterSpace iters) -> ExtValue {
       mlir::Value logical = fir::getBase(lambda(iters));
       mlir::Value val = builder.createConvert(loc, i1Ty, logical);
-      return builder.create<mlir::arith::XOrIOp>(loc, val, truth);
+      return mlir::arith::XOrIOp::create(builder, loc, val, truth);
     };
   }
   template <typename OP, typename A>
@@ -6582,7 +6596,7 @@ class ArrayExprLowering {
       mlir::Value right = fir::getBase(rf(iters));
       mlir::Value lhs = builder.createConvert(loc, i1Ty, left);
       mlir::Value rhs = builder.createConvert(loc, i1Ty, right);
-      return builder.create<OP>(loc, lhs, rhs);
+      return OP::create(builder, loc, lhs, rhs);
     };
   }
   template <typename OP, typename A>
@@ -6596,7 +6610,7 @@ class ArrayExprLowering {
       mlir::Value right = fir::getBase(rf(iters));
       mlir::Value lhs = builder.createConvert(loc, i1Ty, left);
       mlir::Value rhs = builder.createConvert(loc, i1Ty, right);
-      return builder.create<OP>(loc, pred, lhs, rhs);
+      return OP::create(builder, loc, pred, lhs, rhs);
     };
   }
   template <int KIND>
@@ -6636,9 +6650,9 @@ class ArrayExprLowering {
             Fortran::common::TypeCategory::Integer, *unsignedKind);
         mlir::Value lhsSL = builder.createConvert(loc, signlessType, lhs);
         mlir::Value rhsSL = builder.createConvert(loc, signlessType, rhs);
-        return builder.create<OP>(loc, pred, lhsSL, rhsSL);
+        return OP::create(builder, loc, pred, lhsSL, rhsSL);
       }
-      return builder.create<OP>(loc, pred, lhs, rhs);
+      return OP::create(builder, loc, pred, lhs, rhs);
     };
   }
   template <typename A>
@@ -6805,9 +6819,8 @@ class ArrayExprLowering {
                                          : one;
                               mlir::Value val = builder.createConvert(
                                   loc, idxTy, subscriptVal);
-                              mlir::Value ivAdj =
-                                  builder.create<mlir::arith::SubIOp>(
-                                      loc, idxTy, val, lb);
+                              mlir::Value ivAdj = mlir::arith::SubIOp::create(
+                                  builder, loc, idxTy, val, lb);
                               componentsToAdd.push_back(
                                   builder.createConvert(loc, idxTy, ivAdj));
                             },
@@ -6829,8 +6842,9 @@ class ArrayExprLowering {
                     converter.getRecordTypeFieldName(getLastSym(*x));
                 if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty)) {
                   ty = recTy.getType(name);
-                  auto fld = builder.create<fir::FieldIndexOp>(
-                      loc, fieldTy, name, recTy, fir::getTypeParams(arrayExv));
+                  auto fld = fir::FieldIndexOp::create(
+                      builder, loc, fieldTy, name, recTy,
+                      fir::getTypeParams(arrayExv));
                   addComponentList(ty, {fld});
                   if (index != revPath.size() - 1 || !isPointerAssignment()) {
                     // Need an intermediate  dereference if the boxed value
@@ -6851,8 +6865,9 @@ class ArrayExprLowering {
                   ty = fir::unwrapRefType(boxTy.getEleTy());
                   auto recTy = mlir::cast<fir::RecordType>(ty);
                   ty = recTy.getType(name);
-                  auto fld = builder.create<fir::FieldIndexOp>(
-                      loc, fieldTy, name, recTy, fir::getTypeParams(arrayExv));
+                  auto fld = fir::FieldIndexOp::create(
+                      builder, loc, fieldTy, name, recTy,
+                      fir::getTypeParams(arrayExv));
                   extendComponent(components, ty, {fld});
                 } else {
                   TODO(loc, "other component type");
@@ -6895,8 +6910,8 @@ class ArrayExprLowering {
         mlir::Value innerArg = esp->findArgumentOfLoad(load);
         if (isAdjustedArrayElementType(eleTy)) {
           mlir::Type eleRefTy = builder.getRefType(eleTy);
-          auto arrayOp = builder.create<fir::ArrayAccessOp>(
-              loc, eleRefTy, innerArg, iters.iterVec(),
+          auto arrayOp = fir::ArrayAccessOp::create(
+              builder, loc, eleRefTy, innerArg, iters.iterVec(),
               fir::factory::getTypeParams(loc, builder, load));
           if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
             mlir::Value dstLen = fir::factory::genLenOfCharacter(
@@ -6923,9 +6938,9 @@ class ArrayExprLowering {
           if (!eleBoxTy || !mlir::isa<fir::BoxType>(eleBoxTy))
             TODO(loc, "assignment in a FORALL involving a designator with a "
                       "POINTER or ALLOCATABLE component part-ref");
-          auto arrayOp = builder.create<fir::ArrayAccessOp>(
-              loc, builder.getRefType(eleBoxTy), innerArg, iters.iterVec(),
-              fir::factory::getTypeParams(loc, builder, load));
+          auto arrayOp = fir::ArrayAccessOp::create(
+              builder, loc, builder.getRefType(eleBoxTy), innerArg,
+              iters.iterVec(), fir::factory::getTypeParams(loc, builder, load));
           mlir::Value addr = components.getExtendCoorRef()(arrayOp);
           components.resetExtendCoorRef();
           // When the lhs is a boxed value and the context is not a pointer
@@ -6934,19 +6949,19 @@ class ArrayExprLowering {
           if (!isPointerAssignment()) {
             if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(eleTy)) {
               eleTy = fir::boxMemRefType(boxTy);
-              addr = builder.create<fir::BoxAddrOp>(loc, eleTy, addr);
+              addr = fir::BoxAddrOp::create(builder, loc, eleTy, addr);
               eleTy = fir::unwrapRefType(eleTy);
             }
           }
           auto ele = convertElementForUpdate(loc, eleTy, iters.getElement());
-          builder.create<fir::StoreOp>(loc, ele, addr);
-          auto amend = builder.create<fir::ArrayAmendOp>(
-              loc, innerArg.getType(), innerArg, arrayOp);
+          fir::StoreOp::create(builder, loc, ele, addr);
+          auto amend = fir::ArrayAmendOp::create(
+              builder, loc, innerArg.getType(), innerArg, arrayOp);
           return arrayLoadExtValue(builder, loc, load, iters.iterVec(), amend);
         }
         auto ele = convertElementForUpdate(loc, eleTy, iters.getElement());
-        auto update = builder.create<fir::ArrayUpdateOp>(
-            loc, innerArg.getType(), innerArg, ele, iters.iterVec(),
+        auto update = fir::ArrayUpdateOp::create(
+            builder, loc, innerArg.getType(), innerArg, ele, iters.iterVec(),
             fir::factory::getTypeParams(loc, builder, load));
         return arrayLoadExtValue(builder, loc, load, iters.iterVec(), update);
       };
@@ -6960,9 +6975,9 @@ class ArrayExprLowering {
         mlir::Value innerArg = explicitSpace->findArgumentOfLoad(load);
         mlir::Type refEleTy =
             fir::isa_ref_type(eleTy) ? eleTy : builder.getRefType(eleTy);
-        auto arrModify = builder.create<fir::ArrayModifyOp>(
-            loc, mlir::TypeRange{refEleTy, innerArg.getType()}, innerArg,
-            iters.iterVec(), load.getTypeparams());
+        auto arrModify = fir::ArrayModifyOp::create(
+            builder, loc, mlir::TypeRange{refEleTy, innerArg.getType()},
+            innerArg, iters.iterVec(), load.getTypeparams());
         return arrayLoadExtValue(builder, loc, load, iters.iterVec(),
                                  arrModify.getResult(1));
       };
@@ -6973,8 +6988,8 @@ class ArrayExprLowering {
           isAdjustedArrayElementType(eleTy)) {
         mlir::Type resTy = builder.getRefType(eleTy);
         // Use array element reference semantics.
-        auto access = builder.create<fir::ArrayAccessOp>(
-            loc, resTy, load, iters.iterVec(),
+        auto access = fir::ArrayAccessOp::create(
+            builder, loc, resTy, load, iters.iterVec(),
             fir::factory::getTypeParams(loc, builder, load));
         mlir::Value newBase = access;
         if (fir::isa_char(eleTy)) {
@@ -6996,8 +7011,8 @@ class ArrayExprLowering {
         if (!eleBoxTy || !mlir::isa<fir::BoxType>(eleBoxTy))
           TODO(loc, "assignment in a FORALL involving a designator with a "
                     "POINTER or ALLOCATABLE component part-ref");
-        auto access = builder.create<fir::ArrayAccessOp>(
-            loc, builder.getRefType(eleBoxTy), load, iters.iterVec(),
+        auto access = fir::ArrayAccessOp::create(
+            builder, loc, builder.getRefType(eleBoxTy), load, iters.iterVec(),
             fir::factory::getTypeParams(loc, builder, load));
         mlir::Value addr = components.getExtendCoorRef()(access);
         components.resetExtendCoorRef();
@@ -7009,8 +7024,8 @@ class ArrayExprLowering {
           // Rhs is a regular expression that will need to be boxed before
           // assigning to the boxed variable.
           auto typeParams = fir::factory::getTypeParams(loc, builder, load);
-          auto access = builder.create<fir::ArrayAccessOp>(
-              loc, builder.getRefType(eleTy), load, iters.iterVec(),
+          auto access = fir::ArrayAccessOp::create(
+              builder, loc, builder.getRefType(eleTy), load, iters.iterVec(),
               typeParams);
           auto addr = components.getExtendCoorRef()(access);
           components.resetExtendCoorRef();
@@ -7024,14 +7039,14 @@ class ArrayExprLowering {
             TODO(loc, "need to adjust typeparameter(s) to reflect the final "
                       "component");
           mlir::Value embox =
-              builder.create<fir::EmboxOp>(loc, boxTy, ptrAddr,
-                                           /*shape=*/mlir::Value{},
-                                           /*slice=*/mlir::Value{}, typeParams);
+              fir::EmboxOp::create(builder, loc, boxTy, ptrAddr,
+                                   /*shape=*/mlir::Value{},
+                                   /*slice=*/mlir::Value{}, typeParams);
           return arrayLoadExtValue(builder, loc, load, iters.iterVec(), embox);
         }
       }
-      auto fetch = builder.create<fir::ArrayFetchOp>(
-          loc, eleTy, load, iters.iterVec(), load.getTypeparams());
+      auto fetch = fir::ArrayFetchOp::create(
+          builder, loc, eleTy, load, iters.iterVec(), load.getTypeparams());
       return arrayLoadExtValue(builder, loc, load, iters.iterVec(), fetch);
     };
     return [=](IterSpace iters) mutable { return lambda(pc(iters)); };
@@ -7540,24 +7555,25 @@ fir::ExtendedValue Fortran::lower::updateBoxForParentComponent(
 
   if (op) {
     if (auto embox = mlir::dyn_cast<fir::EmboxOp>(op)) {
-      auto newBox = builder.create<fir::EmboxOp>(
-          loc, fir::BoxType::get(actualTy), embox.getMemref(), embox.getShape(),
-          embox.getSlice(), embox.getTypeparams());
+      auto newBox = fir::EmboxOp::create(
+          builder, loc, fir::BoxType::get(actualTy), embox.getMemref(),
+          embox.getShape(), embox.getSlice(), embox.getTypeparams());
       return fir::substBase(box, newBox);
     }
     if (auto rebox = mlir::dyn_cast<fir::ReboxOp>(op)) {
-      auto newBox = builder.create<fir::ReboxOp>(
-          loc, fir::BoxType::get(actualTy), rebox.getBox(), rebox.getShape(),
-          rebox.getSlice());
+      auto newBox = fir::ReboxOp::create(
+          builder, loc, fir::BoxType::get(actualTy), rebox.getBox(),
+          rebox.getShape(), rebox.getSlice());
       return fir::substBase(box, newBox);
     }
   }
 
   mlir::Value empty;
   mlir::ValueRange emptyRange;
-  return builder.create<fir::ReboxOp>(loc, fir::BoxType::get(actualTy), boxBase,
-                                      /*shape=*/empty,
-                                      /*slice=*/empty);
+  return fir::ReboxOp::create(builder, loc, fir::BoxType::get(actualTy),
+                              boxBase,
+                              /*shape=*/empty,
+                              /*slice=*/empty);
 }
 
 fir::ExtendedValue Fortran::lower::createBoxValue(
@@ -7648,9 +7664,9 @@ fir::ArrayLoadOp genArrayLoad(mlir::Location loc,
   mlir::Value addr = fir::getBase(exv);
   mlir::Value shapeOp = builder.createShape(loc, exv);
   mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(addr.getType());
-  return builder.create<fir::ArrayLoadOp>(loc, arrTy, addr, shapeOp,
-                                          /*slice=*/mlir::Value{},
-                                          fir::getTypeParams(exv));
+  return fir::ArrayLoadOp::create(builder, loc, arrTy, addr, shapeOp,
+                                  /*slice=*/mlir::Value{},
+                                  fir::getTypeParams(exv));
 }
 template <>
 fir::ArrayLoadOp
@@ -7697,9 +7713,9 @@ void Fortran::lower::createArrayMergeStores(
   for (auto i : llvm::enumerate(esp.getOuterLoop().getResults()))
     if (std::optional<fir::ArrayLoadOp> ldOpt = esp.getLhsLoad(i.index())) {
       fir::ArrayLoadOp load = *ldOpt;
-      builder.create<fir::ArrayMergeStoreOp>(loc, load, i.value(),
-                                             load.getMemref(), load.getSlice(),
-                                             load.getTypeparams());
+      fir::ArrayMergeStoreOp::create(builder, loc, load, i.value(),
+                                     load.getMemref(), load.getSlice(),
+                                     load.getTypeparams());
     }
   if (esp.loopCleanup) {
     (*esp.loopCleanup)(builder);
@@ -7721,12 +7737,12 @@ mlir::Value Fortran::lower::addCrayPointerInst(mlir::Location loc,
   mlir::Value empty;
   mlir::ValueRange emptyRange;
   auto boxTy = fir::BoxType::get(ptrTy);
-  auto box = builder.create<fir::EmboxOp>(loc, boxTy, ptrVal, empty, empty,
-                                          emptyRange);
-  mlir::Value addrof =
-      (mlir::isa<fir::ReferenceType>(ptrTy))
-          ? builder.create<fir::BoxAddrOp>(loc, ptrTy, box)
-          : builder.create<fir::BoxAddrOp>(loc, builder.getRefType(ptrTy), box);
+  auto box = fir::EmboxOp::create(builder, loc, boxTy, ptrVal, empty, empty,
+                                  emptyRange);
+  mlir::Value addrof = (mlir::isa<fir::ReferenceType>(ptrTy))
+                           ? fir::BoxAddrOp::create(builder, loc, ptrTy, box)
+                           : fir::BoxAddrOp::create(
+                                 builder, loc, builder.getRefType(ptrTy), box);
 
   auto refPtrTy =
       builder.getRefType(fir::PointerType::get(fir::dyn_cast_ptrEleTy(pteTy)));
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 9689f920840fb..46be111242bf7 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -167,7 +167,7 @@ class HlfirDesignatorBuilder {
       extents.push_back(builder.createIntegerConstant(loc, idxTy, extent));
     }
     if (!mayHaveNonDefaultLowerBounds(componentSym))
-      return builder.create<fir::ShapeOp>(loc, extents);
+      return fir::ShapeOp::create(builder, loc, extents);
 
     llvm::SmallVector<mlir::Value> lbounds;
     if (const auto *objDetails =
@@ -312,8 +312,8 @@ class HlfirDesignatorBuilder {
     // hlfir.elemental_addr.
     if (auto elementalAddrOp = getVectorSubscriptElementAddrOp())
       builder.setInsertionPointToEnd(&elementalAddrOp->getBody().front());
-    auto designate = builder.create<hlfir::DesignateOp>(
-        getLoc(), designatorType, partInfo.base.value().getBase(),
+    auto designate = hlfir::DesignateOp::create(
+        builder, getLoc(), designatorType, partInfo.base.value().getBase(),
         partInfo.componentName, partInfo.componentShape, partInfo.subscripts,
         partInfo.substring, partInfo.complexPart, partInfo.resultShape,
         partInfo.typeParams, attributes);
@@ -344,7 +344,7 @@ class HlfirDesignatorBuilder {
         mlir::Type refPtrType = builder.getRefType(
             fir::PointerType::get(fir::dyn_cast_ptrEleTy(ptrAddr.getType())));
         mlir::Value cast = builder.createConvert(loc, refPtrType, ptrAddr);
-        mlir::Value ptrVal = builder.create<fir::LoadOp>(loc, cast);
+        mlir::Value ptrVal = fir::LoadOp::create(builder, loc, cast);
 
         // Update the base_addr to the value of the Cray pointer.
         // This is a hacky way to do the update, and it may harm
@@ -442,9 +442,9 @@ class HlfirDesignatorBuilder {
     } else {
       // Compute "len = max(ub-lb+1,0)" (Fortran 2018 9.4.1).
       mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-      auto boundsDiff = builder.create<mlir::arith::SubIOp>(
-          loc, partInfo.substring[1], partInfo.substring[0]);
-      auto rawLen = builder.create<mlir::arith::AddIOp>(loc, boundsDiff, one);
+      auto boundsDiff = mlir::arith::SubIOp::create(
+          builder, loc, partInfo.substring[1], partInfo.substring[0]);
+      auto rawLen = mlir::arith::AddIOp::create(builder, loc, boundsDiff, one);
       partInfo.typeParams[0] =
           fir::factory::genMaxWithZero(builder, loc, rawLen);
     }
@@ -803,10 +803,10 @@ class HlfirDesignatorBuilder {
     mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
     oneBasedIndex = builder.createConvert(loc, idxTy, oneBasedIndex);
     mlir::Value zeroBased =
-        builder.create<mlir::arith::SubIOp>(loc, oneBasedIndex, one);
+        mlir::arith::SubIOp::create(builder, loc, oneBasedIndex, one);
     mlir::Value offset =
-        builder.create<mlir::arith::MulIOp>(loc, zeroBased, step);
-    return builder.create<mlir::arith::AddIOp>(loc, lb, offset);
+        mlir::arith::MulIOp::create(builder, loc, zeroBased, step);
+    return mlir::arith::AddIOp::create(builder, loc, lb, offset);
   }
 
   /// Create an hlfir.element_addr operation to deal with vector subscripted
@@ -836,8 +836,8 @@ class HlfirDesignatorBuilder {
     assert(partInfo.base.has_value() &&
            "vector subscripted part must have a base");
     mlir::Value mold = *partInfo.base;
-    auto elementalAddrOp = builder.create<hlfir::ElementalAddrOp>(
-        loc, shape, mold, mlir::ValueRange{},
+    auto elementalAddrOp = hlfir::ElementalAddrOp::create(
+        builder, loc, shape, mold, mlir::ValueRange{},
         /*isUnordered=*/true);
     setVectorSubscriptElementAddrOp(elementalAddrOp);
     builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
@@ -881,7 +881,7 @@ class HlfirDesignatorBuilder {
     builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
     if (!elementAddr.isPolymorphic())
       elementalAddrOp.getMoldMutable().clear();
-    builder.create<hlfir::YieldOp>(loc, elementAddr);
+    hlfir::YieldOp::create(builder, loc, elementAddr);
     builder.setInsertionPointAfter(elementalAddrOp);
   }
 
@@ -1001,7 +1001,7 @@ HlfirDesignatorBuilder::convertVectorSubscriptedExprToElementalAddr(
     elementalAddrOp.getMoldMutable().clear();
   // Create the hlfir.yield terminator inside the hlfir.elemental_body.
   builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
-  builder.create<hlfir::YieldOp>(loc, elementAddrEntity);
+  hlfir::YieldOp::create(builder, loc, elementAddrEntity);
   builder.setInsertionPointAfter(elementalAddrOp);
   // Reset the HlfirDesignatorBuilder state, in case it is used on a new
   // designator.
@@ -1034,7 +1034,7 @@ struct BinaryOp {};
                                                 rhs)};                         \
       } else {                                                                 \
         return hlfir::EntityWithAttributes{                                    \
-            builder.create<GenBinFirOp>(loc, lhs, rhs)};                       \
+            GenBinFirOp::create(builder, loc, lhs, rhs)};                      \
       }                                                                        \
     }                                                                          \
   };
@@ -1075,7 +1075,7 @@ struct BinaryOp<Fortran::evaluate::Divide<
           fir::genDivC(builder, loc, ty, lhs, rhs)};
     } else {
       return hlfir::EntityWithAttributes{
-          builder.create<mlir::complex::DivOp>(loc, lhs, rhs)};
+          mlir::complex::DivOp::create(builder, loc, lhs, rhs)};
     }
   }
 };
@@ -1219,8 +1219,8 @@ struct BinaryOp<Fortran::evaluate::Relational<
                                          fir::FirOpBuilder &builder,
                                          const Op &op, hlfir::Entity lhs,
                                          hlfir::Entity rhs) {
-    auto cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, translateSignedRelational(op.opr), lhs, rhs);
+    auto cmp = mlir::arith::CmpIOp::create(
+        builder, loc, translateSignedRelational(op.opr), lhs, rhs);
     return hlfir::EntityWithAttributes{cmp};
   }
 };
@@ -1241,8 +1241,8 @@ struct BinaryOp<Fortran::evaluate::Relational<
         mlir::IntegerType::SignednessSemantics::Signless);
     mlir::Value lhsSL = builder.createConvert(loc, signlessType, lhs);
     mlir::Value rhsSL = builder.createConvert(loc, signlessType, rhs);
-    auto cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, translateUnsignedRelational(op.opr), lhsSL, rhsSL);
+    auto cmp = mlir::arith::CmpIOp::create(
+        builder, loc, translateUnsignedRelational(op.opr), lhsSL, rhsSL);
     return hlfir::EntityWithAttributes{cmp};
   }
 };
@@ -1256,8 +1256,8 @@ struct BinaryOp<Fortran::evaluate::Relational<
                                          fir::FirOpBuilder &builder,
                                          const Op &op, hlfir::Entity lhs,
                                          hlfir::Entity rhs) {
-    auto cmp = builder.create<mlir::arith::CmpFOp>(
-        loc, translateFloatRelational(op.opr), lhs, rhs);
+    auto cmp = mlir::arith::CmpFOp::create(
+        builder, loc, translateFloatRelational(op.opr), lhs, rhs);
     return hlfir::EntityWithAttributes{cmp};
   }
 };
@@ -1271,8 +1271,8 @@ struct BinaryOp<Fortran::evaluate::Relational<
                                          fir::FirOpBuilder &builder,
                                          const Op &op, hlfir::Entity lhs,
                                          hlfir::Entity rhs) {
-    auto cmp = builder.create<fir::CmpcOp>(
-        loc, translateFloatRelational(op.opr), lhs, rhs);
+    auto cmp = fir::CmpcOp::create(builder, loc,
+                                   translateFloatRelational(op.opr), lhs, rhs);
     return hlfir::EntityWithAttributes{cmp};
   }
 };
@@ -1313,16 +1313,16 @@ struct BinaryOp<Fortran::evaluate::LogicalOperation<KIND>> {
     switch (op.logicalOperator) {
     case Fortran::evaluate::LogicalOperator::And:
       return hlfir::EntityWithAttributes{
-          builder.create<mlir::arith::AndIOp>(loc, i1Lhs, i1Rhs)};
+          mlir::arith::AndIOp::create(builder, loc, i1Lhs, i1Rhs)};
     case Fortran::evaluate::LogicalOperator::Or:
       return hlfir::EntityWithAttributes{
-          builder.create<mlir::arith::OrIOp>(loc, i1Lhs, i1Rhs)};
+          mlir::arith::OrIOp::create(builder, loc, i1Lhs, i1Rhs)};
     case Fortran::evaluate::LogicalOperator::Eqv:
-      return hlfir::EntityWithAttributes{builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, i1Lhs, i1Rhs)};
+      return hlfir::EntityWithAttributes{mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, i1Lhs, i1Rhs)};
     case Fortran::evaluate::LogicalOperator::Neqv:
-      return hlfir::EntityWithAttributes{builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::ne, i1Lhs, i1Rhs)};
+      return hlfir::EntityWithAttributes{mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::ne, i1Lhs, i1Rhs)};
     case Fortran::evaluate::LogicalOperator::Not:
       // lib/evaluate expression for .NOT. is Fortran::evaluate::Not<KIND>.
       llvm_unreachable(".NOT. is not a binary operator");
@@ -1354,7 +1354,7 @@ struct BinaryOp<Fortran::evaluate::SetLength<KIND>> {
     // Fortran 2018 7.4.4.2 point 5.
     mlir::Value safeLength = fir::factory::genMaxWithZero(builder, loc, length);
     return hlfir::EntityWithAttributes{
-        builder.create<hlfir::SetLengthOp>(loc, string, safeLength)};
+        hlfir::SetLengthOp::create(builder, loc, string, safeLength)};
   }
   static void
   genResultTypeParams(mlir::Location, fir::FirOpBuilder &, hlfir::Entity,
@@ -1372,7 +1372,7 @@ struct BinaryOp<Fortran::evaluate::Concat<KIND>> {
                                   hlfir::Entity lhs, hlfir::Entity rhs) {
     assert(len && "genResultTypeParams must have been called");
     auto concat =
-        builder.create<hlfir::ConcatOp>(loc, mlir::ValueRange{lhs, rhs}, len);
+        hlfir::ConcatOp::create(builder, loc, mlir::ValueRange{lhs, rhs}, len);
     return hlfir::EntityWithAttributes{concat.getResult()};
   }
   void
@@ -1386,7 +1386,7 @@ struct BinaryOp<Fortran::evaluate::Concat<KIND>> {
     mlir::Type idxType = builder.getIndexType();
     mlir::Value lhsLen = builder.createConvert(loc, idxType, lengths[0]);
     mlir::Value rhsLen = builder.createConvert(loc, idxType, lengths[1]);
-    len = builder.create<mlir::arith::AddIOp>(loc, lhsLen, rhsLen);
+    len = mlir::arith::AddIOp::create(builder, loc, lhsLen, rhsLen);
     resultTypeParams.push_back(len);
   }
 
@@ -1410,7 +1410,7 @@ struct UnaryOp<Fortran::evaluate::Not<KIND>> {
     mlir::Value one = builder.createBool(loc, true);
     mlir::Value val = builder.createConvert(loc, builder.getI1Type(), lhs);
     return hlfir::EntityWithAttributes{
-        builder.create<mlir::arith::XOrIOp>(loc, val, one)};
+        mlir::arith::XOrIOp::create(builder, loc, val, one)};
   }
 };
 
@@ -1428,7 +1428,7 @@ struct UnaryOp<Fortran::evaluate::Negate<
         /*params=*/{});
     mlir::Value zero = builder.createIntegerConstant(loc, type, 0);
     return hlfir::EntityWithAttributes{
-        builder.create<mlir::arith::SubIOp>(loc, zero, lhs)};
+        mlir::arith::SubIOp::create(builder, loc, zero, lhs)};
   }
 };
 
@@ -1448,7 +1448,7 @@ struct UnaryOp<Fortran::evaluate::Negate<
     mlir::Value zero = builder.createIntegerConstant(loc, signlessType, 0);
     mlir::Value signless = builder.createConvert(loc, signlessType, lhs);
     mlir::Value negated =
-        builder.create<mlir::arith::SubIOp>(loc, zero, signless);
+        mlir::arith::SubIOp::create(builder, loc, zero, signless);
     return hlfir::EntityWithAttributes(
         builder.createConvert(loc, lhs.getType(), negated));
   }
@@ -1463,7 +1463,7 @@ struct UnaryOp<Fortran::evaluate::Negate<
                                          fir::FirOpBuilder &builder, const Op &,
                                          hlfir::Entity lhs) {
     return hlfir::EntityWithAttributes{
-        builder.create<mlir::arith::NegFOp>(loc, lhs)};
+        mlir::arith::NegFOp::create(builder, loc, lhs)};
   }
 };
 
@@ -1475,7 +1475,7 @@ struct UnaryOp<Fortran::evaluate::Negate<
   static hlfir::EntityWithAttributes gen(mlir::Location loc,
                                          fir::FirOpBuilder &builder, const Op &,
                                          hlfir::Entity lhs) {
-    return hlfir::EntityWithAttributes{builder.create<fir::NegcOp>(loc, lhs)};
+    return hlfir::EntityWithAttributes{fir::NegcOp::create(builder, loc, lhs)};
   }
 };
 
@@ -1499,9 +1499,9 @@ struct UnaryOp<Fortran::evaluate::Parentheses<T>> {
                                          const Op &op, hlfir::Entity lhs) {
     if (lhs.isVariable())
       return hlfir::EntityWithAttributes{
-          builder.create<hlfir::AsExprOp>(loc, lhs)};
+          hlfir::AsExprOp::create(builder, loc, lhs)};
     return hlfir::EntityWithAttributes{
-        builder.create<hlfir::NoReassocOp>(loc, lhs.getType(), lhs)};
+        hlfir::NoReassocOp::create(builder, loc, lhs.getType(), lhs)};
   }
 
   static void
@@ -1822,8 +1822,8 @@ class HlfirBuilder {
     // Allocate scalar temporary that will be initialized
     // with the values specified by the constructor.
     mlir::Value storagePtr = builder.createTemporary(loc, recTy);
-    auto varOp = hlfir::EntityWithAttributes{builder.create<hlfir::DeclareOp>(
-        loc, storagePtr, "ctor.temp", /*shape=*/nullptr,
+    auto varOp = hlfir::EntityWithAttributes{hlfir::DeclareOp::create(
+        builder, loc, storagePtr, "ctor.temp", /*shape=*/nullptr,
         /*typeparams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr,
         fir::FortranVariableFlagsAttr{})};
 
@@ -1855,8 +1855,8 @@ class HlfirBuilder {
         auto parentCompType = baseRecTy.getType(parentName);
         assert(parentCompType && "failed to retrieve parent component type");
         mlir::Type designatorType = builder.getRefType(parentCompType);
-        mlir::Value newParent = builder.create<hlfir::DesignateOp>(
-            loc, designatorType, currentParent, parentName,
+        mlir::Value newParent = hlfir::DesignateOp::create(
+            builder, loc, designatorType, currentParent, parentName,
             /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
             /*substring=*/mlir::ValueRange{},
             /*complexPart=*/std::nullopt,
@@ -1912,8 +1912,8 @@ class HlfirBuilder {
                                                     extraAttributeFlags);
 
       // Get the component designator.
-      auto lhs = builder.create<hlfir::DesignateOp>(
-          loc, designatorType, baseOp, name, compShape,
+      auto lhs = hlfir::DesignateOp::create(
+          builder, loc, designatorType, baseOp, name, compShape,
           hlfir::DesignateOp::Subscripts{},
           /*substring=*/mlir::ValueRange{},
           /*complexPart=*/std::nullopt,
@@ -1945,7 +1945,7 @@ class HlfirBuilder {
           fir::emitFatalError(loc, "pointer component designator could not be "
                                    "lowered to mutable box");
         Fortran::lower::associateMutableBox(converter, loc, *toBox, expr,
-                                            /*lbounds=*/std::nullopt, stmtCtx);
+                                            /*lbounds=*/{}, stmtCtx);
         continue;
       }
 
@@ -1997,10 +1997,10 @@ class HlfirBuilder {
         auto rhsCastAndCleanup =
             hlfir::genTypeAndKindConvert(loc, builder, rhs, lhs.getType(),
                                          /*preserveLowerBounds=*/allowRealloc);
-        builder.create<hlfir::AssignOp>(loc, rhsCastAndCleanup.first, lhs,
-                                        allowRealloc,
-                                        allowRealloc ? keepLhsLength : false,
-                                        /*temporary_lhs=*/true);
+        hlfir::AssignOp::create(builder, loc, rhsCastAndCleanup.first, lhs,
+                                allowRealloc,
+                                allowRealloc ? keepLhsLength : false,
+                                /*temporary_lhs=*/true);
         if (rhsCastAndCleanup.second)
           (*rhsCastAndCleanup.second)();
       };
diff --git a/flang/lib/Lower/ConvertProcedureDesignator.cpp b/flang/lib/Lower/ConvertProcedureDesignator.cpp
index b528544ec245c..d4c535d71cb5f 100644
--- a/flang/lib/Lower/ConvertProcedureDesignator.cpp
+++ b/flang/lib/Lower/ConvertProcedureDesignator.cpp
@@ -49,7 +49,7 @@ fir::ExtendedValue Fortran::lower::convertProcedureDesignator(
         fir::getUnrestrictedIntrinsicSymbolRefAttr(builder, loc, genericName,
                                                    signature);
     mlir::Value funcPtr =
-        builder.create<fir::AddrOfOp>(loc, signature, symbolRefAttr);
+        fir::AddrOfOp::create(builder, loc, signature, symbolRefAttr);
     return funcPtr;
   }
   const Fortran::semantics::Symbol *symbol = proc.GetSymbol();
@@ -69,7 +69,7 @@ fir::ExtendedValue Fortran::lower::convertProcedureDesignator(
         Fortran::lower::getOrDeclareFunction(proc, converter);
     mlir::SymbolRefAttr nameAttr = builder.getSymbolRefAttr(func.getSymName());
     funcPtr =
-        builder.create<fir::AddrOfOp>(loc, func.getFunctionType(), nameAttr);
+        fir::AddrOfOp::create(builder, loc, func.getFunctionType(), nameAttr);
   }
   if (Fortran::lower::mustPassLengthWithDummyProcedure(proc, converter)) {
     // The result length, if available here, must be propagated along the
@@ -114,7 +114,7 @@ static hlfir::EntityWithAttributes designateProcedurePointerComponent(
   /// Passed argument may be a descriptor. This is a scalar reference, so the
   /// base address can be directly addressed.
   if (mlir::isa<fir::BaseBoxType>(base.getType()))
-    base = builder.create<fir::BoxAddrOp>(loc, base);
+    base = fir::BoxAddrOp::create(builder, loc, base);
   std::string fieldName = converter.getRecordTypeFieldName(procComponentSym);
   auto recordType =
       mlir::cast<fir::RecordType>(hlfir::getFortranElementType(base.getType()));
@@ -124,8 +124,8 @@ static hlfir::EntityWithAttributes designateProcedurePointerComponent(
   if (!fieldType)
     TODO(loc, "passing type bound procedure (extension)");
   mlir::Type designatorType = fir::ReferenceType::get(fieldType);
-  mlir::Value compRef = builder.create<hlfir::DesignateOp>(
-      loc, designatorType, base, fieldName,
+  mlir::Value compRef = hlfir::DesignateOp::create(
+      builder, loc, designatorType, base, fieldName,
       /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
       /*substring=*/mlir::ValueRange{},
       /*complexPart=*/std::nullopt,
@@ -174,10 +174,10 @@ hlfir::EntityWithAttributes Fortran::lower::convertProcedureDesignatorToHLFIR(
     mlir::Type boxTy =
         Fortran::lower::getUntypedBoxProcType(&converter.getMLIRContext());
     if (auto host = Fortran::lower::argumentHostAssocs(converter, funcAddr))
-      funcAddr = builder.create<fir::EmboxProcOp>(
-          loc, boxTy, llvm::ArrayRef<mlir::Value>{funcAddr, host});
+      funcAddr = fir::EmboxProcOp::create(
+          builder, loc, boxTy, llvm::ArrayRef<mlir::Value>{funcAddr, host});
     else
-      funcAddr = builder.create<fir::EmboxProcOp>(loc, boxTy, funcAddr);
+      funcAddr = fir::EmboxProcOp::create(builder, loc, boxTy, funcAddr);
   }
 
   mlir::Value res = procExv.match(
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 2bfa9618aa4b9..647bd0d079985 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -236,9 +236,8 @@ mlir::Value Fortran::lower::genInitialDataTarget(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(
           initialTarget))
-    return fir::factory::createUnallocatedBox(
-        builder, loc, boxType,
-        /*nonDeferredParams=*/std::nullopt);
+    return fir::factory::createUnallocatedBox(builder, loc, boxType,
+                                              /*nonDeferredParams=*/{});
   // Pointer initial data target, and NULL(mold).
   for (const auto &sym : Fortran::evaluate::CollectSymbols(initialTarget)) {
     // Derived type component symbols should not be instantiated as objects
@@ -313,8 +312,8 @@ mlir::Value Fortran::lower::genInitialDataTarget(
   // initial value of the descriptor).
   // Create a fir.rebox to set the attribute correctly, and use targetShift
   // to preserve the target lower bounds if any.
-  return builder.create<fir::ReboxOp>(loc, boxType, targetBox, targetShift,
-                                      /*slice=*/mlir::Value{});
+  return fir::ReboxOp::create(builder, loc, boxType, targetBox, targetShift,
+                              /*slice=*/mlir::Value{});
 }
 
 /// Generate default initial value for a derived type object \p sym with mlir
@@ -354,8 +353,8 @@ static mlir::Value genComponentDefaultInit(
       // From a standard point of view, pointer without initialization do not
       // need to be disassociated, but for sanity and simplicity, do it in
       // global constructor since this has no runtime cost.
-      componentValue = fir::factory::createUnallocatedBox(
-          builder, loc, componentTy, std::nullopt);
+      componentValue =
+          fir::factory::createUnallocatedBox(builder, loc, componentTy, {});
     } else if (Fortran::lower::hasDefaultInitialization(component)) {
       // Component type has default initialization.
       componentValue = genDefaultInitializerValue(converter, loc, component,
@@ -363,7 +362,7 @@ static mlir::Value genComponentDefaultInit(
     } else {
       // Component has no initial value. Set its bits to zero by extension
       // to match what is expected because other compilers are doing it.
-      componentValue = builder.create<fir::ZeroOp>(loc, componentTy);
+      componentValue = fir::ZeroOp::create(builder, loc, componentTy);
     }
   } else if (const auto *proc{
                  component
@@ -378,17 +377,17 @@ static mlir::Value genComponentDefaultInit(
         componentValue =
             fir::factory::createNullBoxProc(builder, loc, componentTy);
     } else
-      componentValue = builder.create<fir::ZeroOp>(loc, componentTy);
+      componentValue = fir::ZeroOp::create(builder, loc, componentTy);
   }
   assert(componentValue && "must have been computed");
   componentValue = builder.createConvert(loc, componentTy, componentValue);
   auto fieldTy = fir::FieldType::get(recTy.getContext());
   // FIXME: type parameters must come from the derived-type-spec
-  auto field = builder.create<fir::FieldIndexOp>(
-      loc, fieldTy, name, recTy,
-      /*typeParams=*/mlir::ValueRange{} /*TODO*/);
-  return builder.create<fir::InsertValueOp>(
-      loc, recTy, insertInto, componentValue,
+  auto field =
+      fir::FieldIndexOp::create(builder, loc, fieldTy, name, recTy,
+                                /*typeParams=*/mlir::ValueRange{} /*TODO*/);
+  return fir::InsertValueOp::create(
+      builder, loc, recTy, insertInto, componentValue,
       builder.getArrayAttr(field.getAttributes()));
 }
 
@@ -406,7 +405,7 @@ static mlir::Value genDefaultInitializerValue(
   // Build a scalar default value of the symbol type, looping through the
   // components to build each component initial value.
   auto recTy = mlir::cast<fir::RecordType>(scalarType);
-  mlir::Value initialValue = builder.create<fir::UndefOp>(loc, scalarType);
+  mlir::Value initialValue = fir::UndefOp::create(builder, loc, scalarType);
   const Fortran::semantics::DeclTypeSpec *declTy = sym.GetType();
   assert(declTy && "var with default initialization must have a type");
 
@@ -445,7 +444,7 @@ static mlir::Value genDefaultInitializerValue(
   if (sequenceType) {
     // For arrays, duplicate the scalar value to all elements with an
     // fir.insert_range covering the whole array.
-    auto arrayInitialValue = builder.create<fir::UndefOp>(loc, sequenceType);
+    auto arrayInitialValue = fir::UndefOp::create(builder, loc, sequenceType);
     llvm::SmallVector<int64_t> rangeBounds;
     for (int64_t extent : sequenceType.getShape()) {
       if (extent == fir::SequenceType::getUnknownExtent())
@@ -454,8 +453,8 @@ static mlir::Value genDefaultInitializerValue(
       rangeBounds.push_back(0);
       rangeBounds.push_back(extent - 1);
     }
-    return builder.create<fir::InsertOnRangeOp>(
-        loc, sequenceType, arrayInitialValue, initialValue,
+    return fir::InsertOnRangeOp::create(
+        builder, loc, sequenceType, arrayInitialValue, initialValue,
         builder.getIndexVectorAttr(rangeBounds));
   }
   return initialValue;
@@ -547,16 +546,16 @@ fir::GlobalOp Fortran::lower::defineGlobal(
       createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
         mlir::Value box =
             Fortran::lower::genInitialDataTarget(converter, loc, symTy, expr);
-        b.create<fir::HasValueOp>(loc, box);
+        fir::HasValueOp::create(b, loc, box);
       });
     } else {
       // Create unallocated/disassociated descriptor if no explicit init
       createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
         mlir::Value box = fir::factory::createUnallocatedBox(
             b, loc, symTy,
-            /*nonDeferredParams=*/std::nullopt,
+            /*nonDeferredParams=*/{},
             /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
-        b.create<fir::HasValueOp>(loc, box);
+        fir::HasValueOp::create(b, loc, box);
       });
     }
   } else if (const auto *details =
@@ -570,7 +569,7 @@ fir::GlobalOp Fortran::lower::defineGlobal(
                 converter, loc, details->init().value(), stmtCtx);
             mlir::Value castTo =
                 builder.createConvert(loc, symTy, fir::getBase(initVal));
-            builder.create<fir::HasValueOp>(loc, castTo);
+            fir::HasValueOp::create(builder, loc, castTo);
           });
     } else if (Fortran::lower::hasDefaultInitialization(sym)) {
       createGlobalInitialization(
@@ -580,7 +579,7 @@ fir::GlobalOp Fortran::lower::defineGlobal(
             mlir::Value initVal =
                 genDefaultInitializerValue(converter, loc, sym, symTy, stmtCtx);
             mlir::Value castTo = builder.createConvert(loc, symTy, initVal);
-            builder.create<fir::HasValueOp>(loc, castTo);
+            fir::HasValueOp::create(builder, loc, castTo);
           });
     }
   } else if (Fortran::semantics::IsProcedurePointer(sym)) {
@@ -595,19 +594,19 @@ fir::GlobalOp Fortran::lower::defineGlobal(
               auto box{Fortran::lower::convertProcedureDesignatorInitialTarget(
                   converter, loc, *sym)};
               auto castTo{builder.createConvert(loc, symTy, box)};
-              b.create<fir::HasValueOp>(loc, castTo);
+              fir::HasValueOp::create(b, loc, castTo);
             });
       else { // Has NULL() target.
         createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
           auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
-          b.create<fir::HasValueOp>(loc, box);
+          fir::HasValueOp::create(b, loc, box);
         });
       }
     } else {
       // No initialization.
       createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
         auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
-        b.create<fir::HasValueOp>(loc, box);
+        fir::HasValueOp::create(b, loc, box);
       });
     }
   } else if (sym.has<Fortran::semantics::CommonBlockDetails>()) {
@@ -633,10 +632,10 @@ fir::GlobalOp Fortran::lower::defineGlobal(
         builder, global, [&](fir::FirOpBuilder &builder) {
           mlir::Value initValue;
           if (converter.getLoweringOptions().getInitGlobalZero())
-            initValue = builder.create<fir::ZeroOp>(loc, symTy);
+            initValue = fir::ZeroOp::create(builder, loc, symTy);
           else
-            initValue = builder.create<fir::UndefOp>(loc, symTy);
-          builder.create<fir::HasValueOp>(loc, initValue);
+            initValue = fir::UndefOp::create(builder, loc, symTy);
+          fir::HasValueOp::create(builder, loc, initValue);
         });
   }
   // Set public visibility to prevent global definition to be optimized out
@@ -691,8 +690,8 @@ static void instantiateGlobal(Fortran::lower::AbstractConverter &converter,
                                                         sym);
     global = defineGlobal(converter, var, globalName, linkage, dataAttr);
   }
-  auto addrOf = builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                              global.getSymbol());
+  auto addrOf = fir::AddrOfOp::create(builder, loc, global.resultType(),
+                                      global.getSymbol());
   // The type of the global cannot be trusted to be the same as the one
   // of the variable as some existing programs map common blocks to
   // BIND(C) module variables (e.g. mpi_argv_null in MPI and MPI_F08).
@@ -753,7 +752,7 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
   // would be a waste of space, and incorrect if the pointee is a non dummy
   // assumed-size (possible with cray pointee).
   if (ultimateSymbol.test(Fortran::semantics::Symbol::Flag::CrayPointee))
-    return builder.create<fir::ZeroOp>(loc, fir::ReferenceType::get(ty));
+    return fir::ZeroOp::create(builder, loc, fir::ReferenceType::get(ty));
 
   if (needCUDAAlloc(ultimateSymbol)) {
     cuf::DataAttributeAttr dataAttr =
@@ -768,12 +767,12 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
     for (mlir::Value sh : elidedShape)
       indices.push_back(builder.createConvert(loc, idxTy, sh));
     if (dataAttr.getValue() == cuf::DataAttribute::Shared)
-      return builder.create<cuf::SharedMemoryOp>(loc, ty, nm, symNm, lenParams,
-                                                 indices);
+      return cuf::SharedMemoryOp::create(builder, loc, ty, nm, symNm, lenParams,
+                                         indices);
 
     if (!cuf::isCUDADeviceContext(builder.getRegion()))
-      return builder.create<cuf::AllocOp>(loc, ty, nm, symNm, dataAttr,
-                                          lenParams, indices);
+      return cuf::AllocOp::create(builder, loc, ty, nm, symNm, dataAttr,
+                                  lenParams, indices);
   }
 
   // Let the builder do all the heavy lifting.
@@ -783,7 +782,7 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
   // Local procedure pointer.
   auto res{builder.allocateLocal(loc, ty, nm, symNm, shape, lenParams, isTarg)};
   auto box{fir::factory::createNullBoxProc(builder, loc, ty)};
-  builder.create<fir::StoreOp>(loc, box, res);
+  fir::StoreOp::create(builder, loc, box, res);
   return res;
 }
 
@@ -802,13 +801,24 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter,
     const Fortran::semantics::DerivedTypeSpec *derived{type ? type->AsDerived()
                                                             : nullptr};
     if (derived) {
+      if (!FindCUDADeviceAllocatableUltimateComponent(*derived))
+        return; // No device components.
+
       fir::FirOpBuilder &builder = converter.getFirOpBuilder();
       mlir::Location loc = converter.getCurrentLocation();
 
       fir::ExtendedValue exv =
           converter.getSymbolExtendedValue(symbol.GetUltimate(), &symMap);
-      auto recTy = mlir::dyn_cast<fir::RecordType>(
-          fir::unwrapRefType(fir::getBase(exv).getType()));
+      mlir::Type baseTy = fir::unwrapRefType(fir::getBase(exv).getType());
+      if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseTy))
+        baseTy = boxTy.getEleTy();
+      baseTy = fir::unwrapRefType(baseTy);
+
+      if (mlir::isa<fir::SequenceType>(baseTy))
+        TODO(loc, "array of derived-type with device component");
+
+      auto recTy =
+          mlir::dyn_cast<fir::RecordType>(fir::unwrapSequenceType(baseTy));
       assert(recTy && "expected fir::RecordType");
 
       llvm::SmallVector<mlir::Value> coordinates;
@@ -817,15 +827,15 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter,
         if (Fortran::semantics::IsDeviceAllocatable(sym)) {
           unsigned fieldIdx = recTy.getFieldIndex(sym.name().ToString());
           mlir::Type fieldTy;
-          std::vector<mlir::Value> coordinates;
+          llvm::SmallVector<mlir::Value> coordinates;
 
           if (fieldIdx != std::numeric_limits<unsigned>::max()) {
             // Field found in the base record type.
             auto fieldName = recTy.getTypeList()[fieldIdx].first;
             fieldTy = recTy.getTypeList()[fieldIdx].second;
-            mlir::Value fieldIndex = builder.create<fir::FieldIndexOp>(
-                loc, fir::FieldType::get(fieldTy.getContext()), fieldName,
-                recTy,
+            mlir::Value fieldIndex = fir::FieldIndexOp::create(
+                builder, loc, fir::FieldType::get(fieldTy.getContext()),
+                fieldName, recTy,
                 /*typeParams=*/mlir::ValueRange{});
             coordinates.push_back(fieldIndex);
           } else {
@@ -836,19 +846,18 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter,
                       mlir::dyn_cast<fir::RecordType>(component.second)) {
                 fieldIdx = childRecTy.getFieldIndex(sym.name().ToString());
                 if (fieldIdx != std::numeric_limits<unsigned>::max()) {
-                  mlir::Value parentFieldIndex =
-                      builder.create<fir::FieldIndexOp>(
-                          loc, fir::FieldType::get(childRecTy.getContext()),
-                          component.first, recTy,
-                          /*typeParams=*/mlir::ValueRange{});
+                  mlir::Value parentFieldIndex = fir::FieldIndexOp::create(
+                      builder, loc,
+                      fir::FieldType::get(childRecTy.getContext()),
+                      component.first, recTy,
+                      /*typeParams=*/mlir::ValueRange{});
                   coordinates.push_back(parentFieldIndex);
                   auto fieldName = childRecTy.getTypeList()[fieldIdx].first;
                   fieldTy = childRecTy.getTypeList()[fieldIdx].second;
-                  mlir::Value childFieldIndex =
-                      builder.create<fir::FieldIndexOp>(
-                          loc, fir::FieldType::get(fieldTy.getContext()),
-                          fieldName, childRecTy,
-                          /*typeParams=*/mlir::ValueRange{});
+                  mlir::Value childFieldIndex = fir::FieldIndexOp::create(
+                      builder, loc, fir::FieldType::get(fieldTy.getContext()),
+                      fieldName, childRecTy,
+                      /*typeParams=*/mlir::ValueRange{});
                   coordinates.push_back(childFieldIndex);
                   break;
                 }
@@ -860,12 +869,28 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter,
             TODO(loc, "device resident component in complex derived-type "
                       "hierarchy");
 
-          mlir::Value comp = builder.create<fir::CoordinateOp>(
-              loc, builder.getRefType(fieldTy), fir::getBase(exv), coordinates);
+          mlir::Value base = fir::getBase(exv);
+          mlir::Value comp;
+          if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(base.getType()))) {
+            mlir::Value box = fir::LoadOp::create(builder, loc, base);
+            mlir::Value addr = fir::BoxAddrOp::create(builder, loc, box);
+            llvm::SmallVector<mlir::Value> lenParams;
+            assert(coordinates.size() == 1 && "expect one coordinate");
+            auto field = mlir::dyn_cast<fir::FieldIndexOp>(
+                coordinates[0].getDefiningOp());
+            comp = hlfir::DesignateOp::create(
+                builder, loc, builder.getRefType(fieldTy), addr,
+                /*component=*/field.getFieldName(),
+                /*componentShape=*/mlir::Value{},
+                hlfir::DesignateOp::Subscripts{});
+          } else {
+            comp = fir::CoordinateOp::create(
+                builder, loc, builder.getRefType(fieldTy), base, coordinates);
+          }
           cuf::DataAttributeAttr dataAttr =
               Fortran::lower::translateSymbolCUFDataAttribute(
                   builder.getContext(), sym);
-          builder.create<cuf::SetAllocatorIndexOp>(loc, comp, dataAttr);
+          cuf::SetAllocatorIndexOp::create(builder, loc, comp, dataAttr);
         }
       }
     }
@@ -907,8 +932,8 @@ void Fortran::lower::defaultInitializeAtRuntime(
     // 15.5.2.12 point 3, absent optional dummies are not initialized.
     // Creating descriptor/passing null descriptor to the runtime would
     // create runtime crashes.
-    auto isPresent = builder.create<fir::IsPresentOp>(loc, builder.getI1Type(),
-                                                      fir::getBase(exv));
+    auto isPresent = fir::IsPresentOp::create(builder, loc, builder.getI1Type(),
+                                              fir::getBase(exv));
     builder.genIfThen(loc, isPresent)
         .genThen([&]() {
           auto box = builder.createBox(loc, exv);
@@ -950,7 +975,7 @@ void Fortran::lower::defaultInitializeAtRuntime(
                   converter, loc, details->init().value(), stmtCtx);
               mlir::Value castTo =
                   builder.createConvert(loc, symTy, fir::getBase(initVal));
-              builder.create<fir::HasValueOp>(loc, castTo);
+              fir::HasValueOp::create(builder, loc, castTo);
             });
       } else if (!global) {
         global = builder.createGlobal(loc, symTy, globalName, linkage,
@@ -965,13 +990,13 @@ void Fortran::lower::defaultInitializeAtRuntime(
               mlir::Value initVal = genDefaultInitializerValue(
                   converter, loc, sym, symTy, stmtCtx);
               mlir::Value castTo = builder.createConvert(loc, symTy, initVal);
-              builder.create<fir::HasValueOp>(loc, castTo);
+              fir::HasValueOp::create(builder, loc, castTo);
             });
       }
-      auto addrOf = builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                                  global.getSymbol());
-      builder.create<fir::CopyOp>(loc, addrOf, fir::getBase(exv),
-                                  /*noOverlap=*/true);
+      auto addrOf = fir::AddrOfOp::create(builder, loc, global.resultType(),
+                                          global.getSymbol());
+      fir::CopyOp::create(builder, loc, addrOf, fir::getBase(exv),
+                          /*noOverlap=*/true);
     } else {
       mlir::Value box = builder.createBox(loc, exv);
       fir::runtime::genDerivedTypeInitialize(builder, loc, box);
@@ -1071,8 +1096,8 @@ static void finalizeAtRuntime(Fortran::lower::AbstractConverter &converter,
   fir::ExtendedValue exv = converter.getSymbolExtendedValue(sym, &symMap);
   if (Fortran::semantics::IsOptional(sym)) {
     // Only finalize if present.
-    auto isPresent = builder.create<fir::IsPresentOp>(loc, builder.getI1Type(),
-                                                      fir::getBase(exv));
+    auto isPresent = fir::IsPresentOp::create(builder, loc, builder.getI1Type(),
+                                              fir::getBase(exv));
     builder.genIfThen(loc, isPresent)
         .genThen([&]() {
           auto box = builder.createBox(loc, exv);
@@ -1118,8 +1143,8 @@ static void deallocateIntentOut(Fortran::lower::AbstractConverter &converter,
       fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
       if (Fortran::semantics::IsOptional(sym)) {
-        auto isPresent = builder.create<fir::IsPresentOp>(
-            loc, builder.getI1Type(), fir::getBase(extVal));
+        auto isPresent = fir::IsPresentOp::create(
+            builder, loc, builder.getI1Type(), fir::getBase(extVal));
         builder.genIfThen(loc, isPresent)
             .genThen([&]() {
               Fortran::lower::genDeallocateIfAllocated(converter, *mutBox, loc);
@@ -1332,7 +1357,7 @@ static fir::GlobalOp defineGlobalAggregateStore(
               Fortran::lower::StatementContext stmtCtx;
               mlir::Value initVal = fir::getBase(genInitializerExprValue(
                   converter, loc, objectDetails->init().value(), stmtCtx));
-              builder.create<fir::HasValueOp>(loc, initVal);
+              fir::HasValueOp::create(builder, loc, initVal);
             });
         return global;
       }
@@ -1341,8 +1366,8 @@ static fir::GlobalOp defineGlobalAggregateStore(
   // of the linkage.
   createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &builder) {
     Fortran::lower::StatementContext stmtCtx;
-    mlir::Value initVal = builder.create<fir::ZeroOp>(loc, aggTy);
-    builder.create<fir::HasValueOp>(loc, initVal);
+    mlir::Value initVal = fir::ZeroOp::create(builder, loc, aggTy);
+    fir::HasValueOp::create(builder, loc, initVal);
   });
   return global;
 }
@@ -1392,8 +1417,8 @@ instantiateAggregateStore(Fortran::lower::AbstractConverter &converter,
       global =
           defineGlobalAggregateStore(converter, aggregate, aggName, linkage);
     }
-    auto addr = builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                              global.getSymbol());
+    auto addr = fir::AddrOfOp::create(builder, loc, global.resultType(),
+                                      global.getSymbol());
     auto size = std::get<1>(var.getInterval());
     fir::SequenceType::Shape shape(1, size);
     auto seqTy = fir::SequenceType::get(shape, i8Ty);
@@ -1440,8 +1465,8 @@ static void instantiateAlias(Fortran::lower::AbstractConverter &converter,
   std::size_t off = sym.GetUltimate().offset() - var.getAliasOffset();
   mlir::Value storeAddr = getAggregateStore(storeMap, var);
   mlir::Value offset = builder.createIntegerConstant(loc, idxTy, off);
-  mlir::Value bytePtr = builder.create<fir::CoordinateOp>(
-      loc, i8Ptr, storeAddr, mlir::ValueRange{offset});
+  mlir::Value bytePtr = fir::CoordinateOp::create(
+      builder, loc, i8Ptr, storeAddr, mlir::ValueRange{offset});
   mlir::Value typedPtr = castAliasToPointer(builder, loc, symType, bytePtr);
   Fortran::lower::StatementContext stmtCtx;
   mapSymbolAttributes(converter, var, symMap, stmtCtx, typedPtr);
@@ -1630,7 +1655,7 @@ static void finalizeCommonBlockDefinition(
   mlir::TupleType commonTy = mlir::cast<mlir::TupleType>(global.getType());
   auto initFunc = [&](fir::FirOpBuilder &builder) {
     mlir::IndexType idxTy = builder.getIndexType();
-    mlir::Value cb = builder.create<fir::ZeroOp>(loc, commonTy);
+    mlir::Value cb = fir::ZeroOp::create(builder, loc, commonTy);
     unsigned tupIdx = 0;
     std::size_t offset = 0;
     LLVM_DEBUG(llvm::dbgs() << "block {\n");
@@ -1654,15 +1679,15 @@ static void finalizeCommonBlockDefinition(
           mlir::IntegerAttr offVal = builder.getIntegerAttr(idxTy, tupIdx);
           mlir::Value castVal = builder.createConvert(
               loc, commonTy.getType(tupIdx), fir::getBase(initVal));
-          cb = builder.create<fir::InsertValueOp>(loc, commonTy, cb, castVal,
-                                                  builder.getArrayAttr(offVal));
+          cb = fir::InsertValueOp::create(builder, loc, commonTy, cb, castVal,
+                                          builder.getArrayAttr(offVal));
           ++tupIdx;
           offset = mem->offset() + mem->size();
         }
       }
     }
     LLVM_DEBUG(llvm::dbgs() << "}\n");
-    builder.create<fir::HasValueOp>(loc, cb);
+    fir::HasValueOp::create(builder, loc, cb);
   };
   createGlobalInitialization(builder, global, initFunc);
 }
@@ -1696,8 +1721,8 @@ mlir::Value Fortran::lower::genCommonBlockMember(
 
   mlir::Value offs =
       builder.createIntegerConstant(loc, builder.getIndexType(), byteOffset);
-  mlir::Value varAddr = builder.create<fir::CoordinateOp>(
-      loc, i8Ptr, base, mlir::ValueRange{offs});
+  mlir::Value varAddr = fir::CoordinateOp::create(builder, loc, i8Ptr, base,
+                                                  mlir::ValueRange{offs});
   mlir::Type symType = converter.genType(sym);
 
   return Fortran::semantics::FindEquivalenceSet(sym) != nullptr
@@ -1722,8 +1747,8 @@ static void instantiateCommon(Fortran::lower::AbstractConverter &converter,
   if (!commonAddr) {
     // introduce a local AddrOf and add it to the map
     fir::GlobalOp global = getCommonBlockGlobal(converter, common);
-    commonAddr = builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                               global.getSymbol());
+    commonAddr = fir::AddrOfOp::create(builder, loc, global.resultType(),
+                                       global.getSymbol());
 
     symMap.addSymbol(common, commonAddr);
   }
@@ -2005,8 +2030,8 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
           Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate()));
 
       // Declare a local pointer variable.
-      auto newBase = builder.create<hlfir::DeclareOp>(
-          loc, boxAlloc, name, /*shape=*/nullptr, lenParams,
+      auto newBase = hlfir::DeclareOp::create(
+          builder, loc, boxAlloc, name, /*shape=*/nullptr, lenParams,
           /*dummy_scope=*/nullptr, attributes);
       mlir::Value nullAddr = builder.createNullConstant(
           loc, llvm::cast<fir::BaseBoxType>(ptrBoxType).getEleTy());
@@ -2021,9 +2046,9 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
       // Inherit the shape (and maybe length parameters) from the pointee
       // declaration.
       mlir::Value initVal =
-          builder.create<fir::EmboxOp>(loc, ptrBoxType, nullAddr, shapeOrShift,
-                                       /*slice=*/nullptr, lenParams);
-      builder.create<fir::StoreOp>(loc, initVal, newBase.getBase());
+          fir::EmboxOp::create(builder, loc, ptrBoxType, nullAddr, shapeOrShift,
+                               /*slice=*/nullptr, lenParams);
+      fir::StoreOp::create(builder, loc, initVal, newBase.getBase());
 
       // Any reference to the pointee is going to be using the pointer
       // box from now on. The base_addr of the descriptor must be updated
@@ -2037,9 +2062,9 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
     mlir::Value dummyScope;
     if (converter.isRegisteredDummySymbol(sym))
       dummyScope = converter.dummyArgsScopeValue();
-    auto newBase = builder.create<hlfir::DeclareOp>(
-        loc, base, name, shapeOrShift, lenParams, dummyScope, attributes,
-        dataAttr);
+    auto newBase =
+        hlfir::DeclareOp::create(builder, loc, base, name, shapeOrShift,
+                                 lenParams, dummyScope, attributes, dataAttr);
     symMap.addVariableDefinition(sym, newBase, force);
     return;
   }
@@ -2188,7 +2213,7 @@ void Fortran::lower::mapSymbolAttributes(
       // Additional discussion below.
       mlir::Type dummyProcType =
           Fortran::lower::getDummyProcedureType(sym, converter);
-      mlir::Value undefOp = builder.create<fir::UndefOp>(loc, dummyProcType);
+      mlir::Value undefOp = fir::UndefOp::create(builder, loc, dummyProcType);
 
       Fortran::lower::genDeclareSymbol(converter, symMap, sym, undefOp);
     }
@@ -2278,32 +2303,32 @@ void Fortran::lower::mapSymbolAttributes(
           mlir::Type lenType = builder.getCharacterLengthType();
           mlir::Value addr, len;
           if (Fortran::semantics::IsOptional(sym)) {
-            auto isPresent = builder.create<fir::IsPresentOp>(
-                loc, builder.getI1Type(), dummyArg);
+            auto isPresent = fir::IsPresentOp::create(
+                builder, loc, builder.getI1Type(), dummyArg);
             auto addrAndLen =
                 builder
                     .genIfOp(loc, {refTy, lenType}, isPresent,
                              /*withElseRegion=*/true)
                     .genThen([&]() {
                       mlir::Value readAddr =
-                          builder.create<fir::BoxAddrOp>(loc, refTy, dummyArg);
+                          fir::BoxAddrOp::create(builder, loc, refTy, dummyArg);
                       mlir::Value readLength =
                           charHelp.readLengthFromBox(dummyArg);
-                      builder.create<fir::ResultOp>(
-                          loc, mlir::ValueRange{readAddr, readLength});
+                      fir::ResultOp::create(
+                          builder, loc, mlir::ValueRange{readAddr, readLength});
                     })
                     .genElse([&] {
                       mlir::Value readAddr = builder.genAbsentOp(loc, refTy);
                       mlir::Value readLength =
                           fir::factory::createZeroValue(builder, loc, lenType);
-                      builder.create<fir::ResultOp>(
-                          loc, mlir::ValueRange{readAddr, readLength});
+                      fir::ResultOp::create(
+                          builder, loc, mlir::ValueRange{readAddr, readLength});
                     })
                     .getResults();
             addr = addrAndLen[0];
             len = addrAndLen[1];
           } else {
-            addr = builder.create<fir::BoxAddrOp>(loc, refTy, dummyArg);
+            addr = fir::BoxAddrOp::create(builder, loc, refTy, dummyArg);
             len = charHelp.readLengthFromBox(dummyArg);
           }
           if (!explicitParams.empty())
@@ -2402,7 +2427,7 @@ void Fortran::lower::mapSymbolAttributes(
         mlir::Value dim =
             builder.createIntegerConstant(loc, idxTy, iter.index());
         auto dimInfo =
-            builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
+            fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, box, dim);
         shapes.emplace_back(dimInfo.getResult(1));
       } else if (spec->ubound().isStar()) {
         shapes.emplace_back(getAssumedSizeExtent(loc, builder));
@@ -2426,7 +2451,7 @@ void Fortran::lower::mapSymbolAttributes(
         mlir::Value dim =
             builder.createIntegerConstant(loc, idxTy, iter.index());
         dimInfo =
-            builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
+            fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, box, dim);
         extents.emplace_back(dimInfo.getResult(1));
         if (auto low = spec->lbound().GetExplicit()) {
           auto expr = Fortran::lower::SomeExpr{*low};
@@ -2475,7 +2500,7 @@ void Fortran::lower::mapSymbolAttributes(
     if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(arg.getType())) {
       // Contiguous assumed shape that can be tracked without a fir.box.
       mlir::Type refTy = builder.getRefType(boxTy.getEleTy());
-      addr = builder.create<fir::BoxAddrOp>(loc, refTy, arg);
+      addr = fir::BoxAddrOp::create(builder, loc, refTy, arg);
     }
 
   // Compute/Extract character length.
@@ -2486,8 +2511,8 @@ void Fortran::lower::mapSymbolAttributes(
         std::tie(addr, len) = charHelp.createUnboxChar(arg);
       } else if (mlir::isa<fir::CharacterType>(arg.getType())) {
         // fir.char<1> passed by value (BIND(C) with VALUE attribute).
-        addr = builder.create<fir::AllocaOp>(loc, arg.getType());
-        builder.create<fir::StoreOp>(loc, arg, addr);
+        addr = fir::AllocaOp::create(builder, loc, arg.getType());
+        fir::StoreOp::create(builder, loc, arg, addr);
       } else if (!addr) {
         addr = arg;
       }
@@ -2557,7 +2582,7 @@ void Fortran::lower::mapSymbolAttributes(
         // Dummy argument passed in register. Place the value in memory at that
         // point since lowering expect symbols to be mapped to memory addresses.
         mlir::Type symType = converter.genType(sym);
-        addr = builder.create<fir::AllocaOp>(loc, symType);
+        addr = fir::AllocaOp::create(builder, loc, symType);
         if (isCptrByVal) {
           // Place the void* address into the CPTR address component.
           mlir::Value addrComponent =
@@ -2777,8 +2802,8 @@ Fortran::lower::genPackArray(Fortran::lower::AbstractConverter &converter,
   mlir::Type elementType = boxType.unwrapInnerType();
   llvm::SmallVector<mlir::Value> elidedLenParams =
       fir::factory::elideLengthsAlreadyInType(elementType, lenParams);
-  auto packOp = builder.create<fir::PackArrayOp>(
-      loc, fir::getBase(exv), stackAlloc, isInnermostMode, noCopy,
+  auto packOp = fir::PackArrayOp::create(
+      builder, loc, fir::getBase(exv), stackAlloc, isInnermostMode, noCopy,
       /*max_size=*/mlir::IntegerAttr{},
       /*max_element_size=*/mlir::IntegerAttr{},
       /*min_stride=*/mlir::IntegerAttr{}, fir::PackArrayHeuristics::None,
@@ -2816,6 +2841,6 @@ void Fortran::lower::genUnpackArray(
   // Avoid copy-out for 'intent(in)' variables.
   bool noCopy = Fortran::semantics::IsIntentIn(sym);
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  builder.create<fir::UnpackArrayOp>(loc, temp, original, stackAlloc, noCopy,
-                                     getSafeRepackAttrs(converter));
+  fir::UnpackArrayOp::create(builder, loc, temp, original, stackAlloc, noCopy,
+                             getSafeRepackAttrs(converter));
 }
diff --git a/flang/lib/Lower/CustomIntrinsicCall.cpp b/flang/lib/Lower/CustomIntrinsicCall.cpp
index 30c6ce7f53b3f..2c5233bdd15ee 100644
--- a/flang/lib/Lower/CustomIntrinsicCall.cpp
+++ b/flang/lib/Lower/CustomIntrinsicCall.cpp
@@ -101,7 +101,7 @@ Fortran::lower::genIntrinsicCall(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::Value addr = fir::getBase(result);
     if (auto *box = result.getBoxOf<fir::BoxValue>())
       addr =
-          builder.create<fir::BoxAddrOp>(loc, box->getMemTy(), box->getAddr());
+          fir::BoxAddrOp::create(builder, loc, box->getMemTy(), box->getAddr());
     fir::FirOpBuilder *bldr = &builder;
     stmtCtx.attachCleanup([=]() { bldr->create<fir::FreeMemOp>(loc, addr); });
   }
@@ -171,9 +171,9 @@ lowerMinOrMax(fir::FirOpBuilder &builder, mlir::Location loc,
                 args.emplace_back(getOperand(opIndex, loadOperand));
                 fir::ExtendedValue newExtremum = genIntrinsicCall(
                     builder, loc, name, resultType, args, stmtCtx);
-                builder.create<fir::ResultOp>(loc, fir::getBase(newExtremum));
+                fir::ResultOp::create(builder, loc, fir::getBase(newExtremum));
               })
-              .genElse([&]() { builder.create<fir::ResultOp>(loc, extremum); })
+              .genElse([&]() { fir::ResultOp::create(builder, loc, extremum); })
               .getResults()[0];
     } else {
       // Argument is know to be present at compile time.
@@ -235,13 +235,13 @@ lowerIshftc(fir::FirOpBuilder &builder, mlir::Location loc,
             fir::ExtendedValue sizeExv = getOperand(2, loadOperand);
             mlir::Value size =
                 builder.createConvert(loc, resultType, fir::getBase(sizeExv));
-            builder.create<fir::ResultOp>(loc, size);
+            fir::ResultOp::create(builder, loc, size);
           })
           .genElse([&]() {
             mlir::Value bitSize = builder.createIntegerConstant(
                 loc, resultType,
                 mlir::cast<mlir::IntegerType>(resultType).getWidth());
-            builder.create<fir::ResultOp>(loc, bitSize);
+            fir::ResultOp::create(builder, loc, bitSize);
           })
           .getResults()[0]);
   return genIntrinsicCall(builder, loc, name, resultType, args, stmtCtx);
@@ -280,7 +280,7 @@ lowerAssociated(fir::FirOpBuilder &builder, mlir::Location loc,
   // while the optionality of the target pointer/allocatable is what must be
   // checked here.
   mlir::Value isPresent =
-      builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), targetBase);
+      fir::IsPresentOp::create(builder, loc, builder.getI1Type(), targetBase);
   mlir::Type targetType = fir::unwrapRefType(targetBase.getType());
   mlir::Type targetValueType = fir::unwrapPassByRefType(targetType);
   mlir::Type boxType = mlir::isa<fir::BaseBoxType>(targetType)
@@ -293,11 +293,12 @@ lowerAssociated(fir::FirOpBuilder &builder, mlir::Location loc,
           .genThen([&]() {
             mlir::Value box = builder.createBox(loc, targetExv);
             mlir::Value cast = builder.createConvert(loc, boxType, box);
-            builder.create<fir::ResultOp>(loc, cast);
+            fir::ResultOp::create(builder, loc, cast);
           })
           .genElse([&]() {
-            mlir::Value absentBox = builder.create<fir::AbsentOp>(loc, boxType);
-            builder.create<fir::ResultOp>(loc, absentBox);
+            mlir::Value absentBox =
+                fir::AbsentOp::create(builder, loc, boxType);
+            fir::ResultOp::create(builder, loc, absentBox);
           })
           .getResults()[0];
   args.emplace_back(std::move(targetBox));
diff --git a/flang/lib/Lower/HlfirIntrinsics.cpp b/flang/lib/Lower/HlfirIntrinsics.cpp
index 8b96b209ddb00..6e1d06a25924b 100644
--- a/flang/lib/Lower/HlfirIntrinsics.cpp
+++ b/flang/lib/Lower/HlfirIntrinsics.cpp
@@ -63,7 +63,7 @@ class HlfirTransformationalIntrinsic {
 
   template <typename OP, typename... BUILD_ARGS>
   inline OP createOp(BUILD_ARGS... args) {
-    return builder.create<OP>(loc, args...);
+    return OP::create(builder, loc, args...);
   }
 
   mlir::Value loadBoxAddress(
@@ -195,7 +195,7 @@ mlir::Value HlfirTransformationalIntrinsic::loadBoxAddress(
       // this is a box address type but is not dynamically optional. Just load
       // the box, assuming it is well formed (!fir.ref<!fir.box<...>> ->
       // !fir.box<...>)
-      return builder.create<fir::LoadOp>(loc, actual.getBase());
+      return fir::LoadOp::create(builder, loc, actual.getBase());
     }
     return actual;
   }
@@ -209,9 +209,9 @@ mlir::Value HlfirTransformationalIntrinsic::loadBoxAddress(
   // ensures it won't be.
   mlir::Value box = builder.createBox(loc, exv);
   mlir::Type boxType = box.getType();
-  auto absent = builder.create<fir::AbsentOp>(loc, boxType);
-  auto boxOrAbsent = builder.create<mlir::arith::SelectOp>(
-      loc, boxType, isPresent, box, absent);
+  auto absent = fir::AbsentOp::create(builder, loc, boxType);
+  auto boxOrAbsent = mlir::arith::SelectOp::create(builder, loc, boxType,
+                                                   isPresent, box, absent);
 
   return boxOrAbsent;
 }
@@ -232,11 +232,11 @@ static mlir::Value loadOptionalValue(
         assert(actual.isScalar() && fir::isa_trivial(eleType) &&
                "must be a numerical or logical scalar");
         hlfir::Entity val = hlfir::loadTrivialScalar(loc, builder, actual);
-        builder.create<fir::ResultOp>(loc, val);
+        fir::ResultOp::create(builder, loc, val);
       })
       .genElse([&]() {
         mlir::Value zero = fir::factory::createZeroValue(builder, loc, eleType);
-        builder.create<fir::ResultOp>(loc, zero);
+        fir::ResultOp::create(builder, loc, zero);
       })
       .getResults()[0];
 }
diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index 6a44be65a6cde..2a330ccc4eebb 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -165,7 +165,7 @@ class CapturedSimpleScalars : public CapturedSymbols<CapturedSimpleScalars> {
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvertWithVolatileCast(
         args.loc, typeInTuple, fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    fir::StoreOp::create(builder, args.loc, castBox, args.addrInTuple);
   }
 
   static void getFromTuple(const GetFromTuple &args,
@@ -196,7 +196,7 @@ class CapturedProcedure : public CapturedSymbols<CapturedProcedure> {
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvertWithVolatileCast(
         args.loc, typeInTuple, fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    fir::StoreOp::create(builder, args.loc, castBox, args.addrInTuple);
   }
 
   static void getFromTuple(const GetFromTuple &args,
@@ -231,7 +231,7 @@ class CapturedCharacterScalars
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
     mlir::Value boxchar = fir::factory::CharacterExprHelper(builder, args.loc)
                               .createEmbox(*charBox);
-    builder.create<fir::StoreOp>(args.loc, boxchar, args.addrInTuple);
+    fir::StoreOp::create(builder, args.loc, boxchar, args.addrInTuple);
   }
 
   static void getFromTuple(const GetFromTuple &args,
@@ -269,20 +269,20 @@ class CapturedPolymorphicScalar
         args.loc, typeInTuple, fir::getBase(args.hostValue));
     if (Fortran::semantics::IsOptional(sym)) {
       auto isPresent =
-          builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), castBox);
+          fir::IsPresentOp::create(builder, loc, builder.getI1Type(), castBox);
       builder.genIfThenElse(loc, isPresent)
           .genThen([&]() {
-            builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+            fir::StoreOp::create(builder, loc, castBox, args.addrInTuple);
           })
           .genElse([&]() {
             mlir::Value null = fir::factory::createUnallocatedBox(
                 builder, loc, typeInTuple,
                 /*nonDeferredParams=*/mlir::ValueRange{});
-            builder.create<fir::StoreOp>(loc, null, args.addrInTuple);
+            fir::StoreOp::create(builder, loc, null, args.addrInTuple);
           })
           .end();
     } else {
-      builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+      fir::StoreOp::create(builder, loc, castBox, args.addrInTuple);
     }
   }
   static void getFromTuple(const GetFromTuple &args,
@@ -297,11 +297,11 @@ class CapturedPolymorphicScalar
       auto eleTy = boxTy.getEleTy();
       if (!fir::isa_ref_type(eleTy))
         eleTy = builder.getRefType(eleTy);
-      auto addr = builder.create<fir::BoxAddrOp>(loc, eleTy, box);
+      auto addr = fir::BoxAddrOp::create(builder, loc, eleTy, box);
       mlir::Value isPresent = builder.genIsNotNullAddr(loc, addr);
-      auto absentBox = builder.create<fir::AbsentOp>(loc, boxTy);
-      box =
-          builder.create<mlir::arith::SelectOp>(loc, isPresent, box, absentBox);
+      auto absentBox = fir::AbsentOp::create(builder, loc, boxTy);
+      box = mlir::arith::SelectOp::create(builder, loc, isPresent, box,
+                                          absentBox);
     }
     bindCapturedSymbol(sym, box, converter, args.symMap);
   }
@@ -331,7 +331,7 @@ class CapturedAllocatableAndPointer
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvertWithVolatileCast(
         args.loc, typeInTuple, fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    fir::StoreOp::create(builder, args.loc, castBox, args.addrInTuple);
   }
   static void getFromTuple(const GetFromTuple &args,
                            Fortran::lower::AbstractConverter &converter,
@@ -404,21 +404,21 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
       // done on present optional. For absent optionals, simply create a
       // disassociated pointer (it is illegal to inquire about lower bounds or
       // lengths of optional according to 15.5.2.12 3 (9) and 10.1.11 2 (7)b).
-      auto isPresent = builder.create<fir::IsPresentOp>(
-          loc, builder.getI1Type(), fir::getBase(args.hostValue));
+      auto isPresent = fir::IsPresentOp::create(
+          builder, loc, builder.getI1Type(), fir::getBase(args.hostValue));
       builder.genIfThenElse(loc, isPresent)
           .genThen([&]() {
             fir::factory::associateMutableBox(builder, loc, boxInTuple,
                                               args.hostValue,
-                                              /*lbounds=*/std::nullopt);
+                                              /*lbounds=*/{});
           })
           .genElse([&]() {
             fir::factory::disassociateMutableBox(builder, loc, boxInTuple);
           })
           .end();
     } else {
-      fir::factory::associateMutableBox(
-          builder, loc, boxInTuple, args.hostValue, /*lbounds=*/std::nullopt);
+      fir::factory::associateMutableBox(builder, loc, boxInTuple,
+                                        args.hostValue, /*lbounds=*/{});
     }
   }
 
@@ -441,8 +441,8 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
         const unsigned rank = sym.Rank();
         for (unsigned dim = 0; dim < rank; ++dim) {
           mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
-          auto dims = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                     box, dimVal);
+          auto dims = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy,
+                                             box, dimVal);
           lbounds.emplace_back(dims.getResult(0));
         }
       }
@@ -464,11 +464,11 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
-        auto addr = builder.create<fir::BoxAddrOp>(loc, eleTy, box);
+        auto addr = fir::BoxAddrOp::create(builder, loc, eleTy, box);
         mlir::Value isPresent = builder.genIsNotNullAddr(loc, addr);
-        auto absentBox = builder.create<fir::AbsentOp>(loc, boxTy);
-        box = builder.create<mlir::arith::SelectOp>(loc, isPresent, box,
-                                                    absentBox);
+        auto absentBox = fir::AbsentOp::create(builder, loc, boxTy);
+        box = mlir::arith::SelectOp::create(builder, loc, isPresent, box,
+                                            absentBox);
       }
       fir::BoxValue boxValue(box, lbounds, /*explicitParams=*/{});
       bindCapturedSymbol(sym, boxValue, converter, args.symMap);
@@ -540,7 +540,7 @@ static mlir::Value genTupleCoor(fir::FirOpBuilder &builder, mlir::Location loc,
   auto ty = mlir::isa<fir::ReferenceType>(varTy)
                 ? mlir::Type(fir::LLVMPointerType::get(varTy))
                 : mlir::Type(builder.getRefType(varTy));
-  return builder.create<fir::CoordinateOp>(loc, ty, tupleArg, offset);
+  return fir::CoordinateOp::create(builder, loc, ty, tupleArg, offset);
 }
 
 void Fortran::lower::HostAssociations::addSymbolsToBind(
@@ -572,7 +572,7 @@ void Fortran::lower::HostAssociations::hostProcedureBindings(
   mlir::TupleType tupTy = unwrapTupleTy(getArgumentType(converter));
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
-  auto hostTuple = builder.create<fir::AllocaOp>(loc, tupTy);
+  auto hostTuple = fir::AllocaOp::create(builder, loc, tupTy);
   mlir::IntegerType offTy = builder.getIntegerType(32);
 
   // Walk the list of tupleSymbols and update the pointers in the tuple.
@@ -639,7 +639,7 @@ void Fortran::lower::HostAssociations::internalProcedureBindings(
     mlir::Value off = builder.createIntegerConstant(loc, offTy, s.index());
     mlir::Type varTy = tupTy.getType(s.index());
     mlir::Value eleOff = genTupleCoor(builder, loc, varTy, tupleArg, off);
-    mlir::Value valueInTuple = builder.create<fir::LoadOp>(loc, eleOff);
+    mlir::Value valueInTuple = fir::LoadOp::create(builder, loc, eleOff);
     GetFromTuple getFromTuple{symMap, valueInTuple, loc};
     walkCaptureCategories(getFromTuple, converter, *s.value());
   }
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 63a612d7ead61..c95c3404a8e26 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -153,8 +153,8 @@ static mlir::Value genEndIO(Fortran::lower::AbstractConverter &converter,
   if (csi.ioMsg) {
     mlir::func::FuncOp getIoMsg =
         fir::runtime::getIORuntimeFunc<mkIOKey(GetIoMsg)>(loc, builder);
-    builder.create<fir::CallOp>(
-        loc, getIoMsg,
+    fir::CallOp::create(
+        builder, loc, getIoMsg,
         mlir::ValueRange{
             cookie,
             builder.createConvert(loc, getIoMsg.getFunctionType().getInput(1),
@@ -164,12 +164,12 @@ static mlir::Value genEndIO(Fortran::lower::AbstractConverter &converter,
   }
   mlir::func::FuncOp endIoStatement =
       fir::runtime::getIORuntimeFunc<mkIOKey(EndIoStatement)>(loc, builder);
-  auto call = builder.create<fir::CallOp>(loc, endIoStatement,
-                                          mlir::ValueRange{cookie});
+  auto call = fir::CallOp::create(builder, loc, endIoStatement,
+                                  mlir::ValueRange{cookie});
   mlir::Value iostat = call.getResult(0);
   if (csi.bigUnitIfOp) {
     stmtCtx.finalizeAndPop();
-    builder.create<fir::ResultOp>(loc, iostat);
+    fir::ResultOp::create(builder, loc, iostat);
     builder.setInsertionPointAfter(csi.bigUnitIfOp);
     iostat = csi.bigUnitIfOp.getResult(0);
   }
@@ -178,7 +178,7 @@ static mlir::Value genEndIO(Fortran::lower::AbstractConverter &converter,
         fir::getBase(converter.genExprAddr(loc, csi.ioStatExpr, stmtCtx));
     mlir::Value ioStatResult =
         builder.createConvert(loc, converter.genType(*csi.ioStatExpr), iostat);
-    builder.create<fir::StoreOp>(loc, ioStatResult, ioStatVar);
+    fir::StoreOp::create(builder, loc, ioStatResult, ioStatVar);
   }
   return csi.hasTransferConditionSpec() ? iostat : mlir::Value{};
 }
@@ -203,8 +203,8 @@ static void makeNextConditionalOn(fir::FirOpBuilder &builder,
   mlir::IntegerType boolTy = builder.getI1Type();
   if (inLoop)
     resTy = boolTy;
-  auto ifOp = builder.create<fir::IfOp>(loc, resTy, ok,
-                                        /*withElseRegion=*/inLoop);
+  auto ifOp = fir::IfOp::create(builder, loc, resTy, ok,
+                                /*withElseRegion=*/inLoop);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
 }
 
@@ -259,31 +259,34 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
           ? fir::NameUniquer::doGenerated("default" + suffix)
           : converter.mangleName(suffix);
   if (auto table = builder.getNamedGlobal(tableMangleName))
-    return builder.createConvert(
-        loc, refTy,
-        builder.create<fir::AddrOfOp>(loc, table.resultType(),
-                                      table.getSymbol()));
+    return builder.createConvert(loc, refTy,
+                                 fir::AddrOfOp::create(builder, loc,
+                                                       table.resultType(),
+                                                       table.getSymbol()));
 
   mlir::StringAttr linkOnce = builder.createLinkOnceLinkage();
   mlir::Type idxTy = builder.getIndexType();
   mlir::Type sizeTy =
       fir::runtime::getModel<std::size_t>()(builder.getContext());
   mlir::Type intTy = fir::runtime::getModel<int>()(builder.getContext());
+  mlir::Type byteTy =
+      fir::runtime::getModel<std::uint8_t>()(builder.getContext());
   mlir::Type boolTy = fir::runtime::getModel<bool>()(builder.getContext());
   mlir::Type listTy = fir::SequenceType::get(
       definedIoProcMap.size(),
-      mlir::TupleType::get(context, {refTy, refTy, intTy, boolTy}));
+      mlir::TupleType::get(context, {refTy, refTy, intTy, byteTy}));
   mlir::Type tableTy = mlir::TupleType::get(
       context, {sizeTy, fir::ReferenceType::get(listTy), boolTy});
 
   // Define the list of NonTbpDefinedIo procedures.
   bool tableIsLocal =
       !definedIoProcMap.empty() && hasLocalDefinedIoProc(definedIoProcMap);
-  mlir::Value listAddr =
-      tableIsLocal ? builder.create<fir::AllocaOp>(loc, listTy) : mlir::Value{};
+  mlir::Value listAddr = tableIsLocal
+                             ? fir::AllocaOp::create(builder, loc, listTy)
+                             : mlir::Value{};
   std::string listMangleName = tableMangleName + ".list";
   auto listFunc = [&](fir::FirOpBuilder &builder) {
-    mlir::Value list = builder.create<fir::UndefOp>(loc, listTy);
+    mlir::Value list = fir::UndefOp::create(builder, loc, listTy);
     mlir::IntegerAttr intAttr[4];
     for (int i = 0; i < 4; ++i)
       intAttr[i] = builder.getIntegerAttr(idxTy, i);
@@ -292,8 +295,8 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
     int n0 = 0, n1;
     auto insert = [&](mlir::Value val) {
       idx[1] = intAttr[n1++];
-      list = builder.create<fir::InsertValueOp>(loc, listTy, list, val,
-                                                builder.getArrayAttr(idx));
+      list = fir::InsertValueOp::create(builder, loc, listTy, list, val,
+                                        builder.getArrayAttr(idx));
     };
     for (auto &iface : definedIoProcMap) {
       idx[0] = builder.getIntegerAttr(idxTy, n0++);
@@ -303,8 +306,8 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
       std::string dtName = converter.mangleName(dtSym);
       insert(builder.createConvert(
           loc, refTy,
-          builder.create<fir::AddrOfOp>(
-              loc, fir::ReferenceType::get(converter.genType(dtSym)),
+          fir::AddrOfOp::create(
+              builder, loc, fir::ReferenceType::get(converter.genType(dtSym)),
               builder.getSymbolRefAttr(dtName))));
       // defined IO procedure [void (*subroutine)()], may be null
       const Fortran::semantics::Symbol *procSym = iface.second.subroutine;
@@ -314,8 +317,8 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
           TODO(loc, "defined IO procedure pointers");
         } else if (Fortran::semantics::IsDummy(*procSym)) {
           Fortran::lower::StatementContext stmtCtx;
-          insert(builder.create<fir::BoxAddrOp>(
-              loc, refTy,
+          insert(fir::BoxAddrOp::create(
+              builder, loc, refTy,
               fir::getBase(converter.genExprAddr(
                   loc,
                   Fortran::lower::SomeExpr{
@@ -328,8 +331,8 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
               builder.getSymbolRefAttr(procDef.getSymName());
           insert(builder.createConvert(
               loc, refTy,
-              builder.create<fir::AddrOfOp>(loc, procDef.getFunctionType(),
-                                            nameAttr)));
+              fir::AddrOfOp::create(builder, loc, procDef.getFunctionType(),
+                                    nameAttr)));
         }
       } else {
         insert(builder.createNullConstant(loc, refTy));
@@ -339,14 +342,14 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
       insert(builder.createIntegerConstant(
           loc, intTy, static_cast<int>(iface.second.definedIo)));
       // polymorphic flag is set if first defined IO dummy arg is CLASS(T)
+      // defaultInt8 flag is set if -fdefined-integer-8
       // [bool isDtvArgPolymorphic]
-      insert(builder.createIntegerConstant(loc, boolTy,
-                                           iface.second.isDtvArgPolymorphic));
+      insert(builder.createIntegerConstant(loc, byteTy, iface.second.flags));
     }
     if (tableIsLocal)
-      builder.create<fir::StoreOp>(loc, list, listAddr);
+      fir::StoreOp::create(builder, loc, list, listAddr);
     else
-      builder.create<fir::HasValueOp>(loc, list);
+      fir::HasValueOp::create(builder, loc, list);
   };
   if (!definedIoProcMap.empty()) {
     if (tableIsLocal)
@@ -358,33 +361,34 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
 
   // Define the NonTbpDefinedIoTable.
   mlir::Value tableAddr = tableIsLocal
-                              ? builder.create<fir::AllocaOp>(loc, tableTy)
+                              ? fir::AllocaOp::create(builder, loc, tableTy)
                               : mlir::Value{};
   auto tableFunc = [&](fir::FirOpBuilder &builder) {
-    mlir::Value table = builder.create<fir::UndefOp>(loc, tableTy);
+    mlir::Value table = fir::UndefOp::create(builder, loc, tableTy);
     // list item count [std::size_t items]
-    table = builder.create<fir::InsertValueOp>(
-        loc, tableTy, table,
+    table = fir::InsertValueOp::create(
+        builder, loc, tableTy, table,
         builder.createIntegerConstant(loc, sizeTy, definedIoProcMap.size()),
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 0)));
     // item list [const NonTbpDefinedIo *item]
     if (definedIoProcMap.empty())
       listAddr = builder.createNullConstant(loc, builder.getRefType(listTy));
     else if (fir::GlobalOp list = builder.getNamedGlobal(listMangleName))
-      listAddr = builder.create<fir::AddrOfOp>(loc, list.resultType(),
-                                               list.getSymbol());
+      listAddr = fir::AddrOfOp::create(builder, loc, list.resultType(),
+                                       list.getSymbol());
     assert(listAddr && "missing namelist object list");
-    table = builder.create<fir::InsertValueOp>(
-        loc, tableTy, table, listAddr,
+    table = fir::InsertValueOp::create(
+        builder, loc, tableTy, table, listAddr,
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 1)));
     // [bool ignoreNonTbpEntries] conservatively set to true
-    table = builder.create<fir::InsertValueOp>(
-        loc, tableTy, table, builder.createIntegerConstant(loc, boolTy, true),
+    table = fir::InsertValueOp::create(
+        builder, loc, tableTy, table,
+        builder.createIntegerConstant(loc, boolTy, true),
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 2)));
     if (tableIsLocal)
-      builder.create<fir::StoreOp>(loc, table, tableAddr);
+      fir::StoreOp::create(builder, loc, table, tableAddr);
     else
-      builder.create<fir::HasValueOp>(loc, table);
+      fir::HasValueOp::create(builder, loc, table);
   };
   if (tableIsLocal) {
     tableFunc(builder);
@@ -392,8 +396,8 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
     fir::GlobalOp table = builder.createGlobal(
         loc, tableTy, tableMangleName,
         /*isConst=*/true, /*isTarget=*/false, tableFunc, linkOnce);
-    tableAddr = builder.create<fir::AddrOfOp>(
-        loc, fir::ReferenceType::get(tableTy), table.getSymbol());
+    tableAddr = fir::AddrOfOp::create(
+        builder, loc, fir::ReferenceType::get(tableTy), table.getSymbol());
   }
   assert(tableAddr && "missing NonTbpDefinedIo table result");
   return builder.createConvert(loc, refTy, tableAddr);
@@ -418,8 +422,8 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
   mlir::Location loc = converter.getCurrentLocation();
   std::string groupMangleName = converter.mangleName(symbol);
   if (auto group = builder.getNamedGlobal(groupMangleName))
-    return builder.create<fir::AddrOfOp>(loc, group.resultType(),
-                                         group.getSymbol());
+    return fir::AddrOfOp::create(builder, loc, group.resultType(),
+                                 group.getSymbol());
 
   const auto &details =
       symbol.GetUltimate().get<Fortran::semantics::NamelistDetails>();
@@ -466,18 +470,19 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
       auto descFunc = [&](fir::FirOpBuilder &b) {
         auto box = Fortran::lower::genInitialDataTarget(
             converter, loc, boxTy, *expr, /*couldBeInEquivalence=*/true);
-        b.create<fir::HasValueOp>(loc, box);
+        fir::HasValueOp::create(b, loc, box);
       };
       builder.createGlobalConstant(loc, boxTy, mangleName, descFunc, linkOnce);
     }
   }
 
   // Define the list of Items.
-  mlir::Value listAddr =
-      groupIsLocal ? builder.create<fir::AllocaOp>(loc, listTy) : mlir::Value{};
+  mlir::Value listAddr = groupIsLocal
+                             ? fir::AllocaOp::create(builder, loc, listTy)
+                             : mlir::Value{};
   std::string listMangleName = groupMangleName + ".list";
   auto listFunc = [&](fir::FirOpBuilder &builder) {
-    mlir::Value list = builder.create<fir::UndefOp>(loc, listTy);
+    mlir::Value list = fir::UndefOp::create(builder, loc, listTy);
     mlir::IntegerAttr zero = builder.getIntegerAttr(idxTy, 0);
     mlir::IntegerAttr one = builder.getIntegerAttr(idxTy, 1);
     llvm::SmallVector<mlir::Attribute, 2> idx = {mlir::Attribute{},
@@ -488,14 +493,14 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
       idx[1] = zero;
       mlir::Value nameAddr =
           builder.createConvert(loc, charRefTy, fir::getBase(stringAddress(s)));
-      list = builder.create<fir::InsertValueOp>(loc, listTy, list, nameAddr,
-                                                builder.getArrayAttr(idx));
+      list = fir::InsertValueOp::create(builder, loc, listTy, list, nameAddr,
+                                        builder.getArrayAttr(idx));
       idx[1] = one;
       mlir::Value descAddr;
       if (auto desc = builder.getNamedGlobal(
               Fortran::lower::mangle::globalNamelistDescriptorName(s))) {
-        descAddr = builder.create<fir::AddrOfOp>(loc, desc.resultType(),
-                                                 desc.getSymbol());
+        descAddr = fir::AddrOfOp::create(builder, loc, desc.resultType(),
+                                         desc.getSymbol());
       } else if (Fortran::semantics::FindCommonBlockContaining(s) &&
                  IsAllocatableOrPointer(s)) {
         mlir::Type symType = converter.genType(s);
@@ -503,8 +508,8 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
             Fortran::semantics::FindCommonBlockContaining(s);
         std::string commonBlockName = converter.mangleName(*commonBlockSym);
         fir::GlobalOp commonGlobal = builder.getNamedGlobal(commonBlockName);
-        mlir::Value commonBlockAddr = builder.create<fir::AddrOfOp>(
-            loc, commonGlobal.resultType(), commonGlobal.getSymbol());
+        mlir::Value commonBlockAddr = fir::AddrOfOp::create(
+            builder, loc, commonGlobal.resultType(), commonGlobal.getSymbol());
         mlir::IntegerType i8Ty = builder.getIntegerType(8);
         mlir::Type i8Ptr = builder.getRefType(i8Ty);
         mlir::Type seqTy = builder.getRefType(builder.getVarLenSeqTy(i8Ty));
@@ -512,8 +517,8 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
         std::size_t byteOffset = s.GetUltimate().offset();
         mlir::Value offs = builder.createIntegerConstant(
             loc, builder.getIndexType(), byteOffset);
-        mlir::Value varAddr = builder.create<fir::CoordinateOp>(
-            loc, i8Ptr, base, mlir::ValueRange{offs});
+        mlir::Value varAddr = fir::CoordinateOp::create(
+            builder, loc, i8Ptr, base, mlir::ValueRange{offs});
         descAddr =
             builder.createConvert(loc, builder.getRefType(symType), varAddr);
       } else {
@@ -526,16 +531,16 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
         descAddr = builder.createTemporary(loc, boxType);
         fir::MutableBoxValue box = fir::MutableBoxValue(descAddr, {}, {});
         fir::factory::associateMutableBox(builder, loc, box, exv,
-                                          /*lbounds=*/std::nullopt);
+                                          /*lbounds=*/{});
       }
       descAddr = builder.createConvert(loc, descRefTy, descAddr);
-      list = builder.create<fir::InsertValueOp>(loc, listTy, list, descAddr,
-                                                builder.getArrayAttr(idx));
+      list = fir::InsertValueOp::create(builder, loc, listTy, list, descAddr,
+                                        builder.getArrayAttr(idx));
     }
     if (groupIsLocal)
-      builder.create<fir::StoreOp>(loc, list, listAddr);
+      fir::StoreOp::create(builder, loc, list, listAddr);
     else
-      builder.create<fir::HasValueOp>(loc, list);
+      fir::HasValueOp::create(builder, loc, list);
   };
   if (groupIsLocal)
     listFunc(builder);
@@ -545,39 +550,39 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
 
   // Define the group.
   mlir::Value groupAddr = groupIsLocal
-                              ? builder.create<fir::AllocaOp>(loc, groupTy)
+                              ? fir::AllocaOp::create(builder, loc, groupTy)
                               : mlir::Value{};
   auto groupFunc = [&](fir::FirOpBuilder &builder) {
-    mlir::Value group = builder.create<fir::UndefOp>(loc, groupTy);
+    mlir::Value group = fir::UndefOp::create(builder, loc, groupTy);
     // group name [const char *groupName]
-    group = builder.create<fir::InsertValueOp>(
-        loc, groupTy, group,
+    group = fir::InsertValueOp::create(
+        builder, loc, groupTy, group,
         builder.createConvert(loc, charRefTy,
                               fir::getBase(stringAddress(symbol))),
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 0)));
     // list item count [std::size_t items]
-    group = builder.create<fir::InsertValueOp>(
-        loc, groupTy, group,
+    group = fir::InsertValueOp::create(
+        builder, loc, groupTy, group,
         builder.createIntegerConstant(loc, sizeTy, details.objects().size()),
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 1)));
     // item list [const Item *item]
     if (fir::GlobalOp list = builder.getNamedGlobal(listMangleName))
-      listAddr = builder.create<fir::AddrOfOp>(loc, list.resultType(),
-                                               list.getSymbol());
+      listAddr = fir::AddrOfOp::create(builder, loc, list.resultType(),
+                                       list.getSymbol());
     assert(listAddr && "missing namelist object list");
-    group = builder.create<fir::InsertValueOp>(
-        loc, groupTy, group, listAddr,
+    group = fir::InsertValueOp::create(
+        builder, loc, groupTy, group, listAddr,
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 2)));
     // non-type-bound defined IO procedures
     // [const NonTbpDefinedIoTable *nonTbpDefinedIo]
-    group = builder.create<fir::InsertValueOp>(
-        loc, groupTy, group,
+    group = fir::InsertValueOp::create(
+        builder, loc, groupTy, group,
         getNonTbpDefinedIoTableAddr(converter, definedIoProcMap),
         builder.getArrayAttr(builder.getIntegerAttr(idxTy, 3)));
     if (groupIsLocal)
-      builder.create<fir::StoreOp>(loc, group, groupAddr);
+      fir::StoreOp::create(builder, loc, group, groupAddr);
     else
-      builder.create<fir::HasValueOp>(loc, group);
+      fir::HasValueOp::create(builder, loc, group);
   };
   if (groupIsLocal) {
     groupFunc(builder);
@@ -585,8 +590,8 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
     fir::GlobalOp group = builder.createGlobal(
         loc, groupTy, groupMangleName,
         /*isConst=*/true, /*isTarget=*/false, groupFunc, linkOnce);
-    groupAddr = builder.create<fir::AddrOfOp>(loc, group.resultType(),
-                                              group.getSymbol());
+    groupAddr = fir::AddrOfOp::create(builder, loc, group.resultType(),
+                                      group.getSymbol());
   }
   assert(groupAddr && "missing namelist group result");
   return groupAddr;
@@ -606,7 +611,7 @@ static void genNamelistIO(Fortran::lower::AbstractConverter &converter,
       getNamelistGroup(converter, symbol.GetUltimate(), stmtCtx);
   groupAddr = builder.createConvert(loc, argType, groupAddr);
   llvm::SmallVector<mlir::Value> args = {cookie, groupAddr};
-  ok = builder.create<fir::CallOp>(loc, funcOp, args).getResult(0);
+  ok = fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
 }
 
 /// Is \p type a derived type or an array of derived type?
@@ -749,7 +754,7 @@ static void genOutputItemList(
         outputFuncArgs.push_back(itemValue);
       }
     }
-    ok = builder.create<fir::CallOp>(loc, outputFunc, outputFuncArgs)
+    ok = fir::CallOp::create(builder, loc, outputFunc, outputFuncArgs)
              .getResult(0);
   }
 }
@@ -810,12 +815,12 @@ static void boolRefToLogical(mlir::Location loc, fir::FirOpBuilder &builder,
                              mlir::Value addr) {
   auto boolType = builder.getRefType(builder.getI1Type());
   auto boolAddr = builder.createConvert(loc, boolType, addr);
-  auto boolValue = builder.create<fir::LoadOp>(loc, boolAddr);
+  auto boolValue = fir::LoadOp::create(builder, loc, boolAddr);
   auto logicalType = fir::unwrapPassByRefType(addr.getType());
   // The convert avoid making any assumptions about how LOGICALs are actually
   // represented (it might end-up being either a signed or zero extension).
   auto logicalValue = builder.createConvert(loc, logicalType, boolValue);
-  builder.create<fir::StoreOp>(loc, logicalValue, addr);
+  fir::StoreOp::create(builder, loc, logicalValue, addr);
 }
 
 static mlir::Value
@@ -847,12 +852,13 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter,
       inputFuncArgs.push_back(builder.createConvert(
           loc, inputFunc.getFunctionType().getInput(2), len));
     } else if (mlir::isa<mlir::IntegerType>(itemTy)) {
-      inputFuncArgs.push_back(builder.create<mlir::arith::ConstantOp>(
-          loc, builder.getI32IntegerAttr(
-                   mlir::cast<mlir::IntegerType>(itemTy).getWidth() / 8)));
+      inputFuncArgs.push_back(mlir::arith::ConstantOp::create(
+          builder, loc,
+          builder.getI32IntegerAttr(
+              mlir::cast<mlir::IntegerType>(itemTy).getWidth() / 8)));
     }
   }
-  auto call = builder.create<fir::CallOp>(loc, inputFunc, inputFuncArgs);
+  auto call = fir::CallOp::create(builder, loc, inputFunc, inputFuncArgs);
   auto itemAddr = fir::getBase(item);
   auto itemTy = fir::unwrapRefType(itemAddr.getType());
   if (mlir::isa<fir::LogicalType>(itemTy))
@@ -949,7 +955,7 @@ static void genIoLoop(Fortran::lower::AbstractConverter &converter,
   mlir::Value stepValue =
       control.step.has_value()
           ? genControlValue(*control.step)
-          : builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
+          : mlir::arith::ConstantIndexOp::create(builder, loc, 1);
   auto genItemList = [&](const D &ioImpliedDo) {
     if constexpr (std::is_same_v<D, Fortran::parser::InputImpliedDo>)
       genInputItemList(converter, cookie, itemList, isFormatted, checkResult,
@@ -960,35 +966,36 @@ static void genIoLoop(Fortran::lower::AbstractConverter &converter,
   };
   if (!checkResult) {
     // No IO call result checks - the loop is a fir.do_loop op.
-    auto doLoopOp = builder.create<fir::DoLoopOp>(
-        loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
-        /*finalCountValue=*/true);
+    auto doLoopOp = fir::DoLoopOp::create(builder, loc, lowerValue, upperValue,
+                                          stepValue, /*unordered=*/false,
+                                          /*finalCountValue=*/true);
     builder.setInsertionPointToStart(doLoopOp.getBody());
     mlir::Value lcv = builder.createConvert(
         loc, fir::unwrapRefType(loopVar.getType()), doLoopOp.getInductionVar());
-    builder.create<fir::StoreOp>(loc, lcv, loopVar);
+    fir::StoreOp::create(builder, loc, lcv, loopVar);
     genItemList(ioImpliedDo);
     builder.setInsertionPointToEnd(doLoopOp.getBody());
-    mlir::Value result = builder.create<mlir::arith::AddIOp>(
-        loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr);
-    builder.create<fir::ResultOp>(loc, result);
+    mlir::Value result = mlir::arith::AddIOp::create(
+        builder, loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr);
+    fir::ResultOp::create(builder, loc, result);
     builder.setInsertionPointAfter(doLoopOp);
     // The loop control variable may be used after the loop.
     lcv = builder.createConvert(loc, fir::unwrapRefType(loopVar.getType()),
                                 doLoopOp.getResult(0));
-    builder.create<fir::StoreOp>(loc, lcv, loopVar);
+    fir::StoreOp::create(builder, loc, lcv, loopVar);
     return;
   }
   // Check IO call results - the loop is a fir.iterate_while op.
   if (!ok)
     ok = builder.createBool(loc, true);
-  auto iterWhileOp = builder.create<fir::IterWhileOp>(
-      loc, lowerValue, upperValue, stepValue, ok, /*finalCountValue*/ true);
+  auto iterWhileOp =
+      fir::IterWhileOp::create(builder, loc, lowerValue, upperValue, stepValue,
+                               ok, /*finalCountValue*/ true);
   builder.setInsertionPointToStart(iterWhileOp.getBody());
   mlir::Value lcv =
       builder.createConvert(loc, fir::unwrapRefType(loopVar.getType()),
                             iterWhileOp.getInductionVar());
-  builder.create<fir::StoreOp>(loc, lcv, loopVar);
+  fir::StoreOp::create(builder, loc, lcv, loopVar);
   ok = iterWhileOp.getIterateVar();
   mlir::Value falseValue =
       builder.createIntegerConstant(loc, builder.getI1Type(), 0);
@@ -1001,28 +1008,28 @@ static void genIoLoop(Fortran::lower::AbstractConverter &converter,
     builder.setInsertionPointAfter(lastOp);
     // The primary ifOp result is the result of an IO call or loop.
     if (mlir::isa<fir::CallOp, fir::IfOp>(*lastOp))
-      builder.create<fir::ResultOp>(loc, lastOp->getResult(0));
+      fir::ResultOp::create(builder, loc, lastOp->getResult(0));
     else
-      builder.create<fir::ResultOp>(loc, ok); // loop result
+      fir::ResultOp::create(builder, loc, ok); // loop result
     // The else branch propagates an early exit false result.
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-    builder.create<fir::ResultOp>(loc, falseValue);
+    fir::ResultOp::create(builder, loc, falseValue);
   }
   builder.setInsertionPointToEnd(iterWhileOp.getBody());
   mlir::OpResult iterateResult = builder.getBlock()->back().getResult(0);
   mlir::Value inductionResult0 = iterWhileOp.getInductionVar();
-  auto inductionResult1 = builder.create<mlir::arith::AddIOp>(
-      loc, inductionResult0, iterWhileOp.getStep(), iofAttr);
-  auto inductionResult = builder.create<mlir::arith::SelectOp>(
-      loc, iterateResult, inductionResult1, inductionResult0);
+  auto inductionResult1 = mlir::arith::AddIOp::create(
+      builder, loc, inductionResult0, iterWhileOp.getStep(), iofAttr);
+  auto inductionResult = mlir::arith::SelectOp::create(
+      builder, loc, iterateResult, inductionResult1, inductionResult0);
   llvm::SmallVector<mlir::Value> results = {inductionResult, iterateResult};
-  builder.create<fir::ResultOp>(loc, results);
+  fir::ResultOp::create(builder, loc, results);
   ok = iterWhileOp.getResult(1);
   builder.setInsertionPointAfter(iterWhileOp);
   // The loop control variable may be used after the loop.
   lcv = builder.createConvert(loc, fir::unwrapRefType(loopVar.getType()),
                               iterWhileOp.getResult(0));
-  builder.create<fir::StoreOp>(loc, lcv, loopVar);
+  fir::StoreOp::create(builder, loc, lcv, loopVar);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1044,15 +1051,15 @@ static mlir::Value locToLineNo(Fortran::lower::AbstractConverter &converter,
 
 static mlir::Value getDefaultScratch(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Type toType) {
-  mlir::Value null = builder.create<mlir::arith::ConstantOp>(
-      loc, builder.getI64IntegerAttr(0));
+  mlir::Value null = mlir::arith::ConstantOp::create(
+      builder, loc, builder.getI64IntegerAttr(0));
   return builder.createConvert(loc, toType, null);
 }
 
 static mlir::Value getDefaultScratchLen(fir::FirOpBuilder &builder,
                                         mlir::Location loc, mlir::Type toType) {
-  return builder.create<mlir::arith::ConstantOp>(
-      loc, builder.getIntegerAttr(toType, 0));
+  return mlir::arith::ConstantOp::create(builder, loc,
+                                         builder.getIntegerAttr(toType, 0));
 }
 
 /// Generate a reference to a buffer and the length of buffer given
@@ -1103,8 +1110,8 @@ lowerStringLit(Fortran::lower::AbstractConverter &converter, mlir::Location loc,
   mlir::Value kind;
   if (ty2) {
     auto kindVal = expr->GetType().value().kind();
-    kind = builder.create<mlir::arith::ConstantOp>(
-        loc, builder.getIntegerAttr(ty2, kindVal));
+    kind = mlir::arith::ConstantOp::create(
+        builder, loc, builder.getIntegerAttr(ty2, kindVal));
   }
   return {buff, len, kind};
 }
@@ -1144,7 +1151,7 @@ mlir::Value genIntIOOption(Fortran::lower::AbstractConverter &converter,
       loc, Fortran::semantics::GetExpr(spec.v), localStatementCtx));
   mlir::Value val = builder.createConvert(loc, ioFuncTy.getInput(1), expr);
   llvm::SmallVector<mlir::Value> ioArgs = {cookie, val};
-  return builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+  return fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
 }
 
 /// Generic to build a string argument to the runtime. This passes a CHARACTER
@@ -1162,7 +1169,7 @@ mlir::Value genCharIOOption(Fortran::lower::AbstractConverter &converter,
                      ioFuncTy.getInput(1), ioFuncTy.getInput(2));
   llvm::SmallVector<mlir::Value> ioArgs = {cookie, std::get<0>(tup),
                                            std::get<1>(tup)};
-  return builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+  return fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
 }
 
 template <typename A>
@@ -1195,7 +1202,7 @@ mlir::Value genIOOption<Fortran::parser::FileNameExpr>(
                      ioFuncTy.getInput(1), ioFuncTy.getInput(2));
   llvm::SmallVector<mlir::Value> ioArgs{cookie, std::get<0>(tup),
                                         std::get<1>(tup)};
-  return builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+  return fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
 }
 
 template <>
@@ -1260,7 +1267,7 @@ mlir::Value genIOOption<Fortran::parser::ConnectSpec::CharExpr>(
                      ioFuncTy.getInput(1), ioFuncTy.getInput(2));
   llvm::SmallVector<mlir::Value> ioArgs = {cookie, std::get<0>(tup),
                                            std::get<1>(tup)};
-  return builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+  return fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
 }
 
 template <>
@@ -1314,7 +1321,7 @@ mlir::Value genIOOption<Fortran::parser::IoControlSpec::CharExpr>(
                      ioFuncTy.getInput(1), ioFuncTy.getInput(2));
   llvm::SmallVector<mlir::Value> ioArgs = {cookie, std::get<0>(tup),
                                            std::get<1>(tup)};
-  return builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+  return fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
 }
 
 template <>
@@ -1350,7 +1357,7 @@ static void genIOGetVar(Fortran::lower::AbstractConverter &converter,
   mlir::func::FuncOp ioFunc =
       fir::runtime::getIORuntimeFunc<IoRuntimeKey>(loc, builder);
   mlir::Value value =
-      builder.create<fir::CallOp>(loc, ioFunc, mlir::ValueRange{cookie})
+      fir::CallOp::create(builder, loc, ioFunc, mlir::ValueRange{cookie})
           .getResult(0);
   Fortran::lower::StatementContext localStatementCtx;
   fir::ExtendedValue var = converter.genExprAddr(
@@ -1478,8 +1485,8 @@ genConditionHandlerCall(Fortran::lower::AbstractConverter &converter,
       fir::runtime::getIORuntimeFunc<mkIOKey(EnableHandlers)>(loc, builder);
   mlir::Type boolType = enableHandlers.getFunctionType().getInput(1);
   auto boolValue = [&](bool specifierIsPresent) {
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, builder.getIntegerAttr(boolType, specifierIsPresent));
+    return mlir::arith::ConstantOp::create(
+        builder, loc, builder.getIntegerAttr(boolType, specifierIsPresent));
   };
   llvm::SmallVector<mlir::Value> ioArgs = {cookie,
                                            boolValue(csi.ioStatExpr != nullptr),
@@ -1487,7 +1494,7 @@ genConditionHandlerCall(Fortran::lower::AbstractConverter &converter,
                                            boolValue(csi.hasEnd),
                                            boolValue(csi.hasEor),
                                            boolValue(csi.ioMsg.has_value())};
-  builder.create<fir::CallOp>(loc, enableHandlers, ioArgs);
+  fir::CallOp::create(builder, loc, enableHandlers, ioArgs);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1661,7 +1668,7 @@ lowerReferenceAsStringSelect(Fortran::lower::AbstractConverter &converter,
     // Pass the format string reference and the string length out of the select
     // statement.
     llvm::SmallVector<mlir::Value> args = {stringRef, stringLen};
-    builder.create<mlir::cf::BranchOp>(loc, endBlock, args);
+    mlir::cf::BranchOp::create(builder, loc, endBlock, args);
 
     // Add block to the list of cases and make a new one.
     blockList.push_back(block);
@@ -1676,13 +1683,13 @@ lowerReferenceAsStringSelect(Fortran::lower::AbstractConverter &converter,
       builder, loc,
       "Assigned format variable '" + symbol->name().ToString() +
           "' has not been assigned a valid format label");
-  builder.create<fir::UnreachableOp>(loc);
+  fir::UnreachableOp::create(builder, loc);
   blockList.push_back(unitBlock);
 
   // Lower the selectOp.
   builder.setInsertionPointToEnd(startBlock);
   auto label = fir::getBase(converter.genExprValue(loc, &expr, stmtCtx));
-  builder.create<fir::SelectOp>(loc, label, indexList, blockList);
+  fir::SelectOp::create(builder, loc, label, indexList, blockList);
 
   builder.setInsertionPointToEnd(endBlock);
   endBlock->addArgument(strTy, loc);
@@ -1812,17 +1819,17 @@ static mlir::Value genIOUnitNumber(Fortran::lower::AbstractConverter &converter,
     mlir::Value line = locToLineNo(converter, loc, funcTy.getInput(5));
     args.push_back(file);
     args.push_back(line);
-    auto checkCall = builder.create<fir::CallOp>(loc, check, args);
+    auto checkCall = fir::CallOp::create(builder, loc, check, args);
     if (csi.hasErrorConditionSpec()) {
       mlir::Value iostat = checkCall.getResult(0);
       mlir::Type iostatTy = iostat.getType();
       mlir::Value zero = fir::factory::createZeroValue(builder, loc, iostatTy);
-      mlir::Value unitIsOK = builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, iostat, zero);
-      auto ifOp = builder.create<fir::IfOp>(loc, iostatTy, unitIsOK,
-                                            /*withElseRegion=*/true);
+      mlir::Value unitIsOK = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, iostat, zero);
+      auto ifOp = fir::IfOp::create(builder, loc, iostatTy, unitIsOK,
+                                    /*withElseRegion=*/true);
       builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-      builder.create<fir::ResultOp>(loc, iostat);
+      fir::ResultOp::create(builder, loc, iostat);
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
       stmtCtx.pushScope();
       csi.bigUnitIfOp = ifOp;
@@ -1844,8 +1851,8 @@ static mlir::Value genIOUnit(Fortran::lower::AbstractConverter &converter,
                 &iounit->u))
       return genIOUnitNumber(converter, loc, Fortran::semantics::GetExpr(*e),
                              ty, csi, stmtCtx);
-  return builder.create<mlir::arith::ConstantOp>(
-      loc, builder.getIntegerAttr(ty, defaultUnitNumber));
+  return mlir::arith::ConstantOp::create(
+      builder, loc, builder.getIntegerAttr(ty, defaultUnitNumber));
 }
 
 template <typename A>
@@ -1877,8 +1884,8 @@ static mlir::Value genBasicIOStmt(Fortran::lower::AbstractConverter &converter,
   mlir::Value un = builder.createConvert(loc, beginFuncTy.getInput(0), unit);
   mlir::Value file = locToFilename(converter, loc, beginFuncTy.getInput(1));
   mlir::Value line = locToLineNo(converter, loc, beginFuncTy.getInput(2));
-  auto call = builder.create<fir::CallOp>(loc, beginFunc,
-                                          mlir::ValueRange{un, file, line});
+  auto call = fir::CallOp::create(builder, loc, beginFunc,
+                                  mlir::ValueRange{un, file, line});
   mlir::Value cookie = call.getResult(0);
   genConditionHandlerCall(converter, loc, cookie, stmt.v, csi);
   mlir::Value ok;
@@ -1932,7 +1939,7 @@ genNewunitSpec(Fortran::lower::AbstractConverter &converter, mlir::Location loc,
       auto kind = builder.createIntegerConstant(loc, ioFuncTy.getInput(2),
                                                 var->GetType().value().kind());
       llvm::SmallVector<mlir::Value> ioArgs = {cookie, addr, kind};
-      return builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+      return fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
     }
   llvm_unreachable("missing Newunit spec");
 }
@@ -1967,7 +1974,7 @@ Fortran::lower::genOpenStatement(Fortran::lower::AbstractConverter &converter,
     beginArgs.push_back(locToLineNo(converter, loc, beginFuncTy.getInput(1)));
   }
   auto cookie =
-      builder.create<fir::CallOp>(loc, beginFunc, beginArgs).getResult(0);
+      fir::CallOp::create(builder, loc, beginFunc, beginArgs).getResult(0);
   genConditionHandlerCall(converter, loc, cookie, stmt.v, csi);
   mlir::Value ok;
   auto insertPt = builder.saveInsertionPoint();
@@ -2011,7 +2018,7 @@ Fortran::lower::genWaitStatement(Fortran::lower::AbstractConverter &converter,
     args.push_back(locToFilename(converter, loc, beginFuncTy.getInput(1)));
     args.push_back(locToLineNo(converter, loc, beginFuncTy.getInput(2)));
   }
-  auto cookie = builder.create<fir::CallOp>(loc, beginFunc, args).getResult(0);
+  auto cookie = fir::CallOp::create(builder, loc, beginFunc, args).getResult(0);
   genConditionHandlerCall(converter, loc, cookie, stmt.v, csi);
   return genEndIO(converter, converter.getCurrentLocation(), cookie, csi,
                   stmtCtx);
@@ -2147,9 +2154,10 @@ void genBeginDataTransferCallArgs(
     }
   } else { // PRINT - maybe explicit format; default unit
     maybeGetFormatArgs();
-    ioArgs.push_back(builder.create<mlir::arith::ConstantOp>(
-        loc, builder.getIntegerAttr(ioFuncTy.getInput(ioArgs.size()),
-                                    defaultUnitNumber)));
+    ioArgs.push_back(mlir::arith::ConstantOp::create(
+        builder, loc,
+        builder.getIntegerAttr(ioFuncTy.getInput(ioArgs.size()),
+                               defaultUnitNumber)));
   }
   // File name and line number are always the last two arguments.
   ioArgs.push_back(
@@ -2196,7 +2204,7 @@ genDataTransferStmt(Fortran::lower::AbstractConverter &converter,
       ioArgs, converter, loc, stmt, ioFunc.getFunctionType(), isFormatted,
       isList || isNml, isInternal, descRef, csi, stmtCtx);
   mlir::Value cookie =
-      builder.create<fir::CallOp>(loc, ioFunc, ioArgs).getResult(0);
+      fir::CallOp::create(builder, loc, ioFunc, ioArgs).getResult(0);
 
   auto insertPt = builder.saveInsertionPoint();
   mlir::Value ok;
@@ -2330,7 +2338,7 @@ mlir::Value genInquireSpec<Fortran::parser::InquireSpec::CharVar>(
                                                        .c_str())),
       builder.createConvert(loc, specFuncTy.getInput(2), fir::getBase(str)),
       builder.createConvert(loc, specFuncTy.getInput(3), fir::getLen(str))};
-  return builder.create<fir::CallOp>(loc, specFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, specFunc, args).getResult(0);
 }
 /// Specialization for INTEGER.
 template <>
@@ -2367,7 +2375,7 @@ mlir::Value genInquireSpec<Fortran::parser::InquireSpec::IntVar>(
                                                        .c_str())),
       builder.createConvert(loc, specFuncTy.getInput(2), addr),
       builder.createConvert(loc, specFuncTy.getInput(3), kind)};
-  return builder.create<fir::CallOp>(loc, specFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, specFunc, args).getResult(0);
 }
 /// Specialization for LOGICAL and (PENDING + ID).
 template <>
@@ -2404,7 +2412,7 @@ mlir::Value genInquireSpec<Fortran::parser::InquireSpec::LogVar>(
             Fortran::parser::InquireSpec::LogVar::EnumToString(logVarKind)}
                                                      .c_str())));
   args.push_back(builder.createConvert(loc, specFuncTy.getInput(2), addr));
-  auto call = builder.create<fir::CallOp>(loc, specFunc, args);
+  auto call = fir::CallOp::create(builder, loc, specFunc, args);
   boolRefToLogical(loc, builder, addr);
   return call.getResult(0);
 }
@@ -2500,7 +2508,7 @@ mlir::Value Fortran::lower::genInquireStatement(
     beginArgs = {locToFilename(converter, loc, beginFuncTy.getInput(0)),
                  locToLineNo(converter, loc, beginFuncTy.getInput(1))};
     auto cookie =
-        builder.create<fir::CallOp>(loc, beginFunc, beginArgs).getResult(0);
+        fir::CallOp::create(builder, loc, beginFunc, beginArgs).getResult(0);
     mlir::Value ok;
     genOutputItemList(
         converter, cookie,
@@ -2521,14 +2529,14 @@ mlir::Value Fortran::lower::genInquireStatement(
             .getResult(0);
     mlir::Value length1 =
         builder.createConvert(loc, converter.genType(*ioLengthVar), length);
-    builder.create<fir::StoreOp>(loc, length1, ioLengthVarAddr);
+    fir::StoreOp::create(builder, loc, length1, ioLengthVarAddr);
     return genEndIO(converter, loc, cookie, csi, stmtCtx);
   }
 
   // Common handling for inquire by unit or file.
   assert(list && "inquire-spec list must be present");
   auto cookie =
-      builder.create<fir::CallOp>(loc, beginFunc, beginArgs).getResult(0);
+      fir::CallOp::create(builder, loc, beginFunc, beginArgs).getResult(0);
   genConditionHandlerCall(converter, loc, cookie, *list, csi);
   // Handle remaining arguments in specifier list.
   mlir::Value ok;
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 39e4444cde4e3..471f3685974cd 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -132,21 +132,21 @@ createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
                        /*withElseRegion=*/true)
               .genThen([&]() {
                 if (fir::isBoxAddress(baseAddr.getType()))
-                  baseAddr = builder.create<fir::LoadOp>(loc, baseAddr);
+                  baseAddr = fir::LoadOp::create(builder, loc, baseAddr);
                 mlir::Value boxAddr =
-                    builder.create<fir::BoxAddrOp>(loc, baseAddr);
-                builder.create<fir::ResultOp>(loc, mlir::ValueRange{boxAddr});
+                    fir::BoxAddrOp::create(builder, loc, baseAddr);
+                fir::ResultOp::create(builder, loc, mlir::ValueRange{boxAddr});
               })
               .genElse([&] {
                 mlir::Value absent =
-                    builder.create<fir::AbsentOp>(loc, ifRetTy);
-                builder.create<fir::ResultOp>(loc, mlir::ValueRange{absent});
+                    fir::AbsentOp::create(builder, loc, ifRetTy);
+                fir::ResultOp::create(builder, loc, mlir::ValueRange{absent});
               })
               .getResults()[0];
     } else {
       if (fir::isBoxAddress(baseAddr.getType()))
-        baseAddr = builder.create<fir::LoadOp>(loc, baseAddr);
-      baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
+        baseAddr = fir::LoadOp::create(builder, loc, baseAddr);
+      baseAddr = fir::BoxAddrOp::create(builder, loc, baseAddr);
     }
     retTy = baseAddr.getType();
   }
@@ -159,7 +159,7 @@ createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
   addOperands(operands, operandSegments, bounds);
   addOperands(operands, operandSegments, async);
 
-  Op op = builder.create<Op>(loc, retTy, operands);
+  Op op = Op::create(builder, loc, retTy, operands);
   op.setNameAttr(builder.getStringAttr(name.str()));
   op.setStructured(structured);
   op.setImplicit(implicit);
@@ -198,12 +198,12 @@ createDeclareFunc(mlir::OpBuilder &modBuilder, fir::FirOpBuilder &builder,
                   llvm::SmallVector<mlir::Type> argsTy = {},
                   llvm::SmallVector<mlir::Location> locs = {}) {
   auto funcTy = mlir::FunctionType::get(modBuilder.getContext(), argsTy, {});
-  auto funcOp = modBuilder.create<mlir::func::FuncOp>(loc, funcName, funcTy);
+  auto funcOp = mlir::func::FuncOp::create(modBuilder, loc, funcName, funcTy);
   funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
   builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
                       locs);
   builder.setInsertionPointToEnd(&funcOp.getRegion().back());
-  builder.create<mlir::func::ReturnOp>(loc);
+  mlir::func::ReturnOp::create(builder, loc);
   builder.setInsertionPointToStart(&funcOp.getRegion().back());
   return funcOp;
 }
@@ -214,7 +214,7 @@ createSimpleOp(fir::FirOpBuilder &builder, mlir::Location loc,
                const llvm::SmallVectorImpl<mlir::Value> &operands,
                const llvm::SmallVectorImpl<int32_t> &operandSegments) {
   llvm::ArrayRef<mlir::Type> argTy;
-  Op op = builder.create<Op>(loc, argTy, operands);
+  Op op = Op::create(builder, loc, argTy, operands);
   op->setAttr(Op::getOperandSegmentSizeAttr(),
               builder.getDenseI32ArrayAttr(operandSegments));
   return op;
@@ -257,15 +257,15 @@ static void createDeclareAllocFuncWithArg(mlir::OpBuilder &modBuilder,
 
   if (unwrapFirBox) {
     mlir::Value desc =
-        builder.create<fir::LoadOp>(loc, registerFuncOp.getArgument(0));
-    fir::BoxAddrOp boxAddrOp = builder.create<fir::BoxAddrOp>(loc, desc);
+        fir::LoadOp::create(builder, loc, registerFuncOp.getArgument(0));
+    fir::BoxAddrOp boxAddrOp = fir::BoxAddrOp::create(builder, loc, desc);
     addDeclareAttr(builder, boxAddrOp.getOperation(), clause);
     EntryOp entryOp = createDataEntryOp<EntryOp>(
         builder, loc, boxAddrOp.getResult(), asFortran, bounds,
         /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType(),
         /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-    builder.create<mlir::acc::DeclareEnterOp>(
-        loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
+    mlir::acc::DeclareEnterOp::create(
+        builder, loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
         mlir::ValueRange(entryOp.getAccVar()));
   }
 
@@ -291,8 +291,8 @@ static void createDeclareDeallocFuncWithArg(
   mlir::Value var = preDeallocOp.getArgument(0);
   if (unwrapFirBox) {
     mlir::Value loadOp =
-        builder.create<fir::LoadOp>(loc, preDeallocOp.getArgument(0));
-    fir::BoxAddrOp boxAddrOp = builder.create<fir::BoxAddrOp>(loc, loadOp);
+        fir::LoadOp::create(builder, loc, preDeallocOp.getArgument(0));
+    fir::BoxAddrOp boxAddrOp = fir::BoxAddrOp::create(builder, loc, loadOp);
     addDeclareAttr(builder, boxAddrOp.getOperation(), clause);
     var = boxAddrOp.getResult();
   }
@@ -303,25 +303,25 @@ static void createDeclareDeallocFuncWithArg(
           builder, loc, var, asFortran, bounds,
           /*structured=*/false, /*implicit=*/false, clause, var.getType(),
           /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  builder.create<mlir::acc::DeclareExitOp>(
-      loc, mlir::Value{}, mlir::ValueRange(entryOp.getAccVar()));
+  mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
+                                   mlir::ValueRange(entryOp.getAccVar()));
 
   if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
                 std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
-    builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccVar(),
-                           entryOp.getVar(), entryOp.getVarType(),
-                           entryOp.getBounds(), entryOp.getAsyncOperands(),
-                           entryOp.getAsyncOperandsDeviceTypeAttr(),
-                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-                           /*structured=*/false, /*implicit=*/false,
-                           builder.getStringAttr(*entryOp.getName()));
+    ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                   entryOp.getVar(), entryOp.getVarType(), entryOp.getBounds(),
+                   entryOp.getAsyncOperands(),
+                   entryOp.getAsyncOperandsDeviceTypeAttr(),
+                   entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                   /*structured=*/false, /*implicit=*/false,
+                   builder.getStringAttr(*entryOp.getName()));
   else
-    builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccVar(),
-                           entryOp.getBounds(), entryOp.getAsyncOperands(),
-                           entryOp.getAsyncOperandsDeviceTypeAttr(),
-                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-                           /*structured=*/false, /*implicit=*/false,
-                           builder.getStringAttr(*entryOp.getName()));
+    ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                   entryOp.getBounds(), entryOp.getAsyncOperands(),
+                   entryOp.getAsyncOperandsDeviceTypeAttr(),
+                   entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                   /*structured=*/false, /*implicit=*/false,
+                   builder.getStringAttr(*entryOp.getName()));
 
   // Generate the post dealloc function.
   modBuilder.setInsertionPointAfter(preDeallocOp);
@@ -333,7 +333,7 @@ static void createDeclareDeallocFuncWithArg(
 
   var = postDeallocOp.getArgument(0);
   if (unwrapFirBox) {
-    var = builder.create<fir::LoadOp>(loc, postDeallocOp.getArgument(0));
+    var = fir::LoadOp::create(builder, loc, postDeallocOp.getArgument(0));
     asFortran << accFirDescriptorPostfix.str();
   }
 
@@ -385,8 +385,8 @@ genAtomicCaptureStatement(Fortran::lower::AbstractConverter &converter,
   // Generate `atomic.read` operation for atomic assigment statements
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  firOpBuilder.create<mlir::acc::AtomicReadOp>(
-      loc, fromAddress, toAddress, mlir::TypeAttr::get(elementType));
+  mlir::acc::AtomicReadOp::create(firOpBuilder, loc, fromAddress, toAddress,
+                                  mlir::TypeAttr::get(elementType));
 }
 
 /// Used to generate atomic.write operation which is created in existing
@@ -406,7 +406,7 @@ genAtomicWriteStatement(Fortran::lower::AbstractConverter &converter,
   rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr);
   firOpBuilder.restoreInsertionPoint(insertionPoint);
 
-  firOpBuilder.create<mlir::acc::AtomicWriteOp>(loc, lhsAddr, rhsExpr);
+  mlir::acc::AtomicWriteOp::create(firOpBuilder, loc, lhsAddr, rhsExpr);
 }
 
 /// Used to generate atomic.update operation which is created in existing
@@ -522,7 +522,7 @@ static inline void genAtomicUpdateStatement(
 
   mlir::Operation *atomicUpdateOp = nullptr;
   atomicUpdateOp =
-      firOpBuilder.create<mlir::acc::AtomicUpdateOp>(currentLocation, lhsAddr);
+      mlir::acc::AtomicUpdateOp::create(firOpBuilder, currentLocation, lhsAddr);
 
   llvm::SmallVector<mlir::Type> varTys = {varType};
   llvm::SmallVector<mlir::Location> locs = {currentLocation};
@@ -540,7 +540,7 @@ static inline void genAtomicUpdateStatement(
         *Fortran::semantics::GetExpr(assignmentStmtExpr), atomicStmtCtx));
     mlir::Value convertResult =
         firOpBuilder.createConvert(currentLocation, varType, rhsExpr);
-    firOpBuilder.create<mlir::acc::YieldOp>(currentLocation, convertResult);
+    mlir::acc::YieldOp::create(firOpBuilder, currentLocation, convertResult);
     converter.resetExprOverrides();
   }
   firOpBuilder.setInsertionPointAfter(atomicUpdateOp);
@@ -647,7 +647,7 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
       fir::getBase(converter.genExprValue(assign2.lhs, stmtCtx)).getType();
 
   mlir::Operation *atomicCaptureOp = nullptr;
-  atomicCaptureOp = firOpBuilder.create<mlir::acc::AtomicCaptureOp>(loc);
+  atomicCaptureOp = mlir::acc::AtomicCaptureOp::create(firOpBuilder, loc);
 
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
@@ -688,7 +688,7 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
                               loc);
   }
   firOpBuilder.setInsertionPointToEnd(&block);
-  firOpBuilder.create<mlir::acc::TerminatorOp>(loc);
+  mlir::acc::TerminatorOp::create(firOpBuilder, loc);
   // The clean-ups associated with the statements inside the capture
   // construct must be generated after the AtomicCaptureOp.
   firOpBuilder.setInsertionPointAfter(atomicCaptureOp);
@@ -708,6 +708,7 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                          bool setDeclareAttr = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
+  const bool unwrapBoxAddr = true;
   for (const auto &accObject : objectList.v) {
     llvm::SmallVector<mlir::Value> bounds;
     std::stringstream asFortran;
@@ -735,8 +736,25 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
     Op op = createDataEntryOp<Op>(
         builder, operandLocation, baseAddr, asFortran, bounds, structured,
         implicit, dataClause, baseAddr.getType(), async, asyncDeviceTypes,
-        asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true, info.isPresent);
+        asyncOnlyDeviceTypes, unwrapBoxAddr, info.isPresent);
     dataOperands.push_back(op.getAccVar());
+
+    // For UseDeviceOp, if operand is one of a pair resulting from a
+    // declare operation, create a UseDeviceOp for the other operand as well.
+    if constexpr (std::is_same_v<Op, mlir::acc::UseDeviceOp>) {
+      if (auto declareOp =
+              mlir::dyn_cast<hlfir::DeclareOp>(baseAddr.getDefiningOp())) {
+        mlir::Value otherAddr = declareOp.getResult(1);
+        if (baseAddr != otherAddr) {
+          Op op = createDataEntryOp<Op>(builder, operandLocation, otherAddr,
+                                        asFortran, bounds, structured, implicit,
+                                        dataClause, otherAddr.getType(), async,
+                                        asyncDeviceTypes, asyncOnlyDeviceTypes,
+                                        unwrapBoxAddr, info.isPresent);
+          dataOperands.push_back(op.getAccVar());
+        }
+      }
+    }
   }
 }
 
@@ -821,15 +839,15 @@ genDataExitOperations(fir::FirOpBuilder &builder,
     mlir::Location opLoc = exitLoc ? *exitLoc : entryOp.getLoc();
     if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
                   std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
-      builder.create<ExitOp>(
-          opLoc, entryOp.getAccVar(), entryOp.getVar(), entryOp.getVarType(),
-          entryOp.getBounds(), entryOp.getAsyncOperands(),
+      ExitOp::create(
+          builder, opLoc, entryOp.getAccVar(), entryOp.getVar(),
+          entryOp.getVarType(), entryOp.getBounds(), entryOp.getAsyncOperands(),
           entryOp.getAsyncOperandsDeviceTypeAttr(), entryOp.getAsyncOnlyAttr(),
           entryOp.getDataClause(), structured, entryOp.getImplicit(),
           builder.getStringAttr(*entryOp.getName()));
     else
-      builder.create<ExitOp>(
-          opLoc, entryOp.getAccVar(), entryOp.getBounds(),
+      ExitOp::create(
+          builder, opLoc, entryOp.getAccVar(), entryOp.getBounds(),
           entryOp.getAsyncOperands(), entryOp.getAsyncOperandsDeviceTypeAttr(),
           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(), structured,
           entryOp.getImplicit(), builder.getStringAttr(*entryOp.getName()));
@@ -841,9 +859,9 @@ fir::ShapeOp genShapeOp(mlir::OpBuilder &builder, fir::SequenceType seqTy,
   llvm::SmallVector<mlir::Value> extents;
   mlir::Type idxTy = builder.getIndexType();
   for (auto extent : seqTy.getShape())
-    extents.push_back(builder.create<mlir::arith::ConstantOp>(
-        loc, idxTy, builder.getIntegerAttr(idxTy, extent)));
-  return builder.create<fir::ShapeOp>(loc, extents);
+    extents.push_back(mlir::arith::ConstantOp::create(
+        builder, loc, idxTy, builder.getIntegerAttr(idxTy, extent)));
+  return fir::ShapeOp::create(builder, loc, extents);
 }
 
 /// Get the initial value for reduction operator.
@@ -918,8 +936,8 @@ static mlir::Value getReductionInitValue(fir::FirOpBuilder &builder,
     return builder.createBool(loc, value);
   }
   if (ty.isIntOrIndex())
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, ty,
+    return mlir::arith::ConstantOp::create(
+        builder, loc, ty,
         builder.getIntegerAttr(ty, getReductionInitValue<llvm::APInt>(op, ty)));
   if (op == mlir::acc::ReductionOperator::AccMin ||
       op == mlir::acc::ReductionOperator::AccMax) {
@@ -927,13 +945,13 @@ static mlir::Value getReductionInitValue(fir::FirOpBuilder &builder,
       llvm::report_fatal_error(
           "min/max reduction not supported for complex type");
     if (auto floatTy = mlir::dyn_cast_or_null<mlir::FloatType>(ty))
-      return builder.create<mlir::arith::ConstantOp>(
-          loc, ty,
+      return mlir::arith::ConstantOp::create(
+          builder, loc, ty,
           builder.getFloatAttr(ty,
                                getReductionInitValue<llvm::APFloat>(op, ty)));
   } else if (auto floatTy = mlir::dyn_cast_or_null<mlir::FloatType>(ty)) {
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, ty,
+    return mlir::arith::ConstantOp::create(
+        builder, loc, ty,
         builder.getFloatAttr(ty, getReductionInitValue<int64_t>(op, ty)));
   } else if (auto cmplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(ty)) {
     mlir::Type floatTy = cmplxTy.getElementType();
@@ -967,10 +985,10 @@ static RecipeOp genRecipeOp(
   mlir::OpBuilder modBuilder(mod.getBodyRegion());
   RecipeOp recipe;
   if constexpr (std::is_same_v<RecipeOp, mlir::acc::ReductionRecipeOp>) {
-    recipe = modBuilder.create<mlir::acc::ReductionRecipeOp>(loc, recipeName,
-                                                             ty, op);
+    recipe = mlir::acc::ReductionRecipeOp::create(modBuilder, loc, recipeName,
+                                                  ty, op);
   } else {
-    recipe = modBuilder.create<RecipeOp>(loc, recipeName, ty);
+    recipe = RecipeOp::create(modBuilder, loc, recipeName, ty);
   }
 
   llvm::SmallVector<mlir::Type> argsTy{ty};
@@ -1014,8 +1032,8 @@ static RecipeOp genRecipeOp(
       initName,
       initBlock->getArguments().take_back(initBlock->getArguments().size() - 1),
       initValue);
-  builder.create<mlir::acc::YieldOp>(loc, retVal ? retVal
-                                                 : initBlock->getArgument(0));
+  mlir::acc::YieldOp::create(builder, loc,
+                             retVal ? retVal : initBlock->getArgument(0));
   return recipe;
 }
 
@@ -1114,15 +1132,17 @@ static mlir::Value genShapeFromBoundsOrArgs(
   mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
   for (unsigned i = 0; i < args.size(); i += 3) {
     mlir::Value s1 =
-        builder.create<mlir::arith::SubIOp>(loc, args[i + 1], args[0]);
-    mlir::Value s2 = builder.create<mlir::arith::AddIOp>(loc, s1, one);
-    mlir::Value s3 = builder.create<mlir::arith::DivSIOp>(loc, s2, args[i + 2]);
-    mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::sgt, s3, zero);
-    mlir::Value ext = builder.create<mlir::arith::SelectOp>(loc, cmp, s3, zero);
+        mlir::arith::SubIOp::create(builder, loc, args[i + 1], args[0]);
+    mlir::Value s2 = mlir::arith::AddIOp::create(builder, loc, s1, one);
+    mlir::Value s3 =
+        mlir::arith::DivSIOp::create(builder, loc, s2, args[i + 2]);
+    mlir::Value cmp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::sgt, s3, zero);
+    mlir::Value ext =
+        mlir::arith::SelectOp::create(builder, loc, cmp, s3, zero);
     extents.push_back(ext);
   }
-  return builder.create<fir::ShapeOp>(loc, extents);
+  return fir::ShapeOp::create(builder, loc, extents);
 }
 
 static hlfir::DesignateOp::Subscripts
@@ -1139,8 +1159,8 @@ static hlfir::Entity genDesignateWithTriplets(
     hlfir::DesignateOp::Subscripts &triplets, mlir::Value shape) {
   llvm::SmallVector<mlir::Value> lenParams;
   hlfir::genLengthParameters(loc, builder, entity, lenParams);
-  auto designate = builder.create<hlfir::DesignateOp>(
-      loc, entity.getBase().getType(), entity, /*component=*/"",
+  auto designate = hlfir::DesignateOp::create(
+      builder, loc, entity.getBase().getType(), entity, /*component=*/"",
       /*componentShape=*/mlir::Value{}, triplets,
       /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt, shape,
       lenParams);
@@ -1180,22 +1200,22 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
   builder.setInsertionPointToEnd(&recipe.getCopyRegion().back());
   ty = fir::unwrapRefType(ty);
   if (fir::isa_trivial(ty)) {
-    mlir::Value initValue = builder.create<fir::LoadOp>(
-        loc, recipe.getCopyRegion().front().getArgument(0));
-    builder.create<fir::StoreOp>(loc, initValue,
-                                 recipe.getCopyRegion().front().getArgument(1));
+    mlir::Value initValue = fir::LoadOp::create(
+        builder, loc, recipe.getCopyRegion().front().getArgument(0));
+    fir::StoreOp::create(builder, loc, initValue,
+                         recipe.getCopyRegion().front().getArgument(1));
   } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) {
     fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
     auto shape = genShapeFromBoundsOrArgs(
         loc, firBuilder, seqTy, bounds, recipe.getCopyRegion().getArguments());
 
-    auto leftDeclOp = builder.create<hlfir::DeclareOp>(
-        loc, recipe.getCopyRegion().getArgument(0), llvm::StringRef{}, shape,
-        llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+    auto leftDeclOp = hlfir::DeclareOp::create(
+        builder, loc, recipe.getCopyRegion().getArgument(0), llvm::StringRef{},
+        shape, llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
         fir::FortranVariableFlagsAttr{});
-    auto rightDeclOp = builder.create<hlfir::DeclareOp>(
-        loc, recipe.getCopyRegion().getArgument(1), llvm::StringRef{}, shape,
-        llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
+    auto rightDeclOp = hlfir::DeclareOp::create(
+        builder, loc, recipe.getCopyRegion().getArgument(1), llvm::StringRef{},
+        shape, llvm::ArrayRef<mlir::Value>{}, /*dummy_scope=*/nullptr,
         fir::FortranVariableFlagsAttr{});
 
     hlfir::DesignateOp::Subscripts triplets =
@@ -1207,7 +1227,7 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
     auto right =
         genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
 
-    firBuilder.create<hlfir::AssignOp>(loc, left, right);
+    hlfir::AssignOp::create(firBuilder, loc, left, right);
 
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
     fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
@@ -1228,10 +1248,10 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
     auto rightEntity = hlfir::Entity{recipe.getCopyRegion().getArgument(1)};
     auto right =
         genDesignateWithTriplets(firBuilder, loc, rightEntity, triplets, shape);
-    firBuilder.create<hlfir::AssignOp>(loc, left, right);
+    hlfir::AssignOp::create(firBuilder, loc, left, right);
   }
 
-  builder.create<mlir::acc::TerminatorOp>(loc);
+  mlir::acc::TerminatorOp::create(builder, loc);
   builder.restoreInsertionPoint(ip);
   return recipe;
 }
@@ -1396,10 +1416,10 @@ static mlir::Value genLogicalCombiner(fir::FirOpBuilder &builder,
                                       mlir::Location loc, mlir::Value value1,
                                       mlir::Value value2) {
   mlir::Type i1 = builder.getI1Type();
-  mlir::Value v1 = builder.create<fir::ConvertOp>(loc, i1, value1);
-  mlir::Value v2 = builder.create<fir::ConvertOp>(loc, i1, value2);
-  mlir::Value combined = builder.create<Op>(loc, v1, v2);
-  return builder.create<fir::ConvertOp>(loc, value1.getType(), combined);
+  mlir::Value v1 = fir::ConvertOp::create(builder, loc, i1, value1);
+  mlir::Value v2 = fir::ConvertOp::create(builder, loc, i1, value2);
+  mlir::Value combined = Op::create(builder, loc, v1, v2);
+  return fir::ConvertOp::create(builder, loc, value1.getType(), combined);
 }
 
 static mlir::Value genComparisonCombiner(fir::FirOpBuilder &builder,
@@ -1408,10 +1428,10 @@ static mlir::Value genComparisonCombiner(fir::FirOpBuilder &builder,
                                          mlir::Value value1,
                                          mlir::Value value2) {
   mlir::Type i1 = builder.getI1Type();
-  mlir::Value v1 = builder.create<fir::ConvertOp>(loc, i1, value1);
-  mlir::Value v2 = builder.create<fir::ConvertOp>(loc, i1, value2);
-  mlir::Value add = builder.create<mlir::arith::CmpIOp>(loc, pred, v1, v2);
-  return builder.create<fir::ConvertOp>(loc, value1.getType(), add);
+  mlir::Value v1 = fir::ConvertOp::create(builder, loc, i1, value1);
+  mlir::Value v2 = fir::ConvertOp::create(builder, loc, i1, value2);
+  mlir::Value add = mlir::arith::CmpIOp::create(builder, loc, pred, v1, v2);
+  return fir::ConvertOp::create(builder, loc, value1.getType(), add);
 }
 
 static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
@@ -1423,21 +1443,21 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
   value2 = builder.loadIfRef(loc, value2);
   if (op == mlir::acc::ReductionOperator::AccAdd) {
     if (ty.isIntOrIndex())
-      return builder.create<mlir::arith::AddIOp>(loc, value1, value2);
+      return mlir::arith::AddIOp::create(builder, loc, value1, value2);
     if (mlir::isa<mlir::FloatType>(ty))
-      return builder.create<mlir::arith::AddFOp>(loc, value1, value2);
+      return mlir::arith::AddFOp::create(builder, loc, value1, value2);
     if (auto cmplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(ty))
-      return builder.create<fir::AddcOp>(loc, value1, value2);
+      return fir::AddcOp::create(builder, loc, value1, value2);
     TODO(loc, "reduction add type");
   }
 
   if (op == mlir::acc::ReductionOperator::AccMul) {
     if (ty.isIntOrIndex())
-      return builder.create<mlir::arith::MulIOp>(loc, value1, value2);
+      return mlir::arith::MulIOp::create(builder, loc, value1, value2);
     if (mlir::isa<mlir::FloatType>(ty))
-      return builder.create<mlir::arith::MulFOp>(loc, value1, value2);
+      return mlir::arith::MulFOp::create(builder, loc, value1, value2);
     if (mlir::isa<mlir::ComplexType>(ty))
-      return builder.create<fir::MulcOp>(loc, value1, value2);
+      return fir::MulcOp::create(builder, loc, value1, value2);
     TODO(loc, "reduction mul type");
   }
 
@@ -1448,13 +1468,13 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
     return fir::genMax(builder, loc, {value1, value2});
 
   if (op == mlir::acc::ReductionOperator::AccIand)
-    return builder.create<mlir::arith::AndIOp>(loc, value1, value2);
+    return mlir::arith::AndIOp::create(builder, loc, value1, value2);
 
   if (op == mlir::acc::ReductionOperator::AccIor)
-    return builder.create<mlir::arith::OrIOp>(loc, value1, value2);
+    return mlir::arith::OrIOp::create(builder, loc, value1, value2);
 
   if (op == mlir::acc::ReductionOperator::AccXor)
-    return builder.create<mlir::arith::XOrIOp>(loc, value1, value2);
+    return mlir::arith::XOrIOp::create(builder, loc, value1, value2);
 
   if (op == mlir::acc::ReductionOperator::AccLand)
     return genLogicalCombiner<mlir::arith::AndIOp>(builder, loc, value1,
@@ -1502,19 +1522,21 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
       auto shape =
           genShapeFromBoundsOrArgs(loc, builder, seqTy, bounds,
                                    recipe.getCombinerRegion().getArguments());
-      auto v1DeclareOp = builder.create<hlfir::DeclareOp>(
-          loc, value1, llvm::StringRef{}, shape, llvm::ArrayRef<mlir::Value>{},
+      auto v1DeclareOp = hlfir::DeclareOp::create(
+          builder, loc, value1, llvm::StringRef{}, shape,
+          llvm::ArrayRef<mlir::Value>{},
           /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
-      auto v2DeclareOp = builder.create<hlfir::DeclareOp>(
-          loc, value2, llvm::StringRef{}, shape, llvm::ArrayRef<mlir::Value>{},
+      auto v2DeclareOp = hlfir::DeclareOp::create(
+          builder, loc, value2, llvm::StringRef{}, shape,
+          llvm::ArrayRef<mlir::Value>{},
           /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
       hlfir::DesignateOp::Subscripts triplets = getTripletsFromArgs(recipe);
 
       llvm::SmallVector<mlir::Value> lenParamsLeft;
       auto leftEntity = hlfir::Entity{v1DeclareOp.getBase()};
       hlfir::genLengthParameters(loc, builder, leftEntity, lenParamsLeft);
-      auto leftDesignate = builder.create<hlfir::DesignateOp>(
-          loc, v1DeclareOp.getBase().getType(), v1DeclareOp.getBase(),
+      auto leftDesignate = hlfir::DesignateOp::create(
+          builder, loc, v1DeclareOp.getBase().getType(), v1DeclareOp.getBase(),
           /*component=*/"",
           /*componentShape=*/mlir::Value{}, triplets,
           /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
@@ -1524,8 +1546,8 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
       llvm::SmallVector<mlir::Value> lenParamsRight;
       auto rightEntity = hlfir::Entity{v2DeclareOp.getBase()};
       hlfir::genLengthParameters(loc, builder, rightEntity, lenParamsLeft);
-      auto rightDesignate = builder.create<hlfir::DesignateOp>(
-          loc, v2DeclareOp.getBase().getType(), v2DeclareOp.getBase(),
+      auto rightDesignate = hlfir::DesignateOp::create(
+          builder, loc, v2DeclareOp.getBase().getType(), v2DeclareOp.getBase(),
           /*component=*/"",
           /*componentShape=*/mlir::Value{}, triplets,
           /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
@@ -1546,21 +1568,21 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
       mlir::Value elemental = hlfir::genElementalOp(
           loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel,
           /*isUnordered=*/true);
-      builder.create<hlfir::AssignOp>(loc, elemental, v1DeclareOp.getBase());
+      hlfir::AssignOp::create(builder, loc, elemental, v1DeclareOp.getBase());
       return;
     }
     if (bounds.empty()) {
       llvm::SmallVector<mlir::Value> extents;
       mlir::Type idxTy = builder.getIndexType();
       for (auto extent : seqTy.getShape()) {
-        mlir::Value lb = builder.create<mlir::arith::ConstantOp>(
-            loc, idxTy, builder.getIntegerAttr(idxTy, 0));
-        mlir::Value ub = builder.create<mlir::arith::ConstantOp>(
-            loc, idxTy, builder.getIntegerAttr(idxTy, extent - 1));
-        mlir::Value step = builder.create<mlir::arith::ConstantOp>(
-            loc, idxTy, builder.getIntegerAttr(idxTy, 1));
-        auto loop = builder.create<fir::DoLoopOp>(loc, lb, ub, step,
-                                                  /*unordered=*/false);
+        mlir::Value lb = mlir::arith::ConstantOp::create(
+            builder, loc, idxTy, builder.getIntegerAttr(idxTy, 0));
+        mlir::Value ub = mlir::arith::ConstantOp::create(
+            builder, loc, idxTy, builder.getIntegerAttr(idxTy, extent - 1));
+        mlir::Value step = mlir::arith::ConstantOp::create(
+            builder, loc, idxTy, builder.getIntegerAttr(idxTy, 1));
+        auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
+                                          /*unordered=*/false);
         builder.setInsertionPointToStart(loop.getBody());
         loops.push_back(loop);
         ivs.push_back(loop.getInductionVar());
@@ -1576,8 +1598,8 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
         llvm::SmallVector<mlir::Value> values =
             genConstantBounds(builder, loc, dataBound);
         auto loop =
-            builder.create<fir::DoLoopOp>(loc, values[0], values[1], values[2],
-                                          /*unordered=*/false);
+            fir::DoLoopOp::create(builder, loc, values[0], values[1], values[2],
+                                  /*unordered=*/false);
         builder.setInsertionPointToStart(loop.getBody());
         loops.push_back(loop);
         ivs.push_back(loop.getInductionVar());
@@ -1593,31 +1615,31 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
         mlir::Value lb = recipe.getCombinerRegion().getArgument(i);
         mlir::Value ub = recipe.getCombinerRegion().getArgument(i + 1);
         mlir::Value step = recipe.getCombinerRegion().getArgument(i + 2);
-        auto loop = builder.create<fir::DoLoopOp>(loc, lb, ub, step,
-                                                  /*unordered=*/false);
+        auto loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
+                                          /*unordered=*/false);
         builder.setInsertionPointToStart(loop.getBody());
         loops.push_back(loop);
         ivs.push_back(loop.getInductionVar());
       }
     }
-    auto addr1 = builder.create<fir::CoordinateOp>(loc, refTy, value1, ivs);
-    auto addr2 = builder.create<fir::CoordinateOp>(loc, refTy, value2, ivs);
-    auto load1 = builder.create<fir::LoadOp>(loc, addr1);
-    auto load2 = builder.create<fir::LoadOp>(loc, addr2);
+    auto addr1 = fir::CoordinateOp::create(builder, loc, refTy, value1, ivs);
+    auto addr2 = fir::CoordinateOp::create(builder, loc, refTy, value2, ivs);
+    auto load1 = fir::LoadOp::create(builder, loc, addr1);
+    auto load2 = fir::LoadOp::create(builder, loc, addr2);
     mlir::Value res =
         genScalarCombiner(builder, loc, op, seqTy.getEleTy(), load1, load2);
-    builder.create<fir::StoreOp>(loc, res, addr1);
+    fir::StoreOp::create(builder, loc, res, addr1);
     builder.setInsertionPointAfter(loops[0]);
   } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
     mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
     if (fir::isa_trivial(innerTy)) {
       mlir::Value boxAddr1 = value1, boxAddr2 = value2;
       if (fir::isBoxAddress(boxAddr1.getType()))
-        boxAddr1 = builder.create<fir::LoadOp>(loc, boxAddr1);
+        boxAddr1 = fir::LoadOp::create(builder, loc, boxAddr1);
       if (fir::isBoxAddress(boxAddr2.getType()))
-        boxAddr2 = builder.create<fir::LoadOp>(loc, boxAddr2);
-      boxAddr1 = builder.create<fir::BoxAddrOp>(loc, boxAddr1);
-      boxAddr2 = builder.create<fir::BoxAddrOp>(loc, boxAddr2);
+        boxAddr2 = fir::LoadOp::create(builder, loc, boxAddr2);
+      boxAddr1 = fir::BoxAddrOp::create(builder, loc, boxAddr1);
+      boxAddr2 = fir::BoxAddrOp::create(builder, loc, boxAddr2);
       auto leftEntity = hlfir::Entity{boxAddr1};
       auto rightEntity = hlfir::Entity{boxAddr2};
 
@@ -1625,7 +1647,7 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
       auto rightVal = hlfir::loadTrivialScalar(loc, builder, rightEntity);
       mlir::Value res =
           genScalarCombiner(builder, loc, op, innerTy, leftVal, rightVal);
-      builder.create<hlfir::AssignOp>(loc, res, boxAddr1);
+      hlfir::AssignOp::create(builder, loc, res, boxAddr1);
     } else {
       mlir::Type innerTy = fir::extractSequenceType(boxTy);
       fir::SequenceType seqTy =
@@ -1640,14 +1662,14 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
           getSubscriptsFromArgs(recipe.getCombinerRegion().getArguments());
       auto leftEntity = hlfir::Entity{value1};
       if (fir::isBoxAddress(value1.getType()))
-        leftEntity =
-            hlfir::Entity{builder.create<fir::LoadOp>(loc, value1).getResult()};
+        leftEntity = hlfir::Entity{
+            fir::LoadOp::create(builder, loc, value1).getResult()};
       auto left =
           genDesignateWithTriplets(builder, loc, leftEntity, triplets, shape);
       auto rightEntity = hlfir::Entity{value2};
       if (fir::isBoxAddress(value2.getType()))
-        rightEntity =
-            hlfir::Entity{builder.create<fir::LoadOp>(loc, value2).getResult()};
+        rightEntity = hlfir::Entity{
+            fir::LoadOp::create(builder, loc, value2).getResult()};
       auto right =
           genDesignateWithTriplets(builder, loc, rightEntity, triplets, shape);
 
@@ -1665,11 +1687,11 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
       mlir::Value elemental = hlfir::genElementalOp(
           loc, builder, seqTy.getEleTy(), shape, typeParams, genKernel,
           /*isUnordered=*/true);
-      builder.create<hlfir::AssignOp>(loc, elemental, value1);
+      hlfir::AssignOp::create(builder, loc, elemental, value1);
     }
   } else {
     mlir::Value res = genScalarCombiner(builder, loc, op, ty, value1, value2);
-    builder.create<fir::StoreOp>(loc, res, value1);
+    fir::StoreOp::create(builder, loc, res, value1);
   }
 }
 
@@ -1711,7 +1733,7 @@ mlir::acc::ReductionRecipeOp Fortran::lower::createOrGetReductionRecipe(
   mlir::Value v1 = recipe.getCombinerRegion().front().getArgument(0);
   mlir::Value v2 = recipe.getCombinerRegion().front().getArgument(1);
   genCombiner(builder, loc, op, ty, v1, v2, recipe, bounds, allConstantBound);
-  builder.create<mlir::acc::YieldOp>(loc, v1);
+  mlir::acc::YieldOp::create(builder, loc, v1);
   builder.restoreInsertionPoint(ip);
   return recipe;
 }
@@ -1803,7 +1825,7 @@ createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
                llvm::SmallVector<mlir::Type> retTy = {},
                mlir::Value yieldValue = {}, mlir::TypeRange argsTy = {},
                llvm::SmallVector<mlir::Location> locs = {}) {
-  Op op = builder.create<Op>(loc, retTy, operands);
+  Op op = Op::create(builder, loc, retTy, operands);
   builder.createBlock(&op.getRegion(), op.getRegion().end(), argsTy, locs);
   mlir::Block &block = op.getRegion().back();
   builder.setInsertionPointToStart(&block);
@@ -1823,13 +1845,13 @@ createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
   if (yieldValue) {
     if constexpr (std::is_same_v<Terminator, mlir::acc::YieldOp>) {
-      Terminator yieldOp = builder.create<Terminator>(returnLoc, yieldValue);
+      Terminator yieldOp = Terminator::create(builder, returnLoc, yieldValue);
       yieldValue.getDefiningOp()->moveBefore(yieldOp);
     } else {
-      builder.create<Terminator>(returnLoc);
+      Terminator::create(builder, returnLoc);
     }
   } else {
-    builder.create<Terminator>(returnLoc);
+    Terminator::create(builder, returnLoc);
   }
   builder.setInsertionPointToStart(&block);
   return op;
@@ -2419,7 +2441,7 @@ static mlir::acc::LoopOp createLoopOp(
 
   for (auto [arg, value] : llvm::zip(
            loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
-    builder.create<fir::StoreOp>(currentLocation, arg, value);
+    fir::StoreOp::create(builder, currentLocation, arg, value);
 
   loopOp.setInclusiveUpperbound(inclusiveBounds);
 
@@ -3732,8 +3754,8 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
 
   dataClauseOperands.append(updateHostOperands);
 
-  builder.create<mlir::acc::UpdateOp>(
-      currentLocation, ifCond, asyncOperands,
+  mlir::acc::UpdateOp::create(
+      builder, currentLocation, ifCond, asyncOperands,
       getArrayAttr(builder, asyncOperandsDeviceTypes),
       getArrayAttr(builder, asyncOnlyDeviceTypes), waitOperands,
       getDenseI32ArrayAttr(builder, waitOperandsSegments),
@@ -3855,13 +3877,14 @@ static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
                                   const std::string &declareGlobalName,
                                   bool implicit, std::stringstream &asFortran) {
   GlobalOp declareGlobalOp =
-      modBuilder.create<GlobalOp>(loc, declareGlobalName);
+      GlobalOp::create(modBuilder, loc, declareGlobalName);
   builder.createBlock(&declareGlobalOp.getRegion(),
                       declareGlobalOp.getRegion().end(), {}, {});
   builder.setInsertionPointToEnd(&declareGlobalOp.getRegion().back());
 
-  fir::AddrOfOp addrOp = builder.create<fir::AddrOfOp>(
-      loc, fir::ReferenceType::get(globalOp.getType()), globalOp.getSymbol());
+  fir::AddrOfOp addrOp = fir::AddrOfOp::create(
+      builder, loc, fir::ReferenceType::get(globalOp.getType()),
+      globalOp.getSymbol());
   addDeclareAttr(builder, addrOp, clause);
 
   llvm::SmallVector<mlir::Value> bounds;
@@ -3870,21 +3893,21 @@ static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
       /*structured=*/false, implicit, clause, addrOp.getResTy().getType(),
       /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   if constexpr (std::is_same_v<DeclareOp, mlir::acc::DeclareEnterOp>)
-    builder.create<DeclareOp>(
-        loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
-        mlir::ValueRange(entryOp.getAccVar()));
+    DeclareOp::create(builder, loc,
+                      mlir::acc::DeclareTokenType::get(entryOp.getContext()),
+                      mlir::ValueRange(entryOp.getAccVar()));
   else
-    builder.create<DeclareOp>(loc, mlir::Value{},
-                              mlir::ValueRange(entryOp.getAccVar()));
+    DeclareOp::create(builder, loc, mlir::Value{},
+                      mlir::ValueRange(entryOp.getAccVar()));
   if constexpr (std::is_same_v<GlobalOp, mlir::acc::GlobalDestructorOp>) {
-    builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccVar(),
-                           entryOp.getBounds(), entryOp.getAsyncOperands(),
-                           entryOp.getAsyncOperandsDeviceTypeAttr(),
-                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-                           /*structured=*/false, /*implicit=*/false,
-                           builder.getStringAttr(*entryOp.getName()));
+    ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                   entryOp.getBounds(), entryOp.getAsyncOperands(),
+                   entryOp.getAsyncOperandsDeviceTypeAttr(),
+                   entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                   /*structured=*/false, /*implicit=*/false,
+                   builder.getStringAttr(*entryOp.getName()));
   }
-  builder.create<mlir::acc::TerminatorOp>(loc);
+  mlir::acc::TerminatorOp::create(builder, loc);
   modBuilder.setInsertionPointAfter(declareGlobalOp);
 }
 
@@ -3899,8 +3922,9 @@ static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
   auto registerFuncOp =
       createDeclareFunc(modBuilder, builder, loc, registerFuncName.str());
 
-  fir::AddrOfOp addrOp = builder.create<fir::AddrOfOp>(
-      loc, fir::ReferenceType::get(globalOp.getType()), globalOp.getSymbol());
+  fir::AddrOfOp addrOp = fir::AddrOfOp::create(
+      builder, loc, fir::ReferenceType::get(globalOp.getType()),
+      globalOp.getSymbol());
 
   std::stringstream asFortran;
   asFortran << Fortran::lower::mangle::demangleName(globalOp.getSymName());
@@ -3923,15 +3947,15 @@ static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
 
   if (unwrapFirBox) {
-    auto loadOp = builder.create<fir::LoadOp>(loc, addrOp.getResult());
-    fir::BoxAddrOp boxAddrOp = builder.create<fir::BoxAddrOp>(loc, loadOp);
+    auto loadOp = fir::LoadOp::create(builder, loc, addrOp.getResult());
+    fir::BoxAddrOp boxAddrOp = fir::BoxAddrOp::create(builder, loc, loadOp);
     addDeclareAttr(builder, boxAddrOp.getOperation(), clause);
     EntryOp entryOp = createDataEntryOp<EntryOp>(
         builder, loc, boxAddrOp.getResult(), asFortran, bounds,
         /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType(),
         /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-    builder.create<mlir::acc::DeclareEnterOp>(
-        loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
+    mlir::acc::DeclareEnterOp::create(
+        builder, loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
         mlir::ValueRange(entryOp.getAccVar()));
   }
 
@@ -3962,10 +3986,11 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
     auto preDeallocOp =
         createDeclareFunc(modBuilder, builder, loc, preDeallocFuncName.str());
 
-    fir::AddrOfOp addrOp = builder.create<fir::AddrOfOp>(
-        loc, fir::ReferenceType::get(globalOp.getType()), globalOp.getSymbol());
-    auto loadOp = builder.create<fir::LoadOp>(loc, addrOp.getResult());
-    fir::BoxAddrOp boxAddrOp = builder.create<fir::BoxAddrOp>(loc, loadOp);
+    fir::AddrOfOp addrOp = fir::AddrOfOp::create(
+        builder, loc, fir::ReferenceType::get(globalOp.getType()),
+        globalOp.getSymbol());
+    auto loadOp = fir::LoadOp::create(builder, loc, addrOp.getResult());
+    fir::BoxAddrOp boxAddrOp = fir::BoxAddrOp::create(builder, loc, loadOp);
     mlir::Value var = boxAddrOp.getResult();
     addDeclareAttr(builder, var.getDefiningOp(), clause);
 
@@ -3976,25 +4001,25 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
             /*structured=*/false, /*implicit=*/false, clause, var.getType(),
             /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
 
-    builder.create<mlir::acc::DeclareExitOp>(
-        loc, mlir::Value{}, mlir::ValueRange(entryOp.getAccVar()));
+    mlir::acc::DeclareExitOp::create(builder, loc, mlir::Value{},
+                                     mlir::ValueRange(entryOp.getAccVar()));
 
     if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
                   std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
-      builder.create<ExitOp>(
-          entryOp.getLoc(), entryOp.getAccVar(), entryOp.getVar(),
-          entryOp.getBounds(), entryOp.getAsyncOperands(),
-          entryOp.getAsyncOperandsDeviceTypeAttr(), entryOp.getAsyncOnlyAttr(),
-          entryOp.getDataClause(),
-          /*structured=*/false, /*implicit=*/false,
-          builder.getStringAttr(*entryOp.getName()));
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getVar(), entryOp.getBounds(),
+                     entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
     else
-      builder.create<ExitOp>(
-          entryOp.getLoc(), entryOp.getAccVar(), entryOp.getBounds(),
-          entryOp.getAsyncOperands(), entryOp.getAsyncOperandsDeviceTypeAttr(),
-          entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-          /*structured=*/false, /*implicit=*/false,
-          builder.getStringAttr(*entryOp.getName()));
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
 
     // Generate the post dealloc function.
     modBuilder.setInsertionPointAfter(preDeallocOp);
@@ -4006,8 +4031,9 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
   auto postDeallocOp =
       createDeclareFunc(modBuilder, builder, loc, postDeallocFuncName.str());
 
-  fir::AddrOfOp addrOp = builder.create<fir::AddrOfOp>(
-      loc, fir::ReferenceType::get(globalOp.getType()), globalOp.getSymbol());
+  fir::AddrOfOp addrOp = fir::AddrOfOp::create(
+      builder, loc, fir::ReferenceType::get(globalOp.getType()),
+      globalOp.getSymbol());
   if (unwrapFirBox)
     asFortran << accFirDescriptorPostfix.str();
   llvm::SmallVector<mlir::Value> bounds;
@@ -4244,13 +4270,13 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
   auto ops = funcOp.getOps<mlir::acc::DeclareEnterOp>();
   mlir::Value declareToken;
   if (ops.empty()) {
-    declareToken = builder.create<mlir::acc::DeclareEnterOp>(
-        loc, mlir::acc::DeclareTokenType::get(builder.getContext()),
+    declareToken = mlir::acc::DeclareEnterOp::create(
+        builder, loc, mlir::acc::DeclareTokenType::get(builder.getContext()),
         dataClauseOperands);
   } else {
     auto declareOp = *ops.begin();
-    auto newDeclareOp = builder.create<mlir::acc::DeclareEnterOp>(
-        loc, mlir::acc::DeclareTokenType::get(builder.getContext()),
+    auto newDeclareOp = mlir::acc::DeclareEnterOp::create(
+        builder, loc, mlir::acc::DeclareTokenType::get(builder.getContext()),
         declareOp.getDataClauseOperands());
     newDeclareOp.getDataClauseOperandsMutable().append(dataClauseOperands);
     declareToken = newDeclareOp.getToken();
@@ -4272,7 +4298,7 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
     mlir::func::FuncOp funcOp = builder.getFunction();
     auto ops = funcOp.getOps<mlir::acc::DeclareExitOp>();
     if (ops.empty()) {
-      builder.create<mlir::acc::DeclareExitOp>(loc, declareToken, operands);
+      mlir::acc::DeclareExitOp::create(builder, loc, declareToken, operands);
     } else {
       auto declareOp = *ops.begin();
       declareOp.getDataClauseOperandsMutable().append(operands);
@@ -4396,10 +4422,34 @@ getAttributeValueByDeviceType(llvm::SmallVector<mlir::Attribute> &attributes,
   return std::nullopt;
 }
 
+// Helper function to extract string value from bind name variant
+static std::optional<llvm::StringRef> getBindNameStringValue(
+    const std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
+        &bindNameValue) {
+  if (!bindNameValue.has_value())
+    return std::nullopt;
+
+  return std::visit(
+      [](const auto &attr) -> std::optional<llvm::StringRef> {
+        if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                     mlir::StringAttr>) {
+          return attr.getValue();
+        } else if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                            mlir::SymbolRefAttr>) {
+          return attr.getLeafReference();
+        } else {
+          return std::nullopt;
+        }
+      },
+      bindNameValue.value());
+}
+
 static bool compareDeviceTypeInfo(
     mlir::acc::RoutineOp op,
-    llvm::SmallVector<mlir::Attribute> &bindNameArrayAttr,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypeArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangDimArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypeArrayAttr,
@@ -4409,9 +4459,13 @@ static bool compareDeviceTypeInfo(
   for (uint32_t dtypeInt = 0;
        dtypeInt != mlir::acc::getMaxEnumValForDeviceType(); ++dtypeInt) {
     auto dtype = static_cast<mlir::acc::DeviceType>(dtypeInt);
-    if (op.getBindNameValue(dtype) !=
-        getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
-            bindNameArrayAttr, bindNameDeviceTypeArrayAttr, dtype))
+    auto bindNameValue = getBindNameStringValue(op.getBindNameValue(dtype));
+    if (bindNameValue !=
+            getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+                bindIdNameArrayAttr, bindIdNameDeviceTypeArrayAttr, dtype) &&
+        bindNameValue !=
+            getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+                bindStrNameArrayAttr, bindStrNameDeviceTypeArrayAttr, dtype))
       return false;
     if (op.hasGang(dtype) != hasDeviceType(gangArrayAttr, dtype))
       return false;
@@ -4458,8 +4512,10 @@ getArrayAttrOrNull(fir::FirOpBuilder &builder,
 void createOpenACCRoutineConstruct(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     mlir::ModuleOp mod, mlir::func::FuncOp funcOp, std::string funcName,
-    bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindNames,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes,
+    bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindIdNames,
+    llvm::SmallVector<mlir::Attribute> &bindStrNames,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDimValues,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes,
@@ -4472,7 +4528,8 @@ void createOpenACCRoutineConstruct(
         0) {
       // If the routine is already specified with the same clauses, just skip
       // the operation creation.
-      if (compareDeviceTypeInfo(routineOp, bindNames, bindNameDeviceTypes,
+      if (compareDeviceTypeInfo(routineOp, bindIdNames, bindStrNames,
+                                bindIdNameDeviceTypes, bindStrNameDeviceTypes,
                                 gangDeviceTypes, gangDimValues,
                                 gangDimDeviceTypes, seqDeviceTypes,
                                 workerDeviceTypes, vectorDeviceTypes) &&
@@ -4486,11 +4543,13 @@ void createOpenACCRoutineConstruct(
   std::string routineOpStr = routineOpName.str();
   mlir::OpBuilder modBuilder(mod.getBodyRegion());
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  modBuilder.create<mlir::acc::RoutineOp>(
-      loc, routineOpStr,
+  mlir::acc::RoutineOp::create(
+      modBuilder, loc, routineOpStr,
       mlir::SymbolRefAttr::get(builder.getContext(), funcName),
-      getArrayAttrOrNull(builder, bindNames),
-      getArrayAttrOrNull(builder, bindNameDeviceTypes),
+      getArrayAttrOrNull(builder, bindIdNames),
+      getArrayAttrOrNull(builder, bindStrNames),
+      getArrayAttrOrNull(builder, bindIdNameDeviceTypes),
+      getArrayAttrOrNull(builder, bindStrNameDeviceTypes),
       getArrayAttrOrNull(builder, workerDeviceTypes),
       getArrayAttrOrNull(builder, vectorDeviceTypes),
       getArrayAttrOrNull(builder, seqDeviceTypes), hasNohost,
@@ -4507,8 +4566,10 @@ static void interpretRoutineDeviceInfo(
     llvm::SmallVector<mlir::Attribute> &seqDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &vectorDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &workerDeviceTypes,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes,
-    llvm::SmallVector<mlir::Attribute> &bindNames,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindIdNames,
+    llvm::SmallVector<mlir::Attribute> &bindStrNames,
     llvm::SmallVector<mlir::Attribute> &gangDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDimValues,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes) {
@@ -4541,16 +4602,18 @@ static void interpretRoutineDeviceInfo(
   if (dinfo.bindNameOpt().has_value()) {
     const auto &bindName = dinfo.bindNameOpt().value();
     mlir::Attribute bindNameAttr;
-    if (const auto &bindStr{std::get_if<std::string>(&bindName)}) {
+    if (const auto &bindSym{
+            std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) {
+      bindNameAttr = builder.getSymbolRefAttr(converter.mangleName(*bindSym));
+      bindIdNames.push_back(bindNameAttr);
+      bindIdNameDeviceTypes.push_back(getDeviceTypeAttr());
+    } else if (const auto &bindStr{std::get_if<std::string>(&bindName)}) {
       bindNameAttr = builder.getStringAttr(*bindStr);
-    } else if (const auto &bindSym{
-                   std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) {
-      bindNameAttr = builder.getStringAttr(converter.mangleName(*bindSym));
+      bindStrNames.push_back(bindNameAttr);
+      bindStrNameDeviceTypes.push_back(getDeviceTypeAttr());
     } else {
       llvm_unreachable("Unsupported bind name type");
     }
-    bindNames.push_back(bindNameAttr);
-    bindNameDeviceTypes.push_back(getDeviceTypeAttr());
   }
 }
 
@@ -4566,8 +4629,9 @@ void Fortran::lower::genOpenACCRoutineConstruct(
   bool hasNohost{false};
 
   llvm::SmallVector<mlir::Attribute> seqDeviceTypes, vectorDeviceTypes,
-      workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
-      gangDimDeviceTypes, gangDimValues;
+      workerDeviceTypes, bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+      bindIdNames, bindStrNames, gangDeviceTypes, gangDimDeviceTypes,
+      gangDimValues;
 
   for (const Fortran::semantics::OpenACCRoutineInfo &info : routineInfos) {
     // Device Independent Attributes
@@ -4576,24 +4640,26 @@ void Fortran::lower::genOpenACCRoutineConstruct(
     }
     // Note: Device Independent Attributes are set to the
     // none device type in `info`.
-    interpretRoutineDeviceInfo(converter, info, seqDeviceTypes,
-                               vectorDeviceTypes, workerDeviceTypes,
-                               bindNameDeviceTypes, bindNames, gangDeviceTypes,
-                               gangDimValues, gangDimDeviceTypes);
+    interpretRoutineDeviceInfo(
+        converter, info, seqDeviceTypes, vectorDeviceTypes, workerDeviceTypes,
+        bindIdNameDeviceTypes, bindStrNameDeviceTypes, bindIdNames,
+        bindStrNames, gangDeviceTypes, gangDimValues, gangDimDeviceTypes);
 
     // Device Dependent Attributes
     for (const Fortran::semantics::OpenACCRoutineDeviceTypeInfo &dinfo :
          info.deviceTypeInfos()) {
-      interpretRoutineDeviceInfo(
-          converter, dinfo, seqDeviceTypes, vectorDeviceTypes,
-          workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
-          gangDimValues, gangDimDeviceTypes);
+      interpretRoutineDeviceInfo(converter, dinfo, seqDeviceTypes,
+                                 vectorDeviceTypes, workerDeviceTypes,
+                                 bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+                                 bindIdNames, bindStrNames, gangDeviceTypes,
+                                 gangDimValues, gangDimDeviceTypes);
     }
   }
   createOpenACCRoutineConstruct(
-      converter, loc, mod, funcOp, funcName, hasNohost, bindNames,
-      bindNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes,
-      seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes);
+      converter, loc, mod, funcOp, funcName, hasNohost, bindIdNames,
+      bindStrNames, bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+      gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes,
+      workerDeviceTypes, vectorDeviceTypes);
 }
 
 static void
@@ -4822,9 +4888,9 @@ void Fortran::lower::genOpenACCTerminator(fir::FirOpBuilder &builder,
                                           mlir::Operation *op,
                                           mlir::Location loc) {
   if (mlir::isa<mlir::acc::ParallelOp, mlir::acc::LoopOp>(op))
-    builder.create<mlir::acc::YieldOp>(loc);
+    mlir::acc::YieldOp::create(builder, loc);
   else
-    builder.create<mlir::acc::TerminatorOp>(loc);
+    mlir::acc::TerminatorOp::create(builder, loc);
 }
 
 bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
@@ -4844,7 +4910,7 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
                                                  mlir::Location loc) {
   mlir::Value yieldValue =
       builder.createIntegerConstant(loc, builder.getI1Type(), 1);
-  builder.create<mlir::acc::YieldOp>(loc, yieldValue);
+  mlir::acc::YieldOp::create(builder, loc, yieldValue);
 }
 
 int64_t Fortran::lower::getLoopCountForCollapseAndTile(
diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp
index 6ea331c370640..9a233d2d8cb08 100644
--- a/flang/lib/Lower/OpenMP/Atomic.cpp
+++ b/flang/lib/Lower/OpenMP/Atomic.cpp
@@ -528,8 +528,8 @@ genAtomicRead(lower::AbstractConverter &converter,
   }();
 
   builder.restoreInsertionPoint(atomicAt);
-  mlir::Operation *op = builder.create<mlir::omp::AtomicReadOp>(
-      loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint,
+  mlir::Operation *op = mlir::omp::AtomicReadOp::create(
+      builder, loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint,
       makeMemOrderAttr(converter, memOrder));
 
   if (atomType != storeType) {
@@ -537,7 +537,7 @@ genAtomicRead(lower::AbstractConverter &converter,
     // The READ operation could be a part of UPDATE CAPTURE, so make sure
     // we don't emit extra code into the body of the atomic op.
     builder.restoreInsertionPoint(postAt);
-    mlir::Value load = builder.create<fir::LoadOp>(loc, toAddr);
+    mlir::Value load = fir::LoadOp::create(builder, loc, toAddr);
     overrides.try_emplace(&atom, load);
 
     converter.overrideExprValues(&overrides);
@@ -545,7 +545,7 @@ genAtomicRead(lower::AbstractConverter &converter,
         fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
     converter.resetExprOverrides();
 
-    builder.create<fir::StoreOp>(loc, value, storeAddr);
+    fir::StoreOp::create(builder, loc, value, storeAddr);
   }
   return op;
 }
@@ -581,8 +581,9 @@ genAtomicWrite(lower::AbstractConverter &converter,
   mlir::Value converted = builder.createConvert(loc, atomType, value);
 
   builder.restoreInsertionPoint(atomicAt);
-  mlir::Operation *op = builder.create<mlir::omp::AtomicWriteOp>(
-      loc, atomAddr, converted, hint, makeMemOrderAttr(converter, memOrder));
+  mlir::Operation *op =
+      mlir::omp::AtomicWriteOp::create(builder, loc, atomAddr, converted, hint,
+                                       makeMemOrderAttr(converter, memOrder));
   return op;
 }
 
@@ -635,8 +636,8 @@ genAtomicUpdate(lower::AbstractConverter &converter,
   }
 
   builder.restoreInsertionPoint(atomicAt);
-  auto updateOp = builder.create<mlir::omp::AtomicUpdateOp>(
-      loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder));
+  auto updateOp = mlir::omp::AtomicUpdateOp::create(
+      builder, loc, atomAddr, hint, makeMemOrderAttr(converter, memOrder));
 
   mlir::Region &region = updateOp->getRegion(0);
   mlir::Block *block = builder.createBlock(&region, {}, {atomType}, {loc});
@@ -647,7 +648,7 @@ genAtomicUpdate(lower::AbstractConverter &converter,
   mlir::Value updated =
       fir::getBase(converter.genExprValue(rhs, stmtCtx, &loc));
   mlir::Value converted = builder.createConvert(loc, atomType, updated);
-  builder.create<mlir::omp::YieldOp>(loc, converted);
+  mlir::omp::YieldOp::create(builder, loc, converted);
   converter.resetExprOverrides();
 
   builder.restoreInsertionPoint(postAt); // For naCtx cleanups
@@ -731,8 +732,8 @@ void Fortran::lower::omp::lowerAtomic(
              "Expexcing two actions");
       (void)action0;
       (void)action1;
-      captureOp = builder.create<mlir::omp::AtomicCaptureOp>(
-          loc, hint, makeMemOrderAttr(converter, memOrder));
+      captureOp = mlir::omp::AtomicCaptureOp::create(
+          builder, loc, hint, makeMemOrderAttr(converter, memOrder));
       // Set the non-atomic insertion point to before the atomic.capture.
       preAt = getInsertionPointBefore(captureOp);
 
@@ -740,7 +741,7 @@ void Fortran::lower::omp::lowerAtomic(
       builder.setInsertionPointToEnd(block);
       // Set the atomic insertion point to before the terminator inside
       // atomic.capture.
-      mlir::Operation *term = builder.create<mlir::omp::TerminatorOp>(loc);
+      mlir::Operation *term = mlir::omp::TerminatorOp::create(builder, loc);
       atomicAt = getInsertionPointBefore(term);
       postAt = getInsertionPointAfter(captureOp);
       hint = nullptr;
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 74087d42a8e6e..8b3ad57c53810 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -723,7 +723,7 @@ bool ClauseProcessor::processCopyin() const {
   // barrier is inserted following all of them.
   firOpBuilder.restoreInsertionPoint(insPt);
   if (hasCopyin)
-    firOpBuilder.create<mlir::omp::BarrierOp>(converter.getCurrentLocation());
+    mlir::omp::BarrierOp::create(firOpBuilder, converter.getCurrentLocation());
   return hasCopyin;
 }
 
@@ -803,7 +803,7 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Type> argsTy = {varType, varType};
   auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {});
   mlir::func::FuncOp funcOp =
-      modBuilder.create<mlir::func::FuncOp>(loc, copyFuncName, funcType);
+      mlir::func::FuncOp::create(modBuilder, loc, copyFuncName, funcType);
   funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
   fir::factory::setInternalLinkage(funcOp);
   builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
@@ -819,22 +819,22 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
     for (auto extent : typeInfo.getShape())
       extents.push_back(
           builder.createIntegerConstant(loc, builder.getIndexType(), extent));
-    shape = builder.create<fir::ShapeOp>(loc, extents);
+    shape = fir::ShapeOp::create(builder, loc, extents);
   }
   mlir::Value dst = funcOp.getArgument(0);
   mlir::Value src = funcOp.getArgument(1);
   llvm::SmallVector<mlir::Value> typeparams;
   if (typeInfo.isBoxChar()) {
     // fir.boxchar will be passed here as fir.ref<fir.boxchar>
-    auto loadDst = builder.create<fir::LoadOp>(loc, dst);
-    auto loadSrc = builder.create<fir::LoadOp>(loc, src);
+    auto loadDst = fir::LoadOp::create(builder, loc, dst);
+    auto loadSrc = fir::LoadOp::create(builder, loc, src);
     // get the actual fir.ref<fir.char> type
     mlir::Type refType =
         fir::ReferenceType::get(mlir::cast<fir::BoxCharType>(eleTy).getEleTy());
-    auto unboxedDst = builder.create<fir::UnboxCharOp>(
-        loc, refType, builder.getIndexType(), loadDst);
-    auto unboxedSrc = builder.create<fir::UnboxCharOp>(
-        loc, refType, builder.getIndexType(), loadSrc);
+    auto unboxedDst = fir::UnboxCharOp::create(builder, loc, refType,
+                                               builder.getIndexType(), loadDst);
+    auto unboxedSrc = fir::UnboxCharOp::create(builder, loc, refType,
+                                               builder.getIndexType(), loadSrc);
     // Add length to type parameters
     typeparams.push_back(unboxedDst.getResult(1));
     dst = unboxedDst.getResult(0);
@@ -844,14 +844,14 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
         loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
     typeparams.push_back(charLen);
   }
-  auto declDst = builder.create<hlfir::DeclareOp>(
-      loc, dst, copyFuncName + "_dst", shape, typeparams,
+  auto declDst = hlfir::DeclareOp::create(
+      builder, loc, dst, copyFuncName + "_dst", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
-  auto declSrc = builder.create<hlfir::DeclareOp>(
-      loc, src, copyFuncName + "_src", shape, typeparams,
+  auto declSrc = hlfir::DeclareOp::create(
+      builder, loc, src, copyFuncName + "_src", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   converter.copyVar(loc, declDst.getBase(), declSrc.getBase(), varAttrs);
-  builder.create<mlir::func::ReturnOp>(loc);
+  mlir::func::ReturnOp::create(builder, loc);
   return funcOp;
 }
 
@@ -882,8 +882,8 @@ bool ClauseProcessor::processCopyprivate(
     if (mlir::isa<fir::BaseBoxType>(symType) ||
         mlir::isa<fir::BoxCharType>(symType)) {
       fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-      auto alloca = builder.create<fir::AllocaOp>(currentLocation, symType);
-      builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
+      auto alloca = fir::AllocaOp::create(builder, currentLocation, symType);
+      fir::StoreOp::create(builder, currentLocation, symVal, alloca);
       cpVar = alloca;
     }
 
@@ -1002,8 +1002,8 @@ bool ClauseProcessor::processDepend(lower::SymMap &symMap,
       // allocations so this is not a reliable way to identify the dependency.
       if (auto ref = mlir::dyn_cast<fir::ReferenceType>(dependVar.getType()))
         if (fir::isa_box_type(ref.getElementType()))
-          dependVar = builder.create<fir::LoadOp>(
-              converter.getCurrentLocation(), dependVar);
+          dependVar = fir::LoadOp::create(
+              builder, converter.getCurrentLocation(), dependVar);
 
       // The openmp dialect doesn't know what to do with boxes (and it would
       // break layering to teach it about them). The dependency variable can be
@@ -1012,8 +1012,8 @@ bool ClauseProcessor::processDepend(lower::SymMap &symMap,
       // Getting the address of the box data is okay because all the runtime
       // ultimately cares about is the base address of the array.
       if (fir::isa_box_type(dependVar.getType()))
-        dependVar = builder.create<fir::BoxAddrOp>(
-            converter.getCurrentLocation(), dependVar);
+        dependVar = fir::BoxAddrOp::create(
+            builder, converter.getCurrentLocation(), dependVar);
 
       result.dependVars.push_back(dependVar);
     }
@@ -1315,7 +1315,8 @@ bool ClauseProcessor::processMap(
                      const parser::CharBlock &source) {
     using Map = omp::clause::Map;
     mlir::Location clauseLocation = converter.genLocation(source);
-    const auto &[mapType, typeMods, mappers, iterator, objects] = clause.t;
+    const auto &[mapType, typeMods, refMod, mappers, iterator, objects] =
+        clause.t;
     llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
         llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE;
     std::string mapperIdName = "__implicit_mapper";
@@ -1342,16 +1343,13 @@ bool ClauseProcessor::processMap(
       mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
                      llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
       break;
-    case Map::MapType::Alloc:
-    case Map::MapType::Release:
+    case Map::MapType::Storage:
       // alloc and release is the default map_type for the Target Data
       // Ops, i.e. if no bits for map_type is supplied then alloc/release
-      // is implicitly assumed based on the target directive. Default
-      // value for Target Data and Enter Data is alloc and for Exit Data
-      // it is release.
+      // (aka storage in 6.0+) is implicitly assumed based on the target
+      // directive. Default value for Target Data and Enter Data is alloc
+      // and for Exit Data it is release.
       break;
-    case Map::MapType::Delete:
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE;
     }
 
     if (typeMods) {
@@ -1362,6 +1360,8 @@ bool ClauseProcessor::processMap(
         mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT;
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Close))
         mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE;
+      if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Delete))
+        mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE;
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::OmpxHold))
         mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD;
     }
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 22a07219d3a50..686fba0154f44 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1001,19 +1001,21 @@ Map make(const parser::OmpClause::Map &inp,
          semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpMapClause
   CLAUSET_ENUM_CONVERT( //
-      convert1, parser::OmpMapType::Value, Map::MapType,
+      convertMapType, parser::OmpMapType::Value, Map::MapType,
       // clang-format off
-      MS(Alloc,    Alloc)
-      MS(Delete,   Delete)
-      MS(From,     From)
-      MS(Release,  Release)
-      MS(To,       To)
-      MS(Tofrom,   Tofrom)
+      MS(Alloc,   Storage)
+      MS(Delete,  Storage)
+      MS(Release, Storage)
+      MS(Storage, Storage)
+      MS(From,    From)
+      MS(To,      To)
+      MS(Tofrom,  Tofrom)
       // clang-format on
   );
 
   CLAUSET_ENUM_CONVERT( //
-      convert2, parser::OmpMapTypeModifier::Value, Map::MapTypeModifier,
+      convertMapTypeMod, parser::OmpMapTypeModifier::Value,
+      Map::MapTypeModifier,
       // clang-format off
       MS(Always,    Always)
       MS(Close,     Close)
@@ -1022,43 +1024,76 @@ Map make(const parser::OmpClause::Map &inp,
       // clang-format on
   );
 
+  CLAUSET_ENUM_CONVERT( //
+      convertRefMod, parser::OmpRefModifier::Value, Map::RefModifier,
+      // clang-format off
+      MS(Ref_Ptee,     RefPtee)
+      MS(Ref_Ptr,      RefPtr)
+      MS(Ref_Ptr_Ptee, RefPtrPtee)
+      // clang-format on
+  );
+
+  // Treat always, close, present, self, delete modifiers as map-type-
+  // modifiers.
   auto &mods = semantics::OmpGetModifiers(inp.v);
-  auto *t1 = semantics::OmpGetUniqueModifier<parser::OmpMapper>(mods);
-  auto *t2 = semantics::OmpGetUniqueModifier<parser::OmpIterator>(mods);
-  auto *t3 = semantics::OmpGetUniqueModifier<parser::OmpMapType>(mods);
-  auto &t4 = std::get<parser::OmpObjectList>(inp.v.t);
 
-  auto mappers = [&]() -> std::optional<List<Mapper>> {
+  auto *t1 = semantics::OmpGetUniqueModifier<parser::OmpMapType>(mods);
+  auto &t2 = std::get<parser::OmpObjectList>(inp.v.t);
+
+  auto type = [&]() -> std::optional<Map::MapType> {
     if (t1)
-      return List<Mapper>{Mapper{makeObject(t1->v, semaCtx)}};
+      return convertMapType(t1->v);
     return std::nullopt;
   }();
 
-  auto iterator = [&]() -> std::optional<Iterator> {
-    if (t2)
-      return makeIterator(*t2, semaCtx);
+  llvm::DenseSet<Map::MapTypeModifier> modSet;
+  if (t1 && t1->v == parser::OmpMapType::Value::Delete)
+    modSet.insert(Map::MapTypeModifier::Delete);
+
+  for (auto *typeMod :
+       semantics::OmpGetRepeatableModifier<parser::OmpMapTypeModifier>(mods)) {
+    modSet.insert(convertMapTypeMod(typeMod->v));
+  }
+  if (semantics::OmpGetUniqueModifier<parser::OmpAlwaysModifier>(mods))
+    modSet.insert(Map::MapTypeModifier::Always);
+  if (semantics::OmpGetUniqueModifier<parser::OmpCloseModifier>(mods))
+    modSet.insert(Map::MapTypeModifier::Close);
+  if (semantics::OmpGetUniqueModifier<parser::OmpDeleteModifier>(mods))
+    modSet.insert(Map::MapTypeModifier::Delete);
+  if (semantics::OmpGetUniqueModifier<parser::OmpPresentModifier>(mods))
+    modSet.insert(Map::MapTypeModifier::Present);
+  if (semantics::OmpGetUniqueModifier<parser::OmpSelfModifier>(mods))
+    modSet.insert(Map::MapTypeModifier::Self);
+  if (semantics::OmpGetUniqueModifier<parser::OmpxHoldModifier>(mods))
+    modSet.insert(Map::MapTypeModifier::OmpxHold);
+
+  std::optional<Map::MapTypeModifiers> maybeTypeMods{};
+  if (!modSet.empty())
+    maybeTypeMods = Map::MapTypeModifiers(modSet.begin(), modSet.end());
+
+  auto refMod = [&]() -> std::optional<Map::RefModifier> {
+    if (auto *t = semantics::OmpGetUniqueModifier<parser::OmpRefModifier>(mods))
+      return convertRefMod(t->v);
     return std::nullopt;
   }();
 
-  auto type = [&]() -> std::optional<Map::MapType> {
-    if (t3)
-      return convert1(t3->v);
+  auto mappers = [&]() -> std::optional<List<Mapper>> {
+    if (auto *t = semantics::OmpGetUniqueModifier<parser::OmpMapper>(mods))
+      return List<Mapper>{Mapper{makeObject(t->v, semaCtx)}};
     return std::nullopt;
   }();
 
-  Map::MapTypeModifiers typeMods;
-  for (auto *typeMod :
-       semantics::OmpGetRepeatableModifier<parser::OmpMapTypeModifier>(mods)) {
-    typeMods.push_back(convert2(typeMod->v));
-  }
-  std::optional<Map::MapTypeModifiers> maybeTypeMods{};
-  if (!typeMods.empty())
-    maybeTypeMods = std::move(typeMods);
+  auto iterator = [&]() -> std::optional<Iterator> {
+    if (auto *t = semantics::OmpGetUniqueModifier<parser::OmpIterator>(mods))
+      return makeIterator(*t, semaCtx);
+    return std::nullopt;
+  }();
 
   return Map{{/*MapType=*/std::move(type),
               /*MapTypeModifiers=*/std::move(maybeTypeMods),
-              /*Mapper=*/std::move(mappers), /*Iterator=*/std::move(iterator),
-              /*LocatorList=*/makeObjects(t4, semaCtx)}};
+              /*RefModifier=*/std::move(refMod), /*Mapper=*/std::move(mappers),
+              /*Iterator=*/std::move(iterator),
+              /*LocatorList=*/makeObjects(t2, semaCtx)}};
 }
 
 Match make(const parser::OmpClause::Match &inp,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 3fae3f3a0ddfd..11e488371b886 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -26,6 +26,8 @@
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Semantics/attr.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallSet.h"
 
 namespace Fortran {
 namespace lower {
@@ -49,7 +51,7 @@ DataSharingProcessor::DataSharingProcessor(
       firOpBuilder(converter.getFirOpBuilder()), clauses(clauses), eval(eval),
       shouldCollectPreDeterminedSymbols(shouldCollectPreDeterminedSymbols),
       useDelayedPrivatization(useDelayedPrivatization), symTable(symTable),
-      visitor() {
+      visitor(semaCtx) {
   eval.visit([&](const auto &functionParserNode) {
     parser::Walk(functionParserNode, visitor);
   });
@@ -289,7 +291,7 @@ void DataSharingProcessor::insertBarrier(
       clauseOps->privateNeedsBarrier =
           mlir::UnitAttr::get(&converter.getMLIRContext());
   } else {
-    firOpBuilder.create<mlir::omp::BarrierOp>(converter.getCurrentLocation());
+    mlir::omp::BarrierOp::create(firOpBuilder, converter.getCurrentLocation());
   }
 }
 
@@ -349,32 +351,32 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
              loopOp.getIVs(), result.loopUpperBounds, result.loopSteps)) {
       // v = iv + step
       // cmp = step < 0 ? v < ub : v > ub
-      mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
+      mlir::Value v = mlir::arith::AddIOp::create(firOpBuilder, loc, iv, step);
       vs.push_back(v);
       mlir::Value zero =
           firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
-      mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::slt, step, zero);
-      mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::slt, v, ub);
-      mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::sgt, v, ub);
-      mlir::Value icmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
-          loc, negativeStep, vLT, vGT);
+      mlir::Value negativeStep = mlir::arith::CmpIOp::create(
+          firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, step, zero);
+      mlir::Value vLT = mlir::arith::CmpIOp::create(
+          firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, v, ub);
+      mlir::Value vGT = mlir::arith::CmpIOp::create(
+          firOpBuilder, loc, mlir::arith::CmpIPredicate::sgt, v, ub);
+      mlir::Value icmpOp = mlir::arith::SelectOp::create(
+          firOpBuilder, loc, negativeStep, vLT, vGT);
 
       if (cmpOp)
-        cmpOp = firOpBuilder.create<mlir::arith::AndIOp>(loc, cmpOp, icmpOp);
+        cmpOp = mlir::arith::AndIOp::create(firOpBuilder, loc, cmpOp, icmpOp);
       else
         cmpOp = icmpOp;
     }
 
-    auto ifOp = firOpBuilder.create<fir::IfOp>(loc, cmpOp, /*else*/ false);
+    auto ifOp = fir::IfOp::create(firOpBuilder, loc, cmpOp, /*else*/ false);
     firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     for (auto [v, loopIV] : llvm::zip_equal(vs, loopIVs)) {
       hlfir::Entity loopIVEntity{loopIV};
       loopIVEntity =
           hlfir::derefPointersAndAllocatables(loc, firOpBuilder, loopIVEntity);
-      firOpBuilder.create<hlfir::AssignOp>(loc, v, loopIVEntity);
+      hlfir::AssignOp::create(firOpBuilder, loc, v, loopIVEntity);
     }
     lastPrivIP = firOpBuilder.saveInsertionPoint();
   } else if (mlir::isa<mlir::omp::SectionsOp>(op)) {
@@ -424,24 +426,55 @@ getSource(const semantics::SemanticsContext &semaCtx,
   return source;
 }
 
+static void collectPrivatizingConstructs(
+    llvm::SmallSet<llvm::omp::Directive, 16> &constructs, unsigned version) {
+  using Clause = llvm::omp::Clause;
+  using Directive = llvm::omp::Directive;
+
+  static const Clause privatizingClauses[] = {
+      Clause::OMPC_private,
+      Clause::OMPC_lastprivate,
+      Clause::OMPC_firstprivate,
+      Clause::OMPC_in_reduction,
+      Clause::OMPC_reduction,
+      Clause::OMPC_linear,
+      // TODO: Clause::OMPC_induction,
+      Clause::OMPC_task_reduction,
+      Clause::OMPC_detach,
+      Clause::OMPC_use_device_ptr,
+      Clause::OMPC_is_device_ptr,
+  };
+
+  for (auto dir : llvm::enum_seq_inclusive<Directive>(Directive::First_,
+                                                      Directive::Last_)) {
+    bool allowsPrivatizing = llvm::any_of(privatizingClauses, [&](Clause cls) {
+      return llvm::omp::isAllowedClauseForDirective(dir, cls, version);
+    });
+    if (allowsPrivatizing)
+      constructs.insert(dir);
+  }
+}
+
 bool DataSharingProcessor::isOpenMPPrivatizingConstruct(
-    const parser::OpenMPConstruct &omp) {
-  return common::visit(
-      [](auto &&s) {
-        using BareS = llvm::remove_cvref_t<decltype(s)>;
-        return std::is_same_v<BareS, parser::OpenMPBlockConstruct> ||
-               std::is_same_v<BareS, parser::OpenMPLoopConstruct> ||
-               std::is_same_v<BareS, parser::OpenMPSectionsConstruct>;
-      },
-      omp.u);
+    const parser::OpenMPConstruct &omp, unsigned version) {
+  static llvm::SmallSet<llvm::omp::Directive, 16> privatizing;
+  [[maybe_unused]] static bool init =
+      (collectPrivatizingConstructs(privatizing, version), true);
+
+  // As of OpenMP 6.0, privatizing constructs (with the test being if they
+  // allow a privatizing clause) are: dispatch, distribute, do, for, loop,
+  // parallel, scope, sections, simd, single, target, target_data, task,
+  // taskgroup, taskloop, and teams.
+  return llvm::is_contained(privatizing, extractOmpDirective(omp));
 }
 
 bool DataSharingProcessor::isOpenMPPrivatizingEvaluation(
     const pft::Evaluation &eval) const {
-  return eval.visit([](auto &&s) {
+  unsigned version = semaCtx.langOptions().OpenMPVersion;
+  return eval.visit([=](auto &&s) {
     using BareS = llvm::remove_cvref_t<decltype(s)>;
     if constexpr (std::is_same_v<BareS, parser::OpenMPConstruct>) {
-      return isOpenMPPrivatizingConstruct(s);
+      return isOpenMPPrivatizingConstruct(s, version);
     } else {
       return false;
     }
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index ee2fc70d2e673..bc422f410403a 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -36,6 +36,8 @@ class DataSharingProcessor {
   /// at any point in time. This is used to track Symbol definition scopes in
   /// order to tell which OMP scope defined vs. references a certain Symbol.
   struct OMPConstructSymbolVisitor {
+    OMPConstructSymbolVisitor(semantics::SemanticsContext &ctx)
+        : version(ctx.langOptions().OpenMPVersion) {}
     template <typename T>
     bool Pre(const T &) {
       return true;
@@ -45,13 +47,13 @@ class DataSharingProcessor {
 
     bool Pre(const parser::OpenMPConstruct &omp) {
       // Skip constructs that may not have privatizations.
-      if (isOpenMPPrivatizingConstruct(omp))
+      if (isOpenMPPrivatizingConstruct(omp, version))
         constructs.push_back(&omp);
       return true;
     }
 
     void Post(const parser::OpenMPConstruct &omp) {
-      if (isOpenMPPrivatizingConstruct(omp))
+      if (isOpenMPPrivatizingConstruct(omp, version))
         constructs.pop_back();
     }
 
@@ -68,6 +70,9 @@ class DataSharingProcessor {
     /// construct that defines symbol.
     bool isSymbolDefineBy(const semantics::Symbol *symbol,
                           lower::pft::Evaluation &eval) const;
+
+  private:
+    unsigned version;
   };
 
   mlir::OpBuilder::InsertPoint lastPrivIP;
@@ -115,7 +120,8 @@ class DataSharingProcessor {
                              mlir::OpBuilder::InsertPoint *lastPrivIP);
   void insertDeallocs();
 
-  static bool isOpenMPPrivatizingConstruct(const parser::OpenMPConstruct &omp);
+  static bool isOpenMPPrivatizingConstruct(const parser::OpenMPConstruct &omp,
+                                           unsigned version);
   bool isOpenMPPrivatizingEvaluation(const pft::Evaluation &eval) const;
 
 public:
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 4458f62eea95a..fc5fef9b2c577 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -63,6 +63,28 @@ static void processHostEvalClauses(lower::AbstractConverter &converter,
                                    lower::pft::Evaluation &eval,
                                    mlir::Location loc);
 
+static llvm::omp::Directive
+getOpenMPDirectiveEnum(const parser::OmpLoopDirective &beginStatment) {
+  return beginStatment.v;
+}
+
+static llvm::omp::Directive getOpenMPDirectiveEnum(
+    const parser::OmpBeginLoopDirective &beginLoopDirective) {
+  return getOpenMPDirectiveEnum(
+      std::get<parser::OmpLoopDirective>(beginLoopDirective.t));
+}
+
+static llvm::omp::Directive
+getOpenMPDirectiveEnum(const parser::OpenMPLoopConstruct &ompLoopConstruct) {
+  return getOpenMPDirectiveEnum(
+      std::get<parser::OmpBeginLoopDirective>(ompLoopConstruct.t));
+}
+
+static llvm::omp::Directive getOpenMPDirectiveEnum(
+    const common::Indirection<parser::OpenMPLoopConstruct> &ompLoopConstruct) {
+  return getOpenMPDirectiveEnum(ompLoopConstruct.value());
+}
+
 namespace {
 /// Structure holding information that is needed to pass host-evaluated
 /// information to later lowering stages.
@@ -372,90 +394,6 @@ extractMappedBaseValues(llvm::ArrayRef<mlir::Value> vars,
   });
 }
 
-/// Get the directive enumeration value corresponding to the given OpenMP
-/// construct PFT node.
-llvm::omp::Directive
-extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
-  return common::visit(
-      common::visitors{
-          [](const parser::OpenMPAllocatorsConstruct &c) {
-            return llvm::omp::OMPD_allocators;
-          },
-          [](const parser::OpenMPAssumeConstruct &c) {
-            return llvm::omp::OMPD_assume;
-          },
-          [](const parser::OpenMPAtomicConstruct &c) {
-            return llvm::omp::OMPD_atomic;
-          },
-          [](const parser::OpenMPBlockConstruct &c) {
-            return std::get<parser::OmpBlockDirective>(
-                       std::get<parser::OmpBeginBlockDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPCriticalConstruct &c) {
-            return llvm::omp::OMPD_critical;
-          },
-          [](const parser::OpenMPDeclarativeAllocate &c) {
-            return llvm::omp::OMPD_allocate;
-          },
-          [](const parser::OpenMPDispatchConstruct &c) {
-            return llvm::omp::OMPD_dispatch;
-          },
-          [](const parser::OpenMPExecutableAllocate &c) {
-            return llvm::omp::OMPD_allocate;
-          },
-          [](const parser::OpenMPLoopConstruct &c) {
-            return std::get<parser::OmpLoopDirective>(
-                       std::get<parser::OmpBeginLoopDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPSectionConstruct &c) {
-            return llvm::omp::OMPD_section;
-          },
-          [](const parser::OpenMPSectionsConstruct &c) {
-            return std::get<parser::OmpSectionsDirective>(
-                       std::get<parser::OmpBeginSectionsDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPStandaloneConstruct &c) {
-            return common::visit(
-                common::visitors{
-                    [](const parser::OpenMPSimpleStandaloneConstruct &c) {
-                      return c.v.DirId();
-                    },
-                    [](const parser::OpenMPFlushConstruct &c) {
-                      return llvm::omp::OMPD_flush;
-                    },
-                    [](const parser::OpenMPCancelConstruct &c) {
-                      return llvm::omp::OMPD_cancel;
-                    },
-                    [](const parser::OpenMPCancellationPointConstruct &c) {
-                      return llvm::omp::OMPD_cancellation_point;
-                    },
-                    [](const parser::OmpMetadirectiveDirective &c) {
-                      return llvm::omp::OMPD_metadirective;
-                    },
-                    [](const parser::OpenMPDepobjConstruct &c) {
-                      return llvm::omp::OMPD_depobj;
-                    },
-                    [](const parser::OpenMPInteropConstruct &c) {
-                      return llvm::omp::OMPD_interop;
-                    }},
-                c.u);
-          },
-          [](const parser::OpenMPUtilityConstruct &c) {
-            return common::visit(
-                common::visitors{[](const parser::OmpErrorDirective &c) {
-                                   return llvm::omp::OMPD_error;
-                                 },
-                                 [](const parser::OmpNothingDirective &c) {
-                                   return llvm::omp::OMPD_nothing;
-                                 }},
-                c.u);
-          }},
-      ompConstruct.u);
-}
-
 /// Populate the global \see hostEvalInfo after processing clauses for the given
 /// \p eval OpenMP target construct, or nested constructs, if these must be
 /// evaluated outside of the target region per the spec.
@@ -726,8 +664,8 @@ static void threadPrivatizeVars(lower::AbstractConverter &converter,
       op = declOp.getMemref().getDefiningOp();
     if (mlir::isa<mlir::omp::ThreadprivateOp>(op))
       symValue = mlir::dyn_cast<mlir::omp::ThreadprivateOp>(op).getSymAddr();
-    return firOpBuilder.create<mlir::omp::ThreadprivateOp>(
-        currentLocation, symValue.getType(), symValue);
+    return mlir::omp::ThreadprivateOp::create(firOpBuilder, currentLocation,
+                                              symValue.getType(), symValue);
   };
 
   llvm::SetVector<const semantics::Symbol *> threadprivateSyms;
@@ -794,7 +732,7 @@ createAndSetPrivatizedLoopVar(lower::AbstractConverter &converter,
   lhs = hlfir::derefPointersAndAllocatables(loc, firOpBuilder, lhs);
 
   mlir::Operation *storeOp =
-      firOpBuilder.create<hlfir::AssignOp>(loc, cvtVal, lhs);
+      hlfir::AssignOp::create(firOpBuilder, loc, cvtVal, lhs);
   return storeOp;
 }
 
@@ -1240,8 +1178,8 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
   fir::FirOpBuilder &firOpBuilder = info.converter.getFirOpBuilder();
 
   auto insertMarker = [](fir::FirOpBuilder &builder) {
-    mlir::Value undef = builder.create<fir::UndefOp>(builder.getUnknownLoc(),
-                                                     builder.getIndexType());
+    mlir::Value undef = fir::UndefOp::create(builder, builder.getUnknownLoc(),
+                                             builder.getIndexType());
     return undef.getDefiningOp();
   };
 
@@ -1355,7 +1293,7 @@ static void createBodyOfOp(mlir::Operation &op, const OpWithBodyGenInfo &info,
     mlir::Block *exit = firOpBuilder.createBlock(&region);
     for (mlir::Block *b : exits) {
       firOpBuilder.setInsertionPointToEnd(b);
-      firOpBuilder.create<mlir::cf::BranchOp>(info.loc, exit);
+      mlir::cf::BranchOp::create(firOpBuilder, info.loc, exit);
     }
     return exit;
   };
@@ -1416,8 +1354,8 @@ static void genBodyOfTargetDataOp(
   // Remembering the position for further insertion is important since
   // there are hlfir.declares inserted above while setting block arguments
   // and new code from the body should be inserted after that.
-  mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
-      dataOp.getLoc(), firOpBuilder.getIndexType());
+  mlir::Value undefMarker = fir::UndefOp::create(firOpBuilder, dataOp.getLoc(),
+                                                 firOpBuilder.getIndexType());
 
   // Create blocks for unstructured regions. This has to be done since
   // blocks are initially allocated with the function as the parent region.
@@ -1426,7 +1364,7 @@ static void genBodyOfTargetDataOp(
         firOpBuilder, eval.getNestedEvaluations());
   }
 
-  firOpBuilder.create<mlir::omp::TerminatorOp>(currentLocation);
+  mlir::omp::TerminatorOp::create(firOpBuilder, currentLocation);
 
   // Set the insertion point after the marker.
   firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
@@ -1580,8 +1518,8 @@ static void genBodyOfTargetOp(
             insertIndex, copyVal.getType(), copyVal.getLoc());
 
         firOpBuilder.setInsertionPointToStart(entryBlock);
-        auto loadOp = firOpBuilder.create<fir::LoadOp>(clonedValArg.getLoc(),
-                                                       clonedValArg);
+        auto loadOp = fir::LoadOp::create(firOpBuilder, clonedValArg.getLoc(),
+                                          clonedValArg);
         val.replaceUsesWithIf(loadOp->getResult(0),
                               [entryBlock](mlir::OpOperand &use) {
                                 return use.getOwner()->getBlock() == entryBlock;
@@ -1597,8 +1535,8 @@ static void genBodyOfTargetOp(
   // marker will be deleted since there are not uses.
   // In the HLFIR flow there are hlfir.declares inserted above while
   // setting block arguments.
-  mlir::Value undefMarker = firOpBuilder.create<fir::UndefOp>(
-      targetOp.getLoc(), firOpBuilder.getIndexType());
+  mlir::Value undefMarker = fir::UndefOp::create(
+      firOpBuilder, targetOp.getLoc(), firOpBuilder.getIndexType());
 
   // Create blocks for unstructured regions. This has to be done since
   // blocks are initially allocated with the function as the parent region.
@@ -1608,7 +1546,7 @@ static void genBodyOfTargetOp(
         firOpBuilder, eval.getNestedEvaluations());
   }
 
-  firOpBuilder.create<mlir::omp::TerminatorOp>(currentLocation);
+  mlir::omp::TerminatorOp::create(firOpBuilder, currentLocation);
 
   // Create the insertion point after the marker.
   firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp());
@@ -1654,7 +1592,7 @@ static OpTy genWrapperOp(lower::AbstractConverter &converter,
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   // Create wrapper.
-  auto op = firOpBuilder.create<OpTy>(loc, clauseOps);
+  auto op = OpTy::create(firOpBuilder, loc, clauseOps);
 
   // Create entry block with arguments.
   genEntryBlock(firOpBuilder, args, op.getRegion());
@@ -2067,7 +2005,7 @@ genCriticalOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                                 clauseOps, nameStr);
 
       mlir::OpBuilder modBuilder(mod.getBodyRegion());
-      global = modBuilder.create<mlir::omp::CriticalDeclareOp>(loc, clauseOps);
+      global = mlir::omp::CriticalDeclareOp::create(modBuilder, loc, clauseOps);
     }
     nameAttr = mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
                                             global.getSymName());
@@ -2153,6 +2091,163 @@ genLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   return loopOp;
 }
 
+static mlir::omp::CanonicalLoopOp
+genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval, mlir::Location loc,
+                   const ConstructQueue &queue,
+                   ConstructQueue::const_iterator item,
+                   llvm::ArrayRef<const semantics::Symbol *> ivs,
+                   llvm::omp::Directive directive, DataSharingProcessor &dsp) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  assert(ivs.size() == 1 && "Nested loops not yet implemented");
+  const semantics::Symbol *iv = ivs[0];
+
+  auto &nestedEval = eval.getFirstNestedEvaluation();
+  if (nestedEval.getIf<parser::DoConstruct>()->IsDoConcurrent()) {
+    // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct. Will
+    // need to add special cases for this combination.
+    TODO(loc, "DO CONCURRENT as canonical loop not supported");
+  }
+
+  // Get the loop bounds (and increment)
+  auto &doLoopEval = nestedEval.getFirstNestedEvaluation();
+  auto *doStmt = doLoopEval.getIf<parser::NonLabelDoStmt>();
+  assert(doStmt && "Expected do loop to be in the nested evaluation");
+  auto &loopControl = std::get<std::optional<parser::LoopControl>>(doStmt->t);
+  assert(loopControl.has_value());
+  auto *bounds = std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
+  assert(bounds && "Expected bounds for canonical loop");
+  lower::StatementContext stmtCtx;
+  mlir::Value loopLBVar = fir::getBase(
+      converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx));
+  mlir::Value loopUBVar = fir::getBase(
+      converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx));
+  mlir::Value loopStepVar = [&]() {
+    if (bounds->step) {
+      return fir::getBase(
+          converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx));
+    }
+
+    // If `step` is not present, assume it is `1`.
+    return firOpBuilder.createIntegerConstant(loc, firOpBuilder.getI32Type(),
+                                              1);
+  }();
+
+  // Get the integer kind for the loop variable and cast the loop bounds
+  size_t loopVarTypeSize = bounds->name.thing.symbol->GetUltimate().size();
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  loopLBVar = firOpBuilder.createConvert(loc, loopVarType, loopLBVar);
+  loopUBVar = firOpBuilder.createConvert(loc, loopVarType, loopUBVar);
+  loopStepVar = firOpBuilder.createConvert(loc, loopVarType, loopStepVar);
+
+  // Start lowering
+  mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0);
+  mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1);
+  mlir::Value isDownwards = firOpBuilder.create<mlir::arith::CmpIOp>(
+      loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
+
+  // Ensure we are counting upwards. If not, negate step and swap lb and ub.
+  mlir::Value negStep =
+      firOpBuilder.create<mlir::arith::SubIOp>(loc, zero, loopStepVar);
+  mlir::Value incr = firOpBuilder.create<mlir::arith::SelectOp>(
+      loc, isDownwards, negStep, loopStepVar);
+  mlir::Value lb = firOpBuilder.create<mlir::arith::SelectOp>(
+      loc, isDownwards, loopUBVar, loopLBVar);
+  mlir::Value ub = firOpBuilder.create<mlir::arith::SelectOp>(
+      loc, isDownwards, loopLBVar, loopUBVar);
+
+  // Compute the trip count assuming lb <= ub. This guarantees that the result
+  // is non-negative and we can use unsigned arithmetic.
+  mlir::Value span = firOpBuilder.create<mlir::arith::SubIOp>(
+      loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
+  mlir::Value tcMinusOne =
+      firOpBuilder.create<mlir::arith::DivUIOp>(loc, span, incr);
+  mlir::Value tcIfLooping = firOpBuilder.create<mlir::arith::AddIOp>(
+      loc, tcMinusOne, one, ::mlir::arith::IntegerOverflowFlags::nuw);
+
+  // Fall back to 0 if lb > ub
+  mlir::Value isZeroTC = firOpBuilder.create<mlir::arith::CmpIOp>(
+      loc, mlir::arith::CmpIPredicate::slt, ub, lb);
+  mlir::Value tripcount = firOpBuilder.create<mlir::arith::SelectOp>(
+      loc, isZeroTC, zero, tcIfLooping);
+
+  // Create the CLI handle.
+  auto newcli = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+  mlir::Value cli = newcli.getResult();
+
+  auto ivCallback = [&](mlir::Operation *op)
+      -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
+    mlir::Region &region = op->getRegion(0);
+
+    // Create the op's region skeleton (BB taking the iv as argument)
+    firOpBuilder.createBlock(&region, {}, {loopVarType}, {loc});
+
+    // Compute the value of the loop variable from the logical iteration number.
+    mlir::Value natIterNum = fir::getBase(region.front().getArgument(0));
+    mlir::Value scaled =
+        firOpBuilder.create<mlir::arith::MulIOp>(loc, natIterNum, loopStepVar);
+    mlir::Value userVal =
+        firOpBuilder.create<mlir::arith::AddIOp>(loc, loopLBVar, scaled);
+
+    // The argument is not currently in memory, so make a temporary for the
+    // argument, and store it there, then bind that location to the argument.
+    mlir::Operation *storeOp =
+        createAndSetPrivatizedLoopVar(converter, loc, userVal, iv);
+
+    firOpBuilder.setInsertionPointAfter(storeOp);
+    return {iv};
+  };
+
+  // Create the omp.canonical_loop operation
+  auto canonLoop = genOpWithBody<mlir::omp::CanonicalLoopOp>(
+      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, nestedEval,
+                        directive)
+          .setClauses(&item->clauses)
+          .setDataSharingProcessor(&dsp)
+          .setGenRegionEntryCb(ivCallback),
+      queue, item, tripcount, cli);
+
+  firOpBuilder.setInsertionPointAfter(canonLoop);
+  return canonLoop;
+}
+
+static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
+                        Fortran::lower::SymMap &symTable,
+                        lower::StatementContext &stmtCtx,
+                        Fortran::semantics::SemanticsContext &semaCtx,
+                        Fortran::lower::pft::Evaluation &eval,
+                        mlir::Location loc, const ConstructQueue &queue,
+                        ConstructQueue::const_iterator item) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  mlir::omp::LoopRelatedClauseOps loopInfo;
+  llvm::SmallVector<const semantics::Symbol *> iv;
+  collectLoopRelatedInfo(converter, loc, eval, item->clauses, loopInfo, iv);
+
+  // Clauses for unrolling not yet implemnted
+  ClauseProcessor cp(converter, semaCtx, item->clauses);
+  cp.processTODO<clause::Partial, clause::Full>(
+      loc, llvm::omp::Directive::OMPD_unroll);
+
+  // Even though unroll does not support data-sharing clauses, but this is
+  // required to fill the symbol table.
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
+                           /*shouldCollectPreDeterminedSymbols=*/true,
+                           /*useDelayedPrivatization=*/false, symTable);
+  dsp.processStep1();
+
+  // Emit the associated loop
+  auto canonLoop =
+      genCanonicalLoopOp(converter, symTable, semaCtx, eval, loc, queue, item,
+                         iv, llvm::omp::Directive::OMPD_unroll, dsp);
+
+  // Apply unrolling to it
+  auto cli = canonLoop.getCli();
+  firOpBuilder.create<mlir::omp::UnrollHeuristicOp>(loc, cli);
+}
+
 static mlir::omp::MaskedOp
 genMaskedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             lower::StatementContext &stmtCtx,
@@ -2285,7 +2380,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   }
 
   // SECTIONS construct.
-  auto sectionsOp = builder.create<mlir::omp::SectionsOp>(loc, clauseOps);
+  auto sectionsOp = mlir::omp::SectionsOp::create(builder, loc, clauseOps);
 
   // Create entry block with reduction variables as arguments.
   EntryBlockArgs args;
@@ -2361,7 +2456,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // races on post-update of lastprivate variables when `nowait`
   // clause is present.
   if (clauseOps.nowait && !lastprivates.empty())
-    builder.create<mlir::omp::BarrierOp>(loc);
+    mlir::omp::BarrierOp::create(builder, loc);
 
   return sectionsOp;
 }
@@ -2513,7 +2608,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   };
   lower::pft::visitAllSymbols(eval, captureImplicitMap);
 
-  auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
+  auto targetOp = mlir::omp::TargetOp::create(firOpBuilder, loc, clauseOps);
 
   llvm::SmallVector<mlir::Value> hasDeviceAddrBaseValues, mapBaseValues;
   extractMappedBaseValues(clauseOps.hasDeviceAddrVars, hasDeviceAddrBaseValues);
@@ -2593,7 +2688,7 @@ static OpTy genTargetEnterExitUpdateDataOp(
   genTargetEnterExitUpdateDataClauses(converter, semaCtx, symTable, stmtCtx,
                                       item->clauses, loc, directive, clauseOps);
 
-  return firOpBuilder.create<OpTy>(loc, clauseOps);
+  return OpTy::create(firOpBuilder, loc, clauseOps);
 }
 
 static mlir::omp::TaskOp
@@ -3333,12 +3428,14 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     newOp = genTeamsOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue,
                        item);
     break;
-  case llvm::omp::Directive::OMPD_tile:
-  case llvm::omp::Directive::OMPD_unroll: {
+  case llvm::omp::Directive::OMPD_tile: {
     unsigned version = semaCtx.langOptions().OpenMPVersion;
     TODO(loc, "Unhandled loop directive (" +
                   llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
   }
+  case llvm::omp::Directive::OMPD_unroll:
+    genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
+    break;
   // case llvm::omp::Directive::OMPD_workdistribute:
   case llvm::omp::Directive::OMPD_workshare:
     newOp = genWorkshareOp(converter, symTable, stmtCtx, semaCtx, eval, loc,
@@ -3426,8 +3523,8 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
 
   firOpBuilder.setInsertionPointToStart(converter.getModuleOp().getBody());
   auto mlirType = converter.genType(varType.declTypeSpec->derivedTypeSpec());
-  auto declMapperOp = firOpBuilder.create<mlir::omp::DeclareMapperOp>(
-      loc, mapperNameStr, mlirType);
+  auto declMapperOp = mlir::omp::DeclareMapperOp::create(
+      firOpBuilder, loc, mapperNameStr, mlirType);
   auto &region = declMapperOp.getRegion();
   firOpBuilder.createBlock(&region);
   auto varVal = region.addArgument(firOpBuilder.getRefType(mlirType), loc);
@@ -3440,7 +3537,7 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   List<Clause> clauses = makeClauses(*clauseList, semaCtx);
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processMap(loc, stmtCtx, clauseOps);
-  firOpBuilder.create<mlir::omp::DeclareMapperInfoOp>(loc, clauseOps.mapVars);
+  mlir::omp::DeclareMapperInfoOp::create(firOpBuilder, loc, clauseOps.mapVars);
 }
 
 static void
@@ -3774,12 +3871,25 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     if (auto *ompNestedLoopCons{
             std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
                 &*optLoopCons)}) {
-      genOMP(converter, symTable, semaCtx, eval, ompNestedLoopCons->value());
+      llvm::omp::Directive nestedDirective =
+          getOpenMPDirectiveEnum(*ompNestedLoopCons);
+      switch (nestedDirective) {
+      case llvm::omp::Directive::OMPD_tile:
+        // Emit the omp.loop_nest with annotation for tiling
+        genOMP(converter, symTable, semaCtx, eval, ompNestedLoopCons->value());
+        break;
+      default: {
+        unsigned version = semaCtx.langOptions().OpenMPVersion;
+        TODO(currentLocation,
+             "Applying a loop-associated on the loop generated by the " +
+                 llvm::omp::getOpenMPDirectiveName(nestedDirective, version) +
+                 " construct");
+      }
+      }
     }
   }
 
-  llvm::omp::Directive directive =
-      std::get<parser::OmpLoopDirective>(beginLoopDirective.t).v;
+  llvm::omp::Directive directive = getOpenMPDirectiveEnum(beginLoopDirective);
   const parser::CharBlock &source =
       std::get<parser::OmpLoopDirective>(beginLoopDirective.t).source;
   ConstructQueue queue{
@@ -3842,8 +3952,8 @@ mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
                                                      mlir::Location loc) {
   if (mlir::isa<mlir::omp::AtomicUpdateOp, mlir::omp::DeclareReductionOp,
                 mlir::omp::LoopNestOp>(op))
-    return builder.create<mlir::omp::YieldOp>(loc);
-  return builder.create<mlir::omp::TerminatorOp>(loc);
+    return mlir::omp::YieldOp::create(builder, loc);
+  return mlir::omp::TerminatorOp::create(builder, loc);
 }
 
 void Fortran::lower::genOpenMPConstruct(lower::AbstractConverter &converter,
@@ -3903,9 +4013,8 @@ void Fortran::lower::genThreadprivateOp(lower::AbstractConverter &converter,
       return;
     }
     // Generate ThreadprivateOp and rebind the common block.
-    mlir::Value commonThreadprivateValue =
-        firOpBuilder.create<mlir::omp::ThreadprivateOp>(
-            currentLocation, commonValue.getType(), commonValue);
+    mlir::Value commonThreadprivateValue = mlir::omp::ThreadprivateOp::create(
+        firOpBuilder, currentLocation, commonValue.getType(), commonValue);
     converter.bindSymbol(*common, commonThreadprivateValue);
     // Generate the threadprivate value for the common block member.
     symThreadprivateValue = genCommonBlockMember(converter, currentLocation,
@@ -3925,10 +4034,10 @@ void Fortran::lower::genThreadprivateOp(lower::AbstractConverter &converter,
       global = globalInitialization(converter, firOpBuilder, sym, var,
                                     currentLocation);
 
-    mlir::Value symValue = firOpBuilder.create<fir::AddrOfOp>(
-        currentLocation, global.resultType(), global.getSymbol());
-    symThreadprivateValue = firOpBuilder.create<mlir::omp::ThreadprivateOp>(
-        currentLocation, symValue.getType(), symValue);
+    mlir::Value symValue = fir::AddrOfOp::create(
+        firOpBuilder, currentLocation, global.resultType(), global.getSymbol());
+    symThreadprivateValue = mlir::omp::ThreadprivateOp::create(
+        firOpBuilder, currentLocation, symValue.getType(), symValue);
   } else {
     mlir::Value symValue = converter.getSymbolAddress(sym);
 
@@ -3943,8 +4052,8 @@ void Fortran::lower::genThreadprivateOp(lower::AbstractConverter &converter,
     if (mlir::isa<mlir::omp::ThreadprivateOp>(op))
       return;
 
-    symThreadprivateValue = firOpBuilder.create<mlir::omp::ThreadprivateOp>(
-        currentLocation, symValue.getType(), symValue);
+    symThreadprivateValue = mlir::omp::ThreadprivateOp::create(
+        firOpBuilder, currentLocation, symValue.getType(), symValue);
   }
 
   fir::ExtendedValue sexv = converter.getSymbolExtendedValue(sym);
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 2e53f01f1da6a..b1716d6afb200 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -115,7 +115,7 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool partialMap, mlir::FlatSymbolRefAttr mapperId) {
   if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
-    baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
+    baseAddr = fir::BoxAddrOp::create(builder, loc, baseAddr);
     retTy = baseAddr.getType();
   }
 
@@ -129,8 +129,8 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
     if (seqType.hasDynamicExtents())
       varType = mlir::TypeAttr::get(seqType.getEleTy());
 
-  mlir::omp::MapInfoOp op = builder.create<mlir::omp::MapInfoOp>(
-      loc, retTy, baseAddr, varType,
+  mlir::omp::MapInfoOp op = mlir::omp::MapInfoOp::create(
+      builder, loc, retTy, baseAddr, varType,
       builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
       builder.getAttr<mlir::omp::VariableCaptureKindAttr>(mapCaptureType),
       varPtrPtr, members, membersIndex, bounds, mapperId,
@@ -195,8 +195,8 @@ static void generateArrayIndices(lower::AbstractConverter &converter,
         clauseLocation, firOpBuilder.getIndexType(), 1);
     subscript = firOpBuilder.createConvert(
         clauseLocation, firOpBuilder.getIndexType(), subscript);
-    indices.push_back(firOpBuilder.create<mlir::arith::SubIOp>(clauseLocation,
-                                                               subscript, one));
+    indices.push_back(mlir::arith::SubIOp::create(firOpBuilder, clauseLocation,
+                                                  subscript, one));
   }
 }
 
@@ -329,9 +329,10 @@ mlir::Value createParentSymAndGenIntermediateMaps(
                              subscriptIndices, objectList[i]);
         assert(!subscriptIndices.empty() &&
                "missing expected indices for map clause");
-        curValue = firOpBuilder.create<fir::CoordinateOp>(
-            clauseLocation, firOpBuilder.getRefType(arrType.getEleTy()),
-            curValue, subscriptIndices);
+        curValue = fir::CoordinateOp::create(
+            firOpBuilder, clauseLocation,
+            firOpBuilder.getRefType(arrType.getEleTy()), curValue,
+            subscriptIndices);
       }
     }
 
@@ -345,9 +346,9 @@ mlir::Value createParentSymAndGenIntermediateMaps(
       fir::IntOrValue idxConst = mlir::IntegerAttr::get(
           firOpBuilder.getI32Type(), indices[currentIndicesIdx]);
       mlir::Type memberTy = recordType.getType(indices[currentIndicesIdx]);
-      curValue = firOpBuilder.create<fir::CoordinateOp>(
-          clauseLocation, firOpBuilder.getRefType(memberTy), curValue,
-          llvm::SmallVector<fir::IntOrValue, 1>{idxConst});
+      curValue = fir::CoordinateOp::create(
+          firOpBuilder, clauseLocation, firOpBuilder.getRefType(memberTy),
+          curValue, llvm::SmallVector<fir::IntOrValue, 1>{idxConst});
 
       // If we're a final member, the map will be generated by the processMap
       // call that invoked this function.
@@ -417,7 +418,7 @@ mlir::Value createParentSymAndGenIntermediateMaps(
 
       // Load the currently accessed member, so we can continue to access
       // further segments.
-      curValue = firOpBuilder.create<fir::LoadOp>(clauseLocation, curValue);
+      curValue = fir::LoadOp::create(firOpBuilder, clauseLocation, curValue);
       currentIndicesIdx++;
     }
   }
@@ -661,6 +662,90 @@ bool collectLoopRelatedInfo(
 
   return found;
 }
+
+/// Get the directive enumeration value corresponding to the given OpenMP
+/// construct PFT node.
+llvm::omp::Directive
+extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
+  return common::visit(
+      common::visitors{
+          [](const parser::OpenMPAllocatorsConstruct &c) {
+            return llvm::omp::OMPD_allocators;
+          },
+          [](const parser::OpenMPAssumeConstruct &c) {
+            return llvm::omp::OMPD_assume;
+          },
+          [](const parser::OpenMPAtomicConstruct &c) {
+            return llvm::omp::OMPD_atomic;
+          },
+          [](const parser::OpenMPBlockConstruct &c) {
+            return std::get<parser::OmpBlockDirective>(
+                       std::get<parser::OmpBeginBlockDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPCriticalConstruct &c) {
+            return llvm::omp::OMPD_critical;
+          },
+          [](const parser::OpenMPDeclarativeAllocate &c) {
+            return llvm::omp::OMPD_allocate;
+          },
+          [](const parser::OpenMPDispatchConstruct &c) {
+            return llvm::omp::OMPD_dispatch;
+          },
+          [](const parser::OpenMPExecutableAllocate &c) {
+            return llvm::omp::OMPD_allocate;
+          },
+          [](const parser::OpenMPLoopConstruct &c) {
+            return std::get<parser::OmpLoopDirective>(
+                       std::get<parser::OmpBeginLoopDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPSectionConstruct &c) {
+            return llvm::omp::OMPD_section;
+          },
+          [](const parser::OpenMPSectionsConstruct &c) {
+            return std::get<parser::OmpSectionsDirective>(
+                       std::get<parser::OmpBeginSectionsDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPStandaloneConstruct &c) {
+            return common::visit(
+                common::visitors{
+                    [](const parser::OpenMPSimpleStandaloneConstruct &c) {
+                      return c.v.DirId();
+                    },
+                    [](const parser::OpenMPFlushConstruct &c) {
+                      return llvm::omp::OMPD_flush;
+                    },
+                    [](const parser::OpenMPCancelConstruct &c) {
+                      return llvm::omp::OMPD_cancel;
+                    },
+                    [](const parser::OpenMPCancellationPointConstruct &c) {
+                      return llvm::omp::OMPD_cancellation_point;
+                    },
+                    [](const parser::OmpMetadirectiveDirective &c) {
+                      return llvm::omp::OMPD_metadirective;
+                    },
+                    [](const parser::OpenMPDepobjConstruct &c) {
+                      return llvm::omp::OMPD_depobj;
+                    },
+                    [](const parser::OpenMPInteropConstruct &c) {
+                      return llvm::omp::OMPD_interop;
+                    }},
+                c.u);
+          },
+          [](const parser::OpenMPUtilityConstruct &c) {
+            return common::visit(
+                common::visitors{[](const parser::OmpErrorDirective &c) {
+                                   return llvm::omp::OMPD_error;
+                                 },
+                                 [](const parser::OmpNothingDirective &c) {
+                                   return llvm::omp::OMPD_nothing;
+                                 }},
+                c.u);
+          }},
+      ompConstruct.u);
+}
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 1526bd4e90233..8e3ad5c3452e2 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -166,6 +166,9 @@ bool collectLoopRelatedInfo(
     lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
+
+llvm::omp::Directive
+extractOmpDirective(const parser::OpenMPConstruct &ompConstruct);
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 2be5ef76e46b8..fc59a2414d539 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -43,7 +43,7 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) {
            mlir::acc::OpenACCDialect::getDialectNamespace())
     Fortran::lower::genOpenACCTerminator(builder, parentOp, loc);
   else
-    builder.create<fir::UnreachableOp>(loc);
+    fir::UnreachableOp::create(builder, loc);
   mlir::Block *newBlock = curBlock->splitBlock(builder.getInsertionPoint());
   builder.setInsertionPointToStart(newBlock);
 }
@@ -118,7 +118,7 @@ void Fortran::lower::genStopStatement(
         loc, calleeType.getInput(operands.size()), 0));
   }
 
-  builder.create<fir::CallOp>(loc, callee, operands);
+  fir::CallOp::create(builder, loc, callee, operands);
   auto blockIsUnterminated = [&builder]() {
     mlir::Block *currentBlock = builder.getBlock();
     return currentBlock->empty() ||
@@ -134,7 +134,7 @@ void Fortran::lower::genFailImageStatement(
   mlir::Location loc = converter.getCurrentLocation();
   mlir::func::FuncOp callee =
       fir::runtime::getRuntimeFunc<mkRTKey(FailImageStatement)>(loc, builder);
-  builder.create<fir::CallOp>(loc, callee, std::nullopt);
+  fir::CallOp::create(builder, loc, callee, mlir::ValueRange{});
   genUnreachable(builder, loc);
 }
 
@@ -199,7 +199,7 @@ void Fortran::lower::genPauseStatement(
   mlir::Location loc = converter.getCurrentLocation();
   mlir::func::FuncOp callee =
       fir::runtime::getRuntimeFunc<mkRTKey(PauseStatement)>(loc, builder);
-  builder.create<fir::CallOp>(loc, callee, std::nullopt);
+  fir::CallOp::create(builder, loc, callee, mlir::ValueRange{});
 }
 
 void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder,
@@ -210,17 +210,18 @@ void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(PointerAssociate)>(loc, builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
-void Fortran::lower::genPointerAssociateRemapping(fir::FirOpBuilder &builder,
-                                                  mlir::Location loc,
-                                                  mlir::Value pointer,
-                                                  mlir::Value target,
-                                                  mlir::Value bounds) {
+void Fortran::lower::genPointerAssociateRemapping(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value pointer,
+    mlir::Value target, mlir::Value bounds, bool isMonomorphic) {
   mlir::func::FuncOp func =
-      fir::runtime::getRuntimeFunc<mkRTKey(PointerAssociateRemapping)>(loc,
-                                                                       builder);
+      isMonomorphic
+          ? fir::runtime::getRuntimeFunc<mkRTKey(
+                PointerAssociateRemappingMonomorphic)>(loc, builder)
+          : fir::runtime::getRuntimeFunc<mkRTKey(PointerAssociateRemapping)>(
+                loc, builder);
   auto fTy = func.getFunctionType();
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto sourceLine =
@@ -228,7 +229,7 @@ void Fortran::lower::genPointerAssociateRemapping(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target, bounds, sourceFile,
       sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void Fortran::lower::genPointerAssociateLowerBounds(fir::FirOpBuilder &builder,
@@ -241,5 +242,5 @@ void Fortran::lower::genPointerAssociateLowerBounds(fir::FirOpBuilder &builder,
           loc, builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target, lbounds);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index c3a5b6101ce00..fff060b79c9fe 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -75,9 +75,9 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
                                         /*mutableProperties=*/{}};
         Fortran::lower::genDeallocateIfAllocated(converter, mutableBox, loc);
         if (isDoConcurrent)
-          builder.create<fir::YieldOp>(loc);
+          fir::YieldOp::create(builder, loc);
         else
-          builder.create<mlir::omp::YieldOp>(loc);
+          mlir::omp::YieldOp::create(builder, loc);
         return;
       }
     }
@@ -97,18 +97,18 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
         hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
     mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
     fir::IfOp ifOp =
-        builder.create<fir::IfOp>(loc, isAllocated, /*withElseRegion=*/false);
+        fir::IfOp::create(builder, loc, isAllocated, /*withElseRegion=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
 
     mlir::Value cast = builder.createConvert(
         loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
-    builder.create<fir::FreeMemOp>(loc, cast);
+    fir::FreeMemOp::create(builder, loc, cast);
 
     builder.setInsertionPointAfter(ifOp);
     if (isDoConcurrent)
-      builder.create<fir::YieldOp>(loc);
+      fir::YieldOp::create(builder, loc);
     else
-      builder.create<mlir::omp::YieldOp>(loc);
+      mlir::omp::YieldOp::create(builder, loc);
     return;
   }
 
@@ -122,11 +122,11 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
     auto heapTy = fir::HeapType::get(refTy.getEleTy());
     addr = builder.createConvert(loc, heapTy, addr);
 
-    builder.create<fir::FreeMemOp>(loc, addr);
+    fir::FreeMemOp::create(builder, loc, addr);
     if (isDoConcurrent)
-      builder.create<fir::YieldOp>(loc);
+      fir::YieldOp::create(builder, loc);
     else
-      builder.create<mlir::omp::YieldOp>(loc);
+      mlir::omp::YieldOp::create(builder, loc);
 
     return;
   }
@@ -172,7 +172,7 @@ fir::ShapeShiftOp Fortran::lower::getShapeShift(
       // OpenACC does
       mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
       auto dimInfo =
-          builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
+          fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, box, dim);
       lbAndExtents.push_back(useDefaultLowerBounds ? one()
                                                    : dimInfo.getLowerBound());
       lbAndExtents.push_back(dimInfo.getExtent());
@@ -181,7 +181,7 @@ fir::ShapeShiftOp Fortran::lower::getShapeShift(
 
   auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank);
   auto shapeShift =
-      builder.create<fir::ShapeShiftOp>(loc, shapeShiftTy, lbAndExtents);
+      fir::ShapeShiftOp::create(builder, loc, shapeShiftTy, lbAndExtents);
   return shapeShift;
 }
 
@@ -270,7 +270,7 @@ static mlir::Value generateZeroShapeForRank(fir::FirOpBuilder &builder,
   mlir::SmallVector<mlir::Value> dims;
   dims.resize(rank, zero);
   mlir::Type shapeTy = fir::ShapeType::get(builder.getContext(), rank);
-  return builder.create<fir::ShapeOp>(loc, shapeTy, dims);
+  return fir::ShapeOp::create(builder, loc, shapeTy, dims);
 }
 
 namespace {
@@ -341,9 +341,9 @@ class PopulateInitAndCleanupRegionsHelper {
 
   void createYield(mlir::Value ret) {
     if (isDoConcurrent)
-      builder.create<fir::YieldOp>(loc, ret);
+      fir::YieldOp::create(builder, loc, ret);
     else
-      builder.create<mlir::omp::YieldOp>(loc, ret);
+      mlir::omp::YieldOp::create(builder, loc, ret);
   }
 
   void initTrivialType() {
@@ -392,9 +392,9 @@ void PopulateInitAndCleanupRegionsHelper::initBoxedPrivatePointer(
   // Just incase, do initialize the box with a null value
   mlir::Value null = builder.createNullConstant(loc, boxTy.getEleTy());
   mlir::Value nullBox;
-  nullBox = builder.create<fir::EmboxOp>(loc, boxTy, null, shape,
-                                         /*slice=*/mlir::Value{}, lenParams);
-  builder.create<fir::StoreOp>(loc, nullBox, allocatedPrivVarArg);
+  nullBox = fir::EmboxOp::create(builder, loc, boxTy, null, shape,
+                                 /*slice=*/mlir::Value{}, lenParams);
+  fir::StoreOp::create(builder, loc, nullBox, allocatedPrivVarArg);
   createYield(allocatedPrivVarArg);
 }
 /// Check if an allocatable box is unallocated. If so, initialize the boxAlloca
@@ -410,10 +410,10 @@ void PopulateInitAndCleanupRegionsHelper::initBoxedPrivatePointer(
 /// }
 /// omp.yield %box_alloca
 fir::IfOp PopulateInitAndCleanupRegionsHelper::handleNullAllocatable() {
-  mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, getLoadedMoldArg());
+  mlir::Value addr = fir::BoxAddrOp::create(builder, loc, getLoadedMoldArg());
   mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr);
-  fir::IfOp ifOp = builder.create<fir::IfOp>(loc, isNotAllocated,
-                                             /*withElseRegion=*/true);
+  fir::IfOp ifOp = fir::IfOp::create(builder, loc, isNotAllocated,
+                                     /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   // Just embox the null address and return.
   // We have to give the embox a shape so that the LLVM box structure has the
@@ -421,9 +421,9 @@ fir::IfOp PopulateInitAndCleanupRegionsHelper::handleNullAllocatable() {
   mlir::Value shape = generateZeroShapeForRank(builder, loc, moldArg);
 
   mlir::Value nullBox =
-      builder.create<fir::EmboxOp>(loc, valType, addr, shape,
-                                   /*slice=*/mlir::Value{}, lenParams);
-  builder.create<fir::StoreOp>(loc, nullBox, allocatedPrivVarArg);
+      fir::EmboxOp::create(builder, loc, valType, addr, shape,
+                           /*slice=*/mlir::Value{}, lenParams);
+  fir::StoreOp::create(builder, loc, nullBox, allocatedPrivVarArg);
   return ifOp;
 }
 
@@ -442,14 +442,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
                                                      /*shape=*/{}, lenParams);
   if (scalarInitValue)
     builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
-  mlir::Value box = builder.create<fir::EmboxOp>(
-      loc, valType, valAlloc, /*shape=*/mlir::Value{},
-      /*slice=*/mlir::Value{}, lenParams);
+  mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc,
+                                         /*shape=*/mlir::Value{},
+                                         /*slice=*/mlir::Value{}, lenParams);
   initializeIfDerivedTypeBox(
       builder, loc, box, getLoadedMoldArg(), needsInitialization,
       /*isFirstPrivate=*/kind == DeclOperationKind::FirstPrivateOrLocalInit);
   fir::StoreOp lastOp =
-      builder.create<fir::StoreOp>(loc, box, allocatedPrivVarArg);
+      fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg);
 
   createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
                       isDoConcurrent);
@@ -483,14 +483,15 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     fir::ShapeShiftOp shape =
         getShapeShift(builder, loc, source, cannotHaveNonDefaultLowerBounds);
     mlir::Type arrayType = source.getElementOrSequenceType();
-    mlir::Value allocatedArray = builder.create<fir::AllocMemOp>(
-        loc, arrayType, /*typeparams=*/mlir::ValueRange{}, shape.getExtents());
-    mlir::Value firClass = builder.create<fir::EmboxOp>(loc, source.getType(),
-                                                        allocatedArray, shape);
+    mlir::Value allocatedArray = fir::AllocMemOp::create(
+        builder, loc, arrayType, /*typeparams=*/mlir::ValueRange{},
+        shape.getExtents());
+    mlir::Value firClass = fir::EmboxOp::create(builder, loc, source.getType(),
+                                                allocatedArray, shape);
     initializeIfDerivedTypeBox(
         builder, loc, firClass, source, needsInitialization,
         /*isFirstprivate=*/kind == DeclOperationKind::FirstPrivateOrLocalInit);
-    builder.create<fir::StoreOp>(loc, firClass, allocatedPrivVarArg);
+    fir::StoreOp::create(builder, loc, firClass, allocatedPrivVarArg);
     if (ifUnallocated)
       builder.setInsertionPointAfter(ifUnallocated);
     createYield(allocatedPrivVarArg);
@@ -543,22 +544,21 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
   if (mlir::isa<fir::BaseBoxType>(temp.getType()))
     // the box created by the declare form createTempFromMold is missing
     // lower bounds info
-    box = builder.create<fir::ReboxOp>(loc, boxType, temp, shapeShift,
-                                       /*shift=*/mlir::Value{});
+    box = fir::ReboxOp::create(builder, loc, boxType, temp, shapeShift,
+                               /*shift=*/mlir::Value{});
   else
-    box = builder.create<fir::EmboxOp>(
-        loc, boxType, temp, shapeShift,
-        /*slice=*/mlir::Value{},
-        /*typeParams=*/llvm::ArrayRef<mlir::Value>{});
+    box = fir::EmboxOp::create(builder, loc, boxType, temp, shapeShift,
+                               /*slice=*/mlir::Value{},
+                               /*typeParams=*/llvm::ArrayRef<mlir::Value>{});
 
   if (scalarInitValue)
-    builder.create<hlfir::AssignOp>(loc, scalarInitValue, box);
+    hlfir::AssignOp::create(builder, loc, scalarInitValue, box);
 
   initializeIfDerivedTypeBox(
       builder, loc, box, getLoadedMoldArg(), needsInitialization,
       /*isFirstPrivate=*/kind == DeclOperationKind::FirstPrivateOrLocalInit);
 
-  builder.create<fir::StoreOp>(loc, box, allocatedPrivVarArg);
+  fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg);
   if (ifUnallocated)
     builder.setInsertionPointAfter(ifUnallocated);
   createYield(allocatedPrivVarArg);
@@ -596,8 +596,8 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupUnboxedDerivedType(
   builder.setInsertionPointToStart(initBlock);
   mlir::Type boxedTy = fir::BoxType::get(valType);
   mlir::Value newBox =
-      builder.create<fir::EmboxOp>(loc, boxedTy, allocatedPrivVarArg);
-  mlir::Value moldBox = builder.create<fir::EmboxOp>(loc, boxedTy, moldArg);
+      fir::EmboxOp::create(builder, loc, boxedTy, allocatedPrivVarArg);
+  mlir::Value moldBox = fir::EmboxOp::create(builder, loc, boxedTy, moldArg);
   initializeIfDerivedTypeBox(builder, loc, newBox, moldBox, needsInitialization,
                              /*isFirstPrivate=*/kind ==
                                  DeclOperationKind::FirstPrivateOrLocalInit);
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index c0be1e229f825..80c32d066a38d 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -260,20 +260,20 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
                                                                initIm);
     }
     if (mlir::isa<mlir::FloatType>(type))
-      return builder.create<mlir::arith::ConstantOp>(
-          loc, type,
+      return mlir::arith::ConstantOp::create(
+          builder, loc, type,
           builder.getFloatAttr(type, (double)getOperationIdentity(redId, loc)));
 
     if (mlir::isa<fir::LogicalType>(type)) {
-      mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
-          loc, builder.getI1Type(),
+      mlir::Value intConst = mlir::arith::ConstantOp::create(
+          builder, loc, builder.getI1Type(),
           builder.getIntegerAttr(builder.getI1Type(),
                                  getOperationIdentity(redId, loc)));
       return builder.createConvert(loc, type, intConst);
     }
 
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, type,
+    return mlir::arith::ConstantOp::create(
+        builder, loc, type,
         builder.getIntegerAttr(type, getOperationIdentity(redId, loc)));
   case ReductionIdentifier::ID:
   case ReductionIdentifier::USER_DEF_OP:
@@ -301,15 +301,15 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     break;
   case ReductionIdentifier::IOR:
     assert((type.isIntOrIndex()) && "only integer is expected");
-    reductionOp = builder.create<mlir::arith::OrIOp>(loc, op1, op2);
+    reductionOp = mlir::arith::OrIOp::create(builder, loc, op1, op2);
     break;
   case ReductionIdentifier::IEOR:
     assert((type.isIntOrIndex()) && "only integer is expected");
-    reductionOp = builder.create<mlir::arith::XOrIOp>(loc, op1, op2);
+    reductionOp = mlir::arith::XOrIOp::create(builder, loc, op1, op2);
     break;
   case ReductionIdentifier::IAND:
     assert((type.isIntOrIndex()) && "only integer is expected");
-    reductionOp = builder.create<mlir::arith::AndIOp>(loc, op1, op2);
+    reductionOp = mlir::arith::AndIOp::create(builder, loc, op1, op2);
     break;
   case ReductionIdentifier::ADD:
     reductionOp =
@@ -325,7 +325,8 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
     mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value andiOp = builder.create<mlir::arith::AndIOp>(loc, op1I1, op2I1);
+    mlir::Value andiOp =
+        mlir::arith::AndIOp::create(builder, loc, op1I1, op2I1);
 
     reductionOp = builder.createConvert(loc, type, andiOp);
     break;
@@ -334,7 +335,7 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
     mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value oriOp = builder.create<mlir::arith::OrIOp>(loc, op1I1, op2I1);
+    mlir::Value oriOp = mlir::arith::OrIOp::create(builder, loc, op1I1, op2I1);
 
     reductionOp = builder.createConvert(loc, type, oriOp);
     break;
@@ -343,8 +344,8 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
     mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value cmpiOp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, op1I1, op2I1);
+    mlir::Value cmpiOp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, op1I1, op2I1);
 
     reductionOp = builder.createConvert(loc, type, cmpiOp);
     break;
@@ -353,8 +354,8 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
     mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value cmpiOp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, op1I1, op2I1);
+    mlir::Value cmpiOp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne, op1I1, op2I1);
 
     reductionOp = builder.createConvert(loc, type, cmpiOp);
     break;
@@ -370,9 +371,9 @@ template <typename ParentDeclOpType>
 static void genYield(fir::FirOpBuilder &builder, mlir::Location loc,
                      mlir::Value yieldedValue) {
   if constexpr (std::is_same_v<ParentDeclOpType, mlir::omp::DeclareReductionOp>)
-    builder.create<mlir::omp::YieldOp>(loc, yieldedValue);
+    mlir::omp::YieldOp::create(builder, loc, yieldedValue);
   else
-    builder.create<fir::YieldOp>(loc, yieldedValue);
+    fir::YieldOp::create(builder, loc, yieldedValue);
 }
 
 /// Create reduction combiner region for reduction variables which are boxed
@@ -393,24 +394,24 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
 
   // load fir.ref<fir.box<...>>
   mlir::Value lhsAddr = lhs;
-  lhs = builder.create<fir::LoadOp>(loc, lhs);
-  rhs = builder.create<fir::LoadOp>(loc, rhs);
+  lhs = fir::LoadOp::create(builder, loc, lhs);
+  rhs = fir::LoadOp::create(builder, loc, rhs);
 
   if ((heapTy || ptrTy) && !seqTy) {
     // get box contents (heap pointers)
-    lhs = builder.create<fir::BoxAddrOp>(loc, lhs);
-    rhs = builder.create<fir::BoxAddrOp>(loc, rhs);
+    lhs = fir::BoxAddrOp::create(builder, loc, lhs);
+    rhs = fir::BoxAddrOp::create(builder, loc, rhs);
     mlir::Value lhsValAddr = lhs;
 
     // load heap pointers
-    lhs = builder.create<fir::LoadOp>(loc, lhs);
-    rhs = builder.create<fir::LoadOp>(loc, rhs);
+    lhs = fir::LoadOp::create(builder, loc, lhs);
+    rhs = fir::LoadOp::create(builder, loc, rhs);
 
     mlir::Type eleTy = heapTy ? heapTy.getEleTy() : ptrTy.getEleTy();
 
     mlir::Value result = ReductionProcessor::createScalarCombiner(
         builder, loc, redId, eleTy, lhs, rhs);
-    builder.create<fir::StoreOp>(loc, result, lhsValAddr);
+    fir::StoreOp::create(builder, loc, result, lhsValAddr);
     genYield<DeclRedOpType>(builder, loc, lhsAddr);
     return;
   }
@@ -437,17 +438,17 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
   builder.setInsertionPointToStart(nest.body);
   const bool seqIsVolatile = fir::isa_volatile_type(seqTy.getEleTy());
   mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy(), seqIsVolatile);
-  auto lhsEleAddr = builder.create<fir::ArrayCoorOp>(
-      loc, refTy, lhs, shapeShift, /*slice=*/mlir::Value{},
+  auto lhsEleAddr = fir::ArrayCoorOp::create(
+      builder, loc, refTy, lhs, shapeShift, /*slice=*/mlir::Value{},
       nest.oneBasedIndices, /*typeparms=*/mlir::ValueRange{});
-  auto rhsEleAddr = builder.create<fir::ArrayCoorOp>(
-      loc, refTy, rhs, shapeShift, /*slice=*/mlir::Value{},
+  auto rhsEleAddr = fir::ArrayCoorOp::create(
+      builder, loc, refTy, rhs, shapeShift, /*slice=*/mlir::Value{},
       nest.oneBasedIndices, /*typeparms=*/mlir::ValueRange{});
-  auto lhsEle = builder.create<fir::LoadOp>(loc, lhsEleAddr);
-  auto rhsEle = builder.create<fir::LoadOp>(loc, rhsEleAddr);
+  auto lhsEle = fir::LoadOp::create(builder, loc, lhsEleAddr);
+  auto rhsEle = fir::LoadOp::create(builder, loc, rhsEleAddr);
   mlir::Value scalarReduction = ReductionProcessor::createScalarCombiner(
       builder, loc, redId, refTy, lhsEle, rhsEle);
-  builder.create<fir::StoreOp>(loc, scalarReduction, lhsEleAddr);
+  fir::StoreOp::create(builder, loc, scalarReduction, lhsEleAddr);
 
   builder.setInsertionPointAfter(nest.outerOp);
   genYield<DeclRedOpType>(builder, loc, lhsAddr);
@@ -468,7 +469,7 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::Value result = ReductionProcessor::createScalarCombiner(
         builder, loc, redId, ty, lhsLoaded, rhsLoaded);
     if (isByRef) {
-      builder.create<fir::StoreOp>(loc, result, lhs);
+      fir::StoreOp::create(builder, loc, result, lhs);
       genYield<DeclRedOpType>(builder, loc, lhs);
     } else {
       genYield<DeclRedOpType>(builder, loc, result);
@@ -539,7 +540,7 @@ static void createReductionAllocAndInitRegions(
     if (isByRef) {
       // alloc region
       builder.setInsertionPointToEnd(allocBlock);
-      mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
+      mlir::Value alloca = fir::AllocaOp::create(builder, loc, ty);
       yield(alloca);
       return;
     }
@@ -551,7 +552,7 @@ static void createReductionAllocAndInitRegions(
 
   // alloc region
   builder.setInsertionPointToEnd(allocBlock);
-  mlir::Value boxAlloca = builder.create<fir::AllocaOp>(loc, ty);
+  mlir::Value boxAlloca = fir::AllocaOp::create(builder, loc, ty);
   yield(boxAlloca);
 }
 
@@ -575,7 +576,7 @@ OpType ReductionProcessor::createDeclareReduction(
   if (!isByRef)
     type = valTy;
 
-  decl = modBuilder.create<OpType>(loc, reductionOpName, type);
+  decl = OpType::create(modBuilder, loc, reductionOpName, type);
   createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
                                      isByRef);
 
@@ -672,8 +673,8 @@ void ReductionProcessor::processReductionArguments(
       // Always pass the box by reference so that the OpenMP dialect
       // verifiers don't need to know anything about fir.box
       auto alloca =
-          builder.create<fir::AllocaOp>(currentLocation, box.getType());
-      builder.create<fir::StoreOp>(currentLocation, box, alloca);
+          fir::AllocaOp::create(builder, currentLocation, box.getType());
+      fir::StoreOp::create(builder, currentLocation, box, alloca);
 
       symVal = alloca;
     } else if (mlir::isa<fir::BaseBoxType>(symVal.getType())) {
@@ -683,9 +684,9 @@ void ReductionProcessor::processReductionArguments(
       auto oldIP = builder.saveInsertionPoint();
       builder.setInsertionPointToStart(builder.getAllocaBlock());
       auto alloca =
-          builder.create<fir::AllocaOp>(currentLocation, symVal.getType());
+          fir::AllocaOp::create(builder, currentLocation, symVal.getType());
       builder.restoreInsertionPoint(oldIP);
-      builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
+      fir::StoreOp::create(builder, currentLocation, symVal, alloca);
       symVal = alloca;
     }
 
diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp
index b9d2574a76ad0..881401e11fee4 100644
--- a/flang/lib/Lower/Support/Utils.cpp
+++ b/flang/lib/Lower/Support/Utils.cpp
@@ -702,9 +702,10 @@ void privatizeSymbol(
     // Boxes should be passed by reference into nested regions:
     auto oldIP = firOpBuilder.saveInsertionPoint();
     firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
-    auto alloca = firOpBuilder.create<fir::AllocaOp>(symLoc, privVal.getType());
+    auto alloca =
+        fir::AllocaOp::create(firOpBuilder, symLoc, privVal.getType());
     firOpBuilder.restoreInsertionPoint(oldIP);
-    firOpBuilder.create<fir::StoreOp>(symLoc, privVal, alloca);
+    fir::StoreOp::create(firOpBuilder, symLoc, privVal, alloca);
     privVal = alloca;
   }
 
@@ -726,15 +727,15 @@ void privatizeSymbol(
     OpType result;
 
     if constexpr (std::is_same_v<OpType, mlir::omp::PrivateClauseOp>) {
-      result = firOpBuilder.create<OpType>(
-          symLoc, uniquePrivatizerName, allocType,
+      result = OpType::create(
+          firOpBuilder, symLoc, uniquePrivatizerName, allocType,
           emitCopyRegion ? mlir::omp::DataSharingClauseType::FirstPrivate
                          : mlir::omp::DataSharingClauseType::Private);
     } else {
-      result = firOpBuilder.create<OpType>(
-          symLoc, uniquePrivatizerName, allocType,
-          emitCopyRegion ? fir::LocalitySpecifierType::LocalInit
-                         : fir::LocalitySpecifierType::Local);
+      result =
+          OpType::create(firOpBuilder, symLoc, uniquePrivatizerName, allocType,
+                         emitCopyRegion ? fir::LocalitySpecifierType::LocalInit
+                                        : fir::LocalitySpecifierType::Local);
     }
 
     fir::ExtendedValue symExV = converter.getSymbolExtendedValue(*sym);
@@ -815,12 +816,12 @@ void privatizeSymbol(
       copyFirstPrivateSymbol(converter, symToPrivatize, &ip);
 
       if constexpr (std::is_same_v<OpType, mlir::omp::PrivateClauseOp>) {
-        firOpBuilder.create<mlir::omp::YieldOp>(
-            hsb.getAddr().getLoc(),
+        mlir::omp::YieldOp::create(
+            firOpBuilder, hsb.getAddr().getLoc(),
             symTable.shallowLookupSymbol(*symToPrivatize).getAddr());
       } else {
-        firOpBuilder.create<fir::YieldOp>(
-            hsb.getAddr().getLoc(),
+        fir::YieldOp::create(
+            firOpBuilder, hsb.getAddr().getLoc(),
             symTable.shallowLookupSymbol(*symToPrivatize).getAddr());
       }
     }
diff --git a/flang/lib/Lower/VectorSubscripts.cpp b/flang/lib/Lower/VectorSubscripts.cpp
index 389a89ddcf102..4d1d6fb28b365 100644
--- a/flang/lib/Lower/VectorSubscripts.cpp
+++ b/flang/lib/Lower/VectorSubscripts.cpp
@@ -121,8 +121,9 @@ class VectorSubscriptBoxBuilder {
     if (recTy.getNumLenParams() != 0)
       TODO(loc, "threading length parameters in field index op");
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-    componentPath.emplace_back(builder.create<fir::FieldIndexOp>(
-        loc, fldTy, componentName, recTy, /*typeParams*/ std::nullopt));
+    componentPath.emplace_back(
+        fir::FieldIndexOp::create(builder, loc, fldTy, componentName, recTy,
+                                  /*typeParams=*/mlir::ValueRange{}));
     return fir::unwrapSequenceType(recTy.getType(componentName));
   }
 
@@ -269,16 +270,16 @@ mlir::Value Fortran::lower::VectorSubscriptBox::loopOverElementsBase(
   for (auto [lb, ub, step] : genLoopBounds(builder, loc)) {
     LoopType loop;
     if constexpr (std::is_same_v<LoopType, fir::IterWhileOp>) {
-      loop =
-          builder.create<fir::IterWhileOp>(loc, lb, ub, step, initialCondition);
+      loop = fir::IterWhileOp::create(builder, loc, lb, ub, step,
+                                      initialCondition);
       initialCondition = loop.getIterateVar();
       if (!outerLoop)
         outerLoop = loop;
       else
-        builder.create<fir::ResultOp>(loc, loop.getResult(0));
+        fir::ResultOp::create(builder, loc, loop.getResult(0));
     } else {
-      loop =
-          builder.create<fir::DoLoopOp>(loc, lb, ub, step, /*unordered=*/false);
+      loop = fir::DoLoopOp::create(builder, loc, lb, ub, step,
+                                   /*unordered=*/false);
       if (!outerLoop)
         outerLoop = loop;
     }
@@ -293,7 +294,7 @@ mlir::Value Fortran::lower::VectorSubscriptBox::loopOverElementsBase(
 
   if constexpr (std::is_same_v<LoopType, fir::IterWhileOp>) {
     auto res = elementalGenerator(elem);
-    builder.create<fir::ResultOp>(loc, res);
+    fir::ResultOp::create(builder, loc, res);
     builder.setInsertionPointAfter(outerLoop);
     return outerLoop.getResult(0);
   } else {
@@ -326,7 +327,7 @@ Fortran::lower::VectorSubscriptBox::createSlice(fir::FirOpBuilder &builder,
   mlir::Type idxTy = builder.getIndexType();
   llvm::SmallVector<mlir::Value> triples;
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-  auto undef = builder.create<fir::UndefOp>(loc, idxTy);
+  auto undef = fir::UndefOp::create(builder, loc, idxTy);
   for (const LoweredSubscript &subscript : loweredSubscripts)
     Fortran::common::visit(Fortran::common::visitors{
                                [&](const LoweredTriplet &triplet) {
@@ -346,7 +347,7 @@ Fortran::lower::VectorSubscriptBox::createSlice(fir::FirOpBuilder &builder,
                                },
                            },
                            subscript);
-  return builder.create<fir::SliceOp>(loc, triples, componentPath);
+  return fir::SliceOp::create(builder, loc, triples, componentPath);
 }
 
 llvm::SmallVector<std::tuple<mlir::Value, mlir::Value, mlir::Value>>
@@ -369,13 +370,13 @@ Fortran::lower::VectorSubscriptBox::genLoopBounds(fir::FirOpBuilder &builder,
           builder, loc, loweredBase, dimension, one);
       baseLb = builder.createConvert(loc, idxTy, baseLb);
       lb = baseLb;
-      ub = builder.create<mlir::arith::SubIOp>(loc, idxTy, extent, one);
-      ub = builder.create<mlir::arith::AddIOp>(loc, idxTy, ub, baseLb);
+      ub = mlir::arith::SubIOp::create(builder, loc, idxTy, extent, one);
+      ub = mlir::arith::AddIOp::create(builder, loc, idxTy, ub, baseLb);
       step = one;
     } else {
       const auto &vector = std::get<LoweredVectorSubscript>(subscript);
       lb = zero;
-      ub = builder.create<mlir::arith::SubIOp>(loc, idxTy, vector.size, one);
+      ub = mlir::arith::SubIOp::create(builder, loc, idxTy, vector.size, one);
       step = one;
     }
     bounds.emplace_back(lb, ub, step);
@@ -402,10 +403,10 @@ fir::ExtendedValue Fortran::lower::VectorSubscriptBox::getElementAt(
               mlir::Type vecEleTy = fir::unwrapSequenceType(
                   fir::unwrapPassByRefType(vecBase.getType()));
               mlir::Type refTy = builder.getRefType(vecEleTy);
-              auto vecEltRef = builder.create<fir::CoordinateOp>(
-                  loc, refTy, vecBase, vecIndex);
+              auto vecEltRef = fir::CoordinateOp::create(builder, loc, refTy,
+                                                         vecBase, vecIndex);
               auto vecElt =
-                  builder.create<fir::LoadOp>(loc, vecEleTy, vecEltRef);
+                  fir::LoadOp::create(builder, loc, vecEleTy, vecEltRef);
               indexes.emplace_back(builder.createConvert(loc, idxTy, vecElt));
             },
             [&](const mlir::Value &i) {
@@ -414,8 +415,8 @@ fir::ExtendedValue Fortran::lower::VectorSubscriptBox::getElementAt(
         },
         subscript);
   mlir::Type refTy = builder.getRefType(getElementType());
-  auto elementAddr = builder.create<fir::ArrayCoorOp>(
-      loc, refTy, fir::getBase(loweredBase), shape, slice, indexes,
+  auto elementAddr = fir::ArrayCoorOp::create(
+      builder, loc, refTy, fir::getBase(loweredBase), shape, slice, indexes,
       fir::getTypeParams(loweredBase));
   fir::ExtendedValue element = fir::factory::arraySectionElementToExtendedValue(
       builder, loc, loweredBase, elementAddr, slice);
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index dcbf4991907bf..cf7588f275d22 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -25,8 +25,8 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
                mlir::UnitAttr::get(ctx));
 
   mlir::OpBuilder builder(ctx);
-  auto gpuMod = builder.create<mlir::gpu::GPUModuleOp>(mod.getLoc(),
-                                                       cudaDeviceModuleName);
+  auto gpuMod = mlir::gpu::GPUModuleOp::create(builder, mod.getLoc(),
+                                               cudaDeviceModuleName);
   mlir::Block::iterator insertPt(mod.getBodyRegion().front().end());
   symTab.insert(gpuMod, insertPt);
   return gpuMod;
@@ -84,8 +84,8 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
       if (auto globalOp =
               mod.lookupSymbol<fir::GlobalOp>(addrOfOp.getSymbol())) {
         if (cuf::isRegisteredDeviceGlobal(globalOp)) {
-          builder.create<cuf::SyncDescriptorOp>(box.getLoc(),
-                                                addrOfOp.getSymbol());
+          cuf::SyncDescriptorOp::create(builder, box.getLoc(),
+                                        addrOfOp.getSymbol());
         }
       }
     }
diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp
index 61428ac490a46..a096099a04fe8 100644
--- a/flang/lib/Optimizer/Builder/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Character.cpp
@@ -112,8 +112,8 @@ fir::factory::CharacterExprHelper::materializeValue(mlir::Value str) {
   }
   auto len = builder.createIntegerConstant(
       loc, builder.getCharacterLengthType(), charTy.getLen());
-  auto temp = builder.create<fir::AllocaOp>(loc, charTy);
-  builder.create<fir::StoreOp>(loc, str, temp);
+  auto temp = fir::AllocaOp::create(builder, loc, charTy);
+  fir::StoreOp::create(builder, loc, str, temp);
   LLVM_DEBUG(llvm::dbgs() << "materialized as local: " << str << " -> (" << temp
                           << ", " << len << ")\n");
   return {temp, len};
@@ -163,7 +163,7 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
     }
     if (!boxCharLen) {
       auto unboxed =
-          builder.create<fir::UnboxCharOp>(loc, refType, lenType, character);
+          fir::UnboxCharOp::create(builder, loc, refType, lenType, character);
       base = builder.createConvert(loc, refType, unboxed.getResult(0));
       boxCharLen = unboxed.getResult(1);
     }
@@ -208,7 +208,7 @@ fir::factory::CharacterExprHelper::createEmbox(const fir::CharBoxValue &box) {
   // not in memory.
   if (!fir::isa_ref_type(buff.getType())) {
     auto temp = builder.createTemporary(loc, buff.getType());
-    builder.create<fir::StoreOp>(loc, buff, temp);
+    fir::StoreOp::create(builder, loc, buff, temp);
     buff = temp;
   }
   // fir.emboxchar only accepts scalar, cast array buffer to a scalar buffer.
@@ -218,7 +218,7 @@ fir::factory::CharacterExprHelper::createEmbox(const fir::CharBoxValue &box) {
   // be used in boxchar.
   auto len = builder.createConvert(loc, builder.getCharacterLengthType(),
                                    box.getLen());
-  return builder.create<fir::EmboxCharOp>(loc, boxCharType, buff, len);
+  return fir::EmboxCharOp::create(builder, loc, boxCharType, buff, len);
 }
 
 fir::CharBoxValue fir::factory::CharacterExprHelper::toScalarCharacter(
@@ -231,8 +231,8 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::toScalarCharacter(
   auto lenType = builder.getCharacterLengthType();
   auto len = builder.createConvert(loc, lenType, box.getLen());
   for (auto extent : box.getExtents())
-    len = builder.create<mlir::arith::MulIOp>(
-        loc, len, builder.createConvert(loc, lenType, extent));
+    len = mlir::arith::MulIOp::create(
+        builder, loc, len, builder.createConvert(loc, lenType, extent));
 
   // TODO: typeLen can be improved in compiled constant cases
   // TODO: allow bare fir.array<> (no ref) conversion here ?
@@ -277,7 +277,7 @@ fir::factory::CharacterExprHelper::createElementAddr(mlir::Value buffer,
 
   auto coor = builder.createConvert(loc, coorTy, buffer);
   auto i = builder.createConvert(loc, builder.getIndexType(), index);
-  return builder.create<fir::CoordinateOp>(loc, singleRefTy, coor, i);
+  return fir::CoordinateOp::create(builder, loc, singleRefTy, coor, i);
 }
 
 /// Load a character out of `buff` from offset `index`.
@@ -287,7 +287,7 @@ fir::factory::CharacterExprHelper::createLoadCharAt(mlir::Value buff,
                                                     mlir::Value index) {
   LLVM_DEBUG(llvm::dbgs() << "load a char: " << buff << " type: "
                           << buff.getType() << " at: " << index << '\n');
-  return builder.create<fir::LoadOp>(loc, createElementAddr(buff, index));
+  return fir::LoadOp::create(builder, loc, createElementAddr(buff, index));
 }
 
 /// Store the singleton character `c` to `str` at offset `index`.
@@ -299,7 +299,7 @@ void fir::factory::CharacterExprHelper::createStoreCharAt(mlir::Value str,
                           << " type: " << str.getType() << " at: " << index
                           << '\n');
   auto addr = createElementAddr(str, index);
-  builder.create<fir::StoreOp>(loc, c, addr);
+  fir::StoreOp::create(builder, loc, c, addr);
 }
 
 // FIXME: this temp is useless... either fir.coordinate_of needs to
@@ -311,8 +311,8 @@ mlir::Value fir::factory::CharacterExprHelper::getCharBoxBuffer(
     const fir::CharBoxValue &box) {
   auto buff = box.getBuffer();
   if (fir::isa_char(buff.getType())) {
-    auto newBuff = builder.create<fir::AllocaOp>(loc, buff.getType());
-    builder.create<fir::StoreOp>(loc, buff, newBuff);
+    auto newBuff = fir::AllocaOp::create(builder, loc, buff.getType());
+    fir::StoreOp::create(builder, loc, buff, newBuff);
     return newBuff;
   }
   return buff;
@@ -339,19 +339,19 @@ void fir::factory::CharacterExprHelper::createCopy(
     auto kindBytes = builder.createIntegerConstant(loc, i64Ty, bytes);
     auto castCount = builder.createConvert(loc, i64Ty, count);
     auto totalBytes =
-        builder.create<mlir::arith::MulIOp>(loc, kindBytes, castCount);
+        mlir::arith::MulIOp::create(builder, loc, kindBytes, castCount);
     auto llvmPointerType =
         mlir::LLVM::LLVMPointerType::get(builder.getContext());
     auto toPtr = builder.createConvert(loc, llvmPointerType, toBuff);
     auto fromPtr = builder.createConvert(loc, llvmPointerType, fromBuff);
-    builder.create<mlir::LLVM::MemmoveOp>(loc, toPtr, fromPtr, totalBytes,
-                                          isVolatile);
+    mlir::LLVM::MemmoveOp::create(builder, loc, toPtr, fromPtr, totalBytes,
+                                  isVolatile);
     return;
   }
 
   // Convert a CHARACTER of one KIND into a CHARACTER of another KIND.
-  builder.create<fir::CharConvertOp>(loc, src.getBuffer(), count,
-                                     dest.getBuffer());
+  fir::CharConvertOp::create(builder, loc, src.getBuffer(), count,
+                             dest.getBuffer());
 }
 
 void fir::factory::CharacterExprHelper::createPadding(
@@ -397,7 +397,7 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::createTempFrom(
   } else {
     auto ref = builder.createConvert(loc, builder.getRefType(sourceTy),
                                      temp.getBuffer());
-    builder.create<fir::StoreOp>(loc, charBox->getBuffer(), ref);
+    fir::StoreOp::create(builder, loc, charBox->getBuffer(), ref);
   }
   return temp;
 }
@@ -412,23 +412,23 @@ void fir::factory::CharacterExprHelper::createLengthOneAssign(
     auto fromCharLen1RefTy = builder.getRefType(getSingletonCharType(
         builder.getContext(),
         getCharacterKind(fir::unwrapRefType(val.getType()))));
-    val = builder.create<fir::LoadOp>(
-        loc, builder.createConvert(loc, fromCharLen1RefTy, val));
+    val = fir::LoadOp::create(
+        builder, loc, builder.createConvert(loc, fromCharLen1RefTy, val));
   }
   auto toCharLen1Ty =
       getSingletonCharType(builder.getContext(), getCharacterKind(toTy));
   val = builder.createConvert(loc, toCharLen1Ty, val);
-  builder.create<fir::StoreOp>(
-      loc, val,
+  fir::StoreOp::create(
+      builder, loc, val,
       builder.createConvert(loc, builder.getRefType(toCharLen1Ty), addr));
 }
 
 /// Returns the minimum of integer mlir::Value \p a and \b.
 mlir::Value genMin(fir::FirOpBuilder &builder, mlir::Location loc,
                    mlir::Value a, mlir::Value b) {
-  auto cmp = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::slt, a, b);
-  return builder.create<mlir::arith::SelectOp>(loc, cmp, a, b);
+  auto cmp = mlir::arith::CmpIOp::create(builder, loc,
+                                         mlir::arith::CmpIPredicate::slt, a, b);
+  return mlir::arith::SelectOp::create(builder, loc, cmp, a, b);
 }
 
 void fir::factory::CharacterExprHelper::createAssign(
@@ -479,7 +479,7 @@ void fir::factory::CharacterExprHelper::createAssign(
   if (!compileTimeSameLength) {
     auto one = builder.createIntegerConstant(loc, lhs.getLen().getType(), 1);
     auto maxPadding =
-        builder.create<mlir::arith::SubIOp>(loc, lhs.getLen(), one);
+        mlir::arith::SubIOp::create(builder, loc, lhs.getLen(), one);
     createPadding(lhs, copyCount, maxPadding);
   }
 }
@@ -490,18 +490,19 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::createConcatenate(
                                       lhs.getLen());
   auto rhsLen = builder.createConvert(loc, builder.getCharacterLengthType(),
                                       rhs.getLen());
-  mlir::Value len = builder.create<mlir::arith::AddIOp>(loc, lhsLen, rhsLen);
+  mlir::Value len = mlir::arith::AddIOp::create(builder, loc, lhsLen, rhsLen);
   auto temp = createCharacterTemp(getCharacterType(rhs), len);
   createCopy(temp, lhs, lhsLen);
   auto one = builder.createIntegerConstant(loc, len.getType(), 1);
-  auto upperBound = builder.create<mlir::arith::SubIOp>(loc, len, one);
+  auto upperBound = mlir::arith::SubIOp::create(builder, loc, len, one);
   auto lhsLenIdx = builder.createConvert(loc, builder.getIndexType(), lhsLen);
   auto fromBuff = getCharBoxBuffer(rhs);
   auto toBuff = getCharBoxBuffer(temp);
   fir::factory::DoLoopHelper{builder, loc}.createLoop(
       lhsLenIdx, upperBound, one,
       [&](fir::FirOpBuilder &bldr, mlir::Value index) {
-        auto rhsIndex = bldr.create<mlir::arith::SubIOp>(loc, index, lhsLenIdx);
+        auto rhsIndex =
+            mlir::arith::SubIOp::create(bldr, loc, index, lhsLenIdx);
         auto charVal = createLoadCharAt(fromBuff, rhsIndex);
         createStoreCharAt(toBuff, index, charVal);
       });
@@ -514,7 +515,7 @@ mlir::Value fir::factory::CharacterExprHelper::genSubstringBase(
   if (!one)
     one = builder.createIntegerConstant(loc, lowerBound.getType(), 1);
   auto offset =
-      builder.create<mlir::arith::SubIOp>(loc, lowerBound, one).getResult();
+      mlir::arith::SubIOp::create(builder, loc, lowerBound, one).getResult();
   auto addr = createElementAddr(stringRawAddr, offset);
   return builder.createConvert(loc, substringAddrType, addr);
 }
@@ -545,19 +546,19 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::createSubstring(
   mlir::Value substringLen;
   if (nbounds < 2) {
     substringLen =
-        builder.create<mlir::arith::SubIOp>(loc, box.getLen(), castBounds[0]);
+        mlir::arith::SubIOp::create(builder, loc, box.getLen(), castBounds[0]);
   } else {
     substringLen =
-        builder.create<mlir::arith::SubIOp>(loc, castBounds[1], castBounds[0]);
+        mlir::arith::SubIOp::create(builder, loc, castBounds[1], castBounds[0]);
   }
-  substringLen = builder.create<mlir::arith::AddIOp>(loc, substringLen, one);
+  substringLen = mlir::arith::AddIOp::create(builder, loc, substringLen, one);
 
   // Set length to zero if bounds were reversed (Fortran 2018 9.4.1)
   auto zero = builder.createIntegerConstant(loc, substringLen.getType(), 0);
-  auto cdt = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::slt, substringLen, zero);
+  auto cdt = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::slt, substringLen, zero);
   substringLen =
-      builder.create<mlir::arith::SelectOp>(loc, cdt, zero, substringLen);
+      mlir::arith::SelectOp::create(builder, loc, cdt, zero, substringLen);
 
   return {substringRef, substringLen};
 }
@@ -573,11 +574,11 @@ fir::factory::CharacterExprHelper::createLenTrim(const fir::CharBoxValue &str) {
   auto zero = builder.createIntegerConstant(loc, indexType, 0);
   auto trueVal = builder.createIntegerConstant(loc, builder.getI1Type(), 1);
   auto blank = createBlankConstantCode(getCharacterType(str));
-  mlir::Value lastChar = builder.create<mlir::arith::SubIOp>(loc, len, one);
+  mlir::Value lastChar = mlir::arith::SubIOp::create(builder, loc, len, one);
 
   auto iterWhile =
-      builder.create<fir::IterWhileOp>(loc, lastChar, zero, minusOne, trueVal,
-                                       /*returnFinalCount=*/false, lastChar);
+      fir::IterWhileOp::create(builder, loc, lastChar, zero, minusOne, trueVal,
+                               /*returnFinalCount=*/false, lastChar);
   auto insPt = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(iterWhile.getBody());
   auto index = iterWhile.getInductionVar();
@@ -586,17 +587,17 @@ fir::factory::CharacterExprHelper::createLenTrim(const fir::CharBoxValue &str) {
   auto elemAddr = createElementAddr(fromBuff, index);
   auto codeAddr =
       builder.createConvert(loc, builder.getRefType(blank.getType()), elemAddr);
-  auto c = builder.create<fir::LoadOp>(loc, codeAddr);
-  auto isBlank = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, blank, c);
+  auto c = fir::LoadOp::create(builder, loc, codeAddr);
+  auto isBlank = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, blank, c);
   llvm::SmallVector<mlir::Value> results = {isBlank, index};
-  builder.create<fir::ResultOp>(loc, results);
+  fir::ResultOp::create(builder, loc, results);
   builder.restoreInsertionPoint(insPt);
   // Compute length after iteration (zero if all blanks)
   mlir::Value newLen =
-      builder.create<mlir::arith::AddIOp>(loc, iterWhile.getResult(1), one);
-  auto result = builder.create<mlir::arith::SelectOp>(
-      loc, iterWhile.getResult(0), zero, newLen);
+      mlir::arith::AddIOp::create(builder, loc, iterWhile.getResult(1), one);
+  auto result = mlir::arith::SelectOp::create(
+      builder, loc, iterWhile.getResult(0), zero, newLen);
   return builder.createConvert(loc, builder.getCharacterLengthType(), result);
 }
 
@@ -606,7 +607,7 @@ fir::factory::CharacterExprHelper::createCharacterTemp(mlir::Type type,
   assert(len >= 0 && "expected positive length");
   auto kind = recoverCharacterType(type).getFKind();
   auto charType = fir::CharacterType::get(builder.getContext(), kind, len);
-  auto addr = builder.create<fir::AllocaOp>(loc, charType);
+  auto addr = fir::AllocaOp::create(builder, loc, charType);
   auto mlirLen =
       builder.createIntegerConstant(loc, builder.getCharacterLengthType(), len);
   return {addr, mlirLen};
@@ -690,10 +691,10 @@ fir::factory::CharacterExprHelper::createSingletonFromCode(mlir::Value code,
   auto bits = builder.getKindMap().getCharacterBitsize(kind);
   auto intType = builder.getIntegerType(bits);
   auto cast = builder.createConvert(loc, intType, code);
-  auto undef = builder.create<fir::UndefOp>(loc, charType);
+  auto undef = fir::UndefOp::create(builder, loc, charType);
   auto zero = builder.getIntegerAttr(builder.getIndexType(), 0);
-  return builder.create<fir::InsertValueOp>(loc, charType, undef, cast,
-                                            builder.getArrayAttr(zero));
+  return fir::InsertValueOp::create(builder, loc, charType, undef, cast,
+                                    builder.getArrayAttr(zero));
 }
 
 mlir::Value fir::factory::CharacterExprHelper::extractCodeFromSingleton(
@@ -703,8 +704,8 @@ mlir::Value fir::factory::CharacterExprHelper::extractCodeFromSingleton(
   auto bits = builder.getKindMap().getCharacterBitsize(type.getFKind());
   auto intType = builder.getIntegerType(bits);
   auto zero = builder.getIntegerAttr(builder.getIndexType(), 0);
-  return builder.create<fir::ExtractValueOp>(loc, intType, singleton,
-                                             builder.getArrayAttr(zero));
+  return fir::ExtractValueOp::create(builder, loc, intType, singleton,
+                                     builder.getArrayAttr(zero));
 }
 
 mlir::Value
@@ -716,12 +717,12 @@ fir::factory::CharacterExprHelper::readLengthFromBox(mlir::Value box) {
 mlir::Value fir::factory::CharacterExprHelper::readLengthFromBox(
     mlir::Value box, fir::CharacterType charTy) {
   auto lenTy = builder.getCharacterLengthType();
-  auto size = builder.create<fir::BoxEleSizeOp>(loc, lenTy, box);
+  auto size = fir::BoxEleSizeOp::create(builder, loc, lenTy, box);
   auto bits = builder.getKindMap().getCharacterBitsize(charTy.getFKind());
   auto width = bits / 8;
   if (width > 1) {
     auto widthVal = builder.createIntegerConstant(loc, lenTy, width);
-    return builder.create<mlir::arith::DivSIOp>(loc, size, widthVal);
+    return mlir::arith::DivSIOp::create(builder, loc, size, widthVal);
   }
   return size;
 }
@@ -748,18 +749,18 @@ fir::factory::extractCharacterProcedureTuple(fir::FirOpBuilder &builder,
                                              mlir::Value tuple,
                                              bool openBoxProc) {
   mlir::TupleType tupleType = mlir::cast<mlir::TupleType>(tuple.getType());
-  mlir::Value addr = builder.create<fir::ExtractValueOp>(
-      loc, tupleType.getType(0), tuple,
+  mlir::Value addr = fir::ExtractValueOp::create(
+      builder, loc, tupleType.getType(0), tuple,
       builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 0)}));
   mlir::Value proc = [&]() -> mlir::Value {
     if (openBoxProc)
       if (auto addrTy = mlir::dyn_cast<fir::BoxProcType>(addr.getType()))
-        return builder.create<fir::BoxAddrOp>(loc, addrTy.getEleTy(), addr);
+        return fir::BoxAddrOp::create(builder, loc, addrTy.getEleTy(), addr);
     return addr;
   }();
-  mlir::Value len = builder.create<fir::ExtractValueOp>(
-      loc, tupleType.getType(1), tuple,
+  mlir::Value len = fir::ExtractValueOp::create(
+      builder, loc, tupleType.getType(1), tuple,
       builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 1)}));
   return {proc, len};
@@ -773,14 +774,14 @@ mlir::Value fir::factory::createCharacterProcedureTuple(
   if (len)
     len = builder.createConvert(loc, tupleType.getType(1), len);
   else
-    len = builder.create<fir::UndefOp>(loc, tupleType.getType(1));
-  mlir::Value tuple = builder.create<fir::UndefOp>(loc, tupleType);
-  tuple = builder.create<fir::InsertValueOp>(
-      loc, tupleType, tuple, addr,
+    len = fir::UndefOp::create(builder, loc, tupleType.getType(1));
+  mlir::Value tuple = fir::UndefOp::create(builder, loc, tupleType);
+  tuple = fir::InsertValueOp::create(
+      builder, loc, tupleType, tuple, addr,
       builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 0)}));
-  tuple = builder.create<fir::InsertValueOp>(
-      loc, tupleType, tuple, len,
+  tuple = fir::InsertValueOp::create(
+      builder, loc, tupleType, tuple, len,
       builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 1)}));
   return tuple;
@@ -827,10 +828,10 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::createCharExtremum(
     auto currLen = builder.createConvert(loc, builder.getCharacterLengthType(),
                                          currChar.getLen());
     // biggest len result
-    mlir::Value lhsBigger = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::uge, biggestLen, currLen);
-    biggestLen = builder.create<mlir::arith::SelectOp>(loc, lhsBigger,
-                                                       biggestLen, currLen);
+    mlir::Value lhsBigger = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::uge, biggestLen, currLen);
+    biggestLen = mlir::arith::SelectOp::create(builder, loc, lhsBigger,
+                                               biggestLen, currLen);
 
     auto cmp = predIsMin ? mlir::arith::CmpIPredicate::slt
                          : mlir::arith::CmpIPredicate::sgt;
@@ -843,10 +844,10 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::createCharExtremum(
     resultBuf = builder.createConvert(loc, type, resultBuf);
     currBuf = builder.createConvert(loc, type, currBuf);
 
-    resultBuf = builder.create<mlir::arith::SelectOp>(loc, resultCmp, currBuf,
-                                                      resultBuf);
-    resultLen = builder.create<mlir::arith::SelectOp>(loc, resultCmp, currLen,
-                                                      resultLen);
+    resultBuf = mlir::arith::SelectOp::create(builder, loc, resultCmp, currBuf,
+                                              resultBuf);
+    resultLen = mlir::arith::SelectOp::create(builder, loc, resultCmp, currLen,
+                                              resultLen);
   }
 
   // now that we know the lexicographically biggest/smallest char and which char
@@ -876,7 +877,7 @@ fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
     // As a value, it ought to have a constant LEN value.
     assert(charTy.hasConstantLen() && "must have constant length");
     mlir::Value tmp = builder.createTemporary(loc, charTy);
-    builder.create<fir::StoreOp>(loc, boxCharAddr, tmp);
+    fir::StoreOp::create(builder, loc, boxCharAddr, tmp);
     boxCharAddr = tmp;
   }
   auto fromBits = kindMap.getCharacterBitsize(
@@ -886,13 +887,13 @@ fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
     // Scale by relative ratio to give a buffer of the same length.
     auto ratio = builder.createIntegerConstant(loc, bufferSize.getType(),
                                                fromBits / toBits);
-    bufferSize = builder.create<mlir::arith::MulIOp>(loc, bufferSize, ratio);
+    bufferSize = mlir::arith::MulIOp::create(builder, loc, bufferSize, ratio);
   }
   mlir::Type toType =
       fir::CharacterType::getUnknownLen(builder.getContext(), toKind);
   auto dest = builder.createTemporary(loc, toType, /*name=*/{}, /*shape=*/{},
                                       mlir::ValueRange{bufferSize});
-  builder.create<fir::CharConvertOp>(loc, boxCharAddr, srcBoxChar.getLen(),
-                                     dest);
+  fir::CharConvertOp::create(builder, loc, boxCharAddr, srcBoxChar.getLen(),
+                             dest);
   return fir::CharBoxValue{dest, srcBoxChar.getLen()};
 }
diff --git a/flang/lib/Optimizer/Builder/Complex.cpp b/flang/lib/Optimizer/Builder/Complex.cpp
index 69f97dd654ce0..61de9880774ac 100644
--- a/flang/lib/Optimizer/Builder/Complex.cpp
+++ b/flang/lib/Optimizer/Builder/Complex.cpp
@@ -24,7 +24,7 @@ mlir::Type fir::factory::Complex::getComplexPartType(mlir::Value cplx) const {
 mlir::Value fir::factory::Complex::createComplex(mlir::Type cplxTy,
                                                  mlir::Value real,
                                                  mlir::Value imag) {
-  mlir::Value und = builder.create<fir::UndefOp>(loc, cplxTy);
+  mlir::Value und = fir::UndefOp::create(builder, loc, cplxTy);
   return insert<Part::Imag>(insert<Part::Real>(und, real), imag);
 }
 
diff --git a/flang/lib/Optimizer/Builder/DoLoopHelper.cpp b/flang/lib/Optimizer/Builder/DoLoopHelper.cpp
index 4b12e281b5153..0ec91d5883b92 100644
--- a/flang/lib/Optimizer/Builder/DoLoopHelper.cpp
+++ b/flang/lib/Optimizer/Builder/DoLoopHelper.cpp
@@ -20,7 +20,7 @@ fir::factory::DoLoopHelper::createLoop(mlir::Value lb, mlir::Value ub,
   auto ubi = builder.convertToIndexType(loc, ub);
   assert(step && "step must be an actual Value");
   auto inc = builder.convertToIndexType(loc, step);
-  auto loop = builder.create<fir::DoLoopOp>(loc, lbi, ubi, inc);
+  auto loop = fir::DoLoopOp::create(builder, loc, lbi, ubi, inc);
   auto insertPt = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(loop.getBody());
   auto index = loop.getInductionVar();
@@ -43,6 +43,6 @@ fir::factory::DoLoopHelper::createLoop(mlir::Value count,
   auto indexType = builder.getIndexType();
   auto zero = builder.createIntegerConstant(loc, indexType, 0);
   auto one = builder.createIntegerConstant(loc, count.getType(), 1);
-  auto up = builder.create<mlir::arith::SubIOp>(loc, count, one);
+  auto up = mlir::arith::SubIOp::create(builder, loc, count, one);
   return createLoop(zero, up, one, bodyGenerator);
 }
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index acd5a88a2582d..eaad54eb9eec2 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -406,10 +406,10 @@ mlir::Value fir::FirOpBuilder::genTempDeclareOp(
     llvm::ArrayRef<mlir::Value> typeParams,
     fir::FortranVariableFlagsAttr fortranAttrs) {
   auto nameAttr = mlir::StringAttr::get(builder.getContext(), name);
-  return builder.create<fir::DeclareOp>(loc, memref.getType(), memref, shape,
-                                        typeParams,
-                                        /*dummy_scope=*/nullptr, nameAttr,
-                                        fortranAttrs, cuf::DataAttributeAttr{});
+  return fir::DeclareOp::create(builder, loc, memref.getType(), memref, shape,
+                                typeParams,
+                                /*dummy_scope=*/nullptr, nameAttr, fortranAttrs,
+                                cuf::DataAttributeAttr{});
 }
 
 mlir::Value fir::FirOpBuilder::genStackSave(mlir::Location loc) {
@@ -585,7 +585,7 @@ mlir::Value fir::factory::createConvert(mlir::OpBuilder &builder,
             mlir::cast<fir::RecordType>(val.getType()).getTypeList() ==
                 mlir::cast<fir::RecordType>(toTy).getTypeList()) &&
            "incompatible record types");
-    return builder.create<fir::ConvertOp>(loc, toTy, val);
+    return fir::ConvertOp::create(builder, loc, toTy, val);
   }
   return val;
 }
@@ -620,7 +620,7 @@ fir::StringLitOp fir::FirOpBuilder::createStringLitOp(mlir::Location loc,
   mlir::NamedAttribute sizeAttr(sizeTag, getI64IntegerAttr(data.size()));
   llvm::SmallVector<mlir::NamedAttribute> attrs{dataAttr, sizeAttr};
   return create<fir::StringLitOp>(loc, llvm::ArrayRef<mlir::Type>{type},
-                                  std::nullopt, attrs);
+                                  mlir::ValueRange{}, attrs);
 }
 
 mlir::Value fir::FirOpBuilder::genShape(mlir::Location loc,
@@ -824,7 +824,7 @@ genNullPointerComparison(fir::FirOpBuilder &builder, mlir::Location loc,
   auto intPtrTy = builder.getIntPtrType();
   auto ptrToInt = builder.createConvert(loc, intPtrTy, addr);
   auto c0 = builder.createIntegerConstant(loc, intPtrTy, 0);
-  return builder.create<mlir::arith::CmpIOp>(loc, condition, ptrToInt, c0);
+  return mlir::arith::CmpIOp::create(builder, loc, condition, ptrToInt, c0);
 }
 
 mlir::Value fir::FirOpBuilder::genIsNotNullAddr(mlir::Location loc,
@@ -1028,8 +1028,8 @@ fir::factory::readExtents(fir::FirOpBuilder &builder, mlir::Location loc,
   auto idxTy = builder.getIndexType();
   for (decltype(rank) dim = 0; dim < rank; ++dim) {
     auto dimVal = builder.createIntegerConstant(loc, idxTy, dim);
-    auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                  box.getAddr(), dimVal);
+    auto dimInfo = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy,
+                                          box.getAddr(), dimVal);
     result.emplace_back(dimInfo.getResult(1));
   }
   return result;
@@ -1061,7 +1061,7 @@ fir::ExtendedValue fir::factory::readBoxValue(fir::FirOpBuilder &builder,
   assert(!box.hasAssumedRank() &&
          "cannot read unlimited polymorphic or assumed rank fir.box");
   auto addr =
-      builder.create<fir::BoxAddrOp>(loc, box.getMemTy(), box.getAddr());
+      fir::BoxAddrOp::create(builder, loc, box.getMemTy(), box.getAddr());
   if (box.isCharacter()) {
     auto len = fir::factory::readCharLen(builder, loc, box);
     if (box.rank() == 0)
@@ -1139,13 +1139,13 @@ static llvm::SmallVector<mlir::Value> getFromBox(mlir::Location loc,
     } else if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       if (charTy.hasDynamicLen()) {
         auto idxTy = builder.getIndexType();
-        auto eleSz = builder.create<fir::BoxEleSizeOp>(loc, idxTy, boxVal);
+        auto eleSz = fir::BoxEleSizeOp::create(builder, loc, idxTy, boxVal);
         auto kindBytes =
             builder.getKindMap().getCharacterBitsize(charTy.getFKind()) / 8;
         mlir::Value charSz =
             builder.createIntegerConstant(loc, idxTy, kindBytes);
         mlir::Value len =
-            builder.create<mlir::arith::DivSIOp>(loc, eleSz, charSz);
+            mlir::arith::DivSIOp::create(builder, loc, eleSz, charSz);
         return {len};
       }
     }
@@ -1237,11 +1237,11 @@ fir::ExtendedValue fir::factory::createStringLiteral(fir::FirOpBuilder &builder,
         loc, type, globalName,
         [&](fir::FirOpBuilder &builder) {
           auto stringLitOp = builder.createStringLitOp(loc, str);
-          builder.create<fir::HasValueOp>(loc, stringLitOp);
+          fir::HasValueOp::create(builder, loc, stringLitOp);
         },
         builder.createLinkOnceLinkage());
-  auto addr = builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                            global.getSymbol());
+  auto addr = fir::AddrOfOp::create(builder, loc, global.resultType(),
+                                    global.getSymbol());
   auto len = builder.createIntegerConstant(
       loc, builder.getCharacterLengthType(), str.size());
   return fir::CharBoxValue{addr, len};
@@ -1255,7 +1255,7 @@ fir::factory::createExtents(fir::FirOpBuilder &builder, mlir::Location loc,
   for (auto ext : seqTy.getShape())
     extents.emplace_back(
         ext == fir::SequenceType::getUnknownExtent()
-            ? builder.create<fir::UndefOp>(loc, idxTy).getResult()
+            ? fir::UndefOp::create(builder, loc, idxTy).getResult()
             : builder.createIntegerConstant(loc, idxTy, ext));
   return extents;
 }
@@ -1396,11 +1396,11 @@ void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
     assert(!fir::hasDynamicSize(type));
     auto rhsVal = fir::getBase(rhs);
     if (fir::isa_ref_type(rhsVal.getType()))
-      rhsVal = builder.create<fir::LoadOp>(loc, rhsVal);
+      rhsVal = fir::LoadOp::create(builder, loc, rhsVal);
     mlir::Value lhsAddr = fir::getBase(lhs);
     rhsVal = builder.createConvert(loc, fir::unwrapRefType(lhsAddr.getType()),
                                    rhsVal);
-    builder.create<fir::StoreOp>(loc, rhsVal, lhsAddr);
+    fir::StoreOp::create(builder, loc, rhsVal, lhsAddr);
   }
 }
 
@@ -1421,16 +1421,18 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
     auto &[lFieldName, lFieldTy] = lhsPair;
     auto &[rFieldName, rFieldTy] = rhsPair;
     assert(!fir::hasDynamicSize(lFieldTy) && !fir::hasDynamicSize(rFieldTy));
-    mlir::Value rField = builder.create<fir::FieldIndexOp>(
-        loc, fieldIndexType, rFieldName, rhsType, fir::getTypeParams(rhs));
+    mlir::Value rField =
+        fir::FieldIndexOp::create(builder, loc, fieldIndexType, rFieldName,
+                                  rhsType, fir::getTypeParams(rhs));
     auto rFieldRefType = builder.getRefType(rFieldTy);
-    mlir::Value fromCoor = builder.create<fir::CoordinateOp>(
-        loc, rFieldRefType, fir::getBase(rhs), rField);
-    mlir::Value field = builder.create<fir::FieldIndexOp>(
-        loc, fieldIndexType, lFieldName, lhsType, fir::getTypeParams(lhs));
+    mlir::Value fromCoor = fir::CoordinateOp::create(
+        builder, loc, rFieldRefType, fir::getBase(rhs), rField);
+    mlir::Value field =
+        fir::FieldIndexOp::create(builder, loc, fieldIndexType, lFieldName,
+                                  lhsType, fir::getTypeParams(lhs));
     auto fieldRefType = builder.getRefType(lFieldTy);
-    mlir::Value toCoor = builder.create<fir::CoordinateOp>(
-        loc, fieldRefType, fir::getBase(lhs), field);
+    mlir::Value toCoor = fir::CoordinateOp::create(builder, loc, fieldRefType,
+                                                   fir::getBase(lhs), field);
     std::optional<fir::DoLoopOp> outerLoop;
     if (auto sequenceType = mlir::dyn_cast<fir::SequenceType>(lFieldTy)) {
       // Create loops to assign array components elements by elements.
@@ -1444,7 +1446,7 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
       for (auto extent : llvm::reverse(sequenceType.getShape())) {
         // TODO: add zero size test !
         mlir::Value ub = builder.createIntegerConstant(loc, idxTy, extent - 1);
-        auto loop = builder.create<fir::DoLoopOp>(loc, zero, ub, one);
+        auto loop = fir::DoLoopOp::create(builder, loc, zero, ub, one);
         if (!outerLoop)
           outerLoop = loop;
         indices.push_back(loop.getInductionVar());
@@ -1453,19 +1455,19 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
       // Set indices in column-major order.
       std::reverse(indices.begin(), indices.end());
       auto elementRefType = builder.getRefType(sequenceType.getEleTy());
-      toCoor = builder.create<fir::CoordinateOp>(loc, elementRefType, toCoor,
-                                                 indices);
-      fromCoor = builder.create<fir::CoordinateOp>(loc, elementRefType,
-                                                   fromCoor, indices);
+      toCoor = fir::CoordinateOp::create(builder, loc, elementRefType, toCoor,
+                                         indices);
+      fromCoor = fir::CoordinateOp::create(builder, loc, elementRefType,
+                                           fromCoor, indices);
     }
     if (auto fieldEleTy = fir::unwrapSequenceType(lFieldTy);
         mlir::isa<fir::BaseBoxType>(fieldEleTy)) {
       assert(mlir::isa<fir::PointerType>(
                  mlir::cast<fir::BaseBoxType>(fieldEleTy).getEleTy()) &&
              "allocatable members require deep copy");
-      auto fromPointerValue = builder.create<fir::LoadOp>(loc, fromCoor);
+      auto fromPointerValue = fir::LoadOp::create(builder, loc, fromCoor);
       auto castTo = builder.createConvert(loc, fieldEleTy, fromPointerValue);
-      builder.create<fir::StoreOp>(loc, castTo, toCoor);
+      fir::StoreOp::create(builder, loc, castTo, toCoor);
     } else {
       auto from =
           fir::factory::componentToExtendedValue(builder, loc, fromCoor);
@@ -1543,7 +1545,7 @@ void fir::factory::genRecordAssignment(fir::FirOpBuilder &builder,
     // runtime interface, but assume the fir.box is unchanged.
     // TODO: does this holds true with polymorphic entities ?
     auto toMutableBox = builder.createTemporary(loc, to.getType());
-    builder.create<fir::StoreOp>(loc, to, toMutableBox);
+    fir::StoreOp::create(builder, loc, to, toMutableBox);
     if (isTemporaryLHS)
       fir::runtime::genAssignTemporary(builder, loc, toMutableBox, from);
     else
@@ -1588,12 +1590,12 @@ mlir::Value fir::factory::genLenOfCharacter(
   auto idxTy = builder.getIndexType();
   auto zero = builder.createIntegerConstant(loc, idxTy, 0);
   auto saturatedDiff = [&](mlir::Value lower, mlir::Value upper) {
-    auto diff = builder.create<mlir::arith::SubIOp>(loc, upper, lower);
+    auto diff = mlir::arith::SubIOp::create(builder, loc, upper, lower);
     auto one = builder.createIntegerConstant(loc, idxTy, 1);
-    auto size = builder.create<mlir::arith::AddIOp>(loc, diff, one);
-    auto cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::sgt, size, zero);
-    return builder.create<mlir::arith::SelectOp>(loc, cmp, size, zero);
+    auto size = mlir::arith::AddIOp::create(builder, loc, diff, one);
+    auto cmp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::sgt, size, zero);
+    return mlir::arith::SelectOp::create(builder, loc, cmp, size, zero);
   };
   if (substring.size() == 2) {
     auto upper = builder.createConvert(loc, idxTy, substring.back());
@@ -1615,7 +1617,7 @@ mlir::Value fir::factory::genLenOfCharacter(
   }
   if (fir::isa_box_type(memref.getType())) {
     if (mlir::isa<fir::BoxCharType>(memref.getType()))
-      return builder.create<fir::BoxCharLenOp>(loc, idxTy, memref);
+      return fir::BoxCharLenOp::create(builder, loc, idxTy, memref);
     if (mlir::isa<fir::BoxType>(memref.getType()))
       return CharacterExprHelper(builder, loc).readLengthFromBox(memref);
     fir::emitFatalError(loc, "memref has wrong type");
@@ -1684,10 +1686,10 @@ mlir::Value fir::factory::genMaxWithZero(fir::FirOpBuilder &builder,
     if (auto cst = mlir::dyn_cast<mlir::arith::ConstantOp>(definingOp))
       if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue()))
         return intAttr.getInt() > 0 ? value : zero;
-  mlir::Value valueIsGreater = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::sgt, value, zero);
-  return builder.create<mlir::arith::SelectOp>(loc, valueIsGreater, value,
-                                               zero);
+  mlir::Value valueIsGreater = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sgt, value, zero);
+  return mlir::arith::SelectOp::create(builder, loc, valueIsGreater, value,
+                                       zero);
 }
 
 mlir::Value fir::factory::genMaxWithZero(fir::FirOpBuilder &builder,
@@ -1703,8 +1705,8 @@ mlir::Value fir::factory::computeExtent(fir::FirOpBuilder &builder,
                                         mlir::Value one) {
   mlir::Type type = lb.getType();
   // Let the folder deal with the common `ub - <const> + 1` case.
-  auto diff = builder.create<mlir::arith::SubIOp>(loc, type, ub, lb);
-  auto rawExtent = builder.create<mlir::arith::AddIOp>(loc, type, diff, one);
+  auto diff = mlir::arith::SubIOp::create(builder, loc, type, ub, lb);
+  auto rawExtent = mlir::arith::AddIOp::create(builder, loc, type, diff, one);
   return fir::factory::genMaxWithZero(builder, loc, rawExtent, zero);
 }
 mlir::Value fir::factory::computeExtent(fir::FirOpBuilder &builder,
@@ -1724,8 +1726,8 @@ genCPtrOrCFunptrFieldIndex(fir::FirOpBuilder &builder, mlir::Location loc,
   auto addrFieldName = recTy.getTypeList()[0].first;
   mlir::Type addrFieldTy = recTy.getTypeList()[0].second;
   auto fieldIndexType = fir::FieldType::get(cptrTy.getContext());
-  mlir::Value addrFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, addrFieldName, recTy,
+  mlir::Value addrFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, addrFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
   return {addrFieldIndex, addrFieldTy};
 }
@@ -1736,8 +1738,8 @@ mlir::Value fir::factory::genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder,
                                                mlir::Type ty) {
   auto [addrFieldIndex, addrFieldTy] =
       genCPtrOrCFunptrFieldIndex(builder, loc, ty);
-  return builder.create<fir::CoordinateOp>(loc, builder.getRefType(addrFieldTy),
-                                           cPtr, addrFieldIndex);
+  return fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(addrFieldTy), cPtr, addrFieldIndex);
 }
 
 mlir::Value fir::factory::genCDevPtrAddr(fir::FirOpBuilder &builder,
@@ -1748,15 +1750,15 @@ mlir::Value fir::factory::genCDevPtrAddr(fir::FirOpBuilder &builder,
   auto cptrFieldName = recTy.getTypeList()[0].first;
   mlir::Type cptrFieldTy = recTy.getTypeList()[0].second;
   auto fieldIndexType = fir::FieldType::get(ty.getContext());
-  mlir::Value cptrFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, cptrFieldName, recTy,
+  mlir::Value cptrFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, cptrFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  auto cptrCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(cptrFieldTy), cDevPtr, cptrFieldIndex);
+  auto cptrCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(cptrFieldTy), cDevPtr, cptrFieldIndex);
   auto [addrFieldIndex, addrFieldTy] =
       genCPtrOrCFunptrFieldIndex(builder, loc, cptrFieldTy);
-  return builder.create<fir::CoordinateOp>(loc, builder.getRefType(addrFieldTy),
-                                           cptrCoord, addrFieldIndex);
+  return fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(addrFieldTy), cptrCoord, addrFieldIndex);
 }
 
 mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
@@ -1769,13 +1771,13 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
         genCPtrOrCFunptrFieldIndex(builder, loc, cPtrTy);
     mlir::Value cPtrCoor;
     if (fir::isa_ref_type(cPtr.getType())) {
-      cPtrCoor = builder.create<fir::CoordinateOp>(
-          loc, builder.getRefType(addrFieldTy), cPtr, addrFieldIndex);
+      cPtrCoor = fir::CoordinateOp::create(
+          builder, loc, builder.getRefType(addrFieldTy), cPtr, addrFieldIndex);
     } else {
       auto arrayAttr = builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 0)});
-      cPtrCoor = builder.create<fir::ExtractValueOp>(loc, addrFieldTy, cPtr,
-                                                     arrayAttr);
+      cPtrCoor = fir::ExtractValueOp::create(builder, loc, addrFieldTy, cPtr,
+                                             arrayAttr);
     }
     return genCPtrOrCFunptrValue(builder, loc, cPtrCoor);
   }
@@ -1783,13 +1785,14 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
   if (fir::isa_ref_type(cPtr.getType())) {
     mlir::Value cPtrAddr =
         fir::factory::genCPtrOrCFunptrAddr(builder, loc, cPtr, cPtrTy);
-    return builder.create<fir::LoadOp>(loc, cPtrAddr);
+    return fir::LoadOp::create(builder, loc, cPtrAddr);
   }
   auto [addrFieldIndex, addrFieldTy] =
       genCPtrOrCFunptrFieldIndex(builder, loc, cPtrTy);
   auto arrayAttr =
       builder.getArrayAttr({builder.getIntegerAttr(builder.getIndexType(), 0)});
-  return builder.create<fir::ExtractValueOp>(loc, addrFieldTy, cPtr, arrayAttr);
+  return fir::ExtractValueOp::create(builder, loc, addrFieldTy, cPtr,
+                                     arrayAttr);
 }
 
 fir::BoxValue fir::factory::createBoxValue(fir::FirOpBuilder &builder,
@@ -1837,8 +1840,8 @@ mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
   if (!boxTy)
     fir::emitFatalError(loc, "Procedure pointer must be of BoxProcType");
   auto boxEleTy{fir::unwrapRefType(boxTy.getEleTy())};
-  mlir::Value initVal{builder.create<fir::ZeroOp>(loc, boxEleTy)};
-  return builder.create<fir::EmboxProcOp>(loc, boxTy, initVal);
+  mlir::Value initVal{fir::ZeroOp::create(builder, loc, boxEleTy)};
+  return fir::EmboxProcOp::create(builder, loc, boxTy, initVal);
 }
 
 void fir::factory::setInternalLinkage(mlir::func::FuncOp func) {
@@ -1897,15 +1900,15 @@ llvm::SmallVector<mlir::Value> fir::factory::updateRuntimeExtentsForEmptyArrays(
     mlir::Type type = extent.getType();
     mlir::Value zero = createZeroValue(builder, loc, type);
     zeroes.push_back(zero);
-    mlir::Value isZero = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, extent, zero);
-    isEmpty = builder.create<mlir::arith::OrIOp>(loc, isEmpty, isZero);
+    mlir::Value isZero = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+    isEmpty = mlir::arith::OrIOp::create(builder, loc, isEmpty, isZero);
   }
 
   llvm::SmallVector<mlir::Value> newExtents;
   for (auto [zero, extent] : llvm::zip_equal(zeroes, extents)) {
     newExtents.push_back(
-        builder.create<mlir::arith::SelectOp>(loc, isEmpty, zero, extent));
+        mlir::arith::SelectOp::create(builder, loc, isEmpty, zero, extent));
   }
   return newExtents;
 }
@@ -1926,7 +1929,7 @@ void fir::factory::genDimInfoFromBox(
   for (unsigned i = 0; i < rank; ++i) {
     mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
     auto dimInfo =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
+        fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, box, dim);
     if (lbounds)
       lbounds->push_back(dimInfo.getLowerBound());
     if (extents)
@@ -1943,12 +1946,12 @@ mlir::Value fir::factory::genLifetimeStart(mlir::OpBuilder &builder,
   mlir::Type ptrTy = mlir::LLVM::LLVMPointerType::get(
       alloc.getContext(), getAllocaAddressSpace(dl));
   mlir::Value cast =
-      builder.create<fir::ConvertOp>(loc, ptrTy, alloc.getResult());
-  builder.create<mlir::LLVM::LifetimeStartOp>(loc, size, cast);
+      fir::ConvertOp::create(builder, loc, ptrTy, alloc.getResult());
+  mlir::LLVM::LifetimeStartOp::create(builder, loc, size, cast);
   return cast;
 }
 
 void fir::factory::genLifetimeEnd(mlir::OpBuilder &builder, mlir::Location loc,
                                   mlir::Value cast, int64_t size) {
-  builder.create<mlir::LLVM::LifetimeEndOp>(loc, size, cast);
+  mlir::LLVM::LifetimeEndOp::create(builder, loc, size, cast);
 }
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index e59a6bf2bf224..c3948f2caf67b 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -50,7 +50,7 @@ hlfir::getExplicitExtentsFromShape(mlir::Value shape,
       int64_t extent = exprShape[i];
       mlir::Value extentVal;
       if (extent == expr.getUnknownExtent()) {
-        auto op = builder.create<hlfir::GetExtentOp>(shape.getLoc(), shape, i);
+        auto op = hlfir::GetExtentOp::create(builder, shape.getLoc(), shape, i);
         extentVal = op.getResult();
       } else {
         extentVal =
@@ -150,7 +150,7 @@ static mlir::Value genCharacterVariableLength(mlir::Location loc,
     return builder.createIntegerConstant(loc, builder.getIndexType(),
                                          charType.getLen());
   if (var.isMutableBox())
-    var = hlfir::Entity{builder.create<fir::LoadOp>(loc, var)};
+    var = hlfir::Entity{fir::LoadOp::create(builder, loc, var)};
   mlir::Value len = fir::factory::CharacterExprHelper{builder, loc}.getLength(
       var.getFirBase());
   assert(len && "failed to retrieve length");
@@ -164,8 +164,8 @@ static fir::CharBoxValue genUnboxChar(mlir::Location loc,
     return {emboxChar.getMemref(), emboxChar.getLen()};
   mlir::Type refType = fir::ReferenceType::get(
       mlir::cast<fir::BoxCharType>(boxChar.getType()).getEleTy());
-  auto unboxed = builder.create<fir::UnboxCharOp>(
-      loc, refType, builder.getIndexType(), boxChar);
+  auto unboxed = fir::UnboxCharOp::create(builder, loc, refType,
+                                          builder.getIndexType(), boxChar);
   mlir::Value addr = unboxed.getResult(0);
   mlir::Value len = unboxed.getResult(1);
   if (auto varIface = boxChar.getDefiningOp<fir::FortranVariableOpInterface>())
@@ -278,8 +278,9 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                          box.nonDeferredLenParams().end());
       },
       [](const auto &) {});
-  auto declareOp = builder.create<hlfir::DeclareOp>(
-      loc, base, name, shapeOrShift, lenParams, dummyScope, flags, dataAttr);
+  auto declareOp =
+      hlfir::DeclareOp::create(builder, loc, base, name, shapeOrShift,
+                               lenParams, dummyScope, flags, dataAttr);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
@@ -312,12 +313,12 @@ hlfir::genAssociateExpr(mlir::Location loc, fir::FirOpBuilder &builder,
   genLengthParameters(loc, builder, value, lenParams);
   if (attr) {
     assert(name.empty() && "It attribute is provided, no-name is expected");
-    return builder.create<hlfir::AssociateOp>(loc, source, shape, lenParams,
-                                              fir::FortranVariableFlagsAttr{},
-                                              llvm::ArrayRef{*attr});
+    return hlfir::AssociateOp::create(builder, loc, source, shape, lenParams,
+                                      fir::FortranVariableFlagsAttr{},
+                                      llvm::ArrayRef{*attr});
   }
-  return builder.create<hlfir::AssociateOp>(loc, source, name, shape, lenParams,
-                                            fir::FortranVariableFlagsAttr{});
+  return hlfir::AssociateOp::create(builder, loc, source, name, shape,
+                                    lenParams, fir::FortranVariableFlagsAttr{});
 }
 
 mlir::Value hlfir::genVariableRawAddress(mlir::Location loc,
@@ -326,12 +327,12 @@ mlir::Value hlfir::genVariableRawAddress(mlir::Location loc,
   assert(var.isVariable() && "only address of variables can be taken");
   mlir::Value baseAddr = var.getFirBase();
   if (var.isMutableBox())
-    baseAddr = builder.create<fir::LoadOp>(loc, baseAddr);
+    baseAddr = fir::LoadOp::create(builder, loc, baseAddr);
   // Get raw address.
   if (mlir::isa<fir::BoxCharType>(var.getType()))
     baseAddr = genUnboxChar(loc, builder, var.getBase()).getAddr();
   if (mlir::isa<fir::BaseBoxType>(baseAddr.getType()))
-    baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
+    baseAddr = fir::BoxAddrOp::create(builder, loc, baseAddr);
   return baseAddr;
 }
 
@@ -350,8 +351,8 @@ mlir::Value hlfir::genVariableBoxChar(mlir::Location loc,
       fir::BoxCharType::get(builder.getContext(), charType.getFKind());
   auto scalarAddr =
       builder.createConvert(loc, fir::ReferenceType::get(charType), addr);
-  return builder.create<fir::EmboxCharOp>(loc, boxCharType, scalarAddr,
-                                          lengths[0]);
+  return fir::EmboxCharOp::create(builder, loc, boxCharType, scalarAddr,
+                                  lengths[0]);
 }
 
 static hlfir::Entity changeBoxAttributes(mlir::Location loc,
@@ -365,8 +366,8 @@ static hlfir::Entity changeBoxAttributes(mlir::Location loc,
       getNonDefaultLowerBounds(loc, builder, var);
   if (!lbounds.empty())
     shift = builder.genShift(loc, lbounds);
-  auto rebox = builder.create<fir::ReboxOp>(loc, forceBoxType, var, shift,
-                                            /*slice=*/nullptr);
+  auto rebox = fir::ReboxOp::create(builder, loc, forceBoxType, var, shift,
+                                    /*slice=*/nullptr);
   return hlfir::Entity{rebox};
 }
 
@@ -404,9 +405,8 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
         fir::ReferenceType::get(fir::unwrapRefType(forceBoxType.getEleTy()));
     addr = builder.createConvert(loc, baseType, addr);
   }
-  auto embox =
-      builder.create<fir::EmboxOp>(loc, boxType, addr, shape,
-                                   /*slice=*/mlir::Value{}, typeParams);
+  auto embox = fir::EmboxOp::create(builder, loc, boxType, addr, shape,
+                                    /*slice=*/mlir::Value{}, typeParams);
   return hlfir::Entity{embox.getResult()};
 }
 
@@ -416,7 +416,7 @@ hlfir::Entity hlfir::loadTrivialScalar(mlir::Location loc,
   entity = derefPointersAndAllocatables(loc, builder, entity);
   if (entity.isVariable() && entity.isScalar() &&
       fir::isa_trivial(entity.getFortranElementType())) {
-    return Entity{builder.create<fir::LoadOp>(loc, entity)};
+    return Entity{fir::LoadOp::create(builder, loc, entity)};
   }
   return entity;
 }
@@ -429,8 +429,8 @@ hlfir::Entity hlfir::getElementAt(mlir::Location loc,
   llvm::SmallVector<mlir::Value> lenParams;
   genLengthParameters(loc, builder, entity, lenParams);
   if (mlir::isa<hlfir::ExprType>(entity.getType()))
-    return hlfir::Entity{builder.create<hlfir::ApplyOp>(
-        loc, entity, oneBasedIndices, lenParams)};
+    return hlfir::Entity{hlfir::ApplyOp::create(builder, loc, entity,
+                                                oneBasedIndices, lenParams)};
   // Build hlfir.designate. The lower bounds may need to be added to
   // the oneBasedIndices since hlfir.designate expect indices
   // based on the array operand lower bounds.
@@ -445,16 +445,16 @@ hlfir::Entity hlfir::getElementAt(mlir::Location loc,
     for (auto [oneBased, lb] : llvm::zip(oneBasedIndices, lbounds)) {
       auto lbIdx = builder.createConvert(loc, idxTy, lb);
       auto oneBasedIdx = builder.createConvert(loc, idxTy, oneBased);
-      auto shift = builder.create<mlir::arith::SubIOp>(loc, lbIdx, one);
+      auto shift = mlir::arith::SubIOp::create(builder, loc, lbIdx, one);
       mlir::Value index =
-          builder.create<mlir::arith::AddIOp>(loc, oneBasedIdx, shift);
+          mlir::arith::AddIOp::create(builder, loc, oneBasedIdx, shift);
       indices.push_back(index);
     }
-    designate = builder.create<hlfir::DesignateOp>(loc, resultType, entity,
-                                                   indices, lenParams);
+    designate = hlfir::DesignateOp::create(builder, loc, resultType, entity,
+                                           indices, lenParams);
   } else {
-    designate = builder.create<hlfir::DesignateOp>(loc, resultType, entity,
-                                                   oneBasedIndices, lenParams);
+    designate = hlfir::DesignateOp::create(builder, loc, resultType, entity,
+                                           oneBasedIndices, lenParams);
   }
   return mlir::cast<fir::FortranVariableOpInterface>(designate.getOperation());
 }
@@ -467,8 +467,8 @@ static mlir::Value genUBound(mlir::Location loc, fir::FirOpBuilder &builder,
       return extent;
   extent = builder.createConvert(loc, one.getType(), extent);
   lb = builder.createConvert(loc, one.getType(), lb);
-  auto add = builder.create<mlir::arith::AddIOp>(loc, lb, extent);
-  return builder.create<mlir::arith::SubIOp>(loc, add, one);
+  auto add = mlir::arith::AddIOp::create(builder, loc, lb, extent);
+  return mlir::arith::SubIOp::create(builder, loc, add, one);
 }
 
 llvm::SmallVector<std::pair<mlir::Value, mlir::Value>>
@@ -557,8 +557,8 @@ static mlir::Value computeVariableExtent(mlir::Location loc,
   assert(mlir::isa<fir::BaseBoxType>(variable.getType()) &&
          "array variable with dynamic extent must be boxed");
   mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
-  auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                variable, dimVal);
+  auto dimInfo = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy,
+                                        variable, dimVal);
   return dimInfo.getExtent();
 }
 llvm::SmallVector<mlir::Value> getVariableExtents(mlir::Location loc,
@@ -608,14 +608,14 @@ mlir::Value hlfir::genShape(mlir::Location loc, fir::FirOpBuilder &builder,
       return shape;
     if (mlir::isa<fir::ShapeShiftType>(shape.getType()))
       if (auto s = shape.getDefiningOp<fir::ShapeShiftOp>())
-        return builder.create<fir::ShapeOp>(loc, s.getExtents());
+        return fir::ShapeOp::create(builder, loc, s.getExtents());
   }
   if (mlir::isa<hlfir::ExprType>(entity.getType()))
-    return builder.create<hlfir::ShapeOfOp>(loc, entity.getBase());
+    return hlfir::ShapeOfOp::create(builder, loc, entity.getBase());
   // There is no shape lying around for this entity. Retrieve the extents and
   // build a new fir.shape.
-  return builder.create<fir::ShapeOp>(loc,
-                                      getVariableExtents(loc, builder, entity));
+  return fir::ShapeOp::create(builder, loc,
+                              getVariableExtents(loc, builder, entity));
 }
 
 llvm::SmallVector<mlir::Value>
@@ -668,7 +668,7 @@ mlir::Value hlfir::genLBound(mlir::Location loc, fir::FirOpBuilder &builder,
   mlir::Type idxTy = builder.getIndexType();
   mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
   auto dimInfo =
-      builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, entity, dimVal);
+      fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, entity, dimVal);
   return dimInfo.getLowerBound();
 }
 
@@ -708,7 +708,7 @@ void hlfir::genLengthParameters(mlir::Location loc, fir::FirOpBuilder &builder,
       return;
     }
     if (entity.isCharacter()) {
-      result.push_back(builder.create<hlfir::GetLengthOp>(loc, expr));
+      result.push_back(hlfir::GetLengthOp::create(builder, loc, expr));
       return;
     }
     TODO(loc, "inquire PDTs length parameters of hlfir.expr");
@@ -735,7 +735,7 @@ mlir::Value hlfir::genRank(mlir::Location loc, fir::FirOpBuilder &builder,
     return builder.createIntegerConstant(loc, resultType, entity.getRank());
   assert(entity.isBoxAddressOrValue() &&
          "assumed-ranks are box addresses or values");
-  return builder.create<fir::BoxRankOp>(loc, resultType, entity);
+  return fir::BoxRankOp::create(builder, loc, resultType, entity);
 }
 
 // Return a "shape" that can be used in fir.embox/fir.rebox with \p exv base.
@@ -796,20 +796,20 @@ hlfir::Entity hlfir::derefPointersAndAllocatables(mlir::Location loc,
                                                   fir::FirOpBuilder &builder,
                                                   Entity entity) {
   if (entity.isMutableBox()) {
-    hlfir::Entity boxLoad{builder.create<fir::LoadOp>(loc, entity)};
+    hlfir::Entity boxLoad{fir::LoadOp::create(builder, loc, entity)};
     if (entity.isScalar()) {
       if (!entity.isPolymorphic() && !entity.hasLengthParameters())
-        return hlfir::Entity{builder.create<fir::BoxAddrOp>(loc, boxLoad)};
+        return hlfir::Entity{fir::BoxAddrOp::create(builder, loc, boxLoad)};
       mlir::Type elementType = boxLoad.getFortranElementType();
       if (auto charType = mlir::dyn_cast<fir::CharacterType>(elementType)) {
-        mlir::Value base = builder.create<fir::BoxAddrOp>(loc, boxLoad);
+        mlir::Value base = fir::BoxAddrOp::create(builder, loc, boxLoad);
         if (charType.hasConstantLen())
           return hlfir::Entity{base};
         mlir::Value len = genCharacterVariableLength(loc, builder, entity);
         auto boxCharType =
             fir::BoxCharType::get(builder.getContext(), charType.getFKind());
         return hlfir::Entity{
-            builder.create<fir::EmboxCharOp>(loc, boxCharType, base, len)
+            fir::EmboxCharOp::create(builder, loc, boxCharType, base, len)
                 .getResult()};
       }
     }
@@ -819,7 +819,7 @@ hlfir::Entity hlfir::derefPointersAndAllocatables(mlir::Location loc,
     // information. Keep them boxed.
     return boxLoad;
   } else if (entity.isProcedurePointer()) {
-    return hlfir::Entity{builder.create<fir::LoadOp>(loc, entity)};
+    return hlfir::Entity{fir::LoadOp::create(builder, loc, entity)};
   }
   return entity;
 }
@@ -870,8 +870,8 @@ hlfir::ElementalOp hlfir::genElementalOp(
     mlir::Value polymorphicMold, mlir::Type exprType) {
   if (!exprType)
     exprType = getArrayExprType(elementType, shape, !!polymorphicMold);
-  auto elementalOp = builder.create<hlfir::ElementalOp>(
-      loc, exprType, shape, polymorphicMold, typeParams, isUnordered);
+  auto elementalOp = hlfir::ElementalOp::create(
+      builder, loc, exprType, shape, polymorphicMold, typeParams, isUnordered);
   auto insertPt = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(elementalOp.getBody());
   mlir::Value elementResult = genKernel(loc, builder, elementalOp.getIndices());
@@ -881,7 +881,7 @@ hlfir::ElementalOp hlfir::genElementalOp(
   // here.
   if (fir::isa_trivial(elementResult.getType()))
     elementResult = builder.createConvert(loc, elementType, elementResult);
-  builder.create<hlfir::YieldElementOp>(loc, elementResult);
+  hlfir::YieldElementOp::create(builder, loc, elementResult);
   builder.restoreInsertionPoint(insertPt);
   return elementalOp;
 }
@@ -948,10 +948,10 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
   mlir::OpBuilder::InsertionGuard guard(builder);
   loopNest.oneBasedIndices.assign(extents.size(), mlir::Value{});
   // Build loop nest from column to row.
-  auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
+  auto one = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
   mlir::Type indexType = builder.getIndexType();
   if (emitWorkshareLoop) {
-    auto wslw = builder.create<mlir::omp::WorkshareLoopWrapperOp>(loc);
+    auto wslw = mlir::omp::WorkshareLoopWrapperOp::create(builder, loc);
     loopNest.outerOp = wslw;
     builder.createBlock(&wslw.getRegion());
     mlir::omp::LoopNestOperands lnops;
@@ -961,12 +961,12 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
       lnops.loopUpperBounds.push_back(extent);
       lnops.loopSteps.push_back(one);
     }
-    auto lnOp = builder.create<mlir::omp::LoopNestOp>(loc, lnops);
+    auto lnOp = mlir::omp::LoopNestOp::create(builder, loc, lnops);
     mlir::Block *block = builder.createBlock(&lnOp.getRegion());
     for (auto extent : llvm::reverse(extents))
       block->addArgument(extent.getType(), extent.getLoc());
     loopNest.body = block;
-    builder.create<mlir::omp::YieldOp>(loc);
+    mlir::omp::YieldOp::create(builder, loc);
     for (unsigned dim = 0; dim < extents.size(); dim++)
       loopNest.oneBasedIndices[extents.size() - dim - 1] =
           lnOp.getRegion().front().getArgument(dim);
@@ -975,7 +975,7 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
     for (auto extent : llvm::reverse(extents)) {
       auto ub = builder.createConvert(loc, indexType, extent);
       auto doLoop =
-          builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered);
+          fir::DoLoopOp::create(builder, loc, one, ub, one, isUnordered);
       if (!couldVectorize) {
         mlir::LLVM::LoopVectorizeAttr va{mlir::LLVM::LoopVectorizeAttr::get(
             builder.getContext(),
@@ -1002,7 +1002,7 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
     bool isUnordered) {
   assert(!extents.empty() && "must have at least one extent");
   // Build loop nest from column to row.
-  auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
+  auto one = mlir::arith::ConstantIndexOp::create(builder, loc, 1);
   mlir::Type indexType = builder.getIndexType();
   unsigned dim = extents.size() - 1;
   fir::DoLoopOp outerLoop = nullptr;
@@ -1018,16 +1018,15 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
     // of its parent loop.
     fir::DoLoopOp doLoop;
     if (!parentLoop) {
-      doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered,
-                                             /*finalCountValue=*/false,
-                                             reductionInits);
+      doLoop = fir::DoLoopOp::create(builder, loc, one, ub, one, isUnordered,
+                                     /*finalCountValue=*/false, reductionInits);
     } else {
-      doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered,
-                                             /*finalCountValue=*/false,
-                                             parentLoop.getRegionIterArgs());
+      doLoop = fir::DoLoopOp::create(builder, loc, one, ub, one, isUnordered,
+                                     /*finalCountValue=*/false,
+                                     parentLoop.getRegionIterArgs());
       if (!reductionInits.empty()) {
         // Return the results of the child loop from its parent loop.
-        builder.create<fir::ResultOp>(loc, doLoop.getResults());
+        fir::ResultOp::create(builder, loc, doLoop.getResults());
       }
     }
 
@@ -1044,7 +1043,7 @@ llvm::SmallVector<mlir::Value> hlfir::genLoopNestWithReductions(
       genBody(loc, builder, oneBasedIndices, parentLoop.getRegionIterArgs());
   builder.setInsertionPointToEnd(parentLoop.getBody());
   if (!reductionValues.empty())
-    builder.create<fir::ResultOp>(loc, reductionValues);
+    fir::ResultOp::create(builder, loc, reductionValues);
   builder.setInsertionPointAfter(outerLoop);
   return outerLoop->getResults();
 }
@@ -1057,18 +1056,18 @@ conditionallyEvaluate(mlir::Location loc, fir::FirOpBuilder &builder,
 
   // Evaluate in some region that will be moved into the actual ifOp (the actual
   // ifOp can only be created when the result types are known).
-  auto badIfOp = builder.create<fir::IfOp>(loc, condition.getType(), condition,
-                                           /*withElseRegion=*/false);
+  auto badIfOp = fir::IfOp::create(builder, loc, condition.getType(), condition,
+                                   /*withElseRegion=*/false);
   mlir::Block *preparationBlock = &badIfOp.getThenRegion().front();
   builder.setInsertionPointToStart(preparationBlock);
   fir::ExtendedValue result = genIfTrue();
   fir::ResultOp resultOp = result.match(
       [&](const fir::CharBoxValue &box) -> fir::ResultOp {
-        return builder.create<fir::ResultOp>(
-            loc, mlir::ValueRange{box.getAddr(), box.getLen()});
+        return fir::ResultOp::create(
+            builder, loc, mlir::ValueRange{box.getAddr(), box.getLen()});
       },
       [&](const mlir::Value &addr) -> fir::ResultOp {
-        return builder.create<fir::ResultOp>(loc, addr);
+        return fir::ResultOp::create(builder, loc, addr);
       },
       [&](const auto &) -> fir::ResultOp {
         TODO(loc, "unboxing non scalar optional fir.box");
@@ -1077,8 +1076,8 @@ conditionallyEvaluate(mlir::Location loc, fir::FirOpBuilder &builder,
 
   // Create actual fir.if operation.
   auto ifOp =
-      builder.create<fir::IfOp>(loc, resultOp->getOperandTypes(), condition,
-                                /*withElseRegion=*/true);
+      fir::IfOp::create(builder, loc, resultOp->getOperandTypes(), condition,
+                        /*withElseRegion=*/true);
   // Move evaluation into Then block,
   preparationBlock->moveBefore(&ifOp.getThenRegion().back());
   ifOp.getThenRegion().back().erase();
@@ -1087,11 +1086,11 @@ conditionallyEvaluate(mlir::Location loc, fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> absentValues;
   for (mlir::Type resTy : ifOp->getResultTypes()) {
     if (fir::isa_ref_type(resTy) || fir::isa_box_type(resTy))
-      absentValues.emplace_back(builder.create<fir::AbsentOp>(loc, resTy));
+      absentValues.emplace_back(fir::AbsentOp::create(builder, loc, resTy));
     else
-      absentValues.emplace_back(builder.create<fir::ZeroOp>(loc, resTy));
+      absentValues.emplace_back(fir::ZeroOp::create(builder, loc, resTy));
   }
-  builder.create<fir::ResultOp>(loc, absentValues);
+  fir::ResultOp::create(builder, loc, absentValues);
   badIfOp->erase();
 
   // Build fir::ExtendedValue from the result values.
@@ -1139,8 +1138,8 @@ static fir::ExtendedValue translateVariableToExtendedValue(
     }
     if (variable.mayBeOptional()) {
       if (!keepScalarOptionalBoxed && variable.isScalar()) {
-        mlir::Value isPresent = builder.create<fir::IsPresentOp>(
-            loc, builder.getI1Type(), variable);
+        mlir::Value isPresent = fir::IsPresentOp::create(
+            builder, loc, builder.getI1Type(), variable);
         return conditionallyEvaluate(
             loc, builder, isPresent, [&]() -> fir::ExtendedValue {
               mlir::Value base = genVariableRawAddress(loc, builder, variable);
@@ -1249,7 +1248,7 @@ static fir::ExtendedValue placeTrivialInMemory(mlir::Location loc,
   if (targetType != val.getType())
     builder.createStoreWithConvert(loc, val, temp);
   else
-    builder.create<fir::StoreOp>(loc, val, temp);
+    fir::StoreOp::create(builder, loc, val, temp);
   return temp;
 }
 
@@ -1369,8 +1368,8 @@ hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
          llvm::ArrayRef<mlir::Value> typeParams,
          fir::FortranVariableFlagsAttr attrs) -> mlir::Value {
     auto declareOp =
-        builder.create<hlfir::DeclareOp>(loc, memref, name, shape, typeParams,
-                                         /*dummy_scope=*/nullptr, attrs);
+        hlfir::DeclareOp::create(builder, loc, memref, name, shape, typeParams,
+                                 /*dummy_scope=*/nullptr, attrs);
     return declareOp.getBase();
   };
 
@@ -1403,11 +1402,11 @@ hlfir::Entity hlfir::createStackTempFromMold(mlir::Location loc,
         builder.createTemporary(loc, sequenceType, tmpName, extents, lenParams);
   } else {
     alloc = builder.createTemporary(loc, mold.getFortranElementType(), tmpName,
-                                    /*shape=*/std::nullopt, lenParams);
+                                    /*shape=*/{}, lenParams);
   }
   auto declareOp =
-      builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape, lenParams,
-                                       /*dummy_scope=*/nullptr, declAttrs);
+      hlfir::DeclareOp::create(builder, loc, alloc, tmpName, shape, lenParams,
+                               /*dummy_scope=*/nullptr, declAttrs);
   return hlfir::Entity{declareOp.getBase()};
 }
 
@@ -1422,8 +1421,8 @@ hlfir::convertCharacterKind(mlir::Location loc, fir::FirOpBuilder &builder,
   if (src.second.has_value())
     src.second.value()();
 
-  return hlfir::EntityWithAttributes{builder.create<hlfir::DeclareOp>(
-      loc, res.getAddr(), ".temp.kindconvert", /*shape=*/nullptr,
+  return hlfir::EntityWithAttributes{hlfir::DeclareOp::create(
+      builder, loc, res.getAddr(), ".temp.kindconvert", /*shape=*/nullptr,
       /*typeparams=*/mlir::ValueRange{res.getLen()},
       /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{})};
 }
@@ -1494,10 +1493,10 @@ hlfir::genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
     }
     auto shapeShiftType = fir::ShapeShiftType::get(builder.getContext(), rank);
     mlir::Value shapeShift =
-        builder.create<fir::ShapeShiftOp>(loc, shapeShiftType, lbAndExtents);
-    auto declareOp = builder.create<hlfir::DeclareOp>(
-        loc, associate.getFirBase(), *associate.getUniqName(), shapeShift,
-        associate.getTypeparams(), /*dummy_scope=*/nullptr,
+        fir::ShapeShiftOp::create(builder, loc, shapeShiftType, lbAndExtents);
+    auto declareOp = hlfir::DeclareOp::create(
+        builder, loc, associate.getFirBase(), *associate.getUniqName(),
+        shapeShift, associate.getTypeparams(), /*dummy_scope=*/nullptr,
         /*flags=*/fir::FortranVariableFlagsAttr{});
     hlfir::Entity castWithLbounds =
         mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
@@ -1536,8 +1535,8 @@ std::pair<hlfir::Entity, bool> hlfir::computeEvaluateOpInNewTemp(
                                                    extents, typeParams);
   mlir::Value innerMemory = evalInMem.getMemory();
   temp = builder.createConvert(loc, innerMemory.getType(), temp);
-  auto declareOp = builder.create<hlfir::DeclareOp>(
-      loc, temp, tmpName, shape, typeParams,
+  auto declareOp = hlfir::DeclareOp::create(
+      builder, loc, temp, tmpName, shape, typeParams,
       /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
   computeEvaluateOpIn(loc, builder, evalInMem, declareOp.getOriginalBase());
   return {hlfir::Entity{declareOp.getBase()}, /*heapAllocated=*/heapAllocated};
@@ -1601,7 +1600,7 @@ hlfir::Entity hlfir::gen1DSection(mlir::Location loc,
     }
   }
   mlir::Value sectionShape =
-      builder.create<fir::ShapeOp>(loc, extents[dim - 1]);
+      fir::ShapeOp::create(builder, loc, extents[dim - 1]);
 
   // The result type is one of:
   //   !fir.box/class<!fir.array<NxT>>
@@ -1617,9 +1616,9 @@ hlfir::Entity hlfir::gen1DSection(mlir::Location loc,
       fir::SequenceType::get({dimExtent}, seqType.getEleTy());
   sectionType = fir::wrapInClassOrBoxType(sectionType, array.isPolymorphic());
 
-  auto designate = builder.create<hlfir::DesignateOp>(
-      loc, sectionType, array, /*component=*/"", /*componentShape=*/nullptr,
-      subscripts,
+  auto designate = hlfir::DesignateOp::create(
+      builder, loc, sectionType, array, /*component=*/"",
+      /*componentShape=*/nullptr, subscripts,
       /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
       sectionShape, typeParams);
   return hlfir::Entity{designate.getResult()};
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d32c1fde59f27..b589a6c7c62fb 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -385,13 +385,18 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genChdir,
      {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"clock64", &I::genClock64, {}, /*isElemental=*/false},
+    {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false},
+    {"clock64",
+     &I::genNVVMTime<mlir::NVVM::Clock64Op>,
+     {},
+     /*isElemental=*/false},
     {"cmplx",
      &I::genCmplx,
      {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
     {"command_argument_count", &I::genCommandArgumentCount},
     {"conjg", &I::genConjg},
     {"cosd", &I::genCosd},
+    {"cospi", &I::genCospi},
     {"count",
      &I::genCount,
      {{{"mask", asAddr}, {"dim", asValue}, {"kind", asValue}}},
@@ -503,6 +508,10 @@ static constexpr IntrinsicHandler handlers[]{
     {"getgid", &I::genGetGID},
     {"getpid", &I::genGetPID},
     {"getuid", &I::genGetUID},
+    {"globaltimer",
+     &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>,
+     {},
+     /*isElemental=*/false},
     {"hostnm",
      &I::genHostnm,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -892,6 +901,7 @@ static constexpr IntrinsicHandler handlers[]{
      {{{"number", asValue}, {"handler", asAddr}, {"status", asAddr}}},
      /*isElemental=*/false},
     {"sind", &I::genSind},
+    {"sinpi", &I::genSinpi},
     {"size",
      &I::genSize,
      {{{"array", asBox},
@@ -1101,7 +1111,7 @@ mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
     // was just created from user functions with the same name.
     funcOp->setAttr(fir::FIROpsDialect::getFirRuntimeAttrName(),
                     builder.getUnitAttr());
-    auto libCall = builder.create<fir::CallOp>(loc, funcOp, args);
+    auto libCall = fir::CallOp::create(builder, loc, funcOp, args);
     // TODO: ensure 'strictfp' setting on the call for "precise/strict"
     //       FP mode. Set appropriate Fast-Math Flags otherwise.
     // TODO: we should also mark as many libm function as possible
@@ -1115,7 +1125,7 @@ mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Type soughtFuncType = funcOp.getFunctionType();
 
   if (soughtFuncType == libFuncType) {
-    libCall = builder.create<fir::CallOp>(loc, funcOp, args);
+    libCall = fir::CallOp::create(builder, loc, funcOp, args);
   } else {
     // A function with the same name might have been declared
     // before (e.g. with an explicit interface and a binding label).
@@ -1129,13 +1139,13 @@ mlir::Value genLibCall(fir::FirOpBuilder &builder, mlir::Location loc,
                  llvm::Twine("' may lead to undefined behavior.")));
     mlir::SymbolRefAttr funcSymbolAttr = builder.getSymbolRefAttr(libFuncName);
     mlir::Value funcPointer =
-        builder.create<fir::AddrOfOp>(loc, soughtFuncType, funcSymbolAttr);
+        fir::AddrOfOp::create(builder, loc, soughtFuncType, funcSymbolAttr);
     funcPointer = builder.createConvert(loc, libFuncType, funcPointer);
 
     llvm::SmallVector<mlir::Value, 3> operands{funcPointer};
     operands.append(args.begin(), args.end());
-    libCall = builder.create<fir::CallOp>(loc, mlir::SymbolRefAttr{},
-                                          libFuncType.getResults(), operands);
+    libCall = fir::CallOp::create(builder, loc, mlir::SymbolRefAttr{},
+                                  libFuncType.getResults(), operands);
   }
 
   LLVM_DEBUG(libCall.dump(); llvm::dbgs() << "\n");
@@ -1211,7 +1221,7 @@ mlir::Value genMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
     LLVM_DEBUG(llvm::dbgs() << "Generating '" << mathLibFuncName
                             << "' operation with type ";
                mathLibFuncType.dump(); llvm::dbgs() << "\n");
-    result = builder.create<T>(loc, args);
+    result = T::create(builder, loc, args);
   }
   LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n");
   return result;
@@ -1231,8 +1241,11 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
   llvm::StringRef mathLibFuncName = mathOp.runtimeFunc;
   if (!mathLibFuncName.empty()) {
     // If we enabled MLIR complex or can use approximate operations, we should
-    // NOT use libm.
-    if (!forceMlirComplex && !canUseApprox) {
+    // NOT use libm. Avoid libm when targeting AMDGPU as those symbols are not
+    // available on the device and we rely on MLIR complex operations to
+    // later map to OCML calls.
+    bool isAMDGPU = fir::getTargetTriple(builder.getModule()).isAMDGCN();
+    if (!forceMlirComplex && !canUseApprox && !isAMDGPU) {
       result = genLibCall(builder, loc, mathOp, mathLibFuncType, args);
       LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n");
       return result;
@@ -1246,12 +1259,12 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
   // the argument types for an operation
   if constexpr (T::template hasTrait<
                     mlir::OpTrait::SameOperandsAndResultType>()) {
-    result = builder.create<T>(loc, args);
+    result = T::create(builder, loc, args);
     result = builder.createConvert(loc, mathLibFuncType.getResult(0), result);
   } else {
     auto complexTy = mlir::cast<mlir::ComplexType>(mathLibFuncType.getInput(0));
     auto realTy = complexTy.getElementType();
-    result = builder.create<T>(loc, realTy, args);
+    result = T::create(builder, loc, realTy, args);
     result = builder.createConvert(loc, mathLibFuncType.getResult(0), result);
   }
 
@@ -2449,7 +2462,7 @@ IntrinsicLibrary::outlineInWrapper(GeneratorType generator,
     nameOS << '.' << fmfString;
   }
   mlir::func::FuncOp wrapper = getWrapper(generator, funcName, funcType);
-  return builder.create<fir::CallOp>(loc, wrapper, args).getResult(0);
+  return fir::CallOp::create(builder, loc, wrapper, args).getResult(0);
 }
 
 template <typename GeneratorType>
@@ -2465,7 +2478,7 @@ fir::ExtendedValue IntrinsicLibrary::outlineInExtendedWrapper(
     mlirArgs.emplace_back(toValue(extendedVal, builder, loc));
   mlir::FunctionType funcType = getFunctionType(resultType, mlirArgs, builder);
   mlir::func::FuncOp wrapper = getWrapper(generator, name, funcType);
-  auto call = builder.create<fir::CallOp>(loc, wrapper, mlirArgs);
+  auto call = fir::CallOp::create(builder, loc, wrapper, mlirArgs);
   if (resultType)
     return toExtendedValue(call.getResult(0), builder, loc);
   // Subroutine calls
@@ -2583,9 +2596,9 @@ IntrinsicLibrary::readAndAddCleanUp(fir::MutableBoxValue resultMutableBox,
         return box;
       },
       [&](const mlir::Value &tempAddr) -> fir::ExtendedValue {
-        auto load = builder.create<fir::LoadOp>(loc, resultType, tempAddr);
+        auto load = fir::LoadOp::create(builder, loc, resultType, tempAddr);
         // Temp can be freed right away since it was loaded.
-        builder.create<fir::FreeMemOp>(loc, tempAddr);
+        fir::FreeMemOp::create(builder, loc, tempAddr);
         return load;
       },
       [&](const fir::CharBoxValue &box) -> fir::ExtendedValue {
@@ -2638,9 +2651,9 @@ mlir::Value IntrinsicLibrary::genAbs(mlir::Type resultType,
     // So, implement abs here without branching.
     mlir::Value shift =
         builder.createIntegerConstant(loc, intType, intType.getWidth() - 1);
-    auto mask = builder.create<mlir::arith::ShRSIOp>(loc, arg, shift);
-    auto xored = builder.create<mlir::arith::XOrIOp>(loc, arg, mask);
-    return builder.create<mlir::arith::SubIOp>(loc, xored, mask);
+    auto mask = mlir::arith::ShRSIOp::create(builder, loc, arg, shift);
+    auto xored = mlir::arith::XOrIOp::create(builder, loc, arg, mask);
+    return mlir::arith::SubIOp::create(builder, loc, xored, mask);
   }
   llvm_unreachable("unexpected type in ABS argument");
 }
@@ -2659,7 +2672,7 @@ mlir::Value IntrinsicLibrary::genAcosd(mlir::Type resultType,
   mlir::Value dfactor = builder.createRealConstant(
       loc, mlir::Float64Type::get(context), llvm::APFloat(180.0) / pi);
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
-  return builder.create<mlir::arith::MulFOp>(loc, result, factor);
+  return mlir::arith::MulFOp::create(builder, loc, result, factor);
 }
 
 // ADJUSTL & ADJUSTR
@@ -2812,7 +2825,7 @@ mlir::Value IntrinsicLibrary::genAsind(mlir::Type resultType,
   mlir::Value dfactor = builder.createRealConstant(
       loc, mlir::Float64Type::get(context), llvm::APFloat(180.0) / pi);
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
-  return builder.create<mlir::arith::MulFOp>(loc, result, factor);
+  return mlir::arith::MulFOp::create(builder, loc, result, factor);
 }
 
 // ATAND, ATAN2D
@@ -2826,8 +2839,8 @@ mlir::Value IntrinsicLibrary::genAtand(mlir::Type resultType,
 
   // atand = atan * 180/pi
   if (args.size() == 2) {
-    atan = builder.create<mlir::math::Atan2Op>(loc, fir::getBase(args[0]),
-                                               fir::getBase(args[1]));
+    atan = mlir::math::Atan2Op::create(builder, loc, fir::getBase(args[0]),
+                                       fir::getBase(args[1]));
   } else {
     mlir::FunctionType ftype =
         mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
@@ -2837,7 +2850,7 @@ mlir::Value IntrinsicLibrary::genAtand(mlir::Type resultType,
   mlir::Value dfactor = builder.createRealConstant(
       loc, mlir::Float64Type::get(context), llvm::APFloat(180.0) / pi);
   mlir::Value factor = builder.createConvert(loc, resultType, dfactor);
-  return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
+  return mlir::arith::MulFOp::create(builder, loc, atan, factor);
 }
 
 // ATANPI, ATAN2PI
@@ -2851,8 +2864,8 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
 
   // atanpi = atan / pi
   if (args.size() == 2) {
-    atan = builder.create<mlir::math::Atan2Op>(loc, fir::getBase(args[0]),
-                                               fir::getBase(args[1]));
+    atan = mlir::math::Atan2Op::create(builder, loc, fir::getBase(args[0]),
+                                       fir::getBase(args[1]));
   } else {
     mlir::FunctionType ftype =
         mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
@@ -2862,7 +2875,7 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   mlir::Value dfactor =
       builder.createRealConstant(loc, mlir::Float64Type::get(context), inv_pi);
   mlir::Value factor = builder.createConvert(loc, resultType, dfactor);
-  return builder.create<mlir::arith::MulFOp>(loc, atan, factor);
+  return mlir::arith::MulFOp::create(builder, loc, atan, factor);
 }
 
 static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
@@ -2870,8 +2883,8 @@ static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
                                 mlir::Value arg1) {
   auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
   arg0 = builder.createConvert(loc, llvmPointerType, arg0);
-  return builder.create<mlir::LLVM::AtomicRMWOp>(
-      loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst);
+  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
+                                         mlir::LLVM::AtomicOrdering::seq_cst);
 }
 
 mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
@@ -2929,11 +2942,11 @@ IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
 
   auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
     if (mlir::isa<mlir::Float32Type>(arg.getType()))
-      return builder.create<mlir::LLVM::BitcastOp>(loc, builder.getI32Type(),
-                                                   arg);
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
+                                           arg);
     if (mlir::isa<mlir::Float64Type>(arg.getType()))
-      return builder.create<mlir::LLVM::BitcastOp>(loc, builder.getI64Type(),
-                                                   arg);
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
+                                           arg);
     return arg;
   };
 
@@ -2946,11 +2959,11 @@ IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
   }
 
   auto address =
-      builder.create<mlir::UnrealizedConversionCastOp>(loc, llvmPtrTy, arg0)
+      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
           .getResult(0);
-  auto cmpxchg = builder.create<mlir::LLVM::AtomicCmpXchgOp>(
-      loc, address, arg1, arg2, successOrdering, failureOrdering);
-  return builder.create<mlir::LLVM::ExtractValueOp>(loc, cmpxchg, 1);
+  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
+      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
+  return mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
 }
 
 mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
@@ -3026,31 +3039,31 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
                 mlir::isa<fir::BoxProcType>(ptrTy))) {
     mlir::Value pointerBoxProc =
         fir::isBoxProcAddressType(ptrTy)
-            ? builder.create<fir::LoadOp>(loc, fir::getBase(args[0]))
+            ? fir::LoadOp::create(builder, loc, fir::getBase(args[0]))
             : fir::getBase(args[0]);
     mlir::Value pointerTarget =
-        builder.create<fir::BoxAddrOp>(loc, pointerBoxProc);
+        fir::BoxAddrOp::create(builder, loc, pointerBoxProc);
     if (isStaticallyAbsent(args[1]))
       return builder.genIsNotNullAddr(loc, pointerTarget);
     mlir::Value target = fir::getBase(args[1]);
     if (fir::isBoxProcAddressType(target.getType()))
-      target = builder.create<fir::LoadOp>(loc, target);
+      target = fir::LoadOp::create(builder, loc, target);
     if (mlir::isa<fir::BoxProcType>(target.getType()))
-      target = builder.create<fir::BoxAddrOp>(loc, target);
+      target = fir::BoxAddrOp::create(builder, loc, target);
     mlir::Type intPtrTy = builder.getIntPtrType();
     mlir::Value pointerInt =
         builder.createConvert(loc, intPtrTy, pointerTarget);
     mlir::Value targetInt = builder.createConvert(loc, intPtrTy, target);
-    mlir::Value sameTarget = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, pointerInt, targetInt);
+    mlir::Value sameTarget = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, pointerInt, targetInt);
     mlir::Value zero = builder.createIntegerConstant(loc, intPtrTy, 0);
-    mlir::Value notNull = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, zero, pointerInt);
+    mlir::Value notNull = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne, zero, pointerInt);
     // The not notNull test covers the following two cases:
     // - TARGET is a procedure that is OPTIONAL and absent at runtime.
     // - TARGET is a procedure pointer that is NULL.
     // In both cases, ASSOCIATED should be false if POINTER is NULL.
-    return builder.create<mlir::arith::AndIOp>(loc, sameTarget, notNull);
+    return mlir::arith::AndIOp::create(builder, loc, sameTarget, notNull);
   }
   auto *pointer =
       args[0].match([&](const fir::MutableBoxValue &x) { return &x; },
@@ -3063,7 +3076,7 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
   mlir::Value targetBox = builder.createBox(loc, target);
   mlir::Value pointerBoxRef =
       fir::factory::getMutableIRBox(builder, loc, *pointer);
-  auto pointerBox = builder.create<fir::LoadOp>(loc, pointerBoxRef);
+  auto pointerBox = fir::LoadOp::create(builder, loc, pointerBoxRef);
   return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
 }
 
@@ -3094,12 +3107,12 @@ IntrinsicLibrary::genBesselJn(mlir::Type resultType,
     mlir::Value resultBox =
         fir::factory::getMutableIRBox(builder, loc, resultMutableBox);
 
-    mlir::Value cmpXEq0 = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::UEQ, x, zero);
-    mlir::Value cmpN1LtN2 = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, n1, n2);
-    mlir::Value cmpN1EqN2 = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, n1, n2);
+    mlir::Value cmpXEq0 = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::UEQ, x, zero);
+    mlir::Value cmpN1LtN2 = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::slt, n1, n2);
+    mlir::Value cmpN1EqN2 = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, n1, n2);
 
     auto genXEq0 = [&]() {
       fir::runtime::genBesselJnX0(builder, loc, floatTy, resultBox, n1, n2);
@@ -3111,7 +3124,7 @@ IntrinsicLibrary::genBesselJn(mlir::Type resultType,
       // https://dlmf.nist.gov/10.6.E1). When n1 < n2, this requires
       // the values of BESSEL_JN(n2) and BESSEL_JN(n2 - 1) since they
       // are the anchors of the recursion.
-      mlir::Value n2_1 = builder.create<mlir::arith::SubIOp>(loc, n2, one);
+      mlir::Value n2_1 = mlir::arith::SubIOp::create(builder, loc, n2, one);
       mlir::Value bn2 = genRuntimeCall("bessel_jn", resultType, {n2, x});
       mlir::Value bn2_1 = genRuntimeCall("bessel_jn", resultType, {n2_1, x});
       fir::runtime::genBesselJn(builder, loc, resultBox, n1, n2, x, bn2, bn2_1);
@@ -3179,12 +3192,12 @@ IntrinsicLibrary::genBesselYn(mlir::Type resultType,
     mlir::Value resultBox =
         fir::factory::getMutableIRBox(builder, loc, resultMutableBox);
 
-    mlir::Value cmpXEq0 = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::UEQ, x, zero);
-    mlir::Value cmpN1LtN2 = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, n1, n2);
-    mlir::Value cmpN1EqN2 = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, n1, n2);
+    mlir::Value cmpXEq0 = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::UEQ, x, zero);
+    mlir::Value cmpN1LtN2 = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::slt, n1, n2);
+    mlir::Value cmpN1EqN2 = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, n1, n2);
 
     auto genXEq0 = [&]() {
       fir::runtime::genBesselYnX0(builder, loc, floatTy, resultBox, n1, n2);
@@ -3196,7 +3209,7 @@ IntrinsicLibrary::genBesselYn(mlir::Type resultType,
       // https://dlmf.nist.gov/10.6.E1). When n1 < n2, this requires
       // the values of BESSEL_YN(n1) and BESSEL_YN(n1 + 1) since they
       // are the anchors of the recursion.
-      mlir::Value n1_1 = builder.create<mlir::arith::AddIOp>(loc, n1, one);
+      mlir::Value n1_1 = mlir::arith::AddIOp::create(builder, loc, n1, one);
       mlir::Value bn1 = genRuntimeCall("bessel_yn", resultType, {n1, x});
       mlir::Value bn1_1 = genRuntimeCall("bessel_yn", resultType, {n1_1, x});
       fir::runtime::genBesselYn(builder, loc, resultBox, n1, n2, x, bn1, bn1_1);
@@ -3268,12 +3281,12 @@ IntrinsicLibrary::genBitwiseCompare(mlir::Type resultType,
   if (arg0Ty.isUnsignedInteger())
     arg0 = builder.createConvert(loc, signlessType, arg0);
   else if (bits0 < widest)
-    arg0 = builder.create<mlir::arith::ExtUIOp>(loc, signlessType, arg0);
+    arg0 = mlir::arith::ExtUIOp::create(builder, loc, signlessType, arg0);
   if (arg1Ty.isUnsignedInteger())
     arg1 = builder.createConvert(loc, signlessType, arg1);
   else if (bits1 < widest)
-    arg1 = builder.create<mlir::arith::ExtUIOp>(loc, signlessType, arg1);
-  return builder.create<mlir::arith::CmpIOp>(loc, pred, arg0, arg1);
+    arg1 = mlir::arith::ExtUIOp::create(builder, loc, signlessType, arg1);
+  return mlir::arith::CmpIOp::create(builder, loc, pred, arg0, arg1);
 }
 
 // BTEST
@@ -3292,9 +3305,9 @@ mlir::Value IntrinsicLibrary::genBtest(mlir::Type resultType,
     word = builder.createConvert(loc, signlessType, word);
   mlir::Value shiftCount = builder.createConvert(loc, signlessType, args[1]);
   mlir::Value shifted =
-      builder.create<mlir::arith::ShRUIOp>(loc, word, shiftCount);
+      mlir::arith::ShRUIOp::create(builder, loc, word, shiftCount);
   mlir::Value one = builder.createIntegerConstant(loc, signlessType, 1);
-  mlir::Value bit = builder.create<mlir::arith::AndIOp>(loc, shifted, one);
+  mlir::Value bit = mlir::arith::AndIOp::create(builder, loc, shifted, one);
   return builder.createConvert(loc, resultType, bit);
 }
 
@@ -3305,11 +3318,11 @@ static mlir::Value getAddrFromBox(fir::FirOpBuilder &builder,
   mlir::Value addr{nullptr};
   if (isFunc) {
     auto funcTy = mlir::cast<fir::BoxProcType>(argValue.getType()).getEleTy();
-    addr = builder.create<fir::BoxAddrOp>(loc, funcTy, argValue);
+    addr = fir::BoxAddrOp::create(builder, loc, funcTy, argValue);
   } else {
     const auto *box = arg.getBoxOf<fir::BoxValue>();
-    addr = builder.create<fir::BoxAddrOp>(loc, box->getMemTy(),
-                                          fir::getBase(*box));
+    addr = fir::BoxAddrOp::create(builder, loc, box->getMemTy(),
+                                  fir::getBase(*box));
   }
   return addr;
 }
@@ -3319,7 +3332,7 @@ genCLocOrCFunLoc(fir::FirOpBuilder &builder, mlir::Location loc,
                  mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args,
                  bool isFunc = false, bool isDevLoc = false) {
   assert(args.size() == 1);
-  mlir::Value res = builder.create<fir::AllocaOp>(loc, resultType);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
   mlir::Value resAddr;
   if (isDevLoc)
     resAddr = fir::factory::genCDevPtrAddr(builder, loc, res, resultType);
@@ -3330,7 +3343,7 @@ genCLocOrCFunLoc(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value argAddr = getAddrFromBox(builder, loc, args[0], isFunc);
   mlir::Value argAddrVal = builder.createConvert(
       loc, fir::unwrapRefType(resAddr.getType()), argAddr);
-  builder.create<fir::StoreOp>(loc, argAddrVal, resAddr);
+  fir::StoreOp::create(builder, loc, argAddrVal, resAddr);
   return res;
 }
 
@@ -3343,8 +3356,8 @@ genCAssociated(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value cPtrVal1 =
       fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr1);
   mlir::Value zero = builder.createIntegerConstant(loc, cPtrVal1.getType(), 0);
-  mlir::Value res = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne, cPtrVal1, zero);
+  mlir::Value res = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, cPtrVal1, zero);
 
   if (isStaticallyPresent(args[1])) {
     mlir::Type i1Ty = builder.getI1Type();
@@ -3353,15 +3366,16 @@ genCAssociated(fir::FirOpBuilder &builder, mlir::Location loc,
     res =
         builder
             .genIfOp(loc, {i1Ty}, isDynamicallyAbsent, /*withElseRegion=*/true)
-            .genThen([&]() { builder.create<fir::ResultOp>(loc, res); })
+            .genThen([&]() { fir::ResultOp::create(builder, loc, res); })
             .genElse([&]() {
               mlir::Value cPtrVal2 =
                   fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr2);
-              mlir::Value cmpVal = builder.create<mlir::arith::CmpIOp>(
-                  loc, mlir::arith::CmpIPredicate::eq, cPtrVal1, cPtrVal2);
+              mlir::Value cmpVal = mlir::arith::CmpIOp::create(
+                  builder, loc, mlir::arith::CmpIPredicate::eq, cPtrVal1,
+                  cPtrVal2);
               mlir::Value newRes =
-                  builder.create<mlir::arith::AndIOp>(loc, res, cmpVal);
-              builder.create<fir::ResultOp>(loc, newRes);
+                  mlir::arith::AndIOp::create(builder, loc, res, cmpVal);
+              fir::ResultOp::create(builder, loc, newRes);
             })
             .getResults()[0];
   }
@@ -3416,9 +3430,9 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
       mlir::Type idxType = builder.getIndexType();
       for (int i = 0; i < arrayRank; ++i) {
         mlir::Value index = builder.createIntegerConstant(loc, idxType, i);
-        mlir::Value var = builder.create<fir::CoordinateOp>(
-            loc, builder.getRefType(shapeElementType), shape, index);
-        mlir::Value load = builder.create<fir::LoadOp>(loc, var);
+        mlir::Value var = fir::CoordinateOp::create(
+            builder, loc, builder.getRefType(shapeElementType), shape, index);
+        mlir::Value load = fir::LoadOp::create(builder, loc, var);
         extents.push_back(builder.createConvert(loc, idxType, load));
       }
     }
@@ -3462,8 +3476,8 @@ void IntrinsicLibrary::genCFProcPointer(
   mlir::Value cptrCast =
       builder.createConvert(loc, boxProcType.getEleTy(), cptr);
   mlir::Value cptrBox =
-      builder.create<fir::EmboxProcOp>(loc, boxProcType, cptrCast);
-  builder.create<fir::StoreOp>(loc, cptrBox, fptr);
+      fir::EmboxProcOp::create(builder, loc, boxProcType, cptrCast);
+  fir::StoreOp::create(builder, loc, cptrBox, fptr);
 }
 
 // C_FUNLOC
@@ -3493,7 +3507,7 @@ IntrinsicLibrary::genCPtrCompare(mlir::Type resultType,
   mlir::Value cPtrVal2 =
       fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr2);
   mlir::Value cmp =
-      builder.create<mlir::arith::CmpIOp>(loc, pred, cPtrVal1, cPtrVal2);
+      mlir::arith::CmpIOp::create(builder, loc, pred, cPtrVal1, cPtrVal2);
   return builder.createConvert(loc, resultType, cmp);
 }
 
@@ -3557,16 +3571,6 @@ IntrinsicLibrary::genChdir(std::optional<mlir::Type> resultType,
   return {};
 }
 
-// CLOCK64
-mlir::Value IntrinsicLibrary::genClock64(mlir::Type resultType,
-                                         llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.read.ptx.sreg.clock64";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::FunctionType ftype = mlir::FunctionType::get(context, {}, {resultType});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return builder.create<fir::CallOp>(loc, funcOp, args).getResult(0);
-}
-
 // CMPLX
 mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType,
                                        llvm::ArrayRef<mlir::Value> args) {
@@ -3602,7 +3606,7 @@ mlir::Value IntrinsicLibrary::genConjg(mlir::Type resultType,
   mlir::Value cplx = args[0];
   auto imag = fir::factory::Complex{builder, loc}.extractComplexPart(
       cplx, /*isImagPart=*/true);
-  auto negImag = builder.create<mlir::arith::NegFOp>(loc, imag);
+  auto negImag = mlir::arith::NegFOp::create(builder, loc, imag);
   return fir::factory::Complex{builder, loc}.insertComplexPart(
       cplx, negImag, /*isImagPart=*/true);
 }
@@ -3618,7 +3622,22 @@ mlir::Value IntrinsicLibrary::genCosd(mlir::Type resultType,
   mlir::Value dfactor = builder.createRealConstant(
       loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
-  mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
+  mlir::Value arg = mlir::arith::MulFOp::create(builder, loc, args[0], factor);
+  return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg});
+}
+
+// COSPI
+mlir::Value IntrinsicLibrary::genCospi(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
+  llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
+  mlir::Value dfactor =
+      builder.createRealConstant(loc, mlir::Float64Type::get(context), pi);
+  mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
+  mlir::Value arg = mlir::arith::MulFOp::create(builder, loc, args[0], factor);
   return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg});
 }
 
@@ -3679,7 +3698,7 @@ void IntrinsicLibrary::genCpuTime(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value res1 = fir::runtime::genCpuTime(builder, loc);
   mlir::Value res2 =
       builder.createConvert(loc, fir::dyn_cast_ptrEleTy(arg->getType()), res1);
-  builder.create<fir::StoreOp>(loc, res2, *arg);
+  fir::StoreOp::create(builder, loc, res2, *arg);
 }
 
 // CSHIFT
@@ -3706,7 +3725,7 @@ IntrinsicLibrary::genCshift(mlir::Type resultType,
     // Handle required SHIFT argument as a scalar
     const mlir::Value *shiftAddr = args[1].getUnboxed();
     assert(shiftAddr && "nonscalar CSHIFT argument");
-    auto shift = builder.create<fir::LoadOp>(loc, *shiftAddr);
+    auto shift = fir::LoadOp::create(builder, loc, *shiftAddr);
 
     fir::runtime::genCshiftVector(builder, loc, resultIrBox, array, shift);
   } else {
@@ -3732,9 +3751,9 @@ IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
   assert(args.size() == 1);
   mlir::Type resTy = fir::SequenceType::get(extent, resultType);
   mlir::Value arg = fir::getBase(args[0]);
-  mlir::Value res = builder.create<fir::AllocaOp>(loc, resTy);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
   if (mlir::isa<fir::BaseBoxType>(arg.getType()))
-    arg = builder.create<fir::BoxAddrOp>(loc, arg);
+    arg = fir::BoxAddrOp::create(builder, loc, arg);
   mlir::Type refResTy = fir::ReferenceType::get(resTy);
   mlir::FunctionType ftype =
       mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
@@ -3742,7 +3761,7 @@ IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
   llvm::SmallVector<mlir::Value> funcArgs;
   funcArgs.push_back(res);
   funcArgs.push_back(arg);
-  builder.create<fir::CallOp>(loc, funcOp, funcArgs);
+  fir::CallOp::create(builder, loc, funcOp, funcArgs);
   mlir::Value ext =
       builder.createIntegerConstant(loc, builder.getIndexType(), extent);
   return fir::ArrayBoxValue(res, {ext});
@@ -3758,8 +3777,8 @@ void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
 
   mlir::Value values = fir::getBase(args[3]);
   if (!values)
-    values = builder.create<fir::AbsentOp>(
-        loc, fir::BoxType::get(builder.getNoneType()));
+    values = fir::AbsentOp::create(builder, loc,
+                                   fir::BoxType::get(builder.getNoneType()));
 
   fir::runtime::genDateAndTime(builder, loc, charArgs[0], charArgs[1],
                                charArgs[2], values);
@@ -3771,17 +3790,17 @@ mlir::Value IntrinsicLibrary::genDim(mlir::Type resultType,
   assert(args.size() == 2);
   if (mlir::isa<mlir::IntegerType>(resultType)) {
     mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
-    auto diff = builder.create<mlir::arith::SubIOp>(loc, args[0], args[1]);
-    auto cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::sgt, diff, zero);
-    return builder.create<mlir::arith::SelectOp>(loc, cmp, diff, zero);
+    auto diff = mlir::arith::SubIOp::create(builder, loc, args[0], args[1]);
+    auto cmp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::sgt, diff, zero);
+    return mlir::arith::SelectOp::create(builder, loc, cmp, diff, zero);
   }
   assert(fir::isa_real(resultType) && "Only expects real and integer in DIM");
   mlir::Value zero = builder.createRealZeroConstant(loc, resultType);
-  auto diff = builder.create<mlir::arith::SubFOp>(loc, args[0], args[1]);
-  auto cmp = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OGT, diff, zero);
-  return builder.create<mlir::arith::SelectOp>(loc, cmp, diff, zero);
+  auto diff = mlir::arith::SubFOp::create(builder, loc, args[0], args[1]);
+  auto cmp = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OGT, diff, zero);
+  return mlir::arith::SelectOp::create(builder, loc, cmp, diff, zero);
 }
 
 // DOT_PRODUCT
@@ -3799,12 +3818,13 @@ IntrinsicLibrary::genDotProduct(mlir::Type resultType,
   if (fir::isa_complex(eleTy)) {
     mlir::Value result = builder.createTemporary(loc, eleTy);
     fir::runtime::genDotProduct(builder, loc, vectorA, vectorB, result);
-    return builder.create<fir::LoadOp>(loc, result);
+    return fir::LoadOp::create(builder, loc, result);
   }
 
   // This operation is only used to pass the result type
   // information to the DotProduct generator.
-  auto resultBox = builder.create<fir::AbsentOp>(loc, fir::BoxType::get(eleTy));
+  auto resultBox =
+      fir::AbsentOp::create(builder, loc, fir::BoxType::get(eleTy));
   return fir::runtime::genDotProduct(builder, loc, vectorA, vectorB, resultBox);
 }
 
@@ -3816,7 +3836,7 @@ mlir::Value IntrinsicLibrary::genDprod(mlir::Type resultType,
          "Result must be double precision in DPROD");
   mlir::Value a = builder.createConvert(loc, resultType, args[0]);
   mlir::Value b = builder.createConvert(loc, resultType, args[1]);
-  return builder.create<mlir::arith::MulFOp>(loc, a, b);
+  return mlir::arith::MulFOp::create(builder, loc, a, b);
 }
 
 // DSHIFTL
@@ -3839,14 +3859,14 @@ mlir::Value IntrinsicLibrary::genDshiftl(mlir::Type resultType,
 
   // Per the standard, the value of DSHIFTL(I, J, SHIFT) is equal to
   // IOR (SHIFTL(I, SHIFT), SHIFTR(J, BIT_SIZE(J) - SHIFT))
-  mlir::Value diff = builder.create<mlir::arith::SubIOp>(loc, bitSize, shift);
+  mlir::Value diff = mlir::arith::SubIOp::create(builder, loc, bitSize, shift);
 
   mlir::Value lArgs[2]{i, shift};
   mlir::Value lft = genShift<mlir::arith::ShLIOp>(signlessType, lArgs);
 
   mlir::Value rArgs[2]{j, diff};
   mlir::Value rgt = genShift<mlir::arith::ShRUIOp>(signlessType, rArgs);
-  mlir::Value result = builder.create<mlir::arith::OrIOp>(loc, lft, rgt);
+  mlir::Value result = mlir::arith::OrIOp::create(builder, loc, lft, rgt);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -3872,14 +3892,14 @@ mlir::Value IntrinsicLibrary::genDshiftr(mlir::Type resultType,
 
   // Per the standard, the value of DSHIFTR(I, J, SHIFT) is equal to
   // IOR (SHIFTL(I, BIT_SIZE(I) - SHIFT), SHIFTR(J, SHIFT))
-  mlir::Value diff = builder.create<mlir::arith::SubIOp>(loc, bitSize, shift);
+  mlir::Value diff = mlir::arith::SubIOp::create(builder, loc, bitSize, shift);
 
   mlir::Value lArgs[2]{i, diff};
   mlir::Value lft = genShift<mlir::arith::ShLIOp>(signlessType, lArgs);
 
   mlir::Value rArgs[2]{j, shift};
   mlir::Value rgt = genShift<mlir::arith::ShRUIOp>(signlessType, rArgs);
-  mlir::Value result = builder.create<mlir::arith::OrIOp>(loc, lft, rgt);
+  mlir::Value result = mlir::arith::OrIOp::create(builder, loc, lft, rgt);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -3907,8 +3927,8 @@ IntrinsicLibrary::genEoshift(mlir::Type resultType,
   // Handle optional BOUNDARY argument
   mlir::Value boundary =
       isStaticallyAbsent(args[2])
-          ? builder.create<fir::AbsentOp>(
-                loc, fir::BoxType::get(builder.getNoneType()))
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getNoneType()))
           : builder.createBox(loc, args[2]);
 
   if (arrayRank == 1) {
@@ -3916,7 +3936,7 @@ IntrinsicLibrary::genEoshift(mlir::Type resultType,
     // Handle required SHIFT argument as a scalar
     const mlir::Value *shiftAddr = args[1].getUnboxed();
     assert(shiftAddr && "nonscalar EOSHIFT SHIFT argument");
-    auto shift = builder.create<fir::LoadOp>(loc, *shiftAddr);
+    auto shift = fir::LoadOp::create(builder, loc, *shiftAddr);
     fir::runtime::genEoshiftVector(builder, loc, resultIrBox, array, shift,
                                    boundary);
   } else {
@@ -3964,14 +3984,15 @@ void IntrinsicLibrary::genExecuteCommandLine(
                    .genIfOp(loc, {i1Ty}, waitIsPresentAtRuntime,
                             /*withElseRegion=*/true)
                    .genThen([&]() {
-                     auto waitLoad = builder.create<fir::LoadOp>(loc, waitAddr);
+                     auto waitLoad =
+                         fir::LoadOp::create(builder, loc, waitAddr);
                      mlir::Value cast =
                          builder.createConvert(loc, i1Ty, waitLoad);
-                     builder.create<fir::ResultOp>(loc, cast);
+                     fir::ResultOp::create(builder, loc, cast);
                    })
                    .genElse([&]() {
                      mlir::Value trueVal = builder.createBool(loc, true);
-                     builder.create<fir::ResultOp>(loc, trueVal);
+                     fir::ResultOp::create(builder, loc, trueVal);
                    })
                    .getResults()[0];
   }
@@ -3979,15 +4000,15 @@ void IntrinsicLibrary::genExecuteCommandLine(
   mlir::Value exitstatBox =
       isStaticallyPresent(exitstat)
           ? fir::getBase(exitstat)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value cmdstatBox =
       isStaticallyPresent(cmdstat)
           ? fir::getBase(cmdstat)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value cmdmsgBox =
       isStaticallyPresent(cmdmsg)
           ? fir::getBase(cmdmsg)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   fir::runtime::genExecuteCommandLine(builder, loc, command, waitBool,
                                       exitstatBox, cmdstatBox, cmdmsgBox);
 }
@@ -4008,7 +4029,7 @@ IntrinsicLibrary::genEtime(std::optional<mlir::Type> resultType,
     auto timeAddr = builder.createTemporary(loc, *resultType);
     auto timeBox = builder.createBox(loc, timeAddr);
     fir::runtime::genEtime(builder, loc, values, timeBox);
-    return builder.create<fir::LoadOp>(loc, timeAddr);
+    return fir::LoadOp::create(builder, loc, timeAddr);
   } else {
     // subroutine form
     mlir::Value time = fir::getBase(args[1]);
@@ -4081,8 +4102,8 @@ IntrinsicLibrary::genFindloc(mlir::Type resultType,
 
   // Handle optional mask argument
   auto mask = isStaticallyAbsent(args[3])
-                  ? builder.create<fir::AbsentOp>(
-                        loc, fir::BoxType::get(builder.getI1Type()))
+                  ? fir::AbsentOp::create(
+                        builder, loc, fir::BoxType::get(builder.getI1Type()))
                   : builder.createBox(loc, args[3]);
 
   // Handle optional kind argument
@@ -4271,15 +4292,15 @@ void IntrinsicLibrary::genGetCommand(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value commandBox =
       isStaticallyPresent(command)
           ? fir::getBase(command)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value lenBox =
       isStaticallyPresent(length)
           ? fir::getBase(length)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value errBox =
       isStaticallyPresent(errmsg)
           ? fir::getBase(errmsg)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value stat =
       fir::runtime::genGetCommand(builder, loc, commandBox, lenBox, errBox);
   if (isStaticallyPresent(status)) {
@@ -4338,15 +4359,15 @@ void IntrinsicLibrary::genGetCommandArgument(
   mlir::Value valBox =
       isStaticallyPresent(value)
           ? fir::getBase(value)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value lenBox =
       isStaticallyPresent(length)
           ? fir::getBase(length)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value errBox =
       isStaticallyPresent(errmsg)
           ? fir::getBase(errmsg)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value stat = fir::runtime::genGetCommandArgument(
       builder, loc, number, valBox, lenBox, errBox);
   if (isStaticallyPresent(status)) {
@@ -4391,13 +4412,14 @@ void IntrinsicLibrary::genGetEnvironmentVariable(
                .genIfOp(loc, {i1Ty}, trimNameIsPresentAtRuntime,
                         /*withElseRegion=*/true)
                .genThen([&]() {
-                 auto trimLoad = builder.create<fir::LoadOp>(loc, trimNameAddr);
+                 auto trimLoad =
+                     fir::LoadOp::create(builder, loc, trimNameAddr);
                  mlir::Value cast = builder.createConvert(loc, i1Ty, trimLoad);
-                 builder.create<fir::ResultOp>(loc, cast);
+                 fir::ResultOp::create(builder, loc, cast);
                })
                .genElse([&]() {
                  mlir::Value trueVal = builder.createBool(loc, true);
-                 builder.create<fir::ResultOp>(loc, trueVal);
+                 fir::ResultOp::create(builder, loc, trueVal);
                })
                .getResults()[0];
   }
@@ -4406,15 +4428,15 @@ void IntrinsicLibrary::genGetEnvironmentVariable(
   mlir::Value valBox =
       isStaticallyPresent(value)
           ? fir::getBase(value)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value lenBox =
       isStaticallyPresent(length)
           ? fir::getBase(length)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value errBox =
       isStaticallyPresent(errmsg)
           ? fir::getBase(errmsg)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   mlir::Value stat = fir::runtime::genGetEnvVariable(builder, loc, name, valBox,
                                                      lenBox, trim, errBox);
   if (isStaticallyPresent(status)) {
@@ -4499,8 +4521,8 @@ IntrinsicLibrary::genReduction(FN func, FD funcDim, llvm::StringRef errMsg,
 
   // Handle optional mask argument
   auto mask = isStaticallyAbsent(args[2])
-                  ? builder.create<fir::AbsentOp>(
-                        loc, fir::BoxType::get(builder.getI1Type()))
+                  ? fir::AbsentOp::create(
+                        builder, loc, fir::BoxType::get(builder.getI1Type()))
                   : builder.createBox(loc, args[2]);
 
   bool absentDim = isStaticallyAbsent(args[1]);
@@ -4514,10 +4536,10 @@ IntrinsicLibrary::genReduction(FN func, FD funcDim, llvm::StringRef errMsg,
     if (fir::isa_complex(eleTy)) {
       mlir::Value result = builder.createTemporary(loc, eleTy);
       func(builder, loc, array, mask, result);
-      return builder.create<fir::LoadOp>(loc, result);
+      return fir::LoadOp::create(builder, loc, result);
     }
-    auto resultBox = builder.create<fir::AbsentOp>(
-        loc, fir::BoxType::get(builder.getI1Type()));
+    auto resultBox = fir::AbsentOp::create(
+        builder, loc, fir::BoxType::get(builder.getI1Type()));
     return func(builder, loc, array, mask, resultBox);
   }
   // Handle Product/Sum cases that have an array result.
@@ -4564,8 +4586,8 @@ mlir::Value IntrinsicLibrary::genIbclr(mlir::Type resultType,
   mlir::Value one = builder.createIntegerConstant(loc, signlessType, 1);
   mlir::Value ones = builder.createAllOnesInteger(loc, signlessType);
   mlir::Value pos = builder.createConvert(loc, signlessType, args[1]);
-  mlir::Value bit = builder.create<mlir::arith::ShLIOp>(loc, one, pos);
-  mlir::Value mask = builder.create<mlir::arith::XOrIOp>(loc, ones, bit);
+  mlir::Value bit = mlir::arith::ShLIOp::create(builder, loc, one, pos);
+  mlir::Value mask = mlir::arith::XOrIOp::create(builder, loc, ones, bit);
   return builder.createUnsigned<mlir::arith::AndIOp>(loc, resultType, args[0],
                                                      mask);
 }
@@ -4593,18 +4615,18 @@ mlir::Value IntrinsicLibrary::genIbits(mlir::Type resultType,
   mlir::Value bitSize = builder.createIntegerConstant(
       loc, signlessType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   mlir::Value shiftCount =
-      builder.create<mlir::arith::SubIOp>(loc, bitSize, len);
+      mlir::arith::SubIOp::create(builder, loc, bitSize, len);
   mlir::Value zero = builder.createIntegerConstant(loc, signlessType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, signlessType);
   mlir::Value mask =
-      builder.create<mlir::arith::ShRUIOp>(loc, ones, shiftCount);
+      mlir::arith::ShRUIOp::create(builder, loc, ones, shiftCount);
   mlir::Value res1 = builder.createUnsigned<mlir::arith::ShRSIOp>(
       loc, signlessType, word, pos);
-  mlir::Value res2 = builder.create<mlir::arith::AndIOp>(loc, res1, mask);
-  mlir::Value lenIsZero = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, len, zero);
+  mlir::Value res2 = mlir::arith::AndIOp::create(builder, loc, res1, mask);
+  mlir::Value lenIsZero = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, len, zero);
   mlir::Value result =
-      builder.create<mlir::arith::SelectOp>(loc, lenIsZero, zero, res2);
+      mlir::arith::SelectOp::create(builder, loc, lenIsZero, zero, res2);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -4623,7 +4645,7 @@ mlir::Value IntrinsicLibrary::genIbset(mlir::Type resultType,
       mlir::IntegerType::SignednessSemantics::Signless);
   mlir::Value one = builder.createIntegerConstant(loc, signlessType, 1);
   mlir::Value pos = builder.createConvert(loc, signlessType, args[1]);
-  mlir::Value mask = builder.create<mlir::arith::ShLIOp>(loc, one, pos);
+  mlir::Value mask = mlir::arith::ShLIOp::create(builder, loc, one, pos);
   return builder.createUnsigned<mlir::arith::OrIOp>(loc, resultType, args[0],
                                                     mask);
 }
@@ -4657,13 +4679,13 @@ IntrinsicLibrary::genIchar(mlir::Type resultType,
         fir::CharacterType::get(builder.getContext(), eleType.getFKind(), 1);
     mlir::Type toTy = builder.getRefType(charType);
     mlir::Value cast = builder.createConvert(loc, toTy, buffer);
-    charVal = builder.create<fir::LoadOp>(loc, cast);
+    charVal = fir::LoadOp::create(builder, loc, cast);
   }
   LLVM_DEBUG(llvm::dbgs() << "ichar(" << charVal << ")\n");
   auto code = helper.extractCodeFromSingleton(charVal);
   if (code.getType() == resultType)
     return code;
-  return builder.create<mlir::arith::ExtUIOp>(loc, resultType, code);
+  return mlir::arith::ExtUIOp::create(builder, loc, resultType, code);
 }
 
 // llvm floating point class intrinsic test values
@@ -4693,7 +4715,7 @@ mlir::Value IntrinsicLibrary::genIsFPClass(mlir::Type resultType,
   assert(args.size() == 1);
   mlir::Type i1Ty = builder.getI1Type();
   mlir::Value isfpclass =
-      builder.create<mlir::LLVM::IsFPClass>(loc, i1Ty, args[0], fpclass);
+      mlir::LLVM::IsFPClass::create(builder, loc, i1Ty, args[0], fpclass);
   return builder.createConvert(loc, resultType, isfpclass);
 }
 
@@ -4708,7 +4730,7 @@ mlir::Value IntrinsicLibrary::genQNan(mlir::Type resultType) {
 void IntrinsicLibrary::genRaiseExcept(int excepts, mlir::Value cond) {
   fir::IfOp ifOp;
   if (cond) {
-    ifOp = builder.create<fir::IfOp>(loc, cond, /*withElseRegion=*/false);
+    ifOp = fir::IfOp::create(builder, loc, cond, /*withElseRegion=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   }
   mlir::Type i32Ty = builder.getIntegerType(32);
@@ -4729,11 +4751,11 @@ getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec,
       mlir::dyn_cast<fir::RecordType>(fir::unwrapPassByRefType(rec.getType()));
   assert(index < recType.getTypeList().size() && "not enough components");
   auto [fieldName, fieldTy] = recType.getTypeList()[index];
-  mlir::Value field = builder.create<fir::FieldIndexOp>(
-      loc, fir::FieldType::get(recType.getContext()), fieldName, recType,
-      fir::getTypeParams(rec));
-  return {builder.create<fir::CoordinateOp>(loc, builder.getRefType(fieldTy),
-                                            rec, field),
+  mlir::Value field = fir::FieldIndexOp::create(
+      builder, loc, fir::FieldType::get(recType.getContext()), fieldName,
+      recType, fir::getTypeParams(rec));
+  return {fir::CoordinateOp::create(builder, loc, builder.getRefType(fieldTy),
+                                    rec, field),
           fieldTy};
 }
 
@@ -4746,9 +4768,9 @@ IntrinsicLibrary::genIeeeTypeCompare(mlir::Type resultType,
   assert(args.size() == 2);
   auto [leftRef, fieldTy] = getFieldRef(builder, loc, args[0]);
   auto [rightRef, ignore] = getFieldRef(builder, loc, args[1]);
-  mlir::Value left = builder.create<fir::LoadOp>(loc, fieldTy, leftRef);
-  mlir::Value right = builder.create<fir::LoadOp>(loc, fieldTy, rightRef);
-  return builder.create<mlir::arith::CmpIOp>(loc, pred, left, right);
+  mlir::Value left = fir::LoadOp::create(builder, loc, fieldTy, leftRef);
+  mlir::Value right = fir::LoadOp::create(builder, loc, fieldTy, rightRef);
+  return mlir::arith::CmpIOp::create(builder, loc, pred, left, right);
 }
 
 // IEEE_CLASS
@@ -4779,7 +4801,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
   const unsigned intWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(intWidth);
   mlir::Value intVal =
-      builder.create<mlir::arith::BitcastOp>(loc, intType, realVal);
+      mlir::arith::BitcastOp::create(builder, loc, intType, realVal);
   llvm::StringRef tableName = RTNAME_STRING(IeeeClassTable);
   uint64_t highSignificandSize = (realType.getWidth() == 80) + 1;
 
@@ -4789,8 +4811,8 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
     return builder.createIntegerConstant(loc, intType, k);
   };
   auto createIntegerConstantAPI = [&](const llvm::APInt &apInt) {
-    return builder.create<mlir::arith::ConstantOp>(
-        loc, intType, builder.getIntegerAttr(intType, apInt));
+    return mlir::arith::ConstantOp::create(
+        builder, loc, intType, builder.getIntegerAttr(intType, apInt));
   };
   auto getMasksAndShifts = [&](uint64_t totalSize, uint64_t exponentSize,
                                uint64_t significandSize,
@@ -4837,50 +4859,52 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
 
   // [s] sign bit
   int pos = 3 + highSignificandSize;
-  mlir::Value index = builder.create<mlir::arith::AndIOp>(
-      loc, builder.create<mlir::arith::ShRUIOp>(loc, intVal, signShift),
+  mlir::Value index = mlir::arith::AndIOp::create(
+      builder, loc,
+      mlir::arith::ShRUIOp::create(builder, loc, intVal, signShift),
       createIntegerConstant(1ULL << pos));
 
   // [e] exponent != 0
   mlir::Value exponent =
-      builder.create<mlir::arith::AndIOp>(loc, intVal, exponentMask);
+      mlir::arith::AndIOp::create(builder, loc, intVal, exponentMask);
   mlir::Value zero = createIntegerConstant(0);
-  index = builder.create<mlir::arith::OrIOp>(
-      loc, index,
-      builder.create<mlir::arith::SelectOp>(
-          loc,
-          builder.create<mlir::arith::CmpIOp>(
-              loc, mlir::arith::CmpIPredicate::ne, exponent, zero),
+  index = mlir::arith::OrIOp::create(
+      builder, loc, index,
+      mlir::arith::SelectOp::create(
+          builder, loc,
+          mlir::arith::CmpIOp::create(
+              builder, loc, mlir::arith::CmpIPredicate::ne, exponent, zero),
           createIntegerConstant(1ULL << --pos), zero));
 
   // [m] exponent == 1..1 (max exponent)
-  index = builder.create<mlir::arith::OrIOp>(
-      loc, index,
-      builder.create<mlir::arith::SelectOp>(
-          loc,
-          builder.create<mlir::arith::CmpIOp>(
-              loc, mlir::arith::CmpIPredicate::eq, exponent, exponentMask),
+  index = mlir::arith::OrIOp::create(
+      builder, loc, index,
+      mlir::arith::SelectOp::create(
+          builder, loc,
+          mlir::arith::CmpIOp::create(builder, loc,
+                                      mlir::arith::CmpIPredicate::eq, exponent,
+                                      exponentMask),
           createIntegerConstant(1ULL << --pos), zero));
 
   // [l] low-order significand != 0
-  index = builder.create<mlir::arith::OrIOp>(
-      loc, index,
-      builder.create<mlir::arith::SelectOp>(
-          loc,
-          builder.create<mlir::arith::CmpIOp>(
-              loc, mlir::arith::CmpIPredicate::ne,
-              builder.create<mlir::arith::AndIOp>(loc, intVal,
-                                                  lowSignificandMask),
+  index = mlir::arith::OrIOp::create(
+      builder, loc, index,
+      mlir::arith::SelectOp::create(
+          builder, loc,
+          mlir::arith::CmpIOp::create(
+              builder, loc, mlir::arith::CmpIPredicate::ne,
+              mlir::arith::AndIOp::create(builder, loc, intVal,
+                                          lowSignificandMask),
               zero),
           createIntegerConstant(1ULL << --pos), zero));
 
   // [h] high-order significand (1 or 2 bits)
-  index = builder.create<mlir::arith::OrIOp>(
-      loc, index,
-      builder.create<mlir::arith::AndIOp>(
-          loc,
-          builder.create<mlir::arith::ShRUIOp>(loc, intVal,
-                                               highSignificandShift),
+  index = mlir::arith::OrIOp::create(
+      builder, loc, index,
+      mlir::arith::AndIOp::create(
+          builder, loc,
+          mlir::arith::ShRUIOp::create(builder, loc, intVal,
+                                       highSignificandShift),
           createIntegerConstant((1 << highSignificandSize) - 1)));
 
   int tableSize = 1 << (4 + highSignificandSize);
@@ -5008,10 +5032,10 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
             mlir::RankedTensorType::get(tableSize, int8Ty), values));
   }
 
-  return builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(resultType),
-      builder.create<fir::AddrOfOp>(loc, builder.getRefType(tableTy),
-                                    builder.getSymbolRefAttr(tableName)),
+  return fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(resultType),
+      fir::AddrOfOp::create(builder, loc, builder.getRefType(tableTy),
+                            builder.getSymbolRefAttr(tableName)),
       index);
 }
 
@@ -5038,34 +5062,36 @@ IntrinsicLibrary::genIeeeCopySign(mlir::Type resultType,
 
   // Args have the same type.
   if (xRealType == yRealType)
-    return builder.create<mlir::math::CopySignOp>(loc, xRealVal, yRealVal);
+    return mlir::math::CopySignOp::create(builder, loc, xRealVal, yRealVal);
 
   // Args have different types.
   mlir::Type xIntType = builder.getIntegerType(xRealType.getWidth());
   mlir::Type yIntType = builder.getIntegerType(yRealType.getWidth());
   mlir::Value xIntVal =
-      builder.create<mlir::arith::BitcastOp>(loc, xIntType, xRealVal);
+      mlir::arith::BitcastOp::create(builder, loc, xIntType, xRealVal);
   mlir::Value yIntVal =
-      builder.create<mlir::arith::BitcastOp>(loc, yIntType, yRealVal);
+      mlir::arith::BitcastOp::create(builder, loc, yIntType, yRealVal);
   mlir::Value xZero = builder.createIntegerConstant(loc, xIntType, 0);
   mlir::Value yZero = builder.createIntegerConstant(loc, yIntType, 0);
   mlir::Value xOne = builder.createIntegerConstant(loc, xIntType, 1);
-  mlir::Value ySign = builder.create<mlir::arith::ShRUIOp>(
-      loc, yIntVal,
+  mlir::Value ySign = mlir::arith::ShRUIOp::create(
+      builder, loc, yIntVal,
       builder.createIntegerConstant(loc, yIntType, yRealType.getWidth() - 1));
-  mlir::Value xAbs = builder.create<mlir::arith::ShRUIOp>(
-      loc, builder.create<mlir::arith::ShLIOp>(loc, xIntVal, xOne), xOne);
-  mlir::Value xSign = builder.create<mlir::arith::SelectOp>(
-      loc,
-      builder.create<mlir::arith::CmpIOp>(loc, mlir::arith::CmpIPredicate::eq,
-                                          ySign, yZero),
+  mlir::Value xAbs = mlir::arith::ShRUIOp::create(
+      builder, loc, mlir::arith::ShLIOp::create(builder, loc, xIntVal, xOne),
+      xOne);
+  mlir::Value xSign = mlir::arith::SelectOp::create(
+      builder, loc,
+      mlir::arith::CmpIOp::create(builder, loc, mlir::arith::CmpIPredicate::eq,
+                                  ySign, yZero),
       xZero,
-      builder.create<mlir::arith::ShLIOp>(
-          loc, xOne,
+      mlir::arith::ShLIOp::create(
+          builder, loc, xOne,
           builder.createIntegerConstant(loc, xIntType,
                                         xRealType.getWidth() - 1)));
-  return builder.create<mlir::arith::BitcastOp>(
-      loc, xRealType, builder.create<mlir::arith::OrIOp>(loc, xAbs, xSign));
+  return mlir::arith::BitcastOp::create(
+      builder, loc, xRealType,
+      mlir::arith::OrIOp::create(builder, loc, xAbs, xSign));
 }
 
 // IEEE_GET_FLAG
@@ -5079,16 +5105,16 @@ void IntrinsicLibrary::genIeeeGetFlag(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Type i32Ty = builder.getIntegerType(32);
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
-  mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
+  mlir::Value field = fir::LoadOp::create(builder, loc, fieldRef);
   mlir::Value excepts = fir::runtime::genFetestexcept(
       builder, loc,
       fir::runtime::genMapExcept(
-          builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
-  mlir::Value logicalResult = builder.create<fir::ConvertOp>(
-      loc, resultTy,
-      builder.create<mlir::arith::CmpIOp>(loc, mlir::arith::CmpIPredicate::ne,
-                                          excepts, zero));
-  builder.create<fir::StoreOp>(loc, logicalResult, flagValue);
+          builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, field)));
+  mlir::Value logicalResult = fir::ConvertOp::create(
+      builder, loc, resultTy,
+      mlir::arith::CmpIOp::create(builder, loc, mlir::arith::CmpIPredicate::ne,
+                                  excepts, zero));
+  fir::StoreOp::create(builder, loc, logicalResult, flagValue);
 }
 
 // IEEE_GET_HALTING_MODE
@@ -5103,17 +5129,17 @@ void IntrinsicLibrary::genIeeeGetHaltingMode(
   mlir::Type i32Ty = builder.getIntegerType(32);
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
-  mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
+  mlir::Value field = fir::LoadOp::create(builder, loc, fieldRef);
   mlir::Value haltSet = fir::runtime::genFegetexcept(builder, loc);
-  mlir::Value intResult = builder.create<mlir::arith::AndIOp>(
-      loc, haltSet,
+  mlir::Value intResult = mlir::arith::AndIOp::create(
+      builder, loc, haltSet,
       fir::runtime::genMapExcept(
-          builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
-  mlir::Value logicalResult = builder.create<fir::ConvertOp>(
-      loc, resultTy,
-      builder.create<mlir::arith::CmpIOp>(loc, mlir::arith::CmpIPredicate::ne,
-                                          intResult, zero));
-  builder.create<fir::StoreOp>(loc, logicalResult, halting);
+          builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, field)));
+  mlir::Value logicalResult = fir::ConvertOp::create(
+      builder, loc, resultTy,
+      mlir::arith::CmpIOp::create(builder, loc, mlir::arith::CmpIPredicate::ne,
+                                  intResult, zero));
+  fir::StoreOp::create(builder, loc, logicalResult, halting);
 }
 
 // IEEE_GET_MODES, IEEE_SET_MODES
@@ -5138,34 +5164,34 @@ void IntrinsicLibrary::genIeeeGetOrSetModesOrStatus(
     // allotment. Allocate data space from the heap.
     auto [fieldRef, fieldTy] =
         getFieldRef(builder, loc, fir::getBase(args[0]), 1);
-    addr = builder.create<fir::BoxAddrOp>(
-        loc, builder.create<fir::LoadOp>(loc, fieldRef));
+    addr = fir::BoxAddrOp::create(builder, loc,
+                                  fir::LoadOp::create(builder, loc, fieldRef));
     mlir::Type heapTy = addr.getType();
-    mlir::Value allocated = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne,
+    mlir::Value allocated = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne,
         builder.createConvert(loc, i64Ty, addr),
         builder.createIntegerConstant(loc, i64Ty, 0));
-    auto ifOp = builder.create<fir::IfOp>(loc, heapTy, allocated,
-                                          /*withElseRegion=*/true);
+    auto ifOp = fir::IfOp::create(builder, loc, heapTy, allocated,
+                                  /*withElseRegion=*/true);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    builder.create<fir::ResultOp>(loc, addr);
+    fir::ResultOp::create(builder, loc, addr);
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
     mlir::Value byteSize =
         isModes ? fir::runtime::genGetModesTypeSize(builder, loc)
                 : fir::runtime::genGetStatusTypeSize(builder, loc);
     byteSize = builder.createConvert(loc, builder.getIndexType(), byteSize);
-    addr = builder.create<fir::AllocMemOp>(loc, extractSequenceType(heapTy),
-                                           /*typeparams=*/mlir::ValueRange(),
-                                           byteSize);
-    mlir::Value shape = builder.create<fir::ShapeOp>(loc, byteSize);
-    builder.create<fir::StoreOp>(
-        loc, builder.create<fir::EmboxOp>(loc, fieldTy, addr, shape), fieldRef);
-    builder.create<fir::ResultOp>(loc, addr);
+    addr = fir::AllocMemOp::create(builder, loc, extractSequenceType(heapTy),
+                                   /*typeparams=*/mlir::ValueRange(), byteSize);
+    mlir::Value shape = fir::ShapeOp::create(builder, loc, byteSize);
+    fir::StoreOp::create(
+        builder, loc, fir::EmboxOp::create(builder, loc, fieldTy, addr, shape),
+        fieldRef);
+    fir::ResultOp::create(builder, loc, addr);
     builder.setInsertionPointAfter(ifOp);
-    addr = builder.create<fir::ConvertOp>(loc, ptrTy, ifOp.getResult(0));
+    addr = fir::ConvertOp::create(builder, loc, ptrTy, ifOp.getResult(0));
   } else {
     // Place floating point environment data in __data storage.
-    addr = builder.create<fir::ConvertOp>(loc, ptrTy, getBase(args[0]));
+    addr = fir::ConvertOp::create(builder, loc, ptrTy, getBase(args[0]));
   }
   llvm::StringRef func = isModes ? (isGet ? "fegetmode" : "fesetmode")
                                  : (isGet ? "fegetenv" : "fesetenv");
@@ -5176,11 +5202,11 @@ void IntrinsicLibrary::genIeeeGetOrSetModesOrStatus(
 // Check that an explicit ieee_[get|set]_rounding_mode call radix value is 2.
 static void checkRadix(fir::FirOpBuilder &builder, mlir::Location loc,
                        mlir::Value radix, std::string procName) {
-  mlir::Value notTwo = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne, radix,
+  mlir::Value notTwo = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, radix,
       builder.createIntegerConstant(loc, radix.getType(), 2));
-  auto ifOp = builder.create<fir::IfOp>(loc, notTwo,
-                                        /*withElseRegion=*/false);
+  auto ifOp = fir::IfOp::create(builder, loc, notTwo,
+                                /*withElseRegion=*/false);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   fir::runtime::genReportFatalUserError(builder, loc,
                                         procName + " radix argument must be 2");
@@ -5198,9 +5224,9 @@ void IntrinsicLibrary::genIeeeGetRoundingMode(
     checkRadix(builder, loc, fir::getBase(args[1]), "ieee_get_rounding_mode");
   auto [fieldRef, fieldTy] = getFieldRef(builder, loc, fir::getBase(args[0]));
   mlir::func::FuncOp getRound = fir::factory::getLlvmGetRounding(builder);
-  mlir::Value mode = builder.create<fir::CallOp>(loc, getRound).getResult(0);
+  mlir::Value mode = fir::CallOp::create(builder, loc, getRound).getResult(0);
   mode = builder.createConvert(loc, fieldTy, mode);
-  builder.create<fir::StoreOp>(loc, mode, fieldRef);
+  fir::StoreOp::create(builder, loc, mode, fieldRef);
 }
 
 // IEEE_GET_UNDERFLOW_MODE
@@ -5225,44 +5251,45 @@ mlir::Value IntrinsicLibrary::genIeeeInt(mlir::Type resultType,
   mlir::FloatType realType = mlir::cast<mlir::FloatType>(args[0].getType());
   mlir::Value realResult = genIeeeRint(realType, {args[0], args[1]});
   int intWidth = mlir::cast<mlir::IntegerType>(resultType).getWidth();
-  mlir::Value intLBound = builder.create<mlir::arith::ConstantOp>(
-      loc, resultType,
+  mlir::Value intLBound = mlir::arith::ConstantOp::create(
+      builder, loc, resultType,
       builder.getIntegerAttr(resultType,
                              llvm::APInt::getBitsSet(intWidth,
                                                      /*lo=*/intWidth - 1,
                                                      /*hi=*/intWidth)));
-  mlir::Value intUBound = builder.create<mlir::arith::ConstantOp>(
-      loc, resultType,
+  mlir::Value intUBound = mlir::arith::ConstantOp::create(
+      builder, loc, resultType,
       builder.getIntegerAttr(resultType,
                              llvm::APInt::getBitsSet(intWidth, /*lo=*/0,
                                                      /*hi=*/intWidth - 1)));
   mlir::Value realLBound =
-      builder.create<fir::ConvertOp>(loc, realType, intLBound);
-  mlir::Value realUBound = builder.create<mlir::arith::NegFOp>(loc, realLBound);
-  mlir::Value aGreaterThanLBound = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OGE, realResult, realLBound);
-  mlir::Value aLessThanUBound = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OLT, realResult, realUBound);
-  mlir::Value resultIsValid = builder.create<mlir::arith::AndIOp>(
-      loc, aGreaterThanLBound, aLessThanUBound);
+      fir::ConvertOp::create(builder, loc, realType, intLBound);
+  mlir::Value realUBound =
+      mlir::arith::NegFOp::create(builder, loc, realLBound);
+  mlir::Value aGreaterThanLBound = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OGE, realResult, realLBound);
+  mlir::Value aLessThanUBound = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OLT, realResult, realUBound);
+  mlir::Value resultIsValid = mlir::arith::AndIOp::create(
+      builder, loc, aGreaterThanLBound, aLessThanUBound);
 
   // Result is valid. It may be exact or inexact.
   mlir::Value result;
-  fir::IfOp ifOp = builder.create<fir::IfOp>(loc, resultType, resultIsValid,
-                                             /*withElseRegion=*/true);
+  fir::IfOp ifOp = fir::IfOp::create(builder, loc, resultType, resultIsValid,
+                                     /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-  mlir::Value inexact = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::ONE, args[0], realResult);
+  mlir::Value inexact = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::ONE, args[0], realResult);
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INEXACT, inexact);
-  result = builder.create<fir::ConvertOp>(loc, resultType, realResult);
-  builder.create<fir::ResultOp>(loc, result);
+  result = fir::ConvertOp::create(builder, loc, resultType, realResult);
+  fir::ResultOp::create(builder, loc, result);
 
   // Result is invalid.
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID);
-  result = builder.create<mlir::arith::SelectOp>(loc, aGreaterThanLBound,
-                                                 intUBound, intLBound);
-  builder.create<fir::ResultOp>(loc, result);
+  result = mlir::arith::SelectOp::create(builder, loc, aGreaterThanLBound,
+                                         intUBound, intLBound);
+  fir::ResultOp::create(builder, loc, result);
   builder.setInsertionPointAfter(ifOp);
   return ifOp.getResult(0);
 }
@@ -5317,7 +5344,7 @@ mlir::Value IntrinsicLibrary::genIeeeLogb(mlir::Type resultType,
   int bitWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(realType.getWidth());
   mlir::Value intVal =
-      builder.create<mlir::arith::BitcastOp>(loc, intType, realVal);
+      mlir::arith::BitcastOp::create(builder, loc, intType, realVal);
   mlir::Type i1Ty = builder.getI1Type();
 
   int exponentBias, significandSize, nonSignificandSize;
@@ -5364,72 +5391,72 @@ mlir::Value IntrinsicLibrary::genIeeeLogb(mlir::Type resultType,
     llvm_unreachable("unknown real type");
   }
 
-  mlir::Value isZero = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OEQ, realVal,
+  mlir::Value isZero = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OEQ, realVal,
       builder.createRealZeroConstant(loc, resultType));
-  auto outerIfOp = builder.create<fir::IfOp>(loc, resultType, isZero,
-                                             /*withElseRegion=*/true);
+  auto outerIfOp = fir::IfOp::create(builder, loc, resultType, isZero,
+                                     /*withElseRegion=*/true);
   // X is zero -- result is -infinity
   builder.setInsertionPointToStart(&outerIfOp.getThenRegion().front());
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO);
   mlir::Value ones = builder.createAllOnesInteger(loc, intType);
-  mlir::Value result = builder.create<mlir::arith::ShLIOp>(
-      loc, ones,
+  mlir::Value result = mlir::arith::ShLIOp::create(
+      builder, loc, ones,
       builder.createIntegerConstant(loc, intType,
                                     // kind=10 high-order bit is explicit
                                     significandSize - (bitWidth == 80)));
-  result = builder.create<mlir::arith::BitcastOp>(loc, resultType, result);
-  builder.create<fir::ResultOp>(loc, result);
+  result = mlir::arith::BitcastOp::create(builder, loc, resultType, result);
+  fir::ResultOp::create(builder, loc, result);
 
   builder.setInsertionPointToStart(&outerIfOp.getElseRegion().front());
   mlir::Value one = builder.createIntegerConstant(loc, intType, 1);
   mlir::Value shiftLeftOne =
-      builder.create<mlir::arith::ShLIOp>(loc, intVal, one);
+      mlir::arith::ShLIOp::create(builder, loc, intVal, one);
   mlir::Value isFinite = genIsFPClass(i1Ty, args, finiteTest);
-  auto innerIfOp = builder.create<fir::IfOp>(loc, resultType, isFinite,
-                                             /*withElseRegion=*/true);
+  auto innerIfOp = fir::IfOp::create(builder, loc, resultType, isFinite,
+                                     /*withElseRegion=*/true);
   // X is non-zero finite -- result is unbiased exponent of X
   builder.setInsertionPointToStart(&innerIfOp.getThenRegion().front());
   mlir::Value isNormal = genIsFPClass(i1Ty, args, normalTest);
-  auto normalIfOp = builder.create<fir::IfOp>(loc, resultType, isNormal,
-                                              /*withElseRegion=*/true);
+  auto normalIfOp = fir::IfOp::create(builder, loc, resultType, isNormal,
+                                      /*withElseRegion=*/true);
   // X is normal
   builder.setInsertionPointToStart(&normalIfOp.getThenRegion().front());
-  mlir::Value biasedExponent = builder.create<mlir::arith::ShRUIOp>(
-      loc, shiftLeftOne,
+  mlir::Value biasedExponent = mlir::arith::ShRUIOp::create(
+      builder, loc, shiftLeftOne,
       builder.createIntegerConstant(loc, intType, significandSize + 1));
-  result = builder.create<mlir::arith::SubIOp>(
-      loc, biasedExponent,
+  result = mlir::arith::SubIOp::create(
+      builder, loc, biasedExponent,
       builder.createIntegerConstant(loc, intType, exponentBias));
-  result = builder.create<fir::ConvertOp>(loc, resultType, result);
-  builder.create<fir::ResultOp>(loc, result);
+  result = fir::ConvertOp::create(builder, loc, resultType, result);
+  fir::ResultOp::create(builder, loc, result);
 
   // X is denormal -- result is (-exponentBias - ctlz(significand))
   builder.setInsertionPointToStart(&normalIfOp.getElseRegion().front());
-  mlir::Value significand = builder.create<mlir::arith::ShLIOp>(
-      loc, intVal,
+  mlir::Value significand = mlir::arith::ShLIOp::create(
+      builder, loc, intVal,
       builder.createIntegerConstant(loc, intType, nonSignificandSize));
   mlir::Value ctlz =
-      builder.create<mlir::math::CountLeadingZerosOp>(loc, significand);
+      mlir::math::CountLeadingZerosOp::create(builder, loc, significand);
   mlir::Type i32Ty = builder.getI32Type();
-  result = builder.create<mlir::arith::SubIOp>(
-      loc, builder.createIntegerConstant(loc, i32Ty, -exponentBias),
-      builder.create<fir::ConvertOp>(loc, i32Ty, ctlz));
-  result = builder.create<fir::ConvertOp>(loc, resultType, result);
-  builder.create<fir::ResultOp>(loc, result);
+  result = mlir::arith::SubIOp::create(
+      builder, loc, builder.createIntegerConstant(loc, i32Ty, -exponentBias),
+      fir::ConvertOp::create(builder, loc, i32Ty, ctlz));
+  result = fir::ConvertOp::create(builder, loc, resultType, result);
+  fir::ResultOp::create(builder, loc, result);
 
   builder.setInsertionPointToEnd(&innerIfOp.getThenRegion().front());
-  builder.create<fir::ResultOp>(loc, normalIfOp.getResult(0));
+  fir::ResultOp::create(builder, loc, normalIfOp.getResult(0));
 
   // X is infinity or NaN -- result is +infinity or NaN
   builder.setInsertionPointToStart(&innerIfOp.getElseRegion().front());
-  result = builder.create<mlir::arith::ShRUIOp>(loc, shiftLeftOne, one);
-  result = builder.create<mlir::arith::BitcastOp>(loc, resultType, result);
-  builder.create<fir::ResultOp>(loc, result);
+  result = mlir::arith::ShRUIOp::create(builder, loc, shiftLeftOne, one);
+  result = mlir::arith::BitcastOp::create(builder, loc, resultType, result);
+  fir::ResultOp::create(builder, loc, result);
 
   // Unwind the if nest.
   builder.setInsertionPointToEnd(&outerIfOp.getElseRegion().front());
-  builder.create<fir::ResultOp>(loc, innerIfOp.getResult(0));
+  fir::ResultOp::create(builder, loc, innerIfOp.getResult(0));
   builder.setInsertionPointAfter(outerIfOp);
   return outerIfOp.getResult(0);
 }
@@ -5463,8 +5490,8 @@ mlir::Value IntrinsicLibrary::genIeeeMaxMin(mlir::Type resultType,
   mlir::Value x1, y1; // X or ABS(X), Y or ABS(Y)
   if constexpr (isMag) {
     mlir::Value zero = builder.createRealZeroConstant(loc, resultType);
-    x1 = builder.create<mlir::math::CopySignOp>(loc, x, zero);
-    y1 = builder.create<mlir::math::CopySignOp>(loc, y, zero);
+    x1 = mlir::math::CopySignOp::create(builder, loc, x, zero);
+    y1 = mlir::math::CopySignOp::create(builder, loc, y, zero);
   } else {
     x1 = x;
     y1 = y;
@@ -5475,56 +5502,56 @@ mlir::Value IntrinsicLibrary::genIeeeMaxMin(mlir::Type resultType,
 
   // X1 < Y1 -- MAX result is Y; MIN result is X.
   pred = mlir::arith::CmpFPredicate::OLT;
-  cmp = builder.create<mlir::arith::CmpFOp>(loc, pred, x1, y1);
-  auto ifOp1 = builder.create<fir::IfOp>(loc, resultType, cmp, true);
+  cmp = mlir::arith::CmpFOp::create(builder, loc, pred, x1, y1);
+  auto ifOp1 = fir::IfOp::create(builder, loc, resultType, cmp, true);
   builder.setInsertionPointToStart(&ifOp1.getThenRegion().front());
   result = isMax ? y : x;
-  builder.create<fir::ResultOp>(loc, result);
+  fir::ResultOp::create(builder, loc, result);
 
   // X1 > Y1 -- MAX result is X; MIN result is Y.
   builder.setInsertionPointToStart(&ifOp1.getElseRegion().front());
   pred = mlir::arith::CmpFPredicate::OGT;
-  cmp = builder.create<mlir::arith::CmpFOp>(loc, pred, x1, y1);
-  auto ifOp2 = builder.create<fir::IfOp>(loc, resultType, cmp, true);
+  cmp = mlir::arith::CmpFOp::create(builder, loc, pred, x1, y1);
+  auto ifOp2 = fir::IfOp::create(builder, loc, resultType, cmp, true);
   builder.setInsertionPointToStart(&ifOp2.getThenRegion().front());
   result = isMax ? x : y;
-  builder.create<fir::ResultOp>(loc, result);
+  fir::ResultOp::create(builder, loc, result);
 
   // X1 == Y1 -- MAX favors a positive result; MIN favors a negative result.
   builder.setInsertionPointToStart(&ifOp2.getElseRegion().front());
   pred = mlir::arith::CmpFPredicate::OEQ;
-  cmp = builder.create<mlir::arith::CmpFOp>(loc, pred, x1, y1);
-  auto ifOp3 = builder.create<fir::IfOp>(loc, resultType, cmp, true);
+  cmp = mlir::arith::CmpFOp::create(builder, loc, pred, x1, y1);
+  auto ifOp3 = fir::IfOp::create(builder, loc, resultType, cmp, true);
   builder.setInsertionPointToStart(&ifOp3.getThenRegion().front());
   resultIsX = isMax ? genIsFPClass(i1Ty, x, positiveTest)
                     : genIsFPClass(i1Ty, x, negativeTest);
-  result = builder.create<mlir::arith::SelectOp>(loc, resultIsX, x, y);
-  builder.create<fir::ResultOp>(loc, result);
+  result = mlir::arith::SelectOp::create(builder, loc, resultIsX, x, y);
+  fir::ResultOp::create(builder, loc, result);
 
   // X or Y or both are NaNs -- result may be X, Y, or a qNaN
   builder.setInsertionPointToStart(&ifOp3.getElseRegion().front());
   if constexpr (isNum) {
     pred = mlir::arith::CmpFPredicate::ORD; // check for a non-NaN
-    resultIsX = builder.create<mlir::arith::CmpFOp>(loc, pred, x, x);
-    resultIsY = builder.create<mlir::arith::CmpFOp>(loc, pred, y, y);
+    resultIsX = mlir::arith::CmpFOp::create(builder, loc, pred, x, x);
+    resultIsY = mlir::arith::CmpFOp::create(builder, loc, pred, y, y);
   } else {
     resultIsX = resultIsY = builder.createBool(loc, false);
   }
-  result = builder.create<mlir::arith::SelectOp>(
-      loc, resultIsX, x,
-      builder.create<mlir::arith::SelectOp>(loc, resultIsY, y,
-                                            genQNan(resultType)));
-  mlir::Value hasSNaNOp = builder.create<mlir::arith::OrIOp>(
-      loc, genIsFPClass(builder.getI1Type(), args[0], snanTest),
+  result = mlir::arith::SelectOp::create(
+      builder, loc, resultIsX, x,
+      mlir::arith::SelectOp::create(builder, loc, resultIsY, y,
+                                    genQNan(resultType)));
+  mlir::Value hasSNaNOp = mlir::arith::OrIOp::create(
+      builder, loc, genIsFPClass(builder.getI1Type(), args[0], snanTest),
       genIsFPClass(builder.getI1Type(), args[1], snanTest));
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID, hasSNaNOp);
-  builder.create<fir::ResultOp>(loc, result);
+  fir::ResultOp::create(builder, loc, result);
 
   // Unwind the if nest.
   builder.setInsertionPointAfter(ifOp3);
-  builder.create<fir::ResultOp>(loc, ifOp3.getResult(0));
+  fir::ResultOp::create(builder, loc, ifOp3.getResult(0));
   builder.setInsertionPointAfter(ifOp2);
-  builder.create<fir::ResultOp>(loc, ifOp2.getResult(0));
+  fir::ResultOp::create(builder, loc, ifOp2.getResult(0));
   builder.setInsertionPointAfter(ifOp1);
   return ifOp1.getResult(0);
 }
@@ -5537,13 +5564,13 @@ IntrinsicLibrary::genIeeeQuietCompare(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
   // Compare X and Y with special case treatment of NaN operands.
   assert(args.size() == 2);
-  mlir::Value hasSNaNOp = builder.create<mlir::arith::OrIOp>(
-      loc, genIsFPClass(builder.getI1Type(), args[0], snanTest),
+  mlir::Value hasSNaNOp = mlir::arith::OrIOp::create(
+      builder, loc, genIsFPClass(builder.getI1Type(), args[0], snanTest),
       genIsFPClass(builder.getI1Type(), args[1], snanTest));
   mlir::Value res =
-      builder.create<mlir::arith::CmpFOp>(loc, pred, args[0], args[1]);
+      mlir::arith::CmpFOp::create(builder, loc, pred, args[0], args[1]);
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID, hasSNaNOp);
-  return builder.create<fir::ConvertOp>(loc, resultType, res);
+  return fir::ConvertOp::create(builder, loc, resultType, res);
 }
 
 // IEEE_REAL
@@ -5595,14 +5622,14 @@ mlir::Value IntrinsicLibrary::genIeeeReal(mlir::Type resultType,
   // If the argument is an sNaN, raise an invalid exception and return a qNaN.
   // Otherwise return the argument.
   auto processSnan = [&](mlir::Value x) {
-    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, resultType,
-                                               genIsFPClass(i1Ty, x, snanTest),
-                                               /*withElseRegion=*/true);
+    fir::IfOp ifOp = fir::IfOp::create(builder, loc, resultType,
+                                       genIsFPClass(i1Ty, x, snanTest),
+                                       /*withElseRegion=*/true);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID);
-    builder.create<fir::ResultOp>(loc, genQNan(resultType));
+    fir::ResultOp::create(builder, loc, genQNan(resultType));
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-    builder.create<fir::ResultOp>(loc, x);
+    fir::ResultOp::create(builder, loc, x);
     builder.setInsertionPointAfter(ifOp);
     return ifOp.getResult(0);
   };
@@ -5618,7 +5645,7 @@ mlir::Value IntrinsicLibrary::genIeeeReal(mlir::Type resultType,
     a = builder.createConvert(loc, f32Ty, a);
     aType = f32Ty;
   }
-  r = builder.create<fir::ConvertOp>(loc, resultType, a);
+  r = fir::ConvertOp::create(builder, loc, resultType, a);
 
   mlir::IntegerType aIntType = mlir::dyn_cast<mlir::IntegerType>(aType);
   mlir::FloatType aFloatType = mlir::dyn_cast<mlir::FloatType>(aType);
@@ -5630,142 +5657,144 @@ mlir::Value IntrinsicLibrary::genIeeeReal(mlir::Type resultType,
     return aIntType ? r : processSnan(r);
 
   // A possibly inexact conversion result may need to be rounded up or down.
-  mlir::Value b = builder.create<fir::ConvertOp>(loc, aType, r);
+  mlir::Value b = fir::ConvertOp::create(builder, loc, aType, r);
   mlir::Value aEqB;
   if (aIntType)
-    aEqB = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, a, b);
+    aEqB = mlir::arith::CmpIOp::create(builder, loc,
+                                       mlir::arith::CmpIPredicate::eq, a, b);
   else
-    aEqB = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::UEQ, a, b);
+    aEqB = mlir::arith::CmpFOp::create(builder, loc,
+                                       mlir::arith::CmpFPredicate::UEQ, a, b);
 
   // [a == b] a is a NaN or r is exact (a may be -0, +0, -inf, +inf) -- return r
-  fir::IfOp ifOp1 = builder.create<fir::IfOp>(loc, resultType, aEqB,
-                                              /*withElseRegion=*/true);
+  fir::IfOp ifOp1 = fir::IfOp::create(builder, loc, resultType, aEqB,
+                                      /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&ifOp1.getThenRegion().front());
-  builder.create<fir::ResultOp>(loc, aIntType ? r : processSnan(r));
+  fir::ResultOp::create(builder, loc, aIntType ? r : processSnan(r));
 
   // Code common to (a < b) and (a > b) branches.
   builder.setInsertionPointToStart(&ifOp1.getElseRegion().front());
   mlir::func::FuncOp getRound = fir::factory::getLlvmGetRounding(builder);
-  mlir::Value mode = builder.create<fir::CallOp>(loc, getRound).getResult(0);
+  mlir::Value mode = fir::CallOp::create(builder, loc, getRound).getResult(0);
   mlir::Value aIsNegative, aIsPositive;
   if (aIntType) {
     mlir::Value zero = builder.createIntegerConstant(loc, aIntType, 0);
-    aIsNegative = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, a, zero);
-    aIsPositive = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::sgt, a, zero);
+    aIsNegative = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::slt, a, zero);
+    aIsPositive = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::sgt, a, zero);
   } else {
     mlir::Value zero = builder.createRealZeroConstant(loc, aFloatType);
-    aIsNegative = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::OLT, a, zero);
-    aIsPositive = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::OGT, a, zero);
+    aIsNegative = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::OLT, a, zero);
+    aIsPositive = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::OGT, a, zero);
   }
   mlir::Type resultIntType = builder.getIntegerType(resultFloatType.getWidth());
   mlir::Value resultCast =
-      builder.create<mlir::arith::BitcastOp>(loc, resultIntType, r);
+      mlir::arith::BitcastOp::create(builder, loc, resultIntType, r);
   mlir::Value one = builder.createIntegerConstant(loc, resultIntType, 1);
-  mlir::Value rIsOdd = builder.create<fir::ConvertOp>(
-      loc, i1Ty, builder.create<mlir::arith::AndIOp>(loc, resultCast, one));
+  mlir::Value rIsOdd = fir::ConvertOp::create(
+      builder, loc, i1Ty,
+      mlir::arith::AndIOp::create(builder, loc, resultCast, one));
   // Check for a rounding mode match.
   auto match = [&](int m) {
-    return builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, mode,
+    return mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, mode,
         builder.createIntegerConstant(loc, mode.getType(), m));
   };
-  mlir::Value roundToNearestBit = builder.create<mlir::arith::OrIOp>(
-      loc,
+  mlir::Value roundToNearestBit = mlir::arith::OrIOp::create(
+      builder, loc,
       // IEEE_OTHER is an alias for IEEE_NEAREST.
       match(_FORTRAN_RUNTIME_IEEE_NEAREST), match(_FORTRAN_RUNTIME_IEEE_OTHER));
   mlir::Value roundToNearest =
-      builder.create<mlir::arith::AndIOp>(loc, roundToNearestBit, rIsOdd);
+      mlir::arith::AndIOp::create(builder, loc, roundToNearestBit, rIsOdd);
   mlir::Value roundToZeroBit = match(_FORTRAN_RUNTIME_IEEE_TO_ZERO);
   mlir::Value roundAwayBit = match(_FORTRAN_RUNTIME_IEEE_AWAY);
   mlir::Value roundToZero, roundAway, mustAdjust;
   fir::IfOp adjustIfOp;
   mlir::Value aLtB;
   if (aIntType)
-    aLtB = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, a, b);
+    aLtB = mlir::arith::CmpIOp::create(builder, loc,
+                                       mlir::arith::CmpIPredicate::slt, a, b);
   else
-    aLtB = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::OLT, a, b);
+    aLtB = mlir::arith::CmpFOp::create(builder, loc,
+                                       mlir::arith::CmpFPredicate::OLT, a, b);
   mlir::Value upResult =
-      builder.create<mlir::arith::AddIOp>(loc, resultCast, one);
+      mlir::arith::AddIOp::create(builder, loc, resultCast, one);
   mlir::Value downResult =
-      builder.create<mlir::arith::SubIOp>(loc, resultCast, one);
+      mlir::arith::SubIOp::create(builder, loc, resultCast, one);
 
   // (a < b): r is inexact -- return r or ieee_next_down(r)
-  fir::IfOp ifOp2 = builder.create<fir::IfOp>(loc, resultType, aLtB,
-                                              /*withElseRegion=*/true);
+  fir::IfOp ifOp2 = fir::IfOp::create(builder, loc, resultType, aLtB,
+                                      /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&ifOp2.getThenRegion().front());
   roundToZero =
-      builder.create<mlir::arith::AndIOp>(loc, roundToZeroBit, aIsPositive);
+      mlir::arith::AndIOp::create(builder, loc, roundToZeroBit, aIsPositive);
   roundAway =
-      builder.create<mlir::arith::AndIOp>(loc, roundAwayBit, aIsNegative);
+      mlir::arith::AndIOp::create(builder, loc, roundAwayBit, aIsNegative);
   mlir::Value roundDown = match(_FORTRAN_RUNTIME_IEEE_DOWN);
   mustAdjust =
-      builder.create<mlir::arith::OrIOp>(loc, roundToNearest, roundToZero);
-  mustAdjust = builder.create<mlir::arith::OrIOp>(loc, mustAdjust, roundAway);
-  mustAdjust = builder.create<mlir::arith::OrIOp>(loc, mustAdjust, roundDown);
-  adjustIfOp = builder.create<fir::IfOp>(loc, resultType, mustAdjust,
-                                         /*withElseRegion=*/true);
+      mlir::arith::OrIOp::create(builder, loc, roundToNearest, roundToZero);
+  mustAdjust = mlir::arith::OrIOp::create(builder, loc, mustAdjust, roundAway);
+  mustAdjust = mlir::arith::OrIOp::create(builder, loc, mustAdjust, roundDown);
+  adjustIfOp = fir::IfOp::create(builder, loc, resultType, mustAdjust,
+                                 /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&adjustIfOp.getThenRegion().front());
   if (resultType.isF80())
     r1 = fir::runtime::genNearest(builder, loc, r,
                                   builder.createBool(loc, false));
   else
-    r1 = builder.create<mlir::arith::BitcastOp>(
-        loc, resultType,
-        builder.create<mlir::arith::SelectOp>(loc, aIsNegative, upResult,
-                                              downResult));
-  builder.create<fir::ResultOp>(loc, r1);
+    r1 = mlir::arith::BitcastOp::create(
+        builder, loc, resultType,
+        mlir::arith::SelectOp::create(builder, loc, aIsNegative, upResult,
+                                      downResult));
+  fir::ResultOp::create(builder, loc, r1);
   builder.setInsertionPointToStart(&adjustIfOp.getElseRegion().front());
-  builder.create<fir::ResultOp>(loc, r);
+  fir::ResultOp::create(builder, loc, r);
   builder.setInsertionPointAfter(adjustIfOp);
-  builder.create<fir::ResultOp>(loc, adjustIfOp.getResult(0));
+  fir::ResultOp::create(builder, loc, adjustIfOp.getResult(0));
 
   // (a > b): r is inexact -- return r or ieee_next_up(r)
   builder.setInsertionPointToStart(&ifOp2.getElseRegion().front());
   roundToZero =
-      builder.create<mlir::arith::AndIOp>(loc, roundToZeroBit, aIsNegative);
+      mlir::arith::AndIOp::create(builder, loc, roundToZeroBit, aIsNegative);
   roundAway =
-      builder.create<mlir::arith::AndIOp>(loc, roundAwayBit, aIsPositive);
+      mlir::arith::AndIOp::create(builder, loc, roundAwayBit, aIsPositive);
   mlir::Value roundUp = match(_FORTRAN_RUNTIME_IEEE_UP);
   mustAdjust =
-      builder.create<mlir::arith::OrIOp>(loc, roundToNearest, roundToZero);
-  mustAdjust = builder.create<mlir::arith::OrIOp>(loc, mustAdjust, roundAway);
-  mustAdjust = builder.create<mlir::arith::OrIOp>(loc, mustAdjust, roundUp);
-  adjustIfOp = builder.create<fir::IfOp>(loc, resultType, mustAdjust,
-                                         /*withElseRegion=*/true);
+      mlir::arith::OrIOp::create(builder, loc, roundToNearest, roundToZero);
+  mustAdjust = mlir::arith::OrIOp::create(builder, loc, mustAdjust, roundAway);
+  mustAdjust = mlir::arith::OrIOp::create(builder, loc, mustAdjust, roundUp);
+  adjustIfOp = fir::IfOp::create(builder, loc, resultType, mustAdjust,
+                                 /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&adjustIfOp.getThenRegion().front());
   if (resultType.isF80())
     r1 = fir::runtime::genNearest(builder, loc, r,
                                   builder.createBool(loc, true));
   else
-    r1 = builder.create<mlir::arith::BitcastOp>(
-        loc, resultType,
-        builder.create<mlir::arith::SelectOp>(loc, aIsPositive, upResult,
-                                              downResult));
-  builder.create<fir::ResultOp>(loc, r1);
+    r1 = mlir::arith::BitcastOp::create(
+        builder, loc, resultType,
+        mlir::arith::SelectOp::create(builder, loc, aIsPositive, upResult,
+                                      downResult));
+  fir::ResultOp::create(builder, loc, r1);
   builder.setInsertionPointToStart(&adjustIfOp.getElseRegion().front());
-  builder.create<fir::ResultOp>(loc, r);
+  fir::ResultOp::create(builder, loc, r);
   builder.setInsertionPointAfter(adjustIfOp);
-  builder.create<fir::ResultOp>(loc, adjustIfOp.getResult(0));
+  fir::ResultOp::create(builder, loc, adjustIfOp.getResult(0));
 
   // Generate exceptions for (a < b) and (a > b) branches.
   builder.setInsertionPointAfter(ifOp2);
   r = ifOp2.getResult(0);
-  fir::IfOp exceptIfOp1 = builder.create<fir::IfOp>(
-      loc, genIsFPClass(i1Ty, r, infiniteTest), /*withElseRegion=*/true);
+  fir::IfOp exceptIfOp1 =
+      fir::IfOp::create(builder, loc, genIsFPClass(i1Ty, r, infiniteTest),
+                        /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&exceptIfOp1.getThenRegion().front());
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_OVERFLOW |
                  _FORTRAN_RUNTIME_IEEE_INEXACT);
   builder.setInsertionPointToStart(&exceptIfOp1.getElseRegion().front());
-  fir::IfOp exceptIfOp2 = builder.create<fir::IfOp>(
-      loc, genIsFPClass(i1Ty, r, subnormalTest | zeroTest),
+  fir::IfOp exceptIfOp2 = fir::IfOp::create(
+      builder, loc, genIsFPClass(i1Ty, r, subnormalTest | zeroTest),
       /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&exceptIfOp2.getThenRegion().front());
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW |
@@ -5773,7 +5802,7 @@ mlir::Value IntrinsicLibrary::genIeeeReal(mlir::Type resultType,
   builder.setInsertionPointToStart(&exceptIfOp2.getElseRegion().front());
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INEXACT);
   builder.setInsertionPointAfter(exceptIfOp1);
-  builder.create<fir::ResultOp>(loc, ifOp2.getResult(0));
+  fir::ResultOp::create(builder, loc, ifOp2.getResult(0));
   builder.setInsertionPointAfter(ifOp1);
   return ifOp1.getResult(0);
 }
@@ -5789,19 +5818,19 @@ mlir::Value IntrinsicLibrary::genIeeeRem(mlir::Type resultType,
   mlir::Value y = args[1];
   if (mlir::dyn_cast<mlir::FloatType>(resultType).getWidth() < 32) {
     mlir::Type f32Ty = mlir::Float32Type::get(builder.getContext());
-    x = builder.create<fir::ConvertOp>(loc, f32Ty, x);
-    y = builder.create<fir::ConvertOp>(loc, f32Ty, y);
+    x = fir::ConvertOp::create(builder, loc, f32Ty, x);
+    y = fir::ConvertOp::create(builder, loc, f32Ty, y);
   } else {
-    x = builder.create<fir::ConvertOp>(loc, resultType, x);
-    y = builder.create<fir::ConvertOp>(loc, resultType, y);
+    x = fir::ConvertOp::create(builder, loc, resultType, x);
+    y = fir::ConvertOp::create(builder, loc, resultType, y);
   }
   // remainder calls do not signal IEEE_UNDERFLOW.
-  mlir::Value underflow = builder.create<mlir::arith::AndIOp>(
-      loc, genIsFPClass(builder.getI1Type(), x, subnormalTest),
+  mlir::Value underflow = mlir::arith::AndIOp::create(
+      builder, loc, genIsFPClass(builder.getI1Type(), x, subnormalTest),
       genIsFPClass(builder.getI1Type(), y, infiniteTest));
   mlir::Value result = genRuntimeCall("remainder", x.getType(), {x, y});
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW, underflow);
-  return builder.create<fir::ConvertOp>(loc, resultType, result);
+  return fir::ConvertOp::create(builder, loc, resultType, result);
 }
 
 // IEEE_RINT
@@ -5817,19 +5846,19 @@ mlir::Value IntrinsicLibrary::genIeeeRint(mlir::Type resultType,
   mlir::func::FuncOp setRound = fir::factory::getLlvmSetRounding(builder);
   mlir::Value mode;
   if (isStaticallyPresent(args[1])) {
-    mode = builder.create<fir::CallOp>(loc, getRound).getResult(0);
+    mode = fir::CallOp::create(builder, loc, getRound).getResult(0);
     genIeeeSetRoundingMode({args[1]});
   }
   if (mlir::cast<mlir::FloatType>(resultType).getWidth() == 16)
-    a = builder.create<fir::ConvertOp>(
-        loc, mlir::Float32Type::get(builder.getContext()), a);
-  mlir::Value result = builder.create<fir::ConvertOp>(
-      loc, resultType, genRuntimeCall("nearbyint", a.getType(), a));
+    a = fir::ConvertOp::create(builder, loc,
+                               mlir::Float32Type::get(builder.getContext()), a);
+  mlir::Value result = fir::ConvertOp::create(
+      builder, loc, resultType, genRuntimeCall("nearbyint", a.getType(), a));
   if (isStaticallyPresent(args[1])) {
-    builder.create<fir::CallOp>(loc, setRound, mode);
+    fir::CallOp::create(builder, loc, setRound, mode);
   } else {
-    mlir::Value inexact = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::ONE, args[0], result);
+    mlir::Value inexact = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::ONE, args[0], result);
     genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INEXACT, inexact);
   }
   return result;
@@ -5845,18 +5874,19 @@ void IntrinsicLibrary::genIeeeSetFlagOrHaltingMode(
   mlir::Type i1Ty = builder.getI1Type();
   mlir::Type i32Ty = builder.getIntegerType(32);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, getBase(args[0]));
-  mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
+  mlir::Value field = fir::LoadOp::create(builder, loc, fieldRef);
   mlir::Value except = fir::runtime::genMapExcept(
-      builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field));
-  auto ifOp = builder.create<fir::IfOp>(
-      loc, builder.create<fir::ConvertOp>(loc, i1Ty, getBase(args[1])),
+      builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, field));
+  auto ifOp = fir::IfOp::create(
+      builder, loc,
+      fir::ConvertOp::create(builder, loc, i1Ty, getBase(args[1])),
       /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   (isFlag ? fir::runtime::genFeraiseexcept : fir::runtime::genFeenableexcept)(
-      builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, except));
+      builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, except));
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
   (isFlag ? fir::runtime::genFeclearexcept : fir::runtime::genFedisableexcept)(
-      builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, except));
+      builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, except));
   builder.setInsertionPointAfter(ifOp);
 }
 
@@ -5873,7 +5903,7 @@ void IntrinsicLibrary::genIeeeSetRoundingMode(
     checkRadix(builder, loc, fir::getBase(args[1]), "ieee_set_rounding_mode");
   auto [fieldRef, fieldTy] = getFieldRef(builder, loc, fir::getBase(args[0]));
   mlir::func::FuncOp setRound = fir::factory::getLlvmSetRounding(builder);
-  mlir::Value mode = builder.create<fir::LoadOp>(loc, fieldRef);
+  mlir::Value mode = fir::LoadOp::create(builder, loc, fieldRef);
   static_assert(
       _FORTRAN_RUNTIME_IEEE_TO_ZERO >= 0 &&
       _FORTRAN_RUNTIME_IEEE_TO_ZERO <= 3 &&
@@ -5881,28 +5911,28 @@ void IntrinsicLibrary::genIeeeSetRoundingMode(
       _FORTRAN_RUNTIME_IEEE_NEAREST <= 3 && _FORTRAN_RUNTIME_IEEE_UP >= 0 &&
       _FORTRAN_RUNTIME_IEEE_UP <= 3 && _FORTRAN_RUNTIME_IEEE_DOWN >= 0 &&
       _FORTRAN_RUNTIME_IEEE_DOWN <= 3 && "unexpected rounding mode mapping");
-  mlir::Value mask = builder.create<mlir::arith::ShLIOp>(
-      loc, builder.createAllOnesInteger(loc, fieldTy),
+  mlir::Value mask = mlir::arith::ShLIOp::create(
+      builder, loc, builder.createAllOnesInteger(loc, fieldTy),
       builder.createIntegerConstant(loc, fieldTy, 2));
-  mlir::Value modeIsSupported = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq,
-      builder.create<mlir::arith::AndIOp>(loc, mode, mask),
+  mlir::Value modeIsSupported = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq,
+      mlir::arith::AndIOp::create(builder, loc, mode, mask),
       builder.createIntegerConstant(loc, fieldTy, 0));
   mlir::Value nearest = builder.createIntegerConstant(
       loc, fieldTy, _FORTRAN_RUNTIME_IEEE_NEAREST);
-  mode = builder.create<mlir::arith::SelectOp>(loc, modeIsSupported, mode,
-                                               nearest);
-  mode = builder.create<fir::ConvertOp>(
-      loc, setRound.getFunctionType().getInput(0), mode);
-  builder.create<fir::CallOp>(loc, setRound, mode);
+  mode = mlir::arith::SelectOp::create(builder, loc, modeIsSupported, mode,
+                                       nearest);
+  mode = fir::ConvertOp::create(builder, loc,
+                                setRound.getFunctionType().getInput(0), mode);
+  fir::CallOp::create(builder, loc, setRound, mode);
 }
 
 // IEEE_SET_UNDERFLOW_MODE
 void IntrinsicLibrary::genIeeeSetUnderflowMode(
     llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 1);
-  mlir::Value gradual = builder.create<fir::ConvertOp>(loc, builder.getI1Type(),
-                                                       getBase(args[0]));
+  mlir::Value gradual = fir::ConvertOp::create(
+      builder, loc, builder.getI1Type(), getBase(args[0]));
   fir::runtime::genSetUnderflowMode(builder, loc, {gradual});
 }
 
@@ -5916,9 +5946,9 @@ IntrinsicLibrary::genIeeeSignalingCompare(mlir::Type resultType,
   assert(args.size() == 2);
   mlir::Value hasNaNOp = genIeeeUnordered(mlir::Type{}, args);
   mlir::Value res =
-      builder.create<mlir::arith::CmpFOp>(loc, pred, args[0], args[1]);
+      mlir::arith::CmpFOp::create(builder, loc, pred, args[0], args[1]);
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID, hasNaNOp);
-  return builder.create<fir::ConvertOp>(loc, resultType, res);
+  return fir::ConvertOp::create(builder, loc, resultType, res);
 }
 
 // IEEE_SIGNBIT
@@ -5937,9 +5967,9 @@ mlir::Value IntrinsicLibrary::genIeeeSignbit(mlir::Type resultType,
   }
   mlir::Type intType = builder.getIntegerType(bitWidth);
   mlir::Value intVal =
-      builder.create<mlir::arith::BitcastOp>(loc, intType, realVal);
+      mlir::arith::BitcastOp::create(builder, loc, intType, realVal);
   mlir::Value shift = builder.createIntegerConstant(loc, intType, bitWidth - 1);
-  mlir::Value sign = builder.create<mlir::arith::ShRUIOp>(loc, intVal, shift);
+  mlir::Value sign = mlir::arith::ShRUIOp::create(builder, loc, intVal, shift);
   return builder.createConvert(loc, resultType, sign);
 }
 
@@ -5952,21 +5982,21 @@ IntrinsicLibrary::genIeeeSupportFlag(mlir::Type resultType,
   mlir::Type i1Ty = builder.getI1Type();
   mlir::Type i32Ty = builder.getIntegerType(32);
   auto [fieldRef, fieldTy] = getFieldRef(builder, loc, getBase(args[0]));
-  mlir::Value flag = builder.create<fir::LoadOp>(loc, fieldRef);
+  mlir::Value flag = fir::LoadOp::create(builder, loc, fieldRef);
   mlir::Value standardFlagMask = builder.createIntegerConstant(
       loc, fieldTy,
       _FORTRAN_RUNTIME_IEEE_INVALID | _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO |
           _FORTRAN_RUNTIME_IEEE_OVERFLOW | _FORTRAN_RUNTIME_IEEE_UNDERFLOW |
           _FORTRAN_RUNTIME_IEEE_INEXACT);
-  mlir::Value isStandardFlag = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne,
-      builder.create<mlir::arith::AndIOp>(loc, flag, standardFlagMask),
+  mlir::Value isStandardFlag = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne,
+      mlir::arith::AndIOp::create(builder, loc, flag, standardFlagMask),
       builder.createIntegerConstant(loc, fieldTy, 0));
-  fir::IfOp ifOp = builder.create<fir::IfOp>(loc, i1Ty, isStandardFlag,
-                                             /*withElseRegion=*/true);
+  fir::IfOp ifOp = fir::IfOp::create(builder, loc, i1Ty, isStandardFlag,
+                                     /*withElseRegion=*/true);
   // Standard flags are supported.
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-  builder.create<fir::ResultOp>(loc, builder.createBool(loc, true));
+  fir::ResultOp::create(builder, loc, builder.createBool(loc, true));
 
   // TargetCharacteristics information for the nonstandard ieee_denorm flag
   // is not available here. So use a runtime check restricted to possibly
@@ -5990,17 +6020,17 @@ IntrinsicLibrary::genIeeeSupportFlag(mlir::Type resultType,
     }
   }
   if (mayBeSupported) {
-    mlir::Value isDenorm = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, flag,
+    mlir::Value isDenorm = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, flag,
         builder.createIntegerConstant(loc, fieldTy,
                                       _FORTRAN_RUNTIME_IEEE_DENORM));
-    mlir::Value result = builder.create<mlir::arith::AndIOp>(
-        loc, isDenorm,
+    mlir::Value result = mlir::arith::AndIOp::create(
+        builder, loc, isDenorm,
         fir::runtime::genSupportHalting(
-            builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, flag)));
-    builder.create<fir::ResultOp>(loc, result);
+            builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, flag)));
+    fir::ResultOp::create(builder, loc, result);
   } else {
-    builder.create<fir::ResultOp>(loc, builder.createBool(loc, false));
+    fir::ResultOp::create(builder, loc, builder.createBool(loc, false));
   }
   builder.setInsertionPointAfter(ifOp);
   return builder.createConvert(loc, resultType, ifOp.getResult(0));
@@ -6015,11 +6045,11 @@ fir::ExtendedValue IntrinsicLibrary::genIeeeSupportHalting(
   assert(args.size() == 1);
   mlir::Type i32Ty = builder.getIntegerType(32);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, getBase(args[0]));
-  mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
+  mlir::Value field = fir::LoadOp::create(builder, loc, fieldRef);
   return builder.createConvert(
       loc, resultType,
       fir::runtime::genSupportHalting(
-          builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
+          builder, loc, fir::ConvertOp::create(builder, loc, i32Ty, field)));
 }
 
 // IEEE_SUPPORT_ROUNDING
@@ -6036,16 +6066,16 @@ fir::ExtendedValue IntrinsicLibrary::genIeeeSupportRounding(
   //  4 - to nearest, ties away from zero [not supported]
   assert(args.size() == 1 || args.size() == 2);
   auto [fieldRef, fieldTy] = getFieldRef(builder, loc, getBase(args[0]));
-  mlir::Value mode = builder.create<fir::LoadOp>(loc, fieldRef);
-  mlir::Value lbOk = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::sge, mode,
+  mlir::Value mode = fir::LoadOp::create(builder, loc, fieldRef);
+  mlir::Value lbOk = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sge, mode,
       builder.createIntegerConstant(loc, fieldTy,
                                     _FORTRAN_RUNTIME_IEEE_TO_ZERO));
-  mlir::Value ubOk = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::sle, mode,
+  mlir::Value ubOk = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sle, mode,
       builder.createIntegerConstant(loc, fieldTy, _FORTRAN_RUNTIME_IEEE_DOWN));
   return builder.createConvert(
-      loc, resultType, builder.create<mlir::arith::AndIOp>(loc, lbOk, ubOk));
+      loc, resultType, mlir::arith::AndIOp::create(builder, loc, lbOk, ubOk));
 }
 
 // IEEE_SUPPORT_STANDARD
@@ -6069,15 +6099,15 @@ IntrinsicLibrary::genIeeeUnordered(mlir::Type resultType,
   // If there is no result type return an i1 result.
   assert(args.size() == 2);
   if (args[0].getType() == args[1].getType()) {
-    mlir::Value res = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::UNO, args[0], args[1]);
+    mlir::Value res = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::UNO, args[0], args[1]);
     return resultType ? builder.createConvert(loc, resultType, res) : res;
   }
   assert(resultType && "expecting a (mixed arg type) unordered result type");
   mlir::Type i1Ty = builder.getI1Type();
   mlir::Value xIsNan = genIsFPClass(i1Ty, args[0], nanTest);
   mlir::Value yIsNan = genIsFPClass(i1Ty, args[1], nanTest);
-  mlir::Value res = builder.create<mlir::arith::OrIOp>(loc, xIsNan, yIsNan);
+  mlir::Value res = mlir::arith::OrIOp::create(builder, loc, xIsNan, yIsNan);
   return builder.createConvert(loc, resultType, res);
 }
 
@@ -6202,22 +6232,22 @@ mlir::Value IntrinsicLibrary::genIeeeValue(mlir::Type resultType,
   mlir::Value which;
   if (args.size() == 2) { // user call
     auto [index, ignore] = getFieldRef(builder, loc, args[1]);
-    which = builder.create<fir::LoadOp>(loc, index);
+    which = fir::LoadOp::create(builder, loc, index);
   } else { // compiler generated call
     which = args[0];
   }
-  mlir::Value bits = builder.create<fir::LoadOp>(
-      loc,
-      builder.create<fir::CoordinateOp>(
-          loc, builder.getRefType(valueTy),
-          builder.create<fir::AddrOfOp>(loc, builder.getRefType(tableTy),
-                                        builder.getSymbolRefAttr(tableName)),
+  mlir::Value bits = fir::LoadOp::create(
+      builder, loc,
+      fir::CoordinateOp::create(
+          builder, loc, builder.getRefType(valueTy),
+          fir::AddrOfOp::create(builder, loc, builder.getRefType(tableTy),
+                                builder.getSymbolRefAttr(tableName)),
           which));
   if (bitWidth > 64)
-    bits = builder.create<mlir::arith::ShLIOp>(
-        loc, builder.createConvert(loc, intType, bits),
+    bits = mlir::arith::ShLIOp::create(
+        builder, loc, builder.createConvert(loc, intType, bits),
         builder.createIntegerConstant(loc, intType, bitWidth - 64));
-  return builder.create<mlir::arith::BitcastOp>(loc, realType, bits);
+  return mlir::arith::BitcastOp::create(builder, loc, realType, bits);
 }
 
 // IEOR
@@ -6259,13 +6289,14 @@ IntrinsicLibrary::genIndex(mlir::Type resultType,
         builder.getContext(), builder.getKindMap().defaultLogicalKind());
     mlir::Value temp = builder.createTemporary(loc, logTy);
     mlir::Value castb = builder.createConvert(loc, logTy, b);
-    builder.create<fir::StoreOp>(loc, castb, temp);
+    fir::StoreOp::create(builder, loc, castb, temp);
     return builder.createBox(loc, temp);
   };
-  mlir::Value backOpt = isStaticallyAbsent(args, 2)
-                            ? builder.create<fir::AbsentOp>(
-                                  loc, fir::BoxType::get(builder.getI1Type()))
-                            : makeRefThenEmbox(fir::getBase(args[2]));
+  mlir::Value backOpt =
+      isStaticallyAbsent(args, 2)
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getI1Type()))
+          : makeRefThenEmbox(fir::getBase(args[2]));
   mlir::Value kindVal = isStaticallyAbsent(args, 3)
                             ? builder.createIntegerConstant(
                                   loc, builder.getIndexType(),
@@ -6314,8 +6345,8 @@ mlir::Value
 IntrinsicLibrary::genIsIostatValue(mlir::Type resultType,
                                    llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 1);
-  return builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, args[0],
+  return mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, args[0],
       builder.createIntegerConstant(loc, args[0].getType(), value));
 }
 
@@ -6342,16 +6373,16 @@ mlir::Value IntrinsicLibrary::genIshft(mlir::Type resultType,
   mlir::Value word = args[0];
   if (word.getType().isUnsignedInteger())
     word = builder.createConvert(loc, signlessType, word);
-  auto left = builder.create<mlir::arith::ShLIOp>(loc, word, absShift);
-  auto right = builder.create<mlir::arith::ShRUIOp>(loc, word, absShift);
-  auto shiftIsLarge = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::sge, absShift, bitSize);
-  auto shiftIsNegative = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::slt, shift, zero);
+  auto left = mlir::arith::ShLIOp::create(builder, loc, word, absShift);
+  auto right = mlir::arith::ShRUIOp::create(builder, loc, word, absShift);
+  auto shiftIsLarge = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sge, absShift, bitSize);
+  auto shiftIsNegative = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::slt, shift, zero);
   auto sel =
-      builder.create<mlir::arith::SelectOp>(loc, shiftIsNegative, right, left);
+      mlir::arith::SelectOp::create(builder, loc, shiftIsNegative, right, left);
   mlir::Value result =
-      builder.create<mlir::arith::SelectOp>(loc, shiftIsLarge, zero, sel);
+      mlir::arith::SelectOp::create(builder, loc, shiftIsLarge, zero, sel);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -6392,42 +6423,42 @@ mlir::Value IntrinsicLibrary::genIshftc(mlir::Type resultType,
   mlir::Value zero = builder.createIntegerConstant(loc, signlessType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, signlessType);
   mlir::Value absShift = genAbs(signlessType, {shift});
-  auto elseSize = builder.create<mlir::arith::SubIOp>(loc, size, absShift);
-  auto shiftIsZero = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, shift, zero);
-  auto shiftEqualsSize = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, absShift, size);
+  auto elseSize = mlir::arith::SubIOp::create(builder, loc, size, absShift);
+  auto shiftIsZero = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, shift, zero);
+  auto shiftEqualsSize = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, absShift, size);
   auto shiftIsNop =
-      builder.create<mlir::arith::OrIOp>(loc, shiftIsZero, shiftEqualsSize);
-  auto shiftIsPositive = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::sgt, shift, zero);
-  auto leftSize = builder.create<mlir::arith::SelectOp>(loc, shiftIsPositive,
-                                                        absShift, elseSize);
-  auto rightSize = builder.create<mlir::arith::SelectOp>(loc, shiftIsPositive,
-                                                         elseSize, absShift);
-  auto hasUnchanged = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne, size, bitSize);
-  auto unchangedTmp1 = builder.create<mlir::arith::ShRUIOp>(loc, word, size);
+      mlir::arith::OrIOp::create(builder, loc, shiftIsZero, shiftEqualsSize);
+  auto shiftIsPositive = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sgt, shift, zero);
+  auto leftSize = mlir::arith::SelectOp::create(builder, loc, shiftIsPositive,
+                                                absShift, elseSize);
+  auto rightSize = mlir::arith::SelectOp::create(builder, loc, shiftIsPositive,
+                                                 elseSize, absShift);
+  auto hasUnchanged = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, size, bitSize);
+  auto unchangedTmp1 = mlir::arith::ShRUIOp::create(builder, loc, word, size);
   auto unchangedTmp2 =
-      builder.create<mlir::arith::ShLIOp>(loc, unchangedTmp1, size);
-  auto unchanged = builder.create<mlir::arith::SelectOp>(loc, hasUnchanged,
-                                                         unchangedTmp2, zero);
+      mlir::arith::ShLIOp::create(builder, loc, unchangedTmp1, size);
+  auto unchanged = mlir::arith::SelectOp::create(builder, loc, hasUnchanged,
+                                                 unchangedTmp2, zero);
   auto leftMaskShift =
-      builder.create<mlir::arith::SubIOp>(loc, bitSize, leftSize);
+      mlir::arith::SubIOp::create(builder, loc, bitSize, leftSize);
   auto leftMask =
-      builder.create<mlir::arith::ShRUIOp>(loc, ones, leftMaskShift);
-  auto leftTmp = builder.create<mlir::arith::ShRUIOp>(loc, word, rightSize);
-  auto left = builder.create<mlir::arith::AndIOp>(loc, leftTmp, leftMask);
+      mlir::arith::ShRUIOp::create(builder, loc, ones, leftMaskShift);
+  auto leftTmp = mlir::arith::ShRUIOp::create(builder, loc, word, rightSize);
+  auto left = mlir::arith::AndIOp::create(builder, loc, leftTmp, leftMask);
   auto rightMaskShift =
-      builder.create<mlir::arith::SubIOp>(loc, bitSize, rightSize);
+      mlir::arith::SubIOp::create(builder, loc, bitSize, rightSize);
   auto rightMask =
-      builder.create<mlir::arith::ShRUIOp>(loc, ones, rightMaskShift);
-  auto rightTmp = builder.create<mlir::arith::AndIOp>(loc, word, rightMask);
-  auto right = builder.create<mlir::arith::ShLIOp>(loc, rightTmp, leftSize);
-  auto resTmp = builder.create<mlir::arith::OrIOp>(loc, unchanged, left);
-  auto res = builder.create<mlir::arith::OrIOp>(loc, resTmp, right);
+      mlir::arith::ShRUIOp::create(builder, loc, ones, rightMaskShift);
+  auto rightTmp = mlir::arith::AndIOp::create(builder, loc, word, rightMask);
+  auto right = mlir::arith::ShLIOp::create(builder, loc, rightTmp, leftSize);
+  auto resTmp = mlir::arith::OrIOp::create(builder, loc, unchanged, left);
+  auto res = mlir::arith::OrIOp::create(builder, loc, resTmp, right);
   mlir::Value result =
-      builder.create<mlir::arith::SelectOp>(loc, shiftIsNop, word, res);
+      mlir::arith::SelectOp::create(builder, loc, shiftIsNop, word, res);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -6439,7 +6470,7 @@ mlir::Value IntrinsicLibrary::genLeadz(mlir::Type resultType,
   assert(args.size() == 1);
 
   mlir::Value result =
-      builder.create<mlir::math::CountLeadingZerosOp>(loc, args);
+      mlir::math::CountLeadingZerosOp::create(builder, loc, args);
 
   return builder.createConvert(loc, resultType, result);
 }
@@ -6507,18 +6538,18 @@ IntrinsicLibrary::genLoc(mlir::Type resultType,
   // created when preparing the argument cases, but the box can be safely be
   // used for all those cases and the address will be null if absent.
   mlir::Value isPresent =
-      builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), box);
+      fir::IsPresentOp::create(builder, loc, builder.getI1Type(), box);
   return builder
       .genIfOp(loc, {resultType}, isPresent,
                /*withElseRegion=*/true)
       .genThen([&]() {
         mlir::Value argAddr = getAddrFromBox(builder, loc, args[0], isFunc);
         mlir::Value cast = builder.createConvert(loc, resultType, argAddr);
-        builder.create<fir::ResultOp>(loc, cast);
+        fir::ResultOp::create(builder, loc, cast);
       })
       .genElse([&]() {
         mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
-        builder.create<fir::ResultOp>(loc, zero);
+        fir::ResultOp::create(builder, loc, zero);
       })
       .getResults()[0];
 }
@@ -6551,12 +6582,12 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
   // non-deterministic result. Other compilers don't produce a consistent result
   // in this case either, so we choose the most efficient implementation.
   mlir::Value shift =
-      builder.create<mlir::arith::SubIOp>(loc, bitSize, bitsToSet);
-  mlir::Value shifted = builder.create<Shift>(loc, ones, shift);
-  mlir::Value isZero = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, bitsToSet, zero);
+      mlir::arith::SubIOp::create(builder, loc, bitSize, bitsToSet);
+  mlir::Value shifted = Shift::create(builder, loc, ones, shift);
+  mlir::Value isZero = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, bitsToSet, zero);
   mlir::Value result =
-      builder.create<mlir::arith::SelectOp>(loc, isZero, zero, shifted);
+      mlir::arith::SelectOp::create(builder, loc, isZero, zero, shifted);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -6574,8 +6605,8 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
 
   mlir::Value arg1 = args[1];
   if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = builder.create<fir::ConvertOp>(
-        loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
 
   mlir::Type retTy =
       mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
@@ -6584,10 +6615,10 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
           .create<mlir::NVVM::MatchSyncOp>(loc, retTy, args[0], arg1,
                                            mlir::NVVM::MatchSyncKind::all)
           .getResult();
-  auto value = builder.create<mlir::LLVM::ExtractValueOp>(loc, match, 0);
-  auto pred = builder.create<mlir::LLVM::ExtractValueOp>(loc, match, 1);
-  auto conv = builder.create<mlir::LLVM::ZExtOp>(loc, resultType, pred);
-  builder.create<fir::StoreOp>(loc, conv, args[2]);
+  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
+  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
+  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
+  fir::StoreOp::create(builder, loc, conv, args[2]);
   return value;
 }
 
@@ -6597,14 +6628,14 @@ mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType,
                                           llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
   mlir::Value arg1 =
-      builder.create<fir::ConvertOp>(loc, builder.getI1Type(), args[1]);
+      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
   mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
                          ? builder.getI32Type()
                          : builder.getI1Type();
   auto voteRes =
-      builder.create<mlir::NVVM::VoteSyncOp>(loc, resTy, args[0], arg1, kind)
+      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
           .getResult();
-  return builder.create<fir::ConvertOp>(loc, resultType, voteRes);
+  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
 }
 
 // MATCH_ANY_SYNC
@@ -6616,8 +6647,8 @@ IntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
 
   mlir::Value arg1 = args[1];
   if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = builder.create<fir::ConvertOp>(
-        loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
 
   return builder
       .create<mlir::NVVM::MatchSyncOp>(loc, resultType, args[0], arg1,
@@ -6701,10 +6732,10 @@ IntrinsicLibrary::genMerge(mlir::Type,
                                  mlir::Value other) -> mlir::Value {
     mlir::Type otherType = other.getType();
     if (mlir::isa<fir::BaseBoxType>(otherType))
-      return builder.create<fir::ReboxOp>(loc, otherType, polymorphic,
-                                          /*shape*/ mlir::Value{},
-                                          /*slice=*/mlir::Value{});
-    return builder.create<fir::BoxAddrOp>(loc, otherType, polymorphic);
+      return fir::ReboxOp::create(builder, loc, otherType, polymorphic,
+                                  /*shape*/ mlir::Value{},
+                                  /*slice=*/mlir::Value{});
+    return fir::BoxAddrOp::create(builder, loc, otherType, polymorphic);
   };
   if (fir::isPolymorphicType(tsource.getType()) &&
       !fir::isPolymorphicType(fsource.getType())) {
@@ -6720,8 +6751,8 @@ IntrinsicLibrary::genMerge(mlir::Type,
     // fulfill mlir::SelectOp constraint that the MLIR types must be the same.
     fsourceCast = builder.createConvert(loc, tsource.getType(), fsource);
   }
-  auto rslt = builder.create<mlir::arith::SelectOp>(loc, mask, tsourceCast,
-                                                    fsourceCast);
+  auto rslt = mlir::arith::SelectOp::create(builder, loc, mask, tsourceCast,
+                                            fsourceCast);
   if (isCharRslt) {
     // Need a CharBoxValue for character results
     const fir::CharBoxValue *charBox = args[0].getCharBox();
@@ -6762,7 +6793,7 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                                         args[0], args[1]);
   }
   if (mlir::isa<mlir::IntegerType>(resultType))
-    return builder.create<mlir::arith::RemSIOp>(loc, args[0], args[1]);
+    return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
 
   // Use runtime.
   return builder.createConvert(
@@ -6792,19 +6823,19 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
   }
   if (mlir::isa<mlir::IntegerType>(resultType)) {
     auto remainder =
-        builder.create<mlir::arith::RemSIOp>(loc, args[0], args[1]);
-    auto argXor = builder.create<mlir::arith::XOrIOp>(loc, args[0], args[1]);
+        mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
+    auto argXor = mlir::arith::XOrIOp::create(builder, loc, args[0], args[1]);
     mlir::Value zero = builder.createIntegerConstant(loc, argXor.getType(), 0);
-    auto argSignDifferent = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, argXor, zero);
-    auto remainderIsNotZero = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, remainder, zero);
-    auto mustAddP = builder.create<mlir::arith::AndIOp>(loc, remainderIsNotZero,
-                                                        argSignDifferent);
+    auto argSignDifferent = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::slt, argXor, zero);
+    auto remainderIsNotZero = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne, remainder, zero);
+    auto mustAddP = mlir::arith::AndIOp::create(
+        builder, loc, remainderIsNotZero, argSignDifferent);
     auto remPlusP =
-        builder.create<mlir::arith::AddIOp>(loc, remainder, args[1]);
-    return builder.create<mlir::arith::SelectOp>(loc, mustAddP, remPlusP,
-                                                 remainder);
+        mlir::arith::AddIOp::create(builder, loc, remainder, args[1]);
+    return mlir::arith::SelectOp::create(builder, loc, mustAddP, remPlusP,
+                                         remainder);
   }
 
   auto fastMathFlags = builder.getFastMathFlags();
@@ -6817,21 +6848,21 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
         loc, resultType,
         fir::runtime::genModulo(builder, loc, args[0], args[1]));
 
-  auto remainder = builder.create<mlir::arith::RemFOp>(loc, args[0], args[1]);
+  auto remainder = mlir::arith::RemFOp::create(builder, loc, args[0], args[1]);
   mlir::Value zero = builder.createRealZeroConstant(loc, remainder.getType());
-  auto remainderIsNotZero = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::UNE, remainder, zero);
-  auto aLessThanZero = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OLT, args[0], zero);
-  auto pLessThanZero = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OLT, args[1], zero);
+  auto remainderIsNotZero = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::UNE, remainder, zero);
+  auto aLessThanZero = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OLT, args[0], zero);
+  auto pLessThanZero = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OLT, args[1], zero);
   auto argSignDifferent =
-      builder.create<mlir::arith::XOrIOp>(loc, aLessThanZero, pLessThanZero);
-  auto mustAddP = builder.create<mlir::arith::AndIOp>(loc, remainderIsNotZero,
-                                                      argSignDifferent);
-  auto remPlusP = builder.create<mlir::arith::AddFOp>(loc, remainder, args[1]);
-  return builder.create<mlir::arith::SelectOp>(loc, mustAddP, remPlusP,
-                                               remainder);
+      mlir::arith::XOrIOp::create(builder, loc, aLessThanZero, pLessThanZero);
+  auto mustAddP = mlir::arith::AndIOp::create(builder, loc, remainderIsNotZero,
+                                              argSignDifferent);
+  auto remPlusP = mlir::arith::AddFOp::create(builder, loc, remainder, args[1]);
+  return mlir::arith::SelectOp::create(builder, loc, mustAddP, remPlusP,
+                                       remainder);
 }
 
 void IntrinsicLibrary::genMoveAlloc(llvm::ArrayRef<fir::ExtendedValue> args) {
@@ -6846,7 +6877,7 @@ void IntrinsicLibrary::genMoveAlloc(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value errBox =
       isStaticallyPresent(errMsg)
           ? fir::getBase(errMsg)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
 
   const fir::MutableBoxValue *fromBox = from.getBoxOf<fir::MutableBoxValue>();
   const fir::MutableBoxValue *toBox = to.getBoxOf<fir::MutableBoxValue>();
@@ -6903,33 +6934,34 @@ void IntrinsicLibrary::genMvbits(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Type toType{fir::dyn_cast_ptrEleTy(toAddr.getType())};
   assert(toType.getIntOrFloatBitWidth() == fromType.getIntOrFloatBitWidth() &&
          "mismatched mvbits types");
-  auto to = builder.create<fir::LoadOp>(loc, signlessType, toAddr);
+  auto to = fir::LoadOp::create(builder, loc, signlessType, toAddr);
   mlir::Value topos = builder.createConvert(loc, signlessType, unbox(args[4]));
   mlir::Value zero = builder.createIntegerConstant(loc, signlessType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, signlessType);
   mlir::Value bitSize = builder.createIntegerConstant(
       loc, signlessType,
       mlir::cast<mlir::IntegerType>(signlessType).getWidth());
-  auto shiftCount = builder.create<mlir::arith::SubIOp>(loc, bitSize, len);
-  auto mask = builder.create<mlir::arith::ShRUIOp>(loc, ones, shiftCount);
-  auto unchangedTmp1 = builder.create<mlir::arith::ShLIOp>(loc, mask, topos);
+  auto shiftCount = mlir::arith::SubIOp::create(builder, loc, bitSize, len);
+  auto mask = mlir::arith::ShRUIOp::create(builder, loc, ones, shiftCount);
+  auto unchangedTmp1 = mlir::arith::ShLIOp::create(builder, loc, mask, topos);
   auto unchangedTmp2 =
-      builder.create<mlir::arith::XOrIOp>(loc, unchangedTmp1, ones);
-  auto unchanged = builder.create<mlir::arith::AndIOp>(loc, unchangedTmp2, to);
+      mlir::arith::XOrIOp::create(builder, loc, unchangedTmp1, ones);
+  auto unchanged = mlir::arith::AndIOp::create(builder, loc, unchangedTmp2, to);
   if (fromType.isUnsignedInteger())
     from = builder.createConvert(loc, signlessType, from);
-  auto frombitsTmp1 = builder.create<mlir::arith::ShRUIOp>(loc, from, frompos);
+  auto frombitsTmp1 = mlir::arith::ShRUIOp::create(builder, loc, from, frompos);
   auto frombitsTmp2 =
-      builder.create<mlir::arith::AndIOp>(loc, frombitsTmp1, mask);
-  auto frombits = builder.create<mlir::arith::ShLIOp>(loc, frombitsTmp2, topos);
-  auto resTmp = builder.create<mlir::arith::OrIOp>(loc, unchanged, frombits);
-  auto lenIsZero = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, len, zero);
+      mlir::arith::AndIOp::create(builder, loc, frombitsTmp1, mask);
+  auto frombits =
+      mlir::arith::ShLIOp::create(builder, loc, frombitsTmp2, topos);
+  auto resTmp = mlir::arith::OrIOp::create(builder, loc, unchanged, frombits);
+  auto lenIsZero = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, len, zero);
   mlir::Value res =
-      builder.create<mlir::arith::SelectOp>(loc, lenIsZero, to, resTmp);
+      mlir::arith::SelectOp::create(builder, loc, lenIsZero, to, resTmp);
   if (toType.isUnsignedInteger())
     res = builder.createConvert(loc, toType, res);
-  builder.create<fir::StoreOp>(loc, res, toAddr);
+  fir::StoreOp::create(builder, loc, res, toAddr);
 }
 
 // NEAREST, IEEE_NEXT_AFTER, IEEE_NEXT_DOWN, IEEE_NEXT_UP
@@ -6971,7 +7003,7 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
     // If isNan(Y), set X to a qNaN that will propagate to the resultIsX result.
     mlir::Value qNan = genQNan(xType);
     mlir::Value isFPClass = genIsFPClass(i1Ty, args[1], nanTest);
-    x = builder.create<mlir::arith::SelectOp>(loc, isFPClass, qNan, x);
+    x = mlir::arith::SelectOp::create(builder, loc, isFPClass, qNan, x);
   }
   mlir::Value resultIsX = genIsFPClass(i1Ty, x, nanTest);
   mlir::Type intType = builder.getIntegerType(xBitWidth);
@@ -6982,15 +7014,15 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
   if constexpr (proc == NearestProc::Nearest) {
     // Arg S must not be zero.
     fir::IfOp ifOp =
-        builder.create<fir::IfOp>(loc, genIsFPClass(i1Ty, args[1], zeroTest),
-                                  /*withElseRegion=*/false);
+        fir::IfOp::create(builder, loc, genIsFPClass(i1Ty, args[1], zeroTest),
+                          /*withElseRegion=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     fir::runtime::genReportFatalUserError(
         builder, loc, "intrinsic nearest S argument is zero");
     builder.setInsertionPointAfter(ifOp);
     mlir::Value sSign = IntrinsicLibrary::genIeeeSignbit(intType, {args[1]});
-    valueUp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, sSign, one);
+    valueUp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne, sSign, one);
   } else if constexpr (proc == NearestProc::NextAfter) {
     // Convert X and Y to a common type to allow comparison. Direct conversions
     // between kinds 2, 3, 10, and 16 are not all supported. These conversions
@@ -7011,58 +7043,58 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
       if (xBitWidth > 32 && xBitWidth > yBitWidth)
         y = builder.createConvert(loc, xType, y);
     }
-    resultIsX = builder.create<mlir::arith::OrIOp>(
-        loc, resultIsX,
-        builder.create<mlir::arith::CmpFOp>(
-            loc, mlir::arith::CmpFPredicate::OEQ, x1, y));
-    valueUp = builder.create<mlir::arith::CmpFOp>(
-        loc, mlir::arith::CmpFPredicate::OLT, x1, y);
+    resultIsX = mlir::arith::OrIOp::create(
+        builder, loc, resultIsX,
+        mlir::arith::CmpFOp::create(builder, loc,
+                                    mlir::arith::CmpFPredicate::OEQ, x1, y));
+    valueUp = mlir::arith::CmpFOp::create(
+        builder, loc, mlir::arith::CmpFPredicate::OLT, x1, y);
   } else if constexpr (proc == NearestProc::NextDown) {
     valueUp = builder.createBool(loc, false);
   } else if constexpr (proc == NearestProc::NextUp) {
     valueUp = builder.createBool(loc, true);
   }
-  mlir::Value magnitudeUp = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne, valueUp,
+  mlir::Value magnitudeUp = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, valueUp,
       IntrinsicLibrary::genIeeeSignbit(i1Ty, {args[0]}));
-  resultIsX = builder.create<mlir::arith::OrIOp>(
-      loc, resultIsX,
-      builder.create<mlir::arith::AndIOp>(
-          loc, genIsFPClass(i1Ty, x, infiniteTest), magnitudeUp));
+  resultIsX = mlir::arith::OrIOp::create(
+      builder, loc, resultIsX,
+      mlir::arith::AndIOp::create(
+          builder, loc, genIsFPClass(i1Ty, x, infiniteTest), magnitudeUp));
 
   // Result is X. (For ieee_next_after with isNan(Y), X has been set to a NaN.)
-  fir::IfOp outerIfOp = builder.create<fir::IfOp>(loc, resultType, resultIsX,
-                                                  /*withElseRegion=*/true);
+  fir::IfOp outerIfOp = fir::IfOp::create(builder, loc, resultType, resultIsX,
+                                          /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&outerIfOp.getThenRegion().front());
   if constexpr (proc == NearestProc::NextDown || proc == NearestProc::NextUp)
     genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID,
                    genIsFPClass(i1Ty, x, snanTest));
-  builder.create<fir::ResultOp>(loc, x);
+  fir::ResultOp::create(builder, loc, x);
 
   // Result is minPositiveSubnormal or minNegativeSubnormal. (X is zero.)
   builder.setInsertionPointToStart(&outerIfOp.getElseRegion().front());
-  mlir::Value resultIsMinSubnormal = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OEQ, x,
+  mlir::Value resultIsMinSubnormal = mlir::arith::CmpFOp::create(
+      builder, loc, mlir::arith::CmpFPredicate::OEQ, x,
       builder.createRealZeroConstant(loc, xType));
   fir::IfOp innerIfOp =
-      builder.create<fir::IfOp>(loc, resultType, resultIsMinSubnormal,
-                                /*withElseRegion=*/true);
+      fir::IfOp::create(builder, loc, resultType, resultIsMinSubnormal,
+                        /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&innerIfOp.getThenRegion().front());
   mlir::Value minPositiveSubnormal =
-      builder.create<mlir::arith::BitcastOp>(loc, resultType, one);
-  mlir::Value minNegativeSubnormal = builder.create<mlir::arith::BitcastOp>(
-      loc, resultType,
-      builder.create<mlir::arith::ConstantOp>(
-          loc, intType,
+      mlir::arith::BitcastOp::create(builder, loc, resultType, one);
+  mlir::Value minNegativeSubnormal = mlir::arith::BitcastOp::create(
+      builder, loc, resultType,
+      mlir::arith::ConstantOp::create(
+          builder, loc, intType,
           builder.getIntegerAttr(
               intType, llvm::APInt::getBitsSetWithWrap(
                            xBitWidth, /*lo=*/xBitWidth - 1, /*hi=*/1))));
-  mlir::Value result = builder.create<mlir::arith::SelectOp>(
-      loc, valueUp, minPositiveSubnormal, minNegativeSubnormal);
+  mlir::Value result = mlir::arith::SelectOp::create(
+      builder, loc, valueUp, minPositiveSubnormal, minNegativeSubnormal);
   if constexpr (proc == NearestProc::Nearest || proc == NearestProc::NextAfter)
     genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW |
                    _FORTRAN_RUNTIME_IEEE_INEXACT);
-  builder.create<fir::ResultOp>(loc, result);
+  fir::ResultOp::create(builder, loc, result);
 
   // Result is (X + minPositiveSubnormal) or (X - minPositiveSubnormal).
   builder.setInsertionPointToStart(&innerIfOp.getElseRegion().front());
@@ -7088,15 +7120,15 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
       genRuntimeCall("feraiseexcept", i32Ty, excepts);
       genRuntimeCall("feenableexcept", i32Ty, mask);
     }
-    builder.create<fir::ResultOp>(loc, result);
+    fir::ResultOp::create(builder, loc, result);
   } else {
     // Kind 2, 3, 4, 8, 16. Increment or decrement X cast to integer.
-    mlir::Value intX = builder.create<mlir::arith::BitcastOp>(loc, intType, x);
-    mlir::Value add = builder.create<mlir::arith::AddIOp>(loc, intX, one);
-    mlir::Value sub = builder.create<mlir::arith::SubIOp>(loc, intX, one);
-    result = builder.create<mlir::arith::BitcastOp>(
-        loc, resultType,
-        builder.create<mlir::arith::SelectOp>(loc, magnitudeUp, add, sub));
+    mlir::Value intX = mlir::arith::BitcastOp::create(builder, loc, intType, x);
+    mlir::Value add = mlir::arith::AddIOp::create(builder, loc, intX, one);
+    mlir::Value sub = mlir::arith::SubIOp::create(builder, loc, intX, one);
+    result = mlir::arith::BitcastOp::create(
+        builder, loc, resultType,
+        mlir::arith::SelectOp::create(builder, loc, magnitudeUp, add, sub));
     if constexpr (proc == NearestProc::Nearest ||
                   proc == NearestProc::NextAfter) {
       genRaiseExcept(_FORTRAN_RUNTIME_IEEE_OVERFLOW |
@@ -7106,11 +7138,11 @@ mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
                          _FORTRAN_RUNTIME_IEEE_INEXACT,
                      genIsFPClass(i1Ty, result, subnormalTest));
     }
-    builder.create<fir::ResultOp>(loc, result);
+    fir::ResultOp::create(builder, loc, result);
   }
 
   builder.setInsertionPointAfter(innerIfOp);
-  builder.create<fir::ResultOp>(loc, innerIfOp.getResult(0));
+  fir::ResultOp::create(builder, loc, innerIfOp.getResult(0));
   builder.setInsertionPointAfter(outerIfOp);
   return outerIfOp.getResult(0);
 }
@@ -7192,10 +7224,18 @@ IntrinsicLibrary::genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value boxStorage = builder.createTemporary(loc, boxType);
   mlir::Value box = fir::factory::createUnallocatedBox(
       builder, loc, boxType, mold->nonDeferredLenParams());
-  builder.create<fir::StoreOp>(loc, box, boxStorage);
+  fir::StoreOp::create(builder, loc, box, boxStorage);
   return fir::MutableBoxValue(boxStorage, mold->nonDeferredLenParams(), {});
 }
 
+// CLOCK, CLOCK64, GLOBALTIMER
+template <typename OpTy>
+mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0 && "expect no arguments");
+  return OpTy::create(builder, loc, resultType).getResult();
+}
+
 // PACK
 fir::ExtendedValue
 IntrinsicLibrary::genPack(mlir::Type resultType,
@@ -7210,10 +7250,11 @@ IntrinsicLibrary::genPack(mlir::Type resultType,
   mlir::Value mask = builder.createBox(loc, args[1]);
 
   // Handle optional vector argument
-  mlir::Value vector = isStaticallyAbsent(args, 2)
-                           ? builder.create<fir::AbsentOp>(
-                                 loc, fir::BoxType::get(builder.getI1Type()))
-                           : builder.createBox(loc, args[2]);
+  mlir::Value vector =
+      isStaticallyAbsent(args, 2)
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getI1Type()))
+          : builder.createBox(loc, args[2]);
 
   // Create mutable fir.box to be passed to the runtime for the result.
   mlir::Type resultArrayType = builder.getVarLenSeqTy(resultType, 1);
@@ -7273,7 +7314,7 @@ void IntrinsicLibrary::genPerror(llvm::ArrayRef<fir::ExtendedValue> args) {
   fir::ExtendedValue str = args[0];
   const auto *box = str.getBoxOf<fir::BoxValue>();
   mlir::Value addr =
-      builder.create<fir::BoxAddrOp>(loc, box->getMemTy(), fir::getBase(*box));
+      fir::BoxAddrOp::create(builder, loc, box->getMemTy(), fir::getBase(*box));
   fir::runtime::genPerror(builder, loc, addr);
 }
 
@@ -7282,7 +7323,7 @@ mlir::Value IntrinsicLibrary::genPopcnt(mlir::Type resultType,
                                         llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 1);
 
-  mlir::Value count = builder.create<mlir::math::CtPopOp>(loc, args);
+  mlir::Value count = mlir::math::CtPopOp::create(builder, loc, args);
 
   return builder.createConvert(loc, resultType, count);
 }
@@ -7295,7 +7336,7 @@ mlir::Value IntrinsicLibrary::genPoppar(mlir::Type resultType,
   mlir::Value count = genPopcnt(resultType, args);
   mlir::Value one = builder.createIntegerConstant(loc, resultType, 1);
 
-  return builder.create<mlir::arith::AndIOp>(loc, count, one);
+  return mlir::arith::AndIOp::create(builder, loc, count, one);
 }
 
 // PRESENT
@@ -7303,8 +7344,8 @@ fir::ExtendedValue
 IntrinsicLibrary::genPresent(mlir::Type,
                              llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 1);
-  return builder.create<fir::IsPresentOp>(loc, builder.getI1Type(),
-                                          fir::getBase(args[0]));
+  return fir::IsPresentOp::create(builder, loc, builder.getI1Type(),
+                                  fir::getBase(args[0]));
 }
 
 // PRODUCT
@@ -7369,7 +7410,7 @@ void IntrinsicLibrary::genRandomSeed(llvm::ArrayRef<fir::ExtendedValue> args) {
   auto getDesc = [&](int i) {
     return isStaticallyPresent(args[i])
                ? fir::getBase(args[i])
-               : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+               : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   };
   mlir::Value size = getDesc(0);
   mlir::Value put = getDesc(1);
@@ -7413,13 +7454,13 @@ IntrinsicLibrary::genReduce(mlir::Type resultType,
   bool absentDim = isStaticallyAbsent(args[2]);
 
   auto mask = isStaticallyAbsent(args[3])
-                  ? builder.create<fir::AbsentOp>(
-                        loc, fir::BoxType::get(builder.getI1Type()))
+                  ? fir::AbsentOp::create(
+                        builder, loc, fir::BoxType::get(builder.getI1Type()))
                   : builder.createBox(loc, args[3]);
 
   mlir::Value identity =
       isStaticallyAbsent(args[4])
-          ? builder.create<fir::AbsentOp>(loc, fir::ReferenceType::get(eleTy))
+          ? fir::AbsentOp::create(builder, loc, fir::ReferenceType::get(eleTy))
           : fir::getBase(args[4]);
 
   mlir::Value ordered = isStaticallyAbsent(args[5])
@@ -7435,7 +7476,7 @@ IntrinsicLibrary::genReduce(mlir::Type resultType,
                               ordered, result, argByRef);
       if (fir::isa_derived(eleTy))
         return result;
-      return builder.create<fir::LoadOp>(loc, result);
+      return fir::LoadOp::create(builder, loc, result);
     }
     if (fir::isa_char(eleTy)) {
       auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(resultType);
@@ -7485,7 +7526,7 @@ IntrinsicLibrary::genRename(std::optional<mlir::Type> resultType,
     auto statusAddr = builder.createTemporary(loc, *resultType);
     auto statusBox = builder.createBox(loc, statusAddr);
     fir::runtime::genRename(builder, loc, path1, path2, statusBox);
-    return builder.create<fir::LoadOp>(loc, statusAddr);
+    return fir::LoadOp::create(builder, loc, statusAddr);
   } else {
     // code-gen for the procedure form of RENAME
     mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
@@ -7493,7 +7534,7 @@ IntrinsicLibrary::genRename(std::optional<mlir::Type> resultType,
     mlir::Value statusBox =
         isStaticallyPresent(status)
             ? fir::getBase(status)
-            : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+            : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
     fir::runtime::genRename(builder, loc, path1, path2, statusBox);
     return {};
   }
@@ -7538,16 +7579,18 @@ IntrinsicLibrary::genReshape(mlir::Type resultType,
     TODO(loc, "intrinsic: reshape requires computing rank of result");
 
   // Handle optional pad argument
-  mlir::Value pad = isStaticallyAbsent(args[2])
-                        ? builder.create<fir::AbsentOp>(
-                              loc, fir::BoxType::get(builder.getI1Type()))
-                        : builder.createBox(loc, args[2]);
+  mlir::Value pad =
+      isStaticallyAbsent(args[2])
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getI1Type()))
+          : builder.createBox(loc, args[2]);
 
   // Handle optional order argument
-  mlir::Value order = isStaticallyAbsent(args[3])
-                          ? builder.create<fir::AbsentOp>(
-                                loc, fir::BoxType::get(builder.getI1Type()))
-                          : builder.createBox(loc, args[3]);
+  mlir::Value order =
+      isStaticallyAbsent(args[3])
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getI1Type()))
+          : builder.createBox(loc, args[3]);
 
   // Create mutable fir.box to be passed to the runtime for the result.
   mlir::Type type = builder.getVarLenSeqTy(resultType, resultRank);
@@ -7618,26 +7661,27 @@ mlir::Value IntrinsicLibrary::genScale(mlir::Type resultType,
   // If X is finite and result is infinite, signal IEEE_OVERFLOW
   // If X is finite and scale(result, -I) != X, signal IEEE_UNDERFLOW
   fir::IfOp outerIfOp =
-      builder.create<fir::IfOp>(loc, genIsFPClass(i1Ty, args[0], finiteTest),
-                                /*withElseRegion=*/false);
+      fir::IfOp::create(builder, loc, genIsFPClass(i1Ty, args[0], finiteTest),
+                        /*withElseRegion=*/false);
   builder.setInsertionPointToStart(&outerIfOp.getThenRegion().front());
   fir::IfOp innerIfOp =
-      builder.create<fir::IfOp>(loc, genIsFPClass(i1Ty, result, infiniteTest),
-                                /*withElseRegion=*/true);
+      fir::IfOp::create(builder, loc, genIsFPClass(i1Ty, result, infiniteTest),
+                        /*withElseRegion=*/true);
   builder.setInsertionPointToStart(&innerIfOp.getThenRegion().front());
   genRaiseExcept(_FORTRAN_RUNTIME_IEEE_OVERFLOW |
                  _FORTRAN_RUNTIME_IEEE_INEXACT);
   builder.setInsertionPointToStart(&innerIfOp.getElseRegion().front());
-  mlir::Value minusI = builder.create<mlir::arith::MulIOp>(
-      loc, args[1], builder.createAllOnesInteger(loc, args[1].getType()));
+  mlir::Value minusI = mlir::arith::MulIOp::create(
+      builder, loc, args[1],
+      builder.createAllOnesInteger(loc, args[1].getType()));
   mlir::Value reverseResult = builder.createConvert(
       loc, resultType,
       fir::runtime::genScale(
           builder, loc, builder.createConvert(loc, f32Ty, result), minusI));
   genRaiseExcept(
       _FORTRAN_RUNTIME_IEEE_UNDERFLOW | _FORTRAN_RUNTIME_IEEE_INEXACT,
-      builder.create<mlir::arith::CmpFOp>(loc, mlir::arith::CmpFPredicate::ONE,
-                                          args[0], reverseResult));
+      mlir::arith::CmpFOp::create(builder, loc, mlir::arith::CmpFPredicate::ONE,
+                                  args[0], reverseResult));
   builder.setInsertionPointAfter(outerIfOp);
   return result;
 }
@@ -7689,13 +7733,14 @@ IntrinsicLibrary::genScan(mlir::Type resultType,
         builder.getContext(), builder.getKindMap().defaultLogicalKind());
     mlir::Value temp = builder.createTemporary(loc, logTy);
     mlir::Value castb = builder.createConvert(loc, logTy, b);
-    builder.create<fir::StoreOp>(loc, castb, temp);
+    fir::StoreOp::create(builder, loc, castb, temp);
     return builder.createBox(loc, temp);
   };
-  mlir::Value back = fir::isUnboxedValue(args[2])
-                         ? makeRefThenEmbox(*args[2].getUnboxed())
-                         : builder.create<fir::AbsentOp>(
-                               loc, fir::BoxType::get(builder.getI1Type()));
+  mlir::Value back =
+      fir::isUnboxedValue(args[2])
+          ? makeRefThenEmbox(*args[2].getUnboxed())
+          : fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getI1Type()));
 
   // Handle required string argument
   mlir::Value string = builder.createBox(loc, args[0]);
@@ -7736,7 +7781,7 @@ IntrinsicLibrary::genSecond(std::optional<mlir::Type> resultType,
   genCpuTime(subroutineArgs);
 
   if (resultType)
-    return builder.create<fir::LoadOp>(loc, fir::getBase(result));
+    return fir::LoadOp::create(builder, loc, fir::getBase(result));
   return {};
 }
 
@@ -7783,22 +7828,22 @@ IntrinsicLibrary::genSelectedRealKind(mlir::Type resultType,
   // Handle optional precision(P) argument
   mlir::Value precision =
       isStaticallyAbsent(args[0])
-          ? builder.create<fir::AbsentOp>(
-                loc, fir::ReferenceType::get(builder.getI1Type()))
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::ReferenceType::get(builder.getI1Type()))
           : fir::getBase(args[0]);
 
   // Handle optional range(R) argument
   mlir::Value range =
       isStaticallyAbsent(args[1])
-          ? builder.create<fir::AbsentOp>(
-                loc, fir::ReferenceType::get(builder.getI1Type()))
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::ReferenceType::get(builder.getI1Type()))
           : fir::getBase(args[1]);
 
   // Handle optional radix(RADIX) argument
   mlir::Value radix =
       isStaticallyAbsent(args[2])
-          ? builder.create<fir::AbsentOp>(
-                loc, fir::ReferenceType::get(builder.getI1Type()))
+          ? fir::AbsentOp::create(builder, loc,
+                                  fir::ReferenceType::get(builder.getI1Type()))
           : fir::getBase(args[2]);
 
   return builder.createConvert(
@@ -7836,9 +7881,9 @@ createBoxForRuntimeBoundInquiry(mlir::Location loc, fir::FirOpBuilder &builder,
         // shape information.
         mlir::Value localShape = builder.createShape(loc, array);
         mlir::Value oldBox = boxValue.getAddr();
-        return builder.create<fir::ReboxOp>(loc, oldBox.getType(), oldBox,
-                                            localShape,
-                                            /*slice=*/mlir::Value{});
+        return fir::ReboxOp::create(builder, loc, oldBox.getType(), oldBox,
+                                    localShape,
+                                    /*slice=*/mlir::Value{});
       },
       [&](const auto &) -> mlir::Value {
         // This is a pointer/allocatable, or an entity not yet tracked with a
@@ -7884,7 +7929,7 @@ genBoundInquiry(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::Value resultBase =
         builder.createConvert(loc, baseType, resultStorage);
     mlir::Value rankValue =
-        builder.create<fir::BoxRankOp>(loc, builder.getIndexType(), arrayBox);
+        fir::BoxRankOp::create(builder, loc, builder.getIndexType(), arrayBox);
     return fir::ArrayBoxValue{resultBase, {rankValue}};
   }
   // Result extent is a compile time constant in the other cases.
@@ -7914,9 +7959,9 @@ IntrinsicLibrary::genShape(mlir::Type resultType,
     mlir::Value extent = fir::factory::readExtent(builder, loc, array, dim);
     extent = builder.createConvert(loc, extentType, extent);
     auto index = builder.createIntegerConstant(loc, indexType, dim);
-    auto shapeAddr = builder.create<fir::CoordinateOp>(loc, shapeAddrType,
-                                                       shapeArray, index);
-    builder.create<fir::StoreOp>(loc, extent, shapeAddr);
+    auto shapeAddr = fir::CoordinateOp::create(builder, loc, shapeAddrType,
+                                               shapeArray, index);
+    fir::StoreOp::create(builder, loc, extent, shapeAddr);
   }
   mlir::Value shapeArrayExtent =
       builder.createIntegerConstant(loc, indexType, rank);
@@ -7942,18 +7987,18 @@ mlir::Value IntrinsicLibrary::genShift(mlir::Type resultType,
   mlir::Value zero = builder.createIntegerConstant(loc, signlessType, 0);
   mlir::Value shift = builder.createConvert(loc, signlessType, args[1]);
 
-  mlir::Value tooSmall = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::slt, shift, zero);
-  mlir::Value tooLarge = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::sge, shift, bitSize);
+  mlir::Value tooSmall = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::slt, shift, zero);
+  mlir::Value tooLarge = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sge, shift, bitSize);
   mlir::Value outOfBounds =
-      builder.create<mlir::arith::OrIOp>(loc, tooSmall, tooLarge);
+      mlir::arith::OrIOp::create(builder, loc, tooSmall, tooLarge);
   mlir::Value word = args[0];
   if (word.getType().isUnsignedInteger())
     word = builder.createConvert(loc, signlessType, word);
-  mlir::Value shifted = builder.create<Shift>(loc, word, shift);
+  mlir::Value shifted = Shift::create(builder, loc, word, shift);
   mlir::Value result =
-      builder.create<mlir::arith::SelectOp>(loc, outOfBounds, zero, shifted);
+      mlir::arith::SelectOp::create(builder, loc, outOfBounds, zero, shifted);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -7968,8 +8013,8 @@ mlir::Value IntrinsicLibrary::genShiftA(mlir::Type resultType,
                              mlir::IntegerType::SignednessSemantics::Signless);
   mlir::Value bitSize = builder.createIntegerConstant(loc, signlessType, bits);
   mlir::Value shift = builder.createConvert(loc, signlessType, args[1]);
-  mlir::Value shiftGeBitSize = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::uge, shift, bitSize);
+  mlir::Value shiftGeBitSize = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::uge, shift, bitSize);
 
   // Lowering of mlir::arith::ShRSIOp is using `ashr`. `ashr` is undefined when
   // the shift amount is equal to the element size.
@@ -7981,13 +8026,13 @@ mlir::Value IntrinsicLibrary::genShiftA(mlir::Type resultType,
   mlir::Value word = args[0];
   if (word.getType().isUnsignedInteger())
     word = builder.createConvert(loc, signlessType, word);
-  mlir::Value valueIsNeg = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::slt, word, zero);
+  mlir::Value valueIsNeg = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::slt, word, zero);
   mlir::Value specialRes =
-      builder.create<mlir::arith::SelectOp>(loc, valueIsNeg, minusOne, zero);
-  mlir::Value shifted = builder.create<mlir::arith::ShRSIOp>(loc, word, shift);
-  mlir::Value result = builder.create<mlir::arith::SelectOp>(
-      loc, shiftGeBitSize, specialRes, shifted);
+      mlir::arith::SelectOp::create(builder, loc, valueIsNeg, minusOne, zero);
+  mlir::Value shifted = mlir::arith::ShRSIOp::create(builder, loc, word, shift);
+  mlir::Value result = mlir::arith::SelectOp::create(
+      builder, loc, shiftGeBitSize, specialRes, shifted);
   if (resultType.isUnsignedInteger())
     return builder.createConvert(loc, resultType, result);
   return result;
@@ -8012,10 +8057,10 @@ mlir::Value IntrinsicLibrary::genSign(mlir::Type resultType,
   if (mlir::isa<mlir::IntegerType>(resultType)) {
     mlir::Value abs = genAbs(resultType, {args[0]});
     mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
-    auto neg = builder.create<mlir::arith::SubIOp>(loc, zero, abs);
-    auto cmp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, args[1], zero);
-    return builder.create<mlir::arith::SelectOp>(loc, cmp, neg, abs);
+    auto neg = mlir::arith::SubIOp::create(builder, loc, zero, abs);
+    auto cmp = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::slt, args[1], zero);
+    return mlir::arith::SelectOp::create(builder, loc, cmp, neg, abs);
   }
   return genRuntimeCall("sign", resultType, args);
 }
@@ -8031,6 +8076,21 @@ mlir::Value IntrinsicLibrary::genSind(mlir::Type resultType,
   mlir::Value dfactor = builder.createRealConstant(
       loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
+  mlir::Value arg = mlir::arith::MulFOp::create(builder, loc, args[0], factor);
+  return getRuntimeCallGenerator("sin", ftype)(builder, loc, {arg});
+}
+
+// SINPI
+mlir::Value IntrinsicLibrary::genSinpi(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
+  llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
+  mlir::Value dfactor =
+      builder.createRealConstant(loc, mlir::Float64Type::get(context), pi);
+  mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
   mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
   return getRuntimeCallGenerator("sin", ftype)(builder, loc, {arg});
 }
@@ -8075,14 +8135,14 @@ IntrinsicLibrary::genSize(mlir::Type resultType,
       .genThen([&]() {
         mlir::Value size = builder.createConvert(
             loc, resultType, fir::runtime::genSize(builder, loc, array));
-        builder.create<fir::ResultOp>(loc, size);
+        fir::ResultOp::create(builder, loc, size);
       })
       .genElse([&]() {
-        mlir::Value dimValue = builder.create<fir::LoadOp>(loc, dim);
+        mlir::Value dimValue = fir::LoadOp::create(builder, loc, dim);
         mlir::Value size = builder.createConvert(
             loc, resultType,
             fir::runtime::genSizeDim(builder, loc, array, dimValue));
-        builder.create<fir::ResultOp>(loc, size);
+        fir::ResultOp::create(builder, loc, size);
       })
       .getResults()[0];
 }
@@ -8093,12 +8153,13 @@ IntrinsicLibrary::genSizeOf(mlir::Type resultType,
                             llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 1);
   mlir::Value box = fir::getBase(args[0]);
-  mlir::Value eleSize = builder.create<fir::BoxEleSizeOp>(loc, resultType, box);
+  mlir::Value eleSize =
+      fir::BoxEleSizeOp::create(builder, loc, resultType, box);
   if (!fir::isArray(args[0]))
     return eleSize;
   mlir::Value arraySize = builder.createConvert(
       loc, resultType, fir::runtime::genSize(builder, loc, box));
-  return builder.create<mlir::arith::MulIOp>(loc, eleSize, arraySize);
+  return mlir::arith::MulIOp::create(builder, loc, eleSize, arraySize);
 }
 
 // TAND
@@ -8112,7 +8173,7 @@ mlir::Value IntrinsicLibrary::genTand(mlir::Type resultType,
   mlir::Value dfactor = builder.createRealConstant(
       loc, mlir::Float64Type::get(context), pi / llvm::APFloat(180.0));
   mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
-  mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
+  mlir::Value arg = mlir::arith::MulFOp::create(builder, loc, args[0], factor);
   return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
 }
 
@@ -8122,81 +8183,83 @@ mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType,
   assert(args.size() == 0);
   auto recTy = mlir::cast<fir::RecordType>(resultType);
   assert(recTy && "RecordType expepected");
-  mlir::Value res = builder.create<fir::AllocaOp>(loc, resultType);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
   mlir::Type i32Ty = builder.getI32Type();
 
-  mlir::Value threadIdX = builder.create<mlir::NVVM::ThreadIdXOp>(loc, i32Ty);
-  mlir::Value threadIdY = builder.create<mlir::NVVM::ThreadIdYOp>(loc, i32Ty);
-  mlir::Value threadIdZ = builder.create<mlir::NVVM::ThreadIdZOp>(loc, i32Ty);
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
 
-  mlir::Value blockIdX = builder.create<mlir::NVVM::BlockIdXOp>(loc, i32Ty);
-  mlir::Value blockIdY = builder.create<mlir::NVVM::BlockIdYOp>(loc, i32Ty);
-  mlir::Value blockIdZ = builder.create<mlir::NVVM::BlockIdZOp>(loc, i32Ty);
+  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
 
-  mlir::Value blockDimX = builder.create<mlir::NVVM::BlockDimXOp>(loc, i32Ty);
-  mlir::Value blockDimY = builder.create<mlir::NVVM::BlockDimYOp>(loc, i32Ty);
-  mlir::Value blockDimZ = builder.create<mlir::NVVM::BlockDimZOp>(loc, i32Ty);
-  mlir::Value gridDimX = builder.create<mlir::NVVM::GridDimXOp>(loc, i32Ty);
-  mlir::Value gridDimY = builder.create<mlir::NVVM::GridDimYOp>(loc, i32Ty);
-  mlir::Value gridDimZ = builder.create<mlir::NVVM::GridDimZOp>(loc, i32Ty);
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
 
   // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
   // (blockDim.x * gridDim.x);
   mlir::Value resZ =
-      builder.create<mlir::arith::MulIOp>(loc, blockDimZ, gridDimZ);
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
   mlir::Value resY =
-      builder.create<mlir::arith::MulIOp>(loc, blockDimY, gridDimY);
+      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
   mlir::Value resX =
-      builder.create<mlir::arith::MulIOp>(loc, blockDimX, gridDimX);
-  mlir::Value resZY = builder.create<mlir::arith::MulIOp>(loc, resZ, resY);
-  mlir::Value size = builder.create<mlir::arith::MulIOp>(loc, resZY, resX);
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
+  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
 
   // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
   //   blockIdx.x;
   // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
   //   ((threadIdx.z * blockDim.y) * blockDim.x) +
   //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value r1 = builder.create<mlir::arith::MulIOp>(loc, blockIdZ, gridDimY);
-  mlir::Value r2 = builder.create<mlir::arith::MulIOp>(loc, r1, gridDimX);
-  mlir::Value r3 = builder.create<mlir::arith::MulIOp>(loc, blockIdY, gridDimX);
-  mlir::Value r2r3 = builder.create<mlir::arith::AddIOp>(loc, r2, r3);
-  mlir::Value tmp = builder.create<mlir::arith::AddIOp>(loc, r2r3, blockIdX);
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
 
   mlir::Value bXbY =
-      builder.create<mlir::arith::MulIOp>(loc, blockDimX, blockDimY);
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
   mlir::Value bXbYbZ =
-      builder.create<mlir::arith::MulIOp>(loc, bXbY, blockDimZ);
+      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
   mlir::Value tZbY =
-      builder.create<mlir::arith::MulIOp>(loc, threadIdZ, blockDimY);
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
   mlir::Value tZbYbX =
-      builder.create<mlir::arith::MulIOp>(loc, tZbY, blockDimX);
+      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
   mlir::Value tYbX =
-      builder.create<mlir::arith::MulIOp>(loc, threadIdY, blockDimX);
-  mlir::Value rank = builder.create<mlir::arith::MulIOp>(loc, tmp, bXbYbZ);
-  rank = builder.create<mlir::arith::AddIOp>(loc, rank, tZbYbX);
-  rank = builder.create<mlir::arith::AddIOp>(loc, rank, tYbX);
-  rank = builder.create<mlir::arith::AddIOp>(loc, rank, threadIdX);
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
   mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = builder.create<mlir::arith::AddIOp>(loc, rank, one);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
 
   auto sizeFieldName = recTy.getTypeList()[1].first;
   mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
   mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, sizeFieldName, recTy,
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  builder.create<fir::StoreOp>(loc, size, sizeCoord);
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
 
   auto rankFieldName = recTy.getTypeList()[2].first;
   mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, rankFieldName, recTy,
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  builder.create<fir::StoreOp>(loc, rank, rankCoord);
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
   return res;
 }
 
@@ -8207,50 +8270,50 @@ IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
   assert(args.size() == 0);
   auto recTy = mlir::cast<fir::RecordType>(resultType);
   assert(recTy && "RecordType expepected");
-  mlir::Value res = builder.create<fir::AllocaOp>(loc, resultType);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
   mlir::Type i32Ty = builder.getI32Type();
 
   // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
-  mlir::Value blockDimX = builder.create<mlir::NVVM::BlockDimXOp>(loc, i32Ty);
-  mlir::Value blockDimY = builder.create<mlir::NVVM::BlockDimYOp>(loc, i32Ty);
-  mlir::Value blockDimZ = builder.create<mlir::NVVM::BlockDimZOp>(loc, i32Ty);
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
   mlir::Value size =
-      builder.create<mlir::arith::MulIOp>(loc, blockDimZ, blockDimY);
-  size = builder.create<mlir::arith::MulIOp>(loc, size, blockDimX);
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
+  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
 
   // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
   //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value threadIdX = builder.create<mlir::NVVM::ThreadIdXOp>(loc, i32Ty);
-  mlir::Value threadIdY = builder.create<mlir::NVVM::ThreadIdYOp>(loc, i32Ty);
-  mlir::Value threadIdZ = builder.create<mlir::NVVM::ThreadIdZOp>(loc, i32Ty);
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
   mlir::Value r1 =
-      builder.create<mlir::arith::MulIOp>(loc, threadIdZ, blockDimY);
-  mlir::Value r2 = builder.create<mlir::arith::MulIOp>(loc, r1, blockDimX);
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
   mlir::Value r3 =
-      builder.create<mlir::arith::MulIOp>(loc, threadIdY, blockDimX);
-  mlir::Value r2r3 = builder.create<mlir::arith::AddIOp>(loc, r2, r3);
-  mlir::Value rank = builder.create<mlir::arith::AddIOp>(loc, r2r3, threadIdX);
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
   mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = builder.create<mlir::arith::AddIOp>(loc, rank, one);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
 
   auto sizeFieldName = recTy.getTypeList()[1].first;
   mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
   mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, sizeFieldName, recTy,
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  builder.create<fir::StoreOp>(loc, size, sizeCoord);
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
 
   auto rankFieldName = recTy.getTypeList()[2].first;
   mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, rankFieldName, recTy,
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  builder.create<fir::StoreOp>(loc, rank, rankCoord);
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
   return res;
 }
 
@@ -8260,7 +8323,7 @@ mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType,
   assert(args.size() == 0);
   auto recTy = mlir::cast<fir::RecordType>(resultType);
   assert(recTy && "RecordType expepected");
-  mlir::Value res = builder.create<fir::AllocaOp>(loc, resultType);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
   mlir::Type i32Ty = builder.getI32Type();
 
   // coalesced_group%size = 32
@@ -8268,28 +8331,28 @@ mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType,
   auto sizeFieldName = recTy.getTypeList()[1].first;
   mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
   mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, sizeFieldName, recTy,
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  builder.create<fir::StoreOp>(loc, size, sizeCoord);
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
 
   // coalesced_group%rank = threadIdx.x & 31 + 1
-  mlir::Value threadIdX = builder.create<mlir::NVVM::ThreadIdXOp>(loc, i32Ty);
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
   mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
   mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
   mlir::Value masked =
-      builder.create<mlir::arith::AndIOp>(loc, threadIdX, mask);
-  mlir::Value rank = builder.create<mlir::arith::AddIOp>(loc, masked, one);
+      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
   auto rankFieldName = recTy.getTypeList()[2].first;
   mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = builder.create<fir::FieldIndexOp>(
-      loc, fieldIndexType, rankFieldName, recTy,
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
       /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = builder.create<fir::CoordinateOp>(
-      loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  builder.create<fir::StoreOp>(loc, rank, rankCoord);
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
   return res;
 }
 
@@ -8299,7 +8362,7 @@ mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType,
   assert(args.size() == 1);
 
   mlir::Value result =
-      builder.create<mlir::math::CountTrailingZerosOp>(loc, args);
+      mlir::math::CountTrailingZerosOp::create(builder, loc, args);
 
   return builder.createConvert(loc, resultType, result);
 }
@@ -8327,10 +8390,10 @@ static mlir::Value computeLBOUND(fir::FirOpBuilder &builder, mlir::Location loc,
   zero = builder.createConvert(loc, extent.getType(), zero);
   // Note: for assumed size, the extent is -1, and the lower bound should
   // be returned. It is important to test extent == 0 and not extent > 0.
-  auto dimIsEmpty = builder.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+  auto dimIsEmpty = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::eq, extent, zero);
   one = builder.createConvert(loc, lb.getType(), one);
-  return builder.create<mlir::arith::SelectOp>(loc, dimIsEmpty, one, lb);
+  return mlir::arith::SelectOp::create(builder, loc, dimIsEmpty, one, lb);
 }
 
 // LBOUND
@@ -8366,8 +8429,8 @@ IntrinsicLibrary::genLbound(mlir::Type resultType,
       lb = builder.createConvert(loc, lbType, lb);
       auto index = builder.createIntegerConstant(loc, indexType, dim);
       auto lbAddr =
-          builder.create<fir::CoordinateOp>(loc, lbAddrType, lbArray, index);
-      builder.create<fir::StoreOp>(loc, lb, lbAddr);
+          fir::CoordinateOp::create(builder, loc, lbAddrType, lbArray, index);
+      fir::StoreOp::create(builder, loc, lb, lbAddr);
     }
     mlir::Value lbArrayExtent =
         builder.createIntegerConstant(loc, indexType, rank);
@@ -8406,8 +8469,8 @@ IntrinsicLibrary::genUbound(mlir::Type resultType,
     mlir::Value lbound = fir::getBase(genLbound(resultType, args));
 
     mlir::Value one = builder.createIntegerConstant(loc, resultType, 1);
-    mlir::Value ubound = builder.create<mlir::arith::SubIOp>(loc, lbound, one);
-    return builder.create<mlir::arith::AddIOp>(loc, ubound, extent);
+    mlir::Value ubound = mlir::arith::SubIOp::create(builder, loc, lbound, one);
+    return mlir::arith::AddIOp::create(builder, loc, ubound, extent);
   }
   // Handle calls to UBOUND without the DIM argument, which return an array
   int kindPos = args.size() == 2 ? 1 : 2;
@@ -8501,9 +8564,9 @@ IntrinsicLibrary::genStorageSize(mlir::Type resultType,
 
   box = builder.createBox(loc, args[0],
                           /*isPolymorphic=*/args[0].isPolymorphic());
-  mlir::Value eleSize = builder.create<fir::BoxEleSizeOp>(loc, kindTy, box);
+  mlir::Value eleSize = fir::BoxEleSizeOp::create(builder, loc, kindTy, box);
   mlir::Value c8 = builder.createIntegerConstant(loc, kindTy, 8);
-  return builder.create<mlir::arith::MulIOp>(loc, eleSize, c8);
+  return mlir::arith::MulIOp::create(builder, loc, eleSize, c8);
 }
 
 // SUM
@@ -8516,7 +8579,7 @@ IntrinsicLibrary::genSum(mlir::Type resultType,
 
 // SYNCTHREADS
 void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) {
-  builder.create<mlir::NVVM::Barrier0Op>(loc);
+  mlir::NVVM::Barrier0Op::create(builder, loc);
 }
 
 // SYNCTHREADS_AND
@@ -8528,7 +8591,7 @@ IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
   mlir::FunctionType ftype =
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return builder.create<fir::CallOp>(loc, funcOp, args).getResult(0);
+  return fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
 }
 
 // SYNCTHREADS_COUNT
@@ -8540,7 +8603,7 @@ IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
   mlir::FunctionType ftype =
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return builder.create<fir::CallOp>(loc, funcOp, args).getResult(0);
+  return fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
 }
 
 // SYNCTHREADS_OR
@@ -8552,7 +8615,7 @@ IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
   mlir::FunctionType ftype =
       mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
   auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return builder.create<fir::CallOp>(loc, funcOp, args).getResult(0);
+  return fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
 }
 
 // SYNCWARP
@@ -8564,7 +8627,7 @@ void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) {
       mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
   auto funcOp = builder.createFunction(loc, funcName, funcType);
   llvm::SmallVector<mlir::Value> argsList{mask};
-  builder.create<fir::CallOp>(loc, funcOp, argsList);
+  fir::CallOp::create(builder, loc, funcOp, argsList);
 }
 
 // SYSTEM
@@ -8590,25 +8653,26 @@ IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType,
   mlir::Value exitstatBox =
       isStaticallyPresent(exitstat)
           ? fir::getBase(exitstat)
-          : builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+          : fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
 
   // Create a dummmy cmdstat to prevent EXECUTE_COMMAND_LINE terminate itself
   // when cmdstat is assigned with a non-zero value but not present
   mlir::Value tempValue =
       builder.createIntegerConstant(loc, builder.getI16Type(), 0);
   mlir::Value temp = builder.createTemporary(loc, builder.getI16Type());
-  builder.create<fir::StoreOp>(loc, tempValue, temp);
+  fir::StoreOp::create(builder, loc, tempValue, temp);
   mlir::Value cmdstatBox = builder.createBox(loc, temp);
 
   mlir::Value cmdmsgBox =
-      builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+      fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
 
   fir::runtime::genExecuteCommandLine(builder, loc, command, waitBool,
                                       exitstatBox, cmdstatBox, cmdmsgBox);
 
   if (resultType) {
-    mlir::Value exitstatAddr = builder.create<fir::BoxAddrOp>(loc, exitstatBox);
-    return builder.create<fir::LoadOp>(loc, fir::getBase(exitstatAddr));
+    mlir::Value exitstatAddr =
+        fir::BoxAddrOp::create(builder, loc, exitstatBox);
+    return fir::LoadOp::create(builder, loc, fir::getBase(exitstatAddr));
   }
   return {};
 }
@@ -8704,7 +8768,7 @@ void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) {
       mlir::FunctionType::get(builder.getContext(), {}, {});
   auto funcOp = builder.createFunction(loc, funcName, funcType);
   llvm::SmallVector<mlir::Value> noArgs;
-  builder.create<fir::CallOp>(loc, funcOp, noArgs);
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
 }
 
 // THREADFENCE_BLOCK
@@ -8715,7 +8779,7 @@ void IntrinsicLibrary::genThreadFenceBlock(
       mlir::FunctionType::get(builder.getContext(), {}, {});
   auto funcOp = builder.createFunction(loc, funcName, funcType);
   llvm::SmallVector<mlir::Value> noArgs;
-  builder.create<fir::CallOp>(loc, funcOp, noArgs);
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
 }
 
 // THREADFENCE_SYSTEM
@@ -8726,7 +8790,7 @@ void IntrinsicLibrary::genThreadFenceSystem(
       mlir::FunctionType::get(builder.getContext(), {}, {});
   auto funcOp = builder.createFunction(loc, funcName, funcType);
   llvm::SmallVector<mlir::Value> noArgs;
-  builder.create<fir::CallOp>(loc, funcOp, noArgs);
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
 }
 
 // TIME
@@ -8778,29 +8842,30 @@ static mlir::Value createExtremumCompare(mlir::Location loc,
       // Return the number if one of the inputs is NaN and the other is
       // a number.
       auto leftIsResult =
-          builder.create<mlir::arith::CmpFOp>(loc, orderedCmp, left, right);
-      auto rightIsNan = builder.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UNE, right, right);
+          mlir::arith::CmpFOp::create(builder, loc, orderedCmp, left, right);
+      auto rightIsNan = mlir::arith::CmpFOp::create(
+          builder, loc, mlir::arith::CmpFPredicate::UNE, right, right);
       result =
-          builder.create<mlir::arith::OrIOp>(loc, leftIsResult, rightIsNan);
+          mlir::arith::OrIOp::create(builder, loc, leftIsResult, rightIsNan);
     } else if constexpr (behavior == ExtremumBehavior::IeeeMinMaximum) {
       // Always return NaNs if one the input is NaNs
       auto leftIsResult =
-          builder.create<mlir::arith::CmpFOp>(loc, orderedCmp, left, right);
-      auto leftIsNan = builder.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UNE, left, left);
-      result = builder.create<mlir::arith::OrIOp>(loc, leftIsResult, leftIsNan);
+          mlir::arith::CmpFOp::create(builder, loc, orderedCmp, left, right);
+      auto leftIsNan = mlir::arith::CmpFOp::create(
+          builder, loc, mlir::arith::CmpFPredicate::UNE, left, left);
+      result =
+          mlir::arith::OrIOp::create(builder, loc, leftIsResult, leftIsNan);
     } else if constexpr (behavior == ExtremumBehavior::MinMaxss) {
       // If the left is a NaN, return the right whatever it is.
       result =
-          builder.create<mlir::arith::CmpFOp>(loc, orderedCmp, left, right);
+          mlir::arith::CmpFOp::create(builder, loc, orderedCmp, left, right);
     } else if constexpr (behavior == ExtremumBehavior::PgfortranLlvm) {
       // If one of the operand is a NaN, return left whatever it is.
       static constexpr auto unorderedCmp =
           extremum == Extremum::Max ? mlir::arith::CmpFPredicate::UGT
                                     : mlir::arith::CmpFPredicate::ULT;
       result =
-          builder.create<mlir::arith::CmpFOp>(loc, unorderedCmp, left, right);
+          mlir::arith::CmpFOp::create(builder, loc, unorderedCmp, left, right);
     } else {
       // TODO: ieeeMinNum/ieeeMaxNum
       static_assert(behavior == ExtremumBehavior::IeeeMinMaxNum,
@@ -8814,8 +8879,8 @@ static mlir::Value createExtremumCompare(mlir::Location loc,
       left = builder.createConvert(loc, signlessType, left);
       right = builder.createConvert(loc, signlessType, right);
     }
-    result =
-        builder.create<mlir::arith::CmpIOp>(loc, integerPredicate, left, right);
+    result = mlir::arith::CmpIOp::create(builder, loc, integerPredicate, left,
+                                         right);
   } else if (fir::isa_char(type) || fir::isa_char(fir::unwrapRefType(type))) {
     // TODO: ! character min and max is tricky because the result
     // length is the length of the longest argument!
@@ -8936,13 +9001,14 @@ IntrinsicLibrary::genVerify(mlir::Type resultType,
         builder.getContext(), builder.getKindMap().defaultLogicalKind());
     mlir::Value temp = builder.createTemporary(loc, logTy);
     mlir::Value castb = builder.createConvert(loc, logTy, b);
-    builder.create<fir::StoreOp>(loc, castb, temp);
+    fir::StoreOp::create(builder, loc, castb, temp);
     return builder.createBox(loc, temp);
   };
-  mlir::Value back = fir::isUnboxedValue(args[2])
-                         ? makeRefThenEmbox(*args[2].getUnboxed())
-                         : builder.create<fir::AbsentOp>(
-                               loc, fir::BoxType::get(builder.getI1Type()));
+  mlir::Value back =
+      fir::isUnboxedValue(args[2])
+          ? makeRefThenEmbox(*args[2].getUnboxed())
+          : fir::AbsentOp::create(builder, loc,
+                                  fir::BoxType::get(builder.getI1Type()));
 
   // Handle required string argument
   mlir::Value string = builder.createBox(loc, args[0]);
@@ -8982,8 +9048,8 @@ IntrinsicLibrary::genExtremumloc(FN func, FD funcDim, llvm::StringRef errMsg,
 
   // Handle optional mask argument
   auto mask = isStaticallyAbsent(args[2])
-                  ? builder.create<fir::AbsentOp>(
-                        loc, fir::BoxType::get(builder.getI1Type()))
+                  ? fir::AbsentOp::create(
+                        builder, loc, fir::BoxType::get(builder.getI1Type()))
                   : builder.createBox(loc, args[2]);
 
   // Handle optional kind argument
@@ -9065,8 +9131,8 @@ IntrinsicLibrary::genExtremumVal(FN func, FD funcDim, FC funcChar,
 
   // Handle optional mask argument
   auto mask = isStaticallyAbsent(args[2])
-                  ? builder.create<fir::AbsentOp>(
-                        loc, fir::BoxType::get(builder.getI1Type()))
+                  ? fir::AbsentOp::create(
+                        builder, loc, fir::BoxType::get(builder.getI1Type()))
                   : builder.createBox(loc, args[2]);
 
   bool absentDim = isStaticallyAbsent(args[1]);
@@ -9130,7 +9196,7 @@ mlir::Value IntrinsicLibrary::genExtremum(mlir::Type,
   for (auto arg : args.drop_front()) {
     mlir::Value mask =
         createExtremumCompare<extremum, behavior>(loc, builder, result, arg);
-    result = builder.create<mlir::arith::SelectOp>(loc, mask, result, arg);
+    result = mlir::arith::SelectOp::create(builder, loc, mask, result, arg);
   }
   return result;
 }
diff --git a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp
index 64d70d70829fb..3fb7fab099965 100644
--- a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp
@@ -31,8 +31,7 @@ mlir::func::FuncOp fir::factory::getRealloc(fir::FirOpBuilder &builder) {
 mlir::func::FuncOp
 fir::factory::getLlvmGetRounding(fir::FirOpBuilder &builder) {
   auto int32Ty = builder.getIntegerType(32);
-  auto funcTy =
-      mlir::FunctionType::get(builder.getContext(), std::nullopt, {int32Ty});
+  auto funcTy = mlir::FunctionType::get(builder.getContext(), {}, {int32Ty});
   return builder.createFunction(builder.getUnknownLoc(), "llvm.get.rounding",
                                 funcTy);
 }
@@ -40,8 +39,7 @@ fir::factory::getLlvmGetRounding(fir::FirOpBuilder &builder) {
 mlir::func::FuncOp
 fir::factory::getLlvmSetRounding(fir::FirOpBuilder &builder) {
   auto int32Ty = builder.getIntegerType(32);
-  auto funcTy =
-      mlir::FunctionType::get(builder.getContext(), {int32Ty}, std::nullopt);
+  auto funcTy = mlir::FunctionType::get(builder.getContext(), {int32Ty}, {});
   return builder.createFunction(builder.getUnknownLoc(), "llvm.set.rounding",
                                 funcTy);
 }
@@ -49,8 +47,8 @@ fir::factory::getLlvmSetRounding(fir::FirOpBuilder &builder) {
 mlir::func::FuncOp
 fir::factory::getLlvmInitTrampoline(fir::FirOpBuilder &builder) {
   auto ptrTy = builder.getRefType(builder.getIntegerType(8));
-  auto funcTy = mlir::FunctionType::get(builder.getContext(),
-                                        {ptrTy, ptrTy, ptrTy}, std::nullopt);
+  auto funcTy =
+      mlir::FunctionType::get(builder.getContext(), {ptrTy, ptrTy, ptrTy}, {});
   return builder.createFunction(builder.getUnknownLoc(), "llvm.init.trampoline",
                                 funcTy);
 }
@@ -90,8 +88,7 @@ mlir::func::FuncOp fir::factory::getFeenableexcept(fir::FirOpBuilder &builder) {
 
 mlir::func::FuncOp fir::factory::getFegetexcept(fir::FirOpBuilder &builder) {
   auto int32Ty = builder.getIntegerType(32);
-  auto funcTy =
-      mlir::FunctionType::get(builder.getContext(), std::nullopt, {int32Ty});
+  auto funcTy = mlir::FunctionType::get(builder.getContext(), {}, {int32Ty});
   return builder.createFunction(builder.getUnknownLoc(), "fegetexcept", funcTy);
 }
 
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index d944a4c98473e..50c945df5b465 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -35,7 +35,7 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value shape;
   if (!extents.empty()) {
     if (lbounds.empty()) {
-      shape = builder.create<fir::ShapeOp>(loc, extents);
+      shape = fir::ShapeOp::create(builder, loc, extents);
     } else {
       llvm::SmallVector<mlir::Value> shapeShiftBounds;
       for (auto [lb, extent] : llvm::zip(lbounds, extents)) {
@@ -44,8 +44,8 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
       }
       auto shapeShiftType =
           fir::ShapeShiftType::get(builder.getContext(), extents.size());
-      shape = builder.create<fir::ShapeShiftOp>(loc, shapeShiftType,
-                                                shapeShiftBounds);
+      shape = fir::ShapeShiftOp::create(builder, loc, shapeShiftType,
+                                        shapeShiftBounds);
     }
   } // Otherwise, this a scalar. Leave the shape empty.
 
@@ -78,8 +78,8 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Value emptySlice;
   auto boxType = fir::updateTypeWithVolatility(
       box.getBoxTy(), fir::isa_volatile_type(cleanedAddr.getType()));
-  return builder.create<fir::EmboxOp>(loc, boxType, cleanedAddr, shape,
-                                      emptySlice, cleanedLengths, tdesc);
+  return fir::EmboxOp::create(builder, loc, boxType, cleanedAddr, shape,
+                              emptySlice, cleanedLengths, tdesc);
 }
 
 //===----------------------------------------------------------------------===//
@@ -106,7 +106,7 @@ class MutablePropertyReader {
                         bool forceIRBoxRead = false)
       : builder{builder}, loc{loc}, box{box} {
     if (forceIRBoxRead || !box.isDescribedByVariables())
-      irBox = builder.create<fir::LoadOp>(loc, box.getAddr());
+      irBox = fir::LoadOp::create(builder, loc, box.getAddr());
   }
   /// Get base address of allocated/associated entity.
   mlir::Value readBaseAddress() {
@@ -114,10 +114,10 @@ class MutablePropertyReader {
       auto memrefTy = box.getBoxTy().getEleTy();
       if (!fir::isa_ref_type(memrefTy))
         memrefTy = builder.getRefType(memrefTy);
-      return builder.create<fir::BoxAddrOp>(loc, memrefTy, irBox);
+      return fir::BoxAddrOp::create(builder, loc, memrefTy, irBox);
     }
     auto addrVar = box.getMutableProperties().addr;
-    return builder.create<fir::LoadOp>(loc, addrVar);
+    return fir::LoadOp::create(builder, loc, addrVar);
   }
   /// Return {lbound, extent} values read from the MutableBoxValue given
   /// the dimension.
@@ -125,13 +125,14 @@ class MutablePropertyReader {
     auto idxTy = builder.getIndexType();
     if (irBox) {
       auto dimVal = builder.createIntegerConstant(loc, idxTy, dim);
-      auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                    irBox, dimVal);
+      auto dimInfo = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy,
+                                            irBox, dimVal);
       return {dimInfo.getResult(0), dimInfo.getResult(1)};
     }
     const auto &mutableProperties = box.getMutableProperties();
-    auto lb = builder.create<fir::LoadOp>(loc, mutableProperties.lbounds[dim]);
-    auto ext = builder.create<fir::LoadOp>(loc, mutableProperties.extents[dim]);
+    auto lb = fir::LoadOp::create(builder, loc, mutableProperties.lbounds[dim]);
+    auto ext =
+        fir::LoadOp::create(builder, loc, mutableProperties.extents[dim]);
     return {lb, ext};
   }
 
@@ -146,7 +147,7 @@ class MutablePropertyReader {
     const auto &deferred = box.getMutableProperties().deferredParams;
     if (deferred.empty())
       fir::emitFatalError(loc, "allocatable entity has no length property");
-    return builder.create<fir::LoadOp>(loc, deferred[0]);
+    return fir::LoadOp::create(builder, loc, deferred[0]);
   }
 
   /// Read and return all extents. If \p lbounds vector is provided, lbounds are
@@ -223,7 +224,7 @@ class MutablePropertyWriter {
   /// all that can be described in the new fir.box (e.g. non contiguous entity).
   void updateWithIrBox(mlir::Value newBox) {
     assert(!box.isDescribedByVariables());
-    builder.create<fir::StoreOp>(loc, newBox, box.getAddr());
+    fir::StoreOp::create(builder, loc, newBox, box.getAddr());
   }
   /// Set unallocated/disassociated status for the entity described by
   /// MutableBoxValue. Deallocation is not performed by this helper.
@@ -231,8 +232,8 @@ class MutablePropertyWriter {
     if (box.isDescribedByVariables()) {
       auto addrVar = box.getMutableProperties().addr;
       auto nullTy = fir::dyn_cast_ptrEleTy(addrVar.getType());
-      builder.create<fir::StoreOp>(loc, builder.createNullConstant(loc, nullTy),
-                                   addrVar);
+      fir::StoreOp::create(builder, loc,
+                           builder.createNullConstant(loc, nullTy), addrVar);
     } else {
       // Note that the dynamic type of polymorphic entities must be reset to the
       // declaration type of the mutable box. See Fortran 2018 7.8.2 NOTE 1.
@@ -246,7 +247,7 @@ class MutablePropertyWriter {
       auto deallocatedBox = fir::factory::createUnallocatedBox(
           builder, loc, box.getBoxTy(), box.nonDeferredLenParams(),
           typeSourceBox, allocator);
-      builder.create<fir::StoreOp>(loc, deallocatedBox, box.getAddr());
+      fir::StoreOp::create(builder, loc, deallocatedBox, box.getAddr());
     }
   }
 
@@ -286,7 +287,7 @@ class MutablePropertyWriter {
     const bool valueTypeIsVolatile =
         fir::isa_volatile_type(fir::unwrapRefType(box.getAddr().getType()));
     irBox = builder.createVolatileCast(loc, valueTypeIsVolatile, irBox);
-    builder.create<fir::StoreOp>(loc, irBox, box.getAddr());
+    fir::StoreOp::create(builder, loc, irBox, box.getAddr());
   }
 
   /// Update the set of property variables of the MutableBoxValue.
@@ -295,8 +296,8 @@ class MutablePropertyWriter {
                                mlir::ValueRange lengths) {
     auto castAndStore = [&](mlir::Value val, mlir::Value addr) {
       auto type = fir::dyn_cast_ptrEleTy(addr.getType());
-      builder.create<fir::StoreOp>(loc, builder.createConvert(loc, type, val),
-                                   addr);
+      fir::StoreOp::create(builder, loc, builder.createConvert(loc, type, val),
+                           addr);
     };
     const auto &mutableProperties = box.getMutableProperties();
     castAndStore(addr, mutableProperties.addr);
@@ -379,8 +380,8 @@ mlir::Value fir::factory::createUnallocatedBox(
     }
   }
   mlir::Value emptySlice;
-  auto embox = builder.create<fir::EmboxOp>(
-      loc, baseBoxType, nullAddr, shape, emptySlice, lenParams, typeSourceBox);
+  auto embox = fir::EmboxOp::create(builder, loc, baseBoxType, nullAddr, shape,
+                                    emptySlice, lenParams, typeSourceBox);
   if (allocator != 0)
     embox.setAllocatorIdx(allocator);
   if (isAssumedRank)
@@ -459,7 +460,7 @@ fir::factory::genMutableBoxRead(fir::FirOpBuilder &builder, mlir::Location loc,
   }
   mlir::Value sourceBox;
   if (box.isPolymorphic())
-    sourceBox = builder.create<fir::LoadOp>(loc, box.getAddr());
+    sourceBox = fir::LoadOp::create(builder, loc, box.getAddr());
   if (rank)
     return fir::ArrayBoxValue{addr, extents, lbounds, sourceBox};
   if (box.isPolymorphic())
@@ -490,7 +491,7 @@ static void genFreemem(fir::FirOpBuilder &builder, mlir::Location loc,
   // so make sure the heap type is restored before deallocation.
   auto cast = builder.createConvert(
       loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
-  builder.create<fir::FreeMemOp>(loc, cast);
+  fir::FreeMemOp::create(builder, loc, cast);
 }
 
 void fir::factory::genFreememIfAllocated(fir::FirOpBuilder &builder,
@@ -498,8 +499,8 @@ void fir::factory::genFreememIfAllocated(fir::FirOpBuilder &builder,
                                          const fir::MutableBoxValue &box) {
   auto addr = MutablePropertyReader(builder, loc, box).readBaseAddress();
   auto isAllocated = builder.genIsNotNullAddr(loc, addr);
-  auto ifOp = builder.create<fir::IfOp>(loc, isAllocated,
-                                        /*withElseRegion=*/false);
+  auto ifOp = fir::IfOp::create(builder, loc, isAllocated,
+                                /*withElseRegion=*/false);
   auto insPt = builder.saveInsertionPoint();
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   ::genFreemem(builder, loc, addr);
@@ -521,23 +522,23 @@ void fir::factory::associateMutableBox(fir::FirOpBuilder &builder,
         mlir::Value sourceBox;
         if (auto *polyBox = source.getBoxOf<fir::PolymorphicValue>())
           sourceBox = polyBox->getSourceBox();
-        writer.updateMutableBox(p.getAddr(), /*lbounds=*/std::nullopt,
-                                /*extents=*/std::nullopt,
-                                /*lengths=*/std::nullopt, sourceBox);
+        writer.updateMutableBox(p.getAddr(), /*lbounds=*/{},
+                                /*extents=*/{},
+                                /*lengths=*/{}, sourceBox);
       },
       [&](const fir::UnboxedValue &addr) {
-        writer.updateMutableBox(addr, /*lbounds=*/std::nullopt,
-                                /*extents=*/std::nullopt,
-                                /*lengths=*/std::nullopt);
+        writer.updateMutableBox(addr, /*lbounds=*/{},
+                                /*extents=*/{},
+                                /*lengths=*/{});
       },
       [&](const fir::CharBoxValue &ch) {
-        writer.updateMutableBox(ch.getAddr(), /*lbounds=*/std::nullopt,
-                                /*extents=*/std::nullopt, {ch.getLen()});
+        writer.updateMutableBox(ch.getAddr(), /*lbounds=*/{},
+                                /*extents=*/{}, {ch.getLen()});
       },
       [&](const fir::ArrayBoxValue &arr) {
         writer.updateMutableBox(arr.getAddr(),
                                 lbounds.empty() ? arr.getLBounds() : lbounds,
-                                arr.getExtents(), /*lengths=*/std::nullopt);
+                                arr.getExtents(), /*lengths=*/{});
       },
       [&](const fir::CharArrayBoxValue &arr) {
         writer.updateMutableBox(arr.getAddr(),
@@ -553,15 +554,15 @@ void fir::factory::associateMutableBox(fir::FirOpBuilder &builder,
         if (box.hasAssumedRank()) {
           assert(arr.hasAssumedRank() &&
                  "expect both arr and box to be assumed-rank");
-          mlir::Value reboxed = builder.create<fir::ReboxAssumedRankOp>(
-              loc, box.getBoxTy(), arr.getAddr(),
+          mlir::Value reboxed = fir::ReboxAssumedRankOp::create(
+              builder, loc, box.getBoxTy(), arr.getAddr(),
               fir::LowerBoundModifierAttribute::Preserve);
           writer.updateWithIrBox(reboxed);
         } else if (box.isDescribedByVariables()) {
           // LHS is a contiguous pointer described by local variables. Open RHS
           // fir.box to update the LHS.
-          auto rawAddr = builder.create<fir::BoxAddrOp>(loc, arr.getMemTy(),
-                                                        arr.getAddr());
+          auto rawAddr = fir::BoxAddrOp::create(builder, loc, arr.getMemTy(),
+                                                arr.getAddr());
           auto extents = fir::factory::getExtents(loc, builder, source);
           llvm::SmallVector<mlir::Value> lenParams;
           if (arr.isCharacter()) {
@@ -576,11 +577,11 @@ void fir::factory::associateMutableBox(fir::FirOpBuilder &builder,
           if (!newLbounds.empty()) {
             auto shiftType =
                 fir::ShiftType::get(builder.getContext(), newLbounds.size());
-            shift = builder.create<fir::ShiftOp>(loc, shiftType, newLbounds);
+            shift = fir::ShiftOp::create(builder, loc, shiftType, newLbounds);
           }
           auto reboxed =
-              builder.create<fir::ReboxOp>(loc, box.getBoxTy(), arr.getAddr(),
-                                           shift, /*slice=*/mlir::Value());
+              fir::ReboxOp::create(builder, loc, box.getBoxTy(), arr.getAddr(),
+                                   shift, /*slice=*/mlir::Value());
           writer.updateWithIrBox(reboxed);
         }
       },
@@ -608,9 +609,9 @@ void fir::factory::associateMutableBoxWithRemap(
     for (auto [lb, ub] : llvm::zip(lbounds, ubounds)) {
       auto lbi = builder.createConvert(loc, idxTy, lb);
       auto ubi = builder.createConvert(loc, idxTy, ub);
-      auto diff = builder.create<mlir::arith::SubIOp>(loc, idxTy, ubi, lbi);
+      auto diff = mlir::arith::SubIOp::create(builder, loc, idxTy, ubi, lbi);
       extents.emplace_back(
-          builder.create<mlir::arith::AddIOp>(loc, idxTy, diff, one));
+          mlir::arith::AddIOp::create(builder, loc, idxTy, diff, one));
     }
   } else {
     // lbounds are default. Upper bounds and extents are the same.
@@ -634,11 +635,11 @@ void fir::factory::associateMutableBoxWithRemap(
   source.match(
       [&](const fir::PolymorphicValue &p) {
         writer.updateMutableBox(cast(p.getAddr()), lbounds, extents,
-                                /*lengths=*/std::nullopt);
+                                /*lengths=*/{});
       },
       [&](const fir::UnboxedValue &addr) {
         writer.updateMutableBox(cast(addr), lbounds, extents,
-                                /*lengths=*/std::nullopt);
+                                /*lengths=*/{});
       },
       [&](const fir::CharBoxValue &ch) {
         writer.updateMutableBox(cast(ch.getAddr()), lbounds, extents,
@@ -646,7 +647,7 @@ void fir::factory::associateMutableBoxWithRemap(
       },
       [&](const fir::ArrayBoxValue &arr) {
         writer.updateMutableBox(cast(arr.getAddr()), lbounds, extents,
-                                /*lengths=*/std::nullopt);
+                                /*lengths=*/{});
       },
       [&](const fir::CharArrayBoxValue &arr) {
         writer.updateMutableBox(cast(arr.getAddr()), lbounds, extents,
@@ -657,8 +658,8 @@ void fir::factory::associateMutableBoxWithRemap(
         if (box.isDescribedByVariables()) {
           // LHS is a contiguous pointer described by local variables. Open RHS
           // fir.box to update the LHS.
-          auto rawAddr = builder.create<fir::BoxAddrOp>(loc, arr.getMemTy(),
-                                                        arr.getAddr());
+          auto rawAddr = fir::BoxAddrOp::create(builder, loc, arr.getMemTy(),
+                                                arr.getAddr());
           llvm::SmallVector<mlir::Value> lenParams;
           if (arr.isCharacter()) {
             lenParams.emplace_back(
@@ -678,10 +679,10 @@ void fir::factory::associateMutableBoxWithRemap(
             shapeArgs.push_back(ext);
           }
           auto shape =
-              builder.create<fir::ShapeShiftOp>(loc, shapeType, shapeArgs);
+              fir::ShapeShiftOp::create(builder, loc, shapeType, shapeArgs);
           auto reboxed =
-              builder.create<fir::ReboxOp>(loc, box.getBoxTy(), arr.getAddr(),
-                                           shape, /*slice=*/mlir::Value());
+              fir::ReboxOp::create(builder, loc, box.getBoxTy(), arr.getAddr(),
+                                   shape, /*slice=*/mlir::Value());
           writer.updateWithIrBox(reboxed);
         }
       },
@@ -748,15 +749,15 @@ static mlir::Value allocateAndInitNewStorage(fir::FirOpBuilder &builder,
                                              mlir::ValueRange lenParams,
                                              llvm::StringRef allocName) {
   auto lengths = getNewLengths(builder, loc, box, lenParams);
-  auto newStorage = builder.create<fir::AllocMemOp>(
-      loc, box.getBaseTy(), allocName, lengths, extents);
+  auto newStorage = fir::AllocMemOp::create(builder, loc, box.getBaseTy(),
+                                            allocName, lengths, extents);
   if (mlir::isa<fir::RecordType>(box.getEleTy())) {
     // TODO: skip runtime initialization if this is not required. Currently,
     // there is no way to know here if a derived type needs it or not. But the
     // information is available at compile time and could be reflected here
     // somehow.
-    mlir::Value irBox = createNewFirBox(builder, loc, box, newStorage,
-                                        std::nullopt, extents, lengths);
+    mlir::Value irBox =
+        createNewFirBox(builder, loc, box, newStorage, {}, extents, lengths);
     fir::runtime::genDerivedTypeInitialize(builder, loc, irBox);
   }
   return newStorage;
@@ -771,8 +772,8 @@ void fir::factory::genInlinedAllocation(
   llvm::SmallVector<mlir::Value> safeExtents;
   for (mlir::Value extent : extents)
     safeExtents.push_back(fir::factory::genMaxWithZero(builder, loc, extent));
-  auto heap = builder.create<fir::AllocMemOp>(loc, box.getBaseTy(), allocName,
-                                              lengths, safeExtents);
+  auto heap = fir::AllocMemOp::create(builder, loc, box.getBaseTy(), allocName,
+                                      lengths, safeExtents);
   MutablePropertyWriter{builder, loc, box}.updateMutableBox(
       heap, lbounds, safeExtents, lengths);
   if (mlir::isa<fir::RecordType>(box.getEleTy())) {
@@ -841,10 +842,11 @@ fir::factory::MutableBoxReallocation fir::factory::genReallocIfNeeded(
                                        mlir::Value required) {
               auto castPrevious =
                   builder.createConvert(loc, required.getType(), previous);
-              auto cmp = builder.create<mlir::arith::CmpIOp>(
-                  loc, mlir::arith::CmpIPredicate::ne, castPrevious, required);
-              mustReallocate = builder.create<mlir::arith::SelectOp>(
-                  loc, cmp, cmp, mustReallocate);
+              auto cmp = mlir::arith::CmpIOp::create(
+                  builder, loc, mlir::arith::CmpIPredicate::ne, castPrevious,
+                  required);
+              mustReallocate = mlir::arith::SelectOp::create(
+                  builder, loc, cmp, cmp, mustReallocate);
             };
             llvm::SmallVector<mlir::Value> previousExtents = reader.readShape();
             if (!shape.empty())
@@ -879,17 +881,17 @@ fir::factory::MutableBoxReallocation fir::factory::genReallocIfNeeded(
                                   ".auto.alloc");
                               if (storageHandler)
                                 storageHandler(getExtValForStorage(heap));
-                              builder.create<fir::ResultOp>(loc, heap);
+                              fir::ResultOp::create(builder, loc, heap);
                             })
                             .genElse([&]() {
                               if (storageHandler)
                                 storageHandler(getExtValForStorage(addr));
-                              builder.create<fir::ResultOp>(loc, addr);
+                              fir::ResultOp::create(builder, loc, addr);
                             });
             ifOp.end();
             auto newAddr = ifOp.getResults()[0];
-            builder.create<fir::ResultOp>(
-                loc, mlir::ValueRange{mustReallocate, newAddr});
+            fir::ResultOp::create(builder, loc,
+                                  mlir::ValueRange{mustReallocate, newAddr});
           })
           .genElse([&]() {
             auto trueValue = builder.createBool(loc, true);
@@ -900,15 +902,15 @@ fir::factory::MutableBoxReallocation fir::factory::genReallocIfNeeded(
                   builder, loc,
                   "array left hand side must be allocated when the right hand "
                   "side is a scalar");
-              builder.create<fir::ResultOp>(loc,
-                                            mlir::ValueRange{trueValue, addr});
+              fir::ResultOp::create(builder, loc,
+                                    mlir::ValueRange{trueValue, addr});
             } else {
               auto heap = allocateAndInitNewStorage(
                   builder, loc, box, shape, lengthParams, ".auto.alloc");
               if (storageHandler)
                 storageHandler(getExtValForStorage(heap));
-              builder.create<fir::ResultOp>(loc,
-                                            mlir::ValueRange{trueValue, heap});
+              fir::ResultOp::create(builder, loc,
+                                    mlir::ValueRange{trueValue, heap});
             }
           });
   ifOp.end();
@@ -976,7 +978,7 @@ mlir::Value fir::factory::genNullBoxStorage(fir::FirOpBuilder &builder,
   mlir::Value boxStorage = builder.createTemporary(loc, boxTy);
   mlir::Value nullBox = fir::factory::createUnallocatedBox(
       builder, loc, boxTy, /*nonDeferredParams=*/{});
-  builder.create<fir::StoreOp>(loc, nullBox, boxStorage);
+  fir::StoreOp::create(builder, loc, nullBox, boxStorage);
   return boxStorage;
 }
 
@@ -988,11 +990,11 @@ mlir::Value fir::factory::getAndEstablishBoxStorage(
   mlir::Value nullAddr =
       builder.createNullConstant(loc, boxTy.getBaseAddressType());
   mlir::Value box =
-      builder.create<fir::EmboxOp>(loc, boxTy, nullAddr, shape,
-                                   /*emptySlice=*/mlir::Value{},
-                                   fir::factory::elideLengthsAlreadyInType(
-                                       boxTy.unwrapInnerType(), typeParams),
-                                   polymorphicMold);
-  builder.create<fir::StoreOp>(loc, box, boxStorage);
+      fir::EmboxOp::create(builder, loc, boxTy, nullAddr, shape,
+                           /*emptySlice=*/mlir::Value{},
+                           fir::factory::elideLengthsAlreadyInType(
+                               boxTy.unwrapInnerType(), typeParams),
+                           polymorphicMold);
+  fir::StoreOp::create(builder, loc, box, boxStorage);
   return boxStorage;
 }
diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index 0094ce892d61b..03952da95b11e 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -17,6 +17,7 @@
 #include "flang/Evaluate/common.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/MutableBox.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 
 namespace fir {
@@ -1090,7 +1091,7 @@ void PPCIntrinsicLibrary::genMtfsf(llvm::ArrayRef<fir::ExtendedValue> args) {
         builder.getContext(), builder);
     funcOp = builder.createFunction(loc, "llvm.ppc.mtfsf", libFuncType);
   }
-  builder.create<fir::CallOp>(loc, funcOp, scalarArgs);
+  fir::CallOp::create(builder, loc, funcOp, scalarArgs);
 }
 
 // VEC_ABS
@@ -1117,7 +1118,7 @@ PPCIntrinsicLibrary::genVecAbs(mlir::Type resultType,
     }
 
     funcOp = builder.createFunction(loc, fname, ftype);
-    auto callOp{builder.create<fir::CallOp>(loc, funcOp, argBases[0])};
+    auto callOp{fir::CallOp::create(builder, loc, funcOp, argBases[0])};
     return callOp.getResult(0);
   } else if (auto eleTy = mlir::dyn_cast<mlir::IntegerType>(vTypeInfo.eleTy)) {
     // vec_abs(arg1) = max(0 - arg1, arg1)
@@ -1127,8 +1128,8 @@ PPCIntrinsicLibrary::genVecAbs(mlir::Type resultType,
     // construct vector(0,..)
     auto zeroVal{builder.createIntegerConstant(loc, eleTy, 0)};
     auto vZero{
-        builder.create<mlir::vector::BroadcastOp>(loc, newVecTy, zeroVal)};
-    auto zeroSubVarg1{builder.create<mlir::arith::SubIOp>(loc, vZero, varg1)};
+        mlir::vector::BroadcastOp::create(builder, loc, newVecTy, zeroVal)};
+    auto zeroSubVarg1{mlir::arith::SubIOp::create(builder, loc, vZero, varg1)};
 
     mlir::func::FuncOp funcOp{nullptr};
     switch (eleTy.getWidth()) {
@@ -1158,7 +1159,7 @@ PPCIntrinsicLibrary::genVecAbs(mlir::Type resultType,
     funcOp = builder.createFunction(loc, fname, ftype);
 
     mlir::Value args[] = {zeroSubVarg1, varg1};
-    auto callOp{builder.create<fir::CallOp>(loc, funcOp, args)};
+    auto callOp{fir::CallOp::create(builder, loc, funcOp, args)};
     return builder.createConvert(loc, argBases[0].getType(),
                                  callOp.getResult(0));
   }
@@ -1188,21 +1189,21 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecAddAndMulSubXor(
   switch (vop) {
   case VecOp::Add:
     if (isInteger)
-      r = builder.create<mlir::arith::AddIOp>(loc, vargs[0], vargs[1]);
+      r = mlir::arith::AddIOp::create(builder, loc, vargs[0], vargs[1]);
     else if (isFloat)
-      r = builder.create<mlir::arith::AddFOp>(loc, vargs[0], vargs[1]);
+      r = mlir::arith::AddFOp::create(builder, loc, vargs[0], vargs[1]);
     break;
   case VecOp::Mul:
     if (isInteger)
-      r = builder.create<mlir::arith::MulIOp>(loc, vargs[0], vargs[1]);
+      r = mlir::arith::MulIOp::create(builder, loc, vargs[0], vargs[1]);
     else if (isFloat)
-      r = builder.create<mlir::arith::MulFOp>(loc, vargs[0], vargs[1]);
+      r = mlir::arith::MulFOp::create(builder, loc, vargs[0], vargs[1]);
     break;
   case VecOp::Sub:
     if (isInteger)
-      r = builder.create<mlir::arith::SubIOp>(loc, vargs[0], vargs[1]);
+      r = mlir::arith::SubIOp::create(builder, loc, vargs[0], vargs[1]);
     else if (isFloat)
-      r = builder.create<mlir::arith::SubFOp>(loc, vargs[0], vargs[1]);
+      r = mlir::arith::SubFOp::create(builder, loc, vargs[0], vargs[1]);
     break;
   case VecOp::And:
   case VecOp::Xor: {
@@ -1216,16 +1217,16 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecAddAndMulSubXor(
       auto wd{mlir::dyn_cast<mlir::FloatType>(vecTyInfo.eleTy).getWidth()};
       auto ftype{builder.getIntegerType(wd)};
       auto bcVecTy{mlir::VectorType::get(vecTyInfo.len, ftype)};
-      arg1 = builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[0]);
-      arg2 = builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[1]);
+      arg1 = mlir::vector::BitCastOp::create(builder, loc, bcVecTy, vargs[0]);
+      arg2 = mlir::vector::BitCastOp::create(builder, loc, bcVecTy, vargs[1]);
     }
     if (vop == VecOp::And)
-      r = builder.create<mlir::arith::AndIOp>(loc, arg1, arg2);
+      r = mlir::arith::AndIOp::create(builder, loc, arg1, arg2);
     else if (vop == VecOp::Xor)
-      r = builder.create<mlir::arith::XOrIOp>(loc, arg1, arg2);
+      r = mlir::arith::XOrIOp::create(builder, loc, arg1, arg2);
 
     if (isFloat)
-      r = builder.create<mlir::vector::BitCastOp>(loc, vargs[0].getType(), r);
+      r = mlir::vector::BitCastOp::create(builder, loc, vargs[0].getType(), r);
 
     break;
   }
@@ -1341,7 +1342,7 @@ PPCIntrinsicLibrary::genVecAnyCompare(mlir::Type resultType,
   assert((!fname.empty() && ftype) && "invalid type");
 
   mlir::func::FuncOp funcOp{builder.createFunction(loc, fname, ftype)};
-  auto callOp{builder.create<fir::CallOp>(loc, funcOp, cmpArgs)};
+  auto callOp{fir::CallOp::create(builder, loc, funcOp, cmpArgs)};
   return callOp.getResult(0);
 }
 
@@ -1472,7 +1473,7 @@ PPCIntrinsicLibrary::genVecCmp(mlir::Type resultType,
       // arg1 < arg2 --> vcmpgt(arg2, arg1)
       mlir::Value vargs[]{argBases[argOrder[vop][0]],
                           argBases[argOrder[vop][1]]};
-      auto callOp{builder.create<fir::CallOp>(loc, funcOp, vargs)};
+      auto callOp{fir::CallOp::create(builder, loc, funcOp, vargs)};
       res = callOp.getResult(0);
       break;
     }
@@ -1486,14 +1487,15 @@ PPCIntrinsicLibrary::genVecCmp(mlir::Type resultType,
       // Construct a constant vector(-1)
       auto negOneVal{builder.createIntegerConstant(
           loc, getConvertedElementType(context, eTy), -1)};
-      auto vNegOne{builder.create<mlir::vector::BroadcastOp>(
-          loc, vecTyInfo.toMlirVectorType(context), negOneVal)};
+      auto vNegOne{mlir::vector::BroadcastOp::create(
+          builder, loc, vecTyInfo.toMlirVectorType(context), negOneVal)};
 
-      auto callOp{builder.create<fir::CallOp>(loc, funcOp, vargs)};
+      auto callOp{fir::CallOp::create(builder, loc, funcOp, vargs)};
       mlir::Value callRes{callOp.getResult(0)};
       auto vargs2{
           convertVecArgs(builder, loc, vecTyInfo, mlir::ValueRange{callRes})};
-      auto xorRes{builder.create<mlir::arith::XOrIOp>(loc, vargs2[0], vNegOne)};
+      auto xorRes{
+          mlir::arith::XOrIOp::create(builder, loc, vargs2[0], vNegOne)};
 
       res = builder.createConvert(loc, returnType, xorRes);
       break;
@@ -1518,7 +1520,7 @@ PPCIntrinsicLibrary::genVecCmp(mlir::Type resultType,
     default:
       llvm_unreachable("Invalid vector operation for generator");
     }
-    auto callOp{builder.create<fir::CallOp>(loc, funcOp, vargs)};
+    auto callOp{fir::CallOp::create(builder, loc, funcOp, vargs)};
     res = callOp.getResult(0);
   } else
     llvm_unreachable("invalid vector type");
@@ -1534,13 +1536,13 @@ static inline mlir::Value swapVectorWordPairs(fir::FirOpBuilder &builder,
   auto vtype{mlir::VectorType::get(16, mlir::IntegerType::get(context, 8))};
 
   if (ty != vtype)
-    arg = builder.create<mlir::LLVM::BitcastOp>(loc, vtype, arg).getResult();
+    arg = mlir::LLVM::BitcastOp::create(builder, loc, vtype, arg).getResult();
 
   llvm::SmallVector<int64_t, 16> mask{4,  5,  6,  7,  0, 1, 2,  3,
                                       12, 13, 14, 15, 8, 9, 10, 11};
-  arg = builder.create<mlir::vector::ShuffleOp>(loc, arg, arg, mask);
+  arg = mlir::vector::ShuffleOp::create(builder, loc, arg, arg, mask);
   if (ty != vtype)
-    arg = builder.create<mlir::LLVM::BitcastOp>(loc, ty, arg);
+    arg = mlir::LLVM::BitcastOp::create(builder, loc, ty, arg);
   return arg;
 }
 
@@ -1575,7 +1577,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
                                                : "llvm.ppc.altivec.vcfsx"};
       auto funcOp{builder.createFunction(loc, fname, ftype)};
       mlir::Value newArgs[] = {argBases[0], convArg};
-      auto callOp{builder.create<fir::CallOp>(loc, funcOp, newArgs)};
+      auto callOp{fir::CallOp::create(builder, loc, funcOp, newArgs)};
 
       return callOp.getResult(0);
     } else if (width == 64) {
@@ -1584,8 +1586,8 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
 
       // vec_vtf(arg1, arg2) = fmul(1.0 / (1 << arg2), llvm.sitofp(arg1))
       auto convOp{(isUnsigned)
-                      ? builder.create<mlir::LLVM::UIToFPOp>(loc, ty, vArg1)
-                      : builder.create<mlir::LLVM::SIToFPOp>(loc, ty, vArg1)};
+                      ? mlir::LLVM::UIToFPOp::create(builder, loc, ty, vArg1)
+                      : mlir::LLVM::SIToFPOp::create(builder, loc, ty, vArg1)};
 
       // construct vector<1./(1<<arg1), 1.0/(1<<arg1)>
       auto constInt{mlir::dyn_cast_or_null<mlir::IntegerAttr>(
@@ -1594,11 +1596,11 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
       assert(constInt && "expected integer constant argument");
       double f{1.0 / (1 << constInt.getInt())};
       llvm::SmallVector<double> vals{f, f};
-      auto constOp{builder.create<mlir::arith::ConstantOp>(
-          loc, ty, builder.getF64VectorAttr(vals))};
+      auto constOp{mlir::arith::ConstantOp::create(
+          builder, loc, ty, builder.getF64VectorAttr(vals))};
 
-      auto mulOp{builder.create<mlir::LLVM::FMulOp>(
-          loc, ty, convOp->getResult(0), constOp)};
+      auto mulOp{mlir::LLVM::FMulOp::create(builder, loc, ty,
+                                            convOp->getResult(0), constOp)};
 
       return builder.createConvert(loc, fir::VectorType::get(2, fTy), mulOp);
     }
@@ -1612,7 +1614,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
     auto firTy{resTyInfo.toFirVectorType()};
 
     // vec_convert(v, mold) = bitcast v to "type of mold"
-    auto conv{builder.create<mlir::LLVM::BitcastOp>(loc, moldTy, vArg1)};
+    auto conv{mlir::LLVM::BitcastOp::create(builder, loc, moldTy, vArg1)};
 
     return builder.createConvert(loc, firTy, conv);
   }
@@ -1628,7 +1630,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
       auto ftype{
           genFuncType<Ty::RealVector<8>, Ty::RealVector<4>>(context, builder)};
       auto funcOp{builder.createFunction(loc, fname, ftype)};
-      auto callOp{builder.create<fir::CallOp>(loc, funcOp, newArgs)};
+      auto callOp{fir::CallOp::create(builder, loc, funcOp, newArgs)};
 
       return callOp.getResult(0);
     } else if (vecTyInfo.isFloat64()) {
@@ -1637,7 +1639,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
           genFuncType<Ty::RealVector<4>, Ty::RealVector<8>>(context, builder)};
       auto funcOp{builder.createFunction(loc, fname, ftype)};
       newArgs[0] =
-          builder.create<fir::CallOp>(loc, funcOp, newArgs).getResult(0);
+          fir::CallOp::create(builder, loc, funcOp, newArgs).getResult(0);
       auto fvf32Ty{newArgs[0].getType()};
       auto f32type{mlir::Float32Type::get(context)};
       auto mvf32Ty{mlir::VectorType::get(4, f32type)};
@@ -1661,7 +1663,7 @@ static mlir::Value convertVectorElementOrder(fir::FirOpBuilder &builder,
                                              mlir::Value idx) {
   mlir::Value numSub1{
       builder.createIntegerConstant(loc, idx.getType(), vecInfo.len - 1)};
-  return builder.create<mlir::LLVM::SubOp>(loc, idx.getType(), numSub1, idx);
+  return mlir::LLVM::SubOp::create(builder, loc, idx.getType(), numSub1, idx);
 }
 
 // VEC_EXTRACT
@@ -1680,12 +1682,14 @@ PPCIntrinsicLibrary::genVecExtract(mlir::Type resultType,
   // position
   auto numEle{builder.createIntegerConstant(loc, argTypes[1], vecTyInfo.len)};
   mlir::Value uremOp{
-      builder.create<mlir::LLVM::URemOp>(loc, argBases[1], numEle)};
+      mlir::LLVM::URemOp::create(builder, loc, argBases[1], numEle)};
 
   if (!isNativeVecElemOrderOnLE())
     uremOp = convertVectorElementOrder(builder, loc, vecTyInfo, uremOp);
 
-  return builder.create<mlir::vector::ExtractElementOp>(loc, varg0, uremOp);
+  mlir::Value index = builder.createOrFold<mlir::index::CastUOp>(
+      loc, builder.getIndexType(), uremOp);
+  return mlir::vector::ExtractOp::create(builder, loc, varg0, index);
 }
 
 // VEC_INSERT
@@ -1701,14 +1705,16 @@ PPCIntrinsicLibrary::genVecInsert(mlir::Type resultType,
 
   auto numEle{builder.createIntegerConstant(loc, argTypes[2], vecTyInfo.len)};
   mlir::Value uremOp{
-      builder.create<mlir::LLVM::URemOp>(loc, argBases[2], numEle)};
+      mlir::LLVM::URemOp::create(builder, loc, argBases[2], numEle)};
 
   if (!isNativeVecElemOrderOnLE())
     uremOp = convertVectorElementOrder(builder, loc, vecTyInfo, uremOp);
 
-  auto res{builder.create<mlir::vector::InsertElementOp>(loc, argBases[0],
-                                                         varg1, uremOp)};
-  return builder.create<fir::ConvertOp>(loc, vecTyInfo.toFirVectorType(), res);
+  mlir::Value index = builder.createOrFold<mlir::index::CastUOp>(
+      loc, builder.getIndexType(), uremOp);
+  mlir::Value res =
+      mlir::vector::InsertOp::create(builder, loc, argBases[0], varg1, index);
+  return fir::ConvertOp::create(builder, loc, vecTyInfo.toFirVectorType(), res);
 }
 
 // VEC_MERGEH, VEC_MERGEL
@@ -1794,8 +1800,8 @@ PPCIntrinsicLibrary::genVecMerge(mlir::Type resultType,
   llvm::SmallVector<int64_t, 16> &mergeMask =
       (isBEVecElemOrderOnLE()) ? rMask : mMask;
 
-  auto callOp{builder.create<mlir::vector::ShuffleOp>(loc, vargs[0], vargs[1],
-                                                      mergeMask)};
+  auto callOp{mlir::vector::ShuffleOp::create(builder, loc, vargs[0], vargs[1],
+                                              mergeMask)};
   return builder.createConvert(loc, resultType, callOp);
 }
 
@@ -1807,9 +1813,9 @@ static mlir::Value addOffsetToAddress(fir::FirOpBuilder &builder,
   auto arrRefTy{builder.getRefType(fir::SequenceType::get(
       {typeExtent}, mlir::IntegerType::get(builder.getContext(), 8)))};
   // Convert arg to !fir.ref<!ref.array<?xi8>>
-  auto resAddr{builder.create<fir::ConvertOp>(loc, arrRefTy, baseAddr)};
+  auto resAddr{fir::ConvertOp::create(builder, loc, arrRefTy, baseAddr)};
 
-  return builder.create<fir::CoordinateOp>(loc, arrRefTy, resAddr, offset);
+  return fir::CoordinateOp::create(builder, loc, arrRefTy, resAddr, offset);
 }
 
 static mlir::Value reverseVectorElements(fir::FirOpBuilder &builder,
@@ -1821,8 +1827,8 @@ static mlir::Value reverseVectorElements(fir::FirOpBuilder &builder,
   for (int64_t i = 0; i < len; ++i) {
     mask.push_back(len - 1 - i);
   }
-  auto undefVec{builder.create<fir::UndefOp>(loc, v.getType())};
-  return builder.create<mlir::vector::ShuffleOp>(loc, v, undefVec, mask);
+  auto undefVec{fir::UndefOp::create(builder, loc, v.getType())};
+  return mlir::vector::ShuffleOp::create(builder, loc, v, undefVec, mask);
 }
 
 static mlir::NamedAttribute getAlignmentAttr(fir::FirOpBuilder &builder,
@@ -1871,8 +1877,8 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecLdNoCallGrp(
 
   const auto triple{fir::getTargetTriple(builder.getModule())};
   // Need to get align 1.
-  auto result{builder.create<fir::LoadOp>(loc, mlirTy, addr,
-                                          getAlignmentAttr(builder, 1))};
+  auto result{fir::LoadOp::create(builder, loc, mlirTy, addr,
+                                  getAlignmentAttr(builder, 1))};
   if ((vop == VecOp::Xl && isBEVecElemOrderOnLE()) ||
       (vop == VecOp::Xlbe && triple.isLittleEndian()))
     return builder.createConvert(
@@ -1965,13 +1971,13 @@ PPCIntrinsicLibrary::genVecLdCallGrp(mlir::Type resultType,
       mlir::FunctionType::get(context, {addr.getType()}, {intrinResTy})};
   auto funcOp{builder.createFunction(loc, fname, funcType)};
   auto result{
-      builder.create<fir::CallOp>(loc, funcOp, parsedArgs).getResult(0)};
+      fir::CallOp::create(builder, loc, funcOp, parsedArgs).getResult(0)};
 
   if (vop == VecOp::Lxvp)
     return result;
 
   if (intrinResTy != mlirTy)
-    result = builder.create<mlir::vector::BitCastOp>(loc, mlirTy, result);
+    result = mlir::vector::BitCastOp::create(builder, loc, mlirTy, result);
 
   if (vop != VecOp::Xld2 && vop != VecOp::Xlw4 && isBEVecElemOrderOnLE())
     return builder.createConvert(
@@ -1998,13 +2004,13 @@ PPCIntrinsicLibrary::genVecLvsGrp(mlir::Type resultType,
   // Convert arg0 to i64 type if needed
   auto i64ty{mlir::IntegerType::get(context, 64)};
   if (arg0.getType() != i64ty)
-    arg0 = builder.create<fir::ConvertOp>(loc, i64ty, arg0);
+    arg0 = fir::ConvertOp::create(builder, loc, i64ty, arg0);
 
   // offset is modulo 16, so shift left 56 bits and then right 56 bits to clear
   //   upper 56 bit while preserving sign
   auto shiftVal{builder.createIntegerConstant(loc, i64ty, 56)};
-  auto offset{builder.create<mlir::arith::ShLIOp>(loc, arg0, shiftVal)};
-  auto offset2{builder.create<mlir::arith::ShRSIOp>(loc, offset, shiftVal)};
+  auto offset{mlir::arith::ShLIOp::create(builder, loc, arg0, shiftVal)};
+  auto offset2{mlir::arith::ShRSIOp::create(builder, loc, offset, shiftVal)};
 
   // Add the offsetArg to %addr of arg1
   auto addr{addOffsetToAddress(builder, loc, arg1, offset2)};
@@ -2024,7 +2030,7 @@ PPCIntrinsicLibrary::genVecLvsGrp(mlir::Type resultType,
   auto funcType{mlir::FunctionType::get(context, {addr.getType()}, {mlirTy})};
   auto funcOp{builder.createFunction(loc, fname, funcType)};
   auto result{
-      builder.create<fir::CallOp>(loc, funcOp, parsedArgs).getResult(0)};
+      fir::CallOp::create(builder, loc, funcOp, parsedArgs).getResult(0)};
 
   if (isNativeVecElemOrderOnLE())
     return builder.createConvert(
@@ -2061,19 +2067,19 @@ PPCIntrinsicLibrary::genVecNmaddMsub(mlir::Type resultType,
                                      std::get<1>(fmaMap[width]))};
   if (vop == VecOp::Nmadd) {
     // vec_nmadd(arg1, arg2, arg3) = -fma(arg1, arg2, arg3)
-    auto callOp{builder.create<fir::CallOp>(loc, funcOp, newArgs)};
+    auto callOp{fir::CallOp::create(builder, loc, funcOp, newArgs)};
 
     // We need to convert fir.vector to MLIR vector to use fneg and then back
     // to fir.vector to store.
     auto vCall{builder.createConvert(loc, vTypeInfo.toMlirVectorType(context),
                                      callOp.getResult(0))};
-    auto neg{builder.create<mlir::arith::NegFOp>(loc, vCall)};
+    auto neg{mlir::arith::NegFOp::create(builder, loc, vCall)};
     return builder.createConvert(loc, vTypeInfo.toFirVectorType(), neg);
   } else if (vop == VecOp::Msub) {
     // vec_msub(arg1, arg2, arg3) = fma(arg1, arg2, -arg3)
-    newArgs[2] = builder.create<mlir::arith::NegFOp>(loc, newArgs[2]);
+    newArgs[2] = mlir::arith::NegFOp::create(builder, loc, newArgs[2]);
 
-    auto callOp{builder.create<fir::CallOp>(loc, funcOp, newArgs)};
+    auto callOp{fir::CallOp::create(builder, loc, funcOp, newArgs)};
     return callOp.getResult(0);
   }
   llvm_unreachable("Invalid vector operation for generator");
@@ -2104,10 +2110,10 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
     auto mMask{builder.createConvert(loc, mlirMaskTy, argBases[2])};
 
     if (mlirTy != vi32Ty) {
-      mArg0 =
-          builder.create<mlir::LLVM::BitcastOp>(loc, vi32Ty, mArg0).getResult();
-      mArg1 =
-          builder.create<mlir::LLVM::BitcastOp>(loc, vi32Ty, mArg1).getResult();
+      mArg0 = mlir::LLVM::BitcastOp::create(builder, loc, vi32Ty, mArg0)
+                  .getResult();
+      mArg1 = mlir::LLVM::BitcastOp::create(builder, loc, vi32Ty, mArg1)
+                  .getResult();
     }
 
     auto funcOp{builder.createFunction(
@@ -2122,23 +2128,23 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
       auto v8Ty{mlir::VectorType::get(16, i8Ty)};
       auto negOne{builder.createMinusOneInteger(loc, i8Ty)};
       auto vNegOne{
-          builder.create<mlir::vector::BroadcastOp>(loc, v8Ty, negOne)};
+          mlir::vector::BroadcastOp::create(builder, loc, v8Ty, negOne)};
 
-      mMask = builder.create<mlir::arith::XOrIOp>(loc, mMask, vNegOne);
+      mMask = mlir::arith::XOrIOp::create(builder, loc, mMask, vNegOne);
       newArgs = {mArg1, mArg0, mMask};
     } else {
       newArgs = {mArg0, mArg1, mMask};
     }
 
-    auto res{builder.create<fir::CallOp>(loc, funcOp, newArgs).getResult(0)};
+    auto res{fir::CallOp::create(builder, loc, funcOp, newArgs).getResult(0)};
 
     if (res.getType() != argTypes[0]) {
       // fir.call llvm.ppc.altivec.vperm returns !fir.vector<i4:32>
       // convert the result back to the original type
       res = builder.createConvert(loc, vi32Ty, res);
       if (mlirTy != vi32Ty)
-        res =
-            builder.create<mlir::LLVM::BitcastOp>(loc, mlirTy, res).getResult();
+        res = mlir::LLVM::BitcastOp::create(builder, loc, mlirTy, res)
+                  .getResult();
     }
     return builder.createConvert(loc, resultType, res);
   }
@@ -2151,10 +2157,10 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
     auto constInt{constIntOp.getInt()};
     // arg1, arg2, and result type share same VecTypeInfo
     if (vecTyInfo.isFloat()) {
-      mArg0 =
-          builder.create<mlir::LLVM::BitcastOp>(loc, vf64Ty, mArg0).getResult();
-      mArg1 =
-          builder.create<mlir::LLVM::BitcastOp>(loc, vf64Ty, mArg1).getResult();
+      mArg0 = mlir::LLVM::BitcastOp::create(builder, loc, vf64Ty, mArg0)
+                  .getResult();
+      mArg1 = mlir::LLVM::BitcastOp::create(builder, loc, vf64Ty, mArg1)
+                  .getResult();
     }
 
     llvm::SmallVector<int64_t, 2> nMask; // native vector element order mask
@@ -2183,9 +2189,9 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
 
     llvm::SmallVector<int64_t, 2> mask =
         (isBEVecElemOrderOnLE()) ? rMask : nMask;
-    auto res{builder.create<mlir::vector::ShuffleOp>(loc, mArg0, mArg1, mask)};
+    auto res{mlir::vector::ShuffleOp::create(builder, loc, mArg0, mArg1, mask)};
     if (res.getType() != mlirTy) {
-      auto cast{builder.create<mlir::LLVM::BitcastOp>(loc, mlirTy, res)};
+      auto cast{mlir::LLVM::BitcastOp::create(builder, loc, mlirTy, res)};
       return builder.createConvert(loc, resultType, cast);
     }
     return builder.createConvert(loc, resultType, res);
@@ -2212,22 +2218,23 @@ PPCIntrinsicLibrary::genVecSel(mlir::Type resultType,
 
   // construct a constant <16 x i8> vector with value -1 for bitcast
   auto bcVecTy{mlir::VectorType::get(16, i8Ty)};
-  auto vNegOne{builder.create<mlir::vector::BroadcastOp>(loc, bcVecTy, negOne)};
+  auto vNegOne{
+      mlir::vector::BroadcastOp::create(builder, loc, bcVecTy, negOne)};
 
   // bitcast arguments to bcVecTy
-  auto arg1{builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[0])};
-  auto arg2{builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[1])};
-  auto arg3{builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[2])};
+  auto arg1{mlir::vector::BitCastOp::create(builder, loc, bcVecTy, vargs[0])};
+  auto arg2{mlir::vector::BitCastOp::create(builder, loc, bcVecTy, vargs[1])};
+  auto arg3{mlir::vector::BitCastOp::create(builder, loc, bcVecTy, vargs[2])};
 
   // vec_sel(arg1, arg2, arg3) =
   //   (arg2 and arg3) or (arg1 and (arg3 xor vector(-1,...)))
-  auto comp{builder.create<mlir::arith::XOrIOp>(loc, arg3, vNegOne)};
-  auto a1AndComp{builder.create<mlir::arith::AndIOp>(loc, arg1, comp)};
-  auto a1OrA2{builder.create<mlir::arith::AndIOp>(loc, arg2, arg3)};
-  auto res{builder.create<mlir::arith::OrIOp>(loc, a1AndComp, a1OrA2)};
+  auto comp{mlir::arith::XOrIOp::create(builder, loc, arg3, vNegOne)};
+  auto a1AndComp{mlir::arith::AndIOp::create(builder, loc, arg1, comp)};
+  auto a1OrA2{mlir::arith::AndIOp::create(builder, loc, arg2, arg3)};
+  auto res{mlir::arith::OrIOp::create(builder, loc, a1AndComp, a1OrA2)};
 
   auto bcRes{
-      builder.create<mlir::vector::BitCastOp>(loc, vargs[0].getType(), res)};
+      mlir::vector::BitCastOp::create(builder, loc, vargs[0].getType(), res)};
 
   return builder.createConvert(loc, vecTyInfos[0].toFirVectorType(), bcRes);
 }
@@ -2264,14 +2271,14 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     auto vecVal{builder.createIntegerConstant(
         loc, getConvertedElementType(context, vecTyInfoArgs[0].eleTy), width)};
     auto mask{
-        builder.create<mlir::vector::BroadcastOp>(loc, mlirTyArgs[1], vecVal)};
-    auto shft{builder.create<mlir::arith::RemUIOp>(loc, mlirVecArgs[1], mask)};
+        mlir::vector::BroadcastOp::create(builder, loc, mlirTyArgs[1], vecVal)};
+    auto shft{mlir::arith::RemUIOp::create(builder, loc, mlirVecArgs[1], mask)};
 
     mlir::Value res{nullptr};
     if (vop == VecOp::Sr)
-      res = builder.create<mlir::arith::ShRUIOp>(loc, mlirVecArgs[0], shft);
+      res = mlir::arith::ShRUIOp::create(builder, loc, mlirVecArgs[0], shft);
     else if (vop == VecOp::Sl)
-      res = builder.create<mlir::arith::ShLIOp>(loc, mlirVecArgs[0], shft);
+      res = mlir::arith::ShLIOp::create(builder, loc, mlirVecArgs[0], shft);
 
     shftRes = builder.createConvert(loc, argTypes[0], res);
   } else if (vop == VecOp::Sll || vop == VecOp::Slo || vop == VecOp::Srl ||
@@ -2281,11 +2288,11 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     // Bitcast to vector<4xi32>
     auto bcVecTy{mlir::VectorType::get(4, builder.getIntegerType(32))};
     if (mlirTyArgs[0] != bcVecTy)
-      mlirVecArgs[0] =
-          builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, mlirVecArgs[0]);
+      mlirVecArgs[0] = mlir::vector::BitCastOp::create(builder, loc, bcVecTy,
+                                                       mlirVecArgs[0]);
     if (mlirTyArgs[1] != bcVecTy)
-      mlirVecArgs[1] =
-          builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, mlirVecArgs[1]);
+      mlirVecArgs[1] = mlir::vector::BitCastOp::create(builder, loc, bcVecTy,
+                                                       mlirVecArgs[1]);
 
     llvm::StringRef funcName;
     switch (vop) {
@@ -2307,13 +2314,13 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     auto funcTy{genFuncType<Ty::IntegerVector<4>, Ty::IntegerVector<4>,
                             Ty::IntegerVector<4>>(context, builder)};
     mlir::func::FuncOp funcOp{builder.createFunction(loc, funcName, funcTy)};
-    auto callOp{builder.create<fir::CallOp>(loc, funcOp, mlirVecArgs)};
+    auto callOp{fir::CallOp::create(builder, loc, funcOp, mlirVecArgs)};
 
     // If the result vector type is different from the original type, need
     // to convert to mlir vector, bitcast and then convert back to fir vector.
     if (callOp.getResult(0).getType() != argTypes[0]) {
       auto res = builder.createConvert(loc, bcVecTy, callOp.getResult(0));
-      res = builder.create<mlir::vector::BitCastOp>(loc, mlirTyArgs[0], res);
+      res = mlir::vector::BitCastOp::create(builder, loc, mlirTyArgs[0], res);
       shftRes = builder.createConvert(loc, argTypes[0], res);
     } else {
       shftRes = callOp.getResult(0);
@@ -2329,10 +2336,10 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     auto vi8Ty{mlir::VectorType::get(16, builder.getIntegerType(8))};
     if (mlirTyArgs[0] != vi8Ty) {
       mlirVecArgs[0] =
-          builder.create<mlir::LLVM::BitcastOp>(loc, vi8Ty, mlirVecArgs[0])
+          mlir::LLVM::BitcastOp::create(builder, loc, vi8Ty, mlirVecArgs[0])
               .getResult();
       mlirVecArgs[1] =
-          builder.create<mlir::LLVM::BitcastOp>(loc, vi8Ty, mlirVecArgs[1])
+          mlir::LLVM::BitcastOp::create(builder, loc, vi8Ty, mlirVecArgs[1])
               .getResult();
     }
 
@@ -2347,19 +2354,19 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     if (triple.isLittleEndian()) {
       for (int i = 16; i < 32; ++i)
         mask.push_back(i - shiftVal);
-      shftRes = builder.create<mlir::vector::ShuffleOp>(loc, mlirVecArgs[1],
-                                                        mlirVecArgs[0], mask);
+      shftRes = mlir::vector::ShuffleOp::create(builder, loc, mlirVecArgs[1],
+                                                mlirVecArgs[0], mask);
     } else {
       for (int i = 0; i < 16; ++i)
         mask.push_back(i + shiftVal);
-      shftRes = builder.create<mlir::vector::ShuffleOp>(loc, mlirVecArgs[0],
-                                                        mlirVecArgs[1], mask);
+      shftRes = mlir::vector::ShuffleOp::create(builder, loc, mlirVecArgs[0],
+                                                mlirVecArgs[1], mask);
     }
 
     // Bitcast to the original type
     if (shftRes.getType() != mlirTyArgs[0])
       shftRes =
-          builder.create<mlir::LLVM::BitcastOp>(loc, mlirTyArgs[0], shftRes);
+          mlir::LLVM::BitcastOp::create(builder, loc, mlirTyArgs[0], shftRes);
 
     return builder.createConvert(loc, resultType, shftRes);
   } else
@@ -2384,8 +2391,9 @@ PPCIntrinsicLibrary::genVecSplat(mlir::Type resultType,
     auto vecTyInfo{getVecTypeFromFir(argBases[0])};
 
     auto extractOp{genVecExtract(resultType, args)};
-    splatOp = builder.create<mlir::vector::SplatOp>(
-        loc, *(extractOp.getUnboxed()), vecTyInfo.toMlirVectorType(context));
+    splatOp =
+        mlir::vector::SplatOp::create(builder, loc, *(extractOp.getUnboxed()),
+                                      vecTyInfo.toMlirVectorType(context));
     retTy = vecTyInfo.toFirVectorType();
     break;
   }
@@ -2393,8 +2401,8 @@ PPCIntrinsicLibrary::genVecSplat(mlir::Type resultType,
     assert(args.size() == 1);
     auto vecTyInfo{getVecTypeFromEle(argBases[0])};
 
-    splatOp = builder.create<mlir::vector::SplatOp>(
-        loc, argBases[0], vecTyInfo.toMlirVectorType(context));
+    splatOp = mlir::vector::SplatOp::create(
+        builder, loc, argBases[0], vecTyInfo.toMlirVectorType(context));
     retTy = vecTyInfo.toFirVectorType();
     break;
   }
@@ -2404,8 +2412,8 @@ PPCIntrinsicLibrary::genVecSplat(mlir::Type resultType,
     auto intOp{builder.createConvert(loc, eleTy, argBases[0])};
 
     // the intrinsic always returns vector(integer(4))
-    splatOp = builder.create<mlir::vector::SplatOp>(
-        loc, intOp, mlir::VectorType::get(4, eleTy));
+    splatOp = mlir::vector::SplatOp::create(builder, loc, intOp,
+                                            mlir::VectorType::get(4, eleTy));
     retTy = fir::VectorType::get(4, eleTy);
     break;
   }
@@ -2433,14 +2441,14 @@ PPCIntrinsicLibrary::genVecXlds(mlir::Type resultType,
   auto i64Ty{mlir::IntegerType::get(builder.getContext(), 64)};
   auto i64VecTy{mlir::VectorType::get(2, i64Ty)};
   auto i64RefTy{builder.getRefType(i64Ty)};
-  auto addrConv{builder.create<fir::ConvertOp>(loc, i64RefTy, addr)};
+  auto addrConv{fir::ConvertOp::create(builder, loc, i64RefTy, addr)};
 
-  auto addrVal{builder.create<fir::LoadOp>(loc, addrConv)};
-  auto splatRes{builder.create<mlir::vector::SplatOp>(loc, addrVal, i64VecTy)};
+  auto addrVal{fir::LoadOp::create(builder, loc, addrConv)};
+  auto splatRes{mlir::vector::SplatOp::create(builder, loc, addrVal, i64VecTy)};
 
   mlir::Value result{nullptr};
   if (mlirTy != splatRes.getType()) {
-    result = builder.create<mlir::vector::BitCastOp>(loc, mlirTy, splatRes);
+    result = mlir::vector::BitCastOp::create(builder, loc, mlirTy, splatRes);
   } else
     result = splatRes;
 
@@ -2790,7 +2798,7 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
     if (i == 0 && HandlerOp == MMAHandlerOp::FirstArgIsResult) {
       // First argument is passed in as an address. We need to load
       // the content to match the LLVM interface.
-      v = builder.create<fir::LoadOp>(loc, v);
+      v = fir::LoadOp::create(builder, loc, v);
     }
     auto vType{v.getType()};
     mlir::Type targetType{intrFuncType.getInput(j)};
@@ -2801,7 +2809,7 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
         auto len{mlir::dyn_cast<fir::VectorType>(vType).getLen()};
         mlir::VectorType mlirType = mlir::VectorType::get(len, eleTy);
         auto v0{builder.createConvert(loc, mlirType, v)};
-        auto v1{builder.create<mlir::vector::BitCastOp>(loc, targetType, v0)};
+        auto v1{mlir::vector::BitCastOp::create(builder, loc, targetType, v0)};
         intrArgs.push_back(v1);
       } else if (mlir::isa<mlir::IntegerType>(targetType) &&
                  mlir::isa<mlir::IntegerType>(vType)) {
@@ -2817,7 +2825,7 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
       intrArgs.push_back(v);
     }
   }
-  auto callSt{builder.create<fir::CallOp>(loc, funcOp, intrArgs)};
+  auto callSt{fir::CallOp::create(builder, loc, funcOp, intrArgs)};
   if (HandlerOp == MMAHandlerOp::SubToFunc ||
       HandlerOp == MMAHandlerOp::SubToFuncReverseArgOnLE ||
       HandlerOp == MMAHandlerOp::FirstArgIsResult) {
@@ -2826,10 +2834,11 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
     mlir::Value destPtr{fir::getBase(args[0])};
     mlir::Type callResultPtrType{builder.getRefType(callResult.getType())};
     if (destPtr.getType() != callResultPtrType) {
-      destPtr = builder.create<fir::ConvertOp>(loc, callResultPtrType, destPtr);
+      destPtr =
+          fir::ConvertOp::create(builder, loc, callResultPtrType, destPtr);
     }
     // Copy the result.
-    builder.create<fir::StoreOp>(loc, callResult, destPtr);
+    fir::StoreOp::create(builder, loc, callResult, destPtr);
   }
 }
 
@@ -2896,7 +2905,7 @@ void PPCIntrinsicLibrary::genVecStore(llvm::ArrayRef<fir::ExtendedValue> args) {
   if (vop == VecOp::Stxvp) {
     biArgs.push_back(argBases[0]);
     biArgs.push_back(addr);
-    builder.create<fir::CallOp>(loc, funcOp, biArgs);
+    fir::CallOp::create(builder, loc, funcOp, biArgs);
     return;
   }
 
@@ -2906,7 +2915,7 @@ void PPCIntrinsicLibrary::genVecStore(llvm::ArrayRef<fir::ExtendedValue> args) {
 
   mlir::Value newArg1{nullptr};
   if (stTy != arg1TyInfo.toMlirVectorType(context))
-    newArg1 = builder.create<mlir::vector::BitCastOp>(loc, stTy, cnv);
+    newArg1 = mlir::vector::BitCastOp::create(builder, loc, stTy, cnv);
   else
     newArg1 = cnv;
 
@@ -2917,7 +2926,7 @@ void PPCIntrinsicLibrary::genVecStore(llvm::ArrayRef<fir::ExtendedValue> args) {
   biArgs.push_back(newArg1);
   biArgs.push_back(addr);
 
-  builder.create<fir::CallOp>(loc, funcOp, biArgs);
+  fir::CallOp::create(builder, loc, funcOp, biArgs);
 }
 
 // VEC_XST, VEC_XST_BE, VEC_STXV, VEC_XSTD2, VEC_XSTW4
@@ -2966,7 +2975,7 @@ void PPCIntrinsicLibrary::genVecXStore(
 
     mlir::Type srcTy{nullptr};
     if (numElem != arg1TyInfo.len) {
-      cnv = builder.create<mlir::vector::BitCastOp>(loc, mlirVecTy, cnv);
+      cnv = mlir::vector::BitCastOp::create(builder, loc, mlirVecTy, cnv);
       srcTy = firVecTy;
     } else {
       srcTy = arg1TyInfo.toFirVectorType();
@@ -2989,9 +2998,9 @@ void PPCIntrinsicLibrary::genVecXStore(
   default:
     assert(false && "Invalid vector operation for generator");
   }
-  builder.create<fir::StoreOp>(loc, mlir::TypeRange{},
-                               mlir::ValueRange{src, trg},
-                               getAlignmentAttr(builder, 1));
+  fir::StoreOp::create(builder, loc, mlir::TypeRange{},
+                       mlir::ValueRange{src, trg},
+                       getAlignmentAttr(builder, 1));
 }
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index cd5f1f6d098c3..cc9f8280a172c 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -30,7 +30,7 @@ mlir::Value fir::runtime::genMoveAlloc(fir::FirOpBuilder &builder,
         mlir::dyn_cast<fir::ClassType>(fir::dyn_cast_ptrEleTy(from.getType()));
     mlir::Type derivedType = fir::unwrapInnerType(clTy.getEleTy());
     declaredTypeDesc =
-        builder.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(derivedType));
+        fir::TypeDescOp::create(builder, loc, mlir::TypeAttr::get(derivedType));
   } else {
     declaredTypeDesc = builder.createNullConstant(loc);
   }
@@ -38,7 +38,7 @@ mlir::Value fir::runtime::genMoveAlloc(fir::FirOpBuilder &builder,
       builder, loc, fTy, to, from, declaredTypeDesc, hasStat, errMsg,
       sourceFile, sourceLine)};
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 void fir::runtime::genAllocatableApplyMold(fir::FirOpBuilder &builder,
@@ -52,7 +52,7 @@ void fir::runtime::genAllocatableApplyMold(fir::FirOpBuilder &builder,
       builder.createIntegerConstant(loc, fTy.getInput(2), rank);
   llvm::SmallVector<mlir::Value> args{
       fir::runtime::createArguments(builder, loc, fTy, desc, mold, rankVal)};
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genAllocatableSetBounds(fir::FirOpBuilder &builder,
@@ -66,7 +66,7 @@ void fir::runtime::genAllocatableSetBounds(fir::FirOpBuilder &builder,
   mlir::FunctionType fTy{func.getFunctionType()};
   llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
       builder, loc, fTy, desc, dimIndex, lowerBound, upperBound)};
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
@@ -84,10 +84,10 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
     hasStat = builder.createBool(loc, false);
   if (!errMsg) {
     mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
-    errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+    errMsg = fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   }
   llvm::SmallVector<mlir::Value> args{
       fir::runtime::createArguments(builder, loc, fTy, desc, asyncObject,
                                     hasStat, errMsg, sourceFile, sourceLine)};
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/ArrayConstructor.cpp b/flang/lib/Optimizer/Builder/Runtime/ArrayConstructor.cpp
index 0d56cd2edc99b..8c9825efaaa4e 100644
--- a/flang/lib/Optimizer/Builder/Runtime/ArrayConstructor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/ArrayConstructor.cpp
@@ -50,7 +50,7 @@ mlir::Value fir::runtime::genInitArrayConstructorVector(
   auto args = fir::runtime::createArguments(builder, loc, funcType, cookie,
                                             toBox, useValueLengthParameters,
                                             sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
   return cookie;
 }
 
@@ -63,7 +63,7 @@ void fir::runtime::genPushArrayConstructorValue(
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType,
                                             arrayConstructorVector, fromBox);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genPushArrayConstructorSimpleScalar(
@@ -75,5 +75,5 @@ void fir::runtime::genPushArrayConstructorSimpleScalar(
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(
       builder, loc, funcType, arrayConstructorVector, fromAddress);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Assign.cpp b/flang/lib/Optimizer/Builder/Runtime/Assign.cpp
index 62f03f7d48665..336dbdc89c04a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Assign.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Assign.cpp
@@ -22,7 +22,7 @@ void fir::runtime::genAssign(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, destBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genAssignPolymorphic(fir::FirOpBuilder &builder,
@@ -36,7 +36,7 @@ void fir::runtime::genAssignPolymorphic(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, destBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genAssignExplicitLengthCharacter(fir::FirOpBuilder &builder,
@@ -52,7 +52,7 @@ void fir::runtime::genAssignExplicitLengthCharacter(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, destBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genAssignTemporary(fir::FirOpBuilder &builder,
@@ -66,7 +66,7 @@ void fir::runtime::genAssignTemporary(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, destBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genCopyInAssign(fir::FirOpBuilder &builder,
@@ -79,7 +79,7 @@ void fir::runtime::genCopyInAssign(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, destBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genCopyOutAssign(fir::FirOpBuilder &builder,
@@ -93,5 +93,5 @@ void fir::runtime::genCopyOutAssign(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, destBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
index 62a0652cc2e5d..a6ee98685f3c9 100644
--- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -30,7 +30,7 @@ void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
       builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
-  builder.create<fir::CallOp>(loc, callee, args);
+  fir::CallOp::create(builder, loc, callee, args);
 }
 
 void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
@@ -45,7 +45,7 @@ void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
       builder, loc, fTy, desc, sourceFile, sourceLine)};
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::cuda::genSetAllocatorIndex(fir::FirOpBuilder &builder,
@@ -60,5 +60,5 @@ void fir::runtime::cuda::genSetAllocatorIndex(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
       builder, loc, fTy, desc, index, sourceFile, sourceLine)};
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
index b16819915d5ab..57fb0cccf6863 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
@@ -34,7 +34,7 @@ static void genCharacterSearch(FN func, fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             string1Box, string2Box, backBox,
                                             kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Helper function to recover the KIND from the FIR type.
@@ -72,7 +72,7 @@ static void genAdjust(fir::FirOpBuilder &builder, mlir::Location loc,
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             stringBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, adjustFunc, args);
+  fir::CallOp::create(builder, loc, adjustFunc, args);
 }
 
 void fir::runtime::genAdjustL(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -114,9 +114,9 @@ fir::runtime::genCharCompare(fir::FirOpBuilder &builder, mlir::Location loc,
   auto fTy = beginFunc.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, lhsBuff, rhsBuff,
                                             lhsLen, rhsLen);
-  auto tri = builder.create<fir::CallOp>(loc, beginFunc, args).getResult(0);
+  auto tri = fir::CallOp::create(builder, loc, beginFunc, args).getResult(0);
   auto zero = builder.createIntegerConstant(loc, tri.getType(), 0);
-  return builder.create<mlir::arith::CmpIOp>(loc, cmp, tri, zero);
+  return mlir::arith::CmpIOp::create(builder, loc, cmp, tri, zero);
 }
 
 mlir::Value fir::runtime::genCharCompare(fir::FirOpBuilder &builder,
@@ -130,8 +130,8 @@ mlir::Value fir::runtime::genCharCompare(fir::FirOpBuilder &builder,
     if (fir::isa_ref_type(base.getType()))
       return base;
     auto mem =
-        builder.create<fir::AllocaOp>(loc, base.getType(), /*pinned=*/false);
-    builder.create<fir::StoreOp>(loc, base, mem);
+        fir::AllocaOp::create(builder, loc, base.getType(), /*pinned=*/false);
+    fir::StoreOp::create(builder, loc, base, mem);
     return mem;
   };
   auto lhsBuffer = allocateIfNotInMemory(fir::getBase(lhs));
@@ -165,7 +165,7 @@ mlir::Value fir::runtime::genIndex(fir::FirOpBuilder &builder,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, stringBase, stringLen,
                                     substringBase, substringLen, back);
-  return builder.create<fir::CallOp>(loc, indexFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, indexFunc, args).getResult(0);
 }
 
 void fir::runtime::genIndexDescriptor(fir::FirOpBuilder &builder,
@@ -189,7 +189,7 @@ void fir::runtime::genRepeat(fir::FirOpBuilder &builder, mlir::Location loc,
 
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, stringBox, ncopies, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, repeatFunc, args);
+  fir::CallOp::create(builder, loc, repeatFunc, args);
 }
 
 void fir::runtime::genTrim(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -202,7 +202,7 @@ void fir::runtime::genTrim(fir::FirOpBuilder &builder, mlir::Location loc,
 
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             stringBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, trimFunc, args);
+  fir::CallOp::create(builder, loc, trimFunc, args);
 }
 
 void fir::runtime::genScanDescriptor(fir::FirOpBuilder &builder,
@@ -237,7 +237,7 @@ mlir::Value fir::runtime::genScan(fir::FirOpBuilder &builder,
   auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, stringBase,
                                             stringLen, setBase, setLen, back);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 void fir::runtime::genVerifyDescriptor(fir::FirOpBuilder &builder,
@@ -274,5 +274,5 @@ mlir::Value fir::runtime::genVerify(fir::FirOpBuilder &builder,
   auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, stringBase,
                                             stringLen, setBase, setLen, back);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Command.cpp b/flang/lib/Optimizer/Builder/Runtime/Command.cpp
index 35aa529a9a727..e65e2b6df7557 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Command.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Command.cpp
@@ -30,7 +30,7 @@ mlir::Value fir::runtime::genCommandArgumentCount(fir::FirOpBuilder &builder,
                                                   mlir::Location loc) {
   auto argumentCountFunc =
       fir::runtime::getRuntimeFunc<mkRTKey(ArgumentCount)>(loc, builder);
-  return builder.create<fir::CallOp>(loc, argumentCountFunc).getResult(0);
+  return fir::CallOp::create(builder, loc, argumentCountFunc).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetCommand(fir::FirOpBuilder &builder,
@@ -46,7 +46,7 @@ mlir::Value fir::runtime::genGetCommand(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, runtimeFuncTy, command,
                                     length, errmsg, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, runtimeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetPID(fir::FirOpBuilder &builder,
@@ -54,7 +54,7 @@ mlir::Value fir::runtime::genGetPID(fir::FirOpBuilder &builder,
   auto runtimeFunc =
       fir::runtime::getRuntimeFunc<mkRTKey(GetPID)>(loc, builder);
 
-  return builder.create<fir::CallOp>(loc, runtimeFunc).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetCommandArgument(
@@ -69,7 +69,7 @@ mlir::Value fir::runtime::genGetCommandArgument(
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, runtimeFuncTy, number, value,
                                     length, errmsg, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, runtimeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetEnvVariable(fir::FirOpBuilder &builder,
@@ -87,7 +87,7 @@ mlir::Value fir::runtime::genGetEnvVariable(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, name, value, length, trimName, errmsg,
       sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, runtimeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetCwd(fir::FirOpBuilder &builder,
@@ -100,7 +100,7 @@ mlir::Value fir::runtime::genGetCwd(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2));
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, cwd, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genHostnm(fir::FirOpBuilder &builder,
@@ -113,7 +113,7 @@ mlir::Value fir::runtime::genHostnm(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2));
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, res, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 void fir::runtime::genPerror(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -123,7 +123,7 @@ void fir::runtime::genPerror(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, runtimeFuncTy, string);
-  builder.create<fir::CallOp>(loc, runtimeFunc, args);
+  fir::CallOp::create(builder, loc, runtimeFunc, args);
 }
 
 mlir::Value fir::runtime::genPutEnv(fir::FirOpBuilder &builder,
@@ -137,7 +137,7 @@ mlir::Value fir::runtime::genPutEnv(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(1));
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, str, strLength, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genUnlink(fir::FirOpBuilder &builder,
@@ -151,5 +151,5 @@ mlir::Value fir::runtime::genUnlink(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(1));
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, path, pathLength, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Derived.cpp b/flang/lib/Optimizer/Builder/Runtime/Derived.cpp
index 25b41518a90e5..1b0457bb2d0a1 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Derived.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Derived.cpp
@@ -26,7 +26,7 @@ void fir::runtime::genDerivedTypeInitialize(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   auto args = fir::runtime::createArguments(builder, loc, fTy, box, sourceFile,
                                             sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDerivedTypeInitializeClone(fir::FirOpBuilder &builder,
@@ -41,7 +41,7 @@ void fir::runtime::genDerivedTypeInitializeClone(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, newBox, box,
                                             sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDerivedTypeDestroy(fir::FirOpBuilder &builder,
@@ -49,7 +49,7 @@ void fir::runtime::genDerivedTypeDestroy(fir::FirOpBuilder &builder,
   auto func = fir::runtime::getRuntimeFunc<mkRTKey(Destroy)>(loc, builder);
   auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, box);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDerivedTypeFinalize(fir::FirOpBuilder &builder,
@@ -61,7 +61,7 @@ void fir::runtime::genDerivedTypeFinalize(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   auto args = fir::runtime::createArguments(builder, loc, fTy, box, sourceFile,
                                             sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDerivedTypeDestroyWithoutFinalization(
@@ -70,7 +70,7 @@ void fir::runtime::genDerivedTypeDestroyWithoutFinalization(
       loc, builder);
   auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, box);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genNullifyDerivedType(fir::FirOpBuilder &builder,
@@ -78,7 +78,7 @@ void fir::runtime::genNullifyDerivedType(fir::FirOpBuilder &builder,
                                          fir::RecordType derivedType,
                                          unsigned rank) {
   mlir::Value typeDesc =
-      builder.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(derivedType));
+      fir::TypeDescOp::create(builder, loc, mlir::TypeAttr::get(derivedType));
   mlir::func::FuncOp callee =
       fir::runtime::getRuntimeFunc<mkRTKey(PointerNullifyDerived)>(loc,
                                                                    builder);
@@ -90,7 +90,7 @@ void fir::runtime::genNullifyDerivedType(fir::FirOpBuilder &builder,
   mlir::Value c0 = builder.createIntegerConstant(loc, inputTypes[3], 0);
   args.push_back(rankCst);
   args.push_back(c0);
-  builder.create<fir::CallOp>(loc, callee, args);
+  fir::CallOp::create(builder, loc, callee, args);
 }
 
 mlir::Value fir::runtime::genSameTypeAs(fir::FirOpBuilder &builder,
@@ -100,7 +100,7 @@ mlir::Value fir::runtime::genSameTypeAs(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(SameTypeAs)>(loc, builder);
   auto fTy = sameTypeAsFunc.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, a, b);
-  return builder.create<fir::CallOp>(loc, sameTypeAsFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, sameTypeAsFunc, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genExtendsTypeOf(fir::FirOpBuilder &builder,
@@ -110,5 +110,6 @@ mlir::Value fir::runtime::genExtendsTypeOf(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(ExtendsTypeOf)>(loc, builder);
   auto fTy = extendsTypeOfFunc.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, a, mold);
-  return builder.create<fir::CallOp>(loc, extendsTypeOfFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, extendsTypeOfFunc, args)
+      .getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
index bf5fd6af0eafa..fa3d00e8b844f 100755
--- a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
@@ -44,7 +44,7 @@ mlir::Value fir::runtime::genEnvironmentDefaults(
   mlir::IntegerAttr one = builder.getIntegerAttr(idxTy, 1);
   std::string itemListName = envDefaultListPtrName + ".items";
   auto listBuilder = [&](fir::FirOpBuilder &builder) {
-    mlir::Value list = builder.create<fir::UndefOp>(loc, itemListTy);
+    mlir::Value list = fir::UndefOp::create(builder, loc, itemListTy);
     llvm::SmallVector<mlir::Attribute, 2> idx = {mlir::Attribute{},
                                                  mlir::Attribute{}};
     auto insertStringField = [&](const std::string &s,
@@ -52,8 +52,8 @@ mlir::Value fir::runtime::genEnvironmentDefaults(
       mlir::Value stringAddress = fir::getBase(
           fir::factory::createStringLiteral(builder, loc, s + '\0'));
       mlir::Value addr = builder.createConvert(loc, charRefTy, stringAddress);
-      return builder.create<fir::InsertValueOp>(loc, itemListTy, list, addr,
-                                                builder.getArrayAttr(idx));
+      return fir::InsertValueOp::create(builder, loc, itemListTy, list, addr,
+                                        builder.getArrayAttr(idx));
     };
 
     size_t n = 0;
@@ -65,7 +65,7 @@ mlir::Value fir::runtime::genEnvironmentDefaults(
       list = insertStringField(def.defaultValue, idx);
       ++n;
     }
-    builder.create<fir::HasValueOp>(loc, list);
+    fir::HasValueOp::create(builder, loc, list);
   };
   builder.createGlobalConstant(loc, itemListTy, itemListName, listBuilder,
                                linkOnce);
@@ -73,27 +73,27 @@ mlir::Value fir::runtime::genEnvironmentDefaults(
   // Define the EnviornmentDefaultList object.
   auto envDefaultListBuilder = [&](fir::FirOpBuilder &builder) {
     mlir::Value envDefaultList =
-        builder.create<fir::UndefOp>(loc, envDefaultListTy);
+        fir::UndefOp::create(builder, loc, envDefaultListTy);
     mlir::Value numItems =
         builder.createIntegerConstant(loc, intTy, envDefaults.size());
-    envDefaultList = builder.create<fir::InsertValueOp>(
-        loc, envDefaultListTy, envDefaultList, numItems,
-        builder.getArrayAttr(zero));
+    envDefaultList = fir::InsertValueOp::create(builder, loc, envDefaultListTy,
+                                                envDefaultList, numItems,
+                                                builder.getArrayAttr(zero));
     fir::GlobalOp itemList = builder.getNamedGlobal(itemListName);
     assert(itemList && "missing environment default list");
-    mlir::Value listAddr = builder.create<fir::AddrOfOp>(
-        loc, itemList.resultType(), itemList.getSymbol());
-    envDefaultList = builder.create<fir::InsertValueOp>(
-        loc, envDefaultListTy, envDefaultList, listAddr,
-        builder.getArrayAttr(one));
-    builder.create<fir::HasValueOp>(loc, envDefaultList);
+    mlir::Value listAddr = fir::AddrOfOp::create(
+        builder, loc, itemList.resultType(), itemList.getSymbol());
+    envDefaultList = fir::InsertValueOp::create(builder, loc, envDefaultListTy,
+                                                envDefaultList, listAddr,
+                                                builder.getArrayAttr(one));
+    fir::HasValueOp::create(builder, loc, envDefaultList);
   };
   fir::GlobalOp envDefaultList = builder.createGlobalConstant(
       loc, envDefaultListTy, envDefaultListPtrName + ".list",
       envDefaultListBuilder, linkOnce);
 
   // Define the pointer to the list used by the runtime.
-  mlir::Value addr = builder.create<fir::AddrOfOp>(
-      loc, envDefaultList.resultType(), envDefaultList.getSymbol());
+  mlir::Value addr = fir::AddrOfOp::create(
+      builder, loc, envDefaultList.resultType(), envDefaultList.getSymbol());
   return addr;
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
index 0f66315696ac7..0256644ed06e7 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
@@ -18,21 +18,21 @@ mlir::Value fir::runtime::genMapExcept(fir::FirOpBuilder &builder,
                                        mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(MapException)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func, excepts).getResult(0);
+  return fir::CallOp::create(builder, loc, func, excepts).getResult(0);
 }
 
 void fir::runtime::genFeclearexcept(fir::FirOpBuilder &builder,
                                     mlir::Location loc, mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(feclearexcept)>(loc, builder)};
-  builder.create<fir::CallOp>(loc, func, excepts);
+  fir::CallOp::create(builder, loc, func, excepts);
 }
 
 void fir::runtime::genFeraiseexcept(fir::FirOpBuilder &builder,
                                     mlir::Location loc, mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(feraiseexcept)>(loc, builder)};
-  builder.create<fir::CallOp>(loc, func, excepts);
+  fir::CallOp::create(builder, loc, func, excepts);
 }
 
 mlir::Value fir::runtime::genFetestexcept(fir::FirOpBuilder &builder,
@@ -40,28 +40,28 @@ mlir::Value fir::runtime::genFetestexcept(fir::FirOpBuilder &builder,
                                           mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(fetestexcept)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func, excepts).getResult(0);
+  return fir::CallOp::create(builder, loc, func, excepts).getResult(0);
 }
 
 void fir::runtime::genFedisableexcept(fir::FirOpBuilder &builder,
                                       mlir::Location loc, mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(fedisableexcept)>(loc, builder)};
-  builder.create<fir::CallOp>(loc, func, excepts);
+  fir::CallOp::create(builder, loc, func, excepts);
 }
 
 void fir::runtime::genFeenableexcept(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(feenableexcept)>(loc, builder)};
-  builder.create<fir::CallOp>(loc, func, excepts);
+  fir::CallOp::create(builder, loc, func, excepts);
 }
 
 mlir::Value fir::runtime::genFegetexcept(fir::FirOpBuilder &builder,
                                          mlir::Location loc) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(fegetexcept)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func).getResult(0);
+  return fir::CallOp::create(builder, loc, func).getResult(0);
 }
 
 mlir::Value fir::runtime::genSupportHalting(fir::FirOpBuilder &builder,
@@ -69,33 +69,33 @@ mlir::Value fir::runtime::genSupportHalting(fir::FirOpBuilder &builder,
                                             mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(SupportHalting)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func, excepts).getResult(0);
+  return fir::CallOp::create(builder, loc, func, excepts).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetUnderflowMode(fir::FirOpBuilder &builder,
                                               mlir::Location loc) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(GetUnderflowMode)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func).getResult(0);
+  return fir::CallOp::create(builder, loc, func).getResult(0);
 }
 
 void fir::runtime::genSetUnderflowMode(fir::FirOpBuilder &builder,
                                        mlir::Location loc, mlir::Value flag) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(SetUnderflowMode)>(loc, builder)};
-  builder.create<fir::CallOp>(loc, func, flag);
+  fir::CallOp::create(builder, loc, func, flag);
 }
 
 mlir::Value fir::runtime::genGetModesTypeSize(fir::FirOpBuilder &builder,
                                               mlir::Location loc) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(GetModesTypeSize)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func).getResult(0);
+  return fir::CallOp::create(builder, loc, func).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetStatusTypeSize(fir::FirOpBuilder &builder,
                                                mlir::Location loc) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(GetStatusTypeSize)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func).getResult(0);
+  return fir::CallOp::create(builder, loc, func).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Execute.cpp b/flang/lib/Optimizer/Builder/Runtime/Execute.cpp
index 71ee3996ac0da..2f85fb45ea0d2 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Execute.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Execute.cpp
@@ -40,5 +40,5 @@ void fir::runtime::genExecuteCommandLine(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, command, wait, exitstat, cmdstat, cmdmsg,
       sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, runtimeFunc, args);
+  fir::CallOp::create(builder, loc, runtimeFunc, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
index 718c3533564e8..5e3f022310973 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
@@ -26,7 +26,7 @@ mlir::Value fir::runtime::genLboundDim(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, array, dim,
                                             sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, lboundFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, lboundFunc, args).getResult(0);
 }
 
 void fir::runtime::genLbound(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -40,7 +40,7 @@ void fir::runtime::genLbound(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Ubound` runtime routine.  Calls to UBOUND with a DIM
@@ -57,7 +57,7 @@ void fir::runtime::genUbound(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox, array,
                                             kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, uboundFunc, args);
+  fir::CallOp::create(builder, loc, uboundFunc, args);
 }
 
 /// Generate call to `Size` runtime routine. This routine is a version when
@@ -73,7 +73,7 @@ mlir::Value fir::runtime::genSizeDim(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, array, dim,
                                             sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, sizeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, sizeFunc, args).getResult(0);
 }
 
 /// Generate call to `Size` runtime routine. This routine is a version when
@@ -88,7 +88,7 @@ mlir::Value fir::runtime::genSize(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   auto args = fir::runtime::createArguments(builder, loc, fTy, array,
                                             sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, sizeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, sizeFunc, args).getResult(0);
 }
 
 /// Generate call to `IsContiguous` runtime routine.
@@ -99,7 +99,7 @@ mlir::Value fir::runtime::genIsContiguous(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(IsContiguous)>(loc, builder);
   auto fTy = isContiguousFunc.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, array);
-  return builder.create<fir::CallOp>(loc, isContiguousFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, isContiguousFunc, args).getResult(0);
 }
 
 /// Generate call to `IsContiguousUpTo` runtime routine.
@@ -111,7 +111,7 @@ mlir::Value fir::runtime::genIsContiguousUpTo(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(IsContiguousUpTo)>(loc, builder);
   auto fTy = isContiguousFunc.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, array, dim);
-  return builder.create<fir::CallOp>(loc, isContiguousFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, isContiguousFunc, args).getResult(0);
 }
 
 void fir::runtime::genShape(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -125,5 +125,5 @@ void fir::runtime::genShape(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 773d6408079cc..4b4954a3e738c 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -52,14 +52,15 @@ mlir::Value fir::runtime::genAssociated(fir::FirOpBuilder &builder,
                                                                      builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), pointer, target);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genCpuTime(fir::FirOpBuilder &builder,
                                      mlir::Location loc) {
   mlir::func::FuncOp func =
       fir::runtime::getRuntimeFunc<mkRTKey(CpuTime)>(loc, builder);
-  return builder.create<fir::CallOp>(loc, func, std::nullopt).getResult(0);
+  return fir::CallOp::create(builder, loc, func, mlir::ValueRange{})
+      .getResult(0);
 }
 
 void fir::runtime::genDateAndTime(fir::FirOpBuilder &builder,
@@ -102,7 +103,7 @@ void fir::runtime::genDateAndTime(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, funcTy, dateBuffer, dateLen, timeBuffer, timeLen,
       zoneBuffer, zoneLen, sourceFile, sourceLine, values);
-  builder.create<fir::CallOp>(loc, callee, args);
+  fir::CallOp::create(builder, loc, callee, args);
 }
 
 void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -116,7 +117,7 @@ void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
 
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, runtimeFuncTy, values, time, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, runtimeFunc, args);
+  fir::CallOp::create(builder, loc, runtimeFunc, args);
 }
 
 void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -124,8 +125,8 @@ void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc,
   auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Free)>(loc, builder);
   mlir::Type intPtrTy = builder.getIntPtrType();
 
-  builder.create<fir::CallOp>(loc, runtimeFunc,
-                              builder.createConvert(loc, intPtrTy, ptr));
+  fir::CallOp::create(builder, loc, runtimeFunc,
+                      builder.createConvert(loc, intPtrTy, ptr));
 }
 
 mlir::Value fir::runtime::genFseek(fir::FirOpBuilder &builder,
@@ -139,7 +140,7 @@ mlir::Value fir::runtime::genFseek(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, runtimeFuncTy, unit, offset,
                                     whence, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, runtimeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
   ;
 }
 
@@ -149,7 +150,7 @@ mlir::Value fir::runtime::genFtell(fir::FirOpBuilder &builder,
   mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, runtimeFuncTy, unit);
-  return builder.create<fir::CallOp>(loc, runtimeFunc, args).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetGID(fir::FirOpBuilder &builder,
@@ -157,7 +158,7 @@ mlir::Value fir::runtime::genGetGID(fir::FirOpBuilder &builder,
   auto runtimeFunc =
       fir::runtime::getRuntimeFunc<mkRTKey(GetGID)>(loc, builder);
 
-  return builder.create<fir::CallOp>(loc, runtimeFunc).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc).getResult(0);
 }
 
 mlir::Value fir::runtime::genGetUID(fir::FirOpBuilder &builder,
@@ -165,7 +166,7 @@ mlir::Value fir::runtime::genGetUID(fir::FirOpBuilder &builder,
   auto runtimeFunc =
       fir::runtime::getRuntimeFunc<mkRTKey(GetUID)>(loc, builder);
 
-  return builder.create<fir::CallOp>(loc, runtimeFunc).getResult(0);
+  return fir::CallOp::create(builder, loc, runtimeFunc).getResult(0);
 }
 
 mlir::Value fir::runtime::genMalloc(fir::FirOpBuilder &builder,
@@ -186,7 +187,7 @@ void fir::runtime::genRandomInit(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::runtime::getRuntimeFunc<mkRTKey(RandomInit)>(loc, builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, func.getFunctionType(), repeatable, imageDistinct);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genRandomNumber(fir::FirOpBuilder &builder,
@@ -206,7 +207,7 @@ void fir::runtime::genRandomNumber(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, funcTy.getInput(2));
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, funcTy, harvest, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genRandomSeed(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -223,7 +224,7 @@ void fir::runtime::genRandomSeed(fir::FirOpBuilder &builder, mlir::Location loc,
   if (staticArgCount == 0) {
     func = fir::runtime::getRuntimeFunc<mkRTKey(RandomSeedDefaultPut)>(loc,
                                                                        builder);
-    builder.create<fir::CallOp>(loc, func);
+    fir::CallOp::create(builder, loc, func);
     return;
   }
   mlir::FunctionType funcTy;
@@ -238,7 +239,7 @@ void fir::runtime::genRandomSeed(fir::FirOpBuilder &builder, mlir::Location loc,
         fir::factory::locationToLineNo(builder, loc, funcTy.getInput(4));
     args = fir::runtime::createArguments(builder, loc, funcTy, size, put, get,
                                          sourceFile, sourceLine);
-    builder.create<fir::CallOp>(loc, func, args);
+    fir::CallOp::create(builder, loc, func, args);
     return;
   }
   if (sizeIsPresent) {
@@ -255,7 +256,7 @@ void fir::runtime::genRandomSeed(fir::FirOpBuilder &builder, mlir::Location loc,
   sourceLine = fir::factory::locationToLineNo(builder, loc, funcTy.getInput(2));
   args = fir::runtime::createArguments(builder, loc, funcTy, argBox, sourceFile,
                                        sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// generate rename runtime call
@@ -273,14 +274,15 @@ void fir::runtime::genRename(fir::FirOpBuilder &builder, mlir::Location loc,
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, runtimeFuncTy, path1, path2,
                                     status, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, runtimeFunc, args);
+  fir::CallOp::create(builder, loc, runtimeFunc, args);
 }
 
 /// generate runtime call to time intrinsic
 mlir::Value fir::runtime::genTime(fir::FirOpBuilder &builder,
                                   mlir::Location loc) {
   auto func = fir::runtime::getRuntimeFunc<mkRTKey(time)>(loc, builder);
-  return builder.create<fir::CallOp>(loc, func, std::nullopt).getResult(0);
+  return fir::CallOp::create(builder, loc, func, mlir::ValueRange{})
+      .getResult(0);
 }
 
 /// generate runtime call to transfer intrinsic with no size argument
@@ -296,7 +298,7 @@ void fir::runtime::genTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, sourceBox, moldBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// generate runtime call to transfer intrinsic with size argument
@@ -313,7 +315,7 @@ void fir::runtime::genTransferSize(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, sourceBox,
                                     moldBox, sourceFile, sourceLine, size);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// generate system_clock runtime call/s
@@ -330,11 +332,12 @@ void fir::runtime::genSystemClock(fir::FirOpBuilder &builder,
         mlir::dyn_cast<fir::HeapType>(type)) {
       // Check for a disassociated pointer or an unallocated allocatable.
       assert(!isOptionalArg && "invalid optional argument");
-      ifOp = builder.create<fir::IfOp>(loc, builder.genIsNotNullAddr(loc, arg),
-                                       /*withElseRegion=*/false);
+      ifOp = fir::IfOp::create(builder, loc, builder.genIsNotNullAddr(loc, arg),
+                               /*withElseRegion=*/false);
     } else if (isOptionalArg) {
-      ifOp = builder.create<fir::IfOp>(
-          loc, builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), arg),
+      ifOp = fir::IfOp::create(
+          builder, loc,
+          fir::IsPresentOp::create(builder, loc, builder.getI1Type(), arg),
           /*withElseRegion=*/false);
     }
     if (ifOp)
@@ -346,11 +349,11 @@ void fir::runtime::genSystemClock(fir::FirOpBuilder &builder,
       integerKind = intType.getWidth() / 8;
     mlir::Value kind = builder.createIntegerConstant(loc, kindTy, integerKind);
     mlir::Value res =
-        builder.create<fir::CallOp>(loc, func, mlir::ValueRange{kind})
+        fir::CallOp::create(builder, loc, func, mlir::ValueRange{kind})
             .getResult(0);
     mlir::Value castRes =
         builder.createConvert(loc, fir::dyn_cast_ptrEleTy(type), res);
-    builder.create<fir::StoreOp>(loc, castRes, arg);
+    fir::StoreOp::create(builder, loc, castRes, arg);
     if (ifOp)
       builder.setInsertionPointAfter(ifOp);
   };
@@ -371,24 +374,24 @@ void fir::runtime::genSignal(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value status) {
   assert(mlir::isa<mlir::IntegerType>(number.getType()));
   mlir::Type int64 = builder.getIntegerType(64);
-  number = builder.create<fir::ConvertOp>(loc, int64, number);
+  number = fir::ConvertOp::create(builder, loc, int64, number);
 
   mlir::Type handlerUnwrappedTy = fir::unwrapRefType(handler.getType());
   if (mlir::isa_and_nonnull<mlir::IntegerType>(handlerUnwrappedTy)) {
     // pass the integer as a function pointer like one would to signal(2)
-    handler = builder.create<fir::LoadOp>(loc, handler);
+    handler = fir::LoadOp::create(builder, loc, handler);
     mlir::Type fnPtrTy = fir::LLVMPointerType::get(
         mlir::FunctionType::get(handler.getContext(), {}, {}));
-    handler = builder.create<fir::ConvertOp>(loc, fnPtrTy, handler);
+    handler = fir::ConvertOp::create(builder, loc, fnPtrTy, handler);
   } else {
     assert(mlir::isa<fir::BoxProcType>(handler.getType()));
-    handler = builder.create<fir::BoxAddrOp>(loc, handler);
+    handler = fir::BoxAddrOp::create(builder, loc, handler);
   }
 
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(Signal)>(loc, builder)};
   mlir::Value stat =
-      builder.create<fir::CallOp>(loc, func, mlir::ValueRange{number, handler})
+      fir::CallOp::create(builder, loc, func, mlir::ValueRange{number, handler})
           ->getResult(0);
 
   // return status code via status argument (if present)
@@ -396,12 +399,12 @@ void fir::runtime::genSignal(fir::FirOpBuilder &builder, mlir::Location loc,
     assert(mlir::isa<mlir::IntegerType>(fir::unwrapRefType(status.getType())));
     // status might be dynamically optional, so test if it is present
     mlir::Value isPresent =
-        builder.create<IsPresentOp>(loc, builder.getI1Type(), status);
+        IsPresentOp::create(builder, loc, builder.getI1Type(), status);
     builder.genIfOp(loc, /*results=*/{}, isPresent, /*withElseRegion=*/false)
         .genThen([&]() {
-          stat = builder.create<fir::ConvertOp>(
-              loc, fir::unwrapRefType(status.getType()), stat);
-          builder.create<fir::StoreOp>(loc, stat, status);
+          stat = fir::ConvertOp::create(
+              builder, loc, fir::unwrapRefType(status.getType()), stat);
+          fir::StoreOp::create(builder, loc, stat, status);
         })
         .end();
   }
@@ -410,10 +413,10 @@ void fir::runtime::genSignal(fir::FirOpBuilder &builder, mlir::Location loc,
 void fir::runtime::genSleep(fir::FirOpBuilder &builder, mlir::Location loc,
                             mlir::Value seconds) {
   mlir::Type int64 = builder.getIntegerType(64);
-  seconds = builder.create<fir::ConvertOp>(loc, int64, seconds);
+  seconds = fir::ConvertOp::create(builder, loc, int64, seconds);
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(Sleep)>(loc, builder)};
-  builder.create<fir::CallOp>(loc, func, seconds);
+  fir::CallOp::create(builder, loc, func, seconds);
 }
 
 /// generate chdir runtime call
@@ -423,5 +426,5 @@ mlir::Value fir::runtime::genChdir(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(Chdir)>(loc, builder)};
   llvm::SmallVector<mlir::Value> args =
       fir::runtime::createArguments(builder, loc, func.getFunctionType(), name);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
index 973744837d378..d35f687167b05 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Main.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -62,17 +62,17 @@ void fir::runtime::genMain(
   llvm::SmallVector<mlir::Value, 4> args(block->getArguments());
   args.push_back(env);
 
-  builder.create<fir::CallOp>(loc, startFn, args);
+  fir::CallOp::create(builder, loc, startFn, args);
 
   if (initCuda) {
     auto initFn = builder.createFunction(
         loc, RTNAME_STRING(CUFInit), mlir::FunctionType::get(context, {}, {}));
-    builder.create<fir::CallOp>(loc, initFn);
+    fir::CallOp::create(builder, loc, initFn);
   }
 
-  builder.create<fir::CallOp>(loc, qqMainFn);
-  builder.create<fir::CallOp>(loc, stopFn);
+  fir::CallOp::create(builder, loc, qqMainFn);
+  fir::CallOp::create(builder, loc, stopFn);
 
   mlir::Value ret = builder.createIntegerConstant(loc, argcTy, 0);
-  builder.create<mlir::func::ReturnOp>(loc, ret);
+  mlir::func::ReturnOp::create(builder, loc, ret);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index 4ff7c86bb0a24..62d5e506cedf6 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -317,7 +317,7 @@ mlir::Value fir::runtime::genExponent(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = {
       builder.createConvert(loc, funcTy.getInput(0), x)};
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Fraction intrinsic runtime routine.
@@ -340,7 +340,7 @@ mlir::Value fir::runtime::genFraction(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = {
       builder.createConvert(loc, funcTy.getInput(0), x)};
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Mod intrinsic runtime routine.
@@ -370,7 +370,7 @@ mlir::Value fir::runtime::genMod(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(builder, loc, funcTy, a, p,
                                             sourceFile, sourceLine);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Modulo intrinsic runtime routine.
@@ -403,7 +403,7 @@ mlir::Value fir::runtime::genModulo(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, funcTy, a, p,
                                             sourceFile, sourceLine);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Nearest intrinsic or a "Next" intrinsic module procedure.
@@ -427,7 +427,7 @@ mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
   auto funcTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcTy, x, valueUp);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to RRSpacing intrinsic runtime routine.
@@ -451,7 +451,7 @@ mlir::Value fir::runtime::genRRSpacing(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = {
       builder.createConvert(loc, funcTy.getInput(0), x)};
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to ErfcScaled intrinsic runtime routine.
@@ -475,7 +475,7 @@ mlir::Value fir::runtime::genErfcScaled(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = {
       builder.createConvert(loc, funcTy.getInput(0), x)};
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Scale intrinsic runtime routine.
@@ -499,7 +499,7 @@ mlir::Value fir::runtime::genScale(fir::FirOpBuilder &builder,
   auto funcTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcTy, x, i);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Selected_char_kind intrinsic runtime routine.
@@ -519,7 +519,7 @@ mlir::Value fir::runtime::genSelectedCharKind(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
                                             sourceLine, name, length);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Selected_int_kind intrinsic runtime routine.
@@ -540,7 +540,7 @@ mlir::Value fir::runtime::genSelectedIntKind(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
                                             sourceLine, x, xKind);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Selected_logical_kind intrinsic runtime routine.
@@ -561,7 +561,7 @@ mlir::Value fir::runtime::genSelectedLogicalKind(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
                                             sourceLine, x, xKind);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Selected_real_kind intrinsic runtime routine.
@@ -593,7 +593,7 @@ mlir::Value fir::runtime::genSelectedRealKind(fir::FirOpBuilder &builder,
                                             sourceLine, precision, pKind, range,
                                             rKind, radix, dKind);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Set_exponent intrinsic runtime routine.
@@ -617,7 +617,7 @@ mlir::Value fir::runtime::genSetExponent(fir::FirOpBuilder &builder,
   auto funcTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcTy, x, i);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to Spacing intrinsic runtime routine.
@@ -649,6 +649,6 @@ mlir::Value fir::runtime::genSpacing(fir::FirOpBuilder &builder,
   llvm::SmallVector<mlir::Value> args = {
       builder.createConvert(loc, funcTy.getInput(0), x)};
 
-  mlir::Value res = builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  mlir::Value res = fir::CallOp::create(builder, loc, func, args).getResult(0);
   return builder.createConvert(loc, fltTy, res);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Pointer.cpp b/flang/lib/Optimizer/Builder/Runtime/Pointer.cpp
index 160c6515a7a9d..c03ff5885d91e 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Pointer.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Pointer.cpp
@@ -23,5 +23,5 @@ void fir::runtime::genPointerAssociateScalar(fir::FirOpBuilder &builder,
   mlir::FunctionType fTy{func.getFunctionType()};
   llvm::SmallVector<mlir::Value> args{
       fir::runtime::createArguments(builder, loc, fTy, desc, target)};
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp b/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
index e5d0fb0fb27a9..e5cf96359ea38 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
@@ -34,25 +34,25 @@ void fir::runtime::genRaggedArrayAllocate(mlir::Location loc,
   auto eleTy = fir::unwrapSequenceType(fir::unwrapRefType(header.getType()));
   auto ptrTy =
       builder.getRefType(mlir::cast<mlir::TupleType>(eleTy).getType(1));
-  auto ptr = builder.create<fir::CoordinateOp>(loc, ptrTy, header, one);
-  auto heap = builder.create<fir::LoadOp>(loc, ptr);
+  auto ptr = fir::CoordinateOp::create(builder, loc, ptrTy, header, one);
+  auto heap = fir::LoadOp::create(builder, loc, ptr);
   auto cmp = builder.genIsNullAddr(loc, heap);
   builder.genIfThen(loc, cmp)
       .genThen([&]() {
         auto asHeadersVal = builder.createIntegerConstant(loc, i1Ty, asHeaders);
         auto rankVal = builder.createIntegerConstant(loc, i64Ty, rank);
-        auto buff = builder.create<fir::AllocMemOp>(loc, extentTy);
+        auto buff = fir::AllocMemOp::create(builder, loc, extentTy);
         // Convert all the extents to i64 and pack them in a buffer on the heap.
         for (auto i : llvm::enumerate(extents)) {
           auto offset = builder.createIntegerConstant(loc, i32Ty, i.index());
           auto addr =
-              builder.create<fir::CoordinateOp>(loc, refTy, buff, offset);
+              fir::CoordinateOp::create(builder, loc, refTy, buff, offset);
           auto castVal = builder.createConvert(loc, i64Ty, i.value());
-          builder.create<fir::StoreOp>(loc, castVal, addr);
+          fir::StoreOp::create(builder, loc, castVal, addr);
         }
         auto args = fir::runtime::createArguments(
             builder, loc, fTy, header, asHeadersVal, rankVal, eleSize, buff);
-        builder.create<fir::CallOp>(loc, func, args);
+        fir::CallOp::create(builder, loc, func, args);
       })
       .end();
 }
@@ -64,5 +64,5 @@ void fir::runtime::genRaggedArrayDeallocate(mlir::Location loc,
       loc, builder);
   auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, header);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index f778b963c59ca..157d4358329ce 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -1155,7 +1155,7 @@ mlir::Value genSpecial2Args(FN func, fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
   auto args = fir::runtime::createArguments(builder, loc, fTy, maskBox,
                                             sourceFile, sourceLine, dim);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate calls to reduction intrinsics such as All and Any.
@@ -1171,7 +1171,7 @@ static void genReduction2Args(FN func, fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, maskBox, dim, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate calls to reduction intrinsics such as Maxval and Minval.
@@ -1189,7 +1189,7 @@ static void genReduction3Args(FN func, fir::FirOpBuilder &builder,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, arrayBox, dim,
                                     sourceFile, sourceLine, maskBox);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate calls to reduction intrinsics such as Maxloc and Minloc.
@@ -1206,7 +1206,7 @@ static void genReduction4Args(FN func, fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             arrayBox, kind, sourceFile,
                                             sourceLine, maskBox, back);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate calls to reduction intrinsics such as Maxloc and Minloc.
@@ -1223,7 +1223,7 @@ genReduction5Args(FN func, fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             arrayBox, kind, dim, sourceFile,
                                             sourceLine, maskBox, back);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `AllDim` runtime routine.
@@ -1296,7 +1296,7 @@ void fir::runtime::genCountDim(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(5));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, maskBox, dim, kind, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Findloc` intrinsic runtime routine. This is the version
@@ -1313,7 +1313,7 @@ void fir::runtime::genFindloc(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             arrayBox, valBox, kind, sourceFile,
                                             sourceLine, maskBox, back);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `FindlocDim` intrinsic runtime routine. This is the version
@@ -1331,7 +1331,7 @@ void fir::runtime::genFindlocDim(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, valBox, kind, dim, sourceFile,
       sourceLine, maskBox, back);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Maxloc` intrinsic runtime routine. This is the version
@@ -1392,7 +1392,7 @@ mlir::Value fir::runtime::genMaxval(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, arrayBox, sourceFile, sourceLine, dim, maskBox);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to `MaxvalDim` intrinsic runtime routine. This is the version
@@ -1417,7 +1417,7 @@ void fir::runtime::genMaxvalChar(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, sourceFile, sourceLine, maskBox);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Minloc` intrinsic runtime routine. This is the version
@@ -1476,7 +1476,7 @@ void fir::runtime::genMinvalChar(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, sourceFile, sourceLine, maskBox);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Minval` intrinsic runtime routine. This is the version
@@ -1504,7 +1504,7 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, arrayBox, sourceFile, sourceLine, dim, maskBox);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to `Norm2Dim` intrinsic runtime routine. This is the version
@@ -1527,7 +1527,7 @@ void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, dim, sourceFile, sourceLine);
 
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Norm2` intrinsic runtime routine. This is the version
@@ -1558,7 +1558,7 @@ mlir::Value fir::runtime::genNorm2(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(builder, loc, fTy, arrayBox,
                                             sourceFile, sourceLine, dim);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to `Parity` intrinsic runtime routine. This routine is
@@ -1604,7 +1604,7 @@ mlir::Value fir::runtime::genProduct(fir::FirOpBuilder &builder,
     auto args =
         fir::runtime::createArguments(builder, loc, fTy, resultBox, arrayBox,
                                       sourceFile, sourceLine, dim, maskBox);
-    builder.create<fir::CallOp>(loc, func, args);
+    fir::CallOp::create(builder, loc, func, args);
     return resultBox;
   }
 
@@ -1613,7 +1613,7 @@ mlir::Value fir::runtime::genProduct(fir::FirOpBuilder &builder,
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, arrayBox, sourceFile, sourceLine, dim, maskBox);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 /// Generate call to `DotProduct` intrinsic runtime routine.
@@ -1645,7 +1645,7 @@ mlir::Value fir::runtime::genDotProduct(fir::FirOpBuilder &builder,
     auto args =
         fir::runtime::createArguments(builder, loc, fTy, resultBox, vectorABox,
                                       vectorBBox, sourceFile, sourceLine);
-    builder.create<fir::CallOp>(loc, func, args);
+    fir::CallOp::create(builder, loc, func, args);
     return resultBox;
   }
 
@@ -1653,7 +1653,7 @@ mlir::Value fir::runtime::genDotProduct(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, vectorABox,
                                             vectorBBox, sourceFile, sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 /// Generate call to `SumDim` intrinsic runtime routine. This is the version
 /// that handles any rank array with the dim argument specified.
@@ -1688,7 +1688,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
     auto args =
         fir::runtime::createArguments(builder, loc, fTy, resultBox, arrayBox,
                                       sourceFile, sourceLine, dim, maskBox);
-    builder.create<fir::CallOp>(loc, func, args);
+    fir::CallOp::create(builder, loc, func, args);
     return resultBox;
   }
 
@@ -1697,7 +1697,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, arrayBox, sourceFile, sourceLine, dim, maskBox);
 
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 // The IAll, IAny and IParity intrinsics have essentially the same
@@ -1733,7 +1733,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
     auto args = fir::runtime::createArguments(                                 \
         builder, loc, fTy, arrayBox, sourceFile, sourceLine, dim, maskBox);    \
                                                                                \
-    return builder.create<fir::CallOp>(loc, func, args).getResult(0);          \
+    return fir::CallOp::create(builder, loc, func, args).getResult(0);         \
   }
 
 /// Generate call to `IAllDim` intrinsic runtime routine. This is the version
@@ -1819,11 +1819,12 @@ void fir::runtime::genReduce(fir::FirOpBuilder &builder, mlir::Location loc,
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto sourceLine =
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
-  auto opAddr = builder.create<fir::BoxAddrOp>(loc, fTy.getInput(2), operation);
+  auto opAddr =
+      fir::BoxAddrOp::create(builder, loc, fTy.getInput(2), operation);
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, opAddr, sourceFile, sourceLine,
       dim, maskBox, identity, ordered);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `Reduce` intrinsic runtime routine. This is the version
@@ -1864,11 +1865,12 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder,
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto sourceLine =
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
-  auto opAddr = builder.create<fir::BoxAddrOp>(loc, fTy.getInput(1), operation);
+  auto opAddr =
+      fir::BoxAddrOp::create(builder, loc, fTy.getInput(1), operation);
   auto args = fir::runtime::createArguments(builder, loc, fTy, arrayBox, opAddr,
                                             sourceFile, sourceLine, dim,
                                             maskBox, identity, ordered);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -1912,9 +1914,10 @@ void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc,
 
   auto sourceLine =
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
-  auto opAddr = builder.create<fir::BoxAddrOp>(loc, fTy.getInput(2), operation);
+  auto opAddr =
+      fir::BoxAddrOp::create(builder, loc, fTy.getInput(2), operation);
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, opAddr, sourceFile, sourceLine,
       dim, maskBox, identity, ordered);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Stop.cpp b/flang/lib/Optimizer/Builder/Runtime/Stop.cpp
index 411181cc6dd1c..5629371947641 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Stop.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Stop.cpp
@@ -19,13 +19,13 @@ void fir::runtime::genExit(fir::FirOpBuilder &builder, mlir::Location loc,
   auto exitFunc = fir::runtime::getRuntimeFunc<mkRTKey(Exit)>(loc, builder);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, exitFunc.getFunctionType(), status);
-  builder.create<fir::CallOp>(loc, exitFunc, args);
+  fir::CallOp::create(builder, loc, exitFunc, args);
 }
 
 void fir::runtime::genAbort(fir::FirOpBuilder &builder, mlir::Location loc) {
   mlir::func::FuncOp abortFunc =
       fir::runtime::getRuntimeFunc<mkRTKey(Abort)>(loc, builder);
-  builder.create<fir::CallOp>(loc, abortFunc, std::nullopt);
+  fir::CallOp::create(builder, loc, abortFunc, mlir::ValueRange{});
 }
 
 void fir::runtime::genReportFatalUserError(fir::FirOpBuilder &builder,
@@ -41,5 +41,5 @@ void fir::runtime::genReportFatalUserError(fir::FirOpBuilder &builder,
   mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
   llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
       builder, loc, funcTy, msgVal, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, crashFunc, args);
+  fir::CallOp::create(builder, loc, crashFunc, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Support.cpp b/flang/lib/Optimizer/Builder/Runtime/Support.cpp
index b5e9ddb87c7c4..d0d48ad718da4 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Support.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Support.cpp
@@ -42,7 +42,7 @@ void fir::runtime::genCopyAndUpdateDescriptor(fir::FirOpBuilder &builder,
     func.setArgAttr(0, noCapture, unitAttr);
     func.setArgAttr(1, noCapture, unitAttr);
   }
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 mlir::Value fir::runtime::genIsAssumedSize(fir::FirOpBuilder &builder,
@@ -52,5 +52,5 @@ mlir::Value fir::runtime::genIsAssumedSize(fir::FirOpBuilder &builder,
       fir::runtime::getRuntimeFunc<mkRTKey(IsAssumedSize)>(loc, builder);
   auto fTy = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, fTy, box);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/TemporaryStack.cpp b/flang/lib/Optimizer/Builder/Runtime/TemporaryStack.cpp
index 732152c823a06..effd712079753 100644
--- a/flang/lib/Optimizer/Builder/Runtime/TemporaryStack.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/TemporaryStack.cpp
@@ -23,7 +23,7 @@ mlir::Value fir::runtime::genCreateValueStack(mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, funcType.getInput(1));
   auto args = fir::runtime::createArguments(builder, loc, funcType, sourceFile,
                                             sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 void fir::runtime::genPushValue(mlir::Location loc, fir::FirOpBuilder &builder,
@@ -33,7 +33,7 @@ void fir::runtime::genPushValue(mlir::Location loc, fir::FirOpBuilder &builder,
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType, opaquePtr,
                                             boxValue);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genValueAt(mlir::Location loc, fir::FirOpBuilder &builder,
@@ -44,7 +44,7 @@ void fir::runtime::genValueAt(mlir::Location loc, fir::FirOpBuilder &builder,
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType, opaquePtr,
                                             i, retValueBox);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDestroyValueStack(mlir::Location loc,
@@ -54,7 +54,7 @@ void fir::runtime::genDestroyValueStack(mlir::Location loc,
       fir::runtime::getRuntimeFunc<mkRTKey(DestroyValueStack)>(loc, builder);
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType, opaquePtr);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 mlir::Value fir::runtime::genCreateDescriptorStack(mlir::Location loc,
@@ -68,7 +68,7 @@ mlir::Value fir::runtime::genCreateDescriptorStack(mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, funcType.getInput(1));
   auto args = fir::runtime::createArguments(builder, loc, funcType, sourceFile,
                                             sourceLine);
-  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
 }
 
 void fir::runtime::genPushDescriptor(mlir::Location loc,
@@ -80,7 +80,7 @@ void fir::runtime::genPushDescriptor(mlir::Location loc,
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType, opaquePtr,
                                             boxDescriptor);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDescriptorAt(mlir::Location loc,
@@ -92,7 +92,7 @@ void fir::runtime::genDescriptorAt(mlir::Location loc,
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType, opaquePtr,
                                             i, retDescriptorBox);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 void fir::runtime::genDestroyDescriptorStack(mlir::Location loc,
@@ -103,5 +103,5 @@ void fir::runtime::genDestroyDescriptorStack(mlir::Location loc,
                                                                     builder);
   mlir::FunctionType funcType = func.getFunctionType();
   auto args = fir::runtime::createArguments(builder, loc, funcType, opaquePtr);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp b/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
index 47744b0facb53..6251def5cfb17 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Transformational.cpp
@@ -170,7 +170,7 @@ void fir::runtime::genBesselJn(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, n1, n2, x,
                                     bn2, bn2_1, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `BesselJn` intrinsic. This is used when `x == 0.0`.
@@ -196,7 +196,7 @@ void fir::runtime::genBesselJnX0(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox, n1,
                                             n2, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `BesselYn` intrinsic.
@@ -225,7 +225,7 @@ void fir::runtime::genBesselYn(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, n1, n2, x,
                                     bn1, bn1_1, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to `BesselYn` intrinsic. This is used when `x == 0.0`.
@@ -251,7 +251,7 @@ void fir::runtime::genBesselYnX0(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox, n1,
                                             n2, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to Cshift intrinsic
@@ -266,7 +266,7 @@ void fir::runtime::genCshift(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, arrayBox,
                                     shiftBox, dimBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, cshiftFunc, args);
+  fir::CallOp::create(builder, loc, cshiftFunc, args);
 }
 
 /// Generate call to the vector version of the Cshift intrinsic
@@ -282,7 +282,7 @@ void fir::runtime::genCshiftVector(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
   auto args = fir::runtime::createArguments(
       builder, loc, fTy, resultBox, arrayBox, shiftBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, cshiftFunc, args);
+  fir::CallOp::create(builder, loc, cshiftFunc, args);
 }
 
 /// Generate call to Eoshift intrinsic
@@ -299,7 +299,7 @@ void fir::runtime::genEoshift(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             arrayBox, shiftBox, boundBox,
                                             dimBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, eoshiftFunc, args);
+  fir::CallOp::create(builder, loc, eoshiftFunc, args);
 }
 
 /// Generate call to the vector version of the Eoshift intrinsic
@@ -318,7 +318,7 @@ void fir::runtime::genEoshiftVector(fir::FirOpBuilder &builder,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, arrayBox,
                                     shiftBox, boundBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, eoshiftFunc, args);
+  fir::CallOp::create(builder, loc, eoshiftFunc, args);
 }
 
 /// Define ForcedMatmul<ACAT><AKIND><BCAT><BKIND> models.
@@ -388,7 +388,7 @@ void fir::runtime::genMatmul(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, matrixABox,
                                     matrixBBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Define ForcedMatmulTranspose<ACAT><AKIND><BCAT><BKIND> models.
@@ -440,7 +440,7 @@ void fir::runtime::genMatmulTranspose(fir::FirOpBuilder &builder,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, matrixABox,
                                     matrixBBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to Pack intrinsic runtime routine.
@@ -455,7 +455,7 @@ void fir::runtime::genPack(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, arrayBox,
                                     maskBox, vectorBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, packFunc, args);
+  fir::CallOp::create(builder, loc, packFunc, args);
 }
 
 /// Generate call to Reshape intrinsic runtime routine.
@@ -471,7 +471,7 @@ void fir::runtime::genReshape(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             sourceBox, shapeBox, padBox,
                                             orderBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to ShallowCopy[Direct] runtime routine.
@@ -491,7 +491,7 @@ void fir::runtime::genShallowCopy(fir::FirOpBuilder &builder,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             arrayBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, packFunc, args);
+  fir::CallOp::create(builder, loc, packFunc, args);
 }
 
 /// Generate call to Spread intrinsic runtime routine.
@@ -506,7 +506,7 @@ void fir::runtime::genSpread(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, sourceBox,
                                     dim, ncopies, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to Transpose intrinsic runtime routine.
@@ -519,7 +519,7 @@ void fir::runtime::genTranspose(fir::FirOpBuilder &builder, mlir::Location loc,
       fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
   auto args = fir::runtime::createArguments(builder, loc, fTy, resultBox,
                                             sourceBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, func, args);
+  fir::CallOp::create(builder, loc, func, args);
 }
 
 /// Generate call to Unpack intrinsic runtime routine.
@@ -534,5 +534,5 @@ void fir::runtime::genUnpack(fir::FirOpBuilder &builder, mlir::Location loc,
   auto args =
       fir::runtime::createArguments(builder, loc, fTy, resultBox, vectorBox,
                                     maskBox, fieldBox, sourceFile, sourceLine);
-  builder.create<fir::CallOp>(loc, unpackFunc, args);
+  fir::CallOp::create(builder, loc, unpackFunc, args);
 }
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index 9d2e9837a3df8..4c648df18b328 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -28,7 +28,7 @@ fir::factory::Counter::Counter(mlir::Location loc, fir::FirOpBuilder &builder,
   one = builder.createIntegerConstant(loc, type, 1);
   if (canCountThroughLoops) {
     index = builder.createTemporary(loc, type);
-    builder.create<fir::StoreOp>(loc, initialValue, index);
+    fir::StoreOp::create(builder, loc, initialValue, index);
   } else {
     index = initialValue;
   }
@@ -38,21 +38,21 @@ mlir::Value
 fir::factory::Counter::getAndIncrementIndex(mlir::Location loc,
                                             fir::FirOpBuilder &builder) {
   if (canCountThroughLoops) {
-    mlir::Value indexValue = builder.create<fir::LoadOp>(loc, index);
+    mlir::Value indexValue = fir::LoadOp::create(builder, loc, index);
     mlir::Value newValue =
-        builder.create<mlir::arith::AddIOp>(loc, indexValue, one);
-    builder.create<fir::StoreOp>(loc, newValue, index);
+        mlir::arith::AddIOp::create(builder, loc, indexValue, one);
+    fir::StoreOp::create(builder, loc, newValue, index);
     return indexValue;
   }
   mlir::Value indexValue = index;
-  index = builder.create<mlir::arith::AddIOp>(loc, indexValue, one);
+  index = mlir::arith::AddIOp::create(builder, loc, indexValue, one);
   return indexValue;
 }
 
 void fir::factory::Counter::reset(mlir::Location loc,
                                   fir::FirOpBuilder &builder) {
   if (canCountThroughLoops)
-    builder.create<fir::StoreOp>(loc, initialValue, index);
+    fir::StoreOp::create(builder, loc, initialValue, index);
   else
     index = initialValue;
 }
@@ -103,7 +103,7 @@ void fir::factory::HomogeneousScalarStack::pushValue(mlir::Location loc,
   // below should not get hit but is added as a remainder/safety.
   if (!entity.hasIntrinsicType())
     TODO(loc, "creating inlined temporary stack for derived types");
-  builder.create<hlfir::AssignOp>(loc, value, tempElement);
+  hlfir::AssignOp::create(builder, loc, value, tempElement);
 }
 
 void fir::factory::HomogeneousScalarStack::resetFetchPosition(
@@ -125,14 +125,14 @@ void fir::factory::HomogeneousScalarStack::destroy(mlir::Location loc,
   if (allocateOnHeap) {
     auto declare = temp.getDefiningOp<hlfir::DeclareOp>();
     assert(declare && "temp must have been declared");
-    builder.create<fir::FreeMemOp>(loc, declare.getMemref());
+    fir::FreeMemOp::create(builder, loc, declare.getMemref());
   }
 }
 
 hlfir::Entity fir::factory::HomogeneousScalarStack::moveStackAsArrayExpr(
     mlir::Location loc, fir::FirOpBuilder &builder) {
   mlir::Value mustFree = builder.createBool(loc, allocateOnHeap);
-  auto hlfirExpr = builder.create<hlfir::AsExprOp>(loc, temp, mustFree);
+  auto hlfirExpr = hlfir::AsExprOp::create(builder, loc, temp, mustFree);
   return hlfir::Entity{hlfirExpr};
 }
 
@@ -147,14 +147,14 @@ fir::factory::SimpleCopy::SimpleCopy(mlir::Location loc,
   // Use hlfir.as_expr and hlfir.associate to create a copy and leave
   // bufferization deals with how best to make the copy.
   if (source.isVariable())
-    source = hlfir::Entity{builder.create<hlfir::AsExprOp>(loc, source)};
+    source = hlfir::Entity{hlfir::AsExprOp::create(builder, loc, source)};
   copy = hlfir::genAssociateExpr(loc, builder, source,
                                  source.getFortranElementType(), tempName);
 }
 
 void fir::factory::SimpleCopy::destroy(mlir::Location loc,
                                        fir::FirOpBuilder &builder) {
-  builder.create<hlfir::EndAssociateOp>(loc, copy);
+  hlfir::EndAssociateOp::create(builder, loc, copy);
 }
 
 //===----------------------------------------------------------------------===//
@@ -279,7 +279,7 @@ mlir::Value fir::factory::AnyVariableStack::fetch(mlir::Location loc,
   mlir::Value indexValue = counter.getAndIncrementIndex(loc, builder);
   fir::runtime::genDescriptorAt(loc, builder, opaquePtr, indexValue,
                                 retValueBox);
-  hlfir::Entity retBox{builder.create<fir::LoadOp>(loc, retValueBox)};
+  hlfir::Entity retBox{fir::LoadOp::create(builder, loc, retValueBox)};
   // The runtime always tracks variable as address, but the form of the variable
   // that was saved may be different (raw address, fir.boxchar), ensure
   // the returned variable has the same form of the one that was saved.
@@ -326,7 +326,7 @@ void fir::factory::AnyVectorSubscriptStack::pushShape(
         hlfir::getFortranElementOrSequenceType(*boxType));
     mlir::Value null = builder.createNullConstant(loc, refType);
     mlir::Value descriptor =
-        builder.create<fir::EmboxOp>(loc, *boxType, null, shape);
+        fir::EmboxOp::create(builder, loc, *boxType, null, shape);
     shapeTemp->pushValue(loc, builder, descriptor);
     return;
   }
@@ -372,7 +372,7 @@ void fir::factory::AnyAddressStack::pushValue(mlir::Location loc,
   mlir::Value cast = variable;
   if (auto boxProcType = llvm::dyn_cast<fir::BoxProcType>(variable.getType())) {
     cast =
-        builder.create<fir::BoxAddrOp>(loc, boxProcType.getEleTy(), variable);
+        fir::BoxAddrOp::create(builder, loc, boxProcType.getEleTy(), variable);
   }
   cast = builder.createConvert(loc, builder.getIntPtrType(), cast);
   static_cast<AnyValueStack *>(this)->pushValue(loc, builder, cast);
@@ -383,7 +383,7 @@ mlir::Value fir::factory::AnyAddressStack::fetch(mlir::Location loc,
   mlir::Value addr = static_cast<AnyValueStack *>(this)->fetch(loc, builder);
   if (auto boxProcType = llvm::dyn_cast<fir::BoxProcType>(addressType)) {
     mlir::Value cast = builder.createConvert(loc, boxProcType.getEleTy(), addr);
-    return builder.create<fir::EmboxProcOp>(loc, boxProcType, cast);
+    return fir::EmboxProcOp::create(builder, loc, boxProcType, cast);
   }
   return builder.createConvert(loc, addressType, addr);
 }
diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
index 980307db315d9..d5ea3c7a8e282 100644
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -34,8 +34,10 @@ add_flang_library(FIRCodeGen
 
   MLIR_LIBS
   MLIRComplexToLLVM
+  MLIRComplexToROCDLLibraryCalls
   MLIRComplexToStandard
   MLIRGPUDialect
+  MLIRIndexToLLVM
   MLIRMathToFuncs
   MLIRMathToLLVM
   MLIRMathToLibm
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index ecc04a6c9a2be..609ba27bc212b 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -33,9 +33,11 @@
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/MathToFuncs/MathToFuncs.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
@@ -1122,6 +1124,16 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
     for (mlir::Value opnd : adaptor.getOperands())
       size = rewriter.create<mlir::LLVM::MulOp>(
           loc, ity, size, integerCast(loc, rewriter, ity, opnd));
+
+    // As the return value of malloc(0) is implementation defined, allocate one
+    // byte to ensure the allocation status being true. This behavior aligns to
+    // what the runtime has.
+    mlir::Value zero = genConstantIndex(loc, ity, rewriter, 0);
+    mlir::Value one = genConstantIndex(loc, ity, rewriter, 1);
+    mlir::Value cmp = rewriter.create<mlir::LLVM::ICmpOp>(
+        loc, mlir::LLVM::ICmpPredicate::sgt, size, zero);
+    size = rewriter.create<mlir::LLVM::SelectOp>(loc, cmp, size, one);
+
     auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
@@ -4145,22 +4157,24 @@ class FIRToLLVMLowering
     // conversions that affect the ModuleOp, e.g. create new
     // function operations in it. We have to run such conversions
     // as passes here.
-    mlir::OpPassManager mathConvertionPM("builtin.module");
+    mlir::OpPassManager mathConversionPM("builtin.module");
 
     bool isAMDGCN = fir::getTargetTriple(mod).isAMDGCN();
     // If compiling for AMD target some math operations must be lowered to AMD
     // GPU library calls, the rest can be converted to LLVM intrinsics, which
     // is handled in the mathToLLVM conversion. The lowering to libm calls is
     // not needed since all math operations are handled this way.
-    if (isAMDGCN)
-      mathConvertionPM.addPass(mlir::createConvertMathToROCDL());
+    if (isAMDGCN) {
+      mathConversionPM.addPass(mlir::createConvertMathToROCDL());
+      mathConversionPM.addPass(mlir::createConvertComplexToROCDLLibraryCalls());
+    }
 
     // Convert math::FPowI operations to inline implementation
     // only if the exponent's width is greater than 32, otherwise,
     // it will be lowered to LLVM intrinsic operation by a later conversion.
     mlir::ConvertMathToFuncsOptions mathToFuncsOptions{};
     mathToFuncsOptions.minWidthOfFPowIExponent = 33;
-    mathConvertionPM.addPass(
+    mathConversionPM.addPass(
         mlir::createConvertMathToFuncs(mathToFuncsOptions));
 
     mlir::ConvertComplexToStandardPassOptions complexToStandardOptions{};
@@ -4173,15 +4187,15 @@ class FIRToLLVMLowering
       complexToStandardOptions.complexRange =
           mlir::complex::ComplexRangeFlags::improved;
     }
-    mathConvertionPM.addPass(
+    mathConversionPM.addPass(
         mlir::createConvertComplexToStandardPass(complexToStandardOptions));
 
     // Convert Math dialect operations into LLVM dialect operations.
     // There is no way to prefer MathToLLVM patterns over MathToLibm
     // patterns (applied below), so we have to run MathToLLVM conversion here.
-    mathConvertionPM.addNestedPass<mlir::func::FuncOp>(
+    mathConversionPM.addNestedPass<mlir::func::FuncOp>(
         mlir::createConvertMathToLLVMPass());
-    if (mlir::failed(runPipeline(mathConvertionPM, mod)))
+    if (mlir::failed(runPipeline(mathConversionPM, mod)))
       return signalPassFailure();
 
     std::optional<mlir::DataLayout> dl =
@@ -4211,6 +4225,7 @@ class FIRToLLVMLowering
     if (!isAMDGCN)
       mlir::populateMathToLibmConversionPatterns(pattern);
     mlir::populateComplexToLLVMConversionPatterns(typeConverter, pattern);
+    mlir::index::populateIndexToLLVMConversionPatterns(typeConverter, pattern);
     mlir::populateVectorToLLVMConversionPatterns(typeConverter, pattern);
 
     // Flang specific overloads for OpenMP operations, to allow for special
diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
index e34771c67b0c3..d2cf85bedd54c 100644
--- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
+++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
@@ -63,13 +63,14 @@ class PackArrayConversion : public mlir::OpRewritePattern<fir::PackArrayOp> {
   static constexpr llvm::StringRef bufferName = ".repacked";
 
   // Return value of fir::BaseBoxType that represents a temporary
-  // array created for the original box with given extents and
-  // type parameters. The new box has the default lower bounds.
-  // If useStack is true, then the temporary will be allocated
+  // array created for the original box with given lbounds/extents and
+  // type parameters. The new box has the same shape as the original
+  // array. If useStack is true, then the temporary will be allocated
   // in stack memory (when possible).
   static mlir::Value allocateTempBuffer(fir::FirOpBuilder &builder,
                                         mlir::Location loc, bool useStack,
                                         mlir::Value origBox,
+                                        llvm::ArrayRef<mlir::Value> lbounds,
                                         llvm::ArrayRef<mlir::Value> extents,
                                         llvm::ArrayRef<mlir::Value> typeParams);
 
@@ -99,7 +100,9 @@ class UnpackArrayConversion
 // the presence of the stack attribute does not automatically
 // mean that the allocation is actually done in stack memory.
 // For example, we always do the heap allocation for polymorphic
-// types using Fortran runtime.
+// types using Fortran runtime. Currently, we allocate all
+// repack temporaries of derived types as polymorphic,
+// so that we can preserve the dynamic type of the original.
 // Adding the polymorpic mold to fir.alloca and then using
 // Fortran runtime to compute the allocation size could probably
 // resolve this limitation.
@@ -170,7 +173,8 @@ PackArrayConversion::matchAndRewrite(fir::PackArrayOp op,
 
 mlir::Value PackArrayConversion::allocateTempBuffer(
     fir::FirOpBuilder &builder, mlir::Location loc, bool useStack,
-    mlir::Value origBox, llvm::ArrayRef<mlir::Value> extents,
+    mlir::Value origBox, llvm::ArrayRef<mlir::Value> lbounds,
+    llvm::ArrayRef<mlir::Value> extents,
     llvm::ArrayRef<mlir::Value> typeParams) {
   auto tempType = mlir::cast<fir::SequenceType>(
       fir::extractSequenceType(origBox.getType()));
@@ -191,16 +195,35 @@ mlir::Value PackArrayConversion::allocateTempBuffer(
     assert(!isHeapAllocation && "temp must have been allocated on the stack");
 
   mlir::Type ptrType = base.getType();
-  if (llvm::isa<fir::BaseBoxType>(ptrType))
-    return base;
+  if (auto tempBoxType = mlir::dyn_cast<fir::BaseBoxType>(ptrType)) {
+    // We need to reset the CFI_attribute_allocatable before
+    // returning the temporary box to avoid any mishandling
+    // of the temporary box in Fortran runtime.
+    base = builder.create<fir::BoxAddrOp>(loc, fir::boxMemRefType(tempBoxType),
+                                          base);
+    ptrType = base.getType();
+  }
 
-  mlir::Type tempBoxType = fir::BoxType::get(mlir::isa<fir::HeapType>(ptrType)
-                                                 ? ptrType
-                                                 : fir::unwrapRefType(ptrType));
+  // Create the temporary using dynamic type of the original,
+  // if it is polymorphic, or it has a derived type with SEQUENCE
+  // or BIND attribute (such dummy arguments may have their dynamic
+  // type not exactly matching their static type).
+  // Note that for the latter case, the allocation can still be done
+  // without the mold, because the dynamic and static types
+  // must be storage compatible.
+  bool useDynamicType = fir::isBoxedRecordType(origBox.getType()) ||
+                        fir::isPolymorphicType(origBox.getType());
+  mlir::Type tempBoxType =
+      fir::wrapInClassOrBoxType(fir::unwrapRefType(ptrType),
+                                /*isPolymorphic=*/useDynamicType);
+  // Use the shape with proper lower bounds for the final box.
+  shape = builder.genShape(loc, lbounds, extents);
   mlir::Value newBox =
       builder.createBox(loc, tempBoxType, base, shape, /*slice=*/nullptr,
-                        typeParams, /*tdesc=*/nullptr);
-  return newBox;
+                        typeParams, useDynamicType ? origBox : nullptr);
+  // The new box might be !fir.class, while the original might be
+  // !fir.box - we have to add a conversion.
+  return builder.createConvert(loc, origBox.getType(), newBox);
 }
 
 mlir::FailureOr<mlir::Value>
@@ -280,16 +303,11 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder,
                             << op.getOperation() << '\n';
   }
 
-  mlir::Value tempBox =
-      allocateTempBuffer(builder, loc, op.getStack(), box, extents, typeParams);
+  mlir::Value tempBox = allocateTempBuffer(builder, loc, op.getStack(), box,
+                                           lbounds, extents, typeParams);
   if (!op.getNoCopy())
     fir::runtime::genShallowCopy(builder, loc, tempBox, box,
                                  /*resultIsAllocated=*/true);
-
-  // Set lower bounds after the original box.
-  mlir::Value shift = builder.genShift(loc, lbounds);
-  tempBox = builder.create<fir::ReboxOp>(loc, boxType, tempBox, shift,
-                                         /*slice=*/nullptr);
   builder.create<fir::ResultOp>(loc, tempBox);
 
   return ifOp.getResult(0);
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 7dbf21ce0c125..b60a72e4340b9 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -1443,14 +1443,35 @@ struct TargetAMDGPU : public GenericTarget<TargetAMDGPU> {
   CodeGenSpecifics::Marshalling
   complexArgumentType(mlir::Location loc, mlir::Type eleTy) const override {
     CodeGenSpecifics::Marshalling marshal;
-    TODO(loc, "handle complex argument types");
+    const auto *sem = &floatToSemantics(kindMap, eleTy);
+    if (sem == &llvm::APFloat::IEEEsingle()) {
+      // Lower COMPLEX(KIND=4) as an array of two element values.
+      marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{});
+    } else if (sem == &llvm::APFloat::IEEEdouble()) {
+      // Pass COMPLEX(KIND=8) as two separate arguments.
+      marshal.emplace_back(eleTy, AT{});
+      marshal.emplace_back(eleTy, AT{});
+    } else {
+      typeTodo(sem, loc, "argument");
+    }
     return marshal;
   }
 
   CodeGenSpecifics::Marshalling
   complexReturnType(mlir::Location loc, mlir::Type eleTy) const override {
     CodeGenSpecifics::Marshalling marshal;
-    TODO(loc, "handle complex return types");
+    const auto *sem = &floatToSemantics(kindMap, eleTy);
+    if (sem == &llvm::APFloat::IEEEsingle()) {
+      // Return COMPLEX(KIND=4) as an array of two elements.
+      marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{});
+    } else if (sem == &llvm::APFloat::IEEEdouble()) {
+      // Return COMPLEX(KIND=8) via an aggregate with two fields.
+      marshal.emplace_back(mlir::TupleType::get(eleTy.getContext(),
+                                                mlir::TypeRange{eleTy, eleTy}),
+                           AT{});
+    } else {
+      typeTodo(sem, loc, "return");
+    }
     return marshal;
   }
 };
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index b6bf2753b80ce..cf20d84cbbcdb 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4448,7 +4448,7 @@ llvm::LogicalResult fir::UnboxProcOp::verify() {
 
 void fir::IfOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
                       mlir::Value cond, bool withElseRegion) {
-  build(builder, result, std::nullopt, cond, withElseRegion);
+  build(builder, result, {}, cond, withElseRegion);
 }
 
 void fir::IfOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 58f2b57712974..00ca6731c035b 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -296,7 +296,7 @@ struct SetLengthOpConversion
     llvm::StringRef tmpName{".tmp"};
     llvm::SmallVector<mlir::Value, 1> lenParams{adaptor.getLength()};
     auto alloca = builder.createTemporary(loc, charType, tmpName,
-                                          /*shape=*/std::nullopt, lenParams);
+                                          /*shape=*/{}, lenParams);
     auto declareOp = builder.create<hlfir::DeclareOp>(
         loc, alloca, tmpName, /*shape=*/mlir::Value{}, lenParams,
         /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 03cc92e975b19..c5cf01ed98357 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -405,7 +405,7 @@ void OrderedAssignmentRewriter::pre(hlfir::ForallMaskOp forallMaskOp) {
   mlir::Location loc = forallMaskOp.getLoc();
   mlir::Value mask = generateYieldedScalarValue(forallMaskOp.getMaskRegion(),
                                                 builder.getI1Type());
-  auto ifOp = builder.create<fir::IfOp>(loc, std::nullopt, mask, false);
+  auto ifOp = builder.create<fir::IfOp>(loc, mlir::TypeRange{}, mask, false);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   constructStack.push_back(ifOp);
 }
@@ -530,7 +530,7 @@ void OrderedAssignmentRewriter::generateMaskIfOp(mlir::Value cdt) {
   mlir::Location loc = cdt.getLoc();
   cdt = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{cdt});
   cdt = builder.createConvert(loc, builder.getI1Type(), cdt);
-  auto ifOp = builder.create<fir::IfOp>(cdt.getLoc(), std::nullopt, cdt,
+  auto ifOp = builder.create<fir::IfOp>(cdt.getLoc(), mlir::TypeRange{}, cdt,
                                         /*withElseRegion=*/false);
   constructStack.push_back(ifOp.getOperation());
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index 59e2eeb76c715..391cfe3ceb9a2 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -137,9 +137,9 @@ class CallConversion : public mlir::OpRewritePattern<Op> {
     auto buffer = saveResult.getMemref();
     mlir::Value arg = buffer;
     if (mustEmboxResult(result.getType(), shouldBoxResult))
-      arg = rewriter.create<fir::EmboxOp>(
-          loc, argType, buffer, saveResult.getShape(), /*slice*/ mlir::Value{},
-          saveResult.getTypeparams());
+      arg = fir::EmboxOp::create(rewriter, loc, argType, buffer,
+                                 saveResult.getShape(), /*slice*/ mlir::Value{},
+                                 saveResult.getTypeparams());
 
     llvm::SmallVector<mlir::Type> newResultTypes;
     bool isResultBuiltinCPtr = fir::isa_builtin_cptr_type(result.getType());
@@ -155,8 +155,8 @@ class CallConversion : public mlir::OpRewritePattern<Op> {
         if (!isResultBuiltinCPtr)
           newOperands.emplace_back(arg);
         newOperands.append(op.getOperands().begin(), op.getOperands().end());
-        newOp = rewriter.create<fir::CallOp>(loc, *op.getCallee(),
-                                             newResultTypes, newOperands);
+        newOp = fir::CallOp::create(rewriter, loc, *op.getCallee(),
+                                    newResultTypes, newOperands);
       } else {
         // Indirect calls.
         llvm::SmallVector<mlir::Type> newInputTypes;
@@ -169,13 +169,13 @@ class CallConversion : public mlir::OpRewritePattern<Op> {
 
         llvm::SmallVector<mlir::Value> newOperands;
         newOperands.push_back(
-            rewriter.create<fir::ConvertOp>(loc, newFuncTy, op.getOperand(0)));
+            fir::ConvertOp::create(rewriter, loc, newFuncTy, op.getOperand(0)));
         if (!isResultBuiltinCPtr)
           newOperands.push_back(arg);
         newOperands.append(op.getOperands().begin() + 1,
                            op.getOperands().end());
-        newOp = rewriter.create<fir::CallOp>(loc, mlir::SymbolRefAttr{},
-                                             newResultTypes, newOperands);
+        newOp = fir::CallOp::create(rewriter, loc, mlir::SymbolRefAttr{},
+                                    newResultTypes, newOperands);
       }
     }
 
@@ -191,8 +191,8 @@ class CallConversion : public mlir::OpRewritePattern<Op> {
         passArgPos =
             rewriter.getI32IntegerAttr(*op.getPassArgPos() + passArgShift);
       // TODO: propagate argument and result attributes (need to be shifted).
-      newOp = rewriter.create<fir::DispatchOp>(
-          loc, newResultTypes, rewriter.getStringAttr(op.getMethod()),
+      newOp = fir::DispatchOp::create(
+          rewriter, loc, newResultTypes, rewriter.getStringAttr(op.getMethod()),
           op.getOperands()[0], newOperands, passArgPos,
           /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr,
           op.getProcedureAttrsAttr());
@@ -280,7 +280,7 @@ processReturnLikeOp(OpTy ret, mlir::Value newArg,
     // register pass, this is possible for fir.box results, or fir.record
     // with no length parameters. Simply store the result in the result
     // storage. at the return point.
-    rewriter.create<fir::StoreOp>(loc, resultValue, newArg);
+    fir::StoreOp::create(rewriter, loc, resultValue, newArg);
     rewriter.replaceOpWithNewOp<OpTy>(ret);
   }
   // Delete result old local storage if unused.
@@ -337,8 +337,8 @@ class AddrOfOpConversion : public mlir::OpRewritePattern<fir::AddrOfOp> {
       newFuncTy = getCPtrFunctionType(oldFuncTy);
     else
       newFuncTy = getNewFunctionType(oldFuncTy, shouldBoxResult);
-    auto newAddrOf = rewriter.create<fir::AddrOfOp>(addrOf.getLoc(), newFuncTy,
-                                                    addrOf.getSymbol());
+    auto newAddrOf = fir::AddrOfOp::create(rewriter, addrOf.getLoc(), newFuncTy,
+                                           addrOf.getSymbol());
     // Rather than converting all op a function pointer might transit through
     // (e.g calls, stores, loads, converts...), cast new type to the abstract
     // type. A conversion will be added when calling indirect calls of abstract
@@ -397,7 +397,7 @@ class AbstractResultOpt
         if (mustEmboxResult(resultType, shouldBoxResult)) {
           auto bufferType = fir::ReferenceType::get(resultType);
           rewriter.setInsertionPointToStart(&func.front());
-          newArg = rewriter.create<fir::BoxAddrOp>(loc, bufferType, newArg);
+          newArg = fir::BoxAddrOp::create(rewriter, loc, bufferType, newArg);
         }
         patterns.insert<ReturnOpConversion>(context, newArg);
         target.addDynamicallyLegalOp<mlir::func::ReturnOp>(
diff --git a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
index b27c1b26dedb3..85403ad257657 100644
--- a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
+++ b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
@@ -48,12 +48,21 @@ static llvm::cl::opt<bool>
                       llvm::cl::Hidden,
                       llvm::cl::desc("Add TBAA tags to local allocations."));
 
+// Engineering option to triage TBAA tags attachment for accesses
+// of allocatable entities.
+static llvm::cl::opt<unsigned> localAllocsThreshold(
+    "local-alloc-tbaa-threshold", llvm::cl::init(0), llvm::cl::ReallyHidden,
+    llvm::cl::desc("If present, stops generating TBAA tags for accesses of "
+                   "local allocations after N accesses in a module"));
+
 namespace {
 
 /// Shared state per-module
 class PassState {
 public:
-  PassState(mlir::DominanceInfo &domInfo) : domInfo(domInfo) {}
+  PassState(mlir::DominanceInfo &domInfo,
+            std::optional<unsigned> localAllocsThreshold)
+      : domInfo(domInfo), localAllocsThreshold(localAllocsThreshold) {}
   /// memoised call to fir::AliasAnalysis::getSource
   inline const fir::AliasAnalysis::Source &getSource(mlir::Value value) {
     if (!analysisCache.contains(value))
@@ -84,6 +93,11 @@ class PassState {
   // (e.g. !fir.ref<!fir.type<Derived{f:!fir.box<!fir.heap<f32>>}>>).
   bool typeReferencesDescriptor(mlir::Type type);
 
+  // Returns true if we can attach a TBAA tag to an access of an allocatable
+  // entities. It checks if localAllocsThreshold allows the next tag
+  // attachment.
+  bool attachLocalAllocTag();
+
 private:
   mlir::DominanceInfo &domInfo;
   fir::AliasAnalysis analysis;
@@ -103,6 +117,8 @@ class PassState {
   // Local pass cache for derived types that contain descriptor
   // member(s), to avoid the cost of isRecordWithDescriptorMember().
   llvm::DenseSet<mlir::Type> typesContainingDescriptors;
+
+  std::optional<unsigned> localAllocsThreshold;
 };
 
 // Process fir.dummy_scope operations in the given func:
@@ -169,6 +185,19 @@ bool PassState::typeReferencesDescriptor(mlir::Type type) {
   return false;
 }
 
+bool PassState::attachLocalAllocTag() {
+  if (!localAllocsThreshold)
+    return true;
+  if (*localAllocsThreshold == 0) {
+    LLVM_DEBUG(llvm::dbgs().indent(2)
+               << "WARN: not assigning TBAA tag for an allocated entity access "
+                  "due to the threshold\n");
+    return false;
+  }
+  --*localAllocsThreshold;
+  return true;
+}
+
 class AddAliasTagsPass : public fir::impl::AddAliasTagsBase<AddAliasTagsPass> {
 public:
   void runOnOperation() override;
@@ -335,16 +364,16 @@ void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
       LLVM_DEBUG(llvm::dbgs().indent(2)
                  << "WARN: unknown defining op for SourceKind::Allocate " << *op
                  << "\n");
-    } else if (source.isPointer()) {
+    } else if (source.isPointer() && state.attachLocalAllocTag()) {
       LLVM_DEBUG(llvm::dbgs().indent(2)
                  << "Found reference to allocation at " << *op << "\n");
       tag = state.getFuncTreeWithScope(func, scopeOp).targetDataTree.getTag();
-    } else if (name) {
+    } else if (name && state.attachLocalAllocTag()) {
       LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to allocation "
                                         << name << " at " << *op << "\n");
       tag = state.getFuncTreeWithScope(func, scopeOp)
                 .allocatedDataTree.getTag(*name);
-    } else {
+    } else if (state.attachLocalAllocTag()) {
       LLVM_DEBUG(llvm::dbgs().indent(2)
                  << "WARN: couldn't find a name for allocation " << *op
                  << "\n");
@@ -372,7 +401,9 @@ void AddAliasTagsPass::runOnOperation() {
   // thinks the pass operates on), then the real work of the pass is done in
   // runOnAliasInterface
   auto &domInfo = getAnalysis<mlir::DominanceInfo>();
-  PassState state(domInfo);
+  PassState state(domInfo, localAllocsThreshold.getPosition()
+                               ? std::optional<unsigned>(localAllocsThreshold)
+                               : std::nullopt);
 
   mlir::ModuleOp mod = getOperation();
   mod.walk(
diff --git a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
index d45f855c9078e..f1c66a5bbcf8c 100644
--- a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
@@ -60,9 +60,10 @@ class AffineLoadConversion
     if (!maybeExpandedMap)
       return failure();
 
-    auto coorOp = rewriter.create<fir::CoordinateOp>(
-        op.getLoc(), fir::ReferenceType::get(op.getResult().getType()),
-        adaptor.getMemref(), *maybeExpandedMap);
+    auto coorOp = fir::CoordinateOp::create(
+        rewriter, op.getLoc(),
+        fir::ReferenceType::get(op.getResult().getType()), adaptor.getMemref(),
+        *maybeExpandedMap);
 
     rewriter.replaceOpWithNewOp<fir::LoadOp>(op, coorOp.getResult());
     return success();
@@ -83,8 +84,9 @@ class AffineStoreConversion
     if (!maybeExpandedMap)
       return failure();
 
-    auto coorOp = rewriter.create<fir::CoordinateOp>(
-        op.getLoc(), fir::ReferenceType::get(op.getValueToStore().getType()),
+    auto coorOp = fir::CoordinateOp::create(
+        rewriter, op.getLoc(),
+        fir::ReferenceType::get(op.getValueToStore().getType()),
         adaptor.getMemref(), *maybeExpandedMap);
     rewriter.replaceOpWithNewOp<fir::StoreOp>(op, adaptor.getValue(),
                                               coorOp.getResult());
diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index ef82e400bea14..b032767eef6f0 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -366,8 +366,9 @@ static mlir::Type coordinateArrayElement(fir::ArrayCoorOp op) {
 static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeOp shape,
                               SmallVectorImpl<mlir::Value> &indexArgs,
                               mlir::PatternRewriter &rewriter) {
-  auto one = rewriter.create<mlir::arith::ConstantOp>(
-      acoOp.getLoc(), rewriter.getIndexType(), rewriter.getIndexAttr(1));
+  auto one = mlir::arith::ConstantOp::create(rewriter, acoOp.getLoc(),
+                                             rewriter.getIndexType(),
+                                             rewriter.getIndexAttr(1));
   auto extents = shape.getExtents();
   for (auto i = extents.begin(); i < extents.end(); i++) {
     indexArgs.push_back(one);
@@ -379,8 +380,9 @@ static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeOp shape,
 static void populateIndexArgs(fir::ArrayCoorOp acoOp, fir::ShapeShiftOp shape,
                               SmallVectorImpl<mlir::Value> &indexArgs,
                               mlir::PatternRewriter &rewriter) {
-  auto one = rewriter.create<mlir::arith::ConstantOp>(
-      acoOp.getLoc(), rewriter.getIndexType(), rewriter.getIndexAttr(1));
+  auto one = mlir::arith::ConstantOp::create(rewriter, acoOp.getLoc(),
+                                             rewriter.getIndexType(),
+                                             rewriter.getIndexAttr(1));
   auto extents = shape.getPairs();
   for (auto i = extents.begin(); i < extents.end();) {
     indexArgs.push_back(*i++);
@@ -422,13 +424,13 @@ createAffineOps(mlir::Value arrayRef, mlir::PatternRewriter &rewriter) {
 
   populateIndexArgs(acoOp, indexArgs, rewriter);
 
-  auto affineApply = rewriter.create<affine::AffineApplyOp>(
-      acoOp.getLoc(), affineMap, indexArgs);
+  auto affineApply = affine::AffineApplyOp::create(rewriter, acoOp.getLoc(),
+                                                   affineMap, indexArgs);
   auto arrayElementType = coordinateArrayElement(acoOp);
   auto newType =
       mlir::MemRefType::get({mlir::ShapedType::kDynamic}, arrayElementType);
-  auto arrayConvert = rewriter.create<fir::ConvertOp>(acoOp.getLoc(), newType,
-                                                      acoOp.getMemref());
+  auto arrayConvert = fir::ConvertOp::create(rewriter, acoOp.getLoc(), newType,
+                                             acoOp.getMemref());
   return std::make_pair(affineApply, arrayConvert);
 }
 
@@ -495,7 +497,7 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
                                 affineFor.getRegionIterArgs());
     if (!results.empty()) {
       rewriter.setInsertionPointToEnd(affineFor.getBody());
-      rewriter.create<affine::AffineYieldOp>(resultOp->getLoc(), results);
+      affine::AffineYieldOp::create(rewriter, resultOp->getLoc(), results);
     }
     rewriter.finalizeOpModification(affineFor.getOperation());
 
@@ -525,8 +527,8 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
   std::pair<affine::AffineForOp, mlir::Value>
   positiveConstantStep(fir::DoLoopOp op, int64_t step,
                        mlir::PatternRewriter &rewriter) const {
-    auto affineFor = rewriter.create<affine::AffineForOp>(
-        op.getLoc(), ValueRange(op.getLowerBound()),
+    auto affineFor = affine::AffineForOp::create(
+        rewriter, op.getLoc(), ValueRange(op.getLowerBound()),
         mlir::AffineMap::get(0, 1,
                              mlir::getAffineSymbolExpr(0, op.getContext())),
         ValueRange(op.getUpperBound()),
@@ -543,24 +545,24 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
     auto step = mlir::getAffineSymbolExpr(2, op.getContext());
     mlir::AffineMap upperBoundMap = mlir::AffineMap::get(
         0, 3, (upperBound - lowerBound + step).floorDiv(step));
-    auto genericUpperBound = rewriter.create<affine::AffineApplyOp>(
-        op.getLoc(), upperBoundMap,
+    auto genericUpperBound = affine::AffineApplyOp::create(
+        rewriter, op.getLoc(), upperBoundMap,
         ValueRange({op.getLowerBound(), op.getUpperBound(), op.getStep()}));
     auto actualIndexMap = mlir::AffineMap::get(
         1, 2,
         (lowerBound + mlir::getAffineDimExpr(0, op.getContext())) *
             mlir::getAffineSymbolExpr(1, op.getContext()));
 
-    auto affineFor = rewriter.create<affine::AffineForOp>(
-        op.getLoc(), ValueRange(),
+    auto affineFor = affine::AffineForOp::create(
+        rewriter, op.getLoc(), ValueRange(),
         AffineMap::getConstantMap(0, op.getContext()),
         genericUpperBound.getResult(),
         mlir::AffineMap::get(0, 1,
                              1 + mlir::getAffineSymbolExpr(0, op.getContext())),
         1, op.getIterOperands());
     rewriter.setInsertionPointToStart(affineFor.getBody());
-    auto actualIndex = rewriter.create<affine::AffineApplyOp>(
-        op.getLoc(), actualIndexMap,
+    auto actualIndex = affine::AffineApplyOp::create(
+        rewriter, op.getLoc(), actualIndexMap,
         ValueRange(
             {affineFor.getInductionVar(), op.getLowerBound(), op.getStep()}));
     return std::make_pair(affineFor, actualIndex.getResult());
@@ -588,8 +590,8 @@ class AffineIfConversion : public mlir::OpRewritePattern<fir::IfOp> {
               << "AffineIfConversion: couldn't calculate affine condition\n";);
       return failure();
     }
-    auto affineIf = rewriter.create<affine::AffineIfOp>(
-        op.getLoc(), affineCondition.getIntegerSet(),
+    auto affineIf = affine::AffineIfOp::create(
+        rewriter, op.getLoc(), affineCondition.getIntegerSet(),
         affineCondition.getAffineArgs(), !op.getElseRegion().empty());
     rewriter.startOpModification(affineIf);
     affineIf.getThenBlock()->getOperations().splice(
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index 8544d17f62248..247ba953f3265 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -856,7 +856,7 @@ static bool getAdjustedExtents(mlir::Location loc,
   auto idxTy = rewriter.getIndexType();
   if (isAssumedSize(result)) {
     // Use slice information to compute the extent of the column.
-    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
     mlir::Value size = one;
     if (mlir::Value sliceArg = arrLoad.getSlice()) {
       if (auto sliceOp =
@@ -896,14 +896,14 @@ static mlir::Value getOrReadExtentsAndShapeOp(
         mlir::cast<SequenceType>(dyn_cast_ptrOrBoxEleTy(boxTy)).getDimension();
     auto idxTy = rewriter.getIndexType();
     for (decltype(rank) dim = 0; dim < rank; ++dim) {
-      auto dimVal = rewriter.create<mlir::arith::ConstantIndexOp>(loc, dim);
-      auto dimInfo = rewriter.create<BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                arrLoad.getMemref(), dimVal);
+      auto dimVal = mlir::arith::ConstantIndexOp::create(rewriter, loc, dim);
+      auto dimInfo = BoxDimsOp::create(rewriter, loc, idxTy, idxTy, idxTy,
+                                       arrLoad.getMemref(), dimVal);
       result.emplace_back(dimInfo.getResult(1));
     }
     if (!arrLoad.getShape()) {
       auto shapeType = ShapeType::get(rewriter.getContext(), rank);
-      return rewriter.create<ShapeOp>(loc, shapeType, result);
+      return ShapeOp::create(rewriter, loc, shapeType, result);
     }
     auto shiftOp = arrLoad.getShape().getDefiningOp<ShiftOp>();
     auto shapeShiftType = ShapeShiftType::get(rewriter.getContext(), rank);
@@ -912,8 +912,8 @@ static mlir::Value getOrReadExtentsAndShapeOp(
       shapeShiftOperands.push_back(lb);
       shapeShiftOperands.push_back(extent);
     }
-    return rewriter.create<ShapeShiftOp>(loc, shapeShiftType,
-                                         shapeShiftOperands);
+    return ShapeShiftOp::create(rewriter, loc, shapeShiftType,
+                                shapeShiftOperands);
   }
   copyUsingSlice =
       getAdjustedExtents(loc, rewriter, arrLoad, result, arrLoad.getShape());
@@ -952,13 +952,13 @@ static mlir::Value genCoorOp(mlir::PatternRewriter &rewriter,
   auto module = load->getParentOfType<mlir::ModuleOp>();
   FirOpBuilder builder(rewriter, module);
   auto typeparams = getTypeParamsIfRawData(loc, builder, load, alloc.getType());
-  mlir::Value result = rewriter.create<ArrayCoorOp>(
-      loc, eleTy, alloc, shape, slice,
+  mlir::Value result = ArrayCoorOp::create(
+      rewriter, loc, eleTy, alloc, shape, slice,
       llvm::ArrayRef<mlir::Value>{originated}.take_front(dimension),
       typeparams);
   if (dimension < originated.size())
-    result = rewriter.create<fir::CoordinateOp>(
-        loc, resTy, result,
+    result = fir::CoordinateOp::create(
+        rewriter, loc, resTy, result,
         llvm::ArrayRef<mlir::Value>{originated}.drop_front(dimension));
   return result;
 }
@@ -971,13 +971,13 @@ static mlir::Value getCharacterLen(mlir::Location loc, FirOpBuilder &builder,
       // The loaded array is an emboxed value. Get the CHARACTER length from
       // the box value.
       auto eleSzInBytes =
-          builder.create<BoxEleSizeOp>(loc, charLenTy, load.getMemref());
+          BoxEleSizeOp::create(builder, loc, charLenTy, load.getMemref());
       auto kindSize =
           builder.getKindMap().getCharacterBitsize(charTy.getFKind());
       auto kindByteSize =
           builder.createIntegerConstant(loc, charLenTy, kindSize / 8);
-      return builder.create<mlir::arith::DivSIOp>(loc, eleSzInBytes,
-                                                  kindByteSize);
+      return mlir::arith::DivSIOp::create(builder, loc, eleSzInBytes,
+                                          kindByteSize);
     }
     // The loaded array is a (set of) unboxed values. If the CHARACTER's
     // length is not a constant, it must be provided as a type parameter to
@@ -1003,11 +1003,11 @@ void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
   auto idxTy = rewriter.getIndexType();
   // Build loop nest from column to row.
   for (auto sh : llvm::reverse(extents)) {
-    auto ubi = rewriter.create<ConvertOp>(loc, idxTy, sh);
-    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
-    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
-    auto ub = rewriter.create<mlir::arith::SubIOp>(loc, idxTy, ubi, one);
-    auto loop = rewriter.create<DoLoopOp>(loc, zero, ub, one);
+    auto ubi = ConvertOp::create(rewriter, loc, idxTy, sh);
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    auto ub = mlir::arith::SubIOp::create(rewriter, loc, idxTy, ubi, one);
+    auto loop = DoLoopOp::create(rewriter, loc, zero, ub, one);
     rewriter.setInsertionPointToStart(loop.getBody());
     indices.push_back(loop.getInductionVar());
   }
@@ -1015,13 +1015,13 @@ void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
   std::reverse(indices.begin(), indices.end());
   auto module = arrLoad->getParentOfType<mlir::ModuleOp>();
   FirOpBuilder builder(rewriter, module);
-  auto fromAddr = rewriter.create<ArrayCoorOp>(
-      loc, getEleTy(src.getType()), src, shapeOp,
+  auto fromAddr = ArrayCoorOp::create(
+      rewriter, loc, getEleTy(src.getType()), src, shapeOp,
       CopyIn && copyUsingSlice ? sliceOp : mlir::Value{},
       factory::originateIndices(loc, rewriter, src.getType(), shapeOp, indices),
       getTypeParamsIfRawData(loc, builder, arrLoad, src.getType()));
-  auto toAddr = rewriter.create<ArrayCoorOp>(
-      loc, getEleTy(dst.getType()), dst, shapeOp,
+  auto toAddr = ArrayCoorOp::create(
+      rewriter, loc, getEleTy(dst.getType()), dst, shapeOp,
       !CopyIn && copyUsingSlice ? sliceOp : mlir::Value{},
       factory::originateIndices(loc, rewriter, dst.getType(), shapeOp, indices),
       getTypeParamsIfRawData(loc, builder, arrLoad, dst.getType()));
@@ -1093,15 +1093,16 @@ allocateArrayTemp(mlir::Location loc, mlir::PatternRewriter &rewriter,
       findNonconstantExtents(baseType, extents);
   llvm::SmallVector<mlir::Value> typeParams =
       genArrayLoadTypeParameters(loc, rewriter, load);
-  mlir::Value allocmem = rewriter.create<AllocMemOp>(
-      loc, dyn_cast_ptrOrBoxEleTy(baseType), typeParams, nonconstantExtents);
+  mlir::Value allocmem =
+      AllocMemOp::create(rewriter, loc, dyn_cast_ptrOrBoxEleTy(baseType),
+                         typeParams, nonconstantExtents);
   mlir::Type eleType =
       fir::unwrapSequenceType(fir::unwrapPassByRefType(baseType));
   if (fir::isRecordWithAllocatableMember(eleType)) {
     // The allocatable component descriptors need to be set to a clean
     // deallocated status before anything is done with them.
-    mlir::Value box = rewriter.create<fir::EmboxOp>(
-        loc, fir::BoxType::get(allocmem.getType()), allocmem, shape,
+    mlir::Value box = fir::EmboxOp::create(
+        rewriter, loc, fir::BoxType::get(allocmem.getType()), allocmem, shape,
         /*slice=*/mlir::Value{}, typeParams);
     auto module = load->getParentOfType<mlir::ModuleOp>();
     FirOpBuilder builder(rewriter, module);
@@ -1111,12 +1112,12 @@ allocateArrayTemp(mlir::Location loc, mlir::PatternRewriter &rewriter,
     auto cleanup = [=](mlir::PatternRewriter &r) {
       FirOpBuilder builder(r, module);
       runtime::genDerivedTypeDestroy(builder, loc, box);
-      r.create<FreeMemOp>(loc, allocmem);
+      FreeMemOp::create(r, loc, allocmem);
     };
     return {allocmem, cleanup};
   }
   auto cleanup = [=](mlir::PatternRewriter &r) {
-    r.create<FreeMemOp>(loc, allocmem);
+    FreeMemOp::create(r, loc, allocmem);
   };
   return {allocmem, cleanup};
 }
@@ -1257,7 +1258,7 @@ class ArrayUpdateConversion : public ArrayUpdateConversionBase<ArrayUpdateOp> {
       if (auto inEleTy = dyn_cast_ptrEleTy(input.getType())) {
         emitFatalError(loc, "array_update on references not supported");
       } else {
-        rewriter.create<fir::StoreOp>(loc, input, coor);
+        fir::StoreOp::create(rewriter, loc, input, coor);
       }
     };
     auto lhsEltRefType = toRefType(update.getMerge().getType());
@@ -1368,7 +1369,7 @@ class ArrayAmendConversion : public mlir::OpRewritePattern<ArrayAmendOp> {
     auto *op = amend.getOperation();
     rewriter.setInsertionPoint(op);
     auto loc = amend.getLoc();
-    auto undef = rewriter.create<UndefOp>(loc, amend.getType());
+    auto undef = UndefOp::create(rewriter, loc, amend.getType());
     rewriter.replaceOp(amend, undef.getResult());
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/Transforms/AssumedRankOpConversion.cpp b/flang/lib/Optimizer/Transforms/AssumedRankOpConversion.cpp
index 6af1cb988a4c1..4c7b228eefeb5 100644
--- a/flang/lib/Optimizer/Transforms/AssumedRankOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/AssumedRankOpConversion.cpp
@@ -88,8 +88,8 @@ class ReboxAssumedRankConv
         (fir::isPolymorphicType(oldBoxType) ||
          (newEleType != oldBoxType.unwrapInnerType())) &&
         !fir::isPolymorphicType(newBoxType)) {
-      newDtype = builder.create<fir::TypeDescOp>(
-          loc, mlir::TypeAttr::get(newDerivedType));
+      newDtype = fir::TypeDescOp::create(builder, loc,
+                                         mlir::TypeAttr::get(newDerivedType));
     } else {
       newDtype = builder.createNullConstant(loc);
     }
@@ -103,7 +103,7 @@ class ReboxAssumedRankConv
                                              rebox.getBox(), newDtype,
                                              newAttribute, lowerBoundModifier);
 
-    mlir::Value descValue = builder.create<fir::LoadOp>(loc, tempDesc);
+    mlir::Value descValue = fir::LoadOp::create(builder, loc, tempDesc);
     mlir::Value castDesc = builder.createConvert(loc, newBoxType, descValue);
     rewriter.replaceOp(rebox, castDesc);
     return mlir::success();
diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
index 2dd6950b34897..baa8e591ee162 100644
--- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
@@ -68,25 +68,26 @@ struct CUFAddConstructor
 
     // Symbol reference to CUFRegisterAllocator.
     builder.setInsertionPointToEnd(mod.getBody());
-    auto registerFuncOp = builder.create<mlir::LLVM::LLVMFuncOp>(
-        loc, RTNAME_STRING(CUFRegisterAllocator), funcTy);
+    auto registerFuncOp = mlir::LLVM::LLVMFuncOp::create(
+        builder, loc, RTNAME_STRING(CUFRegisterAllocator), funcTy);
     registerFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
     auto cufRegisterAllocatorRef = mlir::SymbolRefAttr::get(
         mod.getContext(), RTNAME_STRING(CUFRegisterAllocator));
     builder.setInsertionPointToEnd(mod.getBody());
 
     // Create the constructor function that call CUFRegisterAllocator.
-    auto func = builder.create<mlir::LLVM::LLVMFuncOp>(loc, cudaFortranCtorName,
-                                                       funcTy);
+    auto func = mlir::LLVM::LLVMFuncOp::create(builder, loc,
+                                               cudaFortranCtorName, funcTy);
     func.setLinkage(mlir::LLVM::Linkage::Internal);
     builder.setInsertionPointToStart(func.addEntryBlock(builder));
-    builder.create<mlir::LLVM::CallOp>(loc, funcTy, cufRegisterAllocatorRef);
+    mlir::LLVM::CallOp::create(builder, loc, funcTy, cufRegisterAllocatorRef);
 
     auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(cudaDeviceModuleName);
     if (gpuMod) {
       auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
-      auto registeredMod = builder.create<cuf::RegisterModuleOp>(
-          loc, llvmPtrTy, mlir::SymbolRefAttr::get(ctx, gpuMod.getName()));
+      auto registeredMod = cuf::RegisterModuleOp::create(
+          builder, loc, llvmPtrTy,
+          mlir::SymbolRefAttr::get(ctx, gpuMod.getName()));
 
       fir::LLVMTypeConverter typeConverter(mod, /*applyTBAA=*/false,
                                            /*forceUnifiedTBAATree=*/false, *dl);
@@ -96,7 +97,8 @@ struct CUFAddConstructor
           auto kernelName = mlir::SymbolRefAttr::get(
               builder.getStringAttr(cudaDeviceModuleName),
               {mlir::SymbolRefAttr::get(builder.getContext(), func.getName())});
-          builder.create<cuf::RegisterKernelOp>(loc, kernelName, registeredMod);
+          cuf::RegisterKernelOp::create(builder, loc, kernelName,
+                                        registeredMod);
         }
       }
 
@@ -140,19 +142,19 @@ struct CUFAddConstructor
           auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
 
           // Global variable address
-          mlir::Value addr = builder.create<fir::AddrOfOp>(
-              loc, globalOp.resultType(), globalOp.getSymbol());
+          mlir::Value addr = fir::AddrOfOp::create(
+              builder, loc, globalOp.resultType(), globalOp.getSymbol());
 
           llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
               builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
-          builder.create<fir::CallOp>(loc, func, args);
+          fir::CallOp::create(builder, loc, func, args);
         } break;
         default:
           break;
         }
       }
     }
-    builder.create<mlir::LLVM::ReturnOp>(loc, mlir::ValueRange{});
+    mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
 
     // Create the llvm.global_ctor with the function.
     // TODO: We might want to have a utility that retrieve it if already
@@ -165,8 +167,8 @@ struct CUFAddConstructor
     llvm::SmallVector<mlir::Attribute> data;
     priorities.push_back(0);
     data.push_back(mlir::LLVM::ZeroAttr::get(mod.getContext()));
-    builder.create<mlir::LLVM::GlobalCtorsOp>(
-        mod.getLoc(), builder.getArrayAttr(funcs),
+    mlir::LLVM::GlobalCtorsOp::create(
+        builder, mod.getLoc(), builder.getArrayAttr(funcs),
         builder.getI32ArrayAttr(priorities), builder.getArrayAttr(data));
   }
 };
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index f6381ef8a8a21..5e910f7da6472 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -93,10 +93,11 @@ struct CUFComputeSharedMemoryOffsetsAndSize
           mlir::Value dynSize =
               builder.createIntegerConstant(loc, idxTy, tySize);
           for (auto extent : sharedOp.getShape())
-            dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent);
+            dynSize =
+                mlir::arith::MulIOp::create(builder, loc, dynSize, extent);
           if (crtDynOffset)
-            crtDynOffset =
-                builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize);
+            crtDynOffset = mlir::arith::AddIOp::create(builder, loc,
+                                                       crtDynOffset, dynSize);
           else
             crtDynOffset = dynSize;
 
@@ -142,9 +143,9 @@ struct CUFComputeSharedMemoryOffsetsAndSize
           fir::GlobalOp::getDataAttrAttrName(globalOpName),
           cuf::DataAttributeAttr::get(gpuMod.getContext(),
                                       cuf::DataAttribute::Shared)));
-      auto sharedMem = builder.create<fir::GlobalOp>(
-          funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
-          init, linkage, attrs);
+      auto sharedMem = fir::GlobalOp::create(
+          builder, funcOp.getLoc(), sharedMemGlobalName, false, false,
+          sharedMemType, init, linkage, attrs);
       sharedMem.setAlignment(alignment);
     }
   }
diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
index bfb0daeacb8c3..35badb6eadb1c 100644
--- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
@@ -53,21 +53,26 @@ static void processAddrOfOp(fir::AddrOfOp addrOfOp,
   }
 }
 
+static void processTypeDescriptor(fir::RecordType recTy,
+                                  mlir::SymbolTable &symbolTable,
+                                  llvm::DenseSet<fir::GlobalOp> &candidates) {
+  if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
+          fir::NameUniquer::getTypeDescriptorName(recTy.getName()))) {
+    if (!candidates.contains(globalOp)) {
+      globalOp.walk([&](fir::AddrOfOp op) {
+        processAddrOfOp(op, symbolTable, candidates,
+                        /*recurseInGlobal=*/true);
+      });
+      candidates.insert(globalOp);
+    }
+  }
+}
+
 static void processEmboxOp(fir::EmboxOp emboxOp, mlir::SymbolTable &symbolTable,
                            llvm::DenseSet<fir::GlobalOp> &candidates) {
   if (auto recTy = mlir::dyn_cast<fir::RecordType>(
-          fir::unwrapRefType(emboxOp.getMemref().getType()))) {
-    if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
-            fir::NameUniquer::getTypeDescriptorName(recTy.getName()))) {
-      if (!candidates.contains(globalOp)) {
-        globalOp.walk([&](fir::AddrOfOp op) {
-          processAddrOfOp(op, symbolTable, candidates,
-                          /*recurseInGlobal=*/true);
-        });
-        candidates.insert(globalOp);
-      }
-    }
-  }
+          fir::unwrapRefType(emboxOp.getMemref().getType())))
+    processTypeDescriptor(recTy, symbolTable, candidates);
 }
 
 static void
@@ -85,6 +90,17 @@ prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
   }
 }
 
+static void
+processPotentialTypeDescriptor(mlir::Type candidateType,
+                               mlir::SymbolTable &symbolTable,
+                               llvm::DenseSet<fir::GlobalOp> &candidates) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(candidateType))
+    candidateType = boxTy.getEleTy();
+  candidateType = fir::unwrapSequenceType(fir::unwrapRefType(candidateType));
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(candidateType))
+    processTypeDescriptor(recTy, symbolTable, candidates);
+}
+
 class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
 public:
   void runOnOperation() override {
@@ -115,6 +131,8 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
     for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
       if (cuf::isRegisteredDeviceGlobal(globalOp)) {
         candidates.insert(globalOp);
+        processPotentialTypeDescriptor(globalOp.getType(), parentSymTable,
+                                       candidates);
       } else if (globalOp.getConstant() &&
                  mlir::isa<fir::SequenceType>(
                      fir::unwrapRefType(globalOp.resultType()))) {
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index fe69ffa8350af..a40ed95391c3a 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -46,27 +46,28 @@ static mlir::Value createKernelArgArray(mlir::Location loc,
   auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes);
   auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
   mlir::Type i32Ty = rewriter.getI32Type();
-  auto zero = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
-  auto one = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 1));
+  auto zero = mlir::LLVM::ConstantOp::create(rewriter, loc, i32Ty,
+                                             rewriter.getIntegerAttr(i32Ty, 0));
+  auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, i32Ty,
+                                            rewriter.getIntegerAttr(i32Ty, 1));
   mlir::Value argStruct =
-      rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, structTy, one);
-  auto size = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
+      mlir::LLVM::AllocaOp::create(rewriter, loc, ptrTy, structTy, one);
+  auto size = mlir::LLVM::ConstantOp::create(
+      rewriter, loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
   mlir::Value argArray =
-      rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, ptrTy, size);
+      mlir::LLVM::AllocaOp::create(rewriter, loc, ptrTy, ptrTy, size);
 
   for (auto [i, arg] : llvm::enumerate(operands)) {
-    auto indice = rewriter.create<mlir::LLVM::ConstantOp>(
-        loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
-    mlir::Value structMember = rewriter.create<LLVM::GEPOp>(
-        loc, ptrTy, structTy, argStruct,
-        mlir::ArrayRef<mlir::Value>({zero, indice}));
-    rewriter.create<LLVM::StoreOp>(loc, arg, structMember);
-    mlir::Value arrayMember = rewriter.create<LLVM::GEPOp>(
-        loc, ptrTy, ptrTy, argArray, mlir::ArrayRef<mlir::Value>({indice}));
-    rewriter.create<LLVM::StoreOp>(loc, structMember, arrayMember);
+    auto indice = mlir::LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
+    mlir::Value structMember =
+        LLVM::GEPOp::create(rewriter, loc, ptrTy, structTy, argStruct,
+                            mlir::ArrayRef<mlir::Value>({zero, indice}));
+    LLVM::StoreOp::create(rewriter, loc, arg, structMember);
+    mlir::Value arrayMember =
+        LLVM::GEPOp::create(rewriter, loc, ptrTy, ptrTy, argArray,
+                            mlir::ArrayRef<mlir::Value>({indice}));
+    LLVM::StoreOp::create(rewriter, loc, structMember, arrayMember);
   }
   return argArray;
 }
@@ -94,8 +95,8 @@ struct GPULaunchKernelConversion
     mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize();
     mlir::Type i32Ty = rewriter.getI32Type();
     if (!dynamicMemorySize)
-      dynamicMemorySize = rewriter.create<mlir::LLVM::ConstantOp>(
-          loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
+      dynamicMemorySize = mlir::LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
 
     mlir::Value kernelArgs =
         createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter);
@@ -108,17 +109,17 @@ struct GPULaunchKernelConversion
       if (!funcOp)
         return mlir::failure();
       kernelPtr =
-          rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, funcOp.getName());
+          LLVM::AddressOfOp::create(rewriter, loc, ptrTy, funcOp.getName());
     } else {
       kernelPtr =
-          rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
+          LLVM::AddressOfOp::create(rewriter, loc, ptrTy, kernel.getName());
     }
 
     auto llvmIntPtrType = mlir::IntegerType::get(
         ctx, this->getTypeConverter()->getPointerBitwidth(0));
     auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
 
-    mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
+    mlir::Value nullPtr = LLVM::ZeroOp::create(rewriter, loc, ptrTy);
 
     if (op.hasClusterSize()) {
       auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
@@ -134,8 +135,8 @@ struct GPULaunchKernelConversion
       if (!funcOp) {
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
-        auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-            loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
+        auto launchKernelFuncOp = mlir::LLVM::LLVMFuncOp::create(
+            rewriter, loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
@@ -148,8 +149,8 @@ struct GPULaunchKernelConversion
         stream = adaptor.getAsyncDependencies().front();
       }
 
-      rewriter.create<mlir::LLVM::CallOp>(
-          loc, funcTy, cufLaunchClusterKernel,
+      mlir::LLVM::CallOp::create(
+          rewriter, loc, funcTy, cufLaunchClusterKernel,
           mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
                            adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
                            adaptor.getGridSizeX(), adaptor.getGridSizeY(),
@@ -178,7 +179,7 @@ struct GPULaunchKernelConversion
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
         auto launchKernelFuncOp =
-            rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
+            mlir::LLVM::LLVMFuncOp::create(rewriter, loc, fctName, funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
@@ -191,8 +192,8 @@ struct GPULaunchKernelConversion
         stream = adaptor.getAsyncDependencies().front();
       }
 
-      rewriter.create<mlir::LLVM::CallOp>(
-          loc, funcTy, cufLaunchKernel,
+      mlir::LLVM::CallOp::create(
+          rewriter, loc, funcTy, cufLaunchKernel,
           mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
                            adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
                            adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
@@ -222,11 +223,11 @@ static mlir::Value createAddressOfOp(mlir::ConversionPatternRewriter &rewriter,
   auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(
       rewriter.getContext(), mlir::NVVM::NVVMMemorySpace::kSharedMemorySpace);
   if (auto g = gpuMod.lookupSymbol<fir::GlobalOp>(sharedGlobalName))
-    return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
-                                                    g.getSymName());
+    return mlir::LLVM::AddressOfOp::create(rewriter, loc, llvmPtrTy,
+                                           g.getSymName());
   if (auto g = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(sharedGlobalName))
-    return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
-                                                    g.getSymName());
+    return mlir::LLVM::AddressOfOp::create(rewriter, loc, llvmPtrTy,
+                                           g.getSymName());
   return {};
 }
 
@@ -255,13 +256,13 @@ struct CUFSharedMemoryOpConversion
     if (!sharedGlobalAddr)
       mlir::emitError(loc, "Could not find the shared global operation\n");
 
-    auto castPtr = rewriter.create<mlir::LLVM::AddrSpaceCastOp>(
-        loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
+    auto castPtr = mlir::LLVM::AddrSpaceCastOp::create(
+        rewriter, loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
         sharedGlobalAddr);
     mlir::Type baseType = castPtr->getResultTypes().front();
     llvm::SmallVector<mlir::LLVM::GEPArg> gepArgs = {op.getOffset()};
-    mlir::Value shmemPtr = rewriter.create<mlir::LLVM::GEPOp>(
-        loc, baseType, rewriter.getI8Type(), castPtr, gepArgs);
+    mlir::Value shmemPtr = mlir::LLVM::GEPOp::create(
+        rewriter, loc, baseType, rewriter.getI8Type(), castPtr, gepArgs);
     rewriter.replaceOp(op, {shmemPtr});
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 750569c126642..cd7d33091f345 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -89,7 +89,7 @@ static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
                                    mlir::Location loc, mlir::Type toTy,
                                    mlir::Value val) {
   if (val.getType() != toTy)
-    return rewriter.create<fir::ConvertOp>(loc, toTy, val);
+    return fir::ConvertOp::create(rewriter, loc, toTy, val);
   return val;
 }
 
@@ -118,7 +118,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
     errmsg = op.getErrmsg();
   } else {
     mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
-    errmsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
+    errmsg = fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
@@ -148,7 +148,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
         fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
                                       errmsg, sourceFile, sourceLine);
   }
-  auto callOp = builder.create<fir::CallOp>(loc, func, args);
+  auto callOp = fir::CallOp::create(builder, loc, func, args);
   rewriter.replaceOp(op, callOp);
   return mlir::success();
 }
@@ -301,10 +301,11 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
     if (inDeviceContext(op.getOperation())) {
       // In device context just replace the cuf.alloc operation with a fir.alloc
       // the cuf.free will be removed.
-      auto allocaOp = rewriter.create<fir::AllocaOp>(
-          loc, op.getInType(), op.getUniqName() ? *op.getUniqName() : "",
-          op.getBindcName() ? *op.getBindcName() : "", op.getTypeparams(),
-          op.getShape());
+      auto allocaOp =
+          fir::AllocaOp::create(rewriter, loc, op.getInType(),
+                                op.getUniqName() ? *op.getUniqName() : "",
+                                op.getBindcName() ? *op.getBindcName() : "",
+                                op.getTypeparams(), op.getShape());
       allocaOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
       rewriter.replaceOp(op, allocaOp);
       return mlir::success();
@@ -338,14 +339,15 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
           assert(!op.getShape().empty() && "expect shape with dynamic arrays");
           nbElem = builder.loadIfRef(loc, op.getShape()[0]);
           for (unsigned i = 1; i < op.getShape().size(); ++i) {
-            nbElem = rewriter.create<mlir::arith::MulIOp>(
-                loc, nbElem, builder.loadIfRef(loc, op.getShape()[i]));
+            nbElem = mlir::arith::MulIOp::create(
+                rewriter, loc, nbElem,
+                builder.loadIfRef(loc, op.getShape()[i]));
           }
         } else {
           nbElem = builder.createIntegerConstant(loc, builder.getIndexType(),
                                                  seqTy.getConstantArraySize());
         }
-        bytes = rewriter.create<mlir::arith::MulIOp>(loc, nbElem, width);
+        bytes = mlir::arith::MulIOp::create(rewriter, loc, nbElem, width);
       } else if (fir::isa_derived(op.getInType())) {
         mlir::Type structTy = typeConverter->convertType(op.getInType());
         std::size_t structSize = dl->getTypeSizeInBits(structTy) / 8;
@@ -363,7 +365,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
           loc, builder.getI32Type(), getMemType(op.getDataAttr()));
       llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
           builder, loc, fTy, bytes, memTy, sourceFile, sourceLine)};
-      auto callOp = builder.create<fir::CallOp>(loc, func, args);
+      auto callOp = fir::CallOp::create(builder, loc, func, args);
       callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
       auto convOp = builder.createConvert(loc, op.getResult().getType(),
                                           callOp.getResult(0));
@@ -386,7 +388,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
 
     llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
         builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
-    auto callOp = builder.create<fir::CallOp>(loc, func, args);
+    auto callOp = fir::CallOp::create(builder, loc, func, args);
     callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
     auto convOp = builder.createConvert(loc, op.getResult().getType(),
                                         callOp.getResult(0));
@@ -414,8 +416,9 @@ struct CUFDeviceAddressOpConversion
             op.getHostSymbol().getRootReference().getValue())) {
       auto mod = op->getParentOfType<mlir::ModuleOp>();
       mlir::Location loc = op.getLoc();
-      auto hostAddr = rewriter.create<fir::AddrOfOp>(
-          loc, fir::ReferenceType::get(global.getType()), op.getHostSymbol());
+      auto hostAddr = fir::AddrOfOp::create(
+          rewriter, loc, fir::ReferenceType::get(global.getType()),
+          op.getHostSymbol());
       fir::FirOpBuilder builder(rewriter, mod);
       mlir::func::FuncOp callee =
           fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(loc,
@@ -428,7 +431,7 @@ struct CUFDeviceAddressOpConversion
           fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
       llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
           builder, loc, fTy, conv, sourceFile, sourceLine)};
-      auto call = rewriter.create<fir::CallOp>(loc, callee, args);
+      auto call = fir::CallOp::create(rewriter, loc, callee, args);
       mlir::Value addr = createConvertOp(rewriter, loc, hostAddr.getType(),
                                          call->getResult(0));
       rewriter.replaceOp(op, addr.getDefiningOp());
@@ -456,8 +459,8 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
               addrOfOp.getSymbol().getRootReference().getValue())) {
         if (cuf::isRegisteredDeviceGlobal(global)) {
           rewriter.setInsertionPointAfter(addrOfOp);
-          mlir::Value devAddr = rewriter.create<cuf::DeviceAddressOp>(
-              op.getLoc(), addrOfOp.getType(), addrOfOp.getSymbol());
+          mlir::Value devAddr = cuf::DeviceAddressOp::create(
+              rewriter, op.getLoc(), addrOfOp.getType(), addrOfOp.getSymbol());
           rewriter.startOpModification(op);
           op.getMemrefMutable().assign(devAddr);
           rewriter.finalizeOpModification(op);
@@ -502,7 +505,7 @@ struct CUFFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
           loc, builder.getI32Type(), getMemType(op.getDataAttr()));
       llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
           builder, loc, fTy, op.getDevptr(), memTy, sourceFile, sourceLine)};
-      builder.create<fir::CallOp>(loc, func, args);
+      fir::CallOp::create(builder, loc, func, args);
       rewriter.eraseOp(op);
       return mlir::success();
     }
@@ -515,7 +518,7 @@ struct CUFFreeOpConversion : public mlir::OpRewritePattern<cuf::FreeOp> {
         fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
     llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
         builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
-    auto callOp = builder.create<fir::CallOp>(loc, func, args);
+    auto callOp = fir::CallOp::create(builder, loc, func, args);
     callOp->setAttr(cuf::getDataAttrName(), op.getDataAttrAttr());
     rewriter.eraseOp(op);
     return mlir::success();
@@ -558,18 +561,18 @@ static mlir::Value emboxSrc(mlir::PatternRewriter &rewriter,
       srcTy = fir::LogicalType::get(rewriter.getContext(), 4);
       src = createConvertOp(rewriter, loc, srcTy, src);
       addr = builder.createTemporary(loc, srcTy);
-      builder.create<fir::StoreOp>(loc, src, addr);
+      fir::StoreOp::create(builder, loc, src, addr);
     } else {
       if (dstEleTy && fir::isa_trivial(dstEleTy) && srcTy != dstEleTy) {
         // Use dstEleTy and convert to avoid assign mismatch.
         addr = builder.createTemporary(loc, dstEleTy);
-        auto conv = builder.create<fir::ConvertOp>(loc, dstEleTy, src);
-        builder.create<fir::StoreOp>(loc, conv, addr);
+        auto conv = fir::ConvertOp::create(builder, loc, dstEleTy, src);
+        fir::StoreOp::create(builder, loc, conv, addr);
         srcTy = dstEleTy;
       } else {
         // Put constant in memory if it is not.
         addr = builder.createTemporary(loc, srcTy);
-        builder.create<fir::StoreOp>(loc, src, addr);
+        fir::StoreOp::create(builder, loc, src, addr);
       }
     }
   } else {
@@ -582,7 +585,7 @@ static mlir::Value emboxSrc(mlir::PatternRewriter &rewriter,
                         /*slice=*/nullptr, lenParams,
                         /*tdesc=*/nullptr);
   mlir::Value src = builder.createTemporary(loc, box.getType());
-  builder.create<fir::StoreOp>(loc, box, src);
+  fir::StoreOp::create(builder, loc, box, src);
   return src;
 }
 
@@ -601,7 +604,7 @@ static mlir::Value emboxDst(mlir::PatternRewriter &rewriter,
                         /*slice=*/nullptr, lenParams,
                         /*tdesc=*/nullptr);
   mlir::Value dst = builder.createTemporary(loc, dstBox.getType());
-  builder.create<fir::StoreOp>(loc, dstBox, dst);
+  fir::StoreOp::create(builder, loc, dstBox, dst);
   return dst;
 }
 
@@ -660,7 +663,7 @@ struct CUFDataTransferOpConversion
             fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
         llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
             builder, loc, fTy, dst, src, modeValue, sourceFile, sourceLine)};
-        builder.create<fir::CallOp>(loc, func, args);
+        fir::CallOp::create(builder, loc, func, args);
         rewriter.eraseOp(op);
         return mlir::success();
       }
@@ -679,12 +682,12 @@ struct CUFDataTransferOpConversion
               extents.push_back(i.value());
         }
 
-        nbElement = rewriter.create<fir::ConvertOp>(loc, i64Ty, extents[0]);
+        nbElement = fir::ConvertOp::create(rewriter, loc, i64Ty, extents[0]);
         for (unsigned i = 1; i < extents.size(); ++i) {
           auto operand =
-              rewriter.create<fir::ConvertOp>(loc, i64Ty, extents[i]);
+              fir::ConvertOp::create(rewriter, loc, i64Ty, extents[i]);
           nbElement =
-              rewriter.create<mlir::arith::MulIOp>(loc, nbElement, operand);
+              mlir::arith::MulIOp::create(rewriter, loc, nbElement, operand);
         }
       } else {
         if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(dstTy))
@@ -699,12 +702,11 @@ struct CUFDataTransferOpConversion
       } else {
         width = computeWidth(loc, dstTy, kindMap);
       }
-      mlir::Value widthValue = rewriter.create<mlir::arith::ConstantOp>(
-          loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width));
-      mlir::Value bytes =
-          nbElement
-              ? rewriter.create<mlir::arith::MulIOp>(loc, nbElement, widthValue)
-              : widthValue;
+      mlir::Value widthValue = mlir::arith::ConstantOp::create(
+          rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width));
+      mlir::Value bytes = nbElement ? mlir::arith::MulIOp::create(
+                                          rewriter, loc, nbElement, widthValue)
+                                    : widthValue;
 
       mlir::func::FuncOp func =
           fir::runtime::getRuntimeFunc<mkRTKey(CUFDataTransferPtrPtr)>(loc,
@@ -719,13 +721,13 @@ struct CUFDataTransferOpConversion
       // Materialize the src if constant.
       if (matchPattern(src.getDefiningOp(), mlir::m_Constant())) {
         mlir::Value temp = builder.createTemporary(loc, srcTy);
-        builder.create<fir::StoreOp>(loc, src, temp);
+        fir::StoreOp::create(builder, loc, src, temp);
         src = temp;
       }
       llvm::SmallVector<mlir::Value> args{
           fir::runtime::createArguments(builder, loc, fTy, dst, src, bytes,
                                         modeValue, sourceFile, sourceLine)};
-      builder.create<fir::CallOp>(loc, func, args);
+      fir::CallOp::create(builder, loc, func, args);
       rewriter.eraseOp(op);
       return mlir::success();
     }
@@ -734,7 +736,7 @@ struct CUFDataTransferOpConversion
       if (mlir::isa<fir::EmboxOp, fir::ReboxOp>(val.getDefiningOp())) {
         // Materialize the box to memory to be able to call the runtime.
         mlir::Value box = builder.createTemporary(loc, val.getType());
-        builder.create<fir::StoreOp>(loc, val, box);
+        fir::StoreOp::create(builder, loc, val, box);
         return box;
       }
       return val;
@@ -768,7 +770,7 @@ struct CUFDataTransferOpConversion
           fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
       llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
           builder, loc, fTy, dst, src, modeValue, sourceFile, sourceLine)};
-      builder.create<fir::CallOp>(loc, func, args);
+      fir::CallOp::create(builder, loc, func, args);
       rewriter.eraseOp(op);
     } else {
       // Transfer from a descriptor.
@@ -784,7 +786,7 @@ struct CUFDataTransferOpConversion
           fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
       llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
           builder, loc, fTy, dst, src, modeValue, sourceFile, sourceLine)};
-      builder.create<fir::CallOp>(loc, func, args);
+      fir::CallOp::create(builder, loc, func, args);
       rewriter.eraseOp(op);
     }
     return mlir::success();
@@ -810,20 +812,21 @@ struct CUFLaunchOpConversion
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = op.getLoc();
     auto idxTy = mlir::IndexType::get(op.getContext());
-    mlir::Value zero = rewriter.create<mlir::arith::ConstantOp>(
-        loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
+    mlir::Value zero = mlir::arith::ConstantOp::create(
+        rewriter, loc, rewriter.getIntegerType(32),
+        rewriter.getI32IntegerAttr(0));
     auto gridSizeX =
-        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
+        mlir::arith::IndexCastOp::create(rewriter, loc, idxTy, op.getGridX());
     auto gridSizeY =
-        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridY());
+        mlir::arith::IndexCastOp::create(rewriter, loc, idxTy, op.getGridY());
     auto gridSizeZ =
-        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridZ());
+        mlir::arith::IndexCastOp::create(rewriter, loc, idxTy, op.getGridZ());
     auto blockSizeX =
-        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockX());
+        mlir::arith::IndexCastOp::create(rewriter, loc, idxTy, op.getBlockX());
     auto blockSizeY =
-        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockY());
+        mlir::arith::IndexCastOp::create(rewriter, loc, idxTy, op.getBlockY());
     auto blockSizeZ =
-        rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockZ());
+        mlir::arith::IndexCastOp::create(rewriter, loc, idxTy, op.getBlockZ());
     auto kernelName = mlir::SymbolRefAttr::get(
         rewriter.getStringAttr(cudaDeviceModuleName),
         {mlir::SymbolRefAttr::get(
@@ -835,12 +838,12 @@ struct CUFLaunchOpConversion
             op.getCallee().getLeafReference())) {
       if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
               cuf::getClusterDimsAttrName())) {
-        clusterDimX = rewriter.create<mlir::arith::ConstantIndexOp>(
-            loc, clusterDimsAttr.getX().getInt());
-        clusterDimY = rewriter.create<mlir::arith::ConstantIndexOp>(
-            loc, clusterDimsAttr.getY().getInt());
-        clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
-            loc, clusterDimsAttr.getZ().getInt());
+        clusterDimX = mlir::arith::ConstantIndexOp::create(
+            rewriter, loc, clusterDimsAttr.getX().getInt());
+        clusterDimY = mlir::arith::ConstantIndexOp::create(
+            rewriter, loc, clusterDimsAttr.getY().getInt());
+        clusterDimZ = mlir::arith::ConstantIndexOp::create(
+            rewriter, loc, clusterDimsAttr.getZ().getInt());
       }
       procAttr =
           funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
@@ -870,8 +873,9 @@ struct CUFLaunchOpConversion
       args.push_back(arg);
     }
     mlir::Value dynamicShmemSize = op.getBytes() ? op.getBytes() : zero;
-    auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
-        loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
+    auto gpuLaunchOp = mlir::gpu::LaunchFuncOp::create(
+        rewriter, loc, kernelName,
+        mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
         mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ},
         dynamicShmemSize, args);
     if (clusterDimX && clusterDimY && clusterDimZ) {
@@ -883,7 +887,7 @@ struct CUFLaunchOpConversion
       mlir::OpBuilder::InsertionGuard guard(rewriter);
       rewriter.setInsertionPoint(gpuLaunchOp);
       mlir::Value stream =
-          rewriter.create<cuf::StreamCastOp>(loc, op.getStream());
+          cuf::StreamCastOp::create(rewriter, loc, op.getStream());
       gpuLaunchOp.getAsyncDependenciesMutable().append(stream);
     }
     if (procAttr)
@@ -916,8 +920,9 @@ struct CUFSyncDescriptorOpConversion
     if (!globalOp)
       return mlir::failure();
 
-    auto hostAddr = builder.create<fir::AddrOfOp>(
-        loc, fir::ReferenceType::get(globalOp.getType()), op.getGlobalName());
+    auto hostAddr = fir::AddrOfOp::create(
+        builder, loc, fir::ReferenceType::get(globalOp.getType()),
+        op.getGlobalName());
     fir::runtime::cuda::genSyncGlobalDescriptor(builder, loc, hostAddr);
     op.erase();
     return mlir::success();
diff --git a/flang/lib/Optimizer/Transforms/CharacterConversion.cpp b/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
index aee7e8ca5cb66..13da38e92c234 100644
--- a/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
@@ -48,12 +48,13 @@ class CharacterConvertConversion
                << "running character conversion on " << conv << '\n');
 
     // Establish a loop that executes count iterations.
-    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
-    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
     auto idxTy = rewriter.getIndexType();
-    auto castCnt = rewriter.create<fir::ConvertOp>(loc, idxTy, conv.getCount());
-    auto countm1 = rewriter.create<mlir::arith::SubIOp>(loc, castCnt, one);
-    auto loop = rewriter.create<fir::DoLoopOp>(loc, zero, countm1, one);
+    auto castCnt =
+        fir::ConvertOp::create(rewriter, loc, idxTy, conv.getCount());
+    auto countm1 = mlir::arith::SubIOp::create(rewriter, loc, castCnt, one);
+    auto loop = fir::DoLoopOp::create(rewriter, loc, zero, countm1, one);
     auto insPt = rewriter.saveInsertionPoint();
     rewriter.setInsertionPointToStart(loop.getBody());
 
@@ -75,21 +76,22 @@ class CharacterConvertConversion
     auto toTy = rewriter.getIntegerType(toBits);
     auto toPtrTy = pointerType(toBits);
     auto fromPtr =
-        rewriter.create<fir::ConvertOp>(loc, fromPtrTy, conv.getFrom());
-    auto toPtr = rewriter.create<fir::ConvertOp>(loc, toPtrTy, conv.getTo());
+        fir::ConvertOp::create(rewriter, loc, fromPtrTy, conv.getFrom());
+    auto toPtr = fir::ConvertOp::create(rewriter, loc, toPtrTy, conv.getTo());
     auto getEleTy = [&](unsigned bits) {
       return fir::ReferenceType::get(rewriter.getIntegerType(bits));
     };
-    auto fromi = rewriter.create<fir::CoordinateOp>(
-        loc, getEleTy(fromBits), fromPtr,
-        mlir::ValueRange{loop.getInductionVar()});
-    auto toi = rewriter.create<fir::CoordinateOp>(
-        loc, getEleTy(toBits), toPtr, mlir::ValueRange{loop.getInductionVar()});
-    auto load = rewriter.create<fir::LoadOp>(loc, fromi);
+    auto fromi =
+        fir::CoordinateOp::create(rewriter, loc, getEleTy(fromBits), fromPtr,
+                                  mlir::ValueRange{loop.getInductionVar()});
+    auto toi =
+        fir::CoordinateOp::create(rewriter, loc, getEleTy(toBits), toPtr,
+                                  mlir::ValueRange{loop.getInductionVar()});
+    auto load = fir::LoadOp::create(rewriter, loc, fromi);
     mlir::Value icast =
         (fromBits >= toBits)
-            ? rewriter.create<fir::ConvertOp>(loc, toTy, load).getResult()
-            : rewriter.create<mlir::arith::ExtUIOp>(loc, toTy, load)
+            ? fir::ConvertOp::create(rewriter, loc, toTy, load).getResult()
+            : mlir::arith::ExtUIOp::create(rewriter, loc, toTy, load)
                   .getResult();
     rewriter.replaceOpWithNewOp<fir::StoreOp>(conv, icast, toi);
     rewriter.restoreInsertionPoint(insPt);
diff --git a/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp b/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp
index 239a7cdaa4cf2..afafbd8179aff 100644
--- a/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp
+++ b/flang/lib/Optimizer/Transforms/ConstantArgumentGlobalisation.cpp
@@ -111,11 +111,11 @@ class CallOpRewriter : public mlir::OpRewritePattern<fir::CallOp> {
             builder.insert(cln);
             mlir::Value val =
                 builder.createConvert(loc, varTy, cln->getResult(0));
-            builder.create<fir::HasValueOp>(loc, val);
+            fir::HasValueOp::create(builder, loc, val);
           },
           builder.createInternalLinkage());
-      mlir::Value addr = builder.create<fir::AddrOfOp>(loc, global.resultType(),
-                                                       global.getSymbol());
+      mlir::Value addr = fir::AddrOfOp::create(
+          builder, loc, global.resultType(), global.getSymbol());
       newOperands.push_back(addr);
       needUpdate = true;
     }
@@ -125,11 +125,11 @@ class CallOpRewriter : public mlir::OpRewritePattern<fir::CallOp> {
       llvm::SmallVector<mlir::Type> newResultTypes;
       newResultTypes.append(callOp.getResultTypes().begin(),
                             callOp.getResultTypes().end());
-      fir::CallOp newOp = builder.create<fir::CallOp>(
-          loc,
-          callOp.getCallee().has_value() ? callOp.getCallee().value()
-                                         : mlir::SymbolRefAttr{},
-          newResultTypes, newOperands);
+      fir::CallOp newOp = fir::CallOp::create(builder, loc,
+                                              callOp.getCallee().has_value()
+                                                  ? callOp.getCallee().value()
+                                                  : mlir::SymbolRefAttr{},
+                                              newResultTypes, newOperands);
       // Copy all the attributes from the old to new op.
       newOp->setAttrs(callOp->getAttrs());
       rewriter.replaceOp(callOp, newOp);
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index 3d35803e6a2d3..e466aed753e63 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -83,17 +83,17 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
 
     // Initalization block
     rewriter.setInsertionPointToEnd(initBlock);
-    auto diff = rewriter.create<mlir::arith::SubIOp>(loc, high, low);
-    auto distance = rewriter.create<mlir::arith::AddIOp>(loc, diff, step);
+    auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
+    auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
     mlir::Value iters =
-        rewriter.create<mlir::arith::DivSIOp>(loc, distance, step);
+        mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
 
     if (forceLoopToExecuteOnce) {
-      auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
-      auto cond = rewriter.create<mlir::arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sle, iters, zero);
-      auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
-      iters = rewriter.create<mlir::arith::SelectOp>(loc, cond, one, iters);
+      auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+      auto cond = mlir::arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
+      auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+      iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
     }
 
     llvm::SmallVector<mlir::Value> loopOperands;
@@ -102,20 +102,20 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
     loopOperands.append(operands.begin(), operands.end());
     loopOperands.push_back(iters);
 
-    rewriter.create<mlir::cf::BranchOp>(loc, conditionalBlock, loopOperands);
+    mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
 
     // Last loop block
     auto *terminator = lastBlock->getTerminator();
     rewriter.setInsertionPointToEnd(lastBlock);
     auto iv = conditionalBlock->getArgument(0);
     mlir::Value steppedIndex =
-        rewriter.create<mlir::arith::AddIOp>(loc, iv, step, iofAttr);
+        mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
     assert(steppedIndex && "must be a Value");
     auto lastArg = conditionalBlock->getNumArguments() - 1;
     auto itersLeft = conditionalBlock->getArgument(lastArg);
-    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
     mlir::Value itersMinusOne =
-        rewriter.create<mlir::arith::SubIOp>(loc, itersLeft, one);
+        mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
 
     llvm::SmallVector<mlir::Value> loopCarried;
     loopCarried.push_back(steppedIndex);
@@ -123,8 +123,8 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
                                       : terminator->operand_begin();
     loopCarried.append(begin, terminator->operand_end());
     loopCarried.push_back(itersMinusOne);
-    auto backEdge =
-        rewriter.create<mlir::cf::BranchOp>(loc, conditionalBlock, loopCarried);
+    auto backEdge = mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock,
+                                               loopCarried);
     rewriter.eraseOp(terminator);
 
     // Copy loop annotations from the do loop to the loop back edge.
@@ -133,13 +133,13 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
 
     // Conditional block
     rewriter.setInsertionPointToEnd(conditionalBlock);
-    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
-    auto comparison = rewriter.create<mlir::arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, itersLeft, zero);
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto comparison = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
 
-    rewriter.create<mlir::cf::CondBranchOp>(
-        loc, comparison, firstBlock, llvm::ArrayRef<mlir::Value>(), endBlock,
-        llvm::ArrayRef<mlir::Value>());
+    mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
+                                   llvm::ArrayRef<mlir::Value>(), endBlock,
+                                   llvm::ArrayRef<mlir::Value>());
 
     // The result of the loop operation is the values of the condition block
     // arguments except the induction variable on the last iteration.
@@ -180,7 +180,7 @@ class CfgIfConv : public mlir::OpRewritePattern<fir::IfOp> {
       continueBlock = rewriter.createBlock(
           remainingOpsBlock, ifOp.getResultTypes(),
           llvm::SmallVector<mlir::Location>(ifOp.getNumResults(), loc));
-      rewriter.create<mlir::cf::BranchOp>(loc, remainingOpsBlock);
+      mlir::cf::BranchOp::create(rewriter, loc, remainingOpsBlock);
     }
 
     // Move blocks from the "then" region to the region containing 'fir.if',
@@ -190,8 +190,8 @@ class CfgIfConv : public mlir::OpRewritePattern<fir::IfOp> {
     auto *ifOpTerminator = ifOpRegion.back().getTerminator();
     auto ifOpTerminatorOperands = ifOpTerminator->getOperands();
     rewriter.setInsertionPointToEnd(&ifOpRegion.back());
-    rewriter.create<mlir::cf::BranchOp>(loc, continueBlock,
-                                        ifOpTerminatorOperands);
+    mlir::cf::BranchOp::create(rewriter, loc, continueBlock,
+                               ifOpTerminatorOperands);
     rewriter.eraseOp(ifOpTerminator);
     rewriter.inlineRegionBefore(ifOpRegion, continueBlock);
 
@@ -205,16 +205,17 @@ class CfgIfConv : public mlir::OpRewritePattern<fir::IfOp> {
       auto *otherwiseTerm = otherwiseRegion.back().getTerminator();
       auto otherwiseTermOperands = otherwiseTerm->getOperands();
       rewriter.setInsertionPointToEnd(&otherwiseRegion.back());
-      rewriter.create<mlir::cf::BranchOp>(loc, continueBlock,
-                                          otherwiseTermOperands);
+      mlir::cf::BranchOp::create(rewriter, loc, continueBlock,
+                                 otherwiseTermOperands);
       rewriter.eraseOp(otherwiseTerm);
       rewriter.inlineRegionBefore(otherwiseRegion, continueBlock);
     }
 
     rewriter.setInsertionPointToEnd(condBlock);
-    auto branchOp = rewriter.create<mlir::cf::CondBranchOp>(
-        loc, ifOp.getCondition(), ifOpBlock, llvm::ArrayRef<mlir::Value>(),
-        otherwiseBlock, llvm::ArrayRef<mlir::Value>());
+    auto branchOp = mlir::cf::CondBranchOp::create(
+        rewriter, loc, ifOp.getCondition(), ifOpBlock,
+        llvm::ArrayRef<mlir::Value>(), otherwiseBlock,
+        llvm::ArrayRef<mlir::Value>());
     llvm::ArrayRef<int32_t> weights = ifOp.getWeights();
     if (!weights.empty())
       branchOp.setWeights(weights);
@@ -269,7 +270,7 @@ class CfgIterWhileConv : public mlir::OpRewritePattern<fir::IterWhileOp> {
     rewriter.setInsertionPointToEnd(lastBodyBlock);
     auto step = whileOp.getStep();
     mlir::Value stepped =
-        rewriter.create<mlir::arith::AddIOp>(loc, iv, step, iofAttr);
+        mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
     assert(stepped && "must be a Value");
 
     llvm::SmallVector<mlir::Value> loopCarried;
@@ -278,7 +279,7 @@ class CfgIterWhileConv : public mlir::OpRewritePattern<fir::IterWhileOp> {
                      ? std::next(terminator->operand_begin())
                      : terminator->operand_begin();
     loopCarried.append(begin, terminator->operand_end());
-    rewriter.create<mlir::cf::BranchOp>(loc, conditionBlock, loopCarried);
+    mlir::cf::BranchOp::create(rewriter, loc, conditionBlock, loopCarried);
     rewriter.eraseOp(terminator);
 
     // Compute loop bounds before branching to the condition.
@@ -293,31 +294,31 @@ class CfgIterWhileConv : public mlir::OpRewritePattern<fir::IterWhileOp> {
     destOperands.push_back(lowerBound);
     auto iterOperands = whileOp.getIterOperands();
     destOperands.append(iterOperands.begin(), iterOperands.end());
-    rewriter.create<mlir::cf::BranchOp>(loc, conditionBlock, destOperands);
+    mlir::cf::BranchOp::create(rewriter, loc, conditionBlock, destOperands);
 
     // With the body block done, we can fill in the condition block.
     rewriter.setInsertionPointToEnd(conditionBlock);
     // The comparison depends on the sign of the step value. We fully expect
     // this expression to be folded by the optimizer or LLVM. This expression
     // is written this way so that `step == 0` always returns `false`.
-    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
-    auto compl0 = rewriter.create<mlir::arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, zero, step);
-    auto compl1 = rewriter.create<mlir::arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sle, iv, upperBound);
-    auto compl2 = rewriter.create<mlir::arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, step, zero);
-    auto compl3 = rewriter.create<mlir::arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sle, upperBound, iv);
-    auto cmp0 = rewriter.create<mlir::arith::AndIOp>(loc, compl0, compl1);
-    auto cmp1 = rewriter.create<mlir::arith::AndIOp>(loc, compl2, compl3);
-    auto cmp2 = rewriter.create<mlir::arith::OrIOp>(loc, cmp0, cmp1);
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto compl0 = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::slt, zero, step);
+    auto compl1 = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sle, iv, upperBound);
+    auto compl2 = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::slt, step, zero);
+    auto compl3 = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sle, upperBound, iv);
+    auto cmp0 = mlir::arith::AndIOp::create(rewriter, loc, compl0, compl1);
+    auto cmp1 = mlir::arith::AndIOp::create(rewriter, loc, compl2, compl3);
+    auto cmp2 = mlir::arith::OrIOp::create(rewriter, loc, cmp0, cmp1);
     // Remember to AND in the early-exit bool.
     auto comparison =
-        rewriter.create<mlir::arith::AndIOp>(loc, iterateVar, cmp2);
-    rewriter.create<mlir::cf::CondBranchOp>(
-        loc, comparison, firstBodyBlock, llvm::ArrayRef<mlir::Value>(),
-        endBlock, llvm::ArrayRef<mlir::Value>());
+        mlir::arith::AndIOp::create(rewriter, loc, iterateVar, cmp2);
+    mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBodyBlock,
+                                   llvm::ArrayRef<mlir::Value>(), endBlock,
+                                   llvm::ArrayRef<mlir::Value>());
     // The result of the loop operation is the values of the condition block
     // arguments except the induction variable on the last iteration.
     auto args = whileOp.getFinalValue()
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index abad500d3f657..5dcb54eaf9b9d 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -103,13 +103,14 @@ mlir::LLVM::DILocalVariableAttr DebugTypeGenerator::generateArtificialVariable(
   mlir::Type type = val.getType();
   if (!mlir::isa<mlir::IntegerType>(type) || !type.isSignlessInteger()) {
     type = builder.getIntegerType(64);
-    val = builder.create<fir::ConvertOp>(declOp.getLoc(), type, val);
+    val = fir::ConvertOp::create(builder, declOp.getLoc(), type, val);
   }
   mlir::LLVM::DITypeAttr Ty = convertType(type, fileAttr, scope, declOp);
   auto lvAttr = mlir::LLVM::DILocalVariableAttr::get(
       context, scope, name, fileAttr, /*line=*/0, /*argNo=*/0,
       /*alignInBits=*/0, Ty, mlir::LLVM::DIFlags::Artificial);
-  builder.create<mlir::LLVM::DbgValueOp>(declOp.getLoc(), val, lvAttr, nullptr);
+  mlir::LLVM::DbgValueOp::create(builder, declOp.getLoc(), val, lvAttr,
+                                 nullptr);
   return lvAttr;
 }
 
diff --git a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
index f06ad2db90d55..d7d1865bc56ba 100644
--- a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
+++ b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
@@ -49,13 +49,13 @@ struct DoLoopConversion : public OpRewritePattern<fir::DoLoopOp> {
     // must be a positive value.
     // For easier conversion, we calculate the trip count and use a canonical
     // induction variable.
-    auto diff = rewriter.create<arith::SubIOp>(loc, high, low);
-    auto distance = rewriter.create<arith::AddIOp>(loc, diff, step);
-    auto tripCount = rewriter.create<arith::DivSIOp>(loc, distance, step);
-    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    auto one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    auto diff = arith::SubIOp::create(rewriter, loc, high, low);
+    auto distance = arith::AddIOp::create(rewriter, loc, diff, step);
+    auto tripCount = arith::DivSIOp::create(rewriter, loc, distance, step);
+    auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto one = arith::ConstantIndexOp::create(rewriter, loc, 1);
     auto scfForOp =
-        rewriter.create<scf::ForOp>(loc, zero, tripCount, one, iterArgs);
+        scf::ForOp::create(rewriter, loc, zero, tripCount, one, iterArgs);
 
     auto &loopOps = doLoopOp.getBody()->getOperations();
     auto resultOp = cast<fir::ResultOp>(doLoopOp.getBody()->getTerminator());
@@ -68,12 +68,12 @@ struct DoLoopConversion : public OpRewritePattern<fir::DoLoopOp> {
 
     rewriter.setInsertionPointToStart(loweredBody);
     Value iv =
-        rewriter.create<arith::MulIOp>(loc, scfForOp.getInductionVar(), step);
-    iv = rewriter.create<arith::AddIOp>(loc, low, iv);
+        arith::MulIOp::create(rewriter, loc, scfForOp.getInductionVar(), step);
+    iv = arith::AddIOp::create(rewriter, loc, low, iv);
 
     if (!results.empty()) {
       rewriter.setInsertionPointToEnd(loweredBody);
-      rewriter.create<scf::YieldOp>(resultOp->getLoc(), results);
+      scf::YieldOp::create(rewriter, resultOp->getLoc(), results);
     }
     doLoopOp.getInductionVar().replaceAllUsesWith(iv);
     rewriter.replaceAllUsesWith(doLoopOp.getRegionIterArgs(),
diff --git a/flang/lib/Optimizer/Transforms/GenRuntimeCallsForTest.cpp b/flang/lib/Optimizer/Transforms/GenRuntimeCallsForTest.cpp
index 7ea3b9c670c69..699be12178881 100644
--- a/flang/lib/Optimizer/Transforms/GenRuntimeCallsForTest.cpp
+++ b/flang/lib/Optimizer/Transforms/GenRuntimeCallsForTest.cpp
@@ -91,8 +91,8 @@ void GenRuntimeCallsForTestPass::runOnOperation() {
     // Generate the wrapper function body that consists of a call and return.
     builder.setInsertionPointToStart(callerFunc.addEntryBlock());
     mlir::Block::BlockArgListType args = callerFunc.front().getArguments();
-    auto callOp = builder.create<fir::CallOp>(loc, funcOp, args);
-    builder.create<mlir::func::ReturnOp>(loc, callOp.getResults());
+    auto callOp = fir::CallOp::create(builder, loc, funcOp, args);
+    mlir::func::ReturnOp::create(builder, loc, callOp.getResults());
 
     newFuncs.push_back(callerFunc.getOperation());
     builder.restoreInsertionPoint(insertPt);
diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 056bdf63d914f..0095159398155 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -285,7 +285,7 @@ static mlir::Value getIndex(fir::FirOpBuilder &builder, mlir::Operation *op,
   // index_0 = index - lb;
   if (lb.getType() != index.getType())
     lb = builder.createConvert(coop.getLoc(), index.getType(), lb);
-  return builder.create<mlir::arith::SubIOp>(coop.getLoc(), index, lb);
+  return mlir::arith::SubIOp::create(builder, coop.getLoc(), index, lb);
 }
 
 void LoopVersioningPass::runOnOperation() {
@@ -483,26 +483,26 @@ void LoopVersioningPass::runOnOperation() {
       unsigned ndims = arg.rank;
       for (unsigned i = 0; i < ndims; i++) {
         mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
-        arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
-                                                     arg.arg, dimIdx);
+        arg.dims[i] = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy,
+                                             arg.arg, dimIdx);
       }
       // We only care about lowest order dimension, here.
       mlir::Value elemSize =
           builder.createIntegerConstant(loc, idxTy, arg.size);
-      mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2),
-          elemSize);
+      mlir::Value cmp = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq,
+          arg.dims[0].getResult(2), elemSize);
       if (!allCompares) {
         allCompares = cmp;
       } else {
         allCompares =
-            builder.create<mlir::arith::AndIOp>(loc, cmp, allCompares);
+            mlir::arith::AndIOp::create(builder, loc, cmp, allCompares);
       }
     }
 
     auto ifOp =
-        builder.create<fir::IfOp>(loc, op.op->getResultTypes(), allCompares,
-                                  /*withElse=*/true);
+        fir::IfOp::create(builder, loc, op.op->getResultTypes(), allCompares,
+                          /*withElse=*/true);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
 
     LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n");
@@ -515,8 +515,8 @@ void LoopVersioningPass::runOnOperation() {
       mlir::Type arrTy = fir::SequenceType::get(newShape, elementType);
       mlir::Type boxArrTy = fir::BoxType::get(arrTy);
       mlir::Type refArrTy = builder.getRefType(arrTy);
-      auto carg = builder.create<fir::ConvertOp>(loc, boxArrTy, arg.arg);
-      auto caddr = builder.create<fir::BoxAddrOp>(loc, refArrTy, carg);
+      auto carg = fir::ConvertOp::create(builder, loc, boxArrTy, arg.arg);
+      auto caddr = fir::BoxAddrOp::create(builder, loc, refArrTy, carg);
       auto insPt = builder.saveInsertionPoint();
       // Use caddr instead of arg.
       clonedLoop->walk([&](mlir::Operation *coop) {
@@ -540,9 +540,9 @@ void LoopVersioningPass::runOnOperation() {
             mlir::Value scale =
                 builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
             curIndex =
-                builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
-            totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
-                                            loc, curIndex, totalIndex)
+                mlir::arith::MulIOp::create(builder, loc, scale, curIndex);
+            totalIndex = (totalIndex) ? mlir::arith::AddIOp::create(
+                                            builder, loc, curIndex, totalIndex)
                                       : curIndex;
           }
           // This is the lowest dimension - which doesn't need scaling
@@ -554,16 +554,16 @@ void LoopVersioningPass::runOnOperation() {
             unsigned bits = llvm::Log2_32(arg.size);
             mlir::Value elemShift =
                 builder.createIntegerConstant(loc, idxTy, bits);
-            totalIndex = builder.create<mlir::arith::AddIOp>(
-                loc,
-                builder.create<mlir::arith::ShRSIOp>(loc, totalIndex,
-                                                     elemShift),
+            totalIndex = mlir::arith::AddIOp::create(
+                builder, loc,
+                mlir::arith::ShRSIOp::create(builder, loc, totalIndex,
+                                             elemShift),
                 finalIndex);
           } else {
             totalIndex = finalIndex;
           }
-          auto newOp = builder.create<fir::CoordinateOp>(
-              loc, builder.getRefType(elementType), caddr,
+          auto newOp = fir::CoordinateOp::create(
+              builder, loc, builder.getRefType(elementType), caddr,
               mlir::ValueRange{totalIndex});
           LLVM_DEBUG(newOp->dump());
           coop->getResult(0).replaceAllUsesWith(newOp->getResult(0));
@@ -582,7 +582,7 @@ void LoopVersioningPass::runOnOperation() {
     mlir::ResultRange results = clonedLoop->getResults();
     bool hasResults = (results.size() > 0);
     if (hasResults)
-      builder.create<fir::ResultOp>(loc, results);
+      fir::ResultOp::create(builder, loc, results);
 
     // Add the original loop in the else-side of the if operation.
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
@@ -591,7 +591,7 @@ void LoopVersioningPass::runOnOperation() {
     builder.insert(op.op);
     // Rely on "cloned loop has results, so original loop also has results".
     if (hasResults) {
-      builder.create<fir::ResultOp>(loc, op.op->getResults());
+      fir::ResultOp::create(builder, loc, op.op->getResults());
     } else {
       // Use an assert to check this.
       assert(op.op->getResults().size() == 0 &&
diff --git a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
index 3f308a8f4b560..99040898728bb 100644
--- a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
+++ b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
@@ -68,9 +68,9 @@ static mlir::Value genAllocmem(mlir::OpBuilder &builder, fir::AllocaOp alloca,
   };
   llvm::StringRef uniqName = unpackName(alloca.getUniqName());
   llvm::StringRef bindcName = unpackName(alloca.getBindcName());
-  auto heap = builder.create<fir::AllocMemOp>(alloca.getLoc(), varTy, uniqName,
-                                              bindcName, alloca.getTypeparams(),
-                                              alloca.getShape());
+  auto heap = fir::AllocMemOp::create(builder, alloca.getLoc(), varTy, uniqName,
+                                      bindcName, alloca.getTypeparams(),
+                                      alloca.getShape());
   LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: replaced " << alloca
                           << " with " << heap << '\n');
   return heap;
@@ -78,7 +78,7 @@ static mlir::Value genAllocmem(mlir::OpBuilder &builder, fir::AllocaOp alloca,
 
 static void genFreemem(mlir::Location loc, mlir::OpBuilder &builder,
                        mlir::Value allocmem) {
-  [[maybe_unused]] auto free = builder.create<fir::FreeMemOp>(loc, allocmem);
+  [[maybe_unused]] auto free = fir::FreeMemOp::create(builder, loc, allocmem);
   LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: add free " << free
                           << " for " << allocmem << '\n');
 }
diff --git a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp
index 1f8edf851de9b..789111cd35f67 100644
--- a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp
+++ b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp
@@ -200,30 +200,30 @@ void AllocaReplaceImpl::genIndirectDeallocation(
   // and access it indirectly in the entry points that do not dominate.
   rewriter.setInsertionPointToStart(&owningRegion.front());
   mlir::Type heapType = fir::HeapType::get(alloca.getInType());
-  mlir::Value ptrVar = rewriter.create<fir::AllocaOp>(loc, heapType);
-  mlir::Value nullPtr = rewriter.create<fir::ZeroOp>(loc, heapType);
-  rewriter.create<fir::StoreOp>(loc, nullPtr, ptrVar);
+  mlir::Value ptrVar = fir::AllocaOp::create(rewriter, loc, heapType);
+  mlir::Value nullPtr = fir::ZeroOp::create(rewriter, loc, heapType);
+  fir::StoreOp::create(rewriter, loc, nullPtr, ptrVar);
   // TODO: introducing a pointer compare op in FIR would help
   // generating less IR here.
   mlir::Type intPtrTy = fir::getIntPtrType(rewriter);
-  mlir::Value c0 = rewriter.create<mlir::arith::ConstantOp>(
-      loc, intPtrTy, rewriter.getIntegerAttr(intPtrTy, 0));
+  mlir::Value c0 = mlir::arith::ConstantOp::create(
+      rewriter, loc, intPtrTy, rewriter.getIntegerAttr(intPtrTy, 0));
 
   // Store new storage address right after its creation.
   rewriter.restoreInsertionPoint(replacementInsertPoint);
   mlir::Value castReplacement =
       fir::factory::createConvert(rewriter, loc, heapType, replacement);
-  rewriter.create<fir::StoreOp>(loc, castReplacement, ptrVar);
+  fir::StoreOp::create(rewriter, loc, castReplacement, ptrVar);
 
   // Generate conditional deallocation at every deallocation point.
   auto genConditionalDealloc = [&](mlir::Location loc) {
-    mlir::Value ptrVal = rewriter.create<fir::LoadOp>(loc, ptrVar);
+    mlir::Value ptrVal = fir::LoadOp::create(rewriter, loc, ptrVar);
     mlir::Value ptrToInt =
-        rewriter.create<fir::ConvertOp>(loc, intPtrTy, ptrVal);
-    mlir::Value isAllocated = rewriter.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0);
-    auto ifOp = rewriter.create<fir::IfOp>(loc, std::nullopt, isAllocated,
-                                           /*withElseRegion=*/false);
+        fir::ConvertOp::create(rewriter, loc, intPtrTy, ptrVal);
+    mlir::Value isAllocated = mlir::arith::CmpIOp::create(
+        rewriter, loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0);
+    auto ifOp = fir::IfOp::create(rewriter, loc, mlir::TypeRange{}, isAllocated,
+                                  /*withElseRegion=*/false);
     rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
     mlir::Value cast = fir::factory::createConvert(
         rewriter, loc, replacement.getType(), ptrVal);
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index 57eae1ff052a2..2c6601dec6e16 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -183,49 +183,51 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     mlir::Type tdescType =
         fir::TypeDescType::get(mlir::NoneType::get(rewriter.getContext()));
     mlir::Value boxDesc =
-        rewriter.create<fir::BoxTypeDescOp>(loc, tdescType, passedObject);
-    boxDesc = rewriter.create<fir::ConvertOp>(
-        loc, fir::ReferenceType::get(typeDescTy), boxDesc);
+        fir::BoxTypeDescOp::create(rewriter, loc, tdescType, passedObject);
+    boxDesc = fir::ConvertOp::create(
+        rewriter, loc, fir::ReferenceType::get(typeDescTy), boxDesc);
 
     // Load the bindings descriptor.
     auto bindingsCompName = Fortran::semantics::bindingDescCompName;
     fir::RecordType typeDescRecTy = mlir::cast<fir::RecordType>(typeDescTy);
-    mlir::Value field = rewriter.create<fir::FieldIndexOp>(
-        loc, fieldTy, bindingsCompName, typeDescRecTy, mlir::ValueRange{});
+    mlir::Value field =
+        fir::FieldIndexOp::create(rewriter, loc, fieldTy, bindingsCompName,
+                                  typeDescRecTy, mlir::ValueRange{});
     mlir::Type coorTy =
         fir::ReferenceType::get(typeDescRecTy.getType(bindingsCompName));
     mlir::Value bindingBoxAddr =
-        rewriter.create<fir::CoordinateOp>(loc, coorTy, boxDesc, field);
-    mlir::Value bindingBox = rewriter.create<fir::LoadOp>(loc, bindingBoxAddr);
+        fir::CoordinateOp::create(rewriter, loc, coorTy, boxDesc, field);
+    mlir::Value bindingBox = fir::LoadOp::create(rewriter, loc, bindingBoxAddr);
 
     // Load the correct binding.
-    mlir::Value bindings = rewriter.create<fir::BoxAddrOp>(loc, bindingBox);
+    mlir::Value bindings = fir::BoxAddrOp::create(rewriter, loc, bindingBox);
     fir::RecordType bindingTy = fir::unwrapIfDerived(
         mlir::cast<fir::BaseBoxType>(bindingBox.getType()));
     mlir::Type bindingAddrTy = fir::ReferenceType::get(bindingTy);
-    mlir::Value bindingIdxVal = rewriter.create<mlir::arith::ConstantOp>(
-        loc, rewriter.getIndexType(), rewriter.getIndexAttr(bindingIdx));
-    mlir::Value bindingAddr = rewriter.create<fir::CoordinateOp>(
-        loc, bindingAddrTy, bindings, bindingIdxVal);
+    mlir::Value bindingIdxVal =
+        mlir::arith::ConstantOp::create(rewriter, loc, rewriter.getIndexType(),
+                                        rewriter.getIndexAttr(bindingIdx));
+    mlir::Value bindingAddr = fir::CoordinateOp::create(
+        rewriter, loc, bindingAddrTy, bindings, bindingIdxVal);
 
     // Get the function pointer.
     auto procCompName = Fortran::semantics::procCompName;
-    mlir::Value procField = rewriter.create<fir::FieldIndexOp>(
-        loc, fieldTy, procCompName, bindingTy, mlir::ValueRange{});
+    mlir::Value procField = fir::FieldIndexOp::create(
+        rewriter, loc, fieldTy, procCompName, bindingTy, mlir::ValueRange{});
     fir::RecordType procTy =
         mlir::cast<fir::RecordType>(bindingTy.getType(procCompName));
     mlir::Type procRefTy = fir::ReferenceType::get(procTy);
-    mlir::Value procRef = rewriter.create<fir::CoordinateOp>(
-        loc, procRefTy, bindingAddr, procField);
+    mlir::Value procRef = fir::CoordinateOp::create(rewriter, loc, procRefTy,
+                                                    bindingAddr, procField);
 
     auto addressFieldName = Fortran::lower::builtin::cptrFieldName;
-    mlir::Value addressField = rewriter.create<fir::FieldIndexOp>(
-        loc, fieldTy, addressFieldName, procTy, mlir::ValueRange{});
+    mlir::Value addressField = fir::FieldIndexOp::create(
+        rewriter, loc, fieldTy, addressFieldName, procTy, mlir::ValueRange{});
     mlir::Type addressTy = procTy.getType(addressFieldName);
     mlir::Type addressRefTy = fir::ReferenceType::get(addressTy);
-    mlir::Value addressRef = rewriter.create<fir::CoordinateOp>(
-        loc, addressRefTy, procRef, addressField);
-    mlir::Value address = rewriter.create<fir::LoadOp>(loc, addressRef);
+    mlir::Value addressRef = fir::CoordinateOp::create(
+        rewriter, loc, addressRefTy, procRef, addressField);
+    mlir::Value address = fir::LoadOp::create(rewriter, loc, addressRef);
 
     // Get the function type.
     llvm::SmallVector<mlir::Type> argTypes;
@@ -237,7 +239,7 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
 
     mlir::Type funTy =
         mlir::FunctionType::get(rewriter.getContext(), argTypes, resTypes);
-    mlir::Value funcPtr = rewriter.create<fir::ConvertOp>(loc, funTy, address);
+    mlir::Value funcPtr = fir::ConvertOp::create(rewriter, loc, funTy, address);
 
     // Make the call.
     llvm::SmallVector<mlir::Value> args{funcPtr};
@@ -398,12 +400,13 @@ llvm::LogicalResult SelectTypeConv::genTypeLadderStep(
       if (code == 0)
         return mlir::emitError(loc)
                << "type code unavailable for " << a.getType();
-      mlir::Value typeCode = rewriter.create<mlir::arith::ConstantOp>(
-          loc, rewriter.getI8IntegerAttr(code));
-      mlir::Value selectorTypeCode = rewriter.create<fir::BoxTypeCodeOp>(
-          loc, rewriter.getI8Type(), selector);
-      cmp = rewriter.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, selectorTypeCode, typeCode);
+      mlir::Value typeCode = mlir::arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI8IntegerAttr(code));
+      mlir::Value selectorTypeCode = fir::BoxTypeCodeOp::create(
+          rewriter, loc, rewriter.getI8Type(), selector);
+      cmp = mlir::arith::CmpIOp::create(rewriter, loc,
+                                        mlir::arith::CmpIPredicate::eq,
+                                        selectorTypeCode, typeCode);
     } else {
       // Flang inline the kind parameter in the type descriptor so we can
       // directly check if the type descriptor addresses are identical for
@@ -418,16 +421,16 @@ llvm::LogicalResult SelectTypeConv::genTypeLadderStep(
   } else if (auto a = mlir::dyn_cast<fir::SubclassAttr>(attr)) {
     // Retrieve the type descriptor from the type guard statement record type.
     assert(mlir::isa<fir::RecordType>(a.getType()) && "expect fir.record type");
-    mlir::Value typeDescAddr =
-        rewriter.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(a.getType()));
+    mlir::Value typeDescAddr = fir::TypeDescOp::create(
+        rewriter, loc, mlir::TypeAttr::get(a.getType()));
     mlir::Type refNoneType = ReferenceType::get(rewriter.getNoneType());
     mlir::Value typeDesc =
-        rewriter.create<ConvertOp>(loc, refNoneType, typeDescAddr);
+        ConvertOp::create(rewriter, loc, refNoneType, typeDescAddr);
 
     // Prepare the selector descriptor for the runtime call.
     mlir::Type descNoneTy = fir::BoxType::get(rewriter.getNoneType());
     mlir::Value descSelector =
-        rewriter.create<ConvertOp>(loc, descNoneTy, selector);
+        ConvertOp::create(rewriter, loc, descNoneTy, selector);
 
     // Generate runtime call.
     llvm::StringRef fctName = RTNAME_STRING(ClassIs);
@@ -455,10 +458,10 @@ llvm::LogicalResult SelectTypeConv::genTypeLadderStep(
       rewriter.createBlock(dest->getParent(), mlir::Region::iterator(dest));
   rewriter.setInsertionPointToEnd(thisBlock);
   if (destOps.has_value())
-    rewriter.create<mlir::cf::CondBranchOp>(loc, cmp, dest, destOps.value(),
-                                            newBlock, std::nullopt);
+    mlir::cf::CondBranchOp::create(rewriter, loc, cmp, dest, destOps.value(),
+                                   newBlock, mlir::ValueRange{});
   else
-    rewriter.create<mlir::cf::CondBranchOp>(loc, cmp, dest, newBlock);
+    mlir::cf::CondBranchOp::create(rewriter, loc, cmp, dest, newBlock);
   rewriter.setInsertionPointToEnd(newBlock);
   return mlir::success();
 }
@@ -470,16 +473,17 @@ SelectTypeConv::genTypeDescCompare(mlir::Location loc, mlir::Value selector,
                                    mlir::PatternRewriter &rewriter) const {
   assert(mlir::isa<fir::RecordType>(ty) && "expect fir.record type");
   mlir::Value typeDescAddr =
-      rewriter.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(ty));
-  mlir::Value selectorTdescAddr = rewriter.create<fir::BoxTypeDescOp>(
-      loc, typeDescAddr.getType(), selector);
+      fir::TypeDescOp::create(rewriter, loc, mlir::TypeAttr::get(ty));
+  mlir::Value selectorTdescAddr = fir::BoxTypeDescOp::create(
+      rewriter, loc, typeDescAddr.getType(), selector);
   auto intPtrTy = rewriter.getIndexType();
   auto typeDescInt =
-      rewriter.create<fir::ConvertOp>(loc, intPtrTy, typeDescAddr);
+      fir::ConvertOp::create(rewriter, loc, intPtrTy, typeDescAddr);
   auto selectorTdescInt =
-      rewriter.create<fir::ConvertOp>(loc, intPtrTy, selectorTdescAddr);
-  return rewriter.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, typeDescInt, selectorTdescInt);
+      fir::ConvertOp::create(rewriter, loc, intPtrTy, selectorTdescAddr);
+  return mlir::arith::CmpIOp::create(rewriter, loc,
+                                     mlir::arith::CmpIPredicate::eq,
+                                     typeDescInt, selectorTdescInt);
 }
 
 llvm::SmallSet<llvm::StringRef, 4>
diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
index 506c8e66dbdfa..c6aec96ceb5ae 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
@@ -88,18 +88,18 @@ mlir::LogicalResult IsContiguousBoxCoversion::matchAndRewrite(
     // The scalar cases are supposed to be optimized by the canonicalization.
     if (rank == 1 || (op.getInnermost() && rank > 0)) {
       mlir::Type idxTy = builder.getIndexType();
-      auto eleSize = builder.create<fir::BoxEleSizeOp>(loc, idxTy, box);
+      auto eleSize = fir::BoxEleSizeOp::create(builder, loc, idxTy, box);
       mlir::Value zero = fir::factory::createZeroValue(builder, loc, idxTy);
       auto dimInfo =
-          builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, zero);
+          fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, box, zero);
       mlir::Value stride = dimInfo.getByteStride();
-      mlir::Value pred1 = builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, eleSize, stride);
+      mlir::Value pred1 = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, eleSize, stride);
       mlir::Value extent = dimInfo.getExtent();
-      mlir::Value pred2 = builder.create<mlir::arith::CmpIOp>(
-          loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+      mlir::Value pred2 = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, extent, zero);
       mlir::Value result =
-          builder.create<mlir::arith::OrIOp>(loc, pred1, pred2);
+          mlir::arith::OrIOp::create(builder, loc, pred1, pred2);
       result = builder.createConvert(loc, op.getType(), result);
       rewriter.replaceOp(op, result);
       return mlir::success();
@@ -192,7 +192,7 @@ class DoConcurrentConversion
         // TODO Should this be a heap allocation instead? For now, we allocate
         // on the stack for each loop iteration.
         mlir::Value localAlloc =
-            rewriter.create<fir::AllocaOp>(loop.getLoc(), localizer.getType());
+            fir::AllocaOp::create(rewriter, loop.getLoc(), localizer.getType());
 
         auto cloneLocalizerRegion = [&](mlir::Region &region,
                                         mlir::ValueRange regionArgs,
@@ -258,10 +258,10 @@ class DoConcurrentConversion
     for (auto [lb, ub, st, iv] :
          llvm::zip_equal(loop.getLowerBound(), loop.getUpperBound(),
                          loop.getStep(), *loop.getLoopInductionVars())) {
-      innermostUnorderdLoop = rewriter.create<fir::DoLoopOp>(
-          doConcurentOp.getLoc(), lb, ub, st,
+      innermostUnorderdLoop = fir::DoLoopOp::create(
+          rewriter, doConcurentOp.getLoc(), lb, ub, st,
           /*unordred=*/true, /*finalCountValue=*/false,
-          /*iterArgs=*/std::nullopt, loop.getReduceVars(),
+          /*iterArgs=*/mlir::ValueRange{}, loop.getReduceVars(),
           loop.getReduceAttrsAttr());
       ivArgs.push_back(innermostUnorderdLoop.getInductionVar());
       rewriter.setInsertionPointToStart(innermostUnorderdLoop.getBody());
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index 4d25a02bf18ba..49a085ee3b336 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -284,7 +284,7 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
                                      fir::SequenceType::getUnknownExtent());
   mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
   mlir::Type boxArrTy = fir::BoxType::get(arrTy);
-  mlir::Value array = builder.create<fir::ConvertOp>(loc, boxArrTy, arg);
+  mlir::Value array = fir::ConvertOp::create(builder, loc, boxArrTy, arg);
   mlir::Type resultType = funcOp.getResultTypes()[0];
   mlir::Value init = initVal(builder, loc, resultType);
 
@@ -299,11 +299,11 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
   // should be able to optimize the redundancy.
   for (unsigned i = 0; i < rank; ++i) {
     mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
-    auto dims =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
+    auto dims = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, array,
+                                       dimIdx);
     mlir::Value len = dims.getResult(1);
     // We use C indexing here, so len-1 as loopcount
-    mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
+    mlir::Value loopCount = mlir::arith::SubIOp::create(builder, loc, len, one);
     bounds.push_back(loopCount);
   }
   // Create a loop nest consisting of OP operations.
@@ -316,9 +316,9 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
   for (unsigned i = rank; 0 < i; --i) {
     mlir::Value step = one;
     mlir::Value loopCount = bounds[i - 1];
-    auto loop = builder.create<OP>(loc, zeroIdx, loopCount, step,
-                                   unorderedOrInitialLoopCond,
-                                   /*finalCountValue=*/false, init);
+    auto loop = OP::create(builder, loc, zeroIdx, loopCount, step,
+                           unorderedOrInitialLoopCond,
+                           /*finalCountValue=*/false, init);
     init = loop.getRegionIterArgs()[resultIndex];
     indices.push_back(loop.getInductionVar());
     // Set insertion point to the loop body so that the next loop
@@ -332,8 +332,8 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
   // We are in the innermost loop: generate the reduction body.
   mlir::Type eleRefTy = builder.getRefType(elementType);
   mlir::Value addr =
-      builder.create<fir::CoordinateOp>(loc, eleRefTy, array, indices);
-  mlir::Value elem = builder.create<fir::LoadOp>(loc, addr);
+      fir::CoordinateOp::create(builder, loc, eleRefTy, array, indices);
+  mlir::Value elem = fir::LoadOp::create(builder, loc, addr);
   mlir::Value reductionVal = genBody(builder, loc, elementType, elem, init);
   // Generate vector with condition to continue while loop at [0] and result
   // from current loop at [1] for IterWhileOp loops, just result at [0] for
@@ -344,7 +344,7 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
   // to return the updated value of the reduction to the enclosing
   // loops.
   for (unsigned i = 0; i < rank; ++i) {
-    auto result = builder.create<fir::ResultOp>(loc, results);
+    auto result = fir::ResultOp::create(builder, loc, results);
     // Proceed to the outer loop.
     auto loop = mlir::cast<OP>(result->getParentOp());
     results = loop.getResults();
@@ -354,7 +354,7 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
   }
   // End of loop nest. The insertion point is after the outermost loop.
   // Return the reduction value from the function.
-  builder.create<mlir::func::ReturnOp>(loc, results[resultIndex]);
+  mlir::func::ReturnOp::create(builder, loc, results[resultIndex]);
 }
 
 static llvm::SmallVector<mlir::Value> nopLoopCond(fir::FirOpBuilder &builder,
@@ -394,9 +394,9 @@ static void genRuntimeSumBody(fir::FirOpBuilder &builder,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
     if (mlir::isa<mlir::FloatType>(elementType))
-      return builder.create<mlir::arith::AddFOp>(loc, elem1, elem2);
+      return mlir::arith::AddFOp::create(builder, loc, elem1, elem2);
     if (mlir::isa<mlir::IntegerType>(elementType))
-      return builder.create<mlir::arith::AddIOp>(loc, elem1, elem2);
+      return mlir::arith::AddIOp::create(builder, loc, elem1, elem2);
 
     llvm_unreachable("unsupported type");
     return {};
@@ -436,12 +436,12 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
       // This libm function may not work properly for F128 arguments
       // on targets where long double is not F128. It is an LLVM issue,
       // but we just use normal select here to resolve all the cases.
-      auto compare = builder.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OGT, elem1, elem2);
-      return builder.create<mlir::arith::SelectOp>(loc, compare, elem1, elem2);
+      auto compare = mlir::arith::CmpFOp::create(
+          builder, loc, mlir::arith::CmpFPredicate::OGT, elem1, elem2);
+      return mlir::arith::SelectOp::create(builder, loc, compare, elem1, elem2);
     }
     if (mlir::isa<mlir::IntegerType>(elementType))
-      return builder.create<mlir::arith::MaxSIOp>(loc, elem1, elem2);
+      return mlir::arith::MaxSIOp::create(builder, loc, elem1, elem2);
 
     llvm_unreachable("unsupported type");
     return {};
@@ -472,11 +472,11 @@ static void genRuntimeCountBody(fir::FirOpBuilder &builder,
     auto zero64 = builder.createIntegerConstant(loc, builder.getI64Type(), 0);
     auto one64 = builder.createIntegerConstant(loc, builder.getI64Type(), 1);
 
-    auto compare = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, elem1, zero32);
+    auto compare = mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::eq, elem1, zero32);
     auto select =
-        builder.create<mlir::arith::SelectOp>(loc, compare, zero64, one64);
-    return builder.create<mlir::arith::AddIOp>(loc, select, elem2);
+        mlir::arith::SelectOp::create(builder, loc, compare, zero64, one64);
+    return mlir::arith::AddIOp::create(builder, loc, select, elem2);
   };
 
   // Count always gets I32 for elementType as it converts logical input to
@@ -501,14 +501,14 @@ static void genRuntimeAnyBody(fir::FirOpBuilder &builder,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
     auto zero = builder.createIntegerConstant(loc, elementType, 0);
-    return builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, elem1, zero);
+    return mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne, elem1, zero);
   };
 
   auto continueCond = [](fir::FirOpBuilder builder, mlir::Location loc,
                          mlir::Value reductionVal) {
     auto one1 = builder.createIntegerConstant(loc, builder.getI1Type(), 1);
-    auto eor = builder.create<mlir::arith::XOrIOp>(loc, reductionVal, one1);
+    auto eor = mlir::arith::XOrIOp::create(builder, loc, reductionVal, one1);
     llvm::SmallVector<mlir::Value> results = {eor, reductionVal};
     return results;
   };
@@ -534,8 +534,8 @@ static void genRuntimeAllBody(fir::FirOpBuilder &builder,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
     auto zero = builder.createIntegerConstant(loc, elementType, 0);
-    return builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, elem1, zero);
+    return mlir::arith::CmpIOp::create(
+        builder, loc, mlir::arith::CmpIPredicate::ne, elem1, zero);
   };
 
   auto continueCond = [](fir::FirOpBuilder builder, mlir::Location loc,
@@ -577,13 +577,13 @@ void fir::genMinMaxlocReductionLoop(
                                      fir::SequenceType::getUnknownExtent());
   mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
   mlir::Type boxArrTy = fir::BoxType::get(arrTy);
-  array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);
+  array = fir::ConvertOp::create(builder, loc, boxArrTy, array);
 
   mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
   mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
   mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
   mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
-  builder.create<fir::StoreOp>(loc, zero, flagRef);
+  fir::StoreOp::create(builder, loc, zero, flagRef);
 
   mlir::Value init = initVal(builder, loc, elementType);
   llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;
@@ -597,11 +597,11 @@ void fir::genMinMaxlocReductionLoop(
   // should be able to optimize the redundancy.
   for (unsigned i = 0; i < rank; ++i) {
     mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
-    auto dims =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
+    auto dims = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, array,
+                                       dimIdx);
     mlir::Value len = dims.getResult(1);
     // We use C indexing here, so len-1 as loopcount
-    mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
+    mlir::Value loopCount = mlir::arith::SubIOp::create(builder, loc, len, one);
     bounds.push_back(loopCount);
   }
   // Create a loop nest consisting of OP operations.
@@ -615,8 +615,8 @@ void fir::genMinMaxlocReductionLoop(
     mlir::Value step = one;
     mlir::Value loopCount = bounds[i - 1];
     auto loop =
-        builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
-                                      /*finalCountValue=*/false, init);
+        fir::DoLoopOp::create(builder, loc, zeroIdx, loopCount, step, false,
+                              /*finalCountValue=*/false, init);
     init = loop.getRegionIterArgs()[0];
     indices.push_back(loop.getInductionVar());
     // Set insertion point to the loop body so that the next loop
@@ -634,7 +634,7 @@ void fir::genMinMaxlocReductionLoop(
   // to return the updated value of the reduction to the enclosing
   // loops.
   for (unsigned i = 0; i < rank; ++i) {
-    auto result = builder.create<fir::ResultOp>(loc, reductionVal);
+    auto result = fir::ResultOp::create(builder, loc, reductionVal);
     // Proceed to the outer loop.
     auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
     reductionVal = loop.getResult(0);
@@ -646,7 +646,7 @@ void fir::genMinMaxlocReductionLoop(
   if (maskMayBeLogicalScalar) {
     if (fir::IfOp ifOp =
             mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
-      builder.create<fir::ResultOp>(loc, reductionVal);
+      fir::ResultOp::create(builder, loc, reductionVal);
       builder.setInsertionPointAfter(ifOp);
       // Redefine flagSet to escape scope of ifOp
       flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
@@ -689,10 +689,11 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
   mlir::Value returnValue = builder.createIntegerConstant(loc, resultElemTy, 0);
   mlir::Value resultArrSize = builder.createIntegerConstant(loc, idxTy, rank);
 
-  mlir::Value resultArrInit = builder.create<fir::AllocMemOp>(loc, resultTy);
-  mlir::Value resultArrShape = builder.create<fir::ShapeOp>(loc, resultArrSize);
-  mlir::Value resultArr = builder.create<fir::EmboxOp>(
-      loc, resultBoxTy, resultArrInit, resultArrShape);
+  mlir::Value resultArrInit = fir::AllocMemOp::create(builder, loc, resultTy);
+  mlir::Value resultArrShape =
+      fir::ShapeOp::create(builder, loc, resultArrSize);
+  mlir::Value resultArr = fir::EmboxOp::create(builder, loc, resultBoxTy,
+                                               resultArrInit, resultArrShape);
 
   mlir::Type resultRefTy = builder.getRefType(resultElemTy);
 
@@ -701,14 +702,14 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
                                        fir::SequenceType::getUnknownExtent());
     mlir::Type maskTy = fir::SequenceType::get(flatShape, maskElemType);
     mlir::Type boxMaskTy = fir::BoxType::get(maskTy);
-    mask = builder.create<fir::ConvertOp>(loc, boxMaskTy, mask);
+    mask = fir::ConvertOp::create(builder, loc, boxMaskTy, mask);
   }
 
   for (unsigned int i = 0; i < rank; ++i) {
     mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
     mlir::Value resultElemAddr =
-        builder.create<fir::CoordinateOp>(loc, resultRefTy, resultArr, index);
-    builder.create<fir::StoreOp>(loc, returnValue, resultElemAddr);
+        fir::CoordinateOp::create(builder, loc, resultRefTy, resultArr, index);
+    fir::StoreOp::create(builder, loc, returnValue, resultElemAddr);
   }
 
   auto genBodyOp =
@@ -720,29 +721,30 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
     if (maskRank > 0) {
       mlir::Type logicalRef = builder.getRefType(maskElemType);
       mlir::Value maskAddr =
-          builder.create<fir::CoordinateOp>(loc, logicalRef, mask, indices);
-      mlir::Value maskElem = builder.create<fir::LoadOp>(loc, maskAddr);
+          fir::CoordinateOp::create(builder, loc, logicalRef, mask, indices);
+      mlir::Value maskElem = fir::LoadOp::create(builder, loc, maskAddr);
 
       // fir::IfOp requires argument to be I1 - won't accept logical or any
       // other Integer.
       mlir::Type ifCompatType = builder.getI1Type();
       mlir::Value ifCompatElem =
-          builder.create<fir::ConvertOp>(loc, ifCompatType, maskElem);
+          fir::ConvertOp::create(builder, loc, ifCompatType, maskElem);
 
       llvm::SmallVector<mlir::Type> resultsTy = {elementType, elementType};
-      fir::IfOp ifOp = builder.create<fir::IfOp>(loc, elementType, ifCompatElem,
-                                                 /*withElseRegion=*/true);
+      fir::IfOp ifOp =
+          fir::IfOp::create(builder, loc, elementType, ifCompatElem,
+                            /*withElseRegion=*/true);
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     }
 
     // Set flag that mask was true at some point
     mlir::Value flagSet = builder.createIntegerConstant(
         loc, mlir::cast<fir::ReferenceType>(flagRef.getType()).getEleTy(), 1);
-    mlir::Value isFirst = builder.create<fir::LoadOp>(loc, flagRef);
+    mlir::Value isFirst = fir::LoadOp::create(builder, loc, flagRef);
     mlir::Type eleRefTy = builder.getRefType(elementType);
     mlir::Value addr =
-        builder.create<fir::CoordinateOp>(loc, eleRefTy, array, indices);
-    mlir::Value elem = builder.create<fir::LoadOp>(loc, addr);
+        fir::CoordinateOp::create(builder, loc, eleRefTy, array, indices);
+    mlir::Value elem = fir::LoadOp::create(builder, loc, addr);
 
     mlir::Value cmp;
     if (mlir::isa<mlir::FloatType>(elementType)) {
@@ -750,38 +752,37 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
       // is not NaN. A OGL/OLT condition will usually work for this unless all
       // the values are Nan or Inf. This follows the same logic as
       // NumericCompare for Minloc/Maxlox in extrema.cpp.
-      cmp = builder.create<mlir::arith::CmpFOp>(
-          loc,
-          isMax ? mlir::arith::CmpFPredicate::OGT
-                : mlir::arith::CmpFPredicate::OLT,
-          elem, reduction);
-
-      mlir::Value cmpNan = builder.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::UNE, reduction, reduction);
-      mlir::Value cmpNan2 = builder.create<mlir::arith::CmpFOp>(
-          loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
-      cmpNan = builder.create<mlir::arith::AndIOp>(loc, cmpNan, cmpNan2);
-      cmp = builder.create<mlir::arith::OrIOp>(loc, cmp, cmpNan);
+      cmp = mlir::arith::CmpFOp::create(builder, loc,
+                                        isMax ? mlir::arith::CmpFPredicate::OGT
+                                              : mlir::arith::CmpFPredicate::OLT,
+                                        elem, reduction);
+
+      mlir::Value cmpNan = mlir::arith::CmpFOp::create(
+          builder, loc, mlir::arith::CmpFPredicate::UNE, reduction, reduction);
+      mlir::Value cmpNan2 = mlir::arith::CmpFOp::create(
+          builder, loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
+      cmpNan = mlir::arith::AndIOp::create(builder, loc, cmpNan, cmpNan2);
+      cmp = mlir::arith::OrIOp::create(builder, loc, cmp, cmpNan);
     } else if (mlir::isa<mlir::IntegerType>(elementType)) {
-      cmp = builder.create<mlir::arith::CmpIOp>(
-          loc,
-          isMax ? mlir::arith::CmpIPredicate::sgt
-                : mlir::arith::CmpIPredicate::slt,
-          elem, reduction);
+      cmp = mlir::arith::CmpIOp::create(builder, loc,
+                                        isMax ? mlir::arith::CmpIPredicate::sgt
+                                              : mlir::arith::CmpIPredicate::slt,
+                                        elem, reduction);
     } else {
       llvm_unreachable("unsupported type");
     }
 
     // The condition used for the loop is isFirst || <the condition above>.
-    isFirst = builder.create<fir::ConvertOp>(loc, cmp.getType(), isFirst);
-    isFirst = builder.create<mlir::arith::XOrIOp>(
-        loc, isFirst, builder.createIntegerConstant(loc, cmp.getType(), 1));
-    cmp = builder.create<mlir::arith::OrIOp>(loc, cmp, isFirst);
-    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, elementType, cmp,
-                                               /*withElseRegion*/ true);
+    isFirst = fir::ConvertOp::create(builder, loc, cmp.getType(), isFirst);
+    isFirst = mlir::arith::XOrIOp::create(
+        builder, loc, isFirst,
+        builder.createIntegerConstant(loc, cmp.getType(), 1));
+    cmp = mlir::arith::OrIOp::create(builder, loc, cmp, isFirst);
+    fir::IfOp ifOp = fir::IfOp::create(builder, loc, elementType, cmp,
+                                       /*withElseRegion*/ true);
 
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    builder.create<fir::StoreOp>(loc, flagSet, flagRef);
+    fir::StoreOp::create(builder, loc, flagSet, flagRef);
     mlir::Type resultElemTy = hlfir::getFortranElementType(resultArr.getType());
     mlir::Type returnRefTy = builder.getRefType(resultElemTy);
     mlir::IndexType idxTy = builder.getIndexType();
@@ -790,17 +791,17 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
 
     for (unsigned int i = 0; i < rank; ++i) {
       mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
-      mlir::Value resultElemAddr =
-          builder.create<fir::CoordinateOp>(loc, returnRefTy, resultArr, index);
+      mlir::Value resultElemAddr = fir::CoordinateOp::create(
+          builder, loc, returnRefTy, resultArr, index);
       mlir::Value convert =
-          builder.create<fir::ConvertOp>(loc, resultElemTy, indices[i]);
+          fir::ConvertOp::create(builder, loc, resultElemTy, indices[i]);
       mlir::Value fortranIndex =
-          builder.create<mlir::arith::AddIOp>(loc, convert, one);
-      builder.create<fir::StoreOp>(loc, fortranIndex, resultElemAddr);
+          mlir::arith::AddIOp::create(builder, loc, convert, one);
+      fir::StoreOp::create(builder, loc, fortranIndex, resultElemAddr);
     }
-    builder.create<fir::ResultOp>(loc, elem);
+    fir::ResultOp::create(builder, loc, elem);
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-    builder.create<fir::ResultOp>(loc, reduction);
+    fir::ResultOp::create(builder, loc, reduction);
     builder.setInsertionPointAfter(ifOp);
     mlir::Value reductionVal = ifOp.getResult(0);
 
@@ -808,9 +809,9 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
     if (maskRank > 0) {
       fir::IfOp ifOp =
           mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp());
-      builder.create<fir::ResultOp>(loc, reductionVal);
+      fir::ResultOp::create(builder, loc, reductionVal);
       builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-      builder.create<fir::ResultOp>(loc, reduction);
+      fir::ResultOp::create(builder, loc, reduction);
       reductionVal = ifOp.getResult(0);
       builder.setInsertionPointAfter(ifOp);
     }
@@ -825,12 +826,12 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
     mlir::Type logical = maskElemType;
     mlir::Type logicalRefTy = builder.getRefType(logical);
     mlir::Value condAddr =
-        builder.create<fir::BoxAddrOp>(loc, logicalRefTy, mask);
-    mlir::Value cond = builder.create<fir::LoadOp>(loc, condAddr);
-    mlir::Value condI1 = builder.create<fir::ConvertOp>(loc, i1Type, cond);
+        fir::BoxAddrOp::create(builder, loc, logicalRefTy, mask);
+    mlir::Value cond = fir::LoadOp::create(builder, loc, condAddr);
+    mlir::Value condI1 = fir::ConvertOp::create(builder, loc, i1Type, cond);
 
-    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, elementType, condI1,
-                                               /*withElseRegion=*/true);
+    fir::IfOp ifOp = fir::IfOp::create(builder, loc, elementType, condI1,
+                                       /*withElseRegion=*/true);
 
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
     mlir::Value basicValue;
@@ -839,7 +840,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
     } else {
       basicValue = builder.createRealConstant(loc, elementType, 0);
     }
-    builder.create<fir::ResultOp>(loc, basicValue);
+    fir::ResultOp::create(builder, loc, basicValue);
 
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   }
@@ -847,8 +848,8 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
                       const mlir::Type &resultElemType, mlir::Value resultArr,
                       mlir::Value index) {
     mlir::Type resultRefTy = builder.getRefType(resultElemType);
-    return builder.create<fir::CoordinateOp>(loc, resultRefTy, resultArr,
-                                             index);
+    return fir::CoordinateOp::create(builder, loc, resultRefTy, resultArr,
+                                     index);
   };
 
   genMinMaxlocReductionLoop(builder, funcOp.front().getArgument(1), init,
@@ -859,25 +860,26 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
   if (isDim) {
     mlir::Type resultBoxTy =
         fir::BoxType::get(fir::HeapType::get(resultElemTy));
-    mlir::Value outputArr = builder.create<fir::ConvertOp>(
-        loc, builder.getRefType(resultBoxTy), funcOp.front().getArgument(0));
-    mlir::Value resultArrScalar = builder.create<fir::ConvertOp>(
-        loc, fir::HeapType::get(resultElemTy), resultArrInit);
+    mlir::Value outputArr =
+        fir::ConvertOp::create(builder, loc, builder.getRefType(resultBoxTy),
+                               funcOp.front().getArgument(0));
+    mlir::Value resultArrScalar = fir::ConvertOp::create(
+        builder, loc, fir::HeapType::get(resultElemTy), resultArrInit);
     mlir::Value resultBox =
-        builder.create<fir::EmboxOp>(loc, resultBoxTy, resultArrScalar);
-    builder.create<fir::StoreOp>(loc, resultBox, outputArr);
+        fir::EmboxOp::create(builder, loc, resultBoxTy, resultArrScalar);
+    fir::StoreOp::create(builder, loc, resultBox, outputArr);
   } else {
     fir::SequenceType::Shape resultShape(1, rank);
     mlir::Type outputArrTy = fir::SequenceType::get(resultShape, resultElemTy);
     mlir::Type outputHeapTy = fir::HeapType::get(outputArrTy);
     mlir::Type outputBoxTy = fir::BoxType::get(outputHeapTy);
     mlir::Type outputRefTy = builder.getRefType(outputBoxTy);
-    mlir::Value outputArr = builder.create<fir::ConvertOp>(
-        loc, outputRefTy, funcOp.front().getArgument(0));
-    builder.create<fir::StoreOp>(loc, resultArr, outputArr);
+    mlir::Value outputArr = fir::ConvertOp::create(
+        builder, loc, outputRefTy, funcOp.front().getArgument(0));
+    fir::StoreOp::create(builder, loc, resultArr, outputArr);
   }
 
-  builder.create<mlir::func::ReturnOp>(loc);
+  mlir::func::ReturnOp::create(builder, loc);
 }
 
 /// Generate function type for the simplified version of RTNAME(DotProduct)
@@ -929,10 +931,10 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   fir::SequenceType::Shape flatShape = {fir::SequenceType::getUnknownExtent()};
   mlir::Type arrTy1 = fir::SequenceType::get(flatShape, arg1ElementTy);
   mlir::Type boxArrTy1 = fir::BoxType::get(arrTy1);
-  mlir::Value array1 = builder.create<fir::ConvertOp>(loc, boxArrTy1, arg1);
+  mlir::Value array1 = fir::ConvertOp::create(builder, loc, boxArrTy1, arg1);
   mlir::Type arrTy2 = fir::SequenceType::get(flatShape, arg2ElementTy);
   mlir::Type boxArrTy2 = fir::BoxType::get(arrTy2);
-  mlir::Value array2 = builder.create<fir::ConvertOp>(loc, boxArrTy2, arg2);
+  mlir::Value array2 = fir::ConvertOp::create(builder, loc, boxArrTy2, arg2);
   // This version takes the loop trip count from the first argument.
   // If the first argument's box has unknown (at compilation time)
   // extent, then it may be better to take the extent from the second
@@ -941,17 +943,17 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   // function and some analysis at the call site to choose which version
   // is more profitable to call.
   // Note that we can assume that both arguments have the same extent.
-  auto dims =
-      builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array1, zeroIdx);
+  auto dims = fir::BoxDimsOp::create(builder, loc, idxTy, idxTy, idxTy, array1,
+                                     zeroIdx);
   mlir::Value len = dims.getResult(1);
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
   mlir::Value step = one;
 
   // We use C indexing here, so len-1 as loopcount
-  mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
-  auto loop = builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step,
-                                            /*unordered=*/false,
-                                            /*finalCountValue=*/false, zero);
+  mlir::Value loopCount = mlir::arith::SubIOp::create(builder, loc, len, one);
+  auto loop = fir::DoLoopOp::create(builder, loc, zeroIdx, loopCount, step,
+                                    /*unordered=*/false,
+                                    /*finalCountValue=*/false, zero);
   mlir::Value sumVal = loop.getRegionIterArgs()[0];
 
   // Begin loop code
@@ -961,33 +963,35 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   mlir::Type eleRef1Ty = builder.getRefType(arg1ElementTy);
   mlir::Value index = loop.getInductionVar();
   mlir::Value addr1 =
-      builder.create<fir::CoordinateOp>(loc, eleRef1Ty, array1, index);
-  mlir::Value elem1 = builder.create<fir::LoadOp>(loc, addr1);
+      fir::CoordinateOp::create(builder, loc, eleRef1Ty, array1, index);
+  mlir::Value elem1 = fir::LoadOp::create(builder, loc, addr1);
   // Convert to the result type.
-  elem1 = builder.create<fir::ConvertOp>(loc, resultElementType, elem1);
+  elem1 = fir::ConvertOp::create(builder, loc, resultElementType, elem1);
 
   mlir::Type eleRef2Ty = builder.getRefType(arg2ElementTy);
   mlir::Value addr2 =
-      builder.create<fir::CoordinateOp>(loc, eleRef2Ty, array2, index);
-  mlir::Value elem2 = builder.create<fir::LoadOp>(loc, addr2);
+      fir::CoordinateOp::create(builder, loc, eleRef2Ty, array2, index);
+  mlir::Value elem2 = fir::LoadOp::create(builder, loc, addr2);
   // Convert to the result type.
-  elem2 = builder.create<fir::ConvertOp>(loc, resultElementType, elem2);
+  elem2 = fir::ConvertOp::create(builder, loc, resultElementType, elem2);
 
   if (mlir::isa<mlir::FloatType>(resultElementType))
-    sumVal = builder.create<mlir::arith::AddFOp>(
-        loc, builder.create<mlir::arith::MulFOp>(loc, elem1, elem2), sumVal);
+    sumVal = mlir::arith::AddFOp::create(
+        builder, loc, mlir::arith::MulFOp::create(builder, loc, elem1, elem2),
+        sumVal);
   else if (mlir::isa<mlir::IntegerType>(resultElementType))
-    sumVal = builder.create<mlir::arith::AddIOp>(
-        loc, builder.create<mlir::arith::MulIOp>(loc, elem1, elem2), sumVal);
+    sumVal = mlir::arith::AddIOp::create(
+        builder, loc, mlir::arith::MulIOp::create(builder, loc, elem1, elem2),
+        sumVal);
   else
     llvm_unreachable("unsupported type");
 
-  builder.create<fir::ResultOp>(loc, sumVal);
+  fir::ResultOp::create(builder, loc, sumVal);
   // End of loop.
   builder.restoreInsertionPoint(loopEndPt);
 
   mlir::Value resultVal = loop.getResult(0);
-  builder.create<mlir::func::ReturnOp>(loc, resultVal);
+  mlir::func::ReturnOp::create(builder, loc, resultVal);
 }
 
 mlir::func::FuncOp SimplifyIntrinsicsPass::getOrCreateFunction(
@@ -1229,8 +1233,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   mlir::func::FuncOp newFunc =
       getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
-  builder.create<fir::CallOp>(loc, newFunc,
-                              mlir::ValueRange{args[0], args[1], mask});
+  fir::CallOp::create(builder, loc, newFunc,
+                      mlir::ValueRange{args[0], args[1], mask});
   call->dropAllReferences();
   call->erase();
 }
@@ -1259,7 +1263,7 @@ void SimplifyIntrinsicsPass::simplifyReductionBody(
   mlir::func::FuncOp newFunc =
       getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
   auto newCall =
-      builder.create<fir::CallOp>(loc, newFunc, mlir::ValueRange{args[0]});
+      fir::CallOp::create(builder, loc, newFunc, mlir::ValueRange{args[0]});
   call->replaceAllUsesWith(newCall.getResults());
   call->dropAllReferences();
   call->erase();
@@ -1344,8 +1348,8 @@ void SimplifyIntrinsicsPass::runOnOperation() {
 
           mlir::func::FuncOp newFunc = getOrCreateFunction(
               builder, typedFuncName, typeGenerator, bodyGenerator);
-          auto newCall = builder.create<fir::CallOp>(loc, newFunc,
-                                                     mlir::ValueRange{v1, v2});
+          auto newCall = fir::CallOp::create(builder, loc, newFunc,
+                                             mlir::ValueRange{v1, v2});
           call->replaceAllUsesWith(newCall.getResults());
           call->dropAllReferences();
           call->erase();
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index bc8a9497fbb70..0d131291feef3 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -569,7 +569,7 @@ static mlir::Value convertAllocationType(mlir::PatternRewriter &rewriter,
   auto insertionPoint = rewriter.saveInsertionPoint();
   rewriter.setInsertionPointAfter(stack.getDefiningOp());
   mlir::Value conv =
-      rewriter.create<fir::ConvertOp>(loc, firHeapTy, stack).getResult();
+      fir::ConvertOp::create(rewriter, loc, firHeapTy, stack).getResult();
   rewriter.restoreInsertionPoint(insertionPoint);
   return conv;
 }
@@ -758,9 +758,9 @@ AllocMemConversion::insertAlloca(fir::AllocMemOp &oldAlloc,
 
   llvm::StringRef uniqName = unpackName(oldAlloc.getUniqName());
   llvm::StringRef bindcName = unpackName(oldAlloc.getBindcName());
-  auto alloca = rewriter.create<fir::AllocaOp>(loc, varTy, uniqName, bindcName,
-                                               oldAlloc.getTypeparams(),
-                                               oldAlloc.getShape());
+  auto alloca =
+      fir::AllocaOp::create(rewriter, loc, varTy, uniqName, bindcName,
+                            oldAlloc.getTypeparams(), oldAlloc.getShape());
   if (emitLifetimeMarkers)
     insertLifetimeMarkers(oldAlloc, alloca, rewriter);
 
diff --git a/flang/lib/Parser/message.cpp b/flang/lib/Parser/message.cpp
index 909fba948a45a..2a8101dd0b810 100644
--- a/flang/lib/Parser/message.cpp
+++ b/flang/lib/Parser/message.cpp
@@ -453,7 +453,7 @@ void Messages::ResolveProvenances(const AllCookedSources &allCooked) {
 
 void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked,
     bool echoSourceLines, const common::LanguageFeatureControl *hintFlagPtr,
-    std::size_t maxErrorsToEmit) const {
+    std::size_t maxErrorsToEmit, bool warningsAreErrors) const {
   std::vector<const Message *> sorted;
   for (const auto &msg : messages_) {
     sorted.push_back(&msg);
@@ -469,7 +469,7 @@ void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked,
     }
     msg->Emit(o, allCooked, echoSourceLines, hintFlagPtr);
     lastMsg = msg;
-    if (msg->IsFatal()) {
+    if (warningsAreErrors || msg->IsFatal()) {
       ++errorsEmitted;
     }
     // If maxErrorsToEmit is 0, emit all errors, otherwise break after
@@ -491,7 +491,18 @@ void Messages::AttachTo(Message &msg, std::optional<Severity> severity) {
   messages_.clear();
 }
 
-bool Messages::AnyFatalError() const {
+bool Messages::AnyFatalError(bool warningsAreErrors) const {
+  // Short-circuit in the most common case.
+  if (messages_.empty()) {
+    return false;
+  }
+  // If warnings are errors and there are warnings or errors, this is fatal.
+  // This preserves the compiler's current behavior of treating any non-fatal
+  // message as a warning. We may want to refine this in the future.
+  if (warningsAreErrors) {
+    return true;
+  }
+  // Otherwise, check the message buffer for fatal errors.
   for (const auto &msg : messages_) {
     if (msg.IsFatal()) {
       return true;
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 76c9499410017..d349d8ceb0bb5 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -274,6 +274,10 @@ TYPE_PARSER( //
     construct<OmpTypeSpecifier>(Parser<DeclarationTypeSpec>{}) ||
     construct<OmpTypeSpecifier>(Parser<TypeSpec>{}))
 
+// 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
+TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
+    construct<OmpReductionIdentifier>(Parser<ProcedureDesignator>{}))
+
 TYPE_PARSER(construct<OmpReductionSpecifier>( //
     Parser<OmpReductionIdentifier>{},
     ":"_tok >> nonemptyList(Parser<OmpTypeSpecifier>{}),
@@ -442,9 +446,18 @@ TYPE_PARSER(construct<OmpAllocatorComplexModifier>(
 
 TYPE_PARSER(construct<OmpAllocatorSimpleModifier>(scalarIntExpr))
 
+TYPE_PARSER(construct<OmpAlwaysModifier>( //
+    "ALWAYS" >> pure(OmpAlwaysModifier::Value::Always)))
+
 TYPE_PARSER(construct<OmpChunkModifier>( //
     "SIMD" >> pure(OmpChunkModifier::Value::Simd)))
 
+TYPE_PARSER(construct<OmpCloseModifier>( //
+    "CLOSE" >> pure(OmpCloseModifier::Value::Close)))
+
+TYPE_PARSER(construct<OmpDeleteModifier>( //
+    "DELETE" >> pure(OmpDeleteModifier::Value::Delete)))
+
 TYPE_PARSER(construct<OmpDependenceType>(
     "SINK" >> pure(OmpDependenceType::Value::Sink) ||
     "SOURCE" >> pure(OmpDependenceType::Value::Source)))
@@ -502,26 +515,16 @@ TYPE_PARSER(construct<OmpLinearModifier>( //
 TYPE_PARSER(construct<OmpMapper>( //
     "MAPPER"_tok >> parenthesized(Parser<ObjectName>{})))
 
-// map-type -> ALLOC | DELETE | FROM | RELEASE | TO | TOFROM
+// map-type -> ALLOC | DELETE | FROM | RELEASE | STORAGE | TO | TOFROM
 TYPE_PARSER(construct<OmpMapType>( //
     "ALLOC" >> pure(OmpMapType::Value::Alloc) ||
-    "DELETE" >> pure(OmpMapType::Value::Delete) ||
+    // Parse "DELETE" as OmpDeleteModifier
     "FROM" >> pure(OmpMapType::Value::From) ||
     "RELEASE" >> pure(OmpMapType::Value::Release) ||
+    "STORAGE" >> pure(OmpMapType::Value::Storage) ||
     "TO"_id >> pure(OmpMapType::Value::To) ||
     "TOFROM" >> pure(OmpMapType::Value::Tofrom)))
 
-// map-type-modifier -> ALWAYS | CLOSE | OMPX_HOLD | PRESENT
-TYPE_PARSER(construct<OmpMapTypeModifier>(
-    "ALWAYS" >> pure(OmpMapTypeModifier::Value::Always) ||
-    "CLOSE" >> pure(OmpMapTypeModifier::Value::Close) ||
-    "OMPX_HOLD" >> pure(OmpMapTypeModifier::Value::Ompx_Hold) ||
-    "PRESENT" >> pure(OmpMapTypeModifier::Value::Present)))
-
-// 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
-TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
-    construct<OmpReductionIdentifier>(Parser<ProcedureDesignator>{}))
-
 TYPE_PARSER(construct<OmpOrderModifier>(
     "REPRODUCIBLE" >> pure(OmpOrderModifier::Value::Reproducible) ||
     "UNCONSTRAINED" >> pure(OmpOrderModifier::Value::Unconstrained)))
@@ -534,11 +537,22 @@ TYPE_PARSER(construct<OmpOrderingModifier>(
 TYPE_PARSER(construct<OmpPrescriptiveness>(
     "STRICT" >> pure(OmpPrescriptiveness::Value::Strict)))
 
+TYPE_PARSER(construct<OmpPresentModifier>( //
+    "PRESENT" >> pure(OmpPresentModifier::Value::Present)))
+
 TYPE_PARSER(construct<OmpReductionModifier>(
     "INSCAN" >> pure(OmpReductionModifier::Value::Inscan) ||
     "TASK" >> pure(OmpReductionModifier::Value::Task) ||
     "DEFAULT" >> pure(OmpReductionModifier::Value::Default)))
 
+TYPE_PARSER(construct<OmpRefModifier>( //
+    "REF_PTEE" >> pure(OmpRefModifier::Value::Ref_Ptee) ||
+    "REF_PTR"_id >> pure(OmpRefModifier::Value::Ref_Ptr) ||
+    "REF_PTR_PTEE" >> pure(OmpRefModifier::Value::Ref_Ptr_Ptee)))
+
+TYPE_PARSER(construct<OmpSelfModifier>( //
+    "SELF" >> pure(OmpSelfModifier::Value::Self)))
+
 TYPE_PARSER(construct<OmpStepComplexModifier>( //
     "STEP" >> parenthesized(scalarIntExpr)))
 
@@ -559,6 +573,9 @@ TYPE_PARSER(construct<OmpVariableCategory>(
     "POINTER" >> pure(OmpVariableCategory::Value::Pointer) ||
     "SCALAR" >> pure(OmpVariableCategory::Value::Scalar)))
 
+TYPE_PARSER(construct<OmpxHoldModifier>( //
+    "OMPX_HOLD" >> pure(OmpxHoldModifier::Value::Ompx_Hold)))
+
 // This could be auto-generated.
 TYPE_PARSER(
     sourced(construct<OmpAffinityClause::Modifier>(Parser<OmpIterator>{})))
@@ -611,10 +628,16 @@ TYPE_PARSER(sourced(
     construct<OmpLinearClause::Modifier>(Parser<OmpStepSimpleModifier>{})))
 
 TYPE_PARSER(sourced(construct<OmpMapClause::Modifier>(
-    sourced(construct<OmpMapClause::Modifier>(Parser<OmpMapTypeModifier>{}) ||
+    sourced(construct<OmpMapClause::Modifier>(Parser<OmpAlwaysModifier>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpCloseModifier>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpDeleteModifier>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpPresentModifier>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpRefModifier>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpSelfModifier>{}) ||
         construct<OmpMapClause::Modifier>(Parser<OmpMapper>{}) ||
         construct<OmpMapClause::Modifier>(Parser<OmpIterator>{}) ||
-        construct<OmpMapClause::Modifier>(Parser<OmpMapType>{})))))
+        construct<OmpMapClause::Modifier>(Parser<OmpMapType>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpxHoldModifier>{})))))
 
 TYPE_PARSER(
     sourced(construct<OmpOrderClause::Modifier>(Parser<OmpOrderModifier>{})))
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index fbe89c668fc13..8ed16905b5099 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -3007,8 +3007,15 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpPrescriptiveness, Value) // OMP prescriptiveness
   WALK_NESTED_ENUM(OmpMapType, Value) // OMP map-type
   WALK_NESTED_ENUM(OmpMapTypeModifier, Value) // OMP map-type-modifier
+  WALK_NESTED_ENUM(OmpAlwaysModifier, Value)
+  WALK_NESTED_ENUM(OmpCloseModifier, Value)
+  WALK_NESTED_ENUM(OmpDeleteModifier, Value)
+  WALK_NESTED_ENUM(OmpPresentModifier, Value)
+  WALK_NESTED_ENUM(OmpRefModifier, Value)
+  WALK_NESTED_ENUM(OmpSelfModifier, Value)
   WALK_NESTED_ENUM(OmpTraitSelectorName, Value)
   WALK_NESTED_ENUM(OmpTraitSetSelectorName, Value)
+  WALK_NESTED_ENUM(OmpxHoldModifier, Value)
 
 #undef WALK_NESTED_ENUM
   void Unparse(const ReductionOperator::Operator x) {
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index cf05d8463277f..77e2fd6ca5097 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -27,7 +27,8 @@ class CanonicalizationOfOmp {
 public:
   template <typename T> bool Pre(T &) { return true; }
   template <typename T> void Post(T &) {}
-  CanonicalizationOfOmp(parser::Messages &messages) : messages_{messages} {}
+  CanonicalizationOfOmp(SemanticsContext &context)
+      : context_{context}, messages_{context.messages()} {}
 
   void Post(parser::Block &block) {
     for (auto it{block.begin()}; it != block.end(); ++it) {
@@ -88,6 +89,8 @@ class CanonicalizationOfOmp {
     CanonicalizeUtilityConstructs(spec);
   }
 
+  void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); }
+
 private:
   template <typename T> T *GetConstructIf(parser::ExecutionPartConstruct &x) {
     if (auto *y{std::get_if<parser::ExecutableConstruct>(&x.u)}) {
@@ -390,16 +393,58 @@ class CanonicalizationOfOmp {
     omps.erase(rlast.base(), omps.end());
   }
 
+  // Map clause modifiers are parsed as per OpenMP 6.0 spec. That spec has
+  // changed properties of some of the modifiers, for example it has expanded
+  // map-type-modifier into 3 individual modifiers (one for each of the
+  // possible values of the original modifier), and the "map-type" modifier
+  // is no longer ultimate.
+  // To utilize the modifier validation framework for semantic checks,
+  // if the specified OpenMP version is less than 6.0, rewrite the affected
+  // modifiers back into the pre-6.0 forms.
+  void CanonicalizeMapModifiers(parser::OmpMapClause &map) {
+    unsigned version{context_.langOptions().OpenMPVersion};
+    if (version >= 60) {
+      return;
+    }
+
+    // Omp{Always, Close, Present, xHold}Modifier -> OmpMapTypeModifier
+    // OmpDeleteModifier -> OmpMapType
+    using Modifier = parser::OmpMapClause::Modifier;
+    using Modifiers = std::optional<std::list<Modifier>>;
+    auto &modifiers{std::get<Modifiers>(map.t)};
+    if (!modifiers) {
+      return;
+    }
+
+    using MapTypeModifier = parser::OmpMapTypeModifier;
+    using MapType = parser::OmpMapType;
+
+    for (auto &mod : *modifiers) {
+      if (std::holds_alternative<parser::OmpAlwaysModifier>(mod.u)) {
+        mod.u = MapTypeModifier(MapTypeModifier::Value::Always);
+      } else if (std::holds_alternative<parser::OmpCloseModifier>(mod.u)) {
+        mod.u = MapTypeModifier(MapTypeModifier::Value::Close);
+      } else if (std::holds_alternative<parser::OmpPresentModifier>(mod.u)) {
+        mod.u = MapTypeModifier(MapTypeModifier::Value::Present);
+      } else if (std::holds_alternative<parser::OmpxHoldModifier>(mod.u)) {
+        mod.u = MapTypeModifier(MapTypeModifier::Value::Ompx_Hold);
+      } else if (std::holds_alternative<parser::OmpDeleteModifier>(mod.u)) {
+        mod.u = MapType(MapType::Value::Delete);
+      }
+    }
+  }
+
   // Mapping from the specification parts to the blocks that follow in the
   // same construct. This is for converting utility constructs to executable
   // constructs.
   std::map<parser::SpecificationPart *, parser::Block *> blockForSpec_;
+  SemanticsContext &context_;
   parser::Messages &messages_;
 };
 
-bool CanonicalizeOmp(parser::Messages &messages, parser::Program &program) {
-  CanonicalizationOfOmp omp{messages};
+bool CanonicalizeOmp(SemanticsContext &context, parser::Program &program) {
+  CanonicalizationOfOmp omp{context};
   Walk(program, omp);
-  return !messages.AnyFatalError();
+  return !context.messages().AnyFatalError();
 }
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/canonicalize-omp.h b/flang/lib/Semantics/canonicalize-omp.h
index c45d6bbbf9062..3251218da35ed 100644
--- a/flang/lib/Semantics/canonicalize-omp.h
+++ b/flang/lib/Semantics/canonicalize-omp.h
@@ -11,11 +11,12 @@
 
 namespace Fortran::parser {
 struct Program;
-class Messages;
-} // namespace Fortran::parser
+}
 
 namespace Fortran::semantics {
-bool CanonicalizeOmp(parser::Messages &messages, parser::Program &program);
-}
+class SemanticsContext;
+
+bool CanonicalizeOmp(SemanticsContext &context, parser::Program &program);
+} // namespace Fortran::semantics
 
 #endif // FORTRAN_SEMANTICS_CANONICALIZE_OMP_H_
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index f9d64485f1407..a2f2906af10b8 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -151,8 +151,8 @@ class CheckHelper {
   void CheckProcedureAssemblyName(const Symbol &symbol);
   void CheckExplicitSave(const Symbol &);
   parser::Messages WhyNotInteroperableDerivedType(const Symbol &);
-  parser::Messages WhyNotInteroperableObject(
-      const Symbol &, bool allowNonInteroperableType = false);
+  parser::Messages WhyNotInteroperableObject(const Symbol &,
+      bool allowNonInteroperableType = false, bool forCommonBlock = false);
   parser::Messages WhyNotInteroperableFunctionResult(const Symbol &);
   parser::Messages WhyNotInteroperableProcedure(const Symbol &, bool isError);
   void CheckBindC(const Symbol &);
@@ -519,11 +519,35 @@ void CheckHelper::Check(const Symbol &symbol) {
 }
 
 void CheckHelper::CheckCommonBlock(const Symbol &symbol) {
+  auto restorer{messages_.SetLocation(symbol.name())};
   CheckGlobalName(symbol);
   if (symbol.attrs().test(Attr::BIND_C)) {
     CheckBindC(symbol);
+    for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
+      if (ref->has<ObjectEntityDetails>()) {
+        if (auto msgs{WhyNotInteroperableObject(*ref,
+                /*allowInteroperableType=*/false, /*forCommonBlock=*/true)};
+            !msgs.empty()) {
+          parser::Message &reason{msgs.messages().front()};
+          parser::Message *msg{nullptr};
+          if (reason.IsFatal()) {
+            msg = messages_.Say(symbol.name(),
+                "'%s' may not be a member of BIND(C) COMMON block /%s/"_err_en_US,
+                ref->name(), symbol.name());
+          } else {
+            msg = messages_.Say(symbol.name(),
+                "'%s' should not be a member of BIND(C) COMMON block /%s/"_warn_en_US,
+                ref->name(), symbol.name());
+          }
+          if (msg) {
+            msg->Attach(
+                std::move(reason.set_severity(parser::Severity::Because)));
+          }
+        }
+      }
+    }
   }
-  for (MutableSymbolRef ref : symbol.get<CommonBlockDetails>().objects()) {
+  for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
     if (ref->test(Symbol::Flag::CrayPointee)) {
       messages_.Say(ref->name(),
           "Cray pointee '%s' may not be a member of a COMMON block"_err_en_US,
@@ -3154,14 +3178,16 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
 }
 
 parser::Messages CheckHelper::WhyNotInteroperableObject(
-    const Symbol &symbol, bool allowNonInteroperableType) {
+    const Symbol &symbol, bool allowNonInteroperableType, bool forCommonBlock) {
   parser::Messages msgs;
-  if (examinedByWhyNotInteroperable_.find(symbol) !=
-      examinedByWhyNotInteroperable_.end()) {
-    return msgs;
+  if (!forCommonBlock) {
+    if (examinedByWhyNotInteroperable_.find(symbol) !=
+        examinedByWhyNotInteroperable_.end()) {
+      return msgs;
+    }
+    examinedByWhyNotInteroperable_.insert(symbol);
   }
   bool isExplicitBindC{symbol.attrs().test(Attr::BIND_C)};
-  examinedByWhyNotInteroperable_.insert(symbol);
   CHECK(symbol.has<ObjectEntityDetails>());
   if (isExplicitBindC && !symbol.owner().IsModule()) {
     msgs.Say(symbol.name(),
@@ -3258,7 +3284,7 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
     msgs.Say(symbol.name(),
         "An interoperable pointer must not be CONTIGUOUS"_err_en_US);
   }
-  if (msgs.AnyFatalError()) {
+  if (!forCommonBlock && msgs.AnyFatalError()) {
     examinedByWhyNotInteroperable_.erase(symbol);
   }
   return msgs;
@@ -3338,8 +3364,8 @@ parser::Messages CheckHelper::WhyNotInteroperableProcedure(
           // on the C side by either a cdesc_t * or a void *.  F'2023 18.3.7 (5)
           bool allowNonInteroperableType{!dummy->attrs().test(Attr::VALUE) &&
               (IsDescriptor(*dummy) || IsAssumedType(*dummy))};
-          dummyMsgs =
-              WhyNotInteroperableObject(*dummy, allowNonInteroperableType);
+          dummyMsgs = WhyNotInteroperableObject(
+              *dummy, allowNonInteroperableType, /*forCommonBlock=*/false);
         } else {
           CheckBindC(*dummy);
         }
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index cc1d4bf58745a..e258df86a4b1c 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -1180,7 +1180,9 @@ void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) {
 void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
   const auto &control{std::get<parser::IoImpliedDoControl>(outputImpliedDo.t)};
   const parser::Name &name{control.name.thing.thing};
-  context_.CheckIndexVarRedefine(name.source, *name.symbol);
+  if (name.symbol) {
+    context_.CheckIndexVarRedefine(name.source, *name.symbol);
+  }
 }
 
 void DoForallChecker::Leave(const parser::StatVariable &statVariable) {
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 2425265e196c6..8264e1d5e8fd9 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -37,6 +37,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/OpenMP/OMP.h"
 
@@ -1156,8 +1157,7 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar(
                         (sym->has<MainProgramDetails>() ||
                             sym->has<ModuleDetails>())) {
                       context_.Say(name->source,
-                          "The module name or main program name cannot be in a "
-                          "%s "
+                          "The module name cannot be in a %s "
                           "directive"_err_en_US,
                           ContextDirectiveAsFortran());
                     } else if (!IsSaved(*name->symbol) &&
@@ -3399,23 +3399,22 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Detach &x) {
   }
 }
 
-void OmpStructureChecker::CheckAllowedMapTypes(
-    const parser::OmpMapType::Value &type,
-    const std::list<parser::OmpMapType::Value> &allowedMapTypeList) {
-  if (!llvm::is_contained(allowedMapTypeList, type)) {
-    std::string commaSeparatedMapTypes;
-    llvm::interleave(
-        allowedMapTypeList.begin(), allowedMapTypeList.end(),
-        [&](const parser::OmpMapType::Value &mapType) {
-          commaSeparatedMapTypes.append(parser::ToUpperCaseLetters(
-              parser::OmpMapType::EnumToString(mapType)));
-        },
-        [&] { commaSeparatedMapTypes.append(", "); });
-    context_.Say(GetContext().clauseSource,
-        "Only the %s map types are permitted "
-        "for MAP clauses on the %s directive"_err_en_US,
-        commaSeparatedMapTypes, ContextDirectiveAsFortran());
+void OmpStructureChecker::CheckAllowedMapTypes(parser::OmpMapType::Value type,
+    llvm::ArrayRef<parser::OmpMapType::Value> allowed) {
+  if (llvm::is_contained(allowed, type)) {
+    return;
   }
+
+  llvm::SmallVector<std::string> names;
+  llvm::transform(
+      allowed, std::back_inserter(names), [](parser::OmpMapType::Value val) {
+        return parser::ToUpperCaseLetters(
+            parser::OmpMapType::EnumToString(val));
+      });
+  llvm::sort(names);
+  context_.Say(GetContext().clauseSource,
+      "Only the %s map types are permitted for MAP clauses on the %s directive"_err_en_US,
+      llvm::join(names, ", "), ContextDirectiveAsFortran());
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
@@ -3436,27 +3435,62 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
     CheckIteratorModifier(*iter);
   }
   if (auto *type{OmpGetUniqueModifier<parser::OmpMapType>(modifiers)}) {
+    using Directive = llvm::omp::Directive;
     using Value = parser::OmpMapType::Value;
-    switch (GetContext().directive) {
-    case llvm::omp::Directive::OMPD_target:
-    case llvm::omp::Directive::OMPD_target_teams:
-    case llvm::omp::Directive::OMPD_target_teams_distribute:
-    case llvm::omp::Directive::OMPD_target_teams_distribute_simd:
-    case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do:
-    case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do_simd:
-    case llvm::omp::Directive::OMPD_target_data:
-      CheckAllowedMapTypes(
-          type->v, {Value::To, Value::From, Value::Tofrom, Value::Alloc});
-      break;
-    case llvm::omp::Directive::OMPD_target_enter_data:
-      CheckAllowedMapTypes(type->v, {Value::To, Value::Alloc});
-      break;
-    case llvm::omp::Directive::OMPD_target_exit_data:
-      CheckAllowedMapTypes(
-          type->v, {Value::From, Value::Release, Value::Delete});
-      break;
-    default:
-      break;
+
+    static auto isValidForVersion{
+        [](parser::OmpMapType::Value t, unsigned version) {
+          switch (t) {
+          case parser::OmpMapType::Value::Alloc:
+          case parser::OmpMapType::Value::Delete:
+          case parser::OmpMapType::Value::Release:
+            return version < 60;
+          case parser::OmpMapType::Value::Storage:
+            return version >= 60;
+          default:
+            return true;
+          }
+        }};
+
+    llvm::SmallVector<parser::OmpMapType::Value> mapEnteringTypes{[&]() {
+      llvm::SmallVector<parser::OmpMapType::Value> result;
+      for (size_t i{0}; i != parser::OmpMapType::Value_enumSize; ++i) {
+        auto t{static_cast<parser::OmpMapType::Value>(i)};
+        if (isValidForVersion(t, version) && IsMapEnteringType(t)) {
+          result.push_back(t);
+        }
+      }
+      return result;
+    }()};
+    llvm::SmallVector<parser::OmpMapType::Value> mapExitingTypes{[&]() {
+      llvm::SmallVector<parser::OmpMapType::Value> result;
+      for (size_t i{0}; i != parser::OmpMapType::Value_enumSize; ++i) {
+        auto t{static_cast<parser::OmpMapType::Value>(i)};
+        if (isValidForVersion(t, version) && IsMapExitingType(t)) {
+          result.push_back(t);
+        }
+      }
+      return result;
+    }()};
+
+    llvm::omp::Directive dir{GetContext().directive};
+    llvm::ArrayRef<llvm::omp::Directive> leafs{
+        llvm::omp::getLeafConstructsOrSelf(dir)};
+
+    if (llvm::is_contained(leafs, Directive::OMPD_target) ||
+        llvm::is_contained(leafs, Directive::OMPD_target_data)) {
+      if (version >= 60) {
+        // Map types listed in the decay table. [6.0:276]
+        CheckAllowedMapTypes(
+            type->v, {Value::Storage, Value::From, Value::To, Value::Tofrom});
+      } else {
+        CheckAllowedMapTypes(
+            type->v, {Value::Alloc, Value::From, Value::To, Value::Tofrom});
+      }
+    } else if (llvm::is_contained(leafs, Directive::OMPD_target_enter_data)) {
+      CheckAllowedMapTypes(type->v, mapEnteringTypes);
+    } else if (llvm::is_contained(leafs, Directive::OMPD_target_exit_data)) {
+      CheckAllowedMapTypes(type->v, mapExitingTypes);
     }
   }
 
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 6a877a5d0a7c0..f4a291dc255c8 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -179,8 +179,8 @@ class OmpStructureChecker
   void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x);
   void HasInvalidLoopBinding(const parser::OpenMPLoopConstruct &x);
   // specific clause related
-  void CheckAllowedMapTypes(const parser::OmpMapType::Value &,
-      const std::list<parser::OmpMapType::Value> &);
+  void CheckAllowedMapTypes(
+      parser::OmpMapType::Value, llvm::ArrayRef<parser::OmpMapType::Value>);
 
   const std::list<parser::OmpTraitProperty> &GetTraitPropertyList(
       const parser::OmpTraitSelector &);
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 53ec3827893d0..14473724f0f40 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -178,7 +178,7 @@ class ArgumentAnalyzer {
   }
   // Find and return a user-defined assignment
   std::optional<ProcedureRef> TryDefinedAssignment();
-  std::optional<ProcedureRef> GetDefinedAssignmentProc();
+  std::optional<ProcedureRef> GetDefinedAssignmentProc(bool &isAmbiguous);
   std::optional<DynamicType> GetType(std::size_t) const;
   void Dump(llvm::raw_ostream &);
 
@@ -191,7 +191,7 @@ class ArgumentAnalyzer {
   MaybeExpr AnalyzeExprOrWholeAssumedSizeArray(const parser::Expr &);
   bool AreConformable() const;
   const Symbol *FindBoundOp(parser::CharBlock, int passIndex,
-      const Symbol *&generic, bool isSubroutine);
+      const Symbol *&generic, bool isSubroutine, bool *isAmbiguous = nullptr);
   void AddAssignmentConversion(
       const DynamicType &lhsType, const DynamicType &rhsType);
   bool OkLogicalIntegerAssignment(TypeCategory lhs, TypeCategory rhs);
@@ -199,7 +199,8 @@ class ArgumentAnalyzer {
   bool IsBOZLiteral(std::size_t i) const {
     return evaluate::IsBOZLiteral(GetExpr(i));
   }
-  void SayNoMatch(const std::string &, bool isAssignment = false);
+  void SayNoMatch(
+      const std::string &, bool isAssignment = false, bool isAmbiguous = false);
   std::string TypeAsFortran(std::size_t);
   bool AnyUntypedOrMissingOperand();
 
@@ -4781,7 +4782,9 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
     return std::nullopt; // user-defined assignment not allowed for these args
   }
   auto restorer{context_.GetContextualMessages().SetLocation(source_)};
-  if (std::optional<ProcedureRef> procRef{GetDefinedAssignmentProc()}) {
+  bool isAmbiguous{false};
+  if (std::optional<ProcedureRef> procRef{
+          GetDefinedAssignmentProc(isAmbiguous)}) {
     if (context_.inWhereBody() && !procRef->proc().IsElemental()) { // C1032
       context_.Say(
           "Defined assignment in WHERE must be elemental, but '%s' is not"_err_en_US,
@@ -4791,9 +4794,11 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
     return std::move(*procRef);
   }
   if (isDefined == Tristate::Yes) {
-    if (!lhsType || !rhsType || (lhsRank != rhsRank && rhsRank != 0) ||
+    if (isAmbiguous || !lhsType || !rhsType ||
+        (lhsRank != rhsRank && rhsRank != 0) ||
         !OkLogicalIntegerAssignment(lhsType->category(), rhsType->category())) {
-      SayNoMatch("ASSIGNMENT(=)", true);
+      SayNoMatch(
+          "ASSIGNMENT(=)", /*isAssignment=*/true, /*isAmbiguous=*/isAmbiguous);
     }
   } else if (!fatalErrors_) {
     CheckAssignmentConformance();
@@ -4822,13 +4827,15 @@ bool ArgumentAnalyzer::OkLogicalIntegerAssignment(
   return true;
 }
 
-std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
+std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc(
+    bool &isAmbiguous) {
   const Symbol *proc{nullptr};
   bool isProcElemental{false};
   std::optional<int> passedObjectIndex;
   std::string oprNameString{"assignment(=)"};
   parser::CharBlock oprName{oprNameString};
   const auto &scope{context_.context().FindScope(source_)};
+  isAmbiguous = false;
   {
     auto restorer{context_.GetContextualMessages().DiscardMessages()};
     if (const Symbol *symbol{scope.FindSymbol(oprName)}) {
@@ -4842,8 +4849,8 @@ std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
     for (std::size_t i{0}; (!proc || isProcElemental) && i < actuals_.size();
         ++i) {
       const Symbol *generic{nullptr};
-      if (const Symbol *
-          binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) {
+      if (const Symbol *binding{FindBoundOp(oprName, i, generic,
+              /*isSubroutine=*/true, /*isAmbiguous=*/&isAmbiguous)}) {
         // ignore inaccessible type-bound ASSIGNMENT(=) generic
         if (!CheckAccessibleSymbol(scope, DEREF(generic))) {
           const Symbol *resolution{GetBindingResolution(GetType(i), *binding)};
@@ -4967,7 +4974,8 @@ bool ArgumentAnalyzer::AreConformable() const {
 
 // Look for a type-bound operator in the type of arg number passIndex.
 const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName,
-    int passIndex, const Symbol *&generic, bool isSubroutine) {
+    int passIndex, const Symbol *&generic, bool isSubroutine,
+    bool *isAmbiguous) {
   const auto *type{GetDerivedTypeSpec(GetType(passIndex))};
   const semantics::Scope *scope{type ? type->scope() : nullptr};
   if (scope) {
@@ -4989,6 +4997,9 @@ const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName,
       // Use the most recent override of the binding, if any
       return scope->FindComponent(binding->name());
     } else {
+      if (isAmbiguous) {
+        *isAmbiguous = pair.second;
+      }
       context_.EmitGenericResolutionError(*generic, pair.second, isSubroutine);
     }
   }
@@ -5072,40 +5083,37 @@ void ArgumentAnalyzer::ConvertBOZAssignmentRHS(const DynamicType &lhsType) {
 }
 
 // Report error resolving opr when there is a user-defined one available
-void ArgumentAnalyzer::SayNoMatch(const std::string &opr, bool isAssignment) {
+void ArgumentAnalyzer::SayNoMatch(
+    const std::string &opr, bool isAssignment, bool isAmbiguous) {
   std::string type0{TypeAsFortran(0)};
   auto rank0{actuals_[0]->Rank()};
+  std::string prefix{"No intrinsic or user-defined "s + opr + " matches"};
+  if (isAmbiguous) {
+    prefix = "Multiple specific procedures for the generic "s + opr + " match";
+  }
   if (actuals_.size() == 1) {
     if (rank0 > 0) {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "rank %d array of %s"_err_en_US,
-          opr, rank0, type0);
+      context_.Say("%s rank %d array of %s"_err_en_US, prefix, rank0, type0);
     } else {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "operand type %s"_err_en_US,
-          opr, type0);
+      context_.Say("%s operand type %s"_err_en_US, prefix, type0);
     }
   } else {
     std::string type1{TypeAsFortran(1)};
     auto rank1{actuals_[1]->Rank()};
     if (rank0 > 0 && rank1 > 0 && rank0 != rank1) {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "rank %d array of %s and rank %d array of %s"_err_en_US,
-          opr, rank0, type0, rank1, type1);
+      context_.Say("%s rank %d array of %s and rank %d array of %s"_err_en_US,
+          prefix, rank0, type0, rank1, type1);
     } else if (isAssignment && rank0 != rank1) {
       if (rank0 == 0) {
-        context_.Say("No intrinsic or user-defined %s matches "
-                     "scalar %s and rank %d array of %s"_err_en_US,
-            opr, type0, rank1, type1);
+        context_.Say("%s scalar %s and rank %d array of %s"_err_en_US, prefix,
+            type0, rank1, type1);
       } else {
-        context_.Say("No intrinsic or user-defined %s matches "
-                     "rank %d array of %s and scalar %s"_err_en_US,
-            opr, rank0, type0, type1);
+        context_.Say("%s rank %d array of %s and scalar %s"_err_en_US, prefix,
+            rank0, type0, type1);
       }
     } else {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "operand types %s and %s"_err_en_US,
-          opr, type0, type1);
+      context_.Say(
+          "%s operand types %s and %s"_err_en_US, prefix, type0, type1);
     }
   }
 }
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp
index c84e832ee52a1..336ce4beb24ba 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -140,6 +140,22 @@ OmpGetDescriptor<parser::OmpAllocatorSimpleModifier>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAlwaysModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"always-modifier",
+      /*props=*/
+      {
+          {45, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpChunkModifier>() {
   static const OmpModifierDescriptor desc{
@@ -156,6 +172,22 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpChunkModifier>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpCloseModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"close-modifier",
+      /*props=*/
+      {
+          {50, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {50, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpContextSelector>() {
   static const OmpModifierDescriptor desc{
@@ -173,6 +205,23 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpContextSelector>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpDeleteModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"delete-modifier",
+      /*props=*/
+      {
+          {45, {OmpProperty::Unique, OmpProperty::Ultimate}},
+          {60, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpDependenceType>() {
   static const OmpModifierDescriptor desc{
@@ -347,6 +396,7 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpMapType>() {
       /*props=*/
       {
           {45, {OmpProperty::Ultimate}},
+          {60, {OmpProperty::Unique}},
       },
       /*clauses=*/
       {
@@ -367,6 +417,7 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpMapTypeModifier>() {
       /*clauses=*/
       {
           {45, {Clause::OMPC_map}},
+          {60, {}},
       },
   };
   return desc;
@@ -420,6 +471,22 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpPrescriptiveness>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpPresentModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"present-modifier",
+      /*props=*/
+      {
+          {51, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {51, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &
 OmpGetDescriptor<parser::OmpReductionIdentifier>() {
@@ -456,6 +523,38 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpReductionModifier>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpRefModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"ref-modifier",
+      /*props=*/
+      {
+          {60, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {60, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpSelfModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"self-modifier",
+      /*props=*/
+      {
+          {60, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {60, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &
 OmpGetDescriptor<parser::OmpStepComplexModifier>() {
@@ -522,4 +621,20 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpVariableCategory>() {
   };
   return desc;
 }
+
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpxHoldModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"ompx-hold-modifier",
+      /*props=*/
+      {
+          {45, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {45, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index f43d2cc75620e..da14507aa9fe6 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -143,6 +143,31 @@ bool IsVarOrFunctionRef(const MaybeExpr &expr) {
   }
 }
 
+bool IsMapEnteringType(parser::OmpMapType::Value type) {
+  switch (type) {
+  case parser::OmpMapType::Value::Alloc:
+  case parser::OmpMapType::Value::Storage:
+  case parser::OmpMapType::Value::To:
+  case parser::OmpMapType::Value::Tofrom:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool IsMapExitingType(parser::OmpMapType::Value type) {
+  switch (type) {
+  case parser::OmpMapType::Value::Delete:
+  case parser::OmpMapType::Value::From:
+  case parser::OmpMapType::Value::Release:
+  case parser::OmpMapType::Value::Storage:
+  case parser::OmpMapType::Value::Tofrom:
+    return true;
+  default:
+    return false;
+  }
+}
+
 std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
   const parser::TypedExpr &typedExpr{parserExpr.typedExpr};
   // ForwardOwningPointer           typedExpr
diff --git a/flang/lib/Semantics/openmp-utils.h b/flang/lib/Semantics/openmp-utils.h
index a96c008fb26e7..001fbeb45ceec 100644
--- a/flang/lib/Semantics/openmp-utils.h
+++ b/flang/lib/Semantics/openmp-utils.h
@@ -59,6 +59,9 @@ bool IsExtendedListItem(const Symbol &sym);
 bool IsVariableListItem(const Symbol &sym);
 bool IsVarOrFunctionRef(const MaybeExpr &expr);
 
+bool IsMapEnteringType(parser::OmpMapType::Value type);
+bool IsMapExitingType(parser::OmpMapType::Value type);
+
 std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr);
 std::optional<evaluate::DynamicType> GetDynamicType(
     const parser::Expr &parserExpr);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 151f4ccae634e..4c3e509b5a36d 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -138,6 +138,9 @@ class AccAttributeVisitor : DirectiveAttributeVisitor<llvm::acc::Directive> {
   void Post(const parser::OpenACCBlockConstruct &) { PopContext(); }
   bool Pre(const parser::OpenACCCombinedConstruct &);
   void Post(const parser::OpenACCCombinedConstruct &) { PopContext(); }
+  void Post(const parser::AccBeginCombinedDirective &) {
+    GetContext().withinConstruct = true;
+  }
 
   bool Pre(const parser::OpenACCDeclarativeConstruct &);
   void Post(const parser::OpenACCDeclarativeConstruct &) { PopContext(); }
@@ -160,6 +163,18 @@ class AccAttributeVisitor : DirectiveAttributeVisitor<llvm::acc::Directive> {
     GetContext().withinConstruct = true;
   }
 
+  // TODO: We should probably also privatize ConcurrentBounds.
+  template <typename A>
+  bool Pre(const parser::LoopBounds<parser::ScalarName, A> &x) {
+    if (!dirContext_.empty() && GetContext().withinConstruct) {
+      if (auto *symbol{ResolveAcc(
+              x.name.thing, Symbol::Flag::AccPrivate, currScope())}) {
+        AddToContextObjectWithDSA(*symbol, Symbol::Flag::AccPrivate);
+      }
+    }
+    return true;
+  }
+
   bool Pre(const parser::OpenACCStandaloneConstruct &);
   void Post(const parser::OpenACCStandaloneConstruct &) { PopContext(); }
   void Post(const parser::AccStandaloneDirective &) {
@@ -712,7 +727,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   void Post(const parser::EorLabel &eorLabel) { CheckSourceLabel(eorLabel.v); }
 
   void Post(const parser::OmpMapClause &x) {
-    Symbol::Flag ompFlag = Symbol::Flag::OmpMapToFrom;
+    unsigned version{context_.langOptions().OpenMPVersion};
+    std::optional<Symbol::Flag> ompFlag;
+
     auto &mods{OmpGetModifiers(x)};
     if (auto *mapType{OmpGetUniqueModifier<parser::OmpMapType>(mods)}) {
       switch (mapType->v) {
@@ -726,16 +743,33 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
         ompFlag = Symbol::Flag::OmpMapToFrom;
         break;
       case parser::OmpMapType::Value::Alloc:
-        ompFlag = Symbol::Flag::OmpMapAlloc;
-        break;
       case parser::OmpMapType::Value::Release:
-        ompFlag = Symbol::Flag::OmpMapRelease;
+      case parser::OmpMapType::Value::Storage:
+        ompFlag = Symbol::Flag::OmpMapStorage;
         break;
       case parser::OmpMapType::Value::Delete:
         ompFlag = Symbol::Flag::OmpMapDelete;
         break;
       }
     }
+    if (!ompFlag) {
+      if (version >= 60) {
+        // [6.0:275:12-15]
+        // When a map-type is not specified for a clause on which it may be
+        // specified, the map-type defaults to storage if the delete-modifier
+        // is present on the clause or if the list item for which the map-type
+        // is not specified is an assumed-size array.
+        if (OmpGetUniqueModifier<parser::OmpDeleteModifier>(mods)) {
+          ompFlag = Symbol::Flag::OmpMapStorage;
+        }
+        // Otherwise, if delete-modifier is absent, leave ompFlag unset.
+      } else {
+        // [5.2:151:10]
+        // If a map-type is not specified, the map-type defaults to tofrom.
+        ompFlag = Symbol::Flag::OmpMapToFrom;
+      }
+    }
+
     const auto &ompObjList{std::get<parser::OmpObjectList>(x.t)};
     for (const auto &ompObj : ompObjList.v) {
       common::visit(
@@ -744,15 +778,15 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
                 if (const auto *name{
                         semantics::getDesignatorNameIfDataRef(designator)}) {
                   if (name->symbol) {
-                    name->symbol->set(ompFlag);
-                    AddToContextObjectWithDSA(*name->symbol, ompFlag);
-                  }
-                  if (name->symbol &&
-                      semantics::IsAssumedSizeArray(*name->symbol)) {
-                    context_.Say(designator.source,
-                        "Assumed-size whole arrays may not appear on the %s "
-                        "clause"_err_en_US,
-                        "MAP");
+                    name->symbol->set(
+                        ompFlag.value_or(Symbol::Flag::OmpMapStorage));
+                    AddToContextObjectWithDSA(*name->symbol, *ompFlag);
+                    if (semantics::IsAssumedSizeArray(*name->symbol)) {
+                      context_.Say(designator.source,
+                          "Assumed-size whole arrays may not appear on the %s "
+                          "clause"_err_en_US,
+                          "MAP");
+                    }
                   }
                 }
               },
@@ -760,7 +794,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
           },
           ompObj.u);
 
-      ResolveOmpObject(ompObj, ompFlag);
+      ResolveOmpObject(ompObj, ompFlag.value_or(Symbol::Flag::OmpMapStorage));
     }
   }
 
@@ -2759,9 +2793,8 @@ void OmpAttributeVisitor::ResolveOmpObject(
                   }
                   Symbol::Flag dataMappingAttributeFlags[] = {
                       Symbol::Flag::OmpMapTo, Symbol::Flag::OmpMapFrom,
-                      Symbol::Flag::OmpMapToFrom, Symbol::Flag::OmpMapAlloc,
-                      Symbol::Flag::OmpMapRelease, Symbol::Flag::OmpMapDelete,
-                      Symbol::Flag::OmpIsDevicePtr,
+                      Symbol::Flag::OmpMapToFrom, Symbol::Flag::OmpMapStorage,
+                      Symbol::Flag::OmpMapDelete, Symbol::Flag::OmpIsDevicePtr,
                       Symbol::Flag::OmpHasDeviceAddr};
 
                   Symbol::Flag dataSharingAttributeFlags[] = {
diff --git a/flang/lib/Semantics/resolve-labels.cpp b/flang/lib/Semantics/resolve-labels.cpp
index b0cbc4b56e889..9454ef9fe928a 100644
--- a/flang/lib/Semantics/resolve-labels.cpp
+++ b/flang/lib/Semantics/resolve-labels.cpp
@@ -489,15 +489,29 @@ class ParseTreeAnalyzer {
 
   // C1401
   void Post(const parser::MainProgram &mainProgram) {
+    // Uppercase the name of the main program, so that its symbol name
+    // would be unique from similarly named non-main-program symbols.
+    auto upperCaseCharBlock = [](const parser::CharBlock &cb) {
+      auto ch{const_cast<char *>(cb.begin())};
+      for (char *endCh{ch + cb.size()}; ch != endCh; ++ch) {
+        *ch = parser::ToUpperCaseLetter(*ch);
+      }
+    };
+    const parser::CharBlock *progName{nullptr};
+    if (const auto &program{
+            std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(
+                mainProgram.t)}) {
+      progName = &program->statement.v.source;
+      upperCaseCharBlock(*progName);
+    }
     if (const parser::CharBlock *
         endName{GetStmtName(std::get<parser::Statement<parser::EndProgramStmt>>(
             mainProgram.t))}) {
-      if (const auto &program{
-              std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(
-                  mainProgram.t)}) {
-        if (*endName != program->statement.v.source) {
+      upperCaseCharBlock(*endName);
+      if (progName) {
+        if (*endName != *progName) {
           context_.Say(*endName, "END PROGRAM name mismatch"_err_en_US)
-              .Attach(program->statement.v.source, "should be"_en_US);
+              .Attach(*progName, "should be"_en_US);
         }
       } else {
         context_.Say(*endName,
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 96faa5fd954cd..b3268605e7c0c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8574,8 +8574,10 @@ bool ResolveNamesVisitor::Pre(const parser::ImportStmt &x) {
         } else {
           Say(name,
               "A distinct '%s' is already present in this scope"_err_en_US)
-              .Attach(symbol->name(), "Previous declaration of '%s'"_en_US)
-              .Attach(outer->name(), "Declaration of '%s' in host scope"_en_US);
+              .Attach(symbol->name(), "Previous declaration of '%s'"_en_US,
+                  symbol->name().ToString())
+              .Attach(outer->name(), "Declaration of '%s' in host scope"_en_US,
+                  outer->name().ToString());
         }
       }
     } else {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 51ba21a9e5edf..5916a07df7744 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1131,7 +1131,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
   if (auto proc{evaluate::characteristics::Procedure::Characterize(
           specific, context_.foldingContext())}) {
     std::uint8_t isArgDescriptorSet{0};
-    std::uint8_t isArgContiguousSet{0};
+    bool specialCaseFlag{0};
     int argThatMightBeDescriptor{0};
     MaybeExpr which;
     if (isAssignment) {
@@ -1197,7 +1197,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
                         TypeAndShape::Attr::AssumedShape) ||
                 dummyData.attrs.test(evaluate::characteristics::
                         DummyDataObject::Attr::Contiguous)) {
-              isArgContiguousSet |= 1;
+              specialCaseFlag = true;
             }
           }
         }
@@ -1216,7 +1216,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         return;
       }
       if (ddo->type.type().IsPolymorphic()) {
-        isArgDescriptorSet |= 1;
+        argThatMightBeDescriptor = 1;
       }
       switch (io.value()) {
       case common::DefinedIo::ReadFormatted:
@@ -1232,6 +1232,9 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         which = writeUnformattedEnum_;
         break;
       }
+      if (context_.defaultKinds().GetDefaultKind(TypeCategory::Integer) == 8) {
+        specialCaseFlag = true; // UNIT= & IOSTAT= INTEGER(8)
+      }
     }
     if (argThatMightBeDescriptor != 0) {
       if (const auto *dummyData{
@@ -1262,8 +1265,8 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     }
     CHECK(bindingIndex <= 255);
     AddValue(values, specialSchema_, "istypebound"s, IntExpr<1>(bindingIndex));
-    AddValue(values, specialSchema_, "isargcontiguousset"s,
-        IntExpr<1>(isArgContiguousSet));
+    AddValue(values, specialSchema_, "specialcaseflag"s,
+        IntExpr<1>(specialCaseFlag));
     AddValue(values, specialSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{specific}});
     // index might already be present in the case of an override
@@ -1383,19 +1386,26 @@ CollectNonTbpDefinedIoGenericInterfaces(
               } else {
                 // Local scope's specific overrides host's for this type
                 bool updated{false};
+                std::uint8_t flags{0};
+                if (declType->IsPolymorphic()) {
+                  flags |= IsDtvArgPolymorphic;
+                }
+                if (scope.context().GetDefaultKind(TypeCategory::Integer) ==
+                    8) {
+                  flags |= DefinedIoInteger8;
+                }
                 for (auto [iter, end]{result.equal_range(dtDesc)}; iter != end;
                      ++iter) {
                   NonTbpDefinedIo &nonTbp{iter->second};
                   if (nonTbp.definedIo == which) {
                     nonTbp.subroutine = &*specific;
-                    nonTbp.isDtvArgPolymorphic = declType->IsPolymorphic();
+                    nonTbp.flags = flags;
                     updated = true;
                   }
                 }
                 if (!updated) {
-                  result.emplace(dtDesc,
-                      NonTbpDefinedIo{
-                          &*specific, which, declType->IsPolymorphic()});
+                  result.emplace(
+                      dtDesc, NonTbpDefinedIo{&*specific, which, flags});
                 }
               }
             }
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index ab78605d01f4c..6db11aaf56c2a 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -376,8 +376,7 @@ const DeclTypeSpec &SemanticsContext::MakeLogicalType(int kind) {
 }
 
 bool SemanticsContext::AnyFatalError() const {
-  return !messages_.empty() &&
-      (warningsAreErrors_ || messages_.AnyFatalError());
+  return messages_.AnyFatalError(warningsAreErrors_);
 }
 bool SemanticsContext::HasError(const Symbol &symbol) {
   return errorSymbols_.count(symbol) > 0;
@@ -643,8 +642,7 @@ bool Semantics::Perform() {
   return ValidateLabels(context_, program_) &&
       parser::CanonicalizeDo(program_) && // force line break
       CanonicalizeAcc(context_.messages(), program_) &&
-      CanonicalizeOmp(context_.messages(), program_) &&
-      CanonicalizeCUDA(program_) &&
+      CanonicalizeOmp(context_, program_) && CanonicalizeCUDA(program_) &&
       PerformStatementSemantics(context_, program_) &&
       CanonicalizeDirectives(context_.messages(), program_) &&
       ModFileWriter{context_}
@@ -658,7 +656,7 @@ void Semantics::EmitMessages(llvm::raw_ostream &os) {
   context_.messages().ResolveProvenances(context_.allCookedSources());
   context_.messages().Emit(os, context_.allCookedSources(),
       /*echoSourceLine=*/true, &context_.languageFeatures(),
-      /*maxErrorsToEmit=*/context_.maxErrors());
+      context_.maxErrors(), context_.warningsAreErrors());
 }
 
 void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 0380207927ad3..2259cfcf23ece 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -861,8 +861,7 @@ std::string Symbol::OmpFlagToClauseName(Symbol::Flag ompFlag) {
   case Symbol::Flag::OmpMapTo:
   case Symbol::Flag::OmpMapFrom:
   case Symbol::Flag::OmpMapToFrom:
-  case Symbol::Flag::OmpMapAlloc:
-  case Symbol::Flag::OmpMapRelease:
+  case Symbol::Flag::OmpMapStorage:
   case Symbol::Flag::OmpMapDelete:
     clauseName = "MAP";
     break;
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 5e5b43f26c791..5a5b02e1ac3ce 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -348,9 +348,9 @@ const Symbol &BypassGeneric(const Symbol &symbol) {
 
 const Symbol &GetCrayPointer(const Symbol &crayPointee) {
   const Symbol *found{nullptr};
-  for (const auto &[pointee, pointer] :
-      crayPointee.GetUltimate().owner().crayPointers()) {
-    if (pointee == crayPointee.name()) {
+  const Symbol &ultimate{crayPointee.GetUltimate()};
+  for (const auto &[pointee, pointer] : ultimate.owner().crayPointers()) {
+    if (pointee == ultimate.name()) {
       found = &pointer.get();
       break;
     }
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 8dd27d6e4c01b..6af2a5a5e30ff 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -118,7 +118,7 @@
     integer(1) :: which ! SpecialBinding::Which
     integer(1) :: isArgDescriptorSet
     integer(1) :: isTypeBound ! binding index + 1, if any
-    integer(1) :: isArgContiguousSet
+    integer(1) :: specialCaseFlag
     integer(1) :: __padding0(4)
     type(__builtin_c_funptr) :: proc
   end type
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index f8a30da8b9615..d0c312c09353f 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -957,11 +957,21 @@ attributes(device) pure integer function atomicxori(address, val)
 
   ! Time function
 
+  interface
+    attributes(device) integer function clock()
+    end function
+  end interface
+
   interface
     attributes(device) integer(8) function clock64()
     end function
   end interface
 
+  interface
+    attributes(device) integer(8) function globalTimer()
+    end function
+  end interface
+
   ! Warp Match Functions
 
   interface match_all_sync
diff --git a/flang/test/Driver/cuda-option.f90 b/flang/test/Driver/cuda-option.f90
index 0740ed509a077..f55e88dab20ce 100644
--- a/flang/test/Driver/cuda-option.f90
+++ b/flang/test/Driver/cuda-option.f90
@@ -8,7 +8,7 @@ program main
   integer, device :: dvar
 end program
 
-! CHECK-LABEL: PROGRAM main
+! CHECK-LABEL: PROGRAM MAIN
 ! CHECK: INTEGER :: var = 1
 ! CHECK: INTEGER, DEVICE :: dvar
 
diff --git a/flang/test/Driver/fatal-errors-warnings.f90 b/flang/test/Driver/fatal-errors-warnings.f90
new file mode 100644
index 0000000000000..2de09c3ed0778
--- /dev/null
+++ b/flang/test/Driver/fatal-errors-warnings.f90
@@ -0,0 +1,31 @@
+! RUN: %flang_fc1 -Wfatal-errors -pedantic %s 2>&1 | FileCheck %s --check-prefix=CHECK1
+! RUN: not %flang_fc1 -pedantic -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK2
+! RUN: not %flang_fc1 -Wfatal-errors -pedantic -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK3
+
+module m
+    contains
+    subroutine foo(a)
+        real, intent(in), target :: a(:)
+    end subroutine
+end module
+
+program test
+    use m
+    real, target :: a(1)
+    real :: b(1)
+    call foo(a) ! ok
+    !CHECK1: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK2: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK3: fatal-errors-warnings.f90:{{.*}} warning:
+    call foo(b)
+    !CHECK1: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK2: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK3-NOT: error:
+    !CHECK3-NOT: warning:
+    call foo((a))
+    !CHECK1: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK2: fatal-errors-warnings.f90:{{.*}} warning:
+    call foo(a([1]))
+    !! Hard error instead of warning if uncommented.
+    !call foo(a(1))
+end
\ No newline at end of file
diff --git a/flang/test/Driver/intrinsic-module-path.f90 b/flang/test/Driver/intrinsic-module-path.f90
index 8fe486cf61c83..615d8f9a1730a 100644
--- a/flang/test/Driver/intrinsic-module-path.f90
+++ b/flang/test/Driver/intrinsic-module-path.f90
@@ -8,6 +8,7 @@
 !-----------------------------------------
 ! RUN: %flang_fc1 -fsyntax-only %s  2>&1 | FileCheck %s --allow-empty --check-prefix=WITHOUT
 ! RUN: not %flang_fc1 -fsyntax-only -fintrinsic-modules-path %S/Inputs/ %s  2>&1 | FileCheck %s --check-prefix=GIVEN
+! RUN: not %flang_fc1 -fsyntax-only -fintrinsic-modules-path=%S/Inputs/ %s  2>&1 | FileCheck %s --check-prefix=GIVEN
 
 ! WITHOUT-NOT: 'ieee_arithmetic.mod' was not found
 ! WITHOUT-NOT: 'iso_fortran_env.mod' was not found
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 5a3fd0d838002..e7da964184c85 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -44,6 +44,10 @@
 ! RUN: %flang --target=loongarch64-linux-gnu -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-LOONGARCH64
 
+! RUN: %flang --target=sparc64-linux-gnu -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+! RUN: %flang --target=sparc64-freebsd -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+! RUN: %flang --target=sparc64-openbsd -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! CHECK-A57-SAME: "-target-cpu" "cortex-a57"
 ! CHECK-A57-SAME: "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2
@@ -92,3 +96,6 @@
 
 ! CHECK-LOONGARCH64: "-fc1" "-triple" "loongarch64-unknown-linux-gnu"
 ! CHECK-LOONGARCH64-SAME: "-target-cpu" "loongarch64" "-target-feature" "+lsx" "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+ual"
+
+! CHECK-SPARC-VIS: "-fc1" "-triple" "sparc64-{{[^"]+}}"
+! CHECK-SPARC-VIS-SAME: "-target-feature" "+vis"
diff --git a/flang/test/Driver/unparse-use-analyzed.f95 b/flang/test/Driver/unparse-use-analyzed.f95
index eb6046aebba54..4bcd72c9a5f50 100644
--- a/flang/test/Driver/unparse-use-analyzed.f95
+++ b/flang/test/Driver/unparse-use-analyzed.f95
@@ -6,12 +6,12 @@
 ! RUN: %flang_fc1 -fdebug-unparse  %s | FileCheck %s --check-prefix=DEFAULT
 ! RUN: %flang_fc1 -fdebug-unparse -fno-analyzed-objects-for-unparse %s | FileCheck %s --check-prefix=DISABLED
 
-! DEFAULT: PROGRAM test
+! DEFAULT: PROGRAM TEST
 ! DEFAULT-NEXT:  REAL, PARAMETER :: val = 3.43e2_4
 ! DEFAULT-NEXT:  PRINT *, 3.47e2_4
 ! DEFAULT-NEXT: END PROGRAM
 
-! DISABLED: PROGRAM test
+! DISABLED: PROGRAM TEST
 ! DISABLED-NEXT:  REAL, PARAMETER :: val = 343.0
 ! DISABLED-NEXT:  PRINT *, val+4
 ! DISABLED-NEXT: END PROGRAM
diff --git a/flang/test/Driver/unparse-with-modules.f90 b/flang/test/Driver/unparse-with-modules.f90
index 53997f7804efa..f6444afbe47c1 100644
--- a/flang/test/Driver/unparse-with-modules.f90
+++ b/flang/test/Driver/unparse-with-modules.f90
@@ -25,7 +25,7 @@ program test
 !CHECK:  implicit none
 !CHECK:  real(kind=real32) x
 !CHECK: end module
-!CHECK: program test
+!CHECK: program TEST
 !CHECK:  use :: m1
 !CHECK:  use :: basictestmoduletwo
 !CHECK:  implicit none
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
index 4c634513745fd..35c025dad3000 100644
--- a/flang/test/Fir/CUDA/cuda-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -24,3 +24,26 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.conta
 // CHECK: gpu.module @cuda_device_mod
 // CHECK-DAG: fir.global @_QMm2ECc
 // CHECK-DAG: fir.global @_QMm1ECb
+
+// -----
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+  fir.global @_QMmEddarrays {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) {allocator_idx = 3 : i32} : (!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>>
+  }
+  fir.global linkonce_odr @_QMmE.dt.devicearrays constant target : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}> {
+    %0 = fir.undefined !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>
+    fir.has_value %0 : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>
+  }
+}
+
+
+// CHECK-NAG: fir.global @_QMmEddarrays
+// CHECK-NAG: fir.global linkonce_odr @_QMmE.dt.devicearrays
+// CHECK: gpu.module @cuda_device_mod
+// CHECK-NAG: fir.global @_QMmEddarrays
+// CHECK-NAG: fir.global linkonce_odr @_QMmE.dt.devicearrays
diff --git a/flang/test/Fir/alloc-32.fir b/flang/test/Fir/alloc-32.fir
index 3eefc3225fac7..a3cbf200c24fc 100644
--- a/flang/test/Fir/alloc-32.fir
+++ b/flang/test/Fir/alloc-32.fir
@@ -20,7 +20,9 @@ func.func @allocmem_scalar_nonchar() -> !fir.heap<i32> {
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
-// CHECK: %[[trunc:.*]] = trunc i64 %[[mul2]] to i32
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[sz:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: %[[trunc:.*]] = trunc i64 %[[sz]] to i32
 // CHECK: call ptr @malloc(i32 %[[trunc]])
 func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
   %1 = fir.allocmem !fir.char<1,?>(%l : i32)
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 5b4930bb9cb34..8da8b828c18b9 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -87,7 +87,9 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref<!fir.char<2,?>> {
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
-// CHECK: call ptr @malloc(i64 %[[mul2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
   %1 = fir.allocmem !fir.char<1,?>(%l : i32)
   return %1 : !fir.heap<!fir.char<1,?>>
@@ -97,7 +99,9 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]]
-// CHECK: call ptr @malloc(i64 %[[mul2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_scalar_dynchar_kind(%l : i32) -> !fir.heap<!fir.char<2,?>>{
   %1 = fir.allocmem !fir.char<2,?>(%l : i32)
   return %1 : !fir.heap<!fir.char<2,?>>
@@ -152,7 +156,9 @@ func.func @allocmem_array_of_char() -> !fir.heap<!fir.array<3x3x!fir.char<1,10>>
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 9, %[[mul1]]
-// CHECK: call ptr @malloc(i64 %[[mul2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_of_dynchar(%l: i32) -> !fir.heap<!fir.array<3x3x!fir.char<1,?>>> {
   %1 = fir.allocmem !fir.array<3x3x!fir.char<1,?>>(%l : i32)
   return %1 : !fir.heap<!fir.array<3x3x!fir.char<1,?>>>
@@ -180,7 +186,9 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref<!fir.array<?x?xi32
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 12, %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod1]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi32>> {
   %1 = fir.allocmem !fir.array<3x?xi32>, %e
   return %1 : !fir.heap<!fir.array<3x?xi32>>
@@ -190,7 +198,9 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 4, %[[extent]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_nonchar2(%e: index) -> !fir.heap<!fir.array<?x?xi32>> {
   %1 = fir.allocmem !fir.array<?x?xi32>, %e, %e
   return %1 : !fir.heap<!fir.array<?x?xi32>>
@@ -218,7 +228,9 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref<!fir.array<?x?x!fir.
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char(
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 60, %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod1]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fir.char<2,10>>> {
   %1 = fir.allocmem !fir.array<3x?x!fir.char<2,10>>, %e
   return %1 : !fir.heap<!fir.array<3x?x!fir.char<2,10>>>
@@ -228,7 +240,9 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fi
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 20, %[[extent]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_char2(%e : index) -> !fir.heap<!fir.array<?x?x!fir.char<2,10>>> {
   %1 = fir.allocmem !fir.array<?x?x!fir.char<2,10>>, %e, %e
   return %1 : !fir.heap<!fir.array<?x?x!fir.char<2,10>>>
@@ -261,7 +275,9 @@ func.func @alloca_dynarray_of_dynchar2(%l: i32, %e : index) -> !fir.ref<!fir.arr
 // CHECK: %[[prod1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[prod2:.*]] = mul i64 6, %[[prod1]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod3]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod3]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.array<3x?x!fir.char<2,?>>> {
   %1 = fir.allocmem !fir.array<3x?x!fir.char<2,?>>(%l : i32), %e
   return %1 : !fir.heap<!fir.array<3x?x!fir.char<2,?>>>
@@ -273,7 +289,9 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.a
 // CHECK: %[[prod1:.*]] = mul i64 2, %[[a]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod3]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod3]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_dynchar2(%l: i32, %e : index) -> !fir.heap<!fir.array<?x?x!fir.char<2,?>>> {
   %1 = fir.allocmem !fir.array<?x?x!fir.char<2,?>>(%l : i32), %e, %e
   return %1 : !fir.heap<!fir.array<?x?x!fir.char<2,?>>>
@@ -312,7 +330,9 @@ func.func @alloca_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir.r
 // CHECK-SAME: i64 %[[e1:.*]], i64 %[[e2:.*]])
 // CHECK: %[[a:.*]] = mul i64 240, %[[e1]]
 // CHECK: %[[b:.*]] = mul i64 %3, %[[e2]]
-// CHECK: call ptr @malloc(i64 %[[b]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[b]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[b]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.heap<!fir.array<4x?x3x?x5xi32>> {
   %a = fir.allocmem !fir.array<4x?x3x?x5xi32>, %0, %1
   return %a : !fir.heap<!fir.array<4x?x3x?x5xi32>>
@@ -321,7 +341,9 @@ func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.hea
 // CHECK-LABEL: define ptr @allocmem_array_with_holes_char(
 // CHECK-SAME: i64 %[[e:.*]])
 // CHECK: %[[mul:.*]] = mul i64 240, %[[e]]
-// CHECK: call ptr @malloc(i64 %[[mul]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap<!fir.array<3x?x4x!fir.char<2,10>>> {
   %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,10>>, %e
   return %1 : !fir.heap<!fir.array<3x?x4x!fir.char<2,10>>>
@@ -331,7 +353,9 @@ func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap<!fir.array<3x?
 // CHECK-SAME: i64 %[[len:.*]], i64 %[[extent:.*]])
 // CHECK: %[[a:.*]] = mul i64 24, %[[len]]
 // CHECK: %[[b:.*]] = mul i64 %[[a]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[b]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[b]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[b]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir.heap<!fir.array<3x?x4x!fir.char<2,?>>> {
   %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,?>>(%arg0 : index), %arg1
   return %1 : !fir.heap<!fir.array<3x?x4x!fir.char<2,?>>>
diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir
index 6c7f71f6f1f9c..e8ec8ac79e0c2 100644
--- a/flang/test/Fir/arrexp.fir
+++ b/flang/test/Fir/arrexp.fir
@@ -146,7 +146,9 @@ func.func @f6(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: f32) {
   // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1
   // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]]
   // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]]
-  // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SIZE]])
+  // CHECK: %[[CMP:.*]] = icmp sgt i64 %[[SIZE]], 0
+  // CHECK: %[[SZ:.*]] = select i1 %[[CMP]], i64 %[[SIZE]], i64 1
+  // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SZ]])
   %1 = fir.slice %c2, %c10, %c1 : (index, index, index) -> !fir.slice<1>
   %2 = fir.array_load %arg0 [%1] : (!fir.box<!fir.array<?xf32>>, !fir.slice<1>) -> !fir.array<?xf32>
   %3 = fir.slice %c1, %c9, %c1 : (index, index, index) -> !fir.slice<1>
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 0e2bfe48a807d..50a98466f0d4b 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -216,10 +216,14 @@ func.func @test_alloc_and_freemem_one() {
 }
 
 // CHECK-LABEL:  llvm.func @test_alloc_and_freemem_one() {
-// CHECK:    %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64
-// CHECK-NEXT:    llvm.call @malloc(%[[N]])
-// CHECK:         llvm.call @free(%{{.*}})
-// CHECK-NEXT:    llvm.return
+// CHECK-DAG:    %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64
+// CHECK-DAG:    %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK-DAG:    %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK-NEXT:   %[[CMP:.*]] = llvm.icmp "sgt" %[[N]], %[[ZERO]] : i64
+// CHECK-NEXT:   %[[SZ:.*]] = llvm.select %[[CMP]], %[[N]], %[[ONE]] : i1, i64
+// CHECK-NEXT:   llvm.call @malloc(%[[SZ]])
+// CHECK:        llvm.call @free(%{{.*}})
+// CHECK-NEXT:   llvm.return
 
 // -----
 // Verify that fir.allocmem is transformed to a call to malloc
@@ -233,8 +237,12 @@ func.func @test_alloc_and_freemem_several() {
 }
 
 // CHECK-LABEL:  llvm.func @test_alloc_and_freemem_several() {
-// CHECK:      %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64
-// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[N]])
+// CHECK-DAG: %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64
+// CHECK-DAG: %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK-DAG: %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK-NEXT: %[[CMP:.*]] = llvm.icmp "sgt" %[[N]], %[[ZERO]] : i64
+// CHECK-NEXT: %[[SZ:.*]] = llvm.select %[[CMP]], %[[N]], %[[ONE]] : i1, i64
+// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[SZ]])
 // CHECK:              llvm.call @free([[MALLOC]])
 // CHECK:              llvm.return
 
@@ -250,7 +258,11 @@ func.func @test_with_shape(%ncols: index, %nrows: index) {
 // CHECK:   %[[FOUR:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:   %[[DIM1_SIZE:.*]] = llvm.mul %[[FOUR]], %[[NCOLS]]  : i64
 // CHECK:   %[[TOTAL_SIZE:.*]] = llvm.mul %[[DIM1_SIZE]], %[[NROWS]]  : i64
-// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]])
+// CHECK:   %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:   %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:   %[[CMP:.*]] = llvm.icmp "sgt" %[[TOTAL_SIZE]], %[[ZERO]] : i64
+// CHECK:   %[[SZ:.*]] = llvm.select %[[CMP]], %[[TOTAL_SIZE]], %[[ONE]] : i1, i64
+// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[SZ]])
 // CHECK:   llvm.call @free(%[[MEM]]) : (!llvm.ptr) -> ()
 // CHECK:   llvm.return
 // CHECK: }
@@ -266,7 +278,11 @@ func.func @test_string_with_shape(%len: index, %nelems: index) {
 // CHECK:   %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:   %[[LEN_SIZE:.*]] = llvm.mul %[[ONE]], %[[LEN]]  : i64
 // CHECK:   %[[TOTAL_SIZE:.*]] = llvm.mul %[[LEN_SIZE]], %[[NELEMS]]  : i64
-// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]])
+// CHECK:   %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:   %[[ONEA:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:   %[[CMP:.*]] = llvm.icmp "sgt" %[[TOTAL_SIZE]], %[[ZERO]] : i64
+// CHECK:   %[[SZ:.*]] = llvm.select %[[CMP]], %[[TOTAL_SIZE]], %[[ONEA]] : i1, i64
+// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[SZ]])
 // CHECK:   llvm.call @free(%[[MEM]]) : (!llvm.ptr) -> ()
 // CHECK:   llvm.return
 // CHECK: }
diff --git a/flang/test/Integration/debug-common-block-1.f90 b/flang/test/Integration/debug-common-block-1.f90
index 18217637be0fa..77f47daea4a91 100644
--- a/flang/test/Integration/debug-common-block-1.f90
+++ b/flang/test/Integration/debug-common-block-1.f90
@@ -89,7 +89,7 @@ program test
 ! CHECK-DAG: ![[CBF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "__BLNK__"{{.*}})
 ! CHECK-DAG: ![[CBAF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "a"{{.*}})
 
-! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "test"{{.*}})
+! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "TEST"{{.*}})
 ! CHECK-DAG: ![[CBM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "__BLNK__"{{.*}})
 ! CHECK-DAG: ![[CBAM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "a"{{.*}})
 
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index b97be141cc8d0..0ddac633a5b1e 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -37,7 +37,7 @@
 ! BOTH-LABEL: }
 
 program mn
-! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "mn", {{.*}})
+! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "MN", {{.*}})
 
 ! BOTH-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
 ! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
diff --git a/flang/test/Lower/CUDA/cuda-derived.cuf b/flang/test/Lower/CUDA/cuda-derived.cuf
index 96250d88d81c4..d419ee074f7a0 100644
--- a/flang/test/Lower/CUDA/cuda-derived.cuf
+++ b/flang/test/Lower/CUDA/cuda-derived.cuf
@@ -25,6 +25,6 @@ program main
   type(t2) :: b
 end
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"}
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"}
 ! CHECK: %{{.*}} = cuf.alloc !fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>}> {bindc_name = "a", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEa"}
 ! CHECK: %{{.*}} = cuf.alloc !fir.type<_QMm1Tt2{b:!fir.type<_QMm1Tt1{a:!fir.box<!fir.heap<!fir.array<?xf32>>>}>}> {bindc_name = "b", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEb"}
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 42ee7657966e2..d5e614a83b354 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -10,6 +10,7 @@ attributes(global) subroutine devsub()
   integer(4) :: ai
   integer(8) :: al
   integer(8) :: time
+  integer :: smalltime
 
   call syncthreads()
   call syncwarp(1)
@@ -45,7 +46,9 @@ attributes(global) subroutine devsub()
   ai = atomicinc(ai, 1_4)
   ai = atomicdec(ai, 1_4)
 
+  smalltime = clock()
   time = clock64()
+  time = globalTimer()
 end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -82,7 +85,9 @@ end
 ! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 
-! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64()
+! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock : i32
+! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock64 : i64
+! CHECK: %{{.*}} = nvvm.read.ptx.sreg.globaltimer : i64
 
 subroutine host1()
   integer, device :: a(32)
diff --git a/flang/test/Lower/CUDA/cuda-return01.cuf b/flang/test/Lower/CUDA/cuda-return01.cuf
index 47e69a903efd3..ed7c640a71082 100644
--- a/flang/test/Lower/CUDA/cuda-return01.cuf
+++ b/flang/test/Lower/CUDA/cuda-return01.cuf
@@ -28,6 +28,6 @@ program main
   return
 end
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"}
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"}
 ! CHECK: cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-NOT: cuf.free
diff --git a/flang/test/Lower/CUDA/cuda-return02.cuf b/flang/test/Lower/CUDA/cuda-return02.cuf
index e450d7e796f22..e54818444e49c 100644
--- a/flang/test/Lower/CUDA/cuda-return02.cuf
+++ b/flang/test/Lower/CUDA/cuda-return02.cuf
@@ -13,7 +13,7 @@ program test
   return
 end
 
-! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare
 ! CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
 ! CHECK-NEXT: ^bb1:
diff --git a/flang/test/Lower/CUDA/cuda-set-allocator.cuf b/flang/test/Lower/CUDA/cuda-set-allocator.cuf
index ee89ea38a3fc7..e3bb181f65398 100644
--- a/flang/test/Lower/CUDA/cuda-set-allocator.cuf
+++ b/flang/test/Lower/CUDA/cuda-set-allocator.cuf
@@ -21,4 +21,36 @@ contains
 ! CHECK: %[[Z:.*]] = fir.coordinate_of %[[DT]]#0, z : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: cuf.set_allocator_idx %[[Z]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
 
+  subroutine sub2()
+    type(ty_device), pointer :: d1
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMm1Psub2()
+! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub2Ed1"} -> !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMm1Fsub2Ed1"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>)
+! CHECK: %[[LOAD1:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
+! CHECK: %[[ADDR1:.*]] = fir.box_addr %[[LOAD1]] : (!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
+! CHECK: %[[DESIGNATE1:.*]] = hlfir.designate %[[ADDR1]]{"x"} : (!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.set_allocator_idx %[[DESIGNATE1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
+! CHECK: %[[LOAD2:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
+! CHECK: %[[ADDR2:.*]] = fir.box_addr %[[LOAD2]] : (!fir.box<!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
+! CHECK: %[[DESIGNATE2:.*]] = hlfir.designate %[[ADDR2]]{"z"} : (!fir.ptr<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.set_allocator_idx %[[DESIGNATE2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
+
+  subroutine sub3()
+    type(ty_device), allocatable :: d1
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMm1Psub3()
+! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub3Ed1"} -> !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
+! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMm1Fsub3Ed1"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>)
+! CHECK: %[[LOAD1:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
+! CHECK: %[[ADDR1:.*]] = fir.box_addr %[[LOAD1]] : (!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
+! CHECK: %[[DESIGNATE1:.*]] = hlfir.designate %[[ADDR1]]{"x"} : (!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.set_allocator_idx %[[DESIGNATE1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
+! CHECK: %[[LOAD2:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>
+! CHECK: %[[ADDR2:.*]] = fir.box_addr %[[LOAD2]] : (!fir.box<!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>) -> !fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
+! CHECK: %[[DESIGNATE2:.*]] = hlfir.designate %[[ADDR2]]{"z"} : (!fir.heap<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.set_allocator_idx %[[DESIGNATE2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
+
 end module
diff --git a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
index 07c4f012781d4..cbc56ca1e395b 100644
--- a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
@@ -24,7 +24,7 @@ program main
   call mvbits(from, 2, 2, to, 0)
   if (any(to /= 5)) STOP 1
 end program
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = "from", uniq_name = "_QFEfrom"}
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
index 7b64634d10d4b..a097b1522307e 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
@@ -35,7 +35,7 @@ FUNCTION BAR() RESULT(res)
     END
   END
 
-! CHECK-LABEL:  func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:  func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:    %[[VAL_0:.*]] = fir.alloca !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
 ! CHECK:    %[[VAL_1:.*]] = fir.alloca !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
 ! CHECK:    %[[VAL_2:.*]] = fir.alloca !fir.boxproc<(!fir.ref<i32>) -> i32> {bindc_name = "pp2", uniq_name = "_QFEpp2"}
diff --git a/flang/test/Lower/Intrinsics/cospi.f90 b/flang/test/Lower/Intrinsics/cospi.f90
new file mode 100644
index 0000000000000..894002566141b
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cospi.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes="CHECK"
+
+function test_real4(x)
+  real :: x, test_real4
+  test_real4 = cospi(x)
+end function
+
+! CHECK-LABEL: @_QPtest_real4
+! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64
+! CHECK: %[[factor:.*]] = fir.convert %[[dfactor]] : (f64) -> f32
+! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[factor]] fastmath<contract> : f32
+! CHECK: %[[cos:.*]] = math.cos %[[mul]] fastmath<contract> : f32
+
+function test_real8(x)
+  real(8) :: x, test_real8
+  test_real8 = cospi(x)
+end function
+
+! CHECK-LABEL: @_QPtest_real8
+! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64
+! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[dfactor]] fastmath<contract> : f64
+! CHECK: %[[cos:.*]] = math.cos %[[mul]] fastmath<contract> : f64
diff --git a/flang/test/Lower/Intrinsics/sinpi.f90 b/flang/test/Lower/Intrinsics/sinpi.f90
new file mode 100644
index 0000000000000..38c2277892ec7
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sinpi.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes="CHECK"
+
+function test_real4(x)
+  real :: x, test_real4
+  test_real4 = sinpi(x)
+end function
+
+! CHECK-LABEL: @_QPtest_real4
+! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64
+! CHECK: %[[factor:.*]] = fir.convert %[[dfactor]] : (f64) -> f32
+! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[factor]] fastmath<contract> : f32
+! CHECK: %[[sin:.*]] = math.sin %[[mul]] fastmath<contract> : f32
+
+function test_real8(x)
+  real(8) :: x, test_real8
+  test_real8 = sinpi(x)
+end function
+
+! CHECK-LABEL: @_QPtest_real8
+! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64
+! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[dfactor]] fastmath<contract> : f64
+! CHECK: %[[sin:.*]] = math.sin %[[mul]] fastmath<contract> : f64
diff --git a/flang/test/Lower/OpenACC/acc-atomic-read.f90 b/flang/test/Lower/OpenACC/acc-atomic-read.f90
index 639a98051e3a2..76751a0fa63a8 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-read.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-read.f90
@@ -8,7 +8,7 @@ program acc_atomic_test
      g = h
 end program acc_atomic_test
 
-! CHECK: func @_QQmain() attributes {fir.bindc_name = "acc_atomic_test"} {
+! CHECK: func @_QQmain() attributes {fir.bindc_name = "ACC_ATOMIC_TEST"} {
 ! CHECK: %[[VAR_G:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"}
 ! CHECK: %[[G_DECL:.*]]:2 = hlfir.declare %[[VAR_G]] {uniq_name = "_QFEg"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %[[VAR_H:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
diff --git a/flang/test/Lower/OpenACC/acc-atomic-write.f90 b/flang/test/Lower/OpenACC/acc-atomic-write.f90
index 3c55394021abf..e0116e3281820 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-write.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-write.f90
@@ -2,7 +2,7 @@
 
 ! This test checks the lowering of atomic write
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "acc_atomic_write_test"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "ACC_ATOMIC_WRITE_TEST"} {
 !CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[VAR_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
diff --git a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
index 164eb32a8f684..2de7cc5761a2b 100644
--- a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
@@ -15,15 +15,17 @@ subroutine acc_host_data()
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+ ! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
   !$acc host_data use_device(a) if_present
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
 
   !$acc host_data use_device(a) if(ifCondition)
@@ -33,14 +35,14 @@ subroutine acc_host_data()
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.true.)
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.false.)
     a = 1.0
diff --git a/flang/test/Lower/OpenACC/acc-host-data.f90 b/flang/test/Lower/OpenACC/acc-host-data.f90
index 871eabd256ca6..4d09b25b983b9 100644
--- a/flang/test/Lower/OpenACC/acc-host-data.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data.f90
@@ -14,34 +14,37 @@ subroutine acc_host_data()
   !$acc host_data use_device(a)
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
   !$acc host_data use_device(a) if_present
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK: } attributes {ifPresent}
 
-  !$acc host_data use_device(a) if_present if_present
+  !$acc host_data use_device(a) if_present 
   !$acc end host_data
-! CHECK: acc.host_data dataOperands(%{{.*}} : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.host_data dataOperands(%{{.*}}{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
 
   !$acc host_data use_device(a) if(ifCondition)
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA0]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.true.)
   !$acc end host_data
 
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.false.)
     a = 1.0
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index c6df28ec5e000..f9f5e8c2165d5 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -372,12 +372,15 @@ subroutine sub1(i, j, k)
 end subroutine
 
 ! CHECK: func.func @_QPsub1
-! CHECK: acc.parallel
-! CHECK: %[[DC_K:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK: %[[DC_J:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK: %[[DC_I:.*]] = fir.alloca i32 {bindc_name = "i"}
-! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
-! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
-! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "k"}
+! CHECK-SAME: %[[ARG_I:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}
+! CHECK-SAME: %[[ARG_J:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}
+! CHECK-SAME: %[[ARG_K:.*]]: !fir.ref<i32> {fir.bindc_name = "k"}
+! CHECK: %[[DC_I:.*]]:2 = hlfir.declare %[[ARG_I]] dummy_scope %0
+! CHECK: %[[DC_J:.*]]:2 = hlfir.declare %[[ARG_J]] dummy_scope %0 
+! CHECK: %[[DC_K:.*]]:2 = hlfir.declare %[[ARG_K]] dummy_scope %0 
+! CHECK: acc.parallel combined(loop)
+! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
+! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "k"}
 ! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[P_I]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[P_J]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[P_K]] : !fir.ref<i32>) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%c10{{.*}}, %c100{{.*}}, %c200{{.*}} : i32, i32, i32)  step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
 ! CHECK: } attributes {inclusiveUpperbound = array<i1: true, true, true>, independent = [#acc.device_type<none>]}
diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90
index 789f3a57e1f79..1a63b4120235c 100644
--- a/flang/test/Lower/OpenACC/acc-routine.f90
+++ b/flang/test/Lower/OpenACC/acc-routine.f90
@@ -2,13 +2,14 @@
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine17" [#acc.device_type<default>], "_QPacc_routine16" [#acc.device_type<multicore>])
-! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine16" [#acc.device_type<multicore>])
+! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine17
+! [#acc.device_type<default>], @_QPacc_routine16 [#acc.device_type<multicore>])
+! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine16 [#acc.device_type<multicore>])
 ! CHECK: acc.routine @[[r12:.*]] func(@_QPacc_routine17) worker ([#acc.device_type<host>]) vector ([#acc.device_type<multicore>])
 ! CHECK: acc.routine @[[r11:.*]] func(@_QPacc_routine16) gang([#acc.device_type<nvidia>]) seq ([#acc.device_type<host>])
 ! CHECK: acc.routine @[[r10:.*]] func(@_QPacc_routine11) seq
 ! CHECK: acc.routine @[[r09:.*]] func(@_QPacc_routine10) seq
-! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind("_QPacc_routine9a")
+! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind(@_QPacc_routine9a)
 ! CHECK: acc.routine @[[r07:.*]] func(@_QPacc_routine8) bind("routine8_")
 ! CHECK: acc.routine @[[r06:.*]] func(@_QPacc_routine7) gang(dim: 1 : i64)
 ! CHECK: acc.routine @[[r05:.*]] func(@_QPacc_routine6) nohost
diff --git a/flang/test/Lower/OpenACC/acc-routine03.f90 b/flang/test/Lower/OpenACC/acc-routine03.f90
index 85e4ef580f983..ddd6bda0367e4 100644
--- a/flang/test/Lower/OpenACC/acc-routine03.f90
+++ b/flang/test/Lower/OpenACC/acc-routine03.f90
@@ -30,6 +30,6 @@ subroutine sub2(a)
 end subroutine
 
 ! CHECK: acc.routine @acc_routine_1 func(@_QPsub2) worker nohost
-! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind("_QPsub2") worker
+! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind(@_QPsub2) worker
 ! CHECK: func.func @_QPsub1(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QPsub2(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>}
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index f603376163901..655e2762b9694 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -30,5 +30,5 @@ subroutine sub2()
 ! CHECK: acc.routine @acc_routine_1 func(@_QFPsub2) seq
 ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq
 ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
-! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"}
+! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "TEST_ACC_ROUTINE"}
 ! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90
new file mode 100644
index 0000000000000..081a6e317bfc9
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-use-device.f90
@@ -0,0 +1,61 @@
+! This test checks whether the OpenACC use_device clause is applied on both results of hlfir.declare.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! Test for automatic variable appearing in use_device clause.
+subroutine test()
+  integer :: N = 100
+  real*8 :: b(-1:N)
+! CHECK: %[[A0:.*]] = fir.alloca !fir.array<?xf64>, %{{.*}} {bindc_name = "b", uniq_name = "_QFtestEb"}
+! CHECK: %[[A1:.*]] = fir.shape_shift {{.*}} : (index, index) -> !fir.shapeshift<1>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A0]](%[[A1]]) {uniq_name = "_QFtestEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
+  
+  !$acc data copy(b)
+! CHECK: %[[B:.*]] = acc.copyin var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
+! CHECK: acc.data dataOperands(%[[B]] : !fir.box<!fir.array<?xf64>>) {
+
+  !$acc host_data use_device(b)
+  call vadd(b)
+  !$acc end host_data
+! CHECK: %[[C:.*]] = acc.use_device var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[D:.*]] = acc.use_device varPtr(%[[A]]#1 : !fir.ref<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>> {name = "b"}
+! CHECK: acc.host_data dataOperands(%[[C]], %[[D]] : !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) {
+! CHECK: fir.call @_QPvadd(%[[A]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> ()
+  !$acc end data
+! CHECK: acc.copyout accVar(%[[B]] : !fir.box<!fir.array<?xf64>>) to var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"}
+end 
+
+! Test for allocatable, pointer and assumed-shape variables appearing in use_device clause.
+subroutine test2(a, b, c)
+  integer :: N = 100
+  real*8, allocatable :: a(:)
+  real*8, target, allocatable :: d(:)
+  real*8 :: b(:)
+  real*8, pointer :: c(:)
+  call allocate(a(N))
+  call allocate(d(N))
+  c => d
+! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>)
+
+  !$acc data copy(a,b,c,d)
+  !$acc host_data use_device(a,b,c)
+  call vadd2(a,b,c)
+  !$acc end host_data
+
+! CHECK: %[[H:.*]] = acc.use_device varPtr(%[[E]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+! CHECK: %[[I:.*]] = acc.use_device varPtr(%[[E]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+! CHECK: %[[J:.*]] = acc.use_device var(%[[F]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[K:.*]] = acc.use_device var(%[[F]]#1 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[L:.*]] = acc.use_device varPtr(%[[G]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"}
+! CHECK: %[[M:.*]] = acc.use_device varPtr(%[[G]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"}
+! CHECK: acc.host_data dataOperands(%[[H]], %[[I]], %[[J]], %[[K]], %[[L]], %[[M]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) {
+
+
+
+
+  !$acc end data
+
+end
diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90
index 68dcaac90eef5..30313e240efa3 100644
--- a/flang/test/Lower/OpenMP/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/atomic-read.f90
@@ -4,7 +4,7 @@
 
 ! This test checks the lowering of atomic read
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "OMPATOMIC"} {
 !CHECK:    %[[A_REF:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[B_REF:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
diff --git a/flang/test/Lower/OpenMP/atomic-write.f90 b/flang/test/Lower/OpenMP/atomic-write.f90
index 6eded49b0b15d..742fd475c0f04 100644
--- a/flang/test/Lower/OpenMP/atomic-write.f90
+++ b/flang/test/Lower/OpenMP/atomic-write.f90
@@ -4,7 +4,7 @@
 
 ! This test checks the lowering of atomic write
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomicwrite"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "OMPATOMICWRITE"} {
 !CHECK:    %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
 !CHECK:    %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
diff --git a/flang/test/Lower/OpenMP/common-atomic-lowering.f90 b/flang/test/Lower/OpenMP/common-atomic-lowering.f90
index a53cc101024c6..f729bbb00ac8e 100644
--- a/flang/test/Lower/OpenMP/common-atomic-lowering.f90
+++ b/flang/test/Lower/OpenMP/common-atomic-lowering.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "sample"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "SAMPLE"} {
 !CHECK: %[[val_0:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK: %[[val_1:.*]]:2 = hlfir.declare %[[val_0]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[val_2:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
diff --git a/flang/test/Lower/OpenMP/cray-pointers02.f90 b/flang/test/Lower/OpenMP/cray-pointers02.f90
index 19e4cd09fe50a..79d838702e4b0 100644
--- a/flang/test/Lower/OpenMP/cray-pointers02.f90
+++ b/flang/test/Lower/OpenMP/cray-pointers02.f90
@@ -1,7 +1,7 @@
 ! Test lowering of Cray pointee references.
 ! RUN: flang -fc1 -emit-hlfir -fopenmp %s -o - 2>&1 | FileCheck %s
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test_cray_pointers_02"}
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST_CRAY_POINTERS_02"}
 program test_cray_pointers_02
     implicit none
 
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
index c44c6bb966580..af51c4cc3e814 100644
--- a/flang/test/Lower/OpenMP/default-clause-byref.f90
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -34,7 +34,7 @@
 !CHECK:    omp.yield(%[[PRIV_X]] : !fir.ref<i32>)
 !CHECK:  }
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "DEFAULT_CLAUSE_LOWERING"} {
 !CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
 !CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90
index ee5f579f06b91..505fa4f0f5d63 100644
--- a/flang/test/Lower/OpenMP/default-clause.f90
+++ b/flang/test/Lower/OpenMP/default-clause.f90
@@ -8,7 +8,7 @@
 ! RUN: | FileCheck %s
 
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "DEFAULT_CLAUSE_LOWERING"} {
 !CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
 !CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
diff --git a/flang/test/Lower/OpenMP/map-modifiers.f90 b/flang/test/Lower/OpenMP/map-modifiers.f90
index 64d7869cbb836..be93c14627f9a 100644
--- a/flang/test/Lower/OpenMP/map-modifiers.f90
+++ b/flang/test/Lower/OpenMP/map-modifiers.f90
@@ -1,4 +1,6 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
 
 subroutine map_present_target_data
     integer :: x
@@ -15,6 +17,14 @@ subroutine map_present_update
 !$omp target update to(present: x)
 end subroutine
 
+subroutine map_always
+    integer :: x
+!CHECK: %[[MAP:.*]] = omp.map.info {{.*}} map_clauses(always, tofrom) {{.*}} {name = "x"}
+!CHECK: omp.target_data map_entries(%[[MAP]] : {{.*}}) {
+!$omp target data map(always, tofrom: x)
+!$omp end target data
+end subroutine
+
 subroutine map_close
     integer :: x
 !CHECK: %[[MAP:.*]] = omp.map.info {{.*}} map_clauses(close, tofrom) {{.*}} {name = "x"}
diff --git a/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90 b/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90
index a76e7e52100db..17eba93a7405d 100644
--- a/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90
+++ b/flang/test/Lower/OpenMP/nested-loop-transformation-construct01.f90
@@ -1,6 +1,6 @@
 ! Test to ensure TODO message is emitted for tile OpenMP 5.1 Directives when they are nested.
 
-!RUN: not %flang -fopenmp -fopenmp-version=51 %s 2>&1 | FileCheck %s
+!RUN: not %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
 
 subroutine loop_transformation_construct
   implicit none
diff --git a/flang/test/Lower/OpenMP/nested-loop-transformation-construct02.f90 b/flang/test/Lower/OpenMP/nested-loop-transformation-construct02.f90
index 33b7c5a917619..cdc628a3b2e64 100644
--- a/flang/test/Lower/OpenMP/nested-loop-transformation-construct02.f90
+++ b/flang/test/Lower/OpenMP/nested-loop-transformation-construct02.f90
@@ -1,6 +1,6 @@
 ! Test to ensure TODO message is emitted for unroll OpenMP 5.1 Directives when they are nested.
 
-!RUN: not %flang -fopenmp -fopenmp-version=51 %s 2>&1 | FileCheck %s
+!RUN: not %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
 
 program loop_transformation_construct
   implicit none
@@ -17,4 +17,4 @@ program loop_transformation_construct
   !$omp end do
 end program loop_transformation_construct
 
-!CHECK: not yet implemented: Unhandled loop directive (unroll)
+!CHECK: not yet implemented: Applying a loop-associated on the loop generated by the unroll construct
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
index 4bfd5d8d19261..0036670317a8e 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
@@ -80,7 +80,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
index ec54294c7104f..ea0aa9ec3f53b 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
@@ -68,7 +68,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3x2xi32>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 2 : index
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
index 488ecc353af8e..eb0df2b3a17de 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -63,7 +63,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
index 596276a99cafc..2caec0384a6ab 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
@@ -18,7 +18,7 @@
 !CHECK:    fir.store %[[CR]] to %[[C0]] : !fir.ref<i32>
 !CHECK:    omp.yield(%[[C0]] : !fir.ref<i32>)
 !CHECK:  }
-!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "MN"} {
 !CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 !CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index f638688bc2cc1..3c1daa0eb983f 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -82,7 +82,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
index c06343e997bfd..2be154f4bbaf5 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
@@ -25,7 +25,7 @@ end program main
 ! CHECK:           omp.yield(%[[VAL_2]] : i32)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"}
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/parallel-reduction.f90 b/flang/test/Lower/OpenMP/parallel-reduction.f90
index 612549fb32de5..15e8cc325916d 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction.f90
@@ -10,7 +10,7 @@
 !CHECK:    %[[CR:[_a-z0-9]+]] = arith.addi %[[C0]], %[[C1]] : i32
 !CHECK:    omp.yield(%[[CR]] : i32)
 !CHECK:  }
-!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "MN"} {
 !CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 !CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90
index d11925cafdc12..3d5c0326fb6b9 100644
--- a/flang/test/Lower/OpenMP/sections.f90
+++ b/flang/test/Lower/OpenMP/sections.f90
@@ -5,7 +5,7 @@
 ! RUN: %flang_fc1 -emit-hlfir %openmp_flags %s -o - | FileCheck %s
 ! RUN: bbc -hlfir -emit-hlfir %openmp_flags %s -o - | FileCheck %s
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "sample"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "SAMPLE"} {
 !CHECK:   %[[COUNT:.*]] = fir.address_of(@_QFEcount) : !fir.ref<i32>
 !CHECK:   %[[COUNT_DECL:.*]]:2 = hlfir.declare %[[COUNT]] {uniq_name = "_QFEcount"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[ETA:.*]] = fir.alloca f32 {bindc_name = "eta", uniq_name = "_QFEeta"}
diff --git a/flang/test/Lower/OpenMP/taskgroup02.f90 b/flang/test/Lower/OpenMP/taskgroup02.f90
index 1e996a030c23a..4c470b7aa82d1 100644
--- a/flang/test/Lower/OpenMP/taskgroup02.f90
+++ b/flang/test/Lower/OpenMP/taskgroup02.f90
@@ -3,8 +3,9 @@
 ! Check that variables are not privatized twice when TASKGROUP is used.
 
 !CHECK-LABEL: func.func @_QPsub() {
-!CHECK:         omp.parallel {
-!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsubEi"}
+!CHECK:         omp.parallel private(@_QFsubEi_private_i32 %[[SUB_I:.*]]#0 -> %[[ARG:.*]] : !fir.ref<i32>)
+!CHECK:           %[[ALLOCA:.*]] = fir.alloca i32
+!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFsubEi"}
 !CHECK:           omp.master {
 !CHECK:             omp.taskgroup {
 !CHECK-NEXT:          omp.task private(@_QFsubEi_firstprivate_i32 %[[PAR_I]]#0 -> %[[TASK_I:.*]] : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
index 5e54cef8c29db..5c90ef7d84f89 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
@@ -3,7 +3,7 @@
 
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 !CHECK:   %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
index 21547b47cf381..0e61261e8853e 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
@@ -3,7 +3,7 @@
 
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 !CHECK:   %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association.f90
index 7a27efa2f84aa..1887e8aa68fdc 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association.f90
@@ -3,7 +3,7 @@
 
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 !CHECK:   %[[A:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[TP_A:.*]] = omp.threadprivate %[[A_DECL]]#0 : !fir.ref<i32> -> !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic01.f90 b/flang/test/Lower/OpenMP/unroll-heuristic01.f90
new file mode 100644
index 0000000000000..a5f5c003b8a7c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/unroll-heuristic01.f90
@@ -0,0 +1,39 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+
+subroutine omp_unroll_heuristic01(lb, ub, inc)
+  integer res, i, lb, ub, inc
+
+  !$omp unroll
+  do i = lb, ub, inc
+    res = i
+  end do
+  !$omp end unroll
+
+end subroutine omp_unroll_heuristic01
+
+
+!CHECK-LABEL: func.func @_QPomp_unroll_heuristic01(
+!CHECK:      %c0_i32 = arith.constant 0 : i32
+!CHECK-NEXT: %c1_i32 = arith.constant 1 : i32
+!CHECK-NEXT: %13 = arith.cmpi slt, %12, %c0_i32 : i32
+!CHECK-NEXT: %14 = arith.subi %c0_i32, %12 : i32
+!CHECK-NEXT: %15 = arith.select %13, %14, %12 : i32
+!CHECK-NEXT: %16 = arith.select %13, %11, %10 : i32
+!CHECK-NEXT: %17 = arith.select %13, %10, %11 : i32
+!CHECK-NEXT: %18 = arith.subi %17, %16 overflow<nuw> : i32
+!CHECK-NEXT: %19 = arith.divui %18, %15 : i32
+!CHECK-NEXT: %20 = arith.addi %19, %c1_i32 overflow<nuw> : i32
+!CHECK-NEXT: %21 = arith.cmpi slt, %17, %16 : i32
+!CHECK-NEXT: %22 = arith.select %21, %c0_i32, %20 : i32
+!CHECK-NEXT: %canonloop_s0 = omp.new_cli
+!CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%22) {
+!CHECK-NEXT:   %23 = arith.muli %iv, %12 : i32
+!CHECK-NEXT:   %24 = arith.addi %10, %23 : i32
+!CHECK-NEXT:   hlfir.assign %24 to %9#0 : i32, !fir.ref<i32>
+!CHECK-NEXT:   %25 = fir.load %9#0 : !fir.ref<i32>
+!CHECK-NEXT:   hlfir.assign %25 to %6#0 : i32, !fir.ref<i32>
+!CHECK-NEXT:   omp.terminator
+!CHECK-NEXT: }
+!CHECK-NEXT: omp.unroll_heuristic(%canonloop_s0)
+!CHECK-NEXT: return
diff --git a/flang/test/Lower/OpenMP/unroll-heuristic02.f90 b/flang/test/Lower/OpenMP/unroll-heuristic02.f90
new file mode 100644
index 0000000000000..14f694d6cdb78
--- /dev/null
+++ b/flang/test/Lower/OpenMP/unroll-heuristic02.f90
@@ -0,0 +1,97 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+
+subroutine omp_unroll_heuristic_nested02(outer_lb, outer_ub, outer_inc, inner_lb, inner_ub, inner_inc)
+  integer res, i, j, inner_lb, inner_ub, inner_inc, outer_lb, outer_ub, outer_inc
+
+  !$omp unroll
+  do i = outer_lb, outer_ub, outer_inc
+    !$omp unroll
+    do j = inner_lb, inner_ub, inner_inc
+      res = i + j
+    end do
+    !$omp end unroll
+  end do
+  !$omp end unroll
+
+end subroutine omp_unroll_heuristic_nested02
+
+
+!CHECK-LABEL:   func.func @_QPomp_unroll_heuristic_nested02(
+!CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "outer_lb"},
+!CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "outer_ub"},
+!CHECK-SAME:      %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "outer_inc"},
+!CHECK-SAME:      %[[ARG3:.*]]: !fir.ref<i32> {fir.bindc_name = "inner_lb"},
+!CHECK-SAME:      %[[ARG4:.*]]: !fir.ref<i32> {fir.bindc_name = "inner_ub"},
+!CHECK-SAME:      %[[ARG5:.*]]: !fir.ref<i32> {fir.bindc_name = "inner_inc"}) {
+!CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+!CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_unroll_heuristic_nested02Ei"}
+!CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Einner_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_unroll_heuristic_nested02Ej"}
+!CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_inc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_lb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eouter_ub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_11:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_unroll_heuristic_nested02Eres"}
+!CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFomp_unroll_heuristic_nested02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ei"}
+!CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_15:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ej"}
+!CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_18:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_19:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+!CHECK:           %[[VAL_20:.*]] = arith.constant 0 : i32
+!CHECK:           %[[VAL_21:.*]] = arith.constant 1 : i32
+!CHECK:           %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_20]] : i32
+!CHECK:           %[[VAL_23:.*]] = arith.subi %[[VAL_20]], %[[VAL_19]] : i32
+!CHECK:           %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_19]] : i32
+!CHECK:           %[[VAL_25:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_17]] : i32
+!CHECK:           %[[VAL_26:.*]] = arith.select %[[VAL_22]], %[[VAL_17]], %[[VAL_18]] : i32
+!CHECK:           %[[VAL_27:.*]] = arith.subi %[[VAL_26]], %[[VAL_25]] overflow<nuw> : i32
+!CHECK:           %[[VAL_28:.*]] = arith.divui %[[VAL_27]], %[[VAL_24]] : i32
+!CHECK:           %[[VAL_29:.*]] = arith.addi %[[VAL_28]], %[[VAL_21]] overflow<nuw> : i32
+!CHECK:           %[[VAL_30:.*]] = arith.cmpi slt, %[[VAL_26]], %[[VAL_25]] : i32
+!CHECK:           %[[VAL_31:.*]] = arith.select %[[VAL_30]], %[[VAL_20]], %[[VAL_29]] : i32
+!CHECK:           %[[VAL_32:.*]] = omp.new_cli
+!CHECK:           omp.canonical_loop(%[[VAL_32]]) %[[VAL_33:.*]] : i32 in range(%[[VAL_31]]) {
+!CHECK:             %[[VAL_34:.*]] = arith.muli %[[VAL_33]], %[[VAL_19]] : i32
+!CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_17]], %[[VAL_34]] : i32
+!CHECK:             hlfir.assign %[[VAL_35]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+!CHECK:             %[[VAL_36:.*]] = fir.alloca i32 {bindc_name = "j", pinned, uniq_name = "_QFomp_unroll_heuristic_nested02Ej"}
+!CHECK:             %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_36]] {uniq_name = "_QFomp_unroll_heuristic_nested02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:             %[[VAL_38:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_39:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_40:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+!CHECK:             %[[VAL_41:.*]] = arith.constant 0 : i32
+!CHECK:             %[[VAL_42:.*]] = arith.constant 1 : i32
+!CHECK:             %[[VAL_43:.*]] = arith.cmpi slt, %[[VAL_40]], %[[VAL_41]] : i32
+!CHECK:             %[[VAL_44:.*]] = arith.subi %[[VAL_41]], %[[VAL_40]] : i32
+!CHECK:             %[[VAL_45:.*]] = arith.select %[[VAL_43]], %[[VAL_44]], %[[VAL_40]] : i32
+!CHECK:             %[[VAL_46:.*]] = arith.select %[[VAL_43]], %[[VAL_39]], %[[VAL_38]] : i32
+!CHECK:             %[[VAL_47:.*]] = arith.select %[[VAL_43]], %[[VAL_38]], %[[VAL_39]] : i32
+!CHECK:             %[[VAL_48:.*]] = arith.subi %[[VAL_47]], %[[VAL_46]] overflow<nuw> : i32
+!CHECK:             %[[VAL_49:.*]] = arith.divui %[[VAL_48]], %[[VAL_45]] : i32
+!CHECK:             %[[VAL_50:.*]] = arith.addi %[[VAL_49]], %[[VAL_42]] overflow<nuw> : i32
+!CHECK:             %[[VAL_51:.*]] = arith.cmpi slt, %[[VAL_47]], %[[VAL_46]] : i32
+!CHECK:             %[[VAL_52:.*]] = arith.select %[[VAL_51]], %[[VAL_41]], %[[VAL_50]] : i32
+!CHECK:             %[[VAL_53:.*]] = omp.new_cli
+!CHECK:             omp.canonical_loop(%[[VAL_53]]) %[[VAL_54:.*]] : i32 in range(%[[VAL_52]]) {
+!CHECK:               %[[VAL_55:.*]] = arith.muli %[[VAL_54]], %[[VAL_40]] : i32
+!CHECK:               %[[VAL_56:.*]] = arith.addi %[[VAL_38]], %[[VAL_55]] : i32
+!CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_37]]#0 : i32, !fir.ref<i32>
+!CHECK:               %[[VAL_57:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+!CHECK:               %[[VAL_58:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<i32>
+!CHECK:               %[[VAL_59:.*]] = arith.addi %[[VAL_57]], %[[VAL_58]] : i32
+!CHECK:               hlfir.assign %[[VAL_59]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+!CHECK:               omp.terminator
+!CHECK:             }
+!CHECK:             omp.unroll_heuristic(%[[VAL_53]])
+!CHECK:             omp.terminator
+!CHECK:           }
+!CHECK:           omp.unroll_heuristic(%[[VAL_32]])
+!CHECK:           return
+!CHECK:         }
diff --git a/flang/test/Lower/OpenMP/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/wsloop-chunks.f90
index 29c02a3b3c8d5..f3f11d8c4a6c2 100644
--- a/flang/test/Lower/OpenMP/wsloop-chunks.f90
+++ b/flang/test/Lower/OpenMP/wsloop-chunks.f90
@@ -7,7 +7,7 @@ program wsloop
         integer :: i
         integer :: chunk
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "WSLOOP"} {
 ! CHECK:         %[[CHUNK_REF:.*]] = fir.alloca i32 {bindc_name = "chunk", uniq_name = "_QFEchunk"}
 ! CHECK:         %[[VAL_0:.*]]:2 = hlfir.declare %[[CHUNK_REF]] {uniq_name = "_QFEchunk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/wsloop-collapse.f90
index a4d5cbdc03d3e..7ec40ab4b2f43 100644
--- a/flang/test/Lower/OpenMP/wsloop-collapse.f90
+++ b/flang/test/Lower/OpenMP/wsloop-collapse.f90
@@ -2,7 +2,7 @@
 
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
 
-!CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "wsloop_collapse"} {
+!CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "WSLOOP_COLLAPSE"} {
 program wsloop_collapse
 !CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index 58b68e5ec4cfd..e2f75bc8e4481 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -156,7 +156,7 @@ program reduce15
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce15"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE15"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEarr) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEarr"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
index 0a536eb34e7af..663851cba46c6 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
@@ -63,7 +63,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "r", uniq_name = "_QFEr"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
index 9f0dd16002baf..2233a74600948 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
@@ -31,5 +31,5 @@ program reduce
 ! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           omp.wsloop {{.*}} reduction(byref @add_reduction_byref_box_2xi32 %{{.*}} -> %{{.*}} : !fir.ref<!fir.box<!fir.array<2xi32>>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
index 5ada623a0ed23..211bde19da8db 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
@@ -40,5 +40,5 @@ subroutine sub(a, lb, ub)
 ! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           omp.wsloop {{.*}} reduction(byref @add_reduction_byref_box_Uxi32 %{{.*}} -> %{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index 21261da49710c..b7882bcbc0d13 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -65,7 +65,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.array<2xi32>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index ab8dcf1f076c0..7d90335a13a87 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -65,7 +65,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.array<2xi32>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
index 1e26f5a24d41e..d776bd7cfdd03 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -28,7 +28,7 @@ program reduce
 ! CHECK:           omp.yield(%[[VAL_2]] : i32)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index e0a3b469f40c1..5133db0347034 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -93,7 +93,7 @@ program main
 ! CHECK:           omp.yield(%[[VAL_2]] : f64)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEarray) : !fir.ref<!fir.array<3x3xf64>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
index 40b4302f24cd4..27b726376fbeb 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
@@ -64,7 +64,7 @@ program reduce_pointer
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce_pointer"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE_POINTER"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "v", uniq_name = "_QFEv"}
diff --git a/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90
index 73669c25b339e..d7d14581b4b7f 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !CHECK-LABEL: vec_extract_testr4i8
@@ -27,6 +27,7 @@ subroutine vec_extract_testi8i1(arg1, arg2, r)
 ! LLVMIR: %[[arg2:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[arg2]], 2
 ! LLVMIR: %[[sub:.*]] = sub i8 1, %[[urem]]
-! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[arg1]], i8 %[[sub]]
+! LLVMIR: %[[idx:.*]] = zext i8 %[[sub]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[arg1]], i64 %[[idx]]
 ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8
 end subroutine vec_extract_testi8i1
diff --git a/flang/test/Lower/PowerPC/ppc-vec-extract.f90 b/flang/test/Lower/PowerPC/ppc-vec-extract.f90
index 0f279347b6b75..32c0dcfd66013 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-extract.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-extract.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
+! RUN: %flang_fc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !-------------
@@ -19,8 +19,9 @@ subroutine vec_extract_testf32(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i8 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]]
 ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4
 
   r = vec_extract(x, i2)
@@ -29,8 +30,9 @@ subroutine vec_extract_testf32(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i16 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]]
 ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4
 
   r = vec_extract(x, i4)
@@ -39,18 +41,19 @@ subroutine vec_extract_testf32(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i32 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]]
 ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4
 
   r = vec_extract(x, i8)
 
 ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 4
-! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[s]]
+! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 4
+! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[u]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4
+! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]]
 ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4
 end subroutine vec_extract_testf32
 
@@ -68,8 +71,9 @@ subroutine vec_extract_testf64(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i8 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]]
 ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8
 
   r = vec_extract(x, i2)
@@ -78,8 +82,9 @@ subroutine vec_extract_testf64(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i16 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]]
 ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8
 
 
@@ -89,18 +94,19 @@ subroutine vec_extract_testf64(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i32 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]]
 ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8
 
   r = vec_extract(x, i8)
 
 ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 2
-! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[s]]
+! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 2
+! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[u]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2
+! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]]
 ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8
 end subroutine vec_extract_testf64
 
@@ -118,8 +124,9 @@ subroutine vec_extract_testi8(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 16
 ! LLVMIR-BE: %[[s:.*]] = sub i8 15, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i8 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1
 
   r = vec_extract(x, i2)
@@ -128,8 +135,9 @@ subroutine vec_extract_testi8(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 16
 ! LLVMIR-BE: %[[s:.*]] = sub i16 15, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i16 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1
 
   r = vec_extract(x, i4)
@@ -138,18 +146,19 @@ subroutine vec_extract_testi8(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 16
 ! LLVMIR-BE: %[[s:.*]] = sub i32 15, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i32 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1
 
   r = vec_extract(x, i8)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 16
-! LLVMIR-BE: %[[s:.*]] = sub i64 15, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[s]]
+! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 16
+! LLVMIR-BE: %[[idx:.*]] = sub i64 15, %[[u]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 16
+! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1
 end subroutine vec_extract_testi8
 
@@ -167,8 +176,9 @@ subroutine vec_extract_testi16(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 8
 ! LLVMIR-BE: %[[s:.*]] = sub i8 7, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i8 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2
 
   r = vec_extract(x, i2)
@@ -177,8 +187,9 @@ subroutine vec_extract_testi16(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 8
 ! LLVMIR-BE: %[[s:.*]] = sub i16 7, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i16 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2
 
   r = vec_extract(x, i4)
@@ -187,18 +198,19 @@ subroutine vec_extract_testi16(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 8
 ! LLVMIR-BE: %[[s:.*]] = sub i32 7, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i32 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2
 
   r = vec_extract(x, i8)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 8
-! LLVMIR-BE: %[[s:.*]] = sub i64 7, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[s]]
+! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 8
+! LLVMIR-BE: %[[idx:.*]] = sub i64 7, %[[u]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 8
+! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2
 end subroutine vec_extract_testi16
 
@@ -216,8 +228,9 @@ subroutine vec_extract_testi32(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i8 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4
 
   r = vec_extract(x, i2)
@@ -226,8 +239,9 @@ subroutine vec_extract_testi32(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i16 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4
 
   r = vec_extract(x, i4)
@@ -236,18 +250,19 @@ subroutine vec_extract_testi32(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i32 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4
 
   r = vec_extract(x, i8)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 4
-! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[s]]
+! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 4
+! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[u]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4
+! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4
 end subroutine vec_extract_testi32
 
@@ -265,8 +280,9 @@ subroutine vec_extract_testi64(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i8 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8
 
   r = vec_extract(x, i2)
@@ -275,8 +291,9 @@ subroutine vec_extract_testi64(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i16 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8
 
   r = vec_extract(x, i4)
@@ -285,17 +302,18 @@ subroutine vec_extract_testi64(x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i32 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64
+! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8
 
   r = vec_extract(x, i8)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 2
-! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[u]]
-! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[u]]
-! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[s]]
+! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 2
+! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[u]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2
+! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]]
 ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8
 end subroutine vec_extract_testi64
diff --git a/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90
index f64df46f170ab..b30065d74e46b 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s
+! RUN: %flang_fc1 -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !CHECK-LABEL: vec_insert_testf32i64
@@ -31,6 +31,7 @@ subroutine vec_insert_testi64i8(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2
 ! LLVMIR: %[[sub:.*]] = sub i8 1, %[[urem]]
-! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[sub]]
+! LLVMIR: %[[idx:.*]] = zext i8 %[[sub]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testi64i8
diff --git a/flang/test/Lower/PowerPC/ppc-vec-insert.f90 b/flang/test/Lower/PowerPC/ppc-vec-insert.f90
index dd57fcc67be08..26bc7fc114cec 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-insert.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-insert.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
+! RUN: %flang_fc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 ! vec_insert
@@ -20,8 +20,9 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i8 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i2)
@@ -31,8 +32,9 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i16 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i4)
@@ -42,8 +44,9 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i32 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i8)
@@ -51,10 +54,10 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 4
-! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[s]]
+! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 4
+! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[urem]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4
+! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testf32
 
@@ -74,8 +77,9 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i8 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i2)
@@ -85,8 +89,9 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i16 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i4)
@@ -96,8 +101,9 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i32 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i8)
@@ -105,10 +111,10 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[v:.*]] = load double, ptr %{{[0-9]}}, align 8
 ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 2
-! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[s]]
+! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 2
+! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[urem]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2
+! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testf64
 
@@ -128,8 +134,9 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 16
 ! LLVMIR-BE: %[[s:.*]] = sub i8 15, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i8 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i2)
@@ -139,8 +146,9 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 16
 ! LLVMIR-BE: %[[s:.*]] = sub i16 15, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i16 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i4)
@@ -150,8 +158,9 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 16
 ! LLVMIR-BE: %[[s:.*]] = sub i32 15, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i32 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i8)
@@ -159,10 +168,10 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[v:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 16
-! LLVMIR-BE: %[[s:.*]] = sub i64 15, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[s]]
+! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 16
+! LLVMIR-BE: %[[idx:.*]] = sub i64 15, %[[urem]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 16
+! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testi8
 
@@ -182,8 +191,9 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 8
 ! LLVMIR-BE: %[[s:.*]] = sub i8 7, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i8 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i2)
@@ -193,8 +203,9 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 8
 ! LLVMIR-BE: %[[s:.*]] = sub i16 7, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i16 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i4)
@@ -204,8 +215,9 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 8
 ! LLVMIR-BE: %[[s:.*]] = sub i32 7, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i32 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i8)
@@ -213,10 +225,10 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[v:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 8
-! LLVMIR-BE: %[[s:.*]] = sub i64 7, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[s]]
+! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 8
+! LLVMIR-BE: %[[idx:.*]] = sub i64 7, %[[urem]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 8
+! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testi16
 
@@ -236,8 +248,9 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i8 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i2)
@@ -247,8 +260,9 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i16 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i4)
@@ -258,8 +272,9 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 4
 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i32 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i8)
@@ -267,10 +282,10 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[v:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 4
-! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[s]]
+! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 4
+! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[urem]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4
+! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testi32
 
@@ -290,8 +305,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1
 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i2)
@@ -301,8 +317,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2
 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i16 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i16 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i4)
@@ -312,8 +329,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4
 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 2
 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i32 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i32 %[[s]]
+! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64
+! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64
+! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
 
   r = vec_insert(v, x, i8)
@@ -321,9 +339,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8)
 ! LLVMIR: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8
-! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 2
-! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[urem]]
-! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[urem]]
-! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[s]]
+! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 2
+! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[urem]]
+! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2
+! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]]
 ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16
 end subroutine vec_insert_testi64
diff --git a/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90
index 50604e1f720f3..ca8c0c3f6f1d8 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90
@@ -19,7 +19,7 @@ subroutine vec_splat_testu8i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 15
+! LLVMIR: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
diff --git a/flang/test/Lower/PowerPC/ppc-vec-splat.f90 b/flang/test/Lower/PowerPC/ppc-vec-splat.f90
index f3c1f19d5877d..55614c75d1ad8 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-splat.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-splat.f90
@@ -1,5 +1,5 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
+! RUN: %flang_fc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !----------------
@@ -12,8 +12,8 @@ subroutine vec_splat_testi8i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 15
+! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
@@ -25,8 +25,8 @@ subroutine vec_splat_testi8i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 15
+! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
@@ -38,8 +38,8 @@ subroutine vec_splat_testi8i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 15
+! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
@@ -64,8 +64,8 @@ subroutine vec_splat_testi16i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 7
+! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7
 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer
 ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16
@@ -77,8 +77,8 @@ subroutine vec_splat_testi16i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 7
+! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7
 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer
 ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16
@@ -90,8 +90,8 @@ subroutine vec_splat_testi16i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 7
+! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7
 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer
 ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16
@@ -116,8 +116,8 @@ subroutine vec_splat_testi32i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16
@@ -129,8 +129,8 @@ subroutine vec_splat_testi32i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16
@@ -142,8 +142,8 @@ subroutine vec_splat_testi32i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16
@@ -168,8 +168,8 @@ subroutine vec_splat_testi64i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16
@@ -181,8 +181,8 @@ subroutine vec_splat_testi64i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16
@@ -194,8 +194,8 @@ subroutine vec_splat_testi64i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16
@@ -220,8 +220,8 @@ subroutine vec_splat_testf32i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i8 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x float> poison, float %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x float> %[[ins]], <4 x float> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x float> %[[y]], ptr %{{[0-9]}}, align 16
@@ -233,8 +233,8 @@ subroutine vec_splat_testf32i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i16 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x float> poison, float %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x float> %[[ins]], <4 x float> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x float> %[[y]], ptr %{{[0-9]}}, align 16
@@ -246,8 +246,8 @@ subroutine vec_splat_testf32i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i32 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x float> poison, float %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x float> %[[ins]], <4 x float> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x float> %[[y]], ptr %{{[0-9]}}, align 16
@@ -272,8 +272,8 @@ subroutine vec_splat_testf64i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i8 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x double> poison, double %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x double> %[[ins]], <2 x double> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x double> %[[y]], ptr %{{[0-9]}}, align 16
@@ -285,8 +285,8 @@ subroutine vec_splat_testf64i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i16 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x double> poison, double %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x double> %[[ins]], <2 x double> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x double> %[[y]], ptr %{{[0-9]}}, align 16
@@ -298,8 +298,8 @@ subroutine vec_splat_testf64i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i32 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x double> poison, double %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x double> %[[ins]], <2 x double> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x double> %[[y]], ptr %{{[0-9]}}, align 16
@@ -324,8 +324,8 @@ subroutine vec_splat_testu8i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 15
+! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
@@ -337,8 +337,8 @@ subroutine vec_splat_testu8i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 15
+! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
@@ -350,8 +350,8 @@ subroutine vec_splat_testu8i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 15
+! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15
 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer
 ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16
@@ -376,8 +376,8 @@ subroutine vec_splat_testu16i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 7
+! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7
 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer
 ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16
@@ -389,8 +389,8 @@ subroutine vec_splat_testu16i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 7
+! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7
 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer
 ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16
@@ -402,8 +402,8 @@ subroutine vec_splat_testu16i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 7
+! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7
 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer
 ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16
@@ -428,8 +428,8 @@ subroutine vec_splat_testu32i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16
@@ -441,8 +441,8 @@ subroutine vec_splat_testu32i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16
@@ -454,8 +454,8 @@ subroutine vec_splat_testu32i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 3
+! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3
 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer
 ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16
@@ -480,8 +480,8 @@ subroutine vec_splat_testu64i8(x)
   y = vec_splat(x, 0_1)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16
@@ -493,8 +493,8 @@ subroutine vec_splat_testu64i16(x)
   y = vec_splat(x, 0_2)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16
@@ -506,8 +506,8 @@ subroutine vec_splat_testu64i32(x)
   y = vec_splat(x, 0_4)
 
 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16
-! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 0
-! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 1
+! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0
+! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1
 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0
 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer
 ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16
diff --git a/flang/test/Lower/amdgcn-complex.f90 b/flang/test/Lower/amdgcn-complex.f90
new file mode 100644
index 0000000000000..f15c7db2b7316
--- /dev/null
+++ b/flang/test/Lower/amdgcn-complex.f90
@@ -0,0 +1,21 @@
+! REQUIRES: amdgpu-registered-target
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+
+subroutine cabsf_test(a, b)
+   complex :: a
+   real :: b
+   b = abs(a)
+end subroutine
+
+! CHECK-LABEL: func @_QPcabsf_test(
+! CHECK: complex.abs
+! CHECK-NOT: fir.call @cabsf
+
+subroutine cexpf_test(a, b)
+   complex :: a, b
+   b = exp(a)
+end subroutine
+
+! CHECK-LABEL: func @_QPcexpf_test(
+! CHECK: complex.exp
+! CHECK-NOT: fir.call @cexpf
diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90
index 1bc73dae44235..e2899d967c80d 100644
--- a/flang/test/Lower/array-character.f90
+++ b/flang/test/Lower/array-character.f90
@@ -32,7 +32,7 @@ program p
   call charlit
 end program p
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.array<3x!fir.char<1,4>> {bindc_name = "c1", uniq_name = "_QFEc1"}
diff --git a/flang/test/Lower/array-expression-slice-1.f90 b/flang/test/Lower/array-expression-slice-1.f90
index b597814bc0d9f..73943137cb18d 100644
--- a/flang/test/Lower/array-expression-slice-1.f90
+++ b/flang/test/Lower/array-expression-slice-1.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -hlfir=false -fwrapv -o - --outline-intrinsics %s | FileCheck %s
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK-DAG:         %[[VAL_0:.*]] = arith.constant 10 : index
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 2 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant 1 : index
diff --git a/flang/test/Lower/basic-program.f90 b/flang/test/Lower/basic-program.f90
index 5a0e4bdc7b4a1..7e5b40d9e2f0a 100644
--- a/flang/test/Lower/basic-program.f90
+++ b/flang/test/Lower/basic-program.f90
@@ -4,10 +4,10 @@
 program basic
 end program
 
-! CHECK: 1 Program basic
+! CHECK: 1 Program BASIC
 ! CHECK:   1 EndProgramStmt: end program
-! CHECK: End Program basic
+! CHECK: End Program BASIC
 
-! FIR-LABEL: func @_QQmain() attributes {fir.bindc_name = "basic"} {
+! FIR-LABEL: func @_QQmain() attributes {fir.bindc_name = "BASIC"} {
 ! FIR:         return
 ! FIR:       }
diff --git a/flang/test/Lower/big-integer-parameter.f90 b/flang/test/Lower/big-integer-parameter.f90
index a413b1224ebc2..ca90b8adfb318 100644
--- a/flang/test/Lower/big-integer-parameter.f90
+++ b/flang/test/Lower/big-integer-parameter.f90
@@ -13,7 +13,7 @@ program i128
   print*,y
 end
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "i128"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "I128"} {
 ! CHECK-COUNT-2:  %{{.*}} = fir.call @_FortranAioOutputInteger128(%{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<i8>, i128) -> i1
 
 
diff --git a/flang/test/Lower/derived-type-finalization.f90 b/flang/test/Lower/derived-type-finalization.f90
index 3ea58cd719f4a..71cef34899603 100644
--- a/flang/test/Lower/derived-type-finalization.f90
+++ b/flang/test/Lower/derived-type-finalization.f90
@@ -255,5 +255,5 @@ program p
   type(t1) :: t
 end program
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK-NOT: fir.call @_FortranADestroy
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index 1e4bb73350871..d1e12a8dbdfec 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -29,7 +29,9 @@ end program test
 ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]]
 ! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]]
 ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]]
-! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[mul2]])
+! CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+! CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[size]])
 ! CHECK: %[[to:.*]] = getelementptr i8, ptr %[[buff]], i64 %
 ! CHECK: call void @llvm.memmove.p0.p0.i64(ptr %[[to]], ptr %{{.*}}, i64 %{{.*}}, i1 false)
 ! CHECK: call void @free(ptr %[[buff]])
diff --git a/flang/test/Lower/io-derived-type.f90 b/flang/test/Lower/io-derived-type.f90
index 7d2fef3faa2b7..7c289ce261678 100644
--- a/flang/test/Lower/io-derived-type.f90
+++ b/flang/test/Lower/io-derived-type.f90
@@ -37,16 +37,16 @@ subroutine test1
     import, all
     ! CHECK:   %[[V_16:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
     ! CHECK:   %[[V_17:[0-9]+]] = fir.convert %[[V_16]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-    ! CHECK:   %[[V_18:[0-9]+]] = fir.address_of(@_QQMmFtest1.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-    ! CHECK:   %[[V_19:[0-9]+]] = fir.convert %[[V_18]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+    ! CHECK:   %[[V_18:[0-9]+]] = fir.address_of(@_QQMmFtest1.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+    ! CHECK:   %[[V_19:[0-9]+]] = fir.convert %[[V_18]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
     ! CHECK:   %[[V_20:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_17]], %[[V_19]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
     print *, 'test1 outer, should call wft: ', t(1)
     block
       import, only: t
       ! CHECK:   %[[V_37:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
       ! CHECK:   %[[V_38:[0-9]+]] = fir.convert %[[V_37]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-      ! CHECK:   %[[V_39:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-      ! CHECK:   %[[V_40:[0-9]+]] = fir.convert %[[V_39]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+      ! CHECK:   %[[V_39:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+      ! CHECK:   %[[V_40:[0-9]+]] = fir.convert %[[V_39]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
       ! CHECK:   %[[V_41:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_38]], %[[V_40]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
       print *, 'test1 block, should not call wft: ', t(2)
     end block
@@ -56,8 +56,8 @@ subroutine test1
   subroutine test2
     ! CHECK:   %[[V_15:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
     ! CHECK:   %[[V_16:[0-9]+]] = fir.convert %[[V_15]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-    ! CHECK:   %[[V_17:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-    ! CHECK:   %[[V_18:[0-9]+]] = fir.convert %[[V_17]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+    ! CHECK:   %[[V_17:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+    ! CHECK:   %[[V_18:[0-9]+]] = fir.convert %[[V_17]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
     ! CHECK:   %[[V_19:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_16]], %[[V_18]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
 
     import, only: t
@@ -74,23 +74,23 @@ subroutine test3(p, x)
 
     ! CHECK:     %[[V_3:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
     ! CHECK:     %[[V_4:[0-9]+]] = fir.convert %[[V_3]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-    ! CHECK:     %[[V_5:[0-9]+]] = fir.alloca !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     %[[V_6:[0-9]+]] = fir.undefined !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
+    ! CHECK:     %[[V_5:[0-9]+]] = fir.alloca !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     %[[V_6:[0-9]+]] = fir.undefined !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
     ! CHECK:     %[[V_7:[0-9]+]] = fir.address_of(@_QMmE.dt.t)
     ! CHECK:     %[[V_8:[0-9]+]] = fir.convert %[[V_7]] : {{.*}} -> !fir.ref<none>
-    ! CHECK:     %[[V_9:[0-9]+]] = fir.insert_value %[[V_6]], %[[V_8]], [0 : index, 0 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
+    ! CHECK:     %[[V_9:[0-9]+]] = fir.insert_value %[[V_6]], %[[V_8]], [0 : index, 0 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
     ! CHECK:     %[[V_10:[0-9]+]] = fir.box_addr %arg0 : (!fir.boxproc<() -> ()>) -> !fir.ref<none>
-    ! CHECK:     %[[V_11:[0-9]+]] = fir.insert_value %[[V_9]], %[[V_10]], [0 : index, 1 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     %[[V_12:[0-9]+]] = fir.insert_value %[[V_11]], %c2{{.*}}, [0 : index, 2 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, i32) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     %[[V_13:[0-9]+]] = fir.insert_value %[[V_12]], %true, [0 : index, 3 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, i1) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     fir.store %[[V_13]] to %[[V_5]] : !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>
-    ! CHECK:     %[[V_14:[0-9]+]] = fir.alloca tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_15:[0-9]+]] = fir.undefined tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c1{{.*}}, [0 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>, i64) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %[[V_5]], [1 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %true_0, [2 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>, i1) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     fir.store %[[V_18]] to %[[V_14]] : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-    ! CHECK:     %[[V_19:[0-9]+]] = fir.convert %[[V_14]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+    ! CHECK:     %[[V_11:[0-9]+]] = fir.insert_value %[[V_9]], %[[V_10]], [0 : index, 1 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     %[[V_12:[0-9]+]] = fir.insert_value %[[V_11]], %c2{{.*}}, [0 : index, 2 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, i32) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     %[[V_13:[0-9]+]] = fir.insert_value %[[V_12]], %c1_i8, [0 : index, 3 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, i8) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     fir.store %[[V_13]] to %[[V_5]] : !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>
+    ! CHECK:     %[[V_14:[0-9]+]] = fir.alloca tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_15:[0-9]+]] = fir.undefined tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c1{{.*}}, [0 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>, i64) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %[[V_5]], [1 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %true, [2 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>, i1) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     fir.store %[[V_18]] to %[[V_14]] : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+    ! CHECK:     %[[V_19:[0-9]+]] = fir.convert %[[V_14]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
     ! CHECK:     %[[V_20:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_4]], %[[V_19]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
     print *, x
   end subroutine
@@ -112,8 +112,8 @@ program p
 
   ! CHECK:   %[[V_97:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
   ! CHECK:   %[[V_98:[0-9]+]] = fir.convert %[[V_97]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-  ! CHECK:   %[[V_99:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:   %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:   %[[V_99:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:   %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:   %[[V_101:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_98]], %[[V_100]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
   print *, 'main, should call wft: ', t(4)
 
@@ -122,14 +122,14 @@ program p
   ! CHECK:   %[[V_35:[0-9]+]] = fir.shape %c2{{.*}} : (index) -> !fir.shape<1>
   ! CHECK:   %[[V_36:[0-9]+]] = fir.embox %[[V_34]](%[[V_35]]) : (!fir.ref<!fir.array<2x!fir.type<_QMmTt{n:i32}>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.type<_QMmTt{n:i32}>>>
   ! CHECK:   %[[V_37:[0-9]+]] = fir.convert %[[V_36]] : (!fir.box<!fir.array<2x!fir.type<_QMmTt{n:i32}>>>) -> !fir.box<none>
-  ! CHECK:   %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:   %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:   %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:   %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:   %[[V_40:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_37]], %[[V_39]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
   print *, y(2:3)
 end
 
-! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-! CHECK: fir.global linkonce @_QQdefault.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
+! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+! CHECK: fir.global linkonce @_QQdefault.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
diff --git a/flang/test/Lower/location.f90 b/flang/test/Lower/location.f90
index a6ece31bbebed..95bf2260fc107 100644
--- a/flang/test/Lower/location.f90
+++ b/flang/test/Lower/location.f90
@@ -5,7 +5,7 @@ program test
 
 end 
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: fir.call @_FortranAioOutputAscii(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1 loc(fused<#fir<loc_kind_array[ base,  inclusion,  inclusion]>>["{{.*}}location1.inc":1:10, "{{.*}}location0.inc":1:1, "{{.*}}location.f90":4:1])
 ! CHECK: return loc("{{.*}}location.f90":6:1)
 ! CHECK: } loc("{{.*}}location.f90":3:1)
diff --git a/flang/test/Lower/namelist.f90 b/flang/test/Lower/namelist.f90
index 94b0ef11cb102..770af46eea744 100644
--- a/flang/test/Lower/namelist.f90
+++ b/flang/test/Lower/namelist.f90
@@ -42,8 +42,8 @@ program p
   ! CHECK:     %[[V_42:[0-9]+]] = fir.insert_value %[[V_39]], %[[V_41]], [0 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<i8>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_43:[0-9]+]] = fir.insert_value %[[V_42]], %c2{{.*}}, [1 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, i64) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_44:[0-9]+]] = fir.insert_value %[[V_43]], %[[V_24]], [2 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
-  ! CHECK:     %[[V_45:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:     %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:     %[[V_45:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:     %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:     %[[V_47:[0-9]+]] = fir.insert_value %[[V_44]], %[[V_46]], [3 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<none>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     fir.store %[[V_47]] to %[[V_38]] : !fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>
   ! CHECK:     %[[V_48:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>) -> !fir.ref<tuple<>>
@@ -100,8 +100,8 @@ subroutine sss
   ! CHECK:     %[[V_20:[0-9]+]] = fir.insert_value %[[V_17]], %[[V_19]], [0 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<i8>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_21:[0-9]+]] = fir.insert_value %[[V_20]], %c1{{.*}}, [1 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, i64) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_22:[0-9]+]] = fir.insert_value %[[V_21]], %[[V_8]], [2 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
-  ! CHECK:     %[[V_23:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:     %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:     %[[V_23:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:     %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:     %[[V_25:[0-9]+]] = fir.insert_value %[[V_22]], %[[V_24]], [3 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<none>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     fir.store %[[V_25]] to %[[V_16]] : !fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>
   ! CHECK:     %[[V_26:[0-9]+]] = fir.convert %[[V_16]] : (!fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>) -> !fir.ref<tuple<>>
diff --git a/flang/test/Lower/nested-where.f90 b/flang/test/Lower/nested-where.f90
index ab457280b80ce..28aced2325813 100644
--- a/flang/test/Lower/nested-where.f90
+++ b/flang/test/Lower/nested-where.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "nested_where"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "NESTED_WHERE"} {
 program nested_where
 
   ! CHECK:  %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index b7be5f685d9e3..f586380e653a0 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -178,6 +178,17 @@ subroutine polymorphic_to_nonpolymorphic(p)
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPpolymorphic_to_nonpolymorphic
 ! Just checking that FIR is generated without error.
 
+  subroutine nonpolymorphic_to_polymorphic(p, t)
+    type p1
+    end type
+    type(p1), pointer :: p(:)
+    class(p1), target :: t(:)
+    p(0:1) => t
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMpolymorphic_testPnonpolymorphic_to_polymorphic
+! CHECK: fir.call @_FortranAPointerAssociateRemappingMonomorphic
+
 ! Test that lowering does not crash for function return with unlimited
 ! polymoprhic value.
 
@@ -1146,7 +1157,7 @@ program test
   l = i < o%inner
 end program
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/pre-fir-tree02.f90 b/flang/test/Lower/pre-fir-tree02.f90
index f4fa626ba6548..65c33e9b364fe 100644
--- a/flang/test/Lower/pre-fir-tree02.f90
+++ b/flang/test/Lower/pre-fir-tree02.f90
@@ -3,7 +3,7 @@
 ! Test Pre-FIR Tree captures all the intended nodes from the parse-tree
 ! Coarray and OpenMP related nodes are tested in other files.
 
-! CHECK: Program test_prog
+! CHECK: Program TEST_PROG
 program test_prog
   ! Check specification part is not part of the tree.
   interface
diff --git a/flang/test/Lower/pre-fir-tree03.f90 b/flang/test/Lower/pre-fir-tree03.f90
index 313dab4d6ec7c..1de66e3f8d016 100644
--- a/flang/test/Lower/pre-fir-tree03.f90
+++ b/flang/test/Lower/pre-fir-tree03.f90
@@ -2,7 +2,7 @@
 
 ! Test Pre-FIR Tree captures OpenMP related constructs
 
-! CHECK: Program test_omp
+! CHECK: Program TEST_OMP
 program test_omp
   ! CHECK: PrintStmt
   print *, "sequential"
diff --git a/flang/test/Lower/pre-fir-tree06.f90 b/flang/test/Lower/pre-fir-tree06.f90
index f84bcd8b58b2d..ed1e76cb375bd 100644
--- a/flang/test/Lower/pre-fir-tree06.f90
+++ b/flang/test/Lower/pre-fir-tree06.f90
@@ -25,13 +25,13 @@ subroutine sub2()
 end
 ! CHECK: End Module m2
 
-! CHECK: Program main
+! CHECK: Program MAIN
 program main
   real :: y
   ! CHECK-NEXT: OpenMPDeclarativeConstruct
   !$omp threadprivate(y)
 end
-! CHECK: End Program main
+! CHECK: End Program MAIN
 
 ! CHECK: Subroutine sub1
 subroutine sub1()
diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90
index e0af6f065f34d..65940b4e1ff17 100644
--- a/flang/test/Lower/program-units-fir-mangling.f90
+++ b/flang/test/Lower/program-units-fir-mangling.f90
@@ -124,7 +124,7 @@ subroutine should_not_collide()
 ! CHECK: }
 end subroutine
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 program test
 ! CHECK: }
 contains
diff --git a/flang/test/Lower/return-statement.f90 b/flang/test/Lower/return-statement.f90
index 6351a6859eb4f..8ab69e3146e2f 100644
--- a/flang/test/Lower/return-statement.f90
+++ b/flang/test/Lower/return-statement.f90
@@ -4,7 +4,7 @@ program basic
   return
 end program
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "basic"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "BASIC"} {
 ! CHECK:         return
 ! CHECK:       }
 
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 2e05b652822b5..d1a844eddd106 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/volatile-openmp1.f90 b/flang/test/Lower/volatile-openmp1.f90
index 163db953b6b80..07d81a1aeb240 100644
--- a/flang/test/Lower/volatile-openmp1.f90
+++ b/flang/test/Lower/volatile-openmp1.f90
@@ -13,7 +13,7 @@ program main
 !$omp end parallel
 end program
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 1000 : i32
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/volatile-string.f90 b/flang/test/Lower/volatile-string.f90
index 88b21d7b245e9..f263db7abb5fc 100644
--- a/flang/test/Lower/volatile-string.f90
+++ b/flang/test/Lower/volatile-string.f90
@@ -21,7 +21,7 @@ subroutine assign_different_length(string)
   end subroutine
 end program
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 11 : i32
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant true
diff --git a/flang/test/Lower/volatile3.f90 b/flang/test/Lower/volatile3.f90
index 8825f8f3afbcb..a32f29d2bb9e7 100644
--- a/flang/test/Lower/volatile3.f90
+++ b/flang/test/Lower/volatile3.f90
@@ -70,7 +70,7 @@ subroutine sub_select_rank(arr)
 end program
 
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
diff --git a/flang/test/Parser/OpenMP/map-modifiers-v60.f90 b/flang/test/Parser/OpenMP/map-modifiers-v60.f90
new file mode 100644
index 0000000000000..bc80886780d46
--- /dev/null
+++ b/flang/test/Parser/OpenMP/map-modifiers-v60.f90
@@ -0,0 +1,113 @@
+!RUN: %flang_fc1 -fdebug-unparse-no-sema -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree-no-sema -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00(x)
+  integer :: x
+  !$omp target map(always, close, delete, present, ompx_hold: x)
+  x = x + 1
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f00 (x)
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP TARGET  MAP(ALWAYS, CLOSE, DELETE, PRESENT, OMPX_HOLD: x)
+!UNPARSE:   x = x+1
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpAlwaysModifier -> Value = Always
+!PARSE-TREE: | | Modifier -> OmpCloseModifier -> Value = Close
+!PARSE-TREE: | | Modifier -> OmpDeleteModifier -> Value = Delete
+!PARSE-TREE: | | Modifier -> OmpPresentModifier -> Value = Present
+!PARSE-TREE: | | Modifier -> OmpxHoldModifier -> Value = Ompx_Hold
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+
+subroutine f01(x)
+  integer :: x
+  !$omp target map(self, storage: x)
+  x = x + 1
+  !$omp end target
+end
+
+!UNPARSE: !$OMP TARGET  MAP(SELF, STORAGE: x)
+!UNPARSE:   x = x+1
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpSelfModifier -> Value = Self
+!PARSE-TREE: | | Modifier -> OmpMapType -> Value = Storage
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+
+subroutine f02(x)
+  integer, pointer :: x
+  !$omp target map(ref_ptr, to: x)
+  x = x + 1
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f02 (x)
+!UNPARSE:  INTEGER, POINTER :: x
+!UNPARSE: !$OMP TARGET  MAP(REF_PTR, TO: x)
+!UNPARSE:   x = x+1
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpRefModifier -> Value = Ref_Ptr
+!PARSE-TREE: | | Modifier -> OmpMapType -> Value = To
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+
+subroutine f03(x)
+  integer, pointer :: x
+  !$omp target map(ref_ptee, to: x)
+  x = x + 1
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f03 (x)
+!UNPARSE:  INTEGER, POINTER :: x
+!UNPARSE: !$OMP TARGET  MAP(REF_PTEE, TO: x)
+!UNPARSE:   x = x+1
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpRefModifier -> Value = Ref_Ptee
+!PARSE-TREE: | | Modifier -> OmpMapType -> Value = To
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+
+subroutine f04(x)
+  integer, pointer :: x
+  !$omp target map(ref_ptr_ptee, to: x)
+  x = x + 1
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f04 (x)
+!UNPARSE:  INTEGER, POINTER :: x
+!UNPARSE: !$OMP TARGET  MAP(REF_PTR_PTEE, TO: x)
+!UNPARSE:   x = x+1
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: | OmpBlockDirective -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpRefModifier -> Value = Ref_Ptr_Ptee
+!PARSE-TREE: | | Modifier -> OmpMapType -> Value = To
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
diff --git a/flang/test/Parser/OpenMP/unroll-heuristic.f90 b/flang/test/Parser/OpenMP/unroll-heuristic.f90
new file mode 100644
index 0000000000000..2f589af0c83ca
--- /dev/null
+++ b/flang/test/Parser/OpenMP/unroll-heuristic.f90
@@ -0,0 +1,43 @@
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 %s -fdebug-unparse         | FileCheck --check-prefix=UNPARSE %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 %s -fdebug-dump-parse-tree | FileCheck --check-prefix=PTREE %s
+
+subroutine openmp_parse_unroll_heuristic
+  integer i
+
+  !$omp unroll
+  do i = 1, 100
+    call func(i)
+  end do
+  !$omp end unroll
+END subroutine openmp_parse_unroll_heuristic
+
+
+!UNPARSE:      !$OMP UNROLL
+!UNPARSE-NEXT: DO i=1_4,100_4
+!UNPARSE-NEXT:   CALL func(i)
+!UNPARSE-NEXT: END DO
+!UNPARSE-NEXT: !$OMP END UNROLL
+
+!PTREE:      OpenMPConstruct -> OpenMPLoopConstruct
+!PTREE-NEXT: | OmpBeginLoopDirective
+!PTREE-NEXT: | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!PTREE-NEXT: | | OmpClauseList ->
+!PTREE-NEXT: | DoConstruct
+!PTREE-NEXT: | | NonLabelDoStmt
+!PTREE-NEXT: | | | LoopControl -> LoopBounds
+!PTREE-NEXT: | | | | Scalar -> Name = 'i'
+!PTREE-NEXT: | | | | Scalar -> Expr = '1_4'
+!PTREE-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PTREE-NEXT: | | | | Scalar -> Expr = '100_4'
+!PTREE-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '100'
+!PTREE-NEXT: | | Block
+!PTREE-NEXT: | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL func(i)'
+!PTREE-NEXT: | | | | | | Call
+!PTREE-NEXT: | | | | | ProcedureDesignator -> Name = 'func'
+!PTREE-NEXT: | | | | | ActualArgSpec
+!PTREE-NEXT: | | | | | | ActualArg -> Expr = 'i'
+!PTREE-NEXT: | | | | | | | Designator -> DataRef -> Name = 'i'
+!PTREE-NEXT: | | EndDoStmt ->
+!PTREE-NEXT: | OmpEndLoopDirective
+!PTREE-NEXT: | | OmpLoopDirective -> llvm::omp::Directive = unroll
+!PTREE-NEXT: | | OmpClauseList ->
diff --git a/flang/test/Parser/OpenMP/unroll.f90 b/flang/test/Parser/OpenMP/unroll-partial.f90
similarity index 100%
rename from flang/test/Parser/OpenMP/unroll.f90
rename to flang/test/Parser/OpenMP/unroll-partial.f90
diff --git a/flang/test/Parser/acc-unparse.f90 b/flang/test/Parser/acc-unparse.f90
index 62e0d4487f3f7..12e6dec19f272 100644
--- a/flang/test/Parser/acc-unparse.f90
+++ b/flang/test/Parser/acc-unparse.f90
@@ -15,7 +15,7 @@ program bug47659
   end do label1
 end program
 
-!CHECK-LABEL: PROGRAM bug47659
+!CHECK-LABEL: PROGRAM BUG47659
 !CHECK: !$ACC PARALLEL LOOP
 
 
diff --git a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
index 29985a02eb6ef..cfe27e4f8fca1 100644
--- a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
@@ -243,8 +243,15 @@ program openacc_kernels_loop_validity
     a(i) = 3.14
   end do
 
+  !$acc kernels loop default(none) private(N, a)
+  do i = 1, N
+    a(i) = 3.14
+  end do
+
   !$acc kernels loop default(none)
+  !ERROR: The DEFAULT(NONE) clause requires that 'n' must be listed in a data-mapping clause
   do i = 1, N
+    !ERROR: The DEFAULT(NONE) clause requires that 'a' must be listed in a data-mapping clause
     a(i) = 3.14
   end do
 
diff --git a/flang/test/Semantics/OpenACC/acc-symbols01.f90 b/flang/test/Semantics/OpenACC/acc-symbols01.f90
index 375445bad13a5..51a7a3a23e8ce 100644
--- a/flang/test/Semantics/OpenACC/acc-symbols01.f90
+++ b/flang/test/Semantics/OpenACC/acc-symbols01.f90
@@ -1,24 +1,24 @@
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenacc
 
-!DEF: /mm MainProgram
-program mm
-  !DEF: /mm/x ObjectEntity REAL(4)
-  !DEF: /mm/y ObjectEntity REAL(4)
+!DEF: /MM MainProgram
+program MM
+  !DEF: /MM/x ObjectEntity REAL(4)
+  !DEF: /MM/y ObjectEntity REAL(4)
   real x, y
-  !DEF: /mm/a ObjectEntity INTEGER(4)
-  !DEF: /mm/b ObjectEntity INTEGER(4)
-  !DEF: /mm/c ObjectEntity INTEGER(4)
-  !DEF: /mm/i ObjectEntity INTEGER(4)
+  !DEF: /MM/a ObjectEntity INTEGER(4)
+  !DEF: /MM/b ObjectEntity INTEGER(4)
+  !DEF: /MM/c ObjectEntity INTEGER(4)
+  !DEF: /MM/i ObjectEntity INTEGER(4)
   integer a(10), b(10), c(10), i
-  !REF: /mm/b
+  !REF: /MM/b
   b = 2
  !$acc parallel present(c) firstprivate(b) private(a)
  !$acc loop
-  !REF: /mm/i
+  !REF: /MM/i
   do i=1,10
-   !REF: /mm/a
-   !REF: /mm/i
-   !REF: /mm/b
+   !REF: /MM/a
+   !REF: /MM/i
+   !REF: /MM/b
    a(i) = b(i)
   end do
  !$acc end parallel
diff --git a/flang/test/Semantics/OpenMP/combined-constructs.f90 b/flang/test/Semantics/OpenMP/combined-constructs.f90
index 4f2a4a4f501b9..2298d33ef33eb 100644
--- a/flang/test/Semantics/OpenMP/combined-constructs.f90
+++ b/flang/test/Semantics/OpenMP/combined-constructs.f90
@@ -207,7 +207,7 @@ program main
   enddo
   !$omp end target teams
 
-  !ERROR: Only the TO, FROM, TOFROM, ALLOC map types are permitted for MAP clauses on the TARGET TEAMS directive
+  !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS directive
   !$omp target teams map(delete:a)
   do i = 1, N
      a(i) = 3.14
@@ -307,7 +307,7 @@ program main
   enddo
   !$omp end target teams distribute
 
-  !ERROR: Only the TO, FROM, TOFROM, ALLOC map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE directive
+  !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE directive
   !$omp target teams distribute map(delete:a)
   do i = 1, N
      a(i) = 3.14
@@ -400,7 +400,7 @@ program main
   enddo
   !$omp end target teams distribute parallel do
 
-  !ERROR: Only the TO, FROM, TOFROM, ALLOC map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
+  !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE PARALLEL DO directive
   !$omp target teams distribute parallel do map(delete:a)
   do i = 1, N
      a(i) = 3.14
@@ -500,7 +500,7 @@ program main
   enddo
   !$omp end target teams distribute parallel do simd
 
-  !ERROR: Only the TO, FROM, TOFROM, ALLOC map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
+  !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD directive
   !$omp target teams distribute parallel do simd map(delete:a)
   do i = 1, N
      a(i) = 3.14
diff --git a/flang/test/Semantics/OpenMP/critical_within_default.f90 b/flang/test/Semantics/OpenMP/critical_within_default.f90
index dd972e6e52949..a5fe30eeb7de0 100644
--- a/flang/test/Semantics/OpenMP/critical_within_default.f90
+++ b/flang/test/Semantics/OpenMP/critical_within_default.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 ! Test that we do not make a private copy of the critical name
 
-!CHECK:  MainProgram scope: mn
+!CHECK:  MainProgram scope: MN
 !CHECK-NEXT:    j size=4 offset=0: ObjectEntity type: INTEGER(4)
 !CHECK-NEXT:    OtherConstruct scope:
 !CHECK-NEXT:      j (OmpPrivate): HostAssoc
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
index 06f41ab8ce76f..e57a5c0c1cea6 100644
--- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
+++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
 
 program main
-!CHECK-LABEL: MainProgram scope: main
+!CHECK-LABEL: MainProgram scope: MAIN
    implicit none
 
    type ty
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
index 9d0a097fb1991..fc977f2f1b839 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
@@ -17,7 +17,7 @@ end function mymax
 end module mymod
 
 program omp_examples
-!CHECK-LABEL: MainProgram scope: omp_examples
+!CHECK-LABEL: MainProgram scope: OMP_EXAMPLES
   use mymod
   implicit none
   integer, parameter :: n = 100
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
index d7a9f2fc0a36b..84dbe1af01877 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
@@ -49,7 +49,7 @@ function my_add(x, y)
 end module m1
 
 program test_vector
-!CHECK-LABEL: MainProgram scope: test_vector
+!CHECK-LABEL: MainProgram scope: TEST_VECTOR
   use vector_mod
 !CHECK: add_vectors (Function): Use from add_vectors in vector_mod
   implicit none
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
index 12e80cbf7b327..9cd638d796091 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
@@ -22,7 +22,7 @@ end function my_mul
 end module module1
 
 program test_omp_reduction
-!CHECK: MainProgram scope: test_omp_reduction
+!CHECK: MainProgram scope: TEST_OMP_REDUCTION
   use module1, only: t1, operator(.modmul.) => operator(.mul.)
 
 !CHECK: .modmul. (Function): Use from .mul. in module1
diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90
index ddca38fd57812..1f39c57c54ad1 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction.f90
@@ -31,7 +31,7 @@ end subroutine initme
 end function func
 
 program main
-!CHECK-LABEL: MainProgram scope: main
+!CHECK-LABEL: MainProgram scope: MAIN
 
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
 
diff --git a/flang/test/Semantics/OpenMP/declare-target03.f90 b/flang/test/Semantics/OpenMP/declare-target03.f90
index 64a299d78224a..48cfc68393873 100644
--- a/flang/test/Semantics/OpenMP/declare-target03.f90
+++ b/flang/test/Semantics/OpenMP/declare-target03.f90
@@ -13,10 +13,10 @@ subroutine bar
 program main
   use mod1
 
-  !ERROR: The module name or main program name cannot be in a DECLARE TARGET directive
+  !ERROR: The module name cannot be in a DECLARE TARGET directive
   !$omp declare target (mod1)
 
-  !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
-  !ERROR: The module name or main program name cannot be in a DECLARE TARGET directive
+  ! This is now allowed: "main" is implicitly declared symbol separate
+  ! from the main program symbol
   !$omp declare target (main)
 end
diff --git a/flang/test/Semantics/OpenMP/device-constructs.f90 b/flang/test/Semantics/OpenMP/device-constructs.f90
index 6f545b9021966..431e0f88e3237 100644
--- a/flang/test/Semantics/OpenMP/device-constructs.f90
+++ b/flang/test/Semantics/OpenMP/device-constructs.f90
@@ -123,7 +123,7 @@ program main
   enddo
   !$omp end target
 
-  !ERROR: Only the TO, FROM, TOFROM, ALLOC map types are permitted for MAP clauses on the TARGET directive
+  !ERROR: Only the ALLOC, FROM, TO, TOFROM map types are permitted for MAP clauses on the TARGET directive
   !$omp target map(delete:a)
   do i = 1, N
      a = 3.14
@@ -160,7 +160,7 @@ program main
   !ERROR: At most one IF clause can appear on the TARGET ENTER DATA directive
   !$omp target enter data map(to:a) if(.true.) if(.false.)
 
-  !ERROR: Only the TO, ALLOC map types are permitted for MAP clauses on the TARGET ENTER DATA directive
+  !ERROR: Only the ALLOC, TO, TOFROM map types are permitted for MAP clauses on the TARGET ENTER DATA directive
   !$omp target enter data map(from:a)
 
   !$omp target exit data map(delete:a)
@@ -168,7 +168,7 @@ program main
   !ERROR: At most one DEVICE clause can appear on the TARGET EXIT DATA directive
   !$omp target exit data map(from:a) device(0) device(1)
 
-  !ERROR: Only the FROM, RELEASE, DELETE map types are permitted for MAP clauses on the TARGET EXIT DATA directive
+  !ERROR: Only the DELETE, FROM, RELEASE, TOFROM map types are permitted for MAP clauses on the TARGET EXIT DATA directive
   !$omp target exit data map(to:a)
 
   !$omp target update if(.true.) device(1) to(a) from(b) depend(inout:c) nowait
diff --git a/flang/test/Semantics/OpenMP/do-schedule03.f90 b/flang/test/Semantics/OpenMP/do-schedule03.f90
index 8787b094d581a..05602ca57e4a9 100644
--- a/flang/test/Semantics/OpenMP/do-schedule03.f90
+++ b/flang/test/Semantics/OpenMP/do-schedule03.f90
@@ -2,27 +2,27 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Schedule Clause
 ! Test that does not catch non constant integer expressions like xx - xx.
-  !DEF: /ompdoschedule MainProgram
-program ompdoschedule
-  !DEF: /ompdoschedule/a ObjectEntity REAL(4)
-  !DEF: /ompdoschedule/y ObjectEntity REAL(4)
-  !DEF: /ompdoschedule/z ObjectEntity REAL(4)
+  !DEF: /OMPDOSCHEDULE MainProgram
+program OMPDOSCHEDULE
+  !DEF: /OMPDOSCHEDULE/a ObjectEntity REAL(4)
+  !DEF: /OMPDOSCHEDULE/y ObjectEntity REAL(4)
+  !DEF: /OMPDOSCHEDULE/z ObjectEntity REAL(4)
   real  a(100),y(100),z(100)
-  !DEF: /ompdoschedule/b ObjectEntity INTEGER(4)
-  !DEF: /ompdoschedule/i ObjectEntity INTEGER(4)
-  !DEF: /ompdoschedule/n ObjectEntity INTEGER(4)
+  !DEF: /OMPDOSCHEDULE/b ObjectEntity INTEGER(4)
+  !DEF: /OMPDOSCHEDULE/i ObjectEntity INTEGER(4)
+  !DEF: /OMPDOSCHEDULE/n ObjectEntity INTEGER(4)
   integer  b,i,n
-  !REF: /ompdoschedule/b
+  !REF: /OMPDOSCHEDULE/b
   b = 10
   !$omp do  schedule(static,b-b)
-  !DEF: /ompdoschedule/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !REF: /ompdoschedule/n
+  !DEF: /OMPDOSCHEDULE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !REF: /OMPDOSCHEDULE/n
   do i = 2,n+1
-    !REF: /ompdoschedule/y
-    !REF: /ompdoschedule/OtherConstruct1/i
-    !REF: /ompdoschedule/z
-    !REF: /ompdoschedule/a
+    !REF: /OMPDOSCHEDULE/y
+    !REF: /OMPDOSCHEDULE/OtherConstruct1/i
+    !REF: /OMPDOSCHEDULE/z
+    !REF: /OMPDOSCHEDULE/a
     y(i) = z(i-1) + a(i)
   end do
   !$omp end do
-end program ompdoschedule
+end program OMPDOSCHEDULE
diff --git a/flang/test/Semantics/OpenMP/do01-positivecase.f90 b/flang/test/Semantics/OpenMP/do01-positivecase.f90
index 905fdbaf18476..50a6870f43896 100644
--- a/flang/test/Semantics/OpenMP/do01-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do01-positivecase.f90
@@ -4,16 +4,16 @@
 ! The loop iteration variable may not appear in a firstprivate directive.
 ! A positive case
 
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
   integer i
 
   !$omp do  firstprivate(k)
-  !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
     print *, "Hello"
   end do
   !$omp end do
 
-end program omp_do
+end program OMP_DO
diff --git a/flang/test/Semantics/OpenMP/do04-positivecase.f90 b/flang/test/Semantics/OpenMP/do04-positivecase.f90
index eb2d67bb8ceb2..51b69fce3c7cc 100644
--- a/flang/test/Semantics/OpenMP/do04-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do04-positivecase.f90
@@ -2,21 +2,21 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop Constructs
 
-!DEF: /omp_do1 MainProgram
-program omp_do1
-  !DEF: /omp_do1/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do1/j ObjectEntity INTEGER(4)
-  !DEF: /omp_do1/k (OmpThreadprivate) ObjectEntity INTEGER(4)
-  !DEF: /omp_do1/n (OmpThreadprivate) ObjectEntity INTEGER(4)
+!DEF: /OMP_DO1 MainProgram
+program OMP_DO1
+  !DEF: /OMP_DO1/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO1/j ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO1/k (OmpThreadprivate) ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO1/n (OmpThreadprivate) ObjectEntity INTEGER(4)
   integer i, j, k, n
   !$omp threadprivate (k,n)
   !$omp do
-  !DEF: /omp_do1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_do1/j
+    !REF: /OMP_DO1/j
     do j=1,10
       print *, "Hello"
     end do
   end do
   !$omp end do
-end program omp_do1
+end program OMP_DO1
diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90
index eda04610535c2..d4eb1fd6bc3da 100644
--- a/flang/test/Semantics/OpenMP/do05-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90
@@ -3,13 +3,13 @@
 ! 2.7.1 Loop Construct restrictions on single directive.
 ! A positive case
 
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do/n ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/n ObjectEntity INTEGER(4)
   integer i,n
   !$omp parallel
-  !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
     !$omp single
     print *, "hello"
@@ -19,13 +19,13 @@ program omp_do
 
   !$omp parallel  default(shared)
   !$omp do
-  !DEF: /omp_do/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !DEF: /omp_do/OtherConstruct2/OtherConstruct1/n HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/n HostAssoc INTEGER(4)
   do i=1,n
     !$omp parallel
     !$omp single
     !DEF: /work EXTERNAL (Subroutine) ProcEntity
-    !DEF: /omp_do/OtherConstruct2/OtherConstruct1/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
+    !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
     call work(i, 1)
     !$omp end single
     !$omp end parallel
@@ -34,7 +34,7 @@ program omp_do
   !$omp end parallel
 
   !$omp parallel private(i)
-  !DEF: /omp_do/OtherConstruct3/i (OmpPrivate, OmpExplicit) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct3/i (OmpPrivate, OmpExplicit) HostAssoc INTEGER(4)
   do i=1,10
      !$omp single
      print *, "hello"
@@ -43,32 +43,32 @@ program omp_do
   !$omp end parallel
 
   !$omp target teams distribute parallel do
-  !DEF:/omp_do/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF:/OMP_DO/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF:/omp_do/OtherConstruct4/i
+    !REF:/OMP_DO/OtherConstruct4/i
     if(i<10) cycle
   end do
   !$omp end target teams distribute parallel do
 
   !$omp target teams distribute parallel do simd
-  !DEF:/omp_do/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF:/OMP_DO/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF:/omp_do/OtherConstruct5/i
+    !REF:/OMP_DO/OtherConstruct5/i
     if(i<10) cycle
   end do
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute 
-  !DEF: /omp_do/OtherConstruct6/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct6/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF: /omp_do/OtherConstruct6/i
+    !REF: /OMP_DO/OtherConstruct6/i
     if(i < 5) cycle
   end do
 
   !$omp target teams distribute simd
-  !DEF: /omp_do/OtherConstruct7/i (OmpLinear, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct7/i (OmpLinear, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF: /omp_do/OtherConstruct7/i
+    !REF: /OMP_DO/OtherConstruct7/i
     if(i < 5) cycle
   end do
-end program omp_do
+end program OMP_DO
diff --git a/flang/test/Semantics/OpenMP/do06-positivecases.f90 b/flang/test/Semantics/OpenMP/do06-positivecases.f90
index 2713b55fa2ecb..dfb1d999bbc53 100644
--- a/flang/test/Semantics/OpenMP/do06-positivecases.f90
+++ b/flang/test/Semantics/OpenMP/do06-positivecases.f90
@@ -5,14 +5,14 @@
 ! region ever binds to a loop region arising from the loop construct.
 
 ! A positive case
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do/j ObjectEntity INTEGER(4)
-  !DEF: /omp_do/k ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/j ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/k ObjectEntity INTEGER(4)
   integer i, j, k
   !$omp do  ordered
-    !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do i=1,10
       !$omp ordered
       !DEF: /my_func EXTERNAL (Subroutine) ProcEntity
@@ -20,4 +20,4 @@ program omp_do
       !$omp end ordered
     end do
   !$omp end do
-end program omp_do
+end program OMP_DO
diff --git a/flang/test/Semantics/OpenMP/do11.f90 b/flang/test/Semantics/OpenMP/do11.f90
index faab457efff3c..472048d684276 100644
--- a/flang/test/Semantics/OpenMP/do11.f90
+++ b/flang/test/Semantics/OpenMP/do11.f90
@@ -2,24 +2,24 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop Constructs
 
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do/j ObjectEntity INTEGER(4)
-  !DEF: /omp_do/k ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/j ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/k ObjectEntity INTEGER(4)
   integer i, j, k
   !$omp do
-  !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_do/j
+    !REF: /OMP_DO/j
     do j=1,10
-      !REF: /omp_do/OtherConstruct1/i
-      !REF: /omp_do/j
+      !REF: /OMP_DO/OtherConstruct1/i
+      !REF: /OMP_DO/j
       print *, "it", i, j
     end do
   end do
   !$omp end do
-end program omp_do
+end program OMP_DO
 
 !DEF: /omp_do2 (Subroutine)Subprogram
 subroutine omp_do2
diff --git a/flang/test/Semantics/OpenMP/do12.f90 b/flang/test/Semantics/OpenMP/do12.f90
index a057a246f7a99..06055b7572a60 100644
--- a/flang/test/Semantics/OpenMP/do12.f90
+++ b/flang/test/Semantics/OpenMP/do12.f90
@@ -2,20 +2,20 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop constructs.
 
-!DEF: /omp_cycle MainProgram
-program omp_cycle
+!DEF: /OMP_CYCLE MainProgram
+program OMP_CYCLE
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !REF: /omp_cycle/OtherConstruct1/i
+    !REF: /OMP_CYCLE/OtherConstruct1/i
     if (i<1) cycle
-    !DEF: /omp_cycle/j (Implicit) ObjectEntity INTEGER(4)
+    !DEF: /OMP_CYCLE/j (Implicit) ObjectEntity INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/k (Implicit) ObjectEntity INTEGER(4)
+      !DEF: /OMP_CYCLE/k (Implicit) ObjectEntity INTEGER(4)
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct1/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct1/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -23,17 +23,17 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !REF: /omp_cycle/j
+    !REF: /OMP_CYCLE/j
     do j=0,10
-      !REF: /omp_cycle/OtherConstruct2/i
+      !REF: /OMP_CYCLE/OtherConstruct2/i
       if (i<1) cycle
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct2/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct2/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -41,17 +41,17 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(2)
-  !DEF: /omp_cycle/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct3/i
+        !REF: /OMP_CYCLE/OtherConstruct3/i
         if (i<1) cycle
-        !REF: /omp_cycle/OtherConstruct3/i
-        !REF: /omp_cycle/OtherConstruct3/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct3/i
+        !REF: /OMP_CYCLE/OtherConstruct3/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -59,17 +59,17 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(3)
-  !DEF: /omp_cycle/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct4/i
+        !REF: /OMP_CYCLE/OtherConstruct4/i
         if (i<1) cycle
-        !REF: /omp_cycle/OtherConstruct4/i
-        !REF: /omp_cycle/OtherConstruct4/j
-        !REF: /omp_cycle/OtherConstruct4/k
+        !REF: /OMP_CYCLE/OtherConstruct4/i
+        !REF: /OMP_CYCLE/OtherConstruct4/j
+        !REF: /OMP_CYCLE/OtherConstruct4/k
         print *, i, j, k
       end do
     end do
@@ -77,20 +77,20 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(3)
-  !DEF: /omp_cycle/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo:do i=0,10
-    !DEF: /omp_cycle/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     foo1:do j=0,10
-      !DEF: /omp_cycle/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       foo2:do k=0,10
-        !REF: /omp_cycle/OtherConstruct5/i
+        !REF: /OMP_CYCLE/OtherConstruct5/i
         if (i<1) cycle foo2
-        !REF: /omp_cycle/OtherConstruct5/i
-        !REF: /omp_cycle/OtherConstruct5/j
-        !REF: /omp_cycle/OtherConstruct5/k
+        !REF: /OMP_CYCLE/OtherConstruct5/i
+        !REF: /OMP_CYCLE/OtherConstruct5/j
+        !REF: /OMP_CYCLE/OtherConstruct5/k
         print *, i, j, k
       end do foo2
     end do foo1
   end do foo
   !$omp end do
-end program omp_cycle
+end program OMP_CYCLE
diff --git a/flang/test/Semantics/OpenMP/do14.f90 b/flang/test/Semantics/OpenMP/do14.f90
index 5e8a5a64c2979..e17647394fff7 100644
--- a/flang/test/Semantics/OpenMP/do14.f90
+++ b/flang/test/Semantics/OpenMP/do14.f90
@@ -2,19 +2,19 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop constructs.
 
-!DEF: /omp_cycle MainProgram
-program omp_cycle
+!DEF: /OMP_CYCLE MainProgram
+program OMP_CYCLE
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
     cycle
-    !DEF: /omp_cycle/j (Implicit) ObjectEntity INTEGER(4)
+    !DEF: /OMP_CYCLE/j (Implicit) ObjectEntity INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/k (Implicit) ObjectEntity INTEGER(4)
+      !DEF: /OMP_CYCLE/k (Implicit) ObjectEntity INTEGER(4)
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct1/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct1/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -22,16 +22,16 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !REF: /omp_cycle/j
+    !REF: /OMP_CYCLE/j
     do j=0,10
       cycle
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct2/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct2/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -39,16 +39,16 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(2)
-  !DEF: /omp_cycle/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
         cycle
-        !REF: /omp_cycle/OtherConstruct3/i
-        !REF: /omp_cycle/OtherConstruct3/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct3/i
+        !REF: /OMP_CYCLE/OtherConstruct3/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -56,16 +56,16 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(3)
-  !DEF: /omp_cycle/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       do k=0,10
         cycle
-        !REF: /omp_cycle/OtherConstruct4/i
-        !REF: /omp_cycle/OtherConstruct4/j
-        !REF: /omp_cycle/OtherConstruct4/k
+        !REF: /OMP_CYCLE/OtherConstruct4/i
+        !REF: /OMP_CYCLE/OtherConstruct4/j
+        !REF: /OMP_CYCLE/OtherConstruct4/k
         print *, i, j, k
       end do
     end do
@@ -73,19 +73,19 @@ program omp_cycle
   !$omp end do
 
   !$omp do  ordered(3)
-  !DEF: /omp_cycle/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo:do i=0,10
-    !DEF: /omp_cycle/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     foo1:do j=0,10
-      !DEF: /omp_cycle/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       foo2:do k=0,10
         cycle foo2
-        !REF: /omp_cycle/OtherConstruct5/i
-        !REF: /omp_cycle/OtherConstruct5/j
-        !REF: /omp_cycle/OtherConstruct5/k
+        !REF: /OMP_CYCLE/OtherConstruct5/i
+        !REF: /OMP_CYCLE/OtherConstruct5/j
+        !REF: /OMP_CYCLE/OtherConstruct5/k
         print *, i, j, k
       end do foo2
     end do foo1
   end do foo
   !$omp end do
-end program omp_cycle
+end program OMP_CYCLE
diff --git a/flang/test/Semantics/OpenMP/do17.f90 b/flang/test/Semantics/OpenMP/do17.f90
index c0c59f16dee1b..cac11f215f074 100644
--- a/flang/test/Semantics/OpenMP/do17.f90
+++ b/flang/test/Semantics/OpenMP/do17.f90
@@ -2,56 +2,56 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop constructs.
 
-!DEF: /test MainProgram
-program test
- !DEF: /test/i ObjectEntity INTEGER(4)
- !DEF: /test/j ObjectEntity INTEGER(4)
- !DEF: /test/k ObjectEntity INTEGER(4)
+!DEF: /TEST MainProgram
+program TEST
+ !DEF: /TEST/i ObjectEntity INTEGER(4)
+ !DEF: /TEST/j ObjectEntity INTEGER(4)
+ !DEF: /TEST/k ObjectEntity INTEGER(4)
  integer i, j, k
  !$omp do  collapse(2)
- !DEF: /test/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+ !DEF: /TEST/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  foo: do i=0,10
-  !DEF: /test/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /TEST/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo1: do j=0,10
-   !REF: /test/k
+   !REF: /TEST/k
    foo2: do k=0,10
-    !REF: /test/OtherConstruct1/i
+    !REF: /TEST/OtherConstruct1/i
     select case (i)
     case (5)
      cycle foo1
     case (7)
      cycle foo2
     end select
-    !REF: /test/OtherConstruct1/i
-    !REF: /test/OtherConstruct1/j
-    !REF: /test/k
+    !REF: /TEST/OtherConstruct1/i
+    !REF: /TEST/OtherConstruct1/j
+    !REF: /TEST/k
     print *, i, j, k
    end do foo2
   end do foo1
  end do foo
 
  !$omp do  collapse(2)
- !DEF: /test/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+ !DEF: /TEST/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  foo: do i=0,10
-  !DEF: /test/OtherConstruct2/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /TEST/OtherConstruct2/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo1: do j=0,10
-   !REF: /test/k
+   !REF: /TEST/k
    foo2: do k=0,10
-    !REF: /test/OtherConstruct2/i
+    !REF: /TEST/OtherConstruct2/i
     if (i<3) then
      cycle foo1
-     !REF: /test/OtherConstruct2/i
+     !REF: /TEST/OtherConstruct2/i
     else if (i>8) then
      cycle foo1
     else
      cycle foo2
     end if
-    !REF: /test/OtherConstruct2/i
-    !REF: /test/OtherConstruct2/j
-    !REF: /test/k
+    !REF: /TEST/OtherConstruct2/i
+    !REF: /TEST/OtherConstruct2/j
+    !REF: /TEST/k
     print *, i, j, k
    end do foo2
   end do foo1
  end do foo
 !$omp end do
-end program test
+end program TEST
diff --git a/flang/test/Semantics/OpenMP/map-clause-symbols.f90 b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
index 8f984fcd2fa7e..1d6315b4a2312 100644
--- a/flang/test/Semantics/OpenMP/map-clause-symbols.f90
+++ b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
@@ -1,6 +1,6 @@
 ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
 program main
-!CHECK-LABEL:  MainProgram scope: main
+!CHECK-LABEL:  MainProgram scope: MAIN
   integer, parameter :: n = 256
   real(8) :: a(256)
   !$omp target map(mapper(xx), from:a)
diff --git a/flang/test/Semantics/OpenMP/map-modifiers-v60.f90 b/flang/test/Semantics/OpenMP/map-modifiers-v60.f90
new file mode 100644
index 0000000000000..b3f2e5171d6d6
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/map-modifiers-v60.f90
@@ -0,0 +1,35 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52 -Werror
+
+subroutine f00(x)
+  integer :: x
+!WARNING: 'self-modifier' modifier is not supported in OpenMP v5.2, try -fopenmp-version=60
+  !$omp target map(self: x)
+  x = x + 1
+  !$omp end target
+end
+
+subroutine f01(x)
+  integer, pointer :: x
+!WARNING: 'ref-modifier' modifier is not supported in OpenMP v5.2, try -fopenmp-version=60
+  !$omp target map(ref_ptr: x)
+  x = x + 1
+  !$omp end target
+end
+
+subroutine f02(x)
+  integer, pointer :: x
+!WARNING: 'ref-modifier' modifier is not supported in OpenMP v5.2, try -fopenmp-version=60
+  !$omp target map(ref_ptee: x)
+  x = x + 1
+  !$omp end target
+end
+
+subroutine f03(x)
+  integer, pointer :: x
+!WARNING: 'ref-modifier' modifier is not supported in OpenMP v5.2, try -fopenmp-version=60
+  !$omp target map(ref_ptr_ptee: x)
+  x = x + 1
+  !$omp end target
+end
+
+
diff --git a/flang/test/Semantics/OpenMP/reduction08.f90 b/flang/test/Semantics/OpenMP/reduction08.f90
index 01a06eb7d7414..b4a81e644c1e7 100644
--- a/flang/test/Semantics/OpenMP/reduction08.f90
+++ b/flang/test/Semantics/OpenMP/reduction08.f90
@@ -2,62 +2,62 @@
 ! OpenMP Version 4.5
 ! 2.15.3.6 Reduction Clause Positive cases
 
-!DEF: /omp_reduction MainProgram
-program omp_reduction
-  !DEF: /omp_reduction/i ObjectEntity INTEGER(4)
+!DEF: /OMP_REDUCTION MainProgram
+program OMP_REDUCTION
+  !DEF: /OMP_REDUCTION/i ObjectEntity INTEGER(4)
   integer i
-  !DEF: /omp_reduction/k ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/k ObjectEntity INTEGER(4)
   integer :: k = 10
-  !DEF: /omp_reduction/m ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/m ObjectEntity INTEGER(4)
   integer :: m = 12
   !$omp parallel do  reduction(max:k)
-  !DEF: /omp_reduction/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct1/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct1/m (OmpShared) HostAssoc INTEGER(4)
     k = max(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(min:k)
-  !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct2/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct2/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct2/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct2/m (OmpShared) HostAssoc INTEGER(4)
     k = min(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(iand:k)
-  !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct3/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct3/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct3/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct3/m (OmpShared) HostAssoc INTEGER(4)
     k = iand(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(ior:k)
-  !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct4/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct4/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct4/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct4/m (OmpShared) HostAssoc INTEGER(4)
     k = ior(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(ieor:k)
-  !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct5/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct5/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct5/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct5/m (OmpShared) HostAssoc INTEGER(4)
     k = ieor(k,m)
   end do
   !$omp end parallel do
 
-end program omp_reduction
+end program OMP_REDUCTION
diff --git a/flang/test/Semantics/OpenMP/reduction09.f90 b/flang/test/Semantics/OpenMP/reduction09.f90
index d6c71c30d2834..ca60805e8c416 100644
--- a/flang/test/Semantics/OpenMP/reduction09.f90
+++ b/flang/test/Semantics/OpenMP/reduction09.f90
@@ -1,22 +1,22 @@
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.3.6 Reduction Clause Positive cases.
-!DEF: /omp_reduction MainProgram
-program omp_reduction
-  !DEF: /omp_reduction/i ObjectEntity INTEGER(4)
+!DEF: /OMP_REDUCTION MainProgram
+program OMP_REDUCTION
+  !DEF: /OMP_REDUCTION/i ObjectEntity INTEGER(4)
   integer i
-  !DEF: /omp_reduction/k ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/k ObjectEntity INTEGER(4)
   integer :: k = 10
-  !DEF: /omp_reduction/a ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/a ObjectEntity INTEGER(4)
   integer a(10)
-  !DEF: /omp_reduction/b ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/b ObjectEntity INTEGER(4)
   integer b(10,10,10)
 
   !$omp parallel  shared(k)
   !$omp do  reduction(+:k)
-  !DEF: /omp_reduction/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct1/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct1/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end do
@@ -24,53 +24,53 @@ program omp_reduction
 
 
   !$omp parallel do  reduction(+:a(10))
-  !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct2/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct2/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
 
   !$omp parallel do  reduction(+:a(1:10:1))
-  !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct3/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct3/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2))
-  !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct4/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct4/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2:5:1))
-  !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct5/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct5/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
   !$omp parallel  private(i)
   !$omp do reduction(+:k) reduction(+:j)
-  !DEF: /omp_reduction/OtherConstruct6/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct6/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct6/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct6/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end do
   !$omp end parallel
 
   !$omp do reduction(+:k) reduction(*:j) reduction(+:l)
-  !DEF: /omp_reduction/OtherConstruct7/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct7/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct7/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct7/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end do
-end program omp_reduction
+end program OMP_REDUCTION
diff --git a/flang/test/Semantics/OpenMP/reduction11.f90 b/flang/test/Semantics/OpenMP/reduction11.f90
index b2ad0f6a6ee11..dfb3986d37d78 100644
--- a/flang/test/Semantics/OpenMP/reduction11.f90
+++ b/flang/test/Semantics/OpenMP/reduction11.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s
 ! Check intrinsic reduction symbols (in this case "max" are marked as INTRINSIC
 
-! CHECK: MainProgram scope: omp_reduction
+! CHECK: MainProgram scope: OMP_REDUCTION
 program omp_reduction
   ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4)
   integer i
diff --git a/flang/test/Semantics/OpenMP/scan2.f90 b/flang/test/Semantics/OpenMP/scan2.f90
index ffe84910f88a2..1ae5e871595c4 100644
--- a/flang/test/Semantics/OpenMP/scan2.f90
+++ b/flang/test/Semantics/OpenMP/scan2.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s
 ! Check scan reduction
 
-! CHECK: MainProgram scope: omp_reduction
+! CHECK: MainProgram scope: OMP_REDUCTION
 program omp_reduction
   ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4)
   integer i
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index fbd9a0286c79b..74fb420cc517e 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -16,53 +16,53 @@ module md
   integer :: b
  end type myty
 end module md
-!DEF: /mm MainProgram
-program mm
+!DEF: /MM MainProgram
+program MM
  !REF: /md
  use :: md
- !DEF: /mm/c CommonBlockDetails
- !DEF: /mm/x (InCommonBlock) ObjectEntity REAL(4)
- !DEF: /mm/y (InCommonBlock) ObjectEntity REAL(4)
+ !DEF: /MM/c CommonBlockDetails
+ !DEF: /MM/x (InCommonBlock) ObjectEntity REAL(4)
+ !DEF: /MM/y (InCommonBlock) ObjectEntity REAL(4)
  common /c/x, y
- !REF: /mm/x
- !REF: /mm/y
+ !REF: /MM/x
+ !REF: /MM/y
  real x, y
- !DEF: /mm/myty Use
- !DEF: /mm/t ObjectEntity TYPE(myty)
+ !DEF: /MM/myty Use
+ !DEF: /MM/t ObjectEntity TYPE(myty)
  type(myty) :: t
- !DEF: /mm/b ObjectEntity INTEGER(4)
+ !DEF: /MM/b ObjectEntity INTEGER(4)
  integer b(10)
- !REF: /mm/t
+ !REF: /MM/t
  !REF: /md/myty/a
  t%a = 3.14
- !REF: /mm/t
+ !REF: /MM/t
  !REF: /md/myty/b
  t%b = 1
- !REF: /mm/b
+ !REF: /MM/b
  b = 2
- !DEF: /mm/a (Implicit) ObjectEntity REAL(4)
+ !DEF: /MM/a (Implicit) ObjectEntity REAL(4)
  a = 1.0
- !DEF: /mm/c (Implicit) ObjectEntity REAL(4)
+ !DEF: /MM/c (Implicit) ObjectEntity REAL(4)
  c = 2.0
 !$omp parallel do  private(a,t,/c/) shared(c)
- !DEF: /mm/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+ !DEF: /MM/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,10
-  !DEF: /mm/OtherConstruct1/a (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
-  !DEF: /mm/OtherConstruct1/b (OmpShared) HostAssoc INTEGER(4)
-  !REF: /mm/OtherConstruct1/i
+  !DEF: /MM/OtherConstruct1/a (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
+  !DEF: /MM/OtherConstruct1/b (OmpShared) HostAssoc INTEGER(4)
+  !REF: /MM/OtherConstruct1/i
   a = a+b(i)
-  !DEF: /mm/OtherConstruct1/t (OmpPrivate, OmpExplicit) HostAssoc TYPE(myty)
+  !DEF: /MM/OtherConstruct1/t (OmpPrivate, OmpExplicit) HostAssoc TYPE(myty)
   !REF: /md/myty/a
-  !REF: /mm/OtherConstruct1/i
+  !REF: /MM/OtherConstruct1/i
   t%a = i
-  !DEF: /mm/OtherConstruct1/y (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
+  !DEF: /MM/OtherConstruct1/y (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
   y = 0.
-  !DEF: /mm/OtherConstruct1/x (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
-  !REF: /mm/OtherConstruct1/a
-  !REF: /mm/OtherConstruct1/i
-  !REF: /mm/OtherConstruct1/y
+  !DEF: /MM/OtherConstruct1/x (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
+  !REF: /MM/OtherConstruct1/a
+  !REF: /MM/OtherConstruct1/i
+  !REF: /MM/OtherConstruct1/y
   x = a+i+y
-  !DEF: /mm/OtherConstruct1/c (OmpShared, OmpExplicit) HostAssoc REAL(4)
+  !DEF: /MM/OtherConstruct1/c (OmpShared, OmpExplicit) HostAssoc REAL(4)
   c = 3.0
  end do
 end program
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index fe01f15d20aa3..4f3d1926013dc 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -31,10 +31,10 @@ subroutine foo
     end block
   end subroutine foo
 end module mm
-!DEF: /tt MainProgram
-program tt
+!DEF: /TT MainProgram
+program TT
   !REF: /mm
   use :: mm
-  !DEF: /tt/foo (Subroutine) Use
+  !DEF: /TT/foo (Subroutine) Use
   call foo
-end program tt
+end program TT
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index 86b7305411347..1b0c25b7a04b0 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -30,8 +30,8 @@ subroutine function_call_in_region
   !REF: /function_call_in_region/b
   print *, a, b
 end subroutine function_call_in_region
-!DEF: /mm MainProgram
-program mm
+!DEF: /MM MainProgram
+program MM
   !REF: /function_call_in_region
   call function_call_in_region
-end program mm
+end program MM
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index 86b7305411347..1b0c25b7a04b0 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -30,8 +30,8 @@ subroutine function_call_in_region
   !REF: /function_call_in_region/b
   print *, a, b
 end subroutine function_call_in_region
-!DEF: /mm MainProgram
-program mm
+!DEF: /MM MainProgram
+program MM
   !REF: /function_call_in_region
   call function_call_in_region
-end program mm
+end program MM
diff --git a/flang/test/Semantics/OpenMP/threadprivate03.f90 b/flang/test/Semantics/OpenMP/threadprivate03.f90
index 81e26ee327a9d..fda2fe608ac3c 100644
--- a/flang/test/Semantics/OpenMP/threadprivate03.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate03.f90
@@ -10,11 +10,11 @@ program main
   use mod1
   integer, parameter :: i = 1
 
-  !ERROR: The module name or main program name cannot be in a THREADPRIVATE directive
+  !ERROR: The module name cannot be in a THREADPRIVATE directive
   !$omp threadprivate(mod1)
 
-  !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
-  !ERROR: The module name or main program name cannot be in a THREADPRIVATE directive
+  ! This is now allowed, since "main" is implicitly declared symbol,
+  ! separate from the main program symbol.
   !$omp threadprivate(main)
 
   !ERROR: The entity with PARAMETER attribute cannot be in a THREADPRIVATE directive
diff --git a/flang/test/Semantics/PowerPC/ppc-vector-types01.f90 b/flang/test/Semantics/PowerPC/ppc-vector-types01.f90
index ad69b69a47f76..ea54a00fa4392 100644
--- a/flang/test/Semantics/PowerPC/ppc-vector-types01.f90
+++ b/flang/test/Semantics/PowerPC/ppc-vector-types01.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s
 ! REQUIRES: target=powerpc{{.*}}
 
-    ! CHECK-LABEL: PROGRAM ppc_vec_unit
+    ! CHECK-LABEL: PROGRAM PPC_VEC_UNIT
       program ppc_vec_unit
       implicit none
       ! CHECK: VECTOR(INTEGER(KIND=4_4)) :: vi1, vi2
diff --git a/flang/test/Semantics/PowerPC/ppc-vector-types02.f90 b/flang/test/Semantics/PowerPC/ppc-vector-types02.f90
index 8c96684c50eb7..175b58680a209 100644
--- a/flang/test/Semantics/PowerPC/ppc-vector-types02.f90
+++ b/flang/test/Semantics/PowerPC/ppc-vector-types02.f90
@@ -2,7 +2,7 @@
 ! REQUIRES: target=powerpc{{.*}}
 
 ! C: MainProgram scope: ppc_vec_types
-! CHECK-LABEL: MainProgram scope: ppc_vec_types size={{[0-9]*}} alignment={{[0-9]*}}
+! CHECK-LABEL: MainProgram scope: PPC_VEC_TYPES size={{[0-9]*}} alignment={{[0-9]*}}
 program ppc_vec_types
   implicit none
   vector(integer(4)) :: vi
diff --git a/flang/test/Semantics/bind-c18.f90 b/flang/test/Semantics/bind-c18.f90
new file mode 100644
index 0000000000000..f61111458c6d9
--- /dev/null
+++ b/flang/test/Semantics/bind-c18.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+bind(c) :: /blk/
+!ERROR: 'x' may not be a member of BIND(C) COMMON block /blk/
+common /blk/ x
+!BECAUSE: A scalar interoperable variable may not be ALLOCATABLE or POINTER
+integer, pointer :: x
+end
diff --git a/flang/test/Semantics/bug148559.f90 b/flang/test/Semantics/bug148559.f90
new file mode 100644
index 0000000000000..d7b959ac8f191
--- /dev/null
+++ b/flang/test/Semantics/bug148559.f90
@@ -0,0 +1,12 @@
+!RUN: %flang_fc1 -fsyntax-only %s
+!Regression test for crash in semantics on Cray pointers
+
+module m
+  pointer(ptr,pp)
+end module m
+
+program main
+  use m, only:renamea=>pp
+  use m, only:pp
+  print *, renamea
+end
diff --git a/flang/test/Semantics/bug148675.f90 b/flang/test/Semantics/bug148675.f90
new file mode 100644
index 0000000000000..5ce117e7bb3df
--- /dev/null
+++ b/flang/test/Semantics/bug148675.f90
@@ -0,0 +1,21 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module m
+  type t
+    integer n
+   contains
+    procedure :: assign1 => myassign, assign2 => myassign
+    generic :: ASSIGNMENT(=) => assign1
+    generic :: ASSIGNMENT(=) => assign2
+  end type
+ contains
+  subroutine myassign(to, from)
+    class(t), intent(out) :: to
+    integer, intent(in) :: from
+    to%n = from
+  end
+  subroutine test
+    type(t) x
+    !ERROR: Multiple specific procedures for the generic ASSIGNMENT(=) match operand types TYPE(t) and INTEGER(4)
+    x = 5
+  end
+end
diff --git a/flang/test/Semantics/getsymbols03-a.f90 b/flang/test/Semantics/getsymbols03-a.f90
index 95b7fb418367d..5c5e87575a9cb 100644
--- a/flang/test/Semantics/getsymbols03-a.f90
+++ b/flang/test/Semantics/getsymbols03-a.f90
@@ -8,7 +8,7 @@ program main
 end program
 
 ! RUN: %flang_fc1 -fget-symbols-sources %s 2>&1 | FileCheck %s
+! CHECK:MAIN:{{.*}}getsymbols03-a.f90, 4, 9-13
 ! CHECK:f:{{.*}}getsymbols03-b.f90, 2, 12-13
-! CHECK:main:{{.*}}getsymbols03-a.f90, 4, 9-13
 ! CHECK:mm3:{{.*}}getsymbols03-a.f90, 5, 6-9
 ! CHECK:x:{{.*}}getsymbols03-a.f90, 6, 13-14
diff --git a/flang/test/Semantics/long-name.f90 b/flang/test/Semantics/long-name.f90
index 44899b13edd5a..d5a795113e204 100644
--- a/flang/test/Semantics/long-name.f90
+++ b/flang/test/Semantics/long-name.f90
@@ -1,6 +1,6 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror -pedantic
 
-!PORTABILITY: aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg1 has length 64, which is greater than the maximum name length 63 [-Wlong-names]
+!PORTABILITY: AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEFFFFFFFFFFGGG1 has length 64, which is greater than the maximum name length 63 [-Wlong-names]
 program aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg1
 
   !PORTABILITY: aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg2 has length 64, which is greater than the maximum name length 63 [-Wlong-names]
diff --git a/flang/test/Semantics/modproc01.f90 b/flang/test/Semantics/modproc01.f90
index 5f45362e95093..e565ddcfbe0b1 100644
--- a/flang/test/Semantics/modproc01.f90
+++ b/flang/test/Semantics/modproc01.f90
@@ -125,7 +125,7 @@ program test
   x = mf(3, "abc", pdt1(1,3)())
 !  call ms(mf)
 end program
-!CHECK:  MainProgram scope: test size=88 alignment=8
+!CHECK:  MainProgram scope: TEST size=88 alignment=8
 !CHECK:    mf, MODULE (Function): Use from mf in m
 !CHECK:    pdt1: Use from pdt1 in m
 !CHECK:    pdt2: Use from pdt2 in m
diff --git a/flang/test/Semantics/multi-programs04.f90 b/flang/test/Semantics/multi-programs04.f90
index 54b0235aa78f0..e69ac7325278e 100644
--- a/flang/test/Semantics/multi-programs04.f90
+++ b/flang/test/Semantics/multi-programs04.f90
@@ -4,6 +4,6 @@
 program m
 end
 !ERROR: A source file cannot contain more than one main program
-!ERROR: 'm' is already declared in this scoping unit
+!ERROR: 'M' is already declared in this scoping unit
 program m
 end
diff --git a/flang/test/Semantics/pointer01.f90 b/flang/test/Semantics/pointer01.f90
index eaa2426dd77e3..79d6016a6af46 100644
--- a/flang/test/Semantics/pointer01.f90
+++ b/flang/test/Semantics/pointer01.f90
@@ -7,7 +7,6 @@ subroutine msubr
 end module
 program main
   use m
-  !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
   pointer main
   !ERROR: Cannot change POINTER attribute on use-associated 'mobj'
   pointer mobj
diff --git a/flang/test/Semantics/procinterface01.f90 b/flang/test/Semantics/procinterface01.f90
index 73040b0987bd0..70f4a889d6809 100644
--- a/flang/test/Semantics/procinterface01.f90
+++ b/flang/test/Semantics/procinterface01.f90
@@ -159,35 +159,35 @@ end function logical
  tan = "?"
 end function tan
 
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  !REF: /module1
  use :: module1
- !DEF: /main/derived1 Use
- !DEF: /main/instance ObjectEntity TYPE(derived1)
+ !DEF: /MAIN/derived1 Use
+ !DEF: /MAIN/instance ObjectEntity TYPE(derived1)
  type(derived1) :: instance
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p1
  if (instance%p1(1.)/=2.) print *, "p1 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p2
  if (instance%p2(1.)/=2.) print *, "p2 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p3
  if (.not.instance%p3(1.)) print *, "p3 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p4
  if (.not.instance%p4(1.)) print *, "p4 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p5
  if (instance%p5(1.)/=(5.,6.)) print *, "p5 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p6
  if (instance%p6(1.)/=2.) print *, "p6 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p7
  if (instance%p7(0.)/=1.) print *, "p7 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p8
  if (instance%p8(1.)/="a") print *, "p8 failed"
-end program main
+end program MAIN
diff --git a/flang/test/Semantics/resolve05.f90 b/flang/test/Semantics/resolve05.f90
index 0c9877af9b4e2..7b142d2ebd613 100644
--- a/flang/test/Semantics/resolve05.f90
+++ b/flang/test/Semantics/resolve05.f90
@@ -1,6 +1,5 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 program p
-  !PORTABILITY: Name 'p' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
   integer :: p
 end
 module m
diff --git a/flang/test/Semantics/resolve125.f90 b/flang/test/Semantics/resolve125.f90
index e040c006ec179..620c7d65578cd 100644
--- a/flang/test/Semantics/resolve125.f90
+++ b/flang/test/Semantics/resolve125.f90
@@ -43,7 +43,7 @@ subroutine reset
   end subroutine reset
 end module m2
 
-!CHECK: MainProgram scope: main
+!CHECK: MainProgram scope: MAIN
 !CHECK:       i: Use from i in m2
 !CHECK:       i2: Use from i2 in m2
 !CHECK:       init (Subroutine): Use from init in m2
@@ -61,4 +61,4 @@ program main
   else
     print *, "fail"
   end if
-end program main
\ No newline at end of file
+end program main
diff --git a/flang/test/Semantics/resolve40.f90 b/flang/test/Semantics/resolve40.f90
index a91507aa62282..81bb5f989ec48 100644
--- a/flang/test/Semantics/resolve40.f90
+++ b/flang/test/Semantics/resolve40.f90
@@ -96,3 +96,10 @@ subroutine s12(x)
   !BECAUSE: 'x' is an INTENT(IN) dummy argument
   read(*,nml=nl)
 end
+
+subroutine s13()
+  implicit none
+  !ERROR: No explicit type declared for 'i'
+  !ERROR: No explicit type declared for 'i'
+  print *, (i, i = 1, 2)
+end
diff --git a/flang/test/Semantics/symbol03.f90 b/flang/test/Semantics/symbol03.f90
index a6b4b0bd15937..62472495d9736 100644
--- a/flang/test/Semantics/symbol03.f90
+++ b/flang/test/Semantics/symbol03.f90
@@ -1,23 +1,23 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
 ! Test host association in internal subroutine of main program.
 
-!DEF: /main MainProgram
-program main
- !DEF: /main/x ObjectEntity INTEGER(4)
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/x ObjectEntity INTEGER(4)
  integer x
- !DEF: /main/s (Subroutine) Subprogram
+ !DEF: /MAIN/s (Subroutine) Subprogram
  call s
 contains
- !REF: /main/s
+ !REF: /MAIN/s
  subroutine s
-  !DEF: /main/s/y (Implicit) ObjectEntity REAL(4)
-  !DEF: /main/s/x HostAssoc INTEGER(4)
+  !DEF: /MAIN/s/y (Implicit) ObjectEntity REAL(4)
+  !DEF: /MAIN/s/x HostAssoc INTEGER(4)
   y = x
  contains
-  !DEF: /main/s/s2 (Subroutine) Subprogram
+  !DEF: /MAIN/s/s2 (Subroutine) Subprogram
   subroutine s2
-   !DEF: /main/s/s2/z (Implicit) ObjectEntity REAL(4)
-   !DEF: /main/s/s2/x HostAssoc INTEGER(4)
+   !DEF: /MAIN/s/s2/z (Implicit) ObjectEntity REAL(4)
+   !DEF: /MAIN/s/s2/x HostAssoc INTEGER(4)
    z = x
   end subroutine
  end subroutine
diff --git a/flang/test/Semantics/symbol06.f90 b/flang/test/Semantics/symbol06.f90
index bbd6d4d071c89..b45edabcd5318 100644
--- a/flang/test/Semantics/symbol06.f90
+++ b/flang/test/Semantics/symbol06.f90
@@ -1,56 +1,56 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
-!DEF: /main MainProgram
-program main
- !DEF: /main/t1 DerivedType
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/t1 DerivedType
  type :: t1
-  !DEF: /main/t1/a1 ObjectEntity INTEGER(4)
+  !DEF: /MAIN/t1/a1 ObjectEntity INTEGER(4)
   integer :: a1
  end type
- !REF: /main/t1
- !DEF: /main/t2 DerivedType
+ !REF: /MAIN/t1
+ !DEF: /MAIN/t2 DerivedType
  type, extends(t1) :: t2
-  !DEF: /main/t2/a2 ObjectEntity INTEGER(4)
+  !DEF: /MAIN/t2/a2 ObjectEntity INTEGER(4)
   integer :: a2
  end type
- !REF: /main/t2
- !DEF: /main/t3 DerivedType
+ !REF: /MAIN/t2
+ !DEF: /MAIN/t3 DerivedType
  type, extends(t2) :: t3
-  !DEF: /main/t3/a3 ObjectEntity INTEGER(4)
+  !DEF: /MAIN/t3/a3 ObjectEntity INTEGER(4)
   integer :: a3
  end type
- !REF: /main/t3
- !DEF: /main/x3 ObjectEntity TYPE(t3)
+ !REF: /MAIN/t3
+ !DEF: /MAIN/x3 ObjectEntity TYPE(t3)
  type(t3) :: x3
- !DEF: /main/i ObjectEntity INTEGER(4)
+ !DEF: /MAIN/i ObjectEntity INTEGER(4)
  integer i
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t2/a2
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t2/a2
  i = x3%a2
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t1/a1
  i = x3%a1
- !REF: /main/i
- !REF: /main/x3
- !DEF: /main/t3/t2 (ParentComp) ObjectEntity TYPE(t2)
- !REF: /main/t2/a2
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !DEF: /MAIN/t3/t2 (ParentComp) ObjectEntity TYPE(t2)
+ !REF: /MAIN/t2/a2
  i = x3%t2%a2
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t3/t2
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t3/t2
+ !REF: /MAIN/t1/a1
  i = x3%t2%a1
- !REF: /main/i
- !REF: /main/x3
- !DEF: /main/t2/t1 (ParentComp) ObjectEntity TYPE(t1)
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !DEF: /MAIN/t2/t1 (ParentComp) ObjectEntity TYPE(t1)
+ !REF: /MAIN/t1/a1
  i = x3%t1%a1
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t3/t2
- !REF: /main/t2/t1
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t3/t2
+ !REF: /MAIN/t2/t1
+ !REF: /MAIN/t1/a1
  i = x3%t2%t1%a1
 end program
 
diff --git a/flang/test/Semantics/symbol07.f90 b/flang/test/Semantics/symbol07.f90
index f3cc934e51b16..e1d8257b9e190 100644
--- a/flang/test/Semantics/symbol07.f90
+++ b/flang/test/Semantics/symbol07.f90
@@ -1,40 +1,40 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  implicit complex(z)
- !DEF: /main/t DerivedType
+ !DEF: /MAIN/t DerivedType
  type :: t
-  !DEF: /main/t/re ObjectEntity REAL(4)
+  !DEF: /MAIN/t/re ObjectEntity REAL(4)
   real :: re
-  !DEF: /main/t/im ObjectEntity REAL(4)
+  !DEF: /MAIN/t/im ObjectEntity REAL(4)
   real :: im
  end type
- !DEF: /main/z1 ObjectEntity COMPLEX(4)
+ !DEF: /MAIN/z1 ObjectEntity COMPLEX(4)
  complex z1
- !REF: /main/t
- !DEF: /main/w ObjectEntity TYPE(t)
+ !REF: /MAIN/t
+ !DEF: /MAIN/w ObjectEntity TYPE(t)
  type(t) :: w
- !DEF: /main/x ObjectEntity REAL(4)
- !DEF: /main/y ObjectEntity REAL(4)
+ !DEF: /MAIN/x ObjectEntity REAL(4)
+ !DEF: /MAIN/y ObjectEntity REAL(4)
  real x, y
- !REF: /main/x
- !REF: /main/z1
+ !REF: /MAIN/x
+ !REF: /MAIN/z1
  x = z1%re
- !REF: /main/y
- !REF: /main/z1
+ !REF: /MAIN/y
+ !REF: /MAIN/z1
  y = z1%im
- !DEF: /main/z2 (Implicit) ObjectEntity COMPLEX(4)
- !REF: /main/x
+ !DEF: /MAIN/z2 (Implicit) ObjectEntity COMPLEX(4)
+ !REF: /MAIN/x
  z2%re = x
- !REF: /main/z2
- !REF: /main/y
+ !REF: /MAIN/z2
+ !REF: /MAIN/y
  z2%im = y
- !REF: /main/x
- !REF: /main/w
- !REF: /main/t/re
+ !REF: /MAIN/x
+ !REF: /MAIN/w
+ !REF: /MAIN/t/re
  x = w%re
- !REF: /main/y
- !REF: /main/w
- !REF: /main/t/im
+ !REF: /MAIN/y
+ !REF: /MAIN/w
+ !REF: /MAIN/t/im
  y = w%im
 end program
diff --git a/flang/test/Semantics/symbol08.f90 b/flang/test/Semantics/symbol08.f90
index 61dab798955c5..933ff6d0c2ba8 100644
--- a/flang/test/Semantics/symbol08.f90
+++ b/flang/test/Semantics/symbol08.f90
@@ -1,15 +1,15 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
-!DEF: /main MainProgram
-program main
- !DEF: /main/x POINTER ObjectEntity REAL(4)
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/x POINTER ObjectEntity REAL(4)
  pointer :: x
- !REF: /main/x
+ !REF: /MAIN/x
  real x
- !DEF: /main/y EXTERNAL, POINTER (Function) ProcEntity REAL(4)
+ !DEF: /MAIN/y EXTERNAL, POINTER (Function) ProcEntity REAL(4)
  pointer :: y
- !REF: /main/y
+ !REF: /MAIN/y
  procedure (real) :: y
- !DEF: /main/z (Implicit) ObjectEntity REAL(4)
- !REF: /main/y
+ !DEF: /MAIN/z (Implicit) ObjectEntity REAL(4)
+ !REF: /MAIN/y
  z = y()
 end program
diff --git a/flang/test/Semantics/symbol15.f90 b/flang/test/Semantics/symbol15.f90
index df10942e6af2d..79a45491306ef 100644
--- a/flang/test/Semantics/symbol15.f90
+++ b/flang/test/Semantics/symbol15.f90
@@ -249,15 +249,15 @@ subroutine ext2
 !DEF: /ext3 (Subroutine) Subprogram
 subroutine ext3
 end subroutine
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  !REF: /m
  use :: m
- !DEF: /main/pdt1 Use
- !DEF: /main/pdt1y ObjectEntity TYPE(pdt1(k=2_4))
+ !DEF: /MAIN/pdt1 Use
+ !DEF: /MAIN/pdt1y ObjectEntity TYPE(pdt1(k=2_4))
  type(pdt1(2)) :: pdt1y
- !DEF: /main/pdt2 Use
- !DEF: /main/pdt2y ObjectEntity TYPE(pdt2(k=2_4))
+ !DEF: /MAIN/pdt2 Use
+ !DEF: /MAIN/pdt2y ObjectEntity TYPE(pdt2(k=2_4))
  type(pdt2(2)) :: pdt2y
  print *, "compiled"
 end program
diff --git a/flang/test/Semantics/symbol16.f90 b/flang/test/Semantics/symbol16.f90
index 7a46092c36b53..547c4624d4cdb 100644
--- a/flang/test/Semantics/symbol16.f90
+++ b/flang/test/Semantics/symbol16.f90
@@ -1,18 +1,18 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
 ! Statement functions
 
-!DEF: /p1 MainProgram
-program p1
- !DEF: /p1/f (Function, StmtFunction) Subprogram INTEGER(4)
- !DEF: /p1/i ObjectEntity INTEGER(4)
- !DEF: /p1/j ObjectEntity INTEGER(4)
+!DEF: /P1 MainProgram
+program P1
+ !DEF: /P1/f (Function, StmtFunction) Subprogram INTEGER(4)
+ !DEF: /P1/i ObjectEntity INTEGER(4)
+ !DEF: /P1/j ObjectEntity INTEGER(4)
  integer f, i, j
- !REF: /p1/f
- !REF: /p1/i
- !DEF: /p1/f/i ObjectEntity INTEGER(4)
+ !REF: /P1/f
+ !REF: /P1/i
+ !DEF: /P1/f/i ObjectEntity INTEGER(4)
  f(i) = i + 1
- !REF: /p1/j
- !REF: /p1/f
+ !REF: /P1/j
+ !REF: /P1/f
  j = f(2)
 end program
 
diff --git a/flang/test/Semantics/symbol17.f90 b/flang/test/Semantics/symbol17.f90
index 434f124509a32..a0d916e55cfa4 100644
--- a/flang/test/Semantics/symbol17.f90
+++ b/flang/test/Semantics/symbol17.f90
@@ -1,44 +1,44 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
 ! Forward references to derived types (non-error cases)
 
-!DEF: /main MainProgram
-program main
- !DEF: /main/t1 DerivedType
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/t1 DerivedType
  type :: t1
-  !DEF: /main/t2 DerivedType
-  !DEF: /main/t1/t1a ALLOCATABLE ObjectEntity TYPE(t2)
+  !DEF: /MAIN/t2 DerivedType
+  !DEF: /MAIN/t1/t1a ALLOCATABLE ObjectEntity TYPE(t2)
   type(t2), allocatable :: t1a
-  !REF: /main/t2
-  !DEF: /main/t1/t1p POINTER ObjectEntity TYPE(t2)
+  !REF: /MAIN/t2
+  !DEF: /MAIN/t1/t1p POINTER ObjectEntity TYPE(t2)
   type(t2), pointer :: t1p
  end type
- !REF: /main/t2
+ !REF: /MAIN/t2
  type :: t2
-  !REF: /main/t2
-  !DEF: /main/t2/t2a ALLOCATABLE ObjectEntity TYPE(t2)
+  !REF: /MAIN/t2
+  !DEF: /MAIN/t2/t2a ALLOCATABLE ObjectEntity TYPE(t2)
   type(t2), allocatable :: t2a
-  !REF: /main/t2
-  !DEF: /main/t2/t2p POINTER ObjectEntity TYPE(t2)
+  !REF: /MAIN/t2
+  !DEF: /MAIN/t2/t2p POINTER ObjectEntity TYPE(t2)
   type(t2), pointer :: t2p
  end type
- !REF: /main/t1
- !DEF: /main/t1x TARGET ObjectEntity TYPE(t1)
+ !REF: /MAIN/t1
+ !DEF: /MAIN/t1x TARGET ObjectEntity TYPE(t1)
  type(t1), target :: t1x
- !REF: /main/t1x
- !REF: /main/t1/t1a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1a
  allocate(t1x%t1a)
- !REF: /main/t1x
- !REF: /main/t1/t1p
- !REF: /main/t1/t1a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1p
+ !REF: /MAIN/t1/t1a
  t1x%t1p => t1x%t1a
- !REF: /main/t1x
- !REF: /main/t1/t1a
- !REF: /main/t2/t2a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1a
+ !REF: /MAIN/t2/t2a
  allocate(t1x%t1a%t2a)
- !REF: /main/t1x
- !REF: /main/t1/t1a
- !REF: /main/t2/t2p
- !REF: /main/t2/t2a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1a
+ !REF: /MAIN/t2/t2p
+ !REF: /MAIN/t2/t2a
  t1x%t1a%t2p => t1x%t1a%t2a
 end program
 !DEF: /f1/fwd DerivedType
diff --git a/flang/test/Semantics/symbol18.f90 b/flang/test/Semantics/symbol18.f90
index a37792bce21d7..6e41bb5db91ee 100644
--- a/flang/test/Semantics/symbol18.f90
+++ b/flang/test/Semantics/symbol18.f90
@@ -2,21 +2,21 @@
 
 ! Intrinsic function in type declaration statement: type is ignored
 
-!DEF: /p1 MainProgram
-program p1
- !DEF: /p1/cos ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity INTEGER(4)
+!DEF: /P1 MainProgram
+program P1
+ !DEF: /P1/cos ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity INTEGER(4)
  integer cos
- !DEF: /p1/y (Implicit) ObjectEntity REAL(4)
- !REF: /p1/cos
- !DEF: /p1/x (Implicit) ObjectEntity REAL(4)
+ !DEF: /P1/y (Implicit) ObjectEntity REAL(4)
+ !REF: /P1/cos
+ !DEF: /P1/x (Implicit) ObjectEntity REAL(4)
  y = cos(x)
- !REF: /p1/y
- !DEF: /p1/sin ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
- !REF: /p1/x
+ !REF: /P1/y
+ !DEF: /P1/sin ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+ !REF: /P1/x
  y = sin(x)
- !REF: /p1/y
+ !REF: /P1/y
  !DEF: /f EXTERNAL (Function, Implicit) ProcEntity REAL(4)
- !REF: /p1/x
+ !REF: /P1/x
  y = f(x)
 end program
 
diff --git a/flang/test/Semantics/symbol20.f90 b/flang/test/Semantics/symbol20.f90
index 8c82776933321..bf3aff489b3b9 100644
--- a/flang/test/Semantics/symbol20.f90
+++ b/flang/test/Semantics/symbol20.f90
@@ -32,16 +32,16 @@ subroutine bar
   print *, "in bar"
  end subroutine
 end module
-!DEF: /demo MainProgram
-program demo
+!DEF: /DEMO MainProgram
+program DEMO
  !REF: /m
  use :: m
- !DEF: /demo/bar (Subroutine) Use
- !DEF: /demo/p EXTERNAL, POINTER (Subroutine) ProcEntity
+ !DEF: /DEMO/bar (Subroutine) Use
+ !DEF: /DEMO/p EXTERNAL, POINTER (Subroutine) ProcEntity
  procedure(bar), pointer :: p
- !REF: /demo/p
- !DEF: /demo/foo (Function) Use
+ !REF: /DEMO/p
+ !DEF: /DEMO/foo (Function) Use
  p => foo()
- !REF: /demo/p
+ !REF: /DEMO/p
  call p
 end program
diff --git a/flang/test/Semantics/symbol25.f90 b/flang/test/Semantics/symbol25.f90
index ac3dd37ef92eb..ac47a19eae8cc 100644
--- a/flang/test/Semantics/symbol25.f90
+++ b/flang/test/Semantics/symbol25.f90
@@ -38,23 +38,23 @@ subroutine inner1
   end subroutine inner1
  end subroutine outer
 end module m
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  !REF: /m
  use :: m
  !REF: /m/specific1
  call generic
- !DEF: /main/inner2 (Subroutine) Subprogram
+ !DEF: /MAIN/inner2 (Subroutine) Subprogram
  call inner2
 contains
- !REF: /main/inner2
+ !REF: /MAIN/inner2
  subroutine inner2
-  !DEF: /main/inner2/generic (Subroutine) Generic
+  !DEF: /MAIN/inner2/generic (Subroutine) Generic
   interface generic
-   !DEF: /main/specific2 (Subroutine) Use
+   !DEF: /MAIN/specific2 (Subroutine) Use
    module procedure :: specific2
   end interface
-  !REF: /main/specific2
+  !REF: /MAIN/specific2
   call generic
  end subroutine inner2
 end program
diff --git a/flang/test/Semantics/symbol26.f90 b/flang/test/Semantics/symbol26.f90
index f5e95853ca099..dded4b632c654 100644
--- a/flang/test/Semantics/symbol26.f90
+++ b/flang/test/Semantics/symbol26.f90
@@ -8,16 +8,16 @@ module m
  !DEF: /m/j PUBLIC (Implicit, InNamelist) ObjectEntity INTEGER(4)
  namelist/a/j
 end module m
-!DEF: /main MainProgram
-program main
- !DEF: /main/j (Implicit) ObjectEntity INTEGER(4)
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/j (Implicit) ObjectEntity INTEGER(4)
  j = 1
 contains
- !DEF: /main/inner (Subroutine) Subprogram
+ !DEF: /MAIN/inner (Subroutine) Subprogram
  subroutine inner
   !REF: /m
   use :: m
-  !DEF: /main/inner/j (Implicit, InNamelist) Use INTEGER(4)
+  !DEF: /MAIN/inner/j (Implicit, InNamelist) Use INTEGER(4)
   j = 2
  end subroutine
 end program
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index bb20c546e0261..aeec336ea58ea 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -87,8 +87,8 @@ subroutine s2(x, y)
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
-!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)]
+!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 !CHECK: .v.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s2,name=.n.s1)]
 end module
@@ -115,8 +115,8 @@ subroutine s2(x, y)
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
-!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)]
+!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 !CHECK: .v.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s2,name=.n.s1)]
 end module
@@ -133,7 +133,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
 
@@ -156,7 +156,7 @@ subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,specialcaseflag=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,specialcaseflag=1_1,proc=s4)]
 end module
 
 module m09
@@ -198,7 +198,7 @@ subroutine wu(x,u,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,specialcaseflag=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,specialcaseflag=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,specialcaseflag=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -247,7 +247,7 @@ subroutine wu(x,u,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=wu)]
 end module
 
 module m11
@@ -290,7 +290,7 @@ module m13
    contains
     procedure :: assign1, assign2
     generic :: assignment(=) => assign1, assign2
-    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)]
+    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=assign1)]
   end type
  contains
   impure elemental subroutine assign1(to, from)
diff --git a/flang/test/Semantics/typeinfo02.f90 b/flang/test/Semantics/typeinfo02.f90
index 29d14c7a0f196..07293627ab492 100644
--- a/flang/test/Semantics/typeinfo02.f90
+++ b/flang/test/Semantics/typeinfo02.f90
@@ -29,5 +29,5 @@ subroutine wf2(x,u,iot,v,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 end module
-!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)]
-!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)]
+!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=wf1)]
+!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=wf2)]
diff --git a/flang/test/Semantics/typeinfo09.f90 b/flang/test/Semantics/typeinfo09.f90
index 3527ee6058ad8..8daa6a5f420d7 100644
--- a/flang/test/Semantics/typeinfo09.f90
+++ b/flang/test/Semantics/typeinfo09.f90
@@ -17,4 +17,4 @@ subroutine copy_impl(this, x)
 end interface
 end module
 
-!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=copy_impl)]
+!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=copy_impl)]
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
index ad824ad3590a2..facc280815722 100644
--- a/flang/test/Semantics/typeinfo13.f90
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -22,5 +22,5 @@ impure elemental subroutine override(to, from)
   end
 end
 
-!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,specialcaseflag=0_1,proc=override)]
 !CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index 12f63031cbaee..6f24b346e3fb9 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -5,7 +5,7 @@
 ! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
 ! RUN:   | FileCheck %s
  
-! CHECK-LABEL: do_concurrent_basic
+! CHECK-LABEL: DO_CONCURRENT_BASIC
 program do_concurrent_basic
     ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 
diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir
index 458869cce45fd..9232a74f224d3 100644
--- a/flang/test/Transforms/lower-repack-arrays.fir
+++ b/flang/test/Transforms/lower-repack-arrays.fir
@@ -28,15 +28,14 @@ func.func @_QPtest1(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"})
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_20]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -93,15 +92,14 @@ func.func @_QPtest1_whole(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name =
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.alloca !fir.array<?x?xf32>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_20]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -156,15 +154,14 @@ func.func @_QPtest1_in(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_17:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+// CHECK:               %[[VAL_24:.*]] = fir.shape_shift %[[VAL_14]]#0, %[[VAL_14]]#1, %[[VAL_15]]#0, %[[VAL_15]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_24]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK:               %[[VAL_20:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_24:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_25:.*]] = fir.rebox %[[VAL_19]](%[[VAL_24]]) : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_25]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_19]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -215,10 +212,9 @@ func.func @_QPtest1_out(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "
 // CHECK:               %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_17:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
-// CHECK:               %[[VAL_20:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_21:.*]] = fir.rebox %[[VAL_19]](%[[VAL_20]]) : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               %[[VAL_20:.*]] = fir.shape_shift %[[VAL_14]]#0, %[[VAL_14]]#1, %[[VAL_15]]#0, %[[VAL_15]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_20]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_19]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -286,15 +282,14 @@ func.func @_QPtest2(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.box
 // CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_24:.*]] = fir.allocmem !fir.array<?x?x!fir.char<1,?>>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.heap<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>
+// CHECK:               %[[VAL_31:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1, %[[VAL_22]]#0, %[[VAL_22]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_31]]) typeparams %[[VAL_12]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, i32) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_32]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_1]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -362,15 +357,14 @@ func.func @_QPtest2_stack(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !f
 // CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_24:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,?>>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               %[[VAL_31:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1, %[[VAL_22]]#0, %[[VAL_22]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_31]]) typeparams %[[VAL_12]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, i32) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_32]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_1]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -427,15 +421,14 @@ func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.allocmem !fir.array<?x?x!fir.char<1,?>>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.heap<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) typeparams %[[VAL_17]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -493,15 +486,14 @@ func.func @_QPtest3_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.b
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,?>>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) typeparams %[[VAL_17]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -559,15 +551,14 @@ func.func @_QPtest4(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.bindc_
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.allocmem !fir.array<?x?x!fir.char<1,10>>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>, index) -> !fir.heap<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,10>>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1, %[[VAL_17]]#0, %[[VAL_17]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,10>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,10>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             }
@@ -626,15 +617,14 @@ func.func @_QPtest4_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,10>>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>, index) -> !fir.ref<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1, %[[VAL_17]]#0, %[[VAL_17]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) : (!fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             }
@@ -690,15 +680,15 @@ func.func @_QPtest5(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bind
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.allocmem !fir.array<?x?x!fir.type<_QMmTt>>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) source_box %[[VAL_0]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[BOX:.*]] = fir.convert %[[VAL_20]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_22:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               fir.result %[[BOX]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             }
@@ -755,15 +745,15 @@ func.func @_QPtest5_stack(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fi
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.alloca !fir.array<?x?x!fir.type<_QMmTt>>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) source_box %[[VAL_0]] : (!fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[BOX:.*]] = fir.convert %[[VAL_20]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_22:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               fir.result %[[BOX]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             }
@@ -830,13 +820,14 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
@@ -906,13 +897,14 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
@@ -981,13 +973,14 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.heap<!fir.array<?x?xnone>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.heap<!fir.array<?x?xnone>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?xnone>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?xnone>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?xnone>>
@@ -1057,13 +1050,14 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.heap<!fir.array<?x?xnone>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.heap<!fir.array<?x?xnone>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?xnone>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?xnone>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?xnone>>
diff --git a/flang/test/Transforms/tbaa-local-alloc-threshold.fir b/flang/test/Transforms/tbaa-local-alloc-threshold.fir
new file mode 100644
index 0000000000000..27c19a6e23095
--- /dev/null
+++ b/flang/test/Transforms/tbaa-local-alloc-threshold.fir
@@ -0,0 +1,23 @@
+// Check that -local-alloc-tbaa-threshold option limits
+// the attachment of TBAA tags to accesses of locally allocated entities.
+// RUN: fir-opt --fir-add-alias-tags -local-alloc-tbaa-threshold=2 %s | FileCheck %s --check-prefixes=ALL,COUNT2
+// RUN: fir-opt --fir-add-alias-tags -local-alloc-tbaa-threshold=1 %s | FileCheck %s --check-prefixes=ALL,COUNT1
+// RUN: fir-opt --fir-add-alias-tags -local-alloc-tbaa-threshold=0 %s | FileCheck %s --check-prefixes=ALL,COUNT0
+
+// ALL-LABEL:   func.func @_QPtest() {
+// COUNT2: fir.load{{.*}}{tbaa =
+// COUNT2: fir.store{{.*}}{tbaa =
+// COUNT1: fir.load{{.*}}{tbaa =
+// COUNT1-NOT: fir.store{{.*}}{tbaa =
+// COUNT0-NOT: fir.load{{.*}}{tbaa =
+// COUNT0-NOT: fir.store{{.*}}{tbaa =
+func.func @_QPtest() {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFtestEx"}
+  %2 = fir.declare %1 {uniq_name = "_QFtestEx"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  %3 = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFtestEy"}
+  %4 = fir.declare %3 {uniq_name = "_QFtestEy"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  %5 = fir.load %4 : !fir.ref<f32>
+  fir.store %5 to %2 : !fir.ref<f32>
+  return
+}
diff --git a/flang/unittests/Optimizer/Builder/CharacterTest.cpp b/flang/unittests/Optimizer/Builder/CharacterTest.cpp
index 6d912b81d9541..d8d2da40ba9a6 100644
--- a/flang/unittests/Optimizer/Builder/CharacterTest.cpp
+++ b/flang/unittests/Optimizer/Builder/CharacterTest.cpp
@@ -29,7 +29,7 @@ struct CharacterTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
index 689af4642b0b6..d5f00c9b61108 100644
--- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp
+++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
@@ -25,7 +25,7 @@ struct ComplexTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index 3e2af24c47b96..e4c21f6b65a36 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -29,7 +29,7 @@ struct FIRBuilderTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
@@ -176,8 +176,7 @@ TEST_F(FIRBuilderTest, getNamedFunction) {
   auto func2 = builder.getNamedFunction("func2");
   EXPECT_EQ(nullptr, func2);
   auto loc = builder.getUnknownLoc();
-  func2 = builder.createFunction(
-      loc, "func2", builder.getFunctionType(std::nullopt, std::nullopt));
+  func2 = builder.createFunction(loc, "func2", builder.getFunctionType({}, {}));
   auto func2query = builder.getNamedFunction("func2");
   EXPECT_EQ(func2, func2query);
 }
diff --git a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
index 29700d2d3dbff..a0785198b078d 100644
--- a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
+++ b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
@@ -28,7 +28,7 @@ struct HLFIRToolsTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
index 40abf567400b3..4ecec92f42dc2 100644
--- a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
+++ b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
@@ -26,9 +26,8 @@ struct RuntimeCallTest : public testing::Test {
     // Set the insertion point in the function entry block.
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
-    mlir::func::FuncOp func =
-        builder.create<mlir::func::FuncOp>(loc, "runtime_unit_tests_func",
-            builder.getFunctionType(std::nullopt, std::nullopt));
+    mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
+        loc, "runtime_unit_tests_func", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 30c23b63b4d56..59808779aa6ef 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -21,9 +21,8 @@ struct FortranVariableTest : public testing::Test {
     // Set the insertion point in the function entry block.
     moduleOp = builder->create<mlir::ModuleOp>(loc);
     builder->setInsertionPointToStart(moduleOp->getBody());
-    mlir::func::FuncOp func =
-        builder->create<mlir::func::FuncOp>(loc, "fortran_variable_tests",
-            builder->getFunctionType(std::nullopt, std::nullopt));
+    mlir::func::FuncOp func = builder->create<mlir::func::FuncOp>(
+        loc, "fortran_variable_tests", builder->getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder->setInsertionPointToStart(entryBlock);
   }
@@ -49,7 +48,7 @@ TEST_F(FortranVariableTest, SimpleScalar) {
   mlir::Value addr = builder->create<fir::AllocaOp>(loc, eleType);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
-      /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt,
+      /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{},
       /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
       /*data_attr=*/cuf::DataAttributeAttr{});
@@ -103,11 +102,11 @@ TEST_F(FortranVariableTest, SimpleArray) {
       extents.size(), fir::SequenceType::getUnknownExtent());
   mlir::Type seqTy = fir::SequenceType::get(typeShape, eleType);
   mlir::Value addr = builder->create<fir::AllocaOp>(
-      loc, seqTy, /*pinned=*/false, /*typeParams=*/std::nullopt, extents);
+      loc, seqTy, /*pinned=*/false, /*typeParams=*/mlir::ValueRange{}, extents);
   mlir::Value shape = createShape(extents);
   auto name = mlir::StringAttr::get(&context, "x");
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
-      shape, /*typeParams*/ std::nullopt, /*dummy_scope=*/nullptr, name,
+      shape, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
       /*data_attr=*/cuf::DataAttributeAttr{});
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 920c5b206b0fe..57ff5b9fdb846 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -7,9 +7,9 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/time/gpu/time_utils.h"
 #include "src/stdio/printf.h"
 #include "src/stdlib/srand.h"
-#include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace benchmarks {
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
index 3bb5b0cc6788c..1f91a9a35c373 100644
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -3,12 +3,8 @@
 #include "src/math/atan2.h"
 #include "src/stdlib/rand.h"
 
-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
 #endif
 
 #define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
@@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
 BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
-BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
-BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
-BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+BENCH(double, NvAtan2, __nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
-BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
-BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
-BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000);
 #endif
diff --git a/libc/benchmarks/gpu/src/math/platform.h b/libc/benchmarks/gpu/src/math/platform.h
new file mode 100644
index 0000000000000..bb7825d38bd42
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/platform.h
@@ -0,0 +1,57 @@
+//===-- AMDGPU specific platform definitions for math support -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+#ifdef LIBC_TARGET_ARCH_IS_AMDGPU
+// The ROCm device library uses control globals to alter codegen for the
+// different targets. To avoid needing to link them in manually we simply
+// define them here.
+extern "C" {
+extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
+extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
+}
+
+// These aliases cause clang to emit the control constants with ODR linkage.
+// This allows us to link against the symbols without preventing them from being
+// optimized out or causing symbol collisions.
+[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
+[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
+[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
+    __oclc_correctly_rounded_sqrt32__;
+[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
+[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
+#endif
+} // namespace LIBC_NAMESPACE_DECL
+
+// Forward declarations for the vendor math libraries.
+extern "C" {
+#ifdef AMDGPU_MATH_FOUND
+double __ocml_sin_f64(double);
+float __ocml_sin_f32(float);
+double __ocml_atan2_f64(double, double);
+float __ocml_atan2_f32(float, float);
+#endif
+
+#ifdef NVPTX_MATH_FOUND
+double __nv_sin(double);
+float __nv_sinf(float);
+double __nv_atan2(double, double);
+float __nv_atan2f(float, float);
+#endif
+}
+
+#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index bf09e6e462172..a759db2e9d33f 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -8,12 +8,8 @@
 #include "src/math/sinf.h"
 #include "src/stdlib/rand.h"
 
-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
 #endif
 
 // BENCHMARK() expects a function that with no parameters that returns a
@@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
 BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
-BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
-BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
-BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
+BENCH(double, NvSin, __nv_sin, -1023, 1023);
+BENCH(double, NvSinTwoPi, __nv_sin, -10, 3);
+BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30);
+BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
-BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
-BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
-BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023);
+BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3);
+BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30);
+BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000);
 #endif
 
 BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
@@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
 BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
-BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
-BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
-BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
+BENCH(float, NvSinf, __nv_sinf, -127, 128);
+BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3);
+BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30);
+BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
-BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
-BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
-BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
+BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128);
+BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3);
+BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30);
+BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 4cf7e9838add3..0f2c04c07c921 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
 // allows us to substract the constant-time overhead from the latency to
 // obtain a true result. This can vary with system load.
 [[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
   uint32_t result = 0.0;
   asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
@@ -44,13 +45,13 @@ template <typename F, typename T>
   T arg = storage;
 
   // The AMDGPU architecture needs to wait on pending results.
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   // Get the current timestamp from the clock.
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  asm("" ::"s"(start));
+  asm("" : "+v"(arg) : "s"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -71,7 +72,7 @@ template <typename F, typename T>
   // ordering.
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   // Return the time elapsed.
   return stop - start;
@@ -84,7 +85,7 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
@@ -100,7 +101,7 @@ template <typename F, typename T1, typename T2>
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   return stop - start;
 }
@@ -111,7 +112,7 @@ template <typename F, typename T, size_t N>
 throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"v"(&inputs));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
@@ -124,7 +125,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   // Return the time elapsed.
   return stop - start;
@@ -136,7 +137,7 @@ template <typename F, typename T, size_t N>
     F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
   asm("" ::"v"(&inputs1), "v"(&inputs2));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"s"(start));
@@ -149,7 +150,7 @@ template <typename F, typename T, size_t N>
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
 
   // Return the time elapsed.
   return stop - start;
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index ece7d9a6c5396..3ed97645ddc93 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -46,7 +47,7 @@ template <typename F, typename T>
   T arg = storage;
 
   // Get the current timestamp from the clock.
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   // This forces the compiler to load the input argument and run the clock cycle
@@ -63,7 +64,7 @@ template <typename F, typename T>
   // Obtain the current timestamp after running the calculation and force
   // ordering.
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile T output = result;
 
@@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   T1 arg = storage;
   T2 arg2 = storage2;
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
@@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile auto output = result;
 
@@ -101,7 +102,7 @@ template <typename F, typename T, size_t N>
 throughput(F f, const cpp::array<T, N> &inputs) {
   asm("" ::"r"(&inputs));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
@@ -114,7 +115,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   }
 
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile auto output = result;
 
@@ -128,7 +129,7 @@ template <typename F, typename T, size_t N>
     F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
   asm("" ::"r"(&inputs1), "r"(&inputs2));
 
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   uint64_t start = gpu::processor_clock();
 
   asm("" ::"llr"(start));
@@ -140,7 +141,7 @@ template <typename F, typename T, size_t N>
   }
 
   uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
   asm("" ::"r"(stop));
   volatile auto output = result;
 
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index c94a407d974df..d4103f8a5a23f 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -153,9 +153,11 @@ elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "x86_64")
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "i386")
   set(LIBC_TARGET_ARCHITECTURE_IS_X86 TRUE)
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv64")
+  set(LIBC_TARGET_ARCHITECTURE_IS_ANY_RISCV TRUE)
   set(LIBC_TARGET_ARCHITECTURE_IS_RISCV64 TRUE)
   set(LIBC_TARGET_ARCHITECTURE "riscv")
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv32")
+  set(LIBC_TARGET_ARCHITECTURE_IS_ANY_RISCV TRUE)
   set(LIBC_TARGET_ARCHITECTURE_IS_RISCV32 TRUE)
   set(LIBC_TARGET_ARCHITECTURE "riscv")
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "amdgpu")
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 82d06e2b9eb55..2478fde64d430 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -13,7 +13,7 @@ endif()
 function(_get_compile_options_from_flags output_var)
   set(compile_options "")
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_RISCV64 OR(LIBC_CPU_FEATURES MATCHES "FMA"))
+  if(LIBC_CPU_FEATURES MATCHES "FMA")
     check_flag(ADD_FMA_FLAG ${FMA_OPT_FLAG} ${ARGN})
   endif()
   check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
@@ -25,8 +25,6 @@ function(_get_compile_options_from_flags output_var)
       if(LIBC_TARGET_ARCHITECTURE_IS_X86_64)
         list(APPEND compile_options "-mavx2")
         list(APPEND compile_options "-mfma")
-      elseif(LIBC_TARGET_ARCHITECTURE_IS_RISCV64)
-        list(APPEND compile_options "-D__LIBC_RISCV_USE_FMA")
       endif()
       # For clang, we will build the math functions with `-fno-math-errno` so that
       # __builtin_fma* will generate the fused-mutliply-add instructions.  We
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index 7d5e73c2f1214..4bbd21ab569dc 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -270,7 +270,7 @@ set(MISC_MATH_BASIC_OPS_OPT_FLAG "MISC_MATH_BASIC_OPS_OPT")
 # Skip FMA_OPT flag for targets that don't support fma.
 if(NOT DEFINED SKIP_FLAG_EXPANSION_FMA_OPT)
   if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86_64 AND (LIBC_CPU_FEATURES MATCHES "FMA")) OR
-        LIBC_TARGET_ARCHITECTURE_IS_RISCV64))
+        LIBC_TARGET_ARCHITECTURE_IS_ANY_RISCV))
     set(SKIP_FLAG_EXPANSION_FMA_OPT TRUE)
   endif()
 endif()
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index e210992c5111a..267c32e956945 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -71,6 +71,7 @@ endfunction()
 
 function(_get_hermetic_test_compile_options output_var)
   _get_common_test_compile_options(compile_options "" "")
+  list(APPEND compile_options "-DLIBC_TEST=HERMETIC")
 
   # null check tests are death tests, remove from hermetic tests for now.
   if(LIBC_ADD_NULL_CHECKS)
@@ -232,6 +233,7 @@ function(create_libc_unittest fq_target_name)
 
   _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_C_TEST}"
                                    "${LIBC_UNITTEST_FLAGS}")
+  list(APPEND compile_options "-DLIBC_TEST=UNIT")
   # TODO: Ideally we would have a separate function for link options.
   set(link_options
     ${compile_options}
@@ -571,6 +573,8 @@ function(add_integration_test test_name)
   target_compile_options(${fq_build_target_name} PRIVATE
                          ${compile_options} ${INTEGRATION_TEST_COMPILE_OPTIONS})
 
+  set(compiler_runtime "")
+
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS}
@@ -599,17 +603,19 @@ function(add_integration_test test_name)
     set(link_options
       -nolibc
       -nostartfiles
-      -static
+      -nostdlib
       ${LIBC_LINK_OPTIONS_DEFAULT}
       ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
     )
     target_link_options(${fq_build_target_name} PRIVATE ${link_options})
+    list(APPEND compiler_runtime ${LIBGCC_S_LOCATION})
   endif()
   target_link_libraries(
     ${fq_build_target_name}
-    ${fq_target_name}.__libc__
     libc.startup.${LIBC_TARGET_OS}.crt1
     libc.test.IntegrationTest.test
+    ${fq_target_name}.__libc__
+    ${compiler_runtime}
   )
   add_dependencies(${fq_build_target_name}
                    libc.test.IntegrationTest.test
@@ -770,6 +776,7 @@ function(add_libc_hermetic test_name)
                          ${HERMETIC_TEST_COMPILE_OPTIONS})
 
   set(link_libraries "")
+  set(compiler_runtime "")
   foreach(lib IN LISTS HERMETIC_TEST_LINK_LIBRARIES)
     if(TARGET ${lib}.hermetic)
       list(APPEND link_libraries ${lib}.hermetic)
@@ -807,12 +814,12 @@ function(add_libc_hermetic test_name)
     set(link_options
       -nolibc
       -nostartfiles
-      -static
+      -nostdlib
       ${LIBC_LINK_OPTIONS_DEFAULT}
       ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
     )
     target_link_options(${fq_build_target_name} PRIVATE ${link_options})
-    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+    list(APPEND compiler_runtime ${LIBGCC_S_LOCATION})
   endif()
   target_link_libraries(
     ${fq_build_target_name}
@@ -820,7 +827,9 @@ function(add_libc_hermetic test_name)
       libc.startup.${LIBC_TARGET_OS}.crt1
       ${link_libraries}
       LibcHermeticTestSupport.hermetic
-      ${fq_target_name}.__libc__)
+      ${fq_target_name}.__libc__
+      ${compiler_runtime}
+    )
   add_dependencies(${fq_build_target_name}
                    LibcTest.hermetic
                    libc.test.UnitTest.ErrnoSetterMatcher
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index de7549c57ff44..80cd15eebc91f 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -278,6 +278,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # internal entrypoints
     libc.startup.baremetal.init
     libc.startup.baremetal.fini
diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt
index 5666ef7e0012d..1f64afebdaaa7 100644
--- a/libc/config/baremetal/arm/headers.txt
+++ b/libc/config/baremetal/arm/headers.txt
@@ -23,4 +23,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.time
     libc.include.uchar
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 7e8c186d52469..c9f8118f6e800 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -278,6 +278,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # internal entrypoints
     libc.startup.baremetal.init
     libc.startup.baremetal.fini
diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt
index 5666ef7e0012d..1f64afebdaaa7 100644
--- a/libc/config/baremetal/riscv/headers.txt
+++ b/libc/config/baremetal/riscv/headers.txt
@@ -23,4 +23,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.time
     libc.include.uchar
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/darwin/aarch64/entrypoints.txt b/libc/config/darwin/aarch64/entrypoints.txt
index 4674a9309115b..3bfdcdbee555e 100644
--- a/libc/config/darwin/aarch64/entrypoints.txt
+++ b/libc/config/darwin/aarch64/entrypoints.txt
@@ -99,6 +99,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.calloc
     libc.src.stdlib.realloc
     libc.src.stdlib.free
+
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/config/darwin/aarch64/headers.txt b/libc/config/darwin/aarch64/headers.txt
index 8f3d6029c9b6a..55a112c0c3ad3 100644
--- a/libc/config/darwin/aarch64/headers.txt
+++ b/libc/config/darwin/aarch64/headers.txt
@@ -11,4 +11,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.stdlib
     libc.include.string
     libc.include.strings
+    libc.include.wctype
 )
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index cff5b7f8312d6..b2abebee017d8 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -363,6 +363,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # sys/uio.h entrypoints
     libc.src.sys.uio.writev
     libc.src.sys.uio.readv
diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 01b0bf36498ce..6d3bc9188583b 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.uchar
     libc.include.unistd
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index a1203cc4991af..5865dc93a9aef 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -191,6 +191,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # sys/time.h entrypoints
     libc.src.sys.time.setitimer
     libc.src.sys.time.getitimer
+
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/config/linux/arm/headers.txt b/libc/config/linux/arm/headers.txt
index 9aabac5dea33c..14c730e2b77b1 100644
--- a/libc/config/linux/arm/headers.txt
+++ b/libc/config/linux/arm/headers.txt
@@ -17,6 +17,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.strings
     libc.include.uchar
     libc.include.wchar
+    libc.include.wctype
 
     # Disabled due to epoll_wait syscalls not being available on this platform.
     # libc.include.sys_epoll
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 14361f5b6beff..79077a5e66ef5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -368,6 +368,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # sys/uio.h entrypoints
     libc.src.sys.uio.writev
     libc.src.sys.uio.readv
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index 01b0bf36498ce..6d3bc9188583b 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.uchar
     libc.include.unistd
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 9223911f04a93..381359cec6f1d 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -396,6 +396,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcstoul
     libc.src.wchar.wcstoull
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 
     # sys/uio.h entrypoints
     libc.src.sys.uio.writev
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 01b0bf36498ce..6d3bc9188583b 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.uchar
     libc.include.unistd
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 8898fd74c302f..18027298acc18 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -105,6 +105,9 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # unistd.h entrypoints
     libc.src.unistd.getentropy
+
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/windows/headers.txt b/libc/config/windows/headers.txt
index 6d9aae9276924..d4a0947d867bb 100644
--- a/libc/config/windows/headers.txt
+++ b/libc/config/windows/headers.txt
@@ -7,4 +7,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.fenv
     libc.include.math
     libc.include.unistd
+    libc.include.wctype
 )
diff --git a/libc/fuzzing/math/CMakeLists.txt b/libc/fuzzing/math/CMakeLists.txt
index c1a93058764b3..be63fe4b65aea 100644
--- a/libc/fuzzing/math/CMakeLists.txt
+++ b/libc/fuzzing/math/CMakeLists.txt
@@ -196,3 +196,12 @@ add_libc_fuzzer(
   DEPENDS
     libc.src.__support.FPUtil.generic.sqrt
 )
+
+add_libc_fuzzer(
+  cbrt_fuzz
+  NEED_MPFR
+  SRCS
+    cbrt_fuzz.cpp
+  DEPENDS
+    libc.src.math.cbrt
+)
diff --git a/libc/fuzzing/math/acos_fuzz.cpp b/libc/fuzzing/math/acos_fuzz.cpp
index d2b5456026839..48fb4eacc3a79 100644
--- a/libc/fuzzing/math/acos_fuzz.cpp
+++ b/libc/fuzzing/math/acos_fuzz.cpp
@@ -12,26 +12,40 @@
 
 #include "src/math/acos.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf and values outside accepted range
-  if (isnan(x) || isinf(x) || x > 1 || x < -1)
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_acos(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x > 1 || x < -1)
+      continue;
 
-  double result = LIBC_NAMESPACE::acos(x);
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_acos(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::acos(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/asin_fuzz.cpp b/libc/fuzzing/math/asin_fuzz.cpp
index 94ae5c7bfdeee..e27d179606824 100644
--- a/libc/fuzzing/math/asin_fuzz.cpp
+++ b/libc/fuzzing/math/asin_fuzz.cpp
@@ -12,26 +12,41 @@
 
 #include "src/math/asin.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf and values outside accepted range
-  if (isnan(x) || isinf(x) || x > 1 || x < -1)
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_asin(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::asin(x);
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x > 1 || x < -1)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_asin(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::asin(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/cbrt_fuzz.cpp b/libc/fuzzing/math/cbrt_fuzz.cpp
new file mode 100644
index 0000000000000..95f1df1695e56
--- /dev/null
+++ b/libc/fuzzing/math/cbrt_fuzz.cpp
@@ -0,0 +1,50 @@
+//===-- cbrt_fuzz.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc cbrt implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cbrt.h"
+#include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_cbrt(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::cbrt(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/cos_fuzz.cpp b/libc/fuzzing/math/cos_fuzz.cpp
index 5b5ba0f7de717..6ed1e9ed8f309 100644
--- a/libc/fuzzing/math/cos_fuzz.cpp
+++ b/libc/fuzzing/math/cos_fuzz.cpp
@@ -12,28 +12,43 @@
 
 #include "src/math/cos.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(const double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x))
-    return 0;
-  if (isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_cos(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::cos(x);
+    // remove NaN and inf as preconditions
+    if (isnan(x))
+      continue;
+    if (isinf(x))
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_cos(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::cos(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/exp10_fuzz.cpp b/libc/fuzzing/math/exp10_fuzz.cpp
index 2baef03a264a4..d939948b723a5 100644
--- a/libc/fuzzing/math/exp10_fuzz.cpp
+++ b/libc/fuzzing/math/exp10_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/exp10.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_exp10(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::exp10(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_exp10(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::exp10(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/exp2_fuzz.cpp b/libc/fuzzing/math/exp2_fuzz.cpp
index 8a2959047a6ca..a29d3c00da672 100644
--- a/libc/fuzzing/math/exp2_fuzz.cpp
+++ b/libc/fuzzing/math/exp2_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/exp2.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_exp2(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::exp2(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_exp2(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::exp2(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/exp_fuzz.cpp b/libc/fuzzing/math/exp_fuzz.cpp
index 97bc12dfa64c9..66823596dc6fa 100644
--- a/libc/fuzzing/math/exp_fuzz.cpp
+++ b/libc/fuzzing/math/exp_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/exp.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_exp(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::exp(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_exp(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::exp(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/expm1_fuzz.cpp b/libc/fuzzing/math/expm1_fuzz.cpp
index db507bb02b1d7..0690e449c3d23 100644
--- a/libc/fuzzing/math/expm1_fuzz.cpp
+++ b/libc/fuzzing/math/expm1_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/expm1.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_expm1(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::expm1(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_expm1(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::expm1(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/log10_fuzz.cpp b/libc/fuzzing/math/log10_fuzz.cpp
index 23134f4903a45..369408cc288b5 100644
--- a/libc/fuzzing/math/log10_fuzz.cpp
+++ b/libc/fuzzing/math/log10_fuzz.cpp
@@ -27,10 +27,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 
     // remove NaN and inf and values outside accepted range
     if (isnan(x) || isinf(x) || x < 0)
-      return 0;
+      continue;
     // signed zeros already tested in unit tests
     if (signbit(x) && x == 0.0)
-      return 0;
+      continue;
 
     mpfr_set_d(input, x, MPFR_RNDN);
     int output = mpfr_log10(input, input, MPFR_RNDN);
diff --git a/libc/fuzzing/math/log1p_fuzz.cpp b/libc/fuzzing/math/log1p_fuzz.cpp
index 5e138a65e3716..e02c61a352c1f 100644
--- a/libc/fuzzing/math/log1p_fuzz.cpp
+++ b/libc/fuzzing/math/log1p_fuzz.cpp
@@ -26,10 +26,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     data += sizeof(double);
     // remove NaN and inf and values outside accepted range
     if (isnan(x) || isinf(x) || x < -1)
-      return 0;
+      continue;
     // signed zeros already tested in unit tests
     if (signbit(x) && x == 0.0)
-      return 0;
+      continue;
 
     mpfr_set_d(input, x, MPFR_RNDN);
     int output = mpfr_log1p(input, input, MPFR_RNDN);
diff --git a/libc/fuzzing/math/log2_fuzz.cpp b/libc/fuzzing/math/log2_fuzz.cpp
index aa19649b95126..c3e53c639cba9 100644
--- a/libc/fuzzing/math/log2_fuzz.cpp
+++ b/libc/fuzzing/math/log2_fuzz.cpp
@@ -27,10 +27,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 
     // remove NaN and inf and values outside accepted range
     if (isnan(x) || isinf(x) || x < 0)
-      return 0;
+      continue;
     // signed zeros already tested in unit tests
     if (signbit(x) && x == 0.0)
-      return 0;
+      continue;
 
     mpfr_set_d(input, x, MPFR_RNDN);
     int output = mpfr_log2(input, input, MPFR_RNDN);
diff --git a/libc/fuzzing/math/log_fuzz.cpp b/libc/fuzzing/math/log_fuzz.cpp
index 03aa678d1f16c..9618accf3db26 100644
--- a/libc/fuzzing/math/log_fuzz.cpp
+++ b/libc/fuzzing/math/log_fuzz.cpp
@@ -27,10 +27,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 
     // remove NaN and inf and values outside accepted range
     if (isnan(x) || isinf(x) || x < 0)
-      return 0;
+      continue;
     // signed zeros already tested in unit tests
     if (signbit(x) && x == 0.0)
-      return 0;
+      continue;
     mpfr_set_d(input, x, MPFR_RNDN);
     int output = mpfr_log(input, input, MPFR_RNDN);
     mpfr_subnormalize(input, output, MPFR_RNDN);
diff --git a/libc/fuzzing/math/sin_fuzz.cpp b/libc/fuzzing/math/sin_fuzz.cpp
index a5f0fa95c1581..f6d59c7e496bc 100644
--- a/libc/fuzzing/math/sin_fuzz.cpp
+++ b/libc/fuzzing/math/sin_fuzz.cpp
@@ -12,28 +12,43 @@
 
 #include "src/math/sin.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(const double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x))
-    return 0;
-  if (isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_sin(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::sin(x);
+    // remove NaN and inf as preconditions
+    if (isnan(x))
+      continue;
+    if (isinf(x))
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_sin(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::sin(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/sincos_fuzz.cpp b/libc/fuzzing/math/sincos_fuzz.cpp
index fd3dfae23168c..3d3306721fc47 100644
--- a/libc/fuzzing/math/sincos_fuzz.cpp
+++ b/libc/fuzzing/math/sincos_fuzz.cpp
@@ -12,15 +12,12 @@
 
 #include "src/math/sincos.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_t sin_x;
   mpfr_t cos_x;
@@ -28,21 +25,43 @@ extern "C" int LLVMFuzzerTestOneInput(double x) {
   mpfr_init2(input, 53);
   mpfr_init2(sin_x, 53);
   mpfr_init2(cos_x, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  mpfr_set_d(input, x, MPFR_RNDN);
+    // remove NaN and inf as preconditions
+    if (isnan(x) || isinf(x))
+      continue;
 
-  int output = mpfr_sin_cos(sin_x, cos_x, input, MPFR_RNDN);
-  mpfr_subnormalize(sin_x, output, MPFR_RNDN);
-  mpfr_subnormalize(cos_x, output, MPFR_RNDN);
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  double to_compare_sin = mpfr_get_d(sin_x, MPFR_RNDN);
-  double to_compare_cos = mpfr_get_d(cos_x, MPFR_RNDN);
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_sin_cos(sin_x, cos_x, input, MPFR_RNDN);
+    mpfr_subnormalize(sin_x, output, MPFR_RNDN);
+    mpfr_subnormalize(cos_x, output, MPFR_RNDN);
 
-  double sin_res, cos_res;
-  LIBC_NAMESPACE::sincos(x, &sin_res, &cos_res);
+    double to_compare_sin = mpfr_get_d(sin_x, MPFR_RNDN);
+    double to_compare_cos = mpfr_get_d(cos_x, MPFR_RNDN);
 
-  if (sin_res != to_compare_sin || cos_res != to_compare_cos)
-    __builtin_trap();
+    double sin_res, cos_res;
+    LIBC_NAMESPACE::sincos(x, &sin_res, &cos_res);
+
+    if (sin_res != to_compare_sin || cos_res != to_compare_cos) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing sin output: " << sin_res
+                << std::endl;
+      std::cout << std::hexfloat << "Expected sin: " << to_compare_sin
+                << std::endl;
+      std::cout << std::hexfloat << "Failing cos output: " << cos_res
+                << std::endl;
+      std::cout << std::hexfloat << "Expected cos: " << to_compare_cos
+                << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   mpfr_clear(sin_x);
diff --git a/libc/fuzzing/math/sqrt_fuzz.cpp b/libc/fuzzing/math/sqrt_fuzz.cpp
index e81cf1afd3728..969b4f58e342c 100644
--- a/libc/fuzzing/math/sqrt_fuzz.cpp
+++ b/libc/fuzzing/math/sqrt_fuzz.cpp
@@ -26,10 +26,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     data += sizeof(double);
     // remove NaN and inf and values outside accepted range
     if (isnan(x) || isinf(x) || x < 0)
-      return 0;
+      continue;
     // signed zeros already tested in unit tests
     if (signbit(x) && x == 0.0)
-      return 0;
+      continue;
 
     mpfr_set_d(input, x, MPFR_RNDN);
     int output = mpfr_sqrt(input, input, MPFR_RNDN);
diff --git a/libc/fuzzing/math/tan_fuzz.cpp b/libc/fuzzing/math/tan_fuzz.cpp
index 2a462fa34fce4..63d3b12866a0e 100644
--- a/libc/fuzzing/math/tan_fuzz.cpp
+++ b/libc/fuzzing/math/tan_fuzz.cpp
@@ -12,28 +12,43 @@
 
 #include "src/math/tan.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(const double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x))
-    return 0;
-  if (isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_tan(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::tan(x);
+    // remove NaN and inf as preconditions
+    if (isnan(x))
+      continue;
+    if (isinf(x))
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_tan(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::tan(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index e4b3cb0faa820..f3bdc9f6aedd1 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -26,6 +26,8 @@ add_proxy_header_library(
     mbstate_t.h
   DEPENDS
     libc.include.llvm-libc-types.mbstate_t
+  FULL_BUILD_DEPENDS
+    libc.include.uchar
 )
 
 add_proxy_header_library(
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 55268d19529c7..73213826ad607 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -69,7 +69,6 @@ add_header_macro(
   ../libc/include/dlfcn.yaml
   dlfcn.h
   DEPENDS
-    .llvm-libc-macros.dlfcn_macros
     .llvm_libc_common_h
 )
 
@@ -720,6 +719,15 @@ add_header_macro(
     .llvm-libc-types.wchar_t
 )
 
+add_header_macro(
+  wctype
+  ../libc/include/wctype.yaml
+  wctype.h
+  DEPENDS
+    .llvm_libc_common_h    
+    .llvm-libc-types.wint_t
+)
+
 add_header_macro(
   locale
   ../libc/include/locale.yaml
diff --git a/libc/include/dirent.yaml b/libc/include/dirent.yaml
index 3fc522fda80e4..66570bca6c495 100644
--- a/libc/include/dirent.yaml
+++ b/libc/include/dirent.yaml
@@ -1,47 +1,45 @@
 header: dirent.h
-header_template: dirent.h.def
-macros: []
+standards:
+  - posix
 types:
   - type_name: struct_dirent
   - type_name: DIR
   - type_name: ino_t
-enums: []
-objects: []
 functions:
   - name: alphasort
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: const struct dirent **
       - type: const struct dirent **
   - name: closedir
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: DIR *
   - name: dirfd
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: DIR *
   - name: fdopendir
     standards:
-      - POSIX
+      - posix
     return_type: DIR *
     arguments:
       - type: int
   - name: opendir
     standards:
-      - POSIX
+      - posix
     return_type: DIR *
     arguments:
       - type: const char *
   - name: readdir
     standards:
-      - POSIX
+      - posix
     return_type: struct dirent *
     arguments:
       - type: DIR *
diff --git a/libc/include/dlfcn.yaml b/libc/include/dlfcn.yaml
index 78bbeff4e60d9..28be34dbd95bd 100644
--- a/libc/include/dlfcn.yaml
+++ b/libc/include/dlfcn.yaml
@@ -1,17 +1,34 @@
 header: dlfcn.h
-header_template: dlfcn.h.def
+standards:
+  - posix
 macros:
+  # Note that macro values are quoted to keep the integer literals as
+  # written.  Without the quotes, YAML will normalize them to minimal
+  # decimal, which is less readable for humans seeing the generated header.
   - macro_name: RTLD_LAZY
-    macro_header: dlfcn-macros.h
+    macro_value: "0x00001"
   - macro_name: RTLD_NOW
-    macro_header: dlfcn-macros.h
+    macro_value: "0x00002"
   - macro_name: RTLD_GLOBAL
-    macro_header: dlfcn-macros.h
+    macro_value: "0x00100"
   - macro_name: RTLD_LOCAL
-    macro_header: dlfcn-macros.h
-types: []
-enums: []
-objects: []
+    macro_value: "0"
+  - macro_name: RTLD_BINDING_MASK
+    standards:
+      - gnu
+    macro_value: "0x00003"
+  - macro_name: RTLD_NOLOAD
+    standards:
+      - gnu
+    macro_value: "0x00004"
+  - macro_name: RTLD_DEEPBIND
+    standards:
+      - gnu
+    macro_value: "0x00008"
+  - macro_name: RTLD_NODELETE
+    standards:
+      - gnu
+    macro_value: "0x01000"
 functions:
   - name: dlclose
     standards:
diff --git a/libc/include/llvm-libc-macros/dlfcn-macros.h b/libc/include/llvm-libc-macros/dlfcn-macros.h
deleted file mode 100644
index dcd202b9ab435..0000000000000
--- a/libc/include/llvm-libc-macros/dlfcn-macros.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- Definition of macros from dlfcn.h ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_MACROS_DLFCN_MACROS_H
-#define LLVM_LIBC_MACROS_DLFCN_MACROS_H
-
-#define RTLD_LAZY 0x00001
-#define RTLD_NOW 0x00002
-#define RTLD_GLOBAL 0x00100
-#define RTLD_LOCAL 0
-
-// Non-standard stuff here
-#define RTLD_BINDING_MASK 0x3
-#define RTLD_NOLOAD 0x00004
-#define RTLD_DEEPBIND 0x00008
-#define RTLD_NODELETE 0x01000
-
-#endif // LLVM_LIBC_MACROS_DLFCN_MACROS_H
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 2f05d7544666e..6697ce5b03851 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -50,4 +50,105 @@
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
 #endif
 
+// POSIX math constants
+// https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html
+#define M_E (__extension__ 0x1.5bf0a8b145769p1)
+#define M_EGAMMA (__extension__ 0x1.2788cfc6fb619p-1)
+#define M_LOG2E (__extension__ 0x1.71547652b82fep0)
+#define M_LOG10E (__extension__ 0x1.bcb7b1526e50ep-2)
+#define M_LN2 (__extension__ 0x1.62e42fefa39efp-1)
+#define M_LN10 (__extension__ 0x1.26bb1bbb55516p1)
+#define M_PHI (__extension__ 0x1.9e3779b97f4a8p0)
+#define M_PI (__extension__ 0x1.921fb54442d18p1)
+#define M_PI_2 (__extension__ 0x1.921fb54442d18p0)
+#define M_PI_4 (__extension__ 0x1.921fb54442d18p-1)
+#define M_1_PI (__extension__ 0x1.45f306dc9c883p-2)
+#define M_1_SQRTPI (__extension__ 0x1.20dd750429b6dp-1)
+#define M_2_PI (__extension__ 0x1.45f306dc9c883p-1)
+#define M_2_SQRTPI (__extension__ 0x1.20dd750429b6dp0)
+#define M_SQRT2 (__extension__ 0x1.6a09e667f3bcdp0)
+#define M_SQRT3 (__extension__ 0x1.bb67ae8584caap0)
+#define M_SQRT1_2 (__extension__ 0x1.6a09e667f3bcdp-1)
+#define M_SQRT1_3 (__extension__ 0x1.279a74590331cp-1)
+
+#define M_Ef (__extension__ 0x1.5bf0a8p1f)
+#define M_EGAMMAf (__extension__ 0x1.2788dp-1f)
+#define M_LOG2Ef (__extension__ 0x1.715476p0f)
+#define M_LOG10Ef (__extension__ 0x1.bcb7b2p-2f)
+#define M_LN2f (__extension__ 0x1.62e43p-1f)
+#define M_LN10f (__extension__ 0x1.26bb1cp1f)
+#define M_PHIf (__extension__ 0x1.9e377ap0f)
+#define M_PIf (__extension__ 0x1.921fb6p1f)
+#define M_PI_2f (__extension__ 0x1.921fb6p0f)
+#define M_PI_4f (__extension__ 0x1.921fb6p-1f)
+#define M_1_PIf (__extension__ 0x1.45f306p-2f)
+#define M_1_SQRTPIf (__extension__ 0x1.20dd76p-1f)
+#define M_2_PIf (__extension__ 0x1.45f306p-1f)
+#define M_2_SQRTPIf (__extension__ 0x1.20dd76p0f)
+#define M_SQRT2f (__extension__ 0x1.6a09e6p0f)
+#define M_SQRT3f (__extension__ 0x1.bb67aep0f)
+#define M_SQRT1_2f (__extension__ 0x1.6a09e6p-1f)
+#define M_SQRT1_3f (__extension__ 0x1.279a74p-1f)
+
+#define M_El (__extension__ 0x1.5bf0a8b1457695355fb8ac404e7ap1L)
+#define M_EGAMMAl (__extension__ 0x1.2788cfc6fb618f49a37c7f0202a6p-1L)
+#define M_LOG2El (__extension__ 0x1.71547652b82fe1777d0ffda0d23ap0L)
+#define M_LOG10El (__extension__ 0x1.bcb7b1526e50e32a6ab7555f5a68p-2L)
+#define M_LN2l (__extension__ 0x1.62e42fefa39ef35793c7673007e6p-1L)
+#define M_LN10l (__extension__ 0x1.26bb1bbb5551582dd4adac5705a6p1L)
+#define M_PHIl (__extension__ 0x1.9e3779b97f4a7c15f39cc0605ceep0L)
+#define M_PIl (__extension__ 0x1.921fb54442d18469898cc51701b8p1L)
+#define M_PI_2l (__extension__ 0x1.921fb54442d18469898cc51701b8p0L)
+#define M_PI_4l (__extension__ 0x1.921fb54442d18469898cc51701b8p-1L)
+#define M_1_PIl (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-2L)
+#define M_1_SQRTPIl (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep-1L)
+#define M_2_PIl (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-1L)
+#define M_2_SQRTPIl (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep0L)
+#define M_SQRT2l (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p0L)
+#define M_SQRT3l (__extension__ 0x1.bb67ae8584caa73b25742d7078b8p0L)
+#define M_SQRT1_2l (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p-1L)
+#define M_SQRT1_3l (__extension__ 0x1.279a74590331c4d218f81e4afb25p-1L)
+
+#ifdef __FLT16_MANT_DIG__
+#define M_Ef16 (__extension__ 0x1.5cp1f16)
+#define M_EGAMMAf16 (__extension__ 0x1.278p-1f16)
+#define M_LOG2Ef16 (__extension__ 0x1.714f16)
+#define M_LOG10Ef16 (__extension__ 0x1.bccp-2f16)
+#define M_LN2f16 (__extension__ 0x1.63p-1f16)
+#define M_LN10f16 (__extension__ 0x1.26cp1f16)
+#define M_PHIf16 (__extension__ 0x1.9e4p0f16)
+#define M_PIf16 (__extension__ 0x1.92p1f16)
+#define M_PI_2f16 (__extension__ 0x1.92p0f16)
+#define M_PI_4f16 (__extension__ 0x1.92p-1f16)
+#define M_1_PIf16 (__extension__ 0x1.46p-2f16)
+#define M_1_SQRTPIf16 (__extension__ 0x1.20cp-1f16)
+#define M_2_PIf16 (__extension__ 0x1.46p-1f16)
+#define M_2_SQRTPIf16 (__extension__ 0x1.20cp0f16)
+#define M_SQRT2f16 (__extension__ 0x1.6ap0f16)
+#define M_SQRT3f16 (__extension__ 0x1.bb8p0f16)
+#define M_SQRT1_2f16 (__extension__ 0x1.6ap-1f16)
+#define M_SQRT1_3f16 (__extension__ 0x1.278p-1f16)
+#endif // __FLT16_MANT_DIG__
+
+#ifdef __SIZEOF_FLOAT128__
+#define M_Ef128 (__extension__ 0x1.5bf0a8b1457695355fb8ac404e7ap1q)
+#define M_EGAMMAf128 (__extension__ 0x1.2788cfc6fb618f49a37c7f0202a6p-1q)
+#define M_LOG2Ef128 (__extension__ 0x1.71547652b82fe1777d0ffda0d23ap0q)
+#define M_LOG10Ef128 (__extension__ 0x1.bcb7b1526e50e32a6ab7555f5a68p-2q)
+#define M_LN2f128 (__extension__ 0x1.62e42fefa39ef35793c7673007e6p-1q)
+#define M_LN10f128 (__extension__ 0x1.26bb1bbb5551582dd4adac5705a6p1q)
+#define M_PHIf128 (__extension__ 0x1.9e3779b97f4a7c15f39cc0605ceep0q)
+#define M_PIf128 (__extension__ 0x1.921fb54442d18469898cc51701b8p1q)
+#define M_PI_2f128 (__extension__ 0x1.921fb54442d18469898cc51701b8p0q)
+#define M_PI_4f128 (__extension__ 0x1.921fb54442d18469898cc51701b8p-1q)
+#define M_1_PIf128 (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-2q)
+#define M_1_SQRTPIf128 (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep-1q)
+#define M_2_PIf128 (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-1q)
+#define M_2_SQRTPIf128 (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep0q)
+#define M_SQRT2f128 (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p0q)
+#define M_SQRT3f128 (__extension__ 0x1.bb67ae8584caa73b25742d7078b8p0q)
+#define M_SQRT1_2f128 (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p-1q)
+#define M_SQRT1_3f128 (__extension__ 0x1.279a74590331c4d218f81e4afb25p-1q)
+#endif // __SIZEOF_FLOAT128__
+
 #endif // LLVM_LIBC_MACROS_MATH_MACROS_H
diff --git a/libc/include/math.yaml b/libc/include/math.yaml
index 3044ec3437ff8..007be235f4380 100644
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@@ -33,14 +33,14 @@ functions:
     return_type: float
     arguments:
       - type: float
-    name: acoshf16
+  - name: acoshf16
     standards:
       - stdc
     return_type: _Float16
     arguments:
       - type: _Float16
     guard: LIBC_TYPES_HAS_FLOAT16
-    name: acospif16
+  - name: acospif16
     standards:
       - stdc
     return_type: _Float16
diff --git a/libc/include/search.yaml b/libc/include/search.yaml
index e0247afad2cd6..8a3a0c50af60f 100644
--- a/libc/include/search.yaml
+++ b/libc/include/search.yaml
@@ -1,6 +1,6 @@
 header: search.h
-header_template: search.h.def
-macros: []
+standards:
+  - posix
 types:
   - type_name: ACTION
   - type_name: ENTRY
@@ -12,35 +12,35 @@ objects: []
 functions:
   - name: hcreate
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: size_t
   - name: hcreate_r
-    standards: GNUExtensions
+    standards: gnu
     return_type: int
     arguments:
       - type: size_t
       - type: struct hsearch_data *
   - name: hdestroy
-    standards: GNUExtensions
+    standards: gnu
     return_type: void
     arguments: []
   - name: hdestroy_r
     standards:
-      - POSIX
+      - posix
     return_type: void
     arguments:
       - type: struct hsearch_data *
   - name: hsearch
     standards:
-      - POSIX
+      - posix
     return_type: ENTRY *
     arguments:
       - type: ENTRY
       - type: ACTION
   - name: hsearch_r
-    standards: GNUExtensions
+    standards: gnu
     return_type: int
     arguments:
       - type: ENTRY
@@ -49,20 +49,20 @@ functions:
       - type: struct hsearch_data *
   - name: insque
     standards:
-      - POSIX
+      - posix
     return_type: void
     arguments:
       - type: void *
       - type: void *
   - name: remque
     standards:
-      - POSIX
+      - posix
     return_type: void
     arguments:
       - type: void *
   - name: lfind
     standards:
-      - POSIX
+      - posix
     return_type: void *
     arguments:
       - type: const void *
@@ -72,7 +72,7 @@ functions:
       - type: __search_compare_t
   - name: lsearch
     standards:
-      - POSIX
+      - posix
     return_type: void *
     arguments:
       - type: const void *
diff --git a/libc/include/setjmp.yaml b/libc/include/setjmp.yaml
index 00049e58c86c8..55e03470e33ca 100644
--- a/libc/include/setjmp.yaml
+++ b/libc/include/setjmp.yaml
@@ -1,10 +1,8 @@
 header: setjmp.h
-header_template: setjmp.h.def
-macros: []
+standards:
+  - stdc
 types:
   - type_name: jmp_buf
-enums: []
-objects: []
 functions:
   - name: longjmp
     standards:
@@ -23,7 +21,7 @@ functions:
       - type: jmp_buf
   - name: sigsetjmp
     standards:
-      - POSIX
+      - posix
     return_type: int
     attributes:
       - _Returns_twice
@@ -32,7 +30,7 @@ functions:
       - type: int
   - name: siglongjmp
     standards:
-      - POSIX
+      - posix
     return_type: _Noreturn void
     arguments:
       - type: sigjmp_buf
diff --git a/libc/include/spawn.yaml b/libc/include/spawn.yaml
index c763cc76fd094..ef39f66d080f6 100644
--- a/libc/include/spawn.yaml
+++ b/libc/include/spawn.yaml
@@ -1,17 +1,15 @@
 header: spawn.h
-header_template: spawn.h.def
-macros: []
+standards:
+  - posix
 types:
   - type_name: posix_spawn_file_actions_t
   - type_name: posix_spawnattr_t
   - type_name: pid_t
   - type_name: mode_t
-enums: []
-objects: []
 functions:
   - name: posix_spawn
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: pid_t *__restrict
@@ -22,14 +20,14 @@ functions:
       - type: char * const * __restrict
   - name: posix_spawn_file_actions_addclose
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: posix_spawn_file_actions_t *
       - type: int
   - name: posix_spawn_file_actions_adddup2
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: posix_spawn_file_actions_t *
@@ -37,7 +35,7 @@ functions:
       - type: int
   - name: posix_spawn_file_actions_addopen
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: posix_spawn_file_actions_t *__restrict
@@ -47,13 +45,13 @@ functions:
       - type: mode_t
   - name: posix_spawn_file_actions_destroy
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: posix_spawn_file_actions_t *
   - name: posix_spawn_file_actions_init
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: posix_spawn_file_actions_t *
diff --git a/libc/include/string.yaml b/libc/include/string.yaml
index 736deceb453de..0bf297ee747a4 100644
--- a/libc/include/string.yaml
+++ b/libc/include/string.yaml
@@ -1,5 +1,6 @@
 header: string.h
-header_template: string.h.def
+standards:
+  - stdc
 macros:
   - macro_name: NULL
     macro_header: null-macro.h
@@ -11,7 +12,7 @@ objects: []
 functions:
   - name: memccpy
     standards:
-      - POSIX
+      - posix
     return_type: void *
     arguments:
       - type: void *__restrict
@@ -61,7 +62,7 @@ functions:
       - type: size_t
   - name: mempcpy
     standards:
-      - POSIX
+      - posix
     return_type: void *
     arguments:
       - type: void *__restrict
@@ -93,14 +94,14 @@ functions:
       - type: size_t
   - name: stpcpy
     standards:
-      - POSIX
+      - posix
     return_type: char *
     arguments:
       - type: char *__restrict
       - type: const char *__restrict
   - name: stpncpy
     standards:
-      - POSIX
+      - posix
     return_type: char *
     arguments:
       - type: char *__restrict
@@ -243,7 +244,7 @@ functions:
       - type: size_t
   - name: strnlen
     standards:
-      - POSIX
+      - posix
     return_type: size_t
     arguments:
       - type: const char *
@@ -271,7 +272,7 @@ functions:
       - type: const char *__restrict
   - name: strsignal
     standards:
-      - POSIX
+      - posix
     return_type: char *
     arguments:
       - type: int
@@ -298,7 +299,7 @@ functions:
       - type: const char *__restrict
   - name: strtok_r
     standards:
-      - POSIX
+      - posix
     return_type: char *
     arguments:
       - type: char *__restrict
diff --git a/libc/include/strings.h.def b/libc/include/strings.h.def
deleted file mode 100644
index 9b016bf0bc50b..0000000000000
--- a/libc/include/strings.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header strings.h -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_STRINGS_H
-#define LLVM_LIBC_STRINGS_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_STRINGS_H
diff --git a/libc/include/strings.yaml b/libc/include/strings.yaml
index 855800d9dbc3d..1e78f0e48aa59 100644
--- a/libc/include/strings.yaml
+++ b/libc/include/strings.yaml
@@ -1,15 +1,14 @@
 header: strings.h
-header_template: strings.h.def
-macros: []
+standards:
+  - bsd
+  - posix
 types:
   - type_name: size_t
   - type_name: locale_t
-enums: []
-objects: []
 functions:
   - name: bcmp
     standards:
-      - llvm_libc_ext
+      - bsd
     return_type: int
     arguments:
       - type: const void *
@@ -17,7 +16,7 @@ functions:
       - type: size_t
   - name: bcopy
     standards:
-      - llvm_libc_ext
+      - bsd
     return_type: void
     arguments:
       - type: const void *
@@ -25,69 +24,61 @@ functions:
       - type: size_t
   - name: bzero
     standards:
-      - llvm_libc_ext
+      - bsd
     return_type: void
     arguments:
       - type: void *
       - type: size_t
   - name: ffs
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: int
   - name: ffsl
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: long
   - name: ffsll
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: long long
   - name: index
     standards:
-      - BSDExtensions
+      - bsd
     return_type: char *
     arguments:
       - type: const char *
       - type: int
   - name: rindex
     standards:
-      - BSDExtensions
+      - bsd
     return_type: char *
     arguments:
       - type: const char *
       - type: int
   - name: strcasecmp
-    standards:
-      - BSDExtensions
     return_type: int
     arguments:
       - type: const char *
       - type: const char *
   - name: strcasecmp_l
-    standards:
-      - BSDExtensions
     return_type: int
     arguments:
       - type: const char *
       - type: const char *
       - type: locale_t
   - name: strncasecmp
-    standards:
-      - BSDExtensions
     return_type: int
     arguments:
       - type: const char *
       - type: const char *
       - type: size_t
   - name: strncasecmp_l
-    standards:
-      - BSDExtensions
     return_type: int
     arguments:
       - type: const char *
diff --git a/libc/include/sys/sendfile.h.def b/libc/include/sys/sendfile.h.def
deleted file mode 100644
index d7f21f91f95ed..0000000000000
--- a/libc/include/sys/sendfile.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Linux sys/sendfile.h ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_SENDFILE_H
-#define LLVM_LIBC_SYS_SENDFILE_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_SENDFILE_H
diff --git a/libc/include/sys/sendfile.yaml b/libc/include/sys/sendfile.yaml
index 259ab83dff54b..a845dab580483 100644
--- a/libc/include/sys/sendfile.yaml
+++ b/libc/include/sys/sendfile.yaml
@@ -1,16 +1,8 @@
 header: sys/sendfile.h
-header_template: sendfile.h.def
-macros: []
-types:
-  - type_name: ssize_t
-  - type_name: size_t
-  - type_name: off_t
-enums: []
-objects: []
+standards:
+  - linux
 functions:
   - name: sendfile
-    standards:
-      - GNUExtensions
     return_type: ssize_t
     arguments:
       - type: int
diff --git a/libc/include/sys/statvfs.h.def b/libc/include/sys/statvfs.h.def
deleted file mode 100644
index f23c9a3d5b1f9..0000000000000
--- a/libc/include/sys/statvfs.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- POSIX header statvfs.h --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_STATVFS_H
-#define LLVM_LIBC_SYS_STATVFS_H
-
-#include <__llvm-libc-common.h>
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_STATVFS_H
diff --git a/libc/include/sys/statvfs.yaml b/libc/include/sys/statvfs.yaml
index 8c1d254add37f..e083677beee89 100644
--- a/libc/include/sys/statvfs.yaml
+++ b/libc/include/sys/statvfs.yaml
@@ -1,23 +1,21 @@
 header: sys/statvfs.h
-header_template: statvfs.h.def
-macros: []
+standards:
+  - posix
 types:
   - type_name: struct_statvfs
   - type_name: fsblkcnt_t
   - type_name: fsfilcnt_t
-enums: []
-objects: []
 functions:
   - name: fstatvfs
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: int
       - type: struct statvfs *
   - name: statvfs
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: const char *__restrict
diff --git a/libc/include/sys/types.yaml b/libc/include/sys/types.yaml
index 6fa0b448fcd38..a00429d3817e1 100644
--- a/libc/include/sys/types.yaml
+++ b/libc/include/sys/types.yaml
@@ -1,32 +1,28 @@
 header: sys/types.h
-header_template: types.h.def
-standards: POSIX
-macros: []
+standards:
+  - posix
 types:
-  - type_name: uid_t
-  - type_name: time_t
-  - type_name: pthread_t
-  - type_name: pthread_rwlock_t
-  - type_name: pthread_rwlockattr_t
-  - type_name: pthread_mutex_t
   - type_name: blkcnt_t
   - type_name: blksize_t
   - type_name: clockid_t
-  - type_name: ssize_t
-  - type_name: pthread_mutexattr_t
-  - type_name: ino_t
-  - type_name: pthread_once_t
-  - type_name: mode_t
   - type_name: dev_t
-  - type_name: pthread_attr_t
   - type_name: gid_t
-  - type_name: pid_t
+  - type_name: ino_t
+  - type_name: mode_t
   - type_name: nlink_t
-  - type_name: suseconds_t
   - type_name: off_t
-  - type_name: size_t
-  - type_name: pthread_key_t
+  - type_name: pid_t
+  - type_name: pthread_attr_t
   - type_name: pthread_condattr_t
-enums: []
-objects: []
-functions: []
+  - type_name: pthread_key_t
+  - type_name: pthread_mutex_t
+  - type_name: pthread_mutexattr_t
+  - type_name: pthread_once_t
+  - type_name: pthread_rwlock_t
+  - type_name: pthread_rwlockattr_t
+  - type_name: pthread_t
+  - type_name: size_t
+  - type_name: ssize_t
+  - type_name: suseconds_t
+  - type_name: time_t
+  - type_name: uid_t
diff --git a/libc/include/sys/uio.h.def b/libc/include/sys/uio.h.def
deleted file mode 100644
index 76496cb2310f7..0000000000000
--- a/libc/include/sys/uio.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- POSIX header uio.h ------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_UIO_H
-#define LLVM_LIBC_SYS_UIO_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_UIO_H
diff --git a/libc/include/sys/uio.yaml b/libc/include/sys/uio.yaml
index 6d3f336b2b520..929911e669386 100644
--- a/libc/include/sys/uio.yaml
+++ b/libc/include/sys/uio.yaml
@@ -1,15 +1,13 @@
 header: sys/uio.h
-header_template: uio.h.def
-macros: []
+standards:
+  - posix
 types:
   - type_name: struct_iovec
   - type_name: ssize_t
-enums: []
-objects: []
 functions:
   - name: writev
     standards:
-      - POSIX
+      - posix
     return_type: ssize_t
     arguments:
       - type: int
@@ -17,7 +15,7 @@ functions:
       - type: int
   - name: readv
     standards:
-      - POSIX
+      - posix
     return_type: ssize_t
     arguments:
       - type: int
diff --git a/libc/include/sys/utsname.h.def b/libc/include/sys/utsname.h.def
deleted file mode 100644
index 08dbbfc062453..0000000000000
--- a/libc/include/sys/utsname.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Linux sys/utsname.h -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_UTSNAME_H
-#define LLVM_LIBC_SYS_UTSNAME_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_UTSNAME_H
diff --git a/libc/include/sys/utsname.yaml b/libc/include/sys/utsname.yaml
index 6c7cb71f9a34f..0f0e4cdb38952 100644
--- a/libc/include/sys/utsname.yaml
+++ b/libc/include/sys/utsname.yaml
@@ -1,14 +1,12 @@
 header: sys/utsname.h
-header_template: utsname.h.def
-macros: []
+standards:
+  - posix
 types:
   - type_name: struct_utsname
-enums: []
-objects: []
 functions:
   - name: uname
     standards:
-      - POSIX
+      - posix
     return_type: int
     arguments:
       - type: struct utsname *
diff --git a/libc/include/threads.h.def b/libc/include/threads.h.def
deleted file mode 100644
index b114bea0ace34..0000000000000
--- a/libc/include/threads.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header threads.h -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_THREADS_H
-#define LLVM_LIBC_THREADS_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_THREADS_H
diff --git a/libc/include/threads.yaml b/libc/include/threads.yaml
index 7014822f9251d..99b29f1815549 100644
--- a/libc/include/threads.yaml
+++ b/libc/include/threads.yaml
@@ -1,5 +1,6 @@
 header: threads.h
-header_template: threads.h.def
+standards:
+  - stdc
 macros:
   - macro_name: ONCE_FLAG_INIT
     macro_value: '{0}'
diff --git a/libc/include/uchar.yaml b/libc/include/uchar.yaml
index 713919796762d..d0799e28ac9cb 100644
--- a/libc/include/uchar.yaml
+++ b/libc/include/uchar.yaml
@@ -1,14 +1,9 @@
 header: uchar.h
-header_template: uchar.h.def
 standards:
   - stdc
-macros: []
 types:
   - type_name: char32_t
   - type_name: char16_t
   - type_name: char8_t
   - type_name: mbstate_t
   - type_name: size_t
-enums: []
-objects: []
-functions: []
diff --git a/libc/include/wctype.yaml b/libc/include/wctype.yaml
new file mode 100644
index 0000000000000..fb4f96f7d17e4
--- /dev/null
+++ b/libc/include/wctype.yaml
@@ -0,0 +1,10 @@
+header: wctype.h
+types:
+  - type_name: wint_t
+functions:
+  - name: iswalpha
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: wint_t
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 3012cbb938816..e3c674c27ffaf 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -11,7 +11,16 @@
 
 #include "libc_common.h"
 
+#include "math/acos.h"
+#include "math/acosf.h"
+#include "math/acosf16.h"
+#include "math/acoshf.h"
+#include "math/acoshf16.h"
+#include "math/erff.h"
 #include "math/exp.h"
+#include "math/exp10.h"
+#include "math/exp10f.h"
+#include "math/exp10f16.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/acos.h b/libc/shared/math/acos.h
new file mode 100644
index 0000000000000..73c6b512e16f4
--- /dev/null
+++ b/libc/shared/math/acos.h
@@ -0,0 +1,23 @@
+//===-- Shared acos function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ACOS_H
+#define LLVM_LIBC_SHARED_MATH_ACOS_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/acos.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::acos;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ACOS_H
diff --git a/libc/shared/math/acosf.h b/libc/shared/math/acosf.h
new file mode 100644
index 0000000000000..7cdd64e7b379a
--- /dev/null
+++ b/libc/shared/math/acosf.h
@@ -0,0 +1,23 @@
+//===-- Shared acosf function -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ACOSF_H
+#define LLVM_LIBC_SHARED_MATH_ACOSF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/acosf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::acosf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ACOSF_H
diff --git a/libc/shared/math/acosf16.h b/libc/shared/math/acosf16.h
new file mode 100644
index 0000000000000..aaf6ed9922556
--- /dev/null
+++ b/libc/shared/math/acosf16.h
@@ -0,0 +1,29 @@
+//===-- Shared acosf16 function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ACOSF16_H
+#define LLVM_LIBC_SHARED_MATH_ACOSF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/acosf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::acosf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_ACOSF16_H
diff --git a/libc/shared/math/acoshf.h b/libc/shared/math/acoshf.h
new file mode 100644
index 0000000000000..86bdbce3d905c
--- /dev/null
+++ b/libc/shared/math/acoshf.h
@@ -0,0 +1,23 @@
+//===-- Shared acoshf function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ACOSHF_H
+#define LLVM_LIBC_SHARED_MATH_ACOSHF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/acoshf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::acoshf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ACOSHF_H
diff --git a/libc/shared/math/acoshf16.h b/libc/shared/math/acoshf16.h
new file mode 100644
index 0000000000000..2f0bc6e80ab6d
--- /dev/null
+++ b/libc/shared/math/acoshf16.h
@@ -0,0 +1,29 @@
+//===-- Shared acoshf16 function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ACOSHF16_H
+#define LLVM_LIBC_SHARED_MATH_ACOSHF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/acoshf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::acoshf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_ACOSHF16_H
diff --git a/libc/shared/math/erff.h b/libc/shared/math/erff.h
new file mode 100644
index 0000000000000..d0cca15570988
--- /dev/null
+++ b/libc/shared/math/erff.h
@@ -0,0 +1,23 @@
+//===-- Shared erff function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ERFF_H
+#define LLVM_LIBC_SHARED_MATH_ERFF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/erff.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::erff;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_ERFF_H
diff --git a/libc/shared/math/exp10.h b/libc/shared/math/exp10.h
new file mode 100644
index 0000000000000..3d36d9103705f
--- /dev/null
+++ b/libc/shared/math/exp10.h
@@ -0,0 +1,23 @@
+//===-- Shared exp10 function -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10_H
+#define LLVM_LIBC_SHARED_MATH_EXP10_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10_H
diff --git a/libc/shared/math/exp10f.h b/libc/shared/math/exp10f.h
new file mode 100644
index 0000000000000..cd2ba54e6f4f2
--- /dev/null
+++ b/libc/shared/math/exp10f.h
@@ -0,0 +1,23 @@
+//===-- Shared exp10f function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H
+#define LLVM_LIBC_SHARED_MATH_EXP10F_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10f;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H
diff --git a/libc/shared/math/exp10f16.h b/libc/shared/math/exp10f16.h
new file mode 100644
index 0000000000000..af00787b058bc
--- /dev/null
+++ b/libc/shared/math/exp10f16.h
@@ -0,0 +1,29 @@
+//===-- Shared exp10f16 function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H
+#define LLVM_LIBC_SHARED_MATH_EXP10F_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/exp10f16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10f16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H
diff --git a/libc/shared/sign.h b/libc/shared/sign.h
new file mode 100644
index 0000000000000..faa8648b96439
--- /dev/null
+++ b/libc/shared/sign.h
@@ -0,0 +1,23 @@
+//===-- Shared sign type ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_SIGN_H
+#define LLVM_LIBC_SHARED_SIGN_H
+
+#include "libc_common.h"
+#include "src/__support/sign.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using LIBC_NAMESPACE_DECL::Sign;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_SIGN_H
diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt
index a665253c4cc03..d7a1e1f49e6ff 100644
--- a/libc/src/CMakeLists.txt
+++ b/libc/src/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(strings)
 add_subdirectory(time)
 add_subdirectory(unistd)
 add_subdirectory(wchar)
+add_subdirectory(wctype)
 
 if(${LIBC_TARGET_OS} STREQUAL "linux")
   add_subdirectory(dirent)
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index cc941f23135a6..f157d90abb8aa 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -209,6 +209,17 @@ add_header_library(
     libc.src.__support.macros.properties.types
 )
 
+add_header_library(
+  comparison_operations
+  HDRS
+    comparison_operations.h
+  DEPENDS
+    .fenv_impl
+    .fp_bits
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   hypot
   HDRS
diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h
index 41104620ed61d..7bec4e30a9960 100644
--- a/libc/src/__support/FPUtil/PolyEval.h
+++ b/libc/src/__support/FPUtil/PolyEval.h
@@ -37,7 +37,7 @@ LIBC_INLINE cpp::enable_if_t<(sizeof(T) <= sizeof(void *)), T> polyeval(T,
 }
 
 template <typename T, typename... Ts>
-LIBC_INLINE cpp::enable_if_t<(sizeof(T) > sizeof(void *)), T>
+LIBC_INLINE static constexpr cpp::enable_if_t<(sizeof(T) > sizeof(void *)), T>
 polyeval(const T &x, const T &a0, const Ts &...a) {
   return multiply_add(x, polyeval(x, a...), a0);
 }
diff --git a/libc/src/__support/FPUtil/comparison_operations.h b/libc/src/__support/FPUtil/comparison_operations.h
new file mode 100644
index 0000000000000..ff62ce085513b
--- /dev/null
+++ b/libc/src/__support/FPUtil/comparison_operations.h
@@ -0,0 +1,114 @@
+//===-- Comparison operations on floating point numbers ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_COMPARISONOPERATIONS_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_COMPARISONOPERATIONS_H
+
+#include "FEnvImpl.h"
+#include "FPBits.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace fputil {
+
+// All predicates are hereby implemented as per IEEE Std 754-2019
+// Implements compareQuietEqual predicate
+// Rules for comparison within the same floating point type
+// 1. +0 = −0
+// 2. (i)   +inf  = +inf
+//    (ii)  -inf  = -inf
+//    (iii) -inf != +inf
+// 3. Any comparison with NaN returns false
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool> equals(T x,
+                                                                       T y) {
+  using FPBits = FPBits<T>;
+  FPBits x_bits(x);
+  FPBits y_bits(y);
+
+  if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
+    fputil::raise_except_if_required(FE_INVALID);
+
+  // NaN == x returns false for every x
+  if (x_bits.is_nan() || y_bits.is_nan())
+    return false;
+
+  // +/- 0 == +/- 0
+  if (x_bits.is_zero() && y_bits.is_zero())
+    return true;
+
+  return x_bits.uintval() == y_bits.uintval();
+}
+
+// Implements compareSignalingLess predicate
+// Section 5.11 Rules:
+// 1. -inf < x (x != -inf)
+// 2. x < +inf (x != +inf)
+// 3. Any comparison with NaN return false
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool> less_than(T x,
+                                                                          T y) {
+  using FPBits = FPBits<T>;
+  FPBits x_bits(x);
+  FPBits y_bits(y);
+
+  // Any comparison with NaN returns false
+  if (x_bits.is_nan() || y_bits.is_nan()) {
+    fputil::raise_except_if_required(FE_INVALID);
+    return false;
+  }
+
+  if (x_bits.is_zero() && y_bits.is_zero())
+    return false;
+
+  if (x_bits.is_neg() && y_bits.is_pos())
+    return true;
+
+  if (x_bits.is_pos() && y_bits.is_neg())
+    return false;
+
+  // since floating-point numbers are stored in the format: s | e | m
+  // we can directly compare the uintval's
+
+  // both negative
+  if (x_bits.is_neg())
+    return x_bits.uintval() > y_bits.uintval();
+
+  // both positive
+  return x_bits.uintval() < y_bits.uintval();
+}
+
+// Implements compareSignalingGreater predicate
+// x < y => y > x
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool>
+greater_than(T x, T y) {
+  return less_than(y, x);
+}
+
+// Implements compareSignalingLessEqual predicate
+// x <= y => (x < y) || (x == y)
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool>
+less_than_or_equals(T x, T y) {
+  return less_than(x, y) || equals(x, y);
+}
+
+// Implements compareSignalingGreaterEqual predicate
+// x >= y => (x > y) || (x == y) => (y < x) || (x == y)
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, bool>
+greater_than_or_equals(T x, T y) {
+  return less_than(y, x) || equals(x, y);
+}
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_COMPARISONOPERATIONS_H
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index c27885aadc028..8e54e845de493 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -151,8 +151,8 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
 }
 
 template <size_t SPLIT_B = 27>
-LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
-                                    const DoubleDouble &b) {
+LIBC_INLINE constexpr DoubleDouble quick_mult(const DoubleDouble &a,
+                                              const DoubleDouble &b) {
   DoubleDouble r = exact_mult<double, SPLIT_B>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
diff --git a/libc/src/__support/macros/properties/architectures.h b/libc/src/__support/macros/properties/architectures.h
index c88956ff41148..ecc93196be286 100644
--- a/libc/src/__support/macros/properties/architectures.h
+++ b/libc/src/__support/macros/properties/architectures.h
@@ -21,7 +21,7 @@
 #define LIBC_TARGET_ARCH_IS_GPU
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#if defined(__CLR_VER) || defined(LIBC_TARGET_ARCH_IS_GPU)
 #define LIBC_TARGET_ARCH_IS_VM
 #endif
 
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index cdb2df97b2b9a..fde30eadfd83b 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -81,7 +81,7 @@
 #endif
 
 #if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) ||   \
-    defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
+    defined(__NVPTX__) || defined(__AMDGPU__) || defined(__riscv_flen)
 #define LIBC_TARGET_CPU_HAS_FMA
 // Provide a more fine-grained control of FMA instruction for ARM targets.
 #if defined(LIBC_TARGET_CPU_HAS_FPU_HALF)
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index f7ef9e7694fe6..9a8a4d16a4a0e 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -1,3 +1,126 @@
+add_header_library(
+  acos
+  HDRS
+    acos.h
+  DEPENDS
+    .asin_utils
+    libc.src.__support.math.asin_utils
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.types
+    libc.src.__support.macros.properties.cpu_features
+)
+
+add_header_library(
+  acosf
+  HDRS
+    acosf.h
+  DEPENDS
+    .inv_trigf_utils
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  acosf16
+  HDRS
+    acosf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.types
+)
+
+add_header_library(
+  acosh_float_constants
+  HDRS
+    acosh_float_constants.h
+  DEPENDS
+    libc.src.__support.macros.config
+)
+
+add_header_library(
+  acoshf_utils
+  HDRS
+    acoshf_utils.h
+  DEPENDS
+    .acosh_float_constants
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+)
+
+add_header_library(
+  acoshf
+  HDRS
+    acoshf.h
+  DEPENDS
+    .acoshf_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  acoshf16
+  HDRS
+    acoshf16.h
+  DEPENDS
+    .acoshf_utils
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  asin_utils
+  HDRS
+    asin_utils.h
+  DEPENDS
+    libc.src.__support.integer_literals
+    libc.src.__support.FPUtil.double_double
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  erff
+  HDRS
+    erff.h
+  DEPENDS
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.macros.optimization
+)
+
 add_header_library(
   exp_float_constants
   HDRS
@@ -65,6 +188,16 @@ add_header_library(
     libc.src.__support.FPUtil.manipulation_functions
 )
 
+add_header_library(
+  inv_trigf_utils
+  HDRS
+    inv_trigf_utils.h
+  DEPENDS
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.common
+)
+
 add_header_library(
   frexpf16
   HDRS
@@ -149,3 +282,86 @@ add_header_library(
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
 )
+
+add_header_library(
+  exp10
+  HDRS
+    exp10.h
+  DEPENDS
+    .exp_constants
+    .exp_utils
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.optional
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.triple_double
+    libc.src.__support.integer_literals
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  exp10f_utils
+  HDRS
+    exp10f_utils.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.common
+    libc.src.__support.math.exp_utils
+)
+
+add_header_library(
+  exp10f
+  HDRS
+    exp10f.h
+  DEPENDS
+    .exp10f_utils
+    libc.src.__support.macros.config
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  exp10_float16_constants
+  HDRS
+    exp10_float16_constants.h
+  DEPENDS
+    libc.src.__support.CPP.array
+)
+
+add_header_library(
+  exp10f16_utils
+  HDRS
+    exp10f16_utils.h
+  DEPENDS
+    .expf16_utils
+    .exp10_float16_constants
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_header_library(
+  exp10f16
+  HDRS
+    exp10f16.h
+  DEPENDS
+    .exp10f16_utils
+    libc.src.__support.FPUtil.fp_bits
+    src.__support.FPUtil.FEnvImpl
+    src.__support.FPUtil.FPBits
+    src.__support.FPUtil.cast
+    src.__support.FPUtil.rounding_mode
+    src.__support.FPUtil.except_value_utils
+    src.__support.macros.optimization
+    src.__support.macros.properties.cpu_features
+)
diff --git a/libc/src/__support/math/acos.h b/libc/src/__support/math/acos.h
new file mode 100644
index 0000000000000..a52ead7fc1b3b
--- /dev/null
+++ b/libc/src/__support/math/acos.h
@@ -0,0 +1,284 @@
+//===-- Implementation header for acos --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H
+
+#include "asin_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr double acos(double x) {
+  using DoubleDouble = fputil::DoubleDouble;
+  using namespace asin_internal;
+  using FPBits = fputil::FPBits<double>;
+
+  FPBits xbits(x);
+  int x_exp = xbits.get_biased_exponent();
+
+  // |x| < 0.5.
+  if (x_exp < FPBits::EXP_BIAS - 1) {
+    // |x| < 2^-55.
+    if (LIBC_UNLIKELY(x_exp < FPBits::EXP_BIAS - 55)) {
+      // When |x| < 2^-55, acos(x) = pi/2
+#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS)
+      return PI_OVER_TWO.hi;
+#else
+      // Force the evaluation and prevent constant propagation so that it
+      // is rounded correctly for FE_UPWARD rounding mode.
+      return (xbits.abs().get_val() + 0x1.0p-160) + PI_OVER_TWO.hi;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    }
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    // acos(x) = pi/2 - asin(x)
+    //         = pi/2 - x * P(x^2)
+    double p = asin_eval(x * x);
+    return PI_OVER_TWO.hi + fputil::multiply_add(-x, p, PI_OVER_TWO.lo);
+#else
+    unsigned idx = 0;
+    DoubleDouble x_sq = fputil::exact_mult(x, x);
+    double err = xbits.abs().get_val() * 0x1.0p-51;
+    // Polynomial approximation:
+    //   p ~ asin(x)/x
+    DoubleDouble p = asin_eval(x_sq, idx, err);
+    // asin(x) ~ x * p
+    DoubleDouble r0 = fputil::exact_mult(x, p.hi);
+    // acos(x) = pi/2 - asin(x)
+    //         ~ pi/2 - x * p
+    //         = pi/2 - x * (p.hi + p.lo)
+    double r_hi = fputil::multiply_add(-x, p.hi, PI_OVER_TWO.hi);
+    // Use Dekker's 2SUM algorithm to compute the lower part.
+    double r_lo = ((PI_OVER_TWO.hi - r_hi) - r0.hi) - r0.lo;
+    r_lo = fputil::multiply_add(-x, p.lo, r_lo + PI_OVER_TWO.lo);
+
+    // Ziv's accuracy test.
+
+    double r_upper = r_hi + (r_lo + err);
+    double r_lower = r_hi + (r_lo - err);
+
+    if (LIBC_LIKELY(r_upper == r_lower))
+      return r_upper;
+
+    // Ziv's accuracy test failed, perform 128-bit calculation.
+
+    // Recalculate mod 1/64.
+    idx = static_cast<unsigned>(fputil::nearest_integer(x_sq.hi * 0x1.0p6));
+
+    // Get x^2 - idx/64 exactly.  When FMA is available, double-double
+    // multiplication will be correct for all rounding modes.  Otherwise we use
+    // Float128 directly.
+    Float128 x_f128(x);
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+    // u = x^2 - idx/64
+    Float128 u_hi(
+        fputil::multiply_add(static_cast<double>(idx), -0x1.0p-6, x_sq.hi));
+    Float128 u = fputil::quick_add(u_hi, Float128(x_sq.lo));
+#else
+    Float128 x_sq_f128 = fputil::quick_mul(x_f128, x_f128);
+    Float128 u = fputil::quick_add(
+        x_sq_f128, Float128(static_cast<double>(idx) * (-0x1.0p-6)));
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
+    Float128 p_f128 = asin_eval(u, idx);
+    // Flip the sign of x_f128 to perform subtraction.
+    x_f128.sign = x_f128.sign.negate();
+    Float128 r =
+        fputil::quick_add(PI_OVER_TWO_F128, fputil::quick_mul(x_f128, p_f128));
+
+    return static_cast<double>(r);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  }
+  // |x| >= 0.5
+
+  double x_abs = xbits.abs().get_val();
+
+  // Maintaining the sign:
+  constexpr double SIGN[2] = {1.0, -1.0};
+  double x_sign = SIGN[xbits.is_neg()];
+  // |x| >= 1
+  if (LIBC_UNLIKELY(x_exp >= FPBits::EXP_BIAS)) {
+    // x = +-1, asin(x) = +- pi/2
+    if (x_abs == 1.0) {
+      // x = 1, acos(x) = 0,
+      // x = -1, acos(x) = pi
+      return x == 1.0 ? 0.0 : fputil::multiply_add(-x_sign, PI.hi, PI.lo);
+    }
+    // |x| > 1, return NaN.
+    if (xbits.is_quiet_nan())
+      return x;
+
+    // Set domain error for non-NaN input.
+    if (!xbits.is_nan())
+      fputil::set_errno_if_required(EDOM);
+
+    fputil::raise_except_if_required(FE_INVALID);
+    return FPBits::quiet_nan().get_val();
+  }
+
+  // When |x| >= 0.5, we perform range reduction as follow:
+  //
+  // When 0.5 <= x < 1, let:
+  //   y = acos(x)
+  // We will use the double angle formula:
+  //   cos(2y) = 1 - 2 sin^2(y)
+  // and the complement angle identity:
+  //   x = cos(y) = 1 - 2 sin^2 (y/2)
+  // So:
+  //   sin(y/2) = sqrt( (1 - x)/2 )
+  // And hence:
+  //   y/2 = asin( sqrt( (1 - x)/2 ) )
+  // Equivalently:
+  //   acos(x) = y = 2 * asin( sqrt( (1 - x)/2 ) )
+  // Let u = (1 - x)/2, then:
+  //   acos(x) = 2 * asin( sqrt(u) )
+  // Moreover, since 0.5 <= x < 1:
+  //   0 < u <= 1/4, and 0 < sqrt(u) <= 0.5,
+  // And hence we can reuse the same polynomial approximation of asin(x) when
+  // |x| <= 0.5:
+  //   acos(x) ~ 2 * sqrt(u) * P(u).
+  //
+  // When -1 < x <= -0.5, we reduce to the previous case using the formula:
+  //   acos(x) = pi - acos(-x)
+  //           = pi - 2 * asin ( sqrt( (1 + x)/2 ) )
+  //           ~ pi - 2 * sqrt(u) * P(u),
+  // where u = (1 - |x|)/2.
+
+  // u = (1 - |x|)/2
+  double u = fputil::multiply_add(x_abs, -0.5, 0.5);
+  // v_hi + v_lo ~ sqrt(u).
+  // Let:
+  //   h = u - v_hi^2 = (sqrt(u) - v_hi) * (sqrt(u) + v_hi)
+  // Then:
+  //   sqrt(u) = v_hi + h / (sqrt(u) + v_hi)
+  //            ~ v_hi + h / (2 * v_hi)
+  // So we can use:
+  //   v_lo = h / (2 * v_hi).
+  double v_hi = fputil::sqrt<double>(u);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr DoubleDouble CONST_TERM[2] = {{0.0, 0.0}, PI};
+  DoubleDouble const_term = CONST_TERM[xbits.is_neg()];
+
+  double p = asin_eval(u);
+  double scale = x_sign * 2.0 * v_hi;
+  double r = const_term.hi + fputil::multiply_add(scale, p, const_term.lo);
+  return r;
+#else
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+  double h = fputil::multiply_add(v_hi, -v_hi, u);
+#else
+  DoubleDouble v_hi_sq = fputil::exact_mult(v_hi, v_hi);
+  double h = (u - v_hi_sq.hi) - v_hi_sq.lo;
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
+  // Scale v_lo and v_hi by 2 from the formula:
+  //   vh = v_hi * 2
+  //   vl = 2*v_lo = h / v_hi.
+  double vh = v_hi * 2.0;
+  double vl = h / v_hi;
+
+  // Polynomial approximation:
+  //   p ~ asin(sqrt(u))/sqrt(u)
+  unsigned idx = 0;
+  double err = vh * 0x1.0p-51;
+
+  DoubleDouble p = asin_eval(DoubleDouble{0.0, u}, idx, err);
+
+  // Perform computations in double-double arithmetic:
+  //   asin(x) = pi/2 - (v_hi + v_lo) * (ASIN_COEFFS[idx][0] + p)
+  DoubleDouble r0 = fputil::quick_mult(DoubleDouble{vl, vh}, p);
+
+  double r_hi = 0, r_lo = 0;
+  if (xbits.is_pos()) {
+    r_hi = r0.hi;
+    r_lo = r0.lo;
+  } else {
+    DoubleDouble r = fputil::exact_add(PI.hi, -r0.hi);
+    r_hi = r.hi;
+    r_lo = (PI.lo - r0.lo) + r.lo;
+  }
+
+  // Ziv's accuracy test.
+
+  double r_upper = r_hi + (r_lo + err);
+  double r_lower = r_hi + (r_lo - err);
+
+  if (LIBC_LIKELY(r_upper == r_lower))
+    return r_upper;
+
+  // Ziv's accuracy test failed, we redo the computations in Float128.
+  // Recalculate mod 1/64.
+  idx = static_cast<unsigned>(fputil::nearest_integer(u * 0x1.0p6));
+
+  // After the first step of Newton-Raphson approximating v = sqrt(u), we have
+  // that:
+  //   sqrt(u) = v_hi + h / (sqrt(u) + v_hi)
+  //      v_lo = h / (2 * v_hi)
+  // With error:
+  //   sqrt(u) - (v_hi + v_lo) = h * ( 1/(sqrt(u) + v_hi) - 1/(2*v_hi) )
+  //                           = -h^2 / (2*v * (sqrt(u) + v)^2).
+  // Since:
+  //   (sqrt(u) + v_hi)^2 ~ (2sqrt(u))^2 = 4u,
+  // we can add another correction term to (v_hi + v_lo) that is:
+  //   v_ll = -h^2 / (2*v_hi * 4u)
+  //        = -v_lo * (h / 4u)
+  //        = -vl * (h / 8u),
+  // making the errors:
+  //   sqrt(u) - (v_hi + v_lo + v_ll) = O(h^3)
+  // well beyond 128-bit precision needed.
+
+  // Get the rounding error of vl = 2 * v_lo ~ h / vh
+  // Get full product of vh * vl
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+  double vl_lo = fputil::multiply_add(-v_hi, vl, h) / v_hi;
+#else
+  DoubleDouble vh_vl = fputil::exact_mult(v_hi, vl);
+  double vl_lo = ((h - vh_vl.hi) - vh_vl.lo) / v_hi;
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+  // vll = 2*v_ll = -vl * (h / (4u)).
+  double t = h * (-0.25) / u;
+  double vll = fputil::multiply_add(vl, t, vl_lo);
+  // m_v = -(v_hi + v_lo + v_ll).
+  Float128 m_v = fputil::quick_add(
+      Float128(vh), fputil::quick_add(Float128(vl), Float128(vll)));
+  m_v.sign = xbits.sign();
+
+  // Perform computations in Float128:
+  //   acos(x) = (v_hi + v_lo + vll) * P(u)         , when 0.5 <= x < 1,
+  //           = pi - (v_hi + v_lo + vll) * P(u)    , when -1 < x <= -0.5.
+  Float128 y_f128(fputil::multiply_add(static_cast<double>(idx), -0x1.0p-6, u));
+
+  Float128 p_f128 = asin_eval(y_f128, idx);
+  Float128 r_f128 = fputil::quick_mul(m_v, p_f128);
+
+  if (xbits.is_neg())
+    r_f128 = fputil::quick_add(PI_F128, r_f128);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H
diff --git a/libc/src/__support/math/acosf.h b/libc/src/__support/math/acosf.h
new file mode 100644
index 0000000000000..153087e998211
--- /dev/null
+++ b/libc/src/__support/math/acosf.h
@@ -0,0 +1,147 @@
+//===-- Implementation header for acosf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOSF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOSF_H
+
+#include "inv_trigf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace acosf_internal {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+static constexpr size_t N_EXCEPTS = 4;
+
+// Exceptional values when |x| <= 0.5
+static constexpr fputil::ExceptValues<float, N_EXCEPTS> ACOSF_EXCEPTS = {{
+    // (inputs, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.110b46p-26, acosf(x) = 0x1.921fb4p0 (RZ)
+    {0x328885a3, 0x3fc90fda, 1, 0, 1},
+    // x = -0x1.110b46p-26, acosf(x) = 0x1.921fb4p0 (RZ)
+    {0xb28885a3, 0x3fc90fda, 1, 0, 1},
+    // x = 0x1.04c444p-12, acosf(x) = 0x1.920f68p0 (RZ)
+    {0x39826222, 0x3fc907b4, 1, 0, 1},
+    // x = -0x1.04c444p-12, acosf(x) = 0x1.923p0 (RZ)
+    {0xb9826222, 0x3fc91800, 1, 0, 1},
+}};
+
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+} // namespace acosf_internal
+
+static constexpr float acosf(float x) {
+  using namespace acosf_internal;
+  using namespace inv_trigf_utils_internal;
+  using FPBits = typename fputil::FPBits<float>;
+
+  FPBits xbits(x);
+  uint32_t x_uint = xbits.uintval();
+  uint32_t x_abs = xbits.uintval() & 0x7fff'ffffU;
+  uint32_t x_sign = x_uint >> 31;
+
+  // |x| <= 0.5
+  if (LIBC_UNLIKELY(x_abs <= 0x3f00'0000U)) {
+    // |x| < 0x1p-10
+    if (LIBC_UNLIKELY(x_abs < 0x3a80'0000U)) {
+      // When |x| < 2^-10, we use the following approximation:
+      //   acos(x) = pi/2 - asin(x)
+      //           ~ pi/2 - x - x^3 / 6
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      // Check for exceptional values
+      if (auto r = ACOSF_EXCEPTS.lookup(x_uint); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+      double xd = static_cast<double>(x);
+      return static_cast<float>(fputil::multiply_add(
+          -0x1.5555555555555p-3 * xd, xd * xd, M_MATH_PI_2 - xd));
+    }
+
+    // For |x| <= 0.5, we approximate acosf(x) by:
+    //   acos(x) = pi/2 - asin(x) = pi/2 - x * P(x^2)
+    // Where P(X^2) = Q(X) is a degree-20 minimax even polynomial approximating
+    // asin(x)/x on [0, 0.5] generated by Sollya with:
+    // > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
+    //                 [|1, D...|], [0, 0.5]);
+    double xd = static_cast<double>(x);
+    double xsq = xd * xd;
+    double x3 = xd * xsq;
+    double r = asin_eval(xsq);
+    return static_cast<float>(fputil::multiply_add(-x3, r, M_MATH_PI_2 - xd));
+  }
+
+  // |x| >= 1, return 0, 2pi, or NaNs.
+  if (LIBC_UNLIKELY(x_abs >= 0x3f80'0000U)) {
+    if (x_abs == 0x3f80'0000U)
+      return x_sign ? /* x == -1.0f */ fputil::round_result_slightly_down(
+                          0x1.921fb6p+1f)
+                    : /* x == 1.0f */ 0.0f;
+
+    if (xbits.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+
+    // |x| <= +/-inf
+    if (x_abs <= 0x7f80'0000U) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+    }
+
+    return x + FPBits::quiet_nan().get_val();
+  }
+
+  // When 0.5 < |x| < 1, we perform range reduction as follow:
+  //
+  // Assume further that 0.5 < x <= 1, and let:
+  //   y = acos(x)
+  // We use the double angle formula:
+  //   x = cos(y) = 1 - 2 sin^2(y/2)
+  // So:
+  //   sin(y/2) = sqrt( (1 - x)/2 )
+  // And hence:
+  //   y = 2 * asin( sqrt( (1 - x)/2 ) )
+  // Let u = (1 - x)/2, then
+  //   acos(x) = 2 * asin( sqrt(u) )
+  // Moreover, since 0.5 < x <= 1,
+  //   0 <= u < 1/4, and 0 <= sqrt(u) < 0.5,
+  // And hence we can reuse the same polynomial approximation of asin(x) when
+  // |x| <= 0.5:
+  //   acos(x) ~ 2 * sqrt(u) * P(u).
+  //
+  // When -1 < x <= -0.5, we use the identity:
+  //   acos(x) = pi - acos(-x)
+  // which is reduced to the postive case.
+
+  xbits.set_sign(Sign::POS);
+  double xd = static_cast<double>(xbits.get_val());
+  double u = fputil::multiply_add(-0.5, xd, 0.5);
+  double cv = 2 * fputil::sqrt<double>(u);
+
+  double r3 = asin_eval(u);
+  double r = fputil::multiply_add(cv * u, r3, cv);
+  return static_cast<float>(x_sign ? M_MATH_PI - r : r);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H
diff --git a/libc/src/__support/math/acosf16.h b/libc/src/__support/math/acosf16.h
new file mode 100644
index 0000000000000..58d3761b95245
--- /dev/null
+++ b/libc/src/__support/math/acosf16.h
@@ -0,0 +1,164 @@
+//===-- Implementation header for acosf16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOSF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOSF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float16 acosf16(float16 x) {
+
+  // Generated by Sollya using the following command:
+  // > round(pi/2, SG, RN);
+  // > round(pi, SG, RN);
+  constexpr float PI_OVER_2 = 0x1.921fb6p0f;
+  constexpr float PI = 0x1.921fb6p1f;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS = 2;
+
+  constexpr fputil::ExceptValues<float16, N_EXCEPTS> ACOSF16_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      {0xacaf, 0x3e93, 1, 0, 0},
+      {0xb874, 0x4052, 1, 0, 1},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+  uint16_t x_sign = x_u >> 15;
+
+  // |x| > 0x1p0, |x| > 1, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs > 0x3c00)) {
+    // acosf16(NaN) = NaN
+    if (xbits.is_nan()) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // 1 < |x| <= +/-inf
+    fputil::raise_except_if_required(FE_INVALID);
+    fputil::set_errno_if_required(EDOM);
+
+    return FPBits::quiet_nan().get_val();
+  }
+
+  float xf = x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Handle exceptional values
+  if (auto r = ACOSF16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // |x| == 0x1p0, x is 1 or -1
+  // if x is (-)1, return pi, else
+  // if x is (+)1, return 0
+  if (LIBC_UNLIKELY(x_abs == 0x3c00))
+    return fputil::cast<float16>(x_sign ? PI : 0.0f);
+
+  float xsq = xf * xf;
+
+  // |x| <= 0x1p-1, |x| <= 0.5
+  if (x_abs <= 0x3800) {
+    // if x is 0, return pi/2
+    if (LIBC_UNLIKELY(x_abs == 0))
+      return fputil::cast<float16>(PI_OVER_2);
+
+    // Note that: acos(x) = pi/2 + asin(-x) = pi/2 - asin(x)
+    // Degree-6 minimax polynomial of asin(x) generated by Sollya with:
+    // > P = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 0.5]);
+    float interm =
+        fputil::polyeval(xsq, 0x1.000002p0f, 0x1.554c2ap-3f, 0x1.3541ccp-4f,
+                         0x1.43b2d6p-5f, 0x1.a0d73ep-5f);
+    return fputil::cast<float16>(fputil::multiply_add(-xf, interm, PI_OVER_2));
+  }
+
+  // When |x| > 0.5, assume that 0.5 < |x| <= 1
+  //
+  // Step-by-step range-reduction proof:
+  // 1:  Let y = asin(x), such that, x = sin(y)
+  // 2:  From complimentary angle identity:
+  //       x = sin(y) = cos(pi/2 - y)
+  // 3:  Let z = pi/2 - y, such that x = cos(z)
+  // 4:  From double angle formula; cos(2A) = 1 - 2 * sin^2(A):
+  //       z = 2A, z/2 = A
+  //       cos(z) = 1 - 2 * sin^2(z/2)
+  // 5:  Make sin(z/2) subject of the formula:
+  //       sin(z/2) = sqrt((1 - cos(z))/2)
+  // 6:  Recall [3]; x = cos(z). Therefore:
+  //       sin(z/2) = sqrt((1 - x)/2)
+  // 7:  Let u = (1 - x)/2
+  // 8:  Therefore:
+  //       asin(sqrt(u)) = z/2
+  //       2 * asin(sqrt(u)) = z
+  // 9:  Recall [3]; z = pi/2 - y. Therefore:
+  //       y = pi/2 - z
+  //       y = pi/2 - 2 * asin(sqrt(u))
+  // 10: Recall [1], y = asin(x). Therefore:
+  //       asin(x) = pi/2 - 2 * asin(sqrt(u))
+  // 11: Recall that: acos(x) = pi/2 + asin(-x) = pi/2 - asin(x)
+  //     Therefore:
+  //       acos(x) = pi/2 - (pi/2 - 2 * asin(sqrt(u)))
+  //       acos(x) = 2 * asin(sqrt(u))
+  //
+  // THE RANGE REDUCTION, HOW?
+  // 12: Recall [7], u = (1 - x)/2
+  // 13: Since 0.5 < x <= 1, therefore:
+  //       0 <= u <= 0.25 and 0 <= sqrt(u) <= 0.5
+  //
+  // Hence, we can reuse the same [0, 0.5] domain polynomial approximation for
+  // Step [11] as `sqrt(u)` is in range.
+  // When -1 < x <= -0.5, the identity:
+  //       acos(x) = pi - acos(-x)
+  // allows us to compute for the negative x value (lhs)
+  // with a positive x value instead (rhs).
+
+  float xf_abs = (xf < 0 ? -xf : xf);
+  float u = fputil::multiply_add(-0.5f, xf_abs, 0.5f);
+  float sqrt_u = fputil::sqrt<float>(u);
+
+  // Degree-6 minimax polynomial of asin(x) generated by Sollya with:
+  // > P = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 0.5]);
+  float asin_sqrt_u =
+      sqrt_u * fputil::polyeval(u, 0x1.000002p0f, 0x1.554c2ap-3f,
+                                0x1.3541ccp-4f, 0x1.43b2d6p-5f, 0x1.a0d73ep-5f);
+
+  return fputil::cast<float16>(
+      x_sign ? fputil::multiply_add(-2.0f, asin_sqrt_u, PI) : 2 * asin_sqrt_u);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H
diff --git a/libc/src/__support/math/acosh_float_constants.h b/libc/src/__support/math/acosh_float_constants.h
new file mode 100644
index 0000000000000..2eb245d8265e0
--- /dev/null
+++ b/libc/src/__support/math/acosh_float_constants.h
@@ -0,0 +1,114 @@
+//===-- Common constants for acoshf function --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOSH_FLOAT_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOSH_FLOAT_CONSTANTS_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace acoshf_internal {
+
+// Lookup table for (1/f) where f = 1 + n*2^(-7), n = 0..127.
+static constexpr double ONE_OVER_F[128] = {
+    0x1.0000000000000p+0, 0x1.fc07f01fc07f0p-1, 0x1.f81f81f81f820p-1,
+    0x1.f44659e4a4271p-1, 0x1.f07c1f07c1f08p-1, 0x1.ecc07b301ecc0p-1,
+    0x1.e9131abf0b767p-1, 0x1.e573ac901e574p-1, 0x1.e1e1e1e1e1e1ep-1,
+    0x1.de5d6e3f8868ap-1, 0x1.dae6076b981dbp-1, 0x1.d77b654b82c34p-1,
+    0x1.d41d41d41d41dp-1, 0x1.d0cb58f6ec074p-1, 0x1.cd85689039b0bp-1,
+    0x1.ca4b3055ee191p-1, 0x1.c71c71c71c71cp-1, 0x1.c3f8f01c3f8f0p-1,
+    0x1.c0e070381c0e0p-1, 0x1.bdd2b899406f7p-1, 0x1.bacf914c1bad0p-1,
+    0x1.b7d6c3dda338bp-1, 0x1.b4e81b4e81b4fp-1, 0x1.b2036406c80d9p-1,
+    0x1.af286bca1af28p-1, 0x1.ac5701ac5701bp-1, 0x1.a98ef606a63bep-1,
+    0x1.a6d01a6d01a6dp-1, 0x1.a41a41a41a41ap-1, 0x1.a16d3f97a4b02p-1,
+    0x1.9ec8e951033d9p-1, 0x1.9c2d14ee4a102p-1, 0x1.999999999999ap-1,
+    0x1.970e4f80cb872p-1, 0x1.948b0fcd6e9e0p-1, 0x1.920fb49d0e229p-1,
+    0x1.8f9c18f9c18fap-1, 0x1.8d3018d3018d3p-1, 0x1.8acb90f6bf3aap-1,
+    0x1.886e5f0abb04ap-1, 0x1.8618618618618p-1, 0x1.83c977ab2beddp-1,
+    0x1.8181818181818p-1, 0x1.7f405fd017f40p-1, 0x1.7d05f417d05f4p-1,
+    0x1.7ad2208e0ecc3p-1, 0x1.78a4c8178a4c8p-1, 0x1.767dce434a9b1p-1,
+    0x1.745d1745d1746p-1, 0x1.724287f46debcp-1, 0x1.702e05c0b8170p-1,
+    0x1.6e1f76b4337c7p-1, 0x1.6c16c16c16c17p-1, 0x1.6a13cd1537290p-1,
+    0x1.6816816816817p-1, 0x1.661ec6a5122f9p-1, 0x1.642c8590b2164p-1,
+    0x1.623fa77016240p-1, 0x1.6058160581606p-1, 0x1.5e75bb8d015e7p-1,
+    0x1.5c9882b931057p-1, 0x1.5ac056b015ac0p-1, 0x1.58ed2308158edp-1,
+    0x1.571ed3c506b3ap-1, 0x1.5555555555555p-1, 0x1.5390948f40febp-1,
+    0x1.51d07eae2f815p-1, 0x1.5015015015015p-1, 0x1.4e5e0a72f0539p-1,
+    0x1.4cab88725af6ep-1, 0x1.4afd6a052bf5bp-1, 0x1.49539e3b2d067p-1,
+    0x1.47ae147ae147bp-1, 0x1.460cbc7f5cf9ap-1, 0x1.446f86562d9fbp-1,
+    0x1.42d6625d51f87p-1, 0x1.4141414141414p-1, 0x1.3fb013fb013fbp-1,
+    0x1.3e22cbce4a902p-1, 0x1.3c995a47babe7p-1, 0x1.3b13b13b13b14p-1,
+    0x1.3991c2c187f63p-1, 0x1.3813813813814p-1, 0x1.3698df3de0748p-1,
+    0x1.3521cfb2b78c1p-1, 0x1.33ae45b57bcb2p-1, 0x1.323e34a2b10bfp-1,
+    0x1.30d190130d190p-1, 0x1.2f684bda12f68p-1, 0x1.2e025c04b8097p-1,
+    0x1.2c9fb4d812ca0p-1, 0x1.2b404ad012b40p-1, 0x1.29e4129e4129ep-1,
+    0x1.288b01288b013p-1, 0x1.27350b8812735p-1, 0x1.25e22708092f1p-1,
+    0x1.2492492492492p-1, 0x1.23456789abcdfp-1, 0x1.21fb78121fb78p-1,
+    0x1.20b470c67c0d9p-1, 0x1.1f7047dc11f70p-1, 0x1.1e2ef3b3fb874p-1,
+    0x1.1cf06ada2811dp-1, 0x1.1bb4a4046ed29p-1, 0x1.1a7b9611a7b96p-1,
+    0x1.19453808ca29cp-1, 0x1.1811811811812p-1, 0x1.16e0689427379p-1,
+    0x1.15b1e5f75270dp-1, 0x1.1485f0e0acd3bp-1, 0x1.135c81135c811p-1,
+    0x1.12358e75d3033p-1, 0x1.1111111111111p-1, 0x1.0fef010fef011p-1,
+    0x1.0ecf56be69c90p-1, 0x1.0db20a88f4696p-1, 0x1.0c9714fbcda3bp-1,
+    0x1.0b7e6ec259dc8p-1, 0x1.0a6810a6810a7p-1, 0x1.0953f39010954p-1,
+    0x1.0842108421084p-1, 0x1.073260a47f7c6p-1, 0x1.0624dd2f1a9fcp-1,
+    0x1.05197f7d73404p-1, 0x1.0410410410410p-1, 0x1.03091b51f5e1ap-1,
+    0x1.0204081020408p-1, 0x1.0101010101010p-1};
+
+// Lookup table for log(f) = log(1 + n*2^(-7)) where n = 0..127.
+static constexpr double LOG_F[128] = {
+    0x0.0000000000000p+0, 0x1.fe02a6b106788p-8, 0x1.fc0a8b0fc03e3p-7,
+    0x1.7b91b07d5b11ap-6, 0x1.f829b0e783300p-6, 0x1.39e87b9febd5fp-5,
+    0x1.77458f632dcfcp-5, 0x1.b42dd711971bep-5, 0x1.f0a30c01162a6p-5,
+    0x1.16536eea37ae0p-4, 0x1.341d7961bd1d0p-4, 0x1.51b073f06183fp-4,
+    0x1.6f0d28ae56b4bp-4, 0x1.8c345d6319b20p-4, 0x1.a926d3a4ad563p-4,
+    0x1.c5e548f5bc743p-4, 0x1.e27076e2af2e5p-4, 0x1.fec9131dbeabap-4,
+    0x1.0d77e7cd08e59p-3, 0x1.1b72ad52f67a0p-3, 0x1.29552f81ff523p-3,
+    0x1.371fc201e8f74p-3, 0x1.44d2b6ccb7d1ep-3, 0x1.526e5e3a1b437p-3,
+    0x1.5ff3070a793d3p-3, 0x1.6d60fe719d21cp-3, 0x1.7ab890210d909p-3,
+    0x1.87fa06520c910p-3, 0x1.9525a9cf456b4p-3, 0x1.a23bc1fe2b563p-3,
+    0x1.af3c94e80bff2p-3, 0x1.bc286742d8cd6p-3, 0x1.c8ff7c79a9a21p-3,
+    0x1.d5c216b4fbb91p-3, 0x1.e27076e2af2e5p-3, 0x1.ef0adcbdc5936p-3,
+    0x1.fb9186d5e3e2ap-3, 0x1.0402594b4d040p-2, 0x1.0a324e27390e3p-2,
+    0x1.1058bf9ae4ad5p-2, 0x1.1675cababa60ep-2, 0x1.1c898c16999fap-2,
+    0x1.22941fbcf7965p-2, 0x1.2895a13de86a3p-2, 0x1.2e8e2bae11d30p-2,
+    0x1.347dd9a987d54p-2, 0x1.3a64c556945e9p-2, 0x1.404308686a7e3p-2,
+    0x1.4618bc21c5ec2p-2, 0x1.4be5f957778a0p-2, 0x1.51aad872df82dp-2,
+    0x1.5767717455a6cp-2, 0x1.5d1bdbf5809cap-2, 0x1.62c82f2b9c795p-2,
+    0x1.686c81e9b14aep-2, 0x1.6e08eaa2ba1e3p-2, 0x1.739d7f6bbd006p-2,
+    0x1.792a55fdd47a2p-2, 0x1.7eaf83b82afc3p-2, 0x1.842d1da1e8b17p-2,
+    0x1.89a3386c1425ap-2, 0x1.8f11e873662c7p-2, 0x1.947941c2116fap-2,
+    0x1.99d958117e08ap-2, 0x1.9f323ecbf984bp-2, 0x1.a484090e5bb0ap-2,
+    0x1.a9cec9a9a0849p-2, 0x1.af1293247786bp-2, 0x1.b44f77bcc8f62p-2,
+    0x1.b9858969310fbp-2, 0x1.beb4d9da71b7bp-2, 0x1.c3dd7a7cdad4dp-2,
+    0x1.c8ff7c79a9a21p-2, 0x1.ce1af0b85f3ebp-2, 0x1.d32fe7e00ebd5p-2,
+    0x1.d83e7258a2f3ep-2, 0x1.dd46a04c1c4a0p-2, 0x1.e24881a7c6c26p-2,
+    0x1.e744261d68787p-2, 0x1.ec399d2468cc0p-2, 0x1.f128f5faf06ecp-2,
+    0x1.f6123fa7028acp-2, 0x1.faf588f78f31ep-2, 0x1.ffd2e0857f498p-2,
+    0x1.02552a5a5d0fep-1, 0x1.04bdf9da926d2p-1, 0x1.0723e5c1cdf40p-1,
+    0x1.0986f4f573520p-1, 0x1.0be72e4252a82p-1, 0x1.0e44985d1cc8bp-1,
+    0x1.109f39e2d4c96p-1, 0x1.12f719593efbcp-1, 0x1.154c3d2f4d5e9p-1,
+    0x1.179eabbd899a0p-1, 0x1.19ee6b467c96ep-1, 0x1.1c3b81f713c24p-1,
+    0x1.1e85f5e7040d0p-1, 0x1.20cdcd192ab6dp-1, 0x1.23130d7bebf42p-1,
+    0x1.2555bce98f7cbp-1, 0x1.2795e1289b11ap-1, 0x1.29d37fec2b08ap-1,
+    0x1.2c0e9ed448e8bp-1, 0x1.2e47436e40268p-1, 0x1.307d7334f10bep-1,
+    0x1.32b1339121d71p-1, 0x1.34e289d9ce1d3p-1, 0x1.37117b54747b5p-1,
+    0x1.393e0d3562a19p-1, 0x1.3b68449fffc22p-1, 0x1.3d9026a7156fap-1,
+    0x1.3fb5b84d16f42p-1, 0x1.41d8fe84672aep-1, 0x1.43f9fe2f9ce67p-1,
+    0x1.4618bc21c5ec2p-1, 0x1.48353d1ea88dfp-1, 0x1.4a4f85db03ebbp-1,
+    0x1.4c679afccee39p-1, 0x1.4e7d811b75bb0p-1, 0x1.50913cc01686bp-1,
+    0x1.52a2d265bc5aap-1, 0x1.54b2467999497p-1, 0x1.56bf9d5b3f399p-1,
+    0x1.58cadb5cd7989p-1, 0x1.5ad404c359f2cp-1, 0x1.5cdb1dc6c1764p-1,
+    0x1.5ee02a9241675p-1, 0x1.60e32f44788d8p-1};
+
+} // namespace acoshf_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOSH_FLOAT_CONSTANTS_H
diff --git a/libc/src/__support/math/acoshf.h b/libc/src/__support/math/acoshf.h
new file mode 100644
index 0000000000000..f18f169f49bb8
--- /dev/null
+++ b/libc/src/__support/math/acoshf.h
@@ -0,0 +1,86 @@
+//===-- Implementation header for acoshf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_H
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float acoshf(float x) {
+  using namespace acoshf_internal;
+  using FPBits_t = typename fputil::FPBits<float>;
+  FPBits_t xbits(x);
+
+  if (LIBC_UNLIKELY(x <= 1.0f)) {
+    if (x == 1.0f)
+      return 0.0f;
+    // x < 1.
+    fputil::set_errno_if_required(EDOM);
+    fputil::raise_except_if_required(FE_INVALID);
+    return FPBits_t::quiet_nan().get_val();
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  uint32_t x_u = xbits.uintval();
+  if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
+    if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
+      return x;
+
+    // Helper functions to set results for exceptional cases.
+    auto round_result_slightly_down = [](float r) -> float {
+      volatile float tmp = r;
+      tmp = tmp - 0x1.0p-25f;
+      return tmp;
+    };
+    auto round_result_slightly_up = [](float r) -> float {
+      volatile float tmp = r;
+      tmp = tmp + 0x1.0p-25f;
+      return tmp;
+    };
+
+    switch (x_u) {
+    case 0x4f8ffb03: // x = 0x1.1ff606p32f
+      return round_result_slightly_up(0x1.6fdd34p4f);
+    case 0x5c569e88: // x = 0x1.ad3d1p57f
+      return round_result_slightly_up(0x1.45c146p5f);
+    case 0x5e68984e: // x = 0x1.d1309cp61f
+      return round_result_slightly_up(0x1.5c9442p5f);
+    case 0x655890d3: // x = 0x1.b121a6p75f
+      return round_result_slightly_down(0x1.a9a3f2p5f);
+    case 0x6eb1a8ec: // x = 0x1.6351d8p94f
+      return round_result_slightly_down(0x1.08b512p6f);
+    case 0x7997f30a: // x = 0x1.2fe614p116f
+      return round_result_slightly_up(0x1.451436p6f);
+    }
+  }
+#else
+  if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
+    return x;
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  double x_d = static_cast<double>(x);
+  // acosh(x) = log(x + sqrt(x^2 - 1))
+  return static_cast<float>(log_eval(
+      x_d + fputil::sqrt<double>(fputil::multiply_add(x_d, x_d, -1.0))));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_H
diff --git a/libc/src/__support/math/acoshf16.h b/libc/src/__support/math/acoshf16.h
new file mode 100644
index 0000000000000..15e7f6ae7e208
--- /dev/null
+++ b/libc/src/__support/math/acoshf16.h
@@ -0,0 +1,123 @@
+//===-- Implementation header for acoshf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "acoshf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float16 acoshf16(float16 x) {
+
+  using namespace acoshf_internal;
+  constexpr size_t N_EXCEPTS = 2;
+  constexpr fputil::ExceptValues<float16, N_EXCEPTS> ACOSHF16_EXCEPTS{{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.6dcp+1, acoshf16(x) = 0x1.b6p+0 (RZ)
+      {0x41B7, 0x3ED8, 1, 0, 0},
+      // x = 0x1.39p+0, acoshf16(x) = 0x1.4f8p-1 (RZ)
+      {0x3CE4, 0x393E, 1, 0, 1},
+  }};
+
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+  uint16_t x_u = xbits.uintval();
+
+  // Check for NaN input first.
+  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
+    if (xbits.is_signaling_nan()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+    if (xbits.is_neg()) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+    return x;
+  }
+
+  // Domain error for inputs less than 1.0.
+  if (LIBC_UNLIKELY(x <= 1.0f)) {
+    if (x == 1.0f)
+      return FPBits::zero().get_val();
+    fputil::set_errno_if_required(EDOM);
+    fputil::raise_except_if_required(FE_INVALID);
+    return FPBits::quiet_nan().get_val();
+  }
+
+  if (auto r = ACOSHF16_EXCEPTS.lookup(xbits.uintval());
+      LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  float xf = x;
+  // High-precision polynomial approximation for inputs close to 1.0
+  // ([1, 1.25)).
+  //
+  // Brief derivation:
+  // 1. Expand acosh(1 + delta) using Taylor series around delta=0:
+  //    acosh(1 + delta) ≈ sqrt(2 * delta) * [1 - delta/12 + 3*delta^2/160
+  //                     - 5*delta^3/896 + 35*delta^4/18432 + ...]
+  // 2. Truncate the series to fit accurately for delta in [0, 0.25].
+  // 3. Polynomial coefficients (from sollya) used here are:
+  //    P(delta) ≈ 1 - 0x1.555556p-4 * delta + 0x1.333334p-6 * delta^2
+  //               - 0x1.6db6dcp-8 * delta^3 + 0x1.f1c71cp-10 * delta^4
+  // 4. The Sollya commands used to generate these coefficients were:
+  //      > display = hexadecimal;
+  //      > round(1/12, SG, RN);
+  //      > round(3/160, SG, RN);
+  //      > round(5/896, SG, RN);
+  //      > round(35/18432, SG, RN);
+  //      With hexadecimal display mode enabled, the outputs were:
+  //      0x1.555556p-4
+  //      0x1.333334p-6
+  //      0x1.6db6dcp-8
+  //      0x1.f1c71cp-10
+  // 5. The maximum absolute error, estimated using:
+  //      dirtyinfnorm(acosh(1 + x) - sqrt(2*x) * P(x), [0, 0.25])
+  //    is:
+  //      0x1.d84281p-22
+  if (LIBC_UNLIKELY(x_u < 0x3D00U)) {
+    float delta = xf - 1.0f;
+    float sqrt_2_delta = fputil::sqrt<float>(2.0 * delta);
+    float pe = fputil::polyeval(delta, 0x1p+0f, -0x1.555556p-4f, 0x1.333334p-6f,
+                                -0x1.6db6dcp-8f, 0x1.f1c71cp-10f);
+    float approx = sqrt_2_delta * pe;
+    return fputil::cast<float16>(approx);
+  }
+
+  // acosh(x) = log(x + sqrt(x^2 - 1))
+  float sqrt_term = fputil::sqrt<float>(fputil::multiply_add(xf, xf, -1.0f));
+  float result = static_cast<float>(log_eval(xf + sqrt_term));
+
+  return fputil::cast<float16>(result);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_H
diff --git a/libc/src/__support/math/acoshf_utils.h b/libc/src/__support/math/acoshf_utils.h
new file mode 100644
index 0000000000000..808c3dd41cfe4
--- /dev/null
+++ b/libc/src/__support/math/acoshf_utils.h
@@ -0,0 +1,60 @@
+//===-- Collection of utils for acoshf --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_UTILS_H
+
+#include "acosh_float_constants.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace acoshf_internal {
+
+// x should be positive, normal finite value
+LIBC_INLINE static double log_eval(double x) {
+  // For x = 2^ex * (1 + mx)
+  //   log(x) = ex * log(2) + log(1 + mx)
+  using FPB = fputil::FPBits<double>;
+  FPB bs(x);
+
+  double ex = static_cast<double>(bs.get_exponent());
+
+  // p1 is the leading 7 bits of mx, i.e.
+  // p1 * 2^(-7) <= m_x < (p1 + 1) * 2^(-7).
+  int p1 = static_cast<int>(bs.get_mantissa() >> (FPB::FRACTION_LEN - 7));
+
+  // Set bs to (1 + (mx - p1*2^(-7))
+  bs.set_uintval(bs.uintval() & (FPB::FRACTION_MASK >> 7));
+  bs.set_biased_exponent(FPB::EXP_BIAS);
+  // dx = (mx - p1*2^(-7)) / (1 + p1*2^(-7)).
+  double dx = (bs.get_val() - 1.0) * ONE_OVER_F[p1];
+
+  // Minimax polynomial of log(1 + dx) generated by Sollya with:
+  // > P = fpminimax(log(1 + x)/x, 6, [|D...|], [0, 2^-7]);
+  const double COEFFS[6] = {-0x1.ffffffffffffcp-2, 0x1.5555555552ddep-2,
+                            -0x1.ffffffefe562dp-3, 0x1.9999817d3a50fp-3,
+                            -0x1.554317b3f67a5p-3, 0x1.1dc5c45e09c18p-3};
+  double dx2 = dx * dx;
+  double c1 = fputil::multiply_add(dx, COEFFS[1], COEFFS[0]);
+  double c2 = fputil::multiply_add(dx, COEFFS[3], COEFFS[2]);
+  double c3 = fputil::multiply_add(dx, COEFFS[5], COEFFS[4]);
+
+  double p = fputil::polyeval(dx2, dx, c1, c2, c3);
+  double result =
+      fputil::multiply_add(ex, /*log(2)*/ 0x1.62e42fefa39efp-1, LOG_F[p1] + p);
+  return result;
+}
+
+} // namespace acoshf_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOSHF_UTILS_H
diff --git a/libc/src/math/generic/asin_utils.h b/libc/src/__support/math/asin_utils.h
similarity index 96%
rename from libc/src/math/generic/asin_utils.h
rename to libc/src/__support/math/asin_utils.h
index 44913d573de2c..e0c9096e2bb78 100644
--- a/libc/src/math/generic/asin_utils.h
+++ b/libc/src/__support/math/asin_utils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_ASIN_UTILS_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_ASIN_UTILS_H
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ASIN_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ASIN_UTILS_H
 
 #include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/double_double.h"
@@ -16,19 +16,18 @@
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/integer_literals.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace {
+namespace asin_internal {
 
 using DoubleDouble = fputil::DoubleDouble;
 using Float128 = fputil::DyadicFloat<128>;
 
-constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p1};
+static constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p1};
 
-constexpr DoubleDouble PI_OVER_TWO = {0x1.1a62633145c07p-54,
-                                      0x1.921fb54442d18p0};
+static constexpr DoubleDouble PI_OVER_TWO = {0x1.1a62633145c07p-54,
+                                             0x1.921fb54442d18p0};
 
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
@@ -39,14 +38,14 @@ constexpr DoubleDouble PI_OVER_TWO = {0x1.1a62633145c07p-54,
 // > dirtyinfnorm(asin(x)/x - P, [0, 0.5]);
 // 0x1.1a71ef0a0f26a9fb7ed7e41dee788b13d1770db3dp-52
 
-constexpr double ASIN_COEFFS[12] = {
+static constexpr double ASIN_COEFFS[12] = {
     0x1.0000000000000p0,  0x1.5555555556dcfp-3,  0x1.3333333082e11p-4,
     0x1.6db6dd14099edp-5, 0x1.f1c69b35bf81fp-6,  0x1.6e97194225a67p-6,
     0x1.1babddb82ce12p-6, 0x1.d55bd078600d6p-7,  0x1.33328959e63d6p-7,
     0x1.2b5993bda1d9bp-6, -0x1.806aff270bf25p-7, 0x1.02614e5ed3936p-5,
 };
 
-LIBC_INLINE double asin_eval(double u) {
+LIBC_INLINE static constexpr double asin_eval(double u) {
   double u2 = u * u;
   double c0 = fputil::multiply_add(u, ASIN_COEFFS[1], ASIN_COEFFS[0]);
   double c1 = fputil::multiply_add(u, ASIN_COEFFS[3], ASIN_COEFFS[2]);
@@ -124,7 +123,7 @@ LIBC_INLINE double asin_eval(double u) {
 // > dirtyinfnorm(asin(x)/x - P, [-1/64, 1/64]);
 // 0x1.999075402cafp-83
 
-constexpr double ASIN_COEFFS[9][12] = {
+static constexpr double ASIN_COEFFS[9][12] = {
     {1.0, 0.0, 0x1.5555555555555p-3, 0x1.5555555555555p-57,
      0x1.3333333333333p-4, 0x1.6db6db6db6db7p-5, 0x1.f1c71c71c71c7p-6,
      0x1.6e8ba2e8ba2e9p-6, 0x1.1c4ec4ec4ec4fp-6, 0x1.c99999999999ap-7,
@@ -164,8 +163,8 @@ constexpr double ASIN_COEFFS[9][12] = {
 };
 
 // We calculate the lower part of the approximation P(u).
-LIBC_INLINE DoubleDouble asin_eval(const DoubleDouble &u, unsigned &idx,
-                                   double &err) {
+LIBC_INLINE static DoubleDouble asin_eval(const DoubleDouble &u, unsigned &idx,
+                                          double &err) {
   using fputil::multiply_add;
   // k = round(u * 32).
   double k = fputil::nearest_integer(u.hi * 0x1.0p5);
@@ -239,7 +238,7 @@ LIBC_INLINE DoubleDouble asin_eval(const DoubleDouble &u, unsigned &idx,
 //               + (676039 x^24)/104857600 + (1300075 x^26)/226492416 +
 //               + (5014575 x^28)/973078528 + (9694845 x^30)/2080374784.
 
-constexpr Float128 ASIN_COEFFS_F128[17][16] = {
+static constexpr Float128 ASIN_COEFFS_F128[17][16] = {
     {
         {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128},
         {Sign::POS, -130, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128},
@@ -548,13 +547,14 @@ constexpr Float128 ASIN_COEFFS_F128[17][16] = {
     },
 };
 
-constexpr Float128 PI_OVER_TWO_F128 = {
+static constexpr Float128 PI_OVER_TWO_F128 = {
     Sign::POS, -127, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128};
 
-constexpr Float128 PI_F128 = {Sign::POS, -126,
-                              0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128};
+static constexpr Float128 PI_F128 = {
+    Sign::POS, -126, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128};
 
-LIBC_INLINE Float128 asin_eval(const Float128 &u, unsigned idx) {
+LIBC_INLINE static constexpr Float128 asin_eval(const Float128 &u,
+                                                unsigned idx) {
   return fputil::polyeval(u, ASIN_COEFFS_F128[idx][0], ASIN_COEFFS_F128[idx][1],
                           ASIN_COEFFS_F128[idx][2], ASIN_COEFFS_F128[idx][3],
                           ASIN_COEFFS_F128[idx][4], ASIN_COEFFS_F128[idx][5],
@@ -567,8 +567,8 @@ LIBC_INLINE Float128 asin_eval(const Float128 &u, unsigned idx) {
 
 #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
-} // anonymous namespace
+} // namespace asin_internal
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_ASIN_UTILS_H
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ASIN_UTILS_H
diff --git a/libc/src/__support/math/erff.h b/libc/src/__support/math/erff.h
new file mode 100644
index 0000000000000..e54ec77b9def7
--- /dev/null
+++ b/libc/src/__support/math/erff.h
@@ -0,0 +1,193 @@
+//===-- Implementation header for erff --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ERFF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ERFF_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float erff(float x) {
+
+  // Polynomials approximating erf(x)/x on ( k/8, (k + 1)/8 ) generated by
+  // Sollya with: > P = fpminimax(erf(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|],
+  // [|D...|],
+  //                 [k/8, (k + 1)/8]);
+  // for k = 0..31.
+  constexpr double COEFFS[32][8] = {
+      {0x1.20dd750429b6dp0, -0x1.812746b037753p-2, 0x1.ce2f219e8596ap-4,
+       -0x1.b82cdacb78fdap-6, 0x1.56479297dfda5p-8, -0x1.8b3ac5455ef02p-11,
+       -0x1.126fcac367e3bp-8, 0x1.2d0bdb3ba4984p-4},
+      {0x1.20dd750429b6dp0, -0x1.812746b0379a8p-2, 0x1.ce2f21a03cf2ap-4,
+       -0x1.b82ce30de083ep-6, 0x1.565bcad3eb60fp-8, -0x1.c02c66f659256p-11,
+       0x1.f92f673385229p-14, -0x1.def402648ae9p-17},
+      {0x1.20dd750429b34p0, -0x1.812746b032dcep-2, 0x1.ce2f219d84aaep-4,
+       -0x1.b82ce22dcf139p-6, 0x1.565b9efcd4af1p-8, -0x1.c021f1af414bcp-11,
+       0x1.f7c6d177eff82p-14, -0x1.c9e4410dcf865p-17},
+      {0x1.20dd750426eabp0, -0x1.812746ae592c7p-2, 0x1.ce2f211525f14p-4,
+       -0x1.b82ccc125e63fp-6, 0x1.56596f261cfd3p-8, -0x1.bfde1ff8eeecfp-11,
+       0x1.f31a9d15dc5d8p-14, -0x1.a5a4362844b3cp-17},
+      {0x1.20dd75039c705p0, -0x1.812746777e74dp-2, 0x1.ce2f17af98a1bp-4,
+       -0x1.b82be4b817cbep-6, 0x1.564bec2e2962ep-8, -0x1.bee86f9da3558p-11,
+       0x1.e9443689dc0ccp-14, -0x1.79c0f230805d8p-17},
+      {0x1.20dd74f811211p0, -0x1.81274371a3e8fp-2, 0x1.ce2ec038262e5p-4,
+       -0x1.b8265b82c5e1fp-6, 0x1.5615a2e239267p-8, -0x1.bc63ae023dcebp-11,
+       0x1.d87c2102f7e06p-14, -0x1.49584bea41d62p-17},
+      {0x1.20dd746d063e3p0, -0x1.812729a8a950fp-2, 0x1.ce2cb0a2df232p-4,
+       -0x1.b80eca1f51278p-6, 0x1.5572e26c46815p-8, -0x1.b715e5638b65ep-11,
+       0x1.bfbb195484968p-14, -0x1.177a565c15c52p-17},
+      {0x1.20dd701b44486p0, -0x1.812691145f237p-2, 0x1.ce23a06b8cfd9p-4,
+       -0x1.b7c1dc7245288p-6, 0x1.53e92f7f397ddp-8, -0x1.ad97cc4acf0b2p-11,
+       0x1.9f028b2b09b71p-14, -0x1.cdc4da08da8c1p-18},
+      {0x1.20dd5715ac332p0, -0x1.8123e680bd0ebp-2, 0x1.ce0457aded691p-4,
+       -0x1.b6f52d52bed4p-6, 0x1.50c291b84414cp-8, -0x1.9ea246b1ad4a9p-11,
+       0x1.77654674e0cap-14, -0x1.737c11a1bcebbp-18},
+      {0x1.20dce6593e114p0, -0x1.811a59c02eadcp-2, 0x1.cdab53c7cd7d5p-4,
+       -0x1.b526d2e321eedp-6, 0x1.4b1d32cd8b994p-8, -0x1.8963143ec0a1ep-11,
+       0x1.4ad5700e4db91p-14, -0x1.231e100e43ef2p-18},
+      {0x1.20db48bfd5a62p0, -0x1.80fdd84f9e308p-2, 0x1.ccd340d462983p-4,
+       -0x1.b196a2928768p-6, 0x1.4210c2c13a0f7p-8, -0x1.6dbdfb4ff71aep-11,
+       0x1.1bca2d17fbd71p-14, -0x1.bca36f90c7cf5p-19},
+      {0x1.20d64b2f8f508p0, -0x1.80b4d4f19fa8bp-2, 0x1.cb088197262e3p-4,
+       -0x1.ab51fd02e5b99p-6, 0x1.34e1e5e81a632p-8, -0x1.4c66377b502cep-11,
+       0x1.d9ad25066213cp-15, -0x1.4b0df7dd0cfa1p-19},
+      {0x1.20c8fc1243576p0, -0x1.8010cb2009e27p-2, 0x1.c7a47e9299315p-4,
+       -0x1.a155be5683654p-6, 0x1.233502694997bp-8, -0x1.26c94b7d813p-11,
+       0x1.8094f1de25fb9p-15, -0x1.e0e3d776c6eefp-20},
+      {0x1.20a9bd1611bc1p0, -0x1.7ec7fbce83f9p-2, 0x1.c1d757d7317b7p-4,
+       -0x1.92c160cd589fp-6, 0x1.0d307269cc5c2p-8, -0x1.fda5b0d2d1879p-12,
+       0x1.2fdd7b3b14a7fp-15, -0x1.54eed4a26af5ap-20},
+      {0x1.20682834f943dp0, -0x1.7c73f747bf5a9p-2, 0x1.b8c2db4a9ffd1p-4,
+       -0x1.7f0e4ffe989ecp-6, 0x1.e7061eae4166ep-9, -0x1.ad36e873fff2dp-12,
+       0x1.d39222396128ep-16, -0x1.d83dacec5ea6bp-21},
+      {0x1.1feb8d12676d7p0, -0x1.7898347284afep-2, 0x1.aba3466b34451p-4,
+       -0x1.663adc573e2f9p-6, 0x1.ae99fb17c3e08p-9, -0x1.602f950ad5535p-12,
+       0x1.5e9717490609dp-16, -0x1.3fca107bbc8d5p-21},
+      {0x1.1f12fe3c536fap0, -0x1.72b1d1f22e6d3p-2, 0x1.99fc0eed4a896p-4,
+       -0x1.48db0a87bd8c6p-6, 0x1.73e368895aa61p-9, -0x1.19b35d5301fc8p-12,
+       0x1.007987e4bb033p-16, -0x1.a7edcd4c2dc7p-22},
+      {0x1.1db7b0df84d5dp0, -0x1.6a4e4a41cde02p-2, 0x1.83bbded16455dp-4,
+       -0x1.2809b3b36977ep-6, 0x1.39c08bab44679p-9, -0x1.b7b45a70ed119p-13,
+       0x1.6e99b36410e7bp-17, -0x1.13619bb7ebc0cp-22},
+      {0x1.1bb1c85c4a527p0, -0x1.5f23b99a249a3p-2, 0x1.694c91fa0d12cp-4,
+       -0x1.053e1ce11c72dp-6, 0x1.02bf72c50ea78p-9, -0x1.4f478fb56cb02p-13,
+       0x1.005f80ecbe213p-17, -0x1.5f2446bde7f5bp-23},
+      {0x1.18dec3bd51f9dp0, -0x1.5123f58346186p-2, 0x1.4b8a1ca536ab4p-4,
+       -0x1.c4243015cc723p-7, 0x1.a1a8a01d351efp-10, -0x1.f466b34f1d86bp-14,
+       0x1.5f835eea0bf6ap-18, -0x1.b83165b939234p-24},
+      {0x1.152804c3369f4p0, -0x1.4084cd4afd4bcp-2, 0x1.2ba2e836e47aap-4,
+       -0x1.800f2dfc6904bp-7, 0x1.4a6daf0669c59p-10, -0x1.6e326ab872317p-14,
+       0x1.d9761a6a755a5p-19, -0x1.0fca33f9dd4b5p-24},
+      {0x1.1087ad68356aap0, -0x1.2dbb044707459p-2, 0x1.0aea8ceaa0384p-4,
+       -0x1.40b516d52b3d2p-7, 0x1.00c9e05f01d22p-10, -0x1.076afb0dc0ff7p-14,
+       0x1.39fadec400657p-19, -0x1.4b5761352e7e3p-25},
+      {0x1.0b0a7a8ba4a22p0, -0x1.196990d22d4a1p-2, 0x1.d5551e6ac0c4dp-5,
+       -0x1.07cce1770bd1ap-7, 0x1.890347b8848bfp-11, -0x1.757ec96750b6ap-15,
+       0x1.9b258a1e06bcep-20, -0x1.8fc6d22da7572p-26},
+      {0x1.04ce2be70fb47p0, -0x1.0449e4b0b9cacp-2, 0x1.97f7424f4b0e7p-5,
+       -0x1.ac825439c42f4p-8, 0x1.28f5f65426dfbp-11, -0x1.05b699a90f90fp-15,
+       0x1.0a888eecf4593p-20, -0x1.deace2b32bb31p-27},
+      {0x1.fbf9fb0e11cc8p-1, -0x1.de2640856545ap-3, 0x1.5f5b1f47f851p-5,
+       -0x1.588bc71eb41b9p-8, 0x1.bc6a0a772f56dp-12, -0x1.6b9fad1f1657ap-16,
+       0x1.573204ba66504p-21, -0x1.1d38065c94e44p-27},
+      {0x1.ed8f18c99e031p-1, -0x1.b4cb6acd903b4p-3, 0x1.2c7f3dddd6fc1p-5,
+       -0x1.13052067df4ep-8, 0x1.4a5027444082fp-12, -0x1.f672bab0e2554p-17,
+       0x1.b83c756348cc9p-22, -0x1.534f1a1079499p-28},
+      {0x1.debd33044166dp-1, -0x1.8d7cd9053f7d8p-3, 0x1.ff9957fb3d6e7p-6,
+       -0x1.b50be55de0f36p-9, 0x1.e92c8ec53a628p-13, -0x1.5a4b88d508007p-17,
+       0x1.1a27737559e26p-22, -0x1.942ae62cb2c14p-29},
+      {0x1.cfdbf0386f3bdp-1, -0x1.68e33d93b0dc4p-3, 0x1.b2683d58f53dep-6,
+       -0x1.5a9174e70d26fp-9, 0x1.69ddd326d49cdp-13, -0x1.dd8f397a8219cp-18,
+       0x1.6a755016ad4ddp-23, -0x1.e366e0139187dp-30},
+      {0x1.c132adb8d7464p-1, -0x1.475a899f61b46p-3, 0x1.70a431397a77cp-6,
+       -0x1.12e3d35beeee2p-9, 0x1.0c16b05738333p-13, -0x1.4a47f873e144ep-18,
+       0x1.d3d494c698c02p-24, -0x1.2302c59547fe5p-30},
+      {0x1.b2f5fd05555e7p-1, -0x1.28feefbe03ec7p-3, 0x1.3923acbb3a676p-6,
+       -0x1.b4ff793cd6358p-10, 0x1.8ea0eb8c913bcp-14, -0x1.cb31ec2baceb1p-19,
+       0x1.30011e7e80c04p-24, -0x1.617710635cb1dp-31},
+      {0x1.a54853cd9593ep-1, -0x1.0dbdbaea4dc8ep-3, 0x1.0a93e2c20a0fdp-6,
+       -0x1.5c969ff401ea8p-10, 0x1.29e0cc64fe627p-14, -0x1.4160d8e9d3c2ap-19,
+       0x1.8e7b67594624ap-25, -0x1.b1cf2c975b09bp-32},
+      {0x1.983ceece09ff8p-1, -0x1.eacc78f7a2dp-4, 0x1.c74418410655fp-7,
+       -0x1.1756a050e441ep-10, 0x1.bff3650f7f548p-15, -0x1.c56c0217d3adap-20,
+       0x1.07b4918d0b489p-25, -0x1.0d4be8c1c50f8p-32},
+  };
+
+  using FPBits = typename fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+  if (LIBC_UNLIKELY(x_abs >= 0x4080'0000U)) {
+    constexpr float ONE[2] = {1.0f, -1.0f};
+    constexpr float SMALL[2] = {-0x1.0p-25f, 0x1.0p-25f};
+
+    int sign = xbits.is_neg() ? 1 : 0;
+
+    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+      return (x_abs > 0x7f80'0000) ? x : ONE[sign];
+    }
+
+    return ONE[sign] + SMALL[sign];
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Exceptional mask = common 0 bits of 2 exceptional values.
+  constexpr uint32_t EXCEPT_MASK = 0x809a'6184U;
+
+  if (LIBC_UNLIKELY((x_abs & EXCEPT_MASK) == 0)) {
+    // Exceptional values
+    if (LIBC_UNLIKELY(x_abs == 0x3f65'9229U)) // |x| = 0x1.cb2452p-1f
+      return x < 0.0f ? fputil::round_result_slightly_down(-0x1.972ea8p-1f)
+                      : fputil::round_result_slightly_up(0x1.972ea8p-1f);
+    if (LIBC_UNLIKELY(x_abs == 0x4004'1e6aU)) // |x| = 0x1.083cd4p+1f
+      return x < 0.0f ? fputil::round_result_slightly_down(-0x1.fe3462p-1f)
+                      : fputil::round_result_slightly_up(0x1.fe3462p-1f);
+    if (x_abs == 0U)
+      return x;
+  }
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // Polynomial approximation:
+  //   erf(x) ~ x * (c0 + c1 * x^2 + c2 * x^4 + ... + c7 * x^14)
+  double xd = static_cast<double>(x);
+  double xsq = xd * xd;
+
+  constexpr uint32_t EIGHT = 3 << FPBits::FRACTION_LEN;
+  int idx = static_cast<int>(FPBits(x_abs + EIGHT).get_val());
+
+  double x4 = xsq * xsq;
+  double c0 = fputil::multiply_add(xsq, COEFFS[idx][1], COEFFS[idx][0]);
+  double c1 = fputil::multiply_add(xsq, COEFFS[idx][3], COEFFS[idx][2]);
+  double c2 = fputil::multiply_add(xsq, COEFFS[idx][5], COEFFS[idx][4]);
+  double c3 = fputil::multiply_add(xsq, COEFFS[idx][7], COEFFS[idx][6]);
+
+  double x8 = x4 * x4;
+  double p0 = fputil::multiply_add(x4, c1, c0);
+  double p1 = fputil::multiply_add(x4, c3, c2);
+
+  return static_cast<float>(xd * fputil::multiply_add(x8, p1, p0));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ERFF_H
diff --git a/libc/src/__support/math/exp.h b/libc/src/__support/math/exp.h
index a538df1e825dc..ff59ff79e3381 100644
--- a/libc/src/__support/math/exp.h
+++ b/libc/src/__support/math/exp.h
@@ -40,11 +40,11 @@ static constexpr double LOG2_E = 0x1.71547652b82fep+0;
 
 // Error bounds:
 // Errors when using double precision.
-static constexpr double ERR_D = 0x1.8p-63;
+static constexpr double EXP_ERR_D = 0x1.8p-63;
 
 #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 // Errors when using double-double precision.
-static constexpr double ERR_DD = 0x1.0p-99;
+static constexpr double EXP_ERR_DD = 0x1.0p-99;
 #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 // -2^-12 * log(2)
@@ -387,7 +387,8 @@ static double exp(double x) {
 
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   if (LIBC_UNLIKELY(denorm)) {
-    return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
+    return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo,
+                                                   EXP_ERR_D)
         .value();
   } else {
     // to multiply by 2^hi, a fast way is to simply add hi to the exponent
@@ -399,12 +400,12 @@ static double exp(double x) {
   }
 #else
   if (LIBC_UNLIKELY(denorm)) {
-    if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
+    if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, EXP_ERR_D);
         LIBC_LIKELY(r.has_value()))
       return r.value();
   } else {
-    double upper = exp_mid.hi + (lo + ERR_D);
-    double lower = exp_mid.hi + (lo - ERR_D);
+    double upper = exp_mid.hi + (lo + EXP_ERR_D);
+    double lower = exp_mid.hi + (lo - EXP_ERR_D);
 
     if (LIBC_LIKELY(upper == lower)) {
       // to multiply by 2^hi, a fast way is to simply add hi to the exponent
@@ -419,12 +420,12 @@ static double exp(double x) {
   DoubleDouble r_dd = exp_double_double(x, kd, exp_mid);
 
   if (LIBC_UNLIKELY(denorm)) {
-    if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
+    if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, EXP_ERR_DD);
         LIBC_LIKELY(r.has_value()))
       return r.value();
   } else {
-    double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
-    double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
+    double upper_dd = r_dd.hi + (r_dd.lo + EXP_ERR_DD);
+    double lower_dd = r_dd.hi + (r_dd.lo - EXP_ERR_DD);
 
     if (LIBC_LIKELY(upper_dd == lower_dd)) {
       int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
diff --git a/libc/src/__support/math/exp10.h b/libc/src/__support/math/exp10.h
new file mode 100644
index 0000000000000..fa60e40c43e5d
--- /dev/null
+++ b/libc/src/__support/math/exp10.h
@@ -0,0 +1,502 @@
+//===-- Implementation header for exp10 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H
+
+#include "exp_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
+#include "exp_utils.h"     // ziv_test_denorm.
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/triple_double.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+using fputil::DoubleDouble;
+using fputil::TripleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+using LIBC_NAMESPACE::operator""_u128;
+
+// log2(10)
+static constexpr double LOG2_10 = 0x1.a934f0979a371p+1;
+
+// -2^-12 * log10(2)
+// > a = -2^-12 * log10(2);
+// > b = round(a, 32, RN);
+// > c = round(a - b, 32, RN);
+// > d = round(a - b - c, D, RN);
+// Errors < 1.5 * 2^-144
+static constexpr double MLOG10_2_EXP2_M12_HI = -0x1.3441350ap-14;
+static constexpr double MLOG10_2_EXP2_M12_MID = 0x1.0c0219dc1da99p-51;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+static constexpr double MLOG10_2_EXP2_M12_MID_32 = 0x1.0c0219dcp-51;
+static constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Error bounds:
+// Errors when using double precision.
+constexpr double EXP10_ERR_D = 0x1.8p-63;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Errors when using double-double precision.
+static constexpr double EXP10_ERR_DD = 0x1.8p-99;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Polynomial approximations with double precision.  Generated by Sollya with:
+// > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]);
+// > P;
+// Error bounds:
+//   | output - (10^dx - 1) / dx | < 2^-52.
+LIBC_INLINE static double exp10_poly_approx_d(double dx) {
+  // dx^2
+  double dx2 = dx * dx;
+  double c0 =
+      fputil::multiply_add(dx, 0x1.53524c73cea6ap+1, 0x1.26bb1bbb55516p+1);
+  double c1 =
+      fputil::multiply_add(dx, 0x1.2bd75cc6afc65p+0, 0x1.0470587aa264cp+1);
+  double p = fputil::multiply_add(dx2, c1, c0);
+  return p;
+}
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Polynomial approximation with double-double precision.  Generated by Solya
+// with:
+// > P = fpminimax((10^x - 1)/x, 5, [|DD...|], [-2^-14, 2^-14]);
+// Error bounds:
+//   | output - 10^(dx) | < 2^-101
+static constexpr DoubleDouble exp10_poly_approx_dd(const DoubleDouble &dx) {
+  // Taylor polynomial.
+  constexpr DoubleDouble COEFFS[] = {
+      {0, 0x1p0},
+      {-0x1.f48ad494e927bp-53, 0x1.26bb1bbb55516p1},
+      {-0x1.e2bfab3191cd2p-53, 0x1.53524c73cea69p1},
+      {0x1.80fb65ec3b503p-53, 0x1.0470591de2ca4p1},
+      {0x1.338fc05e21e55p-54, 0x1.2bd7609fd98c4p0},
+      {0x1.d4ea116818fbp-56, 0x1.1429ffd519865p-1},
+      {-0x1.872a8ff352077p-57, 0x1.a7ed70847c8b3p-3},
+
+  };
+
+  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
+                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
+  return p;
+}
+
+// Polynomial approximation with 128-bit precision:
+// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
+// For |dx| < 2^-14:
+//   | output - 10^dx | < 1.5 * 2^-124.
+static constexpr Float128 exp10_poly_approx_f128(const Float128 &dx) {
+  constexpr Float128 COEFFS_128[]{
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::POS, -126, 0x935d8ddd'aaa8ac16'ea56d62b'82d30a2d_u128},
+      {Sign::POS, -126, 0xa9a92639'e753443a'80a99ce7'5f4d5bdb_u128},
+      {Sign::POS, -126, 0x82382c8e'f1652304'6a4f9d7d'bf6c9635_u128},
+      {Sign::POS, -124, 0x12bd7609'fd98c44c'34578701'9216c7af_u128},
+      {Sign::POS, -127, 0x450a7ff4'7535d889'cc41ed7e'0d27aee5_u128},
+      {Sign::POS, -130, 0xd3f6b844'702d636b'8326bb91'a6e7601d_u128},
+      {Sign::POS, -130, 0x45b937f0'd05bb1cd'fa7b46df'314112a9_u128},
+  };
+
+  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
+                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
+                                COEFFS_128[6], COEFFS_128[7]);
+  return p;
+}
+
+// Compute 10^(x) using 128-bit precision.
+// TODO(lntue): investigate triple-double precision implementation for this
+// step.
+static Float128 exp10_f128(double x, double kd, int idx1, int idx2) {
+  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
+  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-144
+
+  Float128 dx = fputil::quick_add(
+      Float128(t1), fputil::quick_add(Float128(t2), Float128(t3)));
+
+  // TODO: Skip recalculating exp_mid1 and exp_mid2.
+  Float128 exp_mid1 =
+      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
+                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
+                                          Float128(EXP2_MID1[idx1].lo)));
+
+  Float128 exp_mid2 =
+      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
+                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
+                                          Float128(EXP2_MID2[idx2].lo)));
+
+  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
+
+  Float128 p = exp10_poly_approx_f128(dx);
+
+  Float128 r = fputil::quick_mul(exp_mid, p);
+
+  r.exponent += static_cast<int>(kd) >> 12;
+
+  return r;
+}
+
+// Compute 10^x with double-double precision.
+static DoubleDouble exp10_double_double(double x, double kd,
+                                        const DoubleDouble &exp_mid) {
+  // Recalculate dx:
+  //   dx = x - k * 2^-12 * log10(2)
+  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
+  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-140
+
+  DoubleDouble dx = fputil::exact_add(t1, t2);
+  dx.lo += t3;
+
+  // Degree-6 polynomial approximation in double-double precision.
+  // | p - 10^x | < 2^-103.
+  DoubleDouble p = exp10_poly_approx_dd(dx);
+
+  // Error bounds: 2^-102.
+  DoubleDouble r = fputil::quick_mult(exp_mid, p);
+
+  return r;
+}
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// When output is denormal.
+static double exp10_denorm(double x) {
+  // Range reduction.
+  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
+  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
+  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = exp10_poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo,
+                                                 EXP10_ERR_D)
+      .value();
+#else
+  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, EXP10_ERR_D);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use double-double
+  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
+
+  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, EXP10_ERR_DD);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+// Check for exceptional cases when:
+//  * log10(1 - 2^-54) < x < log10(1 + 2^-53)
+//  * x >= log10(2^1024)
+//  * x <= log10(2^-1022)
+//  * x is inf or nan
+static constexpr double exp10_set_exceptional(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+  uint64_t x_abs = xbits.abs().uintval();
+
+  // |x| < log10(1 + 2^-53)
+  if (x_abs <= 0x3c8bcb7b1526e50e) {
+    // 10^(x) ~ 1 + x/2
+    return fputil::multiply_add(x, 0.5, 1.0);
+  }
+
+  // x <= log10(2^-1022) || x >= log10(2^1024) or inf/nan.
+  if (x_u >= 0xc0733a7146f72a42) {
+    // x <= log10(2^-1075) or -inf/nan
+    if (x_u > 0xc07439b746e36b52) {
+      // exp(-Inf) = 0
+      if (xbits.is_inf())
+        return 0.0;
+
+      // exp(nan) = nan
+      if (xbits.is_nan())
+        return x;
+
+      if (fputil::quick_get_round() == FE_UPWARD)
+        return FPBits::min_subnormal().get_val();
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+      return 0.0;
+    }
+
+    return exp10_denorm(x);
+  }
+
+  // x >= log10(2^1024) or +inf/nan
+  // x is finite
+  if (x_u < 0x7ff0'0000'0000'0000ULL) {
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+      return FPBits::max_normal().get_val();
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_OVERFLOW);
+  }
+  // x is +inf or nan
+  return x + FPBits::inf().get_val();
+}
+
+namespace math {
+
+static constexpr double exp10(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+
+  // x <= log10(2^-1022) or x >= log10(2^1024) or
+  // log10(1 - 2^-54) < x < log10(1 + 2^-53).
+  if (LIBC_UNLIKELY(x_u >= 0xc0733a7146f72a42 ||
+                    (x_u <= 0xbc7bcb7b1526e50e && x_u >= 0x40734413509f79ff) ||
+                    x_u < 0x3c8bcb7b1526e50e)) {
+    return exp10_set_exceptional(x);
+  }
+
+  // Now log10(2^-1075) < x <= log10(1 - 2^-54) or
+  //     log10(1 + 2^-53) < x < log10(2^1024)
+
+  // Range reduction:
+  // Let x = log10(2) * (hi + mid1 + mid2) + lo
+  // in which:
+  //   hi is an integer
+  //   mid1 * 2^6 is an integer
+  //   mid2 * 2^12 is an integer
+  // then:
+  //   10^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 10^(lo).
+  // With this formula:
+  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
+  //     field.
+  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
+  //   - 10^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
+  //
+  // We compute (hi + mid1 + mid2) together by perform the rounding on
+  //   x * log2(10) * 2^12.
+  // Since |x| < |log10(2^-1075)| < 2^9,
+  //   |x * 2^12| < 2^9 * 2^12 < 2^21,
+  // So we can fit the rounded result round(x * 2^12) in int32_t.
+  // Thus, the goal is to be able to use an additional addition and fixed width
+  // shift to get an int32_t representing round(x * 2^12).
+  //
+  // Assuming int32_t using 2-complement representation, since the mantissa part
+  // of a double precision is unsigned with the leading bit hidden, if we add an
+  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^23 to the product, the
+  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
+  // considered as a proper 2-complement representations of x*2^12.
+  //
+  // One small problem with this approach is that the sum (x*2^12 + C) in
+  // double precision is rounded to the least significant bit of the dorminant
+  // factor C.  In order to minimize the rounding errors from this addition, we
+  // want to minimize e1.  Another constraint that we want is that after
+  // shifting the mantissa so that the least significant bit of int32_t
+  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
+  // any adjustment.  So combining these 2 requirements, we can choose
+  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
+  // after right shifting the mantissa, the resulting int32_t has correct sign.
+  // With this choice of C, the number of mantissa bits we need to shift to the
+  // right is: 52 - 33 = 19.
+  //
+  // Moreover, since the integer right shifts are equivalent to rounding down,
+  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
+  // +infinity.  So in particular, we can compute:
+  //   hmm = x * 2^12 + C,
+  // where C = 2^33 + 2^32 + 2^-1, then if
+  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
+  // the reduced argument:
+  //   lo = x - log10(2) * 2^-12 * k is bounded by:
+  //   |lo|  = |x - log10(2) * 2^-12 * k|
+  //         = log10(2) * 2^-12 * | x * log2(10) * 2^12 - k |
+  //        <= log10(2) * 2^-12 * (2^-1 + 2^-19)
+  //         < 1.5 * 2^-2 * (2^-13 + 2^-31)
+  //         = 1.5 * (2^-15 * 2^-31)
+  //
+  // Finally, notice that k only uses the mantissa of x * 2^12, so the
+  // exponent 2^12 is not needed.  So we can simply define
+  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
+  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
+
+  // Rounding errors <= 2^-31.
+  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
+  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
+  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
+
+  // We use the degree-4 polynomial to approximate 10^(lo):
+  //   10^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4
+  //           = 1 + lo * P(lo)
+  // So that the errors are bounded by:
+  //   |P(lo) - (10^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
+  // Let P_ be an evaluation of P where all intermediate computations are in
+  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
+  // errors can be bounded by:
+  //      |P_(lo) - P(lo)| < 2^-51
+  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-65
+  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-64.
+  // Since we approximate
+  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
+  // We use the expression:
+  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
+  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
+  // with errors bounded by 2^-64.
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = exp10_poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+  double r =
+      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
+  return r;
+#else
+  double upper = exp_mid.hi + (lo + EXP10_ERR_D);
+  double lower = exp_mid.hi + (lo - EXP10_ERR_D);
+
+  if (LIBC_LIKELY(upper == lower)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
+    return r;
+  }
+
+  // Exact outputs when x = 1, 2, ..., 22 + hard to round with x = 23.
+  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0 | ... | bits of 23.0)
+  if (LIBC_UNLIKELY((x_u & 0x8000'ffff'ffff'ffffULL) == 0ULL)) {
+    switch (x_u) {
+    case 0x3ff0000000000000: // x = 1.0
+      return 10.0;
+    case 0x4000000000000000: // x = 2.0
+      return 100.0;
+    case 0x4008000000000000: // x = 3.0
+      return 1'000.0;
+    case 0x4010000000000000: // x = 4.0
+      return 10'000.0;
+    case 0x4014000000000000: // x = 5.0
+      return 100'000.0;
+    case 0x4018000000000000: // x = 6.0
+      return 1'000'000.0;
+    case 0x401c000000000000: // x = 7.0
+      return 10'000'000.0;
+    case 0x4020000000000000: // x = 8.0
+      return 100'000'000.0;
+    case 0x4022000000000000: // x = 9.0
+      return 1'000'000'000.0;
+    case 0x4024000000000000: // x = 10.0
+      return 10'000'000'000.0;
+    case 0x4026000000000000: // x = 11.0
+      return 100'000'000'000.0;
+    case 0x4028000000000000: // x = 12.0
+      return 1'000'000'000'000.0;
+    case 0x402a000000000000: // x = 13.0
+      return 10'000'000'000'000.0;
+    case 0x402c000000000000: // x = 14.0
+      return 100'000'000'000'000.0;
+    case 0x402e000000000000: // x = 15.0
+      return 1'000'000'000'000'000.0;
+    case 0x4030000000000000: // x = 16.0
+      return 10'000'000'000'000'000.0;
+    case 0x4031000000000000: // x = 17.0
+      return 100'000'000'000'000'000.0;
+    case 0x4032000000000000: // x = 18.0
+      return 1'000'000'000'000'000'000.0;
+    case 0x4033000000000000: // x = 19.0
+      return 10'000'000'000'000'000'000.0;
+    case 0x4034000000000000: // x = 20.0
+      return 100'000'000'000'000'000'000.0;
+    case 0x4035000000000000: // x = 21.0
+      return 1'000'000'000'000'000'000'000.0;
+    case 0x4036000000000000: // x = 22.0
+      return 10'000'000'000'000'000'000'000.0;
+    case 0x4037000000000000: // x = 23.0
+      return 0x1.52d02c7e14af6p76 + x;
+    }
+  }
+
+  // Use double-double
+  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
+
+  double upper_dd = r_dd.hi + (r_dd.lo + EXP10_ERR_DD);
+  double lower_dd = r_dd.hi + (r_dd.lo - EXP10_ERR_DD);
+
+  if (LIBC_LIKELY(upper_dd == lower_dd)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
+    return r;
+  }
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H
diff --git a/libc/src/__support/math/exp10_float16_constants.h b/libc/src/__support/math/exp10_float16_constants.h
new file mode 100644
index 0000000000000..f5928db740ee4
--- /dev/null
+++ b/libc/src/__support/math/exp10_float16_constants.h
@@ -0,0 +1,43 @@
+//===-- Constants for exp10f16 function -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include <stdint.h>
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/CPP/array.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN));
+static constexpr cpp::array<uint32_t, 8> EXP2_MID_BITS = {
+    0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U,
+    0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U,
+};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log2(10), SG, RN);
+static constexpr float LOG2F_10 = 0x1.a934fp+1f;
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log10(2), SG, RN);
+static constexpr float LOG10F_2 = 0x1.344136p-2f;
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/__support/math/exp10f.h
similarity index 91%
rename from libc/src/math/generic/exp10f_impl.h
rename to libc/src/__support/math/exp10f.h
index 975fd01a0a25c..807b4f0d6c109 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/__support/math/exp10f.h
@@ -1,4 +1,4 @@
-//===-- Single-precision 10^x function ------------------------------------===//
+//===-- Implementation header for exp10f ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,22 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H
 
-#include "explogxf.h"
+#include "exp10f_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
 namespace LIBC_NAMESPACE_DECL {
-namespace generic {
+namespace math {
 
-LIBC_INLINE float exp10f(float x) {
+static constexpr float exp10f(float x) {
   using FPBits = typename fputil::FPBits<float>;
   FPBits xbits(x);
 
@@ -132,7 +131,7 @@ LIBC_INLINE float exp10f(float x) {
   return static_cast<float>(multiply_add(p, lo2 * rr.mh, c0 * rr.mh));
 }
 
-} // namespace generic
+} // namespace math
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H
diff --git a/libc/src/__support/math/exp10f16.h b/libc/src/__support/math/exp10f16.h
new file mode 100644
index 0000000000000..0d8b125348844
--- /dev/null
+++ b/libc/src/__support/math/exp10f16.h
@@ -0,0 +1,141 @@
+//===-- Implementation header for exp10f16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "exp10f16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+static constexpr size_t N_EXP10F16_EXCEPTS = 5;
+#else
+static constexpr size_t N_EXP10F16_EXCEPTS = 8;
+#endif
+
+static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
+    EXP10F16_EXCEPTS = {{
+        // x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ)
+        {0x363dU, 0x40e8U, 1U, 0U, 1U},
+        // x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ)
+        {0x3657U, 0x40fbU, 1U, 0U, 0U},
+        // x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ)
+        {0xac06U, 0x3aecU, 1U, 0U, 0U},
+        // x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ)
+        {0xbf0aU, 0x2473U, 1U, 0U, 0U},
+        // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
+        {0xc387U, 0x09a5U, 1U, 0U, 0U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+        // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
+        {0x4030U, 0x57c1U, 1U, 0U, 1U},
+        // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
+        {0x406eU, 0x591fU, 1U, 0U, 1U},
+        // x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ)
+        {0x446eU, 0x7690U, 1U, 0U, 1U},
+#endif
+    }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+static constexpr float16 exp10f16(float16 x) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| >= 5, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs >= 0x4500U)) {
+    // exp10(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 5.
+    if (x_bits.is_pos()) {
+      // exp10(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x <= -8.
+    if (x_u >= 0xc800U) {
+      // exp10(-inf) = +0
+      if (x_bits.is_inf())
+        return FPBits::zero().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
+
+      if (fputil::fenv_is_round_up())
+        return FPBits::min_subnormal().get_val();
+      return FPBits::zero().get_val();
+    }
+  }
+
+  // When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results.
+  if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
+    switch (x_u) {
+    case 0x3c00U: // x = 1.0f16
+      return fputil::cast<float16>(10.0);
+    case 0x4000U: // x = 2.0f16
+      return fputil::cast<float16>(100.0);
+    case 0x4200U: // x = 3.0f16
+      return fputil::cast<float16>(1'000.0);
+    case 0x4400U: // x = 4.0f16
+      return fputil::cast<float16>(10'000.0);
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
+  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+  return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
diff --git a/libc/src/__support/math/exp10f16_utils.h b/libc/src/__support/math/exp10f16_utils.h
new file mode 100644
index 0000000000000..bffb81ba606bb
--- /dev/null
+++ b/libc/src/__support/math/exp10f16_utils.h
@@ -0,0 +1,64 @@
+//===-- Common utils for exp10f16 -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "exp10_float16_constants.h"
+#include "expf16_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LIBC_INLINE static constexpr ExpRangeReduction
+exp10_range_reduction(float16 x) {
+  // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
+  // find hi, mid, lo, such that:
+  //   x = (hi + mid) * log2(10) + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
+  // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
+  // by adding hi to the exponent field of 2^mid.  10^lo is computed using a
+  // degree-4 minimax polynomial generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
+  int x_hi_mid = static_cast<int>(kf);
+  unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3;
+  unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
+
+  uint32_t exp2_hi_mid_bits =
+      EXP2_MID_BITS[x_mid] +
+      static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
+  float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
+  // Degree-4 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
+                                    0x1.04b434p+1f, 0x1.2bcf9ep+0f);
+  return {exp2_hi_mid, exp10_lo};
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H
diff --git a/libc/src/__support/math/exp10f_utils.h b/libc/src/__support/math/exp10f_utils.h
new file mode 100644
index 0000000000000..c30def9d62db2
--- /dev/null
+++ b/libc/src/__support/math/exp10f_utils.h
@@ -0,0 +1,157 @@
+//===-- Common utils for exp10f ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_UTILS_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+struct ExpBase {
+  // Base = e
+  static constexpr int MID_BITS = 5;
+  static constexpr int MID_MASK = (1 << MID_BITS) - 1;
+  // log2(e) * 2^5
+  static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS);
+  // High and low parts of -log(2) * 2^(-5)
+  static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS);
+  static constexpr double M_LOGB_2_LO =
+      -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS);
+  // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya
+  // with:
+  // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN));
+  static constexpr int64_t EXP_2_MID[1 << MID_BITS] = {
+      0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f,
+      0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa,
+      0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715,
+      0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d,
+      0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429,
+      0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74,
+      0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db,
+      0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d,
+      0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c,
+      0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f,
+      0x3ffea4afa2a490da, 0x3fff50765b6e4540,
+  };
+
+  // Approximating e^dx with degree-5 minimax polynomial generated by Sollya:
+  // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]);
+  // Then:
+  //   e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[3] * dx^5.
+  static constexpr double COEFFS[4] = {
+      0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5,
+      0x1.11112a0e34bdbp-7};
+
+  LIBC_INLINE static double powb_lo(double dx) {
+    using fputil::multiply_add;
+    double dx2 = dx * dx;
+    double c0 = 1.0 + dx;
+    // c1 = COEFFS[0] + COEFFS[1] * dx
+    double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]);
+    // c2 = COEFFS[2] + COEFFS[3] * dx
+    double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]);
+    // r = c4 + c5 * dx^4
+    //   = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7
+    return fputil::polyeval(dx2, c0, c1, c2);
+  }
+};
+
+struct Exp10Base : public ExpBase {
+  // log2(10) * 2^5
+  static constexpr double LOG2_B = 0x1.a934f0979a371p1 * (1 << MID_BITS);
+  // High and low parts of -log10(2) * 2^(-5).
+  // Notice that since |x * log2(10)| < 150:
+  //   |k| = |round(x * log2(10) * 2^5)| < 2^8 * 2^5 = 2^13
+  // So when the FMA instructions are not available, in order for the product
+  //   k * M_LOGB_2_HI
+  // to be exact, we only store the high part of log10(2) up to 38 bits
+  // (= 53 - 15) of precision.
+  // It is generated by Sollya with:
+  // > round(log10(2), 44, RN);
+  static constexpr double M_LOGB_2_HI = -0x1.34413509f8p-2 / (1 << MID_BITS);
+  // > round(log10(2) - 0x1.34413509f8p-2, D, RN);
+  static constexpr double M_LOGB_2_LO = 0x1.80433b83b532ap-44 / (1 << MID_BITS);
+
+  // Approximating 10^dx with degree-5 minimax polynomial generated by Sollya:
+  // > Q = fpminimax((10^x - 1)/x, 4, [|D...|], [-log10(2)/2^6, log10(2)/2^6]);
+  // Then:
+  //   10^dx ~ P(dx) = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
+  static constexpr double COEFFS[5] = {0x1.26bb1bbb55515p1, 0x1.53524c73bd3eap1,
+                                       0x1.0470591dff149p1, 0x1.2bd7c0a9fbc4dp0,
+                                       0x1.1429e74a98f43p-1};
+
+  static double powb_lo(double dx) {
+    using fputil::multiply_add;
+    double dx2 = dx * dx;
+    // c0 = 1 + COEFFS[0] * dx
+    double c0 = multiply_add(dx, Exp10Base::COEFFS[0], 1.0);
+    // c1 = COEFFS[1] + COEFFS[2] * dx
+    double c1 = multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
+    // c2 = COEFFS[3] + COEFFS[4] * dx
+    double c2 = multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
+    // r = c0 + dx^2 * (c1 + c2 * dx^2)
+    //   = c0 + c1 * dx^2 + c2 * dx^4
+    //   = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
+    return fputil::polyeval(dx2, c0, c1, c2);
+  }
+};
+
+// Output of range reduction for exp_b: (2^(mid + hi), lo)
+// where:
+//   b^x = 2^(mid + hi) * b^lo
+struct exp_b_reduc_t {
+  double mh; // 2^(mid + hi)
+  double lo;
+};
+
+// The function correctly calculates b^x value with at least float precision
+// in a limited range.
+// Range reduction:
+//   b^x = 2^(hi + mid) * b^lo
+// where:
+//   x = (hi + mid) * log_b(2) + lo
+//   hi is an integer,
+//   0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer
+//   -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1)
+// Base class needs to provide the following constants:
+//   - MID_BITS    : number of bits after decimal points used for mid
+//   - MID_MASK    : 2^MID_BITS - 1, mask to extract mid bits
+//   - LOG2_B      : log2(b) * 2^MID_BITS for scaling
+//   - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS)
+//   - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS)
+//   - EXP_2_MID   : look up table for bit fields of 2^mid
+// Return:
+//   { 2^(hi + mid), lo }
+template <class Base>
+LIBC_INLINE static constexpr exp_b_reduc_t exp_b_range_reduc(float x) {
+  double xd = static_cast<double>(x);
+  // kd = round((hi + mid) * log2(b) * 2^MID_BITS)
+  double kd = fputil::nearest_integer(Base::LOG2_B * xd);
+  // k = round((hi + mid) * log2(b) * 2^MID_BITS)
+  int k = static_cast<int>(kd);
+  // hi = floor(kd * 2^(-MID_BITS))
+  // exp_hi = shift hi to the exponent field of double precision.
+  uint64_t exp_hi = static_cast<uint64_t>(k >> Base::MID_BITS)
+                    << fputil::FPBits<double>::FRACTION_LEN;
+  // mh = 2^hi * 2^mid
+  // mh_bits = bit field of mh
+  uint64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi;
+  double mh = fputil::FPBits<double>(mh_bits).get_val();
+  // dx = lo = x - (hi + mid) * log(2)
+  double dx = fputil::multiply_add(
+      kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd));
+  return {mh, dx};
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_UTILS_H
diff --git a/libc/src/math/generic/inv_trigf_utils.cpp b/libc/src/__support/math/inv_trigf_utils.h
similarity index 56%
rename from libc/src/math/generic/inv_trigf_utils.cpp
rename to libc/src/__support/math/inv_trigf_utils.h
index f23028bb86b5c..4a8fbeca93e49 100644
--- a/libc/src/math/generic/inv_trigf_utils.cpp
+++ b/libc/src/__support/math/inv_trigf_utils.h
@@ -1,4 +1,4 @@
-//===-- Single-precision general exp/log functions ------------------------===//
+//===-- Single-precision general inverse trigonometric functions ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "inv_trigf_utils.h"
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H
+
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
+namespace inv_trigf_utils_internal {
+
+// PI and PI / 2
+static constexpr double M_MATH_PI = 0x1.921fb54442d18p+1;
+static constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0;
+
 // Polynomial approximation for 0 <= x <= 1:
 //   atan(x) ~ atan((i/16) + (x - (i/16)) * Q(x - i/16)
 //           = P(x - i/16)
@@ -29,7 +40,7 @@ namespace LIBC_NAMESPACE_DECL {
 // Notice that degree-7 is good enough for atanf, but degree-8 helps reduce the
 // error bounds for atan2f's fast pass 16 times, and it does not affect the
 // performance of atanf much.
-double ATAN_COEFFS[17][9] = {
+static constexpr double ATAN_COEFFS[17][9] = {
     {0.0, 1.0, 0x1.3f8d76d26d61bp-47, -0x1.5555555574cd8p-2,
      0x1.0dde5d06878eap-29, 0x1.99997738acc77p-3, 0x1.2c43eac9797cap-16,
      -0x1.25fb020007dbdp-3, 0x1.c1b6c31d7b0aep-7},
@@ -83,4 +94,91 @@ double ATAN_COEFFS[17][9] = {
      0x1.555e31a1e15e9p-6, -0x1.245240d65e629p-7, -0x1.fa9ba66478903p-11},
 };
 
+// Look-up table for atan(k/16) with k = 0..16.
+static constexpr double ATAN_K_OVER_16[17] = {
+    0.0,
+    0x1.ff55bb72cfdeap-5,
+    0x1.fd5ba9aac2f6ep-4,
+    0x1.7b97b4bce5b02p-3,
+    0x1.f5b75f92c80ddp-3,
+    0x1.362773707ebccp-2,
+    0x1.6f61941e4def1p-2,
+    0x1.a64eec3cc23fdp-2,
+    0x1.dac670561bb4fp-2,
+    0x1.0657e94db30dp-1,
+    0x1.1e00babdefeb4p-1,
+    0x1.345f01cce37bbp-1,
+    0x1.4978fa3269ee1p-1,
+    0x1.5d58987169b18p-1,
+    0x1.700a7c5784634p-1,
+    0x1.819d0b7158a4dp-1,
+    0x1.921fb54442d18p-1,
+};
+
+// For |x| <= 1/32 and 0 <= i <= 16, return Q(x) such that:
+//   Q(x) ~ (atan(x + i/16) - atan(i/16)) / x.
+LIBC_INLINE static double atan_eval(double x, unsigned i) {
+  double x2 = x * x;
+
+  double c0 = fputil::multiply_add(x, ATAN_COEFFS[i][2], ATAN_COEFFS[i][1]);
+  double c1 = fputil::multiply_add(x, ATAN_COEFFS[i][4], ATAN_COEFFS[i][3]);
+  double c2 = fputil::multiply_add(x, ATAN_COEFFS[i][6], ATAN_COEFFS[i][5]);
+  double c3 = fputil::multiply_add(x, ATAN_COEFFS[i][8], ATAN_COEFFS[i][7]);
+
+  double x4 = x2 * x2;
+  double d1 = fputil::multiply_add(x2, c1, c0);
+  double d2 = fputil::multiply_add(x2, c3, c2);
+  double p = fputil::multiply_add(x4, d2, d1);
+  return p;
+}
+
+// Evaluate atan without big lookup table.
+//   atan(n/d) - atan(k/16) = atan((n/d - k/16) / (1 + (n/d) * (k/16)))
+//                          = atan((n - d * k/16)) / (d + n * k/16))
+// So we let q = (n - d * k/16) / (d + n * k/16),
+// and approximate with Taylor polynomial:
+//   atan(q) ~ q - q^3/3 + q^5/5 - q^7/7 + q^9/9
+LIBC_INLINE static double atan_eval_no_table(double num, double den,
+                                             double k_over_16) {
+  double num_r = fputil::multiply_add(den, -k_over_16, num);
+  double den_r = fputil::multiply_add(num, k_over_16, den);
+  double q = num_r / den_r;
+
+  constexpr double ATAN_TAYLOR[] = {
+      -0x1.5555555555555p-2,
+      0x1.999999999999ap-3,
+      -0x1.2492492492492p-3,
+      0x1.c71c71c71c71cp-4,
+  };
+  double q2 = q * q;
+  double q3 = q2 * q;
+  double q4 = q2 * q2;
+  double c0 = fputil::multiply_add(q2, ATAN_TAYLOR[1], ATAN_TAYLOR[0]);
+  double c1 = fputil::multiply_add(q2, ATAN_TAYLOR[3], ATAN_TAYLOR[2]);
+  double d = fputil::multiply_add(q4, c1, c0);
+  return fputil::multiply_add(q3, d, q);
+}
+
+// > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
+//                 [|1, D...|], [0, 0.5]);
+static constexpr double ASIN_COEFFS[10] = {
+    0x1.5555555540fa1p-3, 0x1.333333512edc2p-4, 0x1.6db6cc1541b31p-5,
+    0x1.f1caff324770ep-6, 0x1.6e43899f5f4f4p-6, 0x1.1f847cf652577p-6,
+    0x1.9b60f47f87146p-7, 0x1.259e2634c494fp-6, -0x1.df946fa875ddp-8,
+    0x1.02311ecf99c28p-5};
+
+// Evaluate P(x^2) - 1, where P(x^2) ~ asin(x)/x
+LIBC_INLINE static double asin_eval(double xsq) {
+  double x4 = xsq * xsq;
+  double r1 = fputil::polyeval(x4, ASIN_COEFFS[0], ASIN_COEFFS[2],
+                               ASIN_COEFFS[4], ASIN_COEFFS[6], ASIN_COEFFS[8]);
+  double r2 = fputil::polyeval(x4, ASIN_COEFFS[1], ASIN_COEFFS[3],
+                               ASIN_COEFFS[5], ASIN_COEFFS[7], ASIN_COEFFS[9]);
+  return fputil::multiply_add(xsq, r2, r1);
+}
+
+} // namespace inv_trigf_utils_internal
+
 } // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_INV_TRIGF_UTILS_H
diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp
index 90ba934c42b69..0f730d6e6dbec 100644
--- a/libc/src/__support/wchar/mbrtowc.cpp
+++ b/libc/src/__support/wchar/mbrtowc.cpp
@@ -37,7 +37,8 @@ ErrorOr<size_t> mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
   }
   auto wc = char_conv.pop_utf32();
   if (wc.has_value()) {
-    *pwc = wc.value();
+    if (pwc != nullptr)
+      *pwc = wc.value();
     // null terminator -> return 0
     if (wc.value() == L'\0')
       return 0;
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 0635bc57bf3e2..869ebdfc8b390 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -56,6 +56,9 @@ template <typename T> class StringConverter {
   // TODO: following functions are almost identical
   // look into templating CharacterConverter pop functions
   ErrorOr<char32_t> popUTF32() {
+    if (num_to_write == 0)
+      return Error(-1);
+
     if (cr.isEmpty() || src_idx == 0) {
       auto src_elements_read = pushFullCharacter();
       if (!src_elements_read.has_value())
@@ -79,6 +82,9 @@ template <typename T> class StringConverter {
   }
 
   ErrorOr<char8_t> popUTF8() {
+    if (num_to_write == 0)
+      return Error(-1);
+
     if (cr.isEmpty() || src_idx == 0) {
       auto src_elements_read = pushFullCharacter();
       if (!src_elements_read.has_value())
diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index daaf1a3ec5639..d2c85ecf7d9b7 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -183,8 +183,8 @@ implementation (which is very often glibc).
 
 - Build and Run exhaustive test (might take hours to run):
 ```
-  $ ninja libc.test.src.math.exhaustive.<func>_test
-  $ projects/libc/test/src/math/exhaustive/libc.test.src.math.exhaustive.<func>_test
+  $ ninja libc.test.src.math.exhaustive.<func>_test.__unit__
+  $ projects/libc/test/src/math/exhaustive/libc.test.src.math.exhaustive.<func>_test.__unit__
 ```
 
 - Build and Run performance test:
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b59beacd94143..408f99ef30760 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -358,7 +358,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fma
-    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
 )
@@ -448,7 +447,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
-    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
@@ -1297,12 +1295,8 @@ add_entrypoint_object(
   HDRS
     ../erff.h
   DEPENDS
-    .common_constants
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.erff
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -1457,35 +1451,7 @@ add_entrypoint_object(
   HDRS
     ../exp10.h
   DEPENDS
-    .common_constants
-    .explogxf
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.FPUtil.triple_double
-    libc.src.__support.integer_literals
-    libc.src.__support.macros.optimization
-    libc.src.errno.errno
-)
-
-add_header_library(
-  exp10f_impl
-  HDRS
-    exp10f_impl.h
-  DEPENDS
-    .explogxf
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.common
+    libc.src.__support.math.exp10
     libc.src.errno.errno
 )
 
@@ -1496,7 +1462,8 @@ add_entrypoint_object(
   HDRS
     ../exp10f.h
   DEPENDS
-    .exp10f_impl
+    libc.src.__support.math.exp10f
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -1506,20 +1473,8 @@ add_entrypoint_object(
   HDRS
     ../exp10f16.h
   DEPENDS
-    .expxf16
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.CPP.array
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.cpu_features
+    libc.src.__support.math.exp10f16
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -1548,7 +1503,6 @@ add_entrypoint_object(
   HDRS
     ../exp10m1f16.h
   DEPENDS
-    .expxf16
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.cast
@@ -1560,6 +1514,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.cpu_features
+    libc.src.__support.math.exp10f16_utils
 )
 
 add_entrypoint_object(
@@ -1633,17 +1588,15 @@ add_entrypoint_object(
     ../powf.h
   DEPENDS
     .common_constants
-    .exp10f_impl
     .exp2f_impl
     .explogxf
+    libc.src.__support.math.exp10f
     libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.FPUtil.triple_double
     libc.src.__support.macros.optimization
@@ -1941,6 +1894,7 @@ add_object_library(
     common_constants.cpp
   DEPENDS
     libc.src.__support.math.exp_constants
+    libc.src.__support.math.acosh_float_constants
     libc.src.__support.number_pair
 )
 
@@ -3797,21 +3751,15 @@ add_entrypoint_object(
 )
 
 #TODO: Add errno include to the hyperbolic functions.
-add_object_library(
+add_header_library(
   explogxf
   HDRS
     explogxf.h
-  SRCS
-    explogxf.cpp
   DEPENDS
     .common_constants
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.common
     libc.src.__support.math.exp_utils
+    libc.src.__support.math.acoshf_utils
+    libc.src.__support.macros.properties.cpu_features
     libc.src.errno.errno
 )
 
@@ -3920,12 +3868,7 @@ add_entrypoint_object(
     ../acoshf.h
   DEPENDS
     .explogxf
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.acoshf
 )
 
 add_entrypoint_object(
@@ -3935,18 +3878,8 @@ add_entrypoint_object(
   HDRS
     ../acoshf16.h
   DEPENDS
-    .explogxf
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.acoshf16
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -3994,6 +3927,7 @@ add_entrypoint_object(
   DEPENDS
     .explogxf
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.macros.optimization
 )
 
@@ -4017,18 +3951,6 @@ add_entrypoint_object(
     libc.src.__support.macros.properties.types
 )
 
-add_object_library(
-  inv_trigf_utils
-  HDRS
-    inv_trigf_utils.h
-  SRCS
-    inv_trigf_utils.cpp
-  DEPENDS
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.common
-)
-
 add_entrypoint_object(
   asinf
   SRCS
@@ -4042,7 +3964,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.macros.optimization
-    .inv_trigf_utils
+    libc.src.__support.math.inv_trigf_utils
 )
 
 add_entrypoint_object(
@@ -4064,20 +3986,6 @@ add_entrypoint_object(
     libc.src.__support.macros.properties.types
 )
 
-add_header_library(
-  asin_utils
-  HDRS
-    atan_utils.h
-  DEPENDS
-    libc.src.__support.integer_literals
-    libc.src.__support.FPUtil.double_double
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.macros.optimization
-)
-
 add_entrypoint_object(
   asin
   SRCS
@@ -4085,7 +3993,7 @@ add_entrypoint_object(
   HDRS
     ../asin.h
   DEPENDS
-    .asin_utils
+    libc.src.__support.math.asin_utils
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
@@ -4104,13 +4012,7 @@ add_entrypoint_object(
   HDRS
     ../acosf.h
   DEPENDS
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    .inv_trigf_utils
+    libc.src.__support.math.acosf
 )
 
 add_entrypoint_object(
@@ -4120,17 +4022,8 @@ add_entrypoint_object(
   HDRS
     ../acosf16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.acosf16
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -4140,17 +4033,7 @@ add_entrypoint_object(
   HDRS
     ../acos.h
   DEPENDS
-    .asin_utils
-    libc.src.__support.FPUtil.double_double
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
-    libc.src.__support.macros.properties.cpu_features
+    libc.src.__support.math.acos
 )
 
 add_entrypoint_object(
@@ -4192,7 +4075,6 @@ add_entrypoint_object(
   HDRS
     ../atanf.h
   DEPENDS
-    .inv_trigf_utils
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
@@ -4200,6 +4082,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
+    libc.src.__support.math.inv_trigf_utils
 )
 
 add_entrypoint_object(
@@ -4248,7 +4131,6 @@ add_entrypoint_object(
     ../atan2f.h
     atan2f_float.h
   DEPENDS
-    .inv_trigf_utils
     libc.hdr.fenv_macros
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.fenv_impl
@@ -4258,6 +4140,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
+    libc.src.__support.math.inv_trigf_utils
 )
 
 add_entrypoint_object(
@@ -5059,10 +4942,11 @@ add_header_library(
   HDRS
     expxf16.h
   DEPENDS
-    libc.src.__support.FPUtil.cast
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.cast
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.macros.attributes
     libc.src.__support.math.expf16_utils
+    libc.src.__support.math.exp10_float16_constants
 )
diff --git a/libc/src/math/generic/acos.cpp b/libc/src/math/generic/acos.cpp
index c14721faef3ce..3a5964290cdd3 100644
--- a/libc/src/math/generic/acos.cpp
+++ b/libc/src/math/generic/acos.cpp
@@ -7,272 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/acos.h"
-#include "asin_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
-#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/math/acos.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using DoubleDouble = fputil::DoubleDouble;
-using Float128 = fputil::DyadicFloat<128>;
-
-LLVM_LIBC_FUNCTION(double, acos, (double x)) {
-  using FPBits = fputil::FPBits<double>;
-
-  FPBits xbits(x);
-  int x_exp = xbits.get_biased_exponent();
-
-  // |x| < 0.5.
-  if (x_exp < FPBits::EXP_BIAS - 1) {
-    // |x| < 2^-55.
-    if (LIBC_UNLIKELY(x_exp < FPBits::EXP_BIAS - 55)) {
-      // When |x| < 2^-55, acos(x) = pi/2
-#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS)
-      return PI_OVER_TWO.hi;
-#else
-      // Force the evaluation and prevent constant propagation so that it
-      // is rounded correctly for FE_UPWARD rounding mode.
-      return (xbits.abs().get_val() + 0x1.0p-160) + PI_OVER_TWO.hi;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    }
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    // acos(x) = pi/2 - asin(x)
-    //         = pi/2 - x * P(x^2)
-    double p = asin_eval(x * x);
-    return PI_OVER_TWO.hi + fputil::multiply_add(-x, p, PI_OVER_TWO.lo);
-#else
-    unsigned idx;
-    DoubleDouble x_sq = fputil::exact_mult(x, x);
-    double err = xbits.abs().get_val() * 0x1.0p-51;
-    // Polynomial approximation:
-    //   p ~ asin(x)/x
-    DoubleDouble p = asin_eval(x_sq, idx, err);
-    // asin(x) ~ x * p
-    DoubleDouble r0 = fputil::exact_mult(x, p.hi);
-    // acos(x) = pi/2 - asin(x)
-    //         ~ pi/2 - x * p
-    //         = pi/2 - x * (p.hi + p.lo)
-    double r_hi = fputil::multiply_add(-x, p.hi, PI_OVER_TWO.hi);
-    // Use Dekker's 2SUM algorithm to compute the lower part.
-    double r_lo = ((PI_OVER_TWO.hi - r_hi) - r0.hi) - r0.lo;
-    r_lo = fputil::multiply_add(-x, p.lo, r_lo + PI_OVER_TWO.lo);
-
-    // Ziv's accuracy test.
-
-    double r_upper = r_hi + (r_lo + err);
-    double r_lower = r_hi + (r_lo - err);
-
-    if (LIBC_LIKELY(r_upper == r_lower))
-      return r_upper;
-
-    // Ziv's accuracy test failed, perform 128-bit calculation.
-
-    // Recalculate mod 1/64.
-    idx = static_cast<unsigned>(fputil::nearest_integer(x_sq.hi * 0x1.0p6));
-
-    // Get x^2 - idx/64 exactly.  When FMA is available, double-double
-    // multiplication will be correct for all rounding modes.  Otherwise we use
-    // Float128 directly.
-    Float128 x_f128(x);
-
-#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-    // u = x^2 - idx/64
-    Float128 u_hi(
-        fputil::multiply_add(static_cast<double>(idx), -0x1.0p-6, x_sq.hi));
-    Float128 u = fputil::quick_add(u_hi, Float128(x_sq.lo));
-#else
-    Float128 x_sq_f128 = fputil::quick_mul(x_f128, x_f128);
-    Float128 u = fputil::quick_add(
-        x_sq_f128, Float128(static_cast<double>(idx) * (-0x1.0p-6)));
-#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-
-    Float128 p_f128 = asin_eval(u, idx);
-    // Flip the sign of x_f128 to perform subtraction.
-    x_f128.sign = x_f128.sign.negate();
-    Float128 r =
-        fputil::quick_add(PI_OVER_TWO_F128, fputil::quick_mul(x_f128, p_f128));
-
-    return static_cast<double>(r);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  }
-  // |x| >= 0.5
-
-  double x_abs = xbits.abs().get_val();
-
-  // Maintaining the sign:
-  constexpr double SIGN[2] = {1.0, -1.0};
-  double x_sign = SIGN[xbits.is_neg()];
-  // |x| >= 1
-  if (LIBC_UNLIKELY(x_exp >= FPBits::EXP_BIAS)) {
-    // x = +-1, asin(x) = +- pi/2
-    if (x_abs == 1.0) {
-      // x = 1, acos(x) = 0,
-      // x = -1, acos(x) = pi
-      return x == 1.0 ? 0.0 : fputil::multiply_add(-x_sign, PI.hi, PI.lo);
-    }
-    // |x| > 1, return NaN.
-    if (xbits.is_quiet_nan())
-      return x;
-
-    // Set domain error for non-NaN input.
-    if (!xbits.is_nan())
-      fputil::set_errno_if_required(EDOM);
-
-    fputil::raise_except_if_required(FE_INVALID);
-    return FPBits::quiet_nan().get_val();
-  }
-
-  // When |x| >= 0.5, we perform range reduction as follow:
-  //
-  // When 0.5 <= x < 1, let:
-  //   y = acos(x)
-  // We will use the double angle formula:
-  //   cos(2y) = 1 - 2 sin^2(y)
-  // and the complement angle identity:
-  //   x = cos(y) = 1 - 2 sin^2 (y/2)
-  // So:
-  //   sin(y/2) = sqrt( (1 - x)/2 )
-  // And hence:
-  //   y/2 = asin( sqrt( (1 - x)/2 ) )
-  // Equivalently:
-  //   acos(x) = y = 2 * asin( sqrt( (1 - x)/2 ) )
-  // Let u = (1 - x)/2, then:
-  //   acos(x) = 2 * asin( sqrt(u) )
-  // Moreover, since 0.5 <= x < 1:
-  //   0 < u <= 1/4, and 0 < sqrt(u) <= 0.5,
-  // And hence we can reuse the same polynomial approximation of asin(x) when
-  // |x| <= 0.5:
-  //   acos(x) ~ 2 * sqrt(u) * P(u).
-  //
-  // When -1 < x <= -0.5, we reduce to the previous case using the formula:
-  //   acos(x) = pi - acos(-x)
-  //           = pi - 2 * asin ( sqrt( (1 + x)/2 ) )
-  //           ~ pi - 2 * sqrt(u) * P(u),
-  // where u = (1 - |x|)/2.
-
-  // u = (1 - |x|)/2
-  double u = fputil::multiply_add(x_abs, -0.5, 0.5);
-  // v_hi + v_lo ~ sqrt(u).
-  // Let:
-  //   h = u - v_hi^2 = (sqrt(u) - v_hi) * (sqrt(u) + v_hi)
-  // Then:
-  //   sqrt(u) = v_hi + h / (sqrt(u) + v_hi)
-  //            ~ v_hi + h / (2 * v_hi)
-  // So we can use:
-  //   v_lo = h / (2 * v_hi).
-  double v_hi = fputil::sqrt<double>(u);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  constexpr DoubleDouble CONST_TERM[2] = {{0.0, 0.0}, PI};
-  DoubleDouble const_term = CONST_TERM[xbits.is_neg()];
-
-  double p = asin_eval(u);
-  double scale = x_sign * 2.0 * v_hi;
-  double r = const_term.hi + fputil::multiply_add(scale, p, const_term.lo);
-  return r;
-#else
-
-#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-  double h = fputil::multiply_add(v_hi, -v_hi, u);
-#else
-  DoubleDouble v_hi_sq = fputil::exact_mult(v_hi, v_hi);
-  double h = (u - v_hi_sq.hi) - v_hi_sq.lo;
-#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-
-  // Scale v_lo and v_hi by 2 from the formula:
-  //   vh = v_hi * 2
-  //   vl = 2*v_lo = h / v_hi.
-  double vh = v_hi * 2.0;
-  double vl = h / v_hi;
-
-  // Polynomial approximation:
-  //   p ~ asin(sqrt(u))/sqrt(u)
-  unsigned idx;
-  double err = vh * 0x1.0p-51;
-
-  DoubleDouble p = asin_eval(DoubleDouble{0.0, u}, idx, err);
-
-  // Perform computations in double-double arithmetic:
-  //   asin(x) = pi/2 - (v_hi + v_lo) * (ASIN_COEFFS[idx][0] + p)
-  DoubleDouble r0 = fputil::quick_mult(DoubleDouble{vl, vh}, p);
-
-  double r_hi, r_lo;
-  if (xbits.is_pos()) {
-    r_hi = r0.hi;
-    r_lo = r0.lo;
-  } else {
-    DoubleDouble r = fputil::exact_add(PI.hi, -r0.hi);
-    r_hi = r.hi;
-    r_lo = (PI.lo - r0.lo) + r.lo;
-  }
-
-  // Ziv's accuracy test.
-
-  double r_upper = r_hi + (r_lo + err);
-  double r_lower = r_hi + (r_lo - err);
-
-  if (LIBC_LIKELY(r_upper == r_lower))
-    return r_upper;
-
-  // Ziv's accuracy test failed, we redo the computations in Float128.
-  // Recalculate mod 1/64.
-  idx = static_cast<unsigned>(fputil::nearest_integer(u * 0x1.0p6));
-
-  // After the first step of Newton-Raphson approximating v = sqrt(u), we have
-  // that:
-  //   sqrt(u) = v_hi + h / (sqrt(u) + v_hi)
-  //      v_lo = h / (2 * v_hi)
-  // With error:
-  //   sqrt(u) - (v_hi + v_lo) = h * ( 1/(sqrt(u) + v_hi) - 1/(2*v_hi) )
-  //                           = -h^2 / (2*v * (sqrt(u) + v)^2).
-  // Since:
-  //   (sqrt(u) + v_hi)^2 ~ (2sqrt(u))^2 = 4u,
-  // we can add another correction term to (v_hi + v_lo) that is:
-  //   v_ll = -h^2 / (2*v_hi * 4u)
-  //        = -v_lo * (h / 4u)
-  //        = -vl * (h / 8u),
-  // making the errors:
-  //   sqrt(u) - (v_hi + v_lo + v_ll) = O(h^3)
-  // well beyond 128-bit precision needed.
-
-  // Get the rounding error of vl = 2 * v_lo ~ h / vh
-  // Get full product of vh * vl
-#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-  double vl_lo = fputil::multiply_add(-v_hi, vl, h) / v_hi;
-#else
-  DoubleDouble vh_vl = fputil::exact_mult(v_hi, vl);
-  double vl_lo = ((h - vh_vl.hi) - vh_vl.lo) / v_hi;
-#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-  // vll = 2*v_ll = -vl * (h / (4u)).
-  double t = h * (-0.25) / u;
-  double vll = fputil::multiply_add(vl, t, vl_lo);
-  // m_v = -(v_hi + v_lo + v_ll).
-  Float128 m_v = fputil::quick_add(
-      Float128(vh), fputil::quick_add(Float128(vl), Float128(vll)));
-  m_v.sign = xbits.sign();
-
-  // Perform computations in Float128:
-  //   acos(x) = (v_hi + v_lo + vll) * P(u)         , when 0.5 <= x < 1,
-  //           = pi - (v_hi + v_lo + vll) * P(u)    , when -1 < x <= -0.5.
-  Float128 y_f128(fputil::multiply_add(static_cast<double>(idx), -0x1.0p-6, u));
-
-  Float128 p_f128 = asin_eval(y_f128, idx);
-  Float128 r_f128 = fputil::quick_mul(m_v, p_f128);
-
-  if (xbits.is_neg())
-    r_f128 = fputil::quick_add(PI_F128, r_f128);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, acos, (double x)) { return math::acos(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/acosf.cpp b/libc/src/math/generic/acosf.cpp
index 8dd6de2ce7474..7afc7d661d552 100644
--- a/libc/src/math/generic/acosf.cpp
+++ b/libc/src/math/generic/acosf.cpp
@@ -7,127 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/acosf.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-
-#include "inv_trigf_utils.h"
+#include "src/__support/math/acosf.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS = 4;
-
-// Exceptional values when |x| <= 0.5
-static constexpr fputil::ExceptValues<float, N_EXCEPTS> ACOSF_EXCEPTS = {{
-    // (inputs, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.110b46p-26, acosf(x) = 0x1.921fb4p0 (RZ)
-    {0x328885a3, 0x3fc90fda, 1, 0, 1},
-    // x = -0x1.110b46p-26, acosf(x) = 0x1.921fb4p0 (RZ)
-    {0xb28885a3, 0x3fc90fda, 1, 0, 1},
-    // x = 0x1.04c444p-12, acosf(x) = 0x1.920f68p0 (RZ)
-    {0x39826222, 0x3fc907b4, 1, 0, 1},
-    // x = -0x1.04c444p-12, acosf(x) = 0x1.923p0 (RZ)
-    {0xb9826222, 0x3fc91800, 1, 0, 1},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
-  using FPBits = typename fputil::FPBits<float>;
-
-  FPBits xbits(x);
-  uint32_t x_uint = xbits.uintval();
-  uint32_t x_abs = xbits.uintval() & 0x7fff'ffffU;
-  uint32_t x_sign = x_uint >> 31;
-
-  // |x| <= 0.5
-  if (LIBC_UNLIKELY(x_abs <= 0x3f00'0000U)) {
-    // |x| < 0x1p-10
-    if (LIBC_UNLIKELY(x_abs < 0x3a80'0000U)) {
-      // When |x| < 2^-10, we use the following approximation:
-      //   acos(x) = pi/2 - asin(x)
-      //           ~ pi/2 - x - x^3 / 6
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      // Check for exceptional values
-      if (auto r = ACOSF_EXCEPTS.lookup(x_uint); LIBC_UNLIKELY(r.has_value()))
-        return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-      double xd = static_cast<double>(x);
-      return static_cast<float>(fputil::multiply_add(
-          -0x1.5555555555555p-3 * xd, xd * xd, M_MATH_PI_2 - xd));
-    }
-
-    // For |x| <= 0.5, we approximate acosf(x) by:
-    //   acos(x) = pi/2 - asin(x) = pi/2 - x * P(x^2)
-    // Where P(X^2) = Q(X) is a degree-20 minimax even polynomial approximating
-    // asin(x)/x on [0, 0.5] generated by Sollya with:
-    // > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
-    //                 [|1, D...|], [0, 0.5]);
-    double xd = static_cast<double>(x);
-    double xsq = xd * xd;
-    double x3 = xd * xsq;
-    double r = asin_eval(xsq);
-    return static_cast<float>(fputil::multiply_add(-x3, r, M_MATH_PI_2 - xd));
-  }
-
-  // |x| >= 1, return 0, 2pi, or NaNs.
-  if (LIBC_UNLIKELY(x_abs >= 0x3f80'0000U)) {
-    if (x_abs == 0x3f80'0000U)
-      return x_sign ? /* x == -1.0f */ fputil::round_result_slightly_down(
-                          0x1.921fb6p+1f)
-                    : /* x == 1.0f */ 0.0f;
-
-    if (xbits.is_signaling_nan()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-
-    // |x| <= +/-inf
-    if (x_abs <= 0x7f80'0000U) {
-      fputil::set_errno_if_required(EDOM);
-      fputil::raise_except_if_required(FE_INVALID);
-    }
-
-    return x + FPBits::quiet_nan().get_val();
-  }
-
-  // When 0.5 < |x| < 1, we perform range reduction as follow:
-  //
-  // Assume further that 0.5 < x <= 1, and let:
-  //   y = acos(x)
-  // We use the double angle formula:
-  //   x = cos(y) = 1 - 2 sin^2(y/2)
-  // So:
-  //   sin(y/2) = sqrt( (1 - x)/2 )
-  // And hence:
-  //   y = 2 * asin( sqrt( (1 - x)/2 ) )
-  // Let u = (1 - x)/2, then
-  //   acos(x) = 2 * asin( sqrt(u) )
-  // Moreover, since 0.5 < x <= 1,
-  //   0 <= u < 1/4, and 0 <= sqrt(u) < 0.5,
-  // And hence we can reuse the same polynomial approximation of asin(x) when
-  // |x| <= 0.5:
-  //   acos(x) ~ 2 * sqrt(u) * P(u).
-  //
-  // When -1 < x <= -0.5, we use the identity:
-  //   acos(x) = pi - acos(-x)
-  // which is reduced to the postive case.
-
-  xbits.set_sign(Sign::POS);
-  double xd = static_cast<double>(xbits.get_val());
-  double u = fputil::multiply_add(-0.5, xd, 0.5);
-  double cv = 2 * fputil::sqrt<double>(u);
-
-  double r3 = asin_eval(u);
-  double r = fputil::multiply_add(cv * u, r3, cv);
-  return static_cast<float>(x_sign ? M_MATH_PI - r : r);
-}
+LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return math::acosf(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/acosf16.cpp b/libc/src/math/generic/acosf16.cpp
index 202a950fbb5dd..0bf85f84c842c 100644
--- a/libc/src/math/generic/acosf16.cpp
+++ b/libc/src/math/generic/acosf16.cpp
@@ -8,144 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/acosf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/math/acosf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// Generated by Sollya using the following command:
-// > round(pi/2, SG, RN);
-// > round(pi, SG, RN);
-static constexpr float PI_OVER_2 = 0x1.921fb6p0f;
-static constexpr float PI = 0x1.921fb6p1f;
+LLVM_LIBC_FUNCTION(float16, acosf16, (float16 x)) { return math::acosf16(x); }
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS = 2;
-
-static constexpr fputil::ExceptValues<float16, N_EXCEPTS> ACOSF16_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    {0xacaf, 0x3e93, 1, 0, 0},
-    {0xb874, 0x4052, 1, 0, 1},
-}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, acosf16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits xbits(x);
-
-  uint16_t x_u = xbits.uintval();
-  uint16_t x_abs = x_u & 0x7fff;
-  uint16_t x_sign = x_u >> 15;
-
-  // |x| > 0x1p0, |x| > 1, or x is NaN.
-  if (LIBC_UNLIKELY(x_abs > 0x3c00)) {
-    // acosf16(NaN) = NaN
-    if (xbits.is_nan()) {
-      if (xbits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // 1 < |x| <= +/-inf
-    fputil::raise_except_if_required(FE_INVALID);
-    fputil::set_errno_if_required(EDOM);
-
-    return FPBits::quiet_nan().get_val();
-  }
-
-  float xf = x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Handle exceptional values
-  if (auto r = ACOSF16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // |x| == 0x1p0, x is 1 or -1
-  // if x is (-)1, return pi, else
-  // if x is (+)1, return 0
-  if (LIBC_UNLIKELY(x_abs == 0x3c00))
-    return fputil::cast<float16>(x_sign ? PI : 0.0f);
-
-  float xsq = xf * xf;
-
-  // |x| <= 0x1p-1, |x| <= 0.5
-  if (x_abs <= 0x3800) {
-    // if x is 0, return pi/2
-    if (LIBC_UNLIKELY(x_abs == 0))
-      return fputil::cast<float16>(PI_OVER_2);
-
-    // Note that: acos(x) = pi/2 + asin(-x) = pi/2 - asin(x)
-    // Degree-6 minimax polynomial of asin(x) generated by Sollya with:
-    // > P = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 0.5]);
-    float interm =
-        fputil::polyeval(xsq, 0x1.000002p0f, 0x1.554c2ap-3f, 0x1.3541ccp-4f,
-                         0x1.43b2d6p-5f, 0x1.a0d73ep-5f);
-    return fputil::cast<float16>(fputil::multiply_add(-xf, interm, PI_OVER_2));
-  }
-
-  // When |x| > 0.5, assume that 0.5 < |x| <= 1
-  //
-  // Step-by-step range-reduction proof:
-  // 1:  Let y = asin(x), such that, x = sin(y)
-  // 2:  From complimentary angle identity:
-  //       x = sin(y) = cos(pi/2 - y)
-  // 3:  Let z = pi/2 - y, such that x = cos(z)
-  // 4:  From double angle formula; cos(2A) = 1 - 2 * sin^2(A):
-  //       z = 2A, z/2 = A
-  //       cos(z) = 1 - 2 * sin^2(z/2)
-  // 5:  Make sin(z/2) subject of the formula:
-  //       sin(z/2) = sqrt((1 - cos(z))/2)
-  // 6:  Recall [3]; x = cos(z). Therefore:
-  //       sin(z/2) = sqrt((1 - x)/2)
-  // 7:  Let u = (1 - x)/2
-  // 8:  Therefore:
-  //       asin(sqrt(u)) = z/2
-  //       2 * asin(sqrt(u)) = z
-  // 9:  Recall [3]; z = pi/2 - y. Therefore:
-  //       y = pi/2 - z
-  //       y = pi/2 - 2 * asin(sqrt(u))
-  // 10: Recall [1], y = asin(x). Therefore:
-  //       asin(x) = pi/2 - 2 * asin(sqrt(u))
-  // 11: Recall that: acos(x) = pi/2 + asin(-x) = pi/2 - asin(x)
-  //     Therefore:
-  //       acos(x) = pi/2 - (pi/2 - 2 * asin(sqrt(u)))
-  //       acos(x) = 2 * asin(sqrt(u))
-  //
-  // THE RANGE REDUCTION, HOW?
-  // 12: Recall [7], u = (1 - x)/2
-  // 13: Since 0.5 < x <= 1, therefore:
-  //       0 <= u <= 0.25 and 0 <= sqrt(u) <= 0.5
-  //
-  // Hence, we can reuse the same [0, 0.5] domain polynomial approximation for
-  // Step [11] as `sqrt(u)` is in range.
-  // When -1 < x <= -0.5, the identity:
-  //       acos(x) = pi - acos(-x)
-  // allows us to compute for the negative x value (lhs)
-  // with a positive x value instead (rhs).
-
-  float xf_abs = (xf < 0 ? -xf : xf);
-  float u = fputil::multiply_add(-0.5f, xf_abs, 0.5f);
-  float sqrt_u = fputil::sqrt<float>(u);
-
-  // Degree-6 minimax polynomial of asin(x) generated by Sollya with:
-  // > P = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 0.5]);
-  float asin_sqrt_u =
-      sqrt_u * fputil::polyeval(u, 0x1.000002p0f, 0x1.554c2ap-3f,
-                                0x1.3541ccp-4f, 0x1.43b2d6p-5f, 0x1.a0d73ep-5f);
-
-  return fputil::cast<float16>(
-      x_sign ? fputil::multiply_add(-2.0f, asin_sqrt_u, PI) : 2 * asin_sqrt_u);
-}
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index c4927fa27a84b..5c04583650e62 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -7,73 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/acoshf.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/math/generic/common_constants.h"
-#include "src/math/generic/explogxf.h"
 
-namespace LIBC_NAMESPACE_DECL {
-
-LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
-  using FPBits_t = typename fputil::FPBits<float>;
-  FPBits_t xbits(x);
-
-  if (LIBC_UNLIKELY(x <= 1.0f)) {
-    if (x == 1.0f)
-      return 0.0f;
-    // x < 1.
-    fputil::set_errno_if_required(EDOM);
-    fputil::raise_except_if_required(FE_INVALID);
-    return FPBits_t::quiet_nan().get_val();
-  }
+#include "src/__support/math/acoshf.h"
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  uint32_t x_u = xbits.uintval();
-  if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
-    if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
-      return x;
-
-    // Helper functions to set results for exceptional cases.
-    auto round_result_slightly_down = [](float r) -> float {
-      volatile float tmp = r;
-      tmp = tmp - 0x1.0p-25f;
-      return tmp;
-    };
-    auto round_result_slightly_up = [](float r) -> float {
-      volatile float tmp = r;
-      tmp = tmp + 0x1.0p-25f;
-      return tmp;
-    };
-
-    switch (x_u) {
-    case 0x4f8ffb03: // x = 0x1.1ff606p32f
-      return round_result_slightly_up(0x1.6fdd34p4f);
-    case 0x5c569e88: // x = 0x1.ad3d1p57f
-      return round_result_slightly_up(0x1.45c146p5f);
-    case 0x5e68984e: // x = 0x1.d1309cp61f
-      return round_result_slightly_up(0x1.5c9442p5f);
-    case 0x655890d3: // x = 0x1.b121a6p75f
-      return round_result_slightly_down(0x1.a9a3f2p5f);
-    case 0x6eb1a8ec: // x = 0x1.6351d8p94f
-      return round_result_slightly_down(0x1.08b512p6f);
-    case 0x7997f30a: // x = 0x1.2fe614p116f
-      return round_result_slightly_up(0x1.451436p6f);
-    }
-  }
-#else
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan()))
-    return x;
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+namespace LIBC_NAMESPACE_DECL {
 
-  double x_d = static_cast<double>(x);
-  // acosh(x) = log(x + sqrt(x^2 - 1))
-  return static_cast<float>(log_eval(
-      x_d + fputil::sqrt<double>(fputil::multiply_add(x_d, x_d, -1.0))));
-}
+LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return math::acoshf(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/acoshf16.cpp b/libc/src/math/generic/acoshf16.cpp
index 44783a8749ac2..bb3a91f707080 100644
--- a/libc/src/math/generic/acoshf16.cpp
+++ b/libc/src/math/generic/acoshf16.cpp
@@ -7,104 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/acoshf16.h"
-#include "explogxf.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/math/acoshf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-static constexpr size_t N_EXCEPTS = 2;
-static constexpr fputil::ExceptValues<float16, N_EXCEPTS> ACOSHF16_EXCEPTS{{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.6dcp+1, acoshf16(x) = 0x1.b6p+0 (RZ)
-    {0x41B7, 0x3ED8, 1, 0, 0},
-    // x = 0x1.39p+0, acoshf16(x) = 0x1.4f8p-1 (RZ)
-    {0x3CE4, 0x393E, 1, 0, 1},
-}};
-
-LLVM_LIBC_FUNCTION(float16, acoshf16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits xbits(x);
-  uint16_t x_u = xbits.uintval();
-
-  // Check for NaN input first.
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
-    if (xbits.is_signaling_nan()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-    if (xbits.is_neg()) {
-      fputil::set_errno_if_required(EDOM);
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-    return x;
-  }
-
-  // Domain error for inputs less than 1.0.
-  if (LIBC_UNLIKELY(x <= 1.0f)) {
-    if (x == 1.0f)
-      return FPBits::zero().get_val();
-    fputil::set_errno_if_required(EDOM);
-    fputil::raise_except_if_required(FE_INVALID);
-    return FPBits::quiet_nan().get_val();
-  }
-
-  if (auto r = ACOSHF16_EXCEPTS.lookup(xbits.uintval());
-      LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-
-  float xf = x;
-  // High-precision polynomial approximation for inputs close to 1.0
-  // ([1, 1.25)).
-  //
-  // Brief derivation:
-  // 1. Expand acosh(1 + delta) using Taylor series around delta=0:
-  //    acosh(1 + delta) ≈ sqrt(2 * delta) * [1 - delta/12 + 3*delta^2/160
-  //                     - 5*delta^3/896 + 35*delta^4/18432 + ...]
-  // 2. Truncate the series to fit accurately for delta in [0, 0.25].
-  // 3. Polynomial coefficients (from sollya) used here are:
-  //    P(delta) ≈ 1 - 0x1.555556p-4 * delta + 0x1.333334p-6 * delta^2
-  //               - 0x1.6db6dcp-8 * delta^3 + 0x1.f1c71cp-10 * delta^4
-  // 4. The Sollya commands used to generate these coefficients were:
-  //      > display = hexadecimal;
-  //      > round(1/12, SG, RN);
-  //      > round(3/160, SG, RN);
-  //      > round(5/896, SG, RN);
-  //      > round(35/18432, SG, RN);
-  //      With hexadecimal display mode enabled, the outputs were:
-  //      0x1.555556p-4
-  //      0x1.333334p-6
-  //      0x1.6db6dcp-8
-  //      0x1.f1c71cp-10
-  // 5. The maximum absolute error, estimated using:
-  //      dirtyinfnorm(acosh(1 + x) - sqrt(2*x) * P(x), [0, 0.25])
-  //    is:
-  //      0x1.d84281p-22
-  if (LIBC_UNLIKELY(x_u < 0x3D00U)) {
-    float delta = xf - 1.0f;
-    float sqrt_2_delta = fputil::sqrt<float>(2.0 * delta);
-    float pe = fputil::polyeval(delta, 0x1p+0f, -0x1.555556p-4f, 0x1.333334p-6f,
-                                -0x1.6db6dcp-8f, 0x1.f1c71cp-10f);
-    float approx = sqrt_2_delta * pe;
-    return fputil::cast<float16>(approx);
-  }
-
-  // acosh(x) = log(x + sqrt(x^2 - 1))
-  float sqrt_term = fputil::sqrt<float>(fputil::multiply_add(xf, xf, -1.0f));
-  float result = static_cast<float>(log_eval(xf + sqrt_term));
-
-  return fputil::cast<float16>(result);
-}
+LLVM_LIBC_FUNCTION(float16, acoshf16, (float16 x)) { return math::acoshf16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/asin.cpp b/libc/src/math/generic/asin.cpp
index ad77683d1f880..d286fceaab6ac 100644
--- a/libc/src/math/generic/asin.cpp
+++ b/libc/src/math/generic/asin.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/asin.h"
-#include "asin_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -18,6 +17,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/math/asin_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -25,6 +25,7 @@ using DoubleDouble = fputil::DoubleDouble;
 using Float128 = fputil::DyadicFloat<128>;
 
 LLVM_LIBC_FUNCTION(double, asin, (double x)) {
+  using namespace asin_internal;
   using FPBits = fputil::FPBits<double>;
 
   FPBits xbits(x);
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index 12383bf6dacae..77d6de910962c 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -17,7 +17,7 @@
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 
-#include "inv_trigf_utils.h"
+#include "src/__support/math/inv_trigf_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -44,6 +44,7 @@ static constexpr fputil::ExceptValues<float, N_EXCEPTS> ASINF_EXCEPTS_HI = {{
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
+  using namespace inv_trigf_utils_internal;
   using FPBits = typename fputil::FPBits<float>;
 
   FPBits xbits(x);
diff --git a/libc/src/math/generic/asinhf.cpp b/libc/src/math/generic/asinhf.cpp
index 0bb7065eb1cfe..3aed3bc2c9cde 100644
--- a/libc/src/math/generic/asinhf.cpp
+++ b/libc/src/math/generic/asinhf.cpp
@@ -19,6 +19,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, asinhf, (float x)) {
+  using namespace acoshf_internal;
   using FPBits_t = typename fputil::FPBits<float>;
   FPBits_t xbits(x);
   uint32_t x_u = xbits.uintval();
diff --git a/libc/src/math/generic/asinhf16.cpp b/libc/src/math/generic/asinhf16.cpp
index 78786320b5f71..0a0b471d87ecc 100644
--- a/libc/src/math/generic/asinhf16.cpp
+++ b/libc/src/math/generic/asinhf16.cpp
@@ -49,6 +49,7 @@ static constexpr fputil::ExceptValues<float16, N_EXCEPTS> ASINHF16_EXCEPTS{{
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 LLVM_LIBC_FUNCTION(float16, asinhf16, (float16 x)) {
+  using namespace acoshf_internal;
   using FPBits = fputil::FPBits<float16>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
index c04b0eb1cc589..32b977f45d7e7 100644
--- a/libc/src/math/generic/atan2f.cpp
+++ b/libc/src/math/generic/atan2f.cpp
@@ -8,7 +8,6 @@
 
 #include "src/math/atan2f.h"
 #include "hdr/fenv_macros.h"
-#include "inv_trigf_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -18,6 +17,7 @@
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/inv_trigf_utils.h"
 
 #if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) &&                               \
     defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT)
@@ -236,6 +236,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
 // which is about rounding errors of double-double (2^-104).
 
 LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) {
+  using namespace inv_trigf_utils_internal;
   using FPBits = typename fputil::FPBits<float>;
   constexpr double IS_NEG[2] = {1.0, -1.0};
   constexpr double PI = 0x1.921fb54442d18p1;
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index 46196dbe4162c..22f962ef4cce4 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanf.h"
-#include "inv_trigf_utils.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/except_value_utils.h"
@@ -16,10 +15,12 @@
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/inv_trigf_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
+  using namespace inv_trigf_utils_internal;
   using FPBits = typename fputil::FPBits<float>;
 
   constexpr double FINAL_SIGN[2] = {1.0, -1.0};
diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp
index 2149314d2f676..602a8f042f783 100644
--- a/libc/src/math/generic/atanhf.cpp
+++ b/libc/src/math/generic/atanhf.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanhf.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
@@ -15,6 +16,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, atanhf, (float x)) {
+  using namespace acoshf_internal;
   using FPBits = typename fputil::FPBits<float>;
 
   FPBits xbits(x);
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/math/generic/common_constants.cpp
index 4dcf84d00ad50..42e3ff0deb348 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/math/generic/common_constants.cpp
@@ -51,52 +51,6 @@ const float ONE_OVER_F_FLOAT[128] = {
     0x1.08421p-1f,  0x1.07326p-1f,  0x1.0624dep-1f, 0x1.05198p-1f,
     0x1.041042p-1f, 0x1.03091cp-1f, 0x1.020408p-1f, 0x1.010102p-1f};
 
-// Lookup table for (1/f) where f = 1 + n*2^(-7), n = 0..127.
-const double ONE_OVER_F[128] = {
-    0x1.0000000000000p+0, 0x1.fc07f01fc07f0p-1, 0x1.f81f81f81f820p-1,
-    0x1.f44659e4a4271p-1, 0x1.f07c1f07c1f08p-1, 0x1.ecc07b301ecc0p-1,
-    0x1.e9131abf0b767p-1, 0x1.e573ac901e574p-1, 0x1.e1e1e1e1e1e1ep-1,
-    0x1.de5d6e3f8868ap-1, 0x1.dae6076b981dbp-1, 0x1.d77b654b82c34p-1,
-    0x1.d41d41d41d41dp-1, 0x1.d0cb58f6ec074p-1, 0x1.cd85689039b0bp-1,
-    0x1.ca4b3055ee191p-1, 0x1.c71c71c71c71cp-1, 0x1.c3f8f01c3f8f0p-1,
-    0x1.c0e070381c0e0p-1, 0x1.bdd2b899406f7p-1, 0x1.bacf914c1bad0p-1,
-    0x1.b7d6c3dda338bp-1, 0x1.b4e81b4e81b4fp-1, 0x1.b2036406c80d9p-1,
-    0x1.af286bca1af28p-1, 0x1.ac5701ac5701bp-1, 0x1.a98ef606a63bep-1,
-    0x1.a6d01a6d01a6dp-1, 0x1.a41a41a41a41ap-1, 0x1.a16d3f97a4b02p-1,
-    0x1.9ec8e951033d9p-1, 0x1.9c2d14ee4a102p-1, 0x1.999999999999ap-1,
-    0x1.970e4f80cb872p-1, 0x1.948b0fcd6e9e0p-1, 0x1.920fb49d0e229p-1,
-    0x1.8f9c18f9c18fap-1, 0x1.8d3018d3018d3p-1, 0x1.8acb90f6bf3aap-1,
-    0x1.886e5f0abb04ap-1, 0x1.8618618618618p-1, 0x1.83c977ab2beddp-1,
-    0x1.8181818181818p-1, 0x1.7f405fd017f40p-1, 0x1.7d05f417d05f4p-1,
-    0x1.7ad2208e0ecc3p-1, 0x1.78a4c8178a4c8p-1, 0x1.767dce434a9b1p-1,
-    0x1.745d1745d1746p-1, 0x1.724287f46debcp-1, 0x1.702e05c0b8170p-1,
-    0x1.6e1f76b4337c7p-1, 0x1.6c16c16c16c17p-1, 0x1.6a13cd1537290p-1,
-    0x1.6816816816817p-1, 0x1.661ec6a5122f9p-1, 0x1.642c8590b2164p-1,
-    0x1.623fa77016240p-1, 0x1.6058160581606p-1, 0x1.5e75bb8d015e7p-1,
-    0x1.5c9882b931057p-1, 0x1.5ac056b015ac0p-1, 0x1.58ed2308158edp-1,
-    0x1.571ed3c506b3ap-1, 0x1.5555555555555p-1, 0x1.5390948f40febp-1,
-    0x1.51d07eae2f815p-1, 0x1.5015015015015p-1, 0x1.4e5e0a72f0539p-1,
-    0x1.4cab88725af6ep-1, 0x1.4afd6a052bf5bp-1, 0x1.49539e3b2d067p-1,
-    0x1.47ae147ae147bp-1, 0x1.460cbc7f5cf9ap-1, 0x1.446f86562d9fbp-1,
-    0x1.42d6625d51f87p-1, 0x1.4141414141414p-1, 0x1.3fb013fb013fbp-1,
-    0x1.3e22cbce4a902p-1, 0x1.3c995a47babe7p-1, 0x1.3b13b13b13b14p-1,
-    0x1.3991c2c187f63p-1, 0x1.3813813813814p-1, 0x1.3698df3de0748p-1,
-    0x1.3521cfb2b78c1p-1, 0x1.33ae45b57bcb2p-1, 0x1.323e34a2b10bfp-1,
-    0x1.30d190130d190p-1, 0x1.2f684bda12f68p-1, 0x1.2e025c04b8097p-1,
-    0x1.2c9fb4d812ca0p-1, 0x1.2b404ad012b40p-1, 0x1.29e4129e4129ep-1,
-    0x1.288b01288b013p-1, 0x1.27350b8812735p-1, 0x1.25e22708092f1p-1,
-    0x1.2492492492492p-1, 0x1.23456789abcdfp-1, 0x1.21fb78121fb78p-1,
-    0x1.20b470c67c0d9p-1, 0x1.1f7047dc11f70p-1, 0x1.1e2ef3b3fb874p-1,
-    0x1.1cf06ada2811dp-1, 0x1.1bb4a4046ed29p-1, 0x1.1a7b9611a7b96p-1,
-    0x1.19453808ca29cp-1, 0x1.1811811811812p-1, 0x1.16e0689427379p-1,
-    0x1.15b1e5f75270dp-1, 0x1.1485f0e0acd3bp-1, 0x1.135c81135c811p-1,
-    0x1.12358e75d3033p-1, 0x1.1111111111111p-1, 0x1.0fef010fef011p-1,
-    0x1.0ecf56be69c90p-1, 0x1.0db20a88f4696p-1, 0x1.0c9714fbcda3bp-1,
-    0x1.0b7e6ec259dc8p-1, 0x1.0a6810a6810a7p-1, 0x1.0953f39010954p-1,
-    0x1.0842108421084p-1, 0x1.073260a47f7c6p-1, 0x1.0624dd2f1a9fcp-1,
-    0x1.05197f7d73404p-1, 0x1.0410410410410p-1, 0x1.03091b51f5e1ap-1,
-    0x1.0204081020408p-1, 0x1.0101010101010p-1};
-
 // Lookup table for log(f) = log(1 + n*2^(-7)) where n = 0..127,
 // computed and stored as float precision constants.
 // Generated by Sollya with the following commands:
@@ -136,52 +90,6 @@ const float LOG_F_FLOAT[128] = {
     0x1.52a2d2p-1f, 0x1.54b246p-1f, 0x1.56bf9ep-1f, 0x1.58cadcp-1f,
     0x1.5ad404p-1f, 0x1.5cdb1ep-1f, 0x1.5ee02ap-1f, 0x1.60e33p-1f};
 
-// Lookup table for log(f) = log(1 + n*2^(-7)) where n = 0..127.
-const double LOG_F[128] = {
-    0x0.0000000000000p+0, 0x1.fe02a6b106788p-8, 0x1.fc0a8b0fc03e3p-7,
-    0x1.7b91b07d5b11ap-6, 0x1.f829b0e783300p-6, 0x1.39e87b9febd5fp-5,
-    0x1.77458f632dcfcp-5, 0x1.b42dd711971bep-5, 0x1.f0a30c01162a6p-5,
-    0x1.16536eea37ae0p-4, 0x1.341d7961bd1d0p-4, 0x1.51b073f06183fp-4,
-    0x1.6f0d28ae56b4bp-4, 0x1.8c345d6319b20p-4, 0x1.a926d3a4ad563p-4,
-    0x1.c5e548f5bc743p-4, 0x1.e27076e2af2e5p-4, 0x1.fec9131dbeabap-4,
-    0x1.0d77e7cd08e59p-3, 0x1.1b72ad52f67a0p-3, 0x1.29552f81ff523p-3,
-    0x1.371fc201e8f74p-3, 0x1.44d2b6ccb7d1ep-3, 0x1.526e5e3a1b437p-3,
-    0x1.5ff3070a793d3p-3, 0x1.6d60fe719d21cp-3, 0x1.7ab890210d909p-3,
-    0x1.87fa06520c910p-3, 0x1.9525a9cf456b4p-3, 0x1.a23bc1fe2b563p-3,
-    0x1.af3c94e80bff2p-3, 0x1.bc286742d8cd6p-3, 0x1.c8ff7c79a9a21p-3,
-    0x1.d5c216b4fbb91p-3, 0x1.e27076e2af2e5p-3, 0x1.ef0adcbdc5936p-3,
-    0x1.fb9186d5e3e2ap-3, 0x1.0402594b4d040p-2, 0x1.0a324e27390e3p-2,
-    0x1.1058bf9ae4ad5p-2, 0x1.1675cababa60ep-2, 0x1.1c898c16999fap-2,
-    0x1.22941fbcf7965p-2, 0x1.2895a13de86a3p-2, 0x1.2e8e2bae11d30p-2,
-    0x1.347dd9a987d54p-2, 0x1.3a64c556945e9p-2, 0x1.404308686a7e3p-2,
-    0x1.4618bc21c5ec2p-2, 0x1.4be5f957778a0p-2, 0x1.51aad872df82dp-2,
-    0x1.5767717455a6cp-2, 0x1.5d1bdbf5809cap-2, 0x1.62c82f2b9c795p-2,
-    0x1.686c81e9b14aep-2, 0x1.6e08eaa2ba1e3p-2, 0x1.739d7f6bbd006p-2,
-    0x1.792a55fdd47a2p-2, 0x1.7eaf83b82afc3p-2, 0x1.842d1da1e8b17p-2,
-    0x1.89a3386c1425ap-2, 0x1.8f11e873662c7p-2, 0x1.947941c2116fap-2,
-    0x1.99d958117e08ap-2, 0x1.9f323ecbf984bp-2, 0x1.a484090e5bb0ap-2,
-    0x1.a9cec9a9a0849p-2, 0x1.af1293247786bp-2, 0x1.b44f77bcc8f62p-2,
-    0x1.b9858969310fbp-2, 0x1.beb4d9da71b7bp-2, 0x1.c3dd7a7cdad4dp-2,
-    0x1.c8ff7c79a9a21p-2, 0x1.ce1af0b85f3ebp-2, 0x1.d32fe7e00ebd5p-2,
-    0x1.d83e7258a2f3ep-2, 0x1.dd46a04c1c4a0p-2, 0x1.e24881a7c6c26p-2,
-    0x1.e744261d68787p-2, 0x1.ec399d2468cc0p-2, 0x1.f128f5faf06ecp-2,
-    0x1.f6123fa7028acp-2, 0x1.faf588f78f31ep-2, 0x1.ffd2e0857f498p-2,
-    0x1.02552a5a5d0fep-1, 0x1.04bdf9da926d2p-1, 0x1.0723e5c1cdf40p-1,
-    0x1.0986f4f573520p-1, 0x1.0be72e4252a82p-1, 0x1.0e44985d1cc8bp-1,
-    0x1.109f39e2d4c96p-1, 0x1.12f719593efbcp-1, 0x1.154c3d2f4d5e9p-1,
-    0x1.179eabbd899a0p-1, 0x1.19ee6b467c96ep-1, 0x1.1c3b81f713c24p-1,
-    0x1.1e85f5e7040d0p-1, 0x1.20cdcd192ab6dp-1, 0x1.23130d7bebf42p-1,
-    0x1.2555bce98f7cbp-1, 0x1.2795e1289b11ap-1, 0x1.29d37fec2b08ap-1,
-    0x1.2c0e9ed448e8bp-1, 0x1.2e47436e40268p-1, 0x1.307d7334f10bep-1,
-    0x1.32b1339121d71p-1, 0x1.34e289d9ce1d3p-1, 0x1.37117b54747b5p-1,
-    0x1.393e0d3562a19p-1, 0x1.3b68449fffc22p-1, 0x1.3d9026a7156fap-1,
-    0x1.3fb5b84d16f42p-1, 0x1.41d8fe84672aep-1, 0x1.43f9fe2f9ce67p-1,
-    0x1.4618bc21c5ec2p-1, 0x1.48353d1ea88dfp-1, 0x1.4a4f85db03ebbp-1,
-    0x1.4c679afccee39p-1, 0x1.4e7d811b75bb0p-1, 0x1.50913cc01686bp-1,
-    0x1.52a2d265bc5aap-1, 0x1.54b2467999497p-1, 0x1.56bf9d5b3f399p-1,
-    0x1.58cadb5cd7989p-1, 0x1.5ad404c359f2cp-1, 0x1.5cdb1dc6c1764p-1,
-    0x1.5ee02a9241675p-1, 0x1.60e32f44788d8p-1};
-
 // Range reduction constants for logarithms.
 // r(0) = 1, r(127) = 0.5
 // r(k) = 2^-8 * ceil(2^8 * (1 - 2^-8) / (1 + k*2^-7))
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
index 291816a7889ad..72b1d564ca472 100644
--- a/libc/src/math/generic/common_constants.h
+++ b/libc/src/math/generic/common_constants.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/FPUtil/triple_double.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/math/acosh_float_constants.h"
 #include "src/__support/math/exp_constants.h"
 #include "src/__support/number_pair.h"
 
@@ -20,16 +21,10 @@ namespace LIBC_NAMESPACE_DECL {
 // computed and stored as float precision constants.
 extern const float ONE_OVER_F_FLOAT[128];
 
-// Lookup table for (1/f) where f = 1 + n*2^(-7), n = 0..127.
-extern const double ONE_OVER_F[128];
-
 // Lookup table for log(f) = log(1 + n*2^(-7)) where n = 0..127,
 // computed and stored as float precision constants.
 extern const float LOG_F_FLOAT[128];
 
-// Lookup table for log(f) = log(1 + n*2^(-7)) where n = 0..127.
-extern const double LOG_F[128];
-
 // Lookup table for range reduction constants r for logarithms.
 extern const float R[128];
 
diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp
index c869f7d9dec5f..9f87564d524a6 100644
--- a/libc/src/math/generic/coshf.cpp
+++ b/libc/src/math/generic/coshf.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/coshf.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
diff --git a/libc/src/math/generic/erff.cpp b/libc/src/math/generic/erff.cpp
index 44607a52a2e57..003b3465ac597 100644
--- a/libc/src/math/generic/erff.cpp
+++ b/libc/src/math/generic/erff.cpp
@@ -7,180 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/erff.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/erff.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// Polynomials approximating erf(x)/x on ( k/8, (k + 1)/8 ) generated by Sollya
-// with:
-// > P = fpminimax(erf(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], [|D...|],
-//                 [k/8, (k + 1)/8]);
-// for k = 0..31.
-constexpr double COEFFS[32][8] = {
-    {0x1.20dd750429b6dp0, -0x1.812746b037753p-2, 0x1.ce2f219e8596ap-4,
-     -0x1.b82cdacb78fdap-6, 0x1.56479297dfda5p-8, -0x1.8b3ac5455ef02p-11,
-     -0x1.126fcac367e3bp-8, 0x1.2d0bdb3ba4984p-4},
-    {0x1.20dd750429b6dp0, -0x1.812746b0379a8p-2, 0x1.ce2f21a03cf2ap-4,
-     -0x1.b82ce30de083ep-6, 0x1.565bcad3eb60fp-8, -0x1.c02c66f659256p-11,
-     0x1.f92f673385229p-14, -0x1.def402648ae9p-17},
-    {0x1.20dd750429b34p0, -0x1.812746b032dcep-2, 0x1.ce2f219d84aaep-4,
-     -0x1.b82ce22dcf139p-6, 0x1.565b9efcd4af1p-8, -0x1.c021f1af414bcp-11,
-     0x1.f7c6d177eff82p-14, -0x1.c9e4410dcf865p-17},
-    {0x1.20dd750426eabp0, -0x1.812746ae592c7p-2, 0x1.ce2f211525f14p-4,
-     -0x1.b82ccc125e63fp-6, 0x1.56596f261cfd3p-8, -0x1.bfde1ff8eeecfp-11,
-     0x1.f31a9d15dc5d8p-14, -0x1.a5a4362844b3cp-17},
-    {0x1.20dd75039c705p0, -0x1.812746777e74dp-2, 0x1.ce2f17af98a1bp-4,
-     -0x1.b82be4b817cbep-6, 0x1.564bec2e2962ep-8, -0x1.bee86f9da3558p-11,
-     0x1.e9443689dc0ccp-14, -0x1.79c0f230805d8p-17},
-    {0x1.20dd74f811211p0, -0x1.81274371a3e8fp-2, 0x1.ce2ec038262e5p-4,
-     -0x1.b8265b82c5e1fp-6, 0x1.5615a2e239267p-8, -0x1.bc63ae023dcebp-11,
-     0x1.d87c2102f7e06p-14, -0x1.49584bea41d62p-17},
-    {0x1.20dd746d063e3p0, -0x1.812729a8a950fp-2, 0x1.ce2cb0a2df232p-4,
-     -0x1.b80eca1f51278p-6, 0x1.5572e26c46815p-8, -0x1.b715e5638b65ep-11,
-     0x1.bfbb195484968p-14, -0x1.177a565c15c52p-17},
-    {0x1.20dd701b44486p0, -0x1.812691145f237p-2, 0x1.ce23a06b8cfd9p-4,
-     -0x1.b7c1dc7245288p-6, 0x1.53e92f7f397ddp-8, -0x1.ad97cc4acf0b2p-11,
-     0x1.9f028b2b09b71p-14, -0x1.cdc4da08da8c1p-18},
-    {0x1.20dd5715ac332p0, -0x1.8123e680bd0ebp-2, 0x1.ce0457aded691p-4,
-     -0x1.b6f52d52bed4p-6, 0x1.50c291b84414cp-8, -0x1.9ea246b1ad4a9p-11,
-     0x1.77654674e0cap-14, -0x1.737c11a1bcebbp-18},
-    {0x1.20dce6593e114p0, -0x1.811a59c02eadcp-2, 0x1.cdab53c7cd7d5p-4,
-     -0x1.b526d2e321eedp-6, 0x1.4b1d32cd8b994p-8, -0x1.8963143ec0a1ep-11,
-     0x1.4ad5700e4db91p-14, -0x1.231e100e43ef2p-18},
-    {0x1.20db48bfd5a62p0, -0x1.80fdd84f9e308p-2, 0x1.ccd340d462983p-4,
-     -0x1.b196a2928768p-6, 0x1.4210c2c13a0f7p-8, -0x1.6dbdfb4ff71aep-11,
-     0x1.1bca2d17fbd71p-14, -0x1.bca36f90c7cf5p-19},
-    {0x1.20d64b2f8f508p0, -0x1.80b4d4f19fa8bp-2, 0x1.cb088197262e3p-4,
-     -0x1.ab51fd02e5b99p-6, 0x1.34e1e5e81a632p-8, -0x1.4c66377b502cep-11,
-     0x1.d9ad25066213cp-15, -0x1.4b0df7dd0cfa1p-19},
-    {0x1.20c8fc1243576p0, -0x1.8010cb2009e27p-2, 0x1.c7a47e9299315p-4,
-     -0x1.a155be5683654p-6, 0x1.233502694997bp-8, -0x1.26c94b7d813p-11,
-     0x1.8094f1de25fb9p-15, -0x1.e0e3d776c6eefp-20},
-    {0x1.20a9bd1611bc1p0, -0x1.7ec7fbce83f9p-2, 0x1.c1d757d7317b7p-4,
-     -0x1.92c160cd589fp-6, 0x1.0d307269cc5c2p-8, -0x1.fda5b0d2d1879p-12,
-     0x1.2fdd7b3b14a7fp-15, -0x1.54eed4a26af5ap-20},
-    {0x1.20682834f943dp0, -0x1.7c73f747bf5a9p-2, 0x1.b8c2db4a9ffd1p-4,
-     -0x1.7f0e4ffe989ecp-6, 0x1.e7061eae4166ep-9, -0x1.ad36e873fff2dp-12,
-     0x1.d39222396128ep-16, -0x1.d83dacec5ea6bp-21},
-    {0x1.1feb8d12676d7p0, -0x1.7898347284afep-2, 0x1.aba3466b34451p-4,
-     -0x1.663adc573e2f9p-6, 0x1.ae99fb17c3e08p-9, -0x1.602f950ad5535p-12,
-     0x1.5e9717490609dp-16, -0x1.3fca107bbc8d5p-21},
-    {0x1.1f12fe3c536fap0, -0x1.72b1d1f22e6d3p-2, 0x1.99fc0eed4a896p-4,
-     -0x1.48db0a87bd8c6p-6, 0x1.73e368895aa61p-9, -0x1.19b35d5301fc8p-12,
-     0x1.007987e4bb033p-16, -0x1.a7edcd4c2dc7p-22},
-    {0x1.1db7b0df84d5dp0, -0x1.6a4e4a41cde02p-2, 0x1.83bbded16455dp-4,
-     -0x1.2809b3b36977ep-6, 0x1.39c08bab44679p-9, -0x1.b7b45a70ed119p-13,
-     0x1.6e99b36410e7bp-17, -0x1.13619bb7ebc0cp-22},
-    {0x1.1bb1c85c4a527p0, -0x1.5f23b99a249a3p-2, 0x1.694c91fa0d12cp-4,
-     -0x1.053e1ce11c72dp-6, 0x1.02bf72c50ea78p-9, -0x1.4f478fb56cb02p-13,
-     0x1.005f80ecbe213p-17, -0x1.5f2446bde7f5bp-23},
-    {0x1.18dec3bd51f9dp0, -0x1.5123f58346186p-2, 0x1.4b8a1ca536ab4p-4,
-     -0x1.c4243015cc723p-7, 0x1.a1a8a01d351efp-10, -0x1.f466b34f1d86bp-14,
-     0x1.5f835eea0bf6ap-18, -0x1.b83165b939234p-24},
-    {0x1.152804c3369f4p0, -0x1.4084cd4afd4bcp-2, 0x1.2ba2e836e47aap-4,
-     -0x1.800f2dfc6904bp-7, 0x1.4a6daf0669c59p-10, -0x1.6e326ab872317p-14,
-     0x1.d9761a6a755a5p-19, -0x1.0fca33f9dd4b5p-24},
-    {0x1.1087ad68356aap0, -0x1.2dbb044707459p-2, 0x1.0aea8ceaa0384p-4,
-     -0x1.40b516d52b3d2p-7, 0x1.00c9e05f01d22p-10, -0x1.076afb0dc0ff7p-14,
-     0x1.39fadec400657p-19, -0x1.4b5761352e7e3p-25},
-    {0x1.0b0a7a8ba4a22p0, -0x1.196990d22d4a1p-2, 0x1.d5551e6ac0c4dp-5,
-     -0x1.07cce1770bd1ap-7, 0x1.890347b8848bfp-11, -0x1.757ec96750b6ap-15,
-     0x1.9b258a1e06bcep-20, -0x1.8fc6d22da7572p-26},
-    {0x1.04ce2be70fb47p0, -0x1.0449e4b0b9cacp-2, 0x1.97f7424f4b0e7p-5,
-     -0x1.ac825439c42f4p-8, 0x1.28f5f65426dfbp-11, -0x1.05b699a90f90fp-15,
-     0x1.0a888eecf4593p-20, -0x1.deace2b32bb31p-27},
-    {0x1.fbf9fb0e11cc8p-1, -0x1.de2640856545ap-3, 0x1.5f5b1f47f851p-5,
-     -0x1.588bc71eb41b9p-8, 0x1.bc6a0a772f56dp-12, -0x1.6b9fad1f1657ap-16,
-     0x1.573204ba66504p-21, -0x1.1d38065c94e44p-27},
-    {0x1.ed8f18c99e031p-1, -0x1.b4cb6acd903b4p-3, 0x1.2c7f3dddd6fc1p-5,
-     -0x1.13052067df4ep-8, 0x1.4a5027444082fp-12, -0x1.f672bab0e2554p-17,
-     0x1.b83c756348cc9p-22, -0x1.534f1a1079499p-28},
-    {0x1.debd33044166dp-1, -0x1.8d7cd9053f7d8p-3, 0x1.ff9957fb3d6e7p-6,
-     -0x1.b50be55de0f36p-9, 0x1.e92c8ec53a628p-13, -0x1.5a4b88d508007p-17,
-     0x1.1a27737559e26p-22, -0x1.942ae62cb2c14p-29},
-    {0x1.cfdbf0386f3bdp-1, -0x1.68e33d93b0dc4p-3, 0x1.b2683d58f53dep-6,
-     -0x1.5a9174e70d26fp-9, 0x1.69ddd326d49cdp-13, -0x1.dd8f397a8219cp-18,
-     0x1.6a755016ad4ddp-23, -0x1.e366e0139187dp-30},
-    {0x1.c132adb8d7464p-1, -0x1.475a899f61b46p-3, 0x1.70a431397a77cp-6,
-     -0x1.12e3d35beeee2p-9, 0x1.0c16b05738333p-13, -0x1.4a47f873e144ep-18,
-     0x1.d3d494c698c02p-24, -0x1.2302c59547fe5p-30},
-    {0x1.b2f5fd05555e7p-1, -0x1.28feefbe03ec7p-3, 0x1.3923acbb3a676p-6,
-     -0x1.b4ff793cd6358p-10, 0x1.8ea0eb8c913bcp-14, -0x1.cb31ec2baceb1p-19,
-     0x1.30011e7e80c04p-24, -0x1.617710635cb1dp-31},
-    {0x1.a54853cd9593ep-1, -0x1.0dbdbaea4dc8ep-3, 0x1.0a93e2c20a0fdp-6,
-     -0x1.5c969ff401ea8p-10, 0x1.29e0cc64fe627p-14, -0x1.4160d8e9d3c2ap-19,
-     0x1.8e7b67594624ap-25, -0x1.b1cf2c975b09bp-32},
-    {0x1.983ceece09ff8p-1, -0x1.eacc78f7a2dp-4, 0x1.c74418410655fp-7,
-     -0x1.1756a050e441ep-10, 0x1.bff3650f7f548p-15, -0x1.c56c0217d3adap-20,
-     0x1.07b4918d0b489p-25, -0x1.0d4be8c1c50f8p-32},
-};
-
-LLVM_LIBC_FUNCTION(float, erff, (float x)) {
-  using FPBits = typename fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-
-  if (LIBC_UNLIKELY(x_abs >= 0x4080'0000U)) {
-    const float ONE[2] = {1.0f, -1.0f};
-    const float SMALL[2] = {-0x1.0p-25f, 0x1.0p-25f};
-
-    int sign = xbits.is_neg() ? 1 : 0;
-
-    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
-      if (xbits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-      return (x_abs > 0x7f80'0000) ? x : ONE[sign];
-    }
-
-    return ONE[sign] + SMALL[sign];
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Exceptional mask = common 0 bits of 2 exceptional values.
-  constexpr uint32_t EXCEPT_MASK = 0x809a'6184U;
-
-  if (LIBC_UNLIKELY((x_abs & EXCEPT_MASK) == 0)) {
-    // Exceptional values
-    if (LIBC_UNLIKELY(x_abs == 0x3f65'9229U)) // |x| = 0x1.cb2452p-1f
-      return x < 0.0f ? fputil::round_result_slightly_down(-0x1.972ea8p-1f)
-                      : fputil::round_result_slightly_up(0x1.972ea8p-1f);
-    if (LIBC_UNLIKELY(x_abs == 0x4004'1e6aU)) // |x| = 0x1.083cd4p+1f
-      return x < 0.0f ? fputil::round_result_slightly_down(-0x1.fe3462p-1f)
-                      : fputil::round_result_slightly_up(0x1.fe3462p-1f);
-    if (x_abs == 0U)
-      return x;
-  }
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // Polynomial approximation:
-  //   erf(x) ~ x * (c0 + c1 * x^2 + c2 * x^4 + ... + c7 * x^14)
-  double xd = static_cast<double>(x);
-  double xsq = xd * xd;
-
-  const uint32_t EIGHT = 3 << FPBits::FRACTION_LEN;
-  int idx = static_cast<int>(FPBits(x_abs + EIGHT).get_val());
-
-  double x4 = xsq * xsq;
-  double c0 = fputil::multiply_add(xsq, COEFFS[idx][1], COEFFS[idx][0]);
-  double c1 = fputil::multiply_add(xsq, COEFFS[idx][3], COEFFS[idx][2]);
-  double c2 = fputil::multiply_add(xsq, COEFFS[idx][5], COEFFS[idx][4]);
-  double c3 = fputil::multiply_add(xsq, COEFFS[idx][7], COEFFS[idx][6]);
-
-  double x8 = x4 * x4;
-  double p0 = fputil::multiply_add(x4, c1, c0);
-  double p1 = fputil::multiply_add(x4, c3, c2);
-
-  return static_cast<float>(xd * fputil::multiply_add(x8, p1, p0));
-}
+LLVM_LIBC_FUNCTION(float, erff, (float x)) { return math::erff(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index c464979b092c3..5c36d28c166ae 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -7,491 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10.h"
-#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
-#include "explogxf.h"         // ziv_test_denorm.
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/common.h"
-#include "src/__support/integer_literals.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/exp10.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using fputil::DoubleDouble;
-using fputil::TripleDouble;
-using Float128 = typename fputil::DyadicFloat<128>;
-
-using LIBC_NAMESPACE::operator""_u128;
-
-// log2(10)
-constexpr double LOG2_10 = 0x1.a934f0979a371p+1;
-
-// -2^-12 * log10(2)
-// > a = -2^-12 * log10(2);
-// > b = round(a, 32, RN);
-// > c = round(a - b, 32, RN);
-// > d = round(a - b - c, D, RN);
-// Errors < 1.5 * 2^-144
-constexpr double MLOG10_2_EXP2_M12_HI = -0x1.3441350ap-14;
-constexpr double MLOG10_2_EXP2_M12_MID = 0x1.0c0219dc1da99p-51;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr double MLOG10_2_EXP2_M12_MID_32 = 0x1.0c0219dcp-51;
-constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// Error bounds:
-// Errors when using double precision.
-constexpr double ERR_D = 0x1.8p-63;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Errors when using double-double precision.
-constexpr double ERR_DD = 0x1.8p-99;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-namespace {
-
-// Polynomial approximations with double precision.  Generated by Sollya with:
-// > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]);
-// > P;
-// Error bounds:
-//   | output - (10^dx - 1) / dx | < 2^-52.
-LIBC_INLINE double poly_approx_d(double dx) {
-  // dx^2
-  double dx2 = dx * dx;
-  double c0 =
-      fputil::multiply_add(dx, 0x1.53524c73cea6ap+1, 0x1.26bb1bbb55516p+1);
-  double c1 =
-      fputil::multiply_add(dx, 0x1.2bd75cc6afc65p+0, 0x1.0470587aa264cp+1);
-  double p = fputil::multiply_add(dx2, c1, c0);
-  return p;
-}
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Polynomial approximation with double-double precision.  Generated by Solya
-// with:
-// > P = fpminimax((10^x - 1)/x, 5, [|DD...|], [-2^-14, 2^-14]);
-// Error bounds:
-//   | output - 10^(dx) | < 2^-101
-DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
-  // Taylor polynomial.
-  constexpr DoubleDouble COEFFS[] = {
-      {0, 0x1p0},
-      {-0x1.f48ad494e927bp-53, 0x1.26bb1bbb55516p1},
-      {-0x1.e2bfab3191cd2p-53, 0x1.53524c73cea69p1},
-      {0x1.80fb65ec3b503p-53, 0x1.0470591de2ca4p1},
-      {0x1.338fc05e21e55p-54, 0x1.2bd7609fd98c4p0},
-      {0x1.d4ea116818fbp-56, 0x1.1429ffd519865p-1},
-      {-0x1.872a8ff352077p-57, 0x1.a7ed70847c8b3p-3},
-
-  };
-
-  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
-                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
-  return p;
-}
-
-// Polynomial approximation with 128-bit precision:
-// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
-// For |dx| < 2^-14:
-//   | output - 10^dx | < 1.5 * 2^-124.
-Float128 poly_approx_f128(const Float128 &dx) {
-  constexpr Float128 COEFFS_128[]{
-      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
-      {Sign::POS, -126, 0x935d8ddd'aaa8ac16'ea56d62b'82d30a2d_u128},
-      {Sign::POS, -126, 0xa9a92639'e753443a'80a99ce7'5f4d5bdb_u128},
-      {Sign::POS, -126, 0x82382c8e'f1652304'6a4f9d7d'bf6c9635_u128},
-      {Sign::POS, -124, 0x12bd7609'fd98c44c'34578701'9216c7af_u128},
-      {Sign::POS, -127, 0x450a7ff4'7535d889'cc41ed7e'0d27aee5_u128},
-      {Sign::POS, -130, 0xd3f6b844'702d636b'8326bb91'a6e7601d_u128},
-      {Sign::POS, -130, 0x45b937f0'd05bb1cd'fa7b46df'314112a9_u128},
-  };
-
-  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
-                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
-                                COEFFS_128[6], COEFFS_128[7]);
-  return p;
-}
-
-// Compute 10^(x) using 128-bit precision.
-// TODO(lntue): investigate triple-double precision implementation for this
-// step.
-Float128 exp10_f128(double x, double kd, int idx1, int idx2) {
-  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
-  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-144
-
-  Float128 dx = fputil::quick_add(
-      Float128(t1), fputil::quick_add(Float128(t2), Float128(t3)));
-
-  // TODO: Skip recalculating exp_mid1 and exp_mid2.
-  Float128 exp_mid1 =
-      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
-                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
-                                          Float128(EXP2_MID1[idx1].lo)));
-
-  Float128 exp_mid2 =
-      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
-                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
-                                          Float128(EXP2_MID2[idx2].lo)));
-
-  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
-
-  Float128 p = poly_approx_f128(dx);
-
-  Float128 r = fputil::quick_mul(exp_mid, p);
-
-  r.exponent += static_cast<int>(kd) >> 12;
-
-  return r;
-}
-
-// Compute 10^x with double-double precision.
-DoubleDouble exp10_double_double(double x, double kd,
-                                 const DoubleDouble &exp_mid) {
-  // Recalculate dx:
-  //   dx = x - k * 2^-12 * log10(2)
-  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
-  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-140
-
-  DoubleDouble dx = fputil::exact_add(t1, t2);
-  dx.lo += t3;
-
-  // Degree-6 polynomial approximation in double-double precision.
-  // | p - 10^x | < 2^-103.
-  DoubleDouble p = poly_approx_dd(dx);
-
-  // Error bounds: 2^-102.
-  DoubleDouble r = fputil::quick_mult(exp_mid, p);
-
-  return r;
-}
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// When output is denormal.
-double exp10_denorm(double x) {
-  // Range reduction.
-  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
-  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
-  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
-      .value();
-#else
-  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use double-double
-  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
-
-  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
-
-// Check for exceptional cases when:
-//  * log10(1 - 2^-54) < x < log10(1 + 2^-53)
-//  * x >= log10(2^1024)
-//  * x <= log10(2^-1022)
-//  * x is inf or nan
-double set_exceptional(double x) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-  uint64_t x_abs = xbits.abs().uintval();
-
-  // |x| < log10(1 + 2^-53)
-  if (x_abs <= 0x3c8bcb7b1526e50e) {
-    // 10^(x) ~ 1 + x/2
-    return fputil::multiply_add(x, 0.5, 1.0);
-  }
-
-  // x <= log10(2^-1022) || x >= log10(2^1024) or inf/nan.
-  if (x_u >= 0xc0733a7146f72a42) {
-    // x <= log10(2^-1075) or -inf/nan
-    if (x_u > 0xc07439b746e36b52) {
-      // exp(-Inf) = 0
-      if (xbits.is_inf())
-        return 0.0;
-
-      // exp(nan) = nan
-      if (xbits.is_nan())
-        return x;
-
-      if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_subnormal().get_val();
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW);
-      return 0.0;
-    }
-
-    return exp10_denorm(x);
-  }
-
-  // x >= log10(2^1024) or +inf/nan
-  // x is finite
-  if (x_u < 0x7ff0'0000'0000'0000ULL) {
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_OVERFLOW);
-  }
-  // x is +inf or nan
-  return x + FPBits::inf().get_val();
-}
-
-} // namespace
-
-LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-
-  // x <= log10(2^-1022) or x >= log10(2^1024) or
-  // log10(1 - 2^-54) < x < log10(1 + 2^-53).
-  if (LIBC_UNLIKELY(x_u >= 0xc0733a7146f72a42 ||
-                    (x_u <= 0xbc7bcb7b1526e50e && x_u >= 0x40734413509f79ff) ||
-                    x_u < 0x3c8bcb7b1526e50e)) {
-    return set_exceptional(x);
-  }
-
-  // Now log10(2^-1075) < x <= log10(1 - 2^-54) or
-  //     log10(1 + 2^-53) < x < log10(2^1024)
-
-  // Range reduction:
-  // Let x = log10(2) * (hi + mid1 + mid2) + lo
-  // in which:
-  //   hi is an integer
-  //   mid1 * 2^6 is an integer
-  //   mid2 * 2^12 is an integer
-  // then:
-  //   10^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 10^(lo).
-  // With this formula:
-  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
-  //     field.
-  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
-  //   - 10^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
-  //
-  // We compute (hi + mid1 + mid2) together by perform the rounding on
-  //   x * log2(10) * 2^12.
-  // Since |x| < |log10(2^-1075)| < 2^9,
-  //   |x * 2^12| < 2^9 * 2^12 < 2^21,
-  // So we can fit the rounded result round(x * 2^12) in int32_t.
-  // Thus, the goal is to be able to use an additional addition and fixed width
-  // shift to get an int32_t representing round(x * 2^12).
-  //
-  // Assuming int32_t using 2-complement representation, since the mantissa part
-  // of a double precision is unsigned with the leading bit hidden, if we add an
-  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^23 to the product, the
-  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
-  // considered as a proper 2-complement representations of x*2^12.
-  //
-  // One small problem with this approach is that the sum (x*2^12 + C) in
-  // double precision is rounded to the least significant bit of the dorminant
-  // factor C.  In order to minimize the rounding errors from this addition, we
-  // want to minimize e1.  Another constraint that we want is that after
-  // shifting the mantissa so that the least significant bit of int32_t
-  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
-  // any adjustment.  So combining these 2 requirements, we can choose
-  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
-  // after right shifting the mantissa, the resulting int32_t has correct sign.
-  // With this choice of C, the number of mantissa bits we need to shift to the
-  // right is: 52 - 33 = 19.
-  //
-  // Moreover, since the integer right shifts are equivalent to rounding down,
-  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
-  // +infinity.  So in particular, we can compute:
-  //   hmm = x * 2^12 + C,
-  // where C = 2^33 + 2^32 + 2^-1, then if
-  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
-  // the reduced argument:
-  //   lo = x - log10(2) * 2^-12 * k is bounded by:
-  //   |lo|  = |x - log10(2) * 2^-12 * k|
-  //         = log10(2) * 2^-12 * | x * log2(10) * 2^12 - k |
-  //        <= log10(2) * 2^-12 * (2^-1 + 2^-19)
-  //         < 1.5 * 2^-2 * (2^-13 + 2^-31)
-  //         = 1.5 * (2^-15 * 2^-31)
-  //
-  // Finally, notice that k only uses the mantissa of x * 2^12, so the
-  // exponent 2^12 is not needed.  So we can simply define
-  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
-  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
-
-  // Rounding errors <= 2^-31.
-  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
-  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
-  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
-
-  // We use the degree-4 polynomial to approximate 10^(lo):
-  //   10^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4
-  //           = 1 + lo * P(lo)
-  // So that the errors are bounded by:
-  //   |P(lo) - (10^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
-  // Let P_ be an evaluation of P where all intermediate computations are in
-  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
-  // errors can be bounded by:
-  //      |P_(lo) - P(lo)| < 2^-51
-  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-65
-  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-64.
-  // Since we approximate
-  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
-  // We use the expression:
-  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
-  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
-  // with errors bounded by 2^-64.
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-  double r =
-      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
-  return r;
-#else
-  double upper = exp_mid.hi + (lo + ERR_D);
-  double lower = exp_mid.hi + (lo - ERR_D);
-
-  if (LIBC_LIKELY(upper == lower)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
-    return r;
-  }
-
-  // Exact outputs when x = 1, 2, ..., 22 + hard to round with x = 23.
-  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0 | ... | bits of 23.0)
-  if (LIBC_UNLIKELY((x_u & 0x8000'ffff'ffff'ffffULL) == 0ULL)) {
-    switch (x_u) {
-    case 0x3ff0000000000000: // x = 1.0
-      return 10.0;
-    case 0x4000000000000000: // x = 2.0
-      return 100.0;
-    case 0x4008000000000000: // x = 3.0
-      return 1'000.0;
-    case 0x4010000000000000: // x = 4.0
-      return 10'000.0;
-    case 0x4014000000000000: // x = 5.0
-      return 100'000.0;
-    case 0x4018000000000000: // x = 6.0
-      return 1'000'000.0;
-    case 0x401c000000000000: // x = 7.0
-      return 10'000'000.0;
-    case 0x4020000000000000: // x = 8.0
-      return 100'000'000.0;
-    case 0x4022000000000000: // x = 9.0
-      return 1'000'000'000.0;
-    case 0x4024000000000000: // x = 10.0
-      return 10'000'000'000.0;
-    case 0x4026000000000000: // x = 11.0
-      return 100'000'000'000.0;
-    case 0x4028000000000000: // x = 12.0
-      return 1'000'000'000'000.0;
-    case 0x402a000000000000: // x = 13.0
-      return 10'000'000'000'000.0;
-    case 0x402c000000000000: // x = 14.0
-      return 100'000'000'000'000.0;
-    case 0x402e000000000000: // x = 15.0
-      return 1'000'000'000'000'000.0;
-    case 0x4030000000000000: // x = 16.0
-      return 10'000'000'000'000'000.0;
-    case 0x4031000000000000: // x = 17.0
-      return 100'000'000'000'000'000.0;
-    case 0x4032000000000000: // x = 18.0
-      return 1'000'000'000'000'000'000.0;
-    case 0x4033000000000000: // x = 19.0
-      return 10'000'000'000'000'000'000.0;
-    case 0x4034000000000000: // x = 20.0
-      return 100'000'000'000'000'000'000.0;
-    case 0x4035000000000000: // x = 21.0
-      return 1'000'000'000'000'000'000'000.0;
-    case 0x4036000000000000: // x = 22.0
-      return 10'000'000'000'000'000'000'000.0;
-    case 0x4037000000000000: // x = 23.0
-      return 0x1.52d02c7e14af6p76 + x;
-    }
-  }
-
-  // Use double-double
-  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
-
-  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
-  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
-
-  if (LIBC_LIKELY(upper_dd == lower_dd)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
-    return r;
-  }
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return math::exp10(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10f.cpp b/libc/src/math/generic/exp10f.cpp
index 5284c380f52ec..b2d4f097bc7ce 100644
--- a/libc/src/math/generic/exp10f.cpp
+++ b/libc/src/math/generic/exp10f.cpp
@@ -7,12 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10f.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/math/generic/exp10f_impl.h"
+
+#include "src/__support/math/exp10f.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return generic::exp10f(x); }
+LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return math::exp10f(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 31abf3b4f89b2..cb3c8599c9231 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -7,128 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10f16.h"
-#include "expxf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/CPP/array.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/exp10f16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-static constexpr size_t N_EXP10F16_EXCEPTS = 5;
-#else
-static constexpr size_t N_EXP10F16_EXCEPTS = 8;
-#endif
-
-static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
-    EXP10F16_EXCEPTS = {{
-        // x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ)
-        {0x363dU, 0x40e8U, 1U, 0U, 1U},
-        // x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ)
-        {0x3657U, 0x40fbU, 1U, 0U, 0U},
-        // x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ)
-        {0xac06U, 0x3aecU, 1U, 0U, 0U},
-        // x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ)
-        {0xbf0aU, 0x2473U, 1U, 0U, 0U},
-        // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
-        {0xc387U, 0x09a5U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
-        {0x4030U, 0x57c1U, 1U, 0U, 1U},
-        // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
-        {0x406eU, 0x591fU, 1U, 0U, 1U},
-        // x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ)
-        {0x446eU, 0x7690U, 1U, 0U, 1U},
-#endif
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-  uint16_t x_abs = x_u & 0x7fffU;
-
-  // When |x| >= 5, or x is NaN.
-  if (LIBC_UNLIKELY(x_abs >= 0x4500U)) {
-    // exp10(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // When x >= 5.
-    if (x_bits.is_pos()) {
-      // exp10(+inf) = +inf
-      if (x_bits.is_inf())
-        return FPBits::inf().get_val();
-
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_UPWARD:
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW);
-        return FPBits::inf().get_val();
-      default:
-        return FPBits::max_normal().get_val();
-      }
-    }
-
-    // When x <= -8.
-    if (x_u >= 0xc800U) {
-      // exp10(-inf) = +0
-      if (x_bits.is_inf())
-        return FPBits::zero().get_val();
-
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-
-      if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
-      return FPBits::zero().get_val();
-    }
-  }
-
-  // When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results.
-  if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
-    switch (x_u) {
-    case 0x3c00U: // x = 1.0f16
-      return fputil::cast<float16>(10.0);
-    case 0x4000U: // x = 2.0f16
-      return fputil::cast<float16>(100.0);
-    case 0x4200U: // x = 3.0f16
-      return fputil::cast<float16>(1'000.0);
-    case 0x4400U: // x = 4.0f16
-      return fputil::cast<float16>(10'000.0);
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
-  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
-  return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
-}
+LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { return math::exp10f16(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
index 545c479694811..6c2fdbea418df 100644
--- a/libc/src/math/generic/exp10m1f16.cpp
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10m1f16.h"
-#include "expxf16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
@@ -21,6 +20,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/exp10f16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/explogxf.cpp b/libc/src/math/generic/explogxf.cpp
deleted file mode 100644
index d38efa0269693..0000000000000
--- a/libc/src/math/generic/explogxf.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- Single-precision general exp/log functions ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "explogxf.h"
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-// N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40]
-alignas(8) const double LOG_P1_LOG2[LOG_P1_SIZE] = {
-    0x0.0000000000000p+0, 0x1.6e79685c2d22ap-6, 0x1.6bad3758efd87p-5,
-    0x1.0eb389fa29f9bp-4, 0x1.663f6fac91316p-4, 0x1.bc84240adabbap-4,
-    0x1.08c588cda79e4p-3, 0x1.32ae9e278ae1ap-3, 0x1.5c01a39fbd688p-3,
-    0x1.84c2bd02f03b3p-3, 0x1.acf5e2db4ec94p-3, 0x1.d49ee4c325970p-3,
-    0x1.fbc16b902680ap-3, 0x1.11307dad30b76p-2, 0x1.24407ab0e073ap-2,
-    0x1.37124cea4cdedp-2, 0x1.49a784bcd1b8bp-2, 0x1.5c01a39fbd688p-2,
-    0x1.6e221cd9d0cdep-2, 0x1.800a563161c54p-2, 0x1.91bba891f1709p-2,
-    0x1.a33760a7f6051p-2, 0x1.b47ebf73882a1p-2, 0x1.c592fad295b56p-2,
-    0x1.d6753e032ea0fp-2, 0x1.e726aa1e754d2p-2, 0x1.f7a8568cb06cfp-2,
-    0x1.03fda8b97997fp-1, 0x1.0c10500d63aa6p-1, 0x1.140c9faa1e544p-1,
-    0x1.1bf311e95d00ep-1, 0x1.23c41d42727c8p-1, 0x1.2b803473f7ad1p-1,
-    0x1.3327c6ab49ca7p-1, 0x1.3abb3faa02167p-1, 0x1.423b07e986aa9p-1,
-    0x1.49a784bcd1b8bp-1, 0x1.510118708a8f9p-1, 0x1.5848226989d34p-1,
-    0x1.5f7cff41e09afp-1, 0x1.66a008e4788ccp-1, 0x1.6db196a76194ap-1,
-    0x1.74b1fd64e0754p-1, 0x1.7ba18f93502e4p-1, 0x1.82809d5be7073p-1,
-    0x1.894f74b06ef8bp-1, 0x1.900e6160002cdp-1, 0x1.96bdad2acb5f6p-1,
-    0x1.9d5d9fd5010b3p-1, 0x1.a3ee7f38e181fp-1, 0x1.aa708f58014d3p-1,
-    0x1.b0e4126bcc86cp-1, 0x1.b74948f5532dap-1, 0x1.bda071cc67e6ep-1,
-    0x1.c3e9ca2e1a055p-1, 0x1.ca258dca93316p-1, 0x1.d053f6d260896p-1,
-    0x1.d6753e032ea0fp-1, 0x1.dc899ab3ff56cp-1, 0x1.e29142e0e0140p-1,
-    0x1.e88c6b3626a73p-1, 0x1.ee7b471b3a950p-1, 0x1.f45e08bcf0655p-1,
-    0x1.fa34e1177c233p-1,
-};
-
-// N[Table[1/(1 + x), {x, 0/64, 63/64, 1/64}], 40]
-alignas(8) const double LOG_P1_1_OVER[LOG_P1_SIZE] = {
-    0x1.0000000000000p+0, 0x1.f81f81f81f820p-1, 0x1.f07c1f07c1f08p-1,
-    0x1.e9131abf0b767p-1, 0x1.e1e1e1e1e1e1ep-1, 0x1.dae6076b981dbp-1,
-    0x1.d41d41d41d41dp-1, 0x1.cd85689039b0bp-1, 0x1.c71c71c71c71cp-1,
-    0x1.c0e070381c0e0p-1, 0x1.bacf914c1bad0p-1, 0x1.b4e81b4e81b4fp-1,
-    0x1.af286bca1af28p-1, 0x1.a98ef606a63bep-1, 0x1.a41a41a41a41ap-1,
-    0x1.9ec8e951033d9p-1, 0x1.999999999999ap-1, 0x1.948b0fcd6e9e0p-1,
-    0x1.8f9c18f9c18fap-1, 0x1.8acb90f6bf3aap-1, 0x1.8618618618618p-1,
-    0x1.8181818181818p-1, 0x1.7d05f417d05f4p-1, 0x1.78a4c8178a4c8p-1,
-    0x1.745d1745d1746p-1, 0x1.702e05c0b8170p-1, 0x1.6c16c16c16c17p-1,
-    0x1.6816816816817p-1, 0x1.642c8590b2164p-1, 0x1.6058160581606p-1,
-    0x1.5c9882b931057p-1, 0x1.58ed2308158edp-1, 0x1.5555555555555p-1,
-    0x1.51d07eae2f815p-1, 0x1.4e5e0a72f0539p-1, 0x1.4afd6a052bf5bp-1,
-    0x1.47ae147ae147bp-1, 0x1.446f86562d9fbp-1, 0x1.4141414141414p-1,
-    0x1.3e22cbce4a902p-1, 0x1.3b13b13b13b14p-1, 0x1.3813813813814p-1,
-    0x1.3521cfb2b78c1p-1, 0x1.323e34a2b10bfp-1, 0x1.2f684bda12f68p-1,
-    0x1.2c9fb4d812ca0p-1, 0x1.29e4129e4129ep-1, 0x1.27350b8812735p-1,
-    0x1.2492492492492p-1, 0x1.21fb78121fb78p-1, 0x1.1f7047dc11f70p-1,
-    0x1.1cf06ada2811dp-1, 0x1.1a7b9611a7b96p-1, 0x1.1811811811812p-1,
-    0x1.15b1e5f75270dp-1, 0x1.135c81135c811p-1, 0x1.1111111111111p-1,
-    0x1.0ecf56be69c90p-1, 0x1.0c9714fbcda3bp-1, 0x1.0a6810a6810a7p-1,
-    0x1.0842108421084p-1, 0x1.0624dd2f1a9fcp-1, 0x1.0410410410410p-1,
-    0x1.0204081020408p-1};
-
-// Taylos series expansion for Log[2, 1 + x] splitted to EVEN AND ODD numbers
-// K_LOG2_ODD starts from x^3
-alignas(8) const
-    double K_LOG2_ODD[4] = {0x1.ec709dc3a03fdp-2, 0x1.2776c50ef9bfep-2,
-                            0x1.a61762a7aded9p-3, 0x1.484b13d7c02a9p-3};
-
-alignas(8) const
-    double K_LOG2_EVEN[4] = {-0x1.71547652b82fep-1, -0x1.71547652b82fep-2,
-                             -0x1.ec709dc3a03fdp-3, -0x1.2776c50ef9bfep-3};
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h
index 5ae1457ca780e..a2a6d60f41f76 100644
--- a/libc/src/math/generic/explogxf.h
+++ b/libc/src/math/generic/explogxf.h
@@ -10,166 +10,18 @@
 #define LLVM_LIBC_SRC_MATH_GENERIC_EXPLOGXF_H
 
 #include "common_constants.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/nearest_integer.h"
+
 #include "src/__support/common.h"
-#include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/cpu_features.h"
-
+#include "src/__support/math/acoshf_utils.h"
+#include "src/__support/math/exp10f_utils.h"
 #include "src/__support/math/exp_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-struct ExpBase {
-  // Base = e
-  static constexpr int MID_BITS = 5;
-  static constexpr int MID_MASK = (1 << MID_BITS) - 1;
-  // log2(e) * 2^5
-  static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS);
-  // High and low parts of -log(2) * 2^(-5)
-  static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS);
-  static constexpr double M_LOGB_2_LO =
-      -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS);
-  // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya
-  // with:
-  // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN));
-  static constexpr int64_t EXP_2_MID[1 << MID_BITS] = {
-      0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f,
-      0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa,
-      0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715,
-      0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d,
-      0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429,
-      0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74,
-      0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db,
-      0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d,
-      0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c,
-      0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f,
-      0x3ffea4afa2a490da, 0x3fff50765b6e4540,
-  };
-
-  // Approximating e^dx with degree-5 minimax polynomial generated by Sollya:
-  // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]);
-  // Then:
-  //   e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[3] * dx^5.
-  static constexpr double COEFFS[4] = {
-      0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5,
-      0x1.11112a0e34bdbp-7};
-
-  LIBC_INLINE static double powb_lo(double dx) {
-    using fputil::multiply_add;
-    double dx2 = dx * dx;
-    double c0 = 1.0 + dx;
-    // c1 = COEFFS[0] + COEFFS[1] * dx
-    double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]);
-    // c2 = COEFFS[2] + COEFFS[3] * dx
-    double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]);
-    // r = c4 + c5 * dx^4
-    //   = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7
-    return fputil::polyeval(dx2, c0, c1, c2);
-  }
-};
-
-struct Exp10Base : public ExpBase {
-  // log2(10) * 2^5
-  static constexpr double LOG2_B = 0x1.a934f0979a371p1 * (1 << MID_BITS);
-  // High and low parts of -log10(2) * 2^(-5).
-  // Notice that since |x * log2(10)| < 150:
-  //   |k| = |round(x * log2(10) * 2^5)| < 2^8 * 2^5 = 2^13
-  // So when the FMA instructions are not available, in order for the product
-  //   k * M_LOGB_2_HI
-  // to be exact, we only store the high part of log10(2) up to 38 bits
-  // (= 53 - 15) of precision.
-  // It is generated by Sollya with:
-  // > round(log10(2), 44, RN);
-  static constexpr double M_LOGB_2_HI = -0x1.34413509f8p-2 / (1 << MID_BITS);
-  // > round(log10(2) - 0x1.34413509f8p-2, D, RN);
-  static constexpr double M_LOGB_2_LO = 0x1.80433b83b532ap-44 / (1 << MID_BITS);
-
-  // Approximating 10^dx with degree-5 minimax polynomial generated by Sollya:
-  // > Q = fpminimax((10^x - 1)/x, 4, [|D...|], [-log10(2)/2^6, log10(2)/2^6]);
-  // Then:
-  //   10^dx ~ P(dx) = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
-  static constexpr double COEFFS[5] = {0x1.26bb1bbb55515p1, 0x1.53524c73bd3eap1,
-                                       0x1.0470591dff149p1, 0x1.2bd7c0a9fbc4dp0,
-                                       0x1.1429e74a98f43p-1};
-
-  static double powb_lo(double dx) {
-    using fputil::multiply_add;
-    double dx2 = dx * dx;
-    // c0 = 1 + COEFFS[0] * dx
-    double c0 = multiply_add(dx, Exp10Base::COEFFS[0], 1.0);
-    // c1 = COEFFS[1] + COEFFS[2] * dx
-    double c1 = multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
-    // c2 = COEFFS[3] + COEFFS[4] * dx
-    double c2 = multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
-    // r = c0 + dx^2 * (c1 + c2 * dx^2)
-    //   = c0 + c1 * dx^2 + c2 * dx^4
-    //   = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
-    return fputil::polyeval(dx2, c0, c1, c2);
-  }
-};
-
 constexpr int LOG_P1_BITS = 6;
 constexpr int LOG_P1_SIZE = 1 << LOG_P1_BITS;
 
-// N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40]
-extern const double LOG_P1_LOG2[LOG_P1_SIZE];
-
-// N[Table[1/(1 + x), {x, 0/64, 63/64, 1/64}], 40]
-extern const double LOG_P1_1_OVER[LOG_P1_SIZE];
-
-// Taylor series expansion for Log[2, 1 + x] splitted to EVEN AND ODD numbers
-// K_LOG2_ODD starts from x^3
-extern const double K_LOG2_ODD[4];
-extern const double K_LOG2_EVEN[4];
-
-// Output of range reduction for exp_b: (2^(mid + hi), lo)
-// where:
-//   b^x = 2^(mid + hi) * b^lo
-struct exp_b_reduc_t {
-  double mh; // 2^(mid + hi)
-  double lo;
-};
-
-// The function correctly calculates b^x value with at least float precision
-// in a limited range.
-// Range reduction:
-//   b^x = 2^(hi + mid) * b^lo
-// where:
-//   x = (hi + mid) * log_b(2) + lo
-//   hi is an integer,
-//   0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer
-//   -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1)
-// Base class needs to provide the following constants:
-//   - MID_BITS    : number of bits after decimal points used for mid
-//   - MID_MASK    : 2^MID_BITS - 1, mask to extract mid bits
-//   - LOG2_B      : log2(b) * 2^MID_BITS for scaling
-//   - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS)
-//   - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS)
-//   - EXP_2_MID   : look up table for bit fields of 2^mid
-// Return:
-//   { 2^(hi + mid), lo }
-template <class Base> LIBC_INLINE exp_b_reduc_t exp_b_range_reduc(float x) {
-  double xd = static_cast<double>(x);
-  // kd = round((hi + mid) * log2(b) * 2^MID_BITS)
-  double kd = fputil::nearest_integer(Base::LOG2_B * xd);
-  // k = round((hi + mid) * log2(b) * 2^MID_BITS)
-  int k = static_cast<int>(kd);
-  // hi = floor(kd * 2^(-MID_BITS))
-  // exp_hi = shift hi to the exponent field of double precision.
-  uint64_t exp_hi = static_cast<uint64_t>(k >> Base::MID_BITS)
-                    << fputil::FPBits<double>::FRACTION_LEN;
-  // mh = 2^hi * 2^mid
-  // mh_bits = bit field of mh
-  uint64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi;
-  double mh = fputil::FPBits<double>(mh_bits).get_val();
-  // dx = lo = x - (hi + mid) * log(2)
-  double dx = fputil::multiply_add(
-      kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd));
-  return {mh, dx};
-}
-
 // The function correctly calculates sinh(x) and cosh(x) by calculating exp(x)
 // and exp(-x) simultaneously.
 // To compute e^x, we perform the following range
@@ -269,33 +121,6 @@ template <bool is_sinh> LIBC_INLINE double exp_pm_eval(float x) {
   return r;
 }
 
-// x should be positive, normal finite value
-LIBC_INLINE static double log2_eval(double x) {
-  using FPB = fputil::FPBits<double>;
-  FPB bs(x);
-
-  double result = 0;
-  result += bs.get_exponent();
-
-  int p1 = (bs.get_mantissa() >> (FPB::FRACTION_LEN - LOG_P1_BITS)) &
-           (LOG_P1_SIZE - 1);
-
-  bs.set_uintval(bs.uintval() & (FPB::FRACTION_MASK >> LOG_P1_BITS));
-  bs.set_biased_exponent(FPB::EXP_BIAS);
-  double dx = (bs.get_val() - 1.0) * LOG_P1_1_OVER[p1];
-
-  // Taylor series for log(2,1+x)
-  double c1 = fputil::multiply_add(dx, K_LOG2_ODD[0], K_LOG2_EVEN[0]);
-  double c2 = fputil::multiply_add(dx, K_LOG2_ODD[1], K_LOG2_EVEN[1]);
-  double c3 = fputil::multiply_add(dx, K_LOG2_ODD[2], K_LOG2_EVEN[2]);
-  double c4 = fputil::multiply_add(dx, K_LOG2_ODD[3], K_LOG2_EVEN[3]);
-
-  // c0 = dx * (1.0 / ln(2)) + LOG_P1_LOG2[p1]
-  double c0 = fputil::multiply_add(dx, 0x1.71547652b82fep+0, LOG_P1_LOG2[p1]);
-  result += LIBC_NAMESPACE::fputil::polyeval(dx * dx, c0, c1, c2, c3, c4);
-  return result;
-}
-
 // x should be positive, normal finite value
 // TODO: Simplify range reduction and polynomial degree for float16.
 //       See issue #137190.
@@ -339,41 +164,6 @@ LIBC_INLINE static float log_eval_f(float x) {
   return result;
 }
 
-// x should be positive, normal finite value
-LIBC_INLINE static double log_eval(double x) {
-  // For x = 2^ex * (1 + mx)
-  //   log(x) = ex * log(2) + log(1 + mx)
-  using FPB = fputil::FPBits<double>;
-  FPB bs(x);
-
-  double ex = static_cast<double>(bs.get_exponent());
-
-  // p1 is the leading 7 bits of mx, i.e.
-  // p1 * 2^(-7) <= m_x < (p1 + 1) * 2^(-7).
-  int p1 = static_cast<int>(bs.get_mantissa() >> (FPB::FRACTION_LEN - 7));
-
-  // Set bs to (1 + (mx - p1*2^(-7))
-  bs.set_uintval(bs.uintval() & (FPB::FRACTION_MASK >> 7));
-  bs.set_biased_exponent(FPB::EXP_BIAS);
-  // dx = (mx - p1*2^(-7)) / (1 + p1*2^(-7)).
-  double dx = (bs.get_val() - 1.0) * ONE_OVER_F[p1];
-
-  // Minimax polynomial of log(1 + dx) generated by Sollya with:
-  // > P = fpminimax(log(1 + x)/x, 6, [|D...|], [0, 2^-7]);
-  const double COEFFS[6] = {-0x1.ffffffffffffcp-2, 0x1.5555555552ddep-2,
-                            -0x1.ffffffefe562dp-3, 0x1.9999817d3a50fp-3,
-                            -0x1.554317b3f67a5p-3, 0x1.1dc5c45e09c18p-3};
-  double dx2 = dx * dx;
-  double c1 = fputil::multiply_add(dx, COEFFS[1], COEFFS[0]);
-  double c2 = fputil::multiply_add(dx, COEFFS[3], COEFFS[2]);
-  double c3 = fputil::multiply_add(dx, COEFFS[5], COEFFS[4]);
-
-  double p = fputil::polyeval(dx2, dx, c1, c2, c3);
-  double result =
-      fputil::multiply_add(ex, /*log(2)*/ 0x1.62e42fefa39efp-1, LOG_F[p1] + p);
-  return result;
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPLOGXF_H
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 05ac95d586823..b17b14fa2d756 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -17,18 +17,11 @@
 #include "src/__support/macros/config.h"
 #include <stdint.h>
 
+#include "src/__support/math/exp10_float16_constants.h"
 #include "src/__support/math/expf16_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN));
-constexpr cpp::array<uint32_t, 8> EXP2_MID_BITS = {
-    0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U,
-    0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U,
-};
-
 LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
   // For -25 < x < 16, to compute 2^x, we perform the following range reduction:
   // find hi, mid, lo, such that:
@@ -66,53 +59,6 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
   return {exp2_hi_mid, exp2_lo};
 }
 
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log2(10), SG, RN);
-static constexpr float LOG2F_10 = 0x1.a934fp+1f;
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log10(2), SG, RN);
-static constexpr float LOG10F_2 = 0x1.344136p-2f;
-
-LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
-  // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
-  // find hi, mid, lo, such that:
-  //   x = (hi + mid) * log2(10) + lo, in which
-  //     hi is an integer,
-  //     mid * 2^3 is an integer,
-  //     -2^(-4) <= lo < 2^(-4).
-  // In particular,
-  //   hi + mid = round(x * 2^3) * 2^(-3).
-  // Then,
-  //   10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
-  // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
-  // by adding hi to the exponent field of 2^mid.  10^lo is computed using a
-  // degree-4 minimax polynomial generated by Sollya.
-
-  float xf = x;
-  float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
-  int x_hi_mid = static_cast<int>(kf);
-  unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3;
-  unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7;
-  // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
-  float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
-
-  uint32_t exp2_hi_mid_bits =
-      EXP2_MID_BITS[x_mid] +
-      static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
-  float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
-  // Degree-4 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
-  //   > 1 + x * P;
-  float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
-                                    0x1.04b434p+1f, 0x1.2bcf9ep+0f);
-  return {exp2_hi_mid, exp10_lo};
-}
-
 // Generated by Sollya with the following commands:
 //   > display = hexadecimal;
 //   > round(log2(exp(1)), SG, RN);
diff --git a/libc/src/math/generic/inv_trigf_utils.h b/libc/src/math/generic/inv_trigf_utils.h
deleted file mode 100644
index 8b47aba342995..0000000000000
--- a/libc/src/math/generic/inv_trigf_utils.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- Single-precision general inverse trigonometric functions ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
-
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-// PI and PI / 2
-static constexpr double M_MATH_PI = 0x1.921fb54442d18p+1;
-static constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0;
-
-extern double ATAN_COEFFS[17][9];
-
-// Look-up table for atan(k/16) with k = 0..16.
-static constexpr double ATAN_K_OVER_16[17] = {
-    0.0,
-    0x1.ff55bb72cfdeap-5,
-    0x1.fd5ba9aac2f6ep-4,
-    0x1.7b97b4bce5b02p-3,
-    0x1.f5b75f92c80ddp-3,
-    0x1.362773707ebccp-2,
-    0x1.6f61941e4def1p-2,
-    0x1.a64eec3cc23fdp-2,
-    0x1.dac670561bb4fp-2,
-    0x1.0657e94db30dp-1,
-    0x1.1e00babdefeb4p-1,
-    0x1.345f01cce37bbp-1,
-    0x1.4978fa3269ee1p-1,
-    0x1.5d58987169b18p-1,
-    0x1.700a7c5784634p-1,
-    0x1.819d0b7158a4dp-1,
-    0x1.921fb54442d18p-1,
-};
-
-// For |x| <= 1/32 and 0 <= i <= 16, return Q(x) such that:
-//   Q(x) ~ (atan(x + i/16) - atan(i/16)) / x.
-LIBC_INLINE static double atan_eval(double x, unsigned i) {
-  double x2 = x * x;
-
-  double c0 = fputil::multiply_add(x, ATAN_COEFFS[i][2], ATAN_COEFFS[i][1]);
-  double c1 = fputil::multiply_add(x, ATAN_COEFFS[i][4], ATAN_COEFFS[i][3]);
-  double c2 = fputil::multiply_add(x, ATAN_COEFFS[i][6], ATAN_COEFFS[i][5]);
-  double c3 = fputil::multiply_add(x, ATAN_COEFFS[i][8], ATAN_COEFFS[i][7]);
-
-  double x4 = x2 * x2;
-  double d1 = fputil::multiply_add(x2, c1, c0);
-  double d2 = fputil::multiply_add(x2, c3, c2);
-  double p = fputil::multiply_add(x4, d2, d1);
-  return p;
-}
-
-// Evaluate atan without big lookup table.
-//   atan(n/d) - atan(k/16) = atan((n/d - k/16) / (1 + (n/d) * (k/16)))
-//                          = atan((n - d * k/16)) / (d + n * k/16))
-// So we let q = (n - d * k/16) / (d + n * k/16),
-// and approximate with Taylor polynomial:
-//   atan(q) ~ q - q^3/3 + q^5/5 - q^7/7 + q^9/9
-LIBC_INLINE static double atan_eval_no_table(double num, double den,
-                                             double k_over_16) {
-  double num_r = fputil::multiply_add(den, -k_over_16, num);
-  double den_r = fputil::multiply_add(num, k_over_16, den);
-  double q = num_r / den_r;
-
-  constexpr double ATAN_TAYLOR[] = {
-      -0x1.5555555555555p-2,
-      0x1.999999999999ap-3,
-      -0x1.2492492492492p-3,
-      0x1.c71c71c71c71cp-4,
-  };
-  double q2 = q * q;
-  double q3 = q2 * q;
-  double q4 = q2 * q2;
-  double c0 = fputil::multiply_add(q2, ATAN_TAYLOR[1], ATAN_TAYLOR[0]);
-  double c1 = fputil::multiply_add(q2, ATAN_TAYLOR[3], ATAN_TAYLOR[2]);
-  double d = fputil::multiply_add(q4, c1, c0);
-  return fputil::multiply_add(q3, d, q);
-}
-
-// > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
-//                 [|1, D...|], [0, 0.5]);
-static constexpr double ASIN_COEFFS[10] = {
-    0x1.5555555540fa1p-3, 0x1.333333512edc2p-4, 0x1.6db6cc1541b31p-5,
-    0x1.f1caff324770ep-6, 0x1.6e43899f5f4f4p-6, 0x1.1f847cf652577p-6,
-    0x1.9b60f47f87146p-7, 0x1.259e2634c494fp-6, -0x1.df946fa875ddp-8,
-    0x1.02311ecf99c28p-5};
-
-// Evaluate P(x^2) - 1, where P(x^2) ~ asin(x)/x
-LIBC_INLINE static double asin_eval(double xsq) {
-  double x4 = xsq * xsq;
-  double r1 = fputil::polyeval(x4, ASIN_COEFFS[0], ASIN_COEFFS[2],
-                               ASIN_COEFFS[4], ASIN_COEFFS[6], ASIN_COEFFS[8]);
-  double r2 = fputil::polyeval(x4, ASIN_COEFFS[1], ASIN_COEFFS[3],
-                               ASIN_COEFFS[5], ASIN_COEFFS[7], ASIN_COEFFS[9]);
-  return fputil::multiply_add(xsq, r2, r1);
-}
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 7f614293029de..16b1b34a6c944 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -37,6 +37,7 @@ namespace internal {
 
 // We don't need to treat denormal and 0
 LIBC_INLINE float log(double x) {
+  using namespace acoshf_internal;
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
 
   using FPBits = typename fputil::FPBits<double>;
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index dfdfd5d6d5760..a45ef511c9bad 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -9,20 +9,17 @@
 #include "src/math/powf.h"
 #include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x)
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y)
 
-#include "exp10f_impl.h" // Speedup for powf(10, y) = exp10f(y)
 #include "exp2f_impl.h"  // Speedup for powf(2, y) = exp2f(y)
 
 namespace LIBC_NAMESPACE_DECL {
@@ -781,7 +778,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       return generic::exp2f(y);
     case 0x4120'0000: // x = 10.0f
       // pow(10, y) = exp10(y)
-      return generic::exp10f(y);
+      return math::exp10f(y);
 #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     }
 
diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp
index d6158fd302536..63111f84de141 100644
--- a/libc/src/math/generic/sinhf.cpp
+++ b/libc/src/math/generic/sinhf.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/sinhf.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 633d9f12949d2..8ab1c9ff98d2f 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -9,6 +9,7 @@ add_header_library(
     aarch64/inline_memset.h
     arm/common.h
     arm/inline_memcpy.h
+    arm/inline_memset.h
     generic/aligned_access.h
     generic/byte_per_byte.h
     inline_bcmp.h
diff --git a/libc/src/string/memory_utils/arm/common.h b/libc/src/string/memory_utils/arm/common.h
index 155bc3481709e..b9f40b64fed98 100644
--- a/libc/src/string/memory_utils/arm/common.h
+++ b/libc/src/string/memory_utils/arm/common.h
@@ -14,6 +14,9 @@
 
 #include <stddef.h> // size_t
 
+// Our minimum supported compiler version does not recognize the standard
+// [[likely]] / [[unlikely]] attributes so we use the preprocessor.
+
 // https://libc.llvm.org/compiler_support.html
 // Support for [[likely]] / [[unlikely]]
 //  [X] GCC 12.2
diff --git a/libc/src/string/memory_utils/arm/inline_memcpy.h b/libc/src/string/memory_utils/arm/inline_memcpy.h
index 30b99d41e0967..c748048a3e586 100644
--- a/libc/src/string/memory_utils/arm/inline_memcpy.h
+++ b/libc/src/string/memory_utils/arm/inline_memcpy.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
 
+#include "src/__support/CPP/type_traits.h"     // always_false
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
 #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
 #include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
@@ -36,7 +37,7 @@ LIBC_INLINE void copy(void *dst, const void *src) {
   } else if constexpr (access == AssumeAccess::kUnknown) {
     memcpy_inline<bytes>(dst, src);
   } else {
-    static_assert(false);
+    static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
   }
 }
 
@@ -54,7 +55,7 @@ LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
       copy<kWordSize, access>(dst + offset, src + offset);
     }
   } else {
-    static_assert(false, "Invalid BlockOp");
+    static_assert(cpp::always_false<decltype(block_op)>, "Invalid BlockOp");
   }
   // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
   // into the load/store instructions.
@@ -102,6 +103,7 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
         copy_bytes_and_bump_pointers(dst, src, offset);
         size -= offset;
       }
+    constexpr AssumeAccess kAligned = AssumeAccess::kAligned;
     const auto src_alignment = distance_to_align_down<kWordSize>(src);
     if (src_alignment == 0)
       LIBC_ATTR_LIKELY {
@@ -110,14 +112,11 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
         // load/store multiple (LDM, STM), each of 4 words. This requires more
         // registers so additional push/pop are needed but the speedup is worth
         // it.
-        consume_by_block<64, BlockOp::kFull, AssumeAccess::kAligned>(dst, src,
-                                                                     size);
+        consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size);
         // Then we use blocks of 4 word load/store.
-        consume_by_block<16, BlockOp::kByWord, AssumeAccess::kAligned>(dst, src,
-                                                                       size);
+        consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size);
         // Then we use word by word copy.
-        consume_by_block<4, BlockOp::kByWord, AssumeAccess::kAligned>(dst, src,
-                                                                      size);
+        consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size);
       }
     else {
       // `dst` is aligned but `src` is not.
@@ -128,7 +127,7 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
             src_alignment == 2
                 ? load_aligned<uint32_t, uint16_t, uint16_t>(src)
                 : load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
-        copy<kWordSize, AssumeAccess::kAligned>(dst, &value);
+        copy<kWordSize, kAligned>(dst, &value);
         dst += kWordSize;
         src += kWordSize;
         size -= kWordSize;
@@ -173,7 +172,7 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
   // accesses through the use of load/store multiple (LDM, STM) and load/store
   // double (LDRD, STRD) instructions are generally not supported and can fault.
   // By forcing decomposition of 64 bytes copy into word by word copy, the
-  // compiler can use the first load to prefetch memory:
+  // compiler uses a load to prefetch the next cache line:
   //   ldr  r3, [r1, #64]!  <- prefetch next cache line
   //   str  r3, [r0]
   //   ldr  r3, [r1, #0x4]
@@ -183,11 +182,10 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
   //   str  r3, [r0, #0x3c]
   // This is a bit detrimental for sizes between 64 and 256 (less than 10%
   // penalty) but the prefetch yields better throughput for larger copies.
-  consume_by_block<64, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src,
-                                                                 size);
-  consume_by_block<16, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src,
-                                                                 size);
-  consume_by_block<4, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src, size);
+  constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown;
+  consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size);
+  consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size);
+  consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size);
   if (size & 1)
     copy_block_and_bump_pointers<1>(dst, src);
   if (size & 2)
diff --git a/libc/src/string/memory_utils/arm/inline_memset.h b/libc/src/string/memory_utils/arm/inline_memset.h
new file mode 100644
index 0000000000000..a7ef9cc7df916
--- /dev/null
+++ b/libc/src/string/memory_utils/arm/inline_memset.h
@@ -0,0 +1,156 @@
+//===-- Memset implementation for arm ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// The functions defined in this file give approximate code size. These sizes
+// assume the following configuration options:
+// - LIBC_CONF_KEEP_FRAME_POINTER = false
+// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
+// - LIBC_ADD_NULL_CHECKS = false
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H
+
+#include "src/__support/CPP/type_traits.h"     // always_false
+#include "src/__support/macros/attributes.h"   // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
+#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
+#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
+
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+template <size_t bytes, AssumeAccess access>
+LIBC_INLINE void set(void *dst, uint32_t value) {
+  static_assert(bytes == 1 || bytes == 2 || bytes == 4);
+  if constexpr (access == AssumeAccess::kAligned) {
+    constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
+    memcpy_inline<bytes>(assume_aligned<alignment>(dst), &value);
+  } else if constexpr (access == AssumeAccess::kUnknown) {
+    memcpy_inline<bytes>(dst, &value);
+  } else {
+    static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
+  }
+}
+
+template <size_t bytes, AssumeAccess access = AssumeAccess::kUnknown>
+LIBC_INLINE void set_block_and_bump_pointers(Ptr &dst, uint32_t value) {
+  if constexpr (bytes <= kWordSize) {
+    set<bytes, access>(dst, value);
+  } else {
+    static_assert(bytes % kWordSize == 0 && bytes >= kWordSize);
+    LIBC_LOOP_UNROLL
+    for (size_t offset = 0; offset < bytes; offset += kWordSize) {
+      set<kWordSize, access>(dst + offset, value);
+    }
+  }
+  // In the 1, 2, 4 byte set case, the compiler can fold pointer offsetting
+  // into the store instructions.
+  // e.g.,
+  // strb  r3, [r0], #1
+  dst += bytes;
+}
+
+template <size_t bytes, AssumeAccess access>
+LIBC_INLINE void consume_by_block(Ptr &dst, uint32_t value, size_t &size) {
+  LIBC_LOOP_NOUNROLL
+  for (size_t i = 0; i < size / bytes; ++i)
+    set_block_and_bump_pointers<bytes, access>(dst, value);
+  size %= bytes;
+}
+
+[[maybe_unused]] LIBC_INLINE void
+set_bytes_and_bump_pointers(Ptr &dst, uint32_t value, size_t size) {
+  LIBC_LOOP_NOUNROLL
+  for (size_t i = 0; i < size; ++i) {
+    set<1, AssumeAccess::kUnknown>(dst++, value);
+  }
+}
+
+} // namespace
+
+// Implementation for Cortex-M0, M0+, M1. It compiles down to 140 bytes when
+// used through `memset` that also needs to return the `dst` ptr. These cores do
+// not allow unaligned stores so all accesses are aligned.
+[[maybe_unused]] LIBC_INLINE void
+inline_memset_arm_low_end(Ptr dst, uint8_t value, size_t size) {
+  if (size >= 8)
+    LIBC_ATTR_LIKELY {
+      // Align `dst` to word boundary.
+      if (const size_t offset = distance_to_align_up<kWordSize>(dst))
+        LIBC_ATTR_UNLIKELY {
+          set_bytes_and_bump_pointers(dst, value, offset);
+          size -= offset;
+        }
+      const uint32_t value32 = value * 0x01010101U; // splat value in each byte
+      consume_by_block<64, AssumeAccess::kAligned>(dst, value32, size);
+      consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size);
+      consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size);
+    }
+  set_bytes_and_bump_pointers(dst, value, size);
+}
+
+// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
+// support for unaligned loads and stores. It compiles down to 186 bytes when
+// used through `memset` that also needs to return the `dst` ptr.
+[[maybe_unused]] LIBC_INLINE void
+inline_memset_arm_mid_end(Ptr dst, uint8_t value, size_t size) {
+  const uint32_t value32 = value * 0x01010101U; // splat value in each byte
+  if (misaligned(dst))
+    LIBC_ATTR_UNLIKELY {
+      if (size < 8)
+        LIBC_ATTR_UNLIKELY {
+          if (size & 1)
+            set_block_and_bump_pointers<1>(dst, value32);
+          if (size & 2)
+            set_block_and_bump_pointers<2>(dst, value32);
+          if (size & 4)
+            set_block_and_bump_pointers<4>(dst, value32);
+          return;
+        }
+      const size_t offset = distance_to_align_up<kWordSize>(dst);
+      if (offset & 1)
+        set_block_and_bump_pointers<1>(dst, value32);
+      if (offset & 2)
+        set_block_and_bump_pointers<2>(dst, value32);
+      size -= offset;
+    }
+  // If we tell the compiler that the stores are aligned it will generate 8 x
+  // STRD instructions. By not specifying alignment, the compiler conservatively
+  // uses 16 x STR.W and is able to use the first one to prefetch the
+  // destination in advance leading to better asymptotic performances.
+  //   str      r12, [r3, #64]!   <- prefetch next cache line
+  //   str.w    r12, [r3, #0x4]
+  //   str.w    r12, [r3, #0x8]
+  //   ...
+  //   str.w    r12, [r3, #0x38]
+  //   str.w    r12, [r3, #0x3c]
+  consume_by_block<64, AssumeAccess::kUnknown>(dst, value32, size);
+  // Prefetching does not matter anymore at this scale so using STRD yields
+  // better results.
+  consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size);
+  consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size);
+  if (size & 1)
+    set_block_and_bump_pointers<1>(dst, value32);
+  if (size & 2)
+    LIBC_ATTR_UNLIKELY
+  set_block_and_bump_pointers<2>(dst, value32);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memset_arm_dispatch(Ptr dst, uint8_t value, size_t size) {
+#ifdef __ARM_FEATURE_UNALIGNED
+  return inline_memset_arm_mid_end(dst, value, size);
+#else
+  return inline_memset_arm_low_end(dst, value, size);
+#endif
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
diff --git a/libc/src/string/memory_utils/inline_memset.h b/libc/src/string/memory_utils/inline_memset.h
index fd9c29ea4410a..e41bdb626d60e 100644
--- a/libc/src/string/memory_utils/inline_memset.h
+++ b/libc/src/string/memory_utils/inline_memset.h
@@ -18,6 +18,9 @@
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 #include "src/string/memory_utils/x86_64/inline_memset.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_x86
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+#include "src/string/memory_utils/arm/inline_memset.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_arm_dispatch
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #include "src/string/memory_utils/aarch64/inline_memset.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_aarch64_dispatch
@@ -34,7 +37,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LIBC_INLINE static void inline_memset(void *dst, uint8_t value, size_t count) {
+[[gnu::flatten]] LIBC_INLINE void inline_memset(void *dst, uint8_t value,
+                                                size_t count) {
   LIBC_SRC_STRING_MEMORY_UTILS_MEMSET(reinterpret_cast<Ptr>(dst), value, count);
 }
 
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 4f56263fce8ec..1231117586a7c 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -21,6 +21,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
+#include <stdint.h> // uintptr_t
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
diff --git a/libc/src/sys/epoll/linux/epoll_create.cpp b/libc/src/sys/epoll/linux/epoll_create.cpp
index 2e44e883ddf0a..dcd082b56f9ad 100644
--- a/libc/src/sys/epoll/linux/epoll_create.cpp
+++ b/libc/src/sys/epoll/linux/epoll_create.cpp
@@ -20,6 +20,11 @@ LLVM_LIBC_FUNCTION(int, epoll_create, ([[maybe_unused]] int size)) {
 #ifdef SYS_epoll_create
   int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_epoll_create, size);
 #elif defined(SYS_epoll_create1)
+  if (size == 0) {
+    libc_errno = EINVAL;
+    return -1;
+  }
+
   int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_epoll_create1, 0);
 #else
 #error                                                                         \
diff --git a/libc/src/sys/time/linux/setitimer.cpp b/libc/src/sys/time/linux/setitimer.cpp
index 1de0d43297760..fb163586e30d9 100644
--- a/libc/src/sys/time/linux/setitimer.cpp
+++ b/libc/src/sys/time/linux/setitimer.cpp
@@ -22,9 +22,9 @@ LLVM_LIBC_FUNCTION(int, setitimer,
     // There is no SYS_setitimer_time64 call, so we can't use time_t directly,
     // and need to convert it to long first.
     long new_value32[4] = {static_cast<long>(new_value->it_interval.tv_sec),
-                           new_value->it_interval.tv_usec,
+                           static_cast<long>(new_value->it_interval.tv_usec),
                            static_cast<long>(new_value->it_value.tv_sec),
-                           new_value->it_value.tv_usec};
+                           static_cast<long>(new_value->it_value.tv_usec)};
     long old_value32[4];
 
     ret = LIBC_NAMESPACE::syscall_impl<long>(SYS_setitimer, which, new_value32,
diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp
index ed37b42aedf6c..9c00ce9909f2f 100644
--- a/libc/src/sys/time/linux/utimes.cpp
+++ b/libc/src/sys/time/linux/utimes.cpp
@@ -59,8 +59,10 @@ LLVM_LIBC_FUNCTION(int, utimes,
     ts[1].tv_sec = times[1].tv_sec;
 
     // convert u-seconds to nanoseconds
-    ts[0].tv_nsec = times[0].tv_usec * 1000;
-    ts[1].tv_nsec = times[1].tv_usec * 1000;
+    ts[0].tv_nsec =
+        static_cast<decltype(ts[0].tv_nsec)>(times[0].tv_usec * 1000);
+    ts[1].tv_nsec =
+        static_cast<decltype(ts[1].tv_nsec)>(times[1].tv_usec * 1000);
 
     ts_ptr = ts;
   }
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 7ace1a6ca66ba..159778df6acca 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -1,3 +1,13 @@
+add_header_library(
+  wchar_utils
+  HDRS
+    wchar_utils.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+)
+
 add_entrypoint_object(
   wcslen
   SRCS
@@ -255,6 +265,8 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.wchar_macros
     libc.hdr.types.size_t
+    libc.src.wchar.wchar_utils
+    libc.src.__support.macros.null_check
 )
 
 add_entrypoint_object(
@@ -266,6 +278,8 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.wchar_macros
     libc.hdr.types.size_t
+    libc.src.wchar.wchar_utils
+    libc.src.__support.macros.null_check
 )
 
 add_entrypoint_object(
diff --git a/libc/src/wchar/mbtowc.cpp b/libc/src/wchar/mbtowc.cpp
index eae39ba6081f3..6d099d43da5fa 100644
--- a/libc/src/wchar/mbtowc.cpp
+++ b/libc/src/wchar/mbtowc.cpp
@@ -25,10 +25,7 @@ LLVM_LIBC_FUNCTION(int, mbtowc,
   if (s == nullptr)
     return 0;
   internal::mbstate internal_mbstate;
-  // temp ptr to use if pwc is nullptr
-  wchar_t buf[1];
-  auto ret =
-      internal::mbrtowc(pwc == nullptr ? buf : pwc, s, n, &internal_mbstate);
+  auto ret = internal::mbrtowc(pwc, s, n, &internal_mbstate);
   if (!ret.has_value() || static_cast<int>(ret.value()) == -2) {
     // Encoding failure
     libc_errno = EILSEQ;
diff --git a/libc/src/wchar/wchar_utils.h b/libc/src/wchar/wchar_utils.h
new file mode 100644
index 0000000000000..e0218c7d89b1f
--- /dev/null
+++ b/libc/src/wchar/wchar_utils.h
@@ -0,0 +1,45 @@
+//===-- wchar utils ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCHAR_UTILS_H
+#define LLVM_LIBC_SRC_WCHAR_WCHAR_UTILS_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// returns true if the character exists in the string
+LIBC_INLINE static bool wcschr(wchar_t c, const wchar_t *str) {
+  for (int n = 0; str[n]; ++n) {
+    if (str[n] == c)
+      return true;
+  }
+  return false;
+}
+
+// bool should be true for wcscspn for complimentary span
+// should be false for wcsspn since we want it to span
+LIBC_INLINE static size_t wcsspn(const wchar_t *s1, const wchar_t *s2,
+                                 bool not_match_set) {
+  size_t i = 0;
+  for (; s1[i]; ++i) {
+    bool in_set = wcschr(s1[i], s2);
+    if (in_set == not_match_set)
+      return i;
+  }
+  return i;
+}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif //  LLVM_LIBC_SRC_WCHAR_WCHAR_UTILS_H
diff --git a/libc/src/wchar/wcscspn.cpp b/libc/src/wchar/wcscspn.cpp
index 8869d84cdfdee..34f3451b19c30 100644
--- a/libc/src/wchar/wcscspn.cpp
+++ b/libc/src/wchar/wcscspn.cpp
@@ -12,23 +12,15 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "wchar_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-bool check(wchar_t c, const wchar_t *s2) {
-  for (int n = 0; s2[n]; ++n) {
-    if (s2[n] == c)
-      return false;
-  }
-  return true;
-}
 LLVM_LIBC_FUNCTION(size_t, wcscspn, (const wchar_t *s1, const wchar_t *s2)) {
-  size_t i = 0;
-  for (; s1[i]; ++i) {
-    if (!check(s1[i], s2))
-      return i;
-  }
-  return i;
+  LIBC_CRASH_ON_NULLPTR(s1);
+  LIBC_CRASH_ON_NULLPTR(s2);
+  return internal::wcsspn(s1, s2, /*not_match_set=*/true);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcsspn.cpp b/libc/src/wchar/wcsspn.cpp
index 23de381a2d954..ae2cf5a41ae41 100644
--- a/libc/src/wchar/wcsspn.cpp
+++ b/libc/src/wchar/wcsspn.cpp
@@ -12,23 +12,15 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "wchar_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-bool check(wchar_t c, const wchar_t *s2) {
-  for (int n = 0; s2[n]; ++n) {
-    if (s2[n] == c)
-      return true;
-  }
-  return false;
-}
 LLVM_LIBC_FUNCTION(size_t, wcsspn, (const wchar_t *s1, const wchar_t *s2)) {
-  size_t i = 0;
-  for (; s1[i]; ++i) {
-    if (!check(s1[i], s2))
-      return i;
-  }
-  return i;
+  LIBC_CRASH_ON_NULLPTR(s1);
+  LIBC_CRASH_ON_NULLPTR(s2);
+  return internal::wcsspn(s1, s2, /*not_match_set=*/false);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wctype/CMakeLists.txt b/libc/src/wctype/CMakeLists.txt
new file mode 100644
index 0000000000000..3ac5eaef8ed8b
--- /dev/null
+++ b/libc/src/wctype/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_entrypoint_object(
+  iswalpha
+  SRCS
+    iswalpha.cpp
+  HDRS
+    iswalpha.h
+  DEPENDS
+    libc.src.__support.wctype_utils    
+)
diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp
new file mode 100644
index 0000000000000..e18f29370fbd0
--- /dev/null
+++ b/libc/src/wctype/iswalpha.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of iswalpha ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wctype/iswalpha.h"
+#include "src/__support/common.h"
+#include "src/__support/wctype_utils.h"
+
+#include "hdr/types/wint_t.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bool, iswalpha, (wint_t c)) { return internal::iswalpha(c); }
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wctype/iswalpha.h b/libc/src/wctype/iswalpha.h
new file mode 100644
index 0000000000000..681fc6ba79a54
--- /dev/null
+++ b/libc/src/wctype/iswalpha.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for iswalpha ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H
+#define LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H
+
+#include "hdr/types/wint_t.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bool iswalpha(wint_t c);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index 1a0780faff512..011ad6aeb34b7 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 
 add_subdirectory(src)
 add_subdirectory(utils)
+add_subdirectory(shared)
 
 if(NOT LLVM_LIBC_FULL_BUILD)
   return()
diff --git a/libc/test/UnitTest/Test.h b/libc/test/UnitTest/Test.h
index a5a2a3c7cf58e..e70fc51869624 100644
--- a/libc/test/UnitTest/Test.h
+++ b/libc/test/UnitTest/Test.h
@@ -52,4 +52,13 @@
     libc_errno = 0;                                                            \
   } while (0)
 
+// Some macro utility to append file names with LIBC_TEST macro's value to be
+// used in stdio tests.
+#undef STR
+#undef EVAL_THEN_STR
+#define STR(X) #X
+#define EVAL_THEN_STR(X) STR(X)
+
+#define APPEND_LIBC_TEST(X) X "." EVAL_THEN_STR(LIBC_TEST)
+
 #endif // LLVM_LIBC_TEST_UNITTEST_TEST_H
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index 24935cec048ba..11e4c3a84157f 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -484,6 +484,21 @@ add_libc_test(
     libc.include.llvm-libc-macros.math_function_macros
 )
 
+add_libc_test(
+  math_constants_c_test
+  C_TEST
+  UNIT_TEST_ONLY
+  SUITE
+    libc_include_tests
+  SRCS
+    math_constants_test.c
+  COMPILE_OPTIONS
+    -Wall
+    -Werror
+  DEPENDS
+    libc.include.llvm-libc-macros.math_macros
+)
+
 # Test `#include <...>` of each header in each available language mode.
 # This is gated on -DLLVM_LIBC_BUILD_HEADER_TESTS=ON until all the bugs
 # in headers are fixed so the tests all compile.
diff --git a/libc/test/include/math_constants_test.c b/libc/test/include/math_constants_test.c
new file mode 100644
index 0000000000000..eb497a9d8a50a
--- /dev/null
+++ b/libc/test/include/math_constants_test.c
@@ -0,0 +1,23 @@
+//===-- Unittests for math constants --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "include/llvm-libc-macros/math-macros.h"
+
+#define IS_DOUBLE(X) _Generic((X), double: 1, default: 0)
+
+#define IS_FLOAT(X) _Generic((X), float: 1, default: 0)
+
+// check if macro is defined
+#ifndef M_PI
+#error "M_PI macro is not defined"
+#else
+int main(void) {
+  _Static_assert(IS_DOUBLE(M_PI), "M_PI is not of double type.");
+  _Static_assert(IS_FLOAT(M_PIf), "M_PIf is not of float type.");
+  return 0;
+}
+#endif
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
new file mode 100644
index 0000000000000..89b607d7e5cc3
--- /dev/null
+++ b/libc/test/shared/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_custom_target(libc-shared-tests)
+
+add_fp_unittest(
+  shared_math_test
+  SUITE
+    libc-shared-tests
+  SRCS
+    shared_math_test.cpp
+  DEPENDS
+    libc.src.__support.math.acos
+    libc.src.__support.math.acosf
+    libc.src.__support.math.acosf16
+    libc.src.__support.math.acoshf
+    libc.src.__support.math.acoshf16
+    libc.src.__support.math.erff
+    libc.src.__support.math.exp
+    libc.src.__support.math.exp10
+    libc.src.__support.math.exp10f
+    libc.src.__support.math.exp10f16
+    libc.src.__support.math.expf
+    libc.src.__support.math.expf16
+    libc.src.__support.math.frexpf
+    libc.src.__support.math.frexpf128
+    libc.src.__support.math.frexpf16
+    libc.src.__support.math.ldexpf
+    libc.src.__support.math.ldexpf128
+    libc.src.__support.math.ldexpf16
+)
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
new file mode 100644
index 0000000000000..8d3cebdf0745c
--- /dev/null
+++ b/libc/test/shared/shared_math_test.cpp
@@ -0,0 +1,71 @@
+//===-- Unittests for shared math functions -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "shared/math.h"
+#include "test/UnitTest/FPMatcher.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+TEST(LlvmLibcSharedMathTest, AllFloat16) {
+  int exponent;
+
+  EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::acoshf16(1.0f));
+
+  EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16));
+
+  EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16));
+
+  ASSERT_FP_EQ(float16(8 << 5), LIBC_NAMESPACE::shared::ldexpf16(float(8), 5));
+  ASSERT_FP_EQ(float16(-1 * (8 << 5)),
+               LIBC_NAMESPACE::shared::ldexpf16(float(-8), 5));
+
+  EXPECT_FP_EQ_ALL_ROUNDING(0.75f16,
+                            LIBC_NAMESPACE::shared::frexpf16(24.0f, &exponent));
+  EXPECT_EQ(exponent, 5);
+
+  EXPECT_FP_EQ(0x1.921fb6p+0f16, LIBC_NAMESPACE::shared::acosf16(0.0f16));
+}
+
+#endif
+
+TEST(LlvmLibcSharedMathTest, AllFloat) {
+  int exponent;
+
+  EXPECT_FP_EQ(0x1.921fb6p+0, LIBC_NAMESPACE::shared::acosf(0.0f));
+  EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::exp10f(0.0f));
+  EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::expf(0.0f));
+  EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::erff(0.0f));
+  EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::acoshf(1.0f));
+
+  EXPECT_FP_EQ_ALL_ROUNDING(0.75f,
+                            LIBC_NAMESPACE::shared::frexpf(24.0f, &exponent));
+  EXPECT_EQ(exponent, 5);
+
+  ASSERT_FP_EQ(float(8 << 5), LIBC_NAMESPACE::shared::ldexpf(float(8), 5));
+  ASSERT_FP_EQ(float(-1 * (8 << 5)),
+               LIBC_NAMESPACE::shared::ldexpf(float(-8), 5));
+}
+
+TEST(LlvmLibcSharedMathTest, AllDouble) {
+  EXPECT_FP_EQ(0x1.921fb54442d18p+0, LIBC_NAMESPACE::shared::acos(0.0));
+  EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp(0.0));
+  EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp10(0.0));
+}
+
+TEST(LlvmLibcSharedMathTest, AllFloat128) {
+  int exponent;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(
+      float128(0.75), LIBC_NAMESPACE::shared::frexpf128(24.0f, &exponent));
+  EXPECT_EQ(exponent, 5);
+
+  ASSERT_FP_EQ(float128(8 << 5),
+               LIBC_NAMESPACE::shared::ldexpf128(float(8), 5));
+  ASSERT_FP_EQ(float128(-1 * (8 << 5)),
+               LIBC_NAMESPACE::shared::ldexpf128(float(-8), 5));
+}
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index 6dca47b5343e6..b3eba43582074 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -70,6 +70,7 @@ add_subdirectory(stdlib)
 add_subdirectory(string)
 add_subdirectory(strings)
 add_subdirectory(wchar)
+add_subdirectory(wctype)
 add_subdirectory(time)
 add_subdirectory(unistd)
 
diff --git a/libc/test/src/__support/FPUtil/CMakeLists.txt b/libc/test/src/__support/FPUtil/CMakeLists.txt
index dfd90057b6ebf..81db4ccae44c6 100644
--- a/libc/test/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/test/src/__support/FPUtil/CMakeLists.txt
@@ -39,6 +39,12 @@ add_fp_unittest(
     libc.src.__support.FPUtil.rounding_mode
 )
 
+# TODO: Temporally disable bfloat16 test until MPCommon target is updated
+# https://github.com/llvm/llvm-project/pull/149678
+if(LLVM_LIBC_FULL_BUILD)
+  return()
+endif()
+
 add_fp_unittest(
   bfloat16_test
   NEED_MPFR
@@ -49,3 +55,15 @@ add_fp_unittest(
   DEPENDS
     libc.src.__support.FPUtil.bfloat16
 )
+
+add_fp_unittest(
+  comparison_operations_test
+  SUITE
+    libc-fputil-tests
+  SRCS
+    comparison_operations_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.comparison_operations
+    libc.src.__support.macros.properties.types
+)
diff --git a/libc/test/src/__support/FPUtil/comparison_operations_test.cpp b/libc/test/src/__support/FPUtil/comparison_operations_test.cpp
new file mode 100644
index 0000000000000..04a3321fd5dbf
--- /dev/null
+++ b/libc/test/src/__support/FPUtil/comparison_operations_test.cpp
@@ -0,0 +1,350 @@
+//===-- Unittests for comparison operations on floating-point numbers -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/comparison_operations.h"
+#include "src/__support/macros/properties/types.h"
+#include "test/UnitTest/FEnvSafeTest.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::fputil::equals;
+using LIBC_NAMESPACE::fputil::greater_than;
+using LIBC_NAMESPACE::fputil::greater_than_or_equals;
+using LIBC_NAMESPACE::fputil::less_than;
+using LIBC_NAMESPACE::fputil::less_than_or_equals;
+
+using BFloat16 = LIBC_NAMESPACE::fputil::BFloat16;
+
+template <typename T>
+class ComparisonOperationsTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+  // TODO: Make these constexpr once quick_get_round is made constexpr.
+  T normal1;
+  T neg_normal1;
+  T normal2;
+  T small;
+  T neg_small;
+  T large;
+  T neg_large;
+
+public:
+  void SetUp() override {
+    with_fenv_preserved([this]() {
+      normal1 = T(3.14);
+      neg_normal1 = T(-3.14);
+      normal2 = T(2.71);
+      small = T(0.1);
+      neg_small = T(-0.1);
+      large = T(10000.0);
+      neg_large = T(-10000.0);
+    });
+  }
+
+  void test_equals() {
+    EXPECT_TRUE(equals(neg_zero, neg_zero));
+    EXPECT_TRUE(equals(zero, neg_zero));
+    EXPECT_TRUE(equals(neg_zero, zero));
+
+    EXPECT_TRUE(equals(inf, inf));
+    EXPECT_TRUE(equals(neg_inf, neg_inf));
+    EXPECT_FALSE(equals(inf, neg_inf));
+    EXPECT_FALSE(equals(neg_inf, inf));
+
+    EXPECT_TRUE(equals(normal1, normal1));
+    EXPECT_TRUE(equals(normal2, normal2));
+    EXPECT_FALSE(equals(normal1, normal2));
+    EXPECT_FALSE(equals(normal1, neg_normal1));
+
+    auto test_qnan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(equals(x, y));
+      EXPECT_FP_EXCEPTION(0);
+    };
+
+    test_qnan(aNaN, aNaN);
+    test_qnan(aNaN, neg_aNaN);
+    test_qnan(aNaN, zero);
+    test_qnan(aNaN, inf);
+    test_qnan(aNaN, normal1);
+
+    test_qnan(neg_aNaN, neg_aNaN);
+    test_qnan(neg_aNaN, aNaN);
+    test_qnan(neg_aNaN, zero);
+    test_qnan(neg_aNaN, inf);
+    test_qnan(neg_aNaN, normal1);
+
+    auto test_snan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(equals(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_snan(sNaN, sNaN);
+    test_snan(sNaN, neg_sNaN);
+    test_snan(sNaN, aNaN);
+    test_snan(sNaN, neg_aNaN);
+    test_snan(sNaN, zero);
+    test_snan(sNaN, neg_zero);
+    test_snan(sNaN, inf);
+    test_snan(sNaN, neg_inf);
+    test_snan(sNaN, normal1);
+
+    test_snan(neg_sNaN, neg_sNaN);
+    test_snan(neg_sNaN, sNaN);
+    test_snan(neg_sNaN, aNaN);
+    test_snan(neg_sNaN, neg_aNaN);
+    test_snan(neg_sNaN, zero);
+    test_snan(neg_sNaN, neg_zero);
+    test_snan(neg_sNaN, inf);
+    test_snan(neg_sNaN, neg_inf);
+    test_snan(neg_sNaN, normal1);
+  }
+
+  void test_less_than() {
+    EXPECT_TRUE(less_than(neg_small, small));
+    EXPECT_TRUE(less_than(small, large));
+
+    EXPECT_TRUE(less_than(neg_large, neg_small));
+    EXPECT_FALSE(less_than(large, small));
+    EXPECT_FALSE(less_than(small, neg_small));
+
+    EXPECT_FALSE(less_than(zero, neg_zero));
+    EXPECT_FALSE(less_than(neg_zero, zero));
+    EXPECT_FALSE(less_than(zero, zero));
+
+    EXPECT_TRUE(less_than(neg_small, zero));
+    EXPECT_TRUE(less_than(neg_zero, small));
+    EXPECT_FALSE(less_than(small, zero));
+
+    EXPECT_TRUE(less_than(neg_inf, inf));
+    EXPECT_TRUE(less_than(neg_inf, neg_small));
+    EXPECT_TRUE(less_than(small, inf));
+    EXPECT_FALSE(less_than(inf, small));
+
+    EXPECT_FALSE(less_than(small, small));
+    EXPECT_FALSE(less_than(neg_inf, neg_inf));
+
+    auto test_qnan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(less_than(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_qnan(aNaN, small);
+    test_qnan(small, aNaN);
+    test_qnan(aNaN, aNaN);
+    test_qnan(neg_aNaN, neg_small);
+    test_qnan(neg_small, neg_aNaN);
+    test_qnan(neg_aNaN, neg_aNaN);
+
+    auto test_snan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(less_than(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_snan(sNaN, small);
+    test_snan(sNaN, neg_small);
+    test_snan(sNaN, zero);
+    test_snan(sNaN, inf);
+    test_snan(sNaN, aNaN);
+    test_snan(sNaN, sNaN);
+
+    test_snan(neg_sNaN, small);
+    test_snan(neg_sNaN, neg_small);
+    test_snan(neg_sNaN, zero);
+    test_snan(neg_sNaN, inf);
+    test_snan(neg_sNaN, aNaN);
+    test_snan(neg_sNaN, neg_sNaN);
+  }
+
+  void test_greater_than() {
+    EXPECT_TRUE(greater_than(large, neg_small));
+    EXPECT_TRUE(greater_than(neg_small, neg_large));
+
+    EXPECT_FALSE(greater_than(large, large));
+    EXPECT_FALSE(greater_than(neg_small, large));
+
+    EXPECT_FALSE(greater_than(zero, neg_zero));
+    EXPECT_FALSE(greater_than(neg_zero, zero));
+
+    EXPECT_TRUE(greater_than(inf, neg_inf));
+    EXPECT_TRUE(greater_than(inf, large));
+    EXPECT_TRUE(greater_than(large, neg_inf));
+    EXPECT_FALSE(greater_than(neg_inf, inf));
+
+    EXPECT_FALSE(greater_than(large, large));
+    EXPECT_FALSE(greater_than(inf, inf));
+
+    auto test_qnan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(greater_than(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_qnan(aNaN, large);
+    test_qnan(large, aNaN);
+    test_qnan(aNaN, aNaN);
+    test_qnan(neg_aNaN, neg_small);
+    test_qnan(neg_small, neg_aNaN);
+    test_qnan(neg_aNaN, neg_aNaN);
+
+    auto test_snan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(greater_than(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_snan(sNaN, large);
+    test_snan(sNaN, neg_small);
+    test_snan(sNaN, zero);
+    test_snan(sNaN, inf);
+    test_snan(sNaN, aNaN);
+    test_snan(sNaN, sNaN);
+
+    test_snan(neg_sNaN, large);
+    test_snan(neg_sNaN, neg_small);
+    test_snan(neg_sNaN, zero);
+    test_snan(neg_sNaN, inf);
+    test_snan(neg_sNaN, aNaN);
+    test_snan(neg_sNaN, neg_sNaN);
+  }
+
+  void test_less_than_or_equals() {
+    EXPECT_TRUE(less_than_or_equals(neg_small, small));
+    EXPECT_TRUE(less_than_or_equals(small, large));
+    EXPECT_TRUE(less_than_or_equals(neg_inf, small));
+
+    EXPECT_TRUE(less_than_or_equals(small, small));
+    EXPECT_TRUE(less_than_or_equals(zero, neg_zero));
+    EXPECT_TRUE(less_than_or_equals(inf, inf));
+
+    EXPECT_FALSE(less_than_or_equals(small, neg_small));
+    EXPECT_FALSE(less_than_or_equals(large, small));
+    EXPECT_FALSE(less_than_or_equals(inf, small));
+
+    EXPECT_TRUE(less_than_or_equals(neg_large, small));
+    EXPECT_FALSE(less_than_or_equals(large, neg_small));
+
+    auto test_qnan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(less_than_or_equals(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_qnan(aNaN, small);
+    test_qnan(small, aNaN);
+    test_qnan(aNaN, aNaN);
+    test_qnan(neg_aNaN, neg_small);
+    test_qnan(neg_small, neg_aNaN);
+    test_qnan(neg_aNaN, neg_aNaN);
+
+    auto test_snan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(less_than_or_equals(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_snan(sNaN, small);
+    test_snan(sNaN, neg_small);
+    test_snan(sNaN, zero);
+    test_snan(sNaN, inf);
+    test_snan(sNaN, aNaN);
+    test_snan(sNaN, sNaN);
+
+    test_snan(neg_sNaN, small);
+    test_snan(neg_sNaN, neg_small);
+    test_snan(neg_sNaN, zero);
+    test_snan(neg_sNaN, inf);
+    test_snan(neg_sNaN, aNaN);
+    test_snan(neg_sNaN, neg_sNaN);
+  }
+
+  void test_greater_than_or_equals() {
+    EXPECT_TRUE(greater_than_or_equals(small, neg_small));
+    EXPECT_TRUE(greater_than_or_equals(large, small));
+    EXPECT_TRUE(greater_than_or_equals(inf, small));
+
+    EXPECT_TRUE(greater_than_or_equals(small, small));
+    EXPECT_TRUE(greater_than_or_equals(zero, neg_zero));
+    EXPECT_TRUE(greater_than_or_equals(neg_inf, neg_inf));
+
+    EXPECT_FALSE(greater_than_or_equals(neg_small, small));
+    EXPECT_FALSE(greater_than_or_equals(small, large));
+    EXPECT_FALSE(greater_than_or_equals(neg_inf, small));
+
+    EXPECT_TRUE(greater_than_or_equals(large, neg_small));
+    EXPECT_FALSE(greater_than_or_equals(neg_large, small));
+
+    auto test_qnan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(greater_than_or_equals(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_qnan(aNaN, small);
+    test_qnan(small, aNaN);
+    test_qnan(aNaN, aNaN);
+    test_qnan(neg_aNaN, neg_small);
+    test_qnan(neg_small, neg_aNaN);
+    test_qnan(neg_aNaN, neg_aNaN);
+
+    auto test_snan = [&](T x, T y) {
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+      EXPECT_FALSE(greater_than_or_equals(x, y));
+      EXPECT_FP_EXCEPTION(FE_INVALID);
+    };
+
+    test_snan(sNaN, small);
+    test_snan(sNaN, neg_small);
+    test_snan(sNaN, zero);
+    test_snan(sNaN, inf);
+    test_snan(sNaN, aNaN);
+    test_snan(sNaN, sNaN);
+
+    test_snan(neg_sNaN, small);
+    test_snan(neg_sNaN, neg_small);
+    test_snan(neg_sNaN, zero);
+    test_snan(neg_sNaN, inf);
+    test_snan(neg_sNaN, aNaN);
+    test_snan(neg_sNaN, neg_sNaN);
+  }
+};
+
+#define TEST_COMPARISON_OPS(Name, Type)                                        \
+  using LlvmLibc##Name##ComparisonOperationsTest =                             \
+      ComparisonOperationsTest<Type>;                                          \
+  TEST_F(LlvmLibc##Name##ComparisonOperationsTest, Equals) { test_equals(); }  \
+  TEST_F(LlvmLibc##Name##ComparisonOperationsTest, LessThan) {                 \
+    test_less_than();                                                          \
+  }                                                                            \
+  TEST_F(LlvmLibc##Name##ComparisonOperationsTest, GreaterThan) {              \
+    test_greater_than();                                                       \
+  }                                                                            \
+  TEST_F(LlvmLibc##Name##ComparisonOperationsTest, LessThanOrEquals) {         \
+    test_less_than_or_equals();                                                \
+  }                                                                            \
+  TEST_F(LlvmLibc##Name##ComparisonOperationsTest, GreaterThanOrEquals) {      \
+    test_greater_than_or_equals();                                             \
+  }
+
+TEST_COMPARISON_OPS(Float, float)
+TEST_COMPARISON_OPS(Double, double)
+TEST_COMPARISON_OPS(LongDouble, long double)
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+TEST_COMPARISON_OPS(Float16, float16)
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#ifdef LIBC_TYPES_HAS_FLOAT128
+TEST_COMPARISON_OPS(Float128, float128)
+#endif // LIBC_TYPES_HAS_FLOAT128
+
+TEST_COMPARISON_OPS(BFloat16, BFloat16)
diff --git a/libc/test/src/__support/File/platform_file_test.cpp b/libc/test/src/__support/File/platform_file_test.cpp
index 6b2be2a149329..425da6ce2ad86 100644
--- a/libc/test/src/__support/File/platform_file_test.cpp
+++ b/libc/test/src/__support/File/platform_file_test.cpp
@@ -21,7 +21,8 @@ LIBC_INLINE File *openfile(const char *file_name, const char *mode) {
 }
 
 TEST(LlvmLibcPlatformFileTest, CreateWriteCloseAndReadBack) {
-  constexpr char FILENAME[] = "testdata/create_write_close_and_readback.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/create_write_close_and_readback.test");
   File *file = openfile(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(file->write(TEXT, TEXT_SIZE).value, TEXT_SIZE);
@@ -42,7 +43,8 @@ TEST(LlvmLibcPlatformFileTest, CreateWriteCloseAndReadBack) {
 }
 
 TEST(LlvmLibcPlatformFileTest, CreateWriteSeekAndReadBack) {
-  constexpr char FILENAME[] = "testdata/create_write_seek_and_readback.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/create_write_seek_and_readback.test");
   File *file = openfile(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(file->write(TEXT, TEXT_SIZE).value, TEXT_SIZE);
@@ -62,7 +64,8 @@ TEST(LlvmLibcPlatformFileTest, CreateWriteSeekAndReadBack) {
 }
 
 TEST(LlvmLibcPlatformFileTest, CreateAppendCloseAndReadBack) {
-  constexpr char FILENAME[] = "testdata/create_append_close_and_readback.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/create_append_close_and_readback.test");
   File *file = openfile(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(file->write(TEXT, TEXT_SIZE).value, TEXT_SIZE);
@@ -91,7 +94,8 @@ TEST(LlvmLibcPlatformFileTest, CreateAppendCloseAndReadBack) {
 }
 
 TEST(LlvmLibcPlatformFileTest, CreateAppendSeekAndReadBack) {
-  constexpr char FILENAME[] = "testdata/create_append_seek_and_readback.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/create_append_seek_and_readback.test");
   File *file = openfile(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(file->write(TEXT, TEXT_SIZE).value, TEXT_SIZE);
@@ -124,7 +128,7 @@ TEST(LlvmLibcPlatformFileTest, LargeFile) {
   for (size_t i = 0; i < DATA_SIZE; ++i)
     write_data[i] = BYTE;
 
-  constexpr char FILENAME[] = "testdata/large_file.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/large_file.test");
   File *file = openfile(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
 
@@ -151,7 +155,8 @@ TEST(LlvmLibcPlatformFileTest, LargeFile) {
 }
 
 TEST(LlvmLibcPlatformFileTest, ReadSeekCurAndRead) {
-  constexpr char FILENAME[] = "testdata/read_seek_cur_and_read.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/read_seek_cur_and_read.test");
   File *file = openfile(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   constexpr char CONTENT[] = "1234567890987654321";
@@ -178,7 +183,8 @@ TEST(LlvmLibcPlatformFileTest, ReadSeekCurAndRead) {
 }
 
 TEST(LlvmLibcPlatformFileTest, IncorrectOperation) {
-  constexpr char FILENAME[] = "testdata/incorrect_operation.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/incorrect_operation.test");
   char data[1] = {123};
 
   File *file = openfile(FILENAME, "w");
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 14d074156d033..d514df9317852 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -245,6 +245,63 @@ TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
   ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
 }
 
+TEST(LlvmLibcStringConverterTest, InvalidCharacterOutsideBounds) {
+  // if an invalid character exists in the source string but we don't have space
+  // to write it, we should return a "stop converting" error rather than an
+  // invalid character error
+
+  // first 4 bytes are clown emoji (🤡)
+  // next 3 form an invalid character
+  const char *src1 = "\xF0\x9F\xA4\xA1\x90\x88\x30";
+  LIBC_NAMESPACE::internal::mbstate ps1;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
+      reinterpret_cast<const char8_t *>(src1), &ps1, 1);
+
+  auto res1 = sc1.popUTF32();
+  ASSERT_TRUE(res1.has_value());
+  ASSERT_EQ(static_cast<int>(res1.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
+
+  res1 = sc1.popUTF32();
+  ASSERT_FALSE(res1.has_value());
+  // no space to write error NOT invalid character error (EILSEQ)
+  ASSERT_EQ(static_cast<int>(res1.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
+
+  const wchar_t src2[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
+      static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
+  LIBC_NAMESPACE::internal::mbstate ps2;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
+      reinterpret_cast<const char32_t *>(src2), &ps2, 4);
+
+  auto res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_FALSE(res2.has_value());
+  // no space to write error NOT invalid character error (EILSEQ)
+  ASSERT_EQ(static_cast<int>(res2.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+}
+
 TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
   /*
   We do NOT test partially popping a character and expecting the next
diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp
index cb88bfcade0dc..5c30fb7c8718f 100644
--- a/libc/test/src/math/cospif_test.cpp
+++ b/libc/test/src/math/cospif_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcCospifTest, SmallValues) {
                                  LIBC_NAMESPACE::cospif(x), 0.5);
 }
 
-// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// SDCOMP-26094: check cospif in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcCospifTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index 01197b835433f..49cc96291a392 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -43,12 +43,8 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
              def_prec);
 }
 
-TEST_F(LlvmLibcExplogfTest, Log2InFloatRange) {
-  CHECK_DATA(0.0f, inf, mpfr::Operation::Log2, LIBC_NAMESPACE::log2_eval,
-             f_normal, def_count, def_prec);
-}
-
 TEST_F(LlvmLibcExplogfTest, LogInFloatRange) {
-  CHECK_DATA(0.0f, inf, mpfr::Operation::Log, LIBC_NAMESPACE::log_eval,
-             f_normal, def_count, def_prec);
+  CHECK_DATA(0.0f, inf, mpfr::Operation::Log,
+             LIBC_NAMESPACE::acoshf_internal::log_eval, f_normal, def_count,
+             def_prec);
 }
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index ad2155f329cd9..4aac1fabfbd62 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -164,7 +164,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) {
   }
 }
 
-// SDCOMP-26094: check sinf in the cases for which the range reducer
+// SDCOMP-26094: check sincosf in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp
index 986c676761f0e..94e3dbc4f07d4 100644
--- a/libc/test/src/math/sinpif_test.cpp
+++ b/libc/test/src/math/sinpif_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcSinpifTest, SmallValues) {
                                  LIBC_NAMESPACE::sinpif(x), 0.5);
 }
 
-// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// SDCOMP-26094: check sinpif in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinpifTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index b53184c30be36..bab1d33edcb04 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -24,7 +24,8 @@ using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
 TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
+  constexpr const char *TEST_FILE_NAME =
+      APPEND_LIBC_TEST("testdata/write_read_append.test");
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
   auto *fp = LIBC_NAMESPACE::fdopen(fd, "w");
@@ -54,7 +55,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
 }
 
 TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
-  constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
+  constexpr const char *TEST_FILE_NAME =
+      APPEND_LIBC_TEST("testdata/invalid_fd.test");
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
   LIBC_NAMESPACE::close(fd);
@@ -65,7 +67,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
 }
 
 TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
-  constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
+  constexpr const char *TEST_FILE_NAME =
+      APPEND_LIBC_TEST("testdata/invalid_mode.test");
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
   ASSERT_ERRNO_SUCCESS();
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index be2e50271b510..1d242a0475aba 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -56,9 +56,10 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 };
 
 TEST_F(LlvmLibcGetcTest, WriteAndReadCharactersWithFgetc) {
-  test_with_func(&LIBC_NAMESPACE::fgetc, "testdata/fgetc.test");
+  test_with_func(&LIBC_NAMESPACE::fgetc,
+                 APPEND_LIBC_TEST("testdata/fgetc.test"));
 }
 
 TEST_F(LlvmLibcGetcTest, WriteAndReadCharactersWithGetc) {
-  test_with_func(&LIBC_NAMESPACE::getc, "testdata/getc.test");
+  test_with_func(&LIBC_NAMESPACE::getc, APPEND_LIBC_TEST("testdata/getc.test"));
 }
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index bef9dafd3d87c..16d79211f5e8f 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -62,9 +62,10 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
 TEST_F(LlvmLibcGetcTest, WriteAndReadCharactersWithFgetcUnlocked) {
   test_with_func(&LIBC_NAMESPACE::fgetc_unlocked,
-                 "testdata/fgetc_unlocked.test");
+                 APPEND_LIBC_TEST("testdata/fgetc_unlocked.test"));
 }
 
 TEST_F(LlvmLibcGetcTest, WriteAndReadCharactersWithGetcUnlocked) {
-  test_with_func(&LIBC_NAMESPACE::getc_unlocked, "testdata/getc_unlocked.test");
+  test_with_func(&LIBC_NAMESPACE::getc_unlocked,
+                 APPEND_LIBC_TEST("testdata/getc_unlocked.test"));
 }
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index 8fc38b0659181..14f054eee1339 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -20,7 +20,7 @@ using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
 TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
-  constexpr char FILENAME[] = "testdata/fgets.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/fgets.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   constexpr char CONTENT[] = "123456789\n"
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e097785832d56..02328042b92b3 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -29,7 +29,8 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
 TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
-  constexpr char FILENAME[] = "testdata/simple_operations.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/simple_operations.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_GE(LIBC_NAMESPACE::fileno(file), 0);
@@ -127,7 +128,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
 }
 
 TEST_F(LlvmLibcFILETest, FFlush) {
-  constexpr char FILENAME[] = "testdata/fflush.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/fflush.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
   constexpr char CONTENT[] = "1234567890987654321";
@@ -154,7 +155,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   };
   constexpr MyStruct WRITE_DATA[] = {{'a', 1}, {'b', 2}, {'c', 3}};
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
-  constexpr char FILENAME[] = "testdata/fread_fwrite.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/fread_fwrite.test");
 
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
diff --git a/libc/test/src/stdio/fopen_test.cpp b/libc/test/src/stdio/fopen_test.cpp
index 42e7c57cffe04..3f651f755e7f3 100644
--- a/libc/test/src/stdio/fopen_test.cpp
+++ b/libc/test/src/stdio/fopen_test.cpp
@@ -17,7 +17,8 @@
 TEST(LlvmLibcFOpenTest, PrintToFile) {
   int result;
 
-  FILE *file = LIBC_NAMESPACE::fopen("./testdata/test_data.txt", "w");
+  FILE *file =
+      LIBC_NAMESPACE::fopen(APPEND_LIBC_TEST("testdata/test.txt"), "w");
   ASSERT_FALSE(file == nullptr);
 
   static constexpr char STRING[] = "A simple string written to a file\n";
@@ -26,7 +27,8 @@ TEST(LlvmLibcFOpenTest, PrintToFile) {
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
-  FILE *new_file = LIBC_NAMESPACE::fopen("./testdata/test_data.txt", "r");
+  FILE *new_file =
+      LIBC_NAMESPACE::fopen(APPEND_LIBC_TEST("testdata/test.txt"), "r");
   ASSERT_FALSE(new_file == nullptr);
 
   static char data[64] = {0};
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 82a3e039d9baa..6799323cc6ad9 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -32,7 +32,7 @@ using ::fread;
 } // namespace printf_test
 
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
-  const char *FILENAME = "fprintf_output.test";
+  const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
 
   ::FILE *file = printf_test::fopen(FILE_PATH, "w");
diff --git a/libc/test/src/stdio/fscanf_test.cpp b/libc/test/src/stdio/fscanf_test.cpp
index e5b8c4f422bac..451ff94055ea5 100644
--- a/libc/test/src/stdio/fscanf_test.cpp
+++ b/libc/test/src/stdio/fscanf_test.cpp
@@ -34,7 +34,7 @@ using ::fwrite;
 } // namespace scanf_test
 
 TEST(LlvmLibcFScanfTest, WriteToFile) {
-  const char *FILENAME = "fscanf_output.test";
+  const char *FILENAME = APPEND_LIBC_TEST("fscanf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
   ::FILE *file = scanf_test::fopen(FILE_PATH, "w");
   ASSERT_FALSE(file == nullptr);
diff --git a/libc/test/src/stdio/ftell_test.cpp b/libc/test/src/stdio/ftell_test.cpp
index 01ff071f2ee78..1f762f38585bc 100644
--- a/libc/test/src/stdio/ftell_test.cpp
+++ b/libc/test/src/stdio/ftell_test.cpp
@@ -21,7 +21,7 @@
 class LlvmLibcFTellTest : public LIBC_NAMESPACE::testing::Test {
 protected:
   void test_with_bufmode(int bufmode) {
-    constexpr char FILENAME[] = "testdata/ftell.test";
+    constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/ftell.test");
     // We will set a special buffer to the file so that we guarantee buffering.
     constexpr size_t BUFFER_SIZE = 1024;
     char buffer[BUFFER_SIZE];
diff --git a/libc/test/src/stdio/putc_test.cpp b/libc/test/src/stdio/putc_test.cpp
index e881a0e2d0108..6bf482794f0b8 100644
--- a/libc/test/src/stdio/putc_test.cpp
+++ b/libc/test/src/stdio/putc_test.cpp
@@ -16,7 +16,7 @@
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcPutcTest, WriteToFile) {
-  constexpr char FILENAME[] = "testdata/putc_output.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/putc_output.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
 
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 296bff1f5dc15..20d166f0acb2f 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -25,7 +25,7 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
-  constexpr const char *FILENAME = "remove.test.file";
+  constexpr const char *FILENAME = APPEND_LIBC_TEST("remove.test.file");
   auto TEST_FILE = libc_make_test_file_path(FILENAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
   ASSERT_ERRNO_SUCCESS();
@@ -42,7 +42,7 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // it was removed.
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *FILENAME = "remove.test.dir";
+  constexpr const char *FILENAME = APPEND_LIBC_TEST("remove.test.dir");
   auto TEST_DIR = libc_make_test_file_path(FILENAME);
   ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU),
               Succeeds(0));
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index 135fb98c07fbb..af957e0fcbf79 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -24,7 +24,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
-  constexpr const char *FILENAME0 = "rename.test.file0";
+  constexpr const char *FILENAME0 = APPEND_LIBC_TEST("rename.test.file0");
   auto TEST_FILEPATH0 = libc_make_test_file_path(FILENAME0);
 
   int fd = LIBC_NAMESPACE::open(TEST_FILEPATH0, O_WRONLY | O_CREAT, S_IRWXU);
@@ -33,7 +33,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Succeeds(0));
 
-  constexpr const char *FILENAME1 = "rename.test.file1";
+  constexpr const char *FILENAME1 = APPEND_LIBC_TEST("rename.test.file1");
   auto TEST_FILEPATH1 = libc_make_test_file_path(FILENAME1);
   ASSERT_THAT(LIBC_NAMESPACE::rename(TEST_FILEPATH0, TEST_FILEPATH1),
               Succeeds(0));
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
 TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
-  constexpr const char *FILENAME1 = "rename.test.file1";
+  constexpr const char *FILENAME1 = APPEND_LIBC_TEST("rename.test.file1");
   auto TEST_FILEPATH1 = libc_make_test_file_path(FILENAME1);
 
   ASSERT_THAT(LIBC_NAMESPACE::rename("non-existent", TEST_FILEPATH1),
diff --git a/libc/test/src/stdio/setbuf_test.cpp b/libc/test/src/stdio/setbuf_test.cpp
index 25fea59076626..f1f98f7040402 100644
--- a/libc/test/src/stdio/setbuf_test.cpp
+++ b/libc/test/src/stdio/setbuf_test.cpp
@@ -18,7 +18,8 @@
 TEST(LlvmLibcSetbufTest, DefaultBufsize) {
   // The idea in this test is to change the buffer after opening a file and
   // ensure that read and write work as expected.
-  constexpr char FILENAME[] = "testdata/setbuf_test_default_bufsize.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/setbuf_test_default_bufsize.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   char buffer[BUFSIZ];
@@ -41,7 +42,8 @@ TEST(LlvmLibcSetbufTest, DefaultBufsize) {
 TEST(LlvmLibcSetbufTest, NullBuffer) {
   // The idea in this test is that we set a null buffer and ensure that
   // everything works correctly.
-  constexpr char FILENAME[] = "testdata/setbuf_test_null_buffer.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/setbuf_test_null_buffer.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   LIBC_NAMESPACE::setbuf(file, nullptr);
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a0936ba79ef73..f55b8e2c3c722 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -23,7 +23,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
   // handle.
-  constexpr char FILENAME[] = "testdata/setvbuf_nbf.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/setvbuf_nbf.test");
 
   ::FILE *fw = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(fw == nullptr);
@@ -59,7 +59,7 @@ TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
   // written.
-  constexpr char FILENAME[] = "testdata/setvbuf_lbf.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/setvbuf_lbf.test");
 
   ::FILE *fw = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(fw == nullptr);
@@ -96,7 +96,8 @@ TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
 }
 
 TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
-  constexpr char FILENAME[] = "testdata/setvbuf_invalid_bufmode.test";
+  constexpr char FILENAME[] =
+      APPEND_LIBC_TEST("testdata/setvbuf_invalid_bufmode.test");
   ::FILE *f = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(f == nullptr);
   char buf[BUFSIZ];
diff --git a/libc/test/src/stdio/ungetc_test.cpp b/libc/test/src/stdio/ungetc_test.cpp
index b9d7530fc7177..917dbc2a77ab4 100644
--- a/libc/test/src/stdio/ungetc_test.cpp
+++ b/libc/test/src/stdio/ungetc_test.cpp
@@ -17,7 +17,7 @@
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcUngetcTest, UngetAndReadBack) {
-  constexpr char FILENAME[] = "testdata/ungetc_test.test";
+  constexpr char FILENAME[] = APPEND_LIBC_TEST("testdata/ungetc_test.test");
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   constexpr char CONTENT[] = "abcdef";
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index e99b382d12112..7af7eca46a1ec 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -21,7 +21,8 @@
 using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
 TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
-  constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
+  constexpr char fNAME[] =
+      APPEND_LIBC_TEST("testdata/unlocked_read_and_write.test");
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
   constexpr char CONTENT[] = "1234567890987654321";
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index 80d484500d5f2..f50565a0f68ca 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -45,7 +45,7 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format,
 }
 
 TEST(LlvmLibcVFPrintfTest, WriteToFile) {
-  const char *FILENAME = "vfprintf_output.test";
+  const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
 
   ::FILE *file = printf_test::fopen(FILE_PATH, "w");
diff --git a/libc/test/src/stdio/vfscanf_test.cpp b/libc/test/src/stdio/vfscanf_test.cpp
index b66538671f620..38a64611d0705 100644
--- a/libc/test/src/stdio/vfscanf_test.cpp
+++ b/libc/test/src/stdio/vfscanf_test.cpp
@@ -42,7 +42,7 @@ static int call_vfscanf(::FILE *stream, const char *__restrict format, ...) {
 }
 
 TEST(LlvmLibcVFScanfTest, WriteToFile) {
-  const char *FILENAME = "vfscanf_output.test";
+  const char *FILENAME = APPEND_LIBC_TEST("vfscanf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
   ::FILE *file = scanf_test::fopen(FILE_PATH, "w");
   ASSERT_FALSE(file == nullptr);
diff --git a/libc/test/src/sys/epoll/linux/epoll_create_test.cpp b/libc/test/src/sys/epoll/linux/epoll_create_test.cpp
index 06c17c6cf29e6..2bbfe4fbe81ff 100644
--- a/libc/test/src/sys/epoll/linux/epoll_create_test.cpp
+++ b/libc/test/src/sys/epoll/linux/epoll_create_test.cpp
@@ -10,7 +10,6 @@
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <sys/syscall.h> // For syscall numbers.
 
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 using LlvmLibcEpollCreateTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
@@ -23,8 +22,6 @@ TEST_F(LlvmLibcEpollCreateTest, Basic) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
 }
 
-#ifdef SYS_epoll_create
 TEST_F(LlvmLibcEpollCreateTest, Fails) {
   ASSERT_THAT(LIBC_NAMESPACE::epoll_create(0), Fails(EINVAL));
 }
-#endif
diff --git a/libc/test/src/wchar/mbrtowc_test.cpp b/libc/test/src/wchar/mbrtowc_test.cpp
index 5a14d8e25935c..c406300b9ca34 100644
--- a/libc/test/src/wchar/mbrtowc_test.cpp
+++ b/libc/test/src/wchar/mbrtowc_test.cpp
@@ -190,6 +190,18 @@ TEST_F(LlvmLibcMBRToWCTest, NullString) {
   ASSERT_ERRNO_SUCCESS();
 }
 
+TEST_F(LlvmLibcMBRToWCTest, NullDest) {
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+  mbstate_t *mb;
+  LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+  // reading nullptr should return correct size
+  size_t n = LIBC_NAMESPACE::mbrtowc(nullptr, ch, 10, mb);
+  ASSERT_EQ(static_cast<int>(n), 4);
+  ASSERT_ERRNO_SUCCESS();
+}
+
 TEST_F(LlvmLibcMBRToWCTest, InvalidMBState) {
   const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
                       static_cast<char>(0xC7), static_cast<char>(0x8C)};
diff --git a/libc/test/src/wctype/CMakeLists.txt b/libc/test/src/wctype/CMakeLists.txt
new file mode 100644
index 0000000000000..5459cdb4a9b71
--- /dev/null
+++ b/libc/test/src/wctype/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc_wctype_unittests)
+
+add_libc_test(
+  iswalpha_test
+  SUITE
+    libc_wctype_unittests
+  SRCS
+    iswalpha_test.cpp
+  DEPENDS
+    libc.src.wctype.iswalpha
+)
diff --git a/libc/test/src/wctype/iswalpha_test.cpp b/libc/test/src/wctype/iswalpha_test.cpp
new file mode 100644
index 0000000000000..f3f75f4dc7aa5
--- /dev/null
+++ b/libc/test/src/wctype/iswalpha_test.cpp
@@ -0,0 +1,54 @@
+//===-- Unittests for iswalpha --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/span.h"
+#include "src/wctype/iswalpha.h"
+
+#include "test/UnitTest/LibcTest.h"
+#include "test/UnitTest/Test.h"
+
+namespace {
+
+// TODO: Merge the wctype tests using this framework.
+constexpr char WALPHA_ARRAY[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
+} // namespace
+
+TEST(LlvmLibciswalpha, SimpleTest) {
+  EXPECT_TRUE(LIBC_NAMESPACE::iswalpha('a'));
+  EXPECT_TRUE(LIBC_NAMESPACE::iswalpha('B'));
+
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('3'));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(' '));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('?'));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('\0'));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(-1));
+}
+
+TEST(LlvmLibciswalpha, DefaultLocale) {
+  // Loops through all characters, verifying that letters return
+  // true and everything else returns false.
+  for (int ch = -255; ch < 255; ++ch) {
+    if (in_span(ch, WALPHA_ARRAY))
+      EXPECT_TRUE(LIBC_NAMESPACE::iswalpha(ch));
+    else
+      EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(ch));
+  }
+}
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 2570d1a106d21..e4e9a74639b17 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -405,6 +405,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       -I${CMAKE_CURRENT_SOURCE_DIR}/clc/include
       # Error on undefined macros
       -Werror=undef
+      -fdiscard-value-names
     )
 
     if( NOT "${cpu}" STREQUAL "" )
diff --git a/libclc/clc/include/clc/atomic/atomic_decl.inc b/libclc/clc/include/clc/atomic/atomic_decl.inc
new file mode 100644
index 0000000000000..b790a94c7d288
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/atomic_decl.inc
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// MemoryOrder is memory order supported by Clang __scoped_atomic* builtins.
+// MemoryScope is memory scope supported by Clang __scoped_atomic* builtins.
+
+#ifdef __CLC_SCALAR
+#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
+      int MemoryScope);
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \
+                                        __CLC_GENTYPE Value, int MemoryOrder,  \
+                                        int MemoryScope);
+#elif defined(__CLC_COMPARE_EXCHANGE)
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator,         \
+      __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal,       \
+      int MemoryScope);
+#else
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
+      int MemoryOrder, int MemoryScope);
+#endif
+
+__CLC_DECLARE_ATOMIC(global)
+__CLC_DECLARE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DECLARE_ATOMIC()
+#endif
+
+#undef __CLC_DECLARE_ATOMIC
+
+#endif // defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h
new file mode 100644
index 0000000000000..ae7918ac32e43
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_compare_exchange
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_COMPARE_EXCHANGE
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_dec.h b/libclc/clc/include/clc/atomic/clc_atomic_dec.h
new file mode 100644
index 0000000000000..ada36ba3ff9b3
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_dec.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_DEC_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_DEC_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_dec
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_DEC_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h
new file mode 100644
index 0000000000000..7e626d4a8830b
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_exchange
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h
new file mode 100644
index 0000000000000..ad0c2eb4607a7
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_add
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h
new file mode 100644
index 0000000000000..80810c38cbbb8
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_and
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h
new file mode 100644
index 0000000000000..56f511922e5c7
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_max
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h
new file mode 100644
index 0000000000000..f17408d28a35d
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_min
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h
new file mode 100644
index 0000000000000..b82069e6f960e
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_or
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h
new file mode 100644
index 0000000000000..6cfd224629d60
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_sub
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h
new file mode 100644
index 0000000000000..b007b47a9369d
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_xor
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_inc.h b/libclc/clc/include/clc/atomic/clc_atomic_inc.h
new file mode 100644
index 0000000000000..3ddef4a8bf355
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_inc.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_INC_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_INC_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_inc
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_INC_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_load.h b/libclc/clc/include/clc/atomic/clc_atomic_load.h
new file mode 100644
index 0000000000000..a4899b34b88a1
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_load.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_load
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_store.h b/libclc/clc/include/clc/atomic/clc_atomic_store.h
new file mode 100644
index 0000000000000..6baf0eb7ea32b
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_store.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_STORE_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_STORE_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_store
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_RETURN_VOID
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_STORE_H__
diff --git a/libclc/clc/include/clc/integer/clc_bit_reverse.h b/libclc/clc/include/clc/integer/clc_bit_reverse.h
new file mode 100644
index 0000000000000..c945e326c74fa
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bit_reverse.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BIT_REVERSE_H__
+#define __CLC_INTEGER_CLC_BIT_REVERSE_H__
+
+#define FUNCTION __clc_bit_reverse
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_INTEGER_CLC_BIT_REVERSE_H__
diff --git a/libc/include/uchar.h.def b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc
similarity index 60%
rename from libc/include/uchar.h.def
rename to libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc
index 31b7fcb73ded6..c93eff08de0bc 100644
--- a/libc/include/uchar.h.def
+++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc
@@ -1,4 +1,4 @@
-//===-- C standard library header uchar.h ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,5 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_UCHAR_H
-#define LLVM_LIBC_UCHAR_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_UCHAR_H
+_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset,
+                                           uint count);
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h
new file mode 100644
index 0000000000000..9c2e047b8be00
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+#define __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_bitfield_extract_signed
+#define __RETTYPE __CLC_S_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h
new file mode 100644
index 0000000000000..95305a3027e5d
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+#define __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_bitfield_extract_unsigned
+#define __RETTYPE __CLC_U_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.h b/libclc/clc/include/clc/integer/clc_bitfield_insert.h
new file mode 100644
index 0000000000000..f4d36b2ad2d2e
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.h
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BITFIELD_INSERT_H__
+#define __CLC_INTEGER_CLC_BITFIELD_INSERT_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_bitfield_insert
+#define __CLC_BODY <clc/integer/clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // __CLC_INTEGER_CLC_BITFIELD_INSERT_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.inc b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc
new file mode 100644
index 0000000000000..22f58bdc09830
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc
@@ -0,0 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE base,
+                                               __CLC_GENTYPE insert,
+                                               uint offset, uint count);
diff --git a/libclc/clc/include/clc/relational/binary_decl.inc b/libclc/clc/include/clc/relational/binary_decl.inc
index bcdf5238b8f58..dc8ec9db7b7da 100644
--- a/libclc/clc/include/clc/relational/binary_decl.inc
+++ b/libclc/clc/include/clc/relational/binary_decl.inc
@@ -6,4 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b);
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b);
+
+#undef __RETTYPE
diff --git a/libclc/clc/include/clc/relational/clc_isfinite.h b/libclc/clc/include/clc/relational/clc_isfinite.h
index 5e71ec7a0640a..444d733039819 100644
--- a/libclc/clc/include/clc/relational/clc_isfinite.h
+++ b/libclc/clc/include/clc/relational/clc_isfinite.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isfinite
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isgreater.h b/libclc/clc/include/clc/relational/clc_isgreater.h
index e2e6911a80cdd..88de46854961d 100644
--- a/libclc/clc/include/clc/relational/clc_isgreater.h
+++ b/libclc/clc/include/clc/relational/clc_isgreater.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isgreaterequal.h b/libclc/clc/include/clc/relational/clc_isgreaterequal.h
index 3fe8835aff9d5..42308036f102f 100644
--- a/libclc/clc/include/clc/relational/clc_isgreaterequal.h
+++ b/libclc/clc/include/clc/relational/clc_isgreaterequal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isgreaterequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isless.h b/libclc/clc/include/clc/relational/clc_isless.h
index 01384cf6fa4a0..6fdc6c54947c0 100644
--- a/libclc/clc/include/clc/relational/clc_isless.h
+++ b/libclc/clc/include/clc/relational/clc_isless.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isless
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_islessequal.h b/libclc/clc/include/clc/relational/clc_islessequal.h
index a4b77a451b248..e592287b23099 100644
--- a/libclc/clc/include/clc/relational/clc_islessequal.h
+++ b/libclc/clc/include/clc/relational/clc_islessequal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_islessequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_islessgreater.h b/libclc/clc/include/clc/relational/clc_islessgreater.h
index 9fb6d641bfa14..a2f10707a677d 100644
--- a/libclc/clc/include/clc/relational/clc_islessgreater.h
+++ b/libclc/clc/include/clc/relational/clc_islessgreater.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_islessgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isnormal.h b/libclc/clc/include/clc/relational/clc_isnormal.h
index d580fed5a7395..2281bc4245d03 100644
--- a/libclc/clc/include/clc/relational/clc_isnormal.h
+++ b/libclc/clc/include/clc/relational/clc_isnormal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isnormal
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isnotequal.h b/libclc/clc/include/clc/relational/clc_isnotequal.h
index 16982fc3c5aaa..c2640fc0899a6 100644
--- a/libclc/clc/include/clc/relational/clc_isnotequal.h
+++ b/libclc/clc/include/clc/relational/clc_isnotequal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isnotequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isordered.h b/libclc/clc/include/clc/relational/clc_isordered.h
index 7ba26662105fc..cb9be31311575 100644
--- a/libclc/clc/include/clc/relational/clc_isordered.h
+++ b/libclc/clc/include/clc/relational/clc_isordered.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isunordered.h b/libclc/clc/include/clc/relational/clc_isunordered.h
index eac158d245191..36d314ff0e1be 100644
--- a/libclc/clc/include/clc/relational/clc_isunordered.h
+++ b/libclc/clc/include/clc/relational/clc_isunordered.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isunordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_signbit.h b/libclc/clc/include/clc/relational/clc_signbit.h
index 892263a09e99c..9e423ab448953 100644
--- a/libclc/clc/include/clc/relational/clc_signbit.h
+++ b/libclc/clc/include/clc/relational/clc_signbit.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_signbit
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/floatn.inc b/libclc/clc/include/clc/relational/floatn.inc
deleted file mode 100644
index 263937f6eef6f..0000000000000
--- a/libclc/clc/include/clc/relational/floatn.inc
+++ /dev/null
@@ -1,132 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
-
-#define __CLC_FLOATN float
-#define __CLC_INTN int
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float2
-#define __CLC_INTN int2
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float3
-#define __CLC_INTN int3
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float4
-#define __CLC_INTN int4
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float8
-#define __CLC_INTN int8
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float16
-#define __CLC_INTN int16
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#undef __CLC_FLOAT
-#undef __CLC_INT
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-#define __CLC_FLOATN double
-#define __CLC_INTN int
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double2
-#define __CLC_INTN long2
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double3
-#define __CLC_INTN long3
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double4
-#define __CLC_INTN long4
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double8
-#define __CLC_INTN long8
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double16
-#define __CLC_INTN long16
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#endif
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define __CLC_FLOATN half
-#define __CLC_INTN int
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half2
-#define __CLC_INTN short2
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half3
-#define __CLC_INTN short3
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half4
-#define __CLC_INTN short4
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half8
-#define __CLC_INTN short8
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half16
-#define __CLC_INTN short16
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#endif
-
-#undef __CLC_BODY
diff --git a/libclc/clc/include/clc/relational/unary_decl.inc b/libclc/clc/include/clc/relational/unary_decl.inc
index b9fb36c905469..cc3f2d065529b 100644
--- a/libclc/clc/include/clc/relational/unary_decl.inc
+++ b/libclc/clc/include/clc/relational/unary_decl.inc
@@ -6,4 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_INTN FUNCTION(__CLC_FLOATN x);
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE x);
+
+#undef __RETTYPE
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index bf8736a726315..ee4f771799e8e 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -1,4 +1,17 @@
 async/clc_prefetch.cl
+atomic/clc_atomic_compare_exchange.cl
+atomic/clc_atomic_dec.cl
+atomic/clc_atomic_exchange.cl
+atomic/clc_atomic_fetch_add.cl
+atomic/clc_atomic_fetch_and.cl
+atomic/clc_atomic_fetch_max.cl
+atomic/clc_atomic_fetch_min.cl
+atomic/clc_atomic_fetch_or.cl
+atomic/clc_atomic_fetch_sub.cl
+atomic/clc_atomic_fetch_xor.cl
+atomic/clc_atomic_inc.cl
+atomic/clc_atomic_load.cl
+atomic/clc_atomic_store.cl
 common/clc_degrees.cl
 common/clc_radians.cl
 common/clc_sign.cl
@@ -15,6 +28,10 @@ geometric/clc_normalize.cl
 integer/clc_abs.cl
 integer/clc_abs_diff.cl
 integer/clc_add_sat.cl
+integer/clc_bitfield_extract_signed.cl
+integer/clc_bitfield_extract_unsigned.cl
+integer/clc_bitfield_insert.cl
+integer/clc_bit_reverse.cl
 integer/clc_clz.cl
 integer/clc_ctz.cl
 integer/clc_hadd.cl
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl
new file mode 100644
index 0000000000000..796dedcef3857
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_compare_exchange.h>
+
+#define __CLC_BODY <clc_atomic_compare_exchange.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_compare_exchange.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc
new file mode 100644
index 0000000000000..32ff9b45b769e
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__SPIR32__) || defined(CLC_NVPTX)
+#if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) ||                           \
+    (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32))
+#define __CLC_HAS_ATOMIC
+#endif
+#else // defined(__SPIR32__) || defined(CLC_NVPTX)
+#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+#define __CLC_HAS_ATOMIC
+#endif
+#endif // defined(__SPIR32__) || defined(CLC_NVPTX)
+
+#ifdef __CLC_HAS_ATOMIC
+
+#ifdef __CLC_FPSIZE
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange(         \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator,         \
+      __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal,       \
+      int MemoryScope) {                                                       \
+    __CLC_U_GENTYPE Comp = __CLC_AS_U_GENTYPE(Comparator);                     \
+    __scoped_atomic_compare_exchange_n(                                        \
+        (ADDRSPACE __CLC_U_GENTYPE *)Ptr, &Comp, __CLC_AS_U_GENTYPE(Value),    \
+        false, MemoryOrderEqual, MemoryOrderUnequal, MemoryScope);             \
+    return __CLC_AS_GENTYPE(Comp);                                             \
+  }
+
+#else
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange(         \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator,         \
+      __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal,       \
+      int MemoryScope) {                                                       \
+    __scoped_atomic_compare_exchange_n(Ptr, &Comparator, Value, false,         \
+                                       MemoryOrderEqual, MemoryOrderUnequal,   \
+                                       MemoryScope);                           \
+    return Comparator;                                                         \
+  }
+
+#endif // __CLC_FPSIZE
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_HAS_ATOMIC
+#undef __CLC_HAS_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl
new file mode 100644
index 0000000000000..f35a9624fd013
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_dec.h>
+
+#define FUNCTION __clc_atomic_dec
+#define __IMPL_FUNCTION __scoped_atomic_fetch_add
+#define __CLC_INC_DEC
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
new file mode 100644
index 0000000000000..2c45f49f60848
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__SPIR32__) || defined(CLC_NVPTX)
+#if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) ||                           \
+    (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32))
+#define __CLC_HAS_ATOMIC
+#endif
+#else // defined(__SPIR32__) || defined(CLC_NVPTX)
+#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+#define __CLC_HAS_ATOMIC
+#endif
+#endif // defined(__SPIR32__) || defined(CLC_NVPTX)
+
+#ifdef __CLC_HAS_ATOMIC
+
+#ifndef __CLC_PTR_CASTTYPE
+#define __CLC_PTR_CASTTYPE __CLC_GENTYPE
+#endif
+
+#ifndef __CLC_AS_RETTYPE
+#define __CLC_AS_RETTYPE(x) x
+#endif
+
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
+      int MemoryScope) {                                                       \
+    return __CLC_AS_RETTYPE(__IMPL_FUNCTION(                                   \
+        (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope));       \
+  }
+#elif defined(__CLC_INC_DEC)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
+      int MemoryScope) {                                                       \
+    return __CLC_AS_RETTYPE(                                                   \
+        __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, (__CLC_GENTYPE)1, \
+                        MemoryOrder, MemoryScope));                            \
+  }
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \
+                                        __CLC_GENTYPE Value, int MemoryOrder,  \
+                                        int MemoryScope) {                     \
+    __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, MemoryOrder,   \
+                    MemoryScope);                                              \
+  }
+#else
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
+      int MemoryOrder, int MemoryScope) {                                      \
+    return __CLC_AS_RETTYPE(                                                   \
+        __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value,            \
+                        MemoryOrder, MemoryScope));                            \
+  }
+#endif
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_HAS_ATOMIC
+#undef __CLC_HAS_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
new file mode 100644
index 0000000000000..52fd11afed6a2
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_exchange.h>
+
+#define FUNCTION __clc_atomic_exchange
+#define __IMPL_FUNCTION __scoped_atomic_exchange_n
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_PTR_CASTTYPE
+#undef __CLC_AS_RETTYPE
+#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl
new file mode 100644
index 0000000000000..0dc44919627b3
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_add.h>
+
+#define FUNCTION __clc_atomic_fetch_add
+#define __IMPL_FUNCTION __scoped_atomic_fetch_add
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libc/include/dlfcn.h.def b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl
similarity index 52%
rename from libc/include/dlfcn.h.def
rename to libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl
index 31395871c6b97..ec89738bc0f62 100644
--- a/libc/include/dlfcn.h.def
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl
@@ -1,4 +1,4 @@
-//===-- C standard library header dlfcn.h ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_DLFCN_H
-#define LLVM_LIBC_DLFCN_H
-
-#include "__llvm-libc-common.h"
-#include "llvm-libc-macros/dlfcn-macros.h"
+#include <clc/atomic/clc_atomic_fetch_and.h>
 
-%%public_api()
+#define FUNCTION __clc_atomic_fetch_and
+#define __IMPL_FUNCTION __scoped_atomic_fetch_and
 
-#endif // LLVM_LIBC_DLFCN_H
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl
new file mode 100644
index 0000000000000..0acac711aa96d
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_max.h>
+
+#define FUNCTION __clc_atomic_fetch_max
+#define __IMPL_FUNCTION __scoped_atomic_fetch_max
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl
new file mode 100644
index 0000000000000..7a098588ec005
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_min.h>
+
+#define FUNCTION __clc_atomic_fetch_min
+#define __IMPL_FUNCTION __scoped_atomic_fetch_min
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libc/include/search.h.def b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl
similarity index 53%
rename from libc/include/search.h.def
rename to libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl
index 6301ba7b656ce..e0f48fa408350 100644
--- a/libc/include/search.h.def
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl
@@ -1,4 +1,4 @@
-//===-- POSIX header search.h ---------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SEARCH_H
-#define LLVM_LIBC_SEARCH_H
-
-#include "__llvm-libc-common.h"
-#define __need_size_t
-#include <stddef.h>
+#include <clc/atomic/clc_atomic_fetch_or.h>
 
-%%public_api()
+#define FUNCTION __clc_atomic_fetch_or
+#define __IMPL_FUNCTION __scoped_atomic_fetch_or
 
-#endif // LLVM_LIBC_SEARCH_H
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl
new file mode 100644
index 0000000000000..a4c2c1da1555c
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_sub.h>
+
+#define FUNCTION __clc_atomic_fetch_sub
+#define __IMPL_FUNCTION __scoped_atomic_fetch_sub
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libc/include/setjmp.h.def b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl
similarity index 52%
rename from libc/include/setjmp.h.def
rename to libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl
index 670bc1ac0fe24..4424a298178fd 100644
--- a/libc/include/setjmp.h.def
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl
@@ -1,4 +1,4 @@
-//===-- C standard library header setjmp.h --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SETJMP_H
-#define LLVM_LIBC_SETJMP_H
-
-#include "__llvm-libc-common.h"
+#include <clc/atomic/clc_atomic_fetch_xor.h>
 
-%%public_api()
+#define FUNCTION __clc_atomic_fetch_xor
+#define __IMPL_FUNCTION __scoped_atomic_fetch_xor
 
-#endif // LLVM_LIBC_SETJMP_H
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl
new file mode 100644
index 0000000000000..019aa8d9d6dd8
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_inc.h>
+
+#define FUNCTION __clc_atomic_inc
+#define __IMPL_FUNCTION __scoped_atomic_fetch_sub
+#define __CLC_INC_DEC
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
new file mode 100644
index 0000000000000..1f083073e43ff
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_load.h>
+
+#define FUNCTION __clc_atomic_load
+#define __IMPL_FUNCTION __scoped_atomic_load_n
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_PTR_CASTTYPE
+#undef __CLC_AS_RETTYPE
+#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
new file mode 100644
index 0000000000000..8fd165b9a83b8
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_store.h>
+
+#define FUNCTION __clc_atomic_store
+#define __IMPL_FUNCTION __scoped_atomic_store_n
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_PTR_CASTTYPE
+#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bit_reverse.cl b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl
new file mode 100644
index 0000000000000..439957383f583
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/integer/clc_bit_reverse.h>
+
+#define FUNCTION __clc_bit_reverse
+#define __IMPL_FUNCTION(x) __builtin_elementwise_bitreverse
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/integer/gentype.inc>
diff --git a/libc/include/dirent.h.def b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl
similarity index 59%
rename from libc/include/dirent.h.def
rename to libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl
index 6786578fbd067..d779ed6a43593 100644
--- a/libc/include/dirent.h.def
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl
@@ -1,4 +1,4 @@
-//===-- POSIX header dirent.h ---------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_DIRENT_H
-#define LLVM_LIBC_DIRENT_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
+#include <clc/integer/clc_bitfield_extract_signed.h>
 
-#endif // LLVM_LIBC_DIRENT_H
+#define __CLC_BODY <clc_bitfield_extract_signed.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc
new file mode 100644
index 0000000000000..84cae2166f7ce
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_S_GENTYPE
+__clc_bitfield_extract_signed(__CLC_GENTYPE base, uint offset, uint count) {
+  if (count == 0)
+    return 0;
+  __CLC_U_GENTYPE x = __CLC_AS_U_GENTYPE(base)
+                      << (__CLC_GENSIZE - offset - count);
+  // Implement an arithmetic shift right.
+  __CLC_U_GENTYPE s = -(x >> (__CLC_GENSIZE - 1));
+  __CLC_U_GENTYPE result = ((s ^ x) >> (__CLC_GENSIZE - count)) ^ s;
+  return __CLC_AS_S_GENTYPE(result);
+}
diff --git a/libc/include/string.h.def b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl
similarity index 59%
rename from libc/include/string.h.def
rename to libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl
index 339d005e43a4f..bf7db401034dc 100644
--- a/libc/include/string.h.def
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl
@@ -1,4 +1,4 @@
-//===-- C standard library header string.h --------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_STRING_H
-#define LLVM_LIBC_STRING_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
+#include <clc/integer/clc_bitfield_extract_unsigned.h>
 
-#endif // LLVM_LIBC_STRING_H
+#define __CLC_BODY <clc_bitfield_extract_unsigned.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc
new file mode 100644
index 0000000000000..bc81ce5c98b09
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE
+__clc_bitfield_extract_unsigned(__CLC_GENTYPE base, uint offset, uint count) {
+  if (count == 0)
+    return 0;
+  __CLC_U_GENTYPE result = __CLC_AS_U_GENTYPE(base)
+                           << (__CLC_GENSIZE - offset - count);
+  return result >> (__CLC_GENSIZE - count);
+}
diff --git a/libc/include/spawn.h.def b/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl
similarity index 61%
rename from libc/include/spawn.h.def
rename to libclc/clc/lib/generic/integer/clc_bitfield_insert.cl
index a8d7015852868..a40fc804f2187 100644
--- a/libc/include/spawn.h.def
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl
@@ -1,4 +1,4 @@
-//===-- POSIX header spawn.h ----------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SPAWN_H
-#define LLVM_LIBC_SPAWN_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
+#include <clc/integer/clc_bitfield_insert.h>
 
-#endif // LLVM_LIBC_SPAWN_H
+#define __CLC_BODY <clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc b/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc
new file mode 100644
index 0000000000000..ad8dac28750cc
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_bitfield_insert(__CLC_GENTYPE base,
+                                                           __CLC_GENTYPE insert,
+                                                           uint offset,
+                                                           uint count) {
+  __CLC_U_GENTYPE u_base = __CLC_AS_U_GENTYPE(base);
+  __CLC_U_GENTYPE u_insert = __CLC_AS_U_GENTYPE(insert);
+  __CLC_U_GENTYPE mask = (((__CLC_U_GENTYPE)1 << count) - (__CLC_U_GENTYPE)1)
+                         << offset;
+  mask = count < __CLC_GENSIZE ? mask : ~(__CLC_U_GENTYPE)0;
+  __CLC_U_GENTYPE result = ((u_insert << offset) & mask) | (u_base & ~mask);
+  return __CLC_AS_GENTYPE(result);
+}
diff --git a/libclc/clc/lib/generic/math/clc_native_divide.inc b/libclc/clc/lib/generic/math/clc_native_divide.inc
index fdf1794812c5a..dac176fb986bd 100644
--- a/libclc/clc/lib/generic/math/clc_native_divide.inc
+++ b/libclc/clc/lib/generic/math/clc_native_divide.inc
@@ -8,5 +8,6 @@
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_divide(__CLC_GENTYPE x,
                                                          __CLC_GENTYPE y) {
+  _Pragma("clang fp reciprocal(on)");
   return x / y;
 }
diff --git a/libclc/clc/lib/generic/math/clc_native_recip.inc b/libclc/clc/lib/generic/math/clc_native_recip.inc
index 57eb35a9522f8..e7246dc08a77c 100644
--- a/libclc/clc/lib/generic/math/clc_native_recip.inc
+++ b/libclc/clc/lib/generic/math/clc_native_recip.inc
@@ -7,5 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_recip(__CLC_GENTYPE val) {
+  _Pragma("clang fp reciprocal(on)");
   return 1.0f / val;
 }
diff --git a/libclc/clc/lib/generic/math/clc_native_rsqrt.inc b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc
index 7a3b0b2af2721..2b2c4bdada9f9 100644
--- a/libclc/clc/lib/generic/math/clc_native_rsqrt.inc
+++ b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc
@@ -7,5 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_rsqrt(__CLC_GENTYPE val) {
+  _Pragma("clang fp reciprocal(on)");
   return 1.0f / __clc_native_sqrt(val);
 }
diff --git a/libclc/clc/lib/generic/math/clc_native_tan.inc b/libclc/clc/lib/generic/math/clc_native_tan.inc
index f61a78968a754..f0c6c6d37d2b7 100644
--- a/libclc/clc/lib/generic/math/clc_native_tan.inc
+++ b/libclc/clc/lib/generic/math/clc_native_tan.inc
@@ -7,5 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_tan(__CLC_GENTYPE val) {
+  _Pragma("clang fp reciprocal(on)");
   return __clc_native_sin(val) / __clc_native_cos(val);
 }
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 056706ee629cd..dc4b1e8286ec0 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -425,17 +425,21 @@ function(add_libclc_builtin_set)
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )
   endif()
 
-  if(CMAKE_HOST_UNIX OR LLVM_USE_SYMLINKS)
-    set(LIBCLC_LINK_OR_COPY create_symlink)
-  else()
-    set(LIBCLC_LINK_OR_COPY copy)
-  endif()
-
   foreach( a IN LISTS ARG_ALIASES )
+    if(CMAKE_HOST_UNIX OR LLVM_USE_SYMLINKS)
+      cmake_path(RELATIVE_PATH libclc_builtins_lib
+        BASE_DIRECTORY ${LIBCLC_OUTPUT_LIBRARY_DIR}
+        OUTPUT_VARIABLE LIBCLC_LINK_OR_COPY_SOURCE)
+      set(LIBCLC_LINK_OR_COPY create_symlink)
+    else()
+      set(LIBCLC_LINK_OR_COPY_SOURCE ${libclc_builtins_lib})
+      set(LIBCLC_LINK_OR_COPY copy)
+    endif()
+
     set( alias_suffix "${a}-${ARG_TRIPLE}.bc" )
     add_custom_command(
       OUTPUT ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      COMMAND ${CMAKE_COMMAND} -E ${LIBCLC_LINK_OR_COPY} ${libclc_builtins_lib} ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
+      COMMAND ${CMAKE_COMMAND} -E ${LIBCLC_LINK_OR_COPY} ${LIBCLC_LINK_OR_COPY_SOURCE} ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
       DEPENDS prepare-${obj_suffix}
     )
     add_custom_target( alias-${alias_suffix} ALL
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h
index 821ae7aab05bf..50fb99d1362fc 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_add
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_ADD_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h
index d10cfed9b581a..8ce328c9739aa 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_and
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_AND_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h
new file mode 100644
index 0000000000000..76eeda7ba3469
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__
+
+#define FUNCTION atomic_compare_exchange_strong
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_COMPARE_EXCHANGE
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h
new file mode 100644
index 0000000000000..12788ad03a2d1
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__
+
+#define FUNCTION atomic_compare_exchange_weak
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_COMPARE_EXCHANGE
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc
index e060e3aaea161..1b2bf17bd6dfd 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc
@@ -6,17 +6,55 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE)                                  \
-  _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE);
+#ifdef __CLC_SCALAR
 
-#define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE)                                   \
-  __CLC_DECLARE_ATOMIC(global, TYPE)                                           \
-  __CLC_DECLARE_ATOMIC(local, TYPE)
+#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) &&         \
+                                 defined(cl_khr_int64_extended_atomics))
+#define HAVE_64_ATOMIC
+#endif
+#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC))
+#define HAVE_FP_ATOMIC
+#endif
+#if defined(__CLC_GENSIZE) &&                                                  \
+    ((__CLC_GENSIZE == 32) ||                                                  \
+     (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC)))
+#define HAVE_INT_ATOMIC
+#endif
+#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC)
 
-__CLC_DECLARE_ATOMIC_ADDRSPACE(int)
-__CLC_DECLARE_ATOMIC_ADDRSPACE(uint)
+#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE)
 
-#undef __CLC_DECLARE_ATOMIC_ADDRSPACE
-#undef __CLC_DECLARE_ATOMIC
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr);
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL void FUNCTION(                                       \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value);
+#elif defined(__CLC_COMPARE_EXCHANGE)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr,                            \
+      ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired);
+#else
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value);
+#endif
 
-#undef FUNCTION
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC
+
+#undef HAVE_INT_ATOMIC
+#undef HAVE_FP_ATOMIC
+#undef HAVE_64_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc
new file mode 100644
index 0000000000000..e060e3aaea161
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE)                                  \
+  _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE);
+
+#define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE)                                   \
+  __CLC_DECLARE_ATOMIC(global, TYPE)                                           \
+  __CLC_DECLARE_ATOMIC(local, TYPE)
+
+__CLC_DECLARE_ATOMIC_ADDRSPACE(int)
+__CLC_DECLARE_ATOMIC_ADDRSPACE(uint)
+
+#undef __CLC_DECLARE_ATOMIC_ADDRSPACE
+#undef __CLC_DECLARE_ATOMIC
+
+#undef FUNCTION
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h
new file mode 100644
index 0000000000000..3949bc13401f2
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__
+
+#define FUNCTION atomic_exchange
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h
new file mode 100644
index 0000000000000..972c1fa69fe7b
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__
+
+#define FUNCTION atomic_fetch_add
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h
new file mode 100644
index 0000000000000..fdac049a74d3f
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__
+
+#define FUNCTION atomic_fetch_and
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h
new file mode 100644
index 0000000000000..513b60fec2727
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__
+
+#define FUNCTION atomic_fetch_max
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h
new file mode 100644
index 0000000000000..c961c4a641656
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__
+
+#define FUNCTION atomic_fetch_min
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h
new file mode 100644
index 0000000000000..25923e3647e36
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__
+
+#define FUNCTION atomic_fetch_or
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h
new file mode 100644
index 0000000000000..b307c30a298b3
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__
+
+#define FUNCTION atomic_fetch_sub
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h
new file mode 100644
index 0000000000000..52510d018574d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__
+
+#define FUNCTION atomic_fetch_xor
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_load.h b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h
new file mode 100644
index 0000000000000..3998a4de9452b
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__
+
+#define FUNCTION atomic_load
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h
index 667fa36f16f9d..6b95ad7e68d94 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_max
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MAX_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h
index 91bb636eec875..c1dfacb40b746 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_min
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MIN_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h
index 5c03fd157a2bc..30c32fe4889d5 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_or
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_OR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_store.h b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h
new file mode 100644
index 0000000000000..4893a5b88df03
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__
+
+#define FUNCTION atomic_store
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_RETURN_VOID
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h
index 25ffe9ff4a9b7..1e7ac5505b071 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_sub
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_SUB_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h
index 6b4206dedb820..043d7825483e4 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h
@@ -15,6 +15,6 @@
 
 _CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile local float *, float);
 _CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile global float *, float);
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XCHG_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h
index e94560cb6b9ed..a9bee007b9344 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_xor
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XOR_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bit_reverse.h b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h
new file mode 100644
index 0000000000000..46b589557631d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BIT_REVERSE_H__
+#define __CLC_OPENCL_INTEGER_BIT_REVERSE_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define FUNCTION bit_reverse
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BIT_REVERSE_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h
new file mode 100644
index 0000000000000..0a902b2a21d6d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__
+#define __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define FUNCTION bitfield_extract_signed
+#define __RETTYPE __CLC_S_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h
new file mode 100644
index 0000000000000..28064c08b113e
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__
+#define __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define FUNCTION bitfield_extract_unsigned
+#define __RETTYPE __CLC_U_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h
new file mode 100644
index 0000000000000..e77d7a4f0b957
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__
+#define __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define __CLC_BODY <clc/integer/clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__
diff --git a/libclc/opencl/include/clc/opencl/relational/isfinite.h b/libclc/opencl/include/clc/opencl/relational/isfinite.h
index 2548e6acf5109..ac3db6764073a 100644
--- a/libclc/opencl/include/clc/opencl/relational/isfinite.h
+++ b/libclc/opencl/include/clc/opencl/relational/isfinite.h
@@ -14,7 +14,7 @@
 #define FUNCTION isfinite
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isgreater.h b/libclc/opencl/include/clc/opencl/relational/isgreater.h
index 6dfe6eb810e2a..2230055115bcd 100644
--- a/libclc/opencl/include/clc/opencl/relational/isgreater.h
+++ b/libclc/opencl/include/clc/opencl/relational/isgreater.h
@@ -14,7 +14,7 @@
 #define FUNCTION isgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h
index 1db2c5d58d062..f99a620dabd78 100644
--- a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h
+++ b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h
@@ -14,7 +14,7 @@
 #define FUNCTION isgreaterequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isless.h b/libclc/opencl/include/clc/opencl/relational/isless.h
index 3e2afb32cddf4..74280e543e0b5 100644
--- a/libclc/opencl/include/clc/opencl/relational/isless.h
+++ b/libclc/opencl/include/clc/opencl/relational/isless.h
@@ -14,7 +14,7 @@
 #define FUNCTION isless
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/islessequal.h b/libclc/opencl/include/clc/opencl/relational/islessequal.h
index 978e6a9052c16..dcc26c37b73c1 100644
--- a/libclc/opencl/include/clc/opencl/relational/islessequal.h
+++ b/libclc/opencl/include/clc/opencl/relational/islessequal.h
@@ -14,7 +14,7 @@
 #define FUNCTION islessequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/islessgreater.h b/libclc/opencl/include/clc/opencl/relational/islessgreater.h
index 56cce7db20770..15a1eb5577531 100644
--- a/libclc/opencl/include/clc/opencl/relational/islessgreater.h
+++ b/libclc/opencl/include/clc/opencl/relational/islessgreater.h
@@ -14,7 +14,7 @@
 #define FUNCTION islessgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isnormal.h b/libclc/opencl/include/clc/opencl/relational/isnormal.h
index ee74a990b5eaf..bbb06aad0df2a 100644
--- a/libclc/opencl/include/clc/opencl/relational/isnormal.h
+++ b/libclc/opencl/include/clc/opencl/relational/isnormal.h
@@ -14,7 +14,7 @@
 #define FUNCTION isnormal
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isnotequal.h b/libclc/opencl/include/clc/opencl/relational/isnotequal.h
index 7cf94e3ceec5f..c13aca8ef4be8 100644
--- a/libclc/opencl/include/clc/opencl/relational/isnotequal.h
+++ b/libclc/opencl/include/clc/opencl/relational/isnotequal.h
@@ -14,7 +14,7 @@
 #define FUNCTION isnotequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isordered.h b/libclc/opencl/include/clc/opencl/relational/isordered.h
index ad9770bd627f2..ea4ba3fa6fe8d 100644
--- a/libclc/opencl/include/clc/opencl/relational/isordered.h
+++ b/libclc/opencl/include/clc/opencl/relational/isordered.h
@@ -14,7 +14,7 @@
 #define FUNCTION isordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isunordered.h b/libclc/opencl/include/clc/opencl/relational/isunordered.h
index 01d2f53837317..76bf85604d1c7 100644
--- a/libclc/opencl/include/clc/opencl/relational/isunordered.h
+++ b/libclc/opencl/include/clc/opencl/relational/isunordered.h
@@ -14,7 +14,7 @@
 #define FUNCTION isunordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/signbit.h b/libclc/opencl/include/clc/opencl/relational/signbit.h
index 29591c0c126a9..6ad6595c7e294 100644
--- a/libclc/opencl/include/clc/opencl/relational/signbit.h
+++ b/libclc/opencl/include/clc/opencl/relational/signbit.h
@@ -14,7 +14,7 @@
 #define FUNCTION signbit
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/lib/generic/SOURCES b/libclc/opencl/lib/generic/SOURCES
index 46ce6d6e36c24..61757efbcaad7 100644
--- a/libclc/opencl/lib/generic/SOURCES
+++ b/libclc/opencl/lib/generic/SOURCES
@@ -8,24 +8,36 @@ atomic/atom_add.cl
 atomic/atom_and.cl
 atomic/atom_cmpxchg.cl
 atomic/atom_dec.cl
-atomic/atom_inc.cl
-atomic/atom_max.cl
-atomic/atom_min.cl
-atomic/atom_or.cl
-atomic/atom_sub.cl
-atomic/atom_xchg.cl
-atomic/atom_xor.cl
 atomic/atomic_add.cl
 atomic/atomic_and.cl
 atomic/atomic_cmpxchg.cl
+atomic/atomic_compare_exchange_strong.cl
+atomic/atomic_compare_exchange_weak.cl
 atomic/atomic_dec.cl
+atomic/atomic_exchange.cl
+atomic/atomic_fetch_add.cl
+atomic/atomic_fetch_and.cl
+atomic/atomic_fetch_max.cl
+atomic/atomic_fetch_min.cl
+atomic/atomic_fetch_or.cl
+atomic/atomic_fetch_sub.cl
+atomic/atomic_fetch_xor.cl
 atomic/atomic_inc.cl
+atomic/atomic_load.cl
 atomic/atomic_max.cl
 atomic/atomic_min.cl
 atomic/atomic_or.cl
+atomic/atomic_store.cl
 atomic/atomic_sub.cl
 atomic/atomic_xchg.cl
 atomic/atomic_xor.cl
+atomic/atom_inc.cl
+atomic/atom_max.cl
+atomic/atom_min.cl
+atomic/atom_or.cl
+atomic/atom_sub.cl
+atomic/atom_xchg.cl
+atomic/atom_xor.cl
 common/degrees.cl
 common/mix.cl
 common/radians.cl
@@ -43,6 +55,10 @@ geometric/normalize.cl
 integer/abs.cl
 integer/abs_diff.cl
 integer/add_sat.cl
+integer/bitfield_extract_signed.cl
+integer/bitfield_extract_unsigned.cl
+integer/bitfield_insert.cl
+integer/bit_reverse.cl
 integer/clz.cl
 integer/ctz.cl
 integer/hadd.cl
diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl
new file mode 100644
index 0000000000000..422c03f292071
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_compare_exchange.h>
+#include <clc/opencl/atomic/atomic_compare_exchange_strong.h>
+
+#define FUNCTION atomic_compare_exchange_strong
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl
new file mode 100644
index 0000000000000..8a6b3c4f0110e
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_compare_exchange.h>
+#include <clc/opencl/atomic/atomic_compare_exchange_weak.h>
+
+#define FUNCTION atomic_compare_exchange_weak
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_dec.cl b/libclc/opencl/lib/generic/atomic/atomic_dec.cl
index 6f18cdf13428a..6de55bc0b9845 100644
--- a/libclc/opencl/lib/generic/atomic/atomic_dec.cl
+++ b/libclc/opencl/lib/generic/atomic/atomic_dec.cl
@@ -6,15 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/atomic/clc_atomic_dec.h>
 #include <clc/opencl/atomic/atomic_dec.h>
 
-#define IMPL(TYPE, AS)                                                         \
-  _CLC_OVERLOAD _CLC_DEF TYPE atomic_dec(volatile AS TYPE *p) {                \
-    return __sync_fetch_and_sub(p, (TYPE)1);                                   \
-  }
+#define FUNCTION atomic_dec
+#define __IMPL_FUNCTION __clc_atomic_dec
 
-IMPL(int, global)
-IMPL(unsigned int, global)
-IMPL(int, local)
-IMPL(unsigned int, local)
-#undef IMPL
+#define __CLC_BODY <atomic_inc_dec.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/opencl/lib/generic/atomic/atomic_def.inc b/libclc/opencl/lib/generic/atomic/atomic_def.inc
new file mode 100644
index 0000000000000..ce192bf844938
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_def.inc
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) &&         \
+                                 defined(cl_khr_int64_extended_atomics))
+#define HAVE_64_ATOMIC
+#endif
+#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC)
+#define HAVE_FP_ATOMIC
+#endif
+#if defined(__CLC_GENSIZE) &&                                                  \
+    ((__CLC_GENSIZE == 32) ||                                                  \
+     (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC)))
+#define HAVE_INT_ATOMIC
+#endif
+#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC)
+
+#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE)
+
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr) {                          \
+    return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr,            \
+                           __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);           \
+  }
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF void FUNCTION(                                        \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) {     \
+    __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value,            \
+                    __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);                  \
+  }
+#elif defined(__CLC_COMPARE_EXCHANGE)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr,                            \
+      ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired) {              \
+    __CLC_GENTYPE Comparator = *Expected;                                      \
+    __CLC_GENTYPE RetValue = __clc_atomic_compare_exchange(                    \
+        (volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Comparator, Desired,          \
+        __ATOMIC_SEQ_CST, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);            \
+    if (Comparator != RetValue) {                                              \
+      *Expected = RetValue;                                                    \
+      return true;                                                             \
+    }                                                                          \
+    return false;                                                              \
+  }
+#else
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) {     \
+    return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value,     \
+                           __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);           \
+  }
+#endif
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC
+
+#undef HAVE_INT_ATOMIC
+#undef HAVE_FP_ATOMIC
+#undef HAVE_64_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/lib/generic/atomic/atomic_exchange.cl b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl
new file mode 100644
index 0000000000000..6dae6c0a77599
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_exchange.h>
+#include <clc/opencl/atomic/atomic_exchange.h>
+
+#define FUNCTION atomic_exchange
+#define __IMPL_FUNCTION __clc_atomic_exchange
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl
new file mode 100644
index 0000000000000..bbaa1c2b0dacf
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_add.h>
+#include <clc/opencl/atomic/atomic_fetch_add.h>
+
+#define FUNCTION atomic_fetch_add
+#define __IMPL_FUNCTION __clc_atomic_fetch_add
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl
new file mode 100644
index 0000000000000..73925844c9357
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_and.h>
+#include <clc/opencl/atomic/atomic_fetch_and.h>
+
+#define FUNCTION atomic_fetch_and
+#define __IMPL_FUNCTION __clc_atomic_fetch_and
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl
new file mode 100644
index 0000000000000..8c8ce11cc575f
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_max.h>
+#include <clc/opencl/atomic/atomic_fetch_max.h>
+
+#define FUNCTION atomic_fetch_max
+#define __IMPL_FUNCTION __clc_atomic_fetch_max
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl
new file mode 100644
index 0000000000000..550459cee32d6
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_min.h>
+#include <clc/opencl/atomic/atomic_fetch_min.h>
+
+#define FUNCTION atomic_fetch_min
+#define __IMPL_FUNCTION __clc_atomic_fetch_min
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl
new file mode 100644
index 0000000000000..2606ff3c99673
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_or.h>
+#include <clc/opencl/atomic/atomic_fetch_or.h>
+
+#define FUNCTION atomic_fetch_or
+#define __IMPL_FUNCTION __clc_atomic_fetch_or
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl
new file mode 100644
index 0000000000000..33772233bebed
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_sub.h>
+#include <clc/opencl/atomic/atomic_fetch_sub.h>
+
+#define FUNCTION atomic_fetch_sub
+#define __IMPL_FUNCTION __clc_atomic_fetch_sub
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl
new file mode 100644
index 0000000000000..6f6503e588b6f
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_xor.h>
+#include <clc/opencl/atomic/atomic_fetch_xor.h>
+
+#define FUNCTION atomic_fetch_xor
+#define __IMPL_FUNCTION __clc_atomic_fetch_xor
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc.cl b/libclc/opencl/lib/generic/atomic/atomic_inc.cl
index 13349e5432e5c..a160b2e2370fc 100644
--- a/libclc/opencl/lib/generic/atomic/atomic_inc.cl
+++ b/libclc/opencl/lib/generic/atomic/atomic_inc.cl
@@ -6,15 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/atomic/clc_atomic_inc.h>
 #include <clc/opencl/atomic/atomic_inc.h>
 
-#define IMPL(TYPE, AS)                                                         \
-  _CLC_OVERLOAD _CLC_DEF TYPE atomic_inc(volatile AS TYPE *p) {                \
-    return __sync_fetch_and_add(p, (TYPE)1);                                   \
-  }
+#define FUNCTION atomic_inc
+#define __IMPL_FUNCTION __clc_atomic_inc
 
-IMPL(int, global)
-IMPL(unsigned int, global)
-IMPL(int, local)
-IMPL(unsigned int, local)
-#undef IMPL
+#define __CLC_BODY <atomic_inc_dec.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc
new file mode 100644
index 0000000000000..0bcf300dd284a
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if __CLC_GENSIZE == 32
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr) {                                 \
+    return __IMPL_FUNCTION(Ptr, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);      \
+  }
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_GENSIZE == 32
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/lib/generic/atomic/atomic_load.cl b/libclc/opencl/lib/generic/atomic/atomic_load.cl
new file mode 100644
index 0000000000000..459265473a8c8
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_load.cl
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_load.h>
+#include <clc/opencl/atomic/atomic_load.h>
+
+#define FUNCTION atomic_load
+#define __IMPL_FUNCTION __clc_atomic_load
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_store.cl b/libclc/opencl/lib/generic/atomic/atomic_store.cl
new file mode 100644
index 0000000000000..67f2c8457fc10
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_store.cl
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_store.h>
+#include <clc/opencl/atomic/atomic_store.h>
+
+#define FUNCTION atomic_store
+#define __IMPL_FUNCTION __clc_atomic_store
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/integer/bit_reverse.cl b/libclc/opencl/lib/generic/integer/bit_reverse.cl
new file mode 100644
index 0000000000000..23181b6b3eba5
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bit_reverse.cl
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bit_reverse.h>
+#include <clc/opencl/integer/bit_reverse.h>
+
+#define FUNCTION bit_reverse
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc
new file mode 100644
index 0000000000000..0262f67732afc
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __IMPL_FUNCTION
+#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset,
+                                          uint count) {
+  return __IMPL_FUNCTION(FUNCTION)(base, offset, count);
+}
diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl
new file mode 100644
index 0000000000000..eaa4ac779cfd1
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bitfield_extract_signed.h>
+#include <clc/opencl/integer/bitfield_extract_signed.h>
+
+#define FUNCTION bitfield_extract_signed
+#define __RETTYPE __CLC_S_GENTYPE
+
+#define __CLC_BODY <bitfield_extract_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl
new file mode 100644
index 0000000000000..fd63d5d6dee30
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bitfield_extract_unsigned.h>
+#include <clc/opencl/integer/bitfield_extract_unsigned.h>
+
+#define FUNCTION bitfield_extract_unsigned
+#define __RETTYPE __CLC_U_GENTYPE
+
+#define __CLC_BODY <bitfield_extract_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.cl b/libclc/opencl/lib/generic/integer/bitfield_insert.cl
new file mode 100644
index 0000000000000..6b441155f393b
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_insert.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bitfield_insert.h>
+#include <clc/opencl/integer/bitfield_insert.h>
+
+#define FUNCTION bitfield_insert
+#define __CLC_BODY <clc/integer/clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.inc b/libclc/opencl/lib/generic/integer/bitfield_insert.inc
new file mode 100644
index 0000000000000..b1f45907a4361
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_insert.inc
@@ -0,0 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitfield_insert(__CLC_GENTYPE base,
+                                                     __CLC_GENTYPE insert,
+                                                     uint offset, uint count) {
+  return __clc_bitfield_insert(base, insert, offset, count);
+}
diff --git a/libclc/opencl/lib/generic/relational/binary_def.inc b/libclc/opencl/lib/generic/relational/binary_def.inc
index 54bb237b8f8f5..8416da0475a2c 100644
--- a/libclc/opencl/lib/generic/relational/binary_def.inc
+++ b/libclc/opencl/lib/generic/relational/binary_def.inc
@@ -10,6 +10,14 @@
 
 #define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x)
 
-_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b) {
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b) {
   return __IMPL_FUNCTION(FUNCTION)(a, b);
 }
+
+#undef __RETTYPE
diff --git a/libclc/opencl/lib/generic/relational/isequal.cl b/libclc/opencl/lib/generic/relational/isequal.cl
index 94f83f9452666..83002c28ceab3 100644
--- a/libclc/opencl/lib/generic/relational/isequal.cl
+++ b/libclc/opencl/lib/generic/relational/isequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isfinite.cl b/libclc/opencl/lib/generic/relational/isfinite.cl
index 695ffea806d5c..a2017133cead8 100644
--- a/libclc/opencl/lib/generic/relational/isfinite.cl
+++ b/libclc/opencl/lib/generic/relational/isfinite.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isfinite
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isgreater.cl b/libclc/opencl/lib/generic/relational/isgreater.cl
index fb46ff20ac608..6eeb2b21c0493 100644
--- a/libclc/opencl/lib/generic/relational/isgreater.cl
+++ b/libclc/opencl/lib/generic/relational/isgreater.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isgreater
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isgreaterequal.cl b/libclc/opencl/lib/generic/relational/isgreaterequal.cl
index b8edde2a05b77..e4e4535fd30d3 100644
--- a/libclc/opencl/lib/generic/relational/isgreaterequal.cl
+++ b/libclc/opencl/lib/generic/relational/isgreaterequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isgreaterequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isinf.cl b/libclc/opencl/lib/generic/relational/isinf.cl
index 2c15f1f826762..2ab8c182e02a6 100644
--- a/libclc/opencl/lib/generic/relational/isinf.cl
+++ b/libclc/opencl/lib/generic/relational/isinf.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isinf
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isless.cl b/libclc/opencl/lib/generic/relational/isless.cl
index 0af1f53e71042..4212970e7671a 100644
--- a/libclc/opencl/lib/generic/relational/isless.cl
+++ b/libclc/opencl/lib/generic/relational/isless.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isless
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/islessequal.cl b/libclc/opencl/lib/generic/relational/islessequal.cl
index 9e32afc718ab2..e7aec262fc762 100644
--- a/libclc/opencl/lib/generic/relational/islessequal.cl
+++ b/libclc/opencl/lib/generic/relational/islessequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION islessequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/islessgreater.cl b/libclc/opencl/lib/generic/relational/islessgreater.cl
index c36a857dc3dfc..b775d2484550c 100644
--- a/libclc/opencl/lib/generic/relational/islessgreater.cl
+++ b/libclc/opencl/lib/generic/relational/islessgreater.cl
@@ -12,4 +12,4 @@
 #define FUNCTION islessgreater
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isnan.cl b/libclc/opencl/lib/generic/relational/isnan.cl
index 8b03930c5312f..4b7eeb5b919b6 100644
--- a/libclc/opencl/lib/generic/relational/isnan.cl
+++ b/libclc/opencl/lib/generic/relational/isnan.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isnan
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isnormal.cl b/libclc/opencl/lib/generic/relational/isnormal.cl
index 4ba21cc3e17fc..60ce9dccaeaf3 100644
--- a/libclc/opencl/lib/generic/relational/isnormal.cl
+++ b/libclc/opencl/lib/generic/relational/isnormal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isnormal
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isnotequal.cl b/libclc/opencl/lib/generic/relational/isnotequal.cl
index 928923b9b2a5e..abb4d3a859663 100644
--- a/libclc/opencl/lib/generic/relational/isnotequal.cl
+++ b/libclc/opencl/lib/generic/relational/isnotequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isnotequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isordered.cl b/libclc/opencl/lib/generic/relational/isordered.cl
index 60ca4d67ff1ea..684ee425e1203 100644
--- a/libclc/opencl/lib/generic/relational/isordered.cl
+++ b/libclc/opencl/lib/generic/relational/isordered.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isordered
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isunordered.cl b/libclc/opencl/lib/generic/relational/isunordered.cl
index 3392d77856ced..84aa8cafb111a 100644
--- a/libclc/opencl/lib/generic/relational/isunordered.cl
+++ b/libclc/opencl/lib/generic/relational/isunordered.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isunordered
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/signbit.cl b/libclc/opencl/lib/generic/relational/signbit.cl
index 26feb8d43fa25..d30fea7b9f6f5 100644
--- a/libclc/opencl/lib/generic/relational/signbit.cl
+++ b/libclc/opencl/lib/generic/relational/signbit.cl
@@ -12,4 +12,4 @@
 #define FUNCTION signbit
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/unary_def.inc b/libclc/opencl/lib/generic/relational/unary_def.inc
index 47bb33ef2da3d..f184e3cf0be56 100644
--- a/libclc/opencl/lib/generic/relational/unary_def.inc
+++ b/libclc/opencl/lib/generic/relational/unary_def.inc
@@ -10,6 +10,14 @@
 
 #define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x)
 
-_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a) {
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a) {
   return __IMPL_FUNCTION(FUNCTION)(a);
 }
+
+#undef __RETTYPE
diff --git a/libclc/utils/CMakeLists.txt b/libclc/utils/CMakeLists.txt
index ea1d9e9c8ef5f..6851ae16bda07 100644
--- a/libclc/utils/CMakeLists.txt
+++ b/libclc/utils/CMakeLists.txt
@@ -12,8 +12,8 @@ set( LLVM_LINK_COMPONENTS
 
 if( LIBCLC_STANDALONE_BUILD )
   add_llvm_executable( prepare_builtins prepare-builtins.cpp )
-  set( prepare_builtins_exe prepare_builtins )
-  set( prepare_builtins_target prepare_builtins )
+  set( prepare_builtins_exe prepare_builtins PARENT_SCOPE )
+  set( prepare_builtins_target prepare_builtins PARENT_SCOPE )
 else()
   add_llvm_utility( prepare_builtins prepare-builtins.cpp )
   setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index f372ac9619997..9557b955cd72c 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -33,6 +33,7 @@ AttributeMacros: [
                   '_LIBCPP_DEPRECATED_IN_CXX20',
                   '_LIBCPP_DEPRECATED_IN_CXX23',
                   '_LIBCPP_DEPRECATED',
+                  '_LIBCPP_DIAGNOSE_NULLPTR_IF',
                   '_LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION',
                   '_LIBCPP_EXPORTED_FROM_ABI',
                   '_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS',
diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index 9feea5fffc26c..acfcd367de505 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -1,10 +1,11 @@
-.. include:: ReleaseNotes/21.rst
+.. include:: ReleaseNotes/22.rst
 
 .. Make sure to reference the non-live release notes in a toctree to avoid Sphinx errors.
 .. toctree::
     :hidden:
 
     ReleaseNotes/20
+    ReleaseNotes/21
 
 .. The release notes are in versioned files, but we make sure to keep publishing
 .. them in an unversioned ReleaseNotes.html page for external sites to reference.
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6f18b61284f49..d31ca0130cb80 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -10,7 +10,7 @@ Written by the `Libc++ Team <https://libcxx.llvm.org>`_
 
 .. warning::
 
-   These are in-progress notes for the upcoming libc++ 20.0.0 release.
+   These are in-progress notes for the upcoming libc++ 21.0.0 release.
    Release notes for previous releases can be found on
    `the Download Page <https://releases.llvm.org/download.html>`_.
 
@@ -18,7 +18,7 @@ Introduction
 ============
 
 This document contains the release notes for the libc++ C++ Standard Library,
-part of the LLVM Compiler Infrastructure, release 20.0.0. Here we describe the
+part of the LLVM Compiler Infrastructure, release 21.0.0. Here we describe the
 status of libc++ in some detail, including major improvements from the previous
 release and new feature work. For the general LLVM release notes, see `the LLVM
 documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM releases may
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
new file mode 100644
index 0000000000000..15bf46d44b07f
--- /dev/null
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -0,0 +1,59 @@
+===========================================
+Libc++ 22.0.0 (In-Progress) Release Notes
+===========================================
+
+.. contents::
+   :local:
+   :depth: 2
+
+Written by the `Libc++ Team <https://libcxx.llvm.org>`_
+
+.. warning::
+
+   These are in-progress notes for the upcoming libc++ 22.0.0 release.
+   Release notes for previous releases can be found on
+   `the Download Page <https://releases.llvm.org/download.html>`_.
+
+Introduction
+============
+
+This document contains the release notes for the libc++ C++ Standard Library,
+part of the LLVM Compiler Infrastructure, release 22.0.0. Here we describe the
+status of libc++ in some detail, including major improvements from the previous
+release and new feature work. For the general LLVM release notes, see `the LLVM
+documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM releases may
+be downloaded from the `LLVM releases web site <https://llvm.org/releases/>`_.
+
+For more information about libc++, please see the `Libc++ Web Site
+<https://libcxx.llvm.org>`_ or the `LLVM Web Site <https://llvm.org>`_.
+
+Note that if you are reading this file from a Git checkout or the
+main Libc++ web page, this document applies to the *next* release, not
+the current one. To see the release notes for a specific release, please
+see the `releases page <https://llvm.org/releases/>`_.
+
+What's New in Libc++ 22.0.0?
+==============================
+
+Implemented Papers
+------------------
+
+- P2321R2: ``zip`` (`Github <https://github.com/llvm/llvm-project/issues/105169>`__) (The paper is partially implemented. ``zip_transform_view`` is implemented in this release)
+
+Improvements and New Features
+-----------------------------
+
+Deprecations and Removals
+-------------------------
+
+Potentially breaking changes
+----------------------------
+
+Announcements About Future Releases
+-----------------------------------
+
+ABI Affecting Changes
+---------------------
+
+Build System Changes
+--------------------
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index e5b2dcf8c1a5b..189f8452e0678 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -170,7 +170,7 @@
 "`LWG3687 <https://wg21.link/LWG3687>`__","``expected<cv void, E>`` move constructor should move","2022-07 (Virtual)","|Complete|","16",""
 "`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","|Complete|","20",""
 "`LWG3701 <https://wg21.link/LWG3701>`__","Make ``formatter<remove_cvref_t<const charT[N]>, charT>`` requirement explicit","2022-07 (Virtual)","|Complete|","15",""
-"`LWG3702 <https://wg21.link/LWG3702>`__","Should ``zip_transform_view::iterator`` remove ``operator<``","2022-07 (Virtual)","","",""
+"`LWG3702 <https://wg21.link/LWG3702>`__","Should ``zip_transform_view::iterator`` remove ``operator<``","2022-07 (Virtual)","|Complete|","22",""
 "`LWG3703 <https://wg21.link/LWG3703>`__","Missing requirements for ``expected<T, E>`` requires ``is_void<T>``","2022-07 (Virtual)","|Complete|","16",""
 "`LWG3704 <https://wg21.link/LWG3704>`__","LWG 2059 added overloads that might be ill-formed for sets","2022-07 (Virtual)","","",""
 "`LWG3705 <https://wg21.link/LWG3705>`__","Hashability shouldn't depend on basic_string's allocator","2022-07 (Virtual)","|Complete|","16",""
@@ -222,7 +222,7 @@
 "`LWG3765 <https://wg21.link/LWG3765>`__","``const_sentinel`` should be constrained","2022-11 (Kona)","","",""
 "`LWG3766 <https://wg21.link/LWG3766>`__","``view_interface::cbegin`` is underconstrained","2022-11 (Kona)","","",""
 "`LWG3770 <https://wg21.link/LWG3770>`__","``const_sentinel_t`` is missing","2022-11 (Kona)","","",""
-"`LWG3773 <https://wg21.link/LWG3773>`__","``views::zip_transform`` still requires ``F`` to be ``copy_constructible`` when empty pack","2022-11 (Kona)","","",""
+"`LWG3773 <https://wg21.link/LWG3773>`__","``views::zip_transform`` still requires ``F`` to be ``copy_constructible`` when empty pack","2022-11 (Kona)","|Complete|","22",""
 "`LWG3774 <https://wg21.link/LWG3774>`__","``<flat_set>`` should include ``<compare>``","2022-11 (Kona)","","",""
 "`LWG3775 <https://wg21.link/LWG3775>`__","Broken dependencies in the ``Cpp17Allocator`` requirements","2022-11 (Kona)","","",""
 "`LWG3778 <https://wg21.link/LWG3778>`__","``vector<bool>`` missing exception specifications","2022-11 (Kona)","|Complete|","3.7",""
@@ -234,7 +234,7 @@
 "`LWG3792 <https://wg21.link/LWG3792>`__","``__cpp_lib_constexpr_algorithms`` should also be defined in ``<utility>``","2022-11 (Kona)","|Complete|","16",""
 "`LWG3795 <https://wg21.link/LWG3795>`__","Self-move-assignment of ``std::future`` and ``std::shared_future`` have unimplementable postconditions","2022-11 (Kona)","","",""
 "`LWG3796 <https://wg21.link/LWG3796>`__","``movable-box`` as member should use ``default-initialization`` instead of ``copy-initialization``","2022-11 (Kona)","","",""
-"`LWG3798 <https://wg21.link/LWG3798>`__","Rvalue reference and ``iterator_category``","2022-11 (Kona)","|Partial|","","``join_with_view``, ``zip_transform_view``, and ``adjacent_transform_view`` haven't been done yet since these types aren't implemented yet"
+"`LWG3798 <https://wg21.link/LWG3798>`__","Rvalue reference and ``iterator_category``","2022-11 (Kona)","|Partial|","","``adjacent_transform_view`` hasn't been done yet since this type isn't implemented yet"
 "`LWG3801 <https://wg21.link/LWG3801>`__","``cartesian_product_view::iterator::distance-from`` ignores the size of last underlying range","2022-11 (Kona)","","",""
 "`LWG3814 <https://wg21.link/LWG3814>`__","Add freestanding items requested by NB comments","2022-11 (Kona)","","",""
 "`LWG3816 <https://wg21.link/LWG3816>`__","``flat_map`` and ``flat_multimap`` should impose sequence container requirements","2022-11 (Kona)","","",""
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 4f2a8dddad92c..cd6583cb62c24 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -403,6 +403,7 @@ set(files
   __format/indic_conjunct_break_table.h
   __format/parser_std_format_spec.h
   __format/range_default_formatter.h
+  __format/range_format.h
   __format/range_formatter.h
   __format/unicode.h
   __format/width_estimation_table.h
@@ -514,7 +515,6 @@ set(files
   __locale_dir/check_grouping.h
   __locale_dir/get_c_locale.h
   __locale_dir/locale_base_api.h
-  __locale_dir/locale_base_api/android.h
   __locale_dir/locale_base_api/bsd_locale_fallbacks.h
   __locale_dir/locale_base_api/ibm.h
   __locale_dir/locale_base_api/musl.h
@@ -737,6 +737,7 @@ set(files
   __ranges/transform_view.h
   __ranges/view_interface.h
   __ranges/views.h
+  __ranges/zip_transform_view.h
   __ranges/zip_view.h
   __split_buffer
   __std_mbstate_t.h
@@ -780,7 +781,6 @@ set(files
   __tuple/make_tuple_types.h
   __tuple/sfinae_helpers.h
   __tuple/tuple_element.h
-  __tuple/tuple_indices.h
   __tuple/tuple_like.h
   __tuple/tuple_like_ext.h
   __tuple/tuple_like_no_subrange.h
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 1d547eac30952..19398dd276a17 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -265,13 +265,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 //      When this option is used, the token passed to `std::random_device`'s
 //      constructor *must* be "/dev/urandom" -- anything else is an error.
 //
-// _LIBCPP_USING_NACL_RANDOM
-//      NaCl's sandbox (which PNaCl also runs in) doesn't allow filesystem access,
-//      including accesses to the special files under `/dev`. This implementation
-//      uses the NaCL syscall `nacl_secure_random_init()` to get entropy.
-//      When this option is used, the token passed to `std::random_device`'s
-//      constructor *must* be "/dev/urandom" -- anything else is an error.
-//
 // _LIBCPP_USING_WIN32_RANDOM
 //      Use rand_s(), for use on Windows.
 //      When this option is used, the token passed to `std::random_device`'s
@@ -283,8 +276,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_USING_GETENTROPY
 #  elif defined(__Fuchsia__)
 #    define _LIBCPP_USING_FUCHSIA_CPRNG
-#  elif defined(__native_client__)
-#    define _LIBCPP_USING_NACL_RANDOM
 #  elif defined(_LIBCPP_WIN32API)
 #    define _LIBCPP_USING_WIN32_RANDOM
 #  else
@@ -416,6 +407,12 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_GCC_DIAGNOSTIC_IGNORED(str)
 #  endif
 
+// Macros to enter and leave a state where deprecation warnings are suppressed.
+#  define _LIBCPP_SUPPRESS_DEPRECATED_PUSH                                                                             \
+    _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated")                                           \
+        _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
+#  define _LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_DIAGNOSTIC_POP
+
 #  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST
 #    define _LIBCPP_HARDENING_SIG f
 #  elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_EXTENSIVE
@@ -713,17 +710,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_WITH_CHAR8_T
 #  endif
 
-// Macros to enter and leave a state where deprecation warnings are suppressed.
-#  if defined(_LIBCPP_COMPILER_CLANG_BASED) || defined(_LIBCPP_COMPILER_GCC)
-#    define _LIBCPP_SUPPRESS_DEPRECATED_PUSH                                                                           \
-      _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated\"")                                \
-          _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
-#    define _LIBCPP_SUPPRESS_DEPRECATED_POP _Pragma("GCC diagnostic pop")
-#  else
-#    define _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-#    define _LIBCPP_SUPPRESS_DEPRECATED_POP
-#  endif
-
 #  if _LIBCPP_STD_VER <= 11
 #    define _LIBCPP_EXPLICIT_SINCE_CXX14
 #  else
@@ -1095,6 +1081,20 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DIAGNOSE_WARNING(...)
 #  endif
 
+#  if __has_attribute(__diagnose_if__) && !defined(_LIBCPP_APPLE_CLANG_VER) &&                                         \
+      (!defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER >= 2001)
+#    define _LIBCPP_DIAGNOSE_IF(...) __attribute__((__diagnose_if__(__VA_ARGS__)))
+#  else
+#    define _LIBCPP_DIAGNOSE_IF(...)
+#  endif
+
+#  define _LIBCPP_DIAGNOSE_NULLPTR_IF(condition, condition_description)                                                \
+    _LIBCPP_DIAGNOSE_IF(                                                                                               \
+        condition,                                                                                                     \
+        "null passed to callee that requires a non-null argument" condition_description,                               \
+        "warning",                                                                                                     \
+        "nonnull")
+
 #  if __has_cpp_attribute(_Clang::__lifetimebound__)
 #    define _LIBCPP_LIFETIMEBOUND [[_Clang::__lifetimebound__]]
 #  else
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index fc01aaf2d8746..b68c0c8258366 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -30,7 +30,6 @@
 #cmakedefine01 _LIBCPP_HAS_LOCALIZATION
 #cmakedefine01 _LIBCPP_HAS_UNICODE
 #cmakedefine01 _LIBCPP_HAS_WIDE_CHARACTERS
-#cmakedefine _LIBCPP_HAS_NO_STD_MODULES
 #cmakedefine01 _LIBCPP_HAS_TIME_ZONE_DATABASE
 #cmakedefine01 _LIBCPP_INSTRUMENTED_WITH_ASAN
 
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
index 0af6aac00c383..260d93ed25785 100644
--- a/libcxx/include/__flat_map/flat_multimap.h
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -114,11 +114,12 @@ class flat_multimap {
   class value_compare {
   private:
     _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_;
-    _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare(key_compare __c) : __comp_(__c) {}
     friend flat_multimap;
 
   public:
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return __comp_(__x.first, __y.first);
     }
   };
@@ -137,17 +138,17 @@ class flat_multimap {
 
 public:
   // [flat.map.cons], construct/copy/destroy
-  _LIBCPP_HIDE_FROM_ABI flat_multimap() noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap() noexcept(
       is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_MappedContainer> &&
       is_nothrow_default_constructible_v<_Compare>)
       : __containers_(), __compare_() {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(const flat_multimap&) = default;
 
   // The copy/move constructors are not specified in the spec, which means they should be defaulted.
   // However, the move constructor can potentially leave a moved-from object in an inconsistent
   // state if an exception is thrown.
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(flat_multimap&& __other) noexcept(
       is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_MappedContainer> &&
       is_nothrow_move_constructible_v<_Compare>)
 #  if _LIBCPP_HAS_EXCEPTIONS
@@ -168,7 +169,8 @@ class flat_multimap {
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(const flat_multimap& __other, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_tag{},
                       __alloc,
                       __other.__containers_.keys,
@@ -177,7 +179,7 @@ class flat_multimap {
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(flat_multimap&& __other, const _Allocator& __alloc)
 #  if _LIBCPP_HAS_EXCEPTIONS
       try
 #  endif // _LIBCPP_HAS_EXCEPTIONS
@@ -194,7 +196,7 @@ class flat_multimap {
 #  endif // _LIBCPP_HAS_EXCEPTIONS
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
       key_container_type __key_cont, mapped_container_type __mapped_cont, const key_compare& __comp = key_compare())
       : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
@@ -204,7 +206,7 @@ class flat_multimap {
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
       const key_container_type& __key_cont, const mapped_container_type& __mapped_cont, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
@@ -214,22 +216,22 @@ class flat_multimap {
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multimap(const key_container_type& __key_cont,
-                const mapped_container_type& __mapped_cont,
-                const key_compare& __comp,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
+      const key_container_type& __key_cont,
+      const mapped_container_type& __mapped_cont,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
                                      "flat_multimap keys and mapped containers have different size");
     __sort();
   }
 
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multimap(sorted_equivalent_t,
-                key_container_type __key_cont,
-                mapped_container_type __mapped_cont,
-                const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
+      sorted_equivalent_t,
+      key_container_type __key_cont,
+      mapped_container_type __mapped_cont,
+      const key_compare& __comp = key_compare())
       : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
                                      "flat_multimap keys and mapped containers have different size");
@@ -238,11 +240,11 @@ class flat_multimap {
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multimap(sorted_equivalent_t,
-                const key_container_type& __key_cont,
-                const mapped_container_type& __mapped_cont,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
+      sorted_equivalent_t,
+      const key_container_type& __key_cont,
+      const mapped_container_type& __mapped_cont,
+      const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
                                      "flat_multimap keys and mapped containers have different size");
@@ -251,33 +253,35 @@ class flat_multimap {
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multimap(sorted_equivalent_t,
-                const key_container_type& __key_cont,
-                const mapped_container_type& __mapped_cont,
-                const key_compare& __comp,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
+      sorted_equivalent_t,
+      const key_container_type& __key_cont,
+      const mapped_container_type& __mapped_cont,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
                                      "flat_multimap keys and mapped containers have different size");
     _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted");
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const key_compare& __comp) : __containers_(), __compare_(__comp) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multimap(const key_compare& __comp)
+      : __containers_(), __compare_(__comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(const key_compare& __comp, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multimap(const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {}
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __containers_(), __compare_(__comp) {
     insert(__first, __last);
@@ -285,7 +289,7 @@ class flat_multimap {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
     insert(__first, __last);
@@ -293,91 +297,99 @@ class flat_multimap {
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {
     insert(__first, __last);
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t __fr, _Range&& __rg)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(from_range_t __fr, _Range&& __rg)
       : flat_multimap(__fr, std::forward<_Range>(__rg), key_compare()) {}
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(from_range_t, _Range&& __rg, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multimap(__comp) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp)
+      : flat_multimap(__comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
     insert_range(std::forward<_Range>(__rg));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
       sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
       : __containers_(), __compare_(__comp) {
     insert(sorted_equivalent, __first, __last);
   }
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multimap(sorted_equivalent_t,
-                _InputIterator __first,
-                _InputIterator __last,
-                const key_compare& __comp,
-                const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
+      sorted_equivalent_t,
+      _InputIterator __first,
+      _InputIterator __last,
+      const key_compare& __comp,
+      const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
     insert(sorted_equivalent, __first, __last);
   }
 
   template <class _InputIterator, class _Allocator>
     requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multimap(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
       : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {
     insert(sorted_equivalent, __first, __last);
   }
 
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multimap(__il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multimap(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multimap(__il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multimap(__il.begin(), __il.end(), __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
   flat_multimap(sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare())
       : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
       sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
       : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {}
 
   template <class _Allocator>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
       : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __alloc) {}
 
-  _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap& operator=(initializer_list<value_type> __il) {
     clear();
     insert(__il);
     return *this;
@@ -386,9 +398,9 @@ class flat_multimap {
   // copy/move assignment are not specified in the spec (defaulted)
   // but move assignment can potentially leave moved from object in an inconsistent
   // state if an exception is thrown
-  _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(const flat_multimap&) = default;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap& operator=(const flat_multimap&) = default;
 
-  _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(flat_multimap&& __other) noexcept(
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap& operator=(flat_multimap&& __other) noexcept(
       is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_MappedContainer> &&
       is_nothrow_move_assignable_v<_Compare>) {
     auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; });
@@ -400,38 +412,54 @@ class flat_multimap {
   }
 
   // iterators
-  _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept {
     return iterator(__containers_.keys.begin(), __containers_.values.begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept {
     return const_iterator(__containers_.keys.begin(), __containers_.values.begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator end() noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept {
     return iterator(__containers_.keys.end(), __containers_.values.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept {
     return const_iterator(__containers_.keys.end(), __containers_.values.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept {
+    return reverse_iterator(begin());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(end());
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(begin());
+  }
 
   // [flat.map.capacity], capacity
-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __containers_.keys.empty(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept {
+    return __containers_.keys.empty();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __containers_.keys.size(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept {
+    return __containers_.keys.size();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept {
     return std::min<size_type>(__containers_.keys.max_size(), __containers_.values.max_size());
   }
 
@@ -439,7 +467,7 @@ class flat_multimap {
   template <class... _Args>
     requires is_constructible_v<pair<key_type, mapped_type>, _Args...> && is_move_constructible_v<key_type> &&
              is_move_constructible_v<mapped_type>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) {
     std::pair<key_type, mapped_type> __pair(std::forward<_Args>(__args)...);
     auto __key_it    = std::upper_bound(__containers_.keys.begin(), __containers_.keys.end(), __pair.first, __compare_);
     auto __mapped_it = __corresponding_mapped_it(*this, __key_it);
@@ -450,7 +478,7 @@ class flat_multimap {
 
   template <class... _Args>
     requires is_constructible_v<pair<key_type, mapped_type>, _Args...>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
     std::pair<key_type, mapped_type> __pair(std::forward<_Args>(__args)...);
 
     auto __prev_larger  = __hint != cbegin() && __compare_(__pair.first, (__hint - 1)->first);
@@ -490,33 +518,35 @@ class flat_multimap {
         *this, __key_iter, __mapped_iter, std::move(__pair.first), std::move(__pair.second));
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) {
+    return emplace(std::move(__x));
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) {
     return emplace_hint(__hint, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) {
     return emplace_hint(__hint, std::move(__x));
   }
 
   template <class _PairLike>
     requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
-  _LIBCPP_HIDE_FROM_ABI iterator insert(_PairLike&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(_PairLike&& __x) {
     return emplace(std::forward<_PairLike>(__x));
   }
 
   template <class _PairLike>
     requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, _PairLike&& __x) {
     return emplace_hint(__hint, std::forward<_PairLike>(__x));
   }
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -525,7 +555,8 @@ class flat_multimap {
 
   template <class _InputIterator>
     requires __has_input_iterator_category<_InputIterator>::value
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
     if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
       __reserve(__last - __first);
     }
@@ -534,7 +565,7 @@ class flat_multimap {
   }
 
   template <_ContainerCompatibleRange<value_type> _Range>
-  _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) {
     if constexpr (ranges::sized_range<_Range>) {
       __reserve(ranges::size(__range));
     }
@@ -542,19 +573,23 @@ class flat_multimap {
     __append_sort_merge</*WasSorted = */ false>(ranges::begin(__range), ranges::end(__range));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list<value_type> __il) {
+    insert(__il.begin(), __il.end());
+  }
 
-  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list<value_type> __il) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  insert(sorted_equivalent_t, initializer_list<value_type> __il) {
     insert(sorted_equivalent, __il.begin(), __il.end());
   }
 
-  _LIBCPP_HIDE_FROM_ABI containers extract() && {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 containers extract() && {
     auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; });
     auto __ret   = std::move(__containers_);
     return __ret;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) {
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __key_cont.size() == __mapped_cont.size(), "flat_multimap keys and mapped containers have different size");
 
@@ -565,15 +600,15 @@ class flat_multimap {
     __guard.__complete();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) {
     return __erase(__position.__key_iter_, __position.__mapped_iter_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __position) {
     return __erase(__position.__key_iter_, __position.__mapped_iter_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
@@ -583,14 +618,14 @@ class flat_multimap {
   template <class _Kp>
     requires(__is_compare_transparent && !is_convertible_v<_Kp &&, iterator> &&
              !is_convertible_v<_Kp &&, const_iterator>)
-  _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) {
     auto [__first, __last] = equal_range(__x);
     auto __res             = __last - __first;
     erase(__first, __last);
     return __res;
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) {
     auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_it     = __containers_.keys.erase(__first.__key_iter_, __last.__key_iter_);
     auto __mapped_it  = __containers_.values.erase(__first.__mapped_iter_, __last.__mapped_iter_);
@@ -598,7 +633,7 @@ class flat_multimap {
     return iterator(std::move(__key_it), std::move(__mapped_it));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __y) noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multimap& __y) noexcept {
     // warning: The spec has unconditional noexcept, which means that
     // if any of the following functions throw an exception,
     // std::terminate will be called
@@ -607,137 +642,160 @@ class flat_multimap {
     ranges::swap(__containers_.values, __y.__containers_.values);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void clear() noexcept {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept {
     __containers_.keys.clear();
     __containers_.values.clear();
   }
 
   // observers
-  _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; }
-  _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return value_compare(__compare_); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const {
+    return value_compare(__compare_);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const key_container_type& keys() const noexcept { return __containers_.keys; }
-  _LIBCPP_HIDE_FROM_ABI const mapped_container_type& values() const noexcept { return __containers_.values; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const key_container_type& keys() const noexcept {
+    return __containers_.keys;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const mapped_container_type& values() const noexcept {
+    return __containers_.values;
+  }
 
   // map operations
-  _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) {
+    return __find_impl(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const {
+    return __find_impl(*this, __x);
+  }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) {
     return __find_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const {
     return __find_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const {
     auto [__first, __last] = equal_range(__x);
     return __last - __first;
   }
 
-  _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const {
+    return find(__x) != end();
+  }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const {
     return find(__x) != end();
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { return __lower_bound<iterator>(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) {
+    return __lower_bound<iterator>(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const {
     return __lower_bound<const_iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) {
     return __lower_bound<iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const {
     return __lower_bound<const_iterator>(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { return __upper_bound<iterator>(*this, __x); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) {
+    return __upper_bound<iterator>(*this, __x);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const {
     return __upper_bound<const_iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) {
     return __upper_bound<iterator>(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const {
     return __upper_bound<const_iterator>(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const key_type& __x) {
     return __equal_range_impl(*this, __x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const key_type& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<iterator, iterator> equal_range(const _Kp& __x) {
     return __equal_range_impl(*this, __x);
   }
   template <class _Kp>
     requires __is_compare_transparent
-  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair<const_iterator, const_iterator>
+  equal_range(const _Kp& __x) const {
     return __equal_range_impl(*this, __x);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multimap& __x, const flat_multimap& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+  operator==(const flat_multimap& __x, const flat_multimap& __y) {
     return ranges::equal(__x, __y);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multimap& __x, const flat_multimap& __y) {
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto
+  operator<=>(const flat_multimap& __x, const flat_multimap& __y) {
     return std::lexicographical_compare_three_way(
         __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __x, flat_multimap& __y) noexcept { __x.swap(__y); }
+  friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  swap(flat_multimap& __x, flat_multimap& __y) noexcept {
+    __x.swap(__y);
+  }
 
 private:
   struct __ctor_uses_allocator_tag {
-    explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_tag() = default;
+    explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __ctor_uses_allocator_tag() = default;
   };
   struct __ctor_uses_allocator_empty_tag {
-    explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_empty_tag() = default;
+    explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __ctor_uses_allocator_empty_tag() = default;
   };
 
   template <class _Allocator, class _KeyCont, class _MappedCont, class... _CompArg>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI
-  flat_multimap(__ctor_uses_allocator_tag,
-                const _Allocator& __alloc,
-                _KeyCont&& __key_cont,
-                _MappedCont&& __mapped_cont,
-                _CompArg&&... __comp)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(
+      __ctor_uses_allocator_tag,
+      const _Allocator& __alloc,
+      _KeyCont&& __key_cont,
+      _MappedCont&& __mapped_cont,
+      _CompArg&&... __comp)
       : __containers_{.keys = std::make_obj_using_allocator<key_container_type>(
                           __alloc, std::forward<_KeyCont>(__key_cont)),
                       .values = std::make_obj_using_allocator<mapped_container_type>(
@@ -746,29 +804,32 @@ class flat_multimap {
 
   template <class _Allocator, class... _CompArg>
     requires __allocator_ctor_constraint<_Allocator>
-  _LIBCPP_HIDE_FROM_ABI flat_multimap(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+  flat_multimap(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp)
       : __containers_{.keys   = std::make_obj_using_allocator<key_container_type>(__alloc),
                       .values = std::make_obj_using_allocator<mapped_container_type>(__alloc)},
         __compare_(std::forward<_CompArg>(__comp)...) {}
 
-  _LIBCPP_HIDE_FROM_ABI bool __is_sorted(auto&& __key_container) const {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool __is_sorted(auto&& __key_container) const {
     return ranges::is_sorted(__key_container, __compare_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __sort() {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __sort() {
     auto __zv = ranges::views::zip(__containers_.keys, __containers_.values);
     ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); });
   }
 
   template <class _Self, class _KeyIter>
-  _LIBCPP_HIDE_FROM_ABI static auto __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto
+  __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) {
     return __self.__containers_.values.begin() +
            static_cast<ranges::range_difference_t<mapped_container_type>>(
                ranges::distance(__self.__containers_.keys.begin(), __key_iter));
   }
 
   template <bool _WasSorted, class _InputIterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_InputIterator __first, _Sentinel __last) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void
+  __append_sort_merge(_InputIterator __first, _Sentinel __last) {
     auto __on_failure     = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     size_t __num_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last));
     if (__num_appended != 0) {
@@ -791,7 +852,7 @@ class flat_multimap {
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) {
     auto __it   = __self.lower_bound(__key);
     auto __last = __self.end();
     if (__it == __last || __self.__compare_(__key, __it->first)) {
@@ -801,7 +862,7 @@ class flat_multimap {
   }
 
   template <class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
     auto [__key_first, __key_last] =
         std::equal_range(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __key, __self.__compare_);
 
@@ -811,7 +872,7 @@ class flat_multimap {
   }
 
   template <class _Res, class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static _Res __lower_bound(_Self&& __self, _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static _Res __lower_bound(_Self&& __self, _Kp& __x) {
     auto __key_iter =
         std::lower_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __x, __self.__compare_);
     auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter);
@@ -819,14 +880,14 @@ class flat_multimap {
   }
 
   template <class _Res, class _Self, class _Kp>
-  _LIBCPP_HIDE_FROM_ABI static _Res __upper_bound(_Self&& __self, _Kp& __x) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static _Res __upper_bound(_Self&& __self, _Kp& __x) {
     auto __key_iter =
         std::upper_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __x, __self.__compare_);
     auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter);
     return _Res(std::move(__key_iter), std::move(__mapped_iter));
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) {
     if constexpr (__container_traits<_KeyContainer>::__reservable) {
       __containers_.keys.reserve(__size);
     }
@@ -837,7 +898,8 @@ class flat_multimap {
   }
 
   template <class _KIter, class _MIter>
-  _LIBCPP_HIDE_FROM_ABI iterator __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator
+  __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) {
     auto __on_failure  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
     auto __key_iter    = __containers_.keys.erase(__key_iter_to_remove);
     auto __mapped_iter = __containers_.values.erase(__mapped_iter_to_remove);
@@ -847,7 +909,8 @@ class flat_multimap {
 
   template <class _Key2, class _Tp2, class _Compare2, class _KeyContainer2, class _MappedContainer2, class _Predicate>
   friend typename flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type
-  erase_if(flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate);
+      _LIBCPP_CONSTEXPR_SINCE_CXX26
+      erase_if(flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate);
 
   friend __flat_map_utils;
 
@@ -855,8 +918,9 @@ class flat_multimap {
   _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
 
   struct __key_equiv {
-    _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
-    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool
+    operator()(const_reference __x, const_reference __y) const {
       return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x));
     }
     key_compare __comp_;
@@ -980,8 +1044,9 @@ struct uses_allocator<flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedC
     : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator>> {};
 
 template <class _Key, class _Tp, class _Compare, class _KeyContainer, class _MappedContainer, class _Predicate>
-_LIBCPP_HIDE_FROM_ABI typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type
-erase_if(flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_multimap, _Predicate __pred) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26
+    typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type
+    erase_if(flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_multimap, _Predicate __pred) {
   auto __zv     = ranges::views::zip(__flat_multimap.__containers_.keys, __flat_multimap.__containers_.values);
   auto __first  = __zv.begin();
   auto __last   = __zv.end();
diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h
index 7149debb2f141..2769647ad527e 100644
--- a/libcxx/include/__format/range_default_formatter.h
+++ b/libcxx/include/__format/range_default_formatter.h
@@ -16,10 +16,10 @@
 
 #include <__algorithm/ranges_copy.h>
 #include <__chrono/statically_widen.h>
-#include <__concepts/same_as.h>
 #include <__config>
 #include <__format/concepts.h>
 #include <__format/formatter.h>
+#include <__format/range_format.h>
 #include <__format/range_formatter.h>
 #include <__iterator/back_insert_iterator.h>
 #include <__ranges/concepts.h>
@@ -42,51 +42,11 @@ concept __const_formattable_range =
 template <class _Rp, class _CharT>
 using __fmt_maybe_const _LIBCPP_NODEBUG = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>;
 
-_LIBCPP_DIAGNOSTIC_PUSH
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow")
-_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshadow")
-// This shadows map, set, and string.
-enum class range_format { disabled, map, set, sequence, string, debug_string };
-_LIBCPP_DIAGNOSTIC_POP
-
 // There is no definition of this struct, it's purely intended to be used to
 // generate diagnostics.
 template <class _Rp>
 struct __instantiated_the_primary_template_of_format_kind;
 
-template <class _Rp>
-constexpr range_format format_kind = [] {
-  // [format.range.fmtkind]/1
-  // A program that instantiates the primary template of format_kind is ill-formed.
-  static_assert(sizeof(_Rp) != sizeof(_Rp), "create a template specialization of format_kind for your type");
-  return range_format::disabled;
-}();
-
-template <ranges::input_range _Rp>
-  requires same_as<_Rp, remove_cvref_t<_Rp>>
-inline constexpr range_format format_kind<_Rp> = [] {
-  // [format.range.fmtkind]/2
-
-  // 2.1 If same_as<remove_cvref_t<ranges::range_reference_t<R>>, R> is true,
-  // Otherwise format_kind<R> is range_format::disabled.
-  if constexpr (same_as<remove_cvref_t<ranges::range_reference_t<_Rp>>, _Rp>)
-    return range_format::disabled;
-  // 2.2 Otherwise, if the qualified-id R::key_type is valid and denotes a type:
-  else if constexpr (requires { typename _Rp::key_type; }) {
-    // 2.2.1 If the qualified-id R::mapped_type is valid and denotes a type ...
-    if constexpr (requires { typename _Rp::mapped_type; } &&
-                  // 2.2.1 ... If either U is a specialization of pair or U is a specialization
-                  // of tuple and tuple_size_v<U> == 2
-                  __fmt_pair_like<remove_cvref_t<ranges::range_reference_t<_Rp>>>)
-      return range_format::map;
-    else
-      // 2.2.2 Otherwise format_kind<R> is range_format::set.
-      return range_format::set;
-  } else
-    // 2.3 Otherwise, format_kind<R> is range_format::sequence.
-    return range_format::sequence;
-}();
-
 template <range_format _Kp, ranges::input_range _Rp, class _CharT>
 struct __range_default_formatter;
 
diff --git a/libcxx/include/__format/range_format.h b/libcxx/include/__format/range_format.h
new file mode 100644
index 0000000000000..139cfd92ee32b
--- /dev/null
+++ b/libcxx/include/__format/range_format.h
@@ -0,0 +1,71 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___FORMAT_RANGE_FORMAT_H
+#define _LIBCPP___FORMAT_RANGE_FORMAT_H
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#include <__concepts/same_as.h>
+#include <__config>
+#include <__format/concepts.h>
+#include <__ranges/concepts.h>
+#include <__type_traits/remove_cvref.h>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow")
+_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshadow")
+// This shadows map, set, and string.
+enum class range_format { disabled, map, set, sequence, string, debug_string };
+_LIBCPP_DIAGNOSTIC_POP
+
+template <class _Rp>
+constexpr range_format format_kind = [] {
+  // [format.range.fmtkind]/1
+  // A program that instantiates the primary template of format_kind is ill-formed.
+  static_assert(sizeof(_Rp) != sizeof(_Rp), "create a template specialization of format_kind for your type");
+  return range_format::disabled;
+}();
+
+template <ranges::input_range _Rp>
+  requires same_as<_Rp, remove_cvref_t<_Rp>>
+inline constexpr range_format format_kind<_Rp> = [] {
+  // [format.range.fmtkind]/2
+
+  // 2.1 If same_as<remove_cvref_t<ranges::range_reference_t<R>>, R> is true,
+  // Otherwise format_kind<R> is range_format::disabled.
+  if constexpr (same_as<remove_cvref_t<ranges::range_reference_t<_Rp>>, _Rp>)
+    return range_format::disabled;
+  // 2.2 Otherwise, if the qualified-id R::key_type is valid and denotes a type:
+  else if constexpr (requires { typename _Rp::key_type; }) {
+    // 2.2.1 If the qualified-id R::mapped_type is valid and denotes a type ...
+    if constexpr (requires { typename _Rp::mapped_type; } &&
+                  // 2.2.1 ... If either U is a specialization of pair or U is a specialization
+                  // of tuple and tuple_size_v<U> == 2
+                  __fmt_pair_like<remove_cvref_t<ranges::range_reference_t<_Rp>>>)
+      return range_format::map;
+    else
+      // 2.2.2 Otherwise format_kind<R> is range_format::set.
+      return range_format::set;
+  } else
+    // 2.3 Otherwise, format_kind<R> is range_format::sequence.
+    return range_format::sequence;
+}();
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif
diff --git a/libcxx/include/__functional/bind.h b/libcxx/include/__functional/bind.h
index 596cce03cdb58..def9e4c4ec7a9 100644
--- a/libcxx/include/__functional/bind.h
+++ b/libcxx/include/__functional/bind.h
@@ -83,15 +83,14 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __mu(reference_w
 
 template <class _Ti, class... _Uj, size_t... _Indx>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __invoke_result_t<_Ti&, _Uj...>
-__mu_expand(_Ti& __ti, tuple<_Uj...>& __uj, __tuple_indices<_Indx...>) {
+__mu_expand(_Ti& __ti, tuple<_Uj...>& __uj, __index_sequence<_Indx...>) {
   return __ti(std::forward<_Uj>(std::get<_Indx>(__uj))...);
 }
 
 template <class _Ti, class... _Uj, __enable_if_t<is_bind_expression<_Ti>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __invoke_result_t<_Ti&, _Uj...>
 __mu(_Ti& __ti, tuple<_Uj...>& __uj) {
-  typedef typename __make_tuple_indices<sizeof...(_Uj)>::type __indices;
-  return std::__mu_expand(__ti, __uj, __indices());
+  return std::__mu_expand(__ti, __uj, __make_index_sequence<sizeof...(_Uj)>());
 }
 
 template <bool _IsPh, class _Ti, class _Uj>
@@ -191,7 +190,7 @@ struct __bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj, true> {
 
 template <class _Fp, class _BoundArgs, size_t... _Indx, class _Args>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __bind_return<_Fp, _BoundArgs, _Args>::type
-__apply_functor(_Fp& __f, _BoundArgs& __bound_args, __tuple_indices<_Indx...>, _Args&& __args) {
+__apply_functor(_Fp& __f, _BoundArgs& __bound_args, __index_sequence<_Indx...>, _Args&& __args) {
   return std::__invoke(__f, std::__mu(std::get<_Indx>(__bound_args), __args)...);
 }
 
@@ -205,8 +204,6 @@ class __bind : public __weak_result_type<__decay_t<_Fp> > {
   _Fd __f_;
   _Td __bound_args_;
 
-  typedef typename __make_tuple_indices<sizeof...(_BoundArgs)>::type __indices;
-
 public:
   template <
       class _Gp,
@@ -219,14 +216,22 @@ class __bind : public __weak_result_type<__decay_t<_Fp> > {
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __bind_return<_Fd, _Td, tuple<_Args&&...> >::type
   operator()(_Args&&... __args) {
-    return std::__apply_functor(__f_, __bound_args_, __indices(), tuple<_Args&&...>(std::forward<_Args>(__args)...));
+    return std::__apply_functor(
+        __f_,
+        __bound_args_,
+        __make_index_sequence<sizeof...(_BoundArgs)>(),
+        tuple<_Args&&...>(std::forward<_Args>(__args)...));
   }
 
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
   typename __bind_return<const _Fd, const _Td, tuple<_Args&&...> >::type
   operator()(_Args&&... __args) const {
-    return std::__apply_functor(__f_, __bound_args_, __indices(), tuple<_Args&&...>(std::forward<_Args>(__args)...));
+    return std::__apply_functor(
+        __f_,
+        __bound_args_,
+        __make_index_sequence<sizeof...(_BoundArgs)>(),
+        tuple<_Args&&...>(std::forward<_Args>(__args)...));
   }
 };
 
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 78f2f3bfd2f4c..03f50d9f3f269 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -122,6 +122,19 @@ struct __get_hash_node_value_type<__hash_value_type<_Key, _Tp> > {
 template <class _Tp>
 using __get_hash_node_value_type_t _LIBCPP_NODEBUG = typename __get_hash_node_value_type<_Tp>::type;
 
+template <class _Tp>
+struct __get_hash_node_key_type {
+  using type _LIBCPP_NODEBUG = _Tp;
+};
+
+template <class _Key, class _Tp>
+struct __get_hash_node_key_type<__hash_value_type<_Key, _Tp> > {
+  using type _LIBCPP_NODEBUG = _Key;
+};
+
+template <class _Tp>
+using __get_hash_node_key_type_t _LIBCPP_NODEBUG = typename __get_hash_node_key_type<_Tp>::type;
+
 template <class _Tp, class _VoidPtr>
 struct __hash_node : public __hash_node_base< __rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > > {
   using __node_value_type _LIBCPP_NODEBUG = __get_hash_node_value_type_t<_Tp>;
@@ -182,69 +195,11 @@ class __hash_map_iterator;
 template <class _HashIterator>
 class __hash_map_const_iterator;
 
-template <class _Tp>
-struct __hash_key_value_types {
-  static_assert(!is_reference<_Tp>::value && !is_const<_Tp>::value, "");
-  typedef _Tp key_type;
-  typedef _Tp __node_value_type;
-  typedef _Tp __container_value_type;
-  static const bool __is_map = false;
-
-  _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Tp const& __v) { return __v; }
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(__node_value_type const& __v) { return __v; }
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__node_value_type& __n) { return std::addressof(__n); }
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type&& __move(__node_value_type& __v) { return std::move(__v); }
-};
-
-template <class _Key, class _Tp>
-struct __hash_key_value_types<__hash_value_type<_Key, _Tp> > {
-  typedef _Key key_type;
-  typedef _Tp mapped_type;
-  typedef __hash_value_type<_Key, _Tp> __node_value_type;
-  typedef pair<const _Key, _Tp> __container_value_type;
-  typedef __container_value_type __map_value_type;
-  static const bool __is_map = true;
-
-  _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(__container_value_type const& __v) { return __v.first; }
-
-  template <class _Up, __enable_if_t<is_same<__remove_cvref_t<_Up>, __node_value_type>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) {
-    return __t.__get_value();
-  }
-
-  template <class _Up, __enable_if_t<is_same<__remove_cvref_t<_Up>, __container_value_type>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) {
-    return __t;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__container_value_type& __n) {
-    return std::addressof(__n);
-  }
-  _LIBCPP_HIDE_FROM_ABI static pair<key_type&&, mapped_type&&> __move(__node_value_type& __v) { return __v.__move(); }
-};
-
-template <class _Tp, class _AllocPtr, class _KVTypes = __hash_key_value_types<_Tp>, bool = _KVTypes::__is_map>
-struct __hash_map_pointer_types {};
-
-template <class _Tp, class _AllocPtr, class _KVTypes>
-struct __hash_map_pointer_types<_Tp, _AllocPtr, _KVTypes, true> {
-  typedef typename _KVTypes::__map_value_type _Mv;
-  typedef __rebind_pointer_t<_AllocPtr, _Mv> __map_value_type_pointer;
-  typedef __rebind_pointer_t<_AllocPtr, const _Mv> __const_map_value_type_pointer;
-};
-
 template <class _NodePtr, class _NodeT = typename pointer_traits<_NodePtr>::element_type>
 struct __hash_node_types;
 
 template <class _NodePtr, class _Tp, class _VoidPtr>
-struct __hash_node_types<_NodePtr, __hash_node<_Tp, _VoidPtr> >
-    : public __hash_key_value_types<_Tp>,
-      __hash_map_pointer_types<_Tp, _VoidPtr>
-
-{
-  typedef __hash_key_value_types<_Tp> __base;
-
-public:
+struct __hash_node_types<_NodePtr, __hash_node<_Tp, _VoidPtr> > {
   typedef ptrdiff_t difference_type;
   typedef size_t size_type;
 
@@ -617,8 +572,6 @@ public:
   typedef typename __alloc_traits::pointer pointer;
 
 private:
-  typedef __hash_node_types<pointer> _NodeTypes;
-
   allocator_type& __na_;
 
 public:
@@ -633,7 +586,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI void operator()(pointer __p) _NOEXCEPT {
     if (__value_constructed) {
-      __alloc_traits::destroy(__na_, _NodeTypes::__get_ptr(__p->__get_value()));
+      __alloc_traits::destroy(__na_, std::addressof(__p->__get_value()));
       std::__destroy_at(std::addressof(*__p));
     }
     if (__p)
@@ -684,6 +637,8 @@ template <class _Tp, class _Hash, class _Equal, class _Alloc>
 class __hash_table {
 public:
   using value_type = __get_hash_node_value_type_t<_Tp>;
+  using key_type   = __get_hash_node_key_type_t<_Tp>;
+
   typedef _Hash hasher;
   typedef _Equal key_equal;
   typedef _Alloc allocator_type;
@@ -694,8 +649,6 @@ private:
 
 public:
   typedef typename _NodeTypes::__node_value_type __node_value_type;
-  typedef typename _NodeTypes::__container_value_type __container_value_type;
-  typedef typename _NodeTypes::key_type key_type;
   typedef value_type& reference;
   typedef const value_type& const_reference;
   typedef typename __alloc_traits::pointer pointer;
@@ -824,7 +777,7 @@ public:
 
   template <class _First,
             class _Second,
-            __enable_if_t<__can_extract_map_key<_First, key_type, __container_value_type>::value, int> = 0>
+            __enable_if_t<__can_extract_map_key<_First, key_type, value_type>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __emplace_unique(_First&& __f, _Second&& __s) {
     return __emplace_unique_key_args(__f, std::forward<_First>(__f), std::forward<_Second>(__s));
   }
@@ -854,9 +807,7 @@ public:
 
   template <class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(value_type&& __value) {
-    using __key_type = typename _NodeTypes::key_type;
-
-    __node_holder __h = __construct_node(const_cast<__key_type&&>(__value.first), std::move(__value.second));
+    __node_holder __h = __construct_node(const_cast<key_type&&>(__value.first), std::move(__value.second));
     __node_insert_unique(__h.get());
     __h.release();
   }
@@ -870,9 +821,7 @@ public:
 
   template <class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(value_type&& __value) {
-    using __key_type = typename _NodeTypes::key_type;
-
-    __node_holder __h = __construct_node(const_cast<__key_type&&>(__value.first), std::move(__value.second));
+    __node_holder __h = __construct_node(const_cast<key_type&&>(__value.first), std::move(__value.second));
     __node_insert_multi(__h.get());
     __h.release();
   }
@@ -1047,12 +996,10 @@ private:
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __assign_value(__get_hash_node_value_type_t<_Tp>& __lhs, _From&& __rhs) {
-    using __key_type = typename _NodeTypes::key_type;
-
     // This is technically UB, since the object was constructed as `const`.
     // Clang doesn't optimize on this currently though.
-    const_cast<__key_type&>(__lhs.first) = const_cast<__copy_cvref_t<_From, __key_type>&&>(__rhs.first);
-    __lhs.second                         = std::forward<_From>(__rhs).second;
+    const_cast<key_type&>(__lhs.first) = const_cast<__copy_cvref_t<_From, key_type>&&>(__rhs.first);
+    __lhs.second                       = std::forward<_From>(__rhs).second;
   }
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<!__is_hash_value_type<_ValueT>::value, int> = 0>
@@ -1201,7 +1148,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer
   while (__np != nullptr) {
     __next_pointer __next    = __np->__next_;
     __node_pointer __real_np = __np->__upcast();
-    __node_traits::destroy(__na, _NodeTypes::__get_ptr(__real_np->__get_value()));
+    __node_traits::destroy(__na, std::addressof(__real_np->__get_value()));
     std::__destroy_at(std::addressof(*__real_np));
     __node_traits::deallocate(__na, __real_np, 1);
     __np = __next;
@@ -1290,8 +1237,8 @@ template <class _InputIterator>
 void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __first, _InputIterator __last) {
   typedef iterator_traits<_InputIterator> _ITraits;
   typedef typename _ITraits::value_type _ItValueType;
-  static_assert(is_same<_ItValueType, __container_value_type>::value,
-                "__assign_unique may only be called with the containers value type");
+  static_assert(
+      is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type");
 
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
@@ -1321,10 +1268,8 @@ template <class _InputIterator>
 void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __first, _InputIterator __last) {
   typedef iterator_traits<_InputIterator> _ITraits;
   typedef typename _ITraits::value_type _ItValueType;
-  static_assert(
-      (is_same<_ItValueType, __container_value_type>::value || is_same<_ItValueType, __node_value_type>::value),
-      "__assign_multi may only be called with the containers value type"
-      " or the nodes value type");
+  static_assert(is_same<_ItValueType, value_type>::value,
+                "__assign_multi may only be called with the containers value type or the nodes value type");
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
 #if _LIBCPP_HAS_EXCEPTIONS
@@ -1345,7 +1290,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __f
     __deallocate_node(__cache);
   }
   for (; __first != __last; ++__first)
-    __emplace_multi(_NodeTypes::__get_value(*__first));
+    __emplace_multi(*__first);
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
@@ -1863,7 +1808,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node(_Args&&... __args) {
   std::__construct_at(std::addressof(*__h), /* next = */ nullptr, /* hash = */ 0);
 
   // Now construct the value_type using the allocator's construct() method.
-  __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__get_value()), std::forward<_Args>(__args)...);
+  __node_traits::construct(__na, std::addressof(__h->__get_value()), std::forward<_Args>(__args)...);
   __h.get_deleter().__value_constructed = true;
 
   __h->__hash_ = hash_function()(__h->__get_value());
@@ -1879,7 +1824,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(size_t __hash, _
   __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
   std::__construct_at(std::addressof(*__h), /* next = */ nullptr, /* hash = */ __hash);
   __node_traits::construct(
-      __na, _NodeTypes::__get_ptr(__h->__get_value()), std::forward<_First>(__f), std::forward<_Rest>(__rest)...);
+      __na, std::addressof(__h->__get_value()), std::forward<_First>(__f), std::forward<_Rest>(__rest)...);
   __h.get_deleter().__value_constructed = true;
   return __h;
 }
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index 8dbc28e839839..9f3ce02a3af20 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -129,8 +129,6 @@
 //       will define those directly.
 #    if defined(_AIX) || defined(__MVS__)
 #      include <__locale_dir/locale_base_api/ibm.h>
-#    elif defined(__ANDROID__)
-#      include <__locale_dir/locale_base_api/android.h>
 #    elif defined(__OpenBSD__)
 #      include <__locale_dir/locale_base_api/openbsd.h>
 #    elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC
diff --git a/libcxx/include/__locale_dir/locale_base_api/android.h b/libcxx/include/__locale_dir/locale_base_api/android.h
deleted file mode 100644
index 36b8d93e1b228..0000000000000
--- a/libcxx/include/__locale_dir/locale_base_api/android.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// -*- C++ -*-
-//===-----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
-
-#include <stdlib.h>
-
-// FIXME: Is this actually required?
-extern "C" {
-#include <xlocale.h>
-}
-
-#include <android/api-level.h>
-
-// If we do not have this header, we are in a platform build rather than an NDK
-// build, which will always be at least as new as the ToT NDK, in which case we
-// don't need any of the inlines below since libc provides them.
-#if __has_include(<android/ndk-version.h>)
-#  include <android/ndk-version.h>
-// In NDK versions later than 16, locale-aware functions are provided by
-// legacy_stdlib_inlines.h
-#  if __NDK_MAJOR__ <= 16
-#    if __ANDROID_API__ < 26
-
-inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) {
-  return ::strtof(__nptr, __endptr);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI double strtod_l(const char* __nptr, char** __endptr, locale_t) {
-  return ::strtod(__nptr, __endptr);
-}
-
-#    endif // __ANDROID_API__ < 26
-
-#  endif // __NDK_MAJOR__ <= 16
-#endif   // __has_include(<android/ndk-version.h>)
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
diff --git a/libcxx/include/__memory/construct_at.h b/libcxx/include/__memory/construct_at.h
index b64e64b5a29b0..658269158d945 100644
--- a/libcxx/include/__memory/construct_at.h
+++ b/libcxx/include/__memory/construct_at.h
@@ -33,7 +33,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <class _Tp, class... _Args, class = decltype(::new(std::declval<void*>()) _Tp(std::declval<_Args>()...))>
-_LIBCPP_HIDE_FROM_ABI constexpr _Tp* construct_at(_Tp* __location, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI constexpr _Tp* construct_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __location, _Args&&... __args) {
   _LIBCPP_ASSERT_NON_NULL(__location != nullptr, "null pointer given to construct_at");
   return ::new (static_cast<void*>(__location)) _Tp(std::forward<_Args>(__args)...);
 }
@@ -73,13 +73,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc) {
 #if _LIBCPP_STD_VER >= 17
 
 template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* __loc) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) {
   std::__destroy_at(__loc);
 }
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* __loc) {
+_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) {
   std::__destroy_at(__loc);
 }
 #  endif
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index 1b8711f10811d..6e7a9afc25deb 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -135,10 +135,10 @@ class _LIBCPP_AVAILABILITY_PMR polymorphic_allocator {
         piecewise_construct,
         __transform_tuple(typename __uses_alloc_ctor< _T1, polymorphic_allocator&, _Args1... >::type(),
                           std::move(__x),
-                          typename __make_tuple_indices<sizeof...(_Args1)>::type{}),
+                          make_index_sequence<sizeof...(_Args1)>()),
         __transform_tuple(typename __uses_alloc_ctor< _T2, polymorphic_allocator&, _Args2... >::type(),
                           std::move(__y),
-                          typename __make_tuple_indices<sizeof...(_Args2)>::type{}));
+                          make_index_sequence<sizeof...(_Args2)>()));
   }
 
   template <class _T1, class _T2>
@@ -194,20 +194,20 @@ class _LIBCPP_AVAILABILITY_PMR polymorphic_allocator {
 private:
   template <class... _Args, size_t... _Is>
   _LIBCPP_HIDE_FROM_ABI tuple<_Args&&...>
-  __transform_tuple(integral_constant<int, 0>, tuple<_Args...>&& __t, __tuple_indices<_Is...>) {
+  __transform_tuple(integral_constant<int, 0>, tuple<_Args...>&& __t, index_sequence<_Is...>) {
     return std::forward_as_tuple(std::get<_Is>(std::move(__t))...);
   }
 
   template <class... _Args, size_t... _Is>
   _LIBCPP_HIDE_FROM_ABI tuple<allocator_arg_t const&, polymorphic_allocator&, _Args&&...>
-  __transform_tuple(integral_constant<int, 1>, tuple<_Args...>&& __t, __tuple_indices<_Is...>) {
+  __transform_tuple(integral_constant<int, 1>, tuple<_Args...>&& __t, index_sequence<_Is...>) {
     using _Tup = tuple<allocator_arg_t const&, polymorphic_allocator&, _Args&&...>;
     return _Tup(allocator_arg, *this, std::get<_Is>(std::move(__t))...);
   }
 
   template <class... _Args, size_t... _Is>
   _LIBCPP_HIDE_FROM_ABI tuple<_Args&&..., polymorphic_allocator&>
-  __transform_tuple(integral_constant<int, 2>, tuple<_Args...>&& __t, __tuple_indices<_Is...>) {
+  __transform_tuple(integral_constant<int, 2>, tuple<_Args...>&& __t, index_sequence<_Is...>) {
     using _Tup = tuple<_Args&&..., polymorphic_allocator&>;
     return _Tup(std::get<_Is>(std::move(__t))..., *this);
   }
diff --git a/libcxx/include/__mutex/once_flag.h b/libcxx/include/__mutex/once_flag.h
index 33064499550eb..e384c15a9f9b6 100644
--- a/libcxx/include/__mutex/once_flag.h
+++ b/libcxx/include/__mutex/once_flag.h
@@ -13,9 +13,9 @@
 #include <__functional/invoke.h>
 #include <__memory/addressof.h>
 #include <__memory/shared_count.h> // __libcpp_acquire_load
-#include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_size.h>
 #include <__utility/forward.h>
+#include <__utility/integer_sequence.h>
 #include <__utility/move.h>
 #include <cstdint>
 #ifndef _LIBCPP_CXX03_LANG
@@ -87,15 +87,12 @@ class __call_once_param {
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __call_once_param(_Fp& __f) : __f_(__f) {}
 
-  _LIBCPP_HIDE_FROM_ABI void operator()() {
-    typedef typename __make_tuple_indices<tuple_size<_Fp>::value, 1>::type _Index;
-    __execute(_Index());
-  }
+  _LIBCPP_HIDE_FROM_ABI void operator()() { __execute(__make_index_sequence<tuple_size<_Fp>::value>()); }
 
 private:
   template <size_t... _Indices>
-  _LIBCPP_HIDE_FROM_ABI void __execute(__tuple_indices<_Indices...>) {
-    std::__invoke(std::get<0>(std::move(__f_)), std::get<_Indices>(std::move(__f_))...);
+  _LIBCPP_HIDE_FROM_ABI void __execute(__index_sequence<_Indices...>) {
+    std::__invoke(std::get<_Indices>(std::move(__f_))...);
   }
 };
 
diff --git a/libcxx/include/__ranges/zip_transform_view.h b/libcxx/include/__ranges/zip_transform_view.h
new file mode 100644
index 0000000000000..07aa182f2858f
--- /dev/null
+++ b/libcxx/include/__ranges/zip_transform_view.h
@@ -0,0 +1,357 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___RANGES_ZIP_TRANSFORM_VIEW_H
+#define _LIBCPP___RANGES_ZIP_TRANSFORM_VIEW_H
+
+#include <__config>
+
+#include <__concepts/constructible.h>
+#include <__concepts/convertible_to.h>
+#include <__concepts/derived_from.h>
+#include <__concepts/equality_comparable.h>
+#include <__concepts/invocable.h>
+#include <__functional/invoke.h>
+#include <__iterator/concepts.h>
+#include <__iterator/incrementable_traits.h>
+#include <__iterator/iterator_traits.h>
+#include <__memory/addressof.h>
+#include <__ranges/access.h>
+#include <__ranges/all.h>
+#include <__ranges/concepts.h>
+#include <__ranges/empty_view.h>
+#include <__ranges/movable_box.h>
+#include <__ranges/view_interface.h>
+#include <__ranges/zip_view.h>
+#include <__type_traits/decay.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_object.h>
+#include <__type_traits/is_reference.h>
+#include <__type_traits/is_referenceable.h>
+#include <__type_traits/maybe_const.h>
+#include <__type_traits/remove_cvref.h>
+#include <__utility/forward.h>
+#include <__utility/in_place.h>
+#include <__utility/move.h>
+#include <tuple> // for std::apply
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+namespace ranges {
+
+template <move_constructible _Fn, input_range... _Views>
+  requires(view<_Views> && ...) &&
+          (sizeof...(_Views) > 0) && is_object_v<_Fn> && regular_invocable<_Fn&, range_reference_t<_Views>...> &&
+          __referenceable<invoke_result_t<_Fn&, range_reference_t<_Views>...>>
+class zip_transform_view : public view_interface<zip_transform_view<_Fn, _Views...>> {
+  _LIBCPP_NO_UNIQUE_ADDRESS zip_view<_Views...> __zip_;
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Fn> __fun_;
+
+  using _InnerView _LIBCPP_NODEBUG = zip_view<_Views...>;
+  template <bool _Const>
+  using __ziperator _LIBCPP_NODEBUG = iterator_t<__maybe_const<_Const, _InnerView>>;
+  template <bool _Const>
+  using __zentinel _LIBCPP_NODEBUG = sentinel_t<__maybe_const<_Const, _InnerView>>;
+
+  template <bool>
+  class __iterator;
+
+  template <bool>
+  class __sentinel;
+
+public:
+  _LIBCPP_HIDE_FROM_ABI zip_transform_view() = default;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit zip_transform_view(_Fn __fun, _Views... __views)
+      : __zip_(std::move(__views)...), __fun_(in_place, std::move(__fun)) {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto begin() { return __iterator<false>(*this, __zip_.begin()); }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto begin() const
+    requires range<const _InnerView> && regular_invocable<const _Fn&, range_reference_t<const _Views>...>
+  {
+    return __iterator<true>(*this, __zip_.begin());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto end() {
+    if constexpr (common_range<_InnerView>) {
+      return __iterator<false>(*this, __zip_.end());
+    } else {
+      return __sentinel<false>(__zip_.end());
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto end() const
+    requires range<const _InnerView> && regular_invocable<const _Fn&, range_reference_t<const _Views>...>
+  {
+    if constexpr (common_range<const _InnerView>) {
+      return __iterator<true>(*this, __zip_.end());
+    } else {
+      return __sentinel<true>(__zip_.end());
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto size()
+    requires sized_range<_InnerView>
+  {
+    return __zip_.size();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto size() const
+    requires sized_range<const _InnerView>
+  {
+    return __zip_.size();
+  }
+};
+
+template <class _Fn, class... _Ranges>
+zip_transform_view(_Fn, _Ranges&&...) -> zip_transform_view<_Fn, views::all_t<_Ranges>...>;
+
+template <bool _Const, class _Fn, class... _Views>
+struct __zip_transform_iterator_category_base {};
+
+template <bool _Const, class _Fn, class... _Views>
+  requires forward_range<__maybe_const<_Const, zip_view<_Views...>>>
+struct __zip_transform_iterator_category_base<_Const, _Fn, _Views...> {
+private:
+  template <class _View>
+  using __tag _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<__maybe_const<_Const, _View>>>::iterator_category;
+
+  static consteval auto __get_iterator_category() {
+    if constexpr (!is_reference_v<invoke_result_t<__maybe_const<_Const, _Fn>&,
+                                                  range_reference_t<__maybe_const<_Const, _Views>>...>>) {
+      return input_iterator_tag();
+    } else if constexpr ((derived_from<__tag<_Views>, random_access_iterator_tag> && ...)) {
+      return random_access_iterator_tag();
+    } else if constexpr ((derived_from<__tag<_Views>, bidirectional_iterator_tag> && ...)) {
+      return bidirectional_iterator_tag();
+    } else if constexpr ((derived_from<__tag<_Views>, forward_iterator_tag> && ...)) {
+      return forward_iterator_tag();
+    } else {
+      return input_iterator_tag();
+    }
+  }
+
+public:
+  using iterator_category = decltype(__get_iterator_category());
+};
+
+template <move_constructible _Fn, input_range... _Views>
+  requires(view<_Views> && ...) &&
+          (sizeof...(_Views) > 0) && is_object_v<_Fn> && regular_invocable<_Fn&, range_reference_t<_Views>...> &&
+          __referenceable<invoke_result_t<_Fn&, range_reference_t<_Views>...>>
+template <bool _Const>
+class zip_transform_view<_Fn, _Views...>::__iterator
+    : public __zip_transform_iterator_category_base<_Const, _Fn, _Views...> {
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, zip_transform_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _InnerView>;
+
+  friend zip_transform_view<_Fn, _Views...>;
+
+  _Parent* __parent_ = nullptr;
+  __ziperator<_Const> __inner_;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator(_Parent& __parent, __ziperator<_Const> __inner)
+      : __parent_(std::addressof(__parent)), __inner_(std::move(__inner)) {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr auto __get_deref_and_invoke() const noexcept {
+    return [&__fun = *__parent_->__fun_](const auto&... __iters) noexcept(noexcept(std::invoke(
+               *__parent_->__fun_, *__iters...))) -> decltype(auto) { return std::invoke(__fun, *__iters...); };
+  }
+
+public:
+  using iterator_concept = typename __ziperator<_Const>::iterator_concept;
+  using value_type =
+      remove_cvref_t<invoke_result_t<__maybe_const<_Const, _Fn>&, range_reference_t<__maybe_const<_Const, _Views>>...>>;
+  using difference_type = range_difference_t<_Base>;
+
+  _LIBCPP_HIDE_FROM_ABI __iterator() = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator(__iterator<!_Const> __i)
+    requires _Const && convertible_to<__ziperator<false>, __ziperator<_Const>>
+      : __parent_(__i.__parent_), __inner_(std::move(__i.__inner_)) {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() const
+      noexcept(noexcept(std::apply(__get_deref_and_invoke(), __zip_view_iterator_access::__get_underlying(__inner_)))) {
+    return std::apply(__get_deref_and_invoke(), __zip_view_iterator_access::__get_underlying(__inner_));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator& operator++() {
+    ++__inner_;
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr void operator++(int) { ++*this; }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator operator++(int)
+    requires forward_range<_Base>
+  {
+    auto __tmp = *this;
+    ++*this;
+    return __tmp;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator& operator--()
+    requires bidirectional_range<_Base>
+  {
+    --__inner_;
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator operator--(int)
+    requires bidirectional_range<_Base>
+  {
+    auto __tmp = *this;
+    --*this;
+    return __tmp;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator& operator+=(difference_type __x)
+    requires random_access_range<_Base>
+  {
+    __inner_ += __x;
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __iterator& operator-=(difference_type __x)
+    requires random_access_range<_Base>
+  {
+    __inner_ -= __x;
+    return *this;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator[](difference_type __n) const
+    requires random_access_range<_Base>
+  {
+    return std::apply(
+        [&]<class... _Is>(const _Is&... __iters) -> decltype(auto) {
+          return std::invoke(*__parent_->__fun_, __iters[iter_difference_t<_Is>(__n)]...);
+        },
+        __zip_view_iterator_access::__get_underlying(__inner_));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const __iterator& __x, const __iterator& __y)
+    requires equality_comparable<__ziperator<_Const>>
+  {
+    return __x.__inner_ == __y.__inner_;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y)
+    requires random_access_range<_Base>
+  {
+    return __x.__inner_ <=> __y.__inner_;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr __iterator operator+(const __iterator& __i, difference_type __n)
+    requires random_access_range<_Base>
+  {
+    return __iterator(*__i.__parent_, __i.__inner_ + __n);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr __iterator operator+(difference_type __n, const __iterator& __i)
+    requires random_access_range<_Base>
+  {
+    return __iterator(*__i.__parent_, __i.__inner_ + __n);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr __iterator operator-(const __iterator& __i, difference_type __n)
+    requires random_access_range<_Base>
+  {
+    return __iterator(*__i.__parent_, __i.__inner_ - __n);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI friend constexpr difference_type operator-(const __iterator& __x, const __iterator& __y)
+    requires sized_sentinel_for<__ziperator<_Const>, __ziperator<_Const>>
+  {
+    return __x.__inner_ - __y.__inner_;
+  }
+};
+
+template <move_constructible _Fn, input_range... _Views>
+  requires(view<_Views> && ...) &&
+          (sizeof...(_Views) > 0) && is_object_v<_Fn> && regular_invocable<_Fn&, range_reference_t<_Views>...> &&
+          __referenceable<invoke_result_t<_Fn&, range_reference_t<_Views>...>>
+template <bool _Const>
+class zip_transform_view<_Fn, _Views...>::__sentinel {
+  __zentinel<_Const> __inner_;
+
+  friend zip_transform_view<_Fn, _Views...>;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(__zentinel<_Const> __inner) : __inner_(__inner) {}
+
+public:
+  _LIBCPP_HIDE_FROM_ABI __sentinel() = default;
+
+  _LIBCPP_HIDE_FROM_ABI constexpr __sentinel(__sentinel<!_Const> __i)
+    requires _Const && convertible_to<__zentinel<false>, __zentinel<_Const>>
+      : __inner_(__i.__inner_) {}
+
+  template <bool _OtherConst>
+    requires sentinel_for<__zentinel<_Const>, __ziperator<_OtherConst>>
+  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const __iterator<_OtherConst>& __x, const __sentinel& __y) {
+    return __x.__inner_ == __y.__inner_;
+  }
+
+  template <bool _OtherConst>
+    requires sized_sentinel_for<__zentinel<_Const>, __ziperator<_OtherConst>>
+  _LIBCPP_HIDE_FROM_ABI friend constexpr range_difference_t<__maybe_const<_OtherConst, _InnerView>>
+  operator-(const __iterator<_OtherConst>& __x, const __sentinel& __y) {
+    return __x.__inner_ - __y.__inner_;
+  }
+
+  template <bool _OtherConst>
+    requires sized_sentinel_for<__zentinel<_Const>, __ziperator<_OtherConst>>
+  _LIBCPP_HIDE_FROM_ABI friend constexpr range_difference_t<__maybe_const<_OtherConst, _InnerView>>
+  operator-(const __sentinel& __x, const __iterator<_OtherConst>& __y) {
+    return __x.__inner_ - __y.__inner_;
+  }
+};
+
+namespace views {
+namespace __zip_transform {
+
+struct __fn {
+  template <class _Fn>
+    requires(move_constructible<decay_t<_Fn>> && regular_invocable<decay_t<_Fn>&> &&
+             is_object_v<invoke_result_t<decay_t<_Fn>&>>)
+  _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Fn&&) const
+      noexcept(noexcept(auto(views::empty<decay_t<invoke_result_t<decay_t<_Fn>&>>>))) {
+    return views::empty<decay_t<invoke_result_t<decay_t<_Fn>&>>>;
+  }
+
+  template <class _Fn, class... _Ranges>
+    requires(sizeof...(_Ranges) > 0)
+  _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Fn&& __fun, _Ranges&&... __rs) const
+      noexcept(noexcept(zip_transform_view(std::forward<_Fn>(__fun), std::forward<_Ranges>(__rs)...)))
+          -> decltype(zip_transform_view(std::forward<_Fn>(__fun), std::forward<_Ranges>(__rs)...)) {
+    return zip_transform_view(std::forward<_Fn>(__fun), std::forward<_Ranges>(__rs)...);
+  }
+};
+
+} // namespace __zip_transform
+inline namespace __cpo {
+inline constexpr auto zip_transform = __zip_transform::__fn{};
+} // namespace __cpo
+} // namespace views
+} // namespace ranges
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___RANGES_ZIP_TRANSFORM_VIEW_H
diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h
index e2a194efcfb4c..ce00c98710c4e 100644
--- a/libcxx/include/__ranges/zip_view.h
+++ b/libcxx/include/__ranges/zip_view.h
@@ -235,6 +235,13 @@ struct __zip_view_iterator_category_base<_Const, _Views...> {
   using iterator_category = input_iterator_tag;
 };
 
+struct __zip_view_iterator_access {
+  template <class _Iter>
+  _LIBCPP_HIDE_FROM_ABI static constexpr decltype(auto) __get_underlying(_Iter& __iter) noexcept {
+    return (__iter.__current_);
+  }
+};
+
 template <input_range... _Views>
   requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
 template <bool _Const>
@@ -255,6 +262,7 @@ class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base
   static constexpr bool __is_zip_view_iterator = true;
 
   friend struct __product_iterator_traits<__iterator>;
+  friend __zip_view_iterator_access;
 
 public:
   using iterator_concept = decltype(ranges::__get_zip_view_iterator_tag<_Const, _Views...>());
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index 1b51571ce302e..a3b672bc0f0e7 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -155,8 +155,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, __thread_id __id) {
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _TSp, class _Fp, class... _Args, size_t... _Indices>
-inline _LIBCPP_HIDE_FROM_ABI void __thread_execute(tuple<_TSp, _Fp, _Args...>& __t, __tuple_indices<_Indices...>) {
-  std::__invoke(std::move(std::get<1>(__t)), std::move(std::get<_Indices>(__t))...);
+inline _LIBCPP_HIDE_FROM_ABI void __thread_execute(tuple<_TSp, _Fp, _Args...>& __t, __index_sequence<_Indices...>) {
+  std::__invoke(std::move(std::get<_Indices + 1>(__t))...);
 }
 
 template <class _Fp>
@@ -164,8 +164,7 @@ _LIBCPP_HIDE_FROM_ABI void* __thread_proxy(void* __vp) {
   // _Fp = tuple< unique_ptr<__thread_struct>, Functor, Args...>
   unique_ptr<_Fp> __p(static_cast<_Fp*>(__vp));
   __thread_local_data().set_pointer(std::get<0>(*__p.get()).release());
-  typedef typename __make_tuple_indices<tuple_size<_Fp>::value, 2>::type _Index;
-  std::__thread_execute(*__p.get(), _Index());
+  std::__thread_execute(*__p.get(), __make_index_sequence<tuple_size<_Fp>::value - 1>());
   return nullptr;
 }
 
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index f29b691b73dda..f8bb4f01b1e29 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -855,17 +855,11 @@ public:
 
 private:
   _LIBCPP_HIDE_FROM_ABI const __node_allocator& __node_alloc() const _NOEXCEPT { return __node_alloc_; }
-  _LIBCPP_HIDE_FROM_ABI __end_node_pointer& __begin_node() _NOEXCEPT { return __begin_node_; }
-  _LIBCPP_HIDE_FROM_ABI const __end_node_pointer& __begin_node() const _NOEXCEPT { return __begin_node_; }
 
 public:
   _LIBCPP_HIDE_FROM_ABI allocator_type __alloc() const _NOEXCEPT { return allocator_type(__node_alloc()); }
 
-private:
-  _LIBCPP_HIDE_FROM_ABI size_type& size() _NOEXCEPT { return __size_; }
-
-public:
-  _LIBCPP_HIDE_FROM_ABI const size_type& size() const _NOEXCEPT { return __size_; }
+  _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; }
   _LIBCPP_HIDE_FROM_ABI value_compare& value_comp() _NOEXCEPT { return __value_comp_; }
   _LIBCPP_HIDE_FROM_ABI const value_compare& value_comp() const _NOEXCEPT { return __value_comp_; }
 
@@ -902,8 +896,8 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI ~__tree();
 
-  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__begin_node()); }
+  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node_); }
+  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__begin_node_); }
   _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(__end_node()); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(__end_node()); }
 
@@ -1225,30 +1219,30 @@ template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp) _NOEXCEPT_(
     is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_copy_constructible<value_compare>::value)
     : __size_(0), __value_comp_(__comp) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const allocator_type& __a)
     : __begin_node_(), __node_alloc_(__node_allocator(__a)), __size_(0) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp, const allocator_type& __a)
     : __begin_node_(), __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(__comp) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
-// Precondition:  size() != 0
+// Precondition:  __size_ != 0
 template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::__node_pointer
 __tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_from_tree(__tree* __t) _NOEXCEPT {
-  __node_pointer __cache                = static_cast<__node_pointer>(__t->__begin_node());
-  __t->__begin_node()                   = __t->__end_node();
+  __node_pointer __cache                = static_cast<__node_pointer>(__t->__begin_node_);
+  __t->__begin_node_                    = __t->__end_node();
   __t->__end_node()->__left_->__parent_ = nullptr;
   __t->__end_node()->__left_            = nullptr;
-  __t->size()                           = 0;
+  __t->__size_                          = 0;
   // __cache->__left_ == nullptr
   if (__cache->__right_ != nullptr)
     __cache = static_cast<__node_pointer>(__cache->__right_);
@@ -1300,7 +1294,7 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_unique(_ForwardIterator __first
       is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type");
   static_assert(
       __has_forward_iterator_category<_ForwardIterator>::value, "__assign_unique requires a forward iterator");
-  if (size() != 0) {
+  if (__size_ != 0) {
     _DetachedTreeCache __cache(this);
     for (; __cache.__get() != nullptr && __first != __last; ++__first) {
       if (__node_assign_unique(*__first, __cache.__get()).second)
@@ -1318,7 +1312,7 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _
   typedef typename _ITraits::value_type _ItValueType;
   static_assert(
       is_same<_ItValueType, value_type>::value, "__assign_multi may only be called with the containers value_type");
-  if (size() != 0) {
+  if (__size_ != 0) {
     _DetachedTreeCache __cache(this);
     for (; __cache.__get() && __first != __last; ++__first) {
       __assign_value(__cache.__get()->__value_, *__first);
@@ -1337,7 +1331,7 @@ __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t)
       __node_alloc_(__node_traits::select_on_container_copy_construction(__t.__node_alloc())),
       __size_(0),
       __value_comp_(__t.value_comp()) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
 template <class _Tp, class _Compare, class _Allocator>
@@ -1348,13 +1342,13 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_(
       __node_alloc_(std::move(__t.__node_alloc_)),
       __size_(__t.__size_),
       __value_comp_(std::move(__t.__value_comp_)) {
-  if (size() == 0)
-    __begin_node() = __end_node();
+  if (__size_ == 0)
+    __begin_node_ = __end_node();
   else {
     __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-    __t.__begin_node()               = __t.__end_node();
+    __t.__begin_node_                = __t.__end_node();
     __t.__end_node()->__left_        = nullptr;
-    __t.size()                       = 0;
+    __t.__size_                      = 0;
   }
 }
 
@@ -1362,19 +1356,19 @@ template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __a)
     : __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(std::move(__t.value_comp())) {
   if (__a == __t.__alloc()) {
-    if (__t.size() == 0)
-      __begin_node() = __end_node();
+    if (__t.__size_ == 0)
+      __begin_node_ = __end_node();
     else {
-      __begin_node()                   = __t.__begin_node();
+      __begin_node_                    = __t.__begin_node_;
       __end_node()->__left_            = __t.__end_node()->__left_;
       __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-      size()                           = __t.size();
-      __t.__begin_node()               = __t.__end_node();
+      __size_                          = __t.__size_;
+      __t.__begin_node_                = __t.__end_node();
       __t.__end_node()->__left_        = nullptr;
-      __t.size()                       = 0;
+      __t.__size_                      = 0;
     }
   } else {
-    __begin_node() = __end_node();
+    __begin_node_ = __end_node();
   }
 }
 
@@ -1387,13 +1381,13 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type)
   __move_assign_alloc(__t);
   __size_       = __t.__size_;
   __value_comp_ = std::move(__t.__value_comp_);
-  if (size() == 0)
-    __begin_node() = __end_node();
+  if (__size_ == 0)
+    __begin_node_ = __end_node();
   else {
     __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-    __t.__begin_node()               = __t.__end_node();
+    __t.__begin_node_                = __t.__end_node();
     __t.__end_node()->__left_        = nullptr;
-    __t.size()                       = 0;
+    __t.__size_                      = 0;
   }
 }
 
@@ -1404,15 +1398,15 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) {
   else {
     value_comp()       = std::move(__t.value_comp());
     const_iterator __e = end();
-    if (size() != 0) {
+    if (__size_ != 0) {
       _DetachedTreeCache __cache(this);
-      while (__cache.__get() != nullptr && __t.size() != 0) {
+      while (__cache.__get() != nullptr && __t.__size_ != 0) {
         __assign_value(__cache.__get()->__value_, std::move(__t.remove(__t.begin())->__value_));
         __node_insert_multi(__cache.__get());
         __cache.__advance();
       }
     }
-    while (__t.size() != 0) {
+    while (__t.__size_ != 0) {
       __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__value_));
     }
   }
@@ -1460,12 +1454,12 @@ void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t)
   std::__swap_allocator(__node_alloc(), __t.__node_alloc());
   swap(__size_, __t.__size_);
   swap(__value_comp_, __t.__value_comp_);
-  if (size() == 0)
-    __begin_node() = __end_node();
+  if (__size_ == 0)
+    __begin_node_ = __end_node();
   else
     __end_node()->__left_->__parent_ = __end_node();
-  if (__t.size() == 0)
-    __t.__begin_node() = __t.__end_node();
+  if (__t.__size_ == 0)
+    __t.__begin_node_ = __t.__end_node();
   else
     __t.__end_node()->__left_->__parent_ = __t.__end_node();
 }
@@ -1473,8 +1467,8 @@ void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t)
 template <class _Tp, class _Compare, class _Allocator>
 void __tree<_Tp, _Compare, _Allocator>::clear() _NOEXCEPT {
   destroy(__root());
-  size()                = 0;
-  __begin_node()        = __end_node();
+  __size_               = 0;
+  __begin_node_         = __end_node();
   __end_node()->__left_ = nullptr;
 }
 
@@ -1664,10 +1658,10 @@ void __tree<_Tp, _Compare, _Allocator>::__insert_node_at(
   __new_node->__parent_ = __parent;
   // __new_node->__is_black_ is initialized in __tree_balance_after_insert
   __child = __new_node;
-  if (__begin_node()->__left_ != nullptr)
-    __begin_node() = static_cast<__end_node_pointer>(__begin_node()->__left_);
+  if (__begin_node_->__left_ != nullptr)
+    __begin_node_ = static_cast<__end_node_pointer>(__begin_node_->__left_);
   std::__tree_balance_after_insert(__end_node()->__left_, __child);
-  ++size();
+  ++__size_;
 }
 
 template <class _Tp, class _Compare, class _Allocator>
@@ -1811,9 +1805,9 @@ typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__remove_node_pointer(__node_pointer __ptr) _NOEXCEPT {
   iterator __r(__ptr);
   ++__r;
-  if (__begin_node() == __ptr)
-    __begin_node() = __r.__ptr_;
-  --size();
+  if (__begin_node_ == __ptr)
+    __begin_node_ = __r.__ptr_;
+  --__size_;
   std::__tree_remove(__end_node()->__left_, static_cast<__node_base_pointer>(__ptr));
   return __r;
 }
@@ -2177,13 +2171,13 @@ template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::__node_holder
 __tree<_Tp, _Compare, _Allocator>::remove(const_iterator __p) _NOEXCEPT {
   __node_pointer __np = __p.__get_np();
-  if (__begin_node() == __p.__ptr_) {
+  if (__begin_node_ == __p.__ptr_) {
     if (__np->__right_ != nullptr)
-      __begin_node() = static_cast<__end_node_pointer>(__np->__right_);
+      __begin_node_ = static_cast<__end_node_pointer>(__np->__right_);
     else
-      __begin_node() = static_cast<__end_node_pointer>(__np->__parent_);
+      __begin_node_ = static_cast<__end_node_pointer>(__np->__parent_);
   }
-  --size();
+  --__size_;
   std::__tree_remove(__end_node()->__left_, static_cast<__node_base_pointer>(__np));
   return __node_holder(__np, _Dp(__node_alloc(), true));
 }
diff --git a/libcxx/include/__tuple/make_tuple_types.h b/libcxx/include/__tuple/make_tuple_types.h
index a5c9bcf23a6eb..3c22ec85dc9c7 100644
--- a/libcxx/include/__tuple/make_tuple_types.h
+++ b/libcxx/include/__tuple/make_tuple_types.h
@@ -14,12 +14,12 @@
 #include <__fwd/array.h>
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_element.h>
-#include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_size.h>
 #include <__tuple/tuple_types.h>
 #include <__type_traits/copy_cvref.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/remove_reference.h>
+#include <__utility/integer_sequence.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -38,38 +38,35 @@ template <class _TupleTypes, class _TupleIndices>
 struct __make_tuple_types_flat;
 
 template <template <class...> class _Tuple, class... _Types, size_t... _Idx>
-struct __make_tuple_types_flat<_Tuple<_Types...>, __tuple_indices<_Idx...>> {
+struct __make_tuple_types_flat<_Tuple<_Types...>, __index_sequence<_Idx...>> {
   // Specialization for pair, tuple, and __tuple_types
   template <class _Tp>
   using __apply_quals _LIBCPP_NODEBUG = __tuple_types<__copy_cvref_t<_Tp, __type_pack_element<_Idx, _Types...>>...>;
 };
 
 template <class _Vt, size_t _Np, size_t... _Idx>
-struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>> {
+struct __make_tuple_types_flat<array<_Vt, _Np>, __index_sequence<_Idx...>> {
   template <size_t>
   using __value_type _LIBCPP_NODEBUG = _Vt;
   template <class _Tp>
   using __apply_quals _LIBCPP_NODEBUG = __tuple_types<__copy_cvref_t<_Tp, __value_type<_Idx>>...>;
 };
 
-template <class _Tp,
-          size_t _Ep     = tuple_size<__libcpp_remove_reference_t<_Tp> >::value,
-          size_t _Sp     = 0,
-          bool _SameSize = (_Ep == tuple_size<__libcpp_remove_reference_t<_Tp> >::value)>
+template <class _Tp>
 struct __make_tuple_types {
-  static_assert(_Sp <= _Ep, "__make_tuple_types input error");
   using _RawTp _LIBCPP_NODEBUG = __remove_cvref_t<_Tp>;
-  using _Maker _LIBCPP_NODEBUG = __make_tuple_types_flat<_RawTp, typename __make_tuple_indices<_Ep, _Sp>::type>;
-  using type _LIBCPP_NODEBUG   = typename _Maker::template __apply_quals<_Tp>;
+  using _Maker _LIBCPP_NODEBUG =
+      __make_tuple_types_flat<_RawTp, __make_index_sequence<tuple_size<__libcpp_remove_reference_t<_Tp>>::value>>;
+  using type _LIBCPP_NODEBUG = typename _Maker::template __apply_quals<_Tp>;
 };
 
-template <class... _Types, size_t _Ep>
-struct __make_tuple_types<tuple<_Types...>, _Ep, 0, true> {
+template <class... _Types>
+struct __make_tuple_types<tuple<_Types...>> {
   using type _LIBCPP_NODEBUG = __tuple_types<_Types...>;
 };
 
-template <class... _Types, size_t _Ep>
-struct __make_tuple_types<__tuple_types<_Types...>, _Ep, 0, true> {
+template <class... _Types>
+struct __make_tuple_types<__tuple_types<_Types...>> {
   using type _LIBCPP_NODEBUG = __tuple_types<_Types...>;
 };
 
diff --git a/libcxx/include/__tuple/tuple_element.h b/libcxx/include/__tuple/tuple_element.h
index f67c8674644c3..607ac3a453de5 100644
--- a/libcxx/include/__tuple/tuple_element.h
+++ b/libcxx/include/__tuple/tuple_element.h
@@ -11,7 +11,6 @@
 
 #include <__config>
 #include <__cstddef/size_t.h>
-#include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_types.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__tuple/tuple_indices.h b/libcxx/include/__tuple/tuple_indices.h
deleted file mode 100644
index 25dc9ec685916..0000000000000
--- a/libcxx/include/__tuple/tuple_indices.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TUPLE_MAKE_TUPLE_INDICES_H
-#define _LIBCPP___TUPLE_MAKE_TUPLE_INDICES_H
-
-#include <__config>
-#include <__cstddef/size_t.h>
-#include <__utility/integer_sequence.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#ifndef _LIBCPP_CXX03_LANG
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <size_t...>
-struct __tuple_indices {};
-
-template <size_t _Ep, size_t _Sp = 0>
-struct __make_tuple_indices {
-  static_assert(_Sp <= _Ep, "__make_tuple_indices input error");
-  typedef __make_indices_imp<_Ep, _Sp> type;
-};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP_CXX03_LANG
-
-#endif // _LIBCPP___TUPLE_MAKE_TUPLE_INDICES_H
diff --git a/libcxx/include/__utility/integer_sequence.h b/libcxx/include/__utility/integer_sequence.h
index d1c6e53c72131..329826ae5eda2 100644
--- a/libcxx/include/__utility/integer_sequence.h
+++ b/libcxx/include/__utility/integer_sequence.h
@@ -17,57 +17,41 @@
 #  pragma GCC system_header
 #endif
 
+#ifndef _LIBCPP_CXX03_LANG
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <size_t...>
-struct __tuple_indices;
+#  if __has_builtin(__make_integer_seq)
+template <template <class _Tp, _Tp...> class _BaseType, class _Tp, _Tp _SequenceSize>
+using __make_integer_sequence_impl _LIBCPP_NODEBUG = __make_integer_seq<_BaseType, _Tp, _SequenceSize>;
+#  else
+template <template <class _Tp, _Tp...> class _BaseType, class _Tp, _Tp _SequenceSize>
+using __make_integer_sequence_impl _LIBCPP_NODEBUG = _BaseType<_Tp, __integer_pack(_SequenceSize)...>;
+#  endif
 
-template <class _IdxType, _IdxType... _Values>
+template <class _Tp, _Tp... _Indices>
 struct __integer_sequence {
-  template <template <class _OIdxType, _OIdxType...> class _ToIndexSeq, class _ToIndexType>
-  using __convert _LIBCPP_NODEBUG = _ToIndexSeq<_ToIndexType, _Values...>;
-
-  template <size_t _Sp>
-  using __to_tuple_indices _LIBCPP_NODEBUG = __tuple_indices<(_Values + _Sp)...>;
+  using value_type = _Tp;
+  static_assert(is_integral<_Tp>::value, "std::integer_sequence can only be instantiated with an integral type");
+  static _LIBCPP_HIDE_FROM_ABI constexpr size_t size() noexcept { return sizeof...(_Indices); }
 };
 
-#if __has_builtin(__make_integer_seq)
-template <size_t _Ep, size_t _Sp>
-using __make_indices_imp _LIBCPP_NODEBUG =
-    typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
-#elif __has_builtin(__integer_pack)
-template <size_t _Ep, size_t _Sp>
-using __make_indices_imp _LIBCPP_NODEBUG =
-    typename __integer_sequence<size_t, __integer_pack(_Ep - _Sp)...>::template __to_tuple_indices<_Sp>;
-#else
-#  error "No known way to get an integer pack from the compiler"
-#endif
+template <size_t... _Indices>
+using __index_sequence _LIBCPP_NODEBUG = __integer_sequence<size_t, _Indices...>;
 
-#if _LIBCPP_STD_VER >= 14
+template <size_t _SequenceSize>
+using __make_index_sequence _LIBCPP_NODEBUG = __make_integer_sequence_impl<__integer_sequence, size_t, _SequenceSize>;
 
-template <class _Tp, _Tp... _Ip>
-struct integer_sequence {
-  typedef _Tp value_type;
-  static_assert(is_integral<_Tp>::value, "std::integer_sequence can only be instantiated with an integral type");
-  static _LIBCPP_HIDE_FROM_ABI constexpr size_t size() noexcept { return sizeof...(_Ip); }
-};
+#  if _LIBCPP_STD_VER >= 14
+
+template <class _Tp, _Tp... _Indices>
+struct integer_sequence : __integer_sequence<_Tp, _Indices...> {};
 
 template <size_t... _Ip>
 using index_sequence = integer_sequence<size_t, _Ip...>;
 
-#  if __has_builtin(__make_integer_seq)
-
 template <class _Tp, _Tp _Ep>
-using make_integer_sequence _LIBCPP_NODEBUG = __make_integer_seq<integer_sequence, _Tp, _Ep>;
-
-#  elif __has_builtin(__integer_pack)
-
-template <class _Tp, _Tp _SequenceSize>
-using make_integer_sequence _LIBCPP_NODEBUG = integer_sequence<_Tp, __integer_pack(_SequenceSize)...>;
-
-#  else
-#    error "No known way to get an integer pack from the compiler"
-#  endif
+using make_integer_sequence _LIBCPP_NODEBUG = __make_integer_sequence_impl<integer_sequence, _Tp, _Ep>;
 
 template <size_t _Np>
 using make_index_sequence = make_integer_sequence<size_t, _Np>;
@@ -75,16 +59,18 @@ using make_index_sequence = make_integer_sequence<size_t, _Np>;
 template <class... _Tp>
 using index_sequence_for = make_index_sequence<sizeof...(_Tp)>;
 
-#  if _LIBCPP_STD_VER >= 20
+#    if _LIBCPP_STD_VER >= 20
 // Executes __func for every element in an index_sequence.
 template <size_t... _Index, class _Function>
 _LIBCPP_HIDE_FROM_ABI constexpr void __for_each_index_sequence(index_sequence<_Index...>, _Function __func) {
   (__func.template operator()<_Index>(), ...);
 }
-#  endif // _LIBCPP_STD_VER >= 20
+#    endif // _LIBCPP_STD_VER >= 20
 
-#endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 14
 
 _LIBCPP_END_NAMESPACE_STD
 
+#endif // _LIBCPP_CXX03_LANG
+
 #endif // _LIBCPP___UTILITY_INTEGER_SEQUENCE_H
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index dbacbce044766..33694c52430f1 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -18,7 +18,6 @@
 #include <__fwd/array.h>
 #include <__fwd/pair.h>
 #include <__fwd/tuple.h>
-#include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_like_no_subrange.h>
 #include <__tuple/tuple_size.h>
 #include <__type_traits/common_reference.h>
@@ -40,6 +39,7 @@
 #include <__type_traits/unwrap_ref.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
+#include <__utility/integer_sequence.h>
 #include <__utility/move.h>
 #include <__utility/piecewise_construct.h>
 
@@ -225,8 +225,8 @@ struct pair
       : pair(__pc,
              __first_args,
              __second_args,
-             typename __make_tuple_indices<sizeof...(_Args1)>::type(),
-             typename __make_tuple_indices<sizeof...(_Args2) >::type()) {}
+             __make_index_sequence<sizeof...(_Args1)>(),
+             __make_index_sequence<sizeof...(_Args2)>()) {}
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair&
   operator=(__conditional_t<is_copy_assignable<first_type>::value && is_copy_assignable<second_type>::value,
@@ -440,8 +440,8 @@ struct pair
   pair(piecewise_construct_t,
        tuple<_Args1...>& __first_args,
        tuple<_Args2...>& __second_args,
-       __tuple_indices<_I1...>,
-       __tuple_indices<_I2...>)
+       __index_sequence<_I1...>,
+       __index_sequence<_I2...>)
       : first(std::forward<_Args1>(std::get<_I1>(__first_args))...),
         second(std::forward<_Args2>(std::get<_I2>(__second_args))...) {}
 #endif
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index d109f27af58d6..e2b46154ae730 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -147,7 +147,6 @@ template <size_t N> struct hash<std::bitset<N>>;
 #  include <__functional/hash.h>
 #  include <__functional/identity.h>
 #  include <__functional/unary_function.h>
-#  include <__tuple/tuple_indices.h>
 #  include <__type_traits/enable_if.h>
 #  include <__type_traits/integral_constant.h>
 #  include <__type_traits/is_char_like_type.h>
@@ -314,7 +313,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI void __init(unsigned long long __v, true_type) _NOEXCEPT;
 #  else
   template <size_t... _Indices>
-  _LIBCPP_HIDE_FROM_ABI constexpr __bitset(unsigned long long __v, std::__tuple_indices<_Indices...>) _NOEXCEPT
+  _LIBCPP_HIDE_FROM_ABI constexpr __bitset(unsigned long long __v, __index_sequence<_Indices...>) _NOEXCEPT
       : __first_{static_cast<__storage_type>(__v >> (_Indices * __bits_per_word))...} {}
 #  endif // _LIBCPP_CXX03_LANG
 };
@@ -352,10 +351,9 @@ template <size_t _N_words, size_t _Size>
 inline _LIBCPP_CONSTEXPR __bitset<_N_words, _Size>::__bitset(unsigned long long __v) _NOEXCEPT
 #  ifndef _LIBCPP_CXX03_LANG
     : __bitset(__v,
-               std::__make_indices_imp< (_N_words < (sizeof(unsigned long long) - 1) / sizeof(__storage_type) + 1)
-                                            ? _N_words
-                                            : (sizeof(unsigned long long) - 1) / sizeof(__storage_type) + 1,
-                                        0>{})
+               __make_index_sequence<(_N_words < (sizeof(unsigned long long) - 1) / sizeof(__storage_type) + 1)
+                                         ? _N_words
+                                         : (sizeof(unsigned long long) - 1) / sizeof(__storage_type) + 1>())
 #  endif
 {
 #  ifdef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index d6b92204f4376..46815eaffa8bd 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -744,7 +744,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -831,7 +831,7 @@ template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/ext/hash_set b/libcxx/include/ext/hash_set
index 7fd5df24ed3a8..62a7a0dbcffb9 100644
--- a/libcxx/include/ext/hash_set
+++ b/libcxx/include/ext/hash_set
@@ -458,7 +458,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -543,7 +543,7 @@ template <class _Value, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multiset<_Value, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Value, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index dc5c47304f014..6d3f20fff688f 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -401,6 +401,14 @@ private:
       }
     }
   }
+
+  _LIBCPP_HIDE_FROM_ABI typename traits_type::int_type __overflow_failed() {
+    if (this->pptr() == this->epptr() + 1) {
+      this->pbump(-1); // lose the character we overflowed above -- we don't really have a
+                       // choice since we couldn't commit the contents of the put area
+    }
+    return traits_type::eof();
+  }
 };
 
 template <class _CharT, class _Traits>
@@ -821,14 +829,6 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
 
 template <class _CharT, class _Traits>
 typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>::overflow(int_type __c) {
-  auto __failed = [this]() {
-    if (this->pptr() == this->epptr() + 1) {
-      this->pbump(-1); // lose the character we overflowed above -- we don't really have a
-                       // choice since we couldn't commit the contents of the put area
-    }
-    return traits_type::eof();
-  };
-
   if (__file_ == nullptr)
     return traits_type::eof();
   __write_mode();
@@ -850,7 +850,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
   if (__always_noconv_) {
     size_t __n = static_cast<size_t>(this->pptr() - this->pbase());
     if (std::fwrite(this->pbase(), sizeof(char_type), __n, __file_) != __n) {
-      return __failed();
+      return __overflow_failed();
     }
   } else {
     if (!__cv_)
@@ -864,14 +864,14 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
     do {
       codecvt_base::result __r = __cv_->out(__st_, __b, __p, __end, __extbuf_, __extbuf_ + __ebs_, __extbuf_end);
       if (__end == __b) {
-        return __failed();
+        return __overflow_failed();
       }
 
       // No conversion needed: output characters directly to the file, done.
       if (__r == codecvt_base::noconv) {
         size_t __n = static_cast<size_t>(__p - __b);
         if (std::fwrite(__b, 1, __n, __file_) != __n) {
-          return __failed();
+          return __overflow_failed();
         }
         break;
 
@@ -879,7 +879,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
       } else if (__r == codecvt_base::ok) {
         size_t __n = static_cast<size_t>(__extbuf_end - __extbuf_);
         if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) {
-          return __failed();
+          return __overflow_failed();
         }
         break;
 
@@ -888,13 +888,13 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
       } else if (__r == codecvt_base::partial) {
         size_t __n = static_cast<size_t>(__extbuf_end - __extbuf_);
         if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) {
-          return __failed();
+          return __overflow_failed();
         }
         __b = const_cast<char_type*>(__end);
         continue;
 
       } else {
-        return __failed();
+        return __overflow_failed();
       }
     } while (true);
   }
diff --git a/libcxx/include/future b/libcxx/include/future
index abdd82dc95dd7..3df9dc9349a01 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -1842,15 +1842,12 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __async_func(__async_func&& __f) : __f_(std::move(__f.__f_)) {}
 
-  _LIBCPP_HIDE_FROM_ABI _Rp operator()() {
-    typedef typename __make_tuple_indices<1 + sizeof...(_Args), 1>::type _Index;
-    return __execute(_Index());
-  }
+  _LIBCPP_HIDE_FROM_ABI _Rp operator()() { return __execute(__make_index_sequence<sizeof...(_Args) + 1>()); }
 
 private:
   template <size_t... _Indices>
-  _LIBCPP_HIDE_FROM_ABI _Rp __execute(__tuple_indices<_Indices...>) {
-    return std::__invoke(std::move(std::get<0>(__f_)), std::move(std::get<_Indices>(__f_))...);
+  _LIBCPP_HIDE_FROM_ABI _Rp __execute(__index_sequence<_Indices...>) {
+    return std::__invoke(std::move(std::get<_Indices>(__f_))...);
   }
 };
 
diff --git a/libcxx/include/limits b/libcxx/include/limits
index 1205e6a0c2781..e8581cf9c321d 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -219,7 +219,7 @@ protected:
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
   static _LIBCPP_CONSTEXPR const bool is_modulo  = !std::is_signed<_Tp>::value;
 
-#  if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__)
+#  if defined(__i386__) || defined(__x86_64__) || defined(__wasm__)
   static _LIBCPP_CONSTEXPR const bool traps = true;
 #  else
   static _LIBCPP_CONSTEXPR const bool traps = false;
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 61ba1c381b2b3..ac497208bdce4 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1369,6 +1369,7 @@ module std [system] {
     module indic_conjunct_break_table         { header "__format/indic_conjunct_break_table.h" }
     module parser_std_format_spec             { header "__format/parser_std_format_spec.h" }
     module range_default_formatter            { header "__format/range_default_formatter.h" }
+    module range_format                       { header "__format/range_format.h" }
     module range_formatter                    { header "__format/range_formatter.h" }
     module unicode                            { header "__format/unicode.h" }
     module width_estimation_table             { header "__format/width_estimation_table.h" }
@@ -1590,7 +1591,6 @@ module std [system] {
     }
 
     module locale_base_api {
-      textual header "__locale_dir/locale_base_api/android.h"
       textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
       textual header "__locale_dir/locale_base_api/ibm.h"
       textual header "__locale_dir/locale_base_api/musl.h"
@@ -1943,6 +1943,9 @@ module std [system] {
       header "__ranges/zip_view.h"
       export std.utility.pair
     }
+    module zip_transform_view {
+      header "__ranges/zip_transform_view.h"
+    }
 
     header "ranges"
     export *
@@ -2110,7 +2113,6 @@ module std [system] {
     module make_tuple_types         { header "__tuple/make_tuple_types.h" }
     module sfinae_helpers           { header "__tuple/sfinae_helpers.h" }
     module tuple_element            { header "__tuple/tuple_element.h" }
-    module tuple_indices            { header "__tuple/tuple_indices.h" }
     module tuple_like_ext           { header "__tuple/tuple_like_ext.h" }
     module tuple_like_no_subrange   { header "__tuple/tuple_like_no_subrange.h" }
     module tuple_like               { header "__tuple/tuple_like.h" }
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index dc8e711f04878..78d8c8a9bcc6e 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -469,17 +469,14 @@ public:
 
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI scoped_lock(adopt_lock_t, _MArgs&... __margs) : __t_(__margs...) {}
 
-  _LIBCPP_HIDE_FROM_ABI ~scoped_lock() {
-    typedef typename __make_tuple_indices<sizeof...(_MArgs)>::type _Indices;
-    __unlock_unpack(_Indices{}, __t_);
-  }
+  _LIBCPP_HIDE_FROM_ABI ~scoped_lock() { __unlock_unpack(make_index_sequence<sizeof...(_MArgs)>(), __t_); }
 
   scoped_lock(scoped_lock const&)            = delete;
   scoped_lock& operator=(scoped_lock const&) = delete;
 
 private:
   template <size_t... _Indx>
-  _LIBCPP_HIDE_FROM_ABI static void __unlock_unpack(__tuple_indices<_Indx...>, _MutexTuple& __mt) {
+  _LIBCPP_HIDE_FROM_ABI static void __unlock_unpack(index_sequence<_Indx...>, _MutexTuple& __mt) {
     (std::get<_Indx>(__mt).unlock(), ...);
   }
 
diff --git a/libcxx/include/print b/libcxx/include/print
index be05d30e0147f..0ff314c22dcd9 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -329,7 +329,8 @@ __vprint_unicode([[maybe_unused]] FILE* __stream,
 } // namespace __print
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void print(FILE* __stream, format_string<_Args...> __fmt, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI void
+print(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, format_string<_Args...> __fmt, _Args&&... __args) {
 #    if _LIBCPP_HAS_UNICODE
   if constexpr (__print::__use_unicode_execution_charset)
     __print::__vprint_unicode(__stream, __fmt.get(), std::make_format_args(__args...), false);
@@ -346,7 +347,8 @@ _LIBCPP_HIDE_FROM_ABI void print(format_string<_Args...> __fmt, _Args&&... __arg
 }
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void println(FILE* __stream, format_string<_Args...> __fmt, _Args&&... __args) {
+_LIBCPP_HIDE_FROM_ABI void
+println(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, format_string<_Args...> __fmt, _Args&&... __args) {
 #    if _LIBCPP_HAS_UNICODE
   // Note the wording in the Standard is inefficient. The output of
   // std::format is a std::string which is then copied. This solution
@@ -361,7 +363,7 @@ _LIBCPP_HIDE_FROM_ABI void println(FILE* __stream, format_string<_Args...> __fmt
 }
 
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void println(FILE* __stream) {
+_LIBCPP_HIDE_FROM_ABI inline void println(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream) {
   std::print(__stream, "\n");
 }
 
@@ -377,7 +379,8 @@ _LIBCPP_HIDE_FROM_ABI void println(format_string<_Args...> __fmt, _Args&&... __a
 
 #    if _LIBCPP_HAS_UNICODE
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(FILE* __stream, string_view __fmt, format_args __args) {
+_LIBCPP_HIDE_FROM_ABI inline void
+vprint_unicode(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, string_view __fmt, format_args __args) {
   __print::__vprint_unicode(__stream, __fmt, __args, false);
 }
 
@@ -389,7 +392,8 @@ _LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(string_view __fmt, format_args
 #    endif // _LIBCPP_HAS_UNICODE
 
 template <class = void> // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563).
-_LIBCPP_HIDE_FROM_ABI inline void vprint_nonunicode(FILE* __stream, string_view __fmt, format_args __args) {
+_LIBCPP_HIDE_FROM_ABI inline void
+vprint_nonunicode(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, string_view __fmt, format_args __args) {
   __print::__vprint_nonunicode(__stream, __fmt, __args, false);
 }
 
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 2a6321bd2c5d8..96d7a6b897188 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -339,6 +339,16 @@ namespace std::ranges {
 
   namespace views { inline constexpr unspecified zip = unspecified; }       // C++23
 
+  // [range.zip.transform], zip transform view
+  template<move_constructible F, input_range... Views>
+    requires (view<Views> && ...) && (sizeof...(Views) > 0) && is_object_v<F> &&
+             regular_invocable<F&, range_reference_t<Views>...> &&
+             can-reference<invoke_result_t<F&, range_reference_t<Views>...>>
+  class zip_transform_view;                                                         // C++23
+
+  namespace views { inline constexpr unspecified zip_transform = unspecified; }     // C++23
+
+
   // [range.as.rvalue]
   template <view V>
     requires input_range<V>
@@ -439,6 +449,7 @@ namespace std {
 #    include <__ranges/join_with_view.h>
 #    include <__ranges/repeat_view.h>
 #    include <__ranges/to.h>
+#    include <__ranges/zip_transform_view.h>
 #    include <__ranges/zip_view.h>
 #  endif
 
diff --git a/libcxx/include/regex b/libcxx/include/regex
index bbc21e244dd17..9bbc3a69021b9 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -2120,7 +2120,7 @@ public:
       __ranges_.push_back(
           std::make_pair(__traits_.transform(__b.begin(), __b.end()), __traits_.transform(__e.begin(), __e.end())));
     } else {
-      if (__b.size() != 1 || __e.size() != 1)
+      if (__b.size() != 1 || __e.size() != 1 || char_traits<typename string_type::value_type>::lt(__e[0], __b[0]))
         std::__throw_regex_error<regex_constants::error_range>();
       if (__icase_) {
         __b[0] = __traits_.translate_nocase(__b[0]);
diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator
index 7b8a9c9739ae0..74effc547f3e2 100644
--- a/libcxx/include/scoped_allocator
+++ b/libcxx/include/scoped_allocator
@@ -434,10 +434,10 @@ public:
         piecewise_construct,
         __transform_tuple(typename __uses_alloc_ctor< _T1, inner_allocator_type&, _Args1... >::type(),
                           std::move(__x),
-                          typename __make_tuple_indices<sizeof...(_Args1)>::type{}),
+                          __make_index_sequence<sizeof...(_Args1)>()),
         __transform_tuple(typename __uses_alloc_ctor< _T2, inner_allocator_type&, _Args2... >::type(),
                           std::move(__y),
-                          typename __make_tuple_indices<sizeof...(_Args2)>::type{}));
+                          __make_index_sequence<sizeof...(_Args2)>()));
   }
 
   template <class _T1, class _T2>
@@ -503,20 +503,20 @@ private:
 
   template <class... _Args, size_t... _Idx>
   _LIBCPP_HIDE_FROM_ABI tuple<_Args&&...>
-  __transform_tuple(integral_constant<int, 0>, tuple<_Args...>&& __t, __tuple_indices<_Idx...>) {
+  __transform_tuple(integral_constant<int, 0>, tuple<_Args...>&& __t, __index_sequence<_Idx...>) {
     return std::forward_as_tuple(std::get<_Idx>(std::move(__t))...);
   }
 
   template <class... _Args, size_t... _Idx>
   _LIBCPP_HIDE_FROM_ABI tuple<allocator_arg_t, inner_allocator_type&, _Args&&...>
-  __transform_tuple(integral_constant<int, 1>, tuple<_Args...>&& __t, __tuple_indices<_Idx...>) {
+  __transform_tuple(integral_constant<int, 1>, tuple<_Args...>&& __t, __index_sequence<_Idx...>) {
     using _Tup = tuple<allocator_arg_t, inner_allocator_type&, _Args&&...>;
     return _Tup(allocator_arg, inner_allocator(), std::get<_Idx>(std::move(__t))...);
   }
 
   template <class... _Args, size_t... _Idx>
   _LIBCPP_HIDE_FROM_ABI tuple<_Args&&..., inner_allocator_type&>
-  __transform_tuple(integral_constant<int, 2>, tuple<_Args...>&& __t, __tuple_indices<_Idx...>) {
+  __transform_tuple(integral_constant<int, 2>, tuple<_Args...>&& __t, __index_sequence<_Idx...>) {
     using _Tup = tuple<_Args&&..., inner_allocator_type&>;
     return _Tup(std::get<_Idx>(std::move(__t))..., inner_allocator());
   }
diff --git a/libcxx/include/string b/libcxx/include/string
index 514dd91c7c172..98297d04d0c61 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -700,18 +700,18 @@ __concatenate_strings(const _Allocator& __alloc,
                       __type_identity_t<basic_string_view<_CharT, _Traits> > __str2);
 
 template <class _Iter>
-struct __string_is_trivial_iterator : public false_type {};
+inline const bool __string_is_trivial_iterator_v = false;
 
 template <class _Tp>
-struct __string_is_trivial_iterator<_Tp*> : public is_arithmetic<_Tp> {};
+inline const bool __string_is_trivial_iterator_v<_Tp*> = is_arithmetic<_Tp>::value;
 
 template <class _Iter>
-struct __string_is_trivial_iterator<__wrap_iter<_Iter> > : public __string_is_trivial_iterator<_Iter> {};
+inline const bool __string_is_trivial_iterator_v<__wrap_iter<_Iter> > = __string_is_trivial_iterator_v<_Iter>;
 
 template <class _CharT, class _Traits, class _Tp>
-struct __can_be_converted_to_string_view
-    : public _BoolConstant< is_convertible<const _Tp&, basic_string_view<_CharT, _Traits> >::value &&
-                            !is_convertible<const _Tp&, const _CharT*>::value > {};
+inline const bool __can_be_converted_to_string_view_v =
+    is_convertible<const _Tp&, basic_string_view<_CharT, _Traits> >::value &&
+    !is_convertible<const _Tp&, const _CharT*>::value;
 
 struct __uninitialized_size_tag {};
 struct __init_with_sentinel_tag {};
@@ -1065,13 +1065,15 @@ public:
   basic_string(nullptr_t) = delete;
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, size_type __n) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, size_type __n)
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "basic_string(const char*, n) detected nullptr");
     __init(__s, __n);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
   basic_string(const _CharT* __s, size_type __n, const _Allocator& __a)
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero")
       : __alloc_(__a) {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "basic_string(const char*, n, allocator) detected nullptr");
     __init(__s, __n);
@@ -1125,7 +1127,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
@@ -1137,7 +1139,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t) {
@@ -1146,7 +1148,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t, const allocator_type& __a)
@@ -1205,7 +1207,7 @@ public:
   operator=(const basic_string& __str);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(const _Tp& __t) {
@@ -1342,7 +1344,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string >::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const _Tp& __t) {
@@ -1371,7 +1373,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const _Tp& __t) {
@@ -1382,7 +1384,7 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const basic_string& __str, size_type __pos, size_type __n = npos);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1394,7 +1396,8 @@ public:
     return append(__sv.data() + __pos, std::min(__n, __sz - __pos));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s, size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s, size_type __n)
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero");
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(size_type __n, value_type __c);
 
@@ -1415,7 +1418,7 @@ public:
     size_type __cap = capacity();
     size_type __n   = static_cast<size_type>(std::distance(__first, __last));
     if (__n) {
-      if (__string_is_trivial_iterator<_ForwardIterator>::value && !__addr_in_range(*__first)) {
+      if (__string_is_trivial_iterator_v<_ForwardIterator> && !__addr_in_range(*__first)) {
         if (__cap - __sz < __n)
           __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0);
         __annotate_increase(__n);
@@ -1467,7 +1470,7 @@ public:
     return *(data() + size() - 1);
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const _Tp& __t) {
     __self_view __sv = __t;
     return assign(__sv.data(), __sv.size());
@@ -1509,7 +1512,7 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const basic_string& __str, size_type __pos, size_type __n = npos);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1521,8 +1524,9 @@ public:
     return assign(__sv.data() + __pos, std::min(__n, __sz - __pos));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s, size_type __n);
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s, size_type __n)
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero");
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(size_type __n, value_type __c);
 
   template <class _InputIterator, __enable_if_t<__has_exactly_input_iterator_category<_InputIterator>::value, int> = 0>
@@ -1535,7 +1539,7 @@ public:
   template <class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   assign(_ForwardIterator __first, _ForwardIterator __last) {
-    if (__string_is_trivial_iterator<_ForwardIterator>::value) {
+    if (__string_is_trivial_iterator_v<_ForwardIterator>) {
       size_type __n = static_cast<size_type>(std::distance(__first, __last));
       __assign_trivial(__first, __last, __n);
     } else {
@@ -1548,7 +1552,7 @@ public:
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_CharT> _Range>
   _LIBCPP_HIDE_FROM_ABI constexpr basic_string& assign_range(_Range&& __range) {
-    if constexpr (__string_is_trivial_iterator<ranges::iterator_t<_Range>>::value &&
+    if constexpr (__string_is_trivial_iterator_v<ranges::iterator_t<_Range>> &&
                   (ranges::forward_range<_Range> || ranges::sized_range<_Range>)) {
       size_type __n = static_cast<size_type>(ranges::distance(__range));
       __assign_trivial(ranges::begin(__range), ranges::end(__range), __n);
@@ -1572,14 +1576,14 @@ public:
     return insert(__pos1, __str.data(), __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos1, const _Tp& __t) {
     __self_view __sv = __t;
     return insert(__pos1, __sv.data(), __sv.size());
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1593,7 +1597,8 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   insert(size_type __pos1, const basic_string& __str, size_type __pos2, size_type __n = npos);
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* __s, size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* __s, size_type __n)
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero");
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, size_type __n, value_type __c);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator insert(const_iterator __pos, value_type __c);
@@ -1649,7 +1654,7 @@ public:
     return replace(__pos1, __n1, __str.data(), __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   replace(size_type __pos1, size_type __n1, const _Tp& __t) {
     __self_view __sv = __t;
@@ -1660,7 +1665,7 @@ public:
   replace(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1673,8 +1678,10 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
-  replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2);
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, const value_type* __s);
+  replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2)
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero");
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
+  replace(size_type __pos, size_type __n1, const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s);
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, size_type __n2, value_type __c);
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1683,7 +1690,7 @@ public:
         static_cast<size_type>(__i1 - begin()), static_cast<size_type>(__i2 - __i1), __str.data(), __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   replace(const_iterator __i1, const_iterator __i2, const _Tp& __t) {
     __self_view __sv = __t;
@@ -1776,14 +1783,15 @@ public:
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
@@ -1807,14 +1815,15 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __sv.data(), __pos, __sv.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
@@ -1838,7 +1847,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1847,7 +1856,8 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
-  find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
@@ -1872,7 +1882,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1881,7 +1891,8 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
-  find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
@@ -1906,7 +1917,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1915,7 +1926,8 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
-  find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
@@ -1940,7 +1952,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1949,7 +1961,8 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
-  find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
@@ -1972,7 +1985,7 @@ public:
     return compare(__self_view(__str));
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT {
     __self_view __sv = __t;
     size_t __lhs_sz  = size();
@@ -1987,7 +2000,7 @@ public:
     return 0;
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const _Tp& __t) const {
     __self_view __sv = __t;
@@ -2005,7 +2018,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
@@ -2026,7 +2039,8 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 int
-  compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const;
+  compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero");
 
   // starts_with
 
@@ -2951,7 +2965,7 @@ template <class _Iterator, class _Sentinel>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
 basic_string<_CharT, _Traits, _Allocator>::__assign_trivial(_Iterator __first, _Sentinel __last, size_type __n) {
   _LIBCPP_ASSERT_INTERNAL(
-      __string_is_trivial_iterator<_Iterator>::value, "The iterator type given to `__assign_trivial` must be trivial");
+      __string_is_trivial_iterator_v<_Iterator>, "The iterator type given to `__assign_trivial` must be trivial");
 
   size_type __old_size = size();
   size_type __cap      = capacity();
@@ -3166,7 +3180,7 @@ basic_string<_CharT, _Traits, _Allocator>::__insert_with_size(
   if (__n == 0)
     return begin() + __ip;
 
-  if (__string_is_trivial_iterator<_Iterator>::value && !__addr_in_range(*__first)) {
+  if (__string_is_trivial_iterator_v<_Iterator> && !__addr_in_range(*__first)) {
     return __insert_from_safe_copy(__n, __ip, std::move(__first), std::move(__last));
   } else {
     const basic_string __temp(__init_with_sentinel_tag(), std::move(__first), std::move(__last), __alloc_);
@@ -3564,7 +3578,8 @@ operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs,
 
 template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
-operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) _NOEXCEPT {
+operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs,
+           const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __rhs) _NOEXCEPT {
   _LIBCPP_ASSERT_NON_NULL(__rhs != nullptr, "operator==(basic_string, char*): received nullptr");
 
   using _String = basic_string<_CharT, _Traits, _Allocator>;
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index 861187c0640e1..f86b2722aca6c 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -318,8 +318,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI basic_string_view& operator=(const basic_string_view&) _NOEXCEPT = default;
 
   _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s, size_type __len) _NOEXCEPT
-      : __data_(__s),
-        __size_(__len) {
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__len != 0 && __s == nullptr, " if len is not zero")
+      : __data_(__s), __size_(__len) {
 #  if _LIBCPP_STD_VER >= 14
     // Allocations must fit in `ptrdiff_t` for pointer arithmetic to work. If `__len` exceeds it, the input
     // range could not have been valid. Most likely the caller underflowed some arithmetic and inadvertently
@@ -352,7 +352,7 @@ public:
       : __data_(ranges::data(__r)), __size_(ranges::size(__r)) {}
 #  endif // _LIBCPP_STD_VER >= 23
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s)
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s)
       : __data_(__s), __size_(std::__char_traits_length_checked<_Traits>(__s)) {}
 
 #  if _LIBCPP_STD_VER >= 23
@@ -483,17 +483,19 @@ public:
     return substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int compare(const _CharT* __s) const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
+  compare(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT {
     return compare(basic_string_view(__s));
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
-  compare(size_type __pos1, size_type __n1, const _CharT* __s) const {
+  compare(size_type __pos1, size_type __n1, const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
     return substr(__pos1, __n1).compare(basic_string_view(__s));
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
-  compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const {
+  compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero") {
     return substr(__pos1, __n1).compare(basic_string_view(__s, __n2));
   }
 
@@ -509,13 +511,14 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT {
+  find(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
@@ -534,13 +537,14 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  rfind(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT {
+  rfind(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
@@ -560,13 +564,14 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_first_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT {
+  find_first_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
@@ -586,13 +591,14 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_last_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT {
+  find_last_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
@@ -613,13 +619,14 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_first_not_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT {
+  find_first_not_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
@@ -640,13 +647,14 @@ public:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT {
+  find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
+      _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
-  find_last_not_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT {
+  find_last_not_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s, __pos, traits_type::length(__s));
@@ -661,7 +669,7 @@ public:
     return !empty() && _Traits::eq(front(), __c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* __s) const noexcept {
+  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
     return starts_with(basic_string_view(__s));
   }
 
@@ -673,7 +681,7 @@ public:
     return !empty() && _Traits::eq(back(), __c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* __s) const noexcept {
+  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
     return ends_with(basic_string_view(__s));
   }
 #  endif
@@ -683,7 +691,9 @@ public:
 
   constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept { return find(__c) != npos; }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* __s) const { return find(__s) != npos; }
+  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
+    return find(__s) != npos;
+  }
 #  endif
 
 private:
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 75021f0ea51f6..662d926ed35a2 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -229,7 +229,6 @@ template <class... Types>
 #  include <__tuple/make_tuple_types.h>
 #  include <__tuple/sfinae_helpers.h>
 #  include <__tuple/tuple_element.h>
-#  include <__tuple/tuple_indices.h>
 #  include <__tuple/tuple_like_ext.h>
 #  include <__tuple/tuple_size.h>
 #  include <__tuple/tuple_types.h>
@@ -457,15 +456,15 @@ struct __tuple_impl;
 
 template <size_t... _Indx, class... _Tp>
 struct _LIBCPP_DECLSPEC_EMPTY_BASES
-    __tuple_impl<__tuple_indices<_Indx...>, _Tp...> : public __tuple_leaf<_Indx, _Tp>... {
+    __tuple_impl<__index_sequence<_Indx...>, _Tp...> : public __tuple_leaf<_Indx, _Tp>... {
   _LIBCPP_HIDE_FROM_ABI constexpr __tuple_impl() noexcept(
       __all<is_nothrow_default_constructible<_Tp>::value...>::value) {}
 
   template <size_t... _Uf, class... _Tf, size_t... _Ul, class... _Tl, class... _Up>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_impl(
-      __tuple_indices<_Uf...>,
+      __index_sequence<_Uf...>,
       __tuple_types<_Tf...>,
-      __tuple_indices<_Ul...>,
+      __index_sequence<_Ul...>,
       __tuple_types<_Tl...>,
       _Up&&... __u) noexcept(__all<is_nothrow_constructible<_Tf, _Up>::value...>::value &&
                              __all<is_nothrow_default_constructible<_Tl>::value...>::value)
@@ -475,9 +474,9 @@ struct _LIBCPP_DECLSPEC_EMPTY_BASES
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __tuple_impl(
       allocator_arg_t,
       const _Alloc& __a,
-      __tuple_indices<_Uf...>,
+      __index_sequence<_Uf...>,
       __tuple_types<_Tf...>,
-      __tuple_indices<_Ul...>,
+      __index_sequence<_Ul...>,
       __tuple_types<_Tl...>,
       _Up&&... __u)
       : __tuple_leaf<_Uf, _Tf>(__uses_alloc_ctor<_Tf, _Alloc, _Up>(), __a, std::forward<_Up>(__u))...,
@@ -518,19 +517,19 @@ struct _LIBCPP_DECLSPEC_EMPTY_BASES
 
 template <class _Dest, class _Source, size_t... _Np>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-__memberwise_copy_assign(_Dest& __dest, _Source const& __source, __tuple_indices<_Np...>) {
+__memberwise_copy_assign(_Dest& __dest, _Source const& __source, __index_sequence<_Np...>) {
   std::__swallow(((std::get<_Np>(__dest) = std::get<_Np>(__source)), void(), 0)...);
 }
 
 template <class _Dest, class _Source, class... _Up, size_t... _Np>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __tuple_types<_Up...>, __tuple_indices<_Np...>) {
+__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __tuple_types<_Up...>, __index_sequence<_Np...>) {
   std::__swallow(((std::get<_Np>(__dest) = std::forward<_Up>(std::get<_Np>(__source))), void(), 0)...);
 }
 
 template <class... _Tp>
 class _LIBCPP_NO_SPECIALIZATIONS tuple {
-  typedef __tuple_impl<typename __make_tuple_indices<sizeof...(_Tp)>::type, _Tp...> _BaseT;
+  typedef __tuple_impl<__make_index_sequence<sizeof...(_Tp)>, _Tp...> _BaseT;
 
   _BaseT __base_;
 
@@ -568,9 +567,9 @@ public:
       tuple(allocator_arg_t, _Alloc const& __a)
       : __base_(allocator_arg_t(),
                 __a,
-                __tuple_indices<>(),
+                __index_sequence<>(),
                 __tuple_types<>(),
-                typename __make_tuple_indices<sizeof...(_Tp), 0>::type(),
+                __make_index_sequence<sizeof...(_Tp)>(),
                 __tuple_types<_Tp...>()) {}
 
   // tuple(const T&...) constructors (including allocator_arg_t variants)
@@ -579,10 +578,10 @@ public:
   _LIBCPP_HIDE_FROM_ABI
   _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_Lazy<_And, is_convertible<const _Tp&, _Tp>...> >::value)
       tuple(const _Tp&... __t) noexcept(_And<is_nothrow_copy_constructible<_Tp>...>::value)
-      : __base_(typename __make_tuple_indices<sizeof...(_Tp)>::type(),
-                typename __make_tuple_types<tuple, sizeof...(_Tp)>::type(),
-                typename __make_tuple_indices<0>::type(),
-                typename __make_tuple_types<tuple, 0>::type(),
+      : __base_(__make_index_sequence<sizeof...(_Tp)>(),
+                __tuple_types<_Tp...>(),
+                __index_sequence<>(),
+                __tuple_types<>(),
                 __t...) {}
 
   template <class _Alloc,
@@ -593,10 +592,10 @@ public:
       tuple(allocator_arg_t, const _Alloc& __a, const _Tp&... __t)
       : __base_(allocator_arg_t(),
                 __a,
-                typename __make_tuple_indices<sizeof...(_Tp)>::type(),
-                typename __make_tuple_types<tuple, sizeof...(_Tp)>::type(),
-                typename __make_tuple_indices<0>::type(),
-                typename __make_tuple_types<tuple, 0>::type(),
+                __make_index_sequence<sizeof...(_Tp)>(),
+                __tuple_types<_Tp...>(),
+                __index_sequence<>(),
+                __tuple_types<>(),
                 __t...) {}
 
   // tuple(U&& ...) constructors (including allocator_arg_t variants)
@@ -616,10 +615,10 @@ public:
                            int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit(_Not<_Lazy<_And, is_convertible<_Up, _Tp>...> >::value)
       tuple(_Up&&... __u) noexcept(_And<is_nothrow_constructible<_Tp, _Up>...>::value)
-      : __base_(typename __make_tuple_indices<sizeof...(_Up)>::type(),
-                typename __make_tuple_types<tuple, sizeof...(_Up)>::type(),
-                typename __make_tuple_indices<sizeof...(_Tp), sizeof...(_Up)>::type(),
-                typename __make_tuple_types<tuple, sizeof...(_Tp), sizeof...(_Up)>::type(),
+      : __base_(__make_index_sequence<sizeof...(_Up)>(),
+                __tuple_types<_Tp...>(),
+                __index_sequence<>(),
+                __tuple_types<>(),
                 std::forward<_Up>(__u)...) {}
 
   template <class _Alloc,
@@ -630,10 +629,10 @@ public:
       tuple(allocator_arg_t, const _Alloc& __a, _Up&&... __u)
       : __base_(allocator_arg_t(),
                 __a,
-                typename __make_tuple_indices<sizeof...(_Up)>::type(),
-                typename __make_tuple_types<tuple, sizeof...(_Up)>::type(),
-                typename __make_tuple_indices<sizeof...(_Tp), sizeof...(_Up)>::type(),
-                typename __make_tuple_types<tuple, sizeof...(_Tp), sizeof...(_Up)>::type(),
+                __make_index_sequence<sizeof...(_Up)>(),
+                __tuple_types<_Tp...>(),
+                __index_sequence<>(),
+                __tuple_types<>(),
                 std::forward<_Up>(__u)...) {}
 
   // Copy and move constructors (including the allocator_arg_t variants)
@@ -838,7 +837,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(_If<_And<is_copy_assignable<_Tp>...>::value, tuple, __nat> const& __tuple) noexcept(
       _And<is_nothrow_copy_assignable<_Tp>...>::value) {
-    std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
+    std::__memberwise_copy_assign(*this, __tuple, __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -846,7 +845,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr const tuple& operator=(tuple const& __tuple) const
     requires(_And<is_copy_assignable<const _Tp>...>::value)
   {
-    std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
+    std::__memberwise_copy_assign(*this, __tuple, __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -854,7 +853,7 @@ public:
     requires(_And<is_assignable<const _Tp&, _Tp>...>::value)
   {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Tp...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
+        *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 #    endif // _LIBCPP_STD_VER >= 23
@@ -863,7 +862,7 @@ public:
   operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple) noexcept(
       _And<is_nothrow_move_assignable<_Tp>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Tp...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
+        *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -873,7 +872,7 @@ public:
                      int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(tuple<_Up...> const& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
-    std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
+    std::__memberwise_copy_assign(*this, __tuple, __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -883,7 +882,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(tuple<_Up...>&& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Up...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
+        *this, std::move(__tuple), __tuple_types<_Up...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -892,7 +891,7 @@ public:
             enable_if_t< _And<_BoolConstant<sizeof...(_Tp) == sizeof...(_UTypes)>,
                               is_assignable<const _Tp&, const _UTypes&>...>::value>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI constexpr const tuple& operator=(const tuple<_UTypes...>& __u) const {
-    std::__memberwise_copy_assign(*this, __u, typename __make_tuple_indices<sizeof...(_Tp)>::type());
+    std::__memberwise_copy_assign(*this, __u, __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -900,8 +899,7 @@ public:
             enable_if_t< _And<_BoolConstant<sizeof...(_Tp) == sizeof...(_UTypes)>,
                               is_assignable<const _Tp&, _UTypes>...>::value>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI constexpr const tuple& operator=(tuple<_UTypes...>&& __u) const {
-    std::__memberwise_forward_assign(
-        *this, __u, __tuple_types<_UTypes...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
+    std::__memberwise_forward_assign(*this, __u, __tuple_types<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 #    endif // _LIBCPP_STD_VER >= 23
@@ -967,7 +965,7 @@ public:
       __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up const&>... >::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(array<_Up, _Np> const& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
-    std::__memberwise_copy_assign(*this, __array, typename __make_tuple_indices<sizeof...(_Tp)>::type());
+    std::__memberwise_copy_assign(*this, __array, __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -979,10 +977,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(array<_Up, _Np>&& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
-        *this,
-        std::move(__array),
-        __tuple_types<_If<true, _Up, _Tp>...>(),
-        typename __make_tuple_indices<sizeof...(_Tp)>::type());
+        *this, std::move(__array), __tuple_types<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -1291,17 +1286,17 @@ template <class _Rp, class _Indices, class _Tuple0, class... _Tuples>
 struct __tuple_cat_return_ref_imp;
 
 template <class... _Types, size_t... _I0, class _Tuple0>
-struct __tuple_cat_return_ref_imp<tuple<_Types...>, __tuple_indices<_I0...>, _Tuple0> {
+struct __tuple_cat_return_ref_imp<tuple<_Types...>, __index_sequence<_I0...>, _Tuple0> {
   using _T0 _LIBCPP_NODEBUG = __libcpp_remove_reference_t<_Tuple0>;
   typedef tuple<_Types..., __copy_cvref_t<_Tuple0, typename tuple_element<_I0, _T0>::type>&&...> type;
 };
 
 template <class... _Types, size_t... _I0, class _Tuple0, class _Tuple1, class... _Tuples>
-struct __tuple_cat_return_ref_imp<tuple<_Types...>, __tuple_indices<_I0...>, _Tuple0, _Tuple1, _Tuples...>
+struct __tuple_cat_return_ref_imp<tuple<_Types...>, __index_sequence<_I0...>, _Tuple0, _Tuple1, _Tuples...>
     : public __tuple_cat_return_ref_imp<
           tuple<_Types...,
                 __copy_cvref_t<_Tuple0, typename tuple_element<_I0, __libcpp_remove_reference_t<_Tuple0>>::type>&&...>,
-          typename __make_tuple_indices<tuple_size<__libcpp_remove_reference_t<_Tuple1> >::value>::type,
+          __make_index_sequence<tuple_size<__libcpp_remove_reference_t<_Tuple1> >::value>,
           _Tuple1,
           _Tuples...> {};
 
@@ -1309,7 +1304,7 @@ template <class _Tuple0, class... _Tuples>
 struct __tuple_cat_return_ref
     : public __tuple_cat_return_ref_imp<
           tuple<>,
-          typename __make_tuple_indices< tuple_size<__libcpp_remove_reference_t<_Tuple0> >::value >::type,
+          __make_index_sequence< tuple_size<__libcpp_remove_reference_t<_Tuple0> >::value >,
           _Tuple0,
           _Tuples...> {};
 
@@ -1317,7 +1312,7 @@ template <class _Types, class _I0, class _J0>
 struct __tuple_cat;
 
 template <class... _Types, size_t... _I0, size_t... _J0>
-struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J0...> > {
+struct __tuple_cat<tuple<_Types...>, __index_sequence<_I0...>, __index_sequence<_J0...>> {
   template <class _Tuple0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
   typename __tuple_cat_return_ref<tuple<_Types...>&&, _Tuple0&&>::type
@@ -1335,8 +1330,8 @@ struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J
     using _T0 _LIBCPP_NODEBUG = __libcpp_remove_reference_t<_Tuple0>;
     using _T1 _LIBCPP_NODEBUG = __libcpp_remove_reference_t<_Tuple1>;
     return __tuple_cat<tuple<_Types..., __copy_cvref_t<_Tuple0, typename tuple_element<_J0, _T0>::type>&&...>,
-                       typename __make_tuple_indices<sizeof...(_Types) + tuple_size<_T0>::value>::type,
-                       typename __make_tuple_indices<tuple_size<_T1>::value>::type>()(
+                       __make_index_sequence<sizeof...(_Types) + tuple_size<_T0>::value>,
+                       __make_index_sequence<tuple_size<_T1>::value>>()(
         std::forward_as_tuple(
             std::forward<_Types>(std::get<_I0>(__t))..., std::get<_J0>(std::forward<_Tuple0>(__t0))...),
         std::forward<_Tuple1>(__t1),
@@ -1346,7 +1341,7 @@ struct __tuple_cat<tuple<_Types...>, __tuple_indices<_I0...>, __tuple_indices<_J
 
 template <class _TupleDst, class _TupleSrc, size_t... _Indices>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _TupleDst
-__tuple_cat_select_element_wise(_TupleSrc&& __src, __tuple_indices<_Indices...>) {
+__tuple_cat_select_element_wise(_TupleSrc&& __src, __index_sequence<_Indices...>) {
   static_assert(tuple_size<_TupleDst>::value == tuple_size<_TupleSrc>::value,
                 "misuse of __tuple_cat_select_element_wise with tuples of different sizes");
   return _TupleDst(std::get<_Indices>(std::forward<_TupleSrc>(__src))...);
@@ -1357,10 +1352,10 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __tuple_cat_
 tuple_cat(_Tuple0&& __t0, _Tuples&&... __tpls) {
   using _T0 _LIBCPP_NODEBUG          = __libcpp_remove_reference_t<_Tuple0>;
   using _TRet _LIBCPP_NODEBUG        = typename __tuple_cat_return<_Tuple0, _Tuples...>::type;
-  using _T0Indices _LIBCPP_NODEBUG   = typename __make_tuple_indices<tuple_size<_T0>::value>::type;
-  using _TRetIndices _LIBCPP_NODEBUG = typename __make_tuple_indices<tuple_size<_TRet>::value>::type;
+  using _T0Indices _LIBCPP_NODEBUG   = __make_index_sequence<tuple_size<_T0>::value>;
+  using _TRetIndices _LIBCPP_NODEBUG = __make_index_sequence<tuple_size<_TRet>::value>;
   return std::__tuple_cat_select_element_wise<_TRet>(
-      __tuple_cat<tuple<>, __tuple_indices<>, _T0Indices>()(
+      __tuple_cat<tuple<>, __index_sequence<>, _T0Indices>()(
           tuple<>(), std::forward<_Tuple0>(__t0), std::forward<_Tuples>(__tpls)...),
       _TRetIndices());
 }
@@ -1376,7 +1371,7 @@ struct uses_allocator<tuple<_Tp...>, _Alloc> : true_type {};
 // clang-format off
 template <class _Fn, class _Tuple, size_t... _Id>
 inline _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto)
-__apply_tuple_impl(_Fn&& __f, _Tuple&& __t, __tuple_indices<_Id...>)
+__apply_tuple_impl(_Fn&& __f, _Tuple&& __t, index_sequence<_Id...>)
     _LIBCPP_NOEXCEPT_RETURN(std::__invoke(std::forward<_Fn>(__f), std::get<_Id>(std::forward<_Tuple>(__t))...))
 
 template <class _Fn, class _Tuple>
@@ -1384,28 +1379,28 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) apply(_Fn&& __f, _Tuple&&
     _LIBCPP_NOEXCEPT_RETURN(std::__apply_tuple_impl(
         std::forward<_Fn>(__f),
         std::forward<_Tuple>(__t),
-        typename __make_tuple_indices<tuple_size_v<remove_reference_t<_Tuple>>>::type{}))
+        make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp, class _Tuple, size_t... _Idx>
-inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, __tuple_indices<_Idx...>)
+inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, index_sequence<_Idx...>)
   noexcept(noexcept(_Tp(std::get<_Idx>(std::forward<_Tuple>(__t))...)))
   requires is_constructible_v<_Tp, decltype(std::get<_Idx>(std::forward<_Tuple>(__t)))...> {
   return _Tp(std::get<_Idx>(std::forward<_Tuple>(__t))...);
 }
 #else
 template <class _Tp, class _Tuple, size_t... _Idx>
-inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, __tuple_indices<_Idx...>,
+inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp __make_from_tuple_impl(_Tuple&& __t, index_sequence<_Idx...>,
     enable_if_t<is_constructible_v<_Tp, decltype(std::get<_Idx>(std::forward<_Tuple>(__t)))...>> * = nullptr)
     _LIBCPP_NOEXCEPT_RETURN(_Tp(std::get<_Idx>(std::forward<_Tuple>(__t))...))
 #endif // _LIBCPP_STD_VER >= 20
 
 template <class _Tp, class _Tuple,
-          class _Seq = typename __make_tuple_indices<tuple_size_v<remove_reference_t<_Tuple>>>::type, class = void>
+          class _Seq = make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>, class = void>
 inline constexpr bool __can_make_from_tuple = false;
 
 template <class _Tp, class _Tuple, size_t... _Idx>
-inline constexpr bool __can_make_from_tuple<_Tp, _Tuple, __tuple_indices<_Idx...>,
+inline constexpr bool __can_make_from_tuple<_Tp, _Tuple, index_sequence<_Idx...>,
     enable_if_t<is_constructible_v<_Tp, decltype(std::get<_Idx>(std::declval<_Tuple>()))...>>> = true;
 
 // Based on LWG3528(https://wg21.link/LWG3528) and http://eel.is/c++draft/description#structure.requirements-9,
@@ -1420,7 +1415,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp
 #endif // _LIBCPP_STD_VER >= 20
 inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t)
     _LIBCPP_NOEXCEPT_RETURN(std::__make_from_tuple_impl<_Tp>(
-        std::forward<_Tuple>(__t), typename __make_tuple_indices<tuple_size_v<remove_reference_t<_Tuple>>>::type{}))
+        std::forward<_Tuple>(__t), make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))
 #    undef _LIBCPP_NOEXCEPT_RETURN
 
 #  endif // _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index 5b70cdeae11a5..97c2c52eba337 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -844,10 +844,10 @@ class __hash_map_iterator {
 
 public:
   typedef forward_iterator_tag iterator_category;
-  typedef typename _NodeTypes::__map_value_type value_type;
+  using value_type = typename _HashIterator::value_type;
   typedef typename _NodeTypes::difference_type difference_type;
   typedef value_type& reference;
-  typedef typename _NodeTypes::__map_value_type_pointer pointer;
+  using pointer = typename _HashIterator::pointer;
 
   _LIBCPP_HIDE_FROM_ABI __hash_map_iterator() _NOEXCEPT {}
 
@@ -895,10 +895,10 @@ class __hash_map_const_iterator {
 
 public:
   typedef forward_iterator_tag iterator_category;
-  typedef typename _NodeTypes::__map_value_type value_type;
+  using value_type = typename _HashIterator::value_type;
   typedef typename _NodeTypes::difference_type difference_type;
   typedef const value_type& reference;
-  typedef typename _NodeTypes::__const_map_value_type_pointer pointer;
+  using pointer = typename _HashIterator::pointer;
 
   _LIBCPP_HIDE_FROM_ABI __hash_map_const_iterator() _NOEXCEPT {}
 
diff --git a/libcxx/include/variant b/libcxx/include/variant
index ede9f486ecc2e..9beef146f203c 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -1127,14 +1127,14 @@ template <class _IdxSeq>
 struct __make_overloads_imp;
 
 template <size_t... _Idx>
-struct __make_overloads_imp<__tuple_indices<_Idx...> > {
+struct __make_overloads_imp<index_sequence<_Idx...> > {
   template <class... _Types>
   using _Apply _LIBCPP_NODEBUG = __all_overloads<__overload<_Types, _Idx>...>;
 };
 
 template <class... _Types>
 using _MakeOverloads _LIBCPP_NODEBUG =
-    typename __make_overloads_imp< __make_indices_imp<sizeof...(_Types), 0> >::template _Apply<_Types...>;
+    typename __make_overloads_imp<make_index_sequence<sizeof...(_Types)>>::template _Apply<_Types...>;
 
 template <class _Tp, class... _Types>
 using __best_match_t _LIBCPP_NODEBUG = typename invoke_result_t<_MakeOverloads<_Types...>, _Tp, _Tp>::type;
diff --git a/libcxx/modules/std/ranges.inc b/libcxx/modules/std/ranges.inc
index adabeeb22d551..7ede42e4f7b0a 100644
--- a/libcxx/modules/std/ranges.inc
+++ b/libcxx/modules/std/ranges.inc
@@ -289,16 +289,16 @@ export namespace std {
     namespace views {
       using std::ranges::views::zip;
     } // namespace views
-#endif // _LIBCPP_STD_VER >= 23
 
-#if 0
     // [range.zip.transform], zip transform view
     using std::ranges::zip_transform_view;
 
     namespace views {
       using std::ranges::views::zip_transform;
     }
+#endif // _LIBCPP_STD_VER >= 23
 
+#if 0
     using std::ranges::adjacent_view;
 
     namespace views {
diff --git a/libcxx/src/random.cpp b/libcxx/src/random.cpp
index 5c6644811bfee..79815aadc7323 100644
--- a/libcxx/src/random.cpp
+++ b/libcxx/src/random.cpp
@@ -31,8 +31,6 @@
 #    include <linux/random.h>
 #    include <sys/ioctl.h>
 #  endif
-#elif defined(_LIBCPP_USING_NACL_RANDOM)
-#  include <nacl/nacl_random.h>
 #elif defined(_LIBCPP_USING_FUCHSIA_CPRNG)
 #  include <zircon/syscalls.h>
 #endif
@@ -93,30 +91,6 @@ unsigned random_device::operator()() {
   return r;
 }
 
-#elif defined(_LIBCPP_USING_NACL_RANDOM)
-
-random_device::random_device(const string& __token) {
-  if (__token != "/dev/urandom")
-    std::__throw_system_error(ENOENT, ("random device not supported " + __token).c_str());
-  int error = nacl_secure_random_init();
-  if (error)
-    std::__throw_system_error(error, ("random device failed to open " + __token).c_str());
-}
-
-random_device::~random_device() {}
-
-unsigned random_device::operator()() {
-  unsigned r;
-  size_t n = sizeof(r);
-  size_t bytes_written;
-  int error = nacl_secure_random(&r, n, &bytes_written);
-  if (error != 0)
-    std::__throw_system_error(error, "random_device failed getting bytes");
-  else if (bytes_written != n)
-    std::__throw_runtime_error("random_device failed to obtain enough bytes");
-  return r;
-}
-
 #elif defined(_LIBCPP_USING_WIN32_RANDOM)
 
 random_device::random_device(const string& __token) {
diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_map.pass.cpp b/libcxx/test/extensions/gnu/hash_map/hash_map.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/gnu_cxx/hash_map.pass.cpp
rename to libcxx/test/extensions/gnu/hash_map/hash_map.pass.cpp
diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_map_name_lookup.pass.cpp b/libcxx/test/extensions/gnu/hash_map/hash_map_name_lookup.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/gnu_cxx/hash_map_name_lookup.pass.cpp
rename to libcxx/test/extensions/gnu/hash_map/hash_map_name_lookup.pass.cpp
diff --git a/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
new file mode 100644
index 0000000000000..ea80359f1fea2
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_map>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multimap<int, int> map;
+
+  map.insert(std::make_pair(1, 1));
+  map.insert(std::make_pair(1, 1));
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  std::pair<int, int> arr[] = {std::make_pair(1, 1), std::make_pair(1, 1)};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
new file mode 100644
index 0000000000000..1a60cac158a40
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_set>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multiset<int> map;
+
+  map.insert(1);
+  map.insert(1);
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  int arr[] = {1, 1};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_set.pass.cpp b/libcxx/test/extensions/gnu/hash_set/hash_set.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/gnu_cxx/hash_set.pass.cpp
rename to libcxx/test/extensions/gnu/hash_set/hash_set.pass.cpp
diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_set_name_lookup.pass.cpp b/libcxx/test/extensions/gnu/hash_set/hash_set_name_lookup.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/gnu_cxx/hash_set_name_lookup.pass.cpp
rename to libcxx/test/extensions/gnu/hash_set/hash_set_name_lookup.pass.cpp
diff --git a/libcxx/test/libcxx/atomics/atomics.flag/init_bool.pass.cpp b/libcxx/test/extensions/libcxx/atomics/atomics.flag/init_bool.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/atomics/atomics.flag/init_bool.pass.cpp
rename to libcxx/test/extensions/libcxx/atomics/atomics.flag/init_bool.pass.cpp
diff --git a/libcxx/test/libcxx/containers/associative/map/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/map/scary.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/associative/map/scary.compile.pass.cpp
rename to libcxx/test/extensions/libcxx/containers/associative/map/scary.compile.pass.cpp
diff --git a/libcxx/test/libcxx/containers/associative/set/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/set/scary.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/associative/set/scary.compile.pass.cpp
rename to libcxx/test/extensions/libcxx/containers/associative/set/scary.compile.pass.cpp
diff --git a/libcxx/test/libcxx/containers/associative/unord.map/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/unord.map/scary.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/associative/unord.map/scary.compile.pass.cpp
rename to libcxx/test/extensions/libcxx/containers/associative/unord.map/scary.compile.pass.cpp
diff --git a/libcxx/test/libcxx/containers/associative/unord.set/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/unord.set/scary.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/associative/unord.set/scary.compile.pass.cpp
rename to libcxx/test/extensions/libcxx/containers/associative/unord.set/scary.compile.pass.cpp
diff --git a/libcxx/test/libcxx/containers/sequences/deque/incomplete.pass.cpp b/libcxx/test/extensions/libcxx/containers/sequences/deque/incomplete.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/sequences/deque/incomplete.pass.cpp
rename to libcxx/test/extensions/libcxx/containers/sequences/deque/incomplete.pass.cpp
diff --git a/libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/extern_c.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp
rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/extern_c.pass.cpp
diff --git a/libcxx/test/extensions/libcxx/include_as_c.sh.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_as_c.sh.cpp
similarity index 100%
rename from libcxx/test/extensions/libcxx/include_as_c.sh.cpp
rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/include_as_c.sh.cpp
diff --git a/libcxx/test/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp
rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp
diff --git a/libcxx/test/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp
rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp
diff --git a/libcxx/test/extensions/libcxx/localization/lit.local.cfg b/libcxx/test/extensions/libcxx/localization/lit.local.cfg
new file mode 100644
index 0000000000000..d47f3e0fe4752
--- /dev/null
+++ b/libcxx/test/extensions/libcxx/localization/lit.local.cfg
@@ -0,0 +1,3 @@
+# <locale> tests are obviously not supported when localization support is disabled
+if "no-localization" in config.available_features:
+    config.unsupported = True
diff --git a/libcxx/test/libcxx/localization/locales/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp b/libcxx/test/extensions/libcxx/localization/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/localization/locales/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp
rename to libcxx/test/extensions/libcxx/localization/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp
diff --git a/libcxx/test/libcxx/depr/depr.c.headers/math_h.compile.pass.cpp b/libcxx/test/extensions/msvc/math_h.compile.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/depr/depr.c.headers/math_h.compile.pass.cpp
rename to libcxx/test/extensions/msvc/math_h.compile.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp b/libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp
new file mode 100644
index 0000000000000..4d720fb0c8459
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// Ensure that APIs which take a pointer are diagnosing passing a nullptr to them
+
+#include <memory>
+
+#include "test_macros.h"
+
+void func() {
+  using Arr     = int[1];
+  int* const np = nullptr;
+
+#if TEST_STD_VER >= 20
+  Arr* const np2 = nullptr;
+  std::construct_at(np); // expected-warning {{null passed}}
+  std::destroy_at(np2);  // expected-warning {{null passed}}
+#endif
+
+  std::destroy_at(np); // expected-warning {{null passed}}
+}
diff --git a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp
index 4fa4982de88fa..c5f152a26a766 100644
--- a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp
@@ -38,10 +38,10 @@ static void print(const Deque& d) {
       " : __back_spare() == %zu"
       " : __capacity() == %zu"
       " : bytes allocated == %zu\n",
-      d.size(),
-      d.__front_spare(),
-      d.__back_spare(),
-      d.__capacity(),
+      std::size_t(d.size()),
+      std::size_t(d.__front_spare()),
+      std::size_t(d.__back_spare()),
+      std::size_t(d.__capacity()),
       malloc_allocator_base::outstanding_bytes);
 }
 
diff --git a/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp b/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp
deleted file mode 100644
index e00a028489a72..0000000000000
--- a/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
-#include <__hash_table>
-#include <unordered_map>
-#include <unordered_set>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "min_allocator.h"
-
-void testKeyValueTrait() {
-  {
-    typedef int Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, int>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::pair<int, int> Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::pair<const int, int> Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::__hash_value_type<int, int> Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, int>::value), "");
-    static_assert((std::is_same<Traits::mapped_type, int>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, std::pair<const int, int> >::value), "");
-    static_assert((std::is_same<Traits::__map_value_type, std::pair<const int, int> >::value), "");
-    static_assert(Traits::__is_map == true, "");
-  }
-}
-
-int main(int, char**) {
-  testKeyValueTrait();
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/fuzzing/random.pass.cpp b/libcxx/test/libcxx/fuzzing/random.pass.cpp
index cb074bd60fdc8..f0256a01f29ae 100644
--- a/libcxx/test/libcxx/fuzzing/random.pass.cpp
+++ b/libcxx/test/libcxx/fuzzing/random.pass.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test fails because Clang no longer enables -fdelayed-template-parsing
-// by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21)
+// This doesn't work on Windows because in the MSVC UCRT headers the math.h is
+// actually intended to implement the full C++ spec requirements. For details
+// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828
+// XFAIL: msvc
 
 // UNSUPPORTED: c++03, c++11
 
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp
new file mode 100644
index 0000000000000..afa0bca11ca91
--- /dev/null
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-has-no-unicode
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// Ensure that APIs which take a FILE* are diagnosing passing a nullptr to them
+
+#include <print>
+
+void func() {
+  std::print(nullptr, "");                                      // expected-warning {{null passed}}
+  std::println(nullptr, "");                                    // expected-warning {{null passed}}
+  std::println(nullptr);                                        // expected-warning {{null passed}}
+  std::vprint_unicode(nullptr, "", std::make_format_args());    // expected-warning {{null passed}}
+  std::vprint_nonunicode(nullptr, "", std::make_format_args()); // expected-warning {{null passed}}
+}
diff --git a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp b/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
deleted file mode 100644
index e0811e02f5c13..0000000000000
--- a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Test the set of C++11 features that Clang provides as an extension in C++03 mode.
-// The language features we expect are:
-//
-// 1. rvalue references (and perfect forwarding)
-// 2. variadic templates
-// 3. alias templates
-// 4. defaulted and deleted functions.
-// 5. default values for non-type template parameters.
-//
-// Some features we don't get and can't be used in extended C++03 mode:
-//
-// 1. noexcept and constexpr
-// 2. Two closing '>' without a space.
-
-#include <type_traits>
-#include <cassert>
-
-// Equals delete and default are allowed in minimal C++03 mode.
-namespace test_eq_delete_and_default {
-void t1() = delete;
-struct T2 {
-  T2() = default;
-  T2(T2 const&) = delete;
-};
-}
-
-namespace alias_templates {
-template <class T>
-using X = T;
-static_assert((std::is_same<X<int>, int>::value), "");
-}
-
-namespace variadics_templates {
-template <class ...Args>
-int t1(Args...) {
-  return sizeof...(Args);
-}
-void test() {
-  assert(t1() == 0);
-  assert(t1(42) == 1);
-  assert(t1(1, 2, 3) == 3);
-}
-}
-
-namespace rvalue_references_move_semantics {
-struct T {
-  T() : moved(0) {}
-  T(T const& other) : moved(other.moved) {}
-  T(T&& other) : moved(other.moved) { ++moved; other.moved = -1; }
-  int moved;
-};
-void f(T o, int expect_moved) { assert(o.moved == expect_moved); }
-void test() {
-  {
-    T t;
-    assert(t.moved == 0);
-    T t2(static_cast<T&&>(t));
-    assert(t2.moved == 1);
-    assert(t.moved == -1);
-  }
-  {
-    T t;
-    f(t, 0);
-    f(static_cast<T&&>(t), 1);
-  }
-}
-}
-
-namespace rvalue_references_perfect_forwarding {
-template <class Expect, class T>
-void f(T&&) {
-  static_assert((std::is_same<Expect, T&&>::value), "");
-}
-void test() {
-  int x = 42;
-  f<int&>(x);
-  f<int&&>(42);
-  f<int&&>(static_cast<int&&>(x));
-}
-}
-
-namespace default_values_for_nttp {
-template <int I = 42>
-void f() { assert(I == 42); }
-void test() {
-  f();
-}
-}
-
-namespace reference_qualified_functions {
-struct T {
-  T() : lvalue_called(0), rvalue_called(0) {}
-  void foo() const & { lvalue_called++; }
-  void foo() && { rvalue_called++; }
-  mutable int lvalue_called;
-  int rvalue_called;
-};
-
-void test() {
-  {
-    T t;
-    t.foo();
-    assert(t.lvalue_called == 1);
-    assert(t.rvalue_called == 0);
-  }
-  {
-    T t;
-    static_cast<T&&>(t).foo();
-    assert(t.lvalue_called == 0);
-    assert(t.rvalue_called == 1);
-  }
-}
-}
-
-int main(int, char**) {
-  variadics_templates::test();
-  rvalue_references_move_semantics::test();
-  rvalue_references_perfect_forwarding::test();
-  default_values_for_nttp::test();
-  reference_qualified_functions::test();
-  return 0;
-}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.zip.transform/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.zip.transform/no_unique_address.compile.pass.cpp
new file mode 100644
index 0000000000000..4b66862660072
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.zip.transform/no_unique_address.compile.pass.cpp
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `zip_transform_view`.
+
+#include <ranges>
+
+struct View : std::ranges::view_base {
+  int* begin() const;
+  int* end() const;
+};
+
+struct Pred {
+  template <class... Args>
+  bool operator()(const Args&...) const;
+};
+
+template <class View>
+struct Test {
+  [[no_unique_address]] View view;
+  char c;
+};
+
+static_assert(sizeof(std::ranges::zip_transform_view<Pred, View>) == 1);
diff --git a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp
index d61896277afd4..f428c49fd05f4 100644
--- a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp
@@ -8,7 +8,10 @@
 
 // UNSUPPORTED: c++03
 
-// Ensure that APIs which take a CharT* (and no size for it) are diagnosing passing a nullptr to them
+// Ensure that APIs which take a CharT* are diagnosing passing a nullptr to them
+
+// Clang 19 and AppleClang don't have diagnose_if with diagnostic flags
+// UNSUPPORTED: clang-19, apple-clang-17
 
 #include <string>
 
@@ -20,6 +23,7 @@ void func() {
   std::string str2(np, std::allocator<char>{}); // expected-warning {{null passed}}
   str2 = np;                                    // expected-warning {{null passed}}
   str2 += np;                                   // expected-warning {{null passed}}
+  str2.assign(np);                              // expected-warning {{null passed}}
   str2.append(np);                              // expected-warning {{null passed}}
   str2.insert(0, np);                           // expected-warning {{null passed}}
   str2.find(np);                                // expected-warning {{null passed}}
@@ -30,6 +34,8 @@ void func() {
   str2.find_last_not_of(np);                    // expected-warning {{null passed}}
   str2.compare(np);                             // expected-warning {{null passed}}
   str2.compare(0, 0, np);                       // expected-warning {{null passed}}
+  str2.replace(0, 0, np);                       // expected-warning {{null passed}}
+  (void)(str2 == np);                           // expected-warning {{null passed}}
 
 #if TEST_STD_VER >= 20
   str2.starts_with(np); // expected-warning {{null passed}}
@@ -38,4 +44,21 @@ void func() {
 #if TEST_STD_VER >= 23
   str2.contains(np); // expected-warning {{null passed}}
 #endif
+
+  // clang-format off
+  // These diagnostics are issued via diagnose_if, so we want to check the full description
+  std::string str3(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  std::string str4(nullptr, 1, std::allocator<char>{}); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.find(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.rfind(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.find_first_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.find_last_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.find_first_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.find_last_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.compare(0, 0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n2 is not zero}}
+  str4.assign(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.append(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.insert(0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str4.replace(0, 0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n2 is not zero}}
+  // clang-format on
 }
diff --git a/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp b/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp
index 1810ec1ca8ac9..f358b5efd0df2 100644
--- a/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp
+++ b/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp
@@ -14,6 +14,9 @@
 // Construct a string_view from a null pointer
 // constexpr basic_string_view( const CharT* s );
 
+// We're testing for assertions here, so let's not diagnose the misuses at compile time
+// ADDITIONAL_COMPILE_FLAGS: -Wno-nonnull
+
 #include <string_view>
 
 #include "check_assertion.h"
diff --git a/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp b/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp
new file mode 100644
index 0000000000000..316c9828e0de5
--- /dev/null
+++ b/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Ensure that APIs which take a CharT* are diagnosing passing a nullptr to them
+
+// Clang 19 and AppleClang don't have diagnose_if with diagnostic flags
+// UNSUPPORTED: clang-19, apple-clang-17
+
+#include <string_view>
+
+#include "test_macros.h"
+
+void func() {
+  const char* const np = nullptr;
+  std::string_view str1(np);  // expected-warning {{null passed}}
+  str1 = np;                  // expected-warning {{null passed}}
+  str1.find(np);              // expected-warning {{null passed}}
+  str1.rfind(np);             // expected-warning {{null passed}}
+  str1.find_first_of(np);     // expected-warning {{null passed}}
+  str1.find_last_of(np);      // expected-warning {{null passed}}
+  str1.find_first_not_of(np); // expected-warning {{null passed}}
+  str1.find_last_not_of(np);  // expected-warning {{null passed}}
+  str1.compare(np);           // expected-warning {{null passed}}
+  str1.compare(0, 0, np);     // expected-warning {{null passed}}
+  (void)(str1 == np);         // expected-warning {{null passed}}
+
+#if TEST_STD_VER >= 20
+  str1.starts_with(np); // expected-warning {{null passed}}
+  str1.ends_with(np);   // expected-warning {{null passed}}
+#endif
+#if TEST_STD_VER >= 23
+  str1.contains(np); // expected-warning {{null passed}}
+#endif
+
+  // clang-format off
+  // These diagnostics are issued via diagnose_if, so we want to check the full description
+  std::string_view str2(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if len is not zero}}
+  str2.find(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str2.rfind(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str2.find_first_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str2.find_last_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str2.find_first_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str2.find_last_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}}
+  str2.compare(0, 0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n2 is not zero}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
index 61374094b7adf..25cc08a55a428 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
@@ -6,10 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Clang-18 fixed some spurious clang diagnostics. Once clang-18 is the
-// minimum required version these obsolete tests can be removed.
-// TODO(LLVM-20) remove spurious clang diagnostic tests.
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // With clang-cl, some warnings have a 'which is a Microsoft extension' suffix
@@ -59,12 +55,12 @@ void test() {
   {
     std::expected<int, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
     // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
     // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}
   }
@@ -73,27 +69,27 @@ void test() {
   {
     const std::expected<int, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-2 {{no matching constructor for initialization of{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 16233cd90d219..2dd3acd51e6e6 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -6,10 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Clang-18 fixed some spurious clang diagnostics. Once clang-18 is the
-// minumum required version these obsolete tests can be removed.
-// TODO(LLVM-20) remove spurious clang diagnostic tests.
-
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // With clang-cl, some warnings have a 'which is a Microsoft extension' suffix
@@ -60,13 +56,13 @@ void test() {
   {
     std::expected<void, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
     // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}
     // expected-error-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
     // expected-warning-re@*:* 0-1 {{union member {{.*}} has reference type {{.*}}, which is a Microsoft extension}}
   }
@@ -75,9 +71,9 @@ void test() {
   {
     const std::expected<void, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     // expected-error-re@*:* 0-1 {{call to deleted constructor of {{.*}}}}
   }
 
@@ -85,18 +81,18 @@ void test() {
   {
     std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
+    // expected-error-re@*:* 0-1 {{no matching constructor for initialization of{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
index b162c2da4f337..1b426959475a5 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
@@ -8,10 +8,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-64-bit-atomics
 
-// Older versions of clang have a bug with atomic builtins affecting double and long double.
-// Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
-
 // https://github.com/llvm/llvm-project/issues/72893
 // XFAIL: target={{x86_64-.*}} && tsan
 
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
index 8784037aa5e82..cac7352646fa4 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
@@ -8,10 +8,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-64-bit-atomics
 
-// Older versions of clang have a bug with atomic builtins affecting double and long double.
-// Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
-
 // https://github.com/llvm/llvm-project/issues/72893
 // XFAIL: target={{x86_64-.*}} && tsan
 
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
index e0e079436075f..d38a2d2ed197a 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
@@ -8,10 +8,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-64-bit-atomics
 
-// Older versions of clang have a bug with atomic builtins affecting double and long double.
-// Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
-
 // floating-point-type operator-=(floating-point-type) volatile noexcept;
 // floating-point-type operator-=(floating-point-type) noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
index 7e2c10106e9ab..c53b0320036bf 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
@@ -8,10 +8,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-64-bit-atomics
 
-// Older versions of clang have a bug with atomic builtins affecting double and long double.
-// Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
-
 // floating-point-type operator+=(floating-point-type) volatile noexcept;
 // floating-point-type operator+=(floating-point-type) noexcept;
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp
index 4fa4fd6a69b94..a09bb8c1aa514 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<int>, KeyContainer, ValueContainer>;
@@ -41,11 +41,23 @@ void test() {
   assert(m.empty());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp
index 0960c43c5a90a..fc35fec10cd95 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp
@@ -26,7 +26,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+constexpr bool test() {
   {
     using A1 = limited_allocator<int, 10>;
     using A2 = limited_allocator<int, 20>;
@@ -74,5 +74,15 @@ int main(int, char**) {
     assert(c.max_size() <= max_dist);
     assert(c.max_size() <= alloc_max_size(std::allocator<char>()));
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp
index 533f8da631fc8..3a99e20235135 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000
 
 // <flat_map>
 
@@ -25,7 +27,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using M = std::flat_multimap<int, char, std::less<int>, KeyContainer, ValueContainer>;
   {
     const M m = {{1, 'a'}, {1, 'b'}, {4, 'd'}, {5, 'e'}, {5, 'h'}};
@@ -47,7 +49,7 @@ void test() {
   }
   {
     M m;
-    std::size_t s = 1000;
+    std::size_t s = 500;
     for (auto i = 0u; i < s; ++i) {
       m.emplace(i, 'a');
     }
@@ -60,11 +62,22 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp
index 3e155eb2a1075..596da81f6e940 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp
@@ -8,12 +8,13 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// <flat_map>
+// <flat_multimap>
 
 // template<class Allocator>
 //   explicit flat_multimap(const Allocator& a);
 
 #include <cassert>
+#include <deque>
 #include <flat_map>
 #include <functional>
 #include <vector>
@@ -22,7 +23,23 @@
 #include "test_allocator.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  using A = test_allocator<short>;
+  using M =
+      std::flat_multimap<int,
+                         long,
+                         std::less<int>,
+                         KeyContainer<int, test_allocator<int>>,
+                         ValueContainer<long, test_allocator<long>>>;
+  M m(A(0, 5));
+  assert(m.empty());
+  assert(m.begin() == m.end());
+  assert(m.keys().get_allocator().get_id() == 5);
+  assert(m.values().get_allocator().get_id() == 5);
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -53,20 +70,23 @@ int main(int, char**) {
     static_assert(std::is_constructible_v<M, test_allocator<int>>);
     static_assert(!std::is_convertible_v<test_allocator<int>, M>);
   }
+
+  test<std::vector, std::vector>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    using A = test_allocator<short>;
-    using M =
-        std::flat_multimap<int,
-                           long,
-                           std::less<int>,
-                           std::vector<int, test_allocator<int>>,
-                           std::vector<long, test_allocator<long>>>;
-    M m(A(0, 5));
-    assert(m.empty());
-    assert(m.begin() == m.end());
-    assert(m.keys().get_allocator().get_id() == 5);
-    assert(m.values().get_allocator().get_id() == 5);
+    test<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp
index 32f75daae7e38..d3c4314671b77 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp
@@ -24,18 +24,32 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 #include "test_allocator.h"
+#include "../helpers.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
   {
     M m = {{8, 8}, {10, 10}};
     assert(m.size() == 2);
-    m                              = {{3, 0}, {1, 0}, {2, 0}, {2, 1}, {3, 1}, {4, 0}, {3, 2}, {5, 0}, {6, 0}, {5, 1}};
-    std::pair<int, int> expected[] = {{1, 0}, {2, 0}, {2, 1}, {3, 0}, {3, 1}, {3, 2}, {4, 0}, {5, 0}, {5, 1}, {6, 0}};
-    assert(std::ranges::equal(m, expected));
+    m = {{3, 0}, {1, 0}, {2, 0}, {2, 1}, {3, 1}, {4, 0}, {3, 2}, {5, 0}, {6, 0}, {5, 1}};
+    assert(std::ranges::equal(m.keys(), std::vector{{1, 2, 2, 3, 3, 3, 4, 5, 5, 6}}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {0},
+            {0, 1},
+            {0, 1},
+            {0, 1, 2},
+            {0, 1, 2},
+            {0, 1, 2},
+            {0},
+            {0, 1},
+            {0, 1},
+            {0},
+        });
   }
   {
     M m = {{10, 1}, {8, 1}};
@@ -46,13 +60,28 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+  }
+
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp
index 1989b8a4ff68a..8af02a2605be5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp
@@ -20,48 +20,52 @@
 #include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
+#include "min_allocator.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+// explicit flat_multimap(const key_compare& comp);
+template <class KeyContainer, class ValueContainer>
+constexpr void test_compare() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
   {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+    // The one-argument ctor is explicit.
+    using C = test_less<Key>;
+    static_assert(std::is_constructible_v<std::flat_multimap<Key, Value, C>, C>);
+    static_assert(!std::is_convertible_v<C, std::flat_multimap<Key, Value, C>>);
 
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = other_allocator<int>;
-    using M1 = std::flat_multimap<int, int, C, std::vector<int, A1>, std::vector<int, A1>>;
-    using M2 = std::flat_multimap<int, int, C, std::vector<int, A1>, std::vector<int, A2>>;
-    using M3 = std::flat_multimap<int, int, C, std::vector<int, A2>, std::vector<int, A1>>;
-    static_assert(std::is_constructible_v<M1, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, const C&, const A2&>);
+    static_assert(std::is_constructible_v<std::flat_multimap<Key, Value>, std::less<Key>>);
+    static_assert(!std::is_convertible_v<std::less<Key>, std::flat_multimap<Key, Value>>);
   }
   {
-    using C = test_less<int>;
-    auto m  = std::flat_multimap<int, char*, C>(C(3));
+    using C = test_less<Key>;
+    auto m  = std::flat_multimap<Key, Value, C>(C(3));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(3));
   }
-  {
-    // The one-argument ctor is explicit.
-    using C = test_less<int>;
-    static_assert(std::is_constructible_v<std::flat_multimap<int, char*, C>, C>);
-    static_assert(!std::is_convertible_v<C, std::flat_multimap<int, char*, C>>);
+}
 
-    static_assert(std::is_constructible_v<std::flat_multimap<int, char*>, std::less<int>>);
-    static_assert(!std::is_convertible_v<std::less<int>, std::flat_multimap<int, char*>>);
+// template <class Alloc>
+//   flat_multimap(const key_compare& comp, const Alloc& a);
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_compare_alloc() {
+  {
+    // If an allocator is given, it must be usable by both containers.
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<>, KeyContainer<int>, ValueContainer<int, A>>;
+    static_assert(std::is_constructible_v<M, std::less<>>);
+    static_assert(!std::is_constructible_v<M, std::less<>, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, std::less<>, A>);
   }
   {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    auto m   = std::flat_multimap<int, short, C, std::vector<int, A1>, std::vector<short, A2>>(C(4), A1(5));
+    auto m   = std::flat_multimap<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>(C(4), A1(5));
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
@@ -70,24 +74,60 @@ int main(int, char**) {
   }
   {
     // explicit(false)
-    using C                                                                         = test_less<int>;
-    using A1                                                                        = test_allocator<int>;
-    using A2                                                                        = test_allocator<short>;
-    std::flat_multimap<int, short, C, std::deque<int, A1>, std::deque<short, A2>> m = {C(4), A1(5)};
+    using C                                                                               = test_less<int>;
+    using A1                                                                              = test_allocator<int>;
+    using A2                                                                              = test_allocator<short>;
+    std::flat_multimap<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>> m = {C(4), A1(5)};
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp() == C(4));
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
+}
+
+constexpr bool test() {
   {
-    // If an allocator is given, it must be usable by both containers.
-    using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, std::vector<int, A>>;
-    static_assert(std::is_constructible_v<M, std::less<>>);
-    static_assert(!std::is_constructible_v<M, std::less<>, std::allocator<int>>);
-    static_assert(!std::is_constructible_v<M, std::less<>, A>);
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using M1 = std::flat_multimap<int, int, C, std::vector<int, A1>, std::vector<int, A1>>;
+    using M2 = std::flat_multimap<int, int, C, std::vector<int, A1>, std::vector<int, A2>>;
+    using M3 = std::flat_multimap<int, int, C, std::vector<int, A2>, std::vector<int, A1>>;
+    static_assert(std::is_constructible_v<M1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const C&, const A2&>);
+  }
+
+  test_compare<std::vector<int>, std::vector<int>>();
+  test_compare<std::vector<int>, std::vector<double>>();
+  test_compare<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test_compare<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test_compare<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  test_compare_alloc<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_compare<std::deque<int>, std::vector<double>>();
+    test_compare_alloc<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp
index 17ee3c3864b1b..2bb8035ed7770 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp
@@ -30,49 +30,40 @@
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
 struct P {
   int first;
   int second;
   template <class T, class U>
-  bool operator==(const std::pair<T, U>& rhs) const {
+  constexpr bool operator==(const std::pair<T, U>& rhs) const {
     return MoveOnly(first) == rhs.first && MoveOnly(second) == rhs.second;
   }
 };
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
-    using M1 = std::flat_multimap<int, int, C, V1, V1>;
-    using M2 = std::flat_multimap<int, int, C, V1, V2>;
-    using M3 = std::flat_multimap<int, int, C, V2, V1>;
-    static_assert(std::is_constructible_v<M1, const V1&, const V1&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, const V1&, const V1&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const A2&>);
-
-    static_assert(std::is_constructible_v<M1, const V1&, const V1&, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, const V1&, const V1&, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const C&, const A2&>);
-  }
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     // flat_multimap(key_container_type , mapped_container_type)
-    using M                         = std::flat_multimap<int, char>;
-    std::vector<int> ks             = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    std::vector<char> vs            = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    auto m                          = M(ks, vs);
-    std::pair<int, char> expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
-    assert(std::ranges::equal(m, expected));
+    using M                  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int>, ValueContainer<short>>;
+    KeyContainer<int> ks     = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    ValueContainer<short> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                   = M(ks, vs);
+    assert((m.keys() == KeyContainer<int>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
 
     // explicit(false)
     M m2 = {ks, vs};
@@ -81,18 +72,30 @@ int main(int, char**) {
     m = M(std::move(ks), std::move(vs));
     assert(ks.empty()); // it was moved-from
     assert(vs.empty()); // it was moved-from
-    assert(std::ranges::equal(m, expected));
+    assert((m.keys() == KeyContainer<int>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
   }
   {
     // flat_multimap(key_container_type , mapped_container_type)
     // move-only
-    P expected[] = {{3, 3}, {3, 2}, {2, 1}, {1, 4}};
-    using Ks     = std::deque<int, min_allocator<int>>;
-    using Vs     = std::vector<MoveOnly, min_allocator<MoveOnly>>;
+    P expected[] = {{3, 2}, {2, 1}, {1, 3}};
+    using Ks     = KeyContainer<int, min_allocator<int>>;
+    using Vs     = ValueContainer<MoveOnly, min_allocator<MoveOnly>>;
     using M      = std::flat_multimap<int, MoveOnly, std::greater<int>, Ks, Vs>;
-    Ks ks        = {1, 3, 3, 2};
+    Ks ks        = {1, 3, 2};
     Vs vs;
-    vs.push_back(4);
     vs.push_back(3);
     vs.push_back(2);
     vs.push_back(1);
@@ -105,26 +108,38 @@ int main(int, char**) {
     // flat_multimap(key_container_type , mapped_container_type)
     // container's allocators are used
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
-    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    using M = std::flat_multimap<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = ValueContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
     auto m  = M(std::move(ks), std::move(vs));
     assert(ks.empty()); // it was moved-from
     assert(vs.empty()); // it was moved-from
-    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}};
-    assert(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    assert(std::ranges::equal(m.values(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.keys().get_allocator() == A(5));
     assert(m.values().get_allocator() == A(6));
   }
   {
     // flat_multimap(key_container_type , mapped_container_type, key_compare)
-    using C                         = test_less<int>;
-    using M                         = std::flat_multimap<int, char, C>;
-    std::vector<int> ks             = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    std::vector<char> vs            = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    auto m                          = M(ks, vs, C(4));
-    std::pair<int, char> expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
-    assert(std::ranges::equal(m, expected));
+    using C                 = test_less<int>;
+    using M                 = std::flat_multimap<int, char, C, KeyContainer<int>, ValueContainer<char>>;
+    KeyContainer<int> ks    = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    ValueContainer<char> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                  = M(ks, vs, C(4));
+    assert((m.keys() == KeyContainer<int>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<char>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.key_comp() == C(4));
 
     // explicit(false)
@@ -135,14 +150,14 @@ int main(int, char**) {
   {
     // flat_multimap(key_container_type , mapped_container_type, const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
-    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    using M = std::flat_multimap<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = ValueContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
     auto m  = M(ks, vs, A(4)); // replaces the allocators
     assert(!ks.empty());       // it was an lvalue above
     assert(!vs.empty());       // it was an lvalue above
-    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}};
-    assert(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    assert(std::ranges::equal(m.values(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.keys().get_allocator() == A(4));
     assert(m.values().get_allocator() == A(4));
   }
@@ -150,27 +165,40 @@ int main(int, char**) {
     // flat_multimap(key_container_type , mapped_container_type, const Allocator&)
     // explicit(false)
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
-    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    using M = std::flat_multimap<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = ValueContainer<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
     M m     = {ks, vs, A(4)}; // implicit ctor
     assert(!ks.empty());      // it was an lvalue above
     assert(!vs.empty());      // it was an lvalue above
-    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}};
-    assert(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    assert(std::ranges::equal(m.values(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
     assert(m.keys().get_allocator() == A(4));
     assert(m.values().get_allocator() == A(4));
   }
+
   {
     // flat_multimap(key_container_type , mapped_container_type, key_compare, const Allocator&)
-    using C                         = test_less<int>;
-    using A                         = test_allocator<int>;
-    using M                         = std::flat_multimap<int, int, C, std::vector<int, A>, std::vector<int, A>>;
-    std::vector<int, A> ks          = {1, 1, 1, 2, 2, 3, 2, 3, 3};
-    std::vector<int, A> vs          = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    auto m                          = M(ks, vs, C(4), A(5));
-    std::pair<int, char> expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
-    assert(std::ranges::equal(m, expected));
+    using C                = test_less<int>;
+    using A                = test_allocator<int>;
+    using M                = std::flat_multimap<int, int, C, std::vector<int, A>, std::vector<int, A>>;
+    std::vector<int, A> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    std::vector<int, A> vs = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                 = M(ks, vs, C(4), A(5));
+    assert(std::ranges::equal(m.keys(), std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<int>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.key_comp() == C(4));
     assert(m.keys().get_allocator() == A(5));
     assert(m.values().get_allocator() == A(5));
@@ -182,6 +210,51 @@ int main(int, char**) {
     assert(m2.keys().get_allocator() == A(5));
     assert(m2.values().get_allocator() == A(5));
   }
+}
+
+bool constexpr test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, const V1&, const V1&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const V1&, const V1&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, const V1&, const V1&, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const V1&, const V1&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const C&, const A2&>);
+  }
+
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::vector>();
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp
index 0e6d12cd3c569..c2a782f2d1aee 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp
@@ -13,6 +13,7 @@
 // flat_multimap(const flat_multimap& m);
 
 #include <cassert>
+#include <deque>
 #include <flat_map>
 #include <vector>
 
@@ -20,11 +21,12 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 1, 3, 3, 5}, test_allocator<int>(6));
-    std::vector<char, test_allocator<char>> vs({2, 2, 1, 1, 1}, test_allocator<char>(7));
+    KeyContainer<int, test_allocator<int>> ks({1, 1, 3, 5}, test_allocator<int>(6));
+    ValueContainer<char, test_allocator<char>> vs({2, 2, 2, 1}, test_allocator<char>(7));
     using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
     auto mo = M(ks, vs, C(5));
     auto m  = mo;
@@ -44,10 +46,10 @@ int main(int, char**) {
   }
   {
     using C  = test_less<int>;
-    using Ks = std::vector<int, other_allocator<int>>;
-    using Vs = std::vector<char, other_allocator<char>>;
-    auto ks  = Ks({1, 3, 5, 5, 5, 5}, other_allocator<int>(6));
-    auto vs  = Vs({2, 2, 5, 5, 5, 1}, other_allocator<char>(7));
+    using Ks = KeyContainer<int, other_allocator<int>>;
+    using Vs = ValueContainer<char, other_allocator<char>>;
+    auto ks  = Ks({1, 1, 3, 5}, other_allocator<int>(6));
+    auto vs  = Vs({2, 2, 2, 1}, other_allocator<char>(7));
     using M  = std::flat_multimap<int, char, C, Ks, Vs>;
     auto mo  = M(Ks(ks, other_allocator<int>(6)), Vs(vs, other_allocator<int>(7)), C(5));
     auto m   = mo;
@@ -65,6 +67,26 @@ int main(int, char**) {
     assert(mo.keys().get_allocator() == other_allocator<int>(6));
     assert(mo.values().get_allocator() == other_allocator<char>(7));
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp
index 3047c004d42e9..a26c3490db512 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp
@@ -22,7 +22,30 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  using C = test_less<int>;
+  KeyContainer<int, test_allocator<int>> ks({1, 1, 3, 5}, test_allocator<int>(6));
+  ValueContainer<char, test_allocator<char>> vs({2, 2, 2, 1}, test_allocator<char>(7));
+  using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
+  auto mo = M(ks, vs, C(5));
+  auto m  = M(mo, test_allocator<int>(3));
+
+  assert(m.key_comp() == C(5));
+  assert(m.keys() == ks);
+  assert(m.values() == vs);
+  assert(m.keys().get_allocator() == test_allocator<int>(3));
+  assert(m.values().get_allocator() == test_allocator<char>(3));
+
+  // mo is unchanged
+  assert(mo.key_comp() == C(5));
+  assert(mo.keys() == ks);
+  assert(mo.values() == vs);
+  assert(mo.keys().get_allocator() == test_allocator<int>(6));
+  assert(mo.values().get_allocator() == test_allocator<char>(7));
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -41,27 +64,24 @@ int main(int, char**) {
     static_assert(!std::is_constructible_v<M2, const M2&, const A2&>);
     static_assert(!std::is_constructible_v<M3, const M3&, const A2&>);
   }
-  {
-    using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 3, 3, 5, 5}, test_allocator<int>(6));
-    std::vector<char, test_allocator<char>> vs({2, 2, 1, 1, 1}, test_allocator<char>(7));
-    using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
-    auto mo = M(ks, vs, C(5));
-    auto m  = M(mo, test_allocator<int>(3));
 
-    assert(m.key_comp() == C(5));
-    assert(m.keys() == ks);
-    assert(m.values() == vs);
-    assert(m.keys().get_allocator() == test_allocator<int>(3));
-    assert(m.values().get_allocator() == test_allocator<char>(3));
+  test<std::vector, std::vector>();
 
-    // mo is unchanged
-    assert(mo.key_comp() == C(5));
-    assert(mo.keys() == ks);
-    assert(mo.values() == vs);
-    assert(mo.keys().get_allocator() == test_allocator<int>(6));
-    assert(mo.values().get_allocator() == test_allocator<char>(7));
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
index 3dd7ebdd38871..fd57a1061b615 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
@@ -15,21 +15,23 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "test_macros.h"
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     // test_allocator is not propagated
     using C = test_less<int>;
-    std::vector<int, test_allocator<int>> ks({1, 1, 3, 3, 5}, test_allocator<int>(6));
-    std::vector<char, test_allocator<char>> vs({1, 2, 3, 4, 5}, test_allocator<char>(7));
+    KeyContainer<int, test_allocator<int>> ks({1, 1, 3, 5}, test_allocator<int>(6));
+    ValueContainer<char, test_allocator<char>> vs({2, 2, 2, 1}, test_allocator<char>(7));
     using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
     auto mo = M(ks, vs, C(5));
-    auto m  = M({{3, 3}, {4, 4}, {5, 5}}, C(3), test_allocator<int>(2));
+    auto m  = M({{3, 3}, {4, 4}, {5, 5}, {5, 5}}, C(3), test_allocator<int>(2));
     m       = mo;
 
     assert(m.key_comp() == C(5));
@@ -48,13 +50,13 @@ int main(int, char**) {
   {
     // other_allocator is propagated
     using C  = test_less<int>;
-    using Ks = std::vector<int, other_allocator<int>>;
-    using Vs = std::vector<char, other_allocator<char>>;
-    auto ks  = Ks({1, 1, 3, 3, 5}, other_allocator<int>(6));
-    auto vs  = Vs({2, 1, 3, 2, 1}, other_allocator<char>(7));
+    using Ks = KeyContainer<int, other_allocator<int>>;
+    using Vs = ValueContainer<char, other_allocator<char>>;
+    auto ks  = Ks({1, 1, 3, 5}, other_allocator<int>(6));
+    auto vs  = Vs({2, 2, 2, 1}, other_allocator<char>(7));
     using M  = std::flat_multimap<int, char, C, Ks, Vs>;
     auto mo  = M(Ks(ks, other_allocator<int>(6)), Vs(vs, other_allocator<int>(7)), C(5));
-    auto m   = M({{3, 3}, {4, 4}, {5, 5}}, C(3), other_allocator<int>(2));
+    auto m   = M({{3, 3}, {4, 4}, {5, 5}, {5, 5}}, C(3), other_allocator<int>(2));
     m        = mo;
 
     assert(m.key_comp() == C(5));
@@ -70,12 +72,44 @@ int main(int, char**) {
     assert(mo.keys().get_allocator() == other_allocator<int>(6));
     assert(mo.values().get_allocator() == other_allocator<char>(7));
   }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    // comparator is copied and invariant is preserved
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
+    M mo    = M({{1, 2}, {3, 4}}, std::less<int>());
+    M m     = M({{1, 2}, {3, 4}}, std::greater<int>());
+    assert(m.key_comp()(2, 1) == true);
+    assert(m != mo);
+    m = mo;
+    assert(m.key_comp()(2, 1) == false);
+    assert(m == mo);
+  }
   {
     // self-assignment
     using M = std::flat_multimap<int, int>;
-    M m     = {{1, 1}, {3, 4}};
+    M m     = {{1, 2}, {3, 4}};
     m       = static_cast<const M&>(m);
-    assert((m == M{{1, 1}, {3, 4}}));
+    assert((m == M{{1, 2}, {3, 4}}));
+  }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp
index c910f748d95fe..4e5490bfc9e92 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp
@@ -24,23 +24,24 @@
 #include "test_allocator.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
-    std::flat_multimap<int, char*> m;
+    std::flat_multimap<int, char*, std::less<int>, KeyContainer<int>, ValueContainer<char*>> m;
     assert(m.empty());
   }
   {
     // explicit(false)
-    std::flat_multimap<int, char*> m = {};
+    std::flat_multimap<int, char*, std::less<int>, KeyContainer<int>, ValueContainer<char*>> m = {};
     assert(m.empty());
   }
   {
-    std::flat_multimap<int, char*, DefaultCtableComp, std::deque<int, min_allocator<int>>> m;
+    std::flat_multimap<int, char*, DefaultCtableComp, KeyContainer<int, min_allocator<int>>> m;
     assert(m.empty());
     assert(m.begin() == m.end());
     assert(m.key_comp().default_constructed_);
@@ -49,13 +50,13 @@ int main(int, char**) {
     using A1 = explicit_allocator<int>;
     using A2 = explicit_allocator<char*>;
     {
-      std::flat_multimap<int, char*, DefaultCtableComp, std::vector<int, A1>, std::vector<char*, A2>> m;
+      std::flat_multimap<int, char*, DefaultCtableComp, KeyContainer<int, A1>, ValueContainer<char*, A2>> m;
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
     {
       A1 a1;
-      std::flat_multimap<int, int, DefaultCtableComp, std::vector<int, A1>, std::vector<int, A1>> m(a1);
+      std::flat_multimap<int, int, DefaultCtableComp, KeyContainer<int, A1>, ValueContainer<int, A1>> m(a1);
       assert(m.empty());
       assert(m.key_comp().default_constructed_);
     }
@@ -63,10 +64,31 @@ int main(int, char**) {
   {
     // If an allocator is given, it must be usable by both containers.
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, std::vector<int, A>>;
+    using M = std::flat_multimap<int, int, std::less<>, KeyContainer<int>, ValueContainer<int, A>>;
     static_assert(std::is_constructible_v<M>);
     static_assert(!std::is_constructible_v<M, std::allocator<int>>);
     static_assert(!std::is_constructible_v<M, A>);
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp
index fa490f120875f..49df5561767a2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp
@@ -28,11 +28,11 @@
 #include "test_allocator.h"
 
 struct ThrowingCtorComp {
-  ThrowingCtorComp() noexcept(false) {}
-  bool operator()(const auto&, const auto&) const { return false; }
+  constexpr ThrowingCtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const { return false; }
 };
 
-int main(int, char**) {
+constexpr bool test() {
 #if defined(_LIBCPP_VERSION)
   {
     using C = std::flat_multimap<MoveOnly, MoveOnly>;
@@ -57,5 +57,14 @@ int main(int, char**) {
     static_assert(!std::is_nothrow_default_constructible_v<C>);
     C c;
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
index fd31e440a6614..104d56755bd76 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
@@ -23,35 +23,60 @@
 #include "test_allocator.h"
 
 struct ThrowingDtorComp {
-  bool operator()(const auto&, const auto&) const;
-  ~ThrowingDtorComp() noexcept(false) {}
+  constexpr bool operator()(const auto&, const auto&) const;
+  constexpr ~ThrowingDtorComp() noexcept(false) {}
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
-    using C = std::flat_multimap<MoveOnly, MoveOnly>;
+    using C =
+        std::flat_multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, KeyContainer<MoveOnly>, ValueContainer<MoveOnly>>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
-    using C = std::flat_multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, V, V>;
+    using V  = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using V2 = ValueContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using C  = std::flat_multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, V, V2>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
   {
-    using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
-    using C = std::flat_multimap<MoveOnly, MoveOnly, std::greater<MoveOnly>, V, V>;
+    using V  = KeyContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using V2 = ValueContainer<MoveOnly, test_allocator<MoveOnly>>;
+    using C  = std::flat_multimap<MoveOnly, MoveOnly, std::greater<MoveOnly>, V, V2>;
     static_assert(std::is_nothrow_destructible_v<C>);
     C c;
   }
 #if defined(_LIBCPP_VERSION)
   {
-    using C = std::flat_multimap<MoveOnly, MoveOnly, ThrowingDtorComp>;
+    using C =
+        std::flat_multimap<MoveOnly, MoveOnly, ThrowingDtorComp, KeyContainer<MoveOnly>, ValueContainer<MoveOnly>>;
     static_assert(!std::is_nothrow_destructible_v<C>);
     C c;
   }
 #endif // _LIBCPP_VERSION
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp
index 8e89192ec0ea1..e40708a5f80cf 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp
@@ -31,12 +31,84 @@
 #include "../../../test_compare.h"
 
 struct DefaultCtableComp {
-  explicit DefaultCtableComp() { default_constructed_ = true; }
-  bool operator()(int, int) const { return false; }
+  constexpr explicit DefaultCtableComp() { default_constructed_ = true; }
+  constexpr bool operator()(int, int) const { return false; }
   bool default_constructed_ = false;
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  std::pair<int, short> expected[] = {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {5, 2}};
+  {
+    // flat_multimap(initializer_list<value_type>);
+    using M                                         = std::flat_multimap<int, short>;
+    std::initializer_list<std::pair<int, short>> il = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    M m(il);
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>);
+    // explicit(false)
+    using M = std::flat_multimap<int, short>;
+    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>);
+    using M = std::flat_multimap<int, short, std::greater<int>, KeyContainer<int, min_allocator<int>>>;
+    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>);
+    // different comparator
+    using A = explicit_allocator<int>;
+    using M = std::flat_multimap<int, int, DefaultCtableComp, KeyContainer<int, A>, ValueContainer<int, A>>;
+    M m     = {{1, 1}, {2, 2}, {3, 3}};
+    assert(m.size() == 3);
+    assert(m.begin()->first == m.begin()->second);
+    assert(m.key_comp().default_constructed_);
+  }
+  {
+    // flat_multimap(initializer_list<value_type>, const Allocator&);
+    using A = explicit_allocator<int>;
+    using M = std::flat_multimap<int, int, std::greater<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    A a;
+    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, a);
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>, const key_compare&);
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, short, C>;
+    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10));
+    assert(std::equal(m.begin(), m.end(), expected, expected + 6));
+    assert(m.key_comp() == C(10));
+
+    // explicit(false)
+    M m2 = {{{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(10));
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    // flat_multimap(initializer_list<value_type>, const key_compare&);
+    // Sorting uses the comparator that was passed in
+    using M = std::flat_multimap<int, short, std::function<bool(int, int)>, KeyContainer<int, min_allocator<int>>>;
+    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, std::greater<int>());
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+    assert(m.key_comp()(2, 1) == true);
+  }
+  {
+    // flat_multimap(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
+    using A = explicit_allocator<int>;
+    using M = std::flat_multimap<int, int, std::greater<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    A a;
+    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, {}, a);
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -83,77 +155,23 @@ int main(int, char**) {
         !std::is_constructible_v<M, std::initializer_list<std::pair<const int, const short>>, std::allocator<int>>);
   }
 
-  std::pair<int, short> expected[] = {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {5, 2}};
-  {
-    // flat_multimap(initializer_list<value_type>);
-    using M                                         = std::flat_multimap<int, short>;
-    std::initializer_list<std::pair<int, short>> il = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
-    M m(il);
-    assert(std::ranges::equal(m, expected));
-  }
-  {
-    // flat_multimap(initializer_list<value_type>);
-    // explicit(false)
-    using M = std::flat_multimap<int, short>;
-    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
-    assert(std::ranges::equal(m, expected));
-  }
-  {
-    // flat_multimap(initializer_list<value_type>);
-    using M = std::flat_multimap<int, short, std::greater<int>, std::deque<int, min_allocator<int>>>;
-    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
-    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
-  }
-  {
-    using A = explicit_allocator<int>;
-    {
-      // flat_multimap(initializer_list<value_type>);
-      // different comparator
-      using M = std::flat_multimap<int, int, DefaultCtableComp, std::vector<int, A>, std::deque<int, A>>;
-      M m     = {{1, 1}, {2, 2}, {3, 3}};
-      assert(m.size() == 3);
-
-      std::pair<int, int> expected1[] = {{1, 1}, {2, 2}, {3, 3}};
-      assert(std::ranges::equal(m, expected1));
-      assert(m.key_comp().default_constructed_);
-    }
-    {
-      // flat_multimap(initializer_list<value_type>, const Allocator&);
-      using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, A>, std::vector<int, A>>;
-      A a;
-      M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, a);
-      assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
-    }
-  }
-  {
-    // flat_multimap(initializer_list<value_type>, const key_compare&);
-    using C = test_less<int>;
-    using M = std::flat_multimap<int, short, C>;
-    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10));
-    assert(std::ranges::equal(m, expected));
-    assert(m.key_comp() == C(10));
+  test<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {{{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)};
-    assert(m2 == m);
-    assert(m2.key_comp() == C(10));
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_multimap(initializer_list<value_type>, const key_compare&);
-    // Sorting uses the comparator that was passed in
-    using M = std::flat_multimap<int, short, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>;
-    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, std::greater<int>());
-    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
-    assert(m.key_comp()(2, 1) == true);
-  }
-  {
-    // flat_multimap(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
-    using A = explicit_allocator<int>;
-    using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, A>, std::vector<int, A>>;
-    A a;
-    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, {}, a);
-    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+    test<std::deque, std::deque>();
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp
index c9c5e6c99d1c8..c7f2d16cae27b 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp
@@ -22,51 +22,41 @@
 #include <flat_map>
 #include <functional>
 #include <vector>
+#include <ranges>
 
+#include "MinSequenceContainer.h"
 #include "min_allocator.h"
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-
-    using C     = test_less<int>;
-    using A1    = test_allocator<int>;
-    using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
-    using M1    = std::flat_multimap<int, int, C, V1, V1>;
-    using M2    = std::flat_multimap<int, int, C, V1, V2>;
-    using M3    = std::flat_multimap<int, int, C, V2, V1>;
-    using Iter1 = typename M1::iterator;
-    using Iter2 = typename M2::iterator;
-    using Iter3 = typename M3::iterator;
-    static_assert(std::is_constructible_v<M1, Iter1, Iter1, const A1&>);
-    static_assert(!std::is_constructible_v<M1, Iter1, Iter1, const A2&>);
-    static_assert(!std::is_constructible_v<M2, Iter2, Iter2, const A2&>);
-    static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const A2&>);
-
-    static_assert(std::is_constructible_v<M1, Iter1, Iter1, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, Iter1, Iter1, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, Iter2, Iter2, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const C&, const A2&>);
-  }
-
-  using P      = std::pair<int, short>;
-  P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
-  P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using P     = std::pair<Key, Value>;
+  P ar[]      = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
   {
     // flat_multimap(InputIterator , InputIterator)
     // cpp17_input_iterator
-    using M = std::flat_multimap<int, short>;
+    using M = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
     auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
 
     // explicit(false)
     M m2 = {cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9)};
@@ -75,25 +65,49 @@ int main(int, char**) {
   {
     // flat_multimap(InputIterator , InputIterator)
     // greater
-    using M = std::flat_multimap<int, short, std::greater<int>, std::deque<int, min_allocator<int>>, std::deque<short>>;
+    using M = std::flat_multimap<Key, Value, std::greater<Key>, KeyContainer, ValueContainer>;
     auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
-    assert((m.keys() == std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
-    LIBCPP_ASSERT((m.values() == std::deque<short>{6, 8, 9, 4, 5, 7, 1, 2, 3}));
+    assert(std::ranges::equal(m.keys(), KeyContainer{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+        });
   }
   {
     // flat_multimap(InputIterator , InputIterator)
     // Test when the operands are of array type (also contiguous iterator type)
-    using M = std::flat_multimap<int, short, std::greater<int>, std::vector<int, min_allocator<int>>>;
+    using M = std::flat_multimap<Key, Value, std::greater<Key>, KeyContainer, ValueContainer>;
     auto m  = M(ar, ar);
     assert(m.empty());
   }
   {
     // flat_multimap(InputIterator , InputIterator, const key_compare&)
-    using C = test_less<int>;
-    using M = std::flat_multimap<int, short, C, std::vector<int>, std::deque<short>>;
+    using C = test_less<Key>;
+    using M = std::flat_multimap<Key, Value, C, KeyContainer, ValueContainer>;
     auto m  = M(ar, ar + 9, C(3));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.key_comp() == C(3));
 
     // explicit(false)
@@ -101,14 +115,33 @@ int main(int, char**) {
     assert(m2 == m);
     assert(m2.key_comp() == C(3));
   }
+}
+
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_alloc() {
+  using P = std::pair<int, short>;
+  P ar[]  = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+
   {
     // flat_multimap(InputIterator , InputIterator, const Allocator&)
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     auto m   = M(ar, ar + 9, A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
@@ -117,10 +150,22 @@ int main(int, char**) {
     // explicit(false)
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     M m      = {ar, ar + 9, A1(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
@@ -129,10 +174,22 @@ int main(int, char**) {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     auto m   = M(ar, ar + 9, C(3), A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.key_comp() == C(3));
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
@@ -142,13 +199,79 @@ int main(int, char**) {
     // explicit(false)
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, std::less<int>, std::deque<int, A1>, std::vector<short, A2>>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     M m      = {ar, ar + 9, {}, A2(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
+}
+
+constexpr bool test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C     = test_less<int>;
+    using A1    = test_allocator<int>;
+    using A2    = other_allocator<int>;
+    using V1    = std::vector<int, A1>;
+    using V2    = std::vector<int, A2>;
+    using M1    = std::flat_multimap<int, int, C, V1, V1>;
+    using M2    = std::flat_multimap<int, int, C, V1, V2>;
+    using M3    = std::flat_multimap<int, int, C, V2, V1>;
+    using Iter1 = typename M1::iterator;
+    using Iter2 = typename M2::iterator;
+    using Iter3 = typename M3::iterator;
+    static_assert(std::is_constructible_v<M1, Iter1, Iter1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, Iter1, Iter1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, Iter2, Iter2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, Iter1, Iter1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, Iter1, Iter1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, Iter2, Iter2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const C&, const A2&>);
+  }
+
+  test<std::vector<int>, std::vector<int>>();
+  test<std::vector<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  test_alloc<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+    test_alloc<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp
index 893c9247959d6..bb9ea9ed6b897 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp
@@ -16,6 +16,7 @@
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -25,11 +26,12 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     using C = test_less<int>;
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    using M = std::flat_multimap<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
     M mo    = M({{1, 1}, {1, 2}, {3, 1}}, C(5), A(7));
     M m     = std::move(mo);
     assert((m == M{{1, 1}, {1, 2}, {3, 1}}));
@@ -45,7 +47,7 @@ int main(int, char**) {
   {
     using C = test_less<int>;
     using A = min_allocator<int>;
-    using M = std::flat_multimap<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    using M = std::flat_multimap<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
     M mo    = M({{1, 1}, {1, 2}, {3, 1}}, C(5), A());
     M m     = std::move(mo);
     assert((m == M{{1, 1}, {1, 2}, {3, 1}}));
@@ -58,9 +60,9 @@ int main(int, char**) {
     assert(m.keys().get_allocator() == A());
     assert(m.values().get_allocator() == A());
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // A moved-from flat_multimap maintains its class invariant in the presence of moved-from comparators.
-    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>, KeyContainer<int>, ValueContainer<int>>;
     M mo    = M({{1, 1}, {1, 2}, {3, 1}}, std::less<int>());
     M m     = std::move(mo);
     assert(m.size() == 3);
@@ -75,7 +77,7 @@ int main(int, char**) {
   }
   {
     // moved-from object maintains invariant if one of underlying container does not clear after move
-    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    using M = std::flat_multimap<int, int, std::less<>, KeyContainer<int>, CopyOnlyVector<int>>;
     M m1    = M({1, 1, 3}, {1, 2, 3});
     M m2    = std::move(m1);
     assert(m2.size() == 3);
@@ -84,6 +86,26 @@ int main(int, char**) {
     LIBCPP_ASSERT(m1.keys().size() == 0);
     LIBCPP_ASSERT(m1.values().size() == 0);
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp
index a0259e805ac5a..140cada468e37 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp
@@ -24,30 +24,13 @@
 #include "../../../test_compare.h"
 #include "test_allocator.h"
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
-    using M1 = std::flat_multimap<int, int, C, V1, V1>;
-    using M2 = std::flat_multimap<int, int, C, V1, V2>;
-    using M3 = std::flat_multimap<int, int, C, V2, V1>;
-    static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, M1&&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, M2&&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, M3&&, const A2&>);
-  }
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {2, 3}, {2, 2}, {3, 1}};
     using C                        = test_less<int>;
     using A                        = test_allocator<int>;
-    using M                        = std::flat_multimap<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    using M                        = std::flat_multimap<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
     auto mo                        = M(expected, expected + 5, C(5), A(7));
     auto m                         = M(std::move(mo), A(3));
 
@@ -57,7 +40,15 @@ int main(int, char**) {
     assert(keys.get_allocator() == A(3));
     assert(values.get_allocator() == A(3));
     assert(std::ranges::equal(keys, expected | std::views::elements<0>));
-    assert(std::ranges::equal(values, expected | std::views::elements<1>));
+    check_possible_values(
+        values,
+        std::vector<std::vector<int>>{
+            {1, 2},
+            {1, 2},
+            {2, 3},
+            {2, 3},
+            {1},
+        });
 
     // The original flat_multimap is moved-from.
     assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp()));
@@ -68,8 +59,8 @@ int main(int, char**) {
   }
   {
     // moved-from object maintains invariant if one of underlying container does not clear after move
-    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
-    M m1    = M({1, 1, 3}, {1, 2, 3});
+    using M = std::flat_multimap<int, int, std::less<>, KeyContainer<int>, CopyOnlyVector<int>>;
+    M m1    = M({1, 2, 3}, {1, 2, 3});
     M m2(std::move(m1), std::allocator<int>{});
     assert(m2.size() == 3);
     check_invariant(m1);
@@ -77,6 +68,45 @@ int main(int, char**) {
     LIBCPP_ASSERT(m1.keys().size() == 0);
     LIBCPP_ASSERT(m1.values().size() == 0);
   }
+}
+
+constexpr bool test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, M1&&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, M2&&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, M3&&, const A2&>);
+  }
+
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp
index 38200d008c78a..292bb5cea1b28 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp
@@ -26,12 +26,13 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = test_allocator<char>;
-    using M  = std::flat_multimap<int, char, C, std::vector<int, A1>, std::vector<char, A2>>;
+    using M  = std::flat_multimap<int, char, C, KeyContainer<int, A1>, ValueContainer<char, A2>>;
     M mo     = M({{1, 1}, {1, 3}, {3, 2}}, C(5), A1(7));
     M m      = M({}, C(3), A1(7));
     m        = std::move(mo);
@@ -46,9 +47,9 @@ int main(int, char**) {
     using C  = test_less<int>;
     using A1 = other_allocator<int>;
     using A2 = other_allocator<char>;
-    using M  = std::flat_multimap<int, char, C, std::deque<int, A1>, std::deque<char, A2>>;
+    using M  = std::flat_multimap<int, char, C, KeyContainer<int, A1>, ValueContainer<char, A2>>;
     M mo     = M({{4, 5}, {4, 4}}, C(5), A1(7));
-    M m      = M({{1, 1}, {1, 2}, {1, 3}, {4, 4}}, C(3), A1(7));
+    M m      = M({{1, 1}, {2, 2}, {3, 3}, {4, 4}}, C(3), A1(7));
     m        = std::move(mo);
     assert((m == M{{4, 5}, {4, 4}}));
     assert(m.key_comp() == C(5));
@@ -59,9 +60,9 @@ int main(int, char**) {
   }
   {
     using A = min_allocator<int>;
-    using M = std::flat_multimap<int, int, std::greater<int>, std::vector<int, A>, std::vector<int, A>>;
+    using M = std::flat_multimap<int, int, std::greater<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
     M mo    = M({{5, 1}, {5, 2}, {3, 3}}, A());
-    M m     = M({{4, 4}, {4, 3}, {4, 2}, {1, 1}}, A());
+    M m     = M({{4, 4}, {3, 3}, {2, 2}, {1, 1}}, A());
     m       = std::move(mo);
     assert((m == M{{5, 1}, {5, 2}, {3, 3}}));
     auto [ks, vs] = std::move(m).extract();
@@ -69,6 +70,26 @@ int main(int, char**) {
     assert(vs.get_allocator() == A());
     assert(mo.empty());
   }
+}
 
-  return 0;
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
 }
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
+  return 0;
+}
\ No newline at end of file
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp
index bc65dca32899c..ce77ab065a252 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <cassert>
 #include <compare>
+#include <deque>
 #include <flat_map>
 #include <functional>
 #include <utility>
@@ -27,9 +28,9 @@
 struct MoveNegates {
   int value_    = 0;
   MoveNegates() = default;
-  MoveNegates(int v) : value_(v) {}
-  MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; }
-  MoveNegates& operator=(MoveNegates&& rhs) {
+  constexpr MoveNegates(int v) : value_(v) {}
+  constexpr MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; }
+  constexpr MoveNegates& operator=(MoveNegates&& rhs) {
     value_     = rhs.value_;
     rhs.value_ = -rhs.value_;
     return *this;
@@ -41,9 +42,9 @@ struct MoveNegates {
 struct MoveClears {
   int value_   = 0;
   MoveClears() = default;
-  MoveClears(int v) : value_(v) {}
-  MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; }
-  MoveClears& operator=(MoveClears&& rhs) {
+  constexpr MoveClears(int v) : value_(v) {}
+  constexpr MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; }
+  constexpr MoveClears& operator=(MoveClears&& rhs) {
     value_     = rhs.value_;
     rhs.value_ = 0;
     return *this;
@@ -52,34 +53,39 @@ struct MoveClears {
   auto operator<=>(const MoveClears&) const = default;
 };
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  auto value_eq = [](auto&& p, auto&& q) { return p.first == q.first; };
   {
     const std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {3, 3}, {3, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}};
-    using M = std::flat_multimap<MoveNegates, int, std::less<MoveNegates>, std::vector<MoveNegates>>;
-    M m     = M(expected, expected + 8);
-    M m2    = M(expected, expected + 3);
+    using M =
+        std::flat_multimap<MoveNegates, int, std::less<MoveNegates>, KeyContainer<MoveNegates>, ValueContainer<int>>;
+    M m  = M(std::sorted_equivalent, expected, expected + 8);
+    M m2 = M(expected, expected + 3);
 
     m2 = std::move(m);
 
     assert(std::equal(m2.begin(), m2.end(), expected, expected + 8));
     LIBCPP_ASSERT(m.empty());
-    check_invariant(m);
+    assert(std::is_sorted(m.begin(), m.end(), m.value_comp()));          // still sorted
+    assert(std::adjacent_find(m.begin(), m.end(), value_eq) == m.end()); // still contains no duplicates
     m.insert({1, 1});
     m.insert({2, 2});
     assert(m.contains(1));
     assert(m.find(2) != m.end());
   }
   {
-    const std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {3, 3}, {4, 4}, {5, 5}, {5, 6}, {7, 7}, {8, 8}};
-    using M = std::flat_multimap<MoveClears, int, std::less<MoveClears>, std::vector<MoveClears>>;
-    M m     = M(expected, expected + 8);
+    const std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {3, 3}, {4, 4}, {4, 5}, {6, 6}, {7, 7}, {8, 8}};
+    using M = std::flat_multimap<MoveClears, int, std::less<MoveClears>, KeyContainer<MoveClears>, ValueContainer<int>>;
+    M m     = M(std::sorted_equivalent, expected, expected + 8);
     M m2    = M(expected, expected + 3);
 
     m2 = std::move(m);
 
     assert(std::equal(m2.begin(), m2.end(), expected, expected + 8));
     LIBCPP_ASSERT(m.empty());
-    check_invariant(m);
+    assert(std::is_sorted(m.begin(), m.end(), m.value_comp()));          // still sorted
+    assert(std::adjacent_find(m.begin(), m.end(), value_eq) == m.end()); // still contains no duplicates
     m.insert({1, 1});
     m.insert({2, 2});
     assert(m.contains(1));
@@ -87,15 +93,36 @@ int main(int, char**) {
   }
   {
     // moved-from object maintains invariant if one of underlying container does not clear after move
-    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
-    M m1    = M({1, 1, 3}, {1, 2, 3});
-    M m2    = M({1, 1}, {1, 2});
+    using M = std::flat_multimap<int, int, std::less<>, KeyContainer<int>, CopyOnlyVector<int>>;
+    M m1    = M({1, 1, 2, 3}, {1, 1, 2, 3});
+    M m2    = M({1, 2, 2}, {1, 2, 2});
     m2      = std::move(m1);
-    assert(m2.size() == 3);
+    assert(m2.size() == 4);
     check_invariant(m1);
     LIBCPP_ASSERT(m1.empty());
     LIBCPP_ASSERT(m1.keys().size() == 0);
     LIBCPP_ASSERT(m1.values().size() == 0);
   }
+}
+
+constexpr bool test() {
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.compile.pass.cpp
similarity index 99%
rename from libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp
rename to libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.compile.pass.cpp
index 4eb58313f6f72..1aa40759fada9 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.compile.pass.cpp
@@ -49,7 +49,7 @@ struct MoveThrowsComp {
   bool operator()(const auto&, const auto&) const;
 };
 
-int main(int, char**) {
+void test() {
   {
     using C = std::flat_multimap<int, int>;
     LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v<C>);
@@ -105,6 +105,4 @@ int main(int, char**) {
     using C = std::flat_multimap<int, int, std::less<int>, std::vector<int>, std::pmr::vector<int>>;
     static_assert(!std::is_nothrow_move_assignable_v<C>);
   }
-
-  return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp
index de750e2506341..0b3655174f136 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp
@@ -27,9 +27,11 @@
 #include <vector>
 
 #include "min_allocator.h"
+#include "MinSequenceContainer.h"
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
 // test constraint container-compatible-range
@@ -66,70 +68,34 @@ static_assert(std::is_constructible_v<Map,
 static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<int>, std::less<int>, std::allocator<int>>);
 static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<double>, std::less<int>, std::allocator<int>>);
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
-    using M1 = std::flat_multimap<int, int, C, V1, V1>;
-    using M2 = std::flat_multimap<int, int, C, V1, V2>;
-    using M3 = std::flat_multimap<int, int, C, V2, V1>;
-    static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>);
-    static_assert(!std::is_constructible_v<M1, std::from_range_t, M1, const A2&>);
-    static_assert(!std::is_constructible_v<M2, std::from_range_t, M2, const A2&>);
-    static_assert(!std::is_constructible_v<M3, std::from_range_t, M3, const A2&>);
-
-    static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, std::from_range_t, M1, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, std::from_range_t, M2, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, std::from_range_t, M3, const C&, const A2&>);
-  }
-  {
-    // container-compatible-range
-    using C           = test_less<int>;
-    using A1          = test_allocator<int>;
-    using A2          = test_allocator<std::string>;
-    using M           = std::flat_multimap<int, std::string, C, std::vector<int, A1>, std::vector<std::string, A2>>;
-    using Pair        = std::pair<int, std::string>;
-    using PairLike    = std::tuple<int, std::string>;
-    using NonPairLike = int;
-
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&>);
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&>);
-    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&>);
-
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const C&>);
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const C&>);
-    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&>);
-
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const A1&>);
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const A1&>);
-    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const A1&>);
-
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const C&, const A1&>);
-    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&, const A1&>);
-  }
-
-  using P      = std::pair<int, short>;
-  P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
-  P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using P     = std::pair<Key, Value>;
+  P ar[]      = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
   {
     // flat_multimap(from_range_t, R&&)
     // input_range && !common
-    using M    = std::flat_multimap<int, short>;
+    using M    = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
     using Iter = cpp20_input_iterator<const P*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
     auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
 
     // explicit(false)
     M m2 = {std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))};
@@ -138,31 +104,67 @@ int main(int, char**) {
   {
     // flat_multimap(from_range_t, R&&)
     // greater
-    using M = std::flat_multimap<int, short, std::greater<int>, std::deque<int, min_allocator<int>>, std::deque<short>>;
+    using M    = std::flat_multimap<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
     using Iter = cpp20_input_iterator<const P*>;
     using Sent = sentinel_wrapper<Iter>;
     using R    = std::ranges::subrange<Iter, Sent>;
     auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
-    assert((m.keys() == std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
-    LIBCPP_ASSERT((m.values() == std::deque<short>{6, 8, 9, 4, 5, 7, 1, 2, 3}));
+    assert(std::ranges::equal(m.keys(), KeyContainer{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+        });
   }
   {
     // flat_multimap(from_range_t, R&&)
     // contiguous range
-    using M = std::flat_multimap<int, short>;
+    using M = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
     using R = std::ranges::subrange<const P*>;
     auto m  = M(std::from_range, R(ar, ar + 9));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
   }
   {
     // flat_multimap(from_range_t, R&&, const key_compare&)
     using C = test_less<int>;
-    using M = std::flat_multimap<int, short, C, std::vector<int>, std::deque<short>>;
+    using M = std::flat_multimap<Key, Value, C, KeyContainer, ValueContainer>;
     using R = std::ranges::subrange<const P*>;
     auto m  = M(std::from_range, R(ar, ar + 9), C(3));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<Value>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.key_comp() == C(3));
 
     // explicit(false)
@@ -170,15 +172,33 @@ int main(int, char**) {
     assert(m2 == m);
     assert(m2.key_comp() == C(3));
   }
+}
+
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_alloc() {
+  using P = std::pair<int, short>;
+  P ar[]  = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
   {
     // flat_multimap(from_range_t, R&&, const Allocator&)
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     using R  = std::ranges::subrange<const P*>;
     auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
@@ -187,11 +207,23 @@ int main(int, char**) {
     // explicit(false)
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     using R  = std::ranges::subrange<const P*>;
     M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
@@ -200,11 +232,23 @@ int main(int, char**) {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     using R  = std::ranges::subrange<const P*>;
     auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.key_comp() == C(3));
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
@@ -214,14 +258,101 @@ int main(int, char**) {
     // explicit(false)
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, std::less<int>, std::deque<int, A1>, std::vector<short, A2>>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     using R  = std::ranges::subrange<const P*>;
     M m      = {std::from_range, R(ar, ar + 9), {}, A2(5)}; // implicit ctor
-    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
-    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(std::ranges::equal(m.keys(), KeyContainer<int, A1>{1, 1, 1, 2, 2, 2, 3, 3, 3}));
+    check_possible_values(
+        m.values(),
+        std::vector<std::vector<short>>{
+            {1, 2, 3},
+            {1, 2, 3},
+            {1, 2, 3},
+            {4, 5, 7},
+            {4, 5, 7},
+            {4, 5, 7},
+            {6, 8, 9},
+            {6, 8, 9},
+            {6, 8, 9},
+        });
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
+}
+
+constexpr bool test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::from_range_t, M1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::from_range_t, M2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::from_range_t, M3, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::from_range_t, M1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::from_range_t, M2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::from_range_t, M3, const C&, const A2&>);
+  }
+  {
+    // container-compatible-range
+    using C           = test_less<int>;
+    using A1          = test_allocator<int>;
+    using A2          = test_allocator<std::string>;
+    using M           = std::flat_multimap<int, std::string, C, std::vector<int, A1>, std::vector<std::string, A2>>;
+    using Pair        = std::pair<int, std::string>;
+    using PairLike    = std::tuple<int, std::string>;
+    using NonPairLike = int;
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&>);
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const C&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const C&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&>);
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const A1&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const A1&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const A1&>);
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const C&, const A1&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&, const A1&>);
+  }
+
+  test<std::vector<int>, std::vector<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  test_alloc<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+    test_alloc<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp
index 16579f0deed5d..b07f8bafd5564 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp
@@ -21,6 +21,7 @@
 //            const mapped_container_type& mapped_cont,
 //            const key_compare& comp, const Alloc& a);
 
+#include <algorithm>
 #include <deque>
 #include <flat_map>
 #include <functional>
@@ -31,46 +32,25 @@
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
+#include "../helpers.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = other_allocator<int>;
-    using V1 = std::vector<int, A1>;
-    using V2 = std::vector<int, A2>;
-    using M1 = std::flat_multimap<int, int, C, V1, V1>;
-    using M2 = std::flat_multimap<int, int, C, V1, V2>;
-    using M3 = std::flat_multimap<int, int, C, V2, V1>;
-    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V1&, const V2&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, const V2&, const V1&, const A2&>);
-
-    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V1&, const V2&, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, const V2&, const V1&, const C&, const A2&>);
-  }
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
   {
     // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type)
-    using M              = std::flat_multimap<int, char>;
-    std::vector<int> ks  = {1, 4, 4, 10};
-    std::vector<char> vs = {4, 3, 2, 1};
-    auto ks2             = ks;
-    auto vs2             = vs;
+    using M                 = std::flat_multimap<int, char, std::less<int>, KeyContainer<int>, ValueContainer<char>>;
+    KeyContainer<int> ks    = {1, 4, 4, 10};
+    ValueContainer<char> vs = {4, 3, 2, 1};
+    auto ks2                = ks;
+    auto vs2                = vs;
 
     auto m = M(std::sorted_equivalent, ks, vs);
-    assert((m == M{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
     m = M(std::sorted_equivalent, std::move(ks), std::move(vs));
     assert(ks.empty()); // it was moved-from
     assert(vs.empty()); // it was moved-from
-    assert((m == M{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
 
     // explicit(false)
     M m2 = {std::sorted_equivalent, std::move(ks2), std::move(vs2)};
@@ -79,41 +59,41 @@ int main(int, char**) {
   {
     // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type)
     // non-default container, comparator and allocator type
-    using Ks = std::deque<int, min_allocator<int>>;
-    using Vs = std::deque<char, min_allocator<char>>;
+    using Ks = KeyContainer<int, min_allocator<int>>;
+    using Vs = ValueContainer<char, min_allocator<char>>;
     using M  = std::flat_multimap<int, char, std::greater<int>, Ks, Vs>;
-    Ks ks    = {10, 1, 1, 1};
+    Ks ks    = {10, 4, 4, 1};
     Vs vs    = {1, 2, 3, 4};
     auto m   = M(std::sorted_equivalent, ks, vs);
-    assert((m == M{{1, 2}, {1, 3}, {1, 4}, {10, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{10, 1}, {4, 2}, {4, 3}, {1, 4}}));
     m = M(std::sorted_equivalent, std::move(ks), std::move(vs));
     assert(ks.empty()); // it was moved-from
     assert(vs.empty()); // it was moved-from
-    assert((m == M{{1, 2}, {1, 3}, {1, 4}, {10, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{10, 1}, {4, 2}, {4, 3}, {1, 4}}));
   }
   {
     // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type)
     // allocator copied into the containers
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({2, 2, 4, 10}, A(4));
-    auto vs = std::deque<int, A>({4, 3, 2, 1}, A(5));
+    using M = std::flat_multimap<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 4, 4, 10}, A(4));
+    auto vs = ValueContainer<int, A>({4, 3, 2, 1}, A(5));
     auto m  = M(std::sorted_equivalent, std::move(ks), std::move(vs));
     assert(ks.empty()); // it was moved-from
     assert(vs.empty()); // it was moved-from
-    assert((m == M{{2, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
     assert(m.keys().get_allocator() == A(4));
     assert(m.values().get_allocator() == A(5));
   }
   {
     // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, key_compare)
-    using C              = test_less<int>;
-    using M              = std::flat_multimap<int, char, C>;
-    std::vector<int> ks  = {1, 2, 10, 10};
-    std::vector<char> vs = {4, 3, 2, 1};
+    using C                 = test_less<int>;
+    using M                 = std::flat_multimap<int, char, C, KeyContainer<int>, ValueContainer<char>>;
+    KeyContainer<int> ks    = {1, 4, 4, 10};
+    ValueContainer<char> vs = {4, 3, 2, 1};
 
     auto m = M(std::sorted_equivalent, ks, vs, C(4));
-    assert((m == M{{1, 4}, {2, 3}, {10, 2}, {10, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
     assert(m.key_comp() == C(4));
 
     // explicit(false)
@@ -123,19 +103,19 @@ int main(int, char**) {
   }
   {
     // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, key_compare, const Allocator&)
-    using C                = test_less<int>;
-    using A                = test_allocator<int>;
-    using M                = std::flat_multimap<int, int, C, std::vector<int, A>, std::vector<int, A>>;
-    std::vector<int, A> ks = {1, 2, 4, 10};
-    std::vector<int, A> vs = {4, 3, 2, 1};
-    auto m                 = M(std::sorted_equivalent, ks, vs, C(4), A(5));
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    using C                   = test_less<int>;
+    using A                   = test_allocator<int>;
+    using M                   = std::flat_multimap<int, int, C, KeyContainer<int, A>, ValueContainer<int, A>>;
+    KeyContainer<int, A> ks   = {1, 4, 4, 10};
+    ValueContainer<int, A> vs = {4, 3, 2, 1};
+    auto m                    = M(std::sorted_equivalent, ks, vs, C(4), A(5));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
     assert(m.key_comp() == C(4));
     assert(m.keys().get_allocator() == A(5));
     assert(m.values().get_allocator() == A(5));
 
     // explicit(false)
-    M m2 = {ks, vs, C(4), A(5)};
+    M m2 = {std::sorted_equivalent, ks, vs, C(4), A(5)};
     assert(m2 == m);
     assert(m2.key_comp() == C(4));
     assert(m2.keys().get_allocator() == A(5));
@@ -144,13 +124,13 @@ int main(int, char**) {
   {
     // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, const Allocator&)
     using A = test_allocator<int>;
-    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
-    auto ks = std::vector<int, A>({1, 2, 4, 4}, A(4));
-    auto vs = std::deque<int, A>({4, 3, 2, 1}, A(5));
+    using M = std::flat_multimap<int, int, std::less<int>, KeyContainer<int, A>, ValueContainer<int, A>>;
+    auto ks = KeyContainer<int, A>({1, 4, 4, 10}, A(4));
+    auto vs = ValueContainer<int, A>({4, 3, 2, 1}, A(5));
     auto m  = M(std::sorted_equivalent, ks, vs, A(6)); // replaces the allocators
     assert(!ks.empty());                               // it was an lvalue above
     assert(!vs.empty());                               // it was an lvalue above
-    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {4, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, char>>{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
     assert(m.keys().get_allocator() == A(6));
     assert(m.values().get_allocator() == A(6));
 
@@ -160,6 +140,51 @@ int main(int, char**) {
     assert(m2.keys().get_allocator() == A(6));
     assert(m2.values().get_allocator() == A(6));
   }
+}
+
+constexpr bool test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V1&, const V2&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, const V2&, const V1&, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V1&, const V2&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, const V2&, const V1&, const C&, const A2&>);
+  }
+
+  test<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque, std::vector>();
+    test<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp
index b34313bb3d404..555b8d4fd6b57 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp
@@ -19,9 +19,11 @@
 //   flat_multimap(sorted_equivalent_t, initializer_list<value_type> il,
 //            const key_compare& comp, const Alloc& a);
 
+#include <algorithm>
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "min_allocator.h"
@@ -31,13 +33,86 @@
 #include "../../../test_compare.h"
 
 template <class T, class U>
-std::initializer_list<std::pair<T, U>> il = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
+constexpr std::initializer_list<std::pair<T, U>> il = {{1, 4}, {4, 2}, {4, 4}, {5, 5}};
 
-const auto il1 = il<int, int>;
-const auto il2 = il<int, short>;
-const auto il3 = il<short, int>;
+constexpr auto il1 = il<int, int>;
+constexpr auto il2 = il<int, short>;
+constexpr auto il3 = il<short, int>;
 
-int main(int, char**) {
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test() {
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>);
+    using M = std::flat_multimap<int, int, std::less<int>, KeyContainer<int>, ValueContainer<int>>;
+    auto m  = M(std::sorted_equivalent, il1);
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 4}, {4, 2}, {4, 4}, {5, 5}}));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, il1};
+    assert(m2 == m);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>, KeyContainer<int>, ValueContainer<int>>;
+    auto m  = M(std::sorted_equivalent, il1, std::less<int>());
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 4}, {4, 2}, {4, 4}, {5, 5}}));
+    assert(m.key_comp()(1, 2) == true);
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, il1, std::less<int>()};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
+    // greater
+    using M =
+        std::flat_multimap<int, int, std::greater<int>, KeyContainer<int, min_allocator<int>>, ValueContainer<int>>;
+    std::initializer_list<std::pair<int, int>> il4{{5, 5}, {4, 5}, {4, 2}, {1, 1}};
+    auto m = M(std::sorted_equivalent, il4, std::greater<int>());
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{5, 5}, {4, 5}, {4, 2}, {1, 1}}));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>,  const Allocator&)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    auto m   = M(std::sorted_equivalent, il2, A1(5));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, short>>{{1, 4}, {4, 2}, {4, 4}, {5, 5}}));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, il2, A1(5)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A1(5));
+    assert(m2.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    auto m   = M(std::sorted_equivalent, il2, C(3), A1(5));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, short>>{{1, 4}, {4, 2}, {4, 4}, {5, 5}}));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
+    // explicit(false)
+    using A1 = test_allocator<short>;
+    using A2 = test_allocator<int>;
+    using M  = std::flat_multimap<short, int, std::less<int>, KeyContainer<short, A1>, ValueContainer<int, A2>>;
+    M m      = {std::sorted_equivalent, il3, {}, A1(5)}; // implicit ctor
+    assert(std::ranges::equal(m, std::vector<std::pair<short, int>>{{1, 4}, {4, 2}, {4, 4}, {5, 5}}));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+}
+
+constexpr bool test() {
   {
     // The constructors in this subclause shall not participate in overload
     // resolution unless uses_allocator_v<key_container_type, Alloc> is true
@@ -108,76 +183,23 @@ int main(int, char**) {
                                            std::allocator<int>>);
   }
 
-  {
-    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>);
-    using M       = std::flat_multimap<int, int>;
-    auto m        = M(std::sorted_equivalent, il1);
-    auto expected = M{{1, 1}, {4, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
-
-    // explicit(false)
-    M m2 = {std::sorted_equivalent, il1};
-    assert(m2 == m);
-  }
-  {
-    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
-    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
-    auto m  = M(std::sorted_equivalent, il1, std::less<int>());
-    assert(m == M({{1, 1}, {4, 2}, {4, 4}, {5, 5}}, std::less<>()));
-    assert(m.key_comp()(1, 2) == true);
+  test<std::vector, std::vector>();
 
-    // explicit(false)
-    M m2 = {std::sorted_equivalent, il1, std::less<int>()};
-    assert(m2 == m);
-  }
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
-    // greater
-    using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, min_allocator<int>>, std::vector<int>>;
-    std::initializer_list<std::pair<int, int>> il4{{5, 5}, {4, 4}, {1, 2}, {1, 1}};
-    auto m = M(std::sorted_equivalent, il4, std::greater<int>());
-    assert((m == M{{5, 5}, {4, 4}, {1, 2}, {1, 1}}));
+    test<std::deque, std::deque>();
   }
-  {
-    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>,  const Allocator&)
-    using A1      = test_allocator<int>;
-    using A2      = test_allocator<short>;
-    using M       = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    auto m        = M(std::sorted_equivalent, il2, A1(5));
-    auto expected = M{{1, 1}, {4, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
 
-    // explicit(false)
-    M m2 = {std::sorted_equivalent, il2, A1(5)};
-    assert(m2 == m);
-    assert(m2.keys().get_allocator() == A1(5));
-    assert(m2.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
-    using C  = test_less<int>;
-    using A1 = test_allocator<int>;
-    using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
-    auto m   = M(std::sorted_equivalent, il2, C(3), A1(5));
-    assert((m == M{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
-    assert(m.key_comp() == C(3));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
-  {
-    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
-    // explicit(false)
-    using A1 = test_allocator<short>;
-    using A2 = test_allocator<int>;
-    using M  = std::flat_multimap<short, int, std::less<int>, std::deque<short, A1>, std::vector<int, A2>>;
-    M m      = {std::sorted_equivalent, il3, {}, A1(5)}; // implicit ctor
-    assert((m == M{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
-    assert(m.keys().get_allocator() == A1(5));
-    assert(m.values().get_allocator() == A2(5));
-  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp
index 45c4b3dc675a5..72e9695dd42ea 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp
@@ -17,52 +17,33 @@
 // template<class InputIterator, class Allocator>
 //   flat_multimap(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a);
 
+#include <algorithm>
 #include <deque>
 #include <flat_map>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
+#include "MinSequenceContainer.h"
 #include "min_allocator.h"
 #include "test_allocator.h"
 #include "test_iterators.h"
 #include "test_macros.h"
 #include "../../../test_compare.h"
 
-int main(int, char**) {
-  {
-    // The constructors in this subclause shall not participate in overload
-    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
-    // and uses_allocator_v<mapped_container_type, Alloc> is true.
-    using C     = test_less<int>;
-    using A1    = test_allocator<int>;
-    using A2    = other_allocator<int>;
-    using V1    = std::vector<int, A1>;
-    using V2    = std::vector<int, A2>;
-    using M1    = std::flat_multimap<int, int, C, V1, V1>;
-    using M2    = std::flat_multimap<int, int, C, V1, V2>;
-    using M3    = std::flat_multimap<int, int, C, V2, V1>;
-    using Iter1 = typename M1::iterator;
-    using Iter2 = typename M2::iterator;
-    using Iter3 = typename M3::iterator;
-    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const A1&>);
-    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const A2&>);
-    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const A2&>);
-    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, Iter3, Iter3, const A2&>);
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
 
-    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A1&>);
-    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A2&>);
-    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, Iter3, Iter3, const C&, const A2&>);
-  }
   {
     // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator);
     // cpp17_input_iterator
-    using M = std::flat_multimap<int, int>;
-    using P = std::pair<int, int>;
-    P ar[]  = {{1, 1}, {4, 4}, {5, 5}, {5, 2}};
+    using M = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    using P = std::pair<Key, Value>;
+    P ar[]  = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
     auto m  = M(std::sorted_equivalent, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4));
-    auto expected = M{{1, 1}, {4, 4}, {5, 5}, {5, 2}};
-    assert(m == expected);
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
 
     // explicit(false)
     M m2 = {std::sorted_equivalent, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4)};
@@ -71,25 +52,24 @@ int main(int, char**) {
   {
     // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator);
     // contiguous iterator
-    using C = test_less<int>;
-    using M =
-        std::flat_multimap<int, int, C, std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>;
-    std::pair<int, int> ar[] = {{1, 1}, {1, 4}, {2, 2}, {5, 5}};
-    auto m                   = M(std::sorted_equivalent, ar, ar + 4);
-    auto expected            = M{{1, 1}, {1, 4}, {2, 2}, {5, 5}};
-    assert(m == expected);
+    using C = test_less<Key>;
+    using P = std::pair<Key, Value>;
+    using M = std::flat_multimap<Key, Value, C, KeyContainer, ValueContainer>;
+    P ar[]  = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
+    auto m  = M(std::sorted_equivalent, ar, ar + 4);
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // cpp_17_input_iterator
-    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
-    using P = std::pair<int, int>;
-    P ar[]  = {{1, 1}, {2, 2}, {2, 4}, {5, 5}};
+    using M = std::flat_multimap<Key, Value, std::function<bool(Key, Value)>>;
+    using P = std::pair<Key, Value>;
+    P ar[]  = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
     auto m  = M(std::sorted_equivalent,
                cpp17_input_iterator<const P*>(ar),
                cpp17_input_iterator<const P*>(ar + 4),
                std::less<int>());
-    assert(m == M({{1, 1}, {2, 2}, {2, 4}, {5, 5}}, std::less<>()));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
     assert(m.key_comp()(1, 2) == true);
 
     // explicit(false)
@@ -102,36 +82,38 @@ int main(int, char**) {
   {
     // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // greater
-    using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, min_allocator<int>>, std::vector<int>>;
-    using P = std::pair<int, int>;
-    P ar[]  = {{5, 5}, {2, 4}, {2, 2}, {1, 1}};
+    using M = std::flat_multimap<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
+    using P = std::pair<Key, Value>;
+    P ar[]  = {{5, 5}, {4, 4}, {4, 2}, {1, 1}};
     auto m  = M(std::sorted_equivalent,
                cpp17_input_iterator<const P*>(ar),
                cpp17_input_iterator<const P*>(ar + 4),
                std::greater<int>());
-    assert((m == M{{5, 5}, {2, 4}, {2, 2}, {1, 1}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{5, 5}, {4, 4}, {4, 2}, {1, 1}}));
   }
   {
     // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
     // contiguous iterator
-    using C = test_less<int>;
-    using M =
-        std::flat_multimap<int, int, C, std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>;
-    std::pair<int, int> ar[1] = {{42, 42}};
-    auto m                    = M(std::sorted_equivalent, ar, ar, C(5));
+    using C                     = test_less<Key>;
+    using M                     = std::flat_multimap<Key, Value, C, KeyContainer, ValueContainer>;
+    std::pair<Key, Value> ar[1] = {{42, 42}};
+    auto m                      = M(std::sorted_equivalent, ar, ar, C(5));
     assert(m.empty());
     assert(m.key_comp() == C(5));
   }
+}
+
+template <template <class...> class KeyContainer, template <class...> class ValueContainer>
+constexpr void test_alloc() {
   {
     // flat_multimap(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&)
-    using A1      = test_allocator<int>;
-    using A2      = test_allocator<short>;
-    using M       = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
-    using P       = std::pair<int, int>;
-    P ar[]        = {{2, 1}, {2, 2}, {4, 4}, {5, 5}};
-    auto m        = M(std::sorted_equivalent, ar, ar + 4, A1(5));
-    auto expected = M{{2, 1}, {2, 2}, {4, 4}, {5, 5}};
-    assert(m == expected);
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, KeyContainer<int, A1>, ValueContainer<short, A2>>;
+    using P  = std::pair<int, int>;
+    P ar[]   = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
+    auto m   = M(std::sorted_equivalent, ar, ar + 4, A1(5));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
 
@@ -146,11 +128,11 @@ int main(int, char**) {
     using C  = test_less<int>;
     using A1 = test_allocator<int>;
     using A2 = test_allocator<short>;
-    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    using M  = std::flat_multimap<int, short, C, KeyContainer<int, A1>, ValueContainer<short, A2>>;
     using P  = std::pair<int, int>;
-    P ar[]   = {{1, 1}, {1, 2}, {1, 4}, {1, 5}};
+    P ar[]   = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
     auto m   = M(std::sorted_equivalent, ar, ar + 4, C(3), A1(5));
-    assert((m == M{{1, 1}, {1, 2}, {1, 4}, {1, 5}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
     assert(m.key_comp() == C(3));
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
@@ -160,14 +142,67 @@ int main(int, char**) {
     // explicit(false)
     using A1 = test_allocator<short>;
     using A2 = test_allocator<int>;
-    using M  = std::flat_multimap<short, int, std::less<int>, std::deque<short, A1>, std::vector<int, A2>>;
+    using M  = std::flat_multimap<short, int, std::less<int>, KeyContainer<short, A1>, ValueContainer<int, A2>>;
     using P  = std::pair<int, int>;
-    P ar[]   = {{1, 1}, {1, 2}, {1, 4}, {1, 5}};
+    P ar[]   = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
     M m      = {std::sorted_equivalent, ar, ar + 4, {}, A1(5)}; // implicit ctor
-    assert((m == M{{1, 1}, {1, 2}, {1, 4}, {1, 5}}));
+    assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
     assert(m.keys().get_allocator() == A1(5));
     assert(m.values().get_allocator() == A2(5));
   }
+}
+
+constexpr bool test() {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+    using C     = test_less<int>;
+    using A1    = test_allocator<int>;
+    using A2    = other_allocator<int>;
+    using V1    = std::vector<int, A1>;
+    using V2    = std::vector<int, A2>;
+    using M1    = std::flat_multimap<int, int, C, V1, V1>;
+    using M2    = std::flat_multimap<int, int, C, V1, V2>;
+    using M3    = std::flat_multimap<int, int, C, V2, V1>;
+    using Iter1 = typename M1::iterator;
+    using Iter2 = typename M2::iterator;
+    using Iter3 = typename M3::iterator;
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, Iter3, Iter3, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, Iter3, Iter3, const C&, const A2&>);
+  }
+
+  test<std::vector<int>, std::vector<int>>();
+  test<std::vector<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  test_alloc<std::vector, std::vector>();
+
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+    test_alloc<std::deque, std::deque>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp
index 76d5cbd909050..bed933751be91 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp
@@ -34,7 +34,7 @@ static_assert(HasStdErase<std::vector<int>>);
 static_assert(!HasStdErase<std::flat_multimap<int, int>>);
 
 template <class M>
-M make(std::initializer_list<int> vals) {
+constexpr M make(std::initializer_list<int> vals) {
   M ret;
   for (int v : vals) {
     ret.emplace(static_cast<typename M::key_type>(v), static_cast<typename M::mapped_type>(v + 10));
@@ -43,8 +43,8 @@ M make(std::initializer_list<int> vals) {
 }
 
 template <class M, class Pred>
-void test0(
-    std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
+constexpr void
+test0(std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
   M s = make<M>(vals);
   ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
@@ -52,7 +52,7 @@ void test0(
 }
 
 template <class S>
-void test() {
+constexpr void test() {
   // Test all the plausible signatures for this predicate.
   auto is1   = [](typename S::const_reference v) { return v.first == 1; };
   auto is2   = [](typename S::value_type v) { return v.first == 2; };
@@ -81,7 +81,7 @@ void test() {
   test0<S>({1, 2, 2, 3, 3, 3}, False, {1, 2, 2, 3, 3, 3}, 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::flat_multimap<int, char>>();
   test<std::flat_multimap<int,
                           char,
@@ -89,10 +89,24 @@ int main(int, char**) {
                           std::vector<int, min_allocator<int>>,
                           std::vector<char, min_allocator<char>>>>();
   test<std::flat_multimap<int, char, std::greater<int>, std::vector<int, test_allocator<int>>>>();
-  test<std::flat_multimap<int, char, std::less<int>, std::deque<int, min_allocator<int>>>>();
-  test<std::flat_multimap<int, char, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::flat_multimap<int, char, std::less<int>, std::deque<int, min_allocator<int>>>>();
+    test<std::flat_multimap<int, char, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+  }
   test<std::flat_multimap<long, int>>();
   test<std::flat_multimap<double, int>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp
index c1285955e5db6..66154be66cefd 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp
@@ -32,7 +32,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -79,9 +79,12 @@ void test() {
   assert(i > m.begin());                    // operator>
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
@@ -101,5 +104,14 @@ int main(int, char**) {
     assert(!(cii != ii1));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp
index c2f4f608ba18a..35b51af4954b3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp
@@ -24,7 +24,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -142,11 +142,23 @@ void test() {
   assert(cri2 <=> cri1 == std::strong_ordering::greater);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp
index 6e29d209f83f0..623882d279106 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp
@@ -27,48 +27,62 @@
 
 #include <iterator>
 
+#include "MinSequenceContainer.h"
 #include "test_macros.h"
+#include "min_allocator.h"
 
-int main(int, char**) {
+template <class KeyContainer, class ValueContainer>
+constexpr void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  M m         = {{1, 'a'}, {1, 'b'}, {2, 'c'}, {2, 'd'}, {3, 'e'}, {3, 'f'}, {4, 'g'}, {4, 'h'}};
+  const M& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.rbegin()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rbegin()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.rend()), typename M::reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(m.crend()), typename M::const_reverse_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.rend()), typename M::const_reverse_iterator);
+  static_assert(noexcept(m.rbegin()));
+  static_assert(noexcept(cm.rbegin()));
+  static_assert(noexcept(m.crbegin()));
+  static_assert(noexcept(m.rend()));
+  static_assert(noexcept(cm.rend()));
+  static_assert(noexcept(m.crend()));
+  assert(m.size() == 8);
+  assert(std::distance(m.rbegin(), m.rend()) == 8);
+  assert(std::distance(cm.rbegin(), cm.rend()) == 8);
+  assert(std::distance(m.crbegin(), m.crend()) == 8);
+  assert(std::distance(cm.crbegin(), cm.crend()) == 8);
+  typename M::reverse_iterator i; // default-construct
+  ASSERT_SAME_TYPE(decltype(i->first), const int&);
+  ASSERT_SAME_TYPE(decltype(i->second), char&);
+  i                                    = m.rbegin(); // move-assignment
+  typename M::const_reverse_iterator k = i;          // converting constructor
+  assert(i == k);                                    // comparison
+  for (int j = 8; j >= 1; --j, ++i) {                // pre-increment
+    assert(i->first == (j + 1) / 2);                 // operator->
+  }
+  assert(i == m.rend());
+  for (int j = 1; j <= 8; ++j) {
+    --i; // pre-decrement
+    assert((*i).first == (j + 1) / 2);
+  }
+  assert(i == m.rbegin());
+}
+
+constexpr bool test() {
+  test<std::vector<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
-    using M     = std::flat_multimap<int, char, std::less<int>, std::deque<int>, std::deque<char>>;
-    M m         = {{1, 'a'}, {1, 'b'}, {2, 'c'}, {2, 'd'}, {3, 'e'}, {3, 'f'}, {4, 'g'}, {4, 'h'}};
-    const M& cm = m;
-    ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator);
-    ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator);
-    static_assert(noexcept(m.rbegin()));
-    static_assert(noexcept(cm.rbegin()));
-    static_assert(noexcept(m.crbegin()));
-    static_assert(noexcept(m.rend()));
-    static_assert(noexcept(cm.rend()));
-    static_assert(noexcept(m.crend()));
-    assert(m.size() == 8);
-    assert(std::distance(m.rbegin(), m.rend()) == 8);
-    assert(std::distance(cm.rbegin(), cm.rend()) == 8);
-    assert(std::distance(m.crbegin(), m.crend()) == 8);
-    assert(std::distance(cm.crbegin(), cm.crend()) == 8);
-    M::reverse_iterator i; // default-construct
-    ASSERT_SAME_TYPE(decltype(i->first), const int&);
-    ASSERT_SAME_TYPE(decltype(i->second), char&);
-    i                           = m.rbegin(); // move-assignment
-    M::const_reverse_iterator k = i;          // converting constructor
-    assert(i == k);                           // comparison
-    for (int j = 8; j >= 1; --j, ++i) {       // pre-increment
-      assert(i->first == (j + 1) / 2);        // operator->
-      assert(i->second == 'a' + j - 1);
-    }
-    assert(i == m.rend());
-    for (int j = 1; j <= 8; ++j) {
-      --i; // pre-decrement
-      assert((*i).first == (j + 1) / 2);
-      assert((*i).second == 'a' + j - 1);
-    }
-    assert(i == m.rbegin());
+    test<std::deque<int>, std::vector<char>>();
   }
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
   {
     // N3644 testing
     using C = std::flat_multimap<int, char>;
@@ -84,6 +98,14 @@ int main(int, char**) {
     assert(!(ii1 != cii));
     assert(!(cii != ii1));
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp
index 5b0788b6826fd..5f30e1409bd12 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp
@@ -39,7 +39,7 @@ static_assert(
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -52,13 +52,24 @@ void test() {
   assert(m.size() == 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp
index 9ef0c26e54ba3..ea1843db573cc 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp
@@ -41,7 +41,7 @@ static_assert(!CanEmplace<Map, Emplaceable>);
 static_assert(!CanEmplace<Map, int, double>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -113,7 +113,7 @@ void test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multimap<int, Emplaceable, std::less<int>, KeyContainer, ValueContainer>;
   using R = typename M::iterator;
 
@@ -136,23 +136,38 @@ void test_emplaceable() {
   assert(m.begin()->second == Emplaceable(2, 3.5));
 }
 
-int main(int, char**) {
+constexpr bool test() {
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::vector<double>>();
+    test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
+  }
+
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   test_emplaceable<std::vector<int>, std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<int>, MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<int, min_allocator<int>>, std::vector<Emplaceable, min_allocator<Emplaceable>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto emplace_func = [](auto& m, auto key_arg, auto value_arg) {
       m.emplace(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
     };
     test_emplace_exception_guarantee(emplace_func);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp
index 588d27ea54f4d..2367b406e725f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp
@@ -44,7 +44,7 @@ static_assert(!CanEmplaceHint<Map, int, double>);
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -110,7 +110,7 @@ void test() {
     assert(r->first == 2);
     assert(r->second == 2.0);
     assert(std::next(r)->first == 2);
-    assert(std::next(r)->second == 2.1);
+    assert(std::next(r)->second == 2.1 || std::next(r)->second == 1.9);
   }
   {
     // hint correct and after duplicates
@@ -183,7 +183,7 @@ void test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_emplaceable() {
+constexpr void test_emplaceable() {
   using M = std::flat_multimap<int, Emplaceable, std::less<int>, KeyContainer, ValueContainer>;
   using R = M::iterator;
 
@@ -206,23 +206,36 @@ void test_emplaceable() {
   assert(r->second == Emplaceable(2, 3.6));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   test_emplaceable<std::vector<int>, std::vector<Emplaceable>>();
-  test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
   test_emplaceable<MinSequenceContainer<int>, MinSequenceContainer<Emplaceable>>();
   test_emplaceable<std::vector<int, min_allocator<int>>, std::vector<Emplaceable, min_allocator<Emplaceable>>>();
-
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto emplace_func = [](auto& m, auto key_arg, auto value_arg) {
       m.emplace_hint(m.begin(), std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
     };
     test_emplace_exception_guarantee(emplace_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp
index 78040be2e043d..6b808b34c974f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp
@@ -29,7 +29,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -54,52 +54,64 @@ void test() {
   std::same_as<I> decltype(auto) i1 = m.erase(std::next(m.cbegin(), 2));
   assert(m.size() == 10);
   assert(i1 == std::next(m.begin(), 2));
-  assert(std::ranges::equal(
-      m,
-      std::vector<P>{
-          {1, 1.5}, {2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{1, 2, 3, 4, 4, 4, 5, 6, 7, 8}));
+  check_possible_values(m.values(),
+                        std::vector<std::vector<Value>>{
+                            {1.5}, {2.5, 2.6}, {3.5}, {4.5, 4.7}, {4.5, 4.7}, {4.5, 4.7}, {5.5}, {6.5}, {7.5}, {8.5}});
 
   std::same_as<I> decltype(auto) i2 = m.erase(std::next(m.begin(), 0));
   assert(m.size() == 9);
   assert(i2 == m.begin());
-  assert(std::ranges::equal(
-      m, std::vector<P>{{2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 3, 4, 4, 4, 5, 6, 7, 8}));
+  check_possible_values(m.values(),
+                        std::vector<std::vector<Value>>{
+                            {2.5, 2.6}, {3.5}, {4.5, 4.7}, {4.5, 4.7}, {4.5, 4.7}, {5.5}, {6.5}, {7.5}, {8.5}});
 
   std::same_as<I> decltype(auto) i3 = m.erase(std::next(m.cbegin(), 8));
   assert(m.size() == 8);
   assert(i3 == m.end());
-  assert(std::ranges::equal(
-      m, std::vector<P>{{2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 3, 4, 4, 4, 5, 6, 7}));
+  check_possible_values(
+      m.values(),
+      std::vector<std::vector<Value>>{{2.5, 2.6}, {3.5}, {4.5, 4.7}, {4.5, 4.7}, {4.5, 4.7}, {5.5}, {6.5}, {7.5}});
 
   std::same_as<I> decltype(auto) i4 = m.erase(std::next(m.begin(), 1));
   assert(m.size() == 7);
   assert(i4 == std::next(m.begin()));
-  assert(std::ranges::equal(m, std::vector<P>{{2, 2.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 4, 4, 4, 5, 6, 7}));
+  check_possible_values(
+      m.values(), std::vector<std::vector<Value>>{{2.5, 2.6}, {4.5, 4.7}, {4.5, 4.7}, {4.5, 4.7}, {5.5}, {6.5}, {7.5}});
 
   std::same_as<I> decltype(auto) i5 = m.erase(std::next(m.cbegin(), 2));
   assert(m.size() == 6);
   assert(i5 == std::next(m.begin(), 2));
-  assert(std::ranges::equal(m, std::vector<P>{{2, 2.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 4, 4, 5, 6, 7}));
+  check_possible_values(
+      m.values(), std::vector<std::vector<Value>>{{2.5, 2.6}, {4.5, 4.7}, {4.5, 4.7}, {5.5}, {6.5}, {7.5}});
 
   std::same_as<I> decltype(auto) i6 = m.erase(std::next(m.begin(), 2));
   assert(m.size() == 5);
   assert(i6 == std::next(m.begin(), 2));
-  assert(std::ranges::equal(m, std::vector<P>{{2, 2.5}, {4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 4, 5, 6, 7}));
+  check_possible_values(m.values(), std::vector<std::vector<Value>>{{2.5, 2.6}, {4.5, 4.7}, {5.5}, {6.5}, {7.5}});
 
   std::same_as<I> decltype(auto) i7 = m.erase(std::next(m.cbegin(), 0));
   assert(m.size() == 4);
   assert(i7 == std::next(m.begin(), 0));
-  assert(std::ranges::equal(m, std::vector<P>{{4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{4, 5, 6, 7}));
+  check_possible_values(m.values(), std::vector<std::vector<Value>>{{4.5, 4.7}, {5.5}, {6.5}, {7.5}});
 
   std::same_as<I> decltype(auto) i8 = m.erase(std::next(m.cbegin(), 2));
   assert(m.size() == 3);
   assert(i8 == std::next(m.begin(), 2));
-  assert(std::ranges::equal(m, std::vector<P>{{4, 4.5}, {5, 5.5}, {7, 7.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{4, 5, 7}));
+  check_possible_values(m.values(), std::vector<std::vector<Value>>{{4.5, 4.7}, {5.5}, {7.5}});
 
   std::same_as<I> decltype(auto) i9 = m.erase(std::next(m.cbegin(), 2));
   assert(m.size() == 2);
   assert(i9 == std::next(m.begin(), 2));
-  assert(std::ranges::equal(m, std::vector<P>{{4, 4.5}, {5, 5.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{4, 5}));
+  check_possible_values(m.values(), std::vector<std::vector<Value>>{{4.5, 4.7}, {5.5}});
 
   std::same_as<I> decltype(auto) i10 = m.erase(m.cbegin());
   assert(m.size() == 1);
@@ -112,16 +124,27 @@ void test() {
   assert(i11 == m.end());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_function = [](auto& m, auto) { m.erase(m.begin() + 2); };
     test_erase_exception_guarantee(erase_function);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp
index 103f38c1c5d4a..d7c5bd1df83e2 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp
@@ -14,6 +14,7 @@
 
 // iterator erase(const_iterator first, const_iterator last);
 
+#include <algorithm>
 #include <compare>
 #include <concepts>
 #include <deque>
@@ -28,7 +29,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -53,7 +54,8 @@ void test() {
   std::same_as<I> decltype(auto) i1 = m.erase(m.cbegin(), m.cbegin());
   assert(m.size() == 11);
   assert(i1 == m.begin());
-  assert(std::ranges::equal(
+  check_invariant(m);
+  assert(std::ranges::is_permutation(
       m,
       std::vector<P>{
           {1, 1.5},
@@ -71,13 +73,17 @@ void test() {
   std::same_as<I> decltype(auto) i2 = m.erase(m.cbegin(), std::next(m.cbegin(), 2));
   assert(m.size() == 9);
   assert(i2 == m.begin());
-  assert(std::ranges::equal(
-      m, std::vector<P>{{2, 2.6}, {3, 3.5}, {3, 3.6}, {3, 3.7}, {4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 3, 3, 3, 4, 5, 6, 7, 8}));
+  check_possible_values(
+      m.values(),
+      std::vector<std::vector<Value>>{
+          {2.5, 2.6}, {3.5, 3.6, 3.7}, {3.5, 3.6, 3.7}, {3.5, 3.6, 3.7}, {4.5}, {5.5}, {6.5}, {7.5}, {8.5}});
 
   std::same_as<I> decltype(auto) i3 = m.erase(std::next(m.cbegin(), 2), std::next(m.cbegin(), 6));
   assert(m.size() == 5);
   assert(i3 == std::next(m.begin(), 2));
-  assert(std::ranges::equal(m, std::vector<P>{{2, 2.6}, {3, 3.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+  assert(std::ranges::equal(m.keys(), std::vector<Key>{2, 3, 6, 7, 8}));
+  check_possible_values(m.values(), std::vector<std::vector<Value>>{{2.5, 2.6}, {3.5, 3.6, 3.7}, {6.5}, {7.5}, {8.5}});
 
   std::same_as<I> decltype(auto) i4 = m.erase(m.cbegin(), m.cend());
   assert(m.size() == 0);
@@ -85,15 +91,27 @@ void test() {
   assert(i4 == m.end());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_function = [](auto& m, auto) { m.erase(m.begin(), m.begin() + 2); };
     test_erase_exception_guarantee(erase_function);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp
index 7944996fba1a0..d267c553a5c56 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp
@@ -28,7 +28,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer, class Compare = std::less<>>
-void test() {
+constexpr void test() {
   using M = std::flat_multimap<int, char, Compare, KeyContainer, ValueContainer>;
 
   auto make = [](std::initializer_list<int> il) {
@@ -78,14 +78,17 @@ void test() {
   assert(m.empty());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
   test<std::vector<int>, std::vector<char>, std::greater<>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_function = [](auto& m, auto key_arg) {
       using Map = std::decay_t<decltype(m)>;
       using Key = typename Map::key_type;
@@ -94,6 +97,14 @@ int main(int, char**) {
     };
     test_erase_exception_guarantee(erase_function);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp
index 5627a67b29e9d..ef75e34d0800f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp
@@ -14,6 +14,7 @@
 
 // size_type erase(K&& k);
 
+#include <algorithm>
 #include <compare>
 #include <concepts>
 #include <deque>
@@ -40,10 +41,10 @@ static_assert(!CanErase<const NonTransparentMap>);
 
 template <class Key, class It>
 struct HeterogeneousKey {
-  explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
-  operator It() && { return it_; }
-  auto operator<=>(Key key) const { return key_ <=> key; }
-  friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
+  constexpr explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
+  constexpr operator It() && { return it_; }
+  constexpr auto operator<=>(Key key) const { return key_ <=> key; }
+  constexpr friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
     assert(false);
     return false;
   }
@@ -52,7 +53,7 @@ struct HeterogeneousKey {
 };
 
 template <class KeyContainer, class ValueContainer>
-void test_simple() {
+constexpr void test_simple() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::ranges::less, KeyContainer, ValueContainer>;
@@ -73,7 +74,7 @@ void test_simple() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test_transparent_comparator() {
+constexpr void test_transparent_comparator() {
   using M = std::flat_multimap<std::string, int, TransparentComparator, KeyContainer, ValueContainer>;
   using P = std::pair<std::string, int>;
   M m     = {
@@ -82,12 +83,14 @@ void test_transparent_comparator() {
 
   auto n = m.erase(Transparent<std::string>{"epsilon"});
   assert(n == 2);
-  assert(std::ranges::equal(
+  check_invariant(m);
+  assert(std::ranges::is_permutation(
       m, std::vector<P>{{"alpha", 1}, {"beta", 2}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}}));
 
   auto n2 = m.erase(Transparent<std::string>{"aaa"});
   assert(n2 == 0);
-  assert(std::ranges::equal(
+  check_invariant(m);
+  assert(std::ranges::is_permutation(
       m, std::vector<P>{{"alpha", 1}, {"beta", 2}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}}));
 
   auto n3 = m.erase(Transparent<std::string>{"gamma"});
@@ -115,14 +118,20 @@ void test_transparent_comparator() {
   assert(std::ranges::equal(m, std::vector<P>{}));
 }
 
-int main(int, char**) {
+constexpr bool test() {
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test_simple<std::deque<int>, std::vector<double>>();
+    test_transparent_comparator<std::deque<std::string>, std::vector<int>>();
+  }
+
   test_simple<std::vector<int>, std::vector<double>>();
-  test_simple<std::deque<int>, std::vector<double>>();
   test_simple<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test_simple<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   test_transparent_comparator<std::vector<std::string>, std::vector<int>>();
-  test_transparent_comparator<std::deque<std::string>, std::vector<int>>();
   test_transparent_comparator<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>,
                               std::vector<int, min_allocator<int>>>();
@@ -148,7 +157,8 @@ int main(int, char**) {
     assert(n == 2);
     assert(transparent_used);
   }
-  {
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto erase_transparent = [](auto& m, auto key_arg) {
       using Map = std::decay_t<decltype(m)>;
       using Key = typename Map::key_type;
@@ -163,6 +173,14 @@ int main(int, char**) {
     auto n = m.erase("beta");
     assert(n == 2);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp
index f5ed4a9663a9d..bc1bdcbe5c7a3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp
@@ -35,25 +35,28 @@ static_assert(!CanExtract<std::flat_multimap<int, int> const&>);
 static_assert(!CanExtract<std::flat_multimap<int, int> const&&>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using M = std::flat_multimap<int, int, std::less<int>, KeyContainer, ValueContainer>;
   M m     = M({1, 2, 2, 2, 3, 3}, {4, 5, 6, 7, 8, 9});
 
   std::same_as<typename M::containers> auto containers = std::move(m).extract();
 
-  auto expected_keys   = {1, 2, 2, 2, 3, 3};
-  auto expected_values = {4, 5, 6, 7, 8, 9};
+  auto expected_keys = {1, 2, 2, 2, 3, 3};
   assert(std::ranges::equal(containers.keys, expected_keys));
-  assert(std::ranges::equal(containers.values, expected_values));
+  check_possible_values(
+      containers.values, std::vector<std::vector<int>>{{4}, {5, 6, 7}, {5, 6, 7}, {5, 6, 7}, {8, 9}, {8, 9}});
   check_invariant(m);
   LIBCPP_ASSERT(m.empty());
   LIBCPP_ASSERT(m.keys().size() == 0);
   LIBCPP_ASSERT(m.values().size() == 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<int>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
   {
@@ -69,7 +72,7 @@ int main(int, char**) {
     LIBCPP_ASSERT(m.values().size() == 0);
   }
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
     using KeyContainer   = std::vector<int>;
     using ValueContainer = ThrowOnMoveContainer<int>;
@@ -89,5 +92,15 @@ int main(int, char**) {
     }
 #endif
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp
index 88c173d8a6917..96f2120732165 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -62,13 +62,16 @@ void test() {
   assert(r->second == 4.5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -77,5 +80,14 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp
index 098b66cc49f18..d11489e9252f6 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp
@@ -14,6 +14,7 @@
 
 // void insert(initializer_list<value_type> il);
 
+#include <algorithm>
 #include <flat_map>
 #include <cassert>
 #include <functional>
@@ -25,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -61,16 +62,20 @@ void test() {
       {4, 1.5},
       {4, 2},
   };
-  assert(std::ranges::equal(m, expected));
+  check_invariant(m);
+  assert(std::ranges::is_permutation(m, expected));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) {
       using FlatMap                        = std::decay_t<decltype(m)>;
       using value_type                     = typename FlatMap::value_type;
@@ -79,5 +84,14 @@ int main(int, char**) {
     };
     test_insert_range_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp
index 9d645043a15ca..cde6cd9a07201 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -76,13 +76,16 @@ void test() {
   assert(r->second == 6.5);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -91,5 +94,14 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp
index 30cb89dadbfe0..ad9a7695c82c1 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp
@@ -40,7 +40,7 @@ static_assert(!CanInsert<Map, int, int>);
 static_assert(!CanInsert<Map, cpp20_input_iterator<Pair*>, cpp20_input_iterator<Pair*>>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using P = std::pair<int, double>;
   using M = std::flat_multimap<int, double, std::less<int>, KeyContainer, ValueContainer>;
 
@@ -71,7 +71,8 @@ void test() {
   m.insert(cpp17_input_iterator<P*>(ar1), cpp17_input_iterator<P*>(ar1 + sizeof(ar1) / sizeof(ar1[0])));
   assert(m.size() == 9);
   std::vector<P> expected{{1, 1}, {1, 1.5}, {1, 2}, {2, 1}, {2, 1.5}, {2, 2}, {3, 1}, {3, 1.5}, {3, 2}};
-  assert(std::ranges::equal(m, expected));
+  assert(std::ranges::is_permutation(m, expected));
+  check_invariant(m);
 
   m.insert(cpp17_input_iterator<P*>(ar2), cpp17_input_iterator<P*>(ar2 + sizeof(ar2) / sizeof(ar2[0])));
   assert(m.size() == 18);
@@ -94,15 +95,20 @@ void test() {
       {4, 1},
       {4, 1.5},
       {4, 2}};
-  assert(std::ranges::equal(m, expected2));
+  assert(std::ranges::is_permutation(m, expected2));
+  check_invariant(m);
 }
-int main(int, char**) {
+
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) { m.insert(newValues.begin(), newValues.end()); };
     test_insert_range_exception_guarantee(insert_func);
   }
@@ -112,5 +118,14 @@ int main(int, char**) {
     m.insert(v.begin(), v.end());
     assert(std::ranges::equal(m, std::vector<std::pair<int, int>>{{1, 1}, {2, 2}, {3, 3}, {4, 4}}));
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp
index 61962f4873aee..d84372ae43f18 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp
@@ -25,7 +25,7 @@
 #include "test_macros.h"
 
 template <class Container, class Pair>
-void do_insert_iter_rv_test() {
+constexpr void do_insert_iter_rv_test() {
   using M = Container;
   using P = Pair;
   using R = typename M::iterator;
@@ -68,7 +68,7 @@ void do_insert_iter_rv_test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -79,17 +79,22 @@ void test() {
   do_insert_iter_rv_test<M, CP>();
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
   test<std::vector<int>, std::vector<MoveOnly>>();
-  test<std::deque<int>, std::deque<double>>();
-  test<std::deque<int>, std::deque<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+  {
+    test<std::deque<int>, std::deque<double>>();
+    test<std::deque<int>, std::deque<MoveOnly>>();
+  }
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<MoveOnly>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
   test<std::vector<int, min_allocator<int>>, std::vector<MoveOnly, min_allocator<MoveOnly>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -98,6 +103,14 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp
index 97b8f17d1094f..639f1c0238fe4 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp
@@ -41,7 +41,7 @@ static_assert(!CanInsertRange<Map, std::ranges::subrange<int*>>);
 static_assert(!CanInsertRange<Map, std::ranges::subrange<double*>>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
 
@@ -71,9 +71,12 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<int>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
   {
@@ -87,15 +90,24 @@ int main(int, char**) {
   {
     // The element type of the range doesn't need to be std::pair (P2767).
     std::pair<int, int> pa[] = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}};
-    std::deque<std::reference_wrapper<std::pair<int, int>>> a(pa, pa + 5);
+    std::vector<std::reference_wrapper<std::pair<int, int>>> a(pa, pa + 5);
     std::flat_multimap<int, int> m;
     m.insert_range(a);
     std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}};
     assert(std::ranges::equal(m, expected));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) { m.insert_range(newValues); };
     test_insert_range_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp
index 573150248ca48..04e6b749e63fa 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp
@@ -25,7 +25,7 @@
 #include "../helpers.h"
 
 template <class Container, class Pair>
-void do_insert_rv_test() {
+constexpr void do_insert_rv_test() {
   using M = Container;
   using P = Pair;
   using R = typename M::iterator;
@@ -56,7 +56,7 @@ void do_insert_rv_test() {
 }
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -68,9 +68,12 @@ void test() {
   do_insert_rv_test<M, CP>();
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<MoveOnly>>();
-  test<std::deque<int>, std::vector<MoveOnly>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<MoveOnly>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<MoveOnly>>();
   test<std::vector<int, min_allocator<int>>, std::vector<MoveOnly, min_allocator<MoveOnly>>>();
 
@@ -102,7 +105,7 @@ int main(int, char**) {
     assert(r->first == 3);
     assert(r->second == 3);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
       using FlatMap    = std::decay_t<decltype(m)>;
       using value_type = typename FlatMap::value_type;
@@ -111,6 +114,14 @@ int main(int, char**) {
     };
     test_emplace_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp
index 334dff0a0d2f6..7f0bb4ad2e57c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp
@@ -14,6 +14,7 @@
 
 // void insert(sorted_equivalent_t, initializer_list<value_type> il);
 
+#include <algorithm>
 #include <flat_map>
 #include <cassert>
 #include <functional>
@@ -25,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -43,16 +44,20 @@ void test() {
            });
   assert(m.size() == 12);
   V expected[] = {{0, 1}, {1, 1}, {1, 1.5}, {1, 2}, {1, 2}, {1, 3}, {2, 1}, {2, 4}, {3, 1}, {3, 1.5}, {3, 2}, {4, 1}};
-  assert(std::ranges::equal(m, expected));
+  assert(std::ranges::is_permutation(m, expected));
+  check_invariant(m);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) {
       using FlatMap                        = std::decay_t<decltype(m)>;
       using value_type                     = typename FlatMap::value_type;
@@ -61,6 +66,14 @@ int main(int, char**) {
     };
     test_insert_range_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp
index 37808470a2cf7..4dfdefe83831c 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp
@@ -39,7 +39,7 @@ static_assert(!CanInsert<Map, std::sorted_equivalent_t, int, int>);
 static_assert(!CanInsert<Map, std::sorted_equivalent_t, cpp20_input_iterator<Pair*>, cpp20_input_iterator<Pair*>>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -77,18 +77,29 @@ void test() {
   assert(std::ranges::equal(m, expected2));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     auto insert_func = [](auto& m, const auto& newValues) {
       m.insert(std::sorted_equivalent, newValues.begin(), newValues.end());
     };
     test_insert_range_exception_guarantee(insert_func);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp
index 760479ade76e4..b33f64442c98f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp
@@ -43,63 +43,50 @@ static_assert(CanInsert<Map, Iter, std::tuple<short, double>&&>);
 static_assert(!CanInsert<Map, int>);
 static_assert(!CanInsert<Map, Iter, int>);
 
-static int expensive_comparisons = 0;
-static int cheap_comparisons     = 0;
-
-struct CompareCounter {
-  int i_ = 0;
-  CompareCounter(int i) : i_(i) {}
-  friend auto operator<=>(const CompareCounter& x, const CompareCounter& y) {
-    expensive_comparisons += 1;
-    return x.i_ <=> y.i_;
-  }
-  bool operator==(const CompareCounter&) const = default;
-  friend auto operator<=>(const CompareCounter& x, int y) {
-    cheap_comparisons += 1;
-    return x.i_ <=> y;
-  }
-};
-
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
-  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
 
   {
     // insert(P&&)
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    M m(std::sorted_equivalent, {{1, 1}, {2, 2}, {2, 3}, {4, 4}}, c);
+    assert(!transparent_used);
+
+    std::same_as<typename M::iterator> decltype(auto) res = m.insert(std::pair(ConvertibleTransparent<int>{3}, 3));
+
+    assert(res->first == 3);
+    assert(res->second == 3);
     //   Unlike flat_set, here we can't use key_compare to compare value_type versus P,
     //   so we must eagerly convert to value_type.
-    M m                                                 = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {4, 4}, {5, 5}};
-    expensive_comparisons                               = 0;
-    cheap_comparisons                                   = 0;
-    std::same_as<typename M::iterator> decltype(auto) r = m.insert(std::make_pair(3, 3)); // conversion happens first
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(r == m.begin() + 4);
-
-    std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {3, 3}, {4, 4}, {5, 5}};
-    assert(std::ranges::equal(m, expected));
+    assert(!transparent_used);
   }
   {
     // insert(const_iterator, P&&)
-    M m                                        = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {4, 4}, {5, 5}};
-    expensive_comparisons                      = 0;
-    cheap_comparisons                          = 0;
-    std::same_as<typename M::iterator> auto it = m.insert(m.begin(), std::make_pair(3, 3));
-    assert(expensive_comparisons >= 2);
-    assert(cheap_comparisons == 0);
-    assert(it == m.begin() + 2);
-    std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 3}, {3, 1}, {3, 4}, {4, 4}, {5, 5}};
-    assert(std::ranges::equal(m, expected));
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    M m(std::sorted_equivalent, {{1, 1}, {2, 2}, {2, 3}, {4, 4}}, c);
+    std::same_as<typename M::iterator> decltype(auto) res =
+        m.insert(m.begin(), std::pair(ConvertibleTransparent<int>{3}, 3));
+    assert(res->first == 3);
+    assert(res->second == 3);
+    //   Unlike flat_set, here we can't use key_compare to compare value_type versus P,
+    //   so we must eagerly convert to value_type.
+    assert(!transparent_used);
   }
 }
 
-int main(int, char**) {
-  test<std::vector<CompareCounter>, std::vector<double>>();
-  test<std::deque<CompareCounter>, std::vector<double>>();
-  test<MinSequenceContainer<CompareCounter>, MinSequenceContainer<double>>();
-  test<std::vector<CompareCounter, min_allocator<CompareCounter>>, std::vector<double, min_allocator<double>>>();
+constexpr bool test() {
+  test<std::vector<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
   {
     // no ambiguity between insert(pos, P&&) and insert(first, last)
@@ -113,23 +100,26 @@ int main(int, char**) {
     ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), Evil())), M::iterator);
     ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), m.end())), void);
   }
-  {
-    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
-      using FlatMap    = std::decay_t<decltype(m)>;
-      using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
-      tuple_type t(key_arg, value_arg);
-      m.insert(t);
-    };
-    test_emplace_exception_guarantee(insert_func);
-  }
-  {
-    auto insert_func_iter = [](auto& m, auto key_arg, auto value_arg) {
-      using FlatMap    = std::decay_t<decltype(m)>;
-      using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
-      tuple_type t(key_arg, value_arg);
-      m.insert(m.begin(), t);
-    };
-    test_emplace_exception_guarantee(insert_func_iter);
+
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    {
+      auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+        using FlatMap    = std::decay_t<decltype(m)>;
+        using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
+        tuple_type t(key_arg, value_arg);
+        m.insert(t);
+      };
+      test_emplace_exception_guarantee(insert_func);
+    }
+    {
+      auto insert_func_iter = [](auto& m, auto key_arg, auto value_arg) {
+        using FlatMap    = std::decay_t<decltype(m)>;
+        using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
+        tuple_type t(key_arg, value_arg);
+        m.insert(m.begin(), t);
+      };
+      test_emplace_exception_guarantee(insert_func_iter);
+    }
   }
   {
     // LWG4239 std::string and C string literal
@@ -140,5 +130,15 @@ int main(int, char**) {
     auto it2 = m.insert(m.begin(), {"beta2", 2});
     assert(it2 == m.begin() + 4);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp
index 86fbaff468ab6..5c3211aecaa13 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp
@@ -35,7 +35,7 @@ static_assert(!CanReplace<Map, std::vector<int>, const std::vector<int>&>);
 static_assert(!CanReplace<Map, const std::vector<int>&, const std::vector<int>&>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -51,13 +51,16 @@ void test() {
   assert(std::ranges::equal(m.values(), expected_values));
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
     using KeyContainer   = std::vector<int>;
     using ValueContainer = ThrowOnMoveContainer<int>;
@@ -78,5 +81,15 @@ int main(int, char**) {
     }
 #endif
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp
index f96155d714dc9..efad2d78d5b76 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp
@@ -41,7 +41,7 @@ static_assert(NoExceptAdlSwap<
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -89,11 +89,22 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp
index ab7be3b8ac22e..8f31884aa3a5f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp
@@ -40,7 +40,7 @@ static_assert(NoExceptMemberSwap<
 #endif
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -87,11 +87,23 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<double>>();
-  test<std::deque<int>, std::vector<double>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<double>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
   test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp
index 47140132c6e47..070fbb0244e63 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp
@@ -23,7 +23,7 @@
 
 #include "test_macros.h"
 
-int main(int, char**) {
+constexpr bool test() {
   {
     using M    = std::flat_multimap<int, char>;
     using Comp = std::less<int>; // the default
@@ -40,7 +40,7 @@ int main(int, char**) {
     assert(vc({1, '2'}, {2, '1'}));
     assert(!vc({2, '1'}, {1, '2'}));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(int, int)>;
     using M    = std::flat_multimap<int, int, Comp>;
     Comp comp  = std::greater<int>();
@@ -74,7 +74,7 @@ int main(int, char**) {
     assert(vc({1, 2}, {2, 1}));
     assert(!vc({2, 1}, {1, 2}));
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     using Comp = std::function<bool(const std::vector<int>&, const std::vector<int>&)>;
     using M    = std::flat_multimap<std::vector<int>, int, Comp>;
     Comp comp  = [i = 1](const auto& x, const auto& y) { return x[i] < y[i]; };
@@ -94,5 +94,15 @@ int main(int, char**) {
     assert(!vc(b, a));
     assert(!vc(c, b));
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp
index c7c674c034bca..3611cf494ae49 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp
@@ -28,9 +28,10 @@
 #include "test_macros.h"
 #include "test_allocator.h"
 #include "min_allocator.h"
+#include "../helpers.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -43,17 +44,35 @@ void test() {
   static_assert(noexcept(m.keys()));
   static_assert(noexcept(m.values()));
 
-  auto expected_keys   = {2, 2, 3, 4};
-  auto expected_values = {'b', 'e', 'c', 'a'};
+  auto expected_keys = {2, 2, 3, 4};
   assert(std::ranges::equal(keys, expected_keys));
-  assert(std::ranges::equal(values, expected_values));
+  check_possible_values(
+      values,
+      std::vector<std::vector<char>>{
+          {'b', 'e'},
+          {'b', 'e'},
+          {'c'},
+          {'a'},
+      });
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp
index b3ea0b65a3d93..649b5d96dca0f 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -62,11 +62,22 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<int>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp
index 42feeb0c93416..69095c2419dc3 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp
@@ -37,7 +37,7 @@ static_assert(!CanContains<NonTransparentMap>);
 static_assert(!CanContains<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -55,9 +55,12 @@ void test() {
   assert(m.contains(Transparent<std::string>{"g"}) == false);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<std::string>, std::vector<int>>();
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -77,5 +80,15 @@ int main(int, char**) {
     assert(m.contains("beta") == true);
     assert(m.contains("charlie") == false);
   }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp
index 59b88428cde3c..277abc3e58c45 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
 
@@ -61,11 +61,23 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<int>>();
-  test<std::deque<int>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<int>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
   test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp
index 6811be5de1f02..ee2c9bf849929 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp
@@ -37,7 +37,7 @@ static_assert(!CanCount<NonTransparentMap>);
 static_assert(!CanCount<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -63,9 +63,12 @@ void test() {
   assert(m.count(Transparent<std::string>{"g"}) == 0);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<std::string>, std::vector<int>>();
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -87,5 +90,14 @@ int main(int, char**) {
     assert(m.count("charlie") == 0);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp
index ac369b77a7f3d..88932f6ae27aa 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -71,11 +71,22 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp
index d80c37feecc07..34bc2d8f2e81d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp
@@ -15,6 +15,7 @@
 // template<class K> pair<iterator,iterator>             equal_range(const K& x);
 // template<class K> pair<const_iterator,const_iterator> equal_range(const K& x) const;
 
+#include <algorithm>
 #include <cassert>
 #include <deque>
 #include <flat_map>
@@ -38,7 +39,7 @@ static_assert(!CanEqualRange<NonTransparentMap>);
 static_assert(!CanEqualRange<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -62,7 +63,7 @@ void test() {
     auto [first, last] = map.equal_range(Transparent<std::string>{expected_key});
     auto expected_range =
         expected_values | std::views::transform([&](auto&& val) { return std::pair(expected_key, val); });
-    assert(std::ranges::equal(std::ranges::subrange(first, last), expected_range));
+    assert(std::ranges::is_permutation(std::ranges::subrange(first, last), expected_range));
   };
 
   auto test_not_found = [&](auto&& map, const std::string& expected_key, long expected_offset) {
@@ -90,9 +91,12 @@ void test() {
   test_not_found(cm, "zzz", 9);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<std::string>, std::vector<int>>();
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -114,6 +118,14 @@ int main(int, char**) {
     assert(first == m.begin() + 1);
     assert(last == m.begin() + 3);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp
index 74b7051eb0d7b..a6e0bd33063bd 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp
@@ -27,7 +27,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
@@ -47,11 +47,22 @@ void test() {
   assert(std::as_const(m).find(9) == m.end());
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp
index dff63560c3cb4..3d2a6b1eca0c1 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp
@@ -38,7 +38,7 @@ static_assert(!CanFind<NonTransparentMap>);
 static_assert(!CanFind<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -80,9 +80,12 @@ void test() {
   test_find(cm, "zzz", 10);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<std::string>, std::vector<int>>();
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -102,6 +105,14 @@ int main(int, char**) {
     auto it = m.find("beta");
     assert(it == m.begin() + 1);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp
index c3befdda7de6e..79dd2181c094e 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -63,11 +63,22 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp
index 3161ca01e5579..55f836d5cf151 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp
@@ -38,7 +38,7 @@ static_assert(!CanLowerBound<NonTransparentMap>);
 static_assert(!CanLowerBound<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -88,9 +88,12 @@ void test() {
   test_lower_bound(cm, "zzz", 11);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<std::string>, std::vector<int>>();
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
 
@@ -110,6 +113,14 @@ int main(int, char**) {
     auto it = m.lower_bound("charlie");
     assert(it == m.begin() + 3);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp
index d73d030236e22..789e5b279eb3d 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp
@@ -26,7 +26,7 @@
 #include "min_allocator.h"
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   {
@@ -66,11 +66,22 @@ void test() {
   }
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<int>, std::vector<char>>();
-  test<std::deque<int>, std::vector<char>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<int>, std::vector<char>>();
   test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
   test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp
index d51d87ce77882..817e8f8fb2402 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp
@@ -38,7 +38,7 @@ static_assert(!CanUpperBound<NonTransparentMap>);
 static_assert(!CanUpperBound<const NonTransparentMap>);
 
 template <class KeyContainer, class ValueContainer>
-void test() {
+constexpr void test() {
   using Key   = typename KeyContainer::value_type;
   using Value = typename ValueContainer::value_type;
   using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
@@ -88,9 +88,12 @@ void test() {
   test_upper_bound(cm, "zzz", 11);
 }
 
-int main(int, char**) {
+constexpr bool test() {
   test<std::vector<std::string>, std::vector<int>>();
-  test<std::deque<std::string>, std::vector<int>>();
+#ifndef __cpp_lib_constexpr_deque
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
+    test<std::deque<std::string>, std::vector<int>>();
   test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
   test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
   {
@@ -109,6 +112,14 @@ int main(int, char**) {
     auto it = m.upper_bound("charlie");
     assert(it == m.begin() + 3);
   }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
index 68d7f67a6669f..f3edd3b3a0242 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
@@ -14,18 +14,27 @@
 #include <string>
 #include <vector>
 #include <flat_map>
+#include <ranges>
 
 #include "../flat_helpers.h"
 #include "test_allocator.h"
 #include "test_macros.h"
 
 template <class... Args>
-void check_invariant(const std::flat_multimap<Args...>& m) {
+constexpr void check_invariant(const std::flat_multimap<Args...>& m) {
   assert(m.keys().size() == m.values().size());
   const auto& keys = m.keys();
   assert(std::is_sorted(keys.begin(), keys.end(), m.key_comp()));
 }
 
+constexpr void check_possible_values(const auto& actual, const auto& expected) {
+  assert(std::ranges::size(actual) == std::ranges::size(expected));
+
+  for (const auto& [actual_value, possible_values] : std::views::zip(actual, expected)) {
+    assert(std::ranges::find(possible_values, actual_value) != std::ranges::end(possible_values));
+  }
+}
+
 template <class F>
 void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) {
 #ifndef TEST_HAS_NO_EXCEPTIONS
diff --git a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
index 1ba0063c1dada..f9f81d22ff80e 100644
--- a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test fails because Clang no longer enables -fdelayed-template-parsing
-// by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21)
+// This doesn't work on Windows because in the MSVC UCRT headers the math.h is
+// actually intended to implement the full C++ spec requirements. For details
+// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828
+// XFAIL: msvc
 
 // <math.h>
 
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
index 6929831eca361..172c97f30c24a 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
@@ -8,10 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// Older versions of clang may encounter a backend error (see 0295c2ad):
-//   Pass-by-value arguments with alignment greater than register width are not supported.
-// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && clang-18
-
 // <experimental/simd>
 //
 // [simd.class]
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
index a496938219b05..056d6f65fc368 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
@@ -8,10 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// Older versions of clang may encounter a backend error (see 0295c2ad):
-//   Pass-by-value arguments with alignment greater than register width are not supported.
-// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && clang-18
-
 // FIXME: The following issue occurs on Windows to Armv7 Ubuntu Linux:
 //   Assertion failed: N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type"
 // XFAIL: target=armv7-unknown-linux-gnueabihf
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 4cd8fadab2793..7bcc5847208df 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
@@ -20,10 +19,6 @@
 // ADDITIONAL_COMPILE_FLAGS(target=armv7-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=arm64ec-w64-windows-gnu): -fsized-deallocation
 
-// Android clang-r536225 identifies as clang-19.0 but it predates the real
-// LLVM 19.0.0, so it also leaves sized deallocation off by default.
-// UNSUPPORTED: android && clang-19.0
-
 // UNSUPPORTED: sanitizer-new-delete
 
 // Sized deallocation was introduced in LLVM 11
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 7b96b01caeda8..e6e8532b7e30e 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
@@ -20,10 +19,6 @@
 // ADDITIONAL_COMPILE_FLAGS(target=armv7-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=arm64ec-w64-windows-gnu): -fsized-deallocation
 
-// Android clang-r536225 identifies as clang-19.0 but it predates the real
-// LLVM 19.0.0, so it also leaves sized deallocation off by default.
-// UNSUPPORTED: android && clang-19.0
-
 // UNSUPPORTED: sanitizer-new-delete
 
 // Sized deallocation was introduced in LLVM 11
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
index a9b1e44602bd2..66e149bf58d1b 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
@@ -14,8 +14,7 @@
 
 #include "test_macros.h"
 
-#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || \
-    defined(__wasm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__wasm__)
 static const bool integral_types_trap = true;
 #else
 static const bool integral_types_trap = false;
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
index 488bc468bce79..bc479f1bcb1e0 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
@@ -459,4 +459,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp
index 7f3d6394749b4..fe0e6bd17f94d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp
index 9e50976e5cc2c..30efb61893a1b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp
@@ -171,4 +171,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
index e6145bbed5af9..3470e2b28bc40 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
@@ -420,4 +420,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp
index 0d025923728b7..a908c417df48b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp
index 35033419ac440..cad025eee3373 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp
@@ -195,4 +195,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp
index ea61d99736208..8799a1f7d14e5 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp
@@ -90,4 +90,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp
index 52b02562dc5ab..6ec3037c9ea45 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp
@@ -123,4 +123,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp
index 1453938b01da0..d5d7a5da4a64d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp
@@ -108,4 +108,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp
index 507c7ab6084f8..26ebe1e3ad6b1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp
@@ -204,4 +204,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
index 56759a88a7348..907535a087de2 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp
index b5efa984b456a..9a3a644ca5d64 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp
@@ -105,4 +105,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp
index d9b2c43ecbd12..e4058c2348f9b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp
index b472b205f89d5..24a9eca1e2346 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp
index ccc034418cde0..bc65a7f3cae00 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp
index f250798c129ea..600fa2eb2e4f5 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp
@@ -75,4 +75,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp
index 675c918cac417..8445aa3cf0c48 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
index eff8689be9fb8..b634f3253093e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
@@ -201,4 +201,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp
index 60d6418c7459a..11d5735007f5b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp
index b843aab42e6eb..77a6455e23302 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp
@@ -126,4 +126,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
index 9c7a84f145dde..74cf85ea9029f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
@@ -129,4 +129,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
index 98acf8bb602ca..9c28db3bb0869 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
@@ -179,4 +179,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
index 19e2fd79a4295..9c06eee27e0c8 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
@@ -63,4 +63,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
index d078f9bda23c9..5985bdc2d7d4f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
@@ -63,4 +63,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
index 3fa4334143b1a..77730f17fd9c6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
@@ -147,4 +147,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
index 05f903dccafe7..d2082946597cb 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
@@ -297,4 +297,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
index ee32346d61080..f67adb0de1ded 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
@@ -68,4 +68,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
index 8c0820681188d..b7b7d0334830a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
@@ -579,4 +579,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
index 37deba7c9661a..4de327cbfa26b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
@@ -104,4 +104,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp
index 179c3ce066b6f..68816936c55e9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp
@@ -65,4 +65,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp
index 46238896f79c3..a1178b22776f1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp
index 75dcb18a5428c..e9805ed4b1542 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp
@@ -315,4 +315,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp
index 6857c54460650..8e105648becef 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp
index 0b3d6f5d2bd9c..f4cc8db0f54cb 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp
@@ -84,4 +84,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
index d10c61c0e9cf4..1407d74e03aa2 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
@@ -297,4 +297,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp
index e1a04d1b0e087..f516881651b23 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
index 4044c2b1b2e0f..3db3861c72b5c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
@@ -396,4 +396,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp
index e6b4adac20efb..fad0e5b9777dd 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp
@@ -156,4 +156,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
index bf02dba0da773..f287e1ad9b3ad 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
@@ -678,4 +678,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp
index 52fc2d1854fec..dddf473f86a42 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp
@@ -144,4 +144,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
index fb3734fff10e5..5ffa5df8841c9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
@@ -95,4 +95,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp
index b1de3f7629e9d..3797e0966ec31 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp
@@ -213,4 +213,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp
index e8f109610a3e5..27170d1ea0ce7 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
index 687c343e34e08..cafbd2cac2ccf 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
@@ -252,4 +252,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
index 32685972d6019..148a6dbc0d3e4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
@@ -168,4 +168,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
index de0520af18e2a..163ea5b5514e4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
@@ -128,4 +128,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
index 263d20ace2fd9..0382d93cb40c9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
@@ -77,4 +77,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
index 0ebfc6de84104..db32433ff518e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
@@ -120,4 +120,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp
index d40d115443977..d0ede1168dfa1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp
@@ -99,4 +99,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
index 4cf5178dd7b8f..df19f03e7dba1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
@@ -450,4 +450,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp
index 6507e1c683f24..b7c08fe0de42c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp
@@ -60,4 +60,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp
index d6acf35d63ab0..dc27dc91851a5 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp
@@ -71,4 +71,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp
index 4246f2515dc09..9dc2d8b876640 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp
index fd0f0c51e72b2..c9cae7340e215 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
index 80eae6e1fd274..5dc69f29d0ecd 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
@@ -318,4 +318,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
index 4392173ebbb3a..51feff2195c3d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
@@ -164,4 +164,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp
index 2b326e2b37832..9495e319521c1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
index 3c550e0fa676e..826471a65f691 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
@@ -120,4 +120,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp
index b7650c436128e..992e31ed602e3 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp
@@ -62,4 +62,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
index 1e530ccc3043d..61c5ed476228c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
@@ -93,4 +93,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp
index 113ffce2a5d12..c07d935106ea6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp
@@ -65,4 +65,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
index ac70b0c21e018..6f6c4bbbde808 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index 40a6c07081008..7236d5d7f2aca 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -486,4 +486,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
index bda523614106c..c7bafb0bf059c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
@@ -249,4 +249,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp
index 0eaf9f1aff4fe..589b9ba5a75df 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp
index e6c44a223ee89..a2a81a619d93c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp
@@ -128,4 +128,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
index b583edfc43ad0..b10441fee5eb9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
@@ -333,4 +333,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
index e6c0940ab7fd5..0074f3bf4cc57 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
@@ -996,4 +996,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp
index 0729b0b37ee6a..cf29080ea75b4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp
@@ -63,4 +63,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
index 74b3c8fff69b3..221d8aaebc14b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
@@ -390,4 +390,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
index 9c400ddd2f657..d1c1335df7c80 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
@@ -312,4 +312,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
index 7dd3478576331..02e7febf5c5a1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
@@ -492,4 +492,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
index 4a7b9f7431a81..dea2f293f4c49 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
@@ -135,4 +135,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
index c2513ecad8d08..e34800a89c950 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
@@ -270,4 +270,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 222d562a19d63..962688e06188a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -8154,4 +8154,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp
index 3f4317a724add..4e24dbe810165 100644
--- a/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp
+++ b/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp
@@ -18,6 +18,8 @@
 #include <type_traits>
 #include <utility>
 
+#include "test_macros.h"
+
 // Test for basic properties of C++20 16.3.3.3.6 [customization.point.object].
 template <class CPO, class... Args>
 constexpr bool test(CPO& o, Args&&...) {
@@ -26,7 +28,7 @@ constexpr bool test(CPO& o, Args&&...) {
   static_assert(std::is_trivially_copyable_v<CPO>);
   static_assert(std::is_trivially_default_constructible_v<CPO>);
 
-  auto p = o;
+  auto p  = o;
   using T = decltype(p);
 
   // The type of a customization point object, ignoring cv-qualifiers, shall model semiregular.
@@ -89,11 +91,15 @@ static_assert(test(std::views::counted, a, 10));
 static_assert(test(std::views::drop, a, 10));
 //static_assert(test(std::views::drop_while, a, [](int x){ return x < 10; }));
 //static_assert(test(std::views::elements<0>, pairs));
-static_assert(test(std::views::filter, a, [](int x){ return x < 10; }));
+static_assert(test(std::views::filter, a, [](int x) { return x < 10; }));
 static_assert(test(std::views::join, arrays));
 //static_assert(test(std::views::split, a, 4));
 static_assert(test(std::views::lazy_split, a, 4));
 static_assert(test(std::views::reverse, a));
 static_assert(test(std::views::take, a, 10));
 //static_assert(test(std::views::take_while, a, [](int x){ return x < 10; }));
-static_assert(test(std::views::transform, a, [](int x){ return x + 1; }));
+static_assert(test(std::views::transform, a, [](int x) { return x + 1; }));
+
+#if TEST_STD_VER >= 23
+static_assert(test(std::views::zip_transform, [](int x, int y) { return x + y; }, a, a));
+#endif
diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
index 48c2918802fc3..8d261e9fcbdb2 100644
--- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test fails because Clang no longer enables -fdelayed-template-parsing
-// by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21)
+// This doesn't work on Windows because in the MSVC UCRT headers the math.h is
+// actually intended to implement the full C++ spec requirements. For details
+// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828
+// XFAIL: msvc
 
 // <cmath>
 
diff --git a/libcxx/test/libcxx/numerics/c.math/fdelayed-template-parsing.pass.cpp b/libcxx/test/std/numerics/c.math/fdelayed-template-parsing.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/numerics/c.math/fdelayed-template-parsing.pass.cpp
rename to libcxx/test/std/numerics/c.math/fdelayed-template-parsing.pass.cpp
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 2ab4c11b911b6..44ce32581cca1 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-18, clang-19, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17
 
 // GCC warns about signbit comparing `bool_v < 0`, which we're testing
 // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
diff --git a/libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_int_type.verify.cpp b/libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_int_type.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_int_type.verify.cpp
rename to libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_int_type.verify.cpp
diff --git a/libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_real_type.verify.cpp b/libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_real_type.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_real_type.verify.cpp
rename to libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_real_type.verify.cpp
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/begin.pass.cpp
new file mode 100644
index 0000000000000..9b326e1cd6981
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/begin.pass.cpp
@@ -0,0 +1,127 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr auto begin();
+// constexpr auto begin() const
+//   requires range<const InnerView> &&
+//            regular_invocable<const F&, range_reference_t<const Views>...>;
+
+#include <ranges>
+
+#include <cassert>
+#include <concepts>
+
+#include "types.h"
+
+template <class T>
+concept HasConstBegin = requires(const T& ct) { ct.begin(); };
+
+template <class T>
+concept HasBegin = requires(T& t) { t.begin(); };
+
+constexpr bool test() {
+  int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+  {
+    // all underlying iterators should be at the begin position
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::views::iota(0), std::ranges::single_view(2.));
+    auto it = v.begin();
+    assert(*it == std::make_tuple(1, 0, 2.0));
+
+    auto const_it = std::as_const(v).begin();
+    assert(*const_it == *it);
+
+    static_assert(!std::same_as<decltype(it), decltype(const_it)>);
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it = v.begin();
+    assert(*it == std::make_tuple(1));
+    auto cit = std::as_const(v).begin();
+    assert(*cit == std::make_tuple(1));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it = v.begin();
+    assert(&*it == &buffer[0]);
+    auto cit = std::as_const(v).begin();
+    assert(&*cit == &buffer[0]);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    auto it = v.begin();
+    assert(&std::get<0>(*it) == &buffer[0]);
+    assert(&std::get<1>(*it) == &buffer[0]);
+    assert(std::get<2>(*it) == 2.0);
+    auto cit = std::as_const(v).begin();
+    assert(&std::get<0>(*cit) == &buffer[0]);
+    assert(&std::get<1>(*cit) == &buffer[0]);
+    assert(std::get<2>(*cit) == 2.0);
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer}, SimpleCommon{buffer});
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), SimpleCommon{buffer});
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::empty_view<int>());
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // underlying const R is not a range
+    using ZTV = std::ranges::zip_transform_view<MakeTuple, SimpleCommon, NoConstBeginView>;
+    static_assert(HasBegin<ZTV>);
+    static_assert(!HasConstBegin<ZTV>);
+  }
+
+  {
+    // Fn cannot be invoked on const range
+    using ZTV = std::ranges::zip_transform_view<NonConstOnlyFn, ConstNonConstDifferentView>;
+    static_assert(HasBegin<ZTV>);
+    static_assert(!HasConstBegin<ZTV>);
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/cpo.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/cpo.pass.cpp
new file mode 100644
index 0000000000000..4a0bf7b8be1ac
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/cpo.pass.cpp
@@ -0,0 +1,159 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// std::views::zip_transform
+
+#include <ranges>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <functional>
+#include <type_traits>
+#include <vector>
+
+#include "types.h"
+
+struct NotMoveConstructible {
+  NotMoveConstructible()                       = default;
+  NotMoveConstructible(NotMoveConstructible&&) = delete;
+  int operator()() const { return 5; }
+};
+
+struct NotCopyConstructible {
+  NotCopyConstructible()                            = default;
+  NotCopyConstructible(NotCopyConstructible&&)      = default;
+  NotCopyConstructible(const NotCopyConstructible&) = delete;
+  int operator()() const { return 5; }
+};
+
+struct NotInvocable {};
+
+template <class... Args>
+struct Invocable {
+  int operator()(Args...) const { return 5; }
+};
+
+struct ReturnNotObject {
+  void operator()() const {}
+};
+
+// LWG3773 views::zip_transform still requires F to be copy_constructible when empty pack
+static_assert(std::is_invocable_v<decltype((std::views::zip_transform)), NotCopyConstructible>);
+
+static_assert(!std::is_invocable_v<decltype((std::views::zip_transform))>);
+static_assert(!std::is_invocable_v<decltype((std::views::zip_transform)), NotMoveConstructible>);
+static_assert(!std::is_invocable_v<decltype((std::views::zip_transform)), NotInvocable>);
+static_assert(std::is_invocable_v<decltype((std::views::zip_transform)), Invocable<>>);
+static_assert(!std::is_invocable_v<decltype((std::views::zip_transform)), ReturnNotObject>);
+
+static_assert(std::is_invocable_v<decltype((std::views::zip_transform)), //
+                                  Invocable<int>,                        //
+                                  std::ranges::iota_view<int, int>>);
+static_assert(!std::is_invocable_v<decltype((std::views::zip_transform)), //
+                                   Invocable<>,                           //
+                                   std::ranges::iota_view<int, int>>);
+static_assert(!std::is_invocable_v<decltype((std::views::zip_transform)),
+                                   Invocable<int>,
+                                   std::ranges::iota_view<int, int>,
+                                   std::ranges::iota_view<int, int>>);
+static_assert(std::is_invocable_v<decltype((std::views::zip_transform)),
+                                  Invocable<int, int>,
+                                  std::ranges::iota_view<int, int>,
+                                  std::ranges::iota_view<int, int>>);
+
+constexpr bool test() {
+  {
+    // zip_transform function with no ranges
+    auto v = std::views::zip_transform(Invocable<>{});
+    assert(std::ranges::empty(v));
+    static_assert(std::is_same_v<decltype(v), std::ranges::empty_view<int>>);
+  }
+
+  {
+    // zip_transform views
+    int buffer1[] = {1, 2, 3, 4, 5, 6, 7, 8};
+    int buffer2[] = {9, 10, 11, 12};
+    auto view1    = std::views::all(buffer1);
+    auto view2    = std::views::all(buffer2);
+    std::same_as<std::ranges::zip_transform_view<std::plus<>, decltype(view1), decltype(view2)>> decltype(auto) v =
+        std::views::zip_transform(std::plus{}, buffer1, buffer2);
+    assert(std::ranges::size(v) == 4);
+    auto expected = {10, 12, 14, 16};
+    assert(std::ranges::equal(v, expected));
+    static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v)>, int>);
+  }
+
+  {
+    // zip_transform a viewable range
+    std::array a{1, 2, 3};
+    auto id = [](auto& x) -> decltype(auto) { return (x); };
+    std::same_as<
+        std::ranges::zip_transform_view<decltype(id), std::ranges::ref_view<std::array<int, 3>>>> decltype(auto) v =
+        std::views::zip_transform(id, a);
+    assert(&v[0] == &a[0]);
+    static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v)>, int&>);
+  }
+
+  int buffer[] = {1, 2, 3};
+  {
+    // one range
+    auto v = std::views::zip_transform(MakeTuple{}, SimpleCommon{buffer});
+    assert(std::ranges::equal(v, std::vector{std::tuple(1), std::tuple(2), std::tuple(3)}));
+  }
+
+  {
+    // two ranges
+    auto v = std::views::zip_transform(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    assert(std::ranges::equal(v, std::vector{1, 2, 3}));
+  }
+
+  {
+    // three ranges
+    auto v = std::views::zip_transform(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    assert(std::ranges::equal(v, std::vector{std::tuple(1, 1, 2.0)}));
+  }
+
+  {
+    // single empty range
+    auto v = std::views::zip_transform(MakeTuple{}, std::ranges::empty_view<int>());
+    assert(std::ranges::empty(v));
+  }
+
+  {
+    // empty range at the beginning
+    auto v = std::views::zip_transform(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer}, SimpleCommon{buffer});
+    assert(std::ranges::empty(v));
+  }
+
+  {
+    // empty range in the middle
+    auto v = std::views::zip_transform(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), SimpleCommon{buffer});
+    assert(std::ranges::empty(v));
+  }
+
+  {
+    // empty range at the end
+    auto v = std::views::zip_transform(
+        MakeTuple{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::empty_view<int>());
+    assert(std::ranges::empty(v));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctad.compile.pass.cpp
new file mode 100644
index 0000000000000..9254dd18da4be
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctad.compile.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// template<class F, class... Rs>
+// zip_transform_view(F, Rs&&...) -> zip_transform_view<F, views::all_t<Rs>...>;
+
+#include <cassert>
+#include <ranges>
+
+#include "types.h"
+
+struct Container {
+  int* begin() const;
+  int* end() const;
+};
+
+struct Fn {
+  int operator()(auto&&...) const { return 5; }
+};
+
+void testCTAD() {
+  static_assert(std::is_same_v<decltype(std::ranges::zip_transform_view(Fn{}, Container{})),
+                               std::ranges::zip_transform_view<Fn, std::ranges::owning_view<Container>>>);
+
+  static_assert(std::is_same_v<decltype(std::ranges::zip_transform_view(Fn{}, Container{}, IntView{})),
+                               std::ranges::zip_transform_view<Fn, std::ranges::owning_view<Container>, IntView>>);
+
+  Container c{};
+  static_assert(
+      std::is_same_v<
+          decltype(std::ranges::zip_transform_view(Fn{}, Container{}, IntView{}, c)),
+          std::ranges::
+              zip_transform_view<Fn, std::ranges::owning_view<Container>, IntView, std::ranges::ref_view<Container>>>);
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..751210f2a4f1f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctor.default.pass.cpp
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// zip_transform_view() = default;
+
+#include <ranges>
+
+#include <cassert>
+#include <type_traits>
+
+#include "types.h"
+
+constexpr int buff[] = {1, 2, 3};
+
+struct DefaultConstructibleView : std::ranges::view_base {
+  constexpr DefaultConstructibleView() : begin_(buff), end_(buff + 3) {}
+  constexpr int const* begin() const { return begin_; }
+  constexpr int const* end() const { return end_; }
+
+private:
+  int const* begin_;
+  int const* end_;
+};
+
+struct NonDefaultConstructibleView : std::ranges::view_base {
+  NonDefaultConstructibleView() = delete;
+  int* begin() const;
+  int* end() const;
+};
+
+struct DefaultConstructibleFn {
+  constexpr int operator()(const auto&... x) const { return (x + ...); }
+};
+
+struct NonDefaultConstructibleFn {
+  NonDefaultConstructibleFn() = delete;
+  constexpr int operator()(const auto&... x) const;
+};
+
+// The default constructor requires all underlying views to be default constructible.
+// It is implicitly required by the zip_view's constructor.
+static_assert(std::is_default_constructible_v<std::ranges::zip_transform_view< //
+                  DefaultConstructibleFn,                                      //
+                  DefaultConstructibleView>>);
+static_assert(std::is_default_constructible_v<std::ranges::zip_transform_view< //
+                  DefaultConstructibleFn,                                      //
+                  DefaultConstructibleView,
+                  DefaultConstructibleView>>);
+static_assert(!std::is_default_constructible_v<std::ranges::zip_transform_view< //
+                  NonDefaultConstructibleFn,                                    //
+                  DefaultConstructibleView>>);
+static_assert(!std::is_default_constructible_v<std::ranges::zip_transform_view< //
+                  DefaultConstructibleFn,                                       //
+                  NonDefaultConstructibleView>>);
+static_assert(!std::is_default_constructible_v<std::ranges::zip_transform_view< //
+                  DefaultConstructibleFn,                                       //
+                  DefaultConstructibleView,
+                  NonDefaultConstructibleView>>);
+
+constexpr bool test() {
+  {
+    using View =
+        std::ranges::zip_transform_view<DefaultConstructibleFn, DefaultConstructibleView, DefaultConstructibleView>;
+    View v = View(); // the default constructor is not explicit
+    assert(v.size() == 3);
+    auto it = v.begin();
+    assert(*it++ == 2);
+    assert(*it++ == 4);
+    assert(*it == 6);
+  }
+
+  {
+    // one range
+    using View = std::ranges::zip_transform_view<MakeTuple, DefaultConstructibleView>;
+    View v     = View(); // the default constructor is not explicit
+    auto it    = v.begin();
+    assert(*it == std::make_tuple(1));
+  }
+
+  {
+    // two ranges
+    using View = std::ranges::zip_transform_view<MakeTuple, DefaultConstructibleView, std::ranges::iota_view<int>>;
+    View v     = View(); // the default constructor is not explicit
+    auto it    = v.begin();
+    assert(*it == std::tuple(1, 0));
+  }
+
+  {
+    // three ranges
+    using View = std::ranges::
+        zip_transform_view<MakeTuple, DefaultConstructibleView, DefaultConstructibleView, std::ranges::iota_view<int>>;
+    View v  = View(); // the default constructor is not explicit
+    auto it = v.begin();
+    assert(*it == std::tuple(1, 1, 0));
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the beginning
+    using View = std::ranges::
+        zip_transform_view<MakeTuple, std::ranges::empty_view<int>, DefaultConstructibleView, DefaultConstructibleView>;
+    View v = View(); // the default constructor is not explicit
+    assert(v.empty());
+  }
+
+  {
+    // empty range in the middle
+    using View =
+        std::ranges::zip_transform_view<MakeTuple,
+                                        DefaultConstructibleView,
+                                        std::ranges::empty_view<int>,
+                                        DefaultConstructibleView,
+                                        DefaultConstructibleView>;
+    View v = View(); // the default constructor is not explicit
+    assert(v.empty());
+  }
+
+  {
+    // empty range at the end
+    using View = std::ranges::
+        zip_transform_view<MakeTuple, DefaultConstructibleView, DefaultConstructibleView, std::ranges::empty_view<int>>;
+    View v = View(); // the default constructor is not explicit
+    assert(v.empty());
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctor.views.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctor.views.pass.cpp
new file mode 100644
index 0000000000000..5f3b5a3ff6f99
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/ctor.views.pass.cpp
@@ -0,0 +1,144 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr explicit zip_transform_view(F, Views...)
+
+#include <algorithm>
+#include <ranges>
+#include <vector>
+
+#include "types.h"
+
+struct Fn {
+  int operator()(auto&&...) const { return 5; }
+};
+
+template <class T, class... Args>
+concept IsImplicitlyConstructible = requires(T val, Args... args) { val = {std::forward<Args>(args)...}; };
+
+// test constructor is explicit
+static_assert(std::constructible_from<std::ranges::zip_transform_view<Fn, IntView>, Fn, IntView>);
+static_assert(!IsImplicitlyConstructible<std::ranges::zip_transform_view<Fn, IntView>, Fn, IntView>);
+
+static_assert(std::constructible_from<std::ranges::zip_transform_view<Fn, IntView, IntView>, Fn, IntView, IntView>);
+static_assert(!IsImplicitlyConstructible<std::ranges::zip_transform_view<Fn, IntView, IntView>, Fn, IntView, IntView>);
+
+struct MoveAwareView : std::ranges::view_base {
+  int moves                 = 0;
+  constexpr MoveAwareView() = default;
+  constexpr MoveAwareView(MoveAwareView&& other) : moves(other.moves + 1) { other.moves = 1; }
+  constexpr MoveAwareView& operator=(MoveAwareView&& other) {
+    moves       = other.moves + 1;
+    other.moves = 0;
+    return *this;
+  }
+  constexpr const int* begin() const { return &moves; }
+  constexpr const int* end() const { return &moves + 1; }
+};
+
+template <class View1, class View2>
+constexpr void constructorTest(auto&& buffer1, auto&& buffer2) {
+  std::ranges::zip_transform_view v{MakeTuple{}, View1{buffer1}, View2{buffer2}};
+  auto [i, j] = *v.begin();
+  assert(i == buffer1[0]);
+  assert(j == buffer2[0]);
+};
+
+constexpr bool test() {
+  int buffer[8]  = {1, 2, 3, 4, 5, 6, 7, 8};
+  int buffer2[4] = {9, 8, 7, 6};
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer2});
+    assert(std::ranges::equal(v, std::vector{std::tuple(9), std::tuple(8), std::tuple(7), std::tuple(6)}));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    assert(std::ranges::equal(v, std::vector{1, 2, 3, 4, 5, 6, 7, 8}));
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer2}, std::ranges::single_view(2.));
+    assert(std::ranges::equal(v, std::vector{std::tuple(1, 9, 2.0)}));
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    assert(std::ranges::empty(v));
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer}, SimpleCommon{buffer});
+    assert(std::ranges::empty(v));
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), SimpleCommon{buffer});
+    assert(std::ranges::empty(v));
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::empty_view<int>());
+    assert(std::ranges::empty(v));
+  }
+  {
+    // constructor from views
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SizedRandomAccessView{buffer}, std::views::iota(0), std::ranges::single_view(2.));
+    auto [i, j, k] = *v.begin();
+    assert(i == 1);
+    assert(j == 0);
+    assert(k == 2.0);
+  }
+
+  {
+    // arguments are moved once
+    MoveAwareView mv;
+    std::ranges::zip_transform_view v{MakeTuple{}, std::move(mv), MoveAwareView{}};
+    auto [numMoves1, numMoves2] = *v.begin();
+    assert(numMoves1 == 3); // one move from the local variable to parameter, one move from parameter to member
+    assert(numMoves2 == 2);
+  }
+
+  // input and forward
+  {
+    constructorTest<InputCommonView, ForwardSizedView>(buffer, buffer2);
+  }
+
+  // bidi and random_access
+  {
+    constructorTest<BidiCommonView, SizedRandomAccessView>(buffer, buffer2);
+  }
+
+  // contiguous
+  {
+    constructorTest<ContiguousCommonView, ContiguousCommonView>(buffer, buffer2);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/end.pass.cpp
new file mode 100644
index 0000000000000..e6c7094e7d720
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/end.pass.cpp
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr auto end()
+// constexpr auto end() const
+//   requires range<const InnerView> &&
+//            regular_invocable<const F&, range_reference_t<const Views>...>;
+
+#include <ranges>
+
+#include "types.h"
+
+template <class T>
+concept HasConstEnd = requires(const T& ct) { ct.end(); };
+
+template <class T>
+concept HasEnd = requires(T& t) { t.end(); };
+
+constexpr bool test() {
+  int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+  {
+    // simple test
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::views::iota(0), std::ranges::single_view(2.));
+    assert(v.begin() != v.end());
+    assert(std::as_const(v).begin() != std::as_const(v).end());
+    assert(v.begin() + 1 == v.end());
+    assert(std::as_const(v).begin() + 1 == std::as_const(v).end());
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it = v.begin();
+    assert(it + 8 == v.end());
+    assert(it + 8 == std::as_const(v).end());
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it = v.begin();
+    assert(it + 8 == v.end());
+    assert(it + 8 == std::as_const(v).end());
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    auto it = v.begin();
+    assert(it + 1 == v.end());
+    assert(it + 1 == std::as_const(v).end());
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer}, SimpleCommon{buffer});
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), SimpleCommon{buffer});
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::empty_view<int>());
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // common_range<InnerView>
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it       = v.begin();
+    auto const_it = std::as_const(v).begin();
+    auto st       = v.end();
+    auto const_st = std::as_const(v).end();
+
+    static_assert(!std::same_as<decltype(it), decltype(const_it)>);
+    static_assert(!std::same_as<decltype(st), decltype(const_st)>);
+    static_assert(std::same_as<decltype(it), decltype(st)>);
+    static_assert(std::same_as<decltype(const_it), decltype(const_st)>);
+
+    assert(it + 8 == st);
+    assert(const_it + 8 == const_st);
+  }
+  {
+    // !common_range<InnerView>
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleNonCommon{buffer});
+    auto it       = v.begin();
+    auto const_it = std::as_const(v).begin();
+    auto st       = v.end();
+    auto const_st = std::as_const(v).end();
+
+    static_assert(!std::same_as<decltype(it), decltype(const_it)>);
+    static_assert(!std::same_as<decltype(st), decltype(const_st)>);
+    static_assert(!std::same_as<decltype(it), decltype(st)>);
+    static_assert(!std::same_as<decltype(const_it), decltype(const_st)>);
+
+    assert(it + 8 == st);
+    assert(const_it + 8 == const_st);
+  }
+
+  {
+    // underlying const R is not a range
+    using ZTV = std::ranges::zip_transform_view<MakeTuple, SimpleCommon, NoConstBeginView>;
+    static_assert(HasEnd<ZTV>);
+    static_assert(!HasConstEnd<ZTV>);
+  }
+
+  {
+    // Fn cannot invoke on const range
+    using ZTV = std::ranges::zip_transform_view<NonConstOnlyFn, ConstNonConstDifferentView>;
+    static_assert(HasEnd<ZTV>);
+    static_assert(!HasConstEnd<ZTV>);
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/general.pass.cpp
new file mode 100644
index 0000000000000..3c35de27deb69
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/general.pass.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// Some basic examples of how zip_tranform_view might be used in the wild. This is a general
+// collection of sample algorithms and functions that try to mock general usage of
+// this view.
+
+#include <ranges>
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+int main(int, char**) {
+  std::vector v1 = {1, 2};
+  std::vector v2 = {4, 5, 6};
+  auto ztv       = std::views::zip_transform(std::plus(), v1, v2);
+  auto expected  = {5, 7};
+  assert(std::ranges::equal(ztv, expected));
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/arithmetic.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/arithmetic.pass.cpp
new file mode 100644
index 0000000000000..d697ae571cb7d
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/arithmetic.pass.cpp
@@ -0,0 +1,237 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+//  constexpr iterator& operator+=(difference_type x) requires random_access_range<Base>;
+//  constexpr iterator& operator-=(difference_type x) requires random_access_range<Base>;
+//  friend constexpr iterator operator+(const iterator& i, difference_type n)
+//    requires random_access_range<Base>;
+//  friend constexpr iterator operator+(difference_type n, const iterator& i)
+//    requires random_access_range<Base>;
+//  friend constexpr iterator operator-(const iterator& i, difference_type n)
+//    requires random_access_range<Base>;
+//  friend constexpr difference_type operator-(const iterator& x, const iterator& y)
+//    requires sized_sentinel_for<ziperator<Const>, ziperator<Const>>;
+
+#include <ranges>
+
+#include <array>
+#include <concepts>
+#include <functional>
+
+#include "../types.h"
+
+template <class T, class U>
+concept canPlusEqual = requires(T& t, U& u) { t += u; };
+
+template <class T, class U>
+concept canPlus = requires(T& t, U& u) { t + u; };
+
+template <class T, class U>
+concept canMinusEqual = requires(T& t, U& u) { t -= u; };
+
+template <class T, class U>
+concept canMinus = requires(T& t, U& u) { t - u; };
+
+constexpr bool test() {
+  int buffer1[5] = {1, 2, 3, 4, 5};
+  SizedRandomAccessView a{buffer1};
+  static_assert(std::ranges::random_access_range<decltype(a)>);
+
+  std::array b{4.1, 3.2, 4.3, 0.1, 0.2};
+  static_assert(std::ranges::contiguous_range<decltype(b)>);
+
+  {
+    // operator+(x, n) and operator+=
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b);
+    auto it1   = v.begin();
+    using Iter = decltype(it1);
+
+    std::same_as<Iter> decltype(auto) it2 = it1 + 3;
+    assert(*it2 == std::tuple(4, 0.1));
+
+    std::same_as<Iter> decltype(auto) it3 = 3 + it1;
+    assert(*it3 == std::tuple(4, 0.1));
+
+    std::same_as<Iter&> decltype(auto) it1_ref = it1 += 3;
+    assert(&it1_ref == &it1);
+    assert(*it1_ref == std::tuple(4, 0.1));
+    assert(*it1 == std::tuple(4, 0.1));
+
+    static_assert(canPlus<Iter, std::intptr_t>);
+    static_assert(canPlusEqual<Iter, std::intptr_t>);
+  }
+
+  {
+    // operator-(x, n) and operator-=
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b);
+    auto it1   = v.end();
+    using Iter = decltype(it1);
+
+    std::same_as<Iter> decltype(auto) it2 = it1 - 3;
+    assert(*it2 == std::tuple(3, 4.3));
+
+    std::same_as<Iter&> decltype(auto) it1_ref = it1 -= 3;
+    assert(&it1_ref == &it1);
+    assert(*it1_ref == std::tuple(3, 4.3));
+    assert(*it1 == std::tuple(3, 4.3));
+
+    static_assert(canMinusEqual<Iter, std::intptr_t>);
+    static_assert(canMinus<Iter, std::intptr_t>);
+  }
+
+  {
+    // operator-(x, y)
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b);
+    assert((v.end() - v.begin()) == 5);
+
+    auto it1 = v.begin() + 2;
+    auto it2 = v.end() - 1;
+
+    using Iter = decltype(it1);
+
+    std::same_as<std::iter_difference_t<Iter>> decltype(auto) n = it1 - it2;
+    assert(n == -2);
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer1});
+    auto it = v.begin();
+    assert(*it == std::make_tuple(1));
+
+    it += 4;
+    assert(*it == std::make_tuple(5));
+
+    it -= 1;
+    assert(*it == std::make_tuple(4));
+
+    auto it2 = it - 2;
+    assert(*it2 == std::make_tuple(2));
+
+    auto it3 = 3 + it2;
+    assert(*it3 == std::make_tuple(5));
+
+    assert(it3 - it2 == 3);
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer1}, std::views::iota(0));
+    auto it = v.begin();
+    assert(*it == std::make_tuple(1, 0));
+
+    it += 4;
+    assert(*it == std::make_tuple(5, 4));
+
+    it -= 1;
+    assert(*it == std::make_tuple(4, 3));
+
+    auto it2 = it - 2;
+    assert(*it2 == std::make_tuple(2, 1));
+
+    auto it3 = 3 + it2;
+    assert(*it3 == std::make_tuple(5, 4));
+
+    assert(it3 - it2 == 3);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(
+        Tie{}, SimpleCommon{buffer1}, SimpleCommon{buffer1}, std::ranges::single_view(2.));
+    auto it = v.begin();
+    assert(*it == std::make_tuple(1, 1, 2.0));
+
+    it += 1;
+    assert(it == v.end());
+
+    it -= 1;
+    assert(it == v.begin());
+
+    auto it2 = it + 1;
+    assert(it2 == v.end());
+
+    auto it3 = it2 - 1;
+    assert(it3 == v.begin());
+
+    assert(it3 - it2 == -1);
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it2 - it == 0);
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer1}, SimpleCommon{buffer1});
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it2 - it == 0);
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, std::ranges::empty_view<int>(), SimpleCommon{buffer1});
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it2 - it == 0);
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, SimpleCommon{buffer1}, std::ranges::empty_view<int>());
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it2 - it == 0);
+  }
+  {
+    // One of the ranges is not random access
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b, ForwardSizedView{buffer1});
+    auto it1   = v.begin();
+    using Iter = decltype(it1);
+    static_assert(!canPlus<Iter, std::intptr_t>);
+    static_assert(!canPlus<std::intptr_t, Iter>);
+    static_assert(!canPlusEqual<Iter, std::intptr_t>);
+    static_assert(!canMinus<Iter, std::intptr_t>);
+    static_assert(canMinus<Iter, Iter>);
+    static_assert(!canMinusEqual<Iter, std::intptr_t>);
+
+    auto it2 = ++v.begin();
+    assert((it2 - it1) == 1);
+  }
+
+  {
+    // One of the ranges does not have sized sentinel
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b, InputCommonView{buffer1});
+    using Iter = decltype(v.begin());
+    static_assert(!canPlus<Iter, std::intptr_t>);
+    static_assert(!canPlus<std::intptr_t, Iter>);
+    static_assert(!canPlusEqual<Iter, std::intptr_t>);
+    static_assert(!canMinus<Iter, std::intptr_t>);
+    static_assert(!canMinus<Iter, Iter>);
+    static_assert(!canMinusEqual<Iter, std::intptr_t>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/compare.pass.cpp
new file mode 100644
index 0000000000000..2befb7e4cc58c
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/compare.pass.cpp
@@ -0,0 +1,226 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// friend constexpr bool operator==(const iterator& x, const iterator& y)
+//   requires equality_comparable<ziperator<Const>>;
+
+// friend constexpr auto operator<=>(const iterator& x, const iterator& y)
+//   requires random_access_range<Base>;
+
+#include <ranges>
+
+#include <compare>
+
+#include "test_iterators.h"
+#include "../types.h"
+
+constexpr void compareOperatorTest(auto&& iter1, auto&& iter2) {
+  assert(!(iter1 < iter1));
+  assert(iter1 < iter2);
+  assert(!(iter2 < iter1));
+  assert(iter1 <= iter1);
+  assert(iter1 <= iter2);
+  assert(!(iter2 <= iter1));
+  assert(!(iter1 > iter1));
+  assert(!(iter1 > iter2));
+  assert(iter2 > iter1);
+  assert(iter1 >= iter1);
+  assert(!(iter1 >= iter2));
+  assert(iter2 >= iter1);
+  assert(iter1 == iter1);
+  assert(!(iter1 == iter2));
+  assert(iter2 == iter2);
+  assert(!(iter1 != iter1));
+  assert(iter1 != iter2);
+  assert(!(iter2 != iter2));
+}
+
+constexpr void spaceshipTest(auto&& iter1, auto&& iter2) {
+  using Iter = decltype(iter1);
+  static_assert(std::three_way_comparable<Iter>);
+  assert((iter1 <=> iter2) == std::strong_ordering::less);
+  assert((iter1 <=> iter1) == std::strong_ordering::equal);
+  assert((iter2 <=> iter2) == std::strong_ordering::equal);
+  assert((iter2 <=> iter1) == std::strong_ordering::greater);
+}
+
+constexpr void inequalityOperatorsDoNotExistTest(auto&& iter1, auto&& iter2) {
+  using Iter1 = decltype(iter1);
+  using Iter2 = decltype(iter2);
+  static_assert(!std::is_invocable_v<std::less<>, Iter1, Iter2>);
+  static_assert(!std::is_invocable_v<std::less_equal<>, Iter1, Iter2>);
+  static_assert(!std::is_invocable_v<std::greater<>, Iter1, Iter2>);
+  static_assert(!std::is_invocable_v<std::greater_equal<>, Iter1, Iter2>);
+}
+
+constexpr bool test() {
+  {
+    // Test a new-school iterator with operator<=>; the iterator should also have operator<=>.
+    using It       = three_way_contiguous_iterator<int*>;
+    using SubRange = std::ranges::subrange<It>;
+    static_assert(std::three_way_comparable<It>);
+
+    int a[]    = {1, 2, 3, 4};
+    int b[]    = {5, 6, 7, 8, 9};
+    auto r     = std::views::zip_transform(MakeTuple{}, SubRange(It(a), It(a + 4)), SubRange(It(b), It(b + 5)));
+    auto iter1 = r.begin();
+    auto iter2 = iter1 + 1;
+    using Iter = decltype(iter1);
+    static_assert(std::three_way_comparable<Iter>);
+    compareOperatorTest(iter1, iter2);
+    spaceshipTest(iter1, iter2);
+  }
+
+  {
+    // Test an old-school iterator with no operator<=>; the transform iterator shouldn't have
+    // operator<=> either.
+    using It       = random_access_iterator<int*>;
+    using Subrange = std::ranges::subrange<It>;
+    static_assert(!std::three_way_comparable<It>);
+
+    int a[]    = {1, 2, 3, 4};
+    int b[]    = {5, 6, 7, 8, 9};
+    auto r     = std::views::zip_transform(MakeTuple{}, Subrange(It(a), It(a + 4)), Subrange(It(b), It(b + 5)));
+    auto iter1 = r.begin();
+    auto iter2 = iter1 + 1;
+
+    compareOperatorTest(iter1, iter2);
+    spaceshipTest(iter1, iter2);
+  }
+
+  int buffer[5] = {1, 2, 3, 4, 5};
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it  = v.begin();
+    auto it2 = it + 3;
+    compareOperatorTest(it, it2);
+    spaceshipTest(it, it2);
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it  = v.begin();
+    auto it2 = it + 3;
+    compareOperatorTest(it, it2);
+    spaceshipTest(it, it2);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    auto it  = v.begin();
+    auto it2 = it + 1;
+    compareOperatorTest(it, it2);
+    spaceshipTest(it, it2);
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it == it2);
+    assert(it <=> it2 == std::strong_ordering::equal);
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer}, SimpleCommon{buffer});
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it == it2);
+    assert(it <=> it2 == std::strong_ordering::equal);
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), SimpleCommon{buffer});
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it == it2);
+    assert(it <=> it2 == std::strong_ordering::equal);
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::empty_view<int>());
+    auto it  = v.begin();
+    auto it2 = v.end();
+    assert(it == it2);
+    assert(it <=> it2 == std::strong_ordering::equal);
+  }
+
+  {
+    // non random_access_range
+    int buffer1[1] = {1};
+    int buffer2[2] = {1, 2};
+
+    std::ranges::zip_transform_view v{MakeTuple{}, InputCommonView(buffer1), InputCommonView(buffer2)};
+    using ZTV = decltype(v);
+    static_assert(!std::ranges::forward_range<ZTV>);
+    static_assert(std::ranges::input_range<ZTV>);
+    static_assert(std::ranges::common_range<ZTV>);
+
+    auto it1 = v.begin();
+    auto it2 = v.end();
+    assert(it1 != it2);
+
+    ++it1;
+    assert(it1 == it2);
+
+    inequalityOperatorsDoNotExistTest(it1, it2);
+  }
+
+  {
+    // in this case sentinel is computed by getting each of the underlying sentinel, so only one
+    // underlying iterator is comparing equal
+    int buffer1[1] = {1};
+    int buffer2[2] = {1, 2};
+    std::ranges::zip_transform_view v{MakeTuple{}, ForwardSizedView(buffer1), ForwardSizedView(buffer2)};
+    using ZTV = decltype(v);
+    static_assert(std::ranges::common_range<ZTV>);
+    static_assert(!std::ranges::bidirectional_range<ZTV>);
+
+    auto it1 = v.begin();
+    auto it2 = v.end();
+    assert(it1 != it2);
+
+    ++it1;
+    // it1:  <buffer1 + 1, buffer2 + 1>
+    // it2:  <buffer1 + 1, buffer2 + 2>
+    assert(it1 == it2);
+
+    inequalityOperatorsDoNotExistTest(it1, it2);
+  }
+
+  {
+    // underlying iterator does not support ==
+    using IterNoEqualView = BasicView<cpp20_input_iterator<int*>, sentinel_wrapper<cpp20_input_iterator<int*>>>;
+    int buffer2[]         = {1};
+    std::ranges::zip_transform_view r(MakeTuple{}, IterNoEqualView{buffer2});
+    auto it    = r.begin();
+    using Iter = decltype(it);
+    static_assert(!std::invocable<std::equal_to<>, Iter, Iter>);
+    inequalityOperatorsDoNotExistTest(it, it);
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..8f8369d6af5f2
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/ctor.default.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// iterator() = default;
+
+#include <ranges>
+
+#include "../types.h"
+
+struct IterDefaultCtrView : std::ranges::view_base {
+  int* begin() const;
+  int* end() const;
+};
+
+struct IterNoDefaultCtrView : std::ranges::view_base {
+  cpp20_input_iterator<int*> begin() const;
+  sentinel_wrapper<cpp20_input_iterator<int*>> end() const;
+};
+
+template <class... Views>
+using Iter = std::ranges::iterator_t<std::ranges::zip_transform_view<MakeTuple, Views...>>;
+
+static_assert(!std::default_initializable<Iter<IterNoDefaultCtrView>>);
+static_assert(!std::default_initializable<Iter<IterNoDefaultCtrView, IterDefaultCtrView>>);
+static_assert(!std::default_initializable<Iter<IterNoDefaultCtrView, IterNoDefaultCtrView>>);
+static_assert(std::default_initializable<Iter<IterDefaultCtrView>>);
+static_assert(std::default_initializable<Iter<IterDefaultCtrView, IterDefaultCtrView>>);
+
+template <class Fn, class... Views>
+constexpr void test() {
+  using ZipTransformIter = std::ranges::iterator_t<std::ranges::zip_transform_view<Fn, Views...>>;
+  ZipTransformIter iter1 = {};
+  ZipTransformIter iter2;
+  assert(iter1 == iter2);
+}
+
+constexpr bool test() {
+  test<MakeTuple, IterDefaultCtrView>();
+  test<MakeTuple, IterDefaultCtrView, std::ranges::empty_view<int>>();
+  test<MakeTuple, IterDefaultCtrView, std::ranges::iota_view<int>, std::ranges::single_view<int>>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/ctor.other.pass.cpp
new file mode 100644
index 0000000000000..c643d83a2c665
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/ctor.other.pass.cpp
@@ -0,0 +1,131 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr iterator(iterator<!Const> i)
+//     requires Const && convertible_to<ziperator<false>, ziperator<Const>>;
+
+#include <array>
+#include <ranges>
+
+#include <cassert>
+
+#include "../types.h"
+
+using ConstIterIncompatibleView =
+    BasicView<forward_iterator<int*>,
+              forward_iterator<int*>,
+              random_access_iterator<const int*>,
+              random_access_iterator<const int*>>;
+static_assert(!std::convertible_to<std::ranges::iterator_t<ConstIterIncompatibleView>,
+                                   std::ranges::iterator_t<const ConstIterIncompatibleView>>);
+
+constexpr bool test() {
+  int buffer[3] = {1, 2, 3};
+
+  {
+    std::ranges::zip_transform_view v(MakeTuple{}, NonSimpleCommon{buffer});
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    assert(iter1 == iter2);
+
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+
+    // We cannot create a non-const iterator from a const iterator.
+    static_assert(!std::constructible_from<decltype(iter1), decltype(iter2)>);
+  }
+
+  {
+    // Check when we can't perform a non-const-to-const conversion of the ziperator
+    std::ranges::zip_transform_view v(MakeTuple{}, ConstIterIncompatibleView{buffer});
+    auto iter1 = v.begin();
+    auto iter2 = std::as_const(v).begin();
+
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+
+    static_assert(!std::constructible_from<decltype(iter1), decltype(iter2)>);
+    static_assert(!std::constructible_from<decltype(iter2), decltype(iter1)>);
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, NonSimpleCommon{buffer});
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(*iter2 == std::tuple(1));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, NonSimpleCommon{buffer}, std::views::iota(0));
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(*iter2 == 1);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(
+        Tie{}, NonSimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(*iter2 == std::tuple(1, 1, 2.0));
+  }
+
+  {
+    // single empty range
+    std::array<int, 0> buffer2{};
+    std::ranges::zip_transform_view v(MakeTuple{}, buffer2);
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(iter2 == v.end());
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), NonSimpleCommon{buffer}, SimpleCommon{buffer});
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(iter2 == v.end());
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), NonSimpleCommon{buffer});
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(iter2 == v.end());
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, NonSimpleCommon{buffer}, std::ranges::empty_view<int>());
+    auto iter1                                       = v.begin();
+    std::ranges::iterator_t<const decltype(v)> iter2 = iter1;
+    static_assert(!std::is_same_v<decltype(iter1), decltype(iter2)>);
+    assert(iter2 == v.end());
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/decrement.pass.cpp
new file mode 100644
index 0000000000000..9f3647b82c820
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/decrement.pass.cpp
@@ -0,0 +1,139 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr iterator& operator--() requires bidirectional_range<Base>;
+// constexpr iterator operator--(int) requires bidirectional_range<Base>;
+
+#include <array>
+#include <cassert>
+#include <ranges>
+
+#include "../types.h"
+
+template <class Iter>
+concept canDecrement = requires(Iter it) { --it; } || requires(Iter it) { it--; };
+
+constexpr bool test() {
+  std::array a{1, 2, 3, 4};
+  std::array b{4.1, 3.2, 4.3};
+  {
+    // all random access
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b, std::views::iota(0, 5));
+    auto it    = v.end();
+    using Iter = decltype(it);
+    static_assert(canDecrement<Iter>);
+
+    std::same_as<Iter&> decltype(auto) it_ref = --it;
+    assert(&it_ref == &it);
+
+    assert(*it == std::tuple(3, 4.3, 2));
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it--;
+    assert(original == it2);
+    assert(*it == std::tuple(2, 3.2, 1));
+  }
+
+  {
+    // all bidi+
+    int buffer[2] = {1, 2};
+
+    std::ranges::zip_transform_view v(MakeTuple{}, BidiCommonView{buffer}, std::views::iota(0, 5));
+    auto it    = v.begin();
+    using Iter = decltype(it);
+    static_assert(canDecrement<Iter>);
+
+    ++it;
+    ++it;
+
+    std::same_as<Iter&> decltype(auto) it_ref = --it;
+    assert(&it_ref == &it);
+
+    assert(it == ++v.begin());
+    assert(*it == std::tuple(2, 1));
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it--;
+    assert(original == it2);
+    assert(*it == std::tuple(1, 0));
+  }
+
+  {
+    // non bidi
+    int buffer[3] = {4, 5, 6};
+    std::ranges::zip_transform_view v(MakeTuple{}, a, InputCommonView{buffer});
+    using Iter = std::ranges::iterator_t<decltype(v)>;
+    static_assert(!canDecrement<Iter>);
+  }
+
+  int buffer[] = {1, 2, 3, 4, 5, 6};
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it    = v.end();
+    using Iter = decltype(it);
+
+    std::same_as<Iter&> decltype(auto) it_ref = --it;
+    assert(&it_ref == &it);
+
+    assert(*it == std::tuple(6));
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it--;
+    assert(original == it2);
+    assert(*it == std::tuple(5));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it    = v.begin() + 5;
+    using Iter = decltype(it);
+
+    std::same_as<Iter&> decltype(auto) it_ref = --it;
+    assert(&it_ref == &it);
+
+    assert(*it == 5);
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it--;
+    assert(original == it2);
+    assert(*it == 4);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    auto it    = v.end();
+    using Iter = decltype(it);
+
+    std::same_as<Iter&> decltype(auto) it_ref = --it;
+    assert(&it_ref == &it);
+
+    assert(*it == std::tuple(1, 1, 2.0));
+
+    ++it;
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it--;
+    assert(original == it2);
+    assert(*it == std::tuple(1, 1, 2.0));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/deref.pass.cpp
new file mode 100644
index 0000000000000..3c97e118a7700
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/deref.pass.cpp
@@ -0,0 +1,143 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr decltype(auto) operator*() const noexcept(see below);
+
+#include <array>
+#include <cassert>
+#include <ranges>
+
+#include "../types.h"
+
+// Test noexcept
+// Remarks: Let Is be the pack 0, 1, ..., (sizeof...(Views)-1). The exception specification is equivalent to:
+//   noexcept(invoke(*parent_->fun_, *std::get<Is>(inner_.current_)...)).
+
+template <class ZipTransformView>
+concept DerefNoexcept = requires(std::ranges::iterator_t<ZipTransformView> iter) { requires noexcept(*iter); };
+
+struct ThrowingDerefIter {
+  using iterator_category = std::forward_iterator_tag;
+  using value_type        = int;
+  using difference_type   = std::intptr_t;
+
+  int operator*() const noexcept(false);
+
+  ThrowingDerefIter& operator++();
+  void operator++(int);
+
+  friend constexpr bool operator==(const ThrowingDerefIter&, const ThrowingDerefIter&) = default;
+};
+
+using NoexceptDerefIter = int*;
+
+template <bool NoExceptDeref>
+struct TestView : std::ranges::view_base {
+  using Iter = std::conditional_t<NoExceptDeref, NoexceptDerefIter, ThrowingDerefIter>;
+  Iter begin() const;
+  Iter end() const;
+};
+
+template <bool NoExceptCall>
+struct TestFn {
+  int operator()(auto&&...) const noexcept(NoExceptCall);
+};
+
+static_assert(DerefNoexcept<std::ranges::zip_transform_view<TestFn<true>, TestView<true>>>);
+static_assert(DerefNoexcept<std::ranges::zip_transform_view<TestFn<true>, TestView<true>, TestView<true>>>);
+static_assert(!DerefNoexcept<std::ranges::zip_transform_view<TestFn<true>, TestView<false>>>);
+static_assert(!DerefNoexcept<std::ranges::zip_transform_view<TestFn<false>, TestView<true>>>);
+static_assert(!DerefNoexcept<std::ranges::zip_transform_view<TestFn<false>, TestView<false>>>);
+static_assert(!DerefNoexcept<std::ranges::zip_transform_view<TestFn<false>, TestView<false>, TestView<true>>>);
+static_assert(!DerefNoexcept<std::ranges::zip_transform_view<TestFn<true>, TestView<false>, TestView<true>>>);
+static_assert(!DerefNoexcept<std::ranges::zip_transform_view<TestFn<false>, TestView<false>, TestView<false>>>);
+
+constexpr bool test() {
+  std::array a{1, 2, 3, 4};
+  std::array b{4.1, 3.2, 4.3};
+  {
+    // Function returns reference
+    std::ranges::zip_transform_view v(GetFirst{}, a);
+    auto it                               = v.begin();
+    std::same_as<int&> decltype(auto) val = *it;
+    assert(&val == &a[0]);
+  }
+
+  {
+    // function returns PRValue
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b);
+    auto it                                                  = v.begin();
+    std::same_as<std::tuple<int, double>> decltype(auto) val = *it;
+    assert(val == std::tuple(1, 4.1));
+  }
+
+  {
+    // operator* is const
+    std::ranges::zip_transform_view v(GetFirst{}, a);
+    const auto it                         = v.begin();
+    std::same_as<int&> decltype(auto) val = *it;
+    assert(&val == &a[0]);
+  }
+
+  {
+    // dereference twice
+    std::ranges::zip_transform_view v(MakeTuple{}, a, b);
+    auto it = v.begin();
+    assert(*it == std::tuple(1, 4.1));
+    assert(*it == std::tuple(1, 4.1));
+  }
+
+  {
+    // back and forth
+    std::ranges::zip_transform_view v(Tie{}, a, b);
+    auto it = v.begin();
+    assert(&std::get<0>(*it) == &a[0]);
+    assert(&std::get<1>(*it) == &b[0]);
+    ++it;
+    assert(&std::get<0>(*it) == &a[1]);
+    assert(&std::get<1>(*it) == &b[1]);
+    --it;
+    assert(&std::get<0>(*it) == &a[0]);
+    assert(&std::get<1>(*it) == &b[0]);
+  }
+
+  int buffer[] = {1, 2, 3, 4, 5, 6};
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it = v.begin();
+    assert(*it == std::make_tuple(1));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it = v.begin();
+    assert(&*it == &buffer[0]);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::single_view(2.));
+    auto it = v.begin();
+    assert(&std::get<0>(*it) == &buffer[0]);
+    assert(&std::get<1>(*it) == &buffer[0]);
+    assert(std::get<2>(*it) == 2.0);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/increment.pass.cpp
new file mode 100644
index 0000000000000..4a1ec74a66844
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/increment.pass.cpp
@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr iterator& operator++();
+// constexpr void operator++(int);
+// constexpr iterator operator++(int) requires forward_range<Base>;;
+
+#include <array>
+#include <cassert>
+#include <ranges>
+
+#include "../types.h"
+
+struct InputRange : IntBufferView {
+  using IntBufferView::IntBufferView;
+  using iterator = cpp20_input_iterator<int*>;
+  constexpr iterator begin() const { return iterator(buffer_); }
+  constexpr sentinel_wrapper<iterator> end() const { return sentinel_wrapper<iterator>(iterator(buffer_ + size_)); }
+};
+
+template <class View>
+constexpr void testForwardPlus() {
+  int buffer[] = {1, 2, 3, 4};
+
+  std::ranges::zip_transform_view v(GetFirst{}, View{buffer}, View{buffer});
+  auto it    = v.begin();
+  using Iter = decltype(it);
+
+  assert(&(*it) == &(buffer[0]));
+
+  std::same_as<Iter&> decltype(auto) it_ref = ++it;
+  assert(&it_ref == &it);
+  assert(&(*it) == &(buffer[1]));
+
+  static_assert(std::is_same_v<decltype(it++), Iter>);
+  auto original                          = it;
+  std::same_as<Iter> decltype(auto) copy = it++;
+  assert(original == copy);
+  assert(&(*it) == &(buffer[2]));
+}
+
+constexpr bool test() {
+  testForwardPlus<SizedRandomAccessView>();
+  testForwardPlus<BidiCommonView>();
+  testForwardPlus<ForwardSizedView>();
+
+  {
+    // test input_range
+    int buffer[3] = {4, 5, 6};
+    std::ranges::zip_transform_view v(MakeTuple{}, InputRange{buffer}, InputRange{buffer});
+    auto it    = v.begin();
+    using Iter = decltype(it);
+
+    assert(*it == std::tuple(4, 4));
+
+    std::same_as<Iter&> decltype(auto) it_ref = ++it;
+    assert(&it_ref == &it);
+    assert(*it == std::tuple(5, 5));
+
+    static_assert(std::is_same_v<decltype(it++), void>);
+    it++;
+    assert(*it == std::tuple(6, 6));
+  }
+
+  int buffer[] = {1, 2, 3, 4, 5, 6};
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it    = v.begin();
+    using Iter = decltype(it);
+
+    std::same_as<Iter&> decltype(auto) it_ref = ++it;
+    assert(&it_ref == &it);
+
+    assert(*it == std::tuple(2));
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it++;
+    assert(original == it2);
+    assert(*it == std::tuple(3));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it    = v.begin();
+    using Iter = decltype(it);
+
+    std::same_as<Iter&> decltype(auto) it_ref = ++it;
+    assert(&it_ref == &it);
+
+    assert(*it == 2);
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it++;
+    assert(original == it2);
+    assert(*it == 3);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::repeat_view(2.));
+    auto it    = v.begin();
+    using Iter = decltype(it);
+
+    std::same_as<Iter&> decltype(auto) it_ref = ++it;
+    assert(&it_ref == &it);
+
+    assert(*it == std::tuple(2, 2, 2.0));
+
+    auto original                         = it;
+    std::same_as<Iter> decltype(auto) it2 = it++;
+    assert(original == it2);
+    assert(*it == std::tuple(3, 3, 2.0));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/member_types.compile.pass.cpp
new file mode 100644
index 0000000000000..222f9e43974fb
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/member_types.compile.pass.cpp
@@ -0,0 +1,169 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// Iterator traits and member typedefs in zip_transform_view::iterator.
+
+#include <array>
+#include <ranges>
+
+#include "test_iterators.h"
+
+#include "../types.h"
+
+template <class T>
+concept HasIterCategory = requires { typename T::iterator_category; };
+
+template <class T>
+struct DiffTypeIter {
+  using iterator_category = std::input_iterator_tag;
+  using value_type        = int;
+  using difference_type   = T;
+
+  int operator*() const;
+  DiffTypeIter& operator++();
+  void operator++(int);
+  friend constexpr bool operator==(DiffTypeIter, DiffTypeIter) = default;
+};
+
+template <class T>
+struct DiffTypeRange {
+  DiffTypeIter<T> begin() const;
+  DiffTypeIter<T> end() const;
+};
+
+struct Foo {};
+struct Bar {};
+
+struct RValueRefFn {
+  int&& operator()(auto&&...) const;
+};
+
+void test() {
+  int buffer[] = {1, 2, 3, 4};
+  {
+    // C++20 random_access C++17 random_access
+    std::ranges::zip_transform_view v(GetFirst{}, buffer);
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::is_same_v<Iter::iterator_category, std::random_access_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, int>);
+    static_assert(HasIterCategory<Iter>);
+  }
+
+  {
+    // C++20 random_access C++17 input
+    std::ranges::zip_transform_view v(MakeTuple{}, buffer);
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, std::tuple<int>>);
+    static_assert(HasIterCategory<Iter>);
+  }
+
+  {
+    // C++20 bidirectional C++17 bidirectional
+    std::ranges::zip_transform_view v(GetFirst{}, BidiCommonView{buffer});
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::bidirectional_iterator_tag>);
+    static_assert(std::is_same_v<Iter::iterator_category, std::bidirectional_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, int>);
+    static_assert(HasIterCategory<Iter>);
+  }
+
+  {
+    // C++20 bidirectional C++17 input
+    std::ranges::zip_transform_view v(MakeTuple{}, BidiCommonView{buffer});
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::bidirectional_iterator_tag>);
+    static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, std::tuple<int>>);
+    static_assert(HasIterCategory<Iter>);
+  }
+
+  {
+    // C++20 forward C++17 bidirectional
+    std::ranges::zip_transform_view v(GetFirst{}, ForwardSizedView{buffer});
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::forward_iterator_tag>);
+    static_assert(std::is_same_v<Iter::iterator_category, std::forward_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, int>);
+    static_assert(HasIterCategory<Iter>);
+  }
+
+  {
+    // C++20 forward C++17 input
+    std::ranges::zip_transform_view v(MakeTuple{}, ForwardSizedView{buffer});
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::forward_iterator_tag>);
+    static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, std::tuple<int>>);
+    static_assert(HasIterCategory<Iter>);
+  }
+
+  {
+    // C++20 input C++17 not a range
+    std::ranges::zip_transform_view v(GetFirst{}, InputCommonView{buffer});
+    using Iter = decltype(v.begin());
+
+    static_assert(std::is_same_v<Iter::iterator_concept, std::input_iterator_tag>);
+    static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<Iter::value_type, int>);
+    static_assert(!HasIterCategory<Iter>);
+  }
+
+  {
+    // difference_type of one view
+    std::ranges::zip_transform_view v{MakeTuple{}, DiffTypeRange<std::intptr_t>{}};
+    using Iter = decltype(v.begin());
+    static_assert(std::is_same_v<Iter::difference_type, std::intptr_t>);
+  }
+
+  {
+    // difference_type of multiple views should be the common type
+    std::ranges::zip_transform_view v{MakeTuple{}, DiffTypeRange<std::intptr_t>{}, DiffTypeRange<std::ptrdiff_t>{}};
+    using Iter = decltype(v.begin());
+    static_assert(std::is_same_v<Iter::difference_type, std::common_type_t<std::intptr_t, std::ptrdiff_t>>);
+  }
+
+  const std::array foos{Foo{}};
+  std::array bars{Bar{}, Bar{}};
+  {
+    // value_type of one view
+    std::ranges::zip_transform_view v{MakeTuple{}, foos};
+    using Iter = decltype(v.begin());
+    static_assert(std::is_same_v<Iter::value_type, std::tuple<Foo>>);
+  }
+
+  {
+    // value_type of multiple views with different value_type
+    std::ranges::zip_transform_view v{MakeTuple{}, foos, bars};
+    using Iter = decltype(v.begin());
+    static_assert(std::is_same_v<Iter::value_type, std::tuple<Foo, Bar>>);
+  }
+
+  // LWG3798 Rvalue reference and iterator_category
+  {
+    std::ranges::zip_transform_view v(RValueRefFn{}, buffer);
+    using Iter = decltype(v.begin());
+    static_assert(std::is_same_v<Iter::iterator_category, std::random_access_iterator_tag>);
+  }
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/subscript.pass.cpp
new file mode 100644
index 0000000000000..70cf811e41f98
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/iterator/subscript.pass.cpp
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+//  constexpr decltype(auto) operator[](difference_type n) const
+//    requires random_access_range<Base>;
+
+#include <ranges>
+#include <cassert>
+
+#include "../types.h"
+
+template <class T>
+concept CanSubscript = requires(T t) { t[0]; };
+
+constexpr bool test() {
+  int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+  {
+    // F returns PR value
+    std::ranges::zip_transform_view v(MakeTuple{}, SizedRandomAccessView{buffer}, std::views::iota(0));
+    auto it    = v.begin();
+    using Iter = decltype(it);
+    static_assert(CanSubscript<Iter>);
+
+    std::same_as<std::tuple<int, int>> decltype(auto) val = it[0];
+    assert(val == *it);
+    assert(it[2] == *(it + 2));
+    assert(it[4] == *(it + 4));
+  }
+
+  {
+    // F return by reference
+    std::ranges::zip_transform_view v(GetFirst{}, ContiguousCommonView{buffer}, ContiguousCommonView{buffer});
+    auto it    = v.begin();
+    using Iter = decltype(it);
+    static_assert(CanSubscript<Iter>);
+
+    std::same_as<int&> decltype(auto) val = it[0];
+    assert(&val == &buffer[0]);
+    assert(val == *it);
+    assert(it[2] == *(it + 2));
+    assert(it[4] == *(it + 4));
+  }
+
+  {
+    // non random_access_range
+    std::ranges::zip_transform_view v(GetFirst{}, BidiCommonView{buffer});
+    auto it    = v.begin();
+    using Iter = decltype(it);
+    static_assert(!CanSubscript<Iter>);
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    auto it = v.begin();
+    assert(it[2] == std::make_tuple(3));
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, std::views::iota(0));
+    auto it = v.begin();
+    assert(it[0] == 1);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::repeat_view(2.));
+    auto it = v.begin();
+    assert(&std::get<0>(it[1]) == &buffer[1]);
+    assert(&std::get<1>(it[2]) == &buffer[2]);
+    assert(std::get<2>(it[3]) == 2.0);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..ab00663f4a112
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/ctor.default.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// sentinel() = default;
+
+#include <cassert>
+#include <ranges>
+
+#include "../types.h"
+
+struct PODSentinel {
+  bool b; // deliberately uninitialised
+
+  friend constexpr bool operator==(int*, const PODSentinel& s) { return s.b; }
+};
+
+struct Fn {
+  int operator()(auto&&...) const { return 5; }
+};
+
+struct Range : std::ranges::view_base {
+  int* begin() const;
+  PODSentinel end();
+};
+
+constexpr bool test() {
+  {
+    using R        = std::ranges::zip_transform_view<Fn, Range>;
+    using Sentinel = std::ranges::sentinel_t<R>;
+    static_assert(!std::is_same_v<Sentinel, std::ranges::iterator_t<R>>);
+
+    std::ranges::iterator_t<R> it;
+
+    Sentinel s1;
+    assert(it != s1); // PODSentinel.b is initialised to false
+
+    Sentinel s2 = {};
+    assert(it != s2); // PODSentinel.b is initialised to false
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/ctor.other.pass.cpp
new file mode 100644
index 0000000000000..be7c3d15a9109
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/ctor.other.pass.cpp
@@ -0,0 +1,159 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+//  constexpr sentinel(sentinel<!Const> i)
+//    requires Const && convertible_to<zentinel<false>, zentinel<Const>>;
+
+#include <cassert>
+#include <ranges>
+
+#include "../types.h"
+
+template <class T>
+struct convertible_sentinel_wrapper {
+  explicit convertible_sentinel_wrapper() = default;
+  constexpr convertible_sentinel_wrapper(const T& it) : it_(it) {}
+
+  template <class U>
+    requires std::convertible_to<const U&, T>
+  constexpr convertible_sentinel_wrapper(const convertible_sentinel_wrapper<U>& other) : it_(other.it_) {}
+
+  constexpr friend bool operator==(convertible_sentinel_wrapper const& self, const T& other) {
+    return self.it_ == other;
+  }
+  T it_;
+};
+
+struct NonSimpleNonCommonConvertibleView : IntBufferView {
+  using IntBufferView::IntBufferView;
+
+  constexpr int* begin() { return buffer_; }
+  constexpr const int* begin() const { return buffer_; }
+  constexpr convertible_sentinel_wrapper<int*> end() { return convertible_sentinel_wrapper<int*>(buffer_ + size_); }
+  constexpr convertible_sentinel_wrapper<const int*> end() const {
+    return convertible_sentinel_wrapper<const int*>(buffer_ + size_);
+  }
+};
+
+// convertible_to<zentinel<false>, zentinel<Const>>
+static_assert(std::convertible_to< //
+              std::ranges::sentinel_t<std::ranges::zip_view<NonSimpleNonCommonConvertibleView>>,
+              std::ranges::sentinel_t<std::ranges::zip_view<NonSimpleNonCommonConvertibleView> const>>);
+
+constexpr bool test() {
+  int buffer1[4] = {1, 2, 3, 4};
+  int buffer2[5] = {1, 2, 3, 4, 5};
+  {
+    std::ranges::zip_transform_view v{
+        MakeTuple{}, NonSimpleNonCommonConvertibleView(buffer1), NonSimpleNonCommonConvertibleView(buffer2)};
+    using ZipTransformView = decltype(v);
+    static_assert(!std::ranges::common_range<ZipTransformView>);
+    auto sent1                                            = v.end();
+    std::ranges::sentinel_t<const ZipTransformView> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+
+    assert(v.begin() != sent2);
+    assert(std::as_const(v).begin() != sent2);
+    assert(v.begin() + 4 == sent2);
+    assert(std::as_const(v).begin() + 4 == sent2);
+
+    // Cannot create a non-const iterator from a const iterator.
+    static_assert(!std::constructible_from<decltype(sent1), decltype(sent2)>);
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, NonSimpleNonCommonConvertibleView{buffer1});
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() != sent1);
+    assert(v.begin() != sent2);
+    assert(v.begin() + 4 == sent1);
+    assert(v.begin() + 4 == sent2);
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, NonSimpleNonCommonConvertibleView{buffer1}, std::views::iota(0));
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() != sent1);
+    assert(v.begin() != sent2);
+    assert(v.begin() + 4 == sent1);
+    assert(v.begin() + 4 == sent2);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(
+        Tie{}, NonSimpleNonCommonConvertibleView{buffer1}, SimpleCommon{buffer1}, std::ranges::single_view(2.));
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() != sent1);
+    assert(v.begin() != sent2);
+    assert(v.begin() + 1 == sent1);
+    assert(v.begin() + 1 == sent2);
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, NonSimpleNonCommonConvertibleView(nullptr, 0));
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() == sent1);
+    assert(v.begin() == sent2);
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), NonSimpleNonCommonConvertibleView{buffer1}, SimpleCommon{buffer1});
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() == sent1);
+    assert(v.begin() == sent2);
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, std::ranges::empty_view<int>(), NonSimpleNonCommonConvertibleView{buffer1});
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() == sent1);
+    assert(v.begin() == sent2);
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, NonSimpleNonCommonConvertibleView{buffer1}, std::ranges::empty_view<int>());
+    auto sent1                                       = v.end();
+    std::ranges::sentinel_t<const decltype(v)> sent2 = sent1;
+    static_assert(!std::is_same_v<decltype(sent1), decltype(sent2)>);
+    assert(v.begin() == sent1);
+    assert(v.begin() == sent2);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/eq.pass.cpp
new file mode 100644
index 0000000000000..9b20dacd1f696
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/eq.pass.cpp
@@ -0,0 +1,199 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// template<bool OtherConst>
+//   requires sentinel_for<zentinel<Const>, ziperator<OtherConst>>
+// friend constexpr bool operator==(const iterator<OtherConst>& x, const sentinel& y);
+
+#include <cassert>
+#include <compare>
+#include <ranges>
+
+#include "../types.h"
+
+using Iterator      = random_access_iterator<int*>;
+using ConstIterator = random_access_iterator<const int*>;
+
+template <bool Const>
+struct ComparableSentinel {
+  using Iter = std::conditional_t<Const, ConstIterator, Iterator>;
+  Iter iter_;
+
+  explicit ComparableSentinel() = default;
+  constexpr explicit ComparableSentinel(const Iter& it) : iter_(it) {}
+
+  constexpr friend bool operator==(const Iterator& i, const ComparableSentinel& s) { return base(i) == base(s.iter_); }
+
+  constexpr friend bool operator==(const ConstIterator& i, const ComparableSentinel& s) {
+    return base(i) == base(s.iter_);
+  }
+};
+
+struct ComparableView : IntBufferView {
+  using IntBufferView::IntBufferView;
+
+  constexpr auto begin() { return Iterator(buffer_); }
+  constexpr auto begin() const { return ConstIterator(buffer_); }
+  constexpr auto end() { return ComparableSentinel<false>(Iterator(buffer_ + size_)); }
+  constexpr auto end() const { return ComparableSentinel<true>(ConstIterator(buffer_ + size_)); }
+};
+
+struct ConstIncompatibleView : std::ranges::view_base {
+  cpp17_input_iterator<int*> begin();
+  forward_iterator<const int*> begin() const;
+  sentinel_wrapper<cpp17_input_iterator<int*>> end();
+  sentinel_wrapper<forward_iterator<const int*>> end() const;
+};
+
+template <class Iter, class Sent>
+concept EqualComparable = std::invocable<std::equal_to<>, const Iter&, const Sent&>;
+
+constexpr bool test() {
+  int buffer1[4] = {1, 2, 3, 4};
+  int buffer2[5] = {1, 2, 3, 4, 5};
+  int buffer3[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+  {
+    // const and non-const have different iterator/sentinel types
+    std::ranges::zip_transform_view v{
+        MakeTuple{}, NonSimpleNonCommon(buffer1), SimpleNonCommon(buffer2), SimpleNonCommon(buffer3)};
+    using ZipTransformView = decltype(v);
+    static_assert(!std::ranges::common_range<ZipTransformView>);
+    static_assert(!simple_view<ZipTransformView>);
+
+    assert(v.begin() != v.end());
+    assert(v.begin() + 4 == v.end());
+
+    // const_iterator (const int*) converted to iterator (int*)
+    assert(v.begin() + 4 == std::as_const(v).end());
+
+    using Iter      = std::ranges::iterator_t<decltype(v)>;
+    using ConstIter = std::ranges::iterator_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Iter, ConstIter>);
+    using Sentinel      = std::ranges::sentinel_t<decltype(v)>;
+    using ConstSentinel = std::ranges::sentinel_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Sentinel, ConstSentinel>);
+
+    static_assert(EqualComparable<Iter, Sentinel>);
+    static_assert(!EqualComparable<ConstIter, Sentinel>);
+    static_assert(EqualComparable<Iter, ConstSentinel>);
+    static_assert(EqualComparable<ConstIter, ConstSentinel>);
+  }
+
+  {
+    // underlying const/non-const sentinel can be compared with both const/non-const iterator
+    std::ranges::zip_transform_view v{MakeTuple{}, ComparableView(buffer1), ComparableView(buffer2)};
+    using ZipTransformView = decltype(v);
+    static_assert(!std::ranges::common_range<ZipTransformView>);
+    static_assert(!simple_view<ZipTransformView>);
+
+    assert(v.begin() != v.end());
+    assert(v.begin() + 4 == v.end());
+    assert(std::as_const(v).begin() + 4 == v.end());
+    assert(std::as_const(v).begin() + 4 == std::as_const(v).end());
+    assert(v.begin() + 4 == std::as_const(v).end());
+
+    using Iter      = std::ranges::iterator_t<decltype(v)>;
+    using ConstIter = std::ranges::iterator_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Iter, ConstIter>);
+    using Sentinel      = std::ranges::sentinel_t<decltype(v)>;
+    using ConstSentinel = std::ranges::sentinel_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Sentinel, ConstSentinel>);
+
+    static_assert(EqualComparable<Iter, Sentinel>);
+    static_assert(EqualComparable<ConstIter, Sentinel>);
+    static_assert(EqualComparable<Iter, ConstSentinel>);
+    static_assert(EqualComparable<ConstIter, ConstSentinel>);
+  }
+
+  {
+    // underlying const/non-const sentinel cannot be compared with non-const/const iterator
+    std::ranges::zip_transform_view v{MakeTuple{}, ComparableView(buffer1), ConstIncompatibleView{}};
+    using ZipTransformView = decltype(v);
+    static_assert(!std::ranges::common_range<ZipTransformView>);
+    static_assert(!simple_view<ZipTransformView>);
+
+    using Iter      = std::ranges::iterator_t<decltype(v)>;
+    using ConstIter = std::ranges::iterator_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Iter, ConstIter>);
+    using Sentinel      = std::ranges::sentinel_t<decltype(v)>;
+    using ConstSentinel = std::ranges::sentinel_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Sentinel, ConstSentinel>);
+
+    static_assert(EqualComparable<Iter, Sentinel>);
+    static_assert(!EqualComparable<ConstIter, Sentinel>);
+    static_assert(!EqualComparable<Iter, ConstSentinel>);
+    static_assert(EqualComparable<ConstIter, ConstSentinel>);
+  }
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, ComparableView{buffer1});
+    assert(v.begin() != v.end());
+    assert(v.begin() + 4 == v.end());
+    assert(v.begin() + 4 == std::as_const(v).end());
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, ComparableView{buffer2}, std::views::iota(0));
+    assert(v.begin() != v.end());
+    assert(v.begin() + 5 == v.end());
+    assert(v.begin() + 5 == std::as_const(v).end());
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(
+        Tie{}, ComparableView{buffer1}, SimpleNonCommon{buffer2}, std::ranges::single_view(2.));
+    assert(v.begin() != v.end());
+    assert(v.begin() + 1 == v.end());
+    assert(v.begin() + 1 == std::as_const(v).end());
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, ComparableView(nullptr, 0));
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), ComparableView{buffer1}, SimpleCommon{buffer2});
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, std::ranges::empty_view<int>(), ComparableView{buffer2});
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, ComparableView{buffer2}, std::ranges::empty_view<int>());
+    assert(v.begin() == v.end());
+    assert(std::as_const(v).begin() == std::as_const(v).end());
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/minus.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/minus.pass.cpp
new file mode 100644
index 0000000000000..fc29a00014f67
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/sentinel/minus.pass.cpp
@@ -0,0 +1,279 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// template<bool OtherConst>
+//   requires sized_sentinel_for<zentinel<Const>, ziperator<OtherConst>>
+// friend constexpr range_difference_t<maybe-const<OtherConst, InnerView>>
+//   operator-(const sentinel& x, const iterator<OtherConst>& y);
+
+#include <cassert>
+#include <concepts>
+#include <functional>
+#include <ranges>
+#include <tuple>
+
+#include "../types.h"
+
+template <class Base = int*>
+struct convertible_forward_sized_iterator {
+  Base it_ = nullptr;
+
+  using iterator_category = std::forward_iterator_tag;
+  using value_type        = int;
+  using difference_type   = std::intptr_t;
+
+  convertible_forward_sized_iterator() = default;
+  constexpr convertible_forward_sized_iterator(Base it) : it_(it) {}
+
+  template <std::convertible_to<Base> U>
+  constexpr convertible_forward_sized_iterator(const convertible_forward_sized_iterator<U>& it) : it_(it.it_) {}
+
+  constexpr decltype(*Base{}) operator*() const { return *it_; }
+
+  constexpr convertible_forward_sized_iterator& operator++() {
+    ++it_;
+    return *this;
+  }
+  constexpr convertible_forward_sized_iterator operator++(int) { return forward_sized_iterator(it_++); }
+
+  friend constexpr bool
+  operator==(const convertible_forward_sized_iterator&, const convertible_forward_sized_iterator&) = default;
+
+  friend constexpr difference_type
+  operator-(const convertible_forward_sized_iterator& x, const convertible_forward_sized_iterator& y) {
+    return x.it_ - y.it_;
+  }
+};
+static_assert(std::forward_iterator<convertible_forward_sized_iterator<>>);
+
+template <class Base>
+struct convertible_sized_sentinel {
+  Base base_;
+  explicit convertible_sized_sentinel() = default;
+  constexpr convertible_sized_sentinel(const Base& it) : base_(it) {}
+
+  template <std::convertible_to<Base> U>
+  constexpr convertible_sized_sentinel(const convertible_sized_sentinel<U>& other) : base_(other.base_) {}
+
+  template <class U>
+    requires(std::convertible_to<Base, U> || std::convertible_to<U, Base>)
+  friend constexpr bool operator==(const convertible_sized_sentinel& s, const U& base) {
+    return s.base_ == base;
+  }
+  template <class U>
+    requires(std::convertible_to<Base, U> || std::convertible_to<U, Base>)
+  friend constexpr auto operator-(const convertible_sized_sentinel& s, const U& i) {
+    return s.base_ - i;
+  }
+
+  template <class U>
+    requires(std::convertible_to<Base, U> || std::convertible_to<U, Base>)
+  friend constexpr auto operator-(const U& i, const convertible_sized_sentinel& s) {
+    return i - s.base_;
+  }
+};
+static_assert(std::sized_sentinel_for<convertible_sized_sentinel<convertible_forward_sized_iterator<>>,
+                                      convertible_forward_sized_iterator<>>);
+static_assert(std::sized_sentinel_for<convertible_sized_sentinel<convertible_forward_sized_iterator<const int*>>,
+                                      convertible_forward_sized_iterator<int*>>);
+static_assert(std::sized_sentinel_for<convertible_sized_sentinel<convertible_forward_sized_iterator<int*>>,
+                                      convertible_forward_sized_iterator<const int*>>);
+
+struct ConstCompatibleForwardSized : IntBufferView {
+  using IntBufferView::IntBufferView;
+
+  using iterator       = convertible_forward_sized_iterator<int*>;
+  using const_iterator = convertible_forward_sized_iterator<const int*>;
+
+  constexpr iterator begin() { return {buffer_}; }
+  constexpr const_iterator begin() const { return {buffer_}; }
+  constexpr convertible_sized_sentinel<iterator> end() { return iterator{buffer_ + size_}; }
+  constexpr convertible_sized_sentinel<const_iterator> end() const { return const_iterator{buffer_ + size_}; }
+};
+
+template <class T, class U>
+concept HasMinus = std::invocable<std::minus<>, const T&, const U&>;
+
+template <class T>
+concept SentinelHasMinus = HasMinus<std::ranges::sentinel_t<T>, std::ranges::iterator_t<T>>;
+
+constexpr bool test() {
+  int buffer1[5] = {1, 2, 3, 4, 5};
+  int buffer2[3] = {1, 2, 3};
+
+  {
+    // shortest range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::views::iota(0, 3), ForwardSizedNonCommon(buffer1));
+    static_assert(!std::ranges::common_range<decltype(v)>);
+    auto it = v.begin();
+    auto st = v.end();
+    assert(st - it == 3);
+    assert(st - std::ranges::next(it, 1) == 2);
+
+    assert(it - st == -3);
+    assert(std::ranges::next(it, 1) - st == -2);
+    static_assert(SentinelHasMinus<decltype(v)>);
+  }
+
+  {
+    // underlying sentinel does not model sized_sentinel_for
+    std::ranges::zip_transform_view v(MakeTuple{}, std::views::iota(0), SizedRandomAccessView(buffer1));
+    static_assert(!std::ranges::common_range<decltype(v)>);
+    static_assert(!SentinelHasMinus<decltype(v)>);
+  }
+
+  {
+    // const incompatible:
+    // underlying const sentinels cannot subtract underlying iterators
+    // underlying sentinels cannot subtract underlying const iterators
+    std::ranges::zip_transform_view v(MakeTuple{}, NonSimpleForwardSizedNonCommon{buffer1});
+    static_assert(!std::ranges::common_range<decltype(v)>);
+    static_assert(!simple_view<decltype(v)>);
+
+    using Iter      = std::ranges::iterator_t<decltype(v)>;
+    using ConstIter = std::ranges::iterator_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Iter, ConstIter>);
+    using Sentinel      = std::ranges::sentinel_t<decltype(v)>;
+    using ConstSentinel = std::ranges::sentinel_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Sentinel, ConstSentinel>);
+
+    static_assert(HasMinus<Iter, Sentinel>);
+    static_assert(HasMinus<Sentinel, Iter>);
+    static_assert(HasMinus<ConstIter, ConstSentinel>);
+    static_assert(HasMinus<ConstSentinel, ConstIter>);
+    auto it       = v.begin();
+    auto const_it = std::as_const(v).begin();
+    auto st       = v.end();
+    auto const_st = std::as_const(v).end();
+    assert(it - st == -5);
+    assert(st - it == 5);
+    assert(const_it - const_st == -5);
+    assert(const_st - const_it == 5);
+
+    static_assert(!HasMinus<Iter, ConstSentinel>);
+    static_assert(!HasMinus<ConstSentinel, Iter>);
+    static_assert(!HasMinus<ConstIter, Sentinel>);
+    static_assert(!HasMinus<Sentinel, ConstIter>);
+  }
+
+  {
+    // const compatible allow non-const to const conversion
+    std::ranges::zip_transform_view v(MakeTuple{}, ConstCompatibleForwardSized{buffer1});
+    static_assert(!std::ranges::common_range<decltype(v)>);
+    static_assert(!simple_view<decltype(v)>);
+
+    using Iter      = std::ranges::iterator_t<decltype(v)>;
+    using ConstIter = std::ranges::iterator_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Iter, ConstIter>);
+    using Sentinel      = std::ranges::sentinel_t<decltype(v)>;
+    using ConstSentinel = std::ranges::sentinel_t<const decltype(v)>;
+    static_assert(!std::is_same_v<Sentinel, ConstSentinel>);
+
+    static_assert(HasMinus<Iter, Sentinel>);
+    static_assert(HasMinus<Sentinel, Iter>);
+    static_assert(HasMinus<ConstIter, ConstSentinel>);
+    static_assert(HasMinus<ConstSentinel, ConstIter>);
+    static_assert(HasMinus<Iter, ConstSentinel>);
+    static_assert(HasMinus<ConstSentinel, Iter>);
+    static_assert(HasMinus<ConstIter, Sentinel>);
+    static_assert(HasMinus<Sentinel, ConstIter>);
+
+    auto it       = v.begin();
+    auto const_it = std::as_const(v).begin();
+    auto st       = v.end();
+    auto const_st = std::as_const(v).end();
+
+    assert(it - st == -5);
+    assert(st - it == 5);
+    assert(const_it - const_st == -5);
+    assert(const_st - const_it == 5);
+    assert(it - const_st == -5);
+    assert(const_st - it == 5);
+    assert(const_it - st == -5);
+    assert(st - const_it == 5);
+  }
+
+  auto testMinus = [](auto&& v, auto distance) {
+    auto it       = v.begin();
+    auto const_it = std::as_const(v).begin();
+    auto st       = v.end();
+    auto const_st = std::as_const(v).end();
+
+    assert(it - st == -distance);
+    assert(st - it == distance);
+    assert(const_it - const_st == -distance);
+    assert(const_st - const_it == distance);
+    assert(it - const_st == -distance);
+    assert(const_st - it == distance);
+    assert(const_it - st == -distance);
+    assert(st - const_it == distance);
+  };
+
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, ConstCompatibleForwardSized{buffer1});
+    testMinus(v, 5);
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, ConstCompatibleForwardSized{buffer1}, std::views::iota(0, 100));
+    testMinus(v, 5);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(
+        Tie{},
+        ConstCompatibleForwardSized{buffer1},
+        ConstCompatibleForwardSized{buffer2},
+        std::ranges::single_view(2.));
+    testMinus(v, 1);
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, ConstCompatibleForwardSized(nullptr, 0));
+    testMinus(v, 0);
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), ConstCompatibleForwardSized{buffer1}, SimpleCommon{buffer2});
+    testMinus(v, 0);
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{},
+        ConstCompatibleForwardSized{buffer1},
+        std::ranges::empty_view<int>(),
+        ConstCompatibleForwardSized{buffer2});
+    testMinus(v, 0);
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer1}, ConstCompatibleForwardSized{buffer2}, std::ranges::empty_view<int>());
+    testMinus(v, 0);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/size.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/size.pass.cpp
new file mode 100644
index 0000000000000..140bdc311d527
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/size.pass.cpp
@@ -0,0 +1,128 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// constexpr auto size() requires sized_range<InnerView>
+// constexpr auto size() const requires sized_range<const InnerView>
+
+#include <ranges>
+
+#include <cassert>
+
+#include "test_iterators.h"
+#include "types.h"
+
+int buffer[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+struct SizedView : std::ranges::view_base {
+  std::size_t size_ = 0;
+  constexpr SizedView(std::size_t s) : size_(s) {}
+  constexpr auto begin() const { return buffer; }
+  constexpr auto end() const { return buffer + size_; }
+};
+
+struct SizedNonConst : std::ranges::view_base {
+  using iterator    = forward_iterator<int*>;
+  std::size_t size_ = 0;
+  constexpr SizedNonConst(std::size_t s) : size_(s) {}
+  constexpr auto begin() const { return iterator{buffer}; }
+  constexpr auto end() const { return iterator{buffer + size_}; }
+  constexpr std::size_t size() { return size_; }
+};
+
+struct ConstNonConstDifferentSize : std::ranges::view_base {
+  constexpr auto begin() const { return buffer; }
+  constexpr auto end() const { return buffer + 8; }
+
+  constexpr auto size() { return 5; }
+  constexpr auto size() const { return 6; }
+};
+
+constexpr bool test() {
+  {
+    // one range
+    std::ranges::zip_transform_view v(MakeTuple{}, SimpleCommon{buffer});
+    assert(v.size() == 9);
+    assert(std::as_const(v).size() == 9);
+  }
+
+  {
+    // two ranges
+    std::ranges::zip_transform_view v(GetFirst{}, SimpleCommon{buffer}, SizedView(3));
+    assert(v.size() == 3);
+    assert(std::as_const(v).size() == 3);
+  }
+
+  {
+    // three ranges
+    std::ranges::zip_transform_view v(Tie{}, SimpleCommon{buffer}, SizedView{6}, std::ranges::single_view(2.));
+    assert(v.size() == 1);
+    assert(std::as_const(v).size() == 1);
+  }
+
+  {
+    // single empty range
+    std::ranges::zip_transform_view v(MakeTuple{}, std::ranges::empty_view<int>());
+    assert(v.size() == 0);
+    assert(std::as_const(v).size() == 0);
+  }
+
+  {
+    // empty range at the beginning
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, std::ranges::empty_view<int>(), SimpleCommon{buffer}, SimpleCommon{buffer});
+    assert(v.size() == 0);
+    assert(std::as_const(v).size() == 0);
+  }
+
+  {
+    // empty range in the middle
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, std::ranges::empty_view<int>(), SimpleCommon{buffer});
+    assert(v.size() == 0);
+    assert(std::as_const(v).size() == 0);
+  }
+
+  {
+    // empty range at the end
+    std::ranges::zip_transform_view v(
+        MakeTuple{}, SimpleCommon{buffer}, SimpleCommon{buffer}, std::ranges::empty_view<int>());
+    assert(v.size() == 0);
+    assert(std::as_const(v).size() == 0);
+  }
+
+  {
+    // const-view non-sized range
+    std::ranges::zip_transform_view v(MakeTuple{}, SizedNonConst(2), SizedView(3));
+    assert(v.size() == 2);
+    static_assert(std::ranges::sized_range<decltype(v)>);
+    static_assert(!std::ranges::sized_range<decltype(std::as_const(v))>);
+  }
+
+  {
+    // const/non-const has different sizes
+    std::ranges::zip_transform_view v(MakeTuple{}, ConstNonConstDifferentSize{});
+    assert(v.size() == 5);
+    assert(std::as_const(v).size() == 6);
+  }
+
+  {
+    // underlying range not sized
+    std::ranges::zip_transform_view v(MakeTuple{}, InputCommonView{buffer});
+    static_assert(!std::ranges::sized_range<decltype(v)>);
+    static_assert(!std::ranges::sized_range<decltype(std::as_const(v))>);
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip.transform/types.h b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/types.h
new file mode 100644
index 0000000000000..dc6a40a84d61f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip.transform/types.h
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ZIP_TRANSFORM_TYPES_H
+#define TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ZIP_TRANSFORM_TYPES_H
+
+#include <functional>
+#include <ranges>
+
+#include "test_macros.h"
+#include "test_iterators.h"
+#include "test_range.h"
+#include "../range_adaptor_types.h"
+
+#if TEST_STD_VER <= 20
+#  error "range.zip.transform/types.h" can only be included in builds supporting C++20
+#endif
+
+struct IntView : std::ranges::view_base {
+  int* begin() const;
+  int* end() const;
+};
+
+struct MakeTuple {
+  constexpr auto operator()(auto&&... args) const { return std::tuple(std::forward<decltype(args)>(args)...); }
+};
+
+struct Tie {
+  constexpr auto operator()(auto&&... args) const { return std::tie(std::forward<decltype(args)>(args)...); }
+};
+
+struct GetFirst {
+  constexpr decltype(auto) operator()(auto&& first, auto&&...) const { return std::forward<decltype(first)>(first); }
+};
+
+struct NoConstBeginView : std::ranges::view_base {
+  int* begin();
+  int* end();
+};
+
+struct ConstNonConstDifferentView : std::ranges::view_base {
+  int* begin();
+  const int* begin() const;
+  int* end();
+  const int* end() const;
+};
+
+struct NonConstOnlyFn {
+  int operator()(int&) const;
+  int operator()(const int&) const = delete;
+};
+
+#endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ZIP_TRANSFORM_TYPES_H
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/begin.pass.cpp
index 637d776ad08c4..837ed9efee19d 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/begin.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/begin.pass.cpp
@@ -18,7 +18,7 @@
 #include <tuple>
 #include <utility>
 
-#include "types.h"
+#include "../range_adaptor_types.h"
 
 template <class T>
 concept HasConstBegin = requires(const T& ct) { ct.begin(); };
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
index bdfd58ff8bbe7..34c81f05820fe 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
@@ -18,7 +18,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "types.h"
+#include "../range_adaptor_types.h"
 
 static_assert(std::is_invocable_v<decltype((std::views::zip))>);
 static_assert(!std::is_invocable_v<decltype((std::views::zip)), int>);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.views.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.views.pass.cpp
index b4a16debedc8f..513d26817e915 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.views.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.views.pass.cpp
@@ -13,7 +13,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "types.h"
+#include "../range_adaptor_types.h"
 
 template <class T>
 void conversion_test(T);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/end.pass.cpp
index b7f64477a12d0..7cf9f5f462725 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/end.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/end.pass.cpp
@@ -14,7 +14,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "types.h"
+#include "../range_adaptor_types.h"
 
 // ID | simple | common | bidi | random | sized | #views |     v.end()    | as_const(v)
 //    |        |        |      | access |       |        |                |   .end()
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp
index efe64b31f79fb..444f3ed95b322 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/arithmetic.pass.cpp
@@ -25,7 +25,7 @@
 #include <concepts>
 #include <functional>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 template <class T, class U>
 concept canPlusEqual = requires(T& t, U& u) { t += u; };
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
index 8ab7346800093..5ad054c0c3b1d 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "test_range.h"
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 // This is for testing that zip iterator never calls underlying iterator's >, >=, <=, !=.
 // The spec indicates that zip iterator's >= is negating zip iterator's < instead of calling underlying iterator's >=.
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp
index 98078b2ce3095..abced1629a4a7 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.default.pass.cpp
@@ -13,7 +13,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 struct PODIter {
   int i; // deliberately uninitialised
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.other.pass.cpp
index 7f9784e6f7ae2..6b8b55fbe68e8 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.other.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/ctor.other.pass.cpp
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 using ConstIterIncompatibleView = BasicView<forward_iterator<int*>, forward_iterator<int*>,
                                             random_access_iterator<const int*>, random_access_iterator<const int*>>;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/decrement.pass.cpp
index a8422ec73330f..db5a651dae528 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/decrement.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/decrement.pass.cpp
@@ -16,7 +16,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 template <class Iter>
 concept canDecrement = requires(Iter it) { --it; } || requires(Iter it) { it--; };
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
index fb58aa28fbdf8..61495fae0467f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
@@ -15,7 +15,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 constexpr bool test() {
   std::array a{1, 2, 3, 4};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp
index 0ca8d92800feb..d094779589e09 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp
@@ -17,7 +17,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 struct InputRange : IntBufferView {
   using IntBufferView::IntBufferView;
@@ -59,7 +59,7 @@ constexpr bool test() {
 
   {
     //  bidi
-    int buffer[2] = {1, 2};
+    int buffer[3] = {1, 2, 3};
 
     std::ranges::zip_view v(BidiCommonView{buffer});
     auto it = v.begin();
@@ -81,7 +81,7 @@ constexpr bool test() {
 
   {
     //  forward
-    int buffer[2] = {1, 2};
+    int buffer[3] = {1, 2, 3};
 
     std::ranges::zip_view v(ForwardSizedView{buffer});
     auto it = v.begin();
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_move.pass.cpp
index 2926b22cffa82..f258809a13304 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_move.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_move.pass.cpp
@@ -16,7 +16,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 struct ThrowingMove {
   ThrowingMove() = default;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_swap.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_swap.pass.cpp
index bb0ec1c813238..b12877c33caf0 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_swap.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/iter_swap.pass.cpp
@@ -15,7 +15,7 @@
 #include <cassert>
 #include <ranges>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 struct ThrowingMove {
   ThrowingMove() = default;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
index 2f2f0fc4f4e33..852ea9ba069b0 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
@@ -16,7 +16,7 @@
 
 #include "test_iterators.h"
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 template <class T>
 struct ForwardView : std::ranges::view_base {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/singular.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/singular.pass.cpp
index 0f7e4c4b86489..5cd76e03bab48 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/singular.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/singular.pass.cpp
@@ -16,7 +16,7 @@
 
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 struct ThrowOnIncrementIterator {
   int* it_;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
index ba3abfa2a4369..deeeebf8ca8cf 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
@@ -14,7 +14,7 @@
 #include <ranges>
 #include <cassert>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 constexpr bool test() {
   int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/range.concept.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/range.concept.compile.pass.cpp
index c74c15662d83a..4b8587fc22c4b 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/range.concept.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/range.concept.compile.pass.cpp
@@ -18,7 +18,7 @@
 #include <tuple>
 #include <utility>
 
-#include "types.h"
+#include "../range_adaptor_types.h"
 
 void testConceptPair() {
   int buffer1[2] = {1, 2};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/ctor.other.pass.cpp
index 11ad73c313c59..c3f50b0aeabd8 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/ctor.other.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/ctor.other.pass.cpp
@@ -13,7 +13,7 @@
 #include <cassert>
 #include <ranges>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 template <class T>
 struct convertible_sentinel_wrapper {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/eq.pass.cpp
index 04542c3ae4d14..fa26d987b8dd3 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/eq.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/eq.pass.cpp
@@ -17,7 +17,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 #include "test_range.h"
 
 using Iterator = random_access_iterator<int*>;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp
index be0a7ba5b907e..bcfa3407e6cb1 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/sentinel/minus.pass.cpp
@@ -26,7 +26,7 @@
 #include <ranges>
 #include <tuple>
 
-#include "../types.h"
+#include "../../range_adaptor_types.h"
 
 template <class Base = int*>
 struct convertible_forward_sized_iterator {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/size.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/size.pass.cpp
index 194b3bd1c8719..0c0014c5d1864 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/size.pass.cpp
@@ -18,7 +18,7 @@
 #include <utility>
 
 #include "test_iterators.h"
-#include "types.h"
+#include "../range_adaptor_types.h"
 
 int buffer[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
 struct View : std::ranges::view_base {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/types.h b/libcxx/test/std/ranges/range.adaptors/range_adaptor_types.h
similarity index 83%
rename from libcxx/test/std/ranges/range.adaptors/range.zip/types.h
rename to libcxx/test/std/ranges/range.adaptors/range_adaptor_types.h
index e084dcfc41b0d..288a78ac722e6 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range_adaptor_types.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ZIP_TYPES_H
-#define TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ZIP_TYPES_H
+#ifndef TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ADAPTOR_TYPES_H
+#define TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ADAPTOR_TYPES_H
 
 #include <functional>
 #include <ranges>
@@ -17,7 +17,7 @@
 #include "test_range.h"
 
 #if TEST_STD_VER <= 20
-#  error "range.zip/types.h" can only be included in builds supporting C++20
+#  error "range.adaptor/types.h" can only be included in builds supporting C++20
 #endif // TEST_STD_VER <= 20
 
 template <class T>
@@ -27,12 +27,14 @@ struct BufferView : std::ranges::view_base {
 
   template <std::size_t N>
   constexpr BufferView(T (&b)[N]) : buffer_(b), size_(N) {}
+
+  constexpr BufferView(T* b, std::size_t s) : buffer_(b), size_(s) {}
 };
 
 using IntBufferView = BufferView<int>;
 
 template <bool Simple>
-struct Common :  IntBufferView {
+struct Common : IntBufferView {
   using IntBufferView::IntBufferView;
 
   constexpr int* begin()
@@ -48,10 +50,10 @@ struct Common :  IntBufferView {
   }
   constexpr const int* end() const { return buffer_ + size_; }
 };
-using SimpleCommon = Common<true>;
+using SimpleCommon    = Common<true>;
 using NonSimpleCommon = Common<false>;
 
-using SimpleCommonRandomAccessSized = SimpleCommon;
+using SimpleCommonRandomAccessSized    = SimpleCommon;
 using NonSimpleCommonRandomAccessSized = NonSimpleCommon;
 
 static_assert(std::ranges::common_range<Common<true>>);
@@ -64,20 +66,22 @@ template <bool Simple>
 struct CommonNonRandom : IntBufferView {
   using IntBufferView::IntBufferView;
   using const_iterator = forward_iterator<const int*>;
-  using iterator = forward_iterator<int*>;
+  using iterator       = forward_iterator<int*>;
   constexpr iterator begin()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return iterator(buffer_);
   }
   constexpr const_iterator begin() const { return const_iterator(buffer_); }
   constexpr iterator end()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return iterator(buffer_ + size_);
   }
   constexpr const_iterator end() const { return const_iterator(buffer_ + size_); }
 };
 
-using SimpleCommonNonRandom = CommonNonRandom<true>;
+using SimpleCommonNonRandom    = CommonNonRandom<true>;
 using NonSimpleCommonNonRandom = CommonNonRandom<false>;
 
 static_assert(std::ranges::common_range<SimpleCommonNonRandom>);
@@ -90,18 +94,20 @@ template <bool Simple>
 struct NonCommon : IntBufferView {
   using IntBufferView::IntBufferView;
   constexpr int* begin()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return buffer_;
   }
   constexpr const int* begin() const { return buffer_; }
   constexpr sentinel_wrapper<int*> end()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return sentinel_wrapper<int*>(buffer_ + size_);
   }
   constexpr sentinel_wrapper<const int*> end() const { return sentinel_wrapper<const int*>(buffer_ + size_); }
 };
 
-using SimpleNonCommon = NonCommon<true>;
+using SimpleNonCommon    = NonCommon<true>;
 using NonSimpleNonCommon = NonCommon<false>;
 
 static_assert(!std::ranges::common_range<SimpleNonCommon>);
@@ -114,21 +120,23 @@ template <bool Simple>
 struct NonCommonSized : IntBufferView {
   using IntBufferView::IntBufferView;
   constexpr int* begin()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return buffer_;
   }
   constexpr const int* begin() const { return buffer_; }
   constexpr sentinel_wrapper<int*> end()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return sentinel_wrapper<int*>(buffer_ + size_);
   }
   constexpr sentinel_wrapper<const int*> end() const { return sentinel_wrapper<const int*>(buffer_ + size_); }
   constexpr std::size_t size() const { return size_; }
 };
 
-using SimpleNonCommonSized = NonCommonSized<true>;
+using SimpleNonCommonSized                = NonCommonSized<true>;
 using SimpleNonCommonRandomAccessSized    = SimpleNonCommonSized;
-using NonSimpleNonCommonSized = NonCommonSized<false>;
+using NonSimpleNonCommonSized             = NonCommonSized<false>;
 using NonSimpleNonCommonRandomAccessSized = NonSimpleNonCommonSized;
 
 static_assert(!std::ranges::common_range<SimpleNonCommonSized>);
@@ -142,15 +150,17 @@ struct NonCommonNonRandom : IntBufferView {
   using IntBufferView::IntBufferView;
 
   using const_iterator = forward_iterator<const int*>;
-  using iterator = forward_iterator<int*>;
+  using iterator       = forward_iterator<int*>;
 
   constexpr iterator begin()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return iterator(buffer_);
   }
   constexpr const_iterator begin() const { return const_iterator(buffer_); }
   constexpr sentinel_wrapper<iterator> end()
-    requires(!Simple) {
+    requires(!Simple)
+  {
     return sentinel_wrapper<iterator>(iterator(buffer_ + size_));
   }
   constexpr sentinel_wrapper<const_iterator> end() const {
@@ -158,7 +168,7 @@ struct NonCommonNonRandom : IntBufferView {
   }
 };
 
-using SimpleNonCommonNonRandom = NonCommonNonRandom<true>;
+using SimpleNonCommonNonRandom    = NonCommonNonRandom<true>;
 using NonSimpleNonCommonNonRandom = NonCommonNonRandom<false>;
 
 static_assert(!std::ranges::common_range<SimpleNonCommonNonRandom>);
@@ -172,13 +182,15 @@ struct BasicView : IntBufferView {
   using IntBufferView::IntBufferView;
 
   constexpr NonConstIter begin()
-    requires(!std::is_same_v<Iter, NonConstIter>) {
+    requires(!std::is_same_v<Iter, NonConstIter>)
+  {
     return NonConstIter(buffer_);
   }
   constexpr Iter begin() const { return Iter(buffer_); }
 
   constexpr NonConstSent end()
-    requires(!std::is_same_v<Sent, NonConstSent>) {
+    requires(!std::is_same_v<Sent, NonConstSent>)
+  {
     if constexpr (std::is_same_v<NonConstIter, NonConstSent>) {
       return NonConstIter(buffer_ + size_);
     } else {
@@ -200,10 +212,10 @@ struct forward_sized_iterator {
   Base it_ = nullptr;
 
   using iterator_category = std::forward_iterator_tag;
-  using value_type = int;
-  using difference_type = std::intptr_t;
-  using pointer = Base;
-  using reference = decltype(*Base{});
+  using value_type        = int;
+  using difference_type   = std::intptr_t;
+  using pointer           = Base;
+  using reference         = decltype(*Base{});
 
   forward_sized_iterator() = default;
   constexpr forward_sized_iterator(Base it) : it_(it) {}
@@ -232,8 +244,11 @@ static_assert(std::ranges::common_range<ForwardSizedView>);
 static_assert(!std::ranges::random_access_range<ForwardSizedView>);
 static_assert(simple_view<ForwardSizedView>);
 
-using NonSimpleForwardSizedView = BasicView<forward_sized_iterator<const int*>, forward_sized_iterator<const int*>,
-                                            forward_sized_iterator<int*>, forward_sized_iterator<int*>>;
+using NonSimpleForwardSizedView =
+    BasicView<forward_sized_iterator<const int*>,
+              forward_sized_iterator<const int*>,
+              forward_sized_iterator<int*>,
+              forward_sized_iterator<int*>>;
 static_assert(std::ranges::forward_range<NonSimpleForwardSizedView>);
 static_assert(std::ranges::sized_range<NonSimpleForwardSizedView>);
 static_assert(std::ranges::common_range<NonSimpleForwardSizedView>);
@@ -248,8 +263,10 @@ static_assert(!std::ranges::random_access_range<ForwardSizedNonCommon>);
 static_assert(simple_view<ForwardSizedNonCommon>);
 
 using NonSimpleForwardSizedNonCommon =
-    BasicView<forward_sized_iterator<const int*>, sized_sentinel<forward_sized_iterator<const int*>>,
-              forward_sized_iterator<int*>, sized_sentinel<forward_sized_iterator<int*>>>;
+    BasicView<forward_sized_iterator<const int*>,
+              sized_sentinel<forward_sized_iterator<const int*>>,
+              forward_sized_iterator<int*>,
+              sized_sentinel<forward_sized_iterator<int*>>>;
 static_assert(std::ranges::forward_range<NonSimpleForwardSizedNonCommon>);
 static_assert(std::ranges::sized_range<NonSimpleForwardSizedNonCommon>);
 static_assert(!std::ranges::common_range<NonSimpleForwardSizedNonCommon>);
@@ -278,8 +295,10 @@ static_assert(!std::ranges::sized_range<NonSizedRandomAccessView>);
 static_assert(simple_view<NonSizedRandomAccessView>);
 
 using NonSimpleNonSizedRandomAccessView =
-    BasicView<random_access_iterator<const int*>, sentinel_wrapper<random_access_iterator<const int*>>,
-              random_access_iterator<int*>, sentinel_wrapper<random_access_iterator<int*>> >;
+    BasicView<random_access_iterator<const int*>,
+              sentinel_wrapper<random_access_iterator<const int*>>,
+              random_access_iterator<int*>,
+              sentinel_wrapper<random_access_iterator<int*>> >;
 static_assert(!std::ranges::contiguous_range<NonSimpleNonSizedRandomAccessView>);
 static_assert(std::ranges::random_access_range<NonSimpleNonSizedRandomAccessView>);
 static_assert(!std::ranges::common_range<NonSimpleNonSizedRandomAccessView>);
@@ -308,8 +327,11 @@ static_assert(!std::ranges::forward_range<InputCommonView>);
 static_assert(std::ranges::common_range<InputCommonView>);
 static_assert(simple_view<InputCommonView>);
 
-using NonSimpleInputCommonView = BasicView<common_input_iterator<const int*>, common_input_iterator<const int*>,
-                                           common_input_iterator<int*>, common_input_iterator<int*>>;
+using NonSimpleInputCommonView =
+    BasicView<common_input_iterator<const int*>,
+              common_input_iterator<const int*>,
+              common_input_iterator<int*>,
+              common_input_iterator<int*>>;
 static_assert(std::ranges::input_range<NonSimpleInputCommonView>);
 static_assert(!std::ranges::forward_range<NonSimpleInputCommonView>);
 static_assert(std::ranges::common_range<NonSimpleInputCommonView>);
@@ -322,8 +344,10 @@ static_assert(!std::ranges::common_range<InputNonCommonView>);
 static_assert(simple_view<InputNonCommonView>);
 
 using NonSimpleInputNonCommonView =
-    BasicView<common_input_iterator<const int*>, sentinel_wrapper<common_input_iterator<const int*>>,
-              common_input_iterator<int*>, sentinel_wrapper<common_input_iterator<int*>>>;
+    BasicView<common_input_iterator<const int*>,
+              sentinel_wrapper<common_input_iterator<const int*>>,
+              common_input_iterator<int*>,
+              sentinel_wrapper<common_input_iterator<int*>>>;
 static_assert(std::ranges::input_range<InputNonCommonView>);
 static_assert(!std::ranges::forward_range<InputNonCommonView>);
 static_assert(!std::ranges::common_range<InputNonCommonView>);
@@ -336,8 +360,11 @@ static_assert(!std::ranges::random_access_range<BidiCommonView>);
 static_assert(std::ranges::common_range<BidiCommonView>);
 static_assert(simple_view<BidiCommonView>);
 
-using NonSimpleBidiCommonView = BasicView<bidirectional_iterator<const int*>, bidirectional_iterator<const int*>,
-                                          bidirectional_iterator<int*>, bidirectional_iterator<int*>>;
+using NonSimpleBidiCommonView =
+    BasicView<bidirectional_iterator<const int*>,
+              bidirectional_iterator<const int*>,
+              bidirectional_iterator<int*>,
+              bidirectional_iterator<int*>>;
 static_assert(!std::ranges::sized_range<NonSimpleBidiCommonView>);
 static_assert(std::ranges::bidirectional_range<NonSimpleBidiCommonView>);
 static_assert(!std::ranges::random_access_range<NonSimpleBidiCommonView>);
@@ -372,8 +399,10 @@ static_assert(!std::ranges::common_range<BidiNonCommonView>);
 static_assert(simple_view<BidiNonCommonView>);
 
 using NonSimpleBidiNonCommonView =
-    BasicView<bidirectional_iterator<const int*>, sentinel_wrapper<bidirectional_iterator<const int*>>,
-              bidirectional_iterator<int*>, sentinel_wrapper<bidirectional_iterator<int*>>>;
+    BasicView<bidirectional_iterator<const int*>,
+              sentinel_wrapper<bidirectional_iterator<const int*>>,
+              bidirectional_iterator<int*>,
+              sentinel_wrapper<bidirectional_iterator<int*>>>;
 static_assert(!std::ranges::sized_range<NonSimpleBidiNonCommonView>);
 static_assert(std::ranges::bidirectional_range<NonSimpleBidiNonCommonView>);
 static_assert(!std::ranges::random_access_range<NonSimpleBidiNonCommonView>);
@@ -388,24 +417,25 @@ static_assert(!std::ranges::common_range<SizedBidiNonCommonView>);
 static_assert(simple_view<SizedBidiNonCommonView>);
 
 using NonSimpleSizedBidiNonCommonView =
-    BasicView<bidirectional_iterator<const int*>, sized_sentinel<bidirectional_iterator<const int*>>,
-              bidirectional_iterator<int*>, sized_sentinel<bidirectional_iterator<int*>>>;
+    BasicView<bidirectional_iterator<const int*>,
+              sized_sentinel<bidirectional_iterator<const int*>>,
+              bidirectional_iterator<int*>,
+              sized_sentinel<bidirectional_iterator<int*>>>;
 static_assert(std::ranges::sized_range<NonSimpleSizedBidiNonCommonView>);
 static_assert(std::ranges::bidirectional_range<NonSimpleSizedBidiNonCommonView>);
 static_assert(!std::ranges::random_access_range<NonSimpleSizedBidiNonCommonView>);
 static_assert(!std::ranges::common_range<NonSimpleSizedBidiNonCommonView>);
 static_assert(!simple_view<NonSimpleSizedBidiNonCommonView>);
 
-namespace adltest{
+namespace adltest {
 struct iter_move_swap_iterator {
-
   std::reference_wrapper<int> iter_move_called_times;
   std::reference_wrapper<int> iter_swap_called_times;
   int i = 0;
 
   using iterator_category = std::input_iterator_tag;
-  using value_type = int;
-  using difference_type = std::intptr_t;
+  using value_type        = int;
+  using difference_type   = std::intptr_t;
 
   constexpr int operator*() const { return i; }
 
@@ -435,4 +465,4 @@ struct IterMoveSwapRange {
 };
 } // namespace adltest
 
-#endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ZIP_TYPES_H
+#endif //  TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_ADAPTOR_TYPES_H
diff --git a/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp b/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
index 3b35271d649c3..06186656f0122 100644
--- a/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
+++ b/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
@@ -58,6 +58,7 @@ constexpr bool test() {
 #if TEST_STD_VER >= 23
   testOne<std::ranges::chunk_by_view<View, Pred>>();
   testOne<std::ranges::repeat_view<Pred>>();
+  testOne<std::ranges::zip_transform_view<Pred, View>>();
 #endif
   return true;
 }
diff --git a/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp b/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp
index 023bc0ee79f44..ecfdaee2eed66 100644
--- a/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp
+++ b/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp
@@ -14,6 +14,8 @@
 // template <class ST, class SA>
 //    basic_regex(const basic_string<charT, ST, SA>& s);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <regex>
 #include <cassert>
 #include "test_macros.h"
@@ -33,6 +35,7 @@ int main(int, char**)
 {
     assert(error_range_thrown("([\\w-a])"));
     assert(error_range_thrown("([a-\\w])"));
+    assert(error_range_thrown("([w-a])"));
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
similarity index 90%
rename from libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp
rename to libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
index 9ff148251a05f..9bc695cd7fb76 100644
--- a/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
@@ -10,8 +10,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(clang-17): -fsized-deallocation
-// ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index 47c95c64a0855..2e0b1e9025a61 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 
 // These compilers don't support __builtin_is_virtual_base_of yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
index 09086a4c046d6..8e57e8913dcbe 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // is_bounded_array<T>
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
index 9aac871f2633f..bd7da40daf2bc 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // has_unique_object_representations
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index 192943dd820cc..b7fd0937eb216 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-19, gcc-14, gcc-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index e5b10f57be084..1ca9d44b82afe 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
index 03bec8c2f81d2..ad53c8176cc92 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++23
 
 // These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: android, apple-clang-16, clang-19.1
+// UNSUPPORTED: apple-clang-16, clang-19.1
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
index 82688b10dbf45..73cc4f3e29d5a 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++23
 
 // These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: android, apple-clang-16, clang-18, clang-19.1
+// UNSUPPORTED: apple-clang-16, clang-19.1
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
index 5df6103d79e7a..3c5a57d1c7fec 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
@@ -11,6 +11,7 @@
 #include <bitset>
 #include <algorithm>
 #include <type_traits>
+#include <limits>
 #include <climits>
 #include <cassert>
 #include <stdexcept>
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
index 34b79b38d2b95..2c14de8c796c1 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
@@ -214,7 +214,7 @@ static constexpr bool can_make_from_tuple =
 template <class T, class Tuple>
 auto test_make_from_tuple_impl(T&&, Tuple&& t)
     -> decltype(std::__make_from_tuple_impl<T>(
-                    t, typename std::__make_tuple_indices< std::tuple_size_v<std::remove_reference_t<Tuple>>>::type{}),
+                    t, typename std::make_index_sequence<std::tuple_size_v<std::remove_reference_t<Tuple>>>()),
                 std::uint8_t()) {
   return 0;
 }
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 0a1985b02807b..63ceceaa67635 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -184,7 +184,7 @@ FROM ubuntu:jammy AS android-builder-base
 
 ARG ANDROID_CLANG_VERSION
 ARG ANDROID_CLANG_PREBUILTS_COMMIT
-ARG ANDROID_SYSROOT_BID
+ARG ANDROID_SYSROOT_COMMIT
 
 RUN apt-get update && apt-get install -y curl bzip2 git unzip
 
@@ -217,19 +217,18 @@ RUN <<EOF
     ls /opt/android/clang/clang-current/bin/clang
 EOF
 
-# Install an Android sysroot. New AOSP sysroots are available at
-# https://ci.android.com/builds/branches/aosp-main/grid, the "ndk" target. The
-# NDK also makes its sysroot prebuilt available at
-# https://android.googlesource.com/platform/prebuilts/ndk/+/refs/heads/dev/platform/sysroot.
+# Install an Android sysroot. New Android sysroots are available at
+# https://android.googlesource.com/platform/prebuilts/ndk/+/refs/heads/mirror-goog-main-ndk/platform/sysroot.
 
-ENV ANDROID_SYSROOT_BID=$ANDROID_SYSROOT_BID
+ENV ANDROID_SYSROOT_COMMIT=$ANDROID_SYSROOT_COMMIT
 RUN <<EOF
   set -e
-  cd /opt/android
-  curl -L -o ndk_platform.tar.bz2 \
-      https://androidbuildinternal.googleapis.com/android/internal/build/v3/builds/${ANDROID_SYSROOT_BID}/ndk/attempts/latest/artifacts/ndk_platform.tar.bz2/url
-  tar xf ndk_platform.tar.bz2
-  rm ndk_platform.tar.bz2
+  mkdir -p /opt/android/ndk
+  cd /opt/android/ndk
+  git clone --filter=blob:none https://android.googlesource.com/platform/prebuilts/ndk tmp
+  git -C tmp checkout ${ANDROID_SYSROOT_COMMIT}
+  mv tmp/platform/sysroot .
+  rm -rf tmp
 EOF
 
 # ===----------------------------------------------------------------------===##
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index 2189a41555c2f..4efc6d2a570e3 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -21,7 +21,7 @@ services:
       target: android-buildkite-builder
       args:
         BASE_IMAGE: ubuntu:noble
-        ANDROID_CLANG_VERSION: r536225
-        ANDROID_CLANG_PREBUILTS_COMMIT: 3f67b93ee7a50ae2a3cb34cc32d0589415cc0a9c
-        ANDROID_SYSROOT_BID: 12644632
+        ANDROID_CLANG_VERSION: r563880
+        ANDROID_CLANG_PREBUILTS_COMMIT: 6ae4184bb8706f9731569b9a0a82be3fcdcb951c
+        ANDROID_SYSROOT_COMMIT: f8b85cc5262c6e5cbc9a92c1bab2b18b32a4c63f
         <<: *compiler_versions
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 18eb8a8623748..fe175fd758726 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1882,7 +1882,6 @@ def produce_tests():
 {cxx_tests}
 
 // clang-format on
-
 """.format(
             script_name=script_name,
             header=h,
diff --git a/libcxx/utils/synchronize_csv_status_files.py b/libcxx/utils/synchronize_csv_status_files.py
index 3132857434e7a..5dbd734de7fb0 100755
--- a/libcxx/utils/synchronize_csv_status_files.py
+++ b/libcxx/utils/synchronize_csv_status_files.py
@@ -231,7 +231,7 @@ def from_github_issue(issue: Dict):# -> PaperInfo:
 
         return PaperInfo(
             paper_number=paper,
-            paper_name=issue['title'],
+            paper_name=issue['title'].removeprefix(paper + ': '),
             status=PaperStatus.from_github_issue(issue),
             meeting=issue.get('meeting Voted', None),
             first_released_version=None, # TODO
@@ -269,14 +269,14 @@ def merge(paper: PaperInfo, gh: PaperInfo) -> PaperInfo:
 
 def load_csv(file: pathlib.Path) -> List[Tuple]:
     rows = []
-    with open(file, newline='') as f:
+    with open(file, newline='', encoding='utf-8') as f:
         reader = csv.reader(f, delimiter=',')
         for row in reader:
             rows.append(row)
     return rows
 
 def write_csv(output: pathlib.Path, rows: List[Tuple]):
-    with open(output, 'w', newline='') as f:
+    with open(output, 'w', newline='', encoding='utf-8') as f:
         writer = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator='\n')
         for row in rows:
             writer.writerow(row)
@@ -417,7 +417,7 @@ def main(argv):
     # Load all the Github issues tracking papers from Github.
     if args.load_github_from:
         print(f"Loading all issues from {args.load_github_from}")
-        with open(args.load_github_from, 'r') as f:
+        with open(args.load_github_from, 'r', encoding='utf-8') as f:
             project_info = json.load(f)
     else:
         print("Loading all issues from Github")
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 55db035e62040..9a1afd3721f5a 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -173,7 +173,8 @@ bool DwarfFDECache<A>::_registeredForDyldUnloads = false;
 #endif
 
 template <typename A>
-typename A::pint_t DwarfFDECache<A>::findFDE(pint_t mh, pint_t pc) {
+typename DwarfFDECache<A>::pint_t DwarfFDECache<A>::findFDE(pint_t mh,
+                                                            pint_t pc) {
   pint_t result = 0;
   _LIBUNWIND_LOG_IF_FALSE(_lock.lock_shared());
   for (entry *p = _buffer; p < _bufferUsed; ++p) {
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 79b63e5b7236f..91b6e632fa7ed 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -192,6 +192,18 @@ struct Configuration {
   // Used for /lldltocachepolicy=policy
   llvm::CachePruningPolicy ltoCachePolicy;
 
+  // Used for /thinlto-distributor:<path>
+  StringRef dtltoDistributor;
+
+  // Used for /thinlto-distributor-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoDistributorArgs;
+
+  // Used for /thinlto-remote-compiler:<path>
+  StringRef dtltoCompiler;
+
+  // Used for /thinlto-remote-compiler-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerArgs;
+
   // Used for /opt:[no]ltodebugpassmanager
   bool ltoDebugPassManager = false;
 
@@ -307,7 +319,7 @@ struct Configuration {
   bool warnDebugInfoUnusable = true;
   bool warnLongSectionNames = true;
   bool warnStdcallFixup = true;
-  bool warnExportedDllMain = true;
+  bool warnImportedDllMain = true;
   bool incremental = true;
   bool integrityCheck = false;
   bool killAt = false;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 283aeed1a19cd..83040b534be9c 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -274,8 +274,13 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
       make<std::unique_ptr<Archive>>(std::move(file)); // take ownership
 
       int memberIndex = 0;
-      for (MemoryBufferRef m : getArchiveMembers(ctx, archive))
-        addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
+      for (MemoryBufferRef m : getArchiveMembers(ctx, archive)) {
+        if (!archive->isThin())
+          addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
+        else
+          addThinArchiveBuffer(m, "<whole-archive>");
+      }
+
       return;
     }
     addFile(make<ArchiveFile>(ctx, mbref));
@@ -386,6 +391,14 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   Log(ctx) << "Loaded " << obj << " for " << symName;
 }
 
+void LinkerDriver::addThinArchiveBuffer(MemoryBufferRef mb, StringRef symName) {
+  // Pass an empty string as the archive name and an offset of 0 so that
+  // the original filename is used as the buffer identifier. This is
+  // useful for DTLTO, where having the member identifier be the actual
+  // path on disk enables distribution of bitcode files during ThinLTO.
+  addArchiveBuffer(mb, symName, /*parentName=*/"", /*OffsetInArchive=*/0);
+}
+
 void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
                                         const Archive::Symbol &sym,
                                         StringRef parentName) {
@@ -422,11 +435,8 @@ void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
       reportBufferError(errorCodeToError(mbOrErr.second), childName);
     llvm::TimeTraceScope timeScope("Archive: ",
                                    mbOrErr.first->getBufferIdentifier());
-    // Pass empty string as archive name so that the original filename is
-    // used as the buffer identifier.
-    ctx.driver.addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
-                                toCOFFString(ctx, sym), "",
-                                /*OffsetInArchive=*/0);
+    ctx.driver.addThinArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
+                                    toCOFFString(ctx, sym));
   });
 }
 
@@ -1643,8 +1653,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         config->warnLocallyDefinedImported = false;
       else if (s == "longsections")
         config->warnLongSectionNames = false;
-      else if (s == "exporteddllmain")
-        config->warnExportedDllMain = false;
+      else if (s == "importeddllmain")
+        config->warnImportedDllMain = false;
       // Other warning numbers are ignored.
     }
   }
@@ -2088,6 +2098,23 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Fatal(ctx) << "/manifestinput: requires /manifest:embed";
   }
 
+  // Handle /thinlto-distributor:<path>
+  config->dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor);
+
+  // Handle /thinlto-distributor-arg:<arg>
+  for (auto *arg : args.filtered(OPT_thinlto_distributor_arg))
+    config->dtltoDistributorArgs.push_back(arg->getValue());
+
+  // Handle /thinlto-remote-compiler:<path>
+  config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler);
+  if (!config->dtltoDistributor.empty() && config->dtltoCompiler.empty())
+    Err(ctx) << "A value must be specified for /thinlto-remote-compiler if "
+                "/thinlto-distributor is specified.";
+
+  // Handle /thinlto-remote-compiler-arg:<arg>
+  for (auto *arg : args.filtered(OPT_thinlto_compiler_arg))
+    config->dtltoCompilerArgs.push_back(arg->getValue());
+
   // Handle /dwodir
   config->dwoDir = args.getLastArgValue(OPT_dwodir);
 
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index 14c97a98875bf..5a9bd5c6d9682 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -173,6 +173,7 @@ class LinkerDriver {
                  bool lazy);
   void addArchiveBuffer(MemoryBufferRef mbref, StringRef symName,
                         StringRef parentName, uint64_t offsetInArchive);
+  void addThinArchiveBuffer(MemoryBufferRef mbref, StringRef symName);
 
   void enqueueTask(std::function<void()> task);
   bool run();
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 0b7dbea8cdd99..2a6b63cbacca1 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -117,8 +117,6 @@ static coff_symbol_generic *cloneSymbol(COFFSymbolRef sym) {
 // Skip importing DllMain thunks from import libraries.
 static bool fixupDllMain(COFFLinkerContext &ctx, llvm::object::Archive *file,
                          const Archive::Symbol &sym, bool &skipDllMain) {
-  if (skipDllMain)
-    return true;
   const Archive::Child &c =
       CHECK(sym.getMember(), file->getFileName() +
                                  ": could not get the member for symbol " +
@@ -128,13 +126,13 @@ static bool fixupDllMain(COFFLinkerContext &ctx, llvm::object::Archive *file,
             file->getFileName() +
                 ": could not get the buffer for a child buffer of the archive");
   if (identify_magic(mb.getBuffer()) == file_magic::coff_import_library) {
-    if (ctx.config.warnExportedDllMain) {
+    if (ctx.config.warnImportedDllMain) {
       // We won't place DllMain symbols in the symbol table if they are
       // coming from a import library. This message can be ignored with the flag
-      // '/ignore:exporteddllmain'
+      // '/ignore:importeddllmain'
       Warn(ctx)
           << file->getFileName()
-          << ": skipping exported DllMain symbol [exporteddllmain]\nNOTE: this "
+          << ": skipping imported DllMain symbol [importeddllmain]\nNOTE: this "
              "might be a mistake when the DLL/library was produced.";
     }
     skipDllMain = true;
@@ -204,14 +202,24 @@ void ArchiveFile::parse() {
     }
   }
 
-  // Read the symbol table to construct Lazy objects.
   bool skipDllMain = false;
+  StringRef mangledDllMain, impMangledDllMain;
+
+  // The calls below will fail if we haven't set the machine type yet. Instead
+  // of failing, it is preferable to skip this "imported DllMain" check if we
+  // don't know the machine type at this point.
+  if (!file->isEmpty() && ctx.config.machine != IMAGE_FILE_MACHINE_UNKNOWN) {
+    mangledDllMain = archiveSymtab->mangle("DllMain");
+    impMangledDllMain = uniqueSaver().save("__imp_" + mangledDllMain);
+  }
+
+  // Read the symbol table to construct Lazy objects.
   for (const Archive::Symbol &sym : file->symbols()) {
-    // If the DllMain symbol was exported by mistake, skip importing it
-    // otherwise we might end up with a import thunk in the final binary which
-    // is wrong.
-    if (sym.getName() == "__imp_DllMain" || sym.getName() == "DllMain") {
-      if (fixupDllMain(ctx, file.get(), sym, skipDllMain))
+    // If an import library provides the DllMain symbol, skip importing it, as
+    // we should be using our own DllMain, not another DLL's DllMain.
+    if (!mangledDllMain.empty() && (sym.getName() == mangledDllMain ||
+                                    sym.getName() == impMangledDllMain)) {
+      if (skipDllMain || fixupDllMain(ctx, file.get(), sym, skipDllMain))
         continue;
     }
     archiveSymtab->addLazyArchive(this, sym);
diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index 2a4d07cc2d015..1050874a1b10c 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -110,7 +110,16 @@ BitcodeCompiler::BitcodeCompiler(COFFLinkerContext &c) : ctx(c) {
 
   // Initialize ltoObj.
   lto::ThinBackend backend;
-  if (ctx.config.thinLTOIndexOnly) {
+  if (!ctx.config.dtltoDistributor.empty()) {
+    backend = lto::createOutOfProcessThinBackend(
+        llvm::hardware_concurrency(ctx.config.thinLTOJobs),
+        /*OnWrite=*/nullptr,
+        /*ShouldEmitIndexFiles=*/false,
+        /*ShouldEmitImportFiles=*/false, ctx.config.outputFile,
+        ctx.config.dtltoDistributor, ctx.config.dtltoDistributorArgs,
+        ctx.config.dtltoCompiler, ctx.config.dtltoCompilerArgs,
+        !ctx.config.saveTempsArgs.empty());
+  } else if (ctx.config.thinLTOIndexOnly) {
     auto OnIndexWrite = [&](StringRef S) { thinIndices.erase(S); };
     backend = lto::createWriteIndexesThinBackend(
         llvm::hardware_concurrency(ctx.config.thinLTOJobs),
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index a887d7d351e18..2a82fb5cd8845 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -270,6 +270,17 @@ def thinlto_object_suffix_replace : P<
 def thinlto_prefix_replace: P<
     "thinlto-prefix-replace",
     "'old;new' replace old prefix with new prefix in ThinLTO outputs">;
+def thinlto_distributor : P<"thinlto-distributor",
+  "Distributor to use for ThinLTO backend compilations. If specified, ThinLTO "
+  "backend compilations will be distributed">;
+def thinlto_distributor_arg : P<"thinlto-distributor-arg",
+  "Arguments to pass to the ThinLTO distributor">;
+def thinlto_compiler : P<"thinlto-remote-compiler",
+  "Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
+  "compilations">;
+def thinlto_compiler_arg : P<"thinlto-remote-compiler-arg",
+  "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
+  "compilations">;
 def lto_obj_path : P<
     "lto-obj-path",
     "output native object for merged LTO unit to this path">;
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index 479131a24dcfc..9b33e78731c97 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "Thunks.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -36,6 +37,10 @@ class Hexagon final : public TargetInfo {
                      const uint8_t *loc) const override;
   RelType getDynRel(RelType type) const override;
   int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
+  bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
+  bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
   void writePltHeader(uint8_t *buf) const override;
@@ -63,6 +68,8 @@ Hexagon::Hexagon(Ctx &ctx) : TargetInfo(ctx) {
   tlsGotRel = R_HEX_TPREL_32;
   tlsModuleIndexRel = R_HEX_DTPMOD_32;
   tlsOffsetRel = R_HEX_DTPREL_32;
+
+  needsThunks = true;
 }
 
 uint32_t Hexagon::calcEFlags() const {
@@ -258,6 +265,46 @@ static uint32_t findMaskR16(Ctx &ctx, uint32_t insn) {
 
 static void or32le(uint8_t *p, int32_t v) { write32le(p, read32le(p) | v); }
 
+bool Hexagon::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
+  int64_t offset = dst - src;
+  switch (type) {
+  case llvm::ELF::R_HEX_B22_PCREL:
+  case llvm::ELF::R_HEX_PLT_B22_PCREL:
+  case llvm::ELF::R_HEX_GD_PLT_B22_PCREL:
+  case llvm::ELF::R_HEX_LD_PLT_B22_PCREL:
+    return llvm::isInt<22>(offset >> 2);
+  case llvm::ELF::R_HEX_B15_PCREL:
+    return llvm::isInt<15>(offset >> 2);
+    break;
+  case llvm::ELF::R_HEX_B13_PCREL:
+    return llvm::isInt<13>(offset >> 2);
+    break;
+  case llvm::ELF::R_HEX_B9_PCREL:
+    return llvm::isInt<9>(offset >> 2);
+  default:
+    return true;
+  }
+  llvm_unreachable("unsupported relocation");
+}
+
+bool Hexagon::needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                         uint64_t branchAddr, const Symbol &s,
+                         int64_t a) const {
+  // Only check branch range for supported branch relocation types
+  switch (type) {
+  case R_HEX_B22_PCREL:
+  case R_HEX_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
+  case R_HEX_LD_PLT_B22_PCREL:
+  case R_HEX_B15_PCREL:
+  case R_HEX_B13_PCREL:
+  case R_HEX_B9_PCREL:
+    return !ctx.target->inBranchRange(type, branchAddr, s.getVA(ctx, a));
+  default:
+    return false;
+  }
+}
+
 void Hexagon::relocate(uint8_t *loc, const Relocation &rel,
                        uint64_t val) const {
   switch (rel.type) {
diff --git a/lld/ELF/BPSectionOrderer.cpp b/lld/ELF/BPSectionOrderer.cpp
index f464b1d4518a4..06152046d13d4 100644
--- a/lld/ELF/BPSectionOrderer.cpp
+++ b/lld/ELF/BPSectionOrderer.cpp
@@ -76,10 +76,10 @@ DenseMap<const InputSectionBase *, int> elf::runBalancedPartitioning(
     if (!d)
       return;
     auto *sec = dyn_cast_or_null<InputSection>(d->section);
-    // Skip empty, discarded, ICF folded sections. Skipping ICF folded sections
-    // reduces duplicate detection work in BPSectionOrderer.
+    // Skip empty, discarded, ICF folded sections, .bss. Skipping ICF folded
+    // sections reduces duplicate detection work in BPSectionOrderer.
     if (!sec || sec->size == 0 || !sec->isLive() || sec->repl != sec ||
-        !orderer.secToSym.try_emplace(sec, d).second)
+        !sec->content().data() || !orderer.secToSym.try_emplace(sec, d).second)
       return;
     rootSymbolToSectionIdxs[CachedHashStringRef(
                                 lld::utils::getRootSymbol(sym.getName()))]
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index cebd564036b2c..bd22fe2f1aa25 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1671,8 +1671,9 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
   }
 
   // Sort relocations by offset for more efficient searching for
-  // R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization.
-  if (ctx.arg.emachine == EM_RISCV ||
+  // R_RISCV_PCREL_HI20, ALIGN relocations, R_PPC64_ADDR64 and the
+  // branch-to-branch optimization.
+  if (is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine) ||
       (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
       ctx.arg.branchToBranch)
     llvm::stable_sort(sec->relocs(),
@@ -2139,17 +2140,44 @@ void ThunkCreator::mergeThunks(ArrayRef<OutputSection *> outputSections) {
       });
 }
 
-static int64_t getPCBias(Ctx &ctx, RelType type) {
-  if (ctx.arg.emachine != EM_ARM)
-    return 0;
-  switch (type) {
-  case R_ARM_THM_JUMP19:
-  case R_ARM_THM_JUMP24:
-  case R_ARM_THM_CALL:
-    return 4;
-  default:
-    return 8;
+constexpr uint32_t HEXAGON_MASK_END_PACKET = 3 << 14;
+constexpr uint32_t HEXAGON_END_OF_PACKET = 3 << 14;
+constexpr uint32_t HEXAGON_END_OF_DUPLEX = 0 << 14;
+
+// Return the distance between the packet start and the instruction in the
+// relocation.
+static int getHexagonPacketOffset(const InputSection &isec,
+                                  const Relocation &rel) {
+  const ArrayRef<uint8_t> data = isec.content();
+
+  // Search back as many as 3 instructions.
+  for (unsigned i = 0;; i++) {
+    if (i == 3 || rel.offset < (i + 1) * 4)
+      return i * 4;
+    uint32_t instWord = 0;
+    const ArrayRef<uint8_t> instWordContents =
+        data.drop_front(rel.offset - (i + 1) * 4);
+    memcpy(&instWord, instWordContents.data(), sizeof(instWord));
+    if (((instWord & HEXAGON_MASK_END_PACKET) == HEXAGON_END_OF_PACKET) ||
+        ((instWord & HEXAGON_MASK_END_PACKET) == HEXAGON_END_OF_DUPLEX))
+      return i * 4;
+  }
+}
+static int64_t getPCBias(Ctx &ctx, const InputSection &isec,
+                         const Relocation &rel) {
+  if (ctx.arg.emachine == EM_ARM) {
+    switch (rel.type) {
+    case R_ARM_THM_JUMP19:
+    case R_ARM_THM_JUMP24:
+    case R_ARM_THM_CALL:
+      return 4;
+    default:
+      return 8;
+    }
   }
+  if (ctx.arg.emachine == EM_HEXAGON)
+    return -getHexagonPacketOffset(isec, rel);
+  return 0;
 }
 
 // Find or create a ThunkSection within the InputSectionDescription (ISD) that
@@ -2161,7 +2189,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os,
                                            const Relocation &rel,
                                            uint64_t src) {
   // See the comment in getThunk for -pcBias below.
-  const int64_t pcBias = getPCBias(ctx, rel.type);
+  const int64_t pcBias = getPCBias(ctx, *isec, rel);
   for (std::pair<ThunkSection *, uint32_t> tp : isd->thunkSections) {
     ThunkSection *ts = tp.first;
     uint64_t tsBase = os->addr + ts->outSecOff - pcBias;
@@ -2322,7 +2350,7 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
   // out in the relocation addend. We compensate for the PC bias so that
   // an Arm and Thumb relocation to the same destination get the same keyAddend,
   // which is usually 0.
-  const int64_t pcBias = getPCBias(ctx, rel.type);
+  const int64_t pcBias = getPCBias(ctx, *isec, rel);
   const int64_t keyAddend = rel.addend + pcBias;
 
   // We use a ((section, offset), addend) pair to find the thunk position if
@@ -2481,7 +2509,7 @@ bool ThunkCreator::createThunks(uint32_t pass,
             // STT_SECTION + non-zero addend, clear the addend after
             // redirection.
             if (ctx.arg.emachine != EM_MIPS)
-              rel.addend = -getPCBias(ctx, rel.type);
+              rel.addend = -getPCBias(ctx, *isec, rel);
           }
 
         for (auto &p : isd->thunkSections)
@@ -2525,7 +2553,8 @@ void elf::hexagonTLSSymbolUpdate(Ctx &ctx) {
           for (Relocation &rel : isec->relocs())
             if (rel.sym->type == llvm::ELF::STT_TLS && rel.expr == R_PLT_PC) {
               if (needEntry) {
-                sym->allocateAux(ctx);
+                if (sym->auxIdx == 0)
+                  sym->allocateAux(ctx);
                 addPltEntry(ctx, *ctx.in.plt, *ctx.in.gotPlt, *ctx.in.relaPlt,
                             ctx.target->pltRel, *sym);
                 needEntry = false;
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index c26ba76bccb7e..65d0f094c43c3 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -415,6 +415,22 @@ class AVRThunk : public Thunk {
   void addSymbols(ThunkSection &isec) override;
 };
 
+// Hexagon CPUs need thunks for R_HEX_B{9,1{3,5},22}_PCREL,
+// R_HEX_{,GD_}PLT_B22_PCREL when their destination is out of
+// range.
+class HexagonThunk : public Thunk {
+public:
+  HexagonThunk(Ctx &ctx, const InputSection &isec, Relocation &rel,
+               Symbol &dest)
+      : Thunk(ctx, dest, 0), relOffset(rel.offset) {
+    alignment = 4;
+  }
+  uint32_t relOffset;
+  uint32_t size() override { return ctx.arg.isPic ? 12 : 8; }
+  void writeTo(uint8_t *buf) override;
+  void addSymbols(ThunkSection &isec) override;
+};
+
 // MIPS LA25 thunk
 class MipsThunk final : public Thunk {
 public:
@@ -1519,6 +1535,39 @@ bool PPC64LongBranchThunk::isCompatibleWith(const InputSection &isec,
   return rel.type == R_PPC64_REL24 || rel.type == R_PPC64_REL14;
 }
 
+// Hexagon Target Thunks
+static uint64_t getHexagonThunkDestVA(Ctx &ctx, const Symbol &s, int64_t a) {
+  uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(ctx, a);
+  return SignExtend64<32>(v);
+}
+
+void HexagonThunk::writeTo(uint8_t *buf) {
+  uint64_t s = getHexagonThunkDestVA(ctx, destination, addend);
+  uint64_t p = getThunkTargetSym()->getVA(ctx);
+
+  if (ctx.arg.isPic) {
+    write32(ctx, buf + 0, 0x00004000); // {  immext(#0)
+    ctx.target->relocateNoSym(buf, R_HEX_B32_PCREL_X, s - p);
+    write32(ctx, buf + 4, 0x6a49c00e); //    r14 = add(pc,##0) }
+    ctx.target->relocateNoSym(buf + 4, R_HEX_6_PCREL_X, s - p);
+
+    write32(ctx, buf + 8, 0x528ec000); // {  jumpr r14 }
+  } else {
+    write32(ctx, buf + 0, 0x00004000); //  { immext
+    ctx.target->relocateNoSym(buf, R_HEX_B32_PCREL_X, s - p);
+    write32(ctx, buf + 4, 0x5800c000); //    jump <> }
+    ctx.target->relocateNoSym(buf + 4, R_HEX_B22_PCREL_X, s - p);
+  }
+}
+void HexagonThunk::addSymbols(ThunkSection &isec) {
+  Symbol *enclosing = isec.getEnclosingSymbol(relOffset);
+  StringRef src = enclosing ? enclosing->getName() : isec.name;
+
+  addSymbol(
+      saver().save("__hexagon_thunk_" + destination.getName() + "_from_" + src),
+      STT_FUNC, 0, isec);
+}
+
 Thunk::Thunk(Ctx &ctx, Symbol &d, int64_t a)
     : ctx(ctx), destination(d), addend(a), offset(0) {
   destination.thunkAccessed = true;
@@ -1692,6 +1741,24 @@ static std::unique_ptr<Thunk> addThunkAVR(Ctx &ctx, RelType type, Symbol &s,
   }
 }
 
+static std::unique_ptr<Thunk> addThunkHexagon(Ctx &ctx,
+                                              const InputSection &isec,
+                                              Relocation &rel, Symbol &s) {
+  switch (rel.type) {
+  case R_HEX_B9_PCREL:
+  case R_HEX_B13_PCREL:
+  case R_HEX_B15_PCREL:
+  case R_HEX_B22_PCREL:
+  case R_HEX_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
+    return std::make_unique<HexagonThunk>(ctx, isec, rel, s);
+  default:
+    Fatal(ctx) << "unrecognized relocation " << rel.type << " to " << &s
+               << " for hexagon target";
+    llvm_unreachable("");
+  }
+}
+
 static std::unique_ptr<Thunk> addThunkMips(Ctx &ctx, RelType type, Symbol &s) {
   if ((s.stOther & STO_MIPS_MICROMIPS) && isMipsR6(ctx))
     return std::make_unique<MicroMipsR6Thunk>(ctx, s);
@@ -1761,8 +1828,11 @@ std::unique_ptr<Thunk> elf::addThunk(Ctx &ctx, const InputSection &isec,
     return addThunkPPC32(ctx, isec, rel, s);
   case EM_PPC64:
     return addThunkPPC64(ctx, rel.type, s, a);
+  case EM_HEXAGON:
+    return addThunkHexagon(ctx, isec, rel, s);
   default:
-    llvm_unreachable("add Thunk only supported for ARM, AVR, Mips and PowerPC");
+    llvm_unreachable(
+        "add Thunk only supported for ARM, AVR, Hexagon, Mips and PowerPC");
   }
 }
 
diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index 2fe96b26bfb55..04da702b48764 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -14,15 +14,10 @@
 
 #include "lld/Common/ErrorHandler.h"
 #include "mach-o/compact_unwind_encoding.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
-using namespace llvm::support::endian;
 using namespace lld;
 using namespace lld::macho;
 
@@ -39,7 +34,6 @@ struct ARM64 : ARM64Common {
                             uint64_t &stubOffset, uint64_t selrefVA,
                             Symbol *objcMsgSend) const override;
   void populateThunk(InputSection *thunk, Symbol *funcSym) override;
-  void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
 
   void initICFSafeThunkBody(InputSection *thunk,
                             Symbol *targetSym) const override;
@@ -236,509 +230,6 @@ ARM64::ARM64() : ARM64Common(LP64()) {
   relocAttrs = {relocAttrsArray.data(), relocAttrsArray.size()};
 }
 
-namespace {
-struct Adrp {
-  uint32_t destRegister;
-  int64_t addend;
-};
-
-struct Add {
-  uint8_t destRegister;
-  uint8_t srcRegister;
-  uint32_t addend;
-};
-
-enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 };
-
-struct Ldr {
-  uint8_t destRegister;
-  uint8_t baseRegister;
-  uint8_t p2Size;
-  bool isFloat;
-  ExtendType extendType;
-  int64_t offset;
-};
-} // namespace
-
-static bool parseAdrp(uint32_t insn, Adrp &adrp) {
-  if ((insn & 0x9f000000) != 0x90000000)
-    return false;
-  adrp.destRegister = insn & 0x1f;
-  uint64_t immHi = (insn >> 5) & 0x7ffff;
-  uint64_t immLo = (insn >> 29) & 0x3;
-  adrp.addend = SignExtend64<21>(immLo | (immHi << 2)) * 4096;
-  return true;
-}
-
-static bool parseAdd(uint32_t insn, Add &add) {
-  if ((insn & 0xffc00000) != 0x91000000)
-    return false;
-  add.destRegister = insn & 0x1f;
-  add.srcRegister = (insn >> 5) & 0x1f;
-  add.addend = (insn >> 10) & 0xfff;
-  return true;
-}
-
-static bool parseLdr(uint32_t insn, Ldr &ldr) {
-  ldr.destRegister = insn & 0x1f;
-  ldr.baseRegister = (insn >> 5) & 0x1f;
-  uint8_t size = insn >> 30;
-  uint8_t opc = (insn >> 22) & 3;
-
-  if ((insn & 0x3fc00000) == 0x39400000) {
-    // LDR (immediate), LDRB (immediate), LDRH (immediate)
-    ldr.p2Size = size;
-    ldr.extendType = ZeroExtend;
-    ldr.isFloat = false;
-  } else if ((insn & 0x3f800000) == 0x39800000) {
-    // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate)
-    ldr.p2Size = size;
-    ldr.extendType = static_cast<ExtendType>(opc);
-    ldr.isFloat = false;
-  } else if ((insn & 0x3f400000) == 0x3d400000) {
-    // LDR (immediate, SIMD&FP)
-    ldr.extendType = ZeroExtend;
-    ldr.isFloat = true;
-    if (opc == 1)
-      ldr.p2Size = size;
-    else if (size == 0 && opc == 3)
-      ldr.p2Size = 4;
-    else
-      return false;
-  } else {
-    return false;
-  }
-  ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size;
-  return true;
-}
-
-static bool isValidAdrOffset(int32_t delta) { return isInt<21>(delta); }
-
-static void writeAdr(void *loc, uint32_t dest, int32_t delta) {
-  assert(isValidAdrOffset(delta));
-  uint32_t opcode = 0x10000000;
-  uint32_t immHi = (delta & 0x001ffffc) << 3;
-  uint32_t immLo = (delta & 0x00000003) << 29;
-  write32le(loc, opcode | immHi | immLo | dest);
-}
-
-static void writeNop(void *loc) { write32le(loc, 0xd503201f); }
-
-static bool isLiteralLdrEligible(const Ldr &ldr) {
-  return ldr.p2Size > 1 && isShiftedInt<19, 2>(ldr.offset);
-}
-
-static void writeLiteralLdr(void *loc, const Ldr &ldr) {
-  assert(isLiteralLdrEligible(ldr));
-  uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(19)) << 5;
-  uint32_t opcode;
-  switch (ldr.p2Size) {
-  case 2:
-    if (ldr.isFloat)
-      opcode = 0x1c000000;
-    else
-      opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000;
-    break;
-  case 3:
-    opcode = ldr.isFloat ? 0x5c000000 : 0x58000000;
-    break;
-  case 4:
-    opcode = 0x9c000000;
-    break;
-  default:
-    llvm_unreachable("Invalid literal ldr size");
-  }
-  write32le(loc, opcode | imm19 | ldr.destRegister);
-}
-
-static bool isImmediateLdrEligible(const Ldr &ldr) {
-  // Note: We deviate from ld64's behavior, which converts to immediate loads
-  // only if ldr.offset < 4096, even though the offset is divided by the load's
-  // size in the 12-bit immediate operand. Only the unsigned offset variant is
-  // supported.
-
-  uint32_t size = 1 << ldr.p2Size;
-  return ldr.offset >= 0 && (ldr.offset % size) == 0 &&
-         isUInt<12>(ldr.offset >> ldr.p2Size);
-}
-
-static void writeImmediateLdr(void *loc, const Ldr &ldr) {
-  assert(isImmediateLdrEligible(ldr));
-  uint32_t opcode = 0x39000000;
-  if (ldr.isFloat) {
-    opcode |= 0x04000000;
-    assert(ldr.extendType == ZeroExtend);
-  }
-  opcode |= ldr.destRegister;
-  opcode |= ldr.baseRegister << 5;
-  uint8_t size, opc;
-  if (ldr.p2Size == 4) {
-    size = 0;
-    opc = 3;
-  } else {
-    opc = ldr.extendType;
-    size = ldr.p2Size;
-  }
-  uint32_t immBits = ldr.offset >> ldr.p2Size;
-  write32le(loc, opcode | (immBits << 10) | (opc << 22) | (size << 30));
-}
-
-// Transforms a pair of adrp+add instructions into an adr instruction if the
-// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed
-// immediate offset.
-//
-//   adrp xN, _foo@PAGE
-//   add  xM, xN, _foo@PAGEOFF
-// ->
-//   adr  xM, _foo
-//   nop
-static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
-                         uint64_t offset1, uint64_t offset2) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  Adrp adrp;
-  Add add;
-  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add))
-    return false;
-  if (adrp.destRegister != add.srcRegister)
-    return false;
-
-  uint64_t addr1 = isec->getVA() + offset1;
-  uint64_t referent = pageBits(addr1) + adrp.addend + add.addend;
-  int64_t delta = referent - addr1;
-  if (!isValidAdrOffset(delta))
-    return false;
-
-  writeAdr(buf + offset1, add.destRegister, delta);
-  writeNop(buf + offset2);
-  return true;
-}
-
-// Transforms two adrp instructions into a single adrp if their referent
-// addresses are located on the same 4096 byte page.
-//
-//   adrp xN, _foo@PAGE
-//   adrp xN, _bar@PAGE
-// ->
-//   adrp xN, _foo@PAGE
-//   nop
-static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
-                          uint64_t offset1, uint64_t offset2) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  Adrp adrp1, adrp2;
-  if (!parseAdrp(ins1, adrp1) || !parseAdrp(ins2, adrp2))
-    return;
-  if (adrp1.destRegister != adrp2.destRegister)
-    return;
-
-  uint64_t page1 = pageBits(offset1 + isec->getVA()) + adrp1.addend;
-  uint64_t page2 = pageBits(offset2 + isec->getVA()) + adrp2.addend;
-  if (page1 != page2)
-    return;
-
-  writeNop(buf + offset2);
-}
-
-// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal)
-// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB,
-// as ldr can encode a signed 19-bit offset that gets multiplied by 4.
-//
-//   adrp xN, _foo@PAGE
-//   ldr  xM, [xN, _foo@PAGEOFF]
-// ->
-//   nop
-//   ldr  xM, _foo
-static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
-                         uint64_t offset1, uint64_t offset2) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  Adrp adrp;
-  Ldr ldr;
-  if (!parseAdrp(ins1, adrp) || !parseLdr(ins2, ldr))
-    return;
-  if (adrp.destRegister != ldr.baseRegister)
-    return;
-
-  uint64_t addr1 = isec->getVA() + offset1;
-  uint64_t addr2 = isec->getVA() + offset2;
-  uint64_t referent = pageBits(addr1) + adrp.addend + ldr.offset;
-  ldr.offset = referent - addr2;
-  if (!isLiteralLdrEligible(ldr))
-    return;
-
-  writeNop(buf + offset1);
-  writeLiteralLdr(buf + offset2, ldr);
-}
-
-// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
-// but they may be changed to adrp+add by relaxGotLoad(). This hint performs
-// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
-static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
-                            uint64_t offset1, uint64_t offset2) {
-  uint32_t ins2 = read32le(buf + offset2);
-  Add add;
-  Ldr ldr;
-  if (parseAdd(ins2, add))
-    applyAdrpAdd(buf, isec, offset1, offset2);
-  else if (parseLdr(ins2, ldr))
-    applyAdrpLdr(buf, isec, offset1, offset2);
-}
-
-// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
-// address by loading directly if it's close enough, or to an adrp(p)+ldr
-// sequence if it's not.
-//
-//   adrp x0, _foo@PAGE
-//   add  x1, x0, _foo@PAGEOFF
-//   ldr  x2, [x1, #off]
-static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
-                            uint64_t offset1, uint64_t offset2,
-                            uint64_t offset3) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  uint32_t ins3 = read32le(buf + offset3);
-  Adrp adrp;
-  Add add;
-  Ldr ldr;
-  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add) || !parseLdr(ins3, ldr))
-    return;
-  if (adrp.destRegister != add.srcRegister)
-    return;
-  if (add.destRegister != ldr.baseRegister)
-    return;
-
-  // Load from the target address directly.
-  //   nop
-  //   nop
-  //   ldr x2, [_foo + #off]
-  uint64_t addr1 = isec->getVA() + offset1;
-  uint64_t addr3 = isec->getVA() + offset3;
-  uint64_t referent = pageBits(addr1) + adrp.addend + add.addend;
-  Ldr literalLdr = ldr;
-  literalLdr.offset += referent - addr3;
-  if (isLiteralLdrEligible(literalLdr)) {
-    writeNop(buf + offset1);
-    writeNop(buf + offset2);
-    writeLiteralLdr(buf + offset3, literalLdr);
-    return;
-  }
-
-  if (applyAdrpAdd(buf, isec, offset1, offset2))
-    return;
-
-  // Move the target's page offset into the ldr's immediate offset.
-  //   adrp x0, _foo@PAGE
-  //   nop
-  //   ldr x2, [x0, _foo@PAGEOFF + #off]
-  Ldr immediateLdr = ldr;
-  immediateLdr.baseRegister = adrp.destRegister;
-  immediateLdr.offset += add.addend;
-  if (isImmediateLdrEligible(immediateLdr)) {
-    writeNop(buf + offset2);
-    writeImmediateLdr(buf + offset3, immediateLdr);
-    return;
-  }
-}
-
-// Relaxes a GOT-indirect load.
-// If the referenced symbol is external and its GOT entry is within +/- 1 MiB,
-// the GOT entry can be loaded with a single literal ldr instruction.
-// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
-// we perform the AdrpAddLdr transformation.
-static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
-                               uint64_t offset1, uint64_t offset2,
-                               uint64_t offset3) {
-  uint32_t ins2 = read32le(buf + offset2);
-  Add add;
-  Ldr ldr2;
-
-  if (parseAdd(ins2, add)) {
-    applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
-  } else if (parseLdr(ins2, ldr2)) {
-    // adrp x1, _foo@GOTPAGE
-    // ldr  x2, [x1, _foo@GOTPAGEOFF]
-    // ldr  x3, [x2, #off]
-    uint32_t ins3 = read32le(buf + offset3);
-    Ldr ldr3;
-    if (!parseLdr(ins3, ldr3))
-      return;
-    if (ldr3.baseRegister != ldr2.destRegister)
-      return;
-    // Loads from the GOT must be pointer sized.
-    if (ldr2.p2Size != 3 || ldr2.isFloat)
-      return;
-    applyAdrpLdr(buf, isec, offset1, offset2);
-  }
-}
-
-template <typename Callback>
-static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
-  std::array<uint64_t, 3> args;
-
-  auto readNext = [&]() -> uint64_t {
-    unsigned int n = 0;
-    uint64_t value = decodeULEB128(data.data(), &n, data.end());
-    data = data.drop_front(n);
-    return value;
-  };
-
-  while (!data.empty()) {
-    uint64_t type = readNext();
-    if (type == 0)
-      break;
-
-    uint64_t argCount = readNext();
-    for (unsigned i = 0; i < argCount; ++i) {
-      uint64_t arg = readNext();
-      if (i < 3)
-        args[i] = arg;
-    }
-    // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
-    if (argCount > 3)
-      continue;
-    callback(type, ArrayRef(args.data(), argCount));
-  }
-}
-
-// On RISC architectures like arm64, materializing a memory address generally
-// takes multiple instructions. If the referenced symbol is located close enough
-// in memory, fewer instructions are needed.
-//
-// Linker optimization hints record where addresses are computed. After
-// addresses have been assigned, if possible, we change them to a shorter
-// sequence of instructions. The size of the binary is not modified; the
-// eliminated instructions are replaced with NOPs. This still leads to faster
-// code as the CPU can skip over NOPs quickly.
-//
-// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
-// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
-// transformation kind, and 2 or 3 addresses where the instructions are located.
-void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const {
-  ArrayRef<uint8_t> data = obj.getOptimizationHints();
-  if (data.empty())
-    return;
-
-  const ConcatInputSection *section = nullptr;
-  uint64_t sectionAddr = 0;
-  uint8_t *buf = nullptr;
-
-  auto findSection = [&](uint64_t addr) {
-    if (section && addr >= sectionAddr &&
-        addr < sectionAddr + section->getSize())
-      return true;
-
-    if (obj.sections.empty())
-      return false;
-    auto secIt = std::prev(llvm::upper_bound(
-        obj.sections, addr,
-        [](uint64_t off, const Section *sec) { return off < sec->addr; }));
-    const Section *sec = *secIt;
-
-    if (sec->subsections.empty())
-      return false;
-    auto subsecIt = std::prev(llvm::upper_bound(
-        sec->subsections, addr - sec->addr,
-        [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
-    const Subsection &subsec = *subsecIt;
-    const ConcatInputSection *isec =
-        dyn_cast_or_null<ConcatInputSection>(subsec.isec);
-    if (!isec || isec->shouldOmitFromOutput())
-      return false;
-
-    section = isec;
-    sectionAddr = subsec.offset + sec->addr;
-    buf = outBuf + section->outSecOff + section->parent->fileOff;
-    return true;
-  };
-
-  auto isValidOffset = [&](uint64_t offset) {
-    if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
-      error(toString(&obj) +
-            ": linker optimization hint spans multiple sections");
-      return false;
-    }
-    return true;
-  };
-
-  bool hasAdrpAdrp = false;
-  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
-    if (kind == LOH_ARM64_ADRP_ADRP) {
-      hasAdrpAdrp = true;
-      return;
-    }
-
-    if (!findSection(args[0]))
-      return;
-    switch (kind) {
-    case LOH_ARM64_ADRP_ADD:
-      if (isValidOffset(args[1]))
-        applyAdrpAdd(buf, section, args[0] - sectionAddr,
-                     args[1] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_LDR:
-      if (isValidOffset(args[1]))
-        applyAdrpLdr(buf, section, args[0] - sectionAddr,
-                     args[1] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_LDR_GOT:
-      if (isValidOffset(args[1]))
-        applyAdrpLdrGot(buf, section, args[0] - sectionAddr,
-                        args[1] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_ADD_LDR:
-      if (isValidOffset(args[1]) && isValidOffset(args[2]))
-        applyAdrpAddLdr(buf, section, args[0] - sectionAddr,
-                        args[1] - sectionAddr, args[2] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_LDR_GOT_LDR:
-      if (isValidOffset(args[1]) && isValidOffset(args[2]))
-        applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr,
-                           args[1] - sectionAddr, args[2] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_ADD_STR:
-    case LOH_ARM64_ADRP_LDR_GOT_STR:
-      // TODO: Implement these
-      break;
-    }
-  });
-
-  if (!hasAdrpAdrp)
-    return;
-
-  // AdrpAdrp optimization hints are performed in a second pass because they
-  // might interfere with other transformations. For instance, consider the
-  // following input:
-  //
-  //   adrp x0, _foo@PAGE
-  //   add  x1, x0, _foo@PAGEOFF
-  //   adrp x0, _bar@PAGE
-  //   add  x2, x0, _bar@PAGEOFF
-  //
-  // If we perform the AdrpAdrp relaxation first, we get:
-  //
-  //   adrp x0, _foo@PAGE
-  //   add  x1, x0, _foo@PAGEOFF
-  //   nop
-  //   add x2, x0, _bar@PAGEOFF
-  //
-  // If we then apply AdrpAdd to the first two instructions, the add will have a
-  // garbage value in x0:
-  //
-  //   adr  x1, _foo
-  //   nop
-  //   nop
-  //   add  x2, x0, _bar@PAGEOFF
-  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
-    if (kind != LOH_ARM64_ADRP_ADRP)
-      return;
-    if (!findSection(args[0]))
-      return;
-    if (isValidOffset(args[1]))
-      applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr);
-  });
-}
-
 TargetInfo *macho::createARM64TargetInfo() {
   static ARM64 t;
   return &t;
diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index ecf6ce609e59f..3cd94ced75cc0 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -18,6 +18,7 @@ add_lld_library(lldMachO
   ICF.cpp
   InputFiles.cpp
   InputSection.cpp
+  LinkerOptimizationHints.cpp
   LTO.cpp
   MapFile.cpp
   MarkLive.cpp
diff --git a/lld/MachO/LinkerOptimizationHints.cpp b/lld/MachO/LinkerOptimizationHints.cpp
new file mode 100644
index 0000000000000..bae1a576eea57
--- /dev/null
+++ b/lld/MachO/LinkerOptimizationHints.cpp
@@ -0,0 +1,523 @@
+//===- LinkerOptimizationHints.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LinkerOptimizationHints.h"
+
+#include "Arch/ARM64Common.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+using namespace llvm::support::endian;
+using namespace lld;
+using namespace lld::macho;
+
+namespace {
+struct Adrp {
+  uint32_t destRegister;
+  int64_t addend;
+};
+
+struct Add {
+  uint8_t destRegister;
+  uint8_t srcRegister;
+  uint32_t addend;
+};
+
+enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 };
+
+struct Ldr {
+  uint8_t destRegister;
+  uint8_t baseRegister;
+  uint8_t p2Size;
+  bool isFloat;
+  ExtendType extendType;
+  int64_t offset;
+};
+} // namespace
+
+static bool parseAdrp(uint32_t insn, Adrp &adrp) {
+  if ((insn & 0x9f000000) != 0x90000000)
+    return false;
+  adrp.destRegister = insn & 0x1f;
+  uint64_t immHi = (insn >> 5) & 0x7ffff;
+  uint64_t immLo = (insn >> 29) & 0x3;
+  adrp.addend = SignExtend64<21>(immLo | (immHi << 2)) * 4096;
+  return true;
+}
+
+static bool parseAdd(uint32_t insn, Add &add) {
+  if ((insn & 0xffc00000) != 0x91000000)
+    return false;
+  add.destRegister = insn & 0x1f;
+  add.srcRegister = (insn >> 5) & 0x1f;
+  add.addend = (insn >> 10) & 0xfff;
+  return true;
+}
+
+static bool parseLdr(uint32_t insn, Ldr &ldr) {
+  ldr.destRegister = insn & 0x1f;
+  ldr.baseRegister = (insn >> 5) & 0x1f;
+  uint8_t size = insn >> 30;
+  uint8_t opc = (insn >> 22) & 3;
+
+  if ((insn & 0x3fc00000) == 0x39400000) {
+    // LDR (immediate), LDRB (immediate), LDRH (immediate)
+    ldr.p2Size = size;
+    ldr.extendType = ZeroExtend;
+    ldr.isFloat = false;
+  } else if ((insn & 0x3f800000) == 0x39800000) {
+    // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate)
+    ldr.p2Size = size;
+    ldr.extendType = static_cast<ExtendType>(opc);
+    ldr.isFloat = false;
+  } else if ((insn & 0x3f400000) == 0x3d400000) {
+    // LDR (immediate, SIMD&FP)
+    ldr.extendType = ZeroExtend;
+    ldr.isFloat = true;
+    if (opc == 1)
+      ldr.p2Size = size;
+    else if (size == 0 && opc == 3)
+      ldr.p2Size = 4;
+    else
+      return false;
+  } else {
+    return false;
+  }
+  ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size;
+  return true;
+}
+
+static bool isValidAdrOffset(int32_t delta) { return isInt<21>(delta); }
+
+static void writeAdr(void *loc, uint32_t dest, int32_t delta) {
+  assert(isValidAdrOffset(delta));
+  uint32_t opcode = 0x10000000;
+  uint32_t immHi = (delta & 0x001ffffc) << 3;
+  uint32_t immLo = (delta & 0x00000003) << 29;
+  write32le(loc, opcode | immHi | immLo | dest);
+}
+
+static void writeNop(void *loc) { write32le(loc, 0xd503201f); }
+
+static bool isLiteralLdrEligible(const Ldr &ldr) {
+  return ldr.p2Size > 1 && isShiftedInt<19, 2>(ldr.offset);
+}
+
+static void writeLiteralLdr(void *loc, const Ldr &ldr) {
+  assert(isLiteralLdrEligible(ldr));
+  uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(19)) << 5;
+  uint32_t opcode;
+  switch (ldr.p2Size) {
+  case 2:
+    if (ldr.isFloat)
+      opcode = 0x1c000000;
+    else
+      opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000;
+    break;
+  case 3:
+    opcode = ldr.isFloat ? 0x5c000000 : 0x58000000;
+    break;
+  case 4:
+    opcode = 0x9c000000;
+    break;
+  default:
+    llvm_unreachable("Invalid literal ldr size");
+  }
+  write32le(loc, opcode | imm19 | ldr.destRegister);
+}
+
+static bool isImmediateLdrEligible(const Ldr &ldr) {
+  // Note: We deviate from ld64's behavior, which converts to immediate loads
+  // only if ldr.offset < 4096, even though the offset is divided by the load's
+  // size in the 12-bit immediate operand. Only the unsigned offset variant is
+  // supported.
+
+  uint32_t size = 1 << ldr.p2Size;
+  return ldr.offset >= 0 && (ldr.offset % size) == 0 &&
+         isUInt<12>(ldr.offset >> ldr.p2Size);
+}
+
+static void writeImmediateLdr(void *loc, const Ldr &ldr) {
+  assert(isImmediateLdrEligible(ldr));
+  uint32_t opcode = 0x39000000;
+  if (ldr.isFloat) {
+    opcode |= 0x04000000;
+    assert(ldr.extendType == ZeroExtend);
+  }
+  opcode |= ldr.destRegister;
+  opcode |= ldr.baseRegister << 5;
+  uint8_t size, opc;
+  if (ldr.p2Size == 4) {
+    size = 0;
+    opc = 3;
+  } else {
+    opc = ldr.extendType;
+    size = ldr.p2Size;
+  }
+  uint32_t immBits = ldr.offset >> ldr.p2Size;
+  write32le(loc, opcode | (immBits << 10) | (opc << 22) | (size << 30));
+}
+
+// Transforms a pair of adrp+add instructions into an adr instruction if the
+// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed
+// immediate offset.
+//
+//   adrp xN, _foo@PAGE
+//   add  xM, xN, _foo@PAGEOFF
+// ->
+//   adr  xM, _foo
+//   nop
+static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  Adrp adrp;
+  Add add;
+  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add))
+    return false;
+  if (adrp.destRegister != add.srcRegister)
+    return false;
+
+  uint64_t addr1 = isec->getVA() + offset1;
+  uint64_t referent = lld::macho::pageBits(addr1) + adrp.addend + add.addend;
+  int64_t delta = referent - addr1;
+  if (!isValidAdrOffset(delta))
+    return false;
+
+  writeAdr(buf + offset1, add.destRegister, delta);
+  writeNop(buf + offset2);
+  return true;
+}
+
+// Transforms two adrp instructions into a single adrp if their referent
+// addresses are located on the same 4096 byte page.
+//
+//   adrp xN, _foo@PAGE
+//   adrp xN, _bar@PAGE
+// ->
+//   adrp xN, _foo@PAGE
+//   nop
+static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
+                          uint64_t offset1, uint64_t offset2) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  Adrp adrp1, adrp2;
+  if (!parseAdrp(ins1, adrp1) || !parseAdrp(ins2, adrp2))
+    return;
+  if (adrp1.destRegister != adrp2.destRegister)
+    return;
+
+  uint64_t page1 = pageBits(offset1 + isec->getVA()) + adrp1.addend;
+  uint64_t page2 = pageBits(offset2 + isec->getVA()) + adrp2.addend;
+  if (page1 != page2)
+    return;
+
+  writeNop(buf + offset2);
+}
+
+// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal)
+// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB,
+// as ldr can encode a signed 19-bit offset that gets multiplied by 4.
+//
+//   adrp xN, _foo@PAGE
+//   ldr  xM, [xN, _foo@PAGEOFF]
+// ->
+//   nop
+//   ldr  xM, _foo
+static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  Adrp adrp;
+  Ldr ldr;
+  if (!parseAdrp(ins1, adrp) || !parseLdr(ins2, ldr))
+    return;
+  if (adrp.destRegister != ldr.baseRegister)
+    return;
+
+  uint64_t addr1 = isec->getVA() + offset1;
+  uint64_t addr2 = isec->getVA() + offset2;
+  uint64_t referent = pageBits(addr1) + adrp.addend + ldr.offset;
+  ldr.offset = referent - addr2;
+  if (!isLiteralLdrEligible(ldr))
+    return;
+
+  writeNop(buf + offset1);
+  writeLiteralLdr(buf + offset2, ldr);
+}
+
+// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
+// but they may be changed to adrp+add by relaxGotLoad(). This hint performs
+// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
+static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2) {
+  uint32_t ins2 = read32le(buf + offset2);
+  Add add;
+  Ldr ldr;
+  if (parseAdd(ins2, add))
+    applyAdrpAdd(buf, isec, offset1, offset2);
+  else if (parseLdr(ins2, ldr))
+    applyAdrpLdr(buf, isec, offset1, offset2);
+}
+
+// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
+// address by loading directly if it's close enough, or to an adrp(p)+ldr
+// sequence if it's not.
+//
+//   adrp x0, _foo@PAGE
+//   add  x1, x0, _foo@PAGEOFF
+//   ldr  x2, [x1, #off]
+static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2,
+                            uint64_t offset3) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  uint32_t ins3 = read32le(buf + offset3);
+  Adrp adrp;
+  Add add;
+  Ldr ldr;
+  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add) || !parseLdr(ins3, ldr))
+    return;
+  if (adrp.destRegister != add.srcRegister)
+    return;
+  if (add.destRegister != ldr.baseRegister)
+    return;
+
+  // Load from the target address directly.
+  //   nop
+  //   nop
+  //   ldr x2, [_foo + #off]
+  uint64_t addr1 = isec->getVA() + offset1;
+  uint64_t addr3 = isec->getVA() + offset3;
+  uint64_t referent = pageBits(addr1) + adrp.addend + add.addend;
+  Ldr literalLdr = ldr;
+  literalLdr.offset += referent - addr3;
+  if (isLiteralLdrEligible(literalLdr)) {
+    writeNop(buf + offset1);
+    writeNop(buf + offset2);
+    writeLiteralLdr(buf + offset3, literalLdr);
+    return;
+  }
+
+  if (applyAdrpAdd(buf, isec, offset1, offset2))
+    return;
+
+  // Move the target's page offset into the ldr's immediate offset.
+  //   adrp x0, _foo@PAGE
+  //   nop
+  //   ldr x2, [x0, _foo@PAGEOFF + #off]
+  Ldr immediateLdr = ldr;
+  immediateLdr.baseRegister = adrp.destRegister;
+  immediateLdr.offset += add.addend;
+  if (isImmediateLdrEligible(immediateLdr)) {
+    writeNop(buf + offset2);
+    writeImmediateLdr(buf + offset3, immediateLdr);
+    return;
+  }
+}
+
+// Relaxes a GOT-indirect load.
+// If the referenced symbol is external and its GOT entry is within +/- 1 MiB,
+// the GOT entry can be loaded with a single literal ldr instruction.
+// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
+// we perform the AdrpAddLdr transformation.
+static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
+                               uint64_t offset1, uint64_t offset2,
+                               uint64_t offset3) {
+  uint32_t ins2 = read32le(buf + offset2);
+  Add add;
+  Ldr ldr2;
+
+  if (parseAdd(ins2, add)) {
+    applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
+  } else if (parseLdr(ins2, ldr2)) {
+    // adrp x1, _foo@GOTPAGE
+    // ldr  x2, [x1, _foo@GOTPAGEOFF]
+    // ldr  x3, [x2, #off]
+    uint32_t ins3 = read32le(buf + offset3);
+    Ldr ldr3;
+    if (!parseLdr(ins3, ldr3))
+      return;
+    if (ldr3.baseRegister != ldr2.destRegister)
+      return;
+    applyAdrpLdr(buf, isec, offset1, offset2);
+  }
+}
+
+template <typename Callback>
+static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
+  std::array<uint64_t, 3> args;
+
+  auto readNext = [&]() -> uint64_t {
+    unsigned int n = 0;
+    uint64_t value = decodeULEB128(data.data(), &n, data.end());
+    data = data.drop_front(n);
+    return value;
+  };
+
+  while (!data.empty()) {
+    uint64_t type = readNext();
+    if (type == 0)
+      break;
+
+    uint64_t argCount = readNext();
+    for (unsigned i = 0; i < argCount; ++i) {
+      uint64_t arg = readNext();
+      if (i < 3)
+        args[i] = arg;
+    }
+    // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
+    if (argCount > 3)
+      continue;
+    callback(type, ArrayRef(args.data(), argCount));
+  }
+}
+
+// On RISC architectures like arm64, materializing a memory address generally
+// takes multiple instructions. If the referenced symbol is located close enough
+// in memory, fewer instructions are needed.
+//
+// Linker optimization hints record where addresses are computed. After
+// addresses have been assigned, if possible, we change them to a shorter
+// sequence of instructions. The size of the binary is not modified; the
+// eliminated instructions are replaced with NOPs. This still leads to faster
+// code as the CPU can skip over NOPs quickly.
+//
+// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
+// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
+// transformation kind, and 2 or 3 addresses where the instructions are located.
+void macho::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) {
+  ArrayRef<uint8_t> data = obj.getOptimizationHints();
+  if (data.empty())
+    return;
+
+  const ConcatInputSection *section = nullptr;
+  uint64_t sectionAddr = 0;
+  uint8_t *buf = nullptr;
+
+  auto findSection = [&](uint64_t addr) {
+    if (section && addr >= sectionAddr &&
+        addr < sectionAddr + section->getSize())
+      return true;
+
+    if (obj.sections.empty())
+      return false;
+    auto secIt = std::prev(llvm::upper_bound(
+        obj.sections, addr,
+        [](uint64_t off, const Section *sec) { return off < sec->addr; }));
+    const Section *sec = *secIt;
+
+    if (sec->subsections.empty())
+      return false;
+    auto subsecIt = std::prev(llvm::upper_bound(
+        sec->subsections, addr - sec->addr,
+        [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
+    const Subsection &subsec = *subsecIt;
+    const ConcatInputSection *isec =
+        dyn_cast_or_null<ConcatInputSection>(subsec.isec);
+    if (!isec || isec->shouldOmitFromOutput())
+      return false;
+
+    section = isec;
+    sectionAddr = subsec.offset + sec->addr;
+    buf = outBuf + section->outSecOff + section->parent->fileOff;
+    return true;
+  };
+
+  auto isValidOffset = [&](uint64_t offset) {
+    if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
+      error(toString(&obj) +
+            ": linker optimization hint spans multiple sections");
+      return false;
+    }
+    return true;
+  };
+
+  bool hasAdrpAdrp = false;
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind == LOH_ARM64_ADRP_ADRP) {
+      hasAdrpAdrp = true;
+      return;
+    }
+
+    if (!findSection(args[0]))
+      return;
+    switch (kind) {
+    case LOH_ARM64_ADRP_ADD:
+      if (isValidOffset(args[1]))
+        applyAdrpAdd(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR:
+      if (isValidOffset(args[1]))
+        applyAdrpLdr(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR_GOT:
+      if (isValidOffset(args[1]))
+        applyAdrpLdrGot(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_ADD_LDR:
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpAddLdr(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr, args[2] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR_GOT_LDR:
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr,
+                           args[1] - sectionAddr, args[2] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_ADD_STR:
+    case LOH_ARM64_ADRP_LDR_GOT_STR:
+      // TODO: Implement these
+      break;
+    }
+  });
+
+  if (!hasAdrpAdrp)
+    return;
+
+  // AdrpAdrp optimization hints are performed in a second pass because they
+  // might interfere with other transformations. For instance, consider the
+  // following input:
+  //
+  //   adrp x0, _foo@PAGE
+  //   add  x1, x0, _foo@PAGEOFF
+  //   adrp x0, _bar@PAGE
+  //   add  x2, x0, _bar@PAGEOFF
+  //
+  // If we perform the AdrpAdrp relaxation first, we get:
+  //
+  //   adrp x0, _foo@PAGE
+  //   add  x1, x0, _foo@PAGEOFF
+  //   nop
+  //   add x2, x0, _bar@PAGEOFF
+  //
+  // If we then apply AdrpAdd to the first two instructions, the add will have a
+  // garbage value in x0:
+  //
+  //   adr  x1, _foo
+  //   nop
+  //   nop
+  //   add  x2, x0, _bar@PAGEOFF
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind != LOH_ARM64_ADRP_ADRP)
+      return;
+    if (!findSection(args[0]))
+      return;
+    if (isValidOffset(args[1]))
+      applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr);
+  });
+}
diff --git a/lld/MachO/LinkerOptimizationHints.h b/lld/MachO/LinkerOptimizationHints.h
new file mode 100644
index 0000000000000..eada9b048c255
--- /dev/null
+++ b/lld/MachO/LinkerOptimizationHints.h
@@ -0,0 +1,17 @@
+//===- LinkerOptimizationHints.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_LINKER_OPTIMIZATION_HINTS_H
+#define LLD_MACHO_LINKER_OPTIMIZATION_HINTS_H
+
+#include "InputFiles.h"
+
+namespace lld::macho {
+void applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj);
+}
+#endif
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index 39f5f94078611..27e5178593c87 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -124,8 +124,6 @@ class TargetInfo {
     llvm_unreachable("Unsupported architecture for dtrace symbols");
   }
 
-  virtual void applyOptimizationHints(uint8_t *, const ObjFile &) const {};
-
   uint32_t magic;
   llvm::MachO::CPUType cpuType;
   uint32_t cpuSubtype;
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index 613e6dea3b897..f288fadc0d14f 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -11,6 +11,7 @@
 #include "Config.h"
 #include "InputFiles.h"
 #include "InputSection.h"
+#include "LinkerOptimizationHints.h"
 #include "MapFile.h"
 #include "OutputSection.h"
 #include "OutputSegment.h"
@@ -1209,14 +1210,15 @@ void Writer::writeSections() {
 }
 
 void Writer::applyOptimizationHints() {
-  if (config->arch() != AK_arm64 || config->ignoreOptimizationHints)
+  if (!is_contained({AK_arm64, AK_arm64e, AK_arm64_32}, config->arch()) ||
+      config->ignoreOptimizationHints)
     return;
 
   uint8_t *buf = buffer->getBufferStart();
   TimeTraceScope timeScope("Apply linker optimization hints");
   parallelForEach(inputFiles, [buf](const InputFile *file) {
     if (const auto *objFile = dyn_cast<ObjFile>(file))
-      target->applyOptimizationHints(buf, *objFile);
+      macho::applyOptimizationHints(buf, *objFile);
   });
 }
 
diff --git a/lld/docs/DTLTO.rst b/lld/docs/DTLTO.rst
index 985decf6c7db8..54fcc034d1371 100644
--- a/lld/docs/DTLTO.rst
+++ b/lld/docs/DTLTO.rst
@@ -7,8 +7,7 @@ during the traditional link step.
 
 The implementation is documented here: https://llvm.org/docs/DTLTO.html.
 
-Currently, DTLTO is only supported in ELF LLD. Support will be added to other
-LLD flavours in the future.
+Currently, DTLTO is only supported in ELF and COFF LLD.
 
 ELF LLD
 -------
@@ -40,3 +39,37 @@ The command-line interface is as follows:
 Some LLD LTO options (e.g., ``--lto-sample-profile=<file>``) are supported.
 Currently, other options are silently accepted but do not have the intended
 effect. Support for such options will be expanded in the future.
+
+COFF LLD
+--------
+
+The command-line interface is as follows:
+
+- ``/thinlto-distributor:<path>``
+  Specifies the file to execute as the distributor process. If specified,
+  ThinLTO backend compilations will be distributed.
+
+- ``/thinlto-remote-compiler:<path>``
+  Specifies the path to the compiler that the distributor process will use for
+  backend compilations. The compiler invoked must match the version of LLD.
+
+- ``/thinlto-distributor-arg:<arg>``
+  Specifies ``<arg>`` on the command line when invoking the distributor.
+  Can be specified multiple times.
+
+- ``/thinlto-remote-compiler-arg:<arg>``
+  Appends ``<arg>`` to the remote compiler's command line.
+  Can be specified multiple times.
+
+  Options that introduce extra input/output files may cause miscompilation if
+  the distribution system does not automatically handle pushing/fetching them to
+  remote nodes. In such cases, configure the distributor - possibly using
+  ``/thinlto-distributor-arg:`` - to manage these dependencies. See the
+  distributor documentation for details.
+
+Some LLD LTO options (e.g., ``/lto-sample-profile:<file>``) are supported.
+Currently, other options are silently accepted but do not have the intended
+effect. Support for such options could be expanded in the future.
+
+Currently, there is no DTLTO command line interface supplied for ``clang-cl``,
+as users are expected to invoke LLD directly.
\ No newline at end of file
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 7edc522b4f6a4..1835879b671e8 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -798,6 +798,13 @@ Specify how to report the missing GNU_PROPERTY_X86_FEATURE_1_IBT or GNU_PROPERTY
 .Cm none
 is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
 .Pp
+.It Cm gcs-report Ns = Ns Ar [none|warning|error]
+Specify how to report missing
+.Cm GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+property. GNU_PROPERTY_AARCH64_FEATURE_1_GCS indicates object file support for the Guarded Control Stack security feature.
+.Cm none
+is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
+.Pp
 .It Cm dynamic-undefined-weak
 Make undefined weak symbols dynamic when the dynamic symbol table is present, if they are referenced from
 relocatable object files and not forced local by symbol visibility or versioning. Do not make them dynamic when
@@ -953,6 +960,17 @@ disallows overlap.
 .It Cm shstk
 x86 only, use shadow stack.
 .Pp
+.It Cm gcs Ns = Ns Ar [implicit|never|always]
+Specifies how the
+.Cm GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+bit is set in the output ELF file. When set, it indicates support for the Guarded Control Stack (GCS) security feature.
+.Cm implicit
+(default) The GCS feature bit is set if all input relocatable files have a .note.gnu.property section containing the GNU_PROPERTY_AARCH64_FEATURE_1_GCS bit.
+.Cm never
+clears the GCS feature bit regardless of input relocatable files' markings.
+.Cm always
+sets the GCS feature bit regardless of input object markings.
+.Pp
 .It Cm stack-size Ns = Ns Ar size
 Set the main thread's stack size to
 .Ar size .
diff --git a/lld/test/COFF/dtlto/files.test b/lld/test/COFF/dtlto/files.test
new file mode 100644
index 0000000000000..4297adac9bbf0
--- /dev/null
+++ b/lld/test/COFF/dtlto/files.test
@@ -0,0 +1,71 @@
+REQUIRES: x86
+
+## Test that the LLD options /lldsavetemps and -thinlto-emit-imports-files
+## function correctly with DTLTO we also check that index files 
+## (-thinlto-emit-index-files) are not emitted with DTLTO.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: sed 's/@t1/@t2/g' t1.ll > t2.ll
+
+## Generate ThinLTO bitcode files. Note that t3.bc will not be used by the
+## linker.
+RUN: opt -thinlto-bc t1.ll -o t1.bc
+RUN: opt -thinlto-bc t2.ll -o t2.bc
+RUN: cp t1.bc t3.bc
+
+## Generate object files for mock.py to return.
+RUN: llc t1.ll --filetype=obj -o t1.obj
+RUN: llc t2.ll --filetype=obj -o t2.obj
+
+## Create response file containing shared ThinLTO linker arguments.
+## -start-lib/-end-lib is used to test the special case where unused lazy
+## bitcode inputs result in empty index/imports files.
+## Note that mock.py does not do any compilation; instead, it simply writes
+## the contents of the object files supplied on the command line into the
+## output object files in job order.
+RUN: echo "/entry:t1 /subsystem:console \
+RUN:   t1.bc t2.bc -start-lib t3.bc -end-lib /out:my.exe \
+RUN:   -thinlto-distributor:\"%python\" \
+RUN:   -thinlto-distributor-arg:\"%llvm_src_root/utils/dtlto/mock.py\" \
+RUN:   -thinlto-distributor-arg:t1.obj \
+RUN:   -thinlto-distributor-arg:t2.obj \
+RUN:   -thinlto-remote-compiler:fake.exe" > l.rsp
+
+## Check that without extra flags, no index/imports files are produced and
+## backend temp files are removed.
+RUN: lld-link @l.rsp
+RUN: ls | FileCheck %s \
+RUN:   --check-prefixes=NOBACKEND,NOOTHERS
+
+## Check that with /lldsavetemps and -thinlto-emit-imports-files backend 
+## tempoary files are retained and no index/imports files are produced.
+RUN: rm -f *.imports *.thinlto.bc
+RUN: lld-link @l.rsp  /lldsavetemps -thinlto-emit-imports-files
+RUN: ls | sort | FileCheck %s \
+RUN:   --check-prefixes=BACKEND,NOOTHERS
+
+## JSON jobs description, retained with --save-temps.
+## Note that DTLTO temporary files include a PID component.
+NOBACKEND-NOT: {{^}}my.[[#]].dist-file.json{{$}}
+BACKEND:       {{^}}my.[[#]].dist-file.json{{$}}
+
+## Index/imports files for t1.bc.
+NOOTHERS-NOT: {{^}}t1.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t1.bc.thinlto.bc{{$}}
+
+## Index/imports files for t2.bc.
+NOOTHERS-NOT: {{^}}t2.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t2.bc.thinlto.bc{{$}}
+
+## Empty index/imports files for unused t3.bc.
+NOOTHERS-NOT: {{^}}t3.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t3.bc.thinlto.bc{{$}}
+
+#--- t1.ll
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @t1() {
+  ret void
+}
diff --git a/lld/test/COFF/dtlto/options.test b/lld/test/COFF/dtlto/options.test
new file mode 100644
index 0000000000000..023ecd2359101
--- /dev/null
+++ b/lld/test/COFF/dtlto/options.test
@@ -0,0 +1,56 @@
+REQUIRES: x86
+
+## Test that DTLTO-specific options are handled correctly.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: opt -thinlto-bc foo.ll -o foo.obj
+
+## Not specifying a value for -thinlto-remote-compiler should result in an
+## error if -thinlto-distributor is specified.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:fake.exe 2>&1 | FileCheck %s --check-prefix=COMPILER
+RUN: lld-link /entry:foo /subsystem:console foo.obj /out:my.exe
+
+## Specifying an empty value for -thinlto-remote-compiler should result in an
+## error if -thinlto-distributor is specified.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:fake.exe \
+RUN:   -thinlto-remote-compiler:"" 2>&1 | FileCheck %s --check-prefix=COMPILER
+RUN: lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-remote-compiler:""
+
+COMPILER: error: A value must be specified for /thinlto-remote-compiler if /thinlto-distributor is specified.
+
+## Test that DTLTO options are passed correctly to the distributor and
+## remote compiler.
+## Note: validate.py does not perform any compilation. Instead, it validates the
+## received JSON, pretty-prints the JSON and the supplied arguments, and then
+## exits with an error. This allows FileCheck directives to verify the
+## distributor inputs.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:%python \
+RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/validate.py \
+RUN:   -thinlto-distributor-arg:darg1=10 \
+RUN:   -thinlto-distributor-arg:darg2=20 \
+RUN:   -thinlto-remote-compiler:my_clang.exe \
+RUN:   -thinlto-remote-compiler-arg:carg1=20 \
+RUN:   -thinlto-remote-compiler-arg:carg2=30 2>&1 | FileCheck %s
+
+CHECK: distributor_args=['darg1=10', 'darg2=20']
+
+CHECK: "linker_output": "my.exe"
+
+CHECK: "my_clang.exe"
+CHECK: "carg1=20"
+CHECK: "carg2=30"
+
+CHECK: error: DTLTO backend compilation: cannot open native object file:
+
+#--- foo.ll
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @foo() {
+  ret void
+}
diff --git a/lld/test/COFF/imported-dllmain-i386.test b/lld/test/COFF/imported-dllmain-i386.test
new file mode 100644
index 0000000000000..f8aa09006999c
--- /dev/null
+++ b/lld/test/COFF/imported-dllmain-i386.test
@@ -0,0 +1,58 @@
+REQUIRES: x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows a.s -o a.obj
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows b1.s -o b1.obj
+RUN: llvm-mc -filetype=obj -triple=i386-windows b2.s -o b2.obj
+
+### This is the line where our problem occurs. Here, we export the DllMain symbol which shouldn't happen normally.
+RUN: lld-link b1.obj b2.obj -out:b.dll -dll -implib:b.lib -entry:DllMain -export:bar -export:DllMain -safeseh:no
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows c.s -o c.obj
+RUN: lld-link -lib c.obj -out:c.lib
+
+### Later, if b.lib is provided before other libs/objs that export DllMain statically, we previously were using the dllimported DllMain from b.lib, which is wrong.
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -safeseh:no 2>&1 | FileCheck -check-prefix=WARN %s
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:importeddllmain -safeseh:no 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
+RUN: llvm-objdump --private-headers -d out.dll | FileCheck -check-prefix=DISASM %s
+
+WARN: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+IGNORED-NOT: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+
+DISASM: The Import Tables:
+DISASM: DLL Name: b.dll
+DISASM-NOT: DllMain
+DISASM: bar
+DISASM: Disassembly of section .text:
+DISASM:      b0 01                         movb    $0x1, %al
+DISASM-NEXT: c3                            retl
+
+#--- a.s
+        .text
+        .globl _foo
+_foo:
+        call *__imp__bar
+        ret
+
+#--- b1.s
+        .text
+        .globl _bar
+_bar:
+        ret
+
+#--- b2.s
+        .intel_syntax noprefix
+        .text
+        .globl _DllMain
+_DllMain:
+        xor al, al
+        ret
+
+#--- c.s
+        .intel_syntax noprefix
+        .text
+        .globl _DllMain
+_DllMain:
+        mov al, 1 
+        ret
diff --git a/lld/test/COFF/exported-dllmain.test b/lld/test/COFF/imported-dllmain.test
similarity index 86%
rename from lld/test/COFF/exported-dllmain.test
rename to lld/test/COFF/imported-dllmain.test
index fcf6ed1005379..fa8579b1b41c5 100644
--- a/lld/test/COFF/exported-dllmain.test
+++ b/lld/test/COFF/imported-dllmain.test
@@ -14,11 +14,11 @@ RUN: lld-link -lib c.obj -out:c.lib
 
 ### Later, if b.lib is provided before other libs/objs that export DllMain statically, we previously were using the dllimported DllMain from b.lib, which is wrong.
 RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain 2>&1 | FileCheck -check-prefix=WARN %s
-RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:exporteddllmain 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:importeddllmain 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
 RUN: llvm-objdump --private-headers -d out.dll | FileCheck -check-prefix=DISASM %s
 
-WARN: lld-link: warning: b.lib: skipping exported DllMain symbol [exporteddllmain]
-IGNORED-NOT: lld-link: warning: b.lib: skipping exported DllMain symbol [exporteddllmain]
+WARN: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+IGNORED-NOT: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
 
 DISASM: The Import Tables:
 DISASM: DLL Name: b.dll
diff --git a/lld/test/COFF/thin-archive.s b/lld/test/COFF/thin-archive.s
index 55d71ea635673..7fab10c2b57b4 100644
--- a/lld/test/COFF/thin-archive.s
+++ b/lld/test/COFF/thin-archive.s
@@ -22,23 +22,34 @@
 # SYMTAB:        ?f@@YAHXZ in
 # NO-SYMTAB-NOT: ?f@@YAHXZ in
 
-# RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: lld-link /entry:main %t.main.obj %t_thin.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: lld-link /entry:main %t.main.obj /wholearchive:%t_thin.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
+# RUN: echo "/entry:main \"%t.main.obj\" /out:\"%t.exe\"" > %t.rsp
+
+# RUN: lld-link @%t.rsp %t.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_NON_THIN
+# RUN: lld-link @%t.rsp %t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_SYM
+# RUN: lld-link @%t.rsp /wholearchive:%t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_WHOLE
+# RUN: lld-link @%t.rsp /wholearchive %t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_WHOLE
+
+# LOAD_NON_THIN:   Loaded {{.*}}.lib({{.*}}.obj) for int __cdecl f(void)
+# LOAD_THIN_SYM:   Loaded {{.*}}.obj for int __cdecl f(void)
+# LOAD_THIN_WHOLE: Loaded {{.*}}.obj for <whole-archive>
 
 # RUN: rm %t.lib.obj
-# RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: env LLD_IN_TEST=1 not lld-link /entry:main %t.main.obj %t_thin.lib \
-# RUN:     /out:%t.exe 2>&1 | FileCheck --check-prefix=NOOBJ %s
-# RUN: env LLD_IN_TEST=1 not lld-link /entry:main %t.main.obj %t_thin.lib /out:%t.exe \
-# RUN:     /demangle:no 2>&1 | FileCheck --check-prefix=NOOBJNODEMANGLE %s
-
-# CHECK-NOT: error: could not get the buffer for the member defining
+# RUN: lld-link @%t.rsp %t.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=ERR --allow-empty
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp %t_thin.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJ
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp /wholearchive:%t_thin.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJWHOLE
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp %t_thin.lib /demangle:no 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJNODEMANGLE
+
+# ERR-NOT: error: could not get the buffer for the member defining
 # NOOBJ: error: could not get the buffer for the member defining symbol int __cdecl f(void): {{.*}}.lib({{.*}}.lib.obj):
+# NOOBJWHOLE: error: {{.*}}.lib: could not get the buffer for a child of the archive: '{{.*}}.obj'
 # NOOBJNODEMANGLE: error: could not get the buffer for the member defining symbol ?f@@YAHXZ: {{.*}}.lib({{.*}}.lib.obj):
 
 	.text
diff --git a/lld/test/ELF/bp-section-orderer.s b/lld/test/ELF/bp-section-orderer.s
index 4df2e8d43022e..438d7c2da0f76 100644
--- a/lld/test/ELF/bp-section-orderer.s
+++ b/lld/test/ELF/bp-section-orderer.s
@@ -26,28 +26,28 @@
 
 # RUN: ld.lld -o out.s a.o --irpgo-profile=a.profdata --bp-startup-sort=function
 # RUN: llvm-nm -jn out.s | tr '\n' , | FileCheck %s --check-prefix=STARTUP
-# STARTUP: s5,s4,s3,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d3,d2,d1,{{$}}
+# STARTUP: s5,s4,s3,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d3,d2,d1,g1,{{$}}
 
 # RUN: ld.lld -o out.os a.o --irpgo-profile=a.profdata --bp-startup-sort=function --symbol-ordering-file a.txt
 # RUN: llvm-nm -jn out.os | tr '\n' , | FileCheck %s --check-prefix=ORDER-STARTUP
-# ORDER-STARTUP: s2,s1,s5,s4,s3,A,F,E,D,B,C,merged1,merged2,_start,d3,d2,d4,d1,{{$}}
+# ORDER-STARTUP: s2,s1,s5,s4,s3,A,F,E,D,B,C,merged1,merged2,_start,d3,d2,d4,d1,g1,{{$}}
 
 # RUN: ld.lld -o out.cf a.o --verbose-bp-section-orderer --bp-compression-sort=function 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-FUNC
 # RUN: ld.lld -o out.cf.icf a.o --verbose-bp-section-orderer --bp-compression-sort=function --icf=all --gc-sections 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-ICF-FUNC
 # RUN: llvm-nm -jn out.cf | tr '\n' , | FileCheck %s --check-prefix=CFUNC
-# CFUNC: s5,s4,s3,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d3,d2,d1,{{$}}
+# CFUNC: s5,s4,s3,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d3,d2,d1,g1,{{$}}
 
 # RUN: ld.lld -o out.cd a.o --verbose-bp-section-orderer --bp-compression-sort=data 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-DATA
 # RUN: llvm-nm -jn out.cd | tr '\n' , | FileCheck %s --check-prefix=CDATA
-# CDATA: s5,s3,s4,s2,s1,F,C,E,D,B,A,merged1,merged2,_start,d4,d1,d3,d2,{{$}}
+# CDATA: s5,s3,s4,s2,s1,F,C,E,D,B,A,merged1,merged2,_start,d4,d1,d3,d2,g1,{{$}}
 
 # RUN: ld.lld -o out.cb a.o --verbose-bp-section-orderer --bp-compression-sort=both 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-BOTH
 # RUN: llvm-nm -jn out.cb | tr '\n' , | FileCheck %s --check-prefix=CBOTH
-# CBOTH: s5,s3,s4,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d1,d3,d2,{{$}}
+# CBOTH: s5,s3,s4,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d1,d3,d2,g1,{{$}}
 
 # RUN: ld.lld -o out.cbs a.o --verbose-bp-section-orderer --bp-compression-sort=both --irpgo-profile=a.profdata --bp-startup-sort=function 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-BOTH
 # RUN: llvm-nm -jn out.cbs | tr '\n' , | FileCheck %s --check-prefix=CBOTH-STARTUP
-# CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,{{$}}
+# CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,g1,{{$}}
 
 # BP-COMPRESSION-FUNC: Ordered 9 sections ([[#]] bytes) using balanced partitioning
 # BP-COMPRESSION-ICF-FUNC: Ordered 8 sections ([[#]] bytes) using balanced partitioning
@@ -108,6 +108,7 @@ d3
 d2
 
 #--- a.c
+int g1;
 const char s5[] = "engineering";
 const char s4[] = "computer program";
 const char s3[] = "hardware engineer";
@@ -377,6 +378,14 @@ d1:
 	.word	6                               // 0x6
 	.size	d1, 16
 
+	.type	g1,@object                      // @g1
+	.section	.bss.g1,"aw",@nobits
+	.globl	g1
+	.p2align	2, 0x0
+g1:
+	.word	0                               // 0x0
+	.size	g1, 4
+
 	.section	".note.GNU-stack","",@progbits
 	.addrsig
 	.addrsig_sym F
diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s
deleted file mode 100644
index 53860b5daf2b1..0000000000000
--- a/lld/test/ELF/hexagon-jump-error.s
+++ /dev/null
@@ -1,32 +0,0 @@
-# REQUIRES: hexagon
-# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
-## Use --threads=1 to keep emitted warnings across sections sequential.
-# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
-
-	.globl	_start
-	.type	_start, @function
-_start:
-
-# CHECK: relocation R_HEX_B9_PCREL out of range: 1028 is not in [-1024, 1023]
-{r0 = #0; jump #1f}
-.space (1<<10)
-.section b9, "ax"
-1:
-
-# CHECK: relocation R_HEX_B13_PCREL out of range: 16388 is not in [-16384, 16383]
-if (r0==#0) jump:t #1f
-.space (1<<14)
-.section b13, "ax"
-1:
-
-# CHECK: relocation R_HEX_B15_PCREL out of range: 65540 is not in [-65536, 65535]
-if (p0) jump #1f
-.space (1<<16)
-.section b15, "ax"
-1:
-
-# CHECK: relocation R_HEX_B22_PCREL out of range: 8388612 is not in [-8388608, 8388607]
-jump #1f
-.space (1<<23)
-.section b22, "ax"
-1:
diff --git a/lld/test/ELF/hexagon-plt.s b/lld/test/ELF/hexagon-plt.s
index 679de82923a72..780dc434a6698 100644
--- a/lld/test/ELF/hexagon-plt.s
+++ b/lld/test/ELF/hexagon-plt.s
@@ -30,31 +30,31 @@
 # DIS:      <_start>:
 ## Direct call
 ## Call foo directly
-# DIS-NEXT:   { call 0x2003c }
+# DIS-NEXT:   { call 0x2003c <foo> }
 ## Call bar via plt
-# DIS-NEXT:   { call 0x20060 }
+# DIS-NEXT:   { call 0x20060 <bar@plt> }
 ## Call weak via plt
-# DIS-NEXT:   { call 0x20070 }
+# DIS-NEXT:   { call 0x20070 <weak@plt> }
 # DIS-NEXT: { 	immext(#0)
 
 ## Call foo directly
-# DIS-NEXT: if (p0) jump:nt 0x2003c }
+# DIS-NEXT: if (p0) jump:nt 0x2003c <foo> }
 # DIS-NEXT: { 	immext(#64)
 ## Call bar via plt
-# DIS-NEXT: if (p0) jump:nt 0x20060 }
+# DIS-NEXT: if (p0) jump:nt 0x20060 <bar@plt> }
 # DIS-NEXT: { 	immext(#64)
 ## Call weak via plt
-# DIS-NEXT: if (p0) jump:nt 0x20070 }
+# DIS-NEXT: if (p0) jump:nt 0x20070 <weak@plt> }
 # DIS-NEXT: { 	immext(#0)
 
 ## Call foo directly
-# DIS-NEXT: r0 = #0 ; jump 0x2003c }
+# DIS-NEXT: r0 = #0 ; jump 0x2003c <foo> }
 # DIS-NEXT: { 	immext(#0)
 ## Call bar via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20060 }
+# DIS-NEXT: r0 = #0 ; jump 0x20060 <bar@plt> }
 # DIS-NEXT: { 	immext(#0)
 ## Call weak via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20070 }
+# DIS-NEXT: r0 = #0 ; jump 0x20070 <weak@plt> }
 
 # DIS:      <foo>:
 # DIS-NEXT:   2003c:
diff --git a/lld/test/ELF/hexagon-shared.s b/lld/test/ELF/hexagon-shared.s
index cc62662d278e2..7f7390f1fa8d8 100644
--- a/lld/test/ELF/hexagon-shared.s
+++ b/lld/test/ELF/hexagon-shared.s
@@ -88,7 +88,7 @@ pvar:
 # PLT-NEXT: jumpr r28 }
 
 # TEXT:  bc 00 01 00 000100bc
-# TEXT: { 	call 0x10300 }
+# TEXT: { 	call 0x10300 <bar@plt> }
 # TEXT: if (p0) jump:nt 0x10300
 # TEXT: r0 = #0 ; jump 0x10300
 # TEXT: r0 = add(r1,##-65548)
diff --git a/lld/test/ELF/hexagon-thunk-range-b22rel.s b/lld/test/ELF/hexagon-thunk-range-b22rel.s
new file mode 100644
index 0000000000000..08e37bf0a5552
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-b22rel.s
@@ -0,0 +1,115 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld main.o -o test
+# RUN: llvm-objdump -d --no-show-raw-insn test | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_B22_PCREL relocations.
+## R_HEX_B22_PCREL has a range of +/- 8MB (0x800000 bytes).
+
+#--- main.s
+.globl _start
+.type _start, %function
+_start:
+  call target_within_range_max
+  call target_beyond_range
+  call target_within_range_min
+  call target_beyond_range_min
+  call target_multiple_calls
+  call target_multiple_calls
+  call target_close
+  jumpr r31
+
+target_close:
+  jumpr r31
+
+## Target at maximum positive range (8MB - 4 bytes from _start)
+## We need to account for the instructions above: 7 calls + 1 jumpr = 8 * 4 = 32 bytes
+.skip 0X7fffbc
+.globl target_within_range_max
+.type target_within_range_max, %function
+target_within_range_max:
+  jumpr r31
+
+## Target just beyond maximum positive range (needs thunk)
+.skip 8
+.globl target_beyond_range
+.type target_beyond_range, %function
+target_beyond_range:
+  call target_within_range_max
+  jumpr r31
+
+## Target for multiple calls test
+.skip 0x100000
+.globl target_multiple_calls
+.type target_multiple_calls, %function
+target_multiple_calls:
+  jumpr r31
+
+## Now place targets at maximum negative range
+## We'll put these before _start in memory layout
+.section .text_negative, "ax", %progbits
+
+## Target at maximum negative range (-8MB + 4 bytes from _start)
+.globl target_within_range_min
+.type target_within_range_min, %function
+target_within_range_min:
+  call target_close
+  jumpr r31
+
+.skip 0X7ffff4
+
+## Target beyond maximum negative range (needs thunk)
+.globl target_beyond_range_min
+.type target_beyond_range_min, %function
+target_beyond_range_min:
+  jumpr r31
+
+## Verify thunk generation for targets beyond B22_PCREL range
+# CHECK:       <__hexagon_thunk_target_within_range_min_from_.text.thunk>:
+# CHECK-NEXT:    200b4: { immext(#0x900000)
+# CHECK-NEXT:             jump 0x9200cc <target_within_range_min> }
+
+# CHECK:       <__hexagon_thunk_target_beyond_range_min_from_.text.thunk>:
+# CHECK-NEXT:    200bc: { immext(#0x1100000)
+# CHECK-NEXT:             jump 0x11200c8 <target_beyond_range_min> }
+
+# CHECK:       <__hexagon_thunk_target_multiple_calls_from_.text.thunk>:
+# CHECK-NEXT:    200c4: { immext(#0x8fffc0)
+# CHECK-NEXT:             jump 0x9200c0 <target_multiple_calls> }
+
+## Verify _start calls - some direct, some via thunks
+# CHECK:       <_start>:
+# CHECK-NEXT:    200cc: { call 0x8200ac <target_within_range_max> }
+# CHECK-NEXT:           { call 0x8200b8 <target_beyond_range> }
+# CHECK-NEXT:           { call 0x200b4 <__hexagon_thunk_target_within_range_min_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200bc <__hexagon_thunk_target_beyond_range_min_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200c4 <__hexagon_thunk_target_multiple_calls_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200c4 <__hexagon_thunk_target_multiple_calls_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200ec <target_close> }
+
+# CHECK:      <target_close>:
+# CHECK-NEXT:    200ec: { jumpr r31 }
+
+## Verify targets at maximum positive range (direct calls, no thunks needed)
+# CHECK:      <target_within_range_max>:
+# CHECK-NEXT:  8200ac: { jumpr r31 }
+
+# CHECK:      <target_beyond_range>:
+# CHECK-NEXT:  8200b8: { call 0x8200ac <target_within_range_max> }
+# CHECK-NEXT:          { jumpr r31 }
+
+# CHECK:      <target_multiple_calls>:
+# CHECK-NEXT:  9200c0: { jumpr r31 }
+
+## Verify targets in negative section and thunk for calling back to main section
+# CHECK:      <__hexagon_thunk__from_.text.thunk>:
+# CHECK-NEXT:  9200c4: { immext(#0xff700000)
+# CHECK-NEXT:            jump 0x200cc <_start> }
+
+# CHECK:      <target_within_range_min>:
+# CHECK-NEXT:  9200cc: { call 0x9200c4 <__hexagon_thunk__from_.text.thunk> }
+# CHECK-NEXT:          { jumpr r31 }
+
+# CHECK:      <target_beyond_range_min>:
+# CHECK-NEXT: 11200c8: { jumpr r31 }
diff --git a/lld/test/ELF/hexagon-thunk-range-gdplt.s b/lld/test/ELF/hexagon-thunk-range-gdplt.s
new file mode 100644
index 0000000000000..77fd0e5754568
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-gdplt.s
@@ -0,0 +1,95 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld -shared main.o -o test.so
+# RUN: llvm-objdump -d --no-show-raw-insn test.so | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_GD_PLT_B22_PCREL relocations.
+## Same ±8MB range as regular calls.
+
+#--- main.s
+.globl _start
+.type _start, @function
+_start:
+  ## Setup for TLS Global Dynamic calls
+  r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+
+  ## Test TLS GD PLT calls
+  r0 = add(r2,##tls_var_close@GDGOT)
+  call tls_var_close@GDPLT
+
+  r0 = add(r2,##tls_var_far@GDGOT)
+  call tls_var_far@GDPLT
+
+  jumpr r31
+
+.skip 0x400000
+
+more_code:
+  r0 = add(r2,##tls_var_distant@GDGOT)
+  call tls_var_distant@GDPLT
+  jumpr r31
+
+## TLS variables in .tdata section
+.section .tdata,"awT",@progbits
+.globl tls_var_close, tls_var_far, tls_var_distant
+.type tls_var_close, @object
+.type tls_var_far, @object
+.type tls_var_distant, @object
+
+tls_var_close:
+  .word 0x1234
+
+tls_var_far:
+  .word 0x5678
+
+tls_var_distant:
+  .word 0x9abc
+
+# CHECK: Disassembly of section .text:
+# CHECK:     <_start>:
+# CHECK-NEXT:   102d4:  { immext(#0x420100)
+# CHECK-NEXT:      r2 = add(pc,##0x420130) }
+# CHECK-NEXT:    { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10018) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10010) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+# CHECK:     <more_code>:
+# CHECK-NEXT:   4102f8:  { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10008) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+## Verify PLT entries are created for TLS
+# CHECK: Disassembly of section .plt:
+# CHECK:      <.plt>:
+# CHECK-NEXT:   410310:  { immext(#0x200c0)
+# CHECK-NEXT:      r28 = add(pc,##0x200f4) }
+# CHECK-NEXT:    { r14 -= add(r28,#0x10)
+# CHECK-NEXT:      r15 = memw(r28+#0x8)
+# CHECK-NEXT:      r28 = memw(r28+#0x4) }
+# CHECK-NEXT:    { r14 = asr(r14,#0x2)
+# CHECK-NEXT:      jumpr r28 }
+# CHECK-NEXT:    { trap0(#0xdb) }
+
+# CHECK:      <tls_var_far@plt>:
+# CHECK-NEXT:   410340:  { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200d8) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <tls_var_distant@plt>:
+# CHECK-NEXT:   410350:  { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200cc) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <__tls_get_addr@plt>:
+# CHECK-NEXT:   410360: { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200c0) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
diff --git a/lld/test/ELF/hexagon-thunk-range-plt.s b/lld/test/ELF/hexagon-thunk-range-plt.s
new file mode 100644
index 0000000000000..3a8f50b681d81
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-plt.s
@@ -0,0 +1,75 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf external.s -o external.o
+# RUN: ld.lld -shared external.o -soname external.so -o external.so
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld main.o external.so -o test
+# RUN: llvm-objdump -d --no-show-raw-insn test | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_PLT_B22_PCREL relocations.
+## PLT calls use the same ±8MB range as regular calls but go through PLT entries.
+## This test verifies thunk generation for PLT calls at range boundaries.
+
+#--- external.s
+.globl extern_within_range, extern_beyond_range, extern_close
+.type extern_within_range, @function
+.type extern_beyond_range, @function
+.type extern_close, @function
+
+extern_within_range:
+  jumpr r31
+
+extern_beyond_range:
+  jumpr r31
+
+extern_close:
+  jumpr r31
+
+#--- main.s
+.globl _start
+.type _start, @function
+_start:
+  ## Test PLT calls to external functions at various ranges
+  call extern_within_range@PLT
+  call extern_beyond_range@PLT
+  call extern_close@PLT
+  jumpr r31
+
+.skip 0x200000
+
+# CHECK: Disassembly of section .text:
+# CHECK:     <_start>:
+# CHECK-NEXT:  2021c:  { call 0x220250 <extern_within_range@plt> }
+# CHECK-NEXT:    { call 0x220260 <extern_beyond_range@plt> }
+# CHECK-NEXT:    { call 0x220270 <extern_close@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+## Verify PLT header and entries are created with exact addresses
+# CHECK: Disassembly of section .plt:
+# CHECK:      <.plt>:
+# CHECK-NEXT:   220230:  { immext(#0x20080)
+# CHECK-NEXT:      r28 = add(pc,##0x200b8) }
+# CHECK-NEXT:    { r14 -= add(r28,#0x10)
+# CHECK-NEXT:      r15 = memw(r28+#0x8)
+# CHECK-NEXT:      r28 = memw(r28+#0x4) }
+# CHECK-NEXT:    { r14 = asr(r14,#0x2)
+# CHECK-NEXT:      jumpr r28 }
+# CHECK-NEXT:    { trap0(#0xdb) }
+
+# CHECK:      <extern_within_range@plt>:
+# CHECK-NEXT:   220250:  { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x200a8) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <extern_beyond_range@plt>:
+# CHECK-NEXT:   220260: { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x2009c) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <extern_close@plt>:
+# CHECK-NEXT:   220270:  { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x20090) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
diff --git a/lld/test/ELF/hexagon-thunks-packets.s b/lld/test/ELF/hexagon-thunks-packets.s
new file mode 100644
index 0000000000000..c8aaad4341ff3
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunks-packets.s
@@ -0,0 +1,122 @@
+# REQUIRES: hexagon
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-linux-musl %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-objdump -d %t 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-NONPIC,CHECK %s
+# RUN: llvm-mc -filetype=obj \
+# RUN:         -triple=hexagon-unknown-linux-musl %s -o %t.o
+# RUN: ld.lld --pie %t.o -o %t
+# RUN: llvm-objdump -d %t 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-PIC,CHECK %s
+
+## Packets with pc-relative relocations are more interesting because
+## the offset must be relative to the start of the source, destination
+## packets and not necessarily the instruction word containing the jump/call.
+
+# CHECK:  Disassembly of section .text:
+
+# CHECK-NONPIC: 000200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>:
+# CHECK-NONPIC: { immext(#0x1000040)
+# CHECK-NONPIC:   jump 0x1020110 <myfn_a> }
+
+# CHECK-PIC:    00010150 <__hexagon_thunk_myfn_a_from_.text.thunk>:
+# CHECK-PIC-NEXT:    { immext(#0x1000040)
+# CHECK-PIC-NEXT:      r14 = add(pc,##0x1000060) }
+# CHECK-PIC-NEXT:    { jumpr r14 }
+
+# CHECK-NONPIC: 000200bc <myfn_b>:
+# CHECK-NONPIC: { jumpr r31 }
+# CHECK-PIC:    0001015c <myfn_b>:
+# CHECK-PIC:    { jumpr r31 }
+    .globl myfn_b
+    .type  myfn_b, @function
+myfn_b:
+    jumpr r31
+    .size  myfn_b, .-myfn_b
+
+# CHECK-PIC:    00010160 <main>:
+    .globl main
+    .type  main, @function
+main:
+    { r0 = #0
+      call myfn_a }
+# CHECK-PIC:      { call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC:   { call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:       r0 = #0x0 }
+    call myfn_a
+# CHECK-PIC:    call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC: call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+    call myfn_b
+# CHECK-PIC-NEXT:    call 0x1015c <myfn_b>
+# CHECK-NONPIC-NEXT: call 0x200bc <myfn_b>
+
+    { r2 = add(r0, r1)
+      if (p0) call #myfn_b
+      if (!p0) call #myfn_a }
+# CHECK-PIC-NEXT:     { if (p0) call 0x1015c <myfn_b>
+# CHECK-PIC-NEXT:       if (!p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:  { if (p0) call 0x200bc <myfn_b>
+# CHECK-NONPIC-NEXT:    if (!p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+
+# CHECK-NEXT:       r2 = add(r0,r1) }
+
+    { r2 = add(r0, r1)
+      if (p0) call #myfn_a
+      if (!p0) call #myfn_a }
+# CHECK-PIC-NEXT:  { if (p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-PIC-NEXT:    if (!p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:  { if (p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:    if (!p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:           r2 = add(r0,r1) }
+
+    { r2 = add(r0, r1)
+      r1 = r4
+      r4 = r5
+      if (r0 == #0) jump:t #myfn_a }
+# CHECK-PIC-NEXT:     { if (r0==#0) jump:t 0x10150
+# CHECK-NONPIC-NEXT:  { if (r0==#0) jump:t 0x200b4
+# CHECK-NEXT:    r2 = add(r0,r1)
+# CHECK-NEXT:    r1 = r4; r4 = r5 }
+
+    { r2 = add(r0, r1)
+      r4 = r5
+      if (r0 <= #0) jump:t #myfn_a
+      p1 = cmp.eq(r0, #0); if (p1.new) jump:nt #myfn_a }
+# CHECK-NONPIC-NEXT:  { if (r0<=#0) jump:t 0x200b4
+# CHECK-NONPIC-NEXT:    p1 = cmp.eq(r0,#0x0); if (p1.new) jump:nt 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-PIC-NEXT:     { if (r0<=#0) jump:t 0x10150
+# CHECK-PIC-NEXT:       p1 = cmp.eq(r0,#0x0); if (p1.new) jump:nt 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:           r2 = add(r0,r1)
+# CHECK-NEXT:           r4 = r5 }
+
+    {r0 = #0; jump #myfn_a}
+# CHECK-PIC-NEXT:    { r0 = #0x0 ; jump 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk> }
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk> }
+    {r0 = #0; jump #myfn_b}
+# CHECK-PIC-NEXT:    { r0 = #0x0 ; jump 0x1015c <myfn_b> }
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x200bc <myfn_b> }
+    jumpr r31
+    .size   main, .-main
+
+    .section .text.foo
+    .skip 0x1000000
+
+    .globl myfn_a
+    .type  myfn_a, @function
+myfn_a:
+    {r0 = #0; jump #myfn_b}
+    jumpr r31
+    .size  myfn_a, .-myfn_a
+
+# CHECK-NONPIC: 01020110 <myfn_a>:
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x1020118 <__hexagon_thunk_myfn_b_from_.text.thunk> }
+# CHECK-NONPIC-NEXT: { jumpr r31 }
+
+# CHECK-NONPIC: 01020118 <__hexagon_thunk_myfn_b_from_.text.thunk>:
+# CHECK-NONPIC-NEXT: { immext(#0xfeffff80)
+# CHECK-NONPIC-NEXT:   jump 0x200bc <myfn_b> }
+
+# CHECK-PIC:    010101b8 <__hexagon_thunk_myfn_b_from_.text.thunk>:
+# CHECK-PIC-NEXT:    { immext(#0xfeffff80)
+# CHECK-PIC-NEXT:      r14 = add(pc,##0xfeffffa4) }
+# CHECK-PIC-NEXT:    { jumpr r14 }
diff --git a/lld/test/ELF/hexagon-thunks.s b/lld/test/ELF/hexagon-thunks.s
new file mode 100644
index 0000000000000..211074e1784b9
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunks.s
@@ -0,0 +1,53 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %t/a.s -o %t/a.o
+# RUN: ld.lld -T %t/lds %t/a.o -o %t/a
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-NONPIC,CHECK %s
+# RUN: llvm-mc -filetype=obj \
+# RUN:         -triple=hexagon-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld -T %t/lds --pie %t/a.o -o %t/a
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-PIC,CHECK %s
+
+#--- a.s
+.section .text_low, "ax", %progbits
+    .globl main
+    .type  main, @function
+main:
+    call myfn
+    jumpr r31
+    .size   main, .-main
+
+.section .text_high, "ax", %progbits
+    .globl myfn
+    .type  myfn, @function
+myfn:
+    jumpr r31
+    .size  myfn, .-myfn
+
+# CHECK:  Disassembly of section .text_low:
+
+# CHECK:             <__hexagon_thunk_myfn_from_.text.thunk>:
+# CHECK-NONPIC-NEXT:    200b4: { immext(#0x1000000)
+# CHECK-NONPIC-NEXT:             jump 0x10200bc <myfn> }
+# CHECK-PIC-NEXT:       200b4: { immext(#0x1000000)
+# CHECK-PIC-NEXT:               r14 = add(pc,##0x1000008) }
+# CHECK-PIC-NEXT:       { jumpr r14 }
+
+# CHECK-NONPIC:      <main>:
+# CHECK-NONPIC-NEXT:   200bc: { call 0x200b4 <__hexagon_thunk_myfn_from_.text.thunk> }
+# CHECK-PIC:         <main>:
+# CHECK-PIC-NEXT:      200c0: { call 0x200b4 <__hexagon_thunk_myfn_from_.text.thunk> }
+# CHECK-NEXT:                 { jumpr r31 }
+
+# CHECK:  Disassembly of section .text_high:
+# CHECK:    <myfn>:
+# CHECK-NEXT: 10200bc: { jumpr r31 }
+
+#--- lds
+SECTIONS {
+  .text_low 0x200b4: { *(.text_low) }
+  .text_high 0x10200bc : { *(.text_high) }
+}
diff --git a/lld/test/ELF/hexagon-tls-allocateaux-multiple.s b/lld/test/ELF/hexagon-tls-allocateaux-multiple.s
new file mode 100644
index 0000000000000..a77cc822e67d4
--- /dev/null
+++ b/lld/test/ELF/hexagon-tls-allocateaux-multiple.s
@@ -0,0 +1,36 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf b.s -o b.o
+# RUN: ld.lld -shared a.o b.o -o out.so
+# RUN: llvm-readobj -r out.so | FileCheck --check-prefix=RELOC %s
+
+#--- a.s
+.globl _start
+.type _start, @function
+
+_start:
+  r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+  r0 = add(r2,##tls_var@GDGOT)
+  call tls_var@GDPLT
+  jumpr r31
+
+.section .tdata,"awT",@progbits
+.globl tls_var
+.type tls_var, @object
+tls_var:
+  .word 0x1234
+
+#--- b.s
+.globl other_func
+.type other_func, @function
+
+other_func:
+  ## Direct call to __tls_get_addr - this creates another path that may
+  ## try to allocate auxiliary data for the same symbol
+  call __tls_get_addr
+  jumpr r31
+
+# RELOC:      Section ({{.*}}) .rela.plt {
+# RELOC:        R_HEX_JMP_SLOT __tls_get_addr 0x0
+# RELOC:      }
diff --git a/lld/test/ELF/hexagon-tls-gd-xform.s b/lld/test/ELF/hexagon-tls-gd-xform.s
index 65aeb118fcb33..ade54e8a16fad 100644
--- a/lld/test/ELF/hexagon-tls-gd-xform.s
+++ b/lld/test/ELF/hexagon-tls-gd-xform.s
@@ -18,10 +18,10 @@
 _start:
 .ifdef GDPLT
                         call x@gdplt
-# CHECK_GDPLT:  101ec: { call 0x10220 }
+# CHECK_GDPLT:  101ec: { call 0x10220 <__tls_get_addr@plt> }
 .else
                   call x
-# CHECK:  101b8: { call 0x101e0 }
+# CHECK:  101b8: { call 0x101e0 <x@plt> }
 .endif
 
 # CHECK_GDPLT:        10220: { immext(#0x20040)
diff --git a/lld/test/MachO/loh-arm64-32.s b/lld/test/MachO/loh-arm64-32.s
new file mode 100644
index 0000000000000..906d0e1ce9046
--- /dev/null
+++ b/lld/test/MachO/loh-arm64-32.s
@@ -0,0 +1,64 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-watchos %s -o %t.o
+# RUN: %lld-watchos -U _external %t.o -o %t
+# RUN: llvm-objdump -d --macho %t | FileCheck %s
+
+.text
+.align 2
+.globl _foo
+_foo:
+    ret
+.globl _bar
+_bar:
+    ret
+
+.globl _main
+_main:
+# CHECK-LABEL: _main:
+
+L1: adrp x0, _foo@PAGE
+L2: add  x0, x0, _foo@PAGEOFF
+# CHECK-NEXT: adr x0
+# CHECK-NEXT: nop
+
+L3: adrp x0, _ptr@PAGE
+L4: add  x1, x0, _ptr@PAGEOFF
+L5: ldr  x2, [x1]
+# CHECK-NEXT: nop
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr x2
+
+L6: adrp x0, _foo@PAGE
+L7: adrp x0, _bar@PAGE
+# CHECK-NEXT: adrp x0
+# CHECK-NEXT: nop
+
+L8: adrp x0, _ptr@PAGE
+L9: ldr  x0, [x0, _ptr@PAGEOFF]
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr x0
+
+L10: adrp x0, _ptr@PAGE
+L11: ldr  w0, [x0, _ptr@PAGEOFF]
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr w0, _ptr
+
+L12: adrp x0, _external@PAGE
+L13: ldr  w1, [x0, _external@PAGEOFF]
+L14: ldr  x2, [x1]
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr w1, 0x{{.*}}
+# CHECK-NEXT: ldr x2, [x1]
+
+.data
+.align 4
+_ptr:
+    .quad 0
+
+.loh AdrpAdd L1, L2
+.loh AdrpAddLdr L3, L4, L5
+.loh AdrpAdrp L6, L7
+.loh AdrpLdr L8, L9
+.loh AdrpLdrGot L10, L11
+.loh AdrpLdrGotLdr L12, L13, L14
diff --git a/lld/test/MachO/objc.s b/lld/test/MachO/objc.s
index dbb9f1df27571..c327b206290ab 100644
--- a/lld/test/MachO/objc.s
+++ b/lld/test/MachO/objc.s
@@ -7,12 +7,13 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift.s -o %t/has-swift.o
 # RUN: llvm-as %t/has-swift-ir-loaded.ll -o %t/has-swift-ir-loaded.o
 # RUN: llvm-as %t/has-swift-ir-not-loaded.ll -o %t/has-swift-ir-not-loaded.o
+# RUN: llvm-as %t/has-swift-with-space-ir-loaded.ll -o %t/has-swift-with-space-ir-loaded.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift-proto.s -o %t/has-swift-proto.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-objc.s -o %t/no-objc.o
 ## Make sure we don't mis-parse a 32-bit file as 64-bit
 # RUN: llvm-mc -filetype=obj -triple=armv7-apple-watchos %t/no-objc.s -o %t/wrong-arch.o
-# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o
-# RUN: llvm-ar rcs %t/libHasSomeObjC2.a %t/no-objc.o %t/has-objc-symbol-and-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o
+# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/has-swift-with-space-ir-loaded.o %t/wrong-arch.o
+# RUN: llvm-ar rcs %t/libHasSomeObjC2.a %t/no-objc.o %t/has-objc-symbol-and-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/has-swift-with-space-ir-loaded.o %t/wrong-arch.o
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
 
@@ -22,7 +23,7 @@
 # RUN: %lld -lSystem %t/test.o -o %t/test -L%t -lHasSomeObjC2 -ObjC
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC
 
-# RUN: %no-fatal-warnings-lld -lSystem %t/test.o -o %t/test --start-lib %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o --end-lib -ObjC 2>&1 \
+# RUN: %no-fatal-warnings-lld -lSystem %t/test.o -o %t/test --start-lib %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/has-swift-with-space-ir-loaded.o %t/wrong-arch.o --end-lib -ObjC 2>&1 \
 # RUN:     | FileCheck -check-prefix=WARNING %s
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC
 
@@ -34,14 +35,16 @@
 # OBJC-NEXT:    0 __text          {{.*}}      TEXT
 # OBJC-NEXT:    1 __swift         {{.*}}      DATA
 # OBJC-NEXT:    2 __swift5_fieldmd{{.*}}      DATA
-# OBJC-NEXT:    3 __objc_catlist  {{.*}}      DATA
-# OBJC-NEXT:    4 has_objc_symbol {{.*}}      DATA
+# OBJC-NEXT:    3 __swift5_proto  {{.*}}      DATA
+# OBJC-NEXT:    4 __objc_catlist  {{.*}}      DATA
+# OBJC-NEXT:    5 has_objc_symbol {{.*}}      DATA
 # OBJC-EMPTY:
 # OBJC-NEXT:  SYMBOL TABLE:
 # OBJC-DAG:   g     O __TEXT,__swift _foo
 # OBJC-DAG:   g     F __TEXT,__text _main
 # OBJC-DAG:   g     F __TEXT,__text _OBJC_CLASS_$_MyObject
 # OBJC-DAG:   g     O __TEXT,__swift5_fieldmd $s7somelib4Blah_pMF
+# OBJC-DAG:   g     O __TEXT,__swift5_proto _baz
 
 # RUN: %lld -lSystem %t/test.o -o %t/test -L%t -lHasSomeObjC
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=NO-OBJC
@@ -117,6 +120,13 @@ target triple = "x86_64-apple-darwin"
 @bar = global i64 1234
 @llvm.used = appending global [1 x ptr] [ptr @bar]
 
+#--- has-swift-with-space-ir-loaded.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+@baz = global i64 1234, section "__TEXT, __swift5_proto"
+@llvm.used = appending global [1 x ptr] [ptr @baz]
+
 #--- has-swift-proto.s
 .section __TEXT,__swift5_fieldmd
 .globl $s7somelib4Blah_pMF
diff --git a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
index 6907164a1b95c..1df4d2b26212d 100644
--- a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
+++ b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
@@ -45,6 +45,10 @@ Note that currently ELF Core files are not supported."
     Resetting will result in the reset of all process specific options, such as Threads to save."
 ) lldb::SBSaveCoreOptions::SetProcess;
 
+%feature("docstring", "
+    Get the process to save. If a process is not defined, whether by calling clear or by not setting a process, an invalid process will be returned."
+) lldb::SBSaveCoreOptions::GetProcess;
+
 %feature("docstring", "
     Add an SBThread to be saved, an error will be returned if an SBThread from a different process is specified. 
     The process is set either by the first SBThread added to the options container, or explicitly by the SetProcess call."
@@ -63,6 +67,12 @@ Note that currently ELF Core files are not supported."
     Get an SBThreadCollection of all threads marked to be saved. This collection is not sorted according to insertion order."
 ) lldb::SBSaveCoreOptions::GetThreadsToSave;
 
+%feature("docstring", "
+    Get an SBMemoryRegionInfoList of all the Regions that LLDB will attempt to write into the Core. Note, reading from these
+    regions can fail, and it's not guaraunteed every region will be present in the resulting core. If called without a valid process or style set an empty
+    collection will be returned."
+) lldb::SBSaveCoreOptions::GetMemoryRegionsToSave;
+
 %feature("docstring", "
     Get the current total number of bytes the core is expected to have, excluding the overhead of the core file format.
     Requires both a Process and a Style to be specified. An error will be returned if the provided options would result in no data being saved."
diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index bbd717a982cf3..c6f00ed05cfc2 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -70,33 +70,6 @@ endif()
 
 find_program(unifdef_EXECUTABLE unifdef)
 
-# All necessary header files will be staged in the include directory in the build directory,
-# so just copy the files from there into the framework's staging directory.
-set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
-set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
-file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
-foreach(header ${lldb_build_dir_header_staging_list})
-
-  get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_framework_header_staging}/${basename})
-
-  if(unifdef_EXECUTABLE)
-    # unifdef returns 0 when the file is unchanged and 1 if something was changed.
-    # That means if we successfully remove SWIG code, the build system believes
-    # that the command has failed and stops. This is undesirable.
-    set(copy_command ${unifdef_EXECUTABLE} -USWIG -o ${staged_header} ${header} || (exit 0))
-  else()
-    set(copy_command ${CMAKE_COMMAND} -E copy ${header} ${staged_header})
-  endif()
-
-  add_custom_command(
-    DEPENDS ${header} OUTPUT ${staged_header}
-    COMMAND ${copy_command}
-    COMMENT "LLDB.framework: collect framework header and remove SWIG macros")
-
-  list(APPEND lldb_staged_headers ${staged_header})
-endforeach()
-
 # Wrap output in a target, so lldb-framework can depend on it.
 add_custom_target(liblldb-resource-headers DEPENDS lldb-sbapi-dwarf-enums ${lldb_staged_headers})
 set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources")
@@ -105,22 +78,6 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# Take the headers from the staging directory and fix up their includes for the framework.
-# Then write them to the output directory.
-# Also, run unifdef to remove any specified guards from the header files.
-file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
-foreach(header ${lldb_framework_header_staging_list})
-
-  set(input_header ${header})
-  get_filename_component(header_basename ${input_header} NAME)
-  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${header_basename})
-
-  add_custom_command(TARGET liblldb POST_BUILD
-    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
-    COMMENT "LLDB.framework: Fix up and copy framework headers"
-  )
-endforeach()
-
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
   if (TARGET clang-resource-headers)
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 480430fede928..4bbec891da0b3 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -211,6 +211,7 @@ Clang. Then we build the ``ALL`` target with ninja:
 
   $ cmake -B /path/to/llvm-build -G Ninja \
           -DLLVM_ENABLE_PROJECTS=clang \
+          -DCMAKE_BUILD_TYPE=Release \
           [<more cmake options>] /path/to/llvm-project/llvm
   $ ninja
 
@@ -224,6 +225,7 @@ build directory for Clang, remember to pass its module path via ``Clang_DIR``
 ::
 
   $ cmake -B /path/to/lldb-build -G Ninja \
+          -DCMAKE_BUILD_TYPE=Release \
           -DLLVM_DIR=/path/to/llvm-build/lib/cmake/llvm \
           [<more cmake options>] /path/to/llvm-project/lldb
   $ ninja lldb lldb-server
diff --git a/lldb/docs/resources/qemu-testing.rst b/lldb/docs/resources/qemu-testing.rst
index 8571287a04262..90b8fd50cb5c4 100644
--- a/lldb/docs/resources/qemu-testing.rst
+++ b/lldb/docs/resources/qemu-testing.rst
@@ -167,3 +167,50 @@ The result of this is that:
 
 Your VM configuration should have ports ``54321`` and ``49140`` forwarded for
 this to work.
+
+QEMU user mode emulation
+------------------------
+
+Serious testing of LLDB should be done using system mode emulation. The following
+is presented for information only and is not a supported testing configuration
+supported by the LLDB project.
+
+However, it is possible to run the test suite against user mode QEMU if you just
+want to test a specific aspect of ``lldb`` and are ok ignoring a lot of expected
+failures. This method can also be adapted for simulators with a qemu-like command
+line interface.
+
+(``lldb-server`` cannot be tested using user mode QEMU because that does not
+emulate the debugging system calls that ``lldb-server`` tries to make)
+
+Change ``LLDB_TEST_USER_ARGS`` to choose the ``qemu-user`` platform and
+configure it for your architecture. The example below is for AArch64 and assumes
+that ``qemu-aarch64`` is installed and on your path.
+
+If you need to override how the ``qemu-user`` platform finds the QEMU binary,
+look up the rest of the platform's settings in LLDB.
+
+::
+
+  -DLLDB_TEST_USER_ARGS="--platform-name;qemu-user;--setting;platform.plugin.qemu-user.architecture=aarch64;--arch;aarch64"
+
+Also set ``LLDB_TEST_COMPILER`` to something that can target the emulated
+architecture. Then you should be able to run ``ninja check-lldb`` and it will
+run the tests on QEMU user automatically.
+
+You will see a number of failures compared to a normal test run. Reasons for
+this can be, but are not limited to:
+
+* QEMU's built-in debug stub acting differently and supporting different
+  features to different extents, when compared to ``lldb-server``. We try to
+  be compatible but LLDB is not regularly tested with QEMU user.
+
+* Tests that spawn new processes to attach to. QEMU user only emulates a single
+  process.
+
+* Watchpoints. Either these are not emulated or behave differently to real
+  hardware. Add ``--skip-category;watchpoint`` to ``-DLLDB_TEST_USER_ARGS`` to
+  skip those.
+
+* Lack of memory region information due to QEMU communicating this in the
+  GDB server format which LLDB does not use.
diff --git a/lldb/docs/use/formatting.rst b/lldb/docs/use/formatting.rst
index 21b3ca1912b02..39ccfed30a2ca 100644
--- a/lldb/docs/use/formatting.rst
+++ b/lldb/docs/use/formatting.rst
@@ -3,7 +3,7 @@ Frame and Thread Format
 
 LLDB has a facility to allow users to define the format of the information that
 generates the descriptions for threads and stack frames. Typically when your
-program stops at a breakpoint you will get two lines that describes why your
+program stops at a breakpoint you will get two lines that describe why your
 thread stopped and where:
 
 ::
@@ -192,7 +192,7 @@ you to desensitize control characters and also emit non-printable characters.
 Desensitizing Characters in the Format String
 ---------------------------------------------
 
-The backslash control character allows your to enter the typical ``\a``,
+The backslash control character allows you to enter the typical ``\a``,
 ``\b``, ``\f``, ``\n``, ``\r``, ``\t``, ``\v``, ``\\``, characters and along
 with the standard octal representation ``\0123`` and hex ``\xAB`` characters.
 This allows you to enter escape characters into your format strings and will
@@ -202,8 +202,8 @@ Scoping
 -------
 
 Many times the information that you might have in your prompt might not be
-available and you won``t want it to print out if it isn``t valid. To take care
-of this you can enclose everything that must resolve into a scope. A scope is
+available and you won't want it to print out if it isn't valid. To take care
+of this you can enclose everything that must resolve into a scope. A scope
 starts with ``{`` and ends with ``}``. For example in order to only display the
 current frame line table entry basename and line number when the information is
 available for the current frame:
@@ -269,7 +269,7 @@ thread information:
     frame #0: 0x0000000100000e85 a.out`main + 4 at test.c:19
     frame #1: 0x0000000100000e40 a.out`start + 52
 
-The frame related variables are:
+The frame-related variables are:
 
 - ``${file.*}``
 - ``${frame.*}``
diff --git a/lldb/docs/use/mcp.md b/lldb/docs/use/mcp.md
index 375c164fe771c..b7474246b54f3 100644
--- a/lldb/docs/use/mcp.md
+++ b/lldb/docs/use/mcp.md
@@ -75,7 +75,69 @@ Configuration example for [Visual Studio Code](https://code.visualstudio.com/doc
 }
 ```
 
-### Troubleshooting
+## Tools
+
+Tools are a primitive in the Model Context Protocol that enable servers to
+expose functionality to clients.
+
+LLDB's MCP integration exposes one tool, named `lldb_command` which allows the
+model to run the same commands a user would type in the LLDB command
+interpreter. It takes two arguments:
+
+1. The unique debugger ID as a number.
+2. The command and its arguments as a string.
+
+## Resources
+
+Resources are a primitive in the Model Context Protocol that allow servers to
+expose content that can be read by clients.
+
+LLDB's MCP integration exposes a resource for each debugger and target
+instance. Debugger resources are accessible using the following URI:
+
+```
+lldb://debugger/<debugger id>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1",
+      "mimeType": "application/json",
+      "text": "{\"debugger_id\":1,\"name\":\"debugger_1\",\"num_targets\":1}"
+    }
+  ]
+}
+```
+
+Debuggers can contain one or more targets, which are accessible using the
+following URI:
+
+```
+lldb://debugger/<debugger id>/target/<target idx>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1/target/0",
+      "mimeType": "application/json",
+      "text": "{\"arch\":\"arm64-apple-macosx26.0.0\",\"debugger_id\":1,\"dummy\":false,\"path\":\"/bin/count\",\"platform\":\"host\",\"selected\":true,\"target_idx\":0}"
+    }
+  ]
+}
+```
+
+Note that unlike the debugger id, which is unique, the target index is not
+stable and may be reused when a target is removed and a new target is added.
+
+## Troubleshooting
 
 The MCP server uses the `Host` log channel. You can enable logging with the
 `log enable` command.
diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst
index 325d0685d9d38..4292714c9c208 100644
--- a/lldb/docs/use/python-reference.rst
+++ b/lldb/docs/use/python-reference.rst
@@ -748,7 +748,7 @@ For instance, if the string you are completing is "Test" and the available compl
 
    return {"completion": "Test1", "mode" : "partial"}
 
-and then lldb will add the "1" at the curson and advance it after the added string,
+and then lldb will add the "1" at the cursor and advance it after the added string,
 waiting for more completions.  But if "Test1" is the only completion, return:
 
 .. code-block:: python
@@ -933,7 +933,7 @@ that goal:
 Using the lldb.py module in Python
 ----------------------------------
 
-LLDB has all of its core code build into a shared library which gets used by
+LLDB has all of its core code built into a shared library which gets used by
 the `lldb` command line application. On macOS this shared library is a
 framework: LLDB.framework and on other unix variants the program is a shared
 library: lldb.so. LLDB also provides an lldb.py module that contains the
@@ -956,7 +956,7 @@ For sh and bash:
 
   $ export PYTHONPATH=`lldb -P`
 
-Alternately, you can append the LLDB Python directory to the sys.path list
+Alternatively, you can append the LLDB Python directory to the sys.path list
 directly in your Python code before importing the lldb module.
 
 Now your python scripts are ready to import the lldb module. Below is a python
@@ -1093,11 +1093,11 @@ Writing Target Stop-Hooks in Python
 
 Stop hooks fire whenever the process stops just before control is returned to the
 user.  Stop hooks can either be a set of lldb command-line commands, or can
-be implemented by a suitably defined Python class.  The Python based stop-hooks
-can also be passed as set of -key -value pairs when they are added, and those
+be implemented by a suitably defined Python class.  The Python-based stop-hooks
+can also be passed as a set of -key -value pairs when they are added, and those
 will get packaged up into a SBStructuredData Dictionary and passed to the
 constructor of the Python object managing the stop hook.  This allows for
-parametrization of the stop hooks.
+parameterization of the stop hooks.
 
 To add a Python-based stop hook, first define a class with the following methods:
 
diff --git a/lldb/examples/python/filter_disasm.py b/lldb/examples/python/filter_disasm.py
index de99d4031a7fd..46c9f794b25a2 100644
--- a/lldb/examples/python/filter_disasm.py
+++ b/lldb/examples/python/filter_disasm.py
@@ -11,8 +11,13 @@
 import lldb
 import subprocess
 
-filter_program = "crustfilt"
 
+class Program(list):
+    def __str__(self):
+        return " ".join(self)
+
+
+filter_program = Program(["crustfilt"])
 
 def __lldb_init_module(debugger, dict):
     debugger.HandleCommand("command script add -f filter_disasm.fdis fdis")
@@ -51,13 +56,20 @@ def fdis(debugger, args, exe_ctx, result, dict):
     result.Clear()
 
     if len(args_list) == 1 and args_list[0] == "get":
-        result.PutCString(filter_program)
+        result.PutCString(str(filter_program))
         result.SetStatus(lldb.eReturnStatusSuccessFinishResult)
         return
 
-    if len(args_list) == 2 and args_list[0] == "set":
-        filter_program = args_list[1]
-        result.PutCString("Filter program set to %s" % filter_program)
+    if args_list[0] == "set":
+        # Assume the rest is a program to run and any arguments to be passed to
+        # it.
+        if len(args_list) <= 1:
+            result.PutCString('"set" command requires a program argument')
+            result.SetStatus(lldb.eReturnStatusFailed)
+            return
+
+        filter_program = Program(args_list[1:])
+        result.PutCString('Filter program set to "{}"'.format(filter_program))
         result.SetStatus(lldb.eReturnStatusSuccessFinishResult)
         return
 
@@ -70,7 +82,9 @@ def fdis(debugger, args, exe_ctx, result, dict):
     output = res.GetOutput()
 
     try:
-        proc = subprocess.run([filter_program], capture_output=True, text=True, input=output)
+        proc = subprocess.run(
+            filter_program, capture_output=True, text=True, input=output
+        )
     except (subprocess.SubprocessError, OSError) as e:
         result.PutCString("Error occurred. Original disassembly:\n\n" + output)
         result.SetError(str(e))
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index 20b9488af5597..f42a009c21f48 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -882,38 +882,6 @@ def update(self):
         return False
 
 
-def VariantSummaryProvider(valobj, dict):
-    raw_obj = valobj.GetNonSyntheticValue()
-    index_obj = raw_obj.GetChildMemberWithName("_M_index")
-    data_obj = raw_obj.GetChildMemberWithName("_M_u")
-    if not (index_obj and index_obj.IsValid() and data_obj and data_obj.IsValid()):
-        return "<Can't find _M_index or _M_u>"
-
-    def get_variant_npos_value(index_byte_size):
-        if index_byte_size == 1:
-            return 0xFF
-        elif index_byte_size == 2:
-            return 0xFFFF
-        else:
-            return 0xFFFFFFFF
-
-    npos_value = get_variant_npos_value(index_obj.GetByteSize())
-    index = index_obj.GetValueAsUnsigned(0)
-    if index == npos_value:
-        return " No Value"
-
-    # Strip references and typedefs.
-    variant_type = raw_obj.GetType().GetCanonicalType().GetDereferencedType()
-    template_arg_count = variant_type.GetNumberOfTemplateArguments()
-
-    # Invalid index can happen when the variant is not initialized yet.
-    if index >= template_arg_count:
-        return " <Invalid>"
-
-    active_type = variant_type.GetTemplateArgumentType(index)
-    return f" Active Type = {active_type.GetDisplayTypeName()} "
-
-
 class VariantSynthProvider:
     def __init__(self, valobj, dict):
         self.raw_obj = valobj.GetNonSyntheticValue()
diff --git a/lldb/include/lldb/API/SBMemoryRegionInfoList.h b/lldb/include/lldb/API/SBMemoryRegionInfoList.h
index 1d939dff55faa..8ac9c1aceb6f6 100644
--- a/lldb/include/lldb/API/SBMemoryRegionInfoList.h
+++ b/lldb/include/lldb/API/SBMemoryRegionInfoList.h
@@ -45,6 +45,7 @@ class LLDB_API SBMemoryRegionInfoList {
 
 private:
   friend class SBProcess;
+  friend class SBSaveCoreOptions;
 
   lldb_private::MemoryRegionInfos &ref();
 
diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h
index 37552c13d0f36..7b05377966965 100644
--- a/lldb/include/lldb/API/SBSaveCoreOptions.h
+++ b/lldb/include/lldb/API/SBSaveCoreOptions.h
@@ -12,6 +12,7 @@
 #include "lldb/API/SBDefines.h"
 #include "lldb/API/SBError.h"
 #include "lldb/API/SBFileSpec.h"
+#include "lldb/API/SBMemoryRegionInfoList.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBThread.h"
 #include "lldb/API/SBThreadCollection.h"
@@ -78,6 +79,13 @@ class LLDB_API SBSaveCoreOptions {
   ///   api, or implicitly from any function that requires a process.
   SBError SetProcess(lldb::SBProcess process);
 
+  /// Get the process to save, if the process is not set an invalid SBProcess
+  /// will be returned.
+  ///
+  /// \return
+  ///   The set process, or an invalid SBProcess if no process is set.
+  SBProcess GetProcess();
+
   /// Add a thread to save in the core file.
   ///
   /// \param thread
@@ -119,6 +127,13 @@ class LLDB_API SBSaveCoreOptions {
   ///   an empty collection will be returned.
   SBThreadCollection GetThreadsToSave() const;
 
+  /// Get an unsorted copy of all memory regions to save
+  ///
+  /// \returns
+  ///   An unsorted copy of all memory regions to save. If no process or style
+  ///   is specified an empty collection will be returned.
+  SBMemoryRegionInfoList GetMemoryRegionsToSave();
+
   /// Get the current total number of bytes the core is expected to have
   /// excluding the overhead of the core file format. Requires a Process and
   /// Style to be specified.
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index 369785ceea5a5..aa60b7c6693ca 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -261,8 +261,7 @@ class PluginManager {
   static ObjectFileCreateMemoryInstance
   GetObjectFileCreateMemoryCallbackForPluginName(llvm::StringRef name);
 
-  static Status SaveCore(const lldb::ProcessSP &process_sp,
-                         lldb_private::SaveCoreOptions &core_options);
+  static Status SaveCore(lldb_private::SaveCoreOptions &core_options);
 
   static std::vector<llvm::StringRef> GetSaveCorePluginNames();
 
diff --git a/lldb/include/lldb/Core/dwarf.h b/lldb/include/lldb/Core/dwarf.h
index 4de5c8f24db02..0663ec04e77fa 100644
--- a/lldb/include/lldb/Core/dwarf.h
+++ b/lldb/include/lldb/Core/dwarf.h
@@ -14,12 +14,6 @@
 // Get the DWARF constant definitions from llvm
 #include "llvm/BinaryFormat/Dwarf.h"
 
-namespace lldb_private {
-namespace dwarf {
-  using namespace llvm::dwarf;
-}
-}
-
 typedef llvm::dwarf::Attribute dw_attr_t;
 typedef llvm::dwarf::Form dw_form_t;
 typedef llvm::dwarf::Tag dw_tag_t;
diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h
index da66b184745db..697549706ed07 100644
--- a/lldb/include/lldb/Symbol/SaveCoreOptions.h
+++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
 
+#include "lldb/Target/CoreFileMemoryRanges.h"
 #include "lldb/Target/ThreadCollection.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/RangeMap.h"
@@ -23,7 +24,7 @@ namespace lldb_private {
 
 class SaveCoreOptions {
 public:
-  SaveCoreOptions(){};
+  SaveCoreOptions() = default;
   ~SaveCoreOptions() = default;
 
   lldb_private::Status SetPluginName(const char *name);
@@ -36,17 +37,19 @@ class SaveCoreOptions {
   const std::optional<lldb_private::FileSpec> GetOutputFile() const;
 
   Status SetProcess(lldb::ProcessSP process_sp);
+  lldb::ProcessSP GetProcess() { return m_process_sp; }
 
   Status AddThread(lldb::ThreadSP thread_sp);
   bool RemoveThread(lldb::ThreadSP thread_sp);
   bool ShouldThreadBeSaved(lldb::tid_t tid) const;
   bool HasSpecifiedThreads() const;
 
-  Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const;
+  Status EnsureValidConfiguration() const;
   const MemoryRanges &GetCoreFileMemoryRanges() const;
 
   void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo &region);
 
+  llvm::Expected<lldb_private::CoreFileMemoryRanges> GetMemoryRegionsToSave();
   lldb_private::ThreadCollection::collection GetThreadsToSave() const;
 
   llvm::Expected<uint64_t> GetCurrentSizeInBytes();
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index a1d881375b08b..7b23c8abe8d2f 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -1110,10 +1110,16 @@ class Target : public std::enable_shared_from_this<Target>,
   // 2 - if there is a process, then read from memory
   // 3 - if there is no process, then read from the file cache
   //
+  // If did_read_live_memory is provided, will indicate if the read was from
+  // live memory, or from file contents. A caller which needs to treat these two
+  // sources differently should use this argument to disambiguate where the data
+  // was read from.
+  //
   // The method is virtual for mocking in the unit tests.
   virtual size_t ReadMemory(const Address &addr, void *dst, size_t dst_len,
                             Status &error, bool force_live_memory = false,
-                            lldb::addr_t *load_addr_ptr = nullptr);
+                            lldb::addr_t *load_addr_ptr = nullptr,
+                            bool *did_read_live_memory = nullptr);
 
   size_t ReadCStringFromMemory(const Address &addr, std::string &out_str,
                                Status &error, bool force_live_memory = false);
diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h
index 37bcdc9924171..fc547ed739239 100644
--- a/lldb/include/lldb/Utility/Stream.h
+++ b/lldb/include/lldb/Utility/Stream.h
@@ -270,7 +270,6 @@ class Stream {
   /// \param[in] suffix
   ///     The ANSI color code to end colorization. This is
   ///     environment-dependent.
-
   void PutCStringColorHighlighted(
       llvm::StringRef text,
       std::optional<HighlightSettings> settings = std::nullopt);
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index e021c7a926cf1..171a650f4a163 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -518,6 +518,7 @@ enum LanguageType {
   eLanguageTypeAssembly = 0x0031,
   eLanguageTypeC_sharp = 0x0032,
   eLanguageTypeMojo = 0x0033,
+  eLanguageTypeLastStandardLanguage = eLanguageTypeMojo,
 
   // Vendor Extensions
   // Note: Language::GetNameForLanguageType
@@ -525,8 +526,6 @@ enum LanguageType {
   // Language::SetLanguageFromCString and Language::AsCString assume these can
   // be used as indexes into array g_languages.
   eLanguageTypeMipsAssembler, ///< Mips_Assembler.
-  // Mojo will move to the common list of languages once the DWARF committee
-  // creates a language code for it.
   eNumLanguageTypes
 };
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index d823126e3e2fd..1567462839748 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -173,6 +173,28 @@ def verify_breakpoint_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
                         return
         self.assertTrue(False, f"breakpoint not hit, stopped_events={stopped_events}")
 
+    def verify_all_breakpoints_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+        """Wait for the process we are debugging to stop, and verify we hit
+        all of the breakpoint locations in the "breakpoint_ids" array.
+        "breakpoint_ids" should be a list of int breakpoint IDs ([1, 2])."""
+        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        for stopped_event in stopped_events:
+            if "body" in stopped_event:
+                body = stopped_event["body"]
+                if "reason" not in body:
+                    continue
+                if (
+                    body["reason"] != "breakpoint"
+                    and body["reason"] != "instruction breakpoint"
+                ):
+                    continue
+                if "hitBreakpointIds" not in body:
+                    continue
+                hit_bps = body["hitBreakpointIds"]
+                if all(breakpoint_id in hit_bps for breakpoint_id in breakpoint_ids):
+                    return
+        self.assertTrue(False, f"breakpoints not hit, stopped_events={stopped_events}")
+
     def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
         """Wait for the process we are debugging to stop, and verify the stop
         reason is 'exception' and that the description matches
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
index 6ea8df4c24dd4..aa034db36968d 100755
--- a/lldb/scripts/framework-header-fix.py
+++ b/lldb/scripts/framework-header-fix.py
@@ -97,7 +97,7 @@ def main():
     parser.add_argument("-o", "--output_file")
     parser.add_argument("-p", "--unifdef_path")
     parser.add_argument(
-        "unifdef_guards",
+        "--unifdef_guards",
         nargs="+",
         type=str,
         help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
@@ -111,7 +111,8 @@ def main():
     # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
     # but passing them in with dashes for this script causes argparse to think that they're
     # arguments in and of themself, so they need to passed in without dashes.
-    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
+    if args.unifdef_guards:
+        unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
 
     # Create the framework's header dir if it doesn't already exist
     if not os.path.exists(os.path.dirname(output_file_path)):
@@ -123,7 +124,8 @@ def main():
         modify_rpc_includes(input_file_path, output_file_path)
     # After the incldues have been modified, run unifdef on the headers to remove any guards
     # specified at the command line.
-    remove_guards(output_file_path, unifdef_path, unifdef_guards)
+    if args.unifdef_guards:
+        remove_guards(output_file_path, unifdef_path, unifdef_guards)
 
 
 if __name__ == "__main__":
diff --git a/lldb/scripts/version-header-fix.py b/lldb/scripts/version-header-fix.py
index 98457e6f5b3cd..0caf7c62bc91f 100755
--- a/lldb/scripts/version-header-fix.py
+++ b/lldb/scripts/version-header-fix.py
@@ -29,6 +29,10 @@ def main():
     input_path = str(args.input_path)
     output_path = str(args.output_path)
 
+    # Create the output dir if it doesn't already exist
+    if not os.path.exists(os.path.dirname(output_path)):
+        os.makedirs(os.path.dirname(output_path))
+
     with open(input_path, "r") as input_file:
         lines = input_file.readlines()
         file_buffer = "".join(lines)
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 4751ed319b259..197c98c0d2b0c 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -295,12 +295,21 @@ endif()
 # Stage all headers in the include directory in the build dir.
 file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
 set(lldb_header_staging_dir ${CMAKE_BINARY_DIR}/include/lldb)
+set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
 file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
 file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
 list(REMOVE_ITEM root_public_headers ${root_private_headers})
 
 find_program(unifdef_EXECUTABLE unifdef)
 
+add_custom_target(liblldb-header-staging DEPENDS ${lldb_staged_headers} ${lldb_header_staging_dir}/lldb-defines.h)
+
+if (LLDB_BUILD_FRAMEWORK)
+  add_custom_target(lldb-framework-fixup-all-headers)
+  add_dependencies(lldb-framework-fixup-all-headers liblldb-header-staging)
+  add_dependencies(liblldb lldb-framework-fixup-all-headers)
+endif()
+
 foreach(header
     ${public_headers}
     ${generated_public_headers}
@@ -323,12 +332,23 @@ foreach(header
     COMMENT "LLDB headers: stage LLDB headers in include directory")
 
   list(APPEND lldb_staged_headers ${staged_header})
+
+  if (LLDB_BUILD_FRAMEWORK)
+    set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${basename})
+
+    add_custom_target(lldb-framework-fixup-header-${basename} DEPENDS ${staged_header})
+    add_dependencies(lldb-framework-fixup-all-headers lldb-framework-fixup-header-${basename})
+
+    add_custom_command(TARGET lldb-framework-fixup-header-${basename} POST_BUILD
+      COMMAND "${Python3_EXECUTABLE}" ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${staged_header} -o ${output_header}
+      COMMENT "LLDB.framework: Fix up and copy framework headers"
+    )
+  endif()
 endforeach()
 
-add_custom_command(TARGET liblldb POST_BUILD
+add_custom_command(TARGET liblldb-header-staging POST_BUILD
   COMMAND "${Python3_EXECUTABLE}" ${LLDB_SOURCE_DIR}/scripts/version-header-fix.py -i ${LLDB_SOURCE_DIR}/include/lldb/lldb-defines.h -o ${lldb_header_staging_dir}/lldb-defines.h -m ${LLDB_VERSION_MAJOR} -n ${LLDB_VERSION_MINOR} -p ${LLDB_VERSION_PATCH}
 )
-add_custom_target(liblldb-header-staging DEPENDS ${lldb_staged_headers})
 add_dependencies(liblldb liblldb-header-staging)
 
 if(LLDB_BUILD_FRAMEWORK)
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index 4de5929d6b230..d4be64b815369 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -1263,6 +1263,15 @@ lldb::SBError SBProcess::SaveCore(SBSaveCoreOptions &options) {
     return error;
   }
 
+  if (!options.GetProcess())
+    options.SetProcess(process_sp);
+
+  if (options.GetProcess().GetSP() != process_sp) {
+    error = Status::FromErrorString(
+        "Save Core Options configured for a different process.");
+    return error;
+  }
+
   std::lock_guard<std::recursive_mutex> guard(
       process_sp->GetTarget().GetAPIMutex());
 
@@ -1271,7 +1280,7 @@ lldb::SBError SBProcess::SaveCore(SBSaveCoreOptions &options) {
     return error;
   }
 
-  error.ref() = PluginManager::SaveCore(process_sp, options.ref());
+  error.ref() = PluginManager::SaveCore(options.ref());
 
   return error;
 }
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
index 15584abaac013..e8b81ee57f5a9 100644
--- a/lldb/source/API/SBSaveCoreOptions.cpp
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -81,6 +81,11 @@ SBError SBSaveCoreOptions::SetProcess(lldb::SBProcess process) {
   return m_opaque_up->SetProcess(process.GetSP());
 }
 
+SBProcess SBSaveCoreOptions::GetProcess() {
+  LLDB_INSTRUMENT_VA(this);
+  return SBProcess(m_opaque_up->GetProcess());
+}
+
 SBError SBSaveCoreOptions::AddThread(lldb::SBThread thread) {
   LLDB_INSTRUMENT_VA(this, thread);
   return m_opaque_up->AddThread(thread.GetSP());
@@ -128,6 +133,26 @@ uint64_t SBSaveCoreOptions::GetCurrentSizeInBytes(SBError &error) {
   return *expected_bytes;
 }
 
+lldb::SBMemoryRegionInfoList SBSaveCoreOptions::GetMemoryRegionsToSave() {
+  LLDB_INSTRUMENT_VA(this);
+  llvm::Expected<lldb_private::CoreFileMemoryRanges> memory_ranges =
+      m_opaque_up->GetMemoryRegionsToSave();
+  if (!memory_ranges) {
+    llvm::consumeError(memory_ranges.takeError());
+    return SBMemoryRegionInfoList();
+  }
+
+  SBMemoryRegionInfoList memory_region_infos;
+  for (const auto &range : *memory_ranges) {
+    SBMemoryRegionInfo region_info(
+        nullptr, range.GetRangeBase(), range.GetRangeEnd(),
+        range.data.lldb_permissions, /*mapped=*/true);
+    memory_region_infos.Append(region_info);
+  }
+
+  return memory_region_infos;
+}
+
 lldb_private::SaveCoreOptions &SBSaveCoreOptions::ref() const {
   return *m_opaque_up;
 }
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index a110eececf4d6..a2c004d0ee97f 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -150,6 +150,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
         return;
       }
     }
+    m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(),
+                                           m_cmd_name);
     result.SetStatus(eReturnStatusSuccessFinishResult);
   };
 
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index a95dea63720ac..c5b91678103d5 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -470,6 +470,9 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr,
           return false;
         }
 
+        m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(),
+                                               m_cmd_name);
+
         if (suppress_result)
           if (auto result_var_sp =
                   target.GetPersistentVariable(result_valobj_sp->GetName())) {
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 1181b2d95c8b4..84c576e721e71 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -1354,7 +1354,8 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
         FileSystem::Instance().Resolve(output_file);
         auto &core_dump_options = m_options.m_core_dump_options;
         core_dump_options.SetOutputFile(output_file);
-        Status error = PluginManager::SaveCore(process_sp, core_dump_options);
+        core_dump_options.SetProcess(process_sp);
+        Status error = PluginManager::SaveCore(core_dump_options);
         if (error.Success()) {
           if (core_dump_options.GetStyle() ==
                   SaveCoreStyle::eSaveCoreDirtyOnly ||
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index bece690a85fa5..588736715f817 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -952,27 +952,26 @@ PluginManager::GetObjectFileCreateMemoryCallbackForPluginName(
   return nullptr;
 }
 
-Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
-                               lldb_private::SaveCoreOptions &options) {
+Status PluginManager::SaveCore(lldb_private::SaveCoreOptions &options) {
   Status error;
   if (!options.GetOutputFile()) {
     error = Status::FromErrorString("No output file specified");
     return error;
   }
 
-  if (!process_sp) {
+  if (!options.GetProcess()) {
     error = Status::FromErrorString("Invalid process");
     return error;
   }
 
-  error = options.EnsureValidConfiguration(process_sp);
+  error = options.EnsureValidConfiguration();
   if (error.Fail())
     return error;
 
   if (!options.GetPluginName().has_value()) {
     // Try saving core directly from the process plugin first.
     llvm::Expected<bool> ret =
-        process_sp->SaveCore(options.GetOutputFile()->GetPath());
+        options.GetProcess()->SaveCore(options.GetOutputFile()->GetPath());
     if (!ret)
       return Status::FromError(ret.takeError());
     if (ret.get())
@@ -984,7 +983,10 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp,
   auto instances = GetObjectFileInstances().GetSnapshot();
   for (auto &instance : instances) {
     if (plugin_name.empty() || instance.name == plugin_name) {
-      if (instance.save_core && instance.save_core(process_sp, options, error))
+      // TODO: Refactor the instance.save_core() to not require a process and
+      // get it from options instead.
+      if (instance.save_core &&
+          instance.save_core(options.GetProcess(), options, error))
         return error;
     }
   }
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index 40493df8aec37..05fcc4db3b125 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -854,7 +854,7 @@ llvm::Error ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
       PrintChildren(value_printed, summary_printed, curr_ptr_depth);
   } else if (HasReachedMaximumDepth() && IsAggregate() &&
              ShouldPrintValueObject()) {
-    m_stream->PutCString("{...}\n");
+    m_stream->PutCString(" {...}\n");
     // The maximum child depth has been reached. If `m_max_depth` is the default
     // (i.e. the user has _not_ customized it), then lldb presents a warning to
     // the user. The warning tells the user that the limit has been reached, but
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index 52891fcefd68b..79bc6c87fa9c5 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -41,8 +41,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 // DWARFExpression constructor
 DWARFExpression::DWARFExpression() : m_data() {}
diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index e445fa8833022..6f812b91a8b1d 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -700,7 +700,7 @@ void IRExecutionUnit::CollectCandidateCPlusPlusNames(
 
 class LoadAddressResolver {
 public:
-  LoadAddressResolver(Target *target, bool &symbol_was_missing_weak)
+  LoadAddressResolver(Target &target, bool &symbol_was_missing_weak)
       : m_target(target), m_symbol_was_missing_weak(symbol_was_missing_weak) {}
 
   std::optional<lldb::addr_t> Resolve(SymbolContextList &sc_list) {
@@ -722,11 +722,11 @@ class LoadAddressResolver {
 
       // First try the symbol.
       if (candidate_sc.symbol) {
-        load_address = candidate_sc.symbol->ResolveCallableAddress(*m_target);
+        load_address = candidate_sc.symbol->ResolveCallableAddress(m_target);
         if (load_address == LLDB_INVALID_ADDRESS) {
           Address addr = candidate_sc.symbol->GetAddress();
-          load_address = m_target->GetProcessSP()
-                             ? addr.GetLoadAddress(m_target)
+          load_address = m_target.GetProcessSP()
+                             ? addr.GetLoadAddress(&m_target)
                              : addr.GetFileAddress();
         }
       }
@@ -734,8 +734,8 @@ class LoadAddressResolver {
       // If that didn't work, try the function.
       if (load_address == LLDB_INVALID_ADDRESS && candidate_sc.function) {
         Address addr = candidate_sc.function->GetAddress();
-        load_address = m_target->GetProcessSP() ? addr.GetLoadAddress(m_target)
-                                                : addr.GetFileAddress();
+        load_address = m_target.GetProcessSP() ? addr.GetLoadAddress(&m_target)
+                                               : addr.GetFileAddress();
       }
 
       // We found a load address.
@@ -766,7 +766,7 @@ class LoadAddressResolver {
   }
 
 private:
-  Target *m_target;
+  Target &m_target;
   bool &m_symbol_was_missing_weak;
   lldb::addr_t m_best_internal_load_address = LLDB_INVALID_ADDRESS;
 };
@@ -790,7 +790,7 @@ IRExecutionUnit::FindInSymbols(const std::vector<ConstString> &names,
   for (size_t i = 0; i < m_preferred_modules.GetSize(); ++i)
     non_local_images.Remove(m_preferred_modules.GetModuleAtIndex(i));
 
-  LoadAddressResolver resolver(target, symbol_was_missing_weak);
+  LoadAddressResolver resolver(*target, symbol_was_missing_weak);
 
   ModuleFunctionSearchOptions function_options;
   function_options.include_symbols = true;
diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp
index 405acbb5662d6..196f54b93538d 100644
--- a/lldb/source/Host/common/NativeProcessProtocol.cpp
+++ b/lldb/source/Host/common/NativeProcessProtocol.cpp
@@ -366,12 +366,19 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) {
   if (--it->second.ref_count > 0)
     return Status();
 
+  // Remove the entry from m_software_breakpoints rightaway, so that we don't
+  // leave behind an entry with ref_count == 0 in case one of the following
+  // conditions returns an error. The breakpoint is moved so that it can be
+  // accessed below.
+  SoftwareBreakpoint bkpt = std::move(it->second);
+  m_software_breakpoints.erase(it);
+
   // This is the last reference. Let's remove the breakpoint.
   Status error;
 
   // Clear a software breakpoint instruction
-  llvm::SmallVector<uint8_t, 4> curr_break_op(
-      it->second.breakpoint_opcodes.size(), 0);
+  llvm::SmallVector<uint8_t, 4> curr_break_op(bkpt.breakpoint_opcodes.size(),
+                                              0);
 
   // Read the breakpoint opcode
   size_t bytes_read = 0;
@@ -382,10 +389,10 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) {
         "addr=0x%" PRIx64 ": tried to read %zu bytes but only read %zu", addr,
         curr_break_op.size(), bytes_read);
   }
-  const auto &saved = it->second.saved_opcodes;
+  const auto &saved = bkpt.saved_opcodes;
   // Make sure the breakpoint opcode exists at this address
-  if (llvm::ArrayRef(curr_break_op) != it->second.breakpoint_opcodes) {
-    if (curr_break_op != it->second.saved_opcodes)
+  if (llvm::ArrayRef(curr_break_op) != bkpt.breakpoint_opcodes) {
+    if (curr_break_op != bkpt.saved_opcodes)
       return Status::FromErrorString(
           "Original breakpoint trap is no longer in memory.");
     LLDB_LOG(log,
@@ -418,7 +425,6 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) {
                llvm::make_range(saved.begin(), saved.end()));
   }
 
-  m_software_breakpoints.erase(it);
   return Status();
 }
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 00c3472444d2e..da545f18d9b15 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -616,7 +616,8 @@ void CommandInterpreter::LoadCommandDictionary() {
   std::unique_ptr<CommandObjectRegexCommand> break_regex_cmd_up(
       new CommandObjectRegexCommand(
           *this, "_regexp-break",
-          "Set a breakpoint using one of several shorthand formats.",
+          "Set a breakpoint using one of several shorthand formats, or list "
+          "the existing breakpoints if no arguments are provided.",
           "\n"
           "_regexp-break <filename>:<linenum>:<colnum>\n"
           "              main.c:12:21          // Break at line 12 and column "
@@ -643,7 +644,10 @@ void CommandInterpreter::LoadCommandDictionary() {
           "              /break here/          // Break on source lines in "
           "current file\n"
           "                                    // containing text 'break "
-          "here'.\n",
+          "here'.\n"
+          "_regexp-break\n"
+          "                                    // List the existing "
+          "breakpoints\n",
           lldb::eSymbolCompletion | lldb::eSourceFileCompletion, false));
 
   if (break_regex_cmd_up) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index ea86b6b4327be..ab9d991fd48f7 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -14,11 +14,11 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   CxxStringTypes.cpp
   Generic.cpp
   GenericBitset.cpp
+  GenericList.cpp
   GenericOptional.cpp
   LibCxx.cpp
   LibCxxAtomic.cpp
   LibCxxInitializerList.cpp
-  LibCxxList.cpp
   LibCxxMap.cpp
   LibCxxQueue.cpp
   LibCxxRangesRefView.cpp
@@ -34,8 +34,12 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibStdcppTuple.cpp
   LibStdcppUniquePointer.cpp
   MsvcStl.cpp
+  MsvcStlAtomic.cpp
   MsvcStlSmartPointer.cpp
   MsvcStlTuple.cpp
+  MsvcStlUnordered.cpp
+  MsvcStlVariant.cpp
+  MsvcStlVector.cpp
   MSVCUndecoratedNameParser.cpp
 
   LINK_COMPONENTS
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index bf4139119a76b..80dc4609f9b66 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -1404,7 +1404,7 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   stl_deref_flags.SetFrontEndWantsDereference();
 
   cpp_category_sp->AddTypeSynthetic(
-      "^std::(__debug::)?vector<.+>(( )?&)?$", eFormatterMatchRegex,
+      "^std::__debug::vector<.+>(( )?&)?$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_synth_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider")));
@@ -1434,28 +1434,20 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
           stl_deref_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdMapLikeSynthProvider")));
   cpp_category_sp->AddTypeSynthetic(
-      "^std::(__debug::)?unordered_(multi)?(map|set)<.+> >$",
-      eFormatterMatchRegex,
+      "^std::__debug::unordered_(multi)?(map|set)<.+> >$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_deref_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdUnorderedMapSynthProvider")));
   cpp_category_sp->AddTypeSynthetic(
-      "^std::((__debug::)?|(__cxx11::)?)list<.+>(( )?&)?$",
-      eFormatterMatchRegex,
+      "^std::__(debug|cxx11)::list<.+>(( )?&)?$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_deref_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdListSynthProvider")));
   cpp_category_sp->AddTypeSynthetic(
-      "^std::((__debug::)?|(__cxx11::)?)forward_list<.+>(( )?&)?$",
-      eFormatterMatchRegex,
+      "^std::__(debug|cxx11)::forward_list<.+>(( )?&)?$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_synth_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider")));
-  cpp_category_sp->AddTypeSynthetic(
-      "^std::variant<.+>$", eFormatterMatchRegex,
-      SyntheticChildrenSP(new ScriptedSyntheticChildren(
-          stl_synth_flags,
-          "lldb.formatters.cpp.gnu_libstdcpp.VariantSynthProvider")));
 
   stl_summary_flags.SetDontShowChildren(false);
   stl_summary_flags.SetSkipPointers(false);
@@ -1465,10 +1457,10 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       "libstdc++ std::bitset summary provider",
       "^std::(__debug::)?bitset<.+>(( )?&)?$", stl_summary_flags, true);
 
-  AddCXXSummary(
-      cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider,
-      "libstdc++ std::vector summary provider",
-      "^std::(__debug::)?vector<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::ContainerSizeSummaryProvider,
+                "libstdc++ std::__debug::vector summary provider",
+                "^std::__debug::vector<.+>(( )?&)?$", stl_summary_flags, true);
 
   AddCXXSummary(
       cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider,
@@ -1497,27 +1489,20 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
 
   AddCXXSummary(cpp_category_sp,
                 lldb_private::formatters::ContainerSizeSummaryProvider,
-                "libstdc++ std unordered container summary provider",
-                "^std::(__debug::)?unordered_(multi)?(map|set)<.+> >$",
+                "libstdc++ debug std unordered container summary provider",
+                "^std::__debug::unordered_(multi)?(map|set)<.+> >$",
                 stl_summary_flags, true);
 
-  AddCXXSummary(cpp_category_sp,
-                lldb_private::formatters::ContainerSizeSummaryProvider,
-                "libstdc++ std::list summary provider",
-                "^std::((__debug::)?|(__cxx11::)?)list<.+>(( )?&)?$",
-                stl_summary_flags, true);
+  AddCXXSummary(
+      cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider,
+      "libstdc++ debug std::list summary provider",
+      "^std::__(debug|cxx11)::list<.+>(( )?&)?$", stl_summary_flags, true);
 
   cpp_category_sp->AddTypeSummary(
-      "^std::((__debug::)?|(__cxx11::)?)forward_list<.+>(( )?&)?$",
-      eFormatterMatchRegex,
+      "^std::__(debug|cxx11)::forward_list<.+>(( )?&)?$", eFormatterMatchRegex,
       TypeSummaryImplSP(new ScriptSummaryFormat(
           stl_summary_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider")));
-  cpp_category_sp->AddTypeSummary(
-      "^std::variant<.+>$", eFormatterMatchRegex,
-      TypeSummaryImplSP(new ScriptSummaryFormat(
-          stl_summary_flags,
-          "lldb.formatters.cpp.gnu_libstdcpp.VariantSummaryProvider")));
 
   AddCXXSynthetic(
       cpp_category_sp,
@@ -1551,20 +1536,10 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       "std::bitset synthetic child", "^std::(__debug::)?bitset<.+>(( )?&)?$",
       stl_deref_flags, true);
 
-  AddCXXSynthetic(
-      cpp_category_sp,
-      lldb_private::formatters::LibStdcppOptionalSyntheticFrontEndCreator,
-      "std::optional synthetic child", "^std::optional<.+>(( )?&)?$",
-      stl_deref_flags, true);
-
   AddCXXSummary(cpp_category_sp,
                 lldb_private::formatters::StdlibCoroutineHandleSummaryProvider,
                 "libstdc++ std::coroutine_handle summary provider",
                 libstdcpp_std_coroutine_handle_regex, stl_summary_flags, true);
-  AddCXXSummary(cpp_category_sp,
-                lldb_private::formatters::GenericOptionalSummaryProvider,
-                "libstd++ std::optional summary provider",
-                "^std::optional<.+>(( )?&)?$", stl_summary_flags, true);
 }
 
 static lldb_private::SyntheticChildrenFrontEnd *
@@ -1615,6 +1590,88 @@ GenericTupleSyntheticFrontEndCreator(CXXSyntheticChildren *children,
   return LibStdcppTupleSyntheticFrontEndCreator(children, valobj_sp);
 }
 
+static SyntheticChildrenFrontEnd *
+GenericVectorSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                      lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  // checks for vector<T> and vector<bool>
+  if (auto *msvc = MsvcStlVectorSyntheticFrontEndCreator(valobj_sp))
+    return msvc;
+
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider", *valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericListSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                    lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlList(*valobj_sp))
+    return MsvcStlListSyntheticFrontEndCreator(children, valobj_sp);
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdListSynthProvider", *valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericForwardListSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                           lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlList(*valobj_sp))
+    return MsvcStlForwardListSyntheticFrontEndCreator(children, valobj_sp);
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider",
+      *valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericOptionalSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                        lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlOptional(*valobj_sp))
+    return MsvcStlOptionalSyntheticFrontEndCreator(children, valobj_sp);
+  return LibStdcppOptionalSyntheticFrontEndCreator(children, valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericVariantSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                       lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlVariant(*valobj_sp))
+    return MsvcStlVariantSyntheticFrontEndCreator(children, valobj_sp);
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.VariantSynthProvider", *valobj_sp);
+}
+
+static bool GenericVariantSummaryProvider(ValueObject &valobj, Stream &stream,
+                                          const TypeSummaryOptions &options) {
+  if (IsMsvcStlVariant(valobj))
+    return MsvcStlVariantSummaryProvider(valobj, stream, options);
+  return LibStdcppVariantSummaryProvider(valobj, stream, options);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericUnorderedSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                         ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlUnordered(*valobj_sp))
+    return MsvcStlUnorderedSyntheticFrontEndCreator(children, valobj_sp);
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdUnorderedMapSynthProvider",
+      *valobj_sp);
+}
+
 /// Load formatters that are formatting types from more than one STL
 static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   if (!cpp_category_sp)
@@ -1673,6 +1730,25 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   AddCXXSynthetic(cpp_category_sp, GenericTupleSyntheticFrontEndCreator,
                   "std::tuple synthetic children", "^std::tuple<.*>(( )?&)?$",
                   stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericListSyntheticFrontEndCreator,
+                  "std::list synthetic children", "^std::list<.+>(( )?&)?$",
+                  stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericForwardListSyntheticFrontEndCreator,
+                  "std::forward_list synthetic children",
+                  "^std::forward_list<.+>(( )?&)?$", stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericVariantSyntheticFrontEndCreator,
+                  "std::variant synthetic children", "^std::variant<.*>$",
+                  stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericUnorderedSyntheticFrontEndCreator,
+                  "std::unordered container synthetic children",
+                  "^std::unordered_(multi)?(map|set)<.+> ?>$", stl_synth_flags,
+                  true);
+
+  SyntheticChildren::Flags stl_deref_flags = stl_synth_flags;
+  stl_deref_flags.SetFrontEndWantsDereference();
+  AddCXXSynthetic(cpp_category_sp, GenericOptionalSyntheticFrontEndCreator,
+                  "std::optional synthetic children",
+                  "^std::optional<.+>(( )?&)?$", stl_deref_flags, true);
 
   AddCXXSummary(cpp_category_sp, GenericSmartPointerSummaryProvider,
                 "MSVC STL/libstdc++ std::shared_ptr summary provider",
@@ -1686,6 +1762,30 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
                 "MSVC STL/libstdc++ std::tuple summary provider",
                 "^std::tuple<.*>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
+                "MSVC/libstdc++ std::vector summary provider",
+                "^std::vector<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericVectorSyntheticFrontEndCreator,
+                  "MSVC/libstdc++ std::vector synthetic provider",
+                  "^std::vector<.+>(( )?&)?$", stl_synth_flags, true);
+  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
+                "MSVC STL/libstdc++ std::list summary provider",
+                "^std::list<.+>(( )?&)?$", stl_summary_flags, true);
+  cpp_category_sp->AddTypeSummary(
+      "^std::forward_list<.+>(( )?&)?$", eFormatterMatchRegex,
+      TypeSummaryImplSP(new ScriptSummaryFormat(
+          stl_summary_flags,
+          "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider")));
+  AddCXXSummary(cpp_category_sp, GenericOptionalSummaryProvider,
+                "MSVC STL/libstd++ std::optional summary provider",
+                "^std::optional<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp, GenericVariantSummaryProvider,
+                "MSVC STL/libstdc++ std::variant summary provider",
+                "^std::variant<.*>$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
+                "MSVC STL/libstdc++ std unordered container summary provider",
+                "^std::unordered_(multi)?(map|set)<.+> ?>$", stl_summary_flags,
+                true);
 }
 
 static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
@@ -1700,6 +1800,9 @@ static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       .SetDontShowValue(false)
       .SetShowMembersOneLiner(false)
       .SetHideItemNames(false);
+  SyntheticChildren::Flags stl_synth_flags;
+  stl_synth_flags.SetCascades(true).SetSkipPointers(false).SetSkipReferences(
+      false);
 
   using StringElementType = StringPrinter::StringElementType;
 
@@ -1721,6 +1824,16 @@ static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
           stl_summary_flags,
           MsvcStlStringSummaryProvider<StringElementType::UTF32>,
           "MSVC STL std::u32string summary provider"));
+
+  stl_summary_flags.SetDontShowChildren(false);
+
+  AddCXXSynthetic(cpp_category_sp, MsvcStlAtomicSyntheticFrontEndCreator,
+                  "MSVC STL std::atomic synthetic children",
+                  "^std::atomic<.+>$", stl_synth_flags, true);
+
+  AddCXXSummary(cpp_category_sp, MsvcStlAtomicSummaryProvider,
+                "MSVC STL std::atomic summary provider", "^std::atomic<.+>$",
+                stl_summary_flags, true);
 }
 
 static void LoadSystemFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
similarity index 58%
rename from lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
rename to lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
index 826e6ab090e10..ea1edbfd3ac9b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
@@ -1,4 +1,4 @@
-//===-- LibCxxList.cpp ----------------------------------------------------===//
+//===-- GenericList.cpp ---------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,14 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "LibCxx.h"
+#include "MsvcStl.h"
 
-#include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/DataFormatters/FormattersHelpers.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Endian.h"
 #include "lldb/Utility/Status.h"
-#include "lldb/Utility/Stream.h"
 #include "lldb/ValueObject/ValueObject.h"
 #include "lldb/ValueObject/ValueObjectConstResult.h"
 #include "lldb/lldb-enumerations.h"
@@ -25,31 +22,27 @@ using namespace lldb_private::formatters;
 
 namespace {
 
-class ListEntry {
+enum class StlType {
+  LibCxx,
+  MsvcStl,
+};
+
+template <StlType Stl> class ListEntry {
 public:
   ListEntry() = default;
   ListEntry(ValueObjectSP entry_sp) : m_entry_sp(std::move(entry_sp)) {}
   ListEntry(ValueObject *entry)
       : m_entry_sp(entry ? entry->GetSP() : ValueObjectSP()) {}
 
-  ListEntry next() {
-    if (!m_entry_sp)
-      return ListEntry();
-    return ListEntry(m_entry_sp->GetChildMemberWithName("__next_"));
-  }
-
-  ListEntry prev() {
-    if (!m_entry_sp)
-      return ListEntry();
-    return ListEntry(m_entry_sp->GetChildMemberWithName("__prev_"));
-  }
-
   uint64_t value() const {
     if (!m_entry_sp)
       return 0;
     return m_entry_sp->GetValueAsUnsigned(0);
   }
 
+  ListEntry next();
+  ListEntry prev();
+
   bool null() { return (value() == 0); }
 
   explicit operator bool() { return GetEntry() && !null(); }
@@ -66,10 +59,34 @@ class ListEntry {
   ValueObjectSP m_entry_sp;
 };
 
-class ListIterator {
+template <> ListEntry<StlType::LibCxx> ListEntry<StlType::LibCxx>::next() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("__next_"));
+}
+
+template <> ListEntry<StlType::LibCxx> ListEntry<StlType::LibCxx>::prev() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("__prev_"));
+}
+
+template <> ListEntry<StlType::MsvcStl> ListEntry<StlType::MsvcStl>::next() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("_Next"));
+}
+
+template <> ListEntry<StlType::MsvcStl> ListEntry<StlType::MsvcStl>::prev() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("_Prev"));
+}
+
+template <StlType Stl> class ListIterator {
 public:
   ListIterator() = default;
-  ListIterator(ListEntry entry) : m_entry(std::move(entry)) {}
+  ListIterator(ListEntry<Stl> entry) : m_entry(std::move(entry)) {}
   ListIterator(ValueObjectSP entry) : m_entry(std::move(entry)) {}
   ListIterator(ValueObject *entry) : m_entry(entry) {}
 
@@ -101,9 +118,10 @@ class ListIterator {
   void prev() { m_entry = m_entry.prev(); }
 
 private:
-  ListEntry m_entry;
+  ListEntry<Stl> m_entry;
 };
 
+template <StlType Stl>
 class AbstractListFrontEnd : public SyntheticChildrenFrontEnd {
 public:
   llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override {
@@ -124,33 +142,31 @@ class AbstractListFrontEnd : public SyntheticChildrenFrontEnd {
   ValueObject *m_head = nullptr;
 
   static constexpr bool g_use_loop_detect = true;
-  size_t m_loop_detected = 0; // The number of elements that have had loop
-                              // detection run over them.
-  ListEntry m_slow_runner; // Used for loop detection
-  ListEntry m_fast_runner; // Used for loop detection
+  size_t m_loop_detected = 0;   // The number of elements that have had loop
+                                // detection run over them.
+  ListEntry<Stl> m_slow_runner; // Used for loop detection
+  ListEntry<Stl> m_fast_runner; // Used for loop detection
 
   size_t m_list_capping_size = 0;
   CompilerType m_element_type;
-  std::map<size_t, ListIterator> m_iterators;
+  std::map<size_t, ListIterator<Stl>> m_iterators;
 
   bool HasLoop(size_t count);
   ValueObjectSP GetItem(size_t idx);
 };
 
-class ForwardListFrontEnd : public AbstractListFrontEnd {
+class LibCxxForwardListFrontEnd : public AbstractListFrontEnd<StlType::LibCxx> {
 public:
-  ForwardListFrontEnd(ValueObject &valobj);
+  LibCxxForwardListFrontEnd(ValueObject &valobj);
 
   llvm::Expected<uint32_t> CalculateNumChildren() override;
   ValueObjectSP GetChildAtIndex(uint32_t idx) override;
   lldb::ChildCacheState Update() override;
 };
 
-class ListFrontEnd : public AbstractListFrontEnd {
+class LibCxxListFrontEnd : public AbstractListFrontEnd<StlType::LibCxx> {
 public:
-  ListFrontEnd(lldb::ValueObjectSP valobj_sp);
-
-  ~ListFrontEnd() override = default;
+  LibCxxListFrontEnd(lldb::ValueObjectSP valobj_sp);
 
   llvm::Expected<uint32_t> CalculateNumChildren() override;
 
@@ -163,9 +179,34 @@ class ListFrontEnd : public AbstractListFrontEnd {
   ValueObject *m_tail = nullptr;
 };
 
+class MsvcStlForwardListFrontEnd
+    : public AbstractListFrontEnd<StlType::MsvcStl> {
+public:
+  MsvcStlForwardListFrontEnd(ValueObject &valobj);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+  ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+  lldb::ChildCacheState Update() override;
+};
+
+class MsvcStlListFrontEnd : public AbstractListFrontEnd<StlType::MsvcStl> {
+public:
+  MsvcStlListFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+private:
+  ValueObject *m_tail = nullptr;
+};
+
 } // end anonymous namespace
 
-lldb::ChildCacheState AbstractListFrontEnd::Update() {
+template <StlType Stl>
+lldb::ChildCacheState AbstractListFrontEnd<Stl>::Update() {
   m_loop_detected = 0;
   m_count = UINT32_MAX;
   m_head = nullptr;
@@ -191,7 +232,7 @@ lldb::ChildCacheState AbstractListFrontEnd::Update() {
   return lldb::ChildCacheState::eRefetch;
 }
 
-bool AbstractListFrontEnd::HasLoop(size_t count) {
+template <StlType Stl> bool AbstractListFrontEnd<Stl>::HasLoop(size_t count) {
   if (!g_use_loop_detect)
     return false;
   // don't bother checking for a loop if we won't actually need to jump nodes
@@ -201,7 +242,7 @@ bool AbstractListFrontEnd::HasLoop(size_t count) {
   if (m_loop_detected == 0) {
     // This is the first time we are being run (after the last update). Set up
     // the loop invariant for the first element.
-    m_slow_runner = ListEntry(m_head).next();
+    m_slow_runner = ListEntry<Stl>(m_head).next();
     m_fast_runner = m_slow_runner.next();
     m_loop_detected = 1;
   }
@@ -225,9 +266,10 @@ bool AbstractListFrontEnd::HasLoop(size_t count) {
   return m_slow_runner == m_fast_runner;
 }
 
-ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) {
+template <StlType Stl>
+ValueObjectSP AbstractListFrontEnd<Stl>::GetItem(size_t idx) {
   size_t advance = idx;
-  ListIterator current(m_head);
+  ListIterator<Stl> current(m_head);
   if (idx > 0) {
     auto cached_iterator = m_iterators.find(idx - 1);
     if (cached_iterator != m_iterators.end()) {
@@ -240,16 +282,16 @@ ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) {
   return value_sp;
 }
 
-ForwardListFrontEnd::ForwardListFrontEnd(ValueObject &valobj)
+LibCxxForwardListFrontEnd::LibCxxForwardListFrontEnd(ValueObject &valobj)
     : AbstractListFrontEnd(valobj) {
   Update();
 }
 
-llvm::Expected<uint32_t> ForwardListFrontEnd::CalculateNumChildren() {
+llvm::Expected<uint32_t> LibCxxForwardListFrontEnd::CalculateNumChildren() {
   if (m_count != UINT32_MAX)
     return m_count;
 
-  ListEntry current(m_head);
+  ListEntry<StlType::LibCxx> current(m_head);
   m_count = 0;
   while (current && m_count < m_list_capping_size) {
     ++m_count;
@@ -258,7 +300,7 @@ llvm::Expected<uint32_t> ForwardListFrontEnd::CalculateNumChildren() {
   return m_count;
 }
 
-ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
+ValueObjectSP LibCxxForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
   if (idx >= CalculateNumChildrenIgnoringErrors())
     return nullptr;
 
@@ -289,7 +331,7 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
                                    m_element_type);
 }
 
-lldb::ChildCacheState ForwardListFrontEnd::Update() {
+lldb::ChildCacheState LibCxxForwardListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
 
   Status err;
@@ -312,13 +354,13 @@ lldb::ChildCacheState ForwardListFrontEnd::Update() {
   return ChildCacheState::eRefetch;
 }
 
-ListFrontEnd::ListFrontEnd(lldb::ValueObjectSP valobj_sp)
+LibCxxListFrontEnd::LibCxxListFrontEnd(lldb::ValueObjectSP valobj_sp)
     : AbstractListFrontEnd(*valobj_sp) {
   if (valobj_sp)
     Update();
 }
 
-llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
+llvm::Expected<uint32_t> LibCxxListFrontEnd::CalculateNumChildren() {
   if (m_count != UINT32_MAX)
     return m_count;
   if (!m_head || !m_tail || m_node_address == 0)
@@ -351,7 +393,7 @@ llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
   if (next_val == prev_val)
     return 1;
   uint64_t size = 2;
-  ListEntry current(m_head);
+  ListEntry<StlType::LibCxx> current(m_head);
   while (current.next() && current.next().value() != m_node_address) {
     size++;
     current = current.next();
@@ -361,7 +403,7 @@ llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
   return m_count = (size - 1);
 }
 
-lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(uint32_t idx) {
+lldb::ValueObjectSP LibCxxListFrontEnd::GetChildAtIndex(uint32_t idx) {
   static ConstString g_value("__value_");
   static ConstString g_next("__next_");
 
@@ -412,7 +454,7 @@ lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(uint32_t idx) {
                                    m_element_type);
 }
 
-lldb::ChildCacheState ListFrontEnd::Update() {
+lldb::ChildCacheState LibCxxListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
   m_tail = nullptr;
   m_node_address = 0;
@@ -432,13 +474,167 @@ lldb::ChildCacheState ListFrontEnd::Update() {
   return lldb::ChildCacheState::eRefetch;
 }
 
+MsvcStlForwardListFrontEnd::MsvcStlForwardListFrontEnd(ValueObject &valobj)
+    : AbstractListFrontEnd(valobj) {
+  Update();
+}
+
+llvm::Expected<uint32_t> MsvcStlForwardListFrontEnd::CalculateNumChildren() {
+  if (m_count != UINT32_MAX)
+    return m_count;
+
+  ListEntry<StlType::MsvcStl> current(m_head);
+  m_count = 0;
+  while (current && m_count < m_list_capping_size) {
+    ++m_count;
+    current = current.next();
+  }
+  return m_count;
+}
+
+ValueObjectSP MsvcStlForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
+  if (idx >= CalculateNumChildrenIgnoringErrors())
+    return nullptr;
+
+  if (!m_head)
+    return nullptr;
+
+  if (HasLoop(idx + 1))
+    return nullptr;
+
+  ValueObjectSP current_sp = GetItem(idx);
+  if (!current_sp)
+    return nullptr;
+
+  current_sp = current_sp->GetChildAtIndex(1); // get the _Myval child
+  if (!current_sp)
+    return nullptr;
+
+  // we need to copy current_sp into a new object otherwise we will end up with
+  // all items named _Myval
+  DataExtractor data;
+  Status error;
+  current_sp->GetData(data, error);
+  if (error.Fail())
+    return nullptr;
+
+  return CreateValueObjectFromData(llvm::formatv("[{0}]", idx).str(), data,
+                                   m_backend.GetExecutionContextRef(),
+                                   m_element_type);
+}
+
+lldb::ChildCacheState MsvcStlForwardListFrontEnd::Update() {
+  AbstractListFrontEnd::Update();
+
+  if (auto head_sp =
+          m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"}))
+    m_head = head_sp.get();
+
+  return ChildCacheState::eRefetch;
+}
+
+MsvcStlListFrontEnd::MsvcStlListFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : AbstractListFrontEnd(*valobj_sp) {
+  if (valobj_sp)
+    Update();
+}
+
+llvm::Expected<uint32_t> MsvcStlListFrontEnd::CalculateNumChildren() {
+  if (m_count != UINT32_MAX)
+    return m_count;
+  if (!m_head || !m_tail)
+    return 0;
+
+  auto size_sp =
+      m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Mysize"});
+  if (!size_sp)
+    return llvm::createStringError("Failed to resolve size.");
+
+  m_count = size_sp->GetValueAsUnsigned(UINT32_MAX);
+  if (m_count == UINT32_MAX)
+    return llvm::createStringError("Failed to read size value.");
+
+  return m_count;
+}
+
+lldb::ValueObjectSP MsvcStlListFrontEnd::GetChildAtIndex(uint32_t idx) {
+  if (idx >= CalculateNumChildrenIgnoringErrors())
+    return lldb::ValueObjectSP();
+
+  if (!m_head || !m_tail)
+    return lldb::ValueObjectSP();
+
+  if (HasLoop(idx + 1))
+    return lldb::ValueObjectSP();
+
+  ValueObjectSP current_sp = GetItem(idx);
+  if (!current_sp)
+    return lldb::ValueObjectSP();
+
+  current_sp = current_sp->GetChildAtIndex(2); // get the _Myval child
+  if (!current_sp)
+    return lldb::ValueObjectSP();
+
+  // we need to copy current_sp into a new object otherwise we will end up with
+  // all items named _Myval
+  DataExtractor data;
+  Status error;
+  current_sp->GetData(data, error);
+  if (error.Fail())
+    return lldb::ValueObjectSP();
+
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromData(name.GetString(), data,
+                                   m_backend.GetExecutionContextRef(),
+                                   m_element_type);
+}
+
+lldb::ChildCacheState MsvcStlListFrontEnd::Update() {
+  AbstractListFrontEnd::Update();
+  m_tail = nullptr;
+  m_head = nullptr;
+
+  ValueObjectSP last =
+      m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"});
+  if (!last)
+    return lldb::ChildCacheState::eRefetch;
+  ValueObjectSP first = last->GetChildMemberWithName("_Next");
+  if (!first)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_head = first.get();
+  m_tail = last.get();
+
+  return lldb::ChildCacheState::eRefetch;
+}
+
 SyntheticChildrenFrontEnd *formatters::LibcxxStdListSyntheticFrontEndCreator(
     CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
-  return (valobj_sp ? new ListFrontEnd(valobj_sp) : nullptr);
+  return (valobj_sp ? new LibCxxListFrontEnd(valobj_sp) : nullptr);
 }
 
 SyntheticChildrenFrontEnd *
 formatters::LibcxxStdForwardListSyntheticFrontEndCreator(
     CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
-  return valobj_sp ? new ForwardListFrontEnd(*valobj_sp) : nullptr;
+  return valobj_sp ? new LibCxxForwardListFrontEnd(*valobj_sp) : nullptr;
+}
+
+bool formatters::IsMsvcStlList(ValueObject &valobj) {
+  if (auto valobj_sp = valobj.GetNonSyntheticValue())
+    return valobj_sp->GetChildMemberWithName("_Mypair") != nullptr;
+
+  return false;
+}
+
+SyntheticChildrenFrontEnd *
+formatters::MsvcStlListSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                                lldb::ValueObjectSP valobj_sp) {
+  return (valobj_sp ? new MsvcStlListFrontEnd(valobj_sp) : nullptr);
+}
+
+SyntheticChildrenFrontEnd *
+formatters::MsvcStlForwardListSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  return valobj_sp ? new MsvcStlForwardListFrontEnd(*valobj_sp) : nullptr;
 }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
index c041f39022d10..7fc6eb55d4e3e 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp
@@ -9,6 +9,7 @@
 #include "Generic.h"
 #include "LibCxx.h"
 #include "LibStdcpp.h"
+#include "MsvcStl.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/DataFormatters/FormattersHelpers.h"
 #include "lldb/Target/Target.h"
@@ -32,6 +33,7 @@ class GenericOptionalFrontend : public SyntheticChildrenFrontEnd {
   enum class StdLib {
     LibCxx,
     LibStdcpp,
+    MsvcStl,
   };
 
   GenericOptionalFrontend(ValueObject &valobj, StdLib stdlib);
@@ -77,7 +79,8 @@ lldb::ChildCacheState GenericOptionalFrontend::Update() {
   else if (m_stdlib == StdLib::LibStdcpp) {
     if (ValueObjectSP payload = m_backend.GetChildMemberWithName("_M_payload"))
       engaged_sp = payload->GetChildMemberWithName("_M_engaged");
-  }
+  } else if (m_stdlib == StdLib::MsvcStl)
+    engaged_sp = m_backend.GetChildMemberWithName("_Has_value");
 
   if (!engaged_sp)
     return lldb::ChildCacheState::eRefetch;
@@ -114,7 +117,12 @@ ValueObjectSP GenericOptionalFrontend::GetChildAtIndex(uint32_t _idx) {
     ValueObjectSP candidate = val_sp->GetChildMemberWithName("_M_value");
     if (candidate)
       val_sp = candidate;
-  }
+  } else if (m_stdlib == StdLib::MsvcStl)
+    // Same issue as with LibCxx
+    val_sp = m_backend.GetChildMemberWithName("_Has_value")
+                 ->GetParent()
+                 ->GetChildAtIndex(0)
+                 ->GetChildMemberWithName("_Value");
 
   if (!val_sp)
     return ValueObjectSP();
@@ -143,3 +151,17 @@ SyntheticChildrenFrontEnd *formatters::LibcxxOptionalSyntheticFrontEndCreator(
                                        GenericOptionalFrontend::StdLib::LibCxx);
   return nullptr;
 }
+
+bool formatters::IsMsvcStlOptional(ValueObject &valobj) {
+  if (auto valobj_sp = valobj.GetNonSyntheticValue())
+    return valobj_sp->GetChildMemberWithName("_Has_value") != nullptr;
+  return false;
+}
+
+SyntheticChildrenFrontEnd *formatters::MsvcStlOptionalSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (valobj_sp)
+    return new GenericOptionalFrontend(
+        *valobj_sp, GenericOptionalFrontend::StdLib::MsvcStl);
+  return nullptr;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
index c80a52d0f9ed6..595e835b37df9 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
@@ -366,3 +366,49 @@ bool lldb_private::formatters::LibStdcppSmartPointerSummaryProvider(
 
   return true;
 }
+
+static uint64_t LibStdcppVariantNposValue(size_t index_byte_size) {
+  switch (index_byte_size) {
+  case 1:
+    return 0xff;
+  case 2:
+    return 0xffff;
+  default:
+    return 0xffff'ffff;
+  }
+}
+
+bool formatters::LibStdcppVariantSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  ValueObjectSP valobj_sp = valobj.GetNonSyntheticValue();
+  if (!valobj_sp)
+    return false;
+
+  ValueObjectSP index_obj = valobj_sp->GetChildMemberWithName("_M_index");
+  ValueObjectSP data_obj = valobj_sp->GetChildMemberWithName("_M_u");
+  if (!index_obj || !data_obj)
+    return false;
+
+  auto index_bytes = index_obj->GetByteSize();
+  if (!index_bytes)
+    return false;
+  auto npos_value = LibStdcppVariantNposValue(*index_bytes);
+  auto index = index_obj->GetValueAsUnsigned(0);
+  if (index == npos_value) {
+    stream.Printf(" No Value");
+    return true;
+  }
+
+  auto variant_type =
+      valobj_sp->GetCompilerType().GetCanonicalType().GetNonReferenceType();
+  if (!variant_type)
+    return false;
+  if (index >= variant_type.GetNumTemplateArguments(true)) {
+    stream.Printf(" <Invalid>");
+    return true;
+  }
+
+  auto active_type = variant_type.GetTypeTemplateArgument(index, true);
+  stream << " Active Type = " << active_type.GetDisplayTypeName() << " ";
+  return true;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h
index 8d4d777edee88..429142f63a4bd 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h
@@ -29,6 +29,10 @@ bool LibStdcppUniquePointerSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &options); // libstdc++ std::unique_ptr<>
 
+bool LibStdcppVariantSummaryProvider(
+    ValueObject &valobj, Stream &stream,
+    const TypeSummaryOptions &options); // libstdc++ std::variant<>
+
 SyntheticChildrenFrontEnd *
 LibstdcppMapIteratorSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                              lldb::ValueObjectSP);
@@ -57,6 +61,9 @@ SyntheticChildrenFrontEnd *
 LibStdcppUniquePtrSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                            lldb::ValueObjectSP);
 
+bool LibStdcppVariantSummaryProvider(ValueObject &valobj, Stream &stream,
+                                     const TypeSummaryOptions &options);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
index bad47701904bb..e2a015a537868 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
@@ -51,6 +51,47 @@ SyntheticChildrenFrontEnd *
 MsvcStlTupleSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                      lldb::ValueObjectSP valobj_sp);
 
+// MSVC STL std::vector<>
+bool IsMsvcStlVector(ValueObject &valobj);
+lldb_private::SyntheticChildrenFrontEnd *
+MsvcStlVectorSyntheticFrontEndCreator(lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::list and std::forward_list
+bool IsMsvcStlList(ValueObject &valobj);
+SyntheticChildrenFrontEnd *
+MsvcStlForwardListSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                           lldb::ValueObjectSP valobj_sp);
+SyntheticChildrenFrontEnd *
+MsvcStlListSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                    lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::optional<>
+bool IsMsvcStlOptional(ValueObject &valobj);
+SyntheticChildrenFrontEnd *
+MsvcStlOptionalSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                        lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::variant<>
+bool IsMsvcStlVariant(ValueObject &valobj);
+bool MsvcStlVariantSummaryProvider(ValueObject &valobj, Stream &stream,
+                                   const TypeSummaryOptions &options);
+SyntheticChildrenFrontEnd *
+MsvcStlVariantSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                       lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::atomic<>
+bool MsvcStlAtomicSummaryProvider(ValueObject &valobj, Stream &stream,
+                                  const TypeSummaryOptions &options);
+SyntheticChildrenFrontEnd *
+MsvcStlAtomicSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                      lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::unordered_(multi){map|set}<>
+bool IsMsvcStlUnordered(ValueObject &valobj);
+SyntheticChildrenFrontEnd *
+MsvcStlUnorderedSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                         lldb::ValueObjectSP valobj_sp);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp
new file mode 100644
index 0000000000000..3ec324577ac76
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp
@@ -0,0 +1,102 @@
+//===-- MsvcStlAtomic.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MsvcStl.h"
+
+#include "lldb/DataFormatters/TypeSynthetic.h"
+
+using namespace lldb;
+
+namespace lldb_private {
+namespace formatters {
+
+class MsvcStlAtomicSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  MsvcStlAtomicSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  ValueObject *m_storage = nullptr;
+  CompilerType m_element_type;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::MsvcStlAtomicSyntheticFrontEnd::
+    MsvcStlAtomicSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    MsvcStlAtomicSyntheticFrontEnd::CalculateNumChildren() {
+  return m_storage ? 1 : 0;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::MsvcStlAtomicSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (idx == 0)
+    return m_storage->Cast(m_element_type)->Clone(ConstString("Value"));
+  return nullptr;
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::MsvcStlAtomicSyntheticFrontEnd::Update() {
+  m_storage = nullptr;
+  m_element_type.Clear();
+
+  ValueObjectSP storage_sp = m_backend.GetChildMemberWithName("_Storage");
+  if (!storage_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_element_type = m_backend.GetCompilerType().GetTypeTemplateArgument(0);
+  if (!m_element_type)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_storage = storage_sp.get();
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<size_t> lldb_private::formatters::
+    MsvcStlAtomicSyntheticFrontEnd::GetIndexOfChildWithName(ConstString name) {
+  if (name == "Value")
+    return 0;
+  return llvm::createStringError("Type has no child named '%s'",
+                                 name.AsCString());
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::MsvcStlAtomicSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  return new MsvcStlAtomicSyntheticFrontEnd(valobj_sp);
+}
+
+bool lldb_private::formatters::MsvcStlAtomicSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  auto synth_sp = valobj.GetSyntheticValue();
+  if (!synth_sp)
+    return false;
+
+  auto value_sp = synth_sp->GetChildAtIndex(0);
+  std::string summary;
+  if (value_sp->GetSummaryAsCString(summary, options) && !summary.empty()) {
+    stream << summary;
+    return true;
+  }
+  return false;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlUnordered.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlUnordered.cpp
new file mode 100644
index 0000000000000..9540bff97d260
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlUnordered.cpp
@@ -0,0 +1,69 @@
+//===-- MsvcStlUnordered.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MsvcStl.h"
+#include "lldb/DataFormatters/TypeSynthetic.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+namespace {
+
+class UnorderedFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  UnorderedFrontEnd(ValueObject &valobj) : SyntheticChildrenFrontEnd(valobj) {
+    Update();
+  }
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override {
+    if (!m_list_sp)
+      return llvm::createStringError("Missing _List");
+    return m_list_sp->GetIndexOfChildWithName(name);
+  }
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override {
+    if (!m_list_sp)
+      return llvm::createStringError("Missing _List");
+    return m_list_sp->GetNumChildren();
+  }
+
+  ValueObjectSP GetChildAtIndex(uint32_t idx) override {
+    if (!m_list_sp)
+      return nullptr;
+    return m_list_sp->GetChildAtIndex(idx);
+  }
+
+private:
+  ValueObjectSP m_list_sp;
+};
+
+} // namespace
+
+lldb::ChildCacheState UnorderedFrontEnd::Update() {
+  m_list_sp = nullptr;
+  ValueObjectSP list_sp = m_backend.GetChildMemberWithName("_List");
+  if (!list_sp)
+    return lldb::ChildCacheState::eRefetch;
+  m_list_sp = list_sp->GetSyntheticValue();
+  return lldb::ChildCacheState::eRefetch;
+}
+
+bool formatters::IsMsvcStlUnordered(ValueObject &valobj) {
+  if (auto valobj_sp = valobj.GetNonSyntheticValue())
+    return valobj_sp->GetChildMemberWithName("_List") != nullptr;
+  return false;
+}
+
+SyntheticChildrenFrontEnd *formatters::MsvcStlUnorderedSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (valobj_sp)
+    return new UnorderedFrontEnd(*valobj_sp);
+  return nullptr;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVariant.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVariant.cpp
new file mode 100644
index 0000000000000..52a3d98d2af4b
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVariant.cpp
@@ -0,0 +1,188 @@
+//===-- MsvcStlVariant.cpp-------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MsvcStl.h"
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include "lldb/Symbol/CompilerType.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+
+namespace {
+
+// A variant when using DWARF looks as follows:
+// (lldb) fr v -R v1
+// (std::variant<int, double, char>) v1 = {
+//   std::_SMF_control<std::_Variant_base<int, double, char>, int, double, char>
+//   = {
+//     std::_Variant_storage<int, double, char> = {
+//        = {
+//         _Head = 0
+//         _Tail = {
+//            = {
+//             _Head = 2
+//             _Tail = {
+//                = {
+//                 _Head = '\0'
+//                 _Tail = {}
+//               }
+//             }
+//           }
+//         }
+//       }
+//     }
+//     _Which = '\x01'
+//   }
+// }
+
+ValueObjectSP GetStorageMember(ValueObject &valobj, llvm::StringRef name) {
+  // Find the union
+  ValueObjectSP union_sp = valobj.GetChildAtIndex(0);
+  if (!union_sp)
+    return nullptr;
+  return union_sp->GetChildMemberWithName(name);
+}
+
+ValueObjectSP GetHead(ValueObject &valobj) {
+  return GetStorageMember(valobj, "_Head");
+}
+ValueObjectSP GetTail(ValueObject &valobj) {
+  return GetStorageMember(valobj, "_Tail");
+}
+
+std::optional<int64_t> GetIndexValue(ValueObject &valobj) {
+  ValueObjectSP index_sp = valobj.GetChildMemberWithName("_Which");
+  if (!index_sp)
+    return std::nullopt;
+
+  return {index_sp->GetValueAsSigned(-1)};
+}
+
+ValueObjectSP GetNthStorage(ValueObject &outer, int64_t index) {
+  // We need to find the std::_Variant_storage base class.
+
+  // -> std::_SMF_control (typedef to std::_Variant_base)
+  ValueObjectSP container_sp = outer.GetSP()->GetChildAtIndex(0);
+  if (!container_sp)
+    return nullptr;
+  // -> std::_Variant_storage
+  container_sp = container_sp->GetChildAtIndex(0);
+  if (!container_sp)
+    return nullptr;
+
+  for (int64_t i = 0; i < index; i++) {
+    container_sp = GetTail(*container_sp);
+    if (!container_sp)
+      return nullptr;
+  }
+  return container_sp;
+}
+
+} // namespace
+
+bool formatters::IsMsvcStlVariant(ValueObject &valobj) {
+  if (auto valobj_sp = valobj.GetNonSyntheticValue()) {
+    return valobj_sp->GetChildMemberWithName("_Which") != nullptr;
+  }
+  return false;
+}
+
+bool formatters::MsvcStlVariantSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  ValueObjectSP valobj_sp(valobj.GetNonSyntheticValue());
+  if (!valobj_sp)
+    return false;
+
+  auto index = GetIndexValue(*valobj_sp);
+  if (!index)
+    return false;
+
+  if (*index < 0) {
+    stream.Printf(" No Value");
+    return true;
+  }
+
+  ValueObjectSP storage = GetNthStorage(*valobj_sp, *index);
+  if (!storage)
+    return false;
+  CompilerType storage_type = storage->GetCompilerType();
+  if (!storage_type)
+    return false;
+  // Resolve the typedef
+  if (storage_type.IsTypedefType())
+    storage_type = storage_type.GetTypedefedType();
+
+  CompilerType active_type = storage_type.GetTypeTemplateArgument(1, true);
+  if (!active_type)
+    return false;
+
+  stream << " Active Type = " << active_type.GetDisplayTypeName() << " ";
+  return true;
+}
+
+namespace {
+class VariantFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  VariantFrontEnd(ValueObject &valobj) : SyntheticChildrenFrontEnd(valobj) {
+    Update();
+  }
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override {
+    auto optional_idx = formatters::ExtractIndexFromString(name.GetCString());
+    if (!optional_idx) {
+      return llvm::createStringError("Type has no child named '%s'",
+                                     name.AsCString());
+    }
+    return *optional_idx;
+  }
+
+  lldb::ChildCacheState Update() override;
+  llvm::Expected<uint32_t> CalculateNumChildren() override { return m_size; }
+  ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+private:
+  size_t m_size = 0;
+};
+} // namespace
+
+lldb::ChildCacheState VariantFrontEnd::Update() {
+  m_size = 0;
+
+  auto index = GetIndexValue(m_backend);
+  if (index && *index >= 0)
+    m_size = 1;
+
+  return lldb::ChildCacheState::eRefetch;
+}
+
+ValueObjectSP VariantFrontEnd::GetChildAtIndex(uint32_t idx) {
+  if (idx >= m_size)
+    return nullptr;
+
+  auto index = GetIndexValue(m_backend);
+  if (!index)
+    return nullptr;
+
+  ValueObjectSP storage_sp = GetNthStorage(m_backend, *index);
+  if (!storage_sp)
+    return nullptr;
+
+  ValueObjectSP head_sp = GetHead(*storage_sp);
+  if (!head_sp)
+    return nullptr;
+
+  return head_sp->Clone(ConstString("Value"));
+}
+
+SyntheticChildrenFrontEnd *formatters::MsvcStlVariantSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (valobj_sp)
+    return new VariantFrontEnd(*valobj_sp);
+  return nullptr;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp
new file mode 100644
index 0000000000000..cfc98d27f56d6
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp
@@ -0,0 +1,305 @@
+//===-- MsvcStlVector.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MsvcStl.h"
+
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include "lldb/DataFormatters/TypeSynthetic.h"
+
+using namespace lldb;
+
+namespace lldb_private {
+namespace formatters {
+
+class MsvcStlVectorSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  MsvcStlVectorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  ValueObject *m_start = nullptr;
+  ValueObject *m_finish = nullptr;
+  CompilerType m_element_type;
+  uint32_t m_element_size = 0;
+};
+
+class MsvcStlVectorBoolSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  MsvcStlVectorBoolSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  CompilerType m_bool_type;
+  ExecutionContextRef m_exe_ctx_ref;
+  uint64_t m_count = 0;
+  uint64_t m_element_bit_size = 0;
+  lldb::addr_t m_base_data_address = 0;
+  std::map<size_t, lldb::ValueObjectSP> m_children;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::
+    MsvcStlVectorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    MsvcStlVectorSyntheticFrontEnd::CalculateNumChildren() {
+  if (!m_start || !m_finish)
+    return llvm::createStringError(
+        "Failed to determine start/end of vector data.");
+
+  uint64_t start_val = m_start->GetValueAsUnsigned(0);
+  uint64_t finish_val = m_finish->GetValueAsUnsigned(0);
+
+  // A default-initialized empty vector.
+  if (start_val == 0 && finish_val == 0)
+    return 0;
+
+  if (start_val == 0)
+    return llvm::createStringError("Invalid value for start of vector.");
+
+  if (finish_val == 0)
+    return llvm::createStringError("Invalid value for end of vector.");
+
+  if (start_val > finish_val)
+    return llvm::createStringError(
+        "Start of vector data begins after end pointer.");
+
+  size_t num_children = (finish_val - start_val);
+  if (num_children % m_element_size)
+    return llvm::createStringError("Size not multiple of element size.");
+
+  return num_children / m_element_size;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (!m_start || !m_finish)
+    return lldb::ValueObjectSP();
+
+  uint64_t offset = idx * m_element_size;
+  offset = offset + m_start->GetValueAsUnsigned(0);
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromAddress(name.GetString(), offset,
+                                      m_backend.GetExecutionContextRef(),
+                                      m_element_type);
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::Update() {
+  m_start = m_finish = nullptr;
+  ValueObjectSP data_sp(m_backend.GetChildAtNamePath({"_Mypair", "_Myval2"}));
+
+  if (!data_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_start = data_sp->GetChildMemberWithName("_Myfirst").get();
+  m_finish = data_sp->GetChildMemberWithName("_Mylast").get();
+  if (!m_start || !m_finish)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_element_type = m_start->GetCompilerType().GetPointeeType();
+  llvm::Expected<uint64_t> size_or_err = m_element_type.GetByteSize(nullptr);
+  if (size_or_err)
+    m_element_size = *size_or_err;
+  else
+    LLDB_LOG_ERRORV(GetLog(LLDBLog::DataFormatters), size_or_err.takeError(),
+                    "{0}");
+
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<size_t> lldb_private::formatters::
+    MsvcStlVectorSyntheticFrontEnd::GetIndexOfChildWithName(ConstString name) {
+  if (!m_start || !m_finish)
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  auto optional_idx = ExtractIndexFromString(name.GetCString());
+  if (!optional_idx) {
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  }
+  return *optional_idx;
+}
+
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::
+    MsvcStlVectorBoolSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_bool_type(), m_exe_ctx_ref(),
+      m_children() {
+  if (valobj_sp) {
+    Update();
+    m_bool_type =
+        valobj_sp->GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeBool);
+  }
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    MsvcStlVectorBoolSyntheticFrontEnd::CalculateNumChildren() {
+  return m_count;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  auto iter = m_children.find(idx), end = m_children.end();
+  if (iter != end)
+    return iter->second;
+  if (idx >= m_count)
+    return {};
+  if (m_base_data_address == 0 || m_count == 0)
+    return {};
+  if (!m_bool_type)
+    return {};
+
+  // The vector<bool> is represented as a sequence of `int`s.
+  // The size of an `int` is in `m_element_bit_size` (most often 32b).
+  // To access the element at index `i`:
+  // (bool)((data_address[i / bit_size] >> (i % bit_size)) & 1)
+
+  // int *byte_location = &data_address[i / bit_size]
+  size_t byte_idx = (idx / m_element_bit_size) * (m_element_bit_size / 8);
+  lldb::addr_t byte_location = m_base_data_address + byte_idx;
+
+  ProcessSP process_sp(m_exe_ctx_ref.GetProcessSP());
+  if (!process_sp)
+    return {};
+  Status err;
+  Scalar scalar;
+  size_t bytes_read = process_sp->ReadScalarIntegerFromMemory(
+      byte_location, m_element_bit_size / 8, false, scalar, err);
+  if (err.Fail() || bytes_read == 0 || !scalar.IsValid())
+    return {};
+
+  size_t bit_index = idx % m_element_bit_size;
+  bool bit_set = scalar.GetAPSInt()[bit_index];
+  std::optional<uint64_t> size =
+      llvm::expectedToOptional(m_bool_type.GetByteSize(nullptr));
+  if (!size)
+    return {};
+  WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
+  if (bit_set && buffer_sp && buffer_sp->GetBytes()) {
+    // regardless of endianness, anything non-zero is true
+    *(buffer_sp->GetBytes()) = 1;
+  }
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  ValueObjectSP retval_sp(CreateValueObjectFromData(
+      name.GetString(),
+      DataExtractor(buffer_sp, process_sp->GetByteOrder(),
+                    process_sp->GetAddressByteSize()),
+      m_exe_ctx_ref, m_bool_type));
+  if (retval_sp)
+    m_children[idx] = retval_sp;
+  return retval_sp;
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::Update() {
+  m_exe_ctx_ref.Clear();
+  m_count = 0;
+  m_element_bit_size = 0;
+  m_base_data_address = 0;
+  m_children.clear();
+
+  ValueObjectSP valobj_sp = m_backend.GetSP();
+  if (!valobj_sp)
+    return lldb::ChildCacheState::eRefetch;
+  auto exe_ctx_ref = valobj_sp->GetExecutionContextRef();
+
+  ValueObjectSP size_sp = valobj_sp->GetChildMemberWithName("_Mysize");
+  if (!size_sp)
+    return lldb::ChildCacheState::eRefetch;
+  uint64_t count = size_sp->GetValueAsUnsigned(0);
+  if (count == 0)
+    return lldb::ChildCacheState::eReuse;
+
+  ValueObjectSP begin_sp(valobj_sp->GetChildAtNamePath(
+      {"_Myvec", "_Mypair", "_Myval2", "_Myfirst"}));
+  if (!begin_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  // FIXME: the STL exposes _EEN_VBITS as a constant - it should be used instead
+  CompilerType begin_ty = begin_sp->GetCompilerType().GetPointeeType();
+  if (!begin_ty.IsValid())
+    return lldb::ChildCacheState::eRefetch;
+  llvm::Expected<uint64_t> element_bit_size = begin_ty.GetBitSize(nullptr);
+  if (!element_bit_size)
+    return lldb::ChildCacheState::eRefetch;
+
+  uint64_t base_data_address = begin_sp->GetValueAsUnsigned(0);
+  if (!base_data_address)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_exe_ctx_ref = exe_ctx_ref;
+  m_count = count;
+  m_element_bit_size = *element_bit_size;
+  m_base_data_address = base_data_address;
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<size_t>
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (!m_count || !m_base_data_address)
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  auto optional_idx = ExtractIndexFromString(name.AsCString());
+  if (!optional_idx) {
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  }
+  uint32_t idx = *optional_idx;
+  if (idx >= CalculateNumChildrenIgnoringErrors())
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  return idx;
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEndCreator(
+    lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  valobj_sp = valobj_sp->GetNonSyntheticValue();
+  if (!valobj_sp)
+    return nullptr;
+
+  // We can't check the template parameter here, because PDB doesn't include
+  // this information.
+
+  // vector<T>
+  if (valobj_sp->GetChildMemberWithName("_Mypair") != nullptr)
+    return new MsvcStlVectorSyntheticFrontEnd(valobj_sp);
+  // vector<bool>
+  if (valobj_sp->GetChildMemberWithName("_Myvec") != nullptr)
+    return new MsvcStlVectorBoolSyntheticFrontEnd(valobj_sp);
+
+  return nullptr;
+}
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 806f256d9da48..fe28213c49740 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -836,13 +836,13 @@ Status MinidumpFileBuilder::AddMemoryList() {
   // 32 bit memory descriptiors, so we emit them first to ensure the memory is
   // in accessible with a 32 bit offset.
   std::vector<CoreFileMemoryRange> ranges_32;
-  std::vector<CoreFileMemoryRange> ranges_64;
-  CoreFileMemoryRanges all_core_memory_ranges;
-  error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options,
-                                                    all_core_memory_ranges);
+  llvm::Expected<CoreFileMemoryRanges> all_core_memory_ranges_maybe =
+      m_save_core_options.GetMemoryRegionsToSave();
+  if (!all_core_memory_ranges_maybe)
+    return Status::FromError(all_core_memory_ranges_maybe.takeError());
 
-  if (error.Fail())
-    return error;
+  const CoreFileMemoryRanges &all_core_memory_ranges =
+      *all_core_memory_ranges_maybe;
 
   lldb_private::Progress progress("Saving Minidump File", "",
                                   all_core_memory_ranges.GetSize());
@@ -868,6 +868,10 @@ Status MinidumpFileBuilder::AddMemoryList() {
     }
   }
 
+  // The header has to be in 32b memory, as it needs to be addressable by a 32b
+  // RVA. Everything else can be 64b.
+  total_size += sizeof(llvm::minidump::MemoryListHeader);
+
   if (total_size >= UINT32_MAX) {
     error = Status::FromErrorStringWithFormat(
         "Unable to write minidump. Stack memory "
@@ -876,35 +880,15 @@ Status MinidumpFileBuilder::AddMemoryList() {
     return error;
   }
 
-  // After saving the stacks, we start packing as much as we can into 32b.
-  // We apply a generous padding here so that the Directory, MemoryList and
-  // Memory64List sections all begin in 32b addressable space.
-  // Then anything overflow extends into 64b addressable space.
-  // all_core_memory_vec will either contain all stack regions at this point,
-  // or be empty if it's a stack only minidump.
-  if (!all_core_memory_vec.empty())
-    total_size += 256 + (all_core_memory_vec.size() *
-                         sizeof(llvm::minidump::MemoryDescriptor_64));
-
-  for (const auto &core_range : all_core_memory_vec) {
-    const addr_t range_size = core_range.range.size();
-    // We don't need to check for stacks here because we already removed them
-    // from all_core_memory_ranges.
-    if (total_size + range_size < UINT32_MAX) {
-      ranges_32.push_back(core_range);
-      total_size += range_size;
-    } else {
-      ranges_64.push_back(core_range);
-    }
-  }
-
+  // Save only the thread stacks to the 32b memory list. Everything else will
+  // get put in Memory64, this simplifies tracking
   error = AddMemoryList_32(ranges_32, progress);
   if (error.Fail())
     return error;
 
   // Add the remaining memory as a 64b range.
-  if (!ranges_64.empty()) {
-    error = AddMemoryList_64(ranges_64, progress);
+  if (!all_core_memory_ranges.IsEmpty()) {
+    error = AddMemoryList_64(all_core_memory_vec, progress);
     if (error.Fail())
       return error;
   }
@@ -977,6 +961,7 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks(
   const lldb::addr_t addr = range.range.start();
   const lldb::addr_t size = range.range.size();
   Log *log = GetLog(LLDBLog::Object);
+  uint64_t total_bytes_read = 0;
   Status addDataError;
   Process::ReadMemoryChunkCallback callback =
       [&](Status &error, lldb::addr_t current_addr, const void *buf,
@@ -984,7 +969,7 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks(
     if (error.Fail() || bytes_read == 0) {
       LLDB_LOGF(log,
                 "Failed to read memory region at: 0x%" PRIx64
-                ". Bytes read: %" PRIx64 ", error: %s",
+                ". Bytes read: 0x%" PRIx64 ", error: %s",
                 current_addr, bytes_read, error.AsCString());
 
       // If we failed in a memory read, we would normally want to skip
@@ -997,6 +982,21 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks(
       return lldb_private::IterationAction::Stop;
     }
 
+    if (current_addr != addr + total_bytes_read) {
+      LLDB_LOGF(log,
+                "Current addr is at unexpected address, 0x%" PRIx64
+                ", expected at 0x%" PRIx64,
+                current_addr, addr + total_bytes_read);
+
+      // Something went wrong and the address is not where it should be
+      // we'll error out of this Minidump generation.
+      addDataError = Status::FromErrorStringWithFormat(
+          "Unexpected address encounterd when reading memory in chunks "
+          "0x%" PRIx64 " expected 0x%" PRIx64,
+          current_addr, addr + total_bytes_read);
+      return lldb_private::IterationAction::Stop;
+    }
+
     // Write to the minidump file with the chunk potentially flushing to
     // disk.
     // This error will be captured by the outer scope and is considered fatal.
@@ -1006,13 +1006,13 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks(
     if (addDataError.Fail())
       return lldb_private::IterationAction::Stop;
 
+    total_bytes_read += bytes_read;
     // If we have a partial read, report it, but only if the partial read
     // didn't finish reading the entire region.
-    if (bytes_read != data_buffer.GetByteSize() &&
-        current_addr + bytes_read != size) {
+    if (bytes_read != data_buffer.GetByteSize() && total_bytes_read != size) {
       LLDB_LOGF(log,
-                "Memory region at: %" PRIx64 " partiall read 0x%" PRIx64
-                " bytes out of %" PRIx64 " bytes.",
+                "Memory region at: 0x%" PRIx64 " partial read 0x%" PRIx64
+                " bytes out of 0x%" PRIx64 " bytes.",
                 current_addr, bytes_read,
                 data_buffer.GetByteSize() - bytes_read);
 
@@ -1059,7 +1059,7 @@ MinidumpFileBuilder::AddMemoryList_32(std::vector<CoreFileMemoryRange> &ranges,
 
     LLDB_LOGF(log,
               "AddMemoryList %zu/%zu reading memory for region "
-              "(%" PRIx64 " bytes) [%" PRIx64 ", %" PRIx64 ")",
+              "(0x%" PRIx64 " bytes) [0x%" PRIx64 ", 0x%" PRIx64 ")",
               region_index, ranges.size(), size, addr, addr + size);
     ++region_index;
 
@@ -1117,7 +1117,7 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector<CoreFileMemoryRange> &ranges,
     return error;
 
   error = AddDirectory(StreamType::Memory64List,
-                       (sizeof(llvm::support::ulittle64_t) * 2) +
+                       (sizeof(llvm::minidump::Memory64ListHeader)) +
                            ranges.size() *
                                sizeof(llvm::minidump::MemoryDescriptor_64));
   if (error.Fail())
@@ -1130,9 +1130,9 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector<CoreFileMemoryRange> &ranges,
   // Capture the starting offset for all the descriptors so we can clean them up
   // if needed.
   offset_t starting_offset =
-      GetCurrentDataEndOffset() + sizeof(llvm::support::ulittle64_t);
+      GetCurrentDataEndOffset() + sizeof(llvm::minidump::Memory64ListHeader);
   // The base_rva needs to start after the directories, which is right after
-  // this 8 byte variable.
+  // the descriptors + the size of the header.
   offset_t base_rva =
       starting_offset +
       (ranges.size() * sizeof(llvm::minidump::MemoryDescriptor_64));
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index ef691b77193ce..58ebb7be11994 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -108,13 +108,21 @@ MinidumpParser::GetThreadContext(const minidump::Thread &td) {
 
 llvm::ArrayRef<uint8_t>
 MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) {
+  Log *log = GetLog(LLDBLog::Process);
   // On Windows, a 32-bit process can run on a 64-bit machine under WOW64. If
   // the minidump was captured with a 64-bit debugger, then the CONTEXT we just
   // grabbed from the mini_dump_thread is the one for the 64-bit "native"
   // process rather than the 32-bit "guest" process we care about.  In this
   // case, we can get the 32-bit CONTEXT from the TEB (Thread Environment
   // Block) of the 64-bit process.
-  auto teb_mem = GetMemory(td.EnvironmentBlock, sizeof(TEB64));
+  auto teb_mem_maybe = GetMemory(td.EnvironmentBlock, sizeof(TEB64));
+  if (!teb_mem_maybe) {
+    LLDB_LOG_ERROR(log, teb_mem_maybe.takeError(),
+                   "Failed to read Thread Environment Block: {0}");
+    return {};
+  }
+
+  auto teb_mem = *teb_mem_maybe;
   if (teb_mem.empty())
     return {};
 
@@ -126,8 +134,16 @@ MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) {
   // Slot 1 of the thread-local storage in the 64-bit TEB points to a structure
   // that includes the 32-bit CONTEXT (after a ULONG). See:
   // https://msdn.microsoft.com/en-us/library/ms681670.aspx
-  auto context =
+  auto context_maybe =
       GetMemory(wow64teb->tls_slots[1] + 4, sizeof(MinidumpContext_x86_32));
+  if (!context_maybe) {
+    LLDB_LOG_ERROR(log, context_maybe.takeError(),
+                   "Failed to read WOW Thread Context: {0}");
+    return {};
+  }
+
+  auto context = *context_maybe;
+
   if (context.size() < sizeof(MinidumpContext_x86_32))
     return {};
 
@@ -478,11 +494,13 @@ void MinidumpParser::PopulateMemoryRanges() {
   m_memory_ranges.Sort();
 }
 
-llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
-                                                  size_t size) {
+llvm::Expected<llvm::ArrayRef<uint8_t>>
+MinidumpParser::GetMemory(lldb::addr_t addr, size_t size) {
   std::optional<minidump::Range> range = FindMemoryRange(addr);
   if (!range)
-    return {};
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "No memory range found for address (0x%" PRIx64 ")", addr);
 
   // There's at least some overlap between the beginning of the desired range
   // (addr) and the current range.  Figure out where the overlap begins and
@@ -491,7 +509,11 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
   const size_t offset = addr - range->start;
 
   if (addr < range->start || offset >= range->range_ref.size())
-    return {};
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "Address (0x%" PRIx64 ") is not in range [0x%" PRIx64 " - 0x%" PRIx64
+        ")",
+        addr, range->start, range->start + range->range_ref.size());
 
   const size_t overlap = std::min(size, range->range_ref.size() - offset);
   return range->range_ref.slice(offset, overlap);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index 14599f8d572aa..3b7d33daca717 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -104,7 +104,8 @@ class MinidumpParser {
 
   std::optional<Range> FindMemoryRange(lldb::addr_t addr);
 
-  llvm::ArrayRef<uint8_t> GetMemory(lldb::addr_t addr, size_t size);
+  llvm::Expected<llvm::ArrayRef<uint8_t>> GetMemory(lldb::addr_t addr,
+                                                    size_t size);
 
   /// Returns a list of memory regions and a flag indicating whether the list is
   /// complete (includes all regions mapped into the process memory).
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index ef3c00e2857df..17a421a722743 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -322,12 +322,15 @@ size_t ProcessMinidump::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
 size_t ProcessMinidump::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
                                      Status &error) {
 
-  llvm::ArrayRef<uint8_t> mem = m_minidump_parser->GetMemory(addr, size);
-  if (mem.empty()) {
-    error = Status::FromErrorString("could not parse memory info");
+  llvm::Expected<llvm::ArrayRef<uint8_t>> mem_maybe =
+      m_minidump_parser->GetMemory(addr, size);
+  if (!mem_maybe) {
+    error = Status::FromError(mem_maybe.takeError());
     return 0;
   }
 
+  llvm::ArrayRef<uint8_t> mem = *mem_maybe;
+
   std::memcpy(buf, mem.data(), mem.size());
   return mem.size();
 }
diff --git a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
index f4d032388a883..81c6731cafcd1 100644
--- a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
+++ b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
@@ -848,7 +848,7 @@ static DWARFExpression CreateDWARFExpression(ModuleSP module_sp,
   uint32_t byte_size = architecture.GetDataByteSize();
 
   StreamBuffer<32> stream(Stream::eBinary, address_size, byte_order);
-  stream.PutHex8(lldb_private::dwarf::DW_OP_addr);
+  stream.PutHex8(llvm::dwarf::DW_OP_addr);
   stream.PutMaxHex64(symbol.GetFileAddress(), address_size, byte_order);
 
   DataBufferSP buffer =
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
index a00127b8e5580..4bfbb4d81f5da 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
@@ -15,10 +15,10 @@
 #include "lldb/Symbol/Function.h"
 #include "llvm/Support/DJB.h"
 
-using namespace lldb_private;
 using namespace lldb;
-using namespace lldb_private::dwarf;
+using namespace lldb_private;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 std::unique_ptr<AppleDWARFIndex> AppleDWARFIndex::Create(
     Module &module, DWARFDataExtractor apple_names,
@@ -80,7 +80,7 @@ static bool
 EntryHasMatchingQualhash(const llvm::AppleAcceleratorTable::Entry &entry,
                          uint32_t expected_hash) {
   std::optional<llvm::DWARFFormValue> form_value =
-      entry.lookup(dwarf::DW_ATOM_qual_name_hash);
+      entry.lookup(llvm::dwarf::DW_ATOM_qual_name_hash);
   if (!form_value)
     return false;
   std::optional<uint64_t> hash = form_value->getAsUnsignedConstant();
@@ -93,7 +93,7 @@ EntryHasMatchingQualhash(const llvm::AppleAcceleratorTable::Entry &entry,
 static bool EntryHasMatchingTag(const llvm::AppleAcceleratorTable::Entry &entry,
                                 dw_tag_t expected_tag) {
   std::optional<llvm::DWARFFormValue> form_value =
-      entry.lookup(dwarf::DW_ATOM_die_tag);
+      entry.lookup(llvm::dwarf::DW_ATOM_die_tag);
   if (!form_value)
     return false;
   std::optional<uint64_t> maybe_tag = form_value->getAsUnsignedConstant();
@@ -109,7 +109,7 @@ static bool EntryHasMatchingTag(const llvm::AppleAcceleratorTable::Entry &entry,
 static bool
 HasImplementationFlag(const llvm::AppleAcceleratorTable::Entry &entry) {
   std::optional<llvm::DWARFFormValue> form_value =
-      entry.lookup(dwarf::DW_ATOM_type_flags);
+      entry.lookup(llvm::dwarf::DW_ATOM_type_flags);
   if (!form_value)
     return false;
   std::optional<uint64_t> Flags = form_value->getAsUnsignedConstant();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
index e53e930665a60..a8eafc94215dc 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
@@ -18,8 +18,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 std::optional<SymbolFile::ArrayInfo>
 DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index c76d67b47b336..ba65f50a44d10 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -65,8 +65,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 DWARFASTParserClang::DWARFASTParserClang(TypeSystemClang &ast)
     : DWARFASTParser(Kind::DWARFASTParserClang), m_ast(ast),
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFAttribute.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFAttribute.cpp
index 3d35775e081e3..390ec2b98f62d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFAttribute.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFAttribute.cpp
@@ -10,7 +10,7 @@
 #include "DWARFUnit.h"
 #include "DWARFDebugInfo.h"
 
-using namespace lldb_private::dwarf;
+using namespace llvm::dwarf;
 using namespace lldb_private::plugin::dwarf;
 
 DWARFAttributes::DWARFAttributes() : m_infos() {}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 0db230d0a8b56..a9345c79bd2bb 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -23,8 +23,8 @@
 #include "llvm/Support/raw_ostream.h"
 
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 namespace {
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
index f7df38d240191..b1ca123da8fd6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
@@ -106,7 +106,7 @@ void DWARFDebugInfo::ParseUnitsFor(DIERef::Section section) {
     // table lookups can cause the DWO files to be accessed before the skeleton
     // compile unit is parsed, so we keep a map to allow us to match up the DWO
     // file to the back to the skeleton compile units.
-    if (unit_sp->GetUnitType() == lldb_private::dwarf::DW_UT_skeleton) {
+    if (unit_sp->GetUnitType() == llvm::dwarf::DW_UT_skeleton) {
       if (std::optional<uint64_t> unit_dwo_id = unit_sp->GetHeaderDWOId())
         m_dwarf5_dwo_id_to_skeleton_unit[*unit_dwo_id] = unit_sp.get();
     }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 13b68e747b1ce..f968eee091344 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -34,8 +34,8 @@
 #include "SymbolFileDWARFDwo.h"
 
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 extern int g_verbose;
 
 // Extract a debug info entry for a given DWARFUnit from the data
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugMacro.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugMacro.cpp
index 2cd84bc55b751..f574d022d6b05 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugMacro.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugMacro.cpp
@@ -14,8 +14,8 @@
 #include "DWARFDataExtractor.h"
 
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 DWARFDebugMacroHeader
 DWARFDebugMacroHeader::ParseHeader(const DWARFDataExtractor &debug_macro_data,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDeclContext.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDeclContext.cpp
index f759cb8fae611..b0fa0c6215a1c 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDeclContext.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDeclContext.cpp
@@ -9,8 +9,8 @@
 #include "DWARFDeclContext.h"
 #include "llvm/Support/raw_ostream.h"
 
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 const char *DWARFDeclContext::Entry::GetName() const {
   if (name != nullptr)
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
index a159b923f8df4..dbb2a399dabff 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
@@ -19,8 +19,8 @@
 #include "DWARFUnit.h"
 
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 void DWARFFormValue::Clear() {
   m_unit = nullptr;
@@ -564,25 +564,25 @@ uint64_t DWARFFormValue::Reference(dw_offset_t base_offset) const {
 }
 
 std::optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
-  if ((!IsDataForm(m_form)) || m_form == lldb_private::dwarf::DW_FORM_sdata)
+  if ((!IsDataForm(m_form)) || m_form == llvm::dwarf::DW_FORM_sdata)
     return std::nullopt;
   return m_value.uval;
 }
 
 std::optional<int64_t> DWARFFormValue::getAsSignedConstant() const {
   if ((!IsDataForm(m_form)) ||
-      (m_form == lldb_private::dwarf::DW_FORM_udata &&
+      (m_form == llvm::dwarf::DW_FORM_udata &&
        uint64_t(std::numeric_limits<int64_t>::max()) < m_value.uval))
     return std::nullopt;
   switch (m_form) {
-  case lldb_private::dwarf::DW_FORM_data4:
+  case llvm::dwarf::DW_FORM_data4:
     return int32_t(m_value.uval);
-  case lldb_private::dwarf::DW_FORM_data2:
+  case llvm::dwarf::DW_FORM_data2:
     return int16_t(m_value.uval);
-  case lldb_private::dwarf::DW_FORM_data1:
+  case llvm::dwarf::DW_FORM_data1:
     return int8_t(m_value.uval);
-  case lldb_private::dwarf::DW_FORM_sdata:
-  case lldb_private::dwarf::DW_FORM_data8:
+  case llvm::dwarf::DW_FORM_sdata:
+  case llvm::dwarf::DW_FORM_data8:
   default:
     return m_value.sval;
   }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 8b0fade86f177..a66af5b126eb1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -29,8 +29,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 extern int g_verbose;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index e8c621957ef38..ff1a76b1dd1dc 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -17,10 +17,10 @@
 #include "llvm/ADT/Sequence.h"
 #include <optional>
 
-using namespace lldb_private;
 using namespace lldb;
-using namespace lldb_private::dwarf;
+using namespace lldb_private;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 llvm::Expected<std::unique_ptr<DebugNamesDWARFIndex>>
 DebugNamesDWARFIndex::Create(Module &module, DWARFDataExtractor debug_names,
@@ -484,7 +484,7 @@ void DebugNamesDWARFIndex::GetNamespaces(
     ConstString name, llvm::function_ref<bool(DWARFDIE die)> callback) {
   for (const DebugNames::Entry &entry :
        m_debug_names_up->equal_range(name.GetStringRef())) {
-    lldb_private::dwarf::Tag entry_tag = entry.tag();
+    llvm::dwarf::Tag entry_tag = entry.tag();
     if (entry_tag == DW_TAG_namespace ||
         entry_tag == DW_TAG_imported_declaration) {
       if (!ProcessEntry(entry, callback))
@@ -574,7 +574,7 @@ void DebugNamesDWARFIndex::GetNamespacesWithParents(
                [](const CompilerContext &ctx) { return !ctx.name.IsEmpty(); });
   for (const DebugNames::Entry &entry :
        m_debug_names_up->equal_range(name.GetStringRef())) {
-    lldb_private::dwarf::Tag entry_tag = entry.tag();
+    llvm::dwarf::Tag entry_tag = entry.tag();
     if (entry_tag == DW_TAG_namespace ||
         entry_tag == DW_TAG_imported_declaration) {
       std::optional<llvm::SmallVector<Entry, 4>> parent_chain =
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
index 523820874752a..c858ce2161384 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -28,8 +28,8 @@
 
 using namespace lldb_private;
 using namespace lldb;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 void ManualDWARFIndex::Index() {
   if (m_indexed)
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 5b16ce5f75138..4b4a58297ded4 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -98,8 +98,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 LLDB_PLUGIN_DEFINE(SymbolFileDWARF)
 
@@ -4330,13 +4330,16 @@ SymbolFileDWARF::GetContainingDeclContext(const DWARFDIE &die) {
 }
 
 LanguageType SymbolFileDWARF::LanguageTypeFromDWARF(uint64_t val) {
+  if (val <= eLanguageTypeLastStandardLanguage)
+    return static_cast<LanguageType>(val);
+
   // Note: user languages between lo_user and hi_user must be handled
   // explicitly here.
   switch (val) {
   case DW_LANG_Mips_Assembler:
     return eLanguageTypeMipsAssembler;
   default:
-    return static_cast<LanguageType>(val);
+    return eLanguageTypeUnknown;
   }
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
index b598768b6e49f..6f6120902e62f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
@@ -12,8 +12,8 @@
 #include "lldb/Core/Declaration.h"
 #include "lldb/Target/Language.h"
 
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 static bool IsStructOrClassTag(llvm::dwarf::Tag Tag) {
   return Tag == llvm::dwarf::Tag::DW_TAG_class_type ||
diff --git a/lldb/source/Plugins/SymbolFile/PDB/PDBLocationToDWARFExpression.cpp b/lldb/source/Plugins/SymbolFile/PDB/PDBLocationToDWARFExpression.cpp
index 95add31385df6..b5f29c05470f2 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/PDBLocationToDWARFExpression.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/PDBLocationToDWARFExpression.cpp
@@ -25,7 +25,7 @@
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::npdb;
-using namespace lldb_private::dwarf;
+using namespace llvm::dwarf;
 using namespace llvm::pdb;
 
 static std::unique_ptr<IPDBFrameData>
diff --git a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
index f9aa6b1a98765..b775ec98c9a17 100644
--- a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
+++ b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
@@ -87,9 +87,8 @@ class PluginProperties : public Properties {
   void ServerURLsChangedCallback() {
     m_server_urls = GetDebugInfoDURLs();
     llvm::SmallVector<llvm::StringRef> dbginfod_urls;
-    llvm::for_each(m_server_urls, [&](const auto &obj) {
+    for (const auto &obj : m_server_urls)
       dbginfod_urls.push_back(obj.ref());
-    });
     llvm::setDefaultDebuginfodUrls(dbginfod_urls);
   }
   // Storage for the StringRef's used within the Debuginfod library.
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index e847ede1a4ba6..d3b758f794497 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -87,8 +87,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 using namespace clang;
 using llvm::StringSwitch;
 
@@ -2555,6 +2555,7 @@ RemoveWrappingTypes(QualType type, ArrayRef<clang::Type::TypeClass> mask = {}) {
     case clang::Type::TypeOf:
     case clang::Type::TypeOfExpr:
     case clang::Type::Using:
+    case clang::Type::PredefinedSugar:
       type = type->getLocallyUnqualifiedSingleStepDesugaredType();
       break;
     default:
@@ -4130,6 +4131,7 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) {
   case clang::Type::TypeOf:
   case clang::Type::TypeOfExpr:
   case clang::Type::Using:
+  case clang::Type::PredefinedSugar:
     llvm_unreachable("Handled in RemoveWrappingTypes!");
   case clang::Type::UnaryTransform:
     break;
@@ -4840,6 +4842,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   case clang::Type::TypeOf:
   case clang::Type::TypeOfExpr:
   case clang::Type::Using:
+  case clang::Type::PredefinedSugar:
     llvm_unreachable("Handled in RemoveWrappingTypes!");
 
   case clang::Type::UnaryTransform:
@@ -5141,6 +5144,7 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) {
   case clang::Type::TypeOf:
   case clang::Type::TypeOfExpr:
   case clang::Type::Using:
+  case clang::Type::PredefinedSugar:
     llvm_unreachable("Handled in RemoveWrappingTypes!");
   case clang::Type::UnaryTransform:
     break;
diff --git a/lldb/source/Symbol/DWARFCallFrameInfo.cpp b/lldb/source/Symbol/DWARFCallFrameInfo.cpp
index cb8aa8a26c3f1..a2d748adad64a 100644
--- a/lldb/source/Symbol/DWARFCallFrameInfo.cpp
+++ b/lldb/source/Symbol/DWARFCallFrameInfo.cpp
@@ -26,7 +26,7 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
+using namespace llvm::dwarf;
 
 // GetDwarfEHPtr
 //
diff --git a/lldb/source/Symbol/PostfixExpression.cpp b/lldb/source/Symbol/PostfixExpression.cpp
index 82db345b459c8..72669b652e1dd 100644
--- a/lldb/source/Symbol/PostfixExpression.cpp
+++ b/lldb/source/Symbol/PostfixExpression.cpp
@@ -19,7 +19,7 @@
 
 using namespace lldb_private;
 using namespace lldb_private::postfix;
-using namespace lldb_private::dwarf;
+using namespace llvm::dwarf;
 
 static std::optional<BinaryOpNode::OpType>
 GetBinaryOpType(llvm::StringRef token) {
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
index f93b58f59cf96..6d762a66181cf 100644
--- a/lldb/source/Symbol/SaveCoreOptions.cpp
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -124,16 +124,14 @@ void SaveCoreOptions::AddMemoryRegionToSave(
 const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const {
   return m_regions_to_save;
 }
-Status
-SaveCoreOptions::EnsureValidConfiguration(lldb::ProcessSP process_sp) const {
+Status SaveCoreOptions::EnsureValidConfiguration() const {
   Status error;
   std::string error_str;
   if (!m_threads_to_save.empty() && GetStyle() == lldb::eSaveCoreFull)
     error_str += "Cannot save a full core with a subset of threads\n";
 
-  if (m_process_sp && m_process_sp != process_sp)
-    error_str += "Cannot save core for process using supplied core options. "
-                 "Options were constructed targeting a different process. \n";
+  if (!m_process_sp)
+    error_str += "Need to assign a valid process\n";
 
   if (!error_str.empty())
     error = Status(error_str);
@@ -155,12 +153,30 @@ SaveCoreOptions::GetThreadsToSave() const {
   return thread_collection;
 }
 
+llvm::Expected<lldb_private::CoreFileMemoryRanges>
+SaveCoreOptions::GetMemoryRegionsToSave() {
+  Status error;
+  if (!m_process_sp)
+    return Status::FromErrorString("Requires a process to be set.").takeError();
+
+  error = EnsureValidConfiguration();
+  if (error.Fail())
+    return error.takeError();
+
+  CoreFileMemoryRanges ranges;
+  error = m_process_sp->CalculateCoreFileSaveRanges(*this, ranges);
+  if (error.Fail())
+    return error.takeError();
+
+  return ranges;
+}
+
 llvm::Expected<uint64_t> SaveCoreOptions::GetCurrentSizeInBytes() {
   Status error;
   if (!m_process_sp)
     return Status::FromErrorString("Requires a process to be set.").takeError();
 
-  error = EnsureValidConfiguration(m_process_sp);
+  error = EnsureValidConfiguration();
   if (error.Fail())
     return error.takeError();
 
@@ -169,8 +185,14 @@ llvm::Expected<uint64_t> SaveCoreOptions::GetCurrentSizeInBytes() {
   if (error.Fail())
     return error.takeError();
 
+  llvm::Expected<lldb_private::CoreFileMemoryRanges> core_file_ranges_maybe =
+      GetMemoryRegionsToSave();
+  if (!core_file_ranges_maybe)
+    return core_file_ranges_maybe.takeError();
+  const lldb_private::CoreFileMemoryRanges &core_file_ranges =
+      *core_file_ranges_maybe;
   uint64_t total_in_bytes = 0;
-  for (auto &core_range : ranges)
+  for (const auto &core_range : core_file_ranges)
     total_in_bytes += core_range.data.range.size();
 
   return total_in_bytes;
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index 86754c251cd93..484d9badde397 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -257,7 +257,7 @@ static uint32_t num_languages =
 LanguageType Language::GetLanguageTypeFromString(llvm::StringRef string) {
   for (const auto &L : language_names) {
     if (string.equals_insensitive(L.name))
-      return static_cast<LanguageType>(L.type);
+      return L.type;
   }
 
   return eLanguageTypeUnknown;
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 19f89b8246926..ddf8c62e969ed 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -851,8 +851,9 @@ class StopInfoWatchpoint : public StopInfo {
       // We have to step over the watchpoint before we know what to do:   
       StopInfoWatchpointSP me_as_siwp_sp 
           = std::static_pointer_cast<StopInfoWatchpoint>(shared_from_this());
-      ThreadPlanSP step_over_wp_sp(new ThreadPlanStepOverWatchpoint(
-          *(thread_sp.get()), me_as_siwp_sp, wp_sp));
+      ThreadPlanSP step_over_wp_sp =
+          std::make_shared<ThreadPlanStepOverWatchpoint>(*(thread_sp.get()),
+                                                         me_as_siwp_sp, wp_sp);
       // When this plan is done we want to stop, so set this as a Controlling
       // plan.    
       step_over_wp_sp->SetIsControllingPlan(true);
@@ -1475,13 +1476,13 @@ StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread,
                                                           break_id_t break_id) {
   thread.SetThreadHitBreakpointSite();
 
-  return StopInfoSP(new StopInfoBreakpoint(thread, break_id));
+  return std::make_shared<StopInfoBreakpoint>(thread, break_id);
 }
 
 StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread,
                                                           break_id_t break_id,
                                                           bool should_stop) {
-  return StopInfoSP(new StopInfoBreakpoint(thread, break_id, should_stop));
+  return std::make_shared<StopInfoBreakpoint>(thread, break_id, should_stop);
 }
 
 // LWP_TODO: We'll need a CreateStopReasonWithWatchpointResourceID akin
@@ -1489,67 +1490,67 @@ StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread,
 StopInfoSP StopInfo::CreateStopReasonWithWatchpointID(Thread &thread,
                                                       break_id_t watch_id,
                                                       bool silently_continue) {
-  return StopInfoSP(
-      new StopInfoWatchpoint(thread, watch_id, silently_continue));
+  return std::make_shared<StopInfoWatchpoint>(thread, watch_id,
+                                              silently_continue);
 }
 
 StopInfoSP StopInfo::CreateStopReasonWithSignal(Thread &thread, int signo,
                                                 const char *description,
                                                 std::optional<int> code) {
   thread.GetProcess()->GetUnixSignals()->IncrementSignalHitCount(signo);
-  return StopInfoSP(new StopInfoUnixSignal(thread, signo, description, code));
+  return std::make_shared<StopInfoUnixSignal>(thread, signo, description, code);
 }
 
 StopInfoSP StopInfo::CreateStopReasonWithInterrupt(Thread &thread, int signo,
                                                    const char *description) {
-  return StopInfoSP(new StopInfoInterrupt(thread, signo, description));
+  return std::make_shared<StopInfoInterrupt>(thread, signo, description);
 }
 
 StopInfoSP StopInfo::CreateStopReasonToTrace(Thread &thread) {
-  return StopInfoSP(new StopInfoTrace(thread));
+  return std::make_shared<StopInfoTrace>(thread);
 }
 
 StopInfoSP StopInfo::CreateStopReasonWithPlan(
     ThreadPlanSP &plan_sp, ValueObjectSP return_valobj_sp,
     ExpressionVariableSP expression_variable_sp) {
-  return StopInfoSP(new StopInfoThreadPlan(plan_sp, return_valobj_sp,
-                                           expression_variable_sp));
+  return std::make_shared<StopInfoThreadPlan>(plan_sp, return_valobj_sp,
+                                              expression_variable_sp);
 }
 
 StopInfoSP StopInfo::CreateStopReasonWithException(Thread &thread,
                                                    const char *description) {
-  return StopInfoSP(new StopInfoException(thread, description));
+  return std::make_shared<StopInfoException>(thread, description);
 }
 
 StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread,
                                                     const char *description) {
-  return StopInfoSP(new StopInfoProcessorTrace(thread, description));
+  return std::make_shared<StopInfoProcessorTrace>(thread, description);
 }
 
 StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread,
                                                      const char *description) {
-  return StopInfoSP(new StopInfoHistoryBoundary(thread, description));
+  return std::make_shared<StopInfoHistoryBoundary>(thread, description);
 }
 
 StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) {
-  return StopInfoSP(new StopInfoExec(thread));
+  return std::make_shared<StopInfoExec>(thread);
 }
 
 StopInfoSP StopInfo::CreateStopReasonFork(Thread &thread,
                                           lldb::pid_t child_pid,
                                           lldb::tid_t child_tid) {
-  return StopInfoSP(new StopInfoFork(thread, child_pid, child_tid));
+  return std::make_shared<StopInfoFork>(thread, child_pid, child_tid);
 }
 
 
 StopInfoSP StopInfo::CreateStopReasonVFork(Thread &thread,
                                            lldb::pid_t child_pid,
                                            lldb::tid_t child_tid) {
-  return StopInfoSP(new StopInfoVFork(thread, child_pid, child_tid));
+  return std::make_shared<StopInfoVFork>(thread, child_pid, child_tid);
 }
 
 StopInfoSP StopInfo::CreateStopReasonVForkDone(Thread &thread) {
-  return StopInfoSP(new StopInfoVForkDone(thread));
+  return std::make_shared<StopInfoVForkDone>(thread);
 }
 
 ValueObjectSP StopInfo::GetReturnValueObject(StopInfoSP &stop_info_sp) {
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 7f569173eba20..86ae7dd29b764 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1987,8 +1987,11 @@ size_t Target::ReadMemoryFromFileCache(const Address &addr, void *dst,
 
 size_t Target::ReadMemory(const Address &addr, void *dst, size_t dst_len,
                           Status &error, bool force_live_memory,
-                          lldb::addr_t *load_addr_ptr) {
+                          lldb::addr_t *load_addr_ptr,
+                          bool *did_read_live_memory) {
   error.Clear();
+  if (did_read_live_memory)
+    *did_read_live_memory = false;
 
   Address fixed_addr = addr;
   if (ProcessIsValid())
@@ -2086,6 +2089,8 @@ size_t Target::ReadMemory(const Address &addr, void *dst, size_t dst_len,
       if (bytes_read) {
         if (load_addr_ptr)
           *load_addr_ptr = load_addr;
+        if (did_read_live_memory)
+          *did_read_live_memory = true;
         return bytes_read;
       }
     }
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 656503bb8d228..e6cd48a9d3dad 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -99,7 +99,7 @@ let Definition = "target" in {
     DefaultUnsignedValue<24>,
     Desc<"Maximum number of children to expand in any level of depth.">;
   def MaxChildrenDepth: Property<"max-children-depth", "UInt64">,
-    DefaultUnsignedValue<0xFFFFFFFF>,
+    DefaultUnsignedValue<4>,
     Desc<"Maximum depth to expand children.">;
   def MaxSummaryLength: Property<"max-string-summary-length", "UInt64">,
     DefaultUnsignedValue<1024>,
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index c68894808eacc..8c3e19725f8cb 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -710,9 +710,8 @@ bool Thread::ShouldResume(StateType resume_state) {
   const uint32_t process_stop_id = GetProcess()->GetStopID();
   if (m_stop_info_stop_id == process_stop_id &&
       (m_stop_info_sp && m_stop_info_sp->IsValid())) {
-    StopInfo *stop_info = GetPrivateStopInfo().get();
-    if (stop_info)
-      stop_info->WillResume(resume_state);
+    if (StopInfoSP stop_info_sp = GetPrivateStopInfo())
+      stop_info_sp->WillResume(resume_state);
   }
 
   // Tell all the plans that we are about to resume in case they need to clear
diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
index c0ef29fab8597..0f56057189395 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
@@ -98,7 +98,6 @@ def test_subscript(self):
             substrs=["subscript of pointer to incomplete type 'void'"],
         )
 
-    @expectedFailureAll(oslist=["windows"])
     def test_subscript_synthetic(self):
         self.build()
         lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py b/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py
index 20f49e02adcea..e6aeef2bedff2 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py
@@ -16,10 +16,6 @@ def setUp(self):
         # Find the line number to break at.
         self.line = line_number("main.cpp", "// Set break point at this line.")
 
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr24462, Data formatters have problems on Windows",
-    )
     def test_with_run_command(self):
         """Check that we can properly disable all data formatter categories."""
         self.build()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/atomic/TestDataFormatterStdAtomic.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/atomic/TestDataFormatterStdAtomic.py
index 8186e1d66985b..bdf12ca3b86db 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/atomic/TestDataFormatterStdAtomic.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/atomic/TestDataFormatterStdAtomic.py
@@ -67,3 +67,9 @@ def do_test(self):
     def test_libcxx(self):
         self.build(dictionary={"USE_LIBCPP": 1})
         self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
index f63f8fe1d6a62..45695c43b42a9 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
@@ -7,9 +7,6 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class TestDataFormatterGenericForwardList(TestBase):
     def setUp(self):
@@ -17,9 +14,8 @@ def setUp(self):
         self.line = line_number("main.cpp", "// break here")
         self.namespace = "std"
 
-    def do_test(self, stdlib_type):
+    def do_test(self):
         """Test that std::forward_list is displayed correctly"""
-        self.build(dictionary={stdlib_type: "1"})
         lldbutil.run_to_source_breakpoint(
             self, "// break here", lldb.SBFileSpec("main.cpp", False)
         )
@@ -76,10 +72,8 @@ def do_test(self, stdlib_type):
             substrs=["size=24", "[0]", "[1]", "[2]", "..."],
         )
 
-    def do_test_ptr_and_ref(self, stdlib_type):
+    def do_test_ptr_and_ref(self):
         """Test that ref and ptr to std::forward_list is displayed correctly"""
-        self.build(dictionary={stdlib_type: "1"})
-
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "Check ref and ptr", lldb.SBFileSpec("main.cpp", False)
         )
@@ -158,16 +152,31 @@ def do_test_ptr_and_ref(self, stdlib_type):
 
     @add_test_categories(["libstdcxx"])
     def test_libstdcpp(self):
-        self.do_test(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test()
 
     @add_test_categories(["libstdcxx"])
     def test_ptr_and_ref_libstdcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_ptr_and_ref()
 
     @add_test_categories(["libc++"])
     def test_libcpp(self):
-        self.do_test(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test()
 
     @add_test_categories(["libc++"])
     def test_ptr_and_ref_libcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_ptr_and_ref()
+
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_ptr_and_ref_msvcstl(self):
+        self.build()
+        self.do_test_ptr_and_ref()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
index 78c93b1e3caea..c0207e6ab5911 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
@@ -8,9 +8,6 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class GenericListDataFormatterTestCase(TestBase):
     def setUp(self):
@@ -25,9 +22,8 @@ def setUp(self):
             "main.cpp", "// Set final break point at this line."
         )
 
-    def do_test_with_run_command(self, stdlib_type):
+    def do_test_with_run_command(self, *, is_libstdcpp=False):
         """Test that that file and class static variables display correctly."""
-        self.build(dictionary={stdlib_type: "1"})
         self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
 
         lldbutil.run_break_set_by_file_and_line(
@@ -62,7 +58,7 @@ def cleanup():
             "frame variable numbers_list --raw", matching=False, substrs=["size=0"]
         )
 
-        if stdlib_type == USE_LIBSTDCPP:
+        if is_libstdcpp:
             self.expect(
                 "frame variable &numbers_list._M_impl._M_node --raw",
                 matching=False,
@@ -230,10 +226,8 @@ def cleanup():
             "text_list.MightHaveChildren() says False for non empty!",
         )
 
-    def do_test_ptr_and_ref(self, stdlib_type):
+    def do_test_ptr_and_ref(self):
         """Test that ref and ptr to std::list is displayed correctly"""
-        self.build(dictionary={stdlib_type: "1"})
-
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "Check ref and ptr", lldb.SBFileSpec("main.cpp", False)
         )
@@ -302,16 +296,31 @@ def do_test_ptr_and_ref(self, stdlib_type):
 
     @add_test_categories(["libstdcxx"])
     def test_with_run_command_libstdcpp(self):
-        self.do_test_with_run_command(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_with_run_command(is_libstdcpp=True)
 
     @add_test_categories(["libstdcxx"])
     def test_ptr_and_ref_libstdcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_ptr_and_ref()
 
     @add_test_categories(["libc++"])
     def test_with_run_command_libcpp(self):
-        self.do_test_with_run_command(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_with_run_command()
 
     @add_test_categories(["libc++"])
     def test_ptr_and_ref_libcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_ptr_and_ref()
+
+    @add_test_categories(["msvcstl"])
+    def test_with_run_command_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test_with_run_command()
+
+    @add_test_categories(["msvcstl"])
+    def test_ptr_and_ref_msvcstl(self):
+        self.build()
+        self.do_test_ptr_and_ref()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
index 039c703491759..f6174dd786380 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
@@ -9,15 +9,11 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class GenericListDataFormatterTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    def do_test_with_run_command(self, stdlib_type):
-        self.build(dictionary={stdlib_type: "1"})
+    def do_test_with_run_command(self):
         exe = self.getBuildArtifact("a.out")
         target = self.dbg.CreateTarget(exe)
         self.assertTrue(target and target.IsValid(), "Target is valid")
@@ -64,8 +60,16 @@ def do_test_with_run_command(self, stdlib_type):
 
     @add_test_categories(["libstdcxx"])
     def test_with_run_command_libstdcpp(self):
-        self.do_test_with_run_command(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_with_run_command()
 
     @add_test_categories(["libc++"])
     def test_with_run_command_libcpp(self):
-        self.do_test_with_run_command(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_with_run_command()
+
+    @add_test_categories(["msvcstl"])
+    def test_with_run_command_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test_with_run_command()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp
index e797b3d04dd6b..b31d4ca909ecb 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp
@@ -1,8 +1,3 @@
-// Evil hack: To simulate memory corruption, we want to fiddle with some internals of std::list.
-// Make those accessible to us.
-#define private public
-#define protected public
-
 #include <list>
 #include <stdio.h>
 #include <assert.h>
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
index 99d79a9f125b1..7bb4f75de4e59 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
@@ -3,12 +3,9 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class GenericOptionalDataFormatterTestCase(TestBase):
-    def do_test_with_run_command(self, stdlib_type):
+    def do_test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
 
         # This is the function to remove the custom formats in order to have a
@@ -21,7 +18,6 @@ def cleanup():
 
         self.addTearDownHook(cleanup)
 
-        self.build(dictionary={stdlib_type: "1"})
         self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
 
         bkpt = self.target().FindBreakpointByID(
@@ -100,7 +96,8 @@ def cleanup():
     ## We are skipping gcc version less that 5.1 since this test requires -std=c++17
     @skipIf(compiler="gcc", compiler_version=["<", "5.1"])
     def test_with_run_command_libcpp(self):
-        self.do_test_with_run_command(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_with_run_command()
 
     @add_test_categories(["libstdcxx"])
     ## Clang 7.0 is the oldest Clang that can reliably parse newer libc++ versions
@@ -109,4 +106,11 @@ def test_with_run_command_libcpp(self):
     ## We are skipping gcc version less that 5.1 since this test requires -std=c++17
     @skipIf(compiler="gcc", compiler_version=["<", "5.1"])
     def test_with_run_command_libstdcpp(self):
-        self.do_test_with_run_command(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_with_run_command()
+
+    @add_test_categories(["msvcstl"])
+    def test_with_run_command_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test_with_run_command()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py
index a4209ae069790..dd740bd43b063 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py
@@ -2,17 +2,13 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class GenericUnorderedDataFormatterTestCase(TestBase):
     def setUp(self):
         TestBase.setUp(self)
         self.namespace = "std"
 
-    def do_test_with_run_command(self, stdlib_type):
-        self.build(dictionary={stdlib_type: "1"})
+    def do_test_with_run_command(self):
         self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
 
         lldbutil.run_break_set_by_source_regexp(self, "Set break point at this line.")
@@ -127,8 +123,23 @@ def look_for_content_and_continue(self, var_name, patterns):
 
     @add_test_categories(["libstdcxx"])
     def test_with_run_command_libstdcpp(self):
-        self.do_test_with_run_command(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_with_run_command()
+
+    @add_test_categories(["libstdcxx"])
+    def test_with_run_command_libstdcxx_debug(self):
+        self.build(
+            dictionary={"USE_LIBSTDCPP": 1, "CXXFLAGS_EXTRAS": "-D_GLIBCXX_DEBUG"}
+        )
+        self.do_test_with_run_command()
 
     @add_test_categories(["libc++"])
     def test_with_run_command_libcpp(self):
-        self.do_test_with_run_command(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_with_run_command()
+
+    @add_test_categories(["msvcstl"])
+    def test_with_run_command_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test_with_run_command()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/variant/TestDataFormatterStdVariant.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/variant/TestDataFormatterStdVariant.py
index 9365cfc96783e..9f32ad97c1f0a 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/variant/TestDataFormatterStdVariant.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/variant/TestDataFormatterStdVariant.py
@@ -83,3 +83,9 @@ def test_libcxx(self):
     def test_libstdcxx(self):
         self.build(dictionary={"USE_LIBSTDCPP": 1})
         self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
index 56c86d1edde25..dd142d2be193b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
@@ -47,7 +47,7 @@ def cleanup():
         self.expect(
             "frame variable -A vBool",
             substrs=[
-                "size=49",
+                "size=73",
                 "[0] = false",
                 "[1] = true",
                 "[18] = false",
@@ -55,13 +55,20 @@ def cleanup():
                 "[36] = false",
                 "[47] = true",
                 "[48] = true",
+                "[49] = true",
+                "[50] = false",
+                "[56] = false",
+                "[65] = true",
+                "[70] = false",
+                "[71] = true",
+                "[72] = true",
             ],
         )
 
         self.expect(
             "expr -A -- vBool",
             substrs=[
-                "size=49",
+                "size=73",
                 "[0] = false",
                 "[1] = true",
                 "[18] = false",
@@ -69,6 +76,13 @@ def cleanup():
                 "[36] = false",
                 "[47] = true",
                 "[48] = true",
+                "[49] = true",
+                "[50] = false",
+                "[56] = false",
+                "[65] = true",
+                "[70] = false",
+                "[71] = true",
+                "[72] = true",
             ],
         )
 
@@ -88,3 +102,9 @@ def test_libstdcxx_debug(self):
             dictionary={"USE_LIBSTDCPP": 1, "CXXFLAGS_EXTRAS": "-D_GLIBCXX_DEBUG"}
         )
         self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_libstdcxx(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp
index 22fc6c89ca8a2..2c54166ace7cc 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp
@@ -1,10 +1,10 @@
 #include <cstdio>
-#include <string>
 #include <vector>
 
 int main() {
   std::vector<bool> vBool;
 
+  // 0..=7
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -14,6 +14,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 8..=15
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -23,6 +24,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 16..=23
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -32,6 +34,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 24..=31
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -41,6 +44,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 32..=39
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -50,6 +54,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 40..=47
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -58,6 +63,38 @@ int main() {
   vBool.push_back(true);
   vBool.push_back(false);
   vBool.push_back(true);
+
+  // 48..=55
+  vBool.push_back(true);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+
+  // 56..=63
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+
+  // 64..=71
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(true);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+
+  // 72
   vBool.push_back(true);
 
   std::puts("// Set break point at this line.");
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py
index ba8b10450f4fc..d4da60f86a315 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py
@@ -184,6 +184,12 @@ def test_libcxx(self):
         self.build(dictionary={"USE_LIBCPP": 1})
         self.do_test()
 
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
+
     def do_test_ref_and_ptr(self):
         """Test that that file and class static variables display correctly."""
         (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
@@ -215,3 +221,8 @@ def test_ref_and_ptr_libstdcxx_debug(self):
     def test_ref_and_ptr_libcxx(self):
         self.build(dictionary={"USE_LIBCPP": 1})
         self.do_test_ref_and_ptr()
+
+    @add_test_categories(["msvcstl"])
+    def test_ref_and_ptr_msvcstl(self):
+        self.build()
+        self.do_test_ref_and_ptr()
diff --git a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
index e1d7e42bdd1a9..cd60227572be4 100644
--- a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
+++ b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py
@@ -2,7 +2,6 @@
 Test lldb data formatter subsystem.
 """
 
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -120,6 +119,8 @@ def cleanup():
                 '@"2 elements"',
             ],
         )
+
+        self.runCmd("settings set target.max-children-depth 6")
         self.expect(
             "frame variable mutabledict --ptr-depth 3",
             substrs=[
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py
new file mode 100644
index 0000000000000..b86b69c8399f2
--- /dev/null
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py
@@ -0,0 +1,102 @@
+"""
+Test that saved memory regions is byte-wise 1:1 with the live process. Specifically 
+that the memory regions that will be populated in the Memory64List are the same byte for byte.
+"""
+
+import os
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class ProcessSaveCoreMinidump64bTestCase(TestBase):
+    def verify_minidump(
+        self,
+        options,
+    ):
+        """Verify that the minidump is the same byte for byte as the live process."""
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        target = self.dbg.CreateTarget(exe)
+        core_target = None
+        live_proc = target.LaunchSimple(
+            None, None, self.get_process_working_directory()
+        )
+        try:
+            self.assertState(live_proc.GetState(), lldb.eStateStopped)
+            error = live_proc.SaveCore(options)
+            self.assertTrue(error.Success(), error.GetCString())
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = target.LoadCore(options.GetOutputFile().fullpath)
+            # Get the memory regions we saved off in this core, we can't compare to the core
+            # because we pull from /proc/pid/maps, so even ranges that don't get mapped in will show up
+            # as ranges in the minidump.
+            #
+            # Instead, we have an API that returns to us the number of regions we planned to save from the live process
+            # and we compare those
+            memory_regions_to_compare = options.GetMemoryRegionsToSave()
+
+            for region in memory_regions_to_compare:
+                start_addr = region.GetRegionBase()
+                end_addr = region.GetRegionEnd()
+                actual_process_read_error = lldb.SBError()
+                actual = live_proc.ReadMemory(
+                    start_addr, end_addr - start_addr, actual_process_read_error
+                )
+                expected_process_read_error = lldb.SBError()
+                expected = core_proc.ReadMemory(
+                    start_addr, end_addr - start_addr, expected_process_read_error
+                )
+
+                # Both processes could fail to read a given memory region, so if they both pass
+                # compare, then we'll fail them if the core differs from the live process.
+                if (
+                    actual_process_read_error.Success()
+                    and expected_process_read_error.Success()
+                ):
+                    self.assertEqual(
+                        actual, expected, "Bytes differ between live process and core"
+                    )
+
+                # Now we check if the error is the same, error isn't abnormal but they should fail for the same reason
+                # Success will be false if they both fail
+                self.assertTrue(
+                    actual_process_read_error.Success()
+                    == expected_process_read_error.Success(),
+                    f"Address range {hex(start_addr)} - {hex(end_addr)} failed to read from live process and core for different reasons",
+                )
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            if core_target is not None:
+                self.assertTrue(self.dbg.DeleteTarget(core_target))
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_minidump_save_style_full(self):
+        """Test that a full minidump is the same byte for byte."""
+        minidump_path = self.getBuildArtifact("minidump_full_force64b.dmp")
+        try:
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(minidump_path))
+            options.SetStyle(lldb.eSaveCoreFull)
+            options.SetPluginName("minidump")
+            self.verify_minidump(options)
+        finally:
+            if os.path.isfile(minidump_path):
+                os.unlink(minidump_path)
+
+    @skipUnlessArch("x86_64")
+    @skipUnlessPlatform(["linux"])
+    def test_minidump_save_style_mixed_memory(self):
+        """Test that a mixed memory minidump is the same byte for byte."""
+        minidump_path = self.getBuildArtifact("minidump_mixed_force64b.dmp")
+        try:
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(minidump_path))
+            options.SetStyle(lldb.eSaveCoreDirtyOnly)
+            options.SetPluginName("minidump")
+            self.verify_minidump(options)
+        finally:
+            if os.path.isfile(minidump_path):
+                os.unlink(minidump_path)
diff --git a/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py b/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py
index 1dfd7df9ff1be..33648678da797 100644
--- a/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py
+++ b/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py
@@ -19,10 +19,10 @@ def test(self):
         self.expect(
             "frame var --depth 2 --element-count 5 -- c",
             substrs=[
-                "[0] = {\n    b ={...}\n  }",
-                "[1] = {\n    b ={...}\n  }",
-                "[2] = {\n    b ={...}\n  }",
-                "[3] = {\n    b ={...}\n  }",
-                "[4] = {\n    b ={...}\n  }",
+                "[0] = {\n    b = {...}\n  }",
+                "[1] = {\n    b = {...}\n  }",
+                "[2] = {\n    b = {...}\n  }",
+                "[3] = {\n    b = {...}\n  }",
+                "[4] = {\n    b = {...}\n  }",
             ],
         )
diff --git a/lldb/test/API/lang/cpp/template/TestTemplateArgs.py b/lldb/test/API/lang/cpp/template/TestTemplateArgs.py
index 9708e98a59fce..2fd05d6404650 100644
--- a/lldb/test/API/lang/cpp/template/TestTemplateArgs.py
+++ b/lldb/test/API/lang/cpp/template/TestTemplateArgs.py
@@ -82,7 +82,6 @@ def test_integer_args(self):
             'expr_result.GetType().GetName() == "int"',
         )
 
-    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24489")
     def test_template_template_args(self):
         frame = self.prepareProcess()
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 83713213ce1fe..7ab9749f6266d 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -130,14 +130,14 @@ def delete_module_cache(path):
     config.environment["MallocNanoZone"] = "0"
     if "Address" in config.llvm_use_sanitizer:
         config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1"
-        if "Darwin" in config.host_os:
+        if "Darwin" in config.target_os:
             config.environment["DYLD_INSERT_LIBRARIES"] = find_sanitizer_runtime(
                 "libclang_rt.asan_osx_dynamic.dylib"
             )
 
     if "Thread" in config.llvm_use_sanitizer:
         config.environment["TSAN_OPTIONS"] = "halt_on_error=1"
-        if "Darwin" in config.host_os:
+        if "Darwin" in config.target_os:
             config.environment["DYLD_INSERT_LIBRARIES"] = find_sanitizer_runtime(
                 "libclang_rt.tsan_osx_dynamic.dylib"
             )
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 86d58889cc4ad..c4e4352fe7915 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -13,7 +13,7 @@ config.lldb_src_root = "@LLDB_SOURCE_DIR@"
 config.lldb_libs_dir = lit_config.substitute("@LLDB_LIBS_DIR@")
 config.lldb_framework_dir = lit_config.substitute("@LLDB_FRAMEWORK_DIR@")
 config.cmake_cxx_compiler = "@CMAKE_CXX_COMPILER@"
-config.host_os = "@HOST_OS@"
+config.target_os = "@HOST_OS@"
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.shared_libs = @LLVM_ENABLE_SHARED_LIBS@
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
diff --git a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
index 31e35e0285f17..92ca44ecbbffc 100644
--- a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
+++ b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py
@@ -164,3 +164,46 @@ def test_get_total_in_bytes_missing_requirements(self):
         options.SetStyle(lldb.eSaveCoreCustomOnly)
         total = options.GetCurrentSizeInBytes(error)
         self.assertTrue(error.Fail(), error.GetCString())
+
+    def test_get_memory_regions_to_save(self):
+        """
+        Tests the matrix of responses for GetMemoryRegionsToSave
+        """
+
+        options = lldb.SBSaveCoreOptions()
+
+        # Not specifying plugin or process should return an empty list.
+        memory_list = options.GetMemoryRegionsToSave()
+        self.assertEqual(0, memory_list.GetSize())
+
+        # No style returns an empty list
+        process = self.get_basic_process()
+        options.SetProcess(process)
+        memory_list = options.GetMemoryRegionsToSave()
+        self.assertEqual(0, memory_list.GetSize())
+        options.Clear()
+
+        # No Process returns an empty list
+        options.SetStyle(lldb.eSaveCoreCustomOnly)
+        memory_list = options.GetMemoryRegionsToSave()
+        self.assertEqual(0, memory_list.GetSize())
+        options.Clear()
+
+        # Validate we get back the single region we populate
+        options.SetStyle(lldb.eSaveCoreCustomOnly)
+        process = self.get_basic_process()
+        options.SetProcess(process)
+        memory_range = lldb.SBMemoryRegionInfo()
+
+        # Add the memory range of 0x1000-0x1100
+        process.GetMemoryRegionInfo(0x1000, memory_range)
+        options.AddMemoryRegionToSave(memory_range)
+        memory_list = options.GetMemoryRegionsToSave()
+        self.assertEqual(1, memory_list.GetSize())
+        read_region = lldb.SBMemoryRegionInfo()
+        memory_list.GetMemoryRegionAtIndex(0, read_region)
+
+        # Permissions from Process getLLDBRegion aren't matching up with
+        # the live process permissions, so we're just checking the range for now.
+        self.assertEqual(memory_range.GetRegionBase(), read_region.GetRegionBase())
+        self.assertEqual(memory_range.GetRegionEnd(), read_region.GetRegionEnd())
diff --git a/lldb/test/API/python_api/type/main.cpp b/lldb/test/API/python_api/type/main.cpp
index 6acde5bb666a6..449f77db0d75e 100644
--- a/lldb/test/API/python_api/type/main.cpp
+++ b/lldb/test/API/python_api/type/main.cpp
@@ -44,7 +44,12 @@ template <unsigned Value> struct PointerInfo {
 };
 
 template <unsigned Value, typename InfoType = PointerInfo<Value>>
-struct Pointer {};
+struct Pointer {
+  // When compiling for Windows with exceptions enabled, this struct
+  // must contain something that takes space and is initialised.
+  // Otherwise it will not be present in the debug information.
+  int pad = 0;
+};
 
 enum EnumType {};
 enum class ScopedEnumType {};
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 831edd6494c1e..2e860ff5d5e17 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -398,3 +398,26 @@ def test_column_breakpoints(self):
         self.stepIn()
         func_name = self.get_stackFrames()[0]["name"]
         self.assertEqual(func_name, "a::fourteen(int)")
+
+    @skipIfWindows
+    def test_hit_multiple_breakpoints(self):
+        """Test that if we hit multiple breakpoints at the same address, they
+        all appear in the stop reason."""
+        breakpoint_lines = [
+            line_number("main.cpp", "// break non-breakpointable line"),
+            line_number("main.cpp", "// before loop"),
+        ]
+
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        # Set a pair of breakpoints that will both resolve to the same address.
+        breakpoint_ids = [
+            int(bp_id)
+            for bp_id in self.set_source_breakpoints(self.main_path, breakpoint_lines)
+        ]
+        self.assertEqual(len(breakpoint_ids), 2, "expected two breakpoints")
+        self.dap_server.request_continue()
+        print(breakpoint_ids)
+        # Verify we hit both of the breakpoints we just set
+        self.verify_all_breakpoints_hit(breakpoint_ids)
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/main.cpp b/lldb/test/API/tools/lldb-dap/breakpoint/main.cpp
index a84546a95af15..2206b07f19494 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/main.cpp
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/main.cpp
@@ -33,7 +33,7 @@ int main(int argc, char const *argv[]) {
   if (foo == nullptr) {
     fprintf(stderr, "%s\n", dlerror());
     exit(2);
-  }
+  } // break non-breakpointable line
   foo(12); // before loop
 
   for (int i = 0; i < 10; ++i) {
diff --git a/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s b/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s
index bd40baf2643a0..78be614e3af15 100644
--- a/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s
+++ b/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s
@@ -1,6 +1,5 @@
 # REQUIRES: riscv
-# Unsupported until we fix launching the filter program on Windows.
-# UNSUPPORTED: system-windows
+# REQUIRES: python
 
 # This test verifies that disassemble -b prints out the correct bytes and
 # format for standard and unknown riscv instructions of various sizes,
@@ -11,7 +10,7 @@
 
 # RUN: llvm-mc -filetype=obj -mattr=+c --triple=riscv32-unknown-unknown %s -o %t
 # RUN: %lldb -b %t "-o" "disassemble -b -n main" | FileCheck %s
-# RUN: %lldb -b %t -o "command script import %S/../../../examples/python/filter_disasm.py" -o "fdis set %S/Inputs/dis_filt.py" -o "fdis -n main" | FileCheck --check-prefix=FILTER %s
+# RUN: %lldb -b %t -o "command script import %S/../../../examples/python/filter_disasm.py" -o "fdis set %python %S/Inputs/dis_filt.py" -o "fdis -n main" | FileCheck --check-prefix=FILTER %s
 
 main:
     addi   sp, sp, -0x20               # 16 bit standard instruction
diff --git a/lldb/test/Shell/Minidump/missing-memory-region.yaml b/lldb/test/Shell/Minidump/missing-memory-region.yaml
new file mode 100644
index 0000000000000..1784cacfaf1ba
--- /dev/null
+++ b/lldb/test/Shell/Minidump/missing-memory-region.yaml
@@ -0,0 +1,42 @@
+# Check that looking up a memory region not present in the Minidump fails
+# even if it's in the /proc/<pid>/maps file.
+
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb -c %t -o "memory read 0x5000" 2>&1 | FileCheck %s
+
+# CHECK-LABEL: (lldb) memory read 0x5000
+# CHECK-NEXT: error: No memory range found for address (0x5000)
+
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  AMD64
+    Processor Level: 6
+    Processor Revision: 15876
+    Number of Processors: 40
+    Platform ID:     Linux
+    CSD Version:     'Linux 3.13.0-91-generic #138-Ubuntu SMP Fri Jun 24 17:00:34 UTC 2016 x86_64'
+    CPU:
+      Vendor ID:       GenuineIntel
+      Version Info:    0x00000000
+      Feature Info:    0x00000000
+  - Type:            LinuxProcStatus
+    Text:             |
+      Name:	test-yaml
+      Umask:	0002
+      State:	t (tracing stop)
+      Pid:	8567
+  - Type:            LinuxMaps
+    Text:             |
+      0x1000-0x1100     r-xp 00000000 00:00 0
+      0x2000-0x2200     rw-p 00000000 00:00 0
+      0x4000-0x6000     rw-- 00000000 00:00 0
+  - Type:            Memory64List
+    Memory Ranges:
+      - Start of Memory Range: 0x1000
+        Data Size:       0x100
+        Content :        ''
+      - Start of Memory Range: 0x2000
+        Data Size:       0x200
+        Content :        ''
+...
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
index 5c48b796efda4..2b1818ef8d6c6 100644
--- a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
@@ -1,6 +1,6 @@
 # Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
 RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef --unifdef_guards USWIG
 
 # Check the output
 RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
index a7e82d2f3640c..ba18b4b41d3a0 100644
--- a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
@@ -1,7 +1,7 @@
 # REQUIRES: system-darwin
 # Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
 RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef --unifdef_guards USWIG
 
 # Check the output
 RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
index d015942653967..e2080ca01a6fc 100644
--- a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
+++ b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
@@ -1,6 +1,6 @@
 # Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
 RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/RPC/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/RPC/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef --unifdef_guards USWIG
 
 # Check the output
 RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test
new file mode 100644
index 0000000000000..da6436cb5ca20
--- /dev/null
+++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test
@@ -0,0 +1,68 @@
+# Test that we warn the user about truncated output
+# when target.max-children-count wasn't explicitly set.
+
+# RUN: split-file %s %t
+# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=DWIM
+#
+# RUN: %lldb -x -b -s %t/expr-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=EXPR
+#
+# RUN: %lldb -x -b -s %t/frame-var-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=VAR
+#
+# RUN: %lldb -x -b -s %t/with-setting-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=SETTING
+
+#--- main.cpp
+
+int main() {
+  int arr[512] = { 3 };
+  __builtin_debugtrap();
+}
+
+#--- dwim-commands.input
+
+run
+dwim-print arr
+frame variable arr
+
+DWIM:      (lldb) dwim-print arr
+DWIM:      *** Some of the displayed variables have more members
+DWIM-SAME: use the --show-all-children option to dwim-print
+DWIM:      (lldb) frame variable arr
+DWIM-NOT:  *** Some of the displayed variables have more members
+
+#--- expr-commands.input
+
+run
+expression arr
+frame variable arr
+
+EXPR:      (lldb) expression arr
+EXPR:      *** Some of the displayed variables have more members
+EXPR-SAME: use the --show-all-children option to expression
+EXPR:      (lldb) frame variable arr
+EXPR-NOT:  *** Some of the displayed variables have more members
+
+#--- frame-var-commands.input
+
+run
+frame variable arr
+dwim-print arr
+
+VAR:      (lldb) frame variable arr
+VAR:      *** Some of the displayed variables have more members
+VAR-SAME: use the --show-all-children option to frame variable
+VAR:      (lldb) dwim-print arr
+VAR-NOT:  *** Some of the displayed variables have more members
+
+#--- with-setting-commands.input
+
+run
+settings set target.max-children-count 1
+frame variable arr
+
+SETTING:      (lldb) frame variable arr
+SETTING-NOT:  *** Some of the displayed variables have more members
diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
new file mode 100644
index 0000000000000..12f5661600ae7
--- /dev/null
+++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
@@ -0,0 +1,84 @@
+# Test that we warn the user about truncated output
+# when target.max-children-depth wasn't explicitly set.
+
+# RUN: split-file %s %t
+# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=DWIM
+#
+# RUN: %lldb -x -b -s %t/expr-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=EXPR
+#
+# RUN: %lldb -x -b -s %t/frame-var-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=VAR
+#
+# RUN: %lldb -x -b -s %t/with-setting-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=SETTING
+
+#--- main.cpp
+
+struct L1 {
+    int w;
+    struct L2 {
+        int x;
+        struct L3 {
+            int y;
+            struct L4 {
+              int z;
+                struct L5 {
+                  int a;
+                } l5;
+            } l4;
+        } l3;
+    } l2;
+};
+
+int main() {
+  L1 nested;
+  __builtin_debugtrap();
+}
+
+#--- dwim-commands.input
+
+run
+dwim-print nested
+frame variable nested
+
+DWIM:      (lldb) dwim-print nested
+DWIM:      *** Some of the displayed variables have a greater depth of members
+DWIM-SAME: use the --depth option to dwim-print
+DWIM:      (lldb) frame variable nested
+DWIM-NOT:  *** Some of the displayed variables have a greater depth of members
+
+#--- expr-commands.input
+
+run
+expression nested
+frame variable nested
+
+EXPR:      (lldb) expression nested
+EXPR:      *** Some of the displayed variables have a greater depth of members
+EXPR-SAME: use the --depth option to expression
+EXPR:      (lldb) frame variable nested
+EXPR-NOT:  *** Some of the displayed variables have a greater depth of members
+
+#--- frame-var-commands.input
+
+run
+frame variable nested
+frame variable nested
+
+VAR:      (lldb) frame variable nested
+VAR:      *** Some of the displayed variables have a greater depth of members
+VAR-SAME: use the --depth option to frame variable
+VAR:      (lldb) frame variable nested
+VAR-NOT:  *** Some of the displayed variables have a greater depth of members
+
+#--- with-setting-commands.input
+
+run
+settings set target.max-children-depth 1
+frame variable nested
+
+SETTING:      (lldb) frame variable nested
+SETTING-NOT:  *** Some of the displayed variables have a greater depth of members
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit
index bbce1e88626e5..301488d5810b3 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit
@@ -1,3 +1,4 @@
+settings set target.max-children-depth 10
 expr a
 expr b.c
 expr b.u.c
diff --git a/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script b/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script
index 91de55f4ade4a..43018eacf709b 100644
--- a/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script
+++ b/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script
@@ -1,3 +1,4 @@
+settings set target.max-children-depth 10
 breakpoint set --file UdtLayoutTest.cpp --line 60
 run
 target variable
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index fd89f52595ec6..cbd3b14463e25 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -983,7 +983,7 @@ llvm::Error DAP::Loop() {
 
           if (const protocol::Request *req =
                   std::get_if<protocol::Request>(&*next);
-              req && req->arguments == "disconnect")
+              req && req->command == "disconnect")
             disconnecting = true;
 
           const std::optional<CancelArguments> cancel_args =
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 41ca29a405ac9..f42c50236f19e 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -654,12 +654,17 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread,
       } else {
         body.try_emplace("reason", "breakpoint");
       }
-      lldb::break_id_t bp_id = thread.GetStopReasonDataAtIndex(0);
-      lldb::break_id_t bp_loc_id = thread.GetStopReasonDataAtIndex(1);
-      std::string desc_str =
-          llvm::formatv("breakpoint {0}.{1}", bp_id, bp_loc_id);
-      body.try_emplace("hitBreakpointIds",
-                       llvm::json::Array{llvm::json::Value(bp_id)});
+      std::vector<lldb::break_id_t> bp_ids;
+      std::ostringstream desc_sstream;
+      desc_sstream << "breakpoint";
+      for (size_t idx = 0; idx < thread.GetStopReasonDataCount(); idx += 2) {
+        lldb::break_id_t bp_id = thread.GetStopReasonDataAtIndex(idx);
+        lldb::break_id_t bp_loc_id = thread.GetStopReasonDataAtIndex(idx + 1);
+        bp_ids.push_back(bp_id);
+        desc_sstream << " " << bp_id << "." << bp_loc_id;
+      }
+      std::string desc_str = desc_sstream.str();
+      body.try_emplace("hitBreakpointIds", llvm::json::Array(bp_ids));
       EmplaceSafeString(body, "description", desc_str);
     }
   } break;
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index d5d36158d68e0..94b9559b9ca70 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -58,10 +58,12 @@ void DAPTestBase::SetUp() {
 }
 
 void DAPTestBase::TearDown() {
-  if (core)
+  if (core) {
     ASSERT_THAT_ERROR(core->discard(), Succeeded());
-  if (binary)
+  }
+  if (binary) {
     ASSERT_THAT_ERROR(binary->discard(), Succeeded());
+  }
 }
 
 void DAPTestBase::SetUpTestSuite() {
diff --git a/lldb/unittests/Expression/DWARFExpressionTest.cpp b/lldb/unittests/Expression/DWARFExpressionTest.cpp
index fdc9bfae1876c..819c9739dde7d 100644
--- a/lldb/unittests/Expression/DWARFExpressionTest.cpp
+++ b/lldb/unittests/Expression/DWARFExpressionTest.cpp
@@ -24,8 +24,8 @@
 #include "gtest/gtest.h"
 
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 static llvm::Expected<Scalar> Evaluate(llvm::ArrayRef<uint8_t> expr,
                                        lldb::ModuleSP module_sp = {},
@@ -111,7 +111,8 @@ class MockTarget : public Target {
 
   size_t ReadMemory(const Address &addr, void *dst, size_t dst_len,
                     Status &error, bool force_live_memory = false,
-                    lldb::addr_t *load_addr_ptr = nullptr) /*override*/ {
+                    lldb::addr_t *load_addr_ptr = nullptr,
+                    bool *did_read_live_memory = nullptr) /*override*/ {
     auto expected_memory = this->ReadMemory(addr.GetOffset(), dst_len);
     if (!expected_memory) {
       llvm::consumeError(expected_memory.takeError());
diff --git a/lldb/unittests/Host/NativeProcessProtocolTest.cpp b/lldb/unittests/Host/NativeProcessProtocolTest.cpp
index a48e67c9213da..91c4fd69d6e54 100644
--- a/lldb/unittests/Host/NativeProcessProtocolTest.cpp
+++ b/lldb/unittests/Host/NativeProcessProtocolTest.cpp
@@ -73,6 +73,97 @@ TEST(NativeProcessProtocolTest, SetBreakpointFailVerify) {
                     llvm::Failed());
 }
 
+TEST(NativeProcessProtocolTest, RemoveSoftwareBreakpoint) {
+  NiceMock<MockDelegate> DummyDelegate;
+  MockProcess<NativeProcessProtocol> Process(DummyDelegate,
+                                             ArchSpec("x86_64-pc-linux"));
+  auto Trap = cantFail(Process.GetSoftwareBreakpointTrapOpcode(1));
+  auto Original = std::vector<uint8_t>{0xbb};
+
+  // Set up a breakpoint.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(Original)));
+    EXPECT_CALL(Process, WriteMemory(0x47, Trap)).WillOnce(Return(ByMove(1)));
+    EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap)));
+    EXPECT_THAT_ERROR(Process.SetBreakpoint(0x47, 0, false).ToError(),
+                      llvm::Succeeded());
+  }
+
+  // Remove the breakpoint for the first time. This should remove the breakpoint
+  // from m_software_breakpoints.
+  //
+  // Should succeed.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap)));
+    EXPECT_CALL(Process, WriteMemory(0x47, llvm::ArrayRef(Original)))
+        .WillOnce(Return(ByMove(1)));
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(Original)));
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Succeeded());
+  }
+
+  // Remove the breakpoint for the second time.
+  //
+  // Should fail. None of the ReadMemory() or WriteMemory() should be called,
+  // because the function should early return when seeing that the breakpoint
+  // isn't in m_software_breakpoints.
+  {
+    EXPECT_CALL(Process, ReadMemory(_, _)).Times(0);
+    EXPECT_CALL(Process, WriteMemory(_, _)).Times(0);
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Failed());
+  }
+}
+
+TEST(NativeProcessProtocolTest, RemoveSoftwareBreakpointMemoryError) {
+  NiceMock<MockDelegate> DummyDelegate;
+  MockProcess<NativeProcessProtocol> Process(DummyDelegate,
+                                             ArchSpec("x86_64-pc-linux"));
+  auto Trap = cantFail(Process.GetSoftwareBreakpointTrapOpcode(1));
+  auto Original = std::vector<uint8_t>{0xbb};
+  auto SomethingElse = std::vector<uint8_t>{0xaa};
+
+  // Set up a breakpoint.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(Original)));
+    EXPECT_CALL(Process, WriteMemory(0x47, Trap)).WillOnce(Return(ByMove(1)));
+    EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap)));
+    EXPECT_THAT_ERROR(Process.SetBreakpoint(0x47, 0, false).ToError(),
+                      llvm::Succeeded());
+  }
+
+  // Remove the breakpoint for the first time, with an unexpected value read by
+  // the first ReadMemory(). This should cause an early return, with the
+  // breakpoint removed from m_software_breakpoints.
+  //
+  // Should fail.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(SomethingElse)));
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Failed());
+  }
+
+  // Remove the breakpoint for the second time.
+  //
+  // Should fail. None of the ReadMemory() or WriteMemory() should be called,
+  // because the function should early return when seeing that the breakpoint
+  // isn't in m_software_breakpoints.
+  {
+    EXPECT_CALL(Process, ReadMemory(_, _)).Times(0);
+    EXPECT_CALL(Process, WriteMemory(_, _)).Times(0);
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Failed());
+  }
+}
+
 TEST(NativeProcessProtocolTest, ReadMemoryWithoutTrap) {
   NiceMock<MockDelegate> DummyDelegate;
   MockProcess<NativeProcessProtocol> Process(DummyDelegate,
@@ -146,4 +237,4 @@ TEST(NativeProcessProtocolTest, ReadCStringFromMemory_CrossPageBoundary) {
                                                      bytes_read),
                        llvm::HasValue(llvm::StringRef("hello")));
   EXPECT_EQ(bytes_read, 6UL);
-}
\ No newline at end of file
+}
diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
index ee31c8e63644b..44f653c6fa135 100644
--- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
+++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
@@ -308,16 +308,19 @@ TEST_F(MinidumpParserTest, GetMemory) {
 )"),
                     llvm::Succeeded());
 
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54}), parser->GetMemory(0x401d46, 1));
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54, 0x21}),
-            parser->GetMemory(0x401d46, 4));
-
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}),
-            parser->GetMemory(0x7ffceb34a000, 5));
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}),
-            parser->GetMemory(0x7ffceb34a000, 3));
-
-  EXPECT_EQ(llvm::ArrayRef<uint8_t>(), parser->GetMemory(0x500000, 512));
+  EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 1),
+                       llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54}));
+  EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 4),
+                       llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54, 0x21}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x7ffceb34a000, 5),
+      llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x7ffceb34a000, 3),
+      llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x500000, 512),
+      llvm::FailedWithMessage("No memory range found for address (0x500000)"));
 }
 
 TEST_F(MinidumpParserTest, FindMemoryRangeWithFullMemoryMinidump) {
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index d555d27bef958..71930ab54f409 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -245,7 +245,7 @@ TEST_F(TestTypeSystemClang, TestBuiltinTypeForEmptyTriple) {
 
   EXPECT_FALSE(ast.GetBuiltinTypeByName(ConstString("int")).IsValid());
   EXPECT_FALSE(ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
-                      "char", dwarf::DW_ATE_signed_char, 8)
+                      "char", llvm::dwarf::DW_ATE_signed_char, 8)
                    .IsValid());
   EXPECT_FALSE(ast.GetBuiltinTypeForEncodingAndBitSize(lldb::eEncodingUint, 8)
                    .IsValid());
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARF64UnitTest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARF64UnitTest.cpp
index 20061ec191ca1..702bd8cb3c35e 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARF64UnitTest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARF64UnitTest.cpp
@@ -11,8 +11,8 @@
 #include "TestingSupport/Symbol/YAMLModuleTester.h"
 
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 TEST(DWARF64UnitTest, DWARF64DebugInfoAndCU) {
   const char *yamldata = R"(
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index d608a57382096..0cae01de2902a 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -17,8 +17,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 
 namespace {
 static std::once_flag debugger_initialize_flag;
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 0da26d99ad383..e285ef24dad64 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -20,7 +20,7 @@
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::plugin::dwarf;
-using namespace lldb_private::dwarf;
+using namespace llvm::dwarf;
 
 TEST(DWARFDIETest, ChildIteration) {
   // Tests DWARFDIE::child_iterator.
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDebugNamesIndexTest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDebugNamesIndexTest.cpp
index e56e628d68e8c..c5f811ef4d12e 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDebugNamesIndexTest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDebugNamesIndexTest.cpp
@@ -34,7 +34,7 @@ check_num_matches(DebugNamesDWARFIndex &index, int expected_num_matches,
 }
 
 static DWARFDeclContext::Entry make_entry(const char *c) {
-  return DWARFDeclContext::Entry(dwarf::DW_TAG_class_type, c);
+  return DWARFDeclContext::Entry(llvm::dwarf::DW_TAG_class_type, c);
 }
 
 TEST(DWARFDebugNamesIndexTest, FullyQualifiedQueryWithIDXParent) {
diff --git a/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp b/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp
index 37ecfea26f36c..7f477f1913f9c 100644
--- a/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/SymbolFileDWARFTests.cpp
@@ -37,8 +37,8 @@
 
 using namespace lldb;
 using namespace lldb_private;
-using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
+using namespace llvm::dwarf;
 using llvm::DWARFDebugArangeSet;
 
 class SymbolFileDWARFTests : public testing::Test {
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3f8201fa426fe..903461b39902e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -179,6 +179,13 @@ if ("flang" IN_LIST LLVM_ENABLE_PROJECTS)
   endif ()
 endif()
 
+if ("lldb" IN_LIST LLVM_ENABLE_PROJECTS)
+  if (NOT "clang" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(STATUS "Enabling clang as a dependency of lldb")
+    list(APPEND LLVM_ENABLE_PROJECTS "clang")
+  endif()
+endif ()
+
 if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at "
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index de5f66ce1584c..87d2a9ac3bf88 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -263,6 +263,20 @@ amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitH
 
 #### PowerPC backend
 
+Amy Kwan (esp. release issues) \
+Amy.Kwan1@ibm.com (email), [amy-kwan](https://github.com/amy-kwan) (GitHub) \
+Lei Huang \
+lei@ca.ibm.com (email), [lei137](https://github.com/lei137) (GitHub) \
+Sean Fertile (esp. ABI/ELF/XCOFF) \
+sfertile@ca.ibm.com (email), [mandlebug](https://github.com/mandlebug) (GitHub) \
+Zhijian Lin \
+zhijian@ca.ibm.com (email), [diggerlin](https://github.com/diggerlin) (GitHub) \
+Maryam Moghadas \
+maryammo@ca.ibm.com (email), [maryammo](https://github.com/maryammo) (GitHub) \
+Roland Froese \
+froese@ca.ibm.com (email), [RolandF77](https://github.com/RolandF77) (GitHub) \
+llvmonpower \
+powerllvm@ca.ibm.com (email), [llvmonpower](https://github.com/llvmonpower) (GitHub)
 
 #### RISCV backend
 
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c5b9bd9de66e1..19357635ecfc1 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1844,6 +1844,20 @@ The AMDGPU backend supports the following calling conventions:
                                      ..TODO::
                                      Describe.
 
+     ``amdgpu_gfx_whole_wave``       Used for AMD graphics targets. Functions with this calling convention
+                                     cannot be used as entry points. They must have an i1 as the first argument,
+                                     which will be mapped to the value of EXEC on entry into the function. Other
+                                     arguments will contain poison in their inactive lanes. Similarly, the return
+                                     value for the inactive lanes is poison.
+
+                                     The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the
+                                     prologue and restored to its original value in the epilogue. The inactive lanes
+                                     will be preserved for all the registers used by the function. Active lanes only
+                                     will only be preserved for the callee saved registers.
+
+                                     In all other respects, functions with this calling convention behave like
+                                     ``amdgpu_gfx`` functions.
+
      ``amdgpu_gs``                   Used for Mesa/AMDPAL geometry shaders.
                                      ..TODO::
                                      Describe.
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index 71fdd12d7cc6d..8301b95f54938 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -108,3 +108,31 @@ If specific jobs within the workflow need additional permissions, those
 permissions should be added within the specific job. This practice locks down
 all permissions by default and only enables them when needed, better enforcing
 the principle of least privilege.
+
+Ensuring Workflows Run on the Correct Events
+--------------------------------------------
+
+Github allows workflows to run on a multitude of events and it is important to
+configure a workflow such that it triggers on the correct events. There are
+two main best practices around events that trigger workflows:
+
+1. Workflows that are designed to run on pull requests should not be
+restricted by target branch. Restricting the target branch unnecessarily
+will prevent any stacked PRs from being tested. ``pull_request`` events should
+not contain a branches key.
+
+2. Workflows that are designed to also trigger on push events (e.g., for
+testing on ``main`` or one of the release branches) need to be restricted by
+branch. While pushes to a fork will not trigger a workflow run due to the
+``push`` event if the workflow already has its jobs disabled in forks
+(described above), stacked PRs will end up running jobs twice if the ``push``
+event does not have any branch restrictions. ``push`` events should have
+their branches restricted at the very least to ``main`` and the release
+branches as follows:
+
+.. code-block:: yaml
+
+  push:
+    branches:
+      - main
+      - releases/*
diff --git a/llvm/docs/CalleeTypeMetadata.rst b/llvm/docs/CalleeTypeMetadata.rst
new file mode 100644
index 0000000000000..45d0657966a8c
--- /dev/null
+++ b/llvm/docs/CalleeTypeMetadata.rst
@@ -0,0 +1,33 @@
+====================
+Callee Type Metadata
+====================
+
+Introduction
+============
+This ``!callee_type`` metadata is introduced to support the generation of a call graph
+section in the object file.  The ``!callee_type`` metadata is used
+to identify the types of the intended callees of indirect call instructions. The ``!callee_type`` metadata is a
+list of one or more generalized ``!type`` metadata objects (See :doc:`TypeMetadata`) with each ``!type``
+metadata pointing to a callee's :ref:`type identifier <calleetype-type-identifier>`.
+LLVM's `Control Flow Integrity (CFI)`_ also uses the ``!type`` metadata in its implementation.
+
+.. _Control Flow Integrity (CFI): https://clang.llvm.org/docs/ControlFlowIntegrity.html
+
+.. _calleetype-type-identifier:
+
+Type identifier
+================
+
+The type for an indirect call target is the callee's function signature.
+Mapping from a type to an identifier is an ABI detail.
+In the current implementation, an identifier of type T is
+computed as follows:
+
+  -  Obtain the generalized mangled name for “typeinfo name for T”.
+  -  Compute MD5 hash of the name as a string.
+  -  Reinterpret the first 8 bytes of the hash as a little-endian 64-bit integer.
+
+To avoid mismatched pointer types, generalizations are applied.
+Pointers in return and argument types are treated as equivalent as long as the qualifiers for the 
+type they point to match. For example, ``char*``, ``char**``, and ``int*`` are considered equivalent
+types. However, ``char*`` and ``const char*`` are considered distinct types.
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index c614a6d7ace9e..732227b98ab9e 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -30,7 +30,7 @@ because the naming and other conventions are dictated by the C++ standard.
 
 There are some conventions that are not uniformly followed in the code base
 (e.g. the naming convention).  This is because they are relatively new, and a
-lot of code was written before they were put in place.  Our long term goal is
+lot of code was written before they were put in place.  Our long-term goal is
 for the entire codebase to follow the convention, but we explicitly *do not*
 want patches that do large-scale reformatting of existing code.  On the other
 hand, it is reasonable to rename the methods of a class if you're about to
@@ -50,7 +50,7 @@ code imported into the tree. Generally, our preference is for standards
 conforming, modern, and portable C++ code as the implementation language of
 choice.
 
-For automation, build-systems and utility scripts Python is preferred and
+For automation, build-systems, and utility scripts, Python is preferred and
 is widely used in the LLVM repository already.
 
 C++ Standard Versions
@@ -92,7 +92,7 @@ LLVM support libraries (for example, `ADT
 <https://github.com/llvm/llvm-project/tree/main/llvm/include/llvm/ADT>`_)
 implement specialized data structures or functionality missing in the standard
 library. Such libraries are usually implemented in the ``llvm`` namespace and
-follow the expected standard interface, when there is one.
+follow the expected standard interface when there is one.
 
 When both C++ and the LLVM support libraries provide similar functionality, and
 there isn't a specific reason to favor the C++ implementation, it is generally
@@ -325,8 +325,8 @@ implementation file.  In any case, implementation files can include additional
 comments (not necessarily in Doxygen markup) to explain implementation details
 as needed.
 
-Don't duplicate function or class name at the beginning of the comment.
-For humans it is obvious which function or class is being documented;
+Don't duplicate the function or class name at the beginning of the comment.
+For humans, it is obvious which function or class is being documented;
 automatic documentation processing tools are smart enough to bind the comment
 to the correct declaration.
 
@@ -369,7 +369,7 @@ lower-case letter, and finish the last sentence without a period, if it would
 end in one otherwise. Sentences which end with different punctuation, such as
 "did you forget ';'?", should still do so.
 
-For example this is a good error message:
+For example, this is a good error message:
 
 .. code-block:: none
 
@@ -443,7 +443,7 @@ Write your code to fit within 80 columns.
 There must be some limit to the width of the code in
 order to allow developers to have multiple files side-by-side in
 windows on a modest display.  If you are going to pick a width limit, it is
-somewhat arbitrary but you might as well pick something standard.  Going with 90
+somewhat arbitrary, but you might as well pick something standard.  Going with 90
 columns (for example) instead of 80 columns wouldn't add any significant value
 and would be detrimental to printing out code.  Also many other projects have
 standardized on 80 columns, so some people have already configured their editors
@@ -520,7 +520,7 @@ within each other and within function calls in order to build up aggregates
 The historically common formatting of braced initialization of aggregate
 variables does not mix cleanly with deep nesting, general expression contexts,
 function arguments, and lambdas. We suggest new code use a simple rule for
-formatting braced initialization lists: act as-if the braces were parentheses
+formatting braced initialization lists: act as if the braces were parentheses
 in a function call. The formatting rules exactly match those already well
 understood for formatting nested function calls. Examples:
 
@@ -607,11 +607,11 @@ Static constructors and destructors (e.g., global variables whose types have a
 constructor or destructor) should not be added to the code base, and should be
 removed wherever possible.
 
-Globals in different source files are initialized in `arbitrary order
+Globals in different source files are initialized in an `arbitrary order
 <https://yosefk.com/c++fqa/ctors.html#fqa-10.12>`_, making the code more
 difficult to reason about.
 
-Static constructors have negative impact on launch time of programs that use
+Static constructors have a negative impact on the launch time of programs that use
 LLVM as a library. We would really like for there to be zero cost for linking
 in an additional LLVM target or other library into an application, but static
 constructors undermine this goal.
@@ -698,7 +698,7 @@ If you use a braced initializer list when initializing a variable, use an equals
 Use ``auto`` Type Deduction to Make Code More Readable
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Some are advocating a policy of "almost always ``auto``" in C++11, however LLVM
+Some are advocating a policy of "almost always ``auto``" in C++11; however, LLVM
 uses a more moderate stance. Use ``auto`` if and only if it makes the code more
 readable or easier to maintain. Don't "almost always" use ``auto``, but do use
 ``auto`` with initializers like ``cast<Foo>(...)`` or other places where the
@@ -783,14 +783,14 @@ guards, and might not include their prerequisites. Name such files with the
 
 In general, a header should be implemented by one or more ``.cpp`` files.  Each
 of these ``.cpp`` files should include the header that defines their interface
-first.  This ensures that all of the dependences of the header have been
+first.  This ensures that all of the dependencies of the header have been
 properly added to the header itself, and are not implicit.  System headers
 should be included after user headers for a translation unit.
 
 Library Layering
 ^^^^^^^^^^^^^^^^
 
-A directory of header files (for example ``include/llvm/Foo``) defines a
+A directory of header files (for example, ``include/llvm/Foo``) defines a
 library (``Foo``). One library (both
 its headers and implementation) should only use things from the libraries
 listed in its dependencies.
@@ -822,7 +822,7 @@ especially in header files.
 
 But wait! Sometimes you need to have the definition of a class to use it, or to
 inherit from it.  In these cases go ahead and ``#include`` that header file.  Be
-aware however that there are many cases where you don't need to have the full
+aware, however, that there are many cases where you don't need to have the full
 definition of a class.  If you are using a pointer or reference to a class, you
 don't need the header file.  If you are simply returning a class instance from a
 prototyped function or method, you don't need it.  In fact, for most cases, you
@@ -970,7 +970,7 @@ loops.  A silly example is something like this:
 When you have very, very small loops, this sort of structure is fine. But if it
 exceeds more than 10-15 lines, it becomes difficult for people to read and
 understand at a glance. The problem with this sort of code is that it gets very
-nested very quickly. Meaning that the reader of the code has to keep a lot of
+nested very quickly. This means that the reader of the code has to keep a lot of
 context in their brain to remember what is going immediately on in the loop,
 because they don't know if/when the ``if`` conditions will have ``else``\s etc.
 It is strongly preferred to structure the loop like this:
@@ -988,7 +988,7 @@ It is strongly preferred to structure the loop like this:
     ...
   }
 
-This has all the benefits of using early exits for functions: it reduces nesting
+This has all the benefits of using early exits for functions: it reduces the nesting
 of the loop, it makes it easier to describe why the conditions are true, and it
 makes it obvious to the reader that there is no ``else`` coming up that they
 have to push context into their brain for.  If a loop is large, this can be a
@@ -1149,12 +1149,12 @@ In general, names should be in camel case (e.g. ``TextFileReader`` and
   nouns and start with an upper-case letter (e.g. ``TextFileReader``).
 
 * **Variable names** should be nouns (as they represent state).  The name should
-  be camel case, and start with an upper case letter (e.g. ``Leader`` or
+  be camel case, and start with an upper-case letter (e.g. ``Leader`` or
   ``Boats``).
 
 * **Function names** should be verb phrases (as they represent actions), and
   command-like function should be imperative.  The name should be camel case,
-  and start with a lower case letter (e.g. ``openFile()`` or ``isFoo()``).
+  and start with a lower-case letter (e.g. ``openFile()`` or ``isFoo()``).
 
 * **Enum declarations** (e.g. ``enum Foo {...}``) are types, so they should
   follow the naming conventions for types.  A common use for enums is as a
@@ -1207,7 +1207,7 @@ Assert Liberally
 ^^^^^^^^^^^^^^^^
 
 Use the "``assert``" macro to its fullest.  Check all of your preconditions and
-assumptions, you never know when a bug (not necessarily even yours) might be
+assumptions.  You never know when a bug (not necessarily even yours) might be
 caught early by an assertion, which reduces debugging time dramatically.  The
 "``<cassert>``" header file is probably already included by the header files you
 are using, so it doesn't cost anything to use it.
@@ -1302,7 +1302,7 @@ preferred to write the code like this:
   assert(NewToSet && "The value shouldn't be in the set yet");
 
 In C code where ``[[maybe_unused]]`` is not supported, use ``void`` cast to
-suppress unused variable warning as follows:
+suppress an unused variable warning as follows:
 
 .. code-block:: c
 
@@ -1546,7 +1546,7 @@ whenever possible.
 The semantics of postincrement include making a copy of the value being
 incremented, returning it, and then preincrementing the "work value".  For
 primitive types, this isn't a big deal. But for iterators, it can be a huge
-issue (for example, some iterators contains stack and set objects in them...
+issue (for example, some iterators contain stack and set objects in them...
 copying an iterator could invoke the copy ctor's of these as well).  In general,
 get in the habit of always using preincrement, and you won't have a problem.
 
@@ -1663,7 +1663,7 @@ Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements
 
 When writing the body of an ``if``, ``else``, or for/while loop statement, we
 prefer to omit the braces to avoid unnecessary line noise. However, braces
-should be used in cases where the omission of braces harm the readability and
+should be used in cases where the omission of braces harms the readability and
 maintainability of the code.
 
 We consider that readability is harmed when omitting the brace in the presence
@@ -1763,7 +1763,7 @@ would help to avoid running into a "dangling else" situation.
         handleAttrOnDecl(D, A, i);
   }
 
-  // Use braces on the outer block because of a nested `if`; otherwise the
+  // Use braces on the outer block because of a nested `if`; otherwise, the
   // compiler would warn: `add explicit braces to avoid dangling else`
   if (auto *D = dyn_cast<FunctionDecl>(D)) {
     if (shouldProcess(D))
diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
    llvm-dis
    llvm-dwarfdump
    llvm-dwarfutil
+   llvm-ir2vec
    llvm-lib
    llvm-libtool-darwin
    llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0000000000000..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==============================================
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+--------
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+-----------
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings <https://llvm.org/docs/MLGO.html#ir2vec-embeddings>`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---------------
+
+Triplet Generation Mode
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func input.bc -o embeddings.txt
+
+OPTIONS
+-------
+
+.. option:: --mode=<mode>
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=<level>
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=<name>
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=<path>
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=<weight>
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=<weight>
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=<weight>
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o <filename>
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, ``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-----------------
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.bc``) and LLVM IR files 
+(``.ll``) as input. The input file should contain valid LLVM IR.
+
+OUTPUT FORMAT
+-------------
+
+Triplet Mode Output
+~~~~~~~~~~~~~~~~~~~
+
+In triplet mode, the output consists of lines containing space-separated triplets:
+
+.. code-block:: text
+
+   <opcode> <type> <operand1> <operand2> ...
+
+Each line represents the information of one instruction, with the opcode, type,
+and operands.
+
+Embedding Mode Output
+~~~~~~~~~~~~~~~~~~~~~
+
+In embedding mode, the output format depends on the specified level:
+
+* **Function Level**: One embedding vector per function
+* **Basic Block Level**: One embedding vector per basic block, grouped by function
+* **Instruction Level**: One embedding vector per instruction, grouped by basic block and function
+
+Each embedding is represented as a floating point vector.
+
+EXIT STATUS
+-----------
+
+:program:`llvm-ir2vec` returns 0 on success, and a non-zero value on failure.
+
+Common failure cases include:
+
+* Invalid or missing input file
+* Missing or invalid vocabulary file (in embedding mode)
+* Specified function not found in the module
+* Invalid command line options
+
+SEE ALSO
+--------
+
+:doc:`../MLGO`
+
+For more information about the IR2Vec algorithm and approach, see:
+`IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index c9f0379694287..aaf38f84b92e5 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -140,23 +140,29 @@ OPTIONS
   debug information for stripped binaries. Multiple instances of this argument
   are searched in the order given.
 
-.. option:: --debuginfod, --no-debuginfod
+.. option:: --debug-indent=<width>
 
-  Whether or not to try debuginfod lookups for debug binaries. Unless specified,
-  debuginfod is only enabled if libcurl was compiled in (``LLVM_ENABLE_CURL``)
-  and at least one server URL was provided by the environment variable
-  ``DEBUGINFOD_URLS``.
+  Distance to indent the source-level variable or inlined function display,
+  relative to the start of the disassembly. Defaults to 52 characters.
+
+.. option:: --debug-inlined-funcs[=<format>]
 
-.. option:: --debug-vars=<format>
+  Print the locations of inlined functions alongside disassembly.
+  ``format`` may be ``ascii``, ``limits-only``, or ``unicode``, defaulting to
+  ``unicode`` if omitted.
+
+.. option:: --debug-vars[=<format>]
 
   Print the locations (in registers or memory) of source-level variables
-  alongside disassembly. ``format`` may be ``unicode`` or ``ascii``, defaulting
+  alongside disassembly. ``format`` may be ``ascii`` or ``unicode``, defaulting
   to ``unicode`` if omitted.
 
-.. option:: --debug-vars-indent=<width>
+.. option:: --debuginfod, --no-debuginfod
 
-  Distance to indent the source-level variable display, relative to the start
-  of the disassembly. Defaults to 52 characters.
+  Whether or not to try debuginfod lookups for debug binaries. Unless specified,
+  debuginfod is only enabled if libcurl was compiled in (``LLVM_ENABLE_CURL``)
+  and at least one server URL was provided by the environment variable
+  ``DEBUGINFOD_URLS``.
 
 .. option:: -j, --section=<section1[,section2,...]>
 
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index bad72c6ca8295..d8fb87b6998ad 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -581,6 +581,26 @@ This section stores pairs of (jump table address, number of entries).
 This information is useful for tools that need to statically reconstruct
 the control flow of executables.
 
+``SHT_LLVM_CFI_JUMP_TABLE`` Section (CFI jump table)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This section contains the instructions that make up a `CFI jump table`_.
+It is expected to be ``SHF_ALLOC`` and may be laid out like a normal
+section. The ``SHT_LLVM_CFI_JUMP_TABLE`` section type gives the linker
+permission to modify the section in ways that would not normally be
+permitted, in order to optimize calls via the jump table.
+
+Each ``sh_entsize`` sized slice of a section of this type containing
+exactly one relocation may be considered to be a jump table entry
+that branches to the target of the relocation. This allows the linker
+to replace the jump table entry with the function body if it is small
+enough, or if the function is the last function in the jump table.
+
+A section of this type does not have to be placed according to its
+name. The linker may place the section in whichever output section it
+sees fit (generally the section that would provide the best locality).
+
+.. _CFI jump table: https://clang.llvm.org/docs/ControlFlowIntegrityDesign.html#forward-edge-cfi-for-indirect-function-calls
+
 CodeView-Dependent
 ------------------
 
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index dc53072e09e39..d87a8bd81cc7b 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -354,11 +354,6 @@ The :doc:`CodeOfConduct` applies to all office hours.
     - Every first Friday of the month, 14:00 UK time, for 60 minutes.
     - `Google meet <https://meet.google.com/jps-twgq-ivz>`__
     - English, Portuguese
-  * - Rotating hosts
-    - Getting Started, beginner questions, new contributors.
-    - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
-    - `Google meet <https://meet.google.com/nga-uhpf-bbb>`__
-    - English
 
 For event owners, our Discord bot also supports sending automated announcements
 of upcoming office hours. Please see the :ref:`discord-bot-event-pings` section
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 372fd403ef65d..3036dae192077 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -12,7 +12,7 @@ Welcome to the LLVM project!
 
 The LLVM project has multiple components. The core of the project is
 itself called "LLVM". This contains all of the tools, libraries, and header
-files needed to process intermediate representations and converts it into
+files needed to process intermediate representations and convert them into
 object files.  Tools include an assembler, disassembler, bitcode analyzer, and
 bitcode optimizer.  It also contains basic regression tests.
 
@@ -32,11 +32,11 @@ Getting the Source Code and Building LLVM
 #. Check out LLVM (including subprojects like Clang):
 
    * ``git clone https://github.com/llvm/llvm-project.git``
-   * Or, on windows:
+   * Or, on Windows:
 
      ``git clone --config core.autocrlf=false
      https://github.com/llvm/llvm-project.git``
-   * To save storage and speed-up the checkout time, you may want to do a
+   * To save storage and speed up the checkout time, you may want to do a
      `shallow clone <https://git-scm.com/docs/git-clone#Documentation/git-clone.txt---depthltdepthgt>`_.
      For example, to get the latest revision of the LLVM project, use
 
@@ -71,7 +71,7 @@ Getting the Source Code and Building LLVM
 
      Some common options:
 
-     * ``-DLLVM_ENABLE_PROJECTS='...'`` --- semicolon-separated list of the LLVM
+     * ``-DLLVM_ENABLE_PROJECTS='...'`` --- A semicolon-separated list of the LLVM
        subprojects you'd like to additionally build. Can include any of: clang,
        clang-tools-extra, lldb, lld, polly, or cross-project-tests.
 
@@ -82,10 +82,10 @@ Getting the Source Code and Building LLVM
        pathname of where you want the LLVM tools and libraries to be installed
        (default ``/usr/local``).
 
-     * ``-DCMAKE_BUILD_TYPE=type`` --- Controls optimization level and debug
+     * ``-DCMAKE_BUILD_TYPE=type`` --- Controls the optimization level and debug
        information of the build. Valid options for *type* are ``Debug``,
        ``Release``, ``RelWithDebInfo``, and ``MinSizeRel``. For more detailed
-       information see :ref:`CMAKE_BUILD_TYPE <cmake_build_type>`.
+       information, see :ref:`CMAKE_BUILD_TYPE <cmake_build_type>`.
 
      * ``-DLLVM_ENABLE_ASSERTIONS=ON`` --- Compile with assertion checks enabled
        (default is ON for Debug builds, OFF for all other build types).
@@ -124,7 +124,7 @@ Getting the Source Code and Building LLVM
 
      ``ninja -C build check-llvm``
 
-     This will setup an LLVM build with debugging info, then compile LLVM and
+     This will set up an LLVM build with debugging info, then compile LLVM and
      run LLVM tests.
 
    * For more detailed information on CMake options, see `CMake <CMake.html>`__
@@ -150,7 +150,7 @@ page.
 
 For stand-alone builds, you must have an llvm install that is configured
 properly to be consumable by stand-alone builds of the other projects.
-This could be a distro provided LLVM install, or you can build it yourself,
+This could be a distro-provided LLVM install, or you can build it yourself,
 like this:
 
 .. code-block:: console
@@ -195,7 +195,7 @@ clang        clang, cmake             CLANG_INCLUDE_TESTS=ON (Required for check
 lld          lld, cmake
 ============ ======================== ======================
 
-Example for building stand-alone `clang`:
+Example of building stand-alone `clang`:
 
 .. code-block:: console
 
@@ -224,7 +224,7 @@ Example for building stand-alone `clang`:
 Requirements
 ============
 
-Before you begin to use the LLVM system, review the requirements given below.
+Before you begin to use the LLVM system, review the requirements below.
 This may save you some trouble by knowing ahead of time what hardware and
 software you will need.
 
@@ -265,7 +265,7 @@ Windows on Arm     ARM64                 Visual Studio, Clang\ :sup:`4`
 
   #. Code generation supported for Pentium processors and up
   #. Code generation supported for 32-bit ABI only
-  #. To use LLVM modules on Win32-based system, you may configure LLVM
+  #. To use LLVM modules on a Win32-based system, you may configure LLVM
      with ``-DBUILD_SHARED_LIBS=On``.
   #. Visual Studio alone can compile LLVM. When using Clang, you
      must also have Visual Studio installed.
@@ -309,7 +309,7 @@ Package                                                     Version      Notes
    #. Only needed if you want to run the automated test suite in the
       ``llvm/test`` directory, or if you plan to utilize any Python libraries,
       utilities, or bindings.
-   #. Optional, adds compression / uncompression capabilities to selected LLVM
+   #. Optional, adds compression/uncompression capabilities to selected LLVM
       tools.
    #. Optional, you can use any other build tool supported by CMake.
    #. Only needed when building libc with New Headergen. Mainly used by libc.
@@ -401,11 +401,11 @@ Studio 2019 (or later), or a recent version of mingw64. FreeBSD 10.0 and newer
 have a modern Clang as the system compiler.
 
 However, some Linux distributions and some other or older BSDs sometimes have
-extremely old versions of GCC. These steps attempt to help you upgrade you
+extremely old versions of GCC. These steps attempt to help you upgrade your
 compiler even on such a system. However, if at all possible, we encourage you
 to use a recent version of a distribution with a modern system compiler that
 meets these requirements. Note that it is tempting to install a prior
-version of Clang and libc++ to be the host compiler, however libc++ was not
+version of Clang and libc++ to be the host compiler; however, libc++ was not
 well tested or set up to build on Linux until relatively recently. As
 a consequence, this guide suggests just using libstdc++ and a modern GCC as the
 initial host in a bootstrap, and then using Clang (and potentially libc++).
@@ -514,11 +514,11 @@ appropriate pathname on your local system.  All these paths are absolute:
 
 ``SRC_ROOT``
 
-  This is the top level directory of the LLVM source tree.
+  This is the top-level directory of the LLVM source tree.
 
 ``OBJ_ROOT``
 
-  This is the top level directory of the LLVM object tree (i.e. the tree where
+  This is the top-level directory of the LLVM object tree (i.e. the tree where
   object files and compiled programs will be placed.  It can be the same as
   SRC_ROOT).
 
@@ -666,7 +666,7 @@ cross-compiling CMake provides a variable ``CMAKE_TOOLCHAIN_FILE`` which can
 define compiler flags and variables used during the CMake test operations.
 
 The result of such a build is executables that are not runnable on the build
-host but can be executed on the target. As an example the following CMake
+host but can be executed on the target. As an example, the following CMake
 invocation can generate build files targeting iOS. This will work on macOS
 with the latest Xcode:
 
@@ -770,7 +770,7 @@ Generates system build files.
 - Some simple examples showing how to use LLVM as a compiler for a custom
   language - including lowering, optimization, and code generation.
 
-- Kaleidoscope Tutorial: Kaleidoscope language tutorial run through the
+- Kaleidoscope Tutorial: Kaleidoscope language tutorial runs through the
   implementation of a nice little compiler for a non-trivial language
   including a hand-written lexer, parser, AST, as well as code generation
   support using LLVM- both static (ahead of time) and various approaches to
@@ -858,7 +858,7 @@ share code among the `tools`_.
 
 ``llvm/lib/Support/``
 
-  Source code that corresponding to the header files in ``llvm/include/ADT/``
+  Source code that corresponds to the header files in ``llvm/include/ADT/``
   and ``llvm/include/Support/``.
 
 ``llvm/bindings``
@@ -1051,7 +1051,7 @@ Example with clang
 
      % lli hello.bc
 
-   The second examples shows how to invoke the LLVM JIT, :doc:`lli
+   The second example shows how to invoke the LLVM JIT, :doc:`lli
    <CommandGuide/lli>`.
 
 #. Use the ``llvm-dis`` utility to take a look at the LLVM assembly code:
@@ -1163,7 +1163,7 @@ following options with cmake:
 
    Consider setting this to ``ON`` if you require a debug build, as this will ease
    memory pressure on the linker. This will make linking much faster, as the
-   binaries will not contain any of the debug information. Instead the debug
+   binaries will not contain any of the debug information. Instead, the debug
    information is in a separate DWARF object file (with the extension ``.dwo``).
    This only applies to host platforms using ELF, such as Linux.
 
diff --git a/llvm/docs/HowToUpdateDebugInfo.rst b/llvm/docs/HowToUpdateDebugInfo.rst
index abe21c6794a8a..915e2896023c5 100644
--- a/llvm/docs/HowToUpdateDebugInfo.rst
+++ b/llvm/docs/HowToUpdateDebugInfo.rst
@@ -504,7 +504,7 @@ as follows:
 
 .. code-block:: bash
 
-  $ llvm-original-di-preservation.py sample.json sample.html
+  $ llvm-original-di-preservation.py sample.json --report-file sample.html
 
 Testing of original debug info preservation can be invoked from front-end level
 as follows:
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a5a5070a43a36..822e761444db7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -410,7 +410,7 @@ added in the future:
       calling convention: on most platforms, they are not preserved and need to
       be saved by the caller, but on Windows, xmm6-xmm15 are preserved.
 
-    - On AArch64 the callee preserve all general purpose registers, except
+    - On AArch64 the callee preserves all general purpose registers, except
       X0-X8 and X16-X18. Not allowed with ``nest``.
 
     The idea behind this convention is to support calls to runtime functions
@@ -425,10 +425,10 @@ added in the future:
     on the hot path and definitely executed a lot. Furthermore `preserve_mostcc`
     doesn't prevent the inliner from inlining the function call.
 
-    This calling convention will be used by a future version of the ObjectiveC
+    This calling convention will be used by a future version of the Objective-C
     runtime and should therefore still be considered experimental at this time.
     Although this convention was created to optimize certain runtime calls to
-    the ObjectiveC runtime, it is not limited to this runtime and might be used
+    the Objective-C runtime, it is not limited to this runtime and might be used
     by other runtimes in the future too. The current implementation only
     supports X86-64, but the intention is to support more architectures in the
     future.
@@ -455,14 +455,14 @@ added in the future:
     that don't need to call out to any other functions.
 
     This calling convention, like the `PreserveMost` calling convention, will be
-    used by a future version of the ObjectiveC runtime and should be considered
+    used by a future version of the Objective-C runtime and should be considered
     experimental at this time.
 "``preserve_nonecc``" - The `PreserveNone` calling convention
     This calling convention doesn't preserve any general registers. So all
     general registers are caller saved registers. It also uses all general
     registers to pass arguments. This attribute doesn't impact non-general
     purpose registers (e.g. floating point registers, on X86 XMMs/YMMs).
-    Non-general purpose registers still follow the standard c calling
+    Non-general purpose registers still follow the standard C calling
     convention. Currently it is for x86_64 and AArch64 only.
 "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
     Clang generates an access function to access C++-style Thread Local Storage
@@ -513,7 +513,7 @@ added in the future:
     - On AArch64 the target address is passed in X15.
 "``cc <n>``" - Numbered convention
     Any calling convention may be specified by number, allowing
-    target-specific calling conventions to be used. Target specific
+    target-specific calling conventions to be used. Target-specific
     calling conventions start at 64.
 
 More calling conventions can be added/defined on an as-needed basis, to
@@ -559,7 +559,7 @@ DLL Storage Classes
 -------------------
 
 All Global Variables, Functions and Aliases can have one of the following
-DLL storage class:
+DLL storage classes:
 
 ``dllimport``
     "``dllimport``" causes the compiler to reference a function or variable via
@@ -569,7 +569,7 @@ DLL storage class:
 ``dllexport``
     On Microsoft Windows targets, "``dllexport``" causes the compiler to provide
     a global pointer to a pointer in a DLL, so that it can be referenced with the
-    ``dllimport`` attribute. the pointer name is formed by combining ``__imp_``
+    ``dllimport`` attribute. The pointer name is formed by combining ``__imp_``
     and the function or variable name. On XCOFF targets, ``dllexport`` indicates
     that the symbol will be made visible to other modules using "exported"
     visibility and thus placed by the linker in the loader section symbol table.
@@ -586,7 +586,7 @@ Thread Local Storage Models
 ---------------------------
 
 A variable may be defined as ``thread_local``, which means that it will
-not be shared by threads (each thread will have a separated copy of the
+not be shared by threads (each thread will have a separate copy of the
 variable). Not all targets support thread-local variables. Optionally, a
 TLS model may be specified:
 
@@ -606,10 +606,10 @@ be used. The target may choose a different TLS model if the specified
 model is not supported, or if a better choice of model can be made.
 
 A model can also be specified in an alias, but then it only governs how
-the alias is accessed. It will not have any effect in the aliasee.
+the alias is accessed. It will not have any effect on the aliasee.
 
 For platforms without linker support of ELF TLS model, the -femulated-tls
-flag can be used to generate GCC compatible emulated TLS code.
+flag can be used to generate GCC-compatible emulated TLS code.
 
 .. _runtime_preemption_model:
 
@@ -750,7 +750,7 @@ is zero. The address space qualifier must precede any other attributes.
 
 LLVM allows an explicit section to be specified for globals. If the
 target supports it, it will emit globals to the section specified.
-Additionally, the global can placed in a comdat if the target has the necessary
+Additionally, the global can be placed in a comdat if the target has the necessary
 support.
 
 External declarations may have an explicit section specified. Section
@@ -1316,7 +1316,7 @@ Currently, only the following parameter attributes are defined:
     must be cleared off with :ref:`llvm.stackrestore
     <int_stackrestore>`.
 
-    The inalloca attribute requires a type argument.
+    The ``inalloca`` attribute requires a type argument.
 
     See :doc:`InAlloca` for more information on how to use this
     attribute.
@@ -1328,7 +1328,7 @@ Currently, only the following parameter attributes are defined:
     loads and stores to the structure may be assumed by the callee not
     to trap and to be properly aligned.
 
-    The sret type argument specifies the in memory type.
+    The sret type argument specifies the in-memory type.
 
     A function that accepts an ``sret`` argument must return ``void``.
     A return value may not be ``sret``.
@@ -1397,7 +1397,7 @@ Currently, only the following parameter attributes are defined:
     pointer. This is not a valid attribute for return values. This attribute
     applies only to the particular copy of the pointer passed in this argument.
 
-    The arguments of ``captures`` is a list of captured pointer components,
+    The arguments of ``captures`` are a list of captured pointer components,
     which may be ``none``, or a combination of:
 
     - ``address``: The integral address of the pointer.
@@ -1429,7 +1429,7 @@ Currently, only the following parameter attributes are defined:
       is null is captured in some other way.
 
 ``nofree``
-    This indicates that callee does not free the pointer argument. This is not
+    This indicates that the callee does not free the pointer argument. This is not
     a valid attribute for return values.
 
 .. _nest:
@@ -1545,7 +1545,7 @@ Currently, only the following parameter attributes are defined:
     (matching the supported types for :ref:`fast-math flags <fastmath>`).
     The test mask has the same format as the second argument to the
     :ref:`llvm.is.fpclass <llvm.is.fpclass>`, and indicates which classes
-    of floating-point values are not permitted for the value. For example
+    of floating-point values are not permitted for the value. For example,
     a bitmask of 3 indicates the parameter may not be a NaN.
 
     If the value is a floating-point class indicated by the
@@ -1783,7 +1783,7 @@ string:
 
     define void @f() gc "name" { ... }
 
-The supported values of *name* includes those :ref:`built in to LLVM
+The supported values of *name* include those :ref:`built in to LLVM
 <builtin-gc-strategies>` and any provided by loaded plugins. Specifying a GC
 strategy will cause the compiler to alter its output in order to support the
 named garbage collection algorithm. Note that LLVM itself does not contain a
@@ -2056,9 +2056,9 @@ For example:
 ``hot``
     This attribute indicates that this function is a hot spot of the program
     execution. The function will be optimized more aggressively and will be
-    placed into special subsection of the text section to improving locality.
+    placed into a special subsection of the text section to improve locality.
 
-    When profile feedback is enabled, this attribute has the precedence over
+    When profile feedback is enabled, this attribute takes precedence over
     the profile information. By marking a function ``hot``, users can work
     around the cases where the training input does not have good coverage
     on all the hot functions.
@@ -2162,10 +2162,10 @@ For example:
     and on function declarations and definitions.
 ``nocallback``
     This attribute indicates that the function is only allowed to jump back into
-    caller's module by a return or an exception, and is not allowed to jump back
+    the caller's module by a return or an exception, and is not allowed to jump back
     by invoking a callback function, a direct, possibly transitive, external
     function call, use of ``longjmp``, or other means. It is a compiler hint that
-    is used at module level to improve dataflow analysis, dropped during linking,
+    is used at the module level to improve dataflow analysis, dropped during linking,
     and has no effect on functions defined in the current module.
 ``nodivergencesource``
     A call to this function is not a source of divergence. In uniformity
@@ -2297,7 +2297,7 @@ For example:
    in address-space 0 is considered to be a valid address for memory loads and
    stores. Any analysis or optimization should not treat dereferencing a
    pointer to ``null`` as undefined behavior in this function.
-   Note: Comparing address of a global variable to ``null`` may still
+   Note: Comparing the address of a global variable to ``null`` may still
    evaluate to false because of a limitation in querying this attribute inside
    constant expressions.
 ``optdebug``
@@ -2370,7 +2370,7 @@ For example:
     This attribute controls the behavior of stack probes: either
     the ``"probe-stack"`` attribute, or ABI-required stack probes, if any.
     It defines the size of the guard region. It ensures that if the function
-    may use more stack space than the size of the guard region, stack probing
+    may use more stack space than the size of the guard region, a stack probing
     sequence will be emitted. It takes one required integer value, which
     is 4096 by default.
 
@@ -2892,7 +2892,7 @@ site, these bundles may contain any values that are needed by the
 generated code.  For more details, see :ref:`GC Transitions
 <gc_transition_args>`.
 
-The bundle contain an arbitrary list of Values which need to be passed
+The bundle contains an arbitrary list of Values which need to be passed
 to GC transition code. They will be lowered and passed as operands to
 the appropriate GC_TRANSITION nodes in the selection DAG. It is assumed
 that these arguments must be available before and after (but not
@@ -2903,7 +2903,7 @@ necessarily during) the execution of the callee.
 Assume Operand Bundles
 ^^^^^^^^^^^^^^^^^^^^^^
 
-Operand bundles on an :ref:`llvm.assume <int_assume>` allows representing
+Operand bundles on an :ref:`llvm.assume <int_assume>` allow representing
 assumptions, such as that a :ref:`parameter attribute <paramattrs>` or a
 :ref:`function attribute <fnattrs>` holds for a certain value at a certain
 location. Operand bundles enable assumptions that are either hard or impossible
@@ -2922,11 +2922,11 @@ restricted form:
 
       "<tag>"([ <holds for value> [, <attribute argument>] ])
 
-* The tag of the operand bundle is usually the name of attribute that can be
-  assumed to hold. It can also be `ignore`, this tag doesn't contain any
+* The tag of the operand bundle is usually the name of the attribute that can be
+  assumed to hold. It can also be `ignore`; this tag doesn't contain any
   information and should be ignored.
-* The first argument if present is the value for which the attribute hold.
-* The second argument if present is an argument of the attribute.
+* The first argument, if present, is the value for which the attribute holds.
+* The second argument, if present, is an argument of the attribute.
 
 If there are no arguments the attribute is a property of the call location.
 
@@ -2968,7 +2968,7 @@ the behavior is undefined, unless one of the following exceptions applies:
   dereferenceable at later pointers, e.g. because it could have been freed.
 
 In addition to allowing operand bundles encoding function and parameter
-attributes, an assume operand bundle my also encode a ``separate_storage``
+attributes, an assume operand bundle may also encode a ``separate_storage``
 operand bundle. This has the form:
 
 .. code-block:: llvm
@@ -3115,7 +3115,7 @@ Note that the assembly string *must* be parseable by LLVM's integrated assembler
 Data Layout
 -----------
 
-A module may specify a target specific data layout string that specifies
+A module may specify a target-specific data layout string that specifies
 how data is to be laid out in memory. The syntax for the data layout is
 simply:
 
@@ -3240,12 +3240,24 @@ as follows:
     as :ref:`Non-Integral Pointer Type <nointptrtype>` s.  The ``0``
     address space cannot be specified as non-integral.
 
-Unless explicitly stated otherwise, on every specification that specifies
-an alignment, the value of the alignment must be in the range [1,2^16)
-and must be a power of two times the width of a byte.
-On every specification that takes a ``<abi>:<pref>``, specifying the
-``<pref>`` alignment is optional. If omitted, the preceding ``:``
-should be omitted too and ``<pref>`` will be equal to ``<abi>``.
+``<abi>`` is a lower bound on what is required for a type to be considered
+aligned. This is used in various places, such as:
+
+- The alignment for loads and stores if none is explicitly given.
+- The alignment used to compute struct layout.
+- The alignment used to compute allocation sizes and thus ``getelementptr``
+  offsets.
+- The alignment below which accesses are considered underaligned.
+
+``<pref>`` allows providing a more optimal alignment that should be used when
+possible, primarily for ``alloca`` and the alignment of global variables. It is
+an optional value that must be greater than or equal to ``<abi>``. If omitted,
+the preceding ``:`` should also be omitted and ``<pref>`` will be equal to
+``<abi>``.
+
+Unless explicitly stated otherwise, every alignment specification is provided in
+bits and must be in the range [1,2^16). The value must be a power of two times
+the width of a byte (i.e. ``align = 8 * 2^N``).
 
 When constructing the data layout for a given target, LLVM starts with a
 default set of specifications which are then (possibly) overridden by
@@ -3599,7 +3611,7 @@ operation may modify the memory at that address. A volatile operation
 may not modify any other memory accessible by the module being compiled.
 A volatile operation may not call any code in the current module.
 
-In general (without target specific context), the address space of a
+In general (without target-specific context), the address space of a
 volatile operation may not be changed. Different address spaces may
 have different trapping behavior when dereferencing an invalid
 pointer.
@@ -3782,7 +3794,7 @@ If an atomic operation is marked ``syncscope("singlethread")``, it only
 other operations running in the same thread (for example, in signal handlers).
 
 If an atomic operation is marked ``syncscope("<target-scope>")``, where
-``<target-scope>`` is a target specific synchronization scope, then it is target
+``<target-scope>`` is a target-specific synchronization scope, then it is target
 dependent if it *synchronizes with* and participates in the seq\_cst total
 orderings of other operations.
 
@@ -3884,10 +3896,10 @@ Floating-Point Semantics
 ------------------------
 
 This section defines the semantics for core floating-point operations on types
-that use a format specified by IEEE-745. These types are: ``half``, ``float``,
+that use a format specified by IEEE-754. These types are: ``half``, ``float``,
 ``double``, and ``fp128``, which correspond to the binary16, binary32, binary64,
 and binary128 formats, respectively. The "core" operations are those defined in
-section 5 of IEEE-745, which all have corresponding LLVM operations.
+section 5 of IEEE-754, which all have corresponding LLVM operations.
 
 The value returned by those operations matches that of the corresponding
 IEEE-754 operation executed in the :ref:`default LLVM floating-point environment
@@ -4855,7 +4867,7 @@ to be eliminated. This is because '``poison``' is stronger than '``undef``'.
 
       %D = undef
       %E = icmp slt %D, 4
-      %F = icmp gte %D, 4
+      %F = icmp sge %D, 4
 
     Safe:
       %A = undef
@@ -8159,6 +8171,11 @@ change in the future.
 
 See :doc:`TypeMetadata`.
 
+'``callee_type``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+See :doc:`CalleeTypeMetadata`.
+
 '``associated``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -8729,11 +8746,11 @@ framework::
 The metadata encoding as lists of lists of options, as opposed to a collapsed
 list of options, is chosen so that the IR encoding can use multiple option
 strings to specify e.g., a single library, while still having that specifier be
-preserved as an atomic element that can be recognized by a target specific
+preserved as an atomic element that can be recognized by a target-specific
 assembly writer or object file emitter.
 
 Each individual option is required to be either a valid option for the target's
-linker, or an option that is reserved by the target specific assembly writer or
+linker, or an option that is reserved by the target-specific assembly writer or
 object file emitter. No other aspect of these options is defined by the IR.
 
 Dependent Libs Named Metadata
@@ -19491,7 +19508,7 @@ Semantics:
 
 The '``llvm.set.loop.iterations.*``' intrinsics do not perform any arithmetic
 on their operand. It's a hint to the backend that can use this to set up the
-hardware-loop count with a target specific instruction, usually a move of this
+hardware-loop count with a target-specific instruction, usually a move of this
 value to a special register or a hardware-loop instruction.
 
 
@@ -19530,7 +19547,7 @@ Semantics:
 
 The '``llvm.start.loop.iterations.*``' intrinsics do not perform any arithmetic
 on their operand. It's a hint to the backend that can use this to set up the
-hardware-loop count with a target specific instruction, usually a move of this
+hardware-loop count with a target-specific instruction, usually a move of this
 value to a special register or a hardware-loop instruction.
 
 '``llvm.test.set.loop.iterations.*``' Intrinsic
@@ -19566,7 +19583,7 @@ Semantics:
 
 The '``llvm.test.set.loop.iterations.*``' intrinsics do not perform any
 arithmetic on their operand. It's a hint to the backend that can use this to
-set up the hardware-loop count with a target specific instruction, usually a
+set up the hardware-loop count with a target-specific instruction, usually a
 move of this value to a special register or a hardware-loop instruction.
 The result is the conditional value of whether the given count is not zero.
 
@@ -19604,7 +19621,7 @@ Semantics:
 
 The '``llvm.test.start.loop.iterations.*``' intrinsics do not perform any
 arithmetic on their operand. It's a hint to the backend that can use this to
-set up the hardware-loop count with a target specific instruction, usually a
+set up the hardware-loop count with a target-specific instruction, usually a
 move of this value to a special register or a hardware-loop instruction.
 The result is a pair of the input and a conditional value of whether the
 given count is not zero.
@@ -26622,19 +26639,14 @@ Arguments:
 
 The first argument is a constant integer representing the size of the
 object, or -1 if it is variable sized. The second argument is a pointer
-to the object.
+to an ``alloca`` instruction.
 
 Semantics:
 """"""""""
 
-If ``ptr`` is a stack-allocated object and it points to the first byte of
-the object, the object is initially marked as dead.
-``ptr`` is conservatively considered as a non-stack-allocated object if
-the stack coloring algorithm that is used in the optimization pipeline cannot
-conclude that ``ptr`` is a stack-allocated object.
-
-After '``llvm.lifetime.start``', the stack object that ``ptr`` points is marked
-as alive and has an uninitialized value.
+The stack-allocated object that ``ptr`` points to is initially marked as dead.
+After '``llvm.lifetime.start``', the stack object is marked as alive and has an
+uninitialized value.
 The stack object is marked as dead when either
 :ref:`llvm.lifetime.end <int_lifeend>` to the alloca is executed or the
 function returns.
@@ -26644,11 +26656,6 @@ After :ref:`llvm.lifetime.end <int_lifeend>` is called,
 The second '``llvm.lifetime.start``' call marks the object as alive, but it
 does not change the address of the object.
 
-If ``ptr`` is a non-stack-allocated object, it does not point to the first
-byte of the object or it is a stack object that is already alive, it simply
-fills all bytes of the object with ``poison``.
-
-
 .. _int_lifeend:
 
 '``llvm.lifetime.end``' Intrinsic
@@ -26672,24 +26679,16 @@ Arguments:
 
 The first argument is a constant integer representing the size of the
 object, or -1 if it is variable sized. The second argument is a pointer
-to the object.
+to an ``alloca`` instruction.
 
 Semantics:
 """"""""""
 
-If ``ptr`` is a stack-allocated object and it points to the first byte of the
-object, the object is dead.
-``ptr`` is conservatively considered as a non-stack-allocated object if
-the stack coloring algorithm that is used in the optimization pipeline cannot
-conclude that ``ptr`` is a stack-allocated object.
+The stack-allocated object that ``ptr`` points to becomes dead after the call
+to this intrinsic.
 
 Calling ``llvm.lifetime.end`` on an already dead alloca is no-op.
 
-If ``ptr`` is a non-stack-allocated object or it does not point to the first
-byte of the object, it is equivalent to simply filling all bytes of the object
-with ``poison``.
-
-
 '``llvm.invariant.start``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index ed0769bebeac3..965a21b8c84b8 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -468,6 +468,13 @@ The core components are:
 Using IR2Vec
 ------------
 
+.. note::
+
+   This section describes how to use IR2Vec within LLVM passes. A standalone 
+   tool :doc:`CommandGuide/llvm-ir2vec` is available for generating the
+   embeddings and triplets from LLVM IR files, which can be useful for
+   training vocabularies and generating embeddings outside of compiler passes.
+
 For generating embeddings, first the vocabulary should be obtained. Then, the 
 embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
 
@@ -524,6 +531,10 @@ Further Details
 For more detailed information about the IR2Vec algorithm, its parameters, and
 advanced usage, please refer to the original paper:
 `IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
+
+For information about using IR2Vec tool for generating embeddings and
+triplets from LLVM IR, see :doc:`CommandGuide/llvm-ir2vec`.
+
 The LLVM source code for ``IR2Vec`` can also be explored to understand the 
 implementation details.
 
@@ -595,4 +606,3 @@ optimizations that are currently MLGO-enabled, it may be used as follows:
 where the ``name`` is a path fragment. We will expect to find 2 files,
 ``<name>.in`` (readable, data incoming from the managing process) and
 ``<name>.out`` (writable, the model runner sends data to the managing process)
-
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 11017fe4e01b4..d28eb6860c33a 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1072,6 +1072,8 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+
 Overview:
 """""""""
 
@@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
 in ``tile`` mode. In tile mode, the multi-dimensional layout of the
 source tensor is preserved at the destination. The dimension of the
 tensor data ranges from 1d to 5d with the coordinates specified
-by the ``i32 %d0 ... i32 %d4`` arguments.
+by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode,
+four rows in a 2D tensor are combined to form a single 2D destination
+tensor. The first coordinate ``i32 %x0`` denotes the column index
+followed by four coordinates indicating the four row-indices.
+So, this mode takes a total of 5 coordinates as input arguments.
+For more information on ``gather4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
 
 * The last three arguments to these intrinsics are flags
   indicating support for multicast, cache_hint and cta_group::1/2
@@ -1116,10 +1124,18 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
 Overview:
 """""""""
 
@@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
 are unrolled into a single dimensional column at the destination. In this
 mode, the tensor has to be at least three-dimensional. Along with the tensor
 coordinates, im2col offsets are also specified (denoted by
-``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
-than the number of dimensions of the tensor operation. The last three arguments
-to these intrinsics are flags, with the same functionality as described
-in the ``tile`` mode intrinsics above.
+``i16 im2col0...i16 %im2col2``). For the ``im2col`` mode, the number of offsets
+is two less than the number of dimensions of the tensor operation. For the
+``im2col.w`` and ``im2col.w.128`` mode, the number of offsets is always 2,
+denoted by ``i16 %wHalo`` and ``i16 %wOffset`` arguments. For more information
+on ``im2col.w`` and ``im2col.w.128`` modes, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
+
+The last three arguments to these intrinsics are flags, with the same functionality
+as described in the ``tile`` mode intrinsics above.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*``
+set of PTX instructions. These instructions initiate an asynchronous
+copy of tensor data from global memory to shared::cta memory in
+``tile`` mode. In tile mode, the multi-dimensional layout of the
+source tensor is preserved at the destination. The dimension of the
+tensor data ranges from 1d to 5d with the coordinates specified
+by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode,
+four rows in a 2D tensor are combined to form a single 2D destination
+tensor. The first coordinate ``i32 %x0`` denotes the column index
+followed by four coordinates indicating the four row-indices.
+So, this mode takes a total of 5 coordinates as input arguments.
+For more information on ``gather4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*``
+set of PTX instructions. These instructions initiate an asynchronous copy
+of tensor data from global memory to shared::cta memory in ``im2col`` mode.
+In im2col mode, some dimensions of the source tensor are unrolled into a
+single dimensional column at the destination. In this mode, the tensor has
+to be at least three-dimensional. Along with the tensor coordinates, im2col
+offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``).
+For the ``im2col`` mode, the number of offsets is two less than the number
+of dimensions of the tensor operation. For the ``im2col.w`` and ``im2col.w.128``
+mode, the number of offsets is always 2, denoted by ``i16 %wHalo`` and
+``i16 %wOffset`` arguments. For more information on ``im2col.w`` and
+``im2col.w.128`` modes, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
 
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
@@ -1153,6 +1264,8 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
+
 Overview:
 """""""""
 
@@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from
 shared::cta to global memory (indicated by the ``s2g`` prefix)
 in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
 with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
+In ``tile.scatter4`` mode, a single 2D source tensor is divided into
+four rows in the 2D destination tensor. The first coordinate ``i32 %x0``
+denotes the column index followed by four coordinates indicating the
+four row-indices. So, this mode takes a total of 5 coordinates as input arguments.
+For more information on ``scatter4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
 
 * The last argument to these intrinsics is a boolean flag
   indicating support for cache_hint. This flag argument must
@@ -1214,6 +1333,8 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
+
 Overview:
 """""""""
 
@@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination.
 The dimension of the tensor data ranges from 1d to 5d with the coordinates
 specified by the ``i32 %d0 ... i32 %d4`` arguments.
 
+In ``tile.gather4`` mode, four rows in the 2-dimnesional source tensor are
+fetched to the L2 cache. The first coordinate ``i32 %x0`` denotes the column index
+followed by four coordinates indicating the four row-indices. So, this mode takes
+a total of 5 coordinates as input arguments.
+For more information on ``gather4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
+
 * The last argument to these intrinsics is a boolean flag
   indicating support for cache_hint. This flag argument must
   be a compile-time constant. When set, it indicates a valid
@@ -1246,6 +1374,14 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
 Overview:
 """""""""
 
@@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some
 dimensions of the source tensor are unrolled into a single dimensional
 column at the destination. In this mode, the tensor has to be at least
 three-dimensional. Along with the tensor coordinates, im2col offsets are
-also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
-of im2col offsets is two less than the number of dimensions of the tensor
-operation. The last argument to these intrinsics is a boolean flag, with
+also specified (denoted by ``i16 im2col0...i16 %im2col2``). For ``im2col``
+mode, the number of offsets is two less than the number of dimensions of
+the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` modes,
+the number of offsets is always 2, denoted by ``i16 %wHalo`` and
+``i16 %wOffset`` arguments. For more information on ``im2col.w`` and
+``im2col.w.128`` modes, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
+
+
+The last argument to these intrinsics is a boolean flag, with
 the same functionality as described in the ``tile`` mode intrinsics above.
 
 For more information, refer PTX ISA
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index d417de73d0590..68490c8a247ba 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -135,7 +135,7 @@ rarely have to include this file directly).
       return !L->contains(cast<Instruction>(V)->getParent());
     }
 
-  Note that you should **not** use an ``isa<>`` test followed by a ``cast<>``,
+  Note that you should **not** use an ``isa<>`` test followed by a ``cast<>``;
   for that use the ``dyn_cast<>`` operator.
 
 ``dyn_cast<>``:
@@ -234,8 +234,8 @@ the ``str`` member function.  See ``llvm/ADT/StringRef.h`` (`doxygen
 <https://llvm.org/doxygen/StringRef_8h_source.html>`__) for more
 information.
 
-You should rarely use the ``StringRef`` class directly, because it contains
-pointers to external memory it is not generally safe to store an instance of the
+You should rarely use the ``StringRef`` class directly. Because it contains
+pointers to external memory, it is not generally safe to store an instance of the
 class (unless you know that the external storage will not be freed).
 ``StringRef`` is small and pervasive enough in LLVM that it should always be
 passed by value.
@@ -416,14 +416,14 @@ to abort quickly at the point of failure (providing some basic diagnostic) when
 invariants are broken at runtime.
 
 The fundamental tools for handling programmatic errors are assertions and the
-llvm_unreachable function. Assertions are used to express invariant conditions,
+``llvm_unreachable`` function. Assertions are used to express invariant conditions,
 and should include a message describing the invariant:
 
 .. code-block:: c++
 
   assert(isPhysReg(R) && "All virt regs should have been allocated already.");
 
-The llvm_unreachable function can be used to document areas of control flow
+The ``llvm_unreachable`` function can be used to document areas of control flow
 that should never be entered if the program invariants hold:
 
 .. code-block:: c++
@@ -598,7 +598,7 @@ semantics.  For example:
   }
 
 This third form works with any type that can be assigned to from ``T&&``. This
-can be useful if the ``Expected<T>`` value needs to be stored an already-declared
+can be useful if the ``Expected<T>`` value needs to be stored in an already-declared
 ``std::optional<T>``. For example:
 
 .. code-block:: c++
@@ -619,7 +619,7 @@ can be useful if the ``Expected<T>`` value needs to be stored an already-declare
 
 All ``Error`` instances, whether success or failure, must be either checked or
 moved from (via ``std::move`` or a return) before they are destructed.
-Accidentally discarding an unchecked error will cause a program abort at the
+Accidentally discarding an unchecked error will cause a program to abort at the
 point where the unchecked value's destructor is run, making it easy to identify
 and fix violations of this rule.
 
@@ -661,7 +661,7 @@ a variadic list of "handlers", each of which must be a callable type (a
 function, lambda, or class with a call operator) with one argument. The
 ``handleErrors`` function will visit each handler in the sequence and check its
 argument type against the dynamic type of the error, running the first handler
-that matches. This is the same decision process that is used decide which catch
+that matches. This is the same decision process that is used to decide which catch
 clause to run for a C++ exception.
 
 Since the list of handlers passed to ``handleErrors`` may not cover every error
@@ -869,10 +869,10 @@ T value:
   }
 
 Like the ExitOnError utility, cantFail simplifies control flow. Their treatment
-of error cases is very different however: Where ExitOnError is guaranteed to
+of error cases is very different, however: Where ExitOnError is guaranteed to
 terminate the program on an error input, cantFail simply asserts that the result
 is success. In debug builds this will result in an assertion failure if an error
-is encountered. In release builds the behavior of cantFail for failure values is
+is encountered. In release builds, the behavior of cantFail for failure values is
 undefined. As such, care must be taken in the use of cantFail: clients must be
 certain that a cantFail wrapped call really can not fail with the given
 arguments.
@@ -928,7 +928,7 @@ well-formed Foo or an Error, never an object in an invalid state.
 Propagating and consuming errors based on types
 """""""""""""""""""""""""""""""""""""""""""""""
 
-In some contexts, certain types of error are known to be benign. For example,
+In some contexts, certain types of errors are known to be benign. For example,
 when walking an archive, some clients may be happy to skip over badly formatted
 object files rather than terminating the walk immediately. Skipping badly
 formatted objects could be achieved using an elaborate handler method, but the
@@ -956,7 +956,7 @@ type inspection method, ``isA``, and the ``consumeError`` function:
 Concatenating Errors with joinErrors
 """"""""""""""""""""""""""""""""""""
 
-In the archive walking example above ``BadFileFormat`` errors are simply
+In the archive walking example above, ``BadFileFormat`` errors are simply
 consumed and ignored. If the client had wanted report these errors after
 completing the walk over the archive they could use the ``joinErrors`` utility:
 
@@ -982,13 +982,13 @@ The ``joinErrors`` routine builds a special error type called ``ErrorList``,
 which holds a list of user defined errors. The ``handleErrors`` routine
 recognizes this type and will attempt to handle each of the contained errors in
 order. If all contained errors can be handled, ``handleErrors`` will return
-``Error::success()``, otherwise ``handleErrors`` will concatenate the remaining
+``Error::success()``; otherwise, ``handleErrors`` will concatenate the remaining
 errors and return the resulting ``ErrorList``.
 
 Building fallible iterators and iterator ranges
 """""""""""""""""""""""""""""""""""""""""""""""
 
-The archive walking examples above retrieve archive members by index, however
+The archive walking examples above retrieve archive members by index; however,
 this requires considerable boiler-plate for iteration and error checking. We can
 clean this up by using the "fallible iterator" pattern, which supports the
 following natural iteration idiom for fallible containers like Archive:
@@ -1039,7 +1039,7 @@ fallible_iterator utility which provides ``operator++`` and ``operator--``,
 returning any errors via a reference passed in to the wrapper at construction
 time. The fallible_iterator wrapper takes care of (a) jumping to the end of the
 range on error, and (b) marking the error as checked whenever an iterator is
-compared to ``end`` and found to be inequal (in particular: this marks the
+compared to ``end`` and found to be inequal (in particular, this marks the
 error as checked throughout the body of a range-based for loop), enabling early
 exit from the loop without redundant error checking.
 
@@ -1068,7 +1068,7 @@ functions. E.g.:
 
 Using the fallible_iterator utility allows for both natural construction of
 fallible iterators (using failing ``inc`` and ``dec`` operations) and
-relatively natural use of c++ iterator/loop idioms.
+relatively natural use of C++ iterator/loop idioms.
 
 .. _function_apis:
 
@@ -1175,7 +1175,7 @@ Then you can run your pass like this:
   I am here!
 
 Using the ``LLVM_DEBUG()`` macro instead of a home-brewed solution allows you to not
-have to create "yet another" command line option for the debug output for your
+have to create "yet another" command-line option for the debug output for your
 pass.  Note that ``LLVM_DEBUG()`` macros are disabled for non-asserts builds, so they
 do not cause a performance impact at all (for the same reason, they should also
 not contain side-effects!).
@@ -1349,7 +1349,7 @@ certain number of times.
 The ``llvm/Support/DebugCounter.h`` (`doxygen
 <https://llvm.org/doxygen/DebugCounter_8h_source.html>`__) file
 provides a class named ``DebugCounter`` that can be used to create
-command line counter options that control execution of parts of your code.
+command-line counter options that control execution of parts of your code.
 
 Define your DebugCounter like this:
 
@@ -1364,7 +1364,7 @@ is specified by the first argument.  The name of the counter
 argument, and the description used in the help is specified by the
 third argument.
 
-Whatever code you want that control, use ``DebugCounter::shouldExecute`` to control it.
+Whatever code you want to control, use ``DebugCounter::shouldExecute`` to control it.
 
 .. code-block:: c++
 
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index cb9576b15d701..35a6f59ecbf35 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -14,6 +14,7 @@ LLVM and API reference documentation.
    BlockFrequencyTerminology
    BranchWeightMetadata
    Bugpoint
+   CalleeTypeMetadata
    CIBestPractices
    CommandGuide/index
    ConvergenceAndUniformity
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 4def8725c1bf9..bb1f88e8480f1 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -233,6 +233,8 @@ Changes to the X86 Backend
 --------------------------
 
 * `fp128` will now use `*f128` libcalls on 32-bit GNU targets as well.
+* On x86-32, `fp128` and `i128` are now passed with the expected 16-byte stack
+  alignment.
 
 Changes to the OCaml bindings
 -----------------------------
@@ -280,6 +282,9 @@ Changes to the LLVM tools
   ([#47468](https://github.com/llvm/llvm-project/issues/47468))
 * llvm-addr2line now supports a `+` prefix when specifying an address.
 * Support for `SHT_LLVM_BB_ADDR_MAP` versions 0 and 1 has been dropped.
+* llvm-objdump now supports the `--debug-inlined-funcs` flag, which prints the
+  locations of inlined functions alongside disassembly. The
+  `--debug-vars-indent` flag has also been renamed to `--debug-indent`.
 
 Changes to LLDB
 ---------------------------------
@@ -315,10 +320,7 @@ Changes to LLDB
   [DIL](https://discourse.llvm.org/t/rfc-data-inspection-language/69893) as the
   default implementation for 'frame variable'. This should not change the
   behavior of 'frame variable' at all, at this time. To revert to using the
-  old implementation use
-  ```
-     settings set target.experimental.use-DIL false
-   ```
+  old implementation use: `settings set target.experimental.use-DIL false`.
 * Disassembly of unknown instructions now produces `<unknown>` instead of
   nothing at all
 * Changed the format of opcode bytes to match llvm-objdump when disassembling
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 8f04b6594de79..5cb8d04c0da2a 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -157,6 +157,7 @@ Members of the LLVM Security Response Group are expected to:
 * Help write and review patches to address security issues.
 * Participate in the member nomination and removal processes.
 
+.. _security-group-discussion-medium:
 
 Discussion Medium
 =================
@@ -204,6 +205,10 @@ The LLVM Security Policy may be changed by majority vote of the LLVM Security Re
 What is considered a security issue?
 ====================================
 
+We define "security-sensitive" to mean that a discovered bug or vulnerability
+may require coordinated disclosure, and therefore should be reported to the LLVM
+Security Response group rather than publishing in the public bug tracker.
+
 The LLVM Project has a significant amount of code, and not all of it is
 considered security-sensitive. This is particularly true because LLVM is used in
 a wide variety of circumstances: there are different threat models, untrusted
@@ -217,31 +222,52 @@ security-sensitive). This requires a rationale, and buy-in from the LLVM
 community as for any RFC. In some cases, parts of the codebase could be handled
 as security-sensitive but need significant work to get to the stage where that's
 manageable. The LLVM community will need to decide whether it wants to invest in
-making these parts of the code securable, and maintain these security
-properties over time. In all cases the LLVM Security Response Group should be consulted,
-since they'll be responding to security issues filed against these parts of the
-codebase.
-
-If you're not sure whether an issue is in-scope for this security process or
-not, err towards assuming that it is. The Security Response Group might agree or disagree
-and will explain its rationale in the report, as well as update this document
-through the above process.
-
-The security-sensitive parts of the LLVM Project currently are the following.
-Note that this list can change over time.
-
-* None are currently defined. Please don't let this stop you from reporting
-  issues to the LLVM Security Response Group that you believe are security-sensitive.
-
-The parts of the LLVM Project which are currently treated as non-security
-sensitive are the following. Note that this list can change over time.
-
-* Language front-ends, such as clang, for which a malicious input file can cause
-  undesirable behavior. For example, a maliciously crafted C or Rust source file
-  can cause arbitrary code to execute in LLVM. These parts of LLVM haven't been
-  hardened, and compiling untrusted code usually also includes running utilities
-  such as `make` which can more readily perform malicious things.
-
+making these parts of the code securable, and maintain these security properties
+over time. In all cases the LLVM Security Response Group
+`should be consulted <security-group-discussion-medium_>`__, since they'll be
+responding to security issues filed against these parts of the codebase.
+
+The security-sensitive parts of the LLVM Project currently are the following:
+
+* Code generation: most miscompilations are not security sensitive. However, a
+  miscompilation where there are clear indications that it can result in the
+  produced binary becoming significantly easier to exploit could be considered
+  security sensitive, and should be reported to the security response group.
+* Run-time libraries: only parts of the run-time libraries are considered
+  security-sensitive. The parts that are not considered security-sensitive are
+  documented below.
+
+The following parts of the LLVM Project are currently treated as non-security
+sensitive:
+
+* LLVM's language frontends, analyzers, optimizers, and code generators for
+  which a malicious input can cause undesirable behavior. For example, a
+  maliciously crafted C, Rust or bitcode input file can cause arbitrary code to
+  execute in LLVM. These parts of LLVM haven't been hardened, and handling
+  untrusted code usually also includes running utilities such as make which can
+  more readily perform malicious things. For example, vulnerabilities in clang,
+  clangd, or the LLVM optimizer in a JIT caused by untrusted inputs are not
+  security-sensitive.
+* The following parts of the run-time libraries are explicitly not considered
+  security-sensitive:
+
+  * parts of the run-time libraries that are not meant to be included in
+    production binaries. For example, most sanitizers are not considered
+    security-sensitive as they are meant to be used during development only, not
+    in production.
+  * for libc and libc++: if a user calls library functionality in an undefined
+    or otherwise incorrect way, this will most likely not be considered a
+    security issue, unless the libc/libc++ documentation explicitly promises to
+    harden or catch that specific undefined behaviour or incorrect usage.
+  * unwinding and exception handling: the implementations are not hardened
+    against malformed or malicious unwind or exception handling data. This is
+    not considered security sensitive.
+
+Note that both the explicit security-sensitive and explicit non-security
+sensitive lists can change over time. If you're not sure whether an issue is
+in-scope for this security process or not, err towards assuming that it is. The
+Security Response Group might agree or disagree and will explain its rationale
+in the report, as well as update this document through the above process.
 
 .. _CVE process: https://cve.mitre.org
 .. _report a vulnerability: https://github.com/llvm/llvm-security-repo/security/advisories/new
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index b6dda6a732405..76b6b4e2cab30 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -152,12 +152,12 @@ can run the LLVM and Clang tests simultaneously using:
 
     % make check-all
 
-To run the tests with Valgrind (Memcheck by default), use the ``LIT_ARGS`` make
+To run the tests with Valgrind (Memcheck by default), use the ``LIT_OPTS`` make
 variable to pass the required options to lit. For example, you can use:
 
 .. code-block:: bash
 
-    % make check LIT_ARGS="-v --vg --vg-leak"
+    % make check LIT_OPTS="-v --vg --vg-leak"
 
 to enable testing with valgrind and with leak checking enabled.
 
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
index 56608020fc0c9..5ebff3b0474b1 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst
@@ -86,7 +86,7 @@ instead of computing "``x+3``" twice.
 
 Unfortunately, no amount of local analysis will be able to detect and
 correct this. This requires two transformations: reassociation of
-expressions (to make the add's lexically identical) and Common
+expressions (to make the adds lexically identical) and Common
 Subexpression Elimination (CSE) to delete the redundant add instruction.
 Fortunately, LLVM provides a broad range of optimizations that you can
 use, in the form of "passes".
diff --git a/llvm/include/llvm/ADT/CombinationGenerator.h b/llvm/include/llvm/ADT/CombinationGenerator.h
index 6100aa9812293..bbdbd9bfa1be3 100644
--- a/llvm/include/llvm/ADT/CombinationGenerator.h
+++ b/llvm/include/llvm/ADT/CombinationGenerator.h
@@ -118,10 +118,9 @@ class CombinationGenerator {
       : VariablesChoices(VariablesChoices_) {
 #ifndef NDEBUG
     assert(!VariablesChoices.empty() && "There should be some variables.");
-    llvm::for_each(VariablesChoices, [](ArrayRef<choice_type> VariableChoices) {
+    for (ArrayRef<choice_type> VariableChoices : VariablesChoices)
       assert(!VariableChoices.empty() &&
              "There must always be some choice, at least a placeholder one.");
-    });
 #endif
   }
 
diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index b1009f8b49992..1a2331c1a0322 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -218,12 +218,12 @@ template <class ElemTy> class EquivalenceClasses {
   /// insert - Insert a new value into the union/find set, ignoring the request
   /// if the value already exists.
   const ECValue &insert(const ElemTy &Data) {
-    auto I = TheMapping.insert({Data, nullptr});
-    if (!I.second)
-      return *I.first->second;
+    auto [I, Inserted] = TheMapping.try_emplace(Data);
+    if (!Inserted)
+      return *I->second;
 
     auto *ECV = new (ECValueAllocator) ECValue(Data);
-    I.first->second = ECV;
+    I->second = ECV;
     Members.push_back(ECV);
     return *ECV;
   }
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index 7bd2c8705f393..81b9a685e11d2 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -55,21 +55,13 @@ using type_identity_t // NOLINT(readability-identifier-naming)
 
 // TODO: Remove this in favor of std::optional<T>::transform once we switch to
 // C++23.
-template <typename T, typename Function>
-auto transformOptional(const std::optional<T> &O, const Function &F)
-    -> std::optional<decltype(F(*O))> {
-  if (O)
-    return F(*O);
-  return std::nullopt;
-}
-
-// TODO: Remove this in favor of std::optional<T>::transform once we switch to
-// C++23.
-template <typename T, typename Function>
-auto transformOptional(std::optional<T> &&O, const Function &F)
-    -> std::optional<decltype(F(*std::move(O)))> {
-  if (O)
-    return F(*std::move(O));
+template <typename Optional, typename Function,
+          typename Value = typename llvm::remove_cvref_t<Optional>::value_type>
+std::optional<std::invoke_result_t<Function, Value>>
+transformOptional(Optional &&O, Function &&F) {
+  if (O) {
+    return F(*std::forward<Optional>(O));
+  }
   return std::nullopt;
 }
 
diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
index c089a070d4b57..575b3c929e40c 100644
--- a/llvm/include/llvm/ADT/StringTable.h
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -118,6 +118,13 @@ class StringTable {
     constexpr Iterator(const Iterator &RHS) = default;
     constexpr Iterator(Iterator &&RHS) = default;
 
+    Iterator &operator=(const Iterator &RHS) {
+      Table = RHS.Table;
+      O = RHS.O;
+      S = RHS.S;
+      return *this;
+    }
+
     bool operator==(const Iterator &RHS) const {
       assert(Table == RHS.Table && "Compared iterators for unrelated tables!");
       return O == RHS.O;
@@ -132,6 +139,8 @@ class StringTable {
       O = O.value() + (*Table)[O].size() + 1;
       return *this;
     }
+
+    Offset offset() const { return O; }
   };
 
   constexpr Iterator begin() const { return Iterator(*this, 0); }
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 3d7edf08c8807..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -170,6 +170,10 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+    return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
   /// Helper function to get vocabulary key for a given Opcode
   static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
@@ -182,6 +186,11 @@ class Vocabulary {
   /// Helper function to classify an operand into OperandKind
   static OperandKind getOperandKind(const Value *Op);
 
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index b985292ccee40..1dc73205a0ebb 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -47,6 +47,8 @@ enum class RecurKind {
   FMul,     ///< Product of floats.
   FMin,     ///< FP min implemented in terms of select(cmp()).
   FMax,     ///< FP max implemented in terms of select(cmp()).
+  FMinNum,  ///< FP min with llvm.minnum semantics including NaNs.
+  FMaxNum,  ///< FP max with llvm.maxnum semantics including NaNs.
   FMinimum, ///< FP min with llvm.minimum semantics
   FMaximum, ///< FP max with llvm.maximum semantics
   FMinimumNum, ///< FP min with llvm.minimumnum semantics
@@ -250,6 +252,7 @@ class RecurrenceDescriptor {
   /// Returns true if the recurrence kind is a floating-point min/max kind.
   static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
     return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
+           Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum ||
            Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
            Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
   }
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 73bfe1aabb4e0..af6e534983709 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -236,8 +236,8 @@ class MemoryDepChecker {
 
   /// In same cases when the dependency check fails we can still
   /// vectorize the loop with a dynamic array access check.
-  bool shouldRetryWithRuntimeCheck() const {
-    return FoundNonConstantDistanceDependence &&
+  bool shouldRetryWithRuntimeChecks() const {
+    return ShouldRetryWithRuntimeChecks &&
            Status == VectorizationSafetyStatus::PossiblySafeWithRtChecks;
   }
 
@@ -327,9 +327,9 @@ class MemoryDepChecker {
   uint64_t MaxStoreLoadForwardSafeDistanceInBits =
       std::numeric_limits<uint64_t>::max();
 
-  /// If we see a non-constant dependence distance we can still try to
-  /// vectorize this loop with runtime checks.
-  bool FoundNonConstantDistanceDependence = false;
+  /// Whether we should try to vectorize the loop with runtime checks, if the
+  /// dependencies are not safe.
+  bool ShouldRetryWithRuntimeChecks = false;
 
   /// Result of the dependence checks, indicating whether the checked
   /// dependences are safe for vectorization, require RT checks or are known to
diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index b042a717e4e49..571caf95f275d 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -102,6 +102,12 @@ class CallStackTrie {
   // The maximum size of a cold allocation context, from the profile summary.
   uint64_t MaxColdSize;
 
+  // Tracks whether we have built the Trie from existing MD_memprof metadata. We
+  // apply different heuristics for determining whether to discard non-cold
+  // contexts when rebuilding as we have lost information available during the
+  // original profile match.
+  bool BuiltFromExistingMetadata = false;
+
   void deleteTrieNode(CallStackTrieNode *Node) {
     if (!Node)
       return;
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index af1e0d7251a4f..9a2773c06bae6 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 class TargetLibraryInfo;
+class IntrinsicInst;
 
 /// The Vector Function Database.
 ///
@@ -188,6 +189,10 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
 /// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
 LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID);
 
+/// Given a deinterleaveN intrinsic, return the (narrow) vector type of each
+/// factor.
+LLVM_ABI VectorType *getDeinterleavedVectorType(IntrinsicInst *DI);
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index c7e4bdf3ff811..a2311d2ac285d 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -181,6 +181,7 @@ enum Kind {
   kw_amdgpu_cs_chain_preserve,
   kw_amdgpu_kernel,
   kw_amdgpu_gfx,
+  kw_amdgpu_gfx_whole_wave,
   kw_tailcc,
   kw_m68k_rtdcc,
   kw_graalcc,
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 6bf2e177b5d40..ad35d7f05d5da 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -362,6 +362,7 @@ enum {
   ELFOSABI_FENIXOS = 16,       // FenixOS
   ELFOSABI_CLOUDABI = 17,      // Nuxi CloudABI
   ELFOSABI_CUDA = 51,          // NVIDIA CUDA architecture.
+  ELFOSABI_CUDA_V2 = 41,       // NVIDIA CUDA architecture.
   ELFOSABI_FIRST_ARCH = 64,    // First architecture-specific OS ABI
   ELFOSABI_AMDGPU_HSA = 64,    // AMD HSA runtime
   ELFOSABI_AMDGPU_PAL = 65,    // AMD PAL runtime
@@ -385,6 +386,12 @@ enum {
   ELFABIVERSION_AMDGPU_HSA_V6 = 4,
 };
 
+// CUDA OS ABI Version identification.
+enum {
+  ELFABIVERSION_CUDA_V1 = 7,
+  ELFABIVERSION_CUDA_V2 = 8,
+};
+
 #define ELF_RELOC(name, value) name = value,
 
 // X86_64 relocations.
@@ -921,7 +928,7 @@ enum {
 
 // NVPTX specific e_flags.
 enum : unsigned {
-  // Processor selection mask for EF_CUDA_SM* values.
+  // Processor selection mask for EF_CUDA_SM* values prior to blackwell.
   EF_CUDA_SM = 0xff,
 
   // SM based processor values.
@@ -954,12 +961,22 @@ enum : unsigned {
   // The target is using 64-bit addressing.
   EF_CUDA_64BIT_ADDRESS = 0x400,
   // Set when using the sm_90a processor.
-  EF_CUDA_ACCELERATORS = 0x800,
+  EF_CUDA_ACCELERATORS_V1 = 0x800,
   // Undocumented software feature.
   EF_CUDA_SW_FLAG_V2 = 0x1000,
 
   // Virtual processor selection mask for EF_CUDA_VIRTUAL_SM* values.
   EF_CUDA_VIRTUAL_SM = 0xff0000,
+
+  // Processor selection mask for EF_CUDA_SM* values following blackwell.
+  EF_CUDA_SM_MASK = 0xff00,
+
+  // SM based processor values.
+  EF_CUDA_SM100 = 0x6400,
+  EF_CUDA_SM120 = 0x7800,
+
+  // Set when using an accelerator variant like sm_100a.
+  EF_CUDA_ACCELERATORS = 0x8,
 };
 
 // ELF Relocation types for BPF
@@ -1159,6 +1176,7 @@ enum : unsigned {
   SHT_LLVM_OFFLOADING = 0x6fff4c0b,         // LLVM device offloading data.
   SHT_LLVM_LTO = 0x6fff4c0c,                // .llvm.lto for fat LTO.
   SHT_LLVM_JT_SIZES = 0x6fff4c0d,           // LLVM jump tables sizes.
+  SHT_LLVM_CFI_JUMP_TABLE = 0x6fff4c0e,     // LLVM CFI jump table.
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
diff --git a/llvm/include/llvm/BinaryFormat/SFrame.h b/llvm/include/llvm/BinaryFormat/SFrame.h
index 16d3b16c6c2d3..98dbe38fb2bc4 100644
--- a/llvm/include/llvm/BinaryFormat/SFrame.h
+++ b/llvm/include/llvm/BinaryFormat/SFrame.h
@@ -15,33 +15,36 @@
 #ifndef LLVM_BINARYFORMAT_SFRAME_H
 #define LLVM_BINARYFORMAT_SFRAME_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Endian.h"
 
-namespace llvm::sframe {
+namespace llvm {
+
+template <typename T> struct EnumEntry;
+
+namespace sframe {
 
 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
 
 constexpr uint16_t Magic = 0xdee2;
 
 enum class Version : uint8_t {
-  V1 = 1,
-  V2 = 2,
+#define HANDLE_SFRAME_VERSION(CODE, NAME) NAME = CODE,
+#include "llvm/BinaryFormat/SFrameConstants.def"
 };
 
 enum class Flags : uint8_t {
-  FDESorted = 0x01,
-  FramePointer = 0x02,
-  FDEFuncStartPCRel = 0x04,
+#define HANDLE_SFRAME_FLAG(CODE, NAME) NAME = CODE,
+#include "llvm/BinaryFormat/SFrameConstants.def"
   V2AllFlags = FDESorted | FramePointer | FDEFuncStartPCRel,
   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/0xff),
 };
 
 enum class ABI : uint8_t {
-  AArch64EndianBig = 1,
-  AArch64EndianLittle = 2,
-  AMD64EndianLittle = 3,
+#define HANDLE_SFRAME_ABI(CODE, NAME) NAME = CODE,
+#include "llvm/BinaryFormat/SFrameConstants.def"
 };
 
 /// SFrame FRE Types. Bits 0-3 of FuncDescEntry.Info.
@@ -160,6 +163,11 @@ template <endianness E> using FrameRowEntryAddr1 = FrameRowEntry<uint8_t, E>;
 template <endianness E> using FrameRowEntryAddr2 = FrameRowEntry<uint16_t, E>;
 template <endianness E> using FrameRowEntryAddr4 = FrameRowEntry<uint32_t, E>;
 
-} // namespace llvm::sframe
+ArrayRef<EnumEntry<Version>> getVersions();
+ArrayRef<EnumEntry<Flags>> getFlags();
+ArrayRef<EnumEntry<ABI>> getABIs();
+
+} // namespace sframe
+} // namespace llvm
 
 #endif // LLVM_BINARYFORMAT_SFRAME_H
diff --git a/llvm/include/llvm/BinaryFormat/SFrameConstants.def b/llvm/include/llvm/BinaryFormat/SFrameConstants.def
new file mode 100644
index 0000000000000..643b15f438c86
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/SFrameConstants.def
@@ -0,0 +1,39 @@
+//===- SFrameConstants.def --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined(HANDLE_SFRAME_VERSION) || defined(HANDLE_SFRAME_FLAG) ||  \
+      defined(HANDLE_SFRAME_ABI))
+#error "Missing HANDLE_SFRAME definition"
+#endif
+
+#ifndef HANDLE_SFRAME_VERSION
+#define HANDLE_SFRAME_VERSION(CODE, NAME)
+#endif
+
+#ifndef HANDLE_SFRAME_FLAG
+#define HANDLE_SFRAME_FLAG(CODE, NAME)
+#endif
+
+#ifndef HANDLE_SFRAME_ABI
+#define HANDLE_SFRAME_ABI(CODE, NAME)
+#endif
+
+HANDLE_SFRAME_VERSION(0x01, V1)
+HANDLE_SFRAME_VERSION(0x02, V2)
+
+HANDLE_SFRAME_FLAG(0x01, FDESorted)
+HANDLE_SFRAME_FLAG(0x02, FramePointer)
+HANDLE_SFRAME_FLAG(0x04, FDEFuncStartPCRel)
+
+HANDLE_SFRAME_ABI(0x01, AArch64EndianBig)
+HANDLE_SFRAME_ABI(0x02, AArch64EndianLittle)
+HANDLE_SFRAME_ABI(0x03, AMD64EndianLittle)
+
+#undef HANDLE_SFRAME_VERSION
+#undef HANDLE_SFRAME_FLAG
+#undef HANDLE_SFRAME_ABI
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 31f1197b9723b..da829046cc421 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -700,18 +700,19 @@ class CombinerHelper {
   /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
   /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
+  MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const;
   /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
-  bool matchUDivorURemByConst(MachineInstr &MI) const;
-  void applyUDivorURemByConst(MachineInstr &MI) const;
-
-  /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
-  /// expression that implements it by multiplying by a magic number.
-  /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
-  /// Combine G_SDIV by constant into a multiply by magic constant.
-  bool matchSDivByConst(MachineInstr &MI) const;
-  void applySDivByConst(MachineInstr &MI) const;
+  bool matchUDivOrURemByConst(MachineInstr &MI) const;
+  void applyUDivOrURemByConst(MachineInstr &MI) const;
+
+  /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by
+  /// constant, return an expression that implements it by multiplying by a
+  /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's
+  /// Guide".
+  MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const;
+  /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant.
+  bool matchSDivOrSRemByConst(MachineInstr &MI) const;
+  void applySDivOrSRemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
   /// return expressions that implements it by shifting.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index da7323855b813..490d1a34cc846 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -103,6 +103,20 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
   /// \return The known alignment for the pointer-like value \p R.
   Align computeKnownAlignment(Register R, unsigned Depth = 0);
 
+  /// If a G_SHL/G_ASHR/G_LSHR node with shift operand \p R has shift amounts
+  /// that are all less than the element bit-width of the shift node, return the
+  /// valid constant range.
+  std::optional<ConstantRange>
+  getValidShiftAmountRange(Register R, const APInt &DemandedElts,
+                           unsigned Depth);
+
+  /// If a G_SHL/G_ASHR/G_LSHR node with shift operand \p R has shift amounts
+  /// that are all less than the element bit-width of the shift node, return the
+  /// minimum possible value.
+  std::optional<uint64_t> getValidMinimumShiftAmount(Register R,
+                                                     const APInt &DemandedElts,
+                                                     unsigned Depth = 0);
+
   /// Determine which floating-point classes are valid for \p V, and return them
   /// in KnownFPClass bit sets.
   ///
diff --git a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index aef91909dd17b..794075201d646 100644
--- a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -39,8 +39,8 @@ class MachineBlockFrequencyInfo {
 public:
   LLVM_ABI MachineBlockFrequencyInfo(); // Legacy pass manager only.
   LLVM_ABI explicit MachineBlockFrequencyInfo(
-      MachineFunction &F, MachineBranchProbabilityInfo &MBPI,
-      MachineLoopInfo &MLI);
+      const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI,
+      const MachineLoopInfo &MLI);
   LLVM_ABI MachineBlockFrequencyInfo(MachineBlockFrequencyInfo &&);
   LLVM_ABI ~MachineBlockFrequencyInfo();
 
diff --git a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
index 98a60c987bbe3..1d954cf60c68c 100644
--- a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
@@ -46,6 +46,11 @@ class MachineFunctionAnalysis
   LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
 };
 
+class FreeMachineFunctionPass : public PassInfoMixin<FreeMachineFunctionPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_MachineFunctionAnalysis
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index e7a7091acee64..efda7eb8ffc8d 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -65,7 +65,7 @@
 //
 // void <SubTarget>Subtarget::
 // overrideSchedPolicy(MachineSchedPolicy &Policy,
-//                     unsigned NumRegionInstrs) const {
+//                     const SchedRegion &Region) const {
 //   Policy.<Flag> = true;
 // }
 //
@@ -218,6 +218,22 @@ struct MachineSchedPolicy {
   MachineSchedPolicy() = default;
 };
 
+/// A region of an MBB for scheduling.
+struct SchedRegion {
+  /// RegionBegin is the first instruction in the scheduling region, and
+  /// RegionEnd is either MBB->end() or the scheduling boundary after the
+  /// last instruction in the scheduling region. These iterators cannot refer
+  /// to instructions outside of the identified scheduling region because
+  /// those may be reordered before scheduling this region.
+  MachineBasicBlock::iterator RegionBegin;
+  MachineBasicBlock::iterator RegionEnd;
+  unsigned NumRegionInstrs;
+
+  SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
+              unsigned N)
+      : RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
+};
+
 /// MachineSchedStrategy - Interface to the scheduling algorithm used by
 /// ScheduleDAGMI.
 ///
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 657951ddafd4f..eac8e14d6c37e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1202,13 +1202,16 @@ class SelectionDAG {
   LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                            ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
   LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL,
-                           ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
+                           ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops,
+                           const SDNodeFlags Flags);
   LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                            ArrayRef<SDValue> Ops, const SDNodeFlags Flags);
 
   // Use flags from current flag inserter.
   LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                            ArrayRef<SDValue> Ops);
+  LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL,
+                           ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
   LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
                            ArrayRef<SDValue> Ops);
   LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
@@ -1346,9 +1349,10 @@ class SelectionDAG {
   /// Helper function to make it easier to build SelectCC's if you just have an
   /// ISD::CondCode instead of an SDValue.
   SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
-                      SDValue False, ISD::CondCode Cond) {
+                      SDValue False, ISD::CondCode Cond,
+                      SDNodeFlags Flags = SDNodeFlags()) {
     return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True,
-                   False, getCondCode(Cond));
+                   False, getCondCode(Cond), Flags);
   }
 
   /// Try to simplify a select/vselect into 1 of its operands or a constant.
@@ -1425,10 +1429,9 @@ class SelectionDAG {
 
   /// Creates a LifetimeSDNode that starts (`IsStart==true`) or ends
   /// (`IsStart==false`) the lifetime of the portion of `FrameIndex` between
-  /// offsets `Offset` and `Offset + Size`.
+  /// offsets `0` and `Size`.
   LLVM_ABI SDValue getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain,
-                                   int FrameIndex, int64_t Size,
-                                   int64_t Offset = -1);
+                                   int FrameIndex, int64_t Size);
 
   /// Creates a PseudoProbeSDNode with function GUID `Guid` and
   /// the index of the block `Index` it is probing, as well as the attributes
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 5d9937f832396..8e9c1f75c938c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -2004,25 +2004,17 @@ class FrameIndexSDNode : public SDNode {
 class LifetimeSDNode : public SDNode {
   friend class SelectionDAG;
   int64_t Size;
-  int64_t Offset; // -1 if offset is unknown.
 
   LifetimeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
-                 SDVTList VTs, int64_t Size, int64_t Offset)
-      : SDNode(Opcode, Order, dl, VTs), Size(Size), Offset(Offset) {}
+                 SDVTList VTs, int64_t Size)
+      : SDNode(Opcode, Order, dl, VTs), Size(Size) {}
+
 public:
   int64_t getFrameIndex() const {
     return cast<FrameIndexSDNode>(getOperand(1))->getIndex();
   }
 
-  bool hasOffset() const { return Offset >= 0; }
-  int64_t getOffset() const {
-    assert(hasOffset() && "offset is unknown");
-    return Offset;
-  }
-  int64_t getSize() const {
-    assert(hasOffset() && "offset is unknown");
-    return Size;
-  }
+  int64_t getSize() const { return Size; }
 
   // Methods to support isa and dyn_cast
   static bool classof(const SDNode *N) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 72594c7f9783c..cbdc1b6031680 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3201,11 +3201,15 @@ class LLVM_ABI TargetLoweringBase {
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p LI is the vector load instruction.
+  /// \p Load is the vector load instruction. Can be either a plain load
+  /// instruction or a vp.load intrinsic.
+  /// \p Mask is a per-segment (i.e. number of lanes equal to that of one
+  /// component being interwoven) mask.  Can be nullptr, in which case the
+  /// result is uncondiitional.
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedLoad(LoadInst *LI,
+  virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices,
                                     unsigned Factor) const {
@@ -3215,46 +3219,28 @@ class LLVM_ABI TargetLoweringBase {
   /// Lower an interleaved store to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p SI is the vector store instruction.
+  /// \p SI is the vector store instruction.  Can be either a plain store
+  /// or a vp.store.
+  /// \p Mask is a per-segment (i.e. number of lanes equal to that of one
+  /// component being interwoven) mask.  Can be nullptr, in which case the
+  /// result is unconditional.
   /// \p SVI is the shufflevector to RE-interleave the stored vector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                                     ShuffleVectorInst *SVI,
                                      unsigned Factor) const {
     return false;
   }
 
-  /// Lower an interleaved load to target specific intrinsics. Return
-  /// true on success.
-  ///
-  /// \p Load is a vp.load instruction.
-  /// \p Mask is a mask value
-  /// \p DeinterleaveRes is a list of deinterleaved results.
-  virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
-    return false;
-  }
-
-  /// Lower an interleaved store to target specific intrinsics. Return
-  /// true on success.
-  ///
-  /// \p Store is the vp.store instruction.
-  /// \p Mask is a mask value
-  /// \p InterleaveOps is a list of values being interleaved.
-  virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
-                                       ArrayRef<Value *> InterleaveOps) const {
-    return false;
-  }
-
   /// Lower a deinterleave intrinsic to a target specific load intrinsic.
   /// Return true on success. Currently only supports
   /// llvm.vector.deinterleave{2,3,5,7}
   ///
   /// \p Load is the accompanying load instruction.  Can be either a plain load
   /// instruction or a vp.load intrinsic.
-  /// \p DeinterleaveValues contains the deinterleaved values.
-  virtual bool
-  lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
-                                   ArrayRef<Value *> DeinterleaveValues) const {
+  /// \p DI represents the deinterleaveN intrinsic.
+  virtual bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+                                                IntrinsicInst *DI) const {
     return false;
   }
 
@@ -3262,10 +3248,14 @@ class LLVM_ABI TargetLoweringBase {
   /// Return true on success. Currently only supports
   /// llvm.vector.interleave{2,3,5,7}
   ///
-  /// \p SI is the accompanying store instruction
+  /// \p Store is the accompanying store instruction.  Can be either a plain
+  /// store or a vp.store intrinsic.
+  /// \p Mask is a per-segment (i.e. number of lanes equal to that of one
+  /// component being interwoven) mask.  Can be nullptr, in which case the
+  /// result is uncondiitional.
   /// \p InterleaveValues contains the interleaved values.
   virtual bool
-  lowerInterleaveIntrinsicToStore(StoreInst *SI,
+  lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask,
                                   ArrayRef<Value *> InterleaveValues) const {
     return false;
   }
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 45e67d80629cb..a8c7a8aff83cf 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -54,6 +54,7 @@ class TargetRegisterClass;
 class TargetRegisterInfo;
 class TargetSchedModel;
 class Triple;
+struct SchedRegion;
 
 //===----------------------------------------------------------------------===//
 ///
@@ -231,7 +232,7 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo {
   /// scheduling heuristics (no custom MachineSchedStrategy) to make
   /// changes to the generic scheduling policy.
   virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                   unsigned NumRegionInstrs) const {}
+                                   const SchedRegion &Region) const {}
 
   /// Override generic post-ra scheduling policy within a region.
   ///
@@ -241,7 +242,7 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo {
   /// Note that some options like tracking register pressure won't take effect
   /// in post-ra scheduling.
   virtual void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
-                                         unsigned NumRegionInstrs) const {}
+                                         const SchedRegion &Region) const {}
 
   // Perform target-specific adjustments to the latency of a schedule
   // dependency.
diff --git a/llvm/include/llvm/Demangle/DemangleConfig.h b/llvm/include/llvm/Demangle/DemangleConfig.h
index 30f72ffe0d7ef..0b4aba016612b 100644
--- a/llvm/include/llvm/Demangle/DemangleConfig.h
+++ b/llvm/include/llvm/Demangle/DemangleConfig.h
@@ -6,86 +6,26 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains a variety of feature test macros copied from
-// include/llvm/Support/Compiler.h so that LLVMDemangle does not need to take
-// a dependency on LLVMSupport.
+// Contains DEMANGLE_ aliases for LLVM_ definitions. The canonical copy of
+// ItaniumDemangle.h cannot depend on LLVM headers because lives in the
+// libcxxabi project.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_DEMANGLE_DEMANGLECONFIG_H
 #define LLVM_DEMANGLE_DEMANGLECONFIG_H
 
-#ifndef __has_feature
-#define __has_feature(x) 0
-#endif
-
-#ifndef __has_cpp_attribute
-#define __has_cpp_attribute(x) 0
-#endif
-
-#ifndef __has_attribute
-#define __has_attribute(x) 0
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
-#ifndef DEMANGLE_GNUC_PREREQ
-#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
-#define DEMANGLE_GNUC_PREREQ(maj, min, patch)                           \
-  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >=          \
-   ((maj) << 20) + ((min) << 10) + (patch))
-#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
-#define DEMANGLE_GNUC_PREREQ(maj, min, patch)                           \
-  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
-#else
-#define DEMANGLE_GNUC_PREREQ(maj, min, patch) 0
-#endif
-#endif
+#include "llvm/Support/Compiler.h"
 
-#if __has_attribute(used) || DEMANGLE_GNUC_PREREQ(3, 1, 0)
-#define DEMANGLE_ATTRIBUTE_USED __attribute__((__used__))
-#else
-#define DEMANGLE_ATTRIBUTE_USED
-#endif
+#define DEMANGLE_DUMP_METHOD LLVM_DUMP_METHOD
+#define DEMANGLE_FALLTHROUGH LLVM_FALLTHROUGH
 
-#if __has_builtin(__builtin_unreachable) || DEMANGLE_GNUC_PREREQ(4, 5, 0)
-#define DEMANGLE_UNREACHABLE __builtin_unreachable()
-#elif defined(_MSC_VER)
-#define DEMANGLE_UNREACHABLE __assume(false)
+#if defined(LLVM_BUILTIN_UNREACHABLE)
+#define DEMANGLE_UNREACHABLE LLVM_BUILTIN_UNREACHABLE
 #else
 #define DEMANGLE_UNREACHABLE
 #endif
 
-#if __has_attribute(noinline) || DEMANGLE_GNUC_PREREQ(3, 4, 0)
-#define DEMANGLE_ATTRIBUTE_NOINLINE __attribute__((noinline))
-#elif defined(_MSC_VER)
-#define DEMANGLE_ATTRIBUTE_NOINLINE __declspec(noinline)
-#else
-#define DEMANGLE_ATTRIBUTE_NOINLINE
-#endif
-
-#if !defined(NDEBUG)
-#define DEMANGLE_DUMP_METHOD DEMANGLE_ATTRIBUTE_NOINLINE DEMANGLE_ATTRIBUTE_USED
-#else
-#define DEMANGLE_DUMP_METHOD DEMANGLE_ATTRIBUTE_NOINLINE
-#endif
-
-#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
-#define DEMANGLE_FALLTHROUGH [[fallthrough]]
-#elif __has_cpp_attribute(gnu::fallthrough)
-#define DEMANGLE_FALLTHROUGH [[gnu::fallthrough]]
-#elif !__cplusplus
-// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
-// error when __has_cpp_attribute is given a scoped attribute in C mode.
-#define DEMANGLE_FALLTHROUGH
-#elif __has_cpp_attribute(clang::fallthrough)
-#define DEMANGLE_FALLTHROUGH [[clang::fallthrough]]
-#else
-#define DEMANGLE_FALLTHROUGH
-#endif
-
 #ifndef DEMANGLE_ASSERT
 #include <cassert>
 #define DEMANGLE_ASSERT(__expr, __msg) assert((__expr) && (__msg))
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index de888ff86fe91..7919f7a8b0c34 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -779,16 +779,17 @@ struct LinkT {
 template <typename T, typename I, typename E> //
 struct MapT {
   using LocatorList = ObjectListT<I, E>;
-  ENUM(MapType, To, From, Tofrom, Alloc, Release, Delete);
-  ENUM(MapTypeModifier, Always, Close, Present, OmpxHold);
+  ENUM(MapType, To, From, Tofrom, Storage);
+  ENUM(MapTypeModifier, Always, Close, Delete, Present, Self, OmpxHold);
+  ENUM(RefModifier, RefPtee, RefPtr, RefPtrPtee);
   // See note at the definition of the MapperT type.
   using Mappers = ListT<type::MapperT<I, E>>; // Not a spec name
   using Iterator = type::IteratorT<T, I, E>;
   using MapTypeModifiers = ListT<MapTypeModifier>; // Not a spec name
 
   using TupleTrait = std::true_type;
-  std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(Mappers), OPT(Iterator),
-             LocatorList>
+  std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(RefModifier),
+             OPT(Mappers), OPT(Iterator), LocatorList>
       t;
 };
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 611bfe3f8aced..047baa3a79f5d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -708,6 +708,7 @@ bool ConstructDecompositionT<C, H>::applyClause(
                      tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
                          {/*MapType=*/MapType::Tofrom,
                           /*MapTypeModifier=*/std::nullopt,
+                          /*RefModifier=*/std::nullopt,
                           /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
                           /*LocatorList=*/std::move(tofrom)}});
       dirTarget->clauses.push_back(map);
@@ -969,8 +970,8 @@ bool ConstructDecompositionT<C, H>::applyClause(
           llvm::omp::Clause::OMPC_map,
           tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
               {/*MapType=*/MapType::Tofrom, /*MapTypeModifier=*/std::nullopt,
-               /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
-               /*LocatorList=*/std::move(tofrom)}});
+               /*RefModifier=*/std::nullopt, /*Mapper=*/std::nullopt,
+               /*Iterator=*/std::nullopt, /*LocatorList=*/std::move(tofrom)}});
 
       dirTarget->clauses.push_back(map);
       applied = true;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index d44c33301bde7..9d0a55432e1ae 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -51,13 +51,17 @@ static constexpr inline bool canHaveIterator(Clause C) {
 // Can clause C create a private copy of a variable.
 static constexpr inline bool isPrivatizingClause(Clause C) {
   switch (C) {
+  case OMPC_detach:
   case OMPC_firstprivate:
+  // TODO case OMPC_induction:
   case OMPC_in_reduction:
+  case OMPC_is_device_ptr:
   case OMPC_lastprivate:
   case OMPC_linear:
   case OMPC_private:
   case OMPC_reduction:
   case OMPC_task_reduction:
+  case OMPC_use_device_ptr:
     return true;
   default:
     return false;
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index d68491eb5535c..ef761eb1aed73 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -284,6 +284,9 @@ namespace CallingConv {
     RISCV_VLSCall_32768 = 122,
     RISCV_VLSCall_65536 = 123,
 
+    // Calling convention for AMDGPU whole wave functions.
+    AMDGPU_Gfx_WholeWave = 124,
+
     /// The highest possible ID. Must be some 2^k - 1.
     MaxID = 1023
   };
@@ -294,8 +297,13 @@ namespace CallingConv {
 /// directly or indirectly via a call-like instruction.
 constexpr bool isCallableCC(CallingConv::ID CC) {
   switch (CC) {
+  // Called with special intrinsics:
+  // llvm.amdgcn.cs.chain
   case CallingConv::AMDGPU_CS_Chain:
   case CallingConv::AMDGPU_CS_ChainPreserve:
+  // llvm.amdgcn.call.whole.wave
+  case CallingConv::AMDGPU_Gfx_WholeWave:
+  // Hardware entry points:
   case CallingConv::AMDGPU_CS:
   case CallingConv::AMDGPU_ES:
   case CallingConv::AMDGPU_GS:
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 77cee875f16e7..c529a86309a94 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -39,30 +39,26 @@ class DbgVariableRecord;
 class Instruction;
 class Module;
 
-/// Finds dbg.declare intrinsics declaring local variables as living in the
+/// Finds dbg.declare records declaring local variables as living in the
 /// memory that 'V' points to.
-LLVM_ABI TinyPtrVector<DbgDeclareInst *> findDbgDeclares(Value *V);
-/// As above, for DVRDeclares.
 LLVM_ABI TinyPtrVector<DbgVariableRecord *> findDVRDeclares(Value *V);
 /// As above, for DVRValues.
 LLVM_ABI TinyPtrVector<DbgVariableRecord *> findDVRValues(Value *V);
 
-/// Finds the llvm.dbg.value intrinsics describing a value.
-LLVM_ABI void findDbgValues(
-    SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
-    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
-
-/// Finds the debug info intrinsics describing a value.
-LLVM_ABI void findDbgUsers(
-    SmallVectorImpl<DbgVariableIntrinsic *> &DbgInsts, Value *V,
-    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
+/// Finds the debug info records describing a value.
+LLVM_ABI void
+findDbgUsers(Value *V,
+             SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords);
+/// Finds the dbg.values describing a value.
+LLVM_ABI void
+findDbgValues(Value *V,
+              SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords);
 
 /// Find subprogram that is enclosing this scope.
 LLVM_ABI DISubprogram *getDISubprogram(const MDNode *Scope);
 
 /// Produce a DebugLoc to use for each dbg.declare that is promoted to a
 /// dbg.value.
-LLVM_ABI DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII);
 LLVM_ABI DebugLoc getDebugValueLoc(DbgVariableRecord *DVR);
 
 /// Strip debug info in the module if it exists.
@@ -115,8 +111,7 @@ class DebugInfoFinder {
   LLVM_ABI void processVariable(DILocalVariable *DVI);
   /// Process debug info location.
   LLVM_ABI void processLocation(const Module &M, const DILocation *Loc);
-  /// Process a DbgRecord (e.g, treat a DbgVariableRecord like a
-  /// DbgVariableIntrinsic).
+  /// Process a DbgRecord.
   LLVM_ABI void processDbgRecord(const Module &M, const DbgRecord &DR);
 
   /// Process subprogram.
@@ -193,13 +188,6 @@ using AssignmentInstRange =
 /// Iterators invalidated by adding or removing DIAssignID metadata to/from any
 /// instruction (including by deleting or cloning instructions).
 LLVM_ABI AssignmentInstRange getAssignmentInsts(DIAssignID *ID);
-/// Return a range of instructions (typically just one) that perform the
-/// assignment that \p DAI encodes.
-/// Iterators invalidated by adding or removing DIAssignID metadata to/from any
-/// instruction (including by deleting or cloning instructions).
-inline AssignmentInstRange getAssignmentInsts(const DbgAssignIntrinsic *DAI) {
-  return getAssignmentInsts(DAI->getAssignID());
-}
 
 inline AssignmentInstRange getAssignmentInsts(const DbgVariableRecord *DVR) {
   assert(DVR->isDbgAssign() &&
@@ -290,8 +278,6 @@ struct VarRecord {
   DILocalVariable *Var;
   DILocation *DL;
 
-  VarRecord(DbgVariableIntrinsic *DVI)
-      : Var(DVI->getVariable()), DL(getDebugValueLoc(DVI)) {}
   VarRecord(DbgVariableRecord *DVR)
       : Var(DVR->getVariable()), DL(getDebugValueLoc(DVR)) {}
   VarRecord(DILocalVariable *Var, DILocation *DL) : Var(Var), DL(DL) {}
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 9345f95015301..f1f0c18949c35 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -66,7 +66,6 @@ namespace dwarf {
 enum Tag : uint16_t;
 }
 
-class DbgVariableIntrinsic;
 class DbgVariableRecord;
 
 LLVM_ABI extern cl::opt<bool> EnableFSDiscriminator;
@@ -4613,7 +4612,6 @@ class DebugVariable {
   LLVM_ABI static const FragmentInfo DefaultFragment;
 
 public:
-  LLVM_ABI DebugVariable(const DbgVariableIntrinsic *DII);
   LLVM_ABI DebugVariable(const DbgVariableRecord *DVR);
 
   DebugVariable(const DILocalVariable *Var,
@@ -4681,7 +4679,6 @@ template <> struct DenseMapInfo<DebugVariable> {
 /// information).
 class DebugVariableAggregate : public DebugVariable {
 public:
-  LLVM_ABI DebugVariableAggregate(const DbgVariableIntrinsic *DVI);
   DebugVariableAggregate(const DebugVariable &V)
       : DebugVariable(V.getVariable(), std::nullopt, V.getInlinedAt()) {}
 };
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index df572e8791e13..90276eae13e4b 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -53,3 +53,4 @@ LLVM_FIXED_MD_KIND(MD_DIAssignID, "DIAssignID", 38)
 LLVM_FIXED_MD_KIND(MD_coro_outside_frame, "coro.outside.frame", 39)
 LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40)
 LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41)
+LLVM_FIXED_MD_KIND(MD_callee_type, "callee_type", 42)
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index c317a06753970..5d25804a684ac 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -898,17 +898,6 @@ class Instruction : public User,
   /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
   LLVM_ABI bool isDebugOrPseudoInst() const LLVM_READONLY;
 
-  /// Return a pointer to the previous non-debug instruction in the same basic
-  /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
-  /// operations if \c SkipPseudoOp is true.
-  LLVM_ABI const Instruction *
-  getPrevNonDebugInstruction(bool SkipPseudoOp = false) const;
-  Instruction *getPrevNonDebugInstruction(bool SkipPseudoOp = false) {
-    return const_cast<Instruction *>(
-        static_cast<const Instruction *>(this)->getPrevNonDebugInstruction(
-            SkipPseudoOp));
-  }
-
   /// Create a copy of 'this' instruction that is identical in all ways except
   /// the following:
   ///   * The instruction has no parent
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index eb2c63c24e4b5..8bfa34584c3a4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1936,7 +1936,9 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
    ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
 
-class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
+class AMDGPUStructPtrBufferLoadLDS :
+  ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_lds">,
+  Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,        // rsrc(SGPR)
    LLVMQualPointerType<3>,    // LDS base offset
@@ -3570,6 +3572,12 @@ def int_amdgcn_cvt_f16_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_bf8">,
             [llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
+def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">,
+  DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
+
+def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
+  DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.
@@ -3709,6 +3717,20 @@ class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
      IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
+class AMDGPUWmmaIntrinsicModsC_MatrixFMT :
+  Intrinsic<
+    [llvm_anyfloat_ty], // %D
+    [
+      llvm_i32_ty,      // matrix_a_fmt
+      llvm_anyint_ty,   // %A
+      llvm_i32_ty,      // matrix_b_fmt
+      llvm_anyint_ty,   // %B
+      llvm_i16_ty,      // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+      LLVMMatchType<0>, // %C
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
 defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
 def int_amdgcn_wmma_f32_16x16x4_f32       : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x32_bf16     : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@@ -3733,6 +3755,7 @@ def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint
 def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_i32_16x16x64_iu8      : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_f32_16x16x128_f8f6f4  : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
 def int_amdgcn_wmma_f32_32x16x128_f4       : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
 }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0375f29ad8906..967d1663f237b 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -331,6 +331,11 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
     !eq(gf,"m8n16:x2") : !listsplat(llvm_i32_ty, 2),
     !eq(gf,"m8n16:x4") : !listsplat(llvm_i32_ty, 4),
 
+    // stmatrix b8 -> s32 @ m16n8
+    !eq(gf,"m16n8:x1") : !listsplat(llvm_i32_ty, 1),
+    !eq(gf,"m16n8:x2") : !listsplat(llvm_i32_ty, 2),
+    !eq(gf,"m16n8:x4") : !listsplat(llvm_i32_ty, 4),
+
   );
 }
 
@@ -403,6 +408,17 @@ class LDMATRIX_NAME<WMMA_REGS Frag, int Trans> {
                   !subst("llvm.", "int_", intr));
 }
 
+class STMATRIX_NAME<WMMA_REGS Frag, int Trans> {
+  string intr = "llvm.nvvm.stmatrix.sync.aligned"
+                # "." # Frag.geom
+                # "." # Frag.frag
+                # !if(Trans, ".trans", "")
+                # "." # Frag.ptx_elt_type
+                ;
+  string record = !subst(".", "_",
+                  !subst("llvm.", "int_", intr));
+}
+
 // Generates list of 4-tuples of WMMA_REGS representing a valid MMA op.
 //   Geom: list of supported geometries.
 //   TypeN: PTX type of the corresponding fragment's element.
@@ -443,6 +459,16 @@ class LDMATRIX_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
    list<string> ops = !foreach(x, ret, x.gft);
 }
 
+class STMATRIX_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
+  list<WMMA_REGS> ret =
+     !foldl([]<WMMA_REGS>, Geom, t1, geom, !listconcat(t1,
+     !foldl([]<WMMA_REGS>, Frags, t2, frag, !listconcat(t2,
+     !foldl([]<WMMA_REGS>, Types, t3, type, !listconcat(t3,
+            [WMMA_REGS<geom, frag, type>]))))));
+   // Debugging aid for readable representation of the list above.
+   list<string> ops = !foreach(x, ret, x.gft);
+}
+
 // Creates list of valid combinations of fragments. This is the main list that
 // drives generation of corresponding intrinsics and instructions.
 class NVVM_MMA_OPS {
@@ -537,9 +563,18 @@ class NVVM_MMA_OPS {
   list<WMMA_REGS> ldmatrix_geom_m8n16_ops = LDMATRIX_OPS<
     ["m8n16"], ["x1", "x2", "x4"], ["b8x16.b6x16_p32", "b8x16.b4x16_p64"]>.ret;
 
+  list<WMMA_REGS> stmatrix_b16_ops = STMATRIX_OPS<
+    ["m8n8"], ["x1", "x2", "x4"], ["b16"]>.ret;
+
+  list<WMMA_REGS> stmatrix_b8_ops = STMATRIX_OPS<
+    ["m16n8"], ["x1", "x2", "x4"], ["b8"]>.ret;
+
   list<WMMA_REGS> all_ldmatrix_ops = !listconcat(ldmatrix_b16_ops,
                                                  ldmatrix_geom_m16n16_ops,
                                                  ldmatrix_geom_m8n16_ops);
+
+  list<WMMA_REGS> all_stmatrix_ops = !listconcat(stmatrix_b16_ops,
+                                                 stmatrix_b8_ops);
 }
 
 def NVVM_MMA_OPS : NVVM_MMA_OPS;
@@ -680,6 +715,19 @@ class NVVM_LDMATRIX_SUPPORTED<WMMA_REGS frag, bit trans> {
   );
 }
 
+// Returns true if the fragment is valid for stmatrix ops is supported;
+// false otherwise.
+class NVVM_STMATRIX_SUPPORTED<WMMA_REGS frag, bit trans> {
+  string g = frag.geom;
+  string t = frag.ptx_elt_type;
+
+  bit ret = !cond(
+    !and(!eq(g, "m8n8"), !eq(t, "b16")): true,
+    !and(!eq(g, "m16n8"), !eq(t, "b8"), !eq(trans, 1)): true,
+    true: false
+  );
+}
+
 class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
   string Suffix = !if(sync, "sync_", "")
                   # mode # "_"
@@ -1969,6 +2017,23 @@ foreach transposed = [0, 1] in {
   }
 }
 
+// STMATRIX
+class NVVM_STMATRIX<WMMA_REGS Frag, int Transposed>
+  : Intrinsic<[],
+          !listconcat([llvm_anyptr_ty], Frag.regs),
+          [IntrWriteMem, IntrArgMemOnly, IntrNoCallback,
+           WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
+          STMATRIX_NAME<Frag, Transposed>.intr>;
+
+foreach transposed = [0, 1] in {
+  foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in {
+    if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then {
+      def STMATRIX_NAME<frag, transposed>.record
+        : NVVM_STMATRIX<frag, transposed>;
+    }
+  }
+}
+
 // MAPA
 let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] in {
   def int_nvvm_mapa
@@ -2024,9 +2089,7 @@ foreach dim = 1...5 in {
                       tensor_dim_args,      // actual tensor dims
                       [llvm_i64_ty]),       // cache_hint
           [llvm_i1_ty],                     // Flag for cache_hint
-          [IntrConvergent,
-           ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
 
     // Intrinsics for TMA Copy with reduction
     foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
@@ -2037,18 +2100,31 @@ foreach dim = 1...5 in {
                          tensor_dim_args,     // actual tensor dims
                         [llvm_i64_ty]),       // cache_hint
           [llvm_i1_ty],                       // Flag for cache_hint
-          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
   }
 }
 
+// TMA S2G tile::scatter4
+def int_nvvm_cp_async_bulk_tensor_s2g_tile_scatter4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat([llvm_shared_ptr_ty,        // src_smem_ptr
+                   llvm_ptr_ty],              // tensormap_ptr
+                  !listsplat(llvm_i32_ty, 5), // dims
+                  [llvm_i64_ty]),             // cache_hint
+      [llvm_i1_ty],                           // Flag for cache_hint
+      [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
+
 // TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
 foreach dim = 1...5 in {
   defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
 
-  foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+  foreach mode = !if(!ge(dim, 3), ["tile", "im2col", "im2col_w", "im2col_w_128"], ["tile"]) in {
     defvar is_im2col = !eq(mode, "im2col");
-    defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0);
+    defvar is_im2colw = !or(!eq(mode, "im2col_w"), !eq(mode, "im2col_w_128"));
+
+    // For im2col_w/w128 modes, the num_offsets is always 2.
+    // For im2col mode, the num_offsets is (dim - 2).
+    defvar num_im2col_offsets = !if(is_im2colw, 2, !if(is_im2col, !add(dim, -2), 0));
     defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets);
 
     defvar g2s_params = !listconcat(
@@ -2079,11 +2155,60 @@ foreach dim = 1...5 in {
                        im2col_offsets_args, // im2col offsets
                       [llvm_i64_ty]),       // cache_hint
           [llvm_i1_ty],                     // Flag for cache_hint
-          [IntrConvergent,
-           ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+          [IntrConvergent, ReadOnly<ArgIndex<0>>]>;
+
+    def int_nvvm_cp_async_bulk_tensor_g2s_cta_ # mode # _ # dim # d :
+      DefaultAttrsIntrinsicFlags<[],
+          !listconcat([llvm_shared_ptr_ty,  // dst_ptr
+                       llvm_shared_ptr_ty,  // mbarrier_ptr
+                       llvm_ptr_ty],        // tensormap_ptr
+                       tensor_dim_args,     // actual tensor dims
+                       im2col_offsets_args, // im2col offsets
+                       [llvm_i64_ty]),      // cache_hint
+          [llvm_i1_ty],                     // Flag for cache_hint
+          [IntrConvergent, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
   }
 }
 
+// TMA copy for tile::gather4
+def int_nvvm_cp_async_bulk_tensor_g2s_tile_gather4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat(
+      [llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
+       llvm_shared_ptr_ty,         // mbarrier_ptr
+       llvm_ptr_ty],               // tensormap_ptr
+       !listsplat(llvm_i32_ty, 5), // co-ordinates
+      [llvm_i16_ty,                // cta_mask
+       llvm_i64_ty]),              // cache_hint
+      [llvm_i1_ty,                 // Flag for cta_mask
+       llvm_i1_ty,                 // Flag for cache_hint
+       llvm_i32_ty],               // Flag for cta_group
+      [IntrConvergent,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+       // Allowed values for cta_group are {0,1,2} i.e [0, 3).
+       Range<ArgIndex<12>, 0, 3>]>;
+
+def int_nvvm_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat(
+      [llvm_shared_ptr_ty,         // dst_shared_ptr
+       llvm_shared_ptr_ty,         // mbarrier_ptr
+       llvm_ptr_ty],               // tensormap_ptr
+       !listsplat(llvm_i32_ty, 5), // co-ordinates
+      [llvm_i64_ty]),              // cache_hint
+      [llvm_i1_ty],                // Flag for cache_hint
+      [IntrConvergent,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
+
+// TMA prefetch for tile::gather4
+def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat([llvm_ptr_ty],               // tensormap_ptr
+                  !listsplat(llvm_i32_ty, 5),  // co-ordinates
+                  [llvm_i64_ty]),              // cache_hint
+      [llvm_i1_ty],                            // Flag for cache_hint
+      [IntrConvergent, ReadOnly<ArgIndex<0>>]>;
+
 // Intrinsics for Prefetch and Prefetchu
 let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in {
   foreach level = ["L1", "L2"] in {
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 35c9cd63581d6..b5f0cdf479c08 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -90,7 +90,12 @@ let TargetPrefix = "spv" in {
   def int_spv_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_reflect : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
-  def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
+  def int_spv_refract
+      : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                              [llvm_anyfloat_ty, LLVMMatchType<0>,
+                              llvm_anyfloat_ty],
+                              [IntrNoMem]>;
+def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_smoothstep : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [LLVMMatchType<0>, llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index f592ff287a0e3..c1e4b97e96bc8 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -43,6 +43,10 @@ def int_wasm_ref_is_null_exn :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem],
                         "llvm.wasm.ref.is_null.exn">;
 
+def int_wasm_ref_test_func
+    : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_vararg_ty],
+                            [IntrNoMem]>;
+
 //===----------------------------------------------------------------------===//
 // Table intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 2de26c0c1f7c7..af252aa24567a 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -1255,6 +1255,13 @@ class MDNode : public Metadata {
   bool isReplaceable() const { return isTemporary() || isAlwaysReplaceable(); }
   bool isAlwaysReplaceable() const { return getMetadataID() == DIAssignIDKind; }
 
+  /// Check if this is a valid generalized type metadata node.
+  bool hasGeneralizedMDString() {
+    if (getNumOperands() < 2 || !isa<MDString>(getOperand(1)))
+      return false;
+    return cast<MDString>(getOperand(1))->getString().ends_with(".generalized");
+  }
+
   unsigned getNumTemporaryUses() const {
     assert(isTemporary() && "Only for temporaries");
     return Context.getReplaceableUses()->getNumUses();
@@ -1467,6 +1474,8 @@ class MDNode : public Metadata {
                                                 const Instruction *BInstr);
   LLVM_ABI static MDNode *getMergedMemProfMetadata(MDNode *A, MDNode *B);
   LLVM_ABI static MDNode *getMergedCallsiteMetadata(MDNode *A, MDNode *B);
+  LLVM_ABI static MDNode *getMergedCalleeTypeMetadata(const MDNode *A,
+                                                      const MDNode *B);
 };
 
 /// Tuple of metadata.
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 737610b73b081..0fd5de3b8ea42 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -112,7 +112,6 @@ inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking FTZ flag for invalid f2i/d2i intrinsic");
-  return false;
 }
 
 inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
@@ -179,7 +178,6 @@ inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) {
   }
   llvm_unreachable(
       "Checking invalid f2i/d2i intrinsic for signed int conversion");
-  return false;
 }
 
 inline APFloat::roundingMode
@@ -250,7 +248,6 @@ GetFPToIntegerRoundingMode(Intrinsic::ID IntrinsicID) {
     return APFloat::rmTowardZero;
   }
   llvm_unreachable("Checking rounding mode for invalid f2i/d2i intrinsic");
-  return APFloat::roundingMode::Invalid;
 }
 
 inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
@@ -280,7 +277,6 @@ inline bool FMinFMaxShouldFTZ(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking FTZ flag for invalid fmin/fmax intrinsic");
-  return false;
 }
 
 inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
@@ -310,7 +306,6 @@ inline bool FMinFMaxPropagatesNaNs(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking NaN flag for invalid fmin/fmax intrinsic");
-  return false;
 }
 
 inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
@@ -340,7 +335,83 @@ inline bool FMinFMaxIsXorSignAbs(Intrinsic::ID IntrinsicID) {
     return false;
   }
   llvm_unreachable("Checking XorSignAbs flag for invalid fmin/fmax intrinsic");
-  return false;
+}
+
+inline bool UnaryMathIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_ceil_ftz_f:
+  case Intrinsic::nvvm_fabs_ftz:
+  case Intrinsic::nvvm_floor_ftz_f:
+  case Intrinsic::nvvm_round_ftz_f:
+  case Intrinsic::nvvm_saturate_ftz_f:
+  case Intrinsic::nvvm_sqrt_rn_ftz_f:
+    return true;
+  case Intrinsic::nvvm_ceil_f:
+  case Intrinsic::nvvm_ceil_d:
+  case Intrinsic::nvvm_fabs:
+  case Intrinsic::nvvm_floor_f:
+  case Intrinsic::nvvm_floor_d:
+  case Intrinsic::nvvm_round_f:
+  case Intrinsic::nvvm_round_d:
+  case Intrinsic::nvvm_saturate_d:
+  case Intrinsic::nvvm_saturate_f:
+  case Intrinsic::nvvm_sqrt_f:
+  case Intrinsic::nvvm_sqrt_rn_d:
+  case Intrinsic::nvvm_sqrt_rn_f:
+    return false;
+  }
+  llvm_unreachable("Checking FTZ flag for invalid unary intrinsic");
+}
+
+inline bool RCPShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+    return true;
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_f:
+    return false;
+  }
+  llvm_unreachable("Checking FTZ flag for invalid rcp intrinsic");
+}
+
+inline APFloat::roundingMode GetRCPRoundingMode(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+    return APFloat::rmTowardNegative;
+
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+    return APFloat::rmNearestTiesToEven;
+
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+    return APFloat::rmTowardPositive;
+
+  case Intrinsic::nvvm_rcp_rz_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+    return APFloat::rmTowardZero;
+  }
+  llvm_unreachable("Checking rounding mode for invalid rcp intrinsic");
+}
+
+inline DenormalMode GetNVVMDenromMode(bool ShouldFTZ) {
+  if (ShouldFTZ)
+    return DenormalMode::getPreserveSign();
+  return DenormalMode::getIEEE();
 }
 
 } // namespace nvvm
diff --git a/llvm/include/llvm/IR/OptBisect.h b/llvm/include/llvm/IR/OptBisect.h
index ea3c1defeb100..d813ae933d65e 100644
--- a/llvm/include/llvm/IR/OptBisect.h
+++ b/llvm/include/llvm/IR/OptBisect.h
@@ -15,6 +15,7 @@
 #define LLVM_IR_OPTBISECT_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Compiler.h"
 #include <limits>
 
@@ -82,8 +83,38 @@ class LLVM_ABI OptBisect : public OptPassGate {
   mutable int LastBisectNum = 0;
 };
 
-/// Singleton instance of the OptBisect class, so multiple pass managers don't
-/// need to coordinate their uses of OptBisect.
+/// This class implements a mechanism to disable passes and individual
+/// optimizations at compile time based on a command line option
+/// (-opt-disable) in order to study how single transformations, or
+/// combinations thereof, affect the IR.
+class LLVM_ABI OptDisable : public OptPassGate {
+public:
+  /// Checks the pass name to determine if the specified pass should run.
+  ///
+  /// It returns true if the pass should run, i.e. if its name is was
+  /// not provided via command line.
+  /// If -opt-disable-enable-verbosity is given, the method prints the
+  /// name of the pass, and whether or not the pass will be executed.
+  ///
+  /// Most passes should not call this routine directly. Instead, it is called
+  /// through helper routines provided by the base classes of the pass. For
+  /// instance, function passes should call FunctionPass::skipFunction().
+  bool shouldRunPass(StringRef PassName,
+                     StringRef IRDescription) const override;
+
+  /// Parses the command line argument to extract the names of the passes
+  /// to be disabled. Multiple pass names can be provided with comma separation.
+  void setDisabled(StringRef Pass);
+
+  /// isEnabled() should return true before calling shouldRunPass().
+  bool isEnabled() const override { return !DisabledPasses.empty(); }
+
+private:
+  StringSet<> DisabledPasses = {};
+};
+
+/// Singleton instance of the OptPassGate class, so multiple pass managers don't
+/// need to coordinate their uses of OptBisect and OptDisable.
 LLVM_ABI OptPassGate &getGlobalPassGate();
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h
index 031571599f9ad..33eda5a4222f1 100644
--- a/llvm/include/llvm/IR/PassInstrumentation.h
+++ b/llvm/include/llvm/IR/PassInstrumentation.h
@@ -164,7 +164,7 @@ class PassInstrumentationCallbacks {
 
   /// Add a class name to pass name mapping for use by pass instrumentation.
   LLVM_ABI void addClassToPassName(StringRef ClassName, StringRef PassName);
-  /// Get the pass name for a given pass class name.
+  /// Get the pass name for a given pass class name. Empty if no match found.
   LLVM_ABI StringRef getPassNameForClassName(StringRef ClassName);
 
 private:
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 4f44ae56eb3c7..ea8226c6e17ba 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -491,6 +491,22 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
   /// invalidate them, unless they are preserved by the PreservedAnalyses set.
   void invalidate(IRUnitT &IR, const PreservedAnalyses &PA);
 
+  /// Directly clear a cached analysis for an IR unit.
+  ///
+  /// Using invalidate() over this is preferred unless you are really
+  /// sure you want to *only* clear this analysis without asking if it is
+  /// invalid.
+  template <typename AnalysisT> void clearAnalysis(IRUnitT &IR) {
+    AnalysisResultListT &ResultsList = AnalysisResultLists[&IR];
+    AnalysisKey *ID = AnalysisT::ID();
+
+    auto I =
+        llvm::find_if(ResultsList, [&ID](auto &E) { return E.first == ID; });
+    assert(I != ResultsList.end() && "Analysis must be available");
+    ResultsList.erase(I);
+    AnalysisResults.erase({ID, &IR});
+  }
+
 private:
   /// Look up a registered analysis pass.
   PassConceptT &lookUpPass(AnalysisKey *ID) {
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 8058c8a4c5510..89ad4e5bc6ca4 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -132,6 +132,10 @@ struct RuntimeLibcallsInfo {
     return ImplToLibcall[Impl];
   }
 
+  /// Check if this is valid libcall for the current module, otherwise
+  /// RTLIB::Unsupported.
+  RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const;
+
 private:
   static const RTLIB::LibcallImpl
       DefaultLibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1];
@@ -156,6 +160,14 @@ struct RuntimeLibcallsInfo {
   /// Map from a concrete LibcallImpl implementation to its RTLIB::Libcall kind.
   LLVM_ABI static const RTLIB::Libcall ImplToLibcall[RTLIB::NumLibcallImpls];
 
+  /// Check if a function name is a recognized runtime call of any kind. This
+  /// does not consider if this call is available for any current compilation,
+  /// just that it is a known call somewhere. This returns the set of all
+  /// LibcallImpls which match the name; multiple implementations with the same
+  /// name may exist but differ in interpretation based on the target context.
+  LLVM_ABI static iterator_range<ArrayRef<uint16_t>::const_iterator>
+  getRecognizedLibcallImpls(StringRef FuncName);
+
   static bool darwinHasSinCosStret(const Triple &TT) {
     if (!TT.isOSDarwin())
       return false;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 11926d4128fcf..f0297cd1a0873 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -2129,7 +2129,7 @@ defvar X86CommonLibcalls =
 );
 
 defvar Windows32DivRemMulCalls =
-  LibcallImpls<(add WindowsDivRemMulLibcalls),
+  LibcallsWithCC<(add WindowsDivRemMulLibcalls), X86_STDCALL,
   RuntimeLibcallPredicate<"TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()">>;
 
 def X86_32SystemLibrary
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 0322cbe6cbe8d..bfc117578d363 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -18,9 +18,7 @@
 
 namespace llvm {
 
-class MCAlignFragment;
 class MCFragment;
-class MCLEBFragment;
 class MCSymbol;
 class MCAssembler;
 class MCContext;
@@ -60,6 +58,9 @@ class LLVM_ABI MCAsmBackend {
 
   MCAssembler *Asm = nullptr;
 
+  bool AllowAutoPadding = false;
+  bool AllowEnhancedRelaxation = false;
+
 public:
   MCAsmBackend(const MCAsmBackend &) = delete;
   MCAsmBackend &operator=(const MCAsmBackend &) = delete;
@@ -73,11 +74,11 @@ class LLVM_ABI MCAsmBackend {
 
   /// Return true if this target might automatically pad instructions and thus
   /// need to emit padding enable/disable directives around sensative code.
-  virtual bool allowAutoPadding() const { return false; }
+  bool allowAutoPadding() const { return AllowAutoPadding; }
   /// Return true if this target allows an unrelaxable instruction to be
   /// emitted into RelaxableFragment and then we can increase its size in a
   /// tricky way for optimization.
-  virtual bool allowEnhancedRelaxation() const { return false; }
+  bool allowEnhancedRelaxation() const { return AllowEnhancedRelaxation; }
 
   /// lifetime management
   virtual void reset() {}
@@ -105,21 +106,6 @@ class LLVM_ABI MCAsmBackend {
   /// Get information on a fixup kind.
   virtual MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const;
 
-  /// Hook to check if extra nop bytes must be inserted for alignment directive.
-  /// For some targets this may be necessary in order to support linker
-  /// relaxation. The number of bytes to insert are returned in Size.
-  virtual bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
-                                                     unsigned &Size) {
-    return false;
-  }
-
-  /// Hook which indicates if the target requires a fixup to be generated when
-  /// handling an align directive in an executable section
-  virtual bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
-                                             MCAlignFragment &AF) {
-    return false;
-  }
-
   // Evaluate a fixup, returning std::nullopt to use default handling for
   // `Value` and `IsResolved`. Otherwise, returns `IsResolved` with the
   // expectation that the hook updates `Value`.
@@ -177,6 +163,10 @@ class LLVM_ABI MCAsmBackend {
   }
 
   // Defined by linker relaxation targets.
+
+  // Return false to use default handling. Otherwise, set `Size` to the number
+  // of padding bytes.
+  virtual bool relaxAlign(MCFragment &F, unsigned &Size) { return false; }
   virtual bool relaxDwarfLineAddr(MCFragment &, bool &WasRelaxed) const {
     return false;
   }
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index aa396efa9f018..467ad4eabd3b0 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -36,8 +36,6 @@ class MCCVDefRangeFragment;
 class MCCVInlineLineTableFragment;
 class MCFragment;
 class MCFixup;
-class MCLEBFragment;
-class MCPseudoProbeAddrFragment;
 class MCSymbolRefExpr;
 class raw_ostream;
 class MCAsmBackend;
@@ -69,6 +67,13 @@ class MCAssembler {
 
   SmallVector<const MCSymbol *, 0> Symbols;
 
+  struct RelocDirective {
+    const MCExpr &Offset;
+    const MCExpr *Expr;
+    uint32_t Kind;
+  };
+  SmallVector<RelocDirective, 0> relocDirectives;
+
   mutable SmallVector<std::pair<SMLoc, std::string>, 0> PendingErrors;
 
   MCDwarfLineTableParams LTParams;
@@ -116,7 +121,6 @@ class MCAssembler {
   bool relaxCVInlineLineTable(MCCVInlineLineTableFragment &DF);
   bool relaxCVDefRange(MCCVDefRangeFragment &DF);
   bool relaxFill(MCFillFragment &F);
-  bool relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &DF);
 
 public:
   /// Construct a new assembler instance.
@@ -205,6 +209,7 @@ class MCAssembler {
 
   LLVM_ABI bool registerSection(MCSection &Section);
   LLVM_ABI bool registerSymbol(const MCSymbol &Symbol);
+  void addRelocDirective(RelocDirective RD);
 
   LLVM_ABI void reportError(SMLoc L, const Twine &Msg) const;
   // Record pending errors during layout iteration, as they may go away once the
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 3a7ca1a69ab85..cae2fbcac1fef 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -136,6 +136,18 @@ class LLVM_ABI MCDisassembler {
                                       ArrayRef<uint8_t> Bytes, uint64_t Address,
                                       raw_ostream &CStream) const = 0;
 
+  /// Returns the disassembly of an instruction bundle for VLIW architectures
+  /// like Hexagon.
+  ///
+  /// \param Instr    - An MCInst to populate with the contents of
+  /// the Bundle with sub-instructions encoded as Inst operands.
+  virtual DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+                                            ArrayRef<uint8_t> Bytes,
+                                            uint64_t Address,
+                                            raw_ostream &CStream) const {
+    return Fail;
+  }
+
   /// Used to perform separate target specific disassembly for a particular
   /// symbol. May parse any prelude that precedes instructions after the
   /// start of a symbol, or the entire symbol.
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index ad0961c8bcf97..144f6bc3bd91c 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -141,7 +141,8 @@ class MCELFStreamer : public MCObjectStreamer {
   }
 
 private:
-  void finalizeCGProfileEntry(const MCSymbolRefExpr *&S, uint64_t Offset);
+  void finalizeCGProfileEntry(const MCSymbolRefExpr *Sym, uint64_t Offset,
+                              const MCSymbolRefExpr *&S);
   void finalizeCGProfile();
 
   bool SeenIdent = false;
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index e2a77b809b6ca..2ceeba22abccd 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -40,14 +40,6 @@ class MCObjectStreamer : public MCStreamer {
   std::unique_ptr<MCAssembler> Assembler;
   bool EmitEHFrame;
   bool EmitDebugFrame;
-  struct PendingMCFixup {
-    const MCSymbol *Sym;
-    MCFixup Fixup;
-    MCFragment *DF;
-    PendingMCFixup(const MCSymbol *McSym, MCFragment *F, MCFixup McFixup)
-        : Sym(McSym), Fixup(McFixup), DF(F) {}
-  };
-  SmallVector<PendingMCFixup, 2> PendingFixups;
 
   struct PendingAssignment {
     MCSymbol *Symbol;
@@ -62,8 +54,6 @@ class MCObjectStreamer : public MCStreamer {
   void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &);
   void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
-  void emitInstructionImpl(const MCInst &Inst, const MCSubtargetInfo &STI);
-  void resolvePendingFixups();
 
 protected:
   MCObjectStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
@@ -82,24 +72,6 @@ class MCObjectStreamer : public MCStreamer {
   MCSymbol *emitCFILabel() override;
   void emitCFISections(bool EH, bool Debug) override;
 
-  void insert(MCFragment *F) {
-    auto *Sec = CurFrag->getParent();
-    F->setParent(Sec);
-    F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
-    CurFrag->Next = F;
-    CurFrag = F;
-    Sec->curFragList()->Tail = F;
-  }
-
-  /// Get a data fragment to write into, creating a new one if the current
-  /// fragment is not FT_Data.
-  /// Optionally a \p STI can be passed in so that a new fragment is created
-  /// if the Subtarget differs from the current fragment.
-  MCFragment *getOrCreateDataFragment(const MCSubtargetInfo *STI = nullptr);
-
-protected:
-  bool changeSectionImpl(MCSection *Section, uint32_t Subsection);
-
 public:
   void visitUsedSymbol(const MCSymbol &Sym) override;
 
@@ -108,6 +80,15 @@ class MCObjectStreamer : public MCStreamer {
   /// \name MCStreamer Interface
   /// @{
 
+  // Add a fragment with a variable-size tail and start a new empty fragment.
+  void insert(MCFragment *F);
+
+  // Add a new fragment to the current section without a variable-size tail.
+  void newFragment();
+
+  void appendContents(size_t Num, char Elt);
+  void addFixup(const MCExpr *Value, MCFixupKind Kind);
+
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   virtual void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment &F,
                               uint64_t Offset);
@@ -162,9 +143,8 @@ class MCObjectStreamer : public MCStreamer {
   void emitCVStringTableDirective() override;
   void emitCVFileChecksumsDirective() override;
   void emitCVFileChecksumOffsetDirective(unsigned FileNo) override;
-  std::optional<std::pair<bool, std::string>>
-  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
-                     SMLoc Loc, const MCSubtargetInfo &STI) override;
+  void emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                          const MCExpr *Expr, SMLoc Loc = {}) override;
   using MCStreamer::emitFill;
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                 SMLoc Loc = SMLoc()) override;
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 296fdd8af0d14..87a83497c1920 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -91,8 +91,7 @@ class LLVM_ABI MCSection {
   bool IsRegistered : 1;
 
   bool IsText : 1;
-
-  bool IsVirtual : 1;
+  bool IsBss : 1;
 
   /// Whether the section contains linker-relaxable fragments. If true, the
   /// offset between two locations may not be fully resolved.
@@ -113,7 +112,7 @@ class LLVM_ABI MCSection {
   StringRef Name;
   SectionVariant Variant;
 
-  MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsVirtual,
+  MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
             MCSymbol *Begin);
   // Protected non-virtual dtor prevents destroy through a base class pointer.
   ~MCSection() {}
@@ -178,9 +177,7 @@ class LLVM_ABI MCSection {
 
   /// Check whether this section is "virtual", that is has no actual object
   /// file contents.
-  bool isVirtualSection() const { return IsVirtual; }
-
-  virtual StringRef getVirtualSectionKind() const;
+  bool isBssSection() const { return IsBss; }
 };
 
 // Represents a contiguous piece of code or data within a section. Its size is
@@ -188,6 +185,7 @@ class LLVM_ABI MCSection {
 // destructors.
 class MCFragment {
   friend class MCAssembler;
+  friend class MCStreamer;
   friend class MCObjectStreamer;
   friend class MCSection;
 
@@ -206,7 +204,6 @@ class MCFragment {
     FT_SymbolId,
     FT_CVInlineLines,
     FT_CVDefRange,
-    FT_PseudoProbe,
   };
 
 private:
@@ -234,11 +231,16 @@ class MCFragment {
   /// FT_Relaxable, x86-specific
   bool AllowAutoPadding : 1;
 
+  // Track content and fixups for the fixed-size part as fragments are
+  // appended to the section. The content remains immutable, except when
+  // modified by applyFixup.
   uint32_t ContentStart = 0;
   uint32_t ContentEnd = 0;
   uint32_t FixupStart = 0;
   uint32_t FixupEnd = 0;
 
+  // Track content and fixups for the optional variable-size tail part,
+  // typically modified during relaxation.
   uint32_t VarContentStart = 0;
   uint32_t VarContentEnd = 0;
   uint32_t VarFixupStart = 0;
@@ -254,6 +256,19 @@ class MCFragment {
       uint32_t OperandStart;
       uint32_t OperandSize;
     } relax;
+    struct {
+      // The alignment to ensure, in bytes.
+      Align Alignment;
+      // The size of the integer (in bytes) of \p Value.
+      uint8_t FillLen;
+      // If true, fill with target-specific nop instructions.
+      bool EmitNops;
+      // The maximum number of bytes to emit; if the alignment
+      // cannot be satisfied in this width then this fragment is ignored.
+      unsigned MaxBytesToEmit;
+      // Value to use for filling padding bytes.
+      int64_t Fill;
+    } align;
     struct {
       // True if this is a sleb128, false if uleb128.
       bool IsSigned;
@@ -283,10 +298,10 @@ class MCFragment {
       return false;
     case MCFragment::FT_Relaxable:
     case MCFragment::FT_Data:
+    case MCFragment::FT_Align:
     case MCFragment::FT_Dwarf:
     case MCFragment::FT_DwarfFrame:
     case MCFragment::FT_LEB:
-    case MCFragment::FT_PseudoProbe:
     case MCFragment::FT_CVInlineLines:
     case MCFragment::FT_CVDefRange:
       return true;
@@ -328,9 +343,9 @@ class MCFragment {
   bool getAllowAutoPadding() const { return AllowAutoPadding; }
   void setAllowAutoPadding(bool V) { AllowAutoPadding = V; }
 
-  // Content-related functions manage parent's storage using ContentStart and
+  //== Content-related functions manage parent's storage using ContentStart and
   // ContentSize.
-  void clearContents() { ContentEnd = ContentStart; }
+
   // Get a SmallVector reference. The caller should call doneAppending to update
   // `ContentEnd`.
   SmallVectorImpl<char> &getContentsForAppending() {
@@ -354,7 +369,6 @@ class MCFragment {
     getContentsForAppending().append(Num, Elt);
     doneAppending();
   }
-  LLVM_ABI void setContents(ArrayRef<char> Contents);
   MutableArrayRef<char> getContents() {
     return MutableArrayRef(getParent()->ContentStorage)
         .slice(ContentStart, ContentEnd - ContentStart);
@@ -386,7 +400,6 @@ class MCFragment {
   void clearFixups() { FixupEnd = FixupStart; }
   LLVM_ABI void addFixup(MCFixup Fixup);
   LLVM_ABI void appendFixups(ArrayRef<MCFixup> Fixups);
-  LLVM_ABI void setFixups(ArrayRef<MCFixup> Fixups);
   MutableArrayRef<MCFixup> getFixups() {
     return MutableArrayRef(getParent()->FixupStorage)
         .slice(FixupStart, FixupEnd - FixupStart);
@@ -441,7 +454,45 @@ class MCFragment {
     llvm::copy(Inst, S.begin() + u.relax.OperandStart);
   }
 
+  //== FT_Align functions
+  void makeAlign(Align Alignment, int64_t Fill, uint8_t FillLen,
+                 unsigned MaxBytesToEmit) {
+    Kind = FT_Align;
+    u.align.EmitNops = false;
+    u.align.Alignment = Alignment;
+    u.align.Fill = Fill;
+    u.align.FillLen = FillLen;
+    u.align.MaxBytesToEmit = MaxBytesToEmit;
+  }
+
+  Align getAlignment() const {
+    assert(Kind == FT_Align);
+    return u.align.Alignment;
+  }
+  int64_t getAlignFill() const {
+    assert(Kind == FT_Align);
+    return u.align.Fill;
+  }
+  uint8_t getAlignFillLen() const {
+    assert(Kind == FT_Align);
+    return u.align.FillLen;
+  }
+  unsigned getAlignMaxBytesToEmit() const {
+    assert(Kind == FT_Align);
+    return u.align.MaxBytesToEmit;
+  }
+  bool hasAlignEmitNops() const {
+    assert(Kind == FT_Align);
+    return u.align.EmitNops;
+  }
+
   //== FT_LEB functions
+  void makeLEB(bool IsSigned, const MCExpr *Value) {
+    assert(Kind == FT_Data);
+    Kind = MCFragment::FT_LEB;
+    u.leb.IsSigned = IsSigned;
+    u.leb.Value = Value;
+  }
   const MCExpr &getLEBValue() const {
     assert(Kind == FT_LEB);
     return *u.leb.Value;
@@ -454,10 +505,6 @@ class MCFragment {
     assert(Kind == FT_LEB);
     return u.leb.IsSigned;
   }
-  void setLEBSigned(bool S) {
-    assert(Kind == FT_LEB);
-    u.leb.IsSigned = S;
-  }
 
   //== FT_DwarfFrame functions
   const MCExpr &getDwarfAddrDelta() const {
@@ -486,52 +533,6 @@ class MCEncodedFragment : public MCFragment {
       : MCFragment(FType, HasInstructions) {}
 };
 
-class MCAlignFragment : public MCFragment {
-  /// Flag to indicate that (optimal) NOPs should be emitted instead
-  /// of using the provided value. The exact interpretation of this flag is
-  /// target dependent.
-  bool EmitNops : 1;
-
-  /// The alignment to ensure, in bytes.
-  Align Alignment;
-
-  /// The size of the integer (in bytes) of \p Value.
-  uint8_t FillLen;
-
-  /// The maximum number of bytes to emit; if the alignment
-  /// cannot be satisfied in this width then this fragment is ignored.
-  unsigned MaxBytesToEmit;
-
-  /// Value to use for filling padding bytes.
-  int64_t Fill;
-
-  /// When emitting Nops some subtargets have specific nop encodings.
-  const MCSubtargetInfo *STI = nullptr;
-
-public:
-  MCAlignFragment(Align Alignment, int64_t Fill, uint8_t FillLen,
-                  unsigned MaxBytesToEmit)
-      : MCFragment(FT_Align, false), EmitNops(false), Alignment(Alignment),
-        FillLen(FillLen), MaxBytesToEmit(MaxBytesToEmit), Fill(Fill) {}
-
-  Align getAlignment() const { return Alignment; }
-  int64_t getFill() const { return Fill; }
-  uint8_t getFillLen() const { return FillLen; }
-  unsigned getMaxBytesToEmit() const { return MaxBytesToEmit; }
-
-  bool hasEmitNops() const { return EmitNops; }
-  void setEmitNops(bool Value, const MCSubtargetInfo *STI) {
-    EmitNops = Value;
-    this->STI = STI;
-  }
-
-  const MCSubtargetInfo *getSubtargetInfo() const { return STI; }
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_Align;
-  }
-};
-
 class MCFillFragment : public MCFragment {
   uint8_t ValueSize;
   /// Value to use for filling bytes.
@@ -729,22 +730,6 @@ class MCBoundaryAlignFragment : public MCFragment {
   }
 };
 
-class MCPseudoProbeAddrFragment : public MCEncodedFragment {
-  /// The expression for the difference of the two symbols that
-  /// make up the address delta between two .pseudoprobe directives.
-  const MCExpr *AddrDelta;
-
-public:
-  MCPseudoProbeAddrFragment(const MCExpr *AddrDelta)
-      : MCEncodedFragment(FT_PseudoProbe, false), AddrDelta(AddrDelta) {}
-
-  const MCExpr &getAddrDelta() const { return *AddrDelta; }
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_PseudoProbe;
-  }
-};
-
 inline MCSection::iterator &MCSection::iterator::operator++() {
   F = F->Next;
   return *this;
diff --git a/llvm/include/llvm/MC/MCSectionCOFF.h b/llvm/include/llvm/MC/MCSectionCOFF.h
index 4472a128caa6b..f979413a3791e 100644
--- a/llvm/include/llvm/MC/MCSectionCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionCOFF.h
@@ -82,7 +82,6 @@ class MCSectionCOFF final : public MCSection {
                             raw_ostream &OS,
                             uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  StringRef getVirtualSectionKind() const override;
 
   unsigned getOrAssignWinCFISectionID(unsigned *NextID) const {
     if (WinCFISectionID == ~0U)
diff --git a/llvm/include/llvm/MC/MCSectionELF.h b/llvm/include/llvm/MC/MCSectionELF.h
index f09d30591a3cf..64a4dafbed303 100644
--- a/llvm/include/llvm/MC/MCSectionELF.h
+++ b/llvm/include/llvm/MC/MCSectionELF.h
@@ -68,10 +68,6 @@ class MCSectionELF final : public MCSection {
       Group.getPointer()->setIsSignature();
   }
 
-  // TODO Delete after we stop supporting generation of GNU-style .zdebug_*
-  // sections.
-  void setSectionName(StringRef Name) { this->Name = Name; }
-
 public:
   /// Decides whether a '.section' directive should be printed before the
   /// section name
@@ -88,7 +84,6 @@ class MCSectionELF final : public MCSection {
                             raw_ostream &OS,
                             uint32_t Subsection) const override;
   bool useCodeAlign() const override;
-  StringRef getVirtualSectionKind() const override;
 
   bool isUnique() const { return UniqueID != NonUniqueID; }
   unsigned getUniqueID() const { return UniqueID; }
diff --git a/llvm/include/llvm/MC/MCSectionGOFF.h b/llvm/include/llvm/MC/MCSectionGOFF.h
index 9e3f95e82a14c..b166397b5d370 100644
--- a/llvm/include/llvm/MC/MCSectionGOFF.h
+++ b/llvm/include/llvm/MC/MCSectionGOFF.h
@@ -111,7 +111,7 @@ class LLVM_ABI MCSectionGOFF final : public MCSection {
 
   // Returns the text style for a section. Only defined for ED and PR sections.
   GOFF::ESDTextStyle getTextStyle() const {
-    assert((isED() || isPR() || isVirtualSection()) && "Expect ED or PR section");
+    assert((isED() || isPR() || isBssSection()) && "Expect ED or PR section");
     if (isED())
       return EDAttributes.TextStyle;
     if (isPR())
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 1f7c8b57540a7..4bfc8f921ef11 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -259,6 +259,8 @@ class LLVM_ABI MCStreamer {
   bool AllowAutoPadding = false;
 
 protected:
+  bool IsObj = false;
+
   // Symbol of the current epilog for which we are processing SEH directives.
   WinEH::FrameInfo::Epilog *CurrentWinEpilog = nullptr;
 
@@ -270,6 +272,8 @@ class LLVM_ABI MCStreamer {
   /// section changes.
   virtual void changeSection(MCSection *, uint32_t);
 
+  void addFragment(MCFragment *F);
+
   virtual void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
   virtual void emitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
 
@@ -308,6 +312,7 @@ class LLVM_ABI MCStreamer {
   virtual void reset();
 
   MCContext &getContext() const { return Context; }
+  bool isObj() const { return IsObj; }
 
   // MCObjectStreamer has an MCAssembler and allows more expression folding at
   // parse time.
@@ -425,11 +430,15 @@ class LLVM_ABI MCStreamer {
   }
 
   MCFragment *getCurrentFragment() const {
+    // Ensure consistency with the section stack.
     assert(!getCurrentSection().first ||
            CurFrag->getParent() == getCurrentSection().first);
+    // Ensure we eagerly allocate an empty fragment after adding fragment with a
+    // variable-size tail.
+    assert(!CurFrag || CurFrag->getKind() == MCFragment::FT_Data);
     return CurFrag;
   }
-
+  size_t getCurFragOffset() const { return getCurrentFragment()->Offset; }
   /// Save the current and previous section on the section stack.
   void pushSection() {
     SectionStack.push_back(
@@ -1048,13 +1057,9 @@ class LLVM_ABI MCStreamer {
 
   virtual void emitSyntaxDirective();
 
-  /// Record a relocation described by the .reloc directive. Return std::nullopt
-  /// if succeeded. Otherwise, return a pair (Name is invalid, error message).
-  virtual std::optional<std::pair<bool, std::string>>
-  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
-                     SMLoc Loc, const MCSubtargetInfo &STI) {
-    return std::nullopt;
-  }
+  /// Record a relocation described by the .reloc directive.
+  virtual void emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                  const MCExpr *Expr, SMLoc Loc = {}) {}
 
   virtual void emitAddrsig() {}
   virtual void emitAddrsigSym(const MCSymbol *Sym) {}
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 103686884e705..ced1afdd4cc6a 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1312,7 +1312,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
     case ELF::EM_PPC:
       return (IsLittleEndian ? "elf32-powerpcle" : "elf32-powerpc");
     case ELF::EM_RISCV:
-      return "elf32-littleriscv";
+      return (IsLittleEndian ? "elf32-littleriscv" : "elf32-bigriscv");
     case ELF::EM_CSKY:
       return "elf32-csky";
     case ELF::EM_SPARC:
@@ -1338,7 +1338,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
     case ELF::EM_PPC64:
       return (IsLittleEndian ? "elf64-powerpcle" : "elf64-powerpc");
     case ELF::EM_RISCV:
-      return "elf64-littleriscv";
+      return (IsLittleEndian ? "elf64-littleriscv" : "elf64-bigriscv");
     case ELF::EM_S390:
       return "elf64-s390";
     case ELF::EM_SPARCV9:
@@ -1400,9 +1400,9 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
   case ELF::EM_RISCV:
     switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
     case ELF::ELFCLASS32:
-      return Triple::riscv32;
+      return IsLittleEndian ? Triple::riscv32 : Triple::riscv32be;
     case ELF::ELFCLASS64:
-      return Triple::riscv64;
+      return IsLittleEndian ? Triple::riscv64 : Triple::riscv64be;
     default:
       report_fatal_error("Invalid ELFCLASS!");
     }
@@ -1479,6 +1479,7 @@ template <class ELFT> Triple::OSType ELFObjectFile<ELFT>::getOS() const {
   case ELF::ELFOSABI_OPENBSD:
     return Triple::OpenBSD;
   case ELF::ELFOSABI_CUDA:
+  case ELF::ELFOSABI_CUDA_V2:
     return Triple::CUDA;
   case ELF::ELFOSABI_AMDGPU_HSA:
     return Triple::AMDHSA;
diff --git a/llvm/include/llvm/Object/SFrameParser.h b/llvm/include/llvm/Object/SFrameParser.h
new file mode 100644
index 0000000000000..cf4fe20e84431
--- /dev/null
+++ b/llvm/include/llvm/Object/SFrameParser.h
@@ -0,0 +1,48 @@
+//===- SFrameParser.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_SFRAME_H
+#define LLVM_OBJECT_SFRAME_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+
+namespace llvm {
+namespace object {
+
+template <endianness E> class SFrameParser {
+public:
+  static Expected<SFrameParser> create(ArrayRef<uint8_t> Contents);
+
+  const sframe::Preamble<E> &getPreamble() const { return Header.Preamble; }
+  const sframe::Header<E> &getHeader() const { return Header; }
+
+  bool usesFixedRAOffset() const {
+    return getHeader().ABIArch == sframe::ABI::AMD64EndianLittle;
+  }
+  bool usesFixedFPOffset() const {
+    return false; // Not used in any currently defined ABI.
+  }
+
+private:
+  ArrayRef<uint8_t> Data;
+  const sframe::Header<E> &Header;
+
+  SFrameParser(ArrayRef<uint8_t> Data, const sframe::Header<E> &Header)
+      : Data(Data), Header(Header) {}
+};
+
+extern template class SFrameParser<endianness::big>;
+extern template class SFrameParser<endianness::little>;
+
+} // end namespace object
+} // end namespace llvm
+
+#endif // LLVM_OBJECT_SFRAME_H
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
index 2ecd47dd10bde..f3962c3556c95 100644
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -114,6 +114,10 @@ class LLVM_ABI Pass {
   /// Registration templates, but can be overloaded directly.
   virtual StringRef getPassName() const;
 
+  /// Return a nice clean name for a pass
+  /// corresponding to that used to enable the pass in opt.
+  StringRef getPassArgument() const;
+
   /// getPassID - Return the PassID number that corresponds to this pass.
   AnalysisID getPassID() const {
     return PassID;
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index dc5f3f80f547e..b0360f1903c0e 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -281,7 +281,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
 
       FunctionPassManager FPM;
       FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
-      FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
+      FPM.addPass(FreeMachineFunctionPass());
       if (this->PB.AddInCGSCCOrder) {
         MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
             createCGSCCToFunctionPassAdaptor(std::move(FPM))));
@@ -579,8 +579,10 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
   void insertPass(InsertedPassT &&Pass) const {
     AfterCallbacks.emplace_back(
         [&](StringRef Name, MachineFunctionPassManager &MFPM) mutable {
-          if (Name == TargetPassT::name())
+          if (Name == TargetPassT::name() &&
+              runBeforeAdding(InsertedPassT::name())) {
             MFPM.addPass(std::forward<InsertedPassT>(Pass));
+          }
         });
   }
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 66051d756c808..fc81ab76dc72d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
 def udiv_by_const : GICombineRule<
   (defs root:$root),
   (match (G_UDIV $dst, $x, $y):$root,
-   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
 
 def sdiv_by_const : GICombineRule<
   (defs root:$root),
   (match (G_SDIV $dst, $x, $y):$root,
-   [{ return Helper.matchSDivByConst(*${root}); }]),
-  (apply [{ Helper.applySDivByConst(*${root}); }])>;
+   [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+  (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
 
 def sdiv_by_pow2 : GICombineRule<
   (defs root:$root),
@@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2,
 def urem_by_const : GICombineRule<
   (defs root:$root),
   (match (G_UREM $dst, $x, $y):$root,
-   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
 
-def intrem_combines : GICombineGroup<[urem_by_const]>;
+def srem_by_const : GICombineRule<
+  (defs root:$root),
+  (match (G_SREM $dst, $x, $y):$root,
+   [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+  (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
 
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 7577792003d2e..b65a63b5108dc 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -78,6 +78,8 @@ def : GINodeEquiv<G_XOR, xor>;
 def : GINodeEquiv<G_SHL, shl>;
 def : GINodeEquiv<G_LSHR, srl>;
 def : GINodeEquiv<G_ASHR, sra>;
+def : GINodeEquiv<G_ABDS, abds>;
+def : GINodeEquiv<G_ABDU, abdu>;
 def : GINodeEquiv<G_SADDSAT, saddsat>;
 def : GINodeEquiv<G_UADDSAT, uaddsat>;
 def : GINodeEquiv<G_SSUBSAT, ssubsat>;
diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h
index be3d41e022ad9..40a9b6cc13902 100644
--- a/llvm/include/llvm/TargetParser/Host.h
+++ b/llvm/include/llvm/TargetParser/Host.h
@@ -53,7 +53,7 @@ LLVM_ABI StringRef getHostCPUName();
 /// which features may appear in this map, except that they are all valid LLVM
 /// feature names. The map can be empty, for example if feature detection
 /// fails.
-LLVM_ABI const StringMap<bool, MallocAllocator> getHostCPUFeatures();
+LLVM_ABI StringMap<bool, MallocAllocator> getHostCPUFeatures();
 
 /// This is a function compatible with cl::AddExtraVersionPrinter, which adds
 /// info about the current target triple and detected CPU.
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 57d771b80251a..670a6321fdc02 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -49,62 +49,64 @@ class Triple {
   enum ArchType {
     UnknownArch,
 
-    arm,            // ARM (little endian): arm, armv.*, xscale
-    armeb,          // ARM (big endian): armeb
-    aarch64,        // AArch64 (little endian): aarch64
-    aarch64_be,     // AArch64 (big endian): aarch64_be
-    aarch64_32,     // AArch64 (little endian) ILP32: aarch64_32
-    arc,            // ARC: Synopsys ARC
-    avr,            // AVR: Atmel AVR microcontroller
-    bpfel,          // eBPF or extended BPF or 64-bit BPF (little endian)
-    bpfeb,          // eBPF or extended BPF or 64-bit BPF (big endian)
-    csky,           // CSKY: csky
-    dxil,           // DXIL 32-bit DirectX bytecode
-    hexagon,        // Hexagon: hexagon
-    loongarch32,    // LoongArch (32-bit): loongarch32
-    loongarch64,    // LoongArch (64-bit): loongarch64
-    m68k,           // M68k: Motorola 680x0 family
-    mips,           // MIPS: mips, mipsallegrex, mipsr6
-    mipsel,         // MIPSEL: mipsel, mipsallegrexe, mipsr6el
-    mips64,         // MIPS64: mips64, mips64r6, mipsn32, mipsn32r6
-    mips64el,       // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
-    msp430,         // MSP430: msp430
-    ppc,            // PPC: powerpc
-    ppcle,          // PPCLE: powerpc (little endian)
-    ppc64,          // PPC64: powerpc64, ppu
-    ppc64le,        // PPC64LE: powerpc64le
-    r600,           // R600: AMD GPUs HD2XXX - HD6XXX
-    amdgcn,         // AMDGCN: AMD GCN GPUs
-    riscv32,        // RISC-V (32-bit): riscv32
-    riscv64,        // RISC-V (64-bit): riscv64
-    sparc,          // Sparc: sparc
-    sparcv9,        // Sparcv9: Sparcv9
-    sparcel,        // Sparc: (endianness = little). NB: 'Sparcle' is a CPU variant
-    systemz,        // SystemZ: s390x
-    tce,            // TCE (http://tce.cs.tut.fi/): tce
-    tcele,          // TCE little endian (http://tce.cs.tut.fi/): tcele
-    thumb,          // Thumb (little endian): thumb, thumbv.*
-    thumbeb,        // Thumb (big endian): thumbeb
-    x86,            // X86: i[3-9]86
-    x86_64,         // X86-64: amd64, x86_64
-    xcore,          // XCore: xcore
-    xtensa,         // Tensilica: Xtensa
-    nvptx,          // NVPTX: 32-bit
-    nvptx64,        // NVPTX: 64-bit
-    amdil,          // AMDIL
-    amdil64,        // AMDIL with 64-bit pointers
-    hsail,          // AMD HSAIL
-    hsail64,        // AMD HSAIL with 64-bit pointers
-    spir,           // SPIR: standard portable IR for OpenCL 32-bit version
-    spir64,         // SPIR: standard portable IR for OpenCL 64-bit version
-    spirv,          // SPIR-V with logical memory layout.
-    spirv32,        // SPIR-V with 32-bit pointers
-    spirv64,        // SPIR-V with 64-bit pointers
-    kalimba,        // Kalimba: generic kalimba
-    shave,          // SHAVE: Movidius vector VLIW processors
-    lanai,          // Lanai: Lanai 32-bit
-    wasm32,         // WebAssembly with 32-bit pointers
-    wasm64,         // WebAssembly with 64-bit pointers
+    arm,         // ARM (little endian): arm, armv.*, xscale
+    armeb,       // ARM (big endian): armeb
+    aarch64,     // AArch64 (little endian): aarch64
+    aarch64_be,  // AArch64 (big endian): aarch64_be
+    aarch64_32,  // AArch64 (little endian) ILP32: aarch64_32
+    arc,         // ARC: Synopsys ARC
+    avr,         // AVR: Atmel AVR microcontroller
+    bpfel,       // eBPF or extended BPF or 64-bit BPF (little endian)
+    bpfeb,       // eBPF or extended BPF or 64-bit BPF (big endian)
+    csky,        // CSKY: csky
+    dxil,        // DXIL 32-bit DirectX bytecode
+    hexagon,     // Hexagon: hexagon
+    loongarch32, // LoongArch (32-bit): loongarch32
+    loongarch64, // LoongArch (64-bit): loongarch64
+    m68k,        // M68k: Motorola 680x0 family
+    mips,        // MIPS: mips, mipsallegrex, mipsr6
+    mipsel,      // MIPSEL: mipsel, mipsallegrexe, mipsr6el
+    mips64,      // MIPS64: mips64, mips64r6, mipsn32, mipsn32r6
+    mips64el,    // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
+    msp430,      // MSP430: msp430
+    ppc,         // PPC: powerpc
+    ppcle,       // PPCLE: powerpc (little endian)
+    ppc64,       // PPC64: powerpc64, ppu
+    ppc64le,     // PPC64LE: powerpc64le
+    r600,        // R600: AMD GPUs HD2XXX - HD6XXX
+    amdgcn,      // AMDGCN: AMD GCN GPUs
+    riscv32,     // RISC-V (32-bit, little endian): riscv32
+    riscv64,     // RISC-V (64-bit, little endian): riscv64
+    riscv32be,   // RISC-V (32-bit, big endian): riscv32be
+    riscv64be,   // RISC-V (64-bit, big endian): riscv64be
+    sparc,       // Sparc: sparc
+    sparcv9,     // Sparcv9: Sparcv9
+    sparcel,     // Sparc: (endianness = little). NB: 'Sparcle' is a CPU variant
+    systemz,     // SystemZ: s390x
+    tce,         // TCE (http://tce.cs.tut.fi/): tce
+    tcele,       // TCE little endian (http://tce.cs.tut.fi/): tcele
+    thumb,       // Thumb (little endian): thumb, thumbv.*
+    thumbeb,     // Thumb (big endian): thumbeb
+    x86,         // X86: i[3-9]86
+    x86_64,      // X86-64: amd64, x86_64
+    xcore,       // XCore: xcore
+    xtensa,      // Tensilica: Xtensa
+    nvptx,       // NVPTX: 32-bit
+    nvptx64,     // NVPTX: 64-bit
+    amdil,       // AMDIL
+    amdil64,     // AMDIL with 64-bit pointers
+    hsail,       // AMD HSAIL
+    hsail64,     // AMD HSAIL with 64-bit pointers
+    spir,        // SPIR: standard portable IR for OpenCL 32-bit version
+    spir64,      // SPIR: standard portable IR for OpenCL 64-bit version
+    spirv,       // SPIR-V with logical memory layout.
+    spirv32,     // SPIR-V with 32-bit pointers
+    spirv64,     // SPIR-V with 64-bit pointers
+    kalimba,     // Kalimba: generic kalimba
+    shave,       // SHAVE: Movidius vector VLIW processors
+    lanai,       // Lanai: Lanai 32-bit
+    wasm32,      // WebAssembly with 32-bit pointers
+    wasm64,      // WebAssembly with 64-bit pointers
     renderscript32, // 32-bit RenderScript
     renderscript64, // 64-bit RenderScript
     ve,             // NEC SX-Aurora Vector Engine
@@ -1064,10 +1066,14 @@ class Triple {
   }
 
   /// Tests whether the target is 32-bit RISC-V.
-  bool isRISCV32() const { return getArch() == Triple::riscv32; }
+  bool isRISCV32() const {
+    return getArch() == Triple::riscv32 || getArch() == Triple::riscv32be;
+  }
 
   /// Tests whether the target is 64-bit RISC-V.
-  bool isRISCV64() const { return getArch() == Triple::riscv64; }
+  bool isRISCV64() const {
+    return getArch() == Triple::riscv64 || getArch() == Triple::riscv64be;
+  }
 
   /// Tests whether the target is RISC-V (32- and 64-bit).
   bool isRISCV() const { return isRISCV32() || isRISCV64(); }
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index df146458b4e6f..3f5f4278a2766 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -36,7 +36,6 @@ class BasicBlock;
 class BranchInst;
 class CallBase;
 class CallInst;
-class DbgVariableIntrinsic;
 class DIBuilder;
 class DomTreeUpdater;
 class Function;
@@ -275,36 +274,23 @@ LLVM_ABI CallInst *changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
 LLVM_ABI void InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI,
                                          DIBuilder &Builder);
 
-/// Creates and inserts an llvm.dbg.value intrinsic before a store
-/// that has an associated llvm.dbg.value intrinsic.
-LLVM_ABI void InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII,
-                                         StoreInst *SI, DIBuilder &Builder);
-
-/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                              StoreInst *SI,
-                                              DIBuilder &Builder);
+/// Inserts a dbg.value record before a store to an alloca'd value
+/// that has an associated dbg.declare record.
 LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                               StoreInst *SI,
                                               DIBuilder &Builder);
 
-/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                              LoadInst *LI, DIBuilder &Builder);
+/// Inserts a dbg.value record before a load of an alloca'd value
+/// that has an associated dbg.declare record.
 LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                               LoadInst *LI, DIBuilder &Builder);
 
-/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
-/// llvm.dbg.declare intrinsic.
-LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                              PHINode *LI, DIBuilder &Builder);
+/// Inserts a dbg.value record after a phi that has an associated
+/// llvm.dbg.declare record.
 LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                               PHINode *LI, DIBuilder &Builder);
 
-/// Lowers llvm.dbg.declare intrinsics into appropriate set of
-/// llvm.dbg.value intrinsics.
+/// Lowers dbg.declare records into appropriate set of dbg.value records.
 LLVM_ABI bool LowerDbgDeclare(Function &F);
 
 /// Propagate dbg.value intrinsics through the newly inserted PHIs.
@@ -312,7 +298,7 @@ LLVM_ABI void
 insertDebugValuesForPHIs(BasicBlock *BB,
                          SmallVectorImpl<PHINode *> &InsertedPHIs);
 
-/// Replaces llvm.dbg.declare instruction when the address it
+/// Replaces dbg.declare record when the address it
 /// describes is replaced with a new value. If Deref is true, an
 /// additional DW_OP_deref is prepended to the expression. If Offset
 /// is non-zero, a constant displacement is added to the expression
@@ -321,10 +307,10 @@ LLVM_ABI bool replaceDbgDeclare(Value *Address, Value *NewAddress,
                                 DIBuilder &Builder, uint8_t DIExprFlags,
                                 int Offset);
 
-/// Replaces multiple llvm.dbg.value instructions when the alloca it describes
+/// Replaces multiple dbg.value records when the alloca it describes
 /// is replaced with a new value. If Offset is non-zero, a constant displacement
 /// is added to the expression (after the mandatory Deref). Offset can be
-/// negative. New llvm.dbg.value instructions are inserted at the locations of
+/// negative. New dbg.value records are inserted at the locations of
 /// the instructions they replace.
 LLVM_ABI void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                        DIBuilder &Builder, int Offset = 0);
@@ -339,7 +325,6 @@ LLVM_ABI void salvageDebugInfo(Instruction &I);
 /// Mark undef if salvaging cannot be completed.
 LLVM_ABI void
 salvageDebugInfoForDbgValues(Instruction &I,
-                             ArrayRef<DbgVariableIntrinsic *> Insns,
                              ArrayRef<DbgVariableRecord *> DPInsns);
 
 /// Given an instruction \p I and DIExpression \p DIExpr operating on
diff --git a/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h b/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h
index cd525a9710103..5b92b33a10ea0 100644
--- a/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h
+++ b/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h
@@ -61,7 +61,7 @@ class LockstepReverseIterator
     }
     Insts.clear();
     for (BasicBlock *BB : Blocks) {
-      Instruction *Prev = BB->getTerminator()->getPrevNonDebugInstruction();
+      Instruction *Prev = BB->getTerminator()->getPrevNode();
       if (!Prev) {
         // Block wasn't big enough - only contained a terminator.
         if constexpr (EarlyFailure) {
@@ -108,7 +108,7 @@ class LockstepReverseIterator
       return *this;
     SmallVector<Instruction *, 4> NewInsts;
     for (Instruction *Inst : Insts) {
-      Instruction *Prev = Inst->getPrevNonDebugInstruction();
+      Instruction *Prev = Inst->getPrevNode();
       if (!Prev) {
         if constexpr (!EarlyFailure) {
           this->ActiveBlocks.remove(Inst->getParent());
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index 8b7daf616b110..f288bdfb84f49 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -23,7 +23,6 @@
 
 namespace llvm {
 class DominatorTree;
-class DbgVariableIntrinsic;
 class IntrinsicInst;
 class PostDominatorTree;
 class AllocaInst;
@@ -53,8 +52,6 @@ struct AllocaInfo {
   AllocaInst *AI;
   SmallVector<IntrinsicInst *, 2> LifetimeStart;
   SmallVector<IntrinsicInst *, 2> LifetimeEnd;
-  SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
-  // Non-intrinsic records of variable locations.
   SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
 };
 
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index 4e5da81a7e885..9500b1f160ea9 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -29,7 +29,6 @@ template <typename T> class SSAUpdaterTraits;
 class Type;
 class Use;
 class Value;
-class DbgValueInst;
 
 /// Helper class for SSA formation on a set of values defined in
 /// multiple blocks.
@@ -122,8 +121,6 @@ class SSAUpdater {
   /// the instruction. Anything outside of its block will have its
   /// value set to the new SSA value if available, and undef if not.
   void UpdateDebugValues(Instruction *I);
-  void UpdateDebugValues(Instruction *I,
-                         SmallVectorImpl<DbgValueInst *> &DbgValues);
   void UpdateDebugValues(Instruction *I,
                          SmallVectorImpl<DbgVariableRecord *> &DbgValues);
 
@@ -136,7 +133,6 @@ class SSAUpdater {
 
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
-  void UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue);
   void UpdateDebugValue(Instruction *I, DbgVariableRecord *DbgValue);
 };
 
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index a101151eed7cc..39fef921a9590 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -530,6 +530,7 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
 
   bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
 
+  Value *tryToReuseLCSSAPhi(const SCEVAddRecExpr *S);
   Value *expandAddRecExprLiterally(const SCEVAddRecExpr *);
   PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
                                      const Loop *L, Type *&TruncTy,
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 9c1c2c6e60f02..e71ba5ea5521e 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1801,6 +1801,44 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::nvvm_d2ull_rn:
   case Intrinsic::nvvm_d2ull_rp:
   case Intrinsic::nvvm_d2ull_rz:
+
+  // NVVM math intrinsics:
+  case Intrinsic::nvvm_ceil_d:
+  case Intrinsic::nvvm_ceil_f:
+  case Intrinsic::nvvm_ceil_ftz_f:
+
+  case Intrinsic::nvvm_fabs:
+  case Intrinsic::nvvm_fabs_ftz:
+
+  case Intrinsic::nvvm_floor_d:
+  case Intrinsic::nvvm_floor_f:
+  case Intrinsic::nvvm_floor_ftz_f:
+
+  case Intrinsic::nvvm_rcp_rm_d:
+  case Intrinsic::nvvm_rcp_rm_f:
+  case Intrinsic::nvvm_rcp_rm_ftz_f:
+  case Intrinsic::nvvm_rcp_rn_d:
+  case Intrinsic::nvvm_rcp_rn_f:
+  case Intrinsic::nvvm_rcp_rn_ftz_f:
+  case Intrinsic::nvvm_rcp_rp_d:
+  case Intrinsic::nvvm_rcp_rp_f:
+  case Intrinsic::nvvm_rcp_rp_ftz_f:
+  case Intrinsic::nvvm_rcp_rz_d:
+  case Intrinsic::nvvm_rcp_rz_f:
+  case Intrinsic::nvvm_rcp_rz_ftz_f:
+
+  case Intrinsic::nvvm_round_d:
+  case Intrinsic::nvvm_round_f:
+  case Intrinsic::nvvm_round_ftz_f:
+
+  case Intrinsic::nvvm_saturate_d:
+  case Intrinsic::nvvm_saturate_f:
+  case Intrinsic::nvvm_saturate_ftz_f:
+
+  case Intrinsic::nvvm_sqrt_f:
+  case Intrinsic::nvvm_sqrt_rn_d:
+  case Intrinsic::nvvm_sqrt_rn_f:
+  case Intrinsic::nvvm_sqrt_rn_ftz_f:
     return !Call->isStrictFP();
 
   // Sign operations are actually bitwise operations, they do not raise
@@ -1818,6 +1856,7 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::nearbyint:
   case Intrinsic::rint:
   case Intrinsic::canonicalize:
+
   // Constrained intrinsics can be folded if FP environment is known
   // to compiler.
   case Intrinsic::experimental_constrained_fma:
@@ -1965,22 +2004,56 @@ inline bool llvm_fenv_testexcept() {
   return false;
 }
 
-static APFloat FTZPreserveSign(const APFloat &V) {
+static const APFloat FTZPreserveSign(const APFloat &V) {
   if (V.isDenormal())
     return APFloat::getZero(V.getSemantics(), V.isNegative());
   return V;
 }
 
-Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
-                         Type *Ty) {
+static const APFloat FlushToPositiveZero(const APFloat &V) {
+  if (V.isDenormal())
+    return APFloat::getZero(V.getSemantics(), false);
+  return V;
+}
+
+static const APFloat
+FlushWithDenormKind(const APFloat &V,
+                    DenormalMode::DenormalModeKind DenormKind) {
+  assert(DenormKind != DenormalMode::DenormalModeKind::Invalid &&
+         DenormKind != DenormalMode::DenormalModeKind::Dynamic);
+  switch (DenormKind) {
+  case DenormalMode::DenormalModeKind::IEEE:
+    return V;
+  case DenormalMode::DenormalModeKind::PreserveSign:
+    return FTZPreserveSign(V);
+  case DenormalMode::DenormalModeKind::PositiveZero:
+    return FlushToPositiveZero(V);
+  default:
+    llvm_unreachable("Invalid denormal mode!");
+  }
+}
+
+Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V, Type *Ty,
+                         DenormalMode DenormMode = DenormalMode::getIEEE()) {
+  if (!DenormMode.isValid() ||
+      DenormMode.Input == DenormalMode::DenormalModeKind::Dynamic ||
+      DenormMode.Output == DenormalMode::DenormalModeKind::Dynamic)
+    return nullptr;
+
   llvm_fenv_clearexcept();
-  double Result = NativeFP(V.convertToDouble());
+  auto Input = FlushWithDenormKind(V, DenormMode.Input);
+  double Result = NativeFP(Input.convertToDouble());
   if (llvm_fenv_testexcept()) {
     llvm_fenv_clearexcept();
     return nullptr;
   }
 
-  return GetConstantFoldFPValue(Result, Ty);
+  Constant *Output = GetConstantFoldFPValue(Result, Ty);
+  if (DenormMode.Output == DenormalMode::DenormalModeKind::IEEE)
+    return Output;
+  const auto *CFP = static_cast<ConstantFP *>(Output);
+  const auto Res = FlushWithDenormKind(CFP->getValueAPF(), DenormMode.Output);
+  return ConstantFP::get(Ty->getContext(), Res);
 }
 
 #if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
@@ -2550,6 +2623,94 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         return ConstantFoldFP(atan, APF, Ty);
       case Intrinsic::sqrt:
         return ConstantFoldFP(sqrt, APF, Ty);
+
+      // NVVM Intrinsics:
+      case Intrinsic::nvvm_ceil_ftz_f:
+      case Intrinsic::nvvm_ceil_f:
+      case Intrinsic::nvvm_ceil_d:
+        return ConstantFoldFP(
+            ceil, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+      case Intrinsic::nvvm_fabs_ftz:
+      case Intrinsic::nvvm_fabs:
+        return ConstantFoldFP(
+            fabs, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+      case Intrinsic::nvvm_floor_ftz_f:
+      case Intrinsic::nvvm_floor_f:
+      case Intrinsic::nvvm_floor_d:
+        return ConstantFoldFP(
+            floor, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+      case Intrinsic::nvvm_rcp_rm_ftz_f:
+      case Intrinsic::nvvm_rcp_rn_ftz_f:
+      case Intrinsic::nvvm_rcp_rp_ftz_f:
+      case Intrinsic::nvvm_rcp_rz_ftz_f:
+      case Intrinsic::nvvm_rcp_rm_d:
+      case Intrinsic::nvvm_rcp_rm_f:
+      case Intrinsic::nvvm_rcp_rn_d:
+      case Intrinsic::nvvm_rcp_rn_f:
+      case Intrinsic::nvvm_rcp_rp_d:
+      case Intrinsic::nvvm_rcp_rp_f:
+      case Intrinsic::nvvm_rcp_rz_d:
+      case Intrinsic::nvvm_rcp_rz_f: {
+        APFloat::roundingMode RoundMode = nvvm::GetRCPRoundingMode(IntrinsicID);
+        bool IsFTZ = nvvm::RCPShouldFTZ(IntrinsicID);
+
+        auto Denominator = IsFTZ ? FTZPreserveSign(APF) : APF;
+        APFloat Res = APFloat::getOne(APF.getSemantics());
+        APFloat::opStatus Status = Res.divide(Denominator, RoundMode);
+
+        if (Status == APFloat::opOK || Status == APFloat::opInexact) {
+          if (IsFTZ)
+            Res = FTZPreserveSign(Res);
+          return ConstantFP::get(Ty->getContext(), Res);
+        }
+        return nullptr;
+      }
+
+      case Intrinsic::nvvm_round_ftz_f:
+      case Intrinsic::nvvm_round_f:
+      case Intrinsic::nvvm_round_d: {
+        // Use APFloat implementation instead of native libm call, as some
+        // implementations (e.g. on PPC) do not preserve the sign of negative 0.
+        bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+        auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+        V.roundToIntegral(APFloat::rmNearestTiesToAway);
+        return ConstantFP::get(Ty->getContext(), V);
+      }
+
+      case Intrinsic::nvvm_saturate_ftz_f:
+      case Intrinsic::nvvm_saturate_d:
+      case Intrinsic::nvvm_saturate_f: {
+        bool IsFTZ = nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID);
+        auto V = IsFTZ ? FTZPreserveSign(APF) : APF;
+        if (V.isNegative() || V.isZero() || V.isNaN())
+          return ConstantFP::getZero(Ty);
+        APFloat One = APFloat::getOne(APF.getSemantics());
+        if (V > One)
+          return ConstantFP::get(Ty->getContext(), One);
+        return ConstantFP::get(Ty->getContext(), APF);
+      }
+
+      case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      case Intrinsic::nvvm_sqrt_f:
+      case Intrinsic::nvvm_sqrt_rn_d:
+      case Intrinsic::nvvm_sqrt_rn_f:
+        if (APF.isNegative())
+          return nullptr;
+        return ConstantFoldFP(
+            sqrt, APF, Ty,
+            nvvm::GetNVVMDenromMode(
+                nvvm::UnaryMathIntrinsicShouldFTZ(IntrinsicID)));
+
+      // AMDGCN Intrinsics:
       case Intrinsic::amdgcn_cos:
       case Intrinsic::amdgcn_sin: {
         double V = getValueAsDouble(Op);
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 428342f51ad2e..dd9a44b9aecac 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3670,14 +3670,12 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase);
   const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase);
 
-  if (Src != Dst) {
-    // Check that memory access offsets are multiples of element sizes.
-    if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) ||
-        !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) {
-      LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n");
-      return std::make_unique<Dependence>(Src, Dst,
-                                          SCEVUnionPredicate(Assume, *SE));
-    }
+  // Check that memory access offsets are multiples of element sizes.
+  if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) ||
+      !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) {
+    LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n");
+    return std::make_unique<Dependence>(Src, Dst,
+                                        SCEVUnionPredicate(Assume, *SE));
   }
 
   if (!Assume.empty()) {
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 2cc3ad5f18482..92c9e37dbb484 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -102,8 +102,8 @@ class ValueEvolution {
 
 public:
   // ValueEvolution is meant to be constructed with the TripCount of the loop,
-  // and whether the polynomial algorithm is big-endian, for the significant-bit
-  // check.
+  // and a boolean indicating whether the polynomial algorithm is big-endian
+  // (for the significant-bit check).
   ValueEvolution(unsigned TripCount, bool ByteOrderSwapped);
 
   // Given a list of PHI nodes along with their incoming value from within the
@@ -115,6 +115,10 @@ class ValueEvolution {
   // precise error message.
   StringRef getError() const { return ErrStr; }
 
+  // A set of Instructions visited by ValueEvolution. The only unvisited
+  // instructions will be ones not on the use-def chain of the PHIs' evolutions.
+  SmallPtrSet<const Instruction *, 16> Visited;
+
   // The computed KnownBits for each PHI node, which is populated after
   // computeEvolutions is called.
   KnownPhiMap KnownPhis;
@@ -177,6 +181,9 @@ KnownBits ValueEvolution::computeBinOp(const BinaryOperator *I) {
 KnownBits ValueEvolution::computeInstr(const Instruction *I) {
   unsigned BitWidth = I->getType()->getScalarSizeInBits();
 
+  // computeInstr is the only entry-point that needs to update the Visited set.
+  Visited.insert(I);
+
   // We look up in the map that contains the KnownBits of the PHI from the
   // previous iteration.
   if (const PHINode *P = dyn_cast<PHINode>(I))
@@ -185,9 +192,12 @@ KnownBits ValueEvolution::computeInstr(const Instruction *I) {
   // Compute the KnownBits for a Select(Cmp()), forcing it to take the branch
   // that is predicated on the (least|most)-significant-bit check.
   CmpPredicate Pred;
-  Value *L, *R, *TV, *FV;
-  if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Value(TV),
-                        m_Value(FV)))) {
+  Value *L, *R;
+  Instruction *TV, *FV;
+  if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Instruction(TV),
+                        m_Instruction(FV)))) {
+    Visited.insert(cast<Instruction>(I->getOperand(0)));
+
     // We need to check LCR against [0, 2) in the little-endian case, because
     // the RCR check is insufficient: it is simply [0, 1).
     if (!ByteOrderSwapped) {
@@ -209,10 +219,17 @@ KnownBits ValueEvolution::computeInstr(const Instruction *I) {
     ConstantRange CheckRCR(APInt::getZero(ICmpBW),
                            ByteOrderSwapped ? APInt::getSignedMinValue(ICmpBW)
                                             : APInt(ICmpBW, 1));
-    if (AllowedR == CheckRCR)
+
+    // We only compute KnownBits of either TV or FV, as the other value would
+    // just be a bit-shift as checked by isBigEndianBitShift.
+    if (AllowedR == CheckRCR) {
+      Visited.insert(FV);
       return compute(TV);
-    if (AllowedR.inverse() == CheckRCR)
+    }
+    if (AllowedR.inverse() == CheckRCR) {
+      Visited.insert(TV);
       return compute(FV);
+    }
 
     ErrStr = "Bad RHS of significant-bit-check";
     return {BitWidth};
@@ -634,6 +651,17 @@ HashRecognize::recognizeCRC() const {
     return VE.getError();
   KnownBits ResultBits = VE.KnownPhis.at(ConditionalRecurrence.Phi);
 
+  // There must be exactly four unvisited instructions, corresponding to the
+  // IndVar PHI. Any other unvisited instructions from the KnownBits propagation
+  // can complicate the optimization, which replaces the entire loop with the
+  // table-lookup version of the hash algorithm.
+  std::initializer_list<const Instruction *> AugmentVisited = {
+      IndVar, Latch->getTerminator(), L.getLatchCmpInst(),
+      cast<Instruction>(IndVar->getIncomingValueForBlock(Latch))};
+  VE.Visited.insert_range(AugmentVisited);
+  if (std::distance(Latch->begin(), Latch->end()) != VE.Visited.size())
+    return "Found stray unvisited instructions";
+
   unsigned N = std::min(TC, ResultBits.getBitWidth());
   auto IsZero = [](const KnownBits &K) { return K.isZero(); };
   if (!checkExtractBits(ResultBits, N, IsZero, *ByteOrderSwapped))
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 898bf5b202feb..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
     : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast<unsigned>(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast<unsigned>(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
          "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 39f74beca082f..8be5de3bf356f 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
                   m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
             match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
     };
-    if (isIntMinMaxRecurrenceKind(Kind) ||
-        (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
+    if (isIntMinMaxRecurrenceKind(Kind))
       return isMinMaxPattern(I, Kind, Prev);
-    else if (isFMulAddIntrinsic(I))
+    if (isFPMinMaxRecurrenceKind(Kind)) {
+      InstDesc Res = isMinMaxPattern(I, Kind, Prev);
+      if (!Res.isRecurrence())
+        return InstDesc(false, I);
+      if (HasRequiredFMF())
+        return Res;
+      // We may be able to vectorize FMax/FMin reductions using maxnum/minnum
+      // intrinsics with extra checks ensuring the vector loop handles only
+      // non-NaN inputs.
+      if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
+        assert(Kind == RecurKind::FMax &&
+               "unexpected recurrence kind for maxnum");
+        return InstDesc(I, RecurKind::FMaxNum);
+      }
+      if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
+        assert(Kind == RecurKind::FMin &&
+               "unexpected recurrence kind for minnum");
+        return InstDesc(I, RecurKind::FMinNum);
+      }
+      return InstDesc(false, I);
+    }
+    if (isFMulAddIntrinsic(I))
       return InstDesc(Kind == RecurKind::FMulAdd, I,
                       I->hasAllowReassoc() ? nullptr : I);
     return InstDesc(false, I);
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f8f741575f87a..14be38595ac86 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -589,11 +589,11 @@ void RuntimePointerChecking::groupChecks(
   // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
   // us to perform an accurate check in this case.
   //
-  // The above case requires that we have an UnknownDependence between
-  // accesses to the same underlying object. This cannot happen unless
-  // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
-  // is also false. In this case we will use the fallback path and create
-  // separate checking groups for all pointers.
+  // In the above case, we have a non-constant distance and an Unknown
+  // dependence between accesses to the same underlying object, and could retry
+  // with runtime checks. Therefore UseDependencies is false. In this case we
+  // will use the fallback path and create separate checking groups for all
+  // pointers.
 
   // If we don't have the dependency partitions, construct a new
   // checking pointer group for each pointer. This is also required
@@ -819,7 +819,7 @@ class AccessAnalysis {
   /// perform dependency checking.
   ///
   /// Note that this can later be cleared if we retry memcheck analysis without
-  /// dependency checking (i.e. FoundNonConstantDistanceDependence).
+  /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
   bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
 
   /// We decided that no dependence analysis would be used.  Reset the state.
@@ -896,7 +896,7 @@ class AccessAnalysis {
   ///
   /// Note that, this is different from isDependencyCheckNeeded.  When we retry
   /// memcheck analysis without dependency checking
-  /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
+  /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
   /// cleared while this remains set if we have potentially dependent accesses.
   bool IsRTCheckAnalysisNeeded = false;
 
@@ -2079,11 +2079,16 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   if (StrideAScaled == StrideBScaled)
     CommonStride = StrideAScaled;
 
-  // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
-  // to consider retrying with runtime checks. Historically, we did not set it
-  // when (unscaled) strides were different but there is no inherent reason to.
+  // TODO: Historically, we didn't retry with runtime checks when (unscaled)
+  // strides were different but there is no inherent reason to.
   if (!isa<SCEVConstant>(Dist))
-    FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt;
+    ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
+
+  // If distance is a SCEVCouldNotCompute, return Unknown immediately.
+  if (isa<SCEVCouldNotCompute>(Dist)) {
+    LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
+    return Dependence::Unknown;
+  }
 
   return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
                                       TypeByteSize, AIsWrite, BIsWrite);
@@ -2122,13 +2127,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       std::get<DepDistanceStrideAndSizeInfo>(Res);
   bool HasSameSize = TypeByteSize > 0;
 
-  if (isa<SCEVCouldNotCompute>(Dist)) {
-    if (CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
-    LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
-    return Dependence::Unknown;
-  }
-
   ScalarEvolution &SE = *PSE.getSE();
   auto &DL = InnermostLoop->getHeader()->getDataLayout();
 
@@ -2713,7 +2711,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     DepsAreSafe =
         DepChecker->areDepsSafe(DepCands, Accesses.getDependenciesToCheck());
 
-    if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
+    if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
 
       // Clear the dependency checks. We assume they are not needed.
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index c08024a38ffc2..b3c8a7d4563b7 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -157,6 +157,8 @@ void CallStackTrie::addCallStack(
 }
 
 void CallStackTrie::addCallStack(MDNode *MIB) {
+  // Note that we are building this from existing MD_memprof metadata.
+  BuiltFromExistingMetadata = true;
   MDNode *StackMD = getMIBStackNode(MIB);
   assert(StackMD);
   std::vector<uint64_t> CallStack;
@@ -187,8 +189,9 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
 static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
                              AllocationType AllocType,
                              ArrayRef<ContextTotalSize> ContextSizeInfo,
-                             const uint64_t MaxColdSize, uint64_t &TotalBytes,
-                             uint64_t &ColdBytes) {
+                             const uint64_t MaxColdSize,
+                             bool BuiltFromExistingMetadata,
+                             uint64_t &TotalBytes, uint64_t &ColdBytes) {
   SmallVector<Metadata *> MIBPayload(
       {buildCallstackMetadata(MIBCallStack, Ctx)});
   MIBPayload.push_back(
@@ -197,8 +200,9 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
   if (ContextSizeInfo.empty()) {
     // The profile matcher should have provided context size info if there was a
     // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully
-    // handle a user-provided percent larger than 100.
-    assert(MinCallsiteColdBytePercent >= 100);
+    // handle a user-provided percent larger than 100. However, we may not have
+    // this information if we built the Trie from existing MD_memprof metadata.
+    assert(BuiltFromExistingMetadata || MinCallsiteColdBytePercent >= 100);
     return MDNode::get(Ctx, MIBPayload);
   }
 
@@ -252,9 +256,19 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
 static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
                                     std::vector<Metadata *> &SavedMIBNodes,
                                     unsigned CallerContextLength,
-                                    uint64_t TotalBytes, uint64_t ColdBytes) {
+                                    uint64_t TotalBytes, uint64_t ColdBytes,
+                                    bool BuiltFromExistingMetadata) {
   const bool MostlyCold =
-      MinCallsiteColdBytePercent < 100 &&
+      // If we have built the Trie from existing MD_memprof metadata, we may or
+      // may not have context size information (in which case ColdBytes and
+      // TotalBytes are 0, which is not also guarded against below). Even if we
+      // do have some context size information from the the metadata, we have
+      // already gone through a round of discarding of small non-cold contexts
+      // during matching, and it would be overly aggressive to do it again, and
+      // we also want to maintain the same behavior with and without reporting
+      // of hinted bytes enabled.
+      !BuiltFromExistingMetadata && MinCallsiteColdBytePercent < 100 &&
+      ColdBytes > 0 &&
       ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes;
 
   // In the simplest case, with pruning disabled, keep all the new MIB nodes.
@@ -386,9 +400,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   if (hasSingleAllocType(Node->AllocTypes)) {
     std::vector<ContextTotalSize> ContextSizeInfo;
     collectContextSizeInfo(Node, ContextSizeInfo);
-    MIBNodes.push_back(
-        createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes,
-                      ContextSizeInfo, MaxColdSize, TotalBytes, ColdBytes));
+    MIBNodes.push_back(createMIBNode(
+        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo,
+        MaxColdSize, BuiltFromExistingMetadata, TotalBytes, ColdBytes));
     return true;
   }
 
@@ -416,7 +430,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
     // Pass in the stack length of the MIB nodes added for the immediate caller,
     // which is the current stack length plus 1.
     saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1,
-                            CallerTotalBytes, CallerColdBytes);
+                            CallerTotalBytes, CallerColdBytes,
+                            BuiltFromExistingMetadata);
     TotalBytes += CallerTotalBytes;
     ColdBytes += CallerColdBytes;
 
@@ -441,9 +456,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
     return false;
   std::vector<ContextTotalSize> ContextSizeInfo;
   collectContextSizeInfo(Node, ContextSizeInfo);
-  MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold,
-                                   ContextSizeInfo, MaxColdSize, TotalBytes,
-                                   ColdBytes));
+  MIBNodes.push_back(createMIBNode(
+      Ctx, MIBCallStack, AllocationType::NotCold, ContextSizeInfo, MaxColdSize,
+      BuiltFromExistingMetadata, TotalBytes, ColdBytes));
   return true;
 }
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 21f844c4d2f45..af85ce4077ec8 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7912,10 +7912,40 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) {
   case Intrinsic::ushl_sat:
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
+  case Intrinsic::umul_fix:
+  case Intrinsic::umul_fix_sat:
   case Intrinsic::pow:
   case Intrinsic::powi:
+  case Intrinsic::sin:
+  case Intrinsic::sinh:
+  case Intrinsic::cos:
+  case Intrinsic::cosh:
+  case Intrinsic::sincos:
+  case Intrinsic::sincospi:
+  case Intrinsic::tan:
+  case Intrinsic::tanh:
+  case Intrinsic::asin:
+  case Intrinsic::acos:
+  case Intrinsic::atan:
+  case Intrinsic::atan2:
   case Intrinsic::canonicalize:
   case Intrinsic::sqrt:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::exp10:
+  case Intrinsic::log:
+  case Intrinsic::log2:
+  case Intrinsic::log10:
+  case Intrinsic::modf:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::roundeven:
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 7f0ed0b60a785..1b3da590cff7f 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -306,6 +306,15 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
   }
 }
 
+VectorType *llvm::getDeinterleavedVectorType(IntrinsicInst *DI) {
+  [[maybe_unused]] unsigned Factor =
+      getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
+  assert(Factor && Factor == DISubtypes.size() &&
+         "unexpected deinterleave factor or result type");
+  return cast<VectorType>(DISubtypes[0]);
+}
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index ce813e1d7b1c4..520c6a00a9c07 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(amdgpu_cs_chain_preserve);
   KEYWORD(amdgpu_kernel);
   KEYWORD(amdgpu_gfx);
+  KEYWORD(amdgpu_gfx_whole_wave);
   KEYWORD(tailcc);
   KEYWORD(m68k_rtdcc);
   KEYWORD(graalcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b7f6950f679ef..13bef1f62f1a9 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
     CC = CallingConv::AMDGPU_CS_ChainPreserve;
     break;
   case lltok::kw_amdgpu_kernel:  CC = CallingConv::AMDGPU_KERNEL; break;
+  case lltok::kw_amdgpu_gfx_whole_wave:
+    CC = CallingConv::AMDGPU_Gfx_WholeWave;
+    break;
   case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_m68k_rtdcc:     CC = CallingConv::M68k_RTD; break;
   case lltok::kw_graalcc:        CC = CallingConv::GRAAL; break;
@@ -4783,9 +4786,13 @@ struct MDField : public MDFieldImpl<Metadata *> {
 };
 
 struct MDStringField : public MDFieldImpl<MDString *> {
-  bool AllowEmpty;
-  MDStringField(bool AllowEmpty = true)
-      : ImplTy(nullptr), AllowEmpty(AllowEmpty) {}
+  enum class EmptyIs {
+    Null,  //< Allow empty input string, map to nullptr
+    Empty, //< Allow empty input string, map to an empty MDString
+    Error, //< Disallow empty string, map to an error
+  } EmptyIs;
+  MDStringField(enum EmptyIs EmptyIs = EmptyIs::Null)
+      : ImplTy(nullptr), EmptyIs(EmptyIs) {}
 };
 
 struct MDFieldList : public MDFieldImpl<SmallVector<Metadata *, 4>> {
@@ -5257,10 +5264,19 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
   if (parseStringConstant(S))
     return true;
 
-  if (!Result.AllowEmpty && S.empty())
-    return error(ValueLoc, "'" + Name + "' cannot be empty");
+  if (S.empty()) {
+    switch (Result.EmptyIs) {
+    case MDStringField::EmptyIs::Null:
+      Result.assign(nullptr);
+      return false;
+    case MDStringField::EmptyIs::Empty:
+      break;
+    case MDStringField::EmptyIs::Error:
+      return error(ValueLoc, "'" + Name + "' cannot be empty");
+    }
+  }
 
-  Result.assign(S.empty() ? nullptr : MDString::get(Context, S));
+  Result.assign(MDString::get(Context, S));
   return false;
 }
 
@@ -5778,7 +5794,7 @@ bool LLParser::parseDIFile(MDNode *&Result, bool IsDistinct) {
   REQUIRED(directory, MDStringField, );                                        \
   OPTIONAL(checksumkind, ChecksumKindField, (DIFile::CSK_MD5));                \
   OPTIONAL(checksum, MDStringField, );                                         \
-  OPTIONAL(source, MDStringField, );
+  OPTIONAL(source, MDStringField, (MDStringField::EmptyIs::Empty));
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -6062,7 +6078,7 @@ bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
 ///                         declaration: !4, align: 8)
 bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  OPTIONAL(name, MDStringField, (/* AllowEmpty */ false));                     \
+  OPTIONAL(name, MDStringField, (MDStringField::EmptyIs::Error));              \
   OPTIONAL(scope, MDField, );                                                  \
   OPTIONAL(linkageName, MDStringField, );                                      \
   OPTIONAL(file, MDField, );                                                   \
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 38ba2d9e85a06..4b2debb7ae236 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMBinaryFormat
   MsgPackDocumentYAML.cpp
   MsgPackReader.cpp
   MsgPackWriter.cpp
+  SFrame.cpp
   Wasm.cpp
   XCOFF.cpp
 
diff --git a/llvm/lib/BinaryFormat/SFrame.cpp b/llvm/lib/BinaryFormat/SFrame.cpp
new file mode 100644
index 0000000000000..3b436afd32083
--- /dev/null
+++ b/llvm/lib/BinaryFormat/SFrame.cpp
@@ -0,0 +1,37 @@
+//===-- SFrame.cpp -----------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+
+ArrayRef<EnumEntry<sframe::Version>> sframe::getVersions() {
+  static constexpr EnumEntry<Version> Versions[] = {
+#define HANDLE_SFRAME_VERSION(CODE, NAME) {#NAME, sframe::Version::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+  };
+
+  return ArrayRef(Versions);
+}
+
+ArrayRef<EnumEntry<sframe::Flags>> sframe::getFlags() {
+  static constexpr EnumEntry<sframe::Flags> Flags[] = {
+#define HANDLE_SFRAME_FLAG(CODE, NAME) {#NAME, sframe::Flags::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+  };
+  return ArrayRef(Flags);
+}
+
+ArrayRef<EnumEntry<sframe::ABI>> sframe::getABIs() {
+  static constexpr EnumEntry<sframe::ABI> ABIs[] = {
+#define HANDLE_SFRAME_ABI(CODE, NAME) {#NAME, sframe::ABI::NAME},
+#include "llvm/BinaryFormat/SFrameConstants.def"
+  };
+  return ArrayRef(ABIs);
+}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 66ecc69c9874d..f76368357a9c3 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -293,10 +293,18 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) {
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid section name record");
+
       // Check for the i386 and other (x86_64, ARM) conventions
-      if (S.find("__DATA,__objc_catlist") != std::string::npos ||
-          S.find("__OBJC,__category") != std::string::npos ||
-          S.find("__TEXT,__swift") != std::string::npos)
+
+      auto [Segment, Section] = StringRef(S).split(",");
+      Segment = Segment.trim();
+      Section = Section.trim();
+
+      if (Segment == "__DATA" && Section.starts_with("__objc_catlist"))
+        return true;
+      if (Segment == "__OBJC" && Section.starts_with("__category"))
+        return true;
+      if (Segment == "__TEXT" && Section.starts_with("__swift"))
         return true;
       break;
     }
@@ -7116,9 +7124,11 @@ Error BitcodeReader::materializeModule() {
       if (CallInst *CI = dyn_cast<CallInst>(U))
         UpgradeIntrinsicCall(CI, I.second);
     }
-    if (!I.first->use_empty())
-      I.first->replaceAllUsesWith(I.second);
-    I.first->eraseFromParent();
+    if (I.first != I.second) {
+      if (!I.first->use_empty())
+        I.first->replaceAllUsesWith(I.second);
+      I.first->eraseFromParent();
+    }
   }
   UpgradedIntrinsics.clear();
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 76a1d8c931605..f1d3e96f70c85 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -809,7 +809,7 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 
   // If we have a bss global going to a section that supports the
   // zerofill directive, do so here.
-  if (GVKind.isBSS() && MAI->isMachO() && TheSection->isVirtualSection()) {
+  if (GVKind.isBSS() && MAI->isMachO() && TheSection->isBssSection()) {
     if (Size == 0)
       Size = 1; // zerofill of 0 bytes is undefined.
     emitLinkage(GV, GVSym);
@@ -1868,6 +1868,7 @@ void AsmPrinter::emitFunctionBody() {
         OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
         break;
       case TargetOpcode::EH_LABEL:
+        OutStreamer->AddComment("EH_LABEL");
         OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
         // For AsynchEH, insert a Nop if followed by a trap inst
         //   Or the exception won't be caught.
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
index 618deef2a74ea..4bf3bdfb5adfd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -18,6 +18,11 @@
 #include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/MC/MCStreamer.h"
 
+#ifndef NDEBUG
+#include "llvm/IR/Module.h"
+#include "llvm/Support/WithColor.h"
+#endif
+
 using namespace llvm;
 
 void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
@@ -35,6 +40,9 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
     uint64_t &CallerGuid = NameGuidMap[Name];
     if (!CallerGuid)
       CallerGuid = Function::getGUIDAssumingExternalLinkage(Name);
+#ifndef NDEBUG
+    verifyGuidExistenceInDesc(CallerGuid, Name);
+#endif
     uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex(
         InlinedAt->getDiscriminator());
     ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId);
@@ -51,4 +59,28 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
   SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack));
   Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, Discriminator,
                                     InlineStack, Asm->CurrentFnSym);
+#ifndef NDEBUG
+  verifyGuidExistenceInDesc(
+      Guid, DebugLoc ? DebugLoc->getSubprogramLinkageName() : "");
+#endif
+}
+
+#ifndef NDEBUG
+void PseudoProbeHandler::verifyGuidExistenceInDesc(uint64_t Guid,
+                                                   StringRef FuncName) {
+  NamedMDNode *Desc = Asm->MF->getFunction().getParent()->getNamedMetadata(
+      PseudoProbeDescMetadataName);
+  assert(Desc && "pseudo probe does not exist");
+
+  // Keep DescGuidSet up to date.
+  for (size_t I = DescGuidSet.size(), E = Desc->getNumOperands(); I != E; ++I) {
+    const auto *MD = cast<MDNode>(Desc->getOperand(I));
+    auto *ID = mdconst::extract<ConstantInt>(MD->getOperand(0));
+    DescGuidSet.insert(ID->getZExtValue());
+  }
+
+  if (!DescGuidSet.contains(Guid))
+    WithColor::warning() << "Guid:" << Guid << " Name:" << FuncName
+                         << " does not exist in pseudo probe desc\n";
 }
+#endif
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
index f11b552387501..e950b23bb4280 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -15,6 +15,10 @@
 
 #include "llvm/ADT/DenseMap.h"
 
+#ifndef NDEBUG
+#include "llvm/ADT/DenseSet.h"
+#endif
+
 namespace llvm {
 
 class AsmPrinter;
@@ -26,6 +30,13 @@ class PseudoProbeHandler {
   // Name to GUID map, used as caching/memoization for speed.
   DenseMap<StringRef, uint64_t> NameGuidMap;
 
+#ifndef NDEBUG
+  // All GUID in llvm.pseudo_probe_desc.
+  DenseSet<uint64_t> DescGuidSet;
+
+  void verifyGuidExistenceInDesc(uint64_t Guid, StringRef FuncName);
+#endif
+
 public:
   PseudoProbeHandler(AsmPrinter *A) : Asm(A) {};
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index dccd71fffe053..13fd270ec7410 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -323,12 +323,6 @@ const MCExpr *WinException::getLabel(const MCSymbol *Label) {
                                  Asm->OutContext);
 }
 
-const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
-  return MCBinaryExpr::createAdd(getLabel(Label),
-                                 MCConstantExpr::create(1, Asm->OutContext),
-                                 Asm->OutContext);
-}
-
 const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf,
                                       const MCSymbol *OffsetFrom) {
   return MCBinaryExpr::createSub(
@@ -655,7 +649,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
     AddComment("LabelStart");
     OS.emitValue(getLabel(BeginLabel), 4);
     AddComment("LabelEnd");
-    OS.emitValue(getLabelPlusOne(EndLabel), 4);
+    OS.emitValue(getLabel(EndLabel), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
                                                              : "CatchAll");
     OS.emitValue(FilterOrFinally, 4);
@@ -950,13 +944,7 @@ void WinException::computeIP2StateTable(
       if (!ChangeLabel)
         ChangeLabel = StateChange.PreviousEndLabel;
       // Emit an entry indicating that PCs after 'Label' have this EH state.
-      // NOTE: On ARM architectures, the StateFromIp automatically takes into
-      // account that the return address is after the call instruction (whose EH
-      // state we should be using), but on other platforms we need to +1 to the
-      // label so that we are using the correct EH state.
-      const MCExpr *LabelExpression = (isAArch64 || isThumb)
-                                          ? getLabel(ChangeLabel)
-                                          : getLabelPlusOne(ChangeLabel);
+      const MCExpr *LabelExpression = getLabel(ChangeLabel);
       IPToStateTable.push_back(
           std::make_pair(LabelExpression, StateChange.NewState));
       // FIXME: assert that NewState is between CatchLow and CatchHigh.
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h
index 638589adf0ddc..47dd30cef133d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -80,7 +80,6 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   const MCExpr *create32bitRef(const MCSymbol *Value);
   const MCExpr *create32bitRef(const GlobalValue *GV);
   const MCExpr *getLabel(const MCSymbol *Label);
-  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
   const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
   const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
                                  const MCSymbol *OffsetFrom);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 70a9788c76e1f..c21058ca51344 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -445,7 +445,6 @@ class CodeGenPrepare {
   bool optimizeSwitchInst(SwitchInst *SI);
   bool optimizeExtractElementInst(Instruction *Inst);
   bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
-  bool fixupDbgValue(Instruction *I);
   bool fixupDbgVariableRecord(DbgVariableRecord &I);
   bool fixupDbgVariableRecordsOnInst(Instruction &I);
   bool placeDbgValues(Function &F);
@@ -2762,9 +2761,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
     case Intrinsic::fshl:
     case Intrinsic::fshr:
       return optimizeFunnelShift(II);
-    case Intrinsic::dbg_assign:
-    case Intrinsic::dbg_value:
-      return fixupDbgValue(II);
     case Intrinsic::masked_gather:
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
@@ -3015,7 +3011,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
         //   %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ]
         if (PredBB && PredBB->getSingleSuccessor() == BB)
           CI = dyn_cast_or_null<CallInst>(
-              PredBB->getTerminator()->getPrevNonDebugInstruction(true));
+              PredBB->getTerminator()->getPrevNode());
 
         if (CI && CI->use_empty() &&
             isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
@@ -3032,7 +3028,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
     for (BasicBlock *Pred : predecessors(BB)) {
       if (!VisitedBBs.insert(Pred).second)
         continue;
-      if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {
+      if (Instruction *I = Pred->rbegin()->getPrevNode()) {
         CallInst *CI = dyn_cast<CallInst>(I);
         if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
             attributesPermitTailCall(F, CI, RetI, *TLI)) {
@@ -3554,8 +3550,6 @@ class TypePromotionTransaction {
     /// Keep track of the original uses (pair Instruction, Index).
     SmallVector<InstructionAndIdx, 4> OriginalUses;
     /// Keep track of the debug users.
-    SmallVector<DbgValueInst *, 1> DbgValues;
-    /// And non-instruction debug-users too.
     SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
 
     /// Keep track of the new value so that we can undo it by replacing
@@ -3577,7 +3571,7 @@ class TypePromotionTransaction {
       }
       // Record the debug uses separately. They are not in the instruction's
       // use list, but they are replaced by RAUW.
-      findDbgValues(DbgValues, Inst, &DbgVariableRecords);
+      findDbgValues(Inst, DbgVariableRecords);
 
       // Now, we can replace the uses.
       Inst->replaceAllUsesWith(New);
@@ -3591,11 +3585,7 @@ class TypePromotionTransaction {
       // RAUW has replaced all original uses with references to the new value,
       // including the debug uses. Since we are undoing the replacements,
       // the original debug uses must also be reinstated to maintain the
-      // correctness and utility of debug value instructions.
-      for (auto *DVI : DbgValues)
-        DVI->replaceVariableLocationOp(New, Inst);
-      // Similar story with DbgVariableRecords, the non-instruction
-      // representation of dbg.values.
+      // correctness and utility of debug value records.
       for (DbgVariableRecord *DVR : DbgVariableRecords)
         DVR->replaceVariableLocationOp(New, Inst);
     }
@@ -8933,32 +8923,6 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) {
   return MadeChange;
 }
 
-// Some CGP optimizations may move or alter what's computed in a block. Check
-// whether a dbg.value intrinsic could be pointed at a more appropriate operand.
-bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
-  assert(isa<DbgValueInst>(I));
-  DbgValueInst &DVI = *cast<DbgValueInst>(I);
-
-  // Does this dbg.value refer to a sunk address calculation?
-  bool AnyChange = false;
-  SmallDenseSet<Value *> LocationOps(DVI.location_ops().begin(),
-                                     DVI.location_ops().end());
-  for (Value *Location : LocationOps) {
-    WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
-    Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
-    if (SunkAddr) {
-      // Point dbg.value at locally computed address, which should give the best
-      // opportunity to be accurately lowered. This update may change the type
-      // of pointer being referred to; however this makes no difference to
-      // debugging information, and we can't generate bitcasts that may affect
-      // codegen.
-      DVI.replaceVariableLocationOp(Location, SunkAddr);
-      AnyChange = true;
-    }
-  }
-  return AnyChange;
-}
-
 bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) {
   bool AnyChange = false;
   for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
@@ -8993,14 +8957,6 @@ bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) {
   return AnyChange;
 }
 
-static void DbgInserterHelper(DbgValueInst *DVI, BasicBlock::iterator VI) {
-  DVI->removeFromParent();
-  if (isa<PHINode>(VI))
-    DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
-  else
-    DVI->insertAfter(VI);
-}
-
 static void DbgInserterHelper(DbgVariableRecord *DVR, BasicBlock::iterator VI) {
   DVR->removeFromParent();
   BasicBlock *VIBB = VI->getParent();
@@ -9065,15 +9021,8 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
 
   for (BasicBlock &BB : F) {
     for (Instruction &Insn : llvm::make_early_inc_range(BB)) {
-      // Process dbg.value intrinsics.
-      DbgValueInst *DVI = dyn_cast<DbgValueInst>(&Insn);
-      if (DVI) {
-        DbgProcessor(DVI, DVI);
-        continue;
-      }
-
-      // If this isn't a dbg.value, process any attached DbgVariableRecord
-      // records attached to this instruction.
+      // Process any DbgVariableRecord records attached to this
+      // instruction.
       for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
                filterDbgVars(Insn.getDbgRecordRange()))) {
         if (DVR.Type != DbgVariableRecord::LocationType::Value)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3922eba55e195..e8f513ad5a7a9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   auto &UDivorRem = cast<GenericMachineInstr>(MI);
@@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
   return ret;
 }
 
-bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   Register Dst = MI.getOperand(0).getReg();
@@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivorURemUsingMul(MI);
+void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildUDivOrURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
     return false;
 
   // If the sdiv has an 'exact' flag we can use a simpler lowering.
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+  if (Opcode == TargetOpcode::G_SDIV &&
+      MI.getFlag(MachineInstr::MIFlag::IsExact)) {
     return matchUnaryPredicate(
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
@@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
     if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
         !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
       return false;
+    if (Opcode == TargetOpcode::G_SREM &&
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+      return false;
   }
 
   return matchUnaryPredicate(
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildSDivUsingMul(MI);
+void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildSDivOrSRemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
-  auto &SDiv = cast<GenericMachineInstr>(MI);
-  Register Dst = SDiv.getReg(0);
-  Register LHS = SDiv.getReg(1);
-  Register RHS = SDiv.getReg(2);
+MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV ||
+         Opcode == TargetOpcode::G_SREM);
+  auto &SDivorRem = cast<GenericMachineInstr>(MI);
+  Register Dst = SDivorRem.getReg(0);
+  Register LHS = SDivorRem.getReg(1);
+  Register RHS = SDivorRem.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
   auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
   auto T = MIB.buildLShr(Ty, Q, SignShift);
   T = MIB.buildAnd(Ty, T, ShiftMask);
-  return MIB.buildAdd(Ty, Q, T);
+  auto ret = MIB.buildAdd(Ty, Q, T);
+
+  if (Opcode == TargetOpcode::G_SREM) {
+    auto Prod = MIB.buildMul(Ty, ret, RHS);
+    return MIB.buildSub(Ty, LHS, Prod);
+  }
+  return ret;
 }
 
 bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 1286af864fb3f..974fc40de6222 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1884,6 +1884,14 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
     }
     break;
   }
+  case TargetOpcode::G_ASHR: {
+    Register Src1 = MI.getOperand(1).getReg();
+    Register Src2 = MI.getOperand(2).getReg();
+    FirstAnswer = computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (auto C = getValidMinimumShiftAmount(Src2, DemandedElts, Depth + 1))
+      FirstAnswer = std::min<uint64_t>(FirstAnswer + *C, TyBits);
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(Src);
@@ -2053,6 +2061,64 @@ unsigned GISelValueTracking::computeNumSignBits(Register R, unsigned Depth) {
   return computeNumSignBits(R, DemandedElts, Depth);
 }
 
+std::optional<ConstantRange> GISelValueTracking::getValidShiftAmountRange(
+    Register R, const APInt &DemandedElts, unsigned Depth) {
+  // Shifting more than the bitwidth is not valid.
+  MachineInstr &MI = *MRI.getVRegDef(R);
+  unsigned Opcode = MI.getOpcode();
+
+  LLT Ty = MRI.getType(R);
+  unsigned BitWidth = Ty.getScalarSizeInBits();
+
+  if (Opcode == TargetOpcode::G_CONSTANT) {
+    const APInt &ShAmt = MI.getOperand(1).getCImm()->getValue();
+    if (ShAmt.uge(BitWidth))
+      return std::nullopt;
+    return ConstantRange(ShAmt);
+  }
+
+  if (Opcode == TargetOpcode::G_BUILD_VECTOR) {
+    const APInt *MinAmt = nullptr, *MaxAmt = nullptr;
+    for (unsigned I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
+      if (!DemandedElts[I])
+        continue;
+      MachineInstr *Op = MRI.getVRegDef(MI.getOperand(I + 1).getReg());
+      if (Op->getOpcode() != TargetOpcode::G_CONSTANT) {
+        MinAmt = MaxAmt = nullptr;
+        break;
+      }
+
+      const APInt &ShAmt = Op->getOperand(1).getCImm()->getValue();
+      if (ShAmt.uge(BitWidth))
+        return std::nullopt;
+      if (!MinAmt || MinAmt->ugt(ShAmt))
+        MinAmt = &ShAmt;
+      if (!MaxAmt || MaxAmt->ult(ShAmt))
+        MaxAmt = &ShAmt;
+    }
+    assert(((!MinAmt && !MaxAmt) || (MinAmt && MaxAmt)) &&
+           "Failed to find matching min/max shift amounts");
+    if (MinAmt && MaxAmt)
+      return ConstantRange(*MinAmt, *MaxAmt + 1);
+  }
+
+  // Use computeKnownBits to find a hidden constant/knownbits (usually type
+  // legalized). e.g. Hidden behind multiple bitcasts/build_vector/casts etc.
+  KnownBits KnownAmt = getKnownBits(R, DemandedElts, Depth);
+  if (KnownAmt.getMaxValue().ult(BitWidth))
+    return ConstantRange::fromKnownBits(KnownAmt, /*IsSigned=*/false);
+
+  return std::nullopt;
+}
+
+std::optional<uint64_t> GISelValueTracking::getValidMinimumShiftAmount(
+    Register R, const APInt &DemandedElts, unsigned Depth) {
+  if (std::optional<ConstantRange> AmtRange =
+          getValidShiftAmountRange(R, DemandedElts, Depth))
+    return AmtRange->getUnsignedMin().getZExtValue();
+  return std::nullopt;
+}
+
 void GISelValueTrackingAnalysisLegacy::getAnalysisUsage(
     AnalysisUsage &AU) const {
   AU.setPreservesAll();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d7280eaba2440..dc5dfab4418e5 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2189,23 +2189,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
                                                   : TargetOpcode::LIFETIME_END;
 
-    // Get the underlying objects for the location passed on the lifetime
-    // marker.
-    SmallVector<const Value *, 4> Allocas;
-    getUnderlyingObjects(CI.getArgOperand(1), Allocas);
-
-    // Iterate over each underlying object, creating lifetime markers for each
-    // static alloca. Quit if we find a non-static alloca.
-    for (const Value *V : Allocas) {
-      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
-      if (!AI)
-        continue;
-
-      if (!AI->isStaticAlloca())
-        return true;
+    const AllocaInst *AI = cast<AllocaInst>(CI.getArgOperand(1));
+    if (!AI->isStaticAlloca())
+      return true;
 
-      MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
-    }
+    MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
     return true;
   }
   case Intrinsic::fake_use: {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 11b3ac82e5136..ed7b07f7d9367 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -10120,14 +10120,10 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
     return Legalized;
   }
 
-  bool IsVolatile = MemOp->isVolatile();
-  // Don't try to optimize volatile.
-  if (IsVolatile)
-    return UnableToLegalize;
-
   if (MaxLen && KnownLen > MaxLen)
     return UnableToLegalize;
 
+  bool IsVolatile = MemOp->isVolatile();
   if (Opc == TargetOpcode::G_MEMCPY) {
     auto &MF = *MI.getParent()->getParent();
     const auto &TLI = *MF.getSubtarget().getTargetLowering();
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 92ecfadf97c99..73f11c1345daf 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -95,6 +95,10 @@ bool isEligibleFunction(Function *F) {
   if (F->getCallingConv() == CallingConv::SwiftTail)
     return false;
 
+  // Unnamed functions are skipped for simplicity.
+  if (!F->hasName())
+    return false;
+
   // If function contains callsites with musttail, if we merge
   // it, the merged function will have the musttail callsite, but
   // the number of parameters can change, thus the parameter count
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 95599837e1bfc..1b691881d67dd 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -268,17 +268,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   if (isa<ScalableVectorType>(Load->getType()))
     return false;
 
-  if (auto *LI = dyn_cast<LoadInst>(Load)) {
-    if (!LI->isSimple())
-      return false;
-  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
-    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
-    // Require a constant mask.
-    if (!isa<ConstantVector>(VPLoad->getMaskParam()))
-      return false;
-  } else {
-    llvm_unreachable("unsupported load operation");
-  }
+  if (auto *LI = dyn_cast<LoadInst>(Load);
+      LI && !LI->isSimple())
+    return false;
 
   // Check if all users of this load are shufflevectors. If we encounter any
   // users that are extractelement instructions or binary operators, we save
@@ -367,34 +359,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   bool BinOpShuffleChanged =
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
+  Value *Mask = nullptr;
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
-    Value *LaneMask =
-        getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
-    if (!LaneMask)
+    Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+    if (!Mask)
       return false;
-
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
-
-    // Sometimes the number of Shuffles might be less than Factor, we have to
-    // fill the gaps with null. Also, lowerInterleavedVPLoad
-    // expects them to be sorted.
-    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
-    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
-      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
-    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
-      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-      return !Extracts.empty() || BinOpShuffleChanged;
   } else {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
-
-    // Try to create target specific intrinsics to replace the load and
-    // shuffles.
-    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
-                                   Factor))
-      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
+  // Try to create target specific intrinsics to replace the load and
+  // shuffles.
+  if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
+                                 Indices, Factor))
+    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+    return !Extracts.empty() || BinOpShuffleChanged;
+
   DeadInsts.insert_range(Shuffles);
 
   DeadInsts.insert(Load);
@@ -508,9 +489,6 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
     StoredValue = SI->getValueOperand();
   } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
-    // Require a constant mask.
-    if (!isa<ConstantVector>(VPStore->getMaskParam()))
-      return false;
     StoredValue = VPStore->getArgOperand(0);
   } else {
     llvm_unreachable("unsupported store operation");
@@ -529,46 +507,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
 
+  Value *Mask = nullptr;
   if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
-                              ElementCount::getFixed(LaneMaskLen));
-    if (!LaneMask)
+    Mask = getMask(VPStore->getMaskParam(), Factor,
+                   ElementCount::getFixed(LaneMaskLen));
+    if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
                       << "\n");
 
-    IRBuilder<> Builder(VPStore);
-    // We need to effectively de-interleave the shufflemask
-    // because lowerInterleavedVPStore expects individual de-interleaved
-    // values.
-    SmallVector<Value *, 10> NewShuffles;
-    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
-    auto ShuffleMask = SVI->getShuffleMask();
-
-    for (unsigned i = 0; i < Factor; i++) {
-      for (unsigned j = 0; j < LaneMaskLen; j++)
-        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
-
-      NewShuffles.push_back(Builder.CreateShuffleVector(
-          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
-    }
-
-    // Try to create target specific intrinsics to replace the vp.store and
-    // shuffle.
-    if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
-      // We already created new shuffles.
-      return true;
   } else {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
-
-    // Try to create target specific intrinsics to replace the store and
-    // shuffle.
-    if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
-      return false;
   }
 
+  // Try to create target specific intrinsics to replace the store and
+  // shuffle.
+  if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
+    return false;
+
   // Already have a new target specific interleaved store. Erase the old store.
   DeadInsts.insert(Store);
   DeadInsts.insert(SVI);
@@ -611,109 +569,113 @@ static Value *getMask(Value *WideMask, unsigned Factor,
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
     IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  Value *LoadedVal = DI->getOperand(0);
-  if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
+  Instruction *LoadedVal = dyn_cast<Instruction>(DI->getOperand(0));
+  if (!LoadedVal || !LoadedVal->hasOneUse())
+    return false;
+
+  auto *LI = dyn_cast<LoadInst>(LoadedVal);
+  auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
+  if (!LI && !II)
     return false;
 
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   assert(Factor && "unexpected deinterleave intrinsic");
 
-  SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
-  Value *LastFactor = nullptr;
-  for (auto *User : DI->users()) {
-    auto *Extract = dyn_cast<ExtractValueInst>(User);
-    if (!Extract || Extract->getNumIndices() != 1)
-      return false;
-    unsigned Idx = Extract->getIndices()[0];
-    if (DeinterleaveValues[Idx])
+  Value *Mask = nullptr;
+  if (LI) {
+    if (!LI->isSimple())
       return false;
-    DeinterleaveValues[Idx] = Extract;
-    LastFactor = Extract;
-  }
 
-  if (!LastFactor)
-    return false;
+    LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
+                      << " and factor = " << Factor << "\n");
+  } else {
+    assert(II);
 
-  Value *Mask = nullptr;
-  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
-    if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
-      return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
-    Value *WideMask = VPLoad->getOperand(1);
-    Mask = getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
-    if (!Mask)
+    Value *WideMask;
+    switch (II->getIntrinsicID()) {
+    default:
       return false;
+    case Intrinsic::vp_load:
+      WideMask = II->getOperand(1);
+      break;
+    case  Intrinsic::masked_load:
+      WideMask = II->getOperand(2);
+      break;
+    }
 
-    LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
-                      << *DI << " and factor = " << Factor << "\n");
-  } else {
-    auto *LI = cast<LoadInst>(LoadedVal);
-    if (!LI->isSimple())
+    Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
+    if (!Mask)
       return false;
 
-    LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
-                      << " and factor = " << Factor << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
+                      << " intrinsic " << *DI << " and factor = "
+                      << Factor << "\n");
   }
 
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast<Instruction>(LoadedVal), Mask,
-                                             DeinterleaveValues))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI))
     return false;
 
-  for (Value *V : DeinterleaveValues)
-    if (V)
-      DeadInsts.insert(cast<Instruction>(V));
   DeadInsts.insert(DI);
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.insert(cast<Instruction>(LoadedVal));
+  DeadInsts.insert(LoadedVal);
   return true;
 }
 
 bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
-    IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!II->hasOneUse())
+    IntrinsicInst *IntII, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (!IntII->hasOneUse())
+    return false;
+  Instruction *StoredBy = dyn_cast<Instruction>(IntII->user_back());
+  if (!StoredBy)
     return false;
-  Value *StoredBy = II->user_back();
-  if (!isa<StoreInst, VPIntrinsic>(StoredBy))
+  auto *SI = dyn_cast<StoreInst>(StoredBy);
+  auto *II = dyn_cast<IntrinsicInst>(StoredBy);
+  if (!SI && !II)
     return false;
 
-  SmallVector<Value *, 8> InterleaveValues(II->args());
-  const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID());
+  SmallVector<Value *, 8> InterleaveValues(IntII->args());
+  const unsigned Factor = getInterleaveIntrinsicFactor(IntII->getIntrinsicID());
   assert(Factor && "unexpected interleave intrinsic");
 
-  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
-    if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
+  Value *Mask = nullptr;
+  if (II) {
+    // Check mask operand. Handle both all-true/false and interleaved mask.
+    Value *WideMask;
+    switch (II->getIntrinsicID()) {
+    default:
       return false;
-
-    Value *WideMask = VPStore->getOperand(2);
-    Value *Mask = getMask(WideMask, Factor,
-                          cast<VectorType>(InterleaveValues[0]->getType()));
+    case Intrinsic::vp_store:
+      WideMask = II->getOperand(2);
+      break;
+    case Intrinsic::masked_store:
+      WideMask = II->getOperand(3);
+      break;
+    }
+    Mask = getMask(WideMask, Factor,
+                   cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
-    LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
-                      << *II << " and factor = " << Factor << "\n");
-
-    // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
-    // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
-      return false;
+    LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
+                      << " intrinsic " << *IntII << " and factor = "
+                      << Factor << "\n");
   } else {
-    auto *SI = cast<StoreInst>(StoredBy);
     if (!SI->isSimple())
       return false;
 
-    LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
-                      << " and factor = " << Factor << "\n");
-
-    // Try and match this with target specific intrinsics.
-    if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
-      return false;
+    LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic "
+                      << *IntII << " and factor = " << Factor << "\n");
   }
 
+  // Try and match this with target specific intrinsics.
+  if (!TLI->lowerInterleaveIntrinsicToStore(StoredBy, Mask, InterleaveValues))
+    return false;
+
   // We now have a target-specific store, so delete the old one.
-  DeadInsts.insert(cast<Instruction>(StoredBy));
-  DeadInsts.insert(II);
+  DeadInsts.insert(StoredBy);
+  DeadInsts.insert(IntII);
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 9daacfd399787..e7fa0824fd98a 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -202,8 +202,8 @@ MachineBlockFrequencyInfo::MachineBlockFrequencyInfo(
     MachineBlockFrequencyInfo &&) = default;
 
 MachineBlockFrequencyInfo::MachineBlockFrequencyInfo(
-    MachineFunction &F, MachineBranchProbabilityInfo &MBPI,
-    MachineLoopInfo &MLI) {
+    const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI,
+    const MachineLoopInfo &MLI) {
   calculate(F, MBPI, MLI);
 }
 
diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp
index 9b9cebc74054d..1a20fe586e951 100644
--- a/llvm/lib/CodeGen/MachineDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -63,24 +63,9 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
   // which cover a wide range of lines can help stress the debug info passes:
   // if we can't do that, fall back to using the local variable which precedes
   // all the others.
-  Function *DbgValF = M.getFunction("llvm.dbg.value");
-  DbgValueInst *EarliestDVI = nullptr;
   DbgVariableRecord *EarliestDVR = nullptr;
   DenseMap<unsigned, DILocalVariable *> Line2Var;
   DIExpression *Expr = nullptr;
-  if (DbgValF) {
-    for (const Use &U : DbgValF->uses()) {
-      auto *DVI = dyn_cast<DbgValueInst>(U.getUser());
-      if (!DVI || DVI->getFunction() != &F)
-        continue;
-      unsigned Line = DVI->getDebugLoc().getLine();
-      assert(Line != 0 && "debugify should not insert line 0 locations");
-      Line2Var[Line] = DVI->getVariable();
-      if (!EarliestDVI || Line < EarliestDVI->getDebugLoc().getLine())
-        EarliestDVI = DVI;
-      Expr = DVI->getExpression();
-    }
-  }
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
@@ -125,8 +110,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
       unsigned Line = MI.getDebugLoc().getLine();
       auto It = Line2Var.find(Line);
       if (It == Line2Var.end()) {
-        Line = EarliestDVI ? EarliestDVI->getDebugLoc().getLine()
-                           : EarliestDVR->getDebugLoc().getLine();
+        Line = EarliestDVR->getDebugLoc().getLine();
         It = Line2Var.find(Line);
         assert(It != Line2Var.end());
       }
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 38ad582ba923c..429a17a9113d3 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -211,9 +211,8 @@ void MachineFunction::init() {
   ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
-  // FIXME: Shouldn't use pref alignment if explicit alignment is set on F.
   // FIXME: Use Function::hasOptSize().
-  if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+  if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index e7a4d6d61e211..116a919585d70 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -45,3 +45,9 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
 
   return Result(std::move(MF));
 }
+
+PreservedAnalyses FreeMachineFunctionPass::run(Function &F,
+                                               FunctionAnalysisManager &FAM) {
+  FAM.clearAnalysis<MachineFunctionAnalysis>(F);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b38a4d1c55af9..90005bd181f3a 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -4279,8 +4279,8 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
             !TII->isGlobalMemoryObject(FromMI) &&
             !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
           SDep Pred = Dep;
-          Pred.setSUnit(Src);
-          Dst->addPred(Pred);
+          Pred.setSUnit(From);
+          To->addPred(Pred);
         }
       }
     }
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 76cba2949af60..9d5c39ce7ae76 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -771,24 +771,6 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,
          MI->isFakeUse();
 }
 
-/// A region of an MBB for scheduling.
-namespace {
-struct SchedRegion {
-  /// RegionBegin is the first instruction in the scheduling region, and
-  /// RegionEnd is either MBB->end() or the scheduling boundary after the
-  /// last instruction in the scheduling region. These iterators cannot refer
-  /// to instructions outside of the identified scheduling region because
-  /// those may be reordered before scheduling this region.
-  MachineBasicBlock::iterator RegionBegin;
-  MachineBasicBlock::iterator RegionEnd;
-  unsigned NumRegionInstrs;
-
-  SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
-              unsigned N) :
-    RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
-};
-} // end anonymous namespace
-
 using MBBRegionsVector = SmallVector<SchedRegion, 16>;
 
 static void
@@ -3725,7 +3707,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   RegionPolicy.OnlyBottomUp = true;
 
   // Allow the subtarget to override default policy.
-  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
+  SchedRegion Region(Begin, End, NumRegionInstrs);
+  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Region);
 
   // After subtarget overrides, apply command line options.
   if (!EnableRegPressure) {
@@ -4338,7 +4321,8 @@ void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   RegionPolicy.OnlyBottomUp = false;
 
   // Allow the subtarget to override default policy.
-  MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, NumRegionInstrs);
+  SchedRegion Region(Begin, End, NumRegionInstrs);
+  MF.getSubtarget().overridePostRASchedPolicy(RegionPolicy, Region);
 
   // After subtarget overrides, apply command line options.
   if (PostRADirection == MISched::TopDown) {
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 381249ec8371a..0b2a73b3c7e0f 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -5,35 +5,31 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file defines the RABasic function pass, which provides a minimal
-// implementation of the basic register allocator.
-//
+///
+/// \file
+/// This file defines the RABasic function pass, which provides a minimal
+/// implementation of the basic register allocator.
+///
 //===----------------------------------------------------------------------===//
 
+#include "RegAllocBasic.h"
 #include "AllocationOrder.h"
-#include "RegAllocBase.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/CodeGen/Spiller.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <queue>
 
 using namespace llvm;
 
@@ -42,89 +38,8 @@ using namespace llvm;
 static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
                                       createBasicRegisterAllocator);
 
-namespace {
-  struct CompSpillWeight {
-    bool operator()(const LiveInterval *A, const LiveInterval *B) const {
-      return A->weight() < B->weight();
-    }
-  };
-}
-
-namespace {
-/// RABasic provides a minimal implementation of the basic register allocation
-/// algorithm. It prioritizes live virtual registers by spill weight and spills
-/// whenever a register is unavailable. This is not practical in production but
-/// provides a useful baseline both for measuring other allocators and comparing
-/// the speed of the basic algorithm against other styles of allocators.
-class RABasic : public MachineFunctionPass,
-                public RegAllocBase,
-                private LiveRangeEdit::Delegate {
-  // context
-  MachineFunction *MF = nullptr;
-
-  // state
-  std::unique_ptr<Spiller> SpillerInstance;
-  std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>,
-                      CompSpillWeight>
-      Queue;
-
-  // Scratch space.  Allocated here to avoid repeated malloc calls in
-  // selectOrSplit().
-  BitVector UsableRegs;
-
-  bool LRE_CanEraseVirtReg(Register) override;
-  void LRE_WillShrinkVirtReg(Register) override;
-
-public:
-  RABasic(const RegAllocFilterFunc F = nullptr);
-
-  /// Return the pass name.
-  StringRef getPassName() const override { return "Basic Register Allocator"; }
-
-  /// RABasic analysis usage.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  void releaseMemory() override;
-
-  Spiller &spiller() override { return *SpillerInstance; }
-
-  void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); }
-
-  const LiveInterval *dequeue() override {
-    if (Queue.empty())
-      return nullptr;
-    const LiveInterval *LI = Queue.top();
-    Queue.pop();
-    return LI;
-  }
-
-  MCRegister selectOrSplit(const LiveInterval &VirtReg,
-                           SmallVectorImpl<Register> &SplitVRegs) override;
-
-  /// Perform register allocation.
-  bool runOnMachineFunction(MachineFunction &mf) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().setNoPHIs();
-  }
-
-  MachineFunctionProperties getClearedProperties() const override {
-    return MachineFunctionProperties().setIsSSA();
-  }
-
-  // Helper for spilling all live virtual registers currently unified under preg
-  // that interfere with the most recently queried lvr.  Return true if spilling
-  // was successful, and append any new spilled/split intervals to splitLVRs.
-  bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg,
-                          SmallVectorImpl<Register> &SplitVRegs);
-
-  static char ID;
-};
-
 char RABasic::ID = 0;
 
-} // end anonymous namespace
-
 char &llvm::RABasicID = RABasic::ID;
 
 INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
diff --git a/llvm/lib/CodeGen/RegAllocBasic.h b/llvm/lib/CodeGen/RegAllocBasic.h
new file mode 100644
index 0000000000000..004bc1a0df85c
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocBasic.h
@@ -0,0 +1,104 @@
+//===-- RegAllocBasic.h - Basic Register Allocator Header -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the RABasic class, which provides a minimal
+/// implementation of the basic register allocator.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCBASIC_H
+#define LLVM_CODEGEN_REGALLOCBASIC_H
+
+#include "RegAllocBase.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Spiller.h"
+#include <queue>
+
+namespace llvm {
+
+struct CompSpillWeight {
+  bool operator()(const LiveInterval *A, const LiveInterval *B) const {
+    return A->weight() < B->weight();
+  }
+};
+
+/// RABasic provides a minimal implementation of the basic register allocation
+/// algorithm. It prioritizes live virtual registers by spill weight and spills
+/// whenever a register is unavailable. This is not practical in production but
+/// provides a useful baseline both for measuring other allocators and comparing
+/// the speed of the basic algorithm against other styles of allocators.
+class LLVM_LIBRARY_VISIBILITY RABasic : public MachineFunctionPass,
+                                        public RegAllocBase,
+                                        private LiveRangeEdit::Delegate {
+  // context
+  MachineFunction *MF = nullptr;
+
+  // state
+  std::unique_ptr<Spiller> SpillerInstance;
+  std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>,
+                      CompSpillWeight>
+      Queue;
+
+  // Scratch space.  Allocated here to avoid repeated malloc calls in
+  // selectOrSplit().
+  BitVector UsableRegs;
+
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
+
+public:
+  RABasic(const RegAllocFilterFunc F = nullptr);
+
+  /// Return the pass name.
+  StringRef getPassName() const override { return "Basic Register Allocator"; }
+
+  /// RABasic analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void releaseMemory() override;
+
+  Spiller &spiller() override { return *SpillerInstance; }
+
+  void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); }
+
+  const LiveInterval *dequeue() override {
+    if (Queue.empty())
+      return nullptr;
+    const LiveInterval *LI = Queue.top();
+    Queue.pop();
+    return LI;
+  }
+
+  MCRegister selectOrSplit(const LiveInterval &VirtReg,
+                           SmallVectorImpl<Register> &SplitVRegs) override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  // Helper for spilling all live virtual registers currently unified under preg
+  // that interfere with the most recently queried lvr.  Return true if spilling
+  // was successful, and append any new spilled/split intervals to splitLVRs.
+  bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg,
+                          SmallVectorImpl<Register> &SplitVRegs);
+
+  static char ID;
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 996207034d076..908ed96172615 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -614,6 +614,13 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
       Use &U = *AI->use_begin();
       Instruction *User = cast<Instruction>(U.getUser());
 
+      // Drop lifetime markers now that this is no longer an alloca.
+      // SafeStack has already performed its own stack coloring.
+      if (User->isLifetimeStartOrEnd()) {
+        User->eraseFromParent();
+        continue;
+      }
+
       Instruction *InsertBefore;
       if (auto *PHI = dyn_cast<PHINode>(User))
         InsertBefore = PHI->getIncomingBlock(U)->getTerminator();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e8e4c9618bb2..d3df43473013e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -609,6 +609,8 @@ namespace {
     SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
     SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
                             SDValue False, ISD::CondCode CC, const SDLoc &DL);
+    SDValue foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True,
+                             SDValue False, ISD::CondCode CC, const SDLoc &DL);
     SDValue unfoldMaskedMerge(SDNode *N);
     SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
@@ -859,7 +861,7 @@ namespace {
       auto LK = TLI.getTypeConversion(*DAG.getContext(), VT);
       return (LK.first == TargetLoweringBase::TypeLegal ||
               LK.first == TargetLoweringBase::TypePromoteInteger) &&
-             TLI.isOperationLegal(ISD::UMIN, LK.second);
+             TLI.isOperationLegalOrCustom(ISD::UMIN, LK.second);
     }
 
   public:
@@ -4093,6 +4095,26 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       return N0;
   }
 
+  // (sub x, ([v]select (ult x, y), 0, y)) -> (umin x, (sub x, y))
+  // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y))
+  if (N1.hasOneUse() && hasUMin(VT)) {
+    SDValue Y;
+    if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
+                                      m_SpecificCondCode(ISD::SETULT)),
+                              m_Zero(), m_Deferred(Y))) ||
+        sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
+                                      m_SpecificCondCode(ISD::SETUGE)),
+                              m_Deferred(Y), m_Zero())) ||
+        sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
+                                       m_SpecificCondCode(ISD::SETULT)),
+                               m_Zero(), m_Deferred(Y))) ||
+        sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
+                                       m_SpecificCondCode(ISD::SETUGE)),
+                               m_Deferred(Y), m_Zero())))
+      return DAG.getNode(ISD::UMIN, DL, VT, N0,
+                         DAG.getNode(ISD::SUB, DL, VT, N0, Y));
+  }
+
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -4442,20 +4464,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       sd_match(N1, m_UMaxLike(m_Specific(A), m_Specific(B))))
     return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT);
 
-  // (sub x, (select (ult x, y), 0, y)) -> (umin x, (sub x, y))
-  // (sub x, (select (uge x, y), y, 0)) -> (umin x, (sub x, y))
-  if (hasUMin(VT)) {
-    SDValue Y;
-    if (sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
-                                               m_SpecificCondCode(ISD::SETULT)),
-                                       m_Zero(), m_Deferred(Y)))) ||
-        sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
-                                               m_SpecificCondCode(ISD::SETUGE)),
-                                       m_Deferred(Y), m_Zero()))))
-      return DAG.getNode(ISD::UMIN, DL, VT, N0,
-                         DAG.getNode(ISD::SUB, DL, VT, N0, Y));
-  }
-
   return SDValue();
 }
 
@@ -7633,7 +7641,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
     if (SDValue(GN0, 0).hasOneUse() &&
         isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
-        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+        TLI.isVectorLoadExtDesirable(SDValue(N, 0))) {
       SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
                        GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
 
@@ -12173,6 +12181,30 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
+// ([v]select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
+// ([v]select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
+SDValue DAGCombiner::foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True,
+                                      SDValue False, ISD::CondCode CC,
+                                      const SDLoc &DL) {
+  APInt C;
+  EVT VT = True.getValueType();
+  if (sd_match(RHS, m_ConstInt(C)) && hasUMin(VT)) {
+    if (CC == ISD::SETUGT && LHS == False &&
+        sd_match(True, m_Add(m_Specific(False), m_SpecificInt(~C)))) {
+      SDValue AddC = DAG.getConstant(~C, DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, False, AddC);
+      return DAG.getNode(ISD::UMIN, DL, VT, Add, False);
+    }
+    if (CC == ISD::SETULT && LHS == True &&
+        sd_match(False, m_Add(m_Specific(True), m_SpecificInt(-C)))) {
+      SDValue AddC = DAG.getConstant(-C, DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, True, AddC);
+      return DAG.getNode(ISD::UMIN, DL, VT, True, Add);
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12343,11 +12375,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
       // Any flags available in a select/setcc fold will be on the setcc as they
       // migrated from fcmp
-      Flags = N0->getFlags();
-      SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
-                                       N2, N0.getOperand(2));
-      SelectNode->setFlags(Flags);
-      return SelectNode;
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, N2,
+                         N0.getOperand(2), N0->getFlags());
     }
 
     if (SDValue ABD = foldSelectToABD(Cond0, Cond1, N1, N2, CC, DL))
@@ -12358,24 +12387,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
     // (select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
     // (select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
-    APInt C;
-    if (sd_match(Cond1, m_ConstInt(C)) && hasUMin(VT)) {
-      if (CC == ISD::SETUGT && Cond0 == N2 &&
-          sd_match(N1, m_Add(m_Specific(N2), m_SpecificInt(~C)))) {
-        // The resulting code relies on an unsigned wrap in ADD.
-        // Recreating ADD to drop possible nuw/nsw flags.
-        SDValue AddC = DAG.getConstant(~C, DL, VT);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N2, AddC);
-        return DAG.getNode(ISD::UMIN, DL, VT, Add, N2);
-      }
-      if (CC == ISD::SETULT && Cond0 == N1 &&
-          sd_match(N2, m_Add(m_Specific(N1), m_SpecificInt(-C)))) {
-        // Ditto.
-        SDValue AddC = DAG.getConstant(-C, DL, VT);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, AddC);
-        return DAG.getNode(ISD::UMIN, DL, VT, N1, Add);
-      }
-    }
+    if (SDValue UMin = foldSelectToUMin(Cond0, Cond1, N1, N2, CC, DL))
+      return UMin;
   }
 
   if (!VT.isVector())
@@ -13412,6 +13425,11 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
         }
       }
     }
+
+    // (vselect (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
+    // (vselect (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
+    if (SDValue UMin = foldSelectToUMin(LHS, RHS, N1, N2, CC, DL))
+      return UMin;
   }
 
   if (SimplifySelectOps(N, N1, N2))
@@ -15724,7 +15742,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
   if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
     if (SDValue(GN0, 0).hasOneUse() && ExtVT == GN0->getMemoryVT() &&
-        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+        TLI.isVectorLoadExtDesirable(SDValue(N, 0))) {
       SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
                        GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
 
@@ -16717,7 +16735,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
-  // conditions 1) one-use, and 2) does not produce poison then push
+  // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+  // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
   // the freeze through to the operands that are not guaranteed non-poison.
   // NOTE: we will strip poison-generating flags, so ignore them here.
   if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16725,6 +16744,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
+  // TOOD: we should always allow multiple operands, however this increases the
+  // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
+  // below causing later nodes that share frozen operands to fold again and no
+  // longer being able to confirm other operands are not poison due to recursion
+  // depth limits on isGuaranteedNotToBeUndefOrPoison.
+  bool AllowMultipleMaybePoisonOperands =
+      N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
+      N0.getOpcode() == ISD::BUILD_VECTOR ||
+      N0.getOpcode() == ISD::BUILD_PAIR ||
+      N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+      N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
+
   // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
   // ones" or "constant" into something that depends on FrozenUndef. We can
   // instead pick undef values to keep those properties, while at the same time
@@ -16757,6 +16788,10 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       MaybePoisonOperandNumbers.push_back(OpNo);
     if (!HadMaybePoisonOperands)
       continue;
+    if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+      // Multiple maybe-poison ops when not allowed - bail out.
+      return SDValue();
+    }
   }
   // NOTE: the whole op may be not guaranteed to not be undef or poison because
   // it could create undef or poison due to it's poison-generating flags.
@@ -18706,6 +18741,12 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
     return FTrunc;
 
+  // fold (sint_to_fp (trunc nsw x)) -> (sint_to_fp x)
+  if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoSignedWrap() &&
+      TLI.isTypeDesirableForOp(ISD::SINT_TO_FP,
+                               N0.getOperand(0).getValueType()))
+    return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0.getOperand(0));
+
   return SDValue();
 }
 
@@ -18743,6 +18784,12 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
     return FTrunc;
 
+  // fold (uint_to_fp (trunc nuw x)) -> (uint_to_fp x)
+  if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoUnsignedWrap() &&
+      TLI.isTypeDesirableForOp(ISD::UINT_TO_FP,
+                               N0.getOperand(0).getValueType()))
+    return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0.getOperand(0));
+
   return SDValue();
 }
 
@@ -22698,11 +22745,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
 SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
   const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
-  if (!LifetimeEnd->hasOffset())
-    return SDValue();
-
-  const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
-                                        LifetimeEnd->getOffset(), false);
+  const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), 0, false);
 
   // We walk up the chains to find stores.
   SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
@@ -29389,9 +29432,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
       return {false /*isVolatile*/,
               /*isAtomic*/ false,
               LN->getOperand(1),
-              (LN->hasOffset()) ? LN->getOffset() : 0,
-              (LN->hasOffset()) ? LocationSize::precise(LN->getSize())
-                                : LocationSize::beforeOrAfterPointer(),
+              0,
+              LocationSize::precise(LN->getSize()),
               (MachineMemOperand *)nullptr};
     // Default.
     return {false /*isvolatile*/,
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 85efb1bd8aed9..8c8daef6dccd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -402,7 +402,12 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op,
     AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
                        IsDebug, IsClone, IsCloned);
   } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    MIB.addImm(C->getSExtValue());
+    if (C->getAPIntValue().getSignificantBits() <= 64) {
+      MIB.addImm(C->getSExtValue());
+    } else {
+      MIB.addCImm(
+          ConstantInt::get(MF->getFunction().getContext(), C->getAPIntValue()));
+    }
   } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
     MIB.addFPImm(F->getConstantFPValue());
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7266940c94bf1..74172b230361d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2785,19 +2785,17 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
       // In strict mode, we must avoid spurious exceptions, and therefore
       // must make sure to only emit a single STRICT_SINT_TO_FP.
       SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Op0);
-      Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other },
-                         { Node->getOperand(0), InCvt });
-      Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other },
-                         { Fast.getValue(1), Fast, Fast });
-      Chain = Slow.getValue(1);
       // The STRICT_SINT_TO_FP inherits the exception mode from the
       // incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can
       // never raise any exception.
       SDNodeFlags Flags;
       Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept());
-      Fast->setFlags(Flags);
+      Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DestVT, MVT::Other},
+                         {Node->getOperand(0), InCvt}, Flags);
       Flags.setNoFPExcept(true);
-      Slow->setFlags(Flags);
+      Slow = DAG.getNode(ISD::STRICT_FADD, dl, {DestVT, MVT::Other},
+                         {Fast.getValue(1), Fast, Fast}, Flags);
+      Chain = Slow.getValue(1);
     } else {
       SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
       Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
@@ -3407,14 +3405,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT VT = Operand.getValueType();
     SDValue One = DAG.getConstantFP(1.0, dl, VT);
     SDValue Chain = DAG.getEntryNode();
-    SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
-                              {Chain, Operand, One});
-
     // Propagate existing flags on canonicalize, and additionally set
     // NoFPExcept.
     SDNodeFlags CanonicalizeFlags = Node->getFlags();
     CanonicalizeFlags.setNoFPExcept(true);
-    Mul->setFlags(CanonicalizeFlags);
+    SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
+                              {Chain, Operand, One}, CanonicalizeFlags);
 
     Results.push_back(Mul);
     break;
@@ -4150,15 +4146,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
     if (Tmp1.getOpcode() == ISD::SETCC) {
-      Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
-                             Tmp2, Tmp3,
-                             cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+      Tmp1 = DAG.getSelectCC(
+          dl, Tmp1.getOperand(0), Tmp1.getOperand(1), Tmp2, Tmp3,
+          cast<CondCodeSDNode>(Tmp1.getOperand(2))->get(), Node->getFlags());
     } else {
-      Tmp1 = DAG.getSelectCC(dl, Tmp1,
-                             DAG.getConstant(0, dl, Tmp1.getValueType()),
-                             Tmp2, Tmp3, ISD::SETNE);
+      Tmp1 =
+          DAG.getSelectCC(dl, Tmp1, DAG.getConstant(0, dl, Tmp1.getValueType()),
+                          Tmp2, Tmp3, ISD::SETNE, Node->getFlags());
     }
-    Tmp1->setFlags(Node->getFlags());
     Results.push_back(Tmp1);
     break;
   case ISD::BR_JT: {
@@ -4296,8 +4291,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT Tmp1VT = Tmp1.getValueType();
     Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
                        DAG.getBoolConstant(true, dl, VT, Tmp1VT),
-                       DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3);
-    Tmp1->setFlags(Node->getFlags());
+                       DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3,
+                       Node->getFlags());
     Results.push_back(Tmp1);
     break;
   }
@@ -4335,8 +4330,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) {
       // Use the new condition code and swap true and false
       Legalized = true;
-      Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
-      Tmp1->setFlags(Node->getFlags());
+      Tmp1 =
+          DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC, Node->getFlags());
     } else {
       // If The inverse is not legal, then try to swap the arguments using
       // the inverse condition code.
@@ -4345,8 +4340,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         // The swapped inverse condition is legal, so swap true and false,
         // lhs and rhs.
         Legalized = true;
-        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
-        Tmp1->setFlags(Node->getFlags());
+        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC,
+                               Node->getFlags());
       }
     }
 
@@ -4365,15 +4360,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
       // condition code, create a new SELECT_CC node.
       if (CC.getNode()) {
-        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
-                           Tmp1, Tmp2, Tmp3, Tmp4, CC);
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
+                           Tmp2, Tmp3, Tmp4, CC, Node->getFlags());
       } else {
         Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
         CC = DAG.getCondCode(ISD::SETNE);
         Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
-                           Tmp2, Tmp3, Tmp4, CC);
+                           Tmp2, Tmp3, Tmp4, CC, Node->getFlags());
       }
-      Tmp1->setFlags(Node->getFlags());
     }
     Results.push_back(Tmp1);
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index f908a66128ec8..d2ecc13331e02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -2087,11 +2087,10 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node,
     // Otherwise, SETCC for the given comparison type must be completely
     // illegal; expand it into a SELECT_CC.
     EVT VT = Node->getValueType(0);
-    LHS =
-        DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
-                    DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
-                    DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), CC);
-    LHS->setFlags(Node->getFlags());
+    LHS = DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
+                      DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
+                      DAG.getBoolConstant(false, dl, VT, LHS.getValueType()),
+                      CC, Node->getFlags());
   }
 
   Results.push_back(LHS);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 32c5961195450..1661814d5a897 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -372,9 +372,9 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
 
   SDVTList ScalarVTs = DAG.getVTList(
       ResVT.getVectorElementType(), OvVT.getVectorElementType());
-  SDNode *ScalarNode = DAG.getNode(
-      N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
-  ScalarNode->setFlags(N->getFlags());
+  SDNode *ScalarNode = DAG.getNode(N->getOpcode(), DL, ScalarVTs,
+                                   {ScalarLHS, ScalarRHS}, N->getFlags())
+                           .getNode();
 
   // Replace the other vector result not being explicitly scalarized here.
   unsigned OtherNo = 1 - ResNo;
@@ -1898,7 +1898,7 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
     NE = ResNE;
 
   //The results of each unrolled operation, including the chain.
-  EVT ChainVTs[] = {EltVT, MVT::Other};
+  SDVTList ChainVTs = DAG.getVTList(EltVT, MVT::Other);
   SmallVector<SDValue, 8> Chains;
 
   unsigned i;
@@ -1914,8 +1914,8 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
         Operands[j] = Operand;
       }
     }
-    SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands);
-    Scalar.getNode()->setFlags(N->getFlags());
+    SDValue Scalar =
+        DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands, N->getFlags());
 
     //Add in the scalar as well as its chain value to the
     //result vectors.
@@ -1956,10 +1956,10 @@ void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
   unsigned Opcode = N->getOpcode();
   SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
   SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
-  SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
-  SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
-  LoNode->setFlags(N->getFlags());
-  HiNode->setFlags(N->getFlags());
+  SDNode *LoNode =
+      DAG.getNode(Opcode, dl, LoVTs, {LoLHS, LoRHS}, N->getFlags()).getNode();
+  SDNode *HiNode =
+      DAG.getNode(Opcode, dl, HiVTs, {HiLHS, HiRHS}, N->getFlags()).getNode();
 
   Lo = SDValue(LoNode, ResNo);
   Hi = SDValue(HiNode, ResNo);
@@ -2669,10 +2669,8 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOpWithTwoResults(SDNode *N,
   else
     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
 
-  Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo);
-  Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi);
-  Lo->setFlags(N->getFlags());
-  Hi->setFlags(N->getFlags());
+  Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo, N->getFlags());
+  Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi, N->getFlags());
 
   SDNode *HiNode = Hi.getNode();
   SDNode *LoNode = Lo.getNode();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 70a39eab1e720..773ff48eec1d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -786,10 +786,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     break;
   case ISD::LIFETIME_START:
   case ISD::LIFETIME_END:
-    if (cast<LifetimeSDNode>(N)->hasOffset()) {
-      ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
-      ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
-    }
+    ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
     break;
   case ISD::PSEUDO_PROBE:
     ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
@@ -3036,7 +3033,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
       return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, *this,
                                             Depth);
     break;
-}
+  }
 
   // We don't support other cases than those above for scalable vectors at
   // the moment.
@@ -5544,6 +5541,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::USUBSAT:
   case ISD::MULHU:
   case ISD::MULHS:
+  case ISD::ABDU:
+  case ISD::ABDS:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -5569,6 +5568,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_VECTOR:
   case ISD::BUILD_PAIR:
   case ISD::SPLAT_VECTOR:
+  case ISD::FABS:
     return false;
 
   case ISD::ABS:
@@ -6750,7 +6750,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
     return SDValue();
   int64_t Offset = C2->getSExtValue();
   switch (Opcode) {
-  case ISD::ADD: break;
+  case ISD::ADD:
+  case ISD::PTRADD:
+    break;
   case ISD::SUB: Offset = -uint64_t(Offset); break;
   default: return SDValue();
   }
@@ -9359,7 +9361,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
 
 SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
                                       SDValue Chain, int FrameIndex,
-                                      int64_t Size, int64_t Offset) {
+                                      int64_t Size) {
   const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
   const auto VTs = getVTList(MVT::Other);
   SDValue Ops[2] = {
@@ -9372,13 +9374,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
   AddNodeIDNode(ID, Opcode, VTs, Ops);
   ID.AddInteger(FrameIndex);
   ID.AddInteger(Size);
-  ID.AddInteger(Offset);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
-  LifetimeSDNode *N = newSDNode<LifetimeSDNode>(
-      Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset);
+  LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
+                                                dl.getDebugLoc(), VTs, Size);
   createOperands(N, Ops);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -10558,7 +10559,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               ArrayRef<SDUse> Ops) {
   switch (Ops.size()) {
   case 0: return getNode(Opcode, DL, VT);
-  case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
+  case 1: return getNode(Opcode, DL, VT, Ops[0].get());
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
@@ -10694,7 +10695,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
                               ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
-  return getNode(Opcode, DL, getVTList(ResultTys), Ops);
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                              ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops,
+                              const SDNodeFlags Flags) {
+  return getNode(Opcode, DL, getVTList(ResultTys), Ops, Flags);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
@@ -10850,26 +10860,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
            (Ops[2]->getAsZExtVal() == 0 || Ops[2]->getAsZExtVal() == 1) &&
            "Invalid STRICT_FP_ROUND!");
     break;
-#if 0
-  // FIXME: figure out how to safely handle things like
-  // int foo(int x) { return 1 << (x & 255); }
-  // int bar() { return foo(256); }
-  case ISD::SRA_PARTS:
-  case ISD::SRL_PARTS:
-  case ISD::SHL_PARTS:
-    if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-        cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
-      return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
-    else if (N3.getOpcode() == ISD::AND)
-      if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
-        // If the and is only masking out bits that cannot effect the shift,
-        // eliminate the and.
-        unsigned NumBits = VT.getScalarSizeInBits()*2;
-        if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
-          return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
-      }
-    break;
-#endif
   }
 
   // Memoize the node unless it returns a glue result.
@@ -13872,6 +13862,8 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) {
     return;
   }
 
+  const SDNode *EntrySDN = getEntryNode().getNode();
+
   // We need to copy NodeExtraInfo to all _new_ nodes that are being introduced
   // through the replacement of From with To. Otherwise, replacements of a node
   // (From) with more complex nodes (To and its operands) may result in lost
@@ -13903,9 +13895,14 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) {
       return true;
     if (!Visited.insert(N).second)
       return true;
-    if (getEntryNode().getNode() == N)
+    if (EntrySDN == N)
       return false;
     for (const SDValue &Op : N->op_values()) {
+      if (N == To && Op.getNode() == EntrySDN) {
+        // Special case: New node's operand is the entry node; just need to
+        // copy extra info to new node.
+        break;
+      }
       if (!Self(Self, Op.getNode()))
         return false;
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index da92aaa860b2b..8f080460362cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -303,10 +303,7 @@ BaseIndexOffset BaseIndexOffset::match(const SDNode *N,
   if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N))
     return matchLSNode(LS0, DAG);
   if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) {
-    if (LN->hasOffset())
-      return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(),
-                             false);
-    return BaseIndexOffset(LN->getOperand(1), SDValue(), false);
+    return BaseIndexOffset(LN->getOperand(1), SDValue(), 0, false);
   }
   return BaseIndexOffset();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 74c14ede24755..163646513918d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -845,16 +846,13 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 static void failForInvalidBundles(const CallBase &I, StringRef Name,
                                   ArrayRef<uint32_t> AllowedBundles) {
   if (I.hasOperandBundlesOtherThan(AllowedBundles)) {
+    ListSeparator LS;
     std::string Error;
+    raw_string_ostream OS(Error);
     for (unsigned i = 0, e = I.getNumOperandBundles(); i != e; ++i) {
       OperandBundleUse U = I.getOperandBundleAt(i);
-      bool First = true;
-      if (is_contained(AllowedBundles, U.getTagID()))
-        continue;
-      if (!First)
-        Error += ", ";
-      First = false;
-      Error += U.getTagName();
+      if (!is_contained(AllowedBundles, U.getTagID()))
+        OS << LS << U.getTagName();
     }
     reportFatalUsageError(
         Twine("cannot lower ", Name)
@@ -7598,32 +7596,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     const int64_t ObjectSize =
         cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
-    Value *const ObjectPtr = I.getArgOperand(1);
-    SmallVector<const Value *, 4> Allocas;
-    getUnderlyingObjects(ObjectPtr, Allocas);
-
-    for (const Value *Alloca : Allocas) {
-      const AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(Alloca);
+    const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
 
-      // Could not find an Alloca.
-      if (!LifetimeObject)
-        continue;
-
-      // First check that the Alloca is static, otherwise it won't have a
-      // valid frame index.
-      auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
-      if (SI == FuncInfo.StaticAllocaMap.end())
-        return;
+    // First check that the Alloca is static, otherwise it won't have a
+    // valid frame index.
+    auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+    if (SI == FuncInfo.StaticAllocaMap.end())
+      return;
 
-      const int FrameIndex = SI->second;
-      int64_t Offset;
-      if (GetPointerBaseWithConstantOffset(
-              ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject)
-        Offset = -1; // Cannot determine offset from alloca to lifetime object.
-      Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize,
-                                Offset);
-      DAG.setRoot(Res);
-    }
+    const int FrameIndex = SI->second;
+    Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+    DAG.setRoot(Res);
     return;
   }
   case Intrinsic::pseudoprobe: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 7fc15581c17e4..94745872fa663 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -947,8 +947,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
        << ASC->getDestAddressSpace()
        << ']';
   } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
-    if (LN->hasOffset())
-      OS << "<" << LN->getOffset() << " to " << LN->getOffset() + LN->getSize() << ">";
+    OS << "<0 to " << LN->getSize() << ">";
   } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
     OS << '<' << AA->getAlign().value() << '>';
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e0597988e8907..1764910861df4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -778,7 +778,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
   case ISD::FREEZE: {
     SDValue N0 = Op.getOperand(0);
     if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
-                                             /*PoisonOnly=*/false))
+                                             /*PoisonOnly=*/false, Depth + 1))
       return N0;
     break;
   }
@@ -3369,7 +3369,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   case ISD::FREEZE: {
     SDValue N0 = Op.getOperand(0);
     if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
-                                                 /*PoisonOnly=*/false))
+                                                 /*PoisonOnly=*/false,
+                                                 Depth + 1))
       return TLO.CombineTo(Op, N0);
 
     // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
@@ -8128,7 +8129,7 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
   return ISD::matchUnaryPredicate(
       Z,
       [=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
-      /*AllowUndef=*/true, /*AllowTruncation=*/true);
+      /*AllowUndefs=*/true, /*AllowTruncation=*/true);
 }
 
 static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
@@ -8633,9 +8634,8 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
       return SDValue();
     SDValue Op1 = Node->getOperand(0);
     SDValue Op2 = Node->getOperand(1);
-    SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred);
-    SelCC->setFlags(Node->getFlags());
-    return SelCC;
+    return DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred,
+                           Node->getFlags());
   }
 
   return SDValue();
@@ -11994,8 +11994,7 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
     // Get the mask value and add it to the current output position. This
     // either increments by 1 if MaskI is true or adds 0 otherwise.
     // Freeze in case we have poison/undef mask entries.
-    SDValue MaskI =
-        DAG.getFreeze(DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I));
+    SDValue MaskI = DAG.getExtractVectorElt(DL, MaskScalarVT, Mask, I);
     MaskI = DAG.getFreeze(MaskI);
     MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
     MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI);
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 3ec70083b7043..b79911bcf3c49 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -626,7 +626,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
 
     // If we're instrumenting a block with a tail call, the check has to be
     // inserted before the call rather than between it and the return.
-    Instruction *Prev = CheckLoc->getPrevNonDebugInstruction();
+    Instruction *Prev = CheckLoc->getPrevNode();
     if (auto *CI = dyn_cast_if_present<CallInst>(Prev))
       if (CI->isTailCall() && isInTailCallPosition(*CI, *TM))
         CheckLoc = Prev;
@@ -731,8 +731,8 @@ BasicBlock *CreateFailBB(Function *F, const TargetLowering &TLI) {
   }
 
   if (StackChkFail) {
-    cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
-    B.CreateCall(StackChkFail, Args);
+    CallInst *Call = B.CreateCall(StackChkFail, Args);
+    Call->addFnAttr(Attribute::NoReturn);
   }
 
   B.CreateUnreachable();
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 518a9339d8d11..18d6bbc0ff2b0 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -792,12 +792,18 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
 
   const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
-
-  if (Flags == MachineMemOperand::MOStore)
-    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
-                        Register());
-  else
+  if (Flags == MachineMemOperand::MOStore) {
+    if (MO.isUndef()) {
+      // If this is an undef copy, we do not need to bother we inserting spill
+      // code.
+      BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO);
+    } else {
+      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
+                          Register());
+    }
+  } else
     loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register());
+
   return &*--Pos;
 }
 
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp
index 93a567e89f774..64f1bfc015380 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp
@@ -263,7 +263,7 @@ bool LVScope::removeElement(LVElement *Element) {
     return Item == Element;
   };
   auto RemoveElement = [Element, Predicate](auto &Container) -> bool {
-    auto Iter = std::remove_if(Container->begin(), Container->end(), Predicate);
+    auto Iter = llvm::remove_if(*Container, Predicate);
     if (Iter != Container->end()) {
       Container->erase(Iter, Container->end());
       Element->resetParent();
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index cca99591c8c45..ffc7696c3fd50 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -738,6 +738,32 @@ static inline uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo) {
   return Hi == 63 ? Val >> Lo : (Val & (((1ULL << (Hi + 1)) - 1))) >> Lo;
 }
 
+// Calculate the adjusted page delta between dest and PC. The code is copied
+// from lld and see comments there for more details.
+static uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc,
+                                      uint32_t type) {
+  uint64_t pcalau12i_pc;
+  switch (type) {
+  case ELF::R_LARCH_PCALA64_LO20:
+  case ELF::R_LARCH_GOT64_PC_LO20:
+    pcalau12i_pc = pc - 8;
+    break;
+  case ELF::R_LARCH_PCALA64_HI12:
+  case ELF::R_LARCH_GOT64_PC_HI12:
+    pcalau12i_pc = pc - 12;
+    break;
+  default:
+    pcalau12i_pc = pc;
+    break;
+  }
+  uint64_t result = (dest & ~0xfffULL) - (pcalau12i_pc & ~0xfffULL);
+  if (dest & 0x800)
+    result += 0x1000 - 0x1'0000'0000;
+  if (result & 0x8000'0000)
+    result += 0x1'0000'0000;
+  return result;
+}
+
 void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
                                                   uint64_t Offset,
                                                   uint64_t Value, uint32_t Type,
@@ -789,10 +815,7 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
   case ELF::R_LARCH_GOT_PC_HI20:
   case ELF::R_LARCH_PCALA_HI20: {
     uint64_t Target = Value + Addend;
-    uint64_t TargetPage =
-        (Target + (Target & 0x800)) & ~static_cast<uint64_t>(0xfff);
-    uint64_t PCPage = FinalAddress & ~static_cast<uint64_t>(0xfff);
-    int64_t PageDelta = TargetPage - PCPage;
+    int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
     auto Instr = support::ulittle32_t::ref(TargetPtr);
     uint32_t Imm31_12 = extractBits(PageDelta, /*Hi=*/31, /*Lo=*/12) << 5;
     Instr = (Instr & 0xfe00001f) | Imm31_12;
@@ -806,6 +829,24 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
     Instr = (Instr & 0xffc003ff) | Imm11_0;
     break;
   }
+  case ELF::R_LARCH_GOT64_PC_LO20:
+  case ELF::R_LARCH_PCALA64_LO20: {
+    uint64_t Target = Value + Addend;
+    int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
+    auto Instr = support::ulittle32_t::ref(TargetPtr);
+    uint32_t Imm51_32 = extractBits(PageDelta, /*Hi=*/51, /*Lo=*/32) << 5;
+    Instr = (Instr & 0xfe00001f) | Imm51_32;
+    break;
+  }
+  case ELF::R_LARCH_GOT64_PC_HI12:
+  case ELF::R_LARCH_PCALA64_HI12: {
+    uint64_t Target = Value + Addend;
+    int64_t PageDelta = getLoongArchPageDelta(Target, FinalAddress, Type);
+    auto Instr = support::ulittle32_t::ref(TargetPtr);
+    uint32_t Imm63_52 = extractBits(PageDelta, /*Hi=*/63, /*Lo=*/52) << 10;
+    Instr = (Instr & 0xffc003ff) | Imm63_52;
+    break;
+  }
   case ELF::R_LARCH_ABS_HI20: {
     uint64_t Target = Value + Addend;
     auto Instr = support::ulittle32_t::ref(TargetPtr);
@@ -1758,7 +1799,9 @@ RuntimeDyldELF::processRelocationRef(
         MemMgr.allowStubAllocation()) {
       resolveLoongArch64Branch(SectionID, Value, RelI, Stubs);
     } else if (RelType == ELF::R_LARCH_GOT_PC_HI20 ||
-               RelType == ELF::R_LARCH_GOT_PC_LO12) {
+               RelType == ELF::R_LARCH_GOT_PC_LO12 ||
+               RelType == ELF::R_LARCH_GOT64_PC_HI12 ||
+               RelType == ELF::R_LARCH_GOT64_PC_LO20) {
       uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_LARCH_64);
       resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
                                  RelType);
@@ -2936,7 +2979,9 @@ bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
 
   if (Arch == Triple::loongarch64)
     return RelTy == ELF::R_LARCH_GOT_PC_HI20 ||
-           RelTy == ELF::R_LARCH_GOT_PC_LO12;
+           RelTy == ELF::R_LARCH_GOT_PC_LO12 ||
+           RelTy == ELF::R_LARCH_GOT64_PC_HI12 ||
+           RelTy == ELF::R_LARCH_GOT64_PC_LO20;
 
   if (Arch == Triple::x86_64)
     return RelTy == ELF::R_X86_64_GOTPCREL ||
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 145ef10f28f35..e5a4e1e6b7ce5 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
     break;
   case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
   case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
+  case CallingConv::AMDGPU_Gfx_WholeWave:
+    Out << "amdgpu_gfx_whole_wave";
+    break;
   case CallingConv::M68k_RTD:      Out << "m68k_rtdcc"; break;
   case CallingConv::RISCV_VectorCall:
     Out << "riscv_vector_cc";
@@ -2398,8 +2401,9 @@ static void writeDIFile(raw_ostream &Out, const DIFile *N, AsmWriterContext &) {
   // Print all values for checksum together, or not at all.
   if (N->getChecksum())
     Printer.printChecksum(*N->getChecksum());
-  Printer.printString("source", N->getSource().value_or(StringRef()),
-                      /* ShouldSkipEmpty */ true);
+  if (N->getSource())
+    Printer.printString("source", *N->getSource(),
+                        /* ShouldSkipEmpty */ false);
   Out << ")";
 }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 86285a03c66bb..28ed1e520ce52 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1310,6 +1310,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       return true;
     }
     break;
+  case 'l':
+    if (Name.starts_with("lifetime.start") ||
+        Name.starts_with("lifetime.end")) {
+      // Unless remangling is required, do not upgrade the function declaration,
+      // but do upgrade the calls.
+      if (auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F))
+        NewFn = *Result;
+      else
+        NewFn = F;
+      return true;
+    }
+    break;
   case 'm': {
     // Updating the memory intrinsics (memcpy/memmove/memset) that have an
     // alignment parameter to embedding the alignment as an attribute of
@@ -1629,7 +1641,6 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
   NewFn = nullptr;
   bool Upgraded =
       upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords);
-  assert(F != NewFn && "Intrinsic function upgraded to the same function");
 
   // Upgrade intrinsic attributes.  This does not change the function.
   if (NewFn)
@@ -4570,6 +4581,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   }
 
   const auto &DefaultCase = [&]() -> void {
+    if (F == NewFn)
+      return;
+
     if (CI->getFunctionType() == NewFn->getFunctionType()) {
       // Handle generic mangling change.
       assert(
@@ -5109,6 +5123,31 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       MTI->setSourceAlignment(Align->getMaybeAlignValue());
     break;
   }
+
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    Value *Size = CI->getArgOperand(0);
+    Value *Ptr = CI->getArgOperand(1);
+    if (isa<AllocaInst>(Ptr)) {
+      DefaultCase();
+      return;
+    }
+
+    // Try to strip pointer casts, such that the lifetime works on an alloca.
+    Ptr = Ptr->stripPointerCasts();
+    if (isa<AllocaInst>(Ptr)) {
+      // Don't use NewFn, as we might have looked through an addrspacecast.
+      if (NewFn->getIntrinsicID() == Intrinsic::lifetime_start)
+        NewCall = Builder.CreateLifetimeStart(Ptr, cast<ConstantInt>(Size));
+      else
+        NewCall = Builder.CreateLifetimeEnd(Ptr, cast<ConstantInt>(Size));
+      break;
+    }
+
+    // Otherwise remove the lifetime marker.
+    CI->eraseFromParent();
+    return;
+  }
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
@@ -5131,7 +5170,8 @@ void llvm::UpgradeCallsToIntrinsic(Function *F) {
         UpgradeIntrinsicCall(CB, NewFn);
 
     // Remove old function, no longer used, from the module.
-    F->eraseFromParent();
+    if (F != NewFn)
+      F->eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 84a56058de834..ab8ecee2a81e0 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -45,25 +45,6 @@ using namespace llvm;
 using namespace llvm::at;
 using namespace llvm::dwarf;
 
-TinyPtrVector<DbgDeclareInst *> llvm::findDbgDeclares(Value *V) {
-  // This function is hot. Check whether the value has any metadata to avoid a
-  // DenseMap lookup. This check is a bitfield datamember lookup.
-  if (!V->isUsedByMetadata())
-    return {};
-  auto *L = ValueAsMetadata::getIfExists(V);
-  if (!L)
-    return {};
-  auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L);
-  if (!MDV)
-    return {};
-
-  TinyPtrVector<DbgDeclareInst *> Declares;
-  for (User *U : MDV->users())
-    if (auto *DDI = dyn_cast<DbgDeclareInst>(U))
-      Declares.push_back(DDI);
-
-  return Declares;
-}
 TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup. This check is a bitfield datamember lookup.
@@ -98,42 +79,31 @@ TinyPtrVector<DbgVariableRecord *> llvm::findDVRValues(Value *V) {
   return Values;
 }
 
-template <typename IntrinsicT, bool DbgAssignAndValuesOnly>
+template <bool DbgAssignAndValuesOnly>
 static void
-findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
-                  SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+findDbgIntrinsics(Value *V,
+                  SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup.
   if (!V->isUsedByMetadata())
     return;
 
-  LLVMContext &Ctx = V->getContext();
   // TODO: If this value appears multiple times in a DIArgList, we should still
-  // only add the owning DbgValueInst once; use this set to track ArgListUsers.
+  // only add the owning dbg.value once; use this set to track ArgListUsers.
   // This behaviour can be removed when we can automatically remove duplicates.
   // V will also appear twice in a dbg.assign if its used in the both the value
   // and address components.
-  SmallPtrSet<IntrinsicT *, 4> EncounteredIntrinsics;
   SmallPtrSet<DbgVariableRecord *, 4> EncounteredDbgVariableRecords;
 
-  /// Append IntrinsicT users of MetadataAsValue(MD).
-  auto AppendUsers = [&Ctx, &EncounteredIntrinsics,
-                      &EncounteredDbgVariableRecords, &Result,
-                      DbgVariableRecords](Metadata *MD) {
-    if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) {
-      for (User *U : MDV->users())
-        if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U))
-          if (EncounteredIntrinsics.insert(DVI).second)
-            Result.push_back(DVI);
-    }
-    if (!DbgVariableRecords)
-      return;
+  /// Append users of MetadataAsValue(MD).
+  auto AppendUsers = [&EncounteredDbgVariableRecords,
+                      &DbgVariableRecords](Metadata *MD) {
     // Get DbgVariableRecords that use this as a single value.
     if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
       for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) {
         if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
           if (EncounteredDbgVariableRecords.insert(DVR).second)
-            DbgVariableRecords->push_back(DVR);
+            DbgVariableRecords.push_back(DVR);
       }
     }
   };
@@ -142,29 +112,23 @@ findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
     AppendUsers(L);
     for (Metadata *AL : L->getAllArgListUsers()) {
       AppendUsers(AL);
-      if (!DbgVariableRecords)
-        continue;
       DIArgList *DI = cast<DIArgList>(AL);
       for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers())
         if (!DbgAssignAndValuesOnly || DVR->isDbgValue() || DVR->isDbgAssign())
           if (EncounteredDbgVariableRecords.insert(DVR).second)
-            DbgVariableRecords->push_back(DVR);
+            DbgVariableRecords.push_back(DVR);
     }
   }
 }
 
 void llvm::findDbgValues(
-    SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
-    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
-  findDbgIntrinsics<DbgValueInst, /*DbgAssignAndValuesOnly=*/true>(
-      DbgValues, V, DbgVariableRecords);
+    Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  findDbgIntrinsics</*DbgAssignAndValuesOnly=*/true>(V, DbgVariableRecords);
 }
 
 void llvm::findDbgUsers(
-    SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V,
-    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
-  findDbgIntrinsics<DbgVariableIntrinsic, /*DbgAssignAndValuesOnly=*/false>(
-      DbgUsers, V, DbgVariableRecords);
+    Value *V, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  findDbgIntrinsics</*DbgAssignAndValuesOnly=*/false>(V, DbgVariableRecords);
 }
 
 DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
@@ -173,18 +137,6 @@ DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
   return nullptr;
 }
 
-DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) {
-  // Original dbg.declare must have a location.
-  const DebugLoc &DeclareLoc = DII->getDebugLoc();
-  MDNode *Scope = DeclareLoc.getScope();
-  DILocation *InlinedAt = DeclareLoc.getInlinedAt();
-  // Because no machine insts can come from debug intrinsics, only the scope
-  // and inlinedAt is significant. Zero line numbers are used in case this
-  // DebugLoc leaks into any adjacent instructions. Produce an unknown location
-  // with the correct scope / inlinedAt fields.
-  return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
-}
-
 DebugLoc llvm::getDebugValueLoc(DbgVariableRecord *DVR) {
   // Original dbg.declare must have a location.
   const DebugLoc &DeclareLoc = DVR->getDebugLoc();
@@ -852,19 +804,6 @@ void DebugTypeInfoRemoval::traverse(MDNode *N) {
 bool llvm::stripNonLineTableDebugInfo(Module &M) {
   bool Changed = false;
 
-  // First off, delete the debug intrinsics.
-  auto RemoveUses = [&](StringRef Name) {
-    if (auto *DbgVal = M.getFunction(Name)) {
-      while (!DbgVal->use_empty())
-        cast<Instruction>(DbgVal->user_back())->eraseFromParent();
-      DbgVal->eraseFromParent();
-      Changed = true;
-    }
-  };
-  RemoveUses("llvm.dbg.declare");
-  RemoveUses("llvm.dbg.label");
-  RemoveUses("llvm.dbg.value");
-
   // Delete non-CU debug info named metadata nodes.
   for (auto NMI = M.named_metadata_begin(), NME = M.named_metadata_end();
        NMI != NME;) {
@@ -2288,39 +2227,36 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   // Collect a map of {backing storage : dbg.declares} (currently "backing
   // storage" is limited to Allocas). We'll use this to find dbg.declares to
   // delete after running `trackAssignments`.
-  DenseMap<const AllocaInst *, SmallPtrSet<DbgDeclareInst *, 2>> DbgDeclares;
   DenseMap<const AllocaInst *, SmallPtrSet<DbgVariableRecord *, 2>> DVRDeclares;
   // Create another similar map of {storage : variables} that we'll pass to
   // trackAssignments.
   StorageToVarsMap Vars;
-  auto ProcessDeclare = [&](auto *Declare, auto &DeclareList) {
+  auto ProcessDeclare = [&](DbgVariableRecord &Declare) {
     // FIXME: trackAssignments doesn't let you specify any modifiers to the
     // variable (e.g. fragment) or location (e.g. offset), so we have to
     // leave dbg.declares with non-empty expressions in place.
-    if (Declare->getExpression()->getNumElements() != 0)
+    if (Declare.getExpression()->getNumElements() != 0)
       return;
-    if (!Declare->getAddress())
+    if (!Declare.getAddress())
       return;
     if (AllocaInst *Alloca =
-            dyn_cast<AllocaInst>(Declare->getAddress()->stripPointerCasts())) {
+            dyn_cast<AllocaInst>(Declare.getAddress()->stripPointerCasts())) {
       // FIXME: Skip VLAs for now (let these variables use dbg.declares).
       if (!Alloca->isStaticAlloca())
         return;
       // Similarly, skip scalable vectors (use dbg.declares instead).
       if (auto Sz = Alloca->getAllocationSize(*DL); Sz && Sz->isScalable())
         return;
-      DeclareList[Alloca].insert(Declare);
-      Vars[Alloca].insert(VarRecord(Declare));
+      DVRDeclares[Alloca].insert(&Declare);
+      Vars[Alloca].insert(VarRecord(&Declare));
     }
   };
   for (auto &BB : F) {
     for (auto &I : BB) {
       for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
         if (DVR.isDbgDeclare())
-          ProcessDeclare(&DVR, DVRDeclares);
+          ProcessDeclare(DVR);
       }
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I))
-        ProcessDeclare(DDI, DbgDeclares);
     }
   }
 
@@ -2336,8 +2272,8 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   trackAssignments(F.begin(), F.end(), Vars, *DL);
 
   // Delete dbg.declares for variables now tracked with assignment tracking.
-  auto DeleteSubsumedDeclare = [&](const auto &Markers, auto &Declares) {
-    (void)Markers;
+  for (auto &[Insts, Declares] : DVRDeclares) {
+    auto Markers = at::getDVRAssignmentMarkers(Insts);
     for (auto *Declare : Declares) {
       // Assert that the alloca that Declare uses is now linked to a dbg.assign
       // describing the same variable (i.e. check that this dbg.declare has
@@ -2356,10 +2292,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
       Changed = true;
     }
   };
-  for (auto &P : DbgDeclares)
-    DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second);
-  for (auto &P : DVRDeclares)
-    DeleteSubsumedDeclare(at::getDVRAssignmentMarkers(P.first), P.second);
   return Changed;
 }
 
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 2270923bd3719..f16963dce56e1 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -49,20 +49,11 @@ uint32_t DIType::getAlignInBits() const {
 const DIExpression::FragmentInfo DebugVariable::DefaultFragment = {
     std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
 
-DebugVariable::DebugVariable(const DbgVariableIntrinsic *DII)
-    : Variable(DII->getVariable()),
-      Fragment(DII->getExpression()->getFragmentInfo()),
-      InlinedAt(DII->getDebugLoc().getInlinedAt()) {}
-
 DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
     : Variable(DVR->getVariable()),
       Fragment(DVR->getExpression()->getFragmentInfo()),
       InlinedAt(DVR->getDebugLoc().getInlinedAt()) {}
 
-DebugVariableAggregate::DebugVariableAggregate(const DbgVariableIntrinsic *DVI)
-    : DebugVariable(DVI->getVariable(), std::nullopt,
-                    DVI->getDebugLoc()->getInlinedAt()) {}
-
 DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
                        unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
                        ArrayRef<Metadata *> MDs, bool ImplicitCode)
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 7a03663e129dc..fc067459dcba3 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
   case CallingConv::AArch64_SVE_VectorCall:
   case CallingConv::WASM_EmscriptenInvoke:
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
   case CallingConv::M68k_INTR:
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index c6dca727e0e89..763cc1832b794 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1235,14 +1235,6 @@ bool Instruction::isDebugOrPseudoInst() const {
   return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
 }
 
-const Instruction *
-Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const {
-  for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
-    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
-      return I;
-  return nullptr;
-}
-
 const DebugLoc &Instruction::getStableDebugLoc() const {
   return getDebugLoc();
 }
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index f0448b06e7e82..0dbd07f4865dc 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1303,6 +1303,24 @@ static void addRange(SmallVectorImpl<ConstantInt *> &EndPoints,
   EndPoints.push_back(High);
 }
 
+MDNode *MDNode::getMergedCalleeTypeMetadata(const MDNode *A, const MDNode *B) {
+  // Drop the callee_type metadata if either of the call instructions do not
+  // have it.
+  if (!A || !B)
+    return nullptr;
+  SmallVector<Metadata *, 8> AB;
+  SmallPtrSet<Metadata *, 8> MergedCallees;
+  auto AddUniqueCallees = [&AB, &MergedCallees](const MDNode *N) {
+    for (Metadata *MD : N->operands()) {
+      if (MergedCallees.insert(MD).second)
+        AB.push_back(MD);
+    }
+  };
+  AddUniqueCallees(A);
+  AddUniqueCallees(B);
+  return MDNode::get(A->getContext(), AB);
+}
+
 MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   // Given two ranges, we want to compute the union of the ranges. This
   // is slightly complicated by having to combine the intervals and merge
diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp
index 427e8b78fd03f..29ca268408265 100644
--- a/llvm/lib/IR/OptBisect.cpp
+++ b/llvm/lib/IR/OptBisect.cpp
@@ -25,6 +25,11 @@ static OptBisect &getOptBisector() {
   return OptBisector;
 }
 
+static OptDisable &getOptDisabler() {
+  static OptDisable OptDisabler;
+  return OptDisabler;
+}
+
 static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
                                    cl::init(OptBisect::Disabled), cl::Optional,
                                    cl::cb<void, int>([](int Limit) {
@@ -37,6 +42,18 @@ static cl::opt<bool> OptBisectVerbose(
     cl::desc("Show verbose output when opt-bisect-limit is set"), cl::Hidden,
     cl::init(true), cl::Optional);
 
+static cl::list<std::string> OptDisablePasses(
+    "opt-disable", cl::Hidden, cl::CommaSeparated, cl::Optional,
+    cl::cb<void, std::string>([](const std::string &Pass) {
+      getOptDisabler().setDisabled(Pass);
+    }),
+    cl::desc("Optimization pass(es) to disable (comma-separated list)"));
+
+static cl::opt<bool>
+    OptDisableVerbose("opt-disable-enable-verbosity",
+                      cl::desc("Show verbose output when opt-disable is set"),
+                      cl::Hidden, cl::init(false), cl::Optional);
+
 static void printPassMessage(StringRef Name, int PassNum, StringRef TargetDesc,
                              bool Running) {
   StringRef Status = Running ? "" : "NOT ";
@@ -55,4 +72,27 @@ bool OptBisect::shouldRunPass(StringRef PassName,
   return ShouldRun;
 }
 
-OptPassGate &llvm::getGlobalPassGate() { return getOptBisector(); }
+static void printDisablePassMessage(const StringRef &Name, StringRef TargetDesc,
+                                    bool Running) {
+  StringRef Status = Running ? "" : "NOT ";
+  dbgs() << "OptDisable: " << Status << "running pass " << Name << " on "
+         << TargetDesc << "\n";
+}
+
+void OptDisable::setDisabled(StringRef Pass) { DisabledPasses.insert(Pass); }
+
+bool OptDisable::shouldRunPass(StringRef PassName,
+                               StringRef IRDescription) const {
+  assert(isEnabled());
+
+  const bool ShouldRun = !DisabledPasses.contains(PassName);
+  if (OptDisableVerbose)
+    printDisablePassMessage(PassName, IRDescription, ShouldRun);
+  return ShouldRun;
+}
+
+OptPassGate &llvm::getGlobalPassGate() {
+  if (getOptDisabler().isEnabled())
+    return getOptDisabler();
+  return getOptBisector();
+}
diff --git a/llvm/lib/IR/Pass.cpp b/llvm/lib/IR/Pass.cpp
index 2c5ef7193b463..dec7c9a9ab18c 100644
--- a/llvm/lib/IR/Pass.cpp
+++ b/llvm/lib/IR/Pass.cpp
@@ -62,8 +62,12 @@ static std::string getDescription(const Module &M) {
 
 bool ModulePass::skipModule(const Module &M) const {
   const OptPassGate &Gate = M.getContext().getOptPassGate();
-  return Gate.isEnabled() &&
-         !Gate.shouldRunPass(this->getPassName(), getDescription(M));
+
+  StringRef PassName = getPassArgument();
+  if (PassName.empty())
+    PassName = this->getPassName();
+
+  return Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(M));
 }
 
 bool Pass::mustPreserveAnalysisID(char &AID) const {
@@ -86,6 +90,16 @@ StringRef Pass::getPassName() const {
   return "Unnamed pass: implement Pass::getPassName()";
 }
 
+/// getPassArgument - Return a nice clean name for a pass
+/// corresponding to that used to enable the pass in opt
+StringRef Pass::getPassArgument() const {
+  AnalysisID AID = getPassID();
+  const PassInfo *PI = Pass::lookupPassInfo(AID);
+  if (PI)
+    return PI->getPassArgument();
+  return "";
+}
+
 void Pass::preparePassManager(PMStack &) {
   // By default, don't do anything.
 }
@@ -173,8 +187,12 @@ static std::string getDescription(const Function &F) {
 
 bool FunctionPass::skipFunction(const Function &F) const {
   OptPassGate &Gate = F.getContext().getOptPassGate();
-  if (Gate.isEnabled() &&
-      !Gate.shouldRunPass(this->getPassName(), getDescription(F)))
+
+  StringRef PassName = getPassArgument();
+  if (PassName.empty())
+    PassName = this->getPassName();
+
+  if (Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(F)))
     return true;
 
   if (F.hasOptNone()) {
diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp
index 94ad124a6c770..70bbe8f6234b1 100644
--- a/llvm/lib/IR/PassInstrumentation.cpp
+++ b/llvm/lib/IR/PassInstrumentation.cpp
@@ -23,6 +23,7 @@ template struct LLVM_EXPORT_TEMPLATE Any::TypeId<const Loop *>;
 
 void PassInstrumentationCallbacks::addClassToPassName(StringRef ClassName,
                                                       StringRef PassName) {
+  assert(!PassName.empty() && "PassName can't be empty!");
   ClassToPassName.try_emplace(ClassName, PassName.str());
 }
 
@@ -33,7 +34,10 @@ PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) {
       Fn();
     ClassToPassNameCallbacks.clear();
   }
-  return ClassToPassName[ClassName];
+  auto PassNameIter = ClassToPassName.find(ClassName);
+  if (PassNameIter != ClassToPassName.end())
+    return PassNameIter->second;
+  return {};
 }
 
 AnalysisKey PassInstrumentationAnalysis::Key;
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index b1864897dafa6..5936ac7d0287f 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -135,6 +135,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
   }
 }
 
+RTLIB::LibcallImpl
+RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const {
+  const ArrayRef<uint16_t> RuntimeLibcallNameOffsets(
+      RuntimeLibcallNameOffsetTable);
+
+  iterator_range<ArrayRef<uint16_t>::const_iterator> Range =
+      getRecognizedLibcallImpls(FuncName);
+
+  for (auto I = Range.begin(); I != Range.end(); ++I) {
+    RTLIB::LibcallImpl Impl =
+        static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin());
+
+    // FIXME: This should not depend on looking up ImplToLibcall, only the list
+    // of libcalls for the module.
+    RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
+    if (Recognized != RTLIB::Unsupported)
+      return Recognized;
+  }
+
+  return RTLIB::Unsupported;
+}
+
+iterator_range<ArrayRef<uint16_t>::const_iterator>
+RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) {
+  StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName);
+  if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName)
+    return iterator_range(ArrayRef<uint16_t>());
+
+  uint16_t IndexVal = It.offset().value();
+  const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable);
+
+  ArrayRef<uint16_t>::const_iterator E = TableRef.end();
+  ArrayRef<uint16_t>::const_iterator EntriesBegin =
+      std::lower_bound(TableRef.begin(), E, IndexVal);
+  ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin;
+
+  while (EntriesEnd != E && *EntriesEnd == IndexVal)
+    ++EntriesEnd;
+
+  assert(EntriesBegin != E &&
+         "libcall found in name table but not offset table");
+
+  return make_range(EntriesBegin, EntriesEnd);
+}
+
 bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
   switch (TT.getOS()) {
   case Triple::MacOSX:
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 02c16e201abe2..5928c89029b87 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -582,16 +582,11 @@ void Value::replaceUsesWithIf(Value *New,
   }
 }
 
-/// Replace llvm.dbg.* uses of MetadataAsValue(ValueAsMetadata(V)) outside BB
+/// Replace debug record uses of MetadataAsValue(ValueAsMetadata(V)) outside BB
 /// with New.
 static void replaceDbgUsesOutsideBlock(Value *V, Value *New, BasicBlock *BB) {
-  SmallVector<DbgVariableIntrinsic *> DbgUsers;
   SmallVector<DbgVariableRecord *> DPUsers;
-  findDbgUsers(DbgUsers, V, &DPUsers);
-  for (auto *DVI : DbgUsers) {
-    if (DVI->getParent() != BB)
-      DVI->replaceVariableLocationOp(V, New);
-  }
+  findDbgUsers(V, DPUsers);
   for (auto *DVR : DPUsers) {
     DbgMarker *Marker = DVR->getMarker();
     if (Marker->getParent() != BB)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index dc5373e172f28..3ff9895e161c4 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -531,6 +531,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitCallStackMetadata(MDNode *MD);
   void visitMemProfMetadata(Instruction &I, MDNode *MD);
   void visitCallsiteMetadata(Instruction &I, MDNode *MD);
+  void visitCalleeTypeMetadata(Instruction &I, MDNode *MD);
   void visitDIAssignIDMetadata(Instruction &I, MDNode *MD);
   void visitMMRAMetadata(Instruction &I, MDNode *MD);
   void visitAnnotationMetadata(MDNode *Annotation);
@@ -597,7 +598,6 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call);
   void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
   void visitVPIntrinsic(VPIntrinsic &VPI);
-  void visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII);
   void visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI);
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
   void visitAtomicRMWInst(AtomicRMWInst &RMWI);
@@ -636,15 +636,12 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void verifyFrameRecoverIndices();
   void verifySiblingFuncletUnwinds();
 
-  void verifyFragmentExpression(const DbgVariableIntrinsic &I);
   void verifyFragmentExpression(const DbgVariableRecord &I);
   template <typename ValueOrMetadata>
   void verifyFragmentExpression(const DIVariable &V,
                                 DIExpression::FragmentInfo Fragment,
                                 ValueOrMetadata *Desc);
-  void verifyFnArgs(const DbgVariableIntrinsic &I);
   void verifyFnArgs(const DbgVariableRecord &DVR);
-  void verifyNotEntryValue(const DbgVariableIntrinsic &I);
   void verifyNotEntryValue(const DbgVariableRecord &I);
 
   /// Module-level debug info verification...
@@ -2982,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) {
           "perfect forwarding!",
           &F);
     break;
+  case CallingConv::AMDGPU_Gfx_WholeWave:
+    Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1),
+          "Calling convention requires first argument to be i1", &F);
+    Check(!F.arg_begin()->hasInRegAttr(),
+          "Calling convention requires first argument to not be inreg", &F);
+    Check(!F.isVarArg(),
+          "Calling convention does not support varargs or "
+          "perfect forwarding!",
+          &F);
+    break;
   }
 
   // Check that the argument values match the function type for this function...
@@ -3185,12 +3192,6 @@ void Verifier::visitFunction(const Function &F) {
     CheckDI(SP->describes(&F),
             "!dbg attachment points at wrong subprogram for function", N, &F,
             &I, DL, Scope, SP);
-
-    if (DL->getAtomGroup())
-      CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
-              "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
-              "Instructions enabled",
-              DL, DL->getScope()->getSubprogram());
   };
   for (auto &BB : F)
     for (auto &I : BB) {
@@ -5203,6 +5204,33 @@ void Verifier::visitCallsiteMetadata(Instruction &I, MDNode *MD) {
   visitCallStackMetadata(MD);
 }
 
+static inline bool isConstantIntMetadataOperand(const Metadata *MD) {
+  if (auto *VAL = dyn_cast<ValueAsMetadata>(MD))
+    return isa<ConstantInt>(VAL->getValue());
+  return false;
+}
+
+void Verifier::visitCalleeTypeMetadata(Instruction &I, MDNode *MD) {
+  Check(isa<CallBase>(I), "!callee_type metadata should only exist on calls",
+        &I);
+  for (Metadata *Op : MD->operands()) {
+    Check(isa<MDNode>(Op),
+          "The callee_type metadata must be a list of type metadata nodes", Op);
+    auto *TypeMD = cast<MDNode>(Op);
+    Check(TypeMD->getNumOperands() == 2,
+          "Well-formed generalized type metadata must contain exactly two "
+          "operands",
+          Op);
+    Check(isConstantIntMetadataOperand(TypeMD->getOperand(0)) &&
+              mdconst::extract<ConstantInt>(TypeMD->getOperand(0))->isZero(),
+          "The first operand of type metadata for functions must be zero", Op);
+    Check(TypeMD->hasGeneralizedMDString(),
+          "Only generalized type metadata can be part of the callee_type "
+          "metadata list",
+          Op);
+  }
+}
+
 void Verifier::visitAnnotationMetadata(MDNode *Annotation) {
   Check(isa<MDTuple>(Annotation), "annotation must be a tuple");
   Check(Annotation->getNumOperands() >= 1,
@@ -5480,6 +5508,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_callsite))
     visitCallsiteMetadata(I, MD);
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_callee_type))
+    visitCalleeTypeMetadata(I, MD);
+
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_DIAssignID))
     visitDIAssignIDMetadata(I, MD);
 
@@ -5492,11 +5523,15 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
-  }
 
-  if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-    verifyFragmentExpression(*DII);
-    verifyNotEntryValue(*DII);
+    if (auto *DL = dyn_cast<DILocation>(N)) {
+      if (DL->getAtomGroup()) {
+        CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
+                "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
+                "Instructions enabled",
+                DL, DL->getScope()->getSubprogram());
+      }
+    }
   }
 
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
@@ -5703,18 +5738,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
     break;
   case Intrinsic::dbg_declare: // llvm.dbg.declare
-    Check(isa<MetadataAsValue>(Call.getArgOperand(0)),
-          "invalid llvm.dbg.declare intrinsic call 1", Call);
-    visitDbgIntrinsic("declare", cast<DbgVariableIntrinsic>(Call));
-    break;
-  case Intrinsic::dbg_value: // llvm.dbg.value
-    visitDbgIntrinsic("value", cast<DbgVariableIntrinsic>(Call));
-    break;
-  case Intrinsic::dbg_assign: // llvm.dbg.assign
-    visitDbgIntrinsic("assign", cast<DbgVariableIntrinsic>(Call));
-    break;
-  case Intrinsic::dbg_label: // llvm.dbg.label
-    visitDbgLabelIntrinsic("label", cast<DbgLabelInst>(Call));
+  case Intrinsic::dbg_value:   // llvm.dbg.value
+  case Intrinsic::dbg_assign:  // llvm.dbg.assign
+  case Intrinsic::dbg_label:   // llvm.dbg.label
+    // We no longer interpret debug intrinsics (the old variable-location
+    // design). They're meaningless as far as LLVM is concerned we could make
+    // it an error for them to appear, but it's possible we'll have users
+    // converting back to intrinsics for the forseeable future (such as DXIL),
+    // so tolerate their existance.
     break;
   case Intrinsic::memcpy:
   case Intrinsic::memcpy_inline:
@@ -6637,6 +6668,54 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "invalid vector type for format", &Call, Src1, Call.getArgOperand(5));
     break;
   }
+  case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+    Value *Src0 = Call.getArgOperand(1);
+    Value *Src1 = Call.getArgOperand(3);
+
+    unsigned FmtA = cast<ConstantInt>(Call.getArgOperand(0))->getZExtValue();
+    unsigned FmtB = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
+    Check(FmtA <= 4, "invalid value for matrix format", Call,
+          Call.getArgOperand(0));
+    Check(FmtB <= 4, "invalid value for matrix format", Call,
+          Call.getArgOperand(2));
+
+    // AMDGPU::MatrixFMT values
+    auto getFormatNumRegs = [](unsigned FormatVal) {
+      switch (FormatVal) {
+      case 0:
+      case 1:
+        return 16u;
+      case 2:
+      case 3:
+        return 12u;
+      case 4:
+        return 8u;
+      default:
+        llvm_unreachable("invalid format value");
+      }
+    };
+
+    auto isValidSrcASrcBVector = [](FixedVectorType *Ty) {
+      if (!Ty || !Ty->getElementType()->isIntegerTy(32))
+        return false;
+      unsigned NumElts = Ty->getNumElements();
+      return NumElts == 16 || NumElts == 12 || NumElts == 8;
+    };
+
+    auto *Src0Ty = dyn_cast<FixedVectorType>(Src0->getType());
+    auto *Src1Ty = dyn_cast<FixedVectorType>(Src1->getType());
+    Check(isValidSrcASrcBVector(Src0Ty),
+          "operand 1 must be 8, 12 or 16 element i32 vector", &Call, Src0);
+    Check(isValidSrcASrcBVector(Src1Ty),
+          "operand 3 must be 8, 12 or 16 element i32 vector", &Call, Src1);
+
+    // Permit excess registers for the format.
+    Check(Src0Ty->getNumElements() >= getFormatNumRegs(FmtA),
+          "invalid vector type for format", &Call, Src0, Call.getArgOperand(0));
+    Check(Src1Ty->getNumElements() >= getFormatNumRegs(FmtB),
+          "invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
+    break;
+  }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
   case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
     Value *V = Call.getArgOperand(0);
@@ -6689,6 +6768,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "llvm.threadlocal.address operand isThreadLocal() must be true");
     break;
   }
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    Check(isa<AllocaInst>(Call.getArgOperand(1)),
+          "llvm.lifetime.start/end can only be used on alloca", &Call);
+    break;
   };
 
   // Verify that there aren't any unmediated control transfers between funclets.
@@ -7123,123 +7207,6 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   }
 }
 
-void Verifier::visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII) {
-  auto *MD = DII.getRawLocation();
-  CheckDI(isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
-              (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
-          "invalid llvm.dbg." + Kind + " intrinsic address/value", &DII, MD);
-  CheckDI(isa<DILocalVariable>(DII.getRawVariable()),
-          "invalid llvm.dbg." + Kind + " intrinsic variable", &DII,
-          DII.getRawVariable());
-  CheckDI(isa<DIExpression>(DII.getRawExpression()),
-          "invalid llvm.dbg." + Kind + " intrinsic expression", &DII,
-          DII.getRawExpression());
-
-  if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&DII)) {
-    CheckDI(isa<DIAssignID>(DAI->getRawAssignID()),
-            "invalid llvm.dbg.assign intrinsic DIAssignID", &DII,
-            DAI->getRawAssignID());
-    const auto *RawAddr = DAI->getRawAddress();
-    CheckDI(
-        isa<ValueAsMetadata>(RawAddr) ||
-            (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
-        "invalid llvm.dbg.assign intrinsic address", &DII,
-        DAI->getRawAddress());
-    CheckDI(isa<DIExpression>(DAI->getRawAddressExpression()),
-            "invalid llvm.dbg.assign intrinsic address expression", &DII,
-            DAI->getRawAddressExpression());
-    // All of the linked instructions should be in the same function as DII.
-    for (Instruction *I : at::getAssignmentInsts(DAI))
-      CheckDI(DAI->getFunction() == I->getFunction(),
-              "inst not in same function as dbg.assign", I, DAI);
-  }
-
-  // Ignore broken !dbg attachments; they're checked elsewhere.
-  if (MDNode *N = DII.getDebugLoc().getAsMDNode())
-    if (!isa<DILocation>(N))
-      return;
-
-  BasicBlock *BB = DII.getParent();
-  Function *F = BB ? BB->getParent() : nullptr;
-
-  // The scopes for variables and !dbg attachments must agree.
-  DILocalVariable *Var = DII.getVariable();
-  DILocation *Loc = DII.getDebugLoc();
-  CheckDI(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
-          &DII, BB, F);
-
-  DISubprogram *VarSP = getSubprogram(Var->getRawScope());
-  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
-  if (!VarSP || !LocSP)
-    return; // Broken scope chains are checked elsewhere.
-
-  CheckDI(VarSP == LocSP,
-          "mismatched subprogram between llvm.dbg." + Kind +
-              " variable and !dbg attachment",
-          &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
-          Loc->getScope()->getSubprogram());
-
-  // This check is redundant with one in visitLocalVariable().
-  CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
-          Var->getRawType());
-  verifyFnArgs(DII);
-}
-
-void Verifier::visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI) {
-  CheckDI(isa<DILabel>(DLI.getRawLabel()),
-          "invalid llvm.dbg." + Kind + " intrinsic variable", &DLI,
-          DLI.getRawLabel());
-
-  // Ignore broken !dbg attachments; they're checked elsewhere.
-  if (MDNode *N = DLI.getDebugLoc().getAsMDNode())
-    if (!isa<DILocation>(N))
-      return;
-
-  BasicBlock *BB = DLI.getParent();
-  Function *F = BB ? BB->getParent() : nullptr;
-
-  // The scopes for variables and !dbg attachments must agree.
-  DILabel *Label = DLI.getLabel();
-  DILocation *Loc = DLI.getDebugLoc();
-  Check(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment", &DLI,
-        BB, F);
-
-  DISubprogram *LabelSP = getSubprogram(Label->getRawScope());
-  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
-  if (!LabelSP || !LocSP)
-    return;
-
-  CheckDI(LabelSP == LocSP,
-          "mismatched subprogram between llvm.dbg." + Kind +
-              " label and !dbg attachment",
-          &DLI, BB, F, Label, Label->getScope()->getSubprogram(), Loc,
-          Loc->getScope()->getSubprogram());
-}
-
-void Verifier::verifyFragmentExpression(const DbgVariableIntrinsic &I) {
-  DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(I.getRawVariable());
-  DIExpression *E = dyn_cast_or_null<DIExpression>(I.getRawExpression());
-
-  // We don't know whether this intrinsic verified correctly.
-  if (!V || !E || !E->isValid())
-    return;
-
-  // Nothing to do if this isn't a DW_OP_LLVM_fragment expression.
-  auto Fragment = E->getFragmentInfo();
-  if (!Fragment)
-    return;
-
-  // The frontend helps out GDB by emitting the members of local anonymous
-  // unions as artificial local variables with shared storage. When SROA splits
-  // the storage for artificial local variables that are smaller than the entire
-  // union, the overhang piece will be outside of the allotted space for the
-  // variable and this check fails.
-  // FIXME: Remove this check as soon as clang stops doing this; it hides bugs.
-  if (V->isArtificial())
-    return;
-
-  verifyFragmentExpression(*V, *Fragment, &I);
-}
 void Verifier::verifyFragmentExpression(const DbgVariableRecord &DVR) {
   DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(DVR.getRawVariable());
   DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
@@ -7282,34 +7249,6 @@ void Verifier::verifyFragmentExpression(const DIVariable &V,
   CheckDI(FragSize != *VarSize, "fragment covers entire variable", Desc, &V);
 }
 
-void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
-  // This function does not take the scope of noninlined function arguments into
-  // account. Don't run it if current function is nodebug, because it may
-  // contain inlined debug intrinsics.
-  if (!HasDebugInfo)
-    return;
-
-  // For performance reasons only check non-inlined ones.
-  if (I.getDebugLoc()->getInlinedAt())
-    return;
-
-  DILocalVariable *Var = I.getVariable();
-  CheckDI(Var, "dbg intrinsic without variable");
-
-  unsigned ArgNo = Var->getArg();
-  if (!ArgNo)
-    return;
-
-  // Verify there are no duplicate function argument debug info entries.
-  // These will cause hard-to-debug assertions in the DWARF backend.
-  if (DebugFnArgs.size() < ArgNo)
-    DebugFnArgs.resize(ArgNo, nullptr);
-
-  auto *Prev = DebugFnArgs[ArgNo - 1];
-  DebugFnArgs[ArgNo - 1] = Var;
-  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
-          Prev, Var);
-}
 void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) {
   // This function does not take the scope of noninlined function arguments into
   // account. Don't run it if current function is nodebug, because it may
@@ -7339,29 +7278,6 @@ void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) {
           Prev, Var);
 }
 
-void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
-  DIExpression *E = dyn_cast_or_null<DIExpression>(I.getRawExpression());
-
-  // We don't know whether this intrinsic verified correctly.
-  if (!E || !E->isValid())
-    return;
-
-  if (isa<ValueAsMetadata>(I.getRawLocation())) {
-    Value *VarValue = I.getVariableLocationOp(0);
-    if (isa<UndefValue>(VarValue) || isa<PoisonValue>(VarValue))
-      return;
-    // We allow EntryValues for swift async arguments, as they have an
-    // ABI-guarantee to be turned into a specific register.
-    if (auto *ArgLoc = dyn_cast_or_null<Argument>(VarValue);
-        ArgLoc && ArgLoc->hasAttribute(Attribute::SwiftAsync))
-      return;
-  }
-
-  CheckDI(!E->isEntryValue(),
-          "Entry values are only allowed in MIR unless they target a "
-          "swiftasync Argument",
-          &I);
-}
 void Verifier::verifyNotEntryValue(const DbgVariableRecord &DVR) {
   DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
 
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 3a330dbfec342..67c53e01a6111 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -407,9 +407,8 @@ class MCAsmStreamer final : public MCStreamer {
                        const MCPseudoProbeInlineStack &InlineStack,
                        MCSymbol *FnSym) override;
 
-  std::optional<std::pair<bool, std::string>>
-  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
-                     SMLoc Loc, const MCSubtargetInfo &STI) override;
+  void emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                          const MCExpr *Expr, SMLoc Loc) override;
 
   void emitAddrsig() override;
   void emitAddrsigSym(const MCSymbol *Sym) override;
@@ -2468,10 +2467,8 @@ void MCAsmStreamer::emitPseudoProbe(uint64_t Guid, uint64_t Index,
   EmitEOL();
 }
 
-std::optional<std::pair<bool, std::string>>
-MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                  const MCExpr *Expr, SMLoc,
-                                  const MCSubtargetInfo &STI) {
+void MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                       const MCExpr *Expr, SMLoc) {
   OS << "\t.reloc ";
   MAI->printExpr(OS, Offset);
   OS << ", " << Name;
@@ -2480,7 +2477,6 @@ MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
     MAI->printExpr(OS, *Expr);
   }
   EmitEOL();
-  return std::nullopt;
 }
 
 void MCAsmStreamer::emitAddrsig() {
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index d4d10e0cd74a5..2b56e2a3dbf2a 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -196,12 +196,12 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
   switch (F.getKind()) {
   case MCFragment::FT_Data:
   case MCFragment::FT_Relaxable:
+  case MCFragment::FT_Align:
   case MCFragment::FT_LEB:
   case MCFragment::FT_Dwarf:
   case MCFragment::FT_DwarfFrame:
   case MCFragment::FT_CVInlineLines:
   case MCFragment::FT_CVDefRange:
-  case MCFragment::FT_PseudoProbe:
     return F.getSize();
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
@@ -227,28 +227,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
   case MCFragment::FT_SymbolId:
     return 4;
 
-  case MCFragment::FT_Align: {
-    const MCAlignFragment &AF = cast<MCAlignFragment>(F);
-    unsigned Offset = getFragmentOffset(AF);
-    unsigned Size = offsetToAlignment(Offset, AF.getAlignment());
-
-    // Insert extra Nops for code alignment if the target define
-    // shouldInsertExtraNopBytesForCodeAlign target hook.
-    if (AF.getParent()->useCodeAlign() && AF.hasEmitNops() &&
-        getBackend().shouldInsertExtraNopBytesForCodeAlign(AF, Size))
-      return Size;
-
-    // If we are padding with nops, force the padding to be larger than the
-    // minimum nop size.
-    if (Size > 0 && AF.hasEmitNops()) {
-      while (Size % getBackend().getMinimumNopSize())
-        Size += AF.getAlignment().value();
-    }
-    if (Size > AF.getMaxBytesToEmit())
-      return 0;
-    return Size;
-  }
-
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
@@ -384,7 +362,7 @@ uint64_t MCAssembler::getSectionAddressSize(const MCSection &Sec) const {
 
 uint64_t MCAssembler::getSectionFileSize(const MCSection &Sec) const {
   // Virtual sections have no file size.
-  if (Sec.isVirtualSection())
+  if (Sec.isBssSection())
     return 0;
   return getSectionAddressSize(Sec);
 }
@@ -398,6 +376,10 @@ bool MCAssembler::registerSymbol(const MCSymbol &Symbol) {
   return Changed;
 }
 
+void MCAssembler::addRelocDirective(RelocDirective RD) {
+  relocDirectives.push_back(RD);
+}
+
 /// Write the fragment \p F to the output file.
 static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
                           const MCFragment &F) {
@@ -420,8 +402,7 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
   case MCFragment::FT_Dwarf:
   case MCFragment::FT_DwarfFrame:
   case MCFragment::FT_CVInlineLines:
-  case MCFragment::FT_CVDefRange:
-  case MCFragment::FT_PseudoProbe: {
+  case MCFragment::FT_CVDefRange: {
     if (F.getKind() == MCFragment::FT_Data)
       ++stats::EmittedDataFragments;
     else if (F.getKind() == MCFragment::FT_Relaxable)
@@ -429,48 +410,45 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     const auto &EF = cast<MCFragment>(F);
     OS << StringRef(EF.getContents().data(), EF.getContents().size());
     OS << StringRef(EF.getVarContents().data(), EF.getVarContents().size());
-    break;
-  }
+  } break;
+
   case MCFragment::FT_Align: {
     ++stats::EmittedAlignFragments;
-    const MCAlignFragment &AF = cast<MCAlignFragment>(F);
-    assert(AF.getFillLen() && "Invalid virtual align in concrete fragment!");
+    OS << StringRef(F.getContents().data(), F.getContents().size());
+    assert(F.getAlignFillLen() &&
+           "Invalid virtual align in concrete fragment!");
 
-    uint64_t Count = FragmentSize / AF.getFillLen();
-    assert(FragmentSize % AF.getFillLen() == 0 &&
+    uint64_t Count = (FragmentSize - F.getFixedSize()) / F.getAlignFillLen();
+    assert((FragmentSize - F.getFixedSize()) % F.getAlignFillLen() == 0 &&
            "computeFragmentSize computed size is incorrect");
 
-    // See if we are aligning with nops, and if so do that first to try to fill
-    // the Count bytes.  Then if that did not fill any bytes or there are any
-    // bytes left to fill use the Value and ValueSize to fill the rest.
-    // If we are aligning with nops, ask that target to emit the right data.
-    if (AF.hasEmitNops()) {
-      if (!Asm.getBackend().writeNopData(OS, Count, AF.getSubtargetInfo()))
-        report_fatal_error("unable to write nop sequence of " +
-                          Twine(Count) + " bytes");
-      break;
-    }
-
-    // Otherwise, write out in multiples of the value size.
-    for (uint64_t i = 0; i != Count; ++i) {
-      switch (AF.getFillLen()) {
-      default: llvm_unreachable("Invalid size!");
-      case 1:
-        OS << char(AF.getFill());
-        break;
-      case 2:
-        support::endian::write<uint16_t>(OS, AF.getFill(), Endian);
-        break;
-      case 4:
-        support::endian::write<uint32_t>(OS, AF.getFill(), Endian);
-        break;
-      case 8:
-        support::endian::write<uint64_t>(OS, AF.getFill(), Endian);
-        break;
+    // In the nops mode, call the backend hook to write `Count` nops.
+    if (F.hasAlignEmitNops()) {
+      if (!Asm.getBackend().writeNopData(OS, Count, F.getSubtargetInfo()))
+        reportFatalInternalError("unable to write nop sequence of " +
+                                 Twine(Count) + " bytes");
+    } else {
+      // Otherwise, write out in multiples of the value size.
+      for (uint64_t i = 0; i != Count; ++i) {
+        switch (F.getAlignFillLen()) {
+        default:
+          llvm_unreachable("Invalid size!");
+        case 1:
+          OS << char(F.getAlignFill());
+          break;
+        case 2:
+          support::endian::write<uint16_t>(OS, F.getAlignFill(), Endian);
+          break;
+        case 4:
+          support::endian::write<uint32_t>(OS, F.getAlignFill(), Endian);
+          break;
+        case 8:
+          support::endian::write<uint64_t>(OS, F.getAlignFill(), Endian);
+          break;
+        }
       }
     }
-    break;
-  }
+  } break;
 
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
@@ -581,42 +559,45 @@ void MCAssembler::writeSectionData(raw_ostream &OS,
                                    const MCSection *Sec) const {
   assert(getBackendPtr() && "Expected assembler backend");
 
-  // Ignore virtual sections.
-  if (Sec->isVirtualSection()) {
+  if (Sec->isBssSection()) {
     assert(getSectionFileSize(*Sec) == 0 && "Invalid size for section!");
 
-    // Check that contents are only things legal inside a virtual section.
+    // Ensure no fixups or non-zero bytes are written to BSS sections, catching
+    // errors in both input assembly code and MCStreamer API usage. Location is
+    // not tracked for efficiency.
+    auto Fn = [](char c) { return c != 0; };
     for (const MCFragment &F : *Sec) {
+      bool HasNonZero = false;
       switch (F.getKind()) {
-      default: llvm_unreachable("Invalid fragment in virtual section!");
-      case MCFragment::FT_Data: {
-        // Check that we aren't trying to write a non-zero contents (or fixups)
-        // into a virtual section. This is to support clients which use standard
-        // directives to fill the contents of virtual sections.
-        if (F.getFixups().size() || F.getVarFixups().size())
-          reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
-                                   Sec->getName() + "' cannot have fixups");
-        for (char C : F.getContents())
-          if (C) {
-            reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
-                                     Sec->getName() +
-                                     "' cannot have non-zero initializers");
-            break;
-          }
+      default:
+        reportFatalInternalError("BSS section '" + Sec->getName() +
+                                 "' contains invalid fragment");
+        break;
+      case MCFragment::FT_Data:
+      case MCFragment::FT_Relaxable:
+        HasNonZero =
+            any_of(F.getContents(), Fn) || any_of(F.getVarContents(), Fn);
         break;
-      }
       case MCFragment::FT_Align:
-        // Check that we aren't trying to write a non-zero value into a virtual
-        // section.
-        assert((cast<MCAlignFragment>(F).getFillLen() == 0 ||
-                cast<MCAlignFragment>(F).getFill() == 0) &&
-               "Invalid align in virtual section!");
+        // Disallowed for API usage. AsmParser changes non-zero fill values to
+        // 0.
+        assert(F.getAlignFill() == 0 && "Invalid align in virtual section!");
         break;
       case MCFragment::FT_Fill:
-        assert((cast<MCFillFragment>(F).getValue() == 0) &&
-               "Invalid fill in virtual section!");
+        HasNonZero = cast<MCFillFragment>(F).getValue() != 0;
         break;
       case MCFragment::FT_Org:
+        HasNonZero = cast<MCOrgFragment>(F).getValue() != 0;
+        break;
+      }
+      if (HasNonZero) {
+        reportError(SMLoc(), "BSS section '" + Sec->getName() +
+                                 "' cannot have non-zero bytes");
+        break;
+      }
+      if (F.getFixups().size() || F.getVarFixups().size()) {
+        reportError(SMLoc(),
+                    "BSS section '" + Sec->getName() + "' cannot have fixups");
         break;
       }
     }
@@ -695,18 +676,39 @@ void MCAssembler::layout() {
   // helps check whether a PC-relative fixup is fully resolved.
   this->HasFinalLayout = true;
 
+  // Resolve .reloc offsets and add fixups.
+  for (auto &PF : relocDirectives) {
+    MCValue Res;
+    auto &O = PF.Offset;
+    if (!O.evaluateAsValue(Res, *this)) {
+      getContext().reportError(O.getLoc(), ".reloc offset is not relocatable");
+      continue;
+    }
+    auto *Sym = Res.getAddSym();
+    auto *F = Sym ? Sym->getFragment() : nullptr;
+    auto *Sec = F ? F->getParent() : nullptr;
+    if (Res.getSubSym() || !Sec) {
+      getContext().reportError(O.getLoc(),
+                               ".reloc offset is not relative to a section");
+      continue;
+    }
+
+    uint64_t Offset = Sym ? Sym->getOffset() + Res.getConstant() : 0;
+    F->addFixup(MCFixup::create(Offset, PF.Expr, PF.Kind));
+  }
+
   // Evaluate and apply the fixups, generating relocation entries as necessary.
   for (MCSection &Sec : *this) {
     for (MCFragment &F : Sec) {
       // Process fragments with fixups here.
-      if (F.isEncoded()) {
-        auto Contents = F.getContents();
-        for (MCFixup &Fixup : F.getFixups()) {
-          uint64_t FixedValue;
-          MCValue Target;
-          evaluateFixup(F, Fixup, Target, FixedValue,
-                        /*RecordReloc=*/true, Contents);
-        }
+      auto Contents = F.getContents();
+      for (MCFixup &Fixup : F.getFixups()) {
+        uint64_t FixedValue;
+        MCValue Target;
+        evaluateFixup(F, Fixup, Target, FixedValue,
+                      /*RecordReloc=*/true, Contents);
+      }
+      if (F.getVarFixups().size()) {
         // In the variable part, fixup offsets are relative to the fixed part's
         // start. Extend the variable contents to the left to account for the
         // fixed part size.
@@ -718,11 +720,6 @@ void MCAssembler::layout() {
           evaluateFixup(F, Fixup, Target, FixedValue,
                         /*RecordReloc=*/true, Contents);
         }
-      } else if (auto *AF = dyn_cast<MCAlignFragment>(&F)) {
-        // For RISC-V linker relaxation, an alignment relocation might be
-        // needed.
-        if (AF->hasEmitNops())
-          getBackend().shouldInsertFixupForCodeAlign(*this, *AF);
       }
     }
   }
@@ -926,15 +923,15 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCFragment &F) {
 }
 
 bool MCAssembler::relaxCVInlineLineTable(MCCVInlineLineTableFragment &F) {
-  unsigned OldSize = F.getContents().size();
+  unsigned OldSize = F.getVarContents().size();
   getContext().getCVContext().encodeInlineLineTable(*this, F);
-  return OldSize != F.getContents().size();
+  return OldSize != F.getVarContents().size();
 }
 
 bool MCAssembler::relaxCVDefRange(MCCVDefRangeFragment &F) {
-  unsigned OldSize = F.getContents().size();
+  unsigned OldSize = F.getVarContents().size();
   getContext().getCVContext().encodeDefRange(*this, F);
-  return OldSize != F.getContents().size();
+  return OldSize != F.getVarContents().size();
 }
 
 bool MCAssembler::relaxFill(MCFillFragment &F) {
@@ -945,22 +942,6 @@ bool MCAssembler::relaxFill(MCFillFragment &F) {
   return true;
 }
 
-bool MCAssembler::relaxPseudoProbeAddr(MCPseudoProbeAddrFragment &PF) {
-  uint64_t OldSize = PF.getContents().size();
-  int64_t AddrDelta;
-  bool Abs = PF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *this);
-  assert(Abs && "We created a pseudo probe with an invalid expression");
-  (void)Abs;
-  SmallVector<char, 8> Data;
-  raw_svector_ostream OSE(Data);
-
-  // AddrDelta is a signed integer
-  encodeSLEB128(AddrDelta, OSE, OldSize);
-  PF.setContents(Data);
-  PF.clearFixups();
-  return OldSize != Data.size();
-}
-
 bool MCAssembler::relaxFragment(MCFragment &F) {
   switch(F.getKind()) {
   default:
@@ -982,8 +963,6 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
     return relaxCVDefRange(cast<MCCVDefRangeFragment>(F));
   case MCFragment::FT_Fill:
     return relaxFill(cast<MCFillFragment>(F));
-  case MCFragment::FT_PseudoProbe:
-    return relaxPseudoProbeAddr(cast<MCPseudoProbeAddrFragment>(F));
   }
 }
 
@@ -991,7 +970,32 @@ void MCAssembler::layoutSection(MCSection &Sec) {
   uint64_t Offset = 0;
   for (MCFragment &F : Sec) {
     F.Offset = Offset;
-    Offset += computeFragmentSize(F);
+    if (F.getKind() == MCFragment::FT_Align) {
+      Offset += F.getFixedSize();
+      unsigned Size = offsetToAlignment(Offset, F.getAlignment());
+      // In the nops mode, RISC-V style linker relaxation might adjust the size
+      // and add a fixup, even if `Size` is originally 0.
+      bool AlignFixup = false;
+      if (F.hasAlignEmitNops()) {
+        AlignFixup = getBackend().relaxAlign(F, Size);
+        // If the backend does not handle the fragment specially, pad with nops,
+        // but ensure that the padding is larger than the minimum nop size.
+        if (!AlignFixup)
+          while (Size % getBackend().getMinimumNopSize())
+            Size += F.getAlignment().value();
+      }
+      if (!AlignFixup && Size > F.getAlignMaxBytesToEmit())
+        Size = 0;
+      // Update the variable tail size, offset by FixedSize to prevent ubsan
+      // pointer-overflow in evaluateFixup. The content is ignored.
+      F.VarContentStart = F.getFixedSize();
+      F.VarContentEnd = F.VarContentStart + Size;
+      if (F.VarContentEnd > F.getParent()->ContentStorage.size())
+        F.getParent()->ContentStorage.resize(F.VarContentEnd);
+      Offset += Size;
+    } else {
+      Offset += computeFragmentSize(F);
+    }
   }
 }
 
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 1f9825185175a..7d528a55432e6 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -26,8 +26,10 @@ using namespace llvm;
 using namespace llvm::codeview;
 
 void CodeViewContext::finish() {
-  if (StrTabFragment)
-    StrTabFragment->setContents(StrTab);
+  if (!StrTabFragment)
+    return;
+  assert(StrTabFragment->getKind() == MCFragment::FT_Data);
+  StrTabFragment->setVarContents(StrTab);
 }
 
 /// This is a valid number for use with .cv_loc if we've already seen a .cv_file
@@ -166,8 +168,9 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
   // somewhere else. If somebody wants two string tables in their .s file, one
   // will just be empty.
   if (!StrTabFragment) {
-    StrTabFragment = Ctx.allocFragment<MCFragment>();
-    OS.insert(StrTabFragment);
+    OS.newFragment();
+    StrTabFragment = OS.getCurrentFragment();
+    OS.newFragment();
   }
 
   OS.emitValueToAlignment(Align(4), 0);
@@ -603,7 +606,7 @@ void CodeViewContext::encodeInlineLineTable(const MCAssembler &Asm,
 
   compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
   compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
-  Frag.setContents(Buffer);
+  Frag.setVarContents(Buffer);
 }
 
 void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
@@ -691,6 +694,6 @@ void CodeViewContext::encodeDefRange(const MCAssembler &Asm,
     }
   }
 
-  Frag.setContents(Contents);
-  Frag.setFixups(Fixups);
+  Frag.setVarContents(Contents);
+  Frag.setVarFixups(Fixups);
 }
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 070be621a4b2c..12b3fbab8fb8f 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -734,9 +734,8 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name,
       UniqueName.append("/").append(P->getName());
   }
   // Do the lookup. If we don't have a hit, return a new section.
-  auto IterBool = GOFFUniquingMap.insert(std::make_pair(UniqueName, nullptr));
-  auto Iter = IterBool.first;
-  if (!IterBool.second)
+  auto [Iter, Inserted] = GOFFUniquingMap.try_emplace(UniqueName);
+  if (!Inserted)
     return Iter->second;
 
   StringRef CachedName = StringRef(Iter->first.c_str(), Name.size());
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index b1dced7e2cda3..e7c0d37e8f99b 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -447,10 +447,17 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
         StringRef(reinterpret_cast<const char *>(Cksum.data()), Cksum.size()));
   }
   if (HasAnySource) {
+    // From https://dwarfstd.org/issues/180201.1.html
+    // * The value is an empty null-terminated string if no source is available
+    StringRef Source = DwarfFile.Source.value_or(StringRef());
+    // * If the source is available but is an empty file then the value is a
+    // null-terminated single "\n".
+    if (DwarfFile.Source && DwarfFile.Source->empty())
+      Source = "\n";
     if (LineStr)
-      LineStr->emitRef(MCOS, DwarfFile.Source.value_or(StringRef()));
+      LineStr->emitRef(MCOS, Source);
     else {
-      MCOS->emitBytes(DwarfFile.Source.value_or(StringRef())); // Source and...
+      MCOS->emitBytes(Source);             // Source and...
       MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
     }
   }
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index ffc57227cff16..b8cbaea52560f 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -88,7 +88,7 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   if (SectionELF->getFlags() & ELF::SHF_GNU_RETAIN)
     getWriter().markGnuAbi();
 
-  changeSectionImpl(Section, Subsection);
+  MCObjectStreamer::changeSection(Section, Subsection);
   Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
@@ -314,8 +314,9 @@ void MCELFStreamer::emitIdent(StringRef IdentString) {
   popSection();
 }
 
-void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
-                                           uint64_t Offset) {
+void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *Sym,
+                                           uint64_t Offset,
+                                           const MCSymbolRefExpr *&SRE) {
   const MCSymbol *S = &SRE->getSymbol();
   if (S->isTemporary()) {
     if (!S->isInSection()) {
@@ -328,13 +329,9 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
     S->setUsedInReloc();
     SRE = MCSymbolRefExpr::create(S, getContext(), SRE->getLoc());
   }
-  const MCConstantExpr *MCOffset = MCConstantExpr::create(Offset, getContext());
-  if (std::optional<std::pair<bool, std::string>> Err =
-          MCObjectStreamer::emitRelocDirective(
-              *MCOffset, "BFD_RELOC_NONE", SRE, SRE->getLoc(),
-              *getContext().getSubtargetInfo()))
-    report_fatal_error("Relocation for CG Profile could not be created: " +
-                       Twine(Err->second));
+  auto *O = MCBinaryExpr::createAdd(
+      Sym, MCConstantExpr::create(Offset, getContext()), getContext());
+  MCObjectStreamer::emitRelocDirective(*O, "BFD_RELOC_NONE", SRE);
 }
 
 void MCELFStreamer::finalizeCGProfile() {
@@ -347,9 +344,11 @@ void MCELFStreamer::finalizeCGProfile() {
   pushSection();
   switchSection(CGProfile);
   uint64_t Offset = 0;
+  auto *Sym =
+      MCSymbolRefExpr::create(CGProfile->getBeginSymbol(), getContext());
   for (auto &E : W.getCGProfile()) {
-    finalizeCGProfileEntry(E.From, Offset);
-    finalizeCGProfileEntry(E.To, Offset);
+    finalizeCGProfileEntry(Sym, Offset, E.From);
+    finalizeCGProfileEntry(Sym, Offset, E.To);
     emitIntValue(E.Count, sizeof(uint64_t));
     Offset += sizeof(uint64_t);
   }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 22dff497911de..dbb2fd16eb2e5 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -370,7 +370,6 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
       }
 
       int64_t Num;
-      unsigned Count;
       if (DF) {
         Displacement += DF->getContents().size();
       } else if (F->getKind() == MCFragment::FT_Relaxable &&
@@ -379,11 +378,9 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
         // After layout, during relocation generation, it can be treated as a
         // data fragment.
         Displacement += F->getSize();
-      } else if (auto *AF = dyn_cast<MCAlignFragment>(F);
-                 AF && Layout && AF->hasEmitNops() &&
-                 !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
-                     *AF, Count)) {
-        Displacement += Asm->computeFragmentSize(*AF);
+      } else if (F->getKind() == MCFragment::FT_Align && Layout &&
+                 F->isLinkerRelaxable()) {
+        Displacement += Asm->computeFragmentSize(*F);
       } else if (auto *FF = dyn_cast<MCFillFragment>(F);
                  FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
         Displacement += Num * FF->getValueSize();
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index bfe045abe6e53..3c395e5ccdb0b 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -58,7 +58,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_SymbolId:      OS << "SymbolId"; break;
   case MCFragment::FT_CVInlineLines: OS << "CVInlineLineTable"; break;
   case MCFragment::FT_CVDefRange:    OS << "CVDefRangeTable"; break;
-  case MCFragment::FT_PseudoProbe:   OS << "PseudoProbe"; break;
     // clang-format on
   }
 
@@ -73,17 +72,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   };
 
   switch (getKind()) {
-  case MCFragment::FT_Align: {
-    const auto *AF = cast<MCAlignFragment>(this);
-    OS << " Align:" << AF->getAlignment().value() << " Fill:" << AF->getFill()
-       << " FillLen:" << unsigned(AF->getFillLen())
-       << " MaxBytesToEmit:" << AF->getMaxBytesToEmit();
-    if (AF->hasEmitNops())
-      OS << " Nops";
-    break;
-  }
   case MCFragment::FT_Data:
   case MCFragment::FT_Relaxable:
+  case MCFragment::FT_Align:
   case MCFragment::FT_LEB:
   case MCFragment::FT_Dwarf:
   case MCFragment::FT_DwarfFrame: {
@@ -92,8 +83,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     auto Fixed = getContents();
     auto Var = getVarContents();
     OS << " Size:" << Fixed.size();
-    if (getKind() != MCFragment::FT_Data)
+    if (getKind() != MCFragment::FT_Data) {
       OS << '+' << Var.size();
+      // FT_Align uses getVarContents to track the size, but the content is
+      // ignored and not useful.
+      if (getKind() == MCFragment::FT_Align)
+        Var = {};
+    }
     OS << " [";
     for (unsigned i = 0, e = Fixed.size(); i != e; ++i) {
       if (i) OS << ",";
@@ -112,6 +108,13 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
       OS << ' ';
       getInst().dump_pretty(OS);
       break;
+    case MCFragment::FT_Align:
+      OS << "\n  Align:" << getAlignment().value() << " Fill:" << getAlignFill()
+         << " FillLen:" << unsigned(getAlignFillLen())
+         << " MaxBytesToEmit:" << getAlignMaxBytesToEmit();
+      if (hasAlignEmitNops())
+        OS << " Nops";
+      break;
     case MCFragment::FT_LEB: {
       OS << " Value:";
       getLEBValue().print(OS, nullptr);
@@ -182,12 +185,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     }
     break;
   }
-  case MCFragment::FT_PseudoProbe: {
-    const auto *OF = cast<MCPseudoProbeAddrFragment>(this);
-    OS << " AddrDelta:";
-    OF->getAddrDelta().print(OS, nullptr);
-    break;
-  }
   }
 }
 #endif
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 43598ef038b96..756039944fc25 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -132,8 +132,7 @@ class MCMachOStreamer : public MCObjectStreamer {
 } // end anonymous namespace.
 
 void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
-  // Change the section normally.
-  changeSectionImpl(Section, Subsection);
+  MCObjectStreamer::changeSection(Section, Subsection);
 
   // Output a linker-local symbol so we don't need section-relative local
   // relocations. The linker hates us when we do that.
@@ -161,7 +160,7 @@ void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (cast<MCSymbolMachO>(Symbol)->isSymbolLinkerVisible())
-    insert(getContext().allocFragment<MCFragment>());
+    newFragment();
 
   MCObjectStreamer::emitLabel(Symbol, Loc);
 
@@ -393,7 +392,7 @@ void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
   // On darwin all virtual sections have zerofill type. Disallow the usage of
   // .zerofill in non-virtual functions. If something similar is needed, use
   // .space or .zero.
-  if (!Section->isVirtualSection()) {
+  if (!Section->isBssSection()) {
     getContext().reportError(
         Loc, "The usage of .zerofill is restricted to sections of "
              "ZEROFILL type. Use .zero or .space instead.");
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index c0cef0f06c57a..42f4cf49d7f38 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -33,6 +33,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
           Context, std::move(TAB), std::move(Emitter), std::move(OW))),
       EmitEHFrame(true), EmitDebugFrame(false) {
   assert(Assembler->getBackendPtr() && Assembler->getEmitterPtr());
+  IsObj = true;
   setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
   if (Context.getTargetOptions() && Context.getTargetOptions()->MCRelaxAll)
     Assembler->setRelaxAll(true);
@@ -46,33 +47,23 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
   return nullptr;
 }
 
-// When fixup's offset is a forward declared label, e.g.:
-//
-//   .reloc 1f, R_MIPS_JALR, foo
-// 1: nop
-//
-// postpone adding it to Fixups vector until the label is defined and its offset
-// is known.
-void MCObjectStreamer::resolvePendingFixups() {
-  for (PendingMCFixup &PendingFixup : PendingFixups) {
-    if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) {
-      getContext().reportError(PendingFixup.Fixup.getLoc(),
-                               "unresolved relocation offset");
-      continue;
-    }
-    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset() +
-                                 PendingFixup.Fixup.getOffset());
-
-    // If the location symbol to relocate is in MCEncodedFragment,
-    // put the Fixup into location symbol's fragment. Otherwise
-    // put into PendingFixup.DF
-    MCFragment *F = PendingFixup.Sym->getFragment();
-    if (F->isEncoded())
-      F->addFixup(PendingFixup.Fixup);
-    else
-      PendingFixup.DF->addFixup(PendingFixup.Fixup);
-  }
-  PendingFixups.clear();
+void MCObjectStreamer::newFragment() {
+  addFragment(getContext().allocFragment<MCFragment>());
+}
+
+void MCObjectStreamer::insert(MCFragment *F) {
+  assert(F->getKind() != MCFragment::FT_Data &&
+         "F should have a variable-size tail");
+  addFragment(F);
+  newFragment();
+}
+
+void MCObjectStreamer::appendContents(size_t Num, char Elt) {
+  CurFrag->appendContents(Num, Elt);
+}
+
+void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
+  CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
 }
 
 // As a compile-time optimization, avoid allocating and evaluating an MCExpr
@@ -135,32 +126,6 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
     MCDwarfFrameEmitter::Emit(*this, MAB, false);
 }
 
-static bool canReuseDataFragment(const MCFragment &F,
-                                 const MCAssembler &Assembler,
-                                 const MCSubtargetInfo *STI) {
-  if (!F.hasInstructions())
-    return true;
-  // Do not add data after a linker-relaxable instruction. The difference
-  // between a new label and a label at or before the linker-relaxable
-  // instruction cannot be resolved at assemble-time.
-  if (F.isLinkerRelaxable())
-    return false;
-  // If the subtarget is changed mid fragment we start a new fragment to record
-  // the new STI.
-  return !STI || F.getSubtargetInfo() == STI;
-}
-
-MCFragment *
-MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
-  auto *F = getCurrentFragment();
-  if (F->getKind() != MCFragment::FT_Data ||
-      !canReuseDataFragment(*F, *Assembler, STI)) {
-    F = getContext().allocFragment<MCFragment>();
-    insert(F);
-  }
-  return F;
-}
-
 void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
   Assembler->registerSymbol(Sym);
 }
@@ -174,7 +139,7 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug) {
 void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                      SMLoc Loc) {
   MCStreamer::emitValueImpl(Value, Size, Loc);
-  MCFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getCurrentFragment();
 
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
 
@@ -223,7 +188,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // If there is a current fragment, mark the symbol as pointing into it.
   // Otherwise queue the label and set its fragment pointer when we emit the
   // next fragment.
-  MCFragment *F = getOrCreateDataFragment();
+  MCFragment *F = getCurrentFragment();
   Symbol->setFragment(F);
   Symbol->setOffset(F->getContents().size());
 
@@ -257,10 +222,9 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
     emitULEB128IntValue(IntValue);
     return;
   }
-  auto *F = getOrCreateDataFragment();
-  F->Kind = MCFragment::FT_LEB;
-  F->setLEBSigned(false);
-  F->setLEBValue(Value);
+  auto *F = getCurrentFragment();
+  F->makeLEB(false, Value);
+  newFragment();
 }
 
 void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
@@ -269,10 +233,9 @@ void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
     emitSLEB128IntValue(IntValue);
     return;
   }
-  auto *F = getOrCreateDataFragment();
-  F->Kind = MCFragment::FT_LEB;
-  F->setLEBSigned(true);
-  F->setLEBValue(Value);
+  auto *F = getCurrentFragment();
+  F->makeLEB(true, Value);
+  newFragment();
 }
 
 void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
@@ -281,11 +244,6 @@ void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
 }
 
 void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
-  changeSectionImpl(Section, Subsection);
-}
-
-bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
-                                         uint32_t Subsection) {
   assert(Section && "Cannot switch to a null section!");
   getContext().clearDwarfLocSeen();
 
@@ -304,7 +262,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
   Section->CurFragList = &Subsections[I].second;
   CurFrag = Section->CurFragList->Tail;
 
-  return getAssembler().registerSection(*Section);
+  getAssembler().registerSection(*Section);
 }
 
 void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) {
@@ -336,18 +294,6 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
 
 void MCObjectStreamer::emitInstruction(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
-  const MCSection &Sec = *getCurrentSectionOnly();
-  if (Sec.isVirtualSection()) {
-    getContext().reportError(Inst.getLoc(), Twine(Sec.getVirtualSectionKind()) +
-                                                " section '" + Sec.getName() +
-                                                "' cannot have instructions");
-    return;
-  }
-  emitInstructionImpl(Inst, STI);
-}
-
-void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
-                                           const MCSubtargetInfo &STI) {
   MCStreamer::emitInstruction(Inst, STI);
 
   MCSection *Sec = getCurrentSectionOnly();
@@ -381,7 +327,7 @@ void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
 
 void MCObjectStreamer::emitInstToData(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
-  MCFragment *F = getOrCreateDataFragment();
+  MCFragment *F = getCurrentFragment();
 
   // Append the instruction to the data fragment.
   size_t FixupStartIndex = F->getFixups().size();
@@ -392,21 +338,28 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
   F->doneAppending();
   if (!Fixups.empty())
     F->appendFixups(Fixups);
+  F->setHasInstructions(STI);
 
+  bool MarkedLinkerRelaxable = false;
   for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) {
     Fixup.setOffset(Fixup.getOffset() + CodeOffset);
-    if (Fixup.isLinkerRelaxable()) {
-      F->setLinkerRelaxable();
+    if (!Fixup.isLinkerRelaxable())
+      continue;
+    F->setLinkerRelaxable();
+    // Do not add data after a linker-relaxable instruction. The difference
+    // between a new label and a label at or before the linker-relaxable
+    // instruction cannot be resolved at assemble-time.
+    if (!MarkedLinkerRelaxable) {
+      MarkedLinkerRelaxable = true;
       getCurrentSectionOnly()->setLinkerRelaxable();
+      newFragment();
     }
   }
-
-  F->setHasInstructions(STI);
 }
 
 void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
                                           const MCSubtargetInfo &STI) {
-  auto *F = getOrCreateDataFragment();
+  auto *F = getCurrentFragment();
   SmallVector<char, 16> Data;
   SmallVector<MCFixup, 1> Fixups;
   getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI);
@@ -417,6 +370,7 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
   F->setVarContents(Data);
   F->setVarFixups(Fixups);
   F->setInst(Inst);
+  newFragment();
 }
 
 void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
@@ -478,10 +432,11 @@ void MCObjectStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
     return;
   }
 
-  auto *F = getOrCreateDataFragment();
+  auto *F = getCurrentFragment();
   F->Kind = MCFragment::FT_Dwarf;
   F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, SMLoc()));
   F->setDwarfLineDelta(LineDelta);
+  newFragment();
 }
 
 void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
@@ -509,9 +464,10 @@ void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
 void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                                  const MCSymbol *Label,
                                                  SMLoc Loc) {
-  auto *F = getOrCreateDataFragment();
+  auto *F = getCurrentFragment();
   F->Kind = MCFragment::FT_DwarfFrame;
   F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, Loc));
+  newFragment();
 }
 
 void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
@@ -570,7 +526,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
 
 void MCObjectStreamer::emitBytes(StringRef Data) {
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
-  MCFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getCurrentFragment();
   DF->appendContents(ArrayRef(Data.data(), Data.size()));
 }
 
@@ -579,26 +535,21 @@ void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
                                             unsigned MaxBytesToEmit) {
   if (MaxBytesToEmit == 0)
     MaxBytesToEmit = Alignment.value();
-  insert(getContext().allocFragment<MCAlignFragment>(Alignment, Fill, FillLen,
-                                                     MaxBytesToEmit));
+  MCFragment *F = getCurrentFragment();
+  F->makeAlign(Alignment, Fill, FillLen, MaxBytesToEmit);
+  newFragment();
 
   // Update the maximum alignment on the current section if necessary.
-  MCSection *CurSec = getCurrentSectionOnly();
-  CurSec->ensureMinAlignment(Alignment);
+  F->getParent()->ensureMinAlignment(Alignment);
 }
 
 void MCObjectStreamer::emitCodeAlignment(Align Alignment,
                                          const MCSubtargetInfo *STI,
                                          unsigned MaxBytesToEmit) {
+  auto *F = getCurrentFragment();
   emitValueToAlignment(Alignment, 0, 1, MaxBytesToEmit);
-  auto *F = cast<MCAlignFragment>(getCurrentFragment());
-  F->setEmitNops(true, STI);
-  // With RISC-V style linker relaxation, mark the section as linker-relaxable
-  // if the alignment is larger than the minimum NOP size.
-  unsigned Size;
-  if (getAssembler().getBackend().shouldInsertExtraNopBytesForCodeAlign(*F,
-                                                                        Size))
-    getCurrentSectionOnly()->setLinkerRelaxable();
+  F->u.align.EmitNops = true;
+  F->STI = STI;
 }
 
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
@@ -607,76 +558,14 @@ void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
   insert(getContext().allocFragment<MCOrgFragment>(*Offset, Value, Loc));
 }
 
-static std::optional<std::pair<bool, std::string>>
-getOffsetAndDataFragment(const MCSymbol &Symbol, uint32_t &RelocOffset,
-                         MCFragment *&DF) {
-  if (Symbol.isVariable()) {
-    const MCExpr *SymbolExpr = Symbol.getVariableValue();
-    MCValue OffsetVal;
-    if (!SymbolExpr->evaluateAsRelocatable(OffsetVal, nullptr))
-      return std::make_pair(false,
-                            std::string("symbol in .reloc offset is not "
-                                        "relocatable"));
-    if (OffsetVal.isAbsolute()) {
-      RelocOffset = OffsetVal.getConstant();
-      MCFragment *Fragment = Symbol.getFragment();
-      // FIXME Support symbols with no DF. For example:
-      // .reloc .data, ENUM_VALUE, <some expr>
-      if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
-        return std::make_pair(false,
-                              std::string("symbol in offset has no data "
-                                          "fragment"));
-      DF = cast<MCFragment>(Fragment);
-      return std::nullopt;
-    }
-
-    if (OffsetVal.getSubSym())
-      return std::make_pair(false,
-                            std::string(".reloc symbol offset is not "
-                                        "representable"));
-
-    const MCSymbol &SA = *OffsetVal.getAddSym();
-    if (!SA.isDefined())
-      return std::make_pair(false,
-                            std::string("symbol used in the .reloc offset is "
-                                        "not defined"));
-
-    if (SA.isVariable())
-      return std::make_pair(false,
-                            std::string("symbol used in the .reloc offset is "
-                                        "variable"));
-
-    MCFragment *Fragment = SA.getFragment();
-    // FIXME Support symbols with no DF. For example:
-    // .reloc .data, ENUM_VALUE, <some expr>
-    if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
-      return std::make_pair(false,
-                            std::string("symbol in offset has no data "
-                                        "fragment"));
-    RelocOffset = SA.getOffset() + OffsetVal.getConstant();
-    DF = cast<MCFragment>(Fragment);
-  } else {
-    RelocOffset = Symbol.getOffset();
-    MCFragment *Fragment = Symbol.getFragment();
-    // FIXME Support symbols with no DF. For example:
-    // .reloc .data, ENUM_VALUE, <some expr>
-    if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
-      return std::make_pair(false,
-                            std::string("symbol in offset has no data "
-                                        "fragment"));
-    DF = cast<MCFragment>(Fragment);
-  }
-  return std::nullopt;
-}
-
-std::optional<std::pair<bool, std::string>>
-MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                     const MCExpr *Expr, SMLoc Loc,
-                                     const MCSubtargetInfo &STI) {
+void MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                          const MCExpr *Expr, SMLoc Loc) {
   std::optional<MCFixupKind> MaybeKind =
       Assembler->getBackend().getFixupKind(Name);
-  if (!MaybeKind)
-    return std::make_pair(true, std::string("unknown relocation name"));
+  if (!MaybeKind) {
+    getContext().reportError(Loc, "unknown relocation name");
+    return;
+  }
 
   MCFixupKind Kind = *MaybeKind;
   if (Expr)
@@ -685,38 +574,14 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
     Expr =
         MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext());
 
-  MCFragment *DF = getOrCreateDataFragment(&STI);
-  MCValue OffsetVal;
-  if (!Offset.evaluateAsRelocatable(OffsetVal, nullptr))
-    return std::make_pair(false,
-                          std::string(".reloc offset is not relocatable"));
-  if (OffsetVal.isAbsolute()) {
-    if (OffsetVal.getConstant() < 0)
-      return std::make_pair(false, std::string(".reloc offset is negative"));
-    DF->addFixup(MCFixup::create(OffsetVal.getConstant(), Expr, Kind));
-    return std::nullopt;
+  auto *O = &Offset;
+  int64_t Val;
+  if (Offset.evaluateAsAbsolute(Val, nullptr)) {
+    auto *SecSym = getCurrentSectionOnly()->getBeginSymbol();
+    O = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(SecSym, getContext()),
+                                O, getContext(), Loc);
   }
-  if (OffsetVal.getSubSym())
-    return std::make_pair(false,
-                          std::string(".reloc offset is not representable"));
-
-  const MCSymbol &Symbol = *OffsetVal.getAddSym();
-  if (Symbol.isDefined()) {
-    uint32_t SymbolOffset = 0;
-    std::optional<std::pair<bool, std::string>> Error =
-        getOffsetAndDataFragment(Symbol, SymbolOffset, DF);
-
-    if (Error != std::nullopt)
-      return Error;
-
-    DF->addFixup(
-        MCFixup::create(SymbolOffset + OffsetVal.getConstant(), Expr, Kind));
-    return std::nullopt;
-  }
-
-  PendingFixups.emplace_back(
-      &Symbol, DF, MCFixup::create(OffsetVal.getConstant(), Expr, Kind));
-  return std::nullopt;
+  getAssembler().addRelocDirective({*O, Expr, Kind});
 }
 
 void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
@@ -799,6 +664,5 @@ void MCObjectStreamer::finishImpl() {
   // Emit pseudo probes for the current module.
   MCPseudoProbeTable::emit(this);
 
-  resolvePendingFixups();
   getAssembler().Finish();
 }
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index a36b2dea70ccf..d0b6ea4cfd562 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3079,7 +3079,6 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   const MCExpr *Offset;
   const MCExpr *Expr = nullptr;
-  SMLoc OffsetLoc = Lexer.getTok().getLoc();
 
   if (parseExpression(Offset))
     return true;
@@ -3105,13 +3104,7 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   if (parseEOL())
     return true;
 
-  const MCTargetAsmParser &MCT = getTargetParser();
-  const MCSubtargetInfo &STI = MCT.getSTI();
-  if (std::optional<std::pair<bool, std::string>> Err =
-          getStreamer().emitRelocDirective(*Offset, Name, Expr, DirectiveLoc,
-                                           STI))
-    return Error(Err->first ? NameLoc : OffsetLoc, Err->second);
-
+  getStreamer().emitRelocDirective(*Offset, Name, Expr, NameLoc);
   return false;
 }
 
@@ -3411,11 +3404,10 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) {
   const MCSection *Section = getStreamer().getCurrentSectionOnly();
   assert(Section && "must have section to emit alignment");
 
-  if (HasFillExpr && FillExpr != 0 && Section->isVirtualSection()) {
+  if (HasFillExpr && FillExpr != 0 && Section->isBssSection()) {
     ReturnVal |=
-        Warning(FillExprLoc, "ignoring non-zero fill value in " +
-                                 Section->getVirtualSectionKind() +
-                                 " section '" + Section->getName() + "'");
+        Warning(FillExprLoc, "ignoring non-zero fill value in BSS section '" +
+                                 Section->getName() + "'");
     FillExpr = 0;
   }
 
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index ec8b40261a6ca..c7c3df330fc94 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -571,7 +571,7 @@ bool ELFAsmParser::parseSectionArguments(bool IsPush, SMLoc loc) {
         return TokError("expected end of directive");
     }
 
-    if (Mergeable)
+    if (Mergeable || TypeName == "llvm_cfi_jump_table")
       if (parseMergeSize(Size))
         return true;
     if (Flags & ELF::SHF_LINK_ORDER)
@@ -637,6 +637,8 @@ bool ELFAsmParser::parseSectionArguments(bool IsPush, SMLoc loc) {
       Type = ELF::SHT_LLVM_LTO;
     else if (TypeName == "llvm_jt_sizes")
       Type = ELF::SHT_LLVM_JT_SIZES;
+    else if (TypeName == "llvm_cfi_jump_table")
+      Type = ELF::SHT_LLVM_CFI_JUMP_TABLE;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
diff --git a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
index 665d92eb9a21c..d7b0546d70558 100644
--- a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegister.h"
 
 using namespace llvm;
@@ -22,6 +23,11 @@ MCTargetAsmParser::~MCTargetAsmParser() = default;
 MCSubtargetInfo &MCTargetAsmParser::copySTI() {
   MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI());
   STI = &STICopy;
+  // The returned STI will likely be modified. Create a new fragment to prevent
+  // mixing STI values within a fragment.
+  auto &S = getStreamer();
+  if (S.isObj() && S.getCurrentFragment())
+    static_cast<MCObjectStreamer &>(S).newFragment();
   return STICopy;
 }
 
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index f87d27f4adcc5..b493337b39317 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -81,8 +81,9 @@ void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
     if (AddrDelta->evaluateAsAbsolute(Delta, MCOS->getAssemblerPtr())) {
       MCOS->emitSLEB128IntValue(Delta);
     } else {
-      MCOS->insert(MCOS->getContext().allocFragment<MCPseudoProbeAddrFragment>(
-          AddrDelta));
+      auto *F = MCOS->getCurrentFragment();
+      F->makeLEB(true, AddrDelta);
+      MCOS->newFragment();
     }
   } else {
     // Emit the GUID of the split function that the sentinel probe represents.
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 93671450c0c2f..023f7f27de0aa 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -18,10 +18,10 @@
 
 using namespace llvm;
 
-MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
-                     bool IsVirtual, MCSymbol *Begin)
+MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
+                     MCSymbol *Begin)
     : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
-      IsVirtual(IsVirtual), LinkerRelaxable(false), Name(Name), Variant(V) {
+      IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) {
   // The initial subsection number is 0. Create a fragment list.
   CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
 }
@@ -34,8 +34,6 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
 
 bool MCSection::hasEnded() const { return End && End->isInSection(); }
 
-StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCSection::dump(
     DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>> *FragToSyms)
@@ -60,16 +58,6 @@ LLVM_DUMP_METHOD void MCSection::dump(
 }
 #endif
 
-void MCFragment::setContents(ArrayRef<char> Contents) {
-  auto &S = getParent()->ContentStorage;
-  if (ContentStart + Contents.size() > ContentEnd) {
-    ContentStart = S.size();
-    S.resize_for_overwrite(S.size() + Contents.size());
-  }
-  ContentEnd = ContentStart + Contents.size();
-  llvm::copy(Contents, S.begin() + ContentStart);
-}
-
 void MCFragment::setVarContents(ArrayRef<char> Contents) {
   auto &S = getParent()->ContentStorage;
   if (VarContentStart + Contents.size() > VarContentEnd) {
@@ -96,16 +84,6 @@ void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
   FixupEnd = S.size();
 }
 
-void MCFragment::setFixups(ArrayRef<MCFixup> Fixups) {
-  auto &S = getParent()->FixupStorage;
-  if (FixupStart + Fixups.size() > FixupEnd) {
-    FixupStart = S.size();
-    S.resize_for_overwrite(S.size() + Fixups.size());
-  }
-  FixupEnd = FixupStart + Fixups.size();
-  llvm::copy(Fixups, S.begin() + FixupStart);
-}
-
 void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) {
   auto &S = getParent()->FixupStorage;
   if (VarFixupStart + Fixups.size() > VarFixupEnd) {
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
index 94e29ce27d881..5bf14735eda9b 100644
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/llvm/lib/MC/MCSectionCOFF.cpp
@@ -115,7 +115,3 @@ void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 }
 
 bool MCSectionCOFF::useCodeAlign() const { return isText(); }
-
-StringRef MCSectionCOFF::getVirtualSectionKind() const {
-  return "IMAGE_SCN_CNT_UNINITIALIZED_DATA";
-}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index cc7cdf2fe4d1a..ef33f9c314579 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -176,11 +176,13 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "llvm_lto";
   else if (Type == ELF::SHT_LLVM_JT_SIZES)
     OS << "llvm_jt_sizes";
+  else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
+    OS << "llvm_cfi_jump_table";
   else
     OS << "0x" << Twine::utohexstr(Type);
 
   if (EntrySize) {
-    assert(Flags & ELF::SHF_MERGE);
+    assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE);
     OS << "," << EntrySize;
   }
 
@@ -213,5 +215,3 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
 bool MCSectionELF::useCodeAlign() const {
   return getFlags() & ELF::SHF_EXECINSTR;
 }
-
-StringRef MCSectionELF::getVirtualSectionKind() const { return "SHT_NOBITS"; }
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index d814ab8880500..30198c97d8ab9 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1404,6 +1404,15 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
   return Sym;
 }
 
+void MCStreamer::addFragment(MCFragment *F) {
+  auto *Sec = CurFrag->getParent();
+  F->setParent(Sec);
+  F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
+  CurFrag->Next = F;
+  CurFrag = F;
+  Sec->curFragList()->Tail = F;
+}
+
 static VersionTuple
 targetVersionOrMinimumSupportedOSVersion(const Triple &Target,
                                          VersionTuple TargetVersion) {
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index e8b26bf291ee4..72a8dd7031198 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,15 +318,13 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
 
   // Emit the epilog instructions.
   if (EnableUnwindV2) {
-    MCFragment *DF = OS->getOrCreateDataFragment();
-
     bool IsLast = true;
     for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
       if (IsLast) {
         IsLast = false;
         uint8_t Flags = LastEpilogIsAtEnd ? 0x01 : 0;
-        streamer.emitInt8(EpilogSize);
-        streamer.emitInt8((Flags << 4) | Win64EH::UOP_Epilog);
+        OS->emitInt8(EpilogSize);
+        OS->emitInt8((Flags << 4) | Win64EH::UOP_Epilog);
 
         if (LastEpilogIsAtEnd)
           continue;
@@ -337,9 +335,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
       // layout has been completed.
       auto *MCE = MCUnwindV2EpilogTargetExpr::create(*info, Epilog.second,
                                                      EpilogSize, context);
-      MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_2);
-      DF->addFixup(Fixup);
-      DF->appendContents(2, 0);
+      OS->addFixup(MCE, FK_Data_2);
+      OS->appendContents(2, 0);
     }
   }
   if (AddPaddingEpilogCode)
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 3398775df3f91..9369bea090a57 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -153,7 +153,7 @@ void MCWinCOFFStreamer::initSections(bool NoExecStack,
 }
 
 void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
-  changeSectionImpl(Section, Subsection);
+  MCObjectStreamer::changeSection(Section, Subsection);
   // Ensure that the first and the second symbols relative to the section are
   // the section symbol and the COMDAT symbol.
   getAssembler().registerSymbol(*Section->getBeginSymbol());
@@ -278,35 +278,28 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
 
 void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
   visitUsedSymbol(*Symbol);
-  MCFragment *DF = getOrCreateDataFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2);
-  DF->addFixup(Fixup);
-  DF->appendContents(2, 0);
+  addFixup(SRE, FK_SecRel_2);
+  appendContents(2, 0);
 }
 
 void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
                                          uint64_t Offset) {
   visitUsedSymbol(*Symbol);
-  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol A for the relocation relative reference.
   const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext());
   // Add the constant offset, if given.
   if (Offset)
     MCE = MCBinaryExpr::createAdd(
         MCE, MCConstantExpr::create(Offset, getContext()), getContext());
-  // Build the secrel32 relocation.
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_SecRel_4);
-  // Record the relocation.
-  DF->addFixup(Fixup);
+  addFixup(MCE, FK_SecRel_4);
   // Emit 4 bytes (zeros) to the object file.
-  DF->appendContents(4, 0);
+  appendContents(4, 0);
 }
 
 void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
                                          int64_t Offset) {
   visitUsedSymbol(*Symbol);
-  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol A for the relocation relative reference.
   const MCExpr *MCE = MCSymbolRefExpr::create(
       Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
@@ -314,40 +307,29 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
   if (Offset)
     MCE = MCBinaryExpr::createAdd(
         MCE, MCConstantExpr::create(Offset, getContext()), getContext());
-  // Build the imgrel relocation.
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
-  // Record the relocation.
-  DF->addFixup(Fixup);
+  addFixup(MCE, FK_Data_4);
   // Emit 4 bytes (zeros) to the object file.
-  DF->appendContents(4, 0);
+  appendContents(4, 0);
 }
 
 void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
   visitUsedSymbol(*Symbol);
-  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol for section number.
   const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
       *Symbol, this->getWriter(), getContext());
-  // Build the relocation.
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
-  // Record the relocation.
-  DF->addFixup(Fixup);
+  addFixup(MCE, FK_Data_4);
   // Emit 4 bytes (zeros) to the object file.
-  DF->appendContents(4, 0);
+  appendContents(4, 0);
 }
 
 void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
   visitUsedSymbol(*Symbol);
-  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol for section offset.
   const MCExpr *MCE =
       MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
-  // Build the relocation.
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), MCE, FK_Data_4);
-  // Record the relocation.
-  DF->addFixup(Fixup);
+  addFixup(MCE, FK_Data_4);
   // Emit 4 bytes (zeros) to the object file.
-  DF->appendContents(4, 0);
+  appendContents(4, 0);
 }
 
 void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 4d4529653aba9..63381b4f81859 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -89,7 +89,7 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
 void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
   // Add a Fixup here to later record a relocation of type R_REF to prevent the
   // ref symbol from being garbage collected (by the binder).
-  MCFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getCurrentFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
   std::optional<MCFixupKind> MaybeKind =
       getAssembler().getBackend().getFixupKind("R_REF");
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 3291dd774c1e0..48d2fc6786fa9 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -131,7 +131,7 @@ uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm,
     return 0;
 
   const MCSection &NextSec = *SectionOrder[Next];
-  if (NextSec.isVirtualSection())
+  if (NextSec.isBssSection())
     return 0;
   return offsetToAlignment(EndAddr, NextSec.getAlign());
 }
@@ -267,7 +267,7 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
   const MCSectionMachO &Section = cast<MCSectionMachO>(Sec);
 
   // The offset is unused for virtual sections.
-  if (Section.isVirtualSection()) {
+  if (Section.isBssSection()) {
     assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!");
     FileOffset = 0;
   }
@@ -682,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) {
   unsigned i = 0;
   // Compute the section layout order. Virtual sections must go last.
   for (MCSection &Sec : Asm) {
-    if (!Sec.isVirtualSection()) {
+    if (!Sec.isBssSection()) {
       SectionOrder.push_back(&Sec);
       cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
     }
   }
   for (MCSection &Sec : Asm) {
-    if (Sec.isVirtualSection()) {
+    if (Sec.isBssSection()) {
       SectionOrder.push_back(&Sec);
       cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
     }
@@ -797,11 +797,8 @@ uint64_t MachObjectWriter::writeObject() {
                      UndefinedSymbolData);
 
   if (!CGProfile.empty()) {
-    MCSection *CGProfileSection = getContext().getMachOSection(
-        "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
-    auto &Frag = *CGProfileSection->begin();
-    Frag.clearContents();
-    raw_svector_ostream OS(Frag.getContentsForAppending());
+    SmallString<0> Content;
+    raw_svector_ostream OS(Content);
     for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
@@ -809,7 +806,9 @@ uint64_t MachObjectWriter::writeObject() {
       support::endian::write(OS, ToIndex, W.Endian);
       support::endian::write(OS, CGPE.Count, W.Endian);
     }
-    Frag.doneAppending();
+    MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
+                                                  SectionKind::getMetadata());
+    llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
   }
 
   unsigned NumSections = Asm.end() - Asm.begin();
@@ -883,7 +882,7 @@ uint64_t MachObjectWriter::writeObject() {
 
     VMSize = std::max(VMSize, Address + Size);
 
-    if (Sec.isVirtualSection())
+    if (Sec.isBssSection())
       continue;
 
     SectionDataSize = std::max(SectionDataSize, Address + Size);
@@ -915,7 +914,7 @@ uint64_t MachObjectWriter::writeObject() {
     unsigned Flags = Sec.getTypeAndAttributes();
     if (Sec.hasInstructions())
       Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS;
-    if (!cast<MCSectionMachO>(Sec).isVirtualSection() &&
+    if (!cast<MCSectionMachO>(Sec).isBssSection() &&
         !isUInt<32>(SectionStart)) {
       getContext().reportError(
           SMLoc(), "cannot encode offset of section; object file too large");
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 7af240a73f952..3b99af47eb45b 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -696,14 +696,15 @@ static void addData(SmallVectorImpl<char> &DataBytes,
     if (Frag.hasInstructions())
       report_fatal_error("only data supported in data sections");
 
-    if (auto *Align = dyn_cast<MCAlignFragment>(&Frag)) {
-      if (Align->getFillLen() != 1)
+    llvm::append_range(DataBytes, Frag.getContents());
+    if (Frag.getKind() == MCFragment::FT_Align) {
+      if (Frag.getAlignFillLen() != 1)
         report_fatal_error("only byte values supported for alignment");
       // If nops are requested, use zeros, as this is the data section.
-      uint8_t Value = Align->hasEmitNops() ? 0 : Align->getFill();
+      uint8_t Value = Frag.hasAlignEmitNops() ? 0 : Frag.getAlignFill();
       uint64_t Size =
-          std::min<uint64_t>(alignTo(DataBytes.size(), Align->getAlignment()),
-                             DataBytes.size() + Align->getMaxBytesToEmit());
+          std::min<uint64_t>(alignTo(DataBytes.size(), Frag.getAlignment()),
+                             DataBytes.size() + Frag.getAlignMaxBytesToEmit());
       DataBytes.resize(Size, Value);
     } else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
       int64_t NumValues;
@@ -711,12 +712,10 @@ static void addData(SmallVectorImpl<char> &DataBytes,
         llvm_unreachable("The fill should be an assembler constant");
       DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
                        Fill->getValue());
+    } else if (Frag.getKind() == MCFragment::FT_LEB) {
+      llvm::append_range(DataBytes, Frag.getVarContents());
     } else {
-      llvm::append_range(DataBytes, Frag.getContents());
-      if (Frag.getKind() == MCFragment::FT_LEB)
-        llvm::append_range(DataBytes, Frag.getVarContents());
-      else
-        assert(Frag.getKind() == MCFragment::FT_Data);
+      assert(Frag.getKind() == MCFragment::FT_Data);
     }
   }
 
@@ -1858,23 +1857,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
     auto IT = WS.begin();
     if (IT == WS.end())
       continue;
-    const MCFragment &EmptyFrag = *IT;
-    if (EmptyFrag.getKind() != MCFragment::FT_Data)
-      report_fatal_error(".init_array section should be aligned");
-
-    const MCFragment *nextFrag = EmptyFrag.getNext();
-    while (nextFrag != nullptr) {
-      const MCFragment &AlignFrag = *nextFrag;
-      if (AlignFrag.getKind() != MCFragment::FT_Align)
-        report_fatal_error(".init_array section should be aligned");
-      if (cast<MCAlignFragment>(AlignFrag).getAlignment() !=
-          Align(is64Bit() ? 8 : 4))
-        report_fatal_error(
-            ".init_array section should be aligned for pointers");
-
-      const MCFragment &Frag = *AlignFrag.getNext();
-      nextFrag = Frag.getNext();
-      if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+    for (auto *Frag = &*IT; Frag; Frag = Frag->getNext()) {
+      if (Frag->hasInstructions() || (Frag->getKind() != MCFragment::FT_Align &&
+                                      Frag->getKind() != MCFragment::FT_Data))
         report_fatal_error("only data supported in .init_array section");
 
       uint16_t Priority = UINT16_MAX;
@@ -1886,9 +1871,8 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
         if (WS.getName().substr(PrefixLength + 1).getAsInteger(10, Priority))
           report_fatal_error("invalid .init_array section priority");
       }
-      const auto &DataFrag = Frag;
-      assert(llvm::all_of(DataFrag.getContents(), [](char C) { return !C; }));
-      for (const MCFixup &Fixup : DataFrag.getFixups()) {
+      assert(llvm::all_of(Frag->getContents(), [](char C) { return !C; }));
+      for (const MCFixup &Fixup : Frag->getFixups()) {
         assert(Fixup.getKind() ==
                MCFixup::getDataKindForSize(is64Bit() ? 8 : 4));
         const MCExpr *Expr = Fixup.getValue();
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index ee4d957fe9d87..6ad43347a47e6 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -179,7 +179,7 @@ class llvm::WinCOFFWriter {
   void SetSymbolName(COFFSymbol &S);
   void SetSectionName(COFFSection &S);
 
-  bool IsPhysicalSection(COFFSection *S);
+  bool isUninitializedData(const COFFSection &S);
 
   // Entity writing methods.
   void WriteFileHeader(const COFF::header &Header);
@@ -453,8 +453,8 @@ void WinCOFFWriter::SetSymbolName(COFFSymbol &S) {
     std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
 }
 
-bool WinCOFFWriter::IsPhysicalSection(COFFSection *S) {
-  return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) ==
+bool WinCOFFWriter::isUninitializedData(const COFFSection &S) {
+  return (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) !=
          0;
 }
 
@@ -606,6 +606,9 @@ void WinCOFFWriter::writeSection(const COFFSection &Sec) {
     assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
     AuxSymbol &SecDef = AuxSyms[0];
     SecDef.Aux.SectionDefinition.CheckSum = CRC;
+  } else if (isUninitializedData(Sec)) {
+    // Error if fixups or non-zero bytes are present.
+    writeSectionContents(*Sec.MCSection);
   }
 
   // Write relocations for this section.
@@ -745,7 +748,7 @@ void WinCOFFWriter::assignFileOffsets() {
 
     Sec->Header.SizeOfRawData = Asm->getSectionAddressSize(Section);
 
-    if (IsPhysicalSection(Sec)) {
+    if (!isUninitializedData(*Sec)) {
       Sec->Header.PointerToRawData = Offset;
       Offset += Sec->Header.SizeOfRawData;
     }
@@ -1067,10 +1070,8 @@ uint64_t WinCOFFWriter::writeObject() {
 
   // Create the contents of the .llvm_addrsig section.
   if (Mode != DwoOnly && OWriter.getEmitAddrsigSection()) {
-    auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
-                                            COFF::IMAGE_SCN_LNK_REMOVE);
-    auto *Frag = Sec->curFragList()->Head;
-    raw_svector_ostream OS(Frag->getContentsForAppending());
+    SmallString<0> Content;
+    raw_svector_ostream OS(Content);
     for (const MCSymbol *S : OWriter.AddrsigSyms) {
       if (!S->isRegistered())
         continue;
@@ -1085,15 +1086,15 @@ uint64_t WinCOFFWriter::writeObject() {
              "executePostLayoutBinding!");
       encodeULEB128(SectionMap[TargetSection]->Symbol->getIndex(), OS);
     }
-    Frag->doneAppending();
+    auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
+                                            COFF::IMAGE_SCN_LNK_REMOVE);
+    Sec->curFragList()->Tail->setVarContents(OS.str());
   }
 
   // Create the contents of the .llvm.call-graph-profile section.
   if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) {
-    auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
-                                            COFF::IMAGE_SCN_LNK_REMOVE);
-    auto *Frag = Sec->curFragList()->Head;
-    raw_svector_ostream OS(Frag->getContentsForAppending());
+    SmallString<0> Content;
+    raw_svector_ostream OS(Content);
     for (const auto &CGPE : OWriter.getCGProfile()) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
@@ -1101,7 +1102,9 @@ uint64_t WinCOFFWriter::writeObject() {
       support::endian::write(OS, ToIndex, W.Endian);
       support::endian::write(OS, CGPE.Count, W.Endian);
     }
-    Frag->doneAppending();
+    auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
+                                            COFF::IMAGE_SCN_LNK_REMOVE);
+    Sec->curFragList()->Tail->setVarContents(OS.str());
   }
 
   assignFileOffsets();
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 8f9444f5fb025..86c6b120fa6c3 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -64,14 +64,14 @@ struct Section {
     return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
   }
 
-  bool isVirtualSection() const {
+  bool isBssSection() const {
     return (getType() == MachO::S_ZEROFILL ||
             getType() == MachO::S_GB_ZEROFILL ||
             getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
   }
 
   bool hasValidOffset() const {
-    return !(isVirtualSection() || OriginalOffset == 0);
+    return !(isBssSection() || OriginalOffset == 0);
   }
 };
 
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
index 7c24d1277dc8d..89c1df8699298 100644
--- a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
@@ -112,7 +112,7 @@ size_t MachOWriter::totalSize() const {
     for (const std::unique_ptr<Section> &S : LC.Sections) {
       if (!S->hasValidOffset()) {
         assert((S->Offset == 0) && "Skipped section's offset must be zero");
-        assert((S->isVirtualSection() || S->Size == 0) &&
+        assert((S->isBssSection() || S->Size == 0) &&
                "Non-zero-fill sections with zero offset must have zero size");
         continue;
       }
@@ -240,7 +240,7 @@ void MachOWriter::writeSections() {
     for (const std::unique_ptr<Section> &Sec : LC.Sections) {
       if (!Sec->hasValidOffset()) {
         assert((Sec->Offset == 0) && "Skipped section's offset must be zero");
-        assert((Sec->isVirtualSection() || Sec->Size == 0) &&
+        assert((Sec->isBssSection() || Sec->Size == 0) &&
                "Non-zero-fill sections with zero offset must have zero size");
         continue;
       }
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index 870169a83174f..0f6d2f7c59a5c 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMObject
   OffloadBundle.cpp
   RecordStreamer.cpp
   RelocationResolver.cpp
+  SFrameParser.cpp
   SymbolicFile.cpp
   SymbolSize.cpp
   TapiFile.cpp
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index af073f6a1a917..788c6020a7f99 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -321,6 +321,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_OFFLOADING);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES)
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CFI_JUMP_TABLE)
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 5597d7db6426d..0919c6aad74f2 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -620,7 +620,9 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
 
 StringRef ELFObjectFileBase::getNVPTXCPUName() const {
   assert(getEMachine() == ELF::EM_CUDA);
-  unsigned SM = getPlatformFlags() & ELF::EF_CUDA_SM;
+  unsigned SM = getEIdentABIVersion() == ELF::ELFABIVERSION_CUDA_V1
+                    ? getPlatformFlags() & ELF::EF_CUDA_SM
+                    : getPlatformFlags() & ELF::EF_CUDA_SM_MASK;
 
   switch (SM) {
   // Fermi architecture.
@@ -679,7 +681,18 @@ StringRef ELFObjectFileBase::getNVPTXCPUName() const {
 
   // Hopper architecture.
   case ELF::EF_CUDA_SM90:
-    return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_90a" : "sm_90";
+    return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS_V1 ? "sm_90a"
+                                                             : "sm_90";
+
+  // Blackwell architecture.
+  case ELF::EF_CUDA_SM100:
+    return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_100a"
+                                                          : "sm_100";
+
+  // Rubin architecture.
+  case ELF::EF_CUDA_SM120:
+    return getPlatformFlags() & ELF::EF_CUDA_ACCELERATORS ? "sm_120a"
+                                                          : "sm_120";
   default:
     llvm_unreachable("Unknown EF_CUDA_SM value");
   }
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index b6318bbe3ab74..d81899334b2b1 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -812,6 +812,7 @@ getRelocationResolver(const ObjectFile &Obj) {
       case Triple::amdgcn:
         return {supportsAmdgpu, resolveAmdgpu};
       case Triple::riscv64:
+      case Triple::riscv64be:
         return {supportsRISCV, resolveRISCV};
       default:
         if (isAMDGPU(Obj))
@@ -851,6 +852,7 @@ getRelocationResolver(const ObjectFile &Obj) {
     case Triple::r600:
       return {supportsAmdgpu, resolveAmdgpu};
     case Triple::riscv32:
+    case Triple::riscv32be:
       return {supportsRISCV, resolveRISCV};
     case Triple::csky:
       return {supportsCSKY, resolveCSKY};
@@ -897,7 +899,9 @@ uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R,
         if (Obj->getArch() != Triple::loongarch32 &&
             Obj->getArch() != Triple::loongarch64 &&
             Obj->getArch() != Triple::riscv32 &&
-            Obj->getArch() != Triple::riscv64)
+            Obj->getArch() != Triple::riscv64 &&
+            Obj->getArch() != Triple::riscv32be &&
+            Obj->getArch() != Triple::riscv64be)
           LocData = 0;
       }
     }
diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp
new file mode 100644
index 0000000000000..2d74d1d6b3827
--- /dev/null
+++ b/llvm/lib/Object/SFrameParser.cpp
@@ -0,0 +1,55 @@
+//===- SFrameParser.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/SFrameParser.h"
+#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <typename T>
+static Expected<const T &> getDataSliceAs(ArrayRef<uint8_t> Data,
+                                          uint64_t Offset) {
+  static_assert(std::is_trivial_v<T>);
+  if (Data.size() < Offset + sizeof(T)) {
+    return createStringError(
+        formatv("unexpected end of data at offset {0:x} while reading [{1:x}, "
+                "{2:x})",
+                Data.size(), Offset, Offset + sizeof(T))
+            .str(),
+        object_error::unexpected_eof);
+  }
+  return *reinterpret_cast<const T *>(Data.data() + Offset);
+}
+
+template <endianness E>
+Expected<SFrameParser<E>> SFrameParser<E>::create(ArrayRef<uint8_t> Contents) {
+  Expected<const sframe::Preamble<E> &> Preamble =
+      getDataSliceAs<sframe::Preamble<E>>(Contents, 0);
+  if (!Preamble)
+    return Preamble.takeError();
+
+  if (Preamble->Magic != sframe::Magic)
+    return createError(
+        formatv("invalid magic number ({0:x+4})", Preamble->Magic.value()));
+  if (Preamble->Version != sframe::Version::V2)
+    return createError(
+        formatv("invalid/unsupported version number ({0})",
+                static_cast<unsigned>(Preamble->Version.value())));
+
+  Expected<const sframe::Header<E> &> Header =
+      getDataSliceAs<sframe::Header<E>>(Contents, 0);
+  if (!Header)
+    return Header.takeError();
+  return SFrameParser(Contents, *Header);
+}
+
+template class llvm::object::SFrameParser<endianness::big>;
+template class llvm::object::SFrameParser<endianness::little>;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 80fb52f9603e8..e15570c3f600e 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1189,9 +1189,13 @@ Expected<GVNOptions> parseGVNOptions(StringRef Params) {
     } else if (ParamName == "split-backedge-load-pre") {
       Result.setLoadPRESplitBackedge(Enable);
     } else if (ParamName == "memdep") {
+      // MemDep and MemorySSA are mutually exclusive.
       Result.setMemDep(Enable);
+      Result.setMemorySSA(!Enable);
     } else if (ParamName == "memoryssa") {
+      // MemDep and MemorySSA are mutually exclusive.
       Result.setMemorySSA(Enable);
+      Result.setMemDep(!Enable);
     } else {
       return make_error<StringError>(
           formatv("invalid GVN pass parameter '{}'", ParamName).str(),
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 9a943155aa19f..caa78b613b901 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -434,6 +434,7 @@ FUNCTION_PASS("extra-vector-passes",
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flatten-cfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
+FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass())
 FUNCTION_PASS("gc-lowering", GCLoweringPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 0623e66772047..f165e85baf611 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -1078,9 +1078,13 @@ void OptPassGateInstrumentation::registerCallbacks(
   if (!PassGate.isEnabled())
     return;
 
-  PIC.registerShouldRunOptionalPassCallback([this](StringRef PassName, Any IR) {
-    return this->shouldRun(PassName, IR);
-  });
+  PIC.registerShouldRunOptionalPassCallback(
+      [this, &PIC](StringRef ClassName, Any IR) {
+        StringRef PassName = PIC.getPassNameForClassName(ClassName);
+        if (PassName.empty())
+          return this->shouldRun(ClassName, IR);
+        return this->shouldRun(PassName, IR);
+      });
 }
 
 raw_ostream &PrintPassInstrumentation::print() {
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
index d00580fe35195..19918aa708b2f 100644
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8],
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
   MAYBE_UNUSED(features);
-#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
   if (features & AVX512VL) {
     blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
     return;
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
index deed079e468a5..dd71e729f208f 100644
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out);
 
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__CYGWIN__)
 LLVM_LIBRARY_VISIBILITY
 void blake3_xof_many_avx512(const uint32_t cv[8],
                             const uint8_t block[BLAKE3_BLOCK_LEN],
diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c
index 9629e10836864..ee36721f87573 100644
--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@@ -245,10 +245,11 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
       counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
 }
 
-void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
-                       const uint32_t key[8], uint64_t counter,
-                       bool increment_counter, uint8_t flags,
-                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              bool increment_counter, uint8_t flags,
+                              uint8_t flags_start, uint8_t flags_end,
+                              uint8_t *out) {
   uint32x4_t h_vecs[8] = {
       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
diff --git a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
index d5be360815add..d24657465dd8f 100644
--- a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
+++ b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
@@ -10,7 +10,9 @@
 #define blake3_hasher llvm_blake3_hasher
 #define blake3_chunk_state llvm_blake3_chunk_state
 #define blake3_compress_in_place llvm_blake3_compress_in_place
+#define blake3_compress_subtree_wide llvm_blake3_compress_subtree_wide
 #define blake3_compress_xof llvm_blake3_compress_xof
+#define blake3_xof_many llvm_blake3_xof_many
 #define blake3_hash_many llvm_blake3_hash_many
 #define blake3_simd_degree llvm_blake3_simd_degree
 #define blake3_compress_in_place_portable llvm_blake3_compress_in_place_portable
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index a579eaf7d953d..10b6101d73277 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -380,7 +380,7 @@ if(LLVM_WITH_Z3)
     )
 endif()
 
-target_include_directories(LLVMSupport SYSTEM
+target_include_directories(LLVMSupport
   PRIVATE
   ${LLVM_THIRD_PARTY_DIR}/siphash/include
-  )
+)
diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp
index 432e1fc343f1f..3432dc15ceef2 100644
--- a/llvm/lib/Support/StringMap.cpp
+++ b/llvm/lib/Support/StringMap.cpp
@@ -45,23 +45,15 @@ static inline unsigned *getHashTable(StringMapEntryBase **TheTable,
 
 uint32_t StringMapImpl::hash(StringRef Key) { return xxh3_64bits(Key); }
 
-StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
-  ItemSize = itemSize;
-
+StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize)
+    : ItemSize(itemSize) {
   // If a size is specified, initialize the table with that many buckets.
   if (InitSize) {
     // The table will grow when the number of entries reach 3/4 of the number of
     // buckets. To guarantee that "InitSize" number of entries can be inserted
     // in the table without growing, we allocate just what is needed here.
     init(getMinBucketToReserveForEntries(InitSize));
-    return;
   }
-
-  // Otherwise, initialize it with zero buckets to avoid the allocation.
-  TheTable = nullptr;
-  NumBuckets = 0;
-  NumItems = 0;
-  NumTombstones = 0;
 }
 
 void StringMapImpl::init(unsigned InitSize) {
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index c8e020d791e09..aea1bb0c6d75e 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
 }
 
 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
-  TokStart = CurPtr;
-  // This always consumes at least one character.
-  int CurChar = getNextChar();
+  while (true) {
+    TokStart = CurPtr;
+    // This always consumes at least one character.
+    int CurChar = getNextChar();
 
-  switch (CurChar) {
-  default:
-    // Handle letters: [a-zA-Z_]
-    if (isValidIDChar(CurChar, /*First=*/true))
-      return LexIdentifier();
-
-    // Unknown character, emit an error.
-    return ReturnError(TokStart, "unexpected character");
-  case EOF:
-    // Lex next token, if we just left an include file.
-    // Note that leaving an include file means that the next
-    // symbol is located at the end of the 'include "..."'
-    // construct, so LexToken() is called with default
-    // false parameter.
-    if (processEOF())
-      return LexToken();
+    switch (CurChar) {
+    default:
+      // Handle letters: [a-zA-Z_]
+      if (isValidIDChar(CurChar, /*First=*/true))
+        return LexIdentifier();
 
-    // Return EOF denoting the end of lexing.
-    return tgtok::Eof;
-
-  case ':': return tgtok::colon;
-  case ';': return tgtok::semi;
-  case ',': return tgtok::comma;
-  case '<': return tgtok::less;
-  case '>': return tgtok::greater;
-  case ']': return tgtok::r_square;
-  case '{': return tgtok::l_brace;
-  case '}': return tgtok::r_brace;
-  case '(': return tgtok::l_paren;
-  case ')': return tgtok::r_paren;
-  case '=': return tgtok::equal;
-  case '?': return tgtok::question;
-  case '#':
-    if (FileOrLineStart) {
-      tgtok::TokKind Kind = prepIsDirective();
-      if (Kind != tgtok::Error)
-        return lexPreprocessor(Kind);
-    }
+      // Unknown character, emit an error.
+      return ReturnError(TokStart, "unexpected character");
+    case EOF:
+      // Lex next token, if we just left an include file.
+      if (processEOF()) {
+        // Leaving an include file means that the next symbol is located at the
+        // end of the 'include "..."' construct.
+        FileOrLineStart = false;
+        break;
+      }
 
-    return tgtok::paste;
+      // Return EOF denoting the end of lexing.
+      return tgtok::Eof;
+
+    case ':':
+      return tgtok::colon;
+    case ';':
+      return tgtok::semi;
+    case ',':
+      return tgtok::comma;
+    case '<':
+      return tgtok::less;
+    case '>':
+      return tgtok::greater;
+    case ']':
+      return tgtok::r_square;
+    case '{':
+      return tgtok::l_brace;
+    case '}':
+      return tgtok::r_brace;
+    case '(':
+      return tgtok::l_paren;
+    case ')':
+      return tgtok::r_paren;
+    case '=':
+      return tgtok::equal;
+    case '?':
+      return tgtok::question;
+    case '#':
+      if (FileOrLineStart) {
+        tgtok::TokKind Kind = prepIsDirective();
+        if (Kind != tgtok::Error)
+          return lexPreprocessor(Kind);
+      }
+
+      return tgtok::paste;
 
-  // The period is a separate case so we can recognize the "..."
-  // range punctuator.
-  case '.':
-    if (peekNextChar(0) == '.') {
-      ++CurPtr; // Eat second dot.
+      // The period is a separate case so we can recognize the "..."
+      // range punctuator.
+    case '.':
       if (peekNextChar(0) == '.') {
-        ++CurPtr; // Eat third dot.
-        return tgtok::dotdotdot;
+        ++CurPtr; // Eat second dot.
+        if (peekNextChar(0) == '.') {
+          ++CurPtr; // Eat third dot.
+          return tgtok::dotdotdot;
+        }
+        return ReturnError(TokStart, "invalid '..' punctuation");
       }
-      return ReturnError(TokStart, "invalid '..' punctuation");
-    }
-    return tgtok::dot;
+      return tgtok::dot;
 
-  case '\r':
-    llvm_unreachable("getNextChar() must never return '\r'");
+    case '\r':
+      llvm_unreachable("getNextChar() must never return '\r'");
 
-  case ' ':
-  case '\t':
-    // Ignore whitespace.
-    return LexToken(FileOrLineStart);
-  case '\n':
-    // Ignore whitespace, and identify the new line.
-    return LexToken(true);
-  case '/':
-    // If this is the start of a // comment, skip until the end of the line or
-    // the end of the buffer.
-    if (*CurPtr == '/')
-      SkipBCPLComment();
-    else if (*CurPtr == '*') {
-      if (SkipCComment())
-        return tgtok::Error;
-    } else // Otherwise, this is an error.
-      return ReturnError(TokStart, "unexpected character");
-    return LexToken(FileOrLineStart);
-  case '-': case '+':
-  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
-  case '7': case '8': case '9': {
-    int NextChar = 0;
-    if (isDigit(CurChar)) {
-      // Allow identifiers to start with a number if it is followed by
-      // an identifier. This can happen with paste operations like
-      // foo#8i.
-      int i = 0;
-      do {
-        NextChar = peekNextChar(i++);
-      } while (isDigit(NextChar));
-
-      if (NextChar == 'x' || NextChar == 'b') {
-        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
-        // likely a number.
-        int NextNextChar = peekNextChar(i);
-        switch (NextNextChar) {
-        default:
-          break;
-        case '0': case '1':
-          if (NextChar == 'b')
-            return LexNumber();
-          [[fallthrough]];
-        case '2': case '3': case '4': case '5':
-        case '6': case '7': case '8': case '9':
-        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-          if (NextChar == 'x')
-            return LexNumber();
-          break;
+    case ' ':
+    case '\t':
+      // Ignore whitespace.
+      break;
+    case '\n':
+      // Ignore whitespace, and identify the new line.
+      FileOrLineStart = true;
+      break;
+    case '/':
+      // If this is the start of a // comment, skip until the end of the line or
+      // the end of the buffer.
+      if (*CurPtr == '/')
+        SkipBCPLComment();
+      else if (*CurPtr == '*') {
+        if (SkipCComment())
+          return tgtok::Error;
+      } else // Otherwise, this is an error.
+        return ReturnError(TokStart, "unexpected character");
+      break;
+    case '-':
+    case '+':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9': {
+      int NextChar = 0;
+      if (isDigit(CurChar)) {
+        // Allow identifiers to start with a number if it is followed by
+        // an identifier.  This can happen with paste operations like
+        // foo#8i.
+        int i = 0;
+        do {
+          NextChar = peekNextChar(i++);
+        } while (isDigit(NextChar));
+
+        if (NextChar == 'x' || NextChar == 'b') {
+          // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
+          // likely a number.
+          int NextNextChar = peekNextChar(i);
+          switch (NextNextChar) {
+          default:
+            break;
+          case '0':
+          case '1':
+            if (NextChar == 'b')
+              return LexNumber();
+            [[fallthrough]];
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+          case 'a':
+          case 'b':
+          case 'c':
+          case 'd':
+          case 'e':
+          case 'f':
+          case 'A':
+          case 'B':
+          case 'C':
+          case 'D':
+          case 'E':
+          case 'F':
+            if (NextChar == 'x')
+              return LexNumber();
+            break;
+          }
         }
       }
-    }
 
-    if (isValidIDChar(NextChar, /*First=*/true))
-      return LexIdentifier();
+      if (isValidIDChar(NextChar, /*First=*/true))
+        return LexIdentifier();
 
-    return LexNumber();
-  }
-  case '"': return LexString();
-  case '$': return LexVarName();
-  case '[': return LexBracket();
-  case '!': return LexExclaim();
+      return LexNumber();
+    }
+    case '"':
+      return LexString();
+    case '$':
+      return LexVarName();
+    case '[':
+      return LexBracket();
+    case '!':
+      return LexExclaim();
+    }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 7de66ccbf6f29..201bfe0a443d6 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     llvm_unreachable("Unsupported ElementSize");
   }
 
+  // Preserve undef state until DOP's reg is defined.
+  unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0;
+
   //
   // Create the destructive operation (if required)
   //
@@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
                .addReg(DstReg, RegState::Define)
                .addReg(MI.getOperand(PredIdx).getReg())
-               .addReg(MI.getOperand(DOPIdx).getReg());
+               .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
 
     // After the movprfx, the destructive operand is same as Dst
     DOPIdx = 0;
+    DOPRegState = 0;
 
     // Create the additional LSL to zero the lanes when the DstReg is not
     // unique. Zeros the lanes in z0 that aren't active in p0 with sequence
@@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     assert(DOPRegIsUnique && "The destructive operand should be unique");
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
                .addReg(DstReg, RegState::Define)
-               .addReg(MI.getOperand(DOPIdx).getReg());
+               .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
     DOPIdx = 0;
+    DOPRegState = 0;
   }
 
   //
@@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   //
   DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+  DOPRegState = DOPRegState | RegState::Kill;
 
   switch (DType) {
   case AArch64::DestructiveUnaryPassthru:
-    DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+    DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
         .add(MI.getOperand(PredIdx))
         .add(MI.getOperand(SrcIdx));
     break;
@@ -659,12 +665,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
     DOP.add(MI.getOperand(PredIdx))
-       .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
-       .add(MI.getOperand(SrcIdx));
+        .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+        .add(MI.getOperand(SrcIdx));
     break;
   case AArch64::DestructiveTernaryCommWithRev:
     DOP.add(MI.getOperand(PredIdx))
-        .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+        .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
         .add(MI.getOperand(SrcIdx))
         .add(MI.getOperand(Src2Idx));
     break;
@@ -1199,32 +1205,36 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     Register DstReg = MI.getOperand(0).getReg();
     if (DstReg == MI.getOperand(3).getReg()) {
       // Expand to BIT
-      BuildMI(MBB, MBBI, MI.getDebugLoc(),
-              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
-                                                  : AArch64::BITv16i8))
-          .add(MI.getOperand(0))
-          .add(MI.getOperand(3))
-          .add(MI.getOperand(2))
-          .add(MI.getOperand(1));
+      auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                       TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+                                                           : AArch64::BITv16i8))
+                   .add(MI.getOperand(0))
+                   .add(MI.getOperand(3))
+                   .add(MI.getOperand(2))
+                   .add(MI.getOperand(1));
+      transferImpOps(MI, I, I);
     } else if (DstReg == MI.getOperand(2).getReg()) {
       // Expand to BIF
-      BuildMI(MBB, MBBI, MI.getDebugLoc(),
-              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
-                                                  : AArch64::BIFv16i8))
-          .add(MI.getOperand(0))
-          .add(MI.getOperand(2))
-          .add(MI.getOperand(3))
-          .add(MI.getOperand(1));
+      auto I = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                       TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+                                                           : AArch64::BIFv16i8))
+                   .add(MI.getOperand(0))
+                   .add(MI.getOperand(2))
+                   .add(MI.getOperand(3))
+                   .add(MI.getOperand(1));
+      transferImpOps(MI, I, I);
     } else {
       // Expand to BSL, use additional move if required
       if (DstReg == MI.getOperand(1).getReg()) {
-        BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
-                                                    : AArch64::BSLv16i8))
-            .add(MI.getOperand(0))
-            .add(MI.getOperand(1))
-            .add(MI.getOperand(2))
-            .add(MI.getOperand(3));
+        auto I =
+            BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                    TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                        : AArch64::BSLv16i8))
+                .add(MI.getOperand(0))
+                .add(MI.getOperand(1))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3));
+        transferImpOps(MI, I, I);
       } else {
         BuildMI(MBB, MBBI, MI.getDebugLoc(),
                 TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
@@ -1234,15 +1244,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                         getRenamableRegState(MI.getOperand(0).isRenamable()))
             .add(MI.getOperand(1))
             .add(MI.getOperand(1));
-        BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
-                                                    : AArch64::BSLv16i8))
-            .add(MI.getOperand(0))
-            .addReg(DstReg,
-                    RegState::Kill |
-                        getRenamableRegState(MI.getOperand(0).isRenamable()))
-            .add(MI.getOperand(2))
-            .add(MI.getOperand(3));
+        auto I2 =
+            BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                    TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                        : AArch64::BSLv16i8))
+                .add(MI.getOperand(0))
+                .addReg(DstReg,
+                        RegState::Kill | getRenamableRegState(
+                                             MI.getOperand(0).isRenamable()))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3));
+        transferImpOps(MI, I2, I2);
       }
     }
     MI.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 9973df865ea17..c1c1f0a1024d0 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
                                                "HasDisableFastIncVL", "true",
                                                "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
 
+// On most processors we want to avoid moving from WZR to vector registers
+// (relying on materializing 0 to a FPR and moving from there instead),
+// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
+def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
+                                              "UseWzrToVecMove", "true",
+                                              "Move from WZR to insert 0 into vector registers">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 235df9022c6fb..02ee517a0a9b8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                        ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
                        ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
                        ISD::STORE, ISD::BUILD_VECTOR});
+  setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::LOAD);
 
@@ -2392,6 +2393,15 @@ static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
   return false;
 }
 
+bool isVectorizedBinOp(unsigned Opcode) {
+  switch (Opcode) {
+  case AArch64ISD::SQDMULH:
+    return true;
+  default:
+    return false;
+  }
+}
+
 // isOpcWithIntImmediate - This method tests to see if the node is a specific
 // opcode and that it has a immediate integer right operand.
 // If so Imm will receive the value.
@@ -6429,7 +6439,9 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
     }
   }
 
-  return true;
+  EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
+  return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
+         PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
 }
 
 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
@@ -17145,7 +17157,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -17153,6 +17165,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load");
+
   const DataLayout &DL = LI->getDataLayout();
 
   VectorType *VTy = Shuffles[0]->getType();
@@ -17326,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
+                                                  Value *LaneMask,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
 
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
+  auto *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!LaneMask && "Unexpected mask on store");
 
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
@@ -17476,9 +17498,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    Instruction *Load, Value *Mask,
-    ArrayRef<Value *> DeinterleavedValues) const {
-  unsigned Factor = DeinterleavedValues.size();
+    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
@@ -17488,9 +17509,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
   assert(!Mask && "Unexpected mask on a load\n");
 
-  Value *FirstActive = *llvm::find_if(DeinterleavedValues,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
+  VectorType *VTy = getDeinterleavedVectorType(DI);
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
   bool UseScalable;
@@ -17518,6 +17537,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
 
   Value *BaseAddr = LI->getPointerOperand();
+  Value *Result = nullptr;
   if (NumLoads > 1) {
     // Create multiple legal small ldN.
     SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
@@ -17538,35 +17558,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       }
       LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
     }
-    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
-    for (unsigned J = 0; J < Factor; ++J) {
-      if (DeinterleavedValues[J])
-        DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
-    }
+
+    // Merge the values from different factors.
+    Result = PoisonValue::get(DI->getType());
+    for (unsigned J = 0; J < Factor; ++J)
+      Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
   } else {
-    Value *Result;
     if (UseScalable)
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
-    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
-    for (unsigned I = 0; I < Factor; I++) {
-      if (DeinterleavedValues[I]) {
-        Value *NewExtract = Builder.CreateExtractValue(Result, I);
-        DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
-      }
-    }
   }
+
+  // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+  DI->replaceAllUsesWith(Result);
   return true;
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
+    Instruction *Store, Value *Mask,
+    ArrayRef<Value *> InterleavedValues) const {
   unsigned Factor = InterleavedValues.size();
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
   }
+  StoreInst *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!Mask && "Unexpected mask on plain store");
 
   VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
   const DataLayout &DL = SI->getModule()->getDataLayout();
@@ -20131,8 +20151,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // size, combine into an binop of two contacts of the source vectors. eg:
   // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
-      DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
-      N1->hasOneUse()) {
+      (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
+       isVectorizedBinOp(N0Opc)) &&
+      N0->hasOneUse() && N1->hasOneUse()) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
@@ -20991,6 +21012,98 @@ static SDValue performBuildVectorCombine(SDNode *N,
   return SDValue();
 }
 
+// A special combine for the sqdmulh family of instructions.
+// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
+// SATURATING_VAL ) can be reduced to sqdmulh(...)
+static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
+
+  if (N->getOpcode() != ISD::SMIN)
+    return SDValue();
+
+  EVT DestVT = N->getValueType(0);
+
+  if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
+      DestVT.isScalableVector())
+    return SDValue();
+
+  ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
+
+  if (!Clamp)
+    return SDValue();
+
+  MVT ScalarType;
+  unsigned ShiftAmt = 0;
+  switch (Clamp->getSExtValue()) {
+  case (1ULL << 15) - 1:
+    ScalarType = MVT::i16;
+    ShiftAmt = 16;
+    break;
+  case (1ULL << 31) - 1:
+    ScalarType = MVT::i32;
+    ShiftAmt = 32;
+    break;
+  default:
+    return SDValue();
+  }
+
+  SDValue Sra = N->getOperand(0);
+  if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
+    return SDValue();
+
+  ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
+  if (!RightShiftVec)
+    return SDValue();
+  unsigned SExtValue = RightShiftVec->getSExtValue();
+
+  if (SExtValue != (ShiftAmt - 1))
+    return SDValue();
+
+  SDValue Mul = Sra.getOperand(0);
+  if (Mul.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  SDValue SExt0 = Mul.getOperand(0);
+  SDValue SExt1 = Mul.getOperand(1);
+
+  if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
+      SExt1.getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+
+  EVT SExt0Type = SExt0.getOperand(0).getValueType();
+  EVT SExt1Type = SExt1.getOperand(0).getValueType();
+
+  if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
+      SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
+      SExt0Type.getVectorNumElements() == 1)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue V0 = SExt0.getOperand(0);
+  SDValue V1 = SExt1.getOperand(0);
+
+  // Ensure input vectors are extended to legal types
+  if (SExt0Type.getFixedSizeInBits() < 64) {
+    unsigned VecNumElements = SExt0Type.getVectorNumElements();
+    EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
+                                    VecNumElements);
+    V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
+    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
+  }
+
+  SDValue SQDMULH =
+      DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
+
+  return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
+}
+
+static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) {
+  if (SDValue V = trySQDMULHCombine(N, DAG)) {
+    return V;
+  }
+
+  return SDValue();
+}
+
 static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc DL(N);
@@ -26742,6 +26855,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performAddSubCombine(N, DCI);
   case ISD::BUILD_VECTOR:
     return performBuildVectorCombine(N, DCI, DAG);
+  case ISD::SMIN:
+    return performSMINCombine(N, DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DAG, DCI);
   case AArch64ISD::ANDS:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6afb3c330d25b..d8403c2971696 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -211,19 +211,20 @@ class AArch64TargetLowering : public TargetLowering {
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
-  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                             ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(
-      Instruction *Load, Value *Mask,
-      ArrayRef<Value *> DeinterleaveValues) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+                                        IntrinsicInst *DI) const override;
 
   bool lowerInterleaveIntrinsicToStore(
-      StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
+      Instruction *Store, Value *Mask,
+      ArrayRef<Value *> InterleaveValues) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5420545cc3cec..bc57537ad5dfb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -6288,13 +6290,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
     //
     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
-      const TargetRegisterClass *FillRC;
+      const TargetRegisterClass *FillRC = nullptr;
       switch (DstMO.getSubReg()) {
       default:
-        FillRC = nullptr;
         break;
       case AArch64::sub_32:
-        FillRC = &AArch64::GPR32RegClass;
+        if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
+          FillRC = &AArch64::GPR32RegClass;
         break;
       case AArch64::ssub:
         FillRC = &AArch64::FPR32RegClass;
@@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+static bool getGatherPattern(MachineInstr &Root,
+                             SmallVectorImpl<unsigned> &Patterns,
+                             unsigned LoadLaneOpCode, unsigned NumLanes) {
+  const MachineFunction *MF = Root.getMF();
+
+  // Early exit if optimizing for size.
+  if (MF->getFunction().hasMinSize())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single non-debug use (since we will be replacing the virtual
+  // register)
+  // 2. That the addressing mode only uses a single offset register.
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
+  while (!RemainingLanes.empty() && CurrInstr &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an integer into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  switch (NumLanes) {
+  case 4:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+    break;
+  case 8:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+    break;
+  case 16:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+    break;
+  default:
+    llvm_unreachable("Got bad number of lanes for gather pattern.");
+  }
+
+  return true;
+}
+
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+
+  // The pattern searches for loads into single lanes.
+  switch (Root.getOpcode()) {
+  case AArch64::LD1i32:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+  case AArch64::LD1i16:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+  case AArch64::LD1i8:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+  default:
+    return false;
+  }
+}
+
+static void
+generateGatherPattern(MachineInstr &Root,
+                      SmallVectorImpl<MachineInstr *> &InsInstrs,
+                      SmallVectorImpl<MachineInstr *> &DelInstrs,
+                      DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+                      unsigned Pattern, unsigned NumLanes) {
+
+  MachineFunction &MF = *Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  // Gather the initial load instructions to build the pattern
+  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+  MachineInstr *CurrInstr = &Root;
+  for (unsigned i = 0; i < NumLanes - 1; ++i) {
+    LoadToLaneInstrs.push_back(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Sort the load instructions according to the lane.
+  llvm::sort(LoadToLaneInstrs,
+             [](const MachineInstr *A, const MachineInstr *B) {
+               return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+             });
+
+  MachineInstr *SubregToReg = CurrInstr;
+  LoadToLaneInstrs.push_back(
+      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+  const TargetRegisterClass *FPR128RegClass =
+      MRI.getRegClass(Root.getOperand(0).getReg());
+
+  auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+                                Register SrcRegister, unsigned Lane,
+                                Register OffsetRegister) {
+    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+    MachineInstrBuilder LoadIndexIntoRegister =
+        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                NewRegister)
+            .addReg(SrcRegister)
+            .addImm(Lane)
+            .addReg(OffsetRegister, getKillRegState(true));
+    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+    InsInstrs.push_back(LoadIndexIntoRegister);
+    return NewRegister;
+  };
+
+  // Helper to create load instruction based on opcode
+  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+                                   Register OffsetReg) -> MachineInstrBuilder {
+    unsigned Opcode;
+    switch (NumLanes) {
+    case 4:
+      Opcode = AArch64::LDRSui;
+      break;
+    case 8:
+      Opcode = AArch64::LDRHui;
+      break;
+    case 16:
+      Opcode = AArch64::LDRBui;
+      break;
+    default:
+      llvm_unreachable(
+          "Got unsupported number of lanes in machine-combiner gather pattern");
+    }
+    // Immediate offset load
+    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+        .addReg(OffsetReg)
+        .addImm(0); // immediate offset
+  };
+
+  // Load the remaining lanes into register 0.
+  auto LanesToLoadToReg0 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+  auto PrevReg = SubregToReg->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg0 = PrevReg;
+
+  // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+  // a single instruction.
+  auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+  auto OriginalSplitLoad =
+      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+  auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+  MachineInstrBuilder MiddleIndexLoadInstr =
+      CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+                            OriginalSplitLoad->getOperand(3).getReg());
+
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+  InsInstrs.push_back(MiddleIndexLoadInstr);
+  DelInstrs.push_back(OriginalSplitLoad);
+
+  // Subreg To Reg instruction for register 1.
+  auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+  unsigned SubregType;
+  switch (NumLanes) {
+  case 4:
+    SubregType = AArch64::ssub;
+    break;
+  case 8:
+    SubregType = AArch64::hsub;
+    break;
+  case 16:
+    SubregType = AArch64::bsub;
+    break;
+  default:
+    llvm_unreachable(
+        "Got invalid NumLanes for machine-combiner gather pattern");
+  }
+
+  auto SubRegToRegInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+              DestRegForSubregToReg)
+          .addImm(0)
+          .addReg(DestRegForMiddleIndex, getKillRegState(true))
+          .addImm(SubregType);
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+  InsInstrs.push_back(SubRegToRegInstr);
+
+  // Load remaining lanes into register 1.
+  auto LanesToLoadToReg1 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+                       LoadToLaneInstrsAscending.end());
+  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
+    if (Index == NumLanes / 2 - 2) {
+      break;
+    }
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg1 = PrevReg;
+
+  // Create the final zip instruction to combine the results.
+  MachineInstrBuilder ZipInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+              Root.getOperand(0).getReg())
+          .addReg(LastLoadReg0)
+          .addReg(LastLoadReg1);
+  InsInstrs.push_back(ZipInstr);
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
+  // Load patterns
+  if (getLoadPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 4);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 8);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 16);
+    break;
+  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
@@ -9585,10 +9850,15 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
       };
   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
     // At least one unsafe register is not dead. We do not want to outline at
-    // this point. If it is long enough to outline from, save the range
-    // [RangeBegin, RangeEnd).
-    if (RangeLen > 1)
-      Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
+    // this point. If it is long enough to outline from and does not cross a
+    // bundle boundary, save the range [RangeBegin, RangeEnd).
+    if (RangeLen <= 1)
+      return;
+    if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
+      return;
+    if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
+      return;
+    Ranges.emplace_back(RangeBegin, RangeEnd);
   };
   // Find the first point where all unsafe registers are dead.
   // FIND: <safe instr> <-- end of first potential range
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..02734866e7122 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  GATHER_LANE_i32,
+  GATHER_LANE_i16,
+  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9a..9f8a2571b076e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
 
 def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
 
+def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
+
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -1022,6 +1024,7 @@ def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
                              [SDNPCommutative]>;
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
                              [SDNPCommutative]>;
+def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>;
 
 // Reciprocal estimates and steps.
 def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
@@ -1050,13 +1053,6 @@ def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 def AArch64uaddlv   : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>;
 def AArch64saddlv   : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>;
 
-def AArch64uabd     : PatFrags<(ops node:$lhs, node:$rhs),
-                               [(abdu node:$lhs, node:$rhs),
-                                (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>;
-def AArch64sabd     : PatFrags<(ops node:$lhs, node:$rhs),
-                               [(abds node:$lhs, node:$rhs),
-                                (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>;
-
 // Add Pairwise of two vectors
 def AArch64addp_n   : SDNode<"AArch64ISD::ADDP", SDT_AArch64Zip>;
 // Add Long Pairwise
@@ -5664,8 +5660,7 @@ let Predicates = [HasFullFP16] in {
 // Advanced SIMD two vector instructions.
 //===----------------------------------------------------------------------===//
 
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          AArch64uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>;
 // Match UABDL in log2-shuffle patterns.
 def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
                            (zext (v8i8 V64:$opB))))),
@@ -6015,8 +6010,8 @@ defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
 defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
 defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
-      TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >;
-defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>;
+      TriOpFrag<(add node:$LHS, (abds node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", abds>;
 defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", avgfloors>;
 defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
 defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
@@ -6034,8 +6029,8 @@ defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
 defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
-      TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >;
-defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>;
+      TriOpFrag<(add node:$LHS, (abdu node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", abdu>;
 defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", avgflooru>;
 defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
 defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
@@ -6756,10 +6751,8 @@ defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>
 defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
 defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
 defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>;
-defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
-                                             AArch64sabd>;
-defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
-                                          AArch64sabd>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>;
 defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
             BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
 defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
@@ -6777,8 +6770,7 @@ defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
                  BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
 defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
                  BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
-                                              AArch64uabd>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>;
 defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
                  BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
@@ -7376,6 +7368,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
               (i64 0)),
             dsub)>;
 
+let Predicates = [UseWzrToVecMove] in {
 def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
           (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
 def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
@@ -7386,6 +7379,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
           (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
 def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
           (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+}
 
 def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
@@ -9439,6 +9433,15 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
                              (EXTRACT_SUBREG V128:$Rm, dsub)),
            (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
 
+def : Pat<(v4i16 (AArch64sqdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+          (SQDMULHv4i16 V64:$Rn, V64:$Rm)>;
+def : Pat<(v2i32 (AArch64sqdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+          (SQDMULHv2i32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
+          (SQDMULHv8i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
+          (SQDMULHv4i32 V128:$Rn, V128:$Rm)>;
+
 // Conversions within AdvSIMD types in the same register size are free.
 // But because we need a consistent lane ordering, in big endian many
 // conversions require one or more REV instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 0ddd17cee1344..abcd5505f735b 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,8 +8,8 @@
 //
 // This pass performs below peephole optimizations on MIR level.
 //
-// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-//    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+//    MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
 //
 // 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
 //    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
@@ -126,7 +126,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
 
   template <typename T>
-  bool visitAND(unsigned Opc, MachineInstr &MI);
+  bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
   bool visitORR(MachineInstr &MI);
   bool visitCSEL(MachineInstr &MI);
   bool visitINSERT(MachineInstr &MI);
@@ -194,12 +194,12 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 }
 
 template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(
-    unsigned Opc, MachineInstr &MI) {
+bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
+                                    unsigned OtherOpc) {
   // Try below transformation.
   //
-  // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
-  // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
+  // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
+  // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
   //
   // The mov pseudo instruction could be expanded to multiple mov instructions
   // later. Let's try to split the constant operand of mov instruction into two
@@ -208,10 +208,10 @@ bool AArch64MIPeepholeOpt::visitAND(
 
   return splitTwoPartImm<T>(
       MI,
-      [Opc](T Imm, unsigned RegSize, T &Imm0,
-            T &Imm1) -> std::optional<OpcodePair> {
+      [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+                      T &Imm1) -> std::optional<OpcodePair> {
         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
-          return std::make_pair(Opc, Opc);
+          return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
         return std::nullopt;
       },
       [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
@@ -864,6 +864,12 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       case AArch64::ANDXrr:
         Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
         break;
+      case AArch64::ANDSWrr:
+        Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+        break;
+      case AArch64::ANDSXrr:
+        Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+        break;
       case AArch64::ORRWrs:
         Changed |= visitORR(MI);
         break;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 5379305bc7a7f..adc984ad795af 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    "Cortex-A320 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureBalanceFPOps,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
                                    "Cortex-A55 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureFuseAddress]>;
+                                   FeatureFuseAddress,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    "Cortex-A510 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 2409cc862f21c..0f4f0129e9cd3 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -534,7 +534,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
 }
 
 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                           unsigned NumRegionInstrs) const {
+                                           const SchedRegion &Region) const {
   // LNT run (at least on Cyclone) showed reasonably significant gains for
   // bi-directional scheduling. 253.perlbmk.
   Policy.OnlyTopDown = false;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 154db3c074f71..061ed611e5e47 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -343,7 +343,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           unsigned NumRegionInstrs) const override;
+                           const SchedRegion &Region) const override;
+
   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
                              SDep &Dep,
                              const TargetSchedModel *SchedModel) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f339396f3a411..90d3d92d6bbf5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5211,34 +5211,34 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
   static const CostTblEntry CostTblNoPairwise[]{
-      {ISD::ADD, MVT::v8i8,   2},
-      {ISD::ADD, MVT::v16i8,  2},
-      {ISD::ADD, MVT::v4i16,  2},
-      {ISD::ADD, MVT::v8i16,  2},
-      {ISD::ADD, MVT::v2i32,  2},
-      {ISD::ADD, MVT::v4i32,  2},
-      {ISD::ADD, MVT::v2i64,  2},
-      {ISD::OR,  MVT::v8i8,  15},
-      {ISD::OR,  MVT::v16i8, 17},
-      {ISD::OR,  MVT::v4i16,  7},
-      {ISD::OR,  MVT::v8i16,  9},
-      {ISD::OR,  MVT::v2i32,  3},
-      {ISD::OR,  MVT::v4i32,  5},
-      {ISD::OR,  MVT::v2i64,  3},
-      {ISD::XOR, MVT::v8i8,  15},
-      {ISD::XOR, MVT::v16i8, 17},
-      {ISD::XOR, MVT::v4i16,  7},
-      {ISD::XOR, MVT::v8i16,  9},
-      {ISD::XOR, MVT::v2i32,  3},
-      {ISD::XOR, MVT::v4i32,  5},
-      {ISD::XOR, MVT::v2i64,  3},
-      {ISD::AND, MVT::v8i8,  15},
-      {ISD::AND, MVT::v16i8, 17},
-      {ISD::AND, MVT::v4i16,  7},
-      {ISD::AND, MVT::v8i16,  9},
-      {ISD::AND, MVT::v2i32,  3},
-      {ISD::AND, MVT::v4i32,  5},
-      {ISD::AND, MVT::v2i64,  3},
+      {ISD::ADD, MVT::v8i8, 2},
+      {ISD::ADD, MVT::v16i8, 2},
+      {ISD::ADD, MVT::v4i16, 2},
+      {ISD::ADD, MVT::v8i16, 2},
+      {ISD::ADD, MVT::v2i32, 2},
+      {ISD::ADD, MVT::v4i32, 2},
+      {ISD::ADD, MVT::v2i64, 2},
+      {ISD::OR, MVT::v8i8, 5},  // fmov + orr_lsr + orr_lsr + lsr + orr
+      {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
+      {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
+      {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
+      {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
+      {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
+      {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
+      {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
+      {ISD::XOR, MVT::v16i8, 7},
+      {ISD::XOR, MVT::v4i16, 4},
+      {ISD::XOR, MVT::v8i16, 6},
+      {ISD::XOR, MVT::v2i32, 3},
+      {ISD::XOR, MVT::v4i32, 5},
+      {ISD::XOR, MVT::v2i64, 3},
+      {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
+      {ISD::AND, MVT::v16i8, 7},
+      {ISD::AND, MVT::v4i16, 4},
+      {ISD::AND, MVT::v8i16, 6},
+      {ISD::AND, MVT::v2i32, 3},
+      {ISD::AND, MVT::v4i32, 5},
+      {ISD::AND, MVT::v2i64, 3},
   };
   switch (ISD) {
   default:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 473ba5e2fe11e..bb0f667b686cb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -287,6 +287,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .moreElementsToNextPow2(0)
       .lower();
 
+  getActionDefinitionsBuilder({G_ABDS, G_ABDU})
+      .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+      .lower();
+
   getActionDefinitionsBuilder(
       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
       .legalFor({{s32, s32}, {s64, s32}})
@@ -1794,6 +1798,10 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(AArch64::G_SMULL);
   case Intrinsic::aarch64_neon_umull:
     return LowerBinOp(AArch64::G_UMULL);
+  case Intrinsic::aarch64_neon_sabd:
+    return LowerBinOp(TargetOpcode::G_ABDS);
+  case Intrinsic::aarch64_neon_uabd:
+    return LowerBinOp(TargetOpcode::G_ABDU);
   case Intrinsic::aarch64_neon_abs: {
     // Lower the intrinsic to G_ABS.
     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 233f42b7a4790..08f547a85073e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -559,8 +559,7 @@ void AArch64TargetELFStreamer::finish() {
     if (!Sym.isMemtag())
       continue;
     auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx);
-    (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
-                               *Ctx.getSubtargetInfo());
+    S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 23f106a9c1d4d..007b481f84960 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -153,6 +153,9 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
+extern char &AMDGPUPrepareAGPRAllocLegacyID;
+
 void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
 extern char &AMDGPUReserveWWMRegsLegacyID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index b2b2b3721a00c..6076ac4596655 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
   "Use scratch_* flat memory instructions to access scratch"
 >;
 
+def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
+  "FlatGVSMode",
+  "true",
+  "Have GVS addressing mode with flat_* instructions"
+>;
+
 def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
   "AddNoCarryInsts",
   "true",
@@ -1112,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
   "Has v_bitop3_b32/v_bitop3_b16 instructions"
 >;
 
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+  "HasTanhInsts",
+  "true",
+  "Has v_tanh_f32/f16 instructions"
+>;
+
 def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
   "HasTransposeLoadF4F6Insts",
   "true",
@@ -1836,7 +1848,8 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeatureImageInsts,
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
-   FeatureMemoryAtomicFAddF32DenormalSupport]>;
+   FeatureMemoryAtomicFAddF32DenormalSupport,
+   FeatureRealTrue16Insts]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -1856,8 +1869,7 @@ def FeatureISAVersion11_0_Common : FeatureSet<
     [FeatureMSAALoadDstSelBug,
      FeatureVALUTransUseHazard,
      FeatureMADIntraFwdBug,
-     FeaturePrivEnabledTrap2NopBug,
-     FeatureRealTrue16Insts])>;
+     FeaturePrivEnabledTrap2NopBug])>;
 
 def FeatureISAVersion11_0_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_0_Common.Features,
@@ -1954,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureShaderCyclesHiLoRegisters,
    FeatureArchitectedFlatScratch,
    FeatureArchitectedSGPRs,
+   FeatureFlatGVSMode,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
    FeatureAtomicDsPkAdd16Insts,
@@ -1972,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
    FeatureBitOp3Insts,
+   FeatureTanhInsts,
    FeatureTransposeLoadF4F6Insts,
    FeatureBF16TransInsts,
    FeatureBF16ConversionInsts,
@@ -2381,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
 def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
   AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
 
+def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
+  AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
+
 def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
 
@@ -2693,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
 def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
   AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
 
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+  AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
 def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
   AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 749b9efc81378..4b3dc371c65f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1415,6 +1415,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
 
   MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
   MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+  MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
 
   if (AMDGPU::isCompute(CC)) {
     MD->setHwStage(CC, ".trap_present",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 14101e57f5143..3d8d274f06246 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
     return true;
   }
 
-  unsigned ReturnOpc =
-      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
+  const bool IsWholeWave = MFI->isWholeWaveFunction();
+  unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
+                       : IsShader  ? AMDGPU::SI_RETURN_TO_EPILOG
+                                   : AMDGPU::SI_RETURN;
   auto Ret = B.buildInstrNoInsert(ReturnOpc);
 
   if (!FLI.CanLowerReturn)
@@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
   else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
+  if (IsWholeWave)
+    addOriginalExecToReturn(B.getMF(), Ret);
+
   // TODO: Handle CalleeSavedRegsViaCopy.
 
   B.insertInstr(Ret);
@@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     if (DL.getTypeStoreSize(Arg.getType()) == 0)
       continue;
 
+    if (Info->isWholeWaveFunction() && Idx == 0) {
+      assert(VRegs[Idx].size() == 1 && "Expected only one register");
+
+      // The first argument for whole wave functions is the original EXEC value.
+      B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+          .addDef(VRegs[Idx][0]);
+
+      ++Idx;
+      continue;
+    }
+
     const bool InReg = Arg.hasAttribute(Attribute::InReg);
 
     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall(
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
   if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+      Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
       !AMDGPU::isChainCC(Info.CallConv)) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
@@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+      Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
@@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   return true;
 }
+
+void AMDGPUCallLowering::addOriginalExecToReturn(
+    MachineFunction &MF, MachineInstrBuilder &Ret) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
+  Ret.addReg(Setup->getOperand(0).getReg());
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..e0033d59d10bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
   bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
                       ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
 
+  void addOriginalExecToReturn(MachineFunction &MF,
+                               MachineInstrBuilder &Ret) const;
+
 public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7b5d4077e85f3..891d362503f15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -137,6 +137,9 @@ def gi_global_offset :
 def gi_global_saddr :
     GIComplexOperandMatcher<s64, "selectGlobalSAddr">,
     GIComplexPatternEquiv<GlobalSAddr>;
+def gi_global_saddr_glc :
+    GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
+    GIComplexPatternEquiv<GlobalSAddrGLC>;
 
 def gi_mubuf_scratch_offset :
     GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -312,6 +315,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
 
+def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
+// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
+// so we don't mark it as equivalent.
+
 class GISelSop2Pat <
   SDPatternOperator node,
   Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 25672a52345cb..5a2416debb417 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1863,9 +1863,17 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
                               SIInstrFlags::FlatScratch);
 }
 
-// If this matches zero_extend i32:x, return x
-static SDValue matchZExtFromI32(SDValue Op) {
-  if (Op.getOpcode() != ISD::ZERO_EXTEND)
+// If this matches *_extend i32:x, return x
+// Otherwise if the value is I32 returns x.
+static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
+                                    const SelectionDAG *DAG) {
+  if (Op.getValueType() == MVT::i32)
+    return Op;
+
+  if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
+      Op.getOpcode() != ISD::ANY_EXTEND &&
+      !(DAG->SignBitIsZero(Op) &&
+        Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
     return SDValue();
 
   SDValue ExtSrc = Op.getOperand(0);
@@ -1873,12 +1881,13 @@ static SDValue matchZExtFromI32(SDValue Op) {
 }
 
 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
-                                           SDValue Addr,
-                                           SDValue &SAddr,
-                                           SDValue &VOffset,
-                                           SDValue &Offset) const {
+// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+                                           SDValue &SAddr, SDValue &VOffset,
+                                           SDValue &Offset, bool &ScaleOffset,
+                                           bool NeedIOffset) const {
   int64_t ImmOffset = 0;
+  ScaleOffset = false;
 
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
@@ -1888,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
     const SIInstrInfo *TII = Subtarget->getInstrInfo();
 
-    if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+    if (NeedIOffset &&
+        TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
                                SIInstrFlags::FlatGlobal)) {
       Addr = LHS;
       ImmOffset = COffsetVal;
@@ -1898,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
         // saddr + large_offset -> saddr +
         //                         (voffset = large_offset & ~MaxOffset) +
         //                         (large_offset & MaxOffset);
-        int64_t SplitImmOffset, RemainderOffset;
-        std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
-            COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+        int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
+        if (NeedIOffset) {
+          std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+              COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+        }
 
-        if (isUInt<32>(RemainderOffset)) {
+        if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+                                            : isUInt<32>(RemainderOffset)) {
           SDNode *VMov = CurDAG->getMachineNode(
               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
@@ -1929,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
   // Match the variable offset.
   if (Addr.getOpcode() == ISD::ADD) {
     LHS = Addr.getOperand(0);
-    RHS = Addr.getOperand(1);
 
     if (!LHS->isDivergent()) {
-      // add (i64 sgpr), (zero_extend (i32 vgpr))
-      if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+      // add (i64 sgpr), (*_extend (i32 vgpr))
+      RHS = Addr.getOperand(1);
+      ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
+      if (SDValue ExtRHS = matchExtFromI32orI32(
+              RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
         SAddr = LHS;
-        VOffset = ZextRHS;
+        VOffset = ExtRHS;
       }
     }
 
+    RHS = Addr.getOperand(1);
     if (!SAddr && !RHS->isDivergent()) {
-      // add (zero_extend (i32 vgpr)), (i64 sgpr)
-      if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+      // add (*_extend (i32 vgpr)), (i64 sgpr)
+      ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
+      if (SDValue ExtLHS = matchExtFromI32orI32(
+              LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
         SAddr = RHS;
-        VOffset = ZextLHS;
+        VOffset = ExtLHS;
       }
     }
 
@@ -1953,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
     }
   }
 
+  if (Subtarget->hasScaleOffset() &&
+      (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
+                                ? AMDGPUISD::MAD_I64_I32
+                                : AMDGPUISD::MAD_U64_U32) ||
+       (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
+        CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
+      Addr.getOperand(0)->isDivergent() &&
+      isa<ConstantSDNode>(Addr.getOperand(1)) &&
+      !Addr.getOperand(2)->isDivergent()) {
+    // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
+    unsigned Size =
+        (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+    ScaleOffset = Addr.getConstantOperandVal(1) == Size;
+    if (ScaleOffset) {
+      SAddr = Addr.getOperand(2);
+      VOffset = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+      return true;
+    }
+  }
+
   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
       isa<ConstantSDNode>(Addr))
     return false;
@@ -1968,6 +2007,32 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+                                           SDValue &SAddr, SDValue &VOffset,
+                                           SDValue &Offset,
+                                           SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+    return false;
+
+  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+                                   SDLoc(), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
+                                              SDValue &SAddr, SDValue &VOffset,
+                                              SDValue &Offset,
+                                              SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
+    return false;
+
+  unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
+  CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
+  return true;
+}
+
 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
   if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
@@ -2051,7 +2116,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
 
 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
                                              SDValue &VAddr, SDValue &SAddr,
-                                             SDValue &Offset) const  {
+                                             SDValue &Offset,
+                                             SDValue &CPol) const {
   int64_t ImmOffset = 0;
 
   SDValue LHS, RHS;
@@ -2083,6 +2149,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
+        CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
         return true;
       }
     }
@@ -2116,6 +2183,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   SAddr = SelectSAddrFI(CurDAG, SAddr);
   Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+
+  bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
+  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+                                   SDLoc(), MVT::i32);
   return true;
 }
 
@@ -2136,17 +2207,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
   return true;
 }
 
+// Given \p Offset and load node \p N check if an \p Offset is a multiple of
+// the load byte size. If it is update \p Offset to a pre-scaled value and
+// return true.
+bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
+                                           bool IsSigned) const {
+  bool ScaleOffset = false;
+  if (!Subtarget->hasScaleOffset() || !Offset)
+    return false;
+
+  unsigned Size =
+      (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+
+  SDValue Off = Offset;
+  if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
+    Off = Ext;
+
+  if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
+      ScaleOffset = C->getZExtValue() == Log2_32(Size);
+  } else if (Offset.getOpcode() == ISD::MUL ||
+             (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
+             Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
+             (Offset.isMachineOpcode() &&
+              Offset.getMachineOpcode() ==
+                  (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
+                            : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
+      ScaleOffset = C->getZExtValue() == Size;
+  }
+
+  if (ScaleOffset)
+    Offset = Off.getOperand(0);
+
+  return ScaleOffset;
+}
+
 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
 // not null) offset. If Imm32Only is true, match only 32-bit immediate
 // offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
                                           SDValue *SOffset, SDValue *Offset,
                                           bool Imm32Only, bool IsBuffer,
-                                          bool HasSOffset,
-                                          int64_t ImmOffset) const {
+                                          bool HasSOffset, int64_t ImmOffset,
+                                          bool *ScaleOffset) const {
   assert((!SOffset || !Offset) &&
          "Cannot match both soffset and offset at the same time!");
 
+  if (ScaleOffset) {
+    assert(N && SOffset);
+
+    *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
+  }
+
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
   if (!C) {
     if (!SOffset)
@@ -2231,24 +2344,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
 // Match a base and an immediate (if Offset is not null) or an SGPR (if
 // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
 // true, match only 32-bit immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
-                                              SDValue *SOffset, SDValue *Offset,
-                                              bool Imm32Only, bool IsBuffer,
-                                              bool HasSOffset,
-                                              int64_t ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
+                                              SDValue &SBase, SDValue *SOffset,
+                                              SDValue *Offset, bool Imm32Only,
+                                              bool IsBuffer, bool HasSOffset,
+                                              int64_t ImmOffset,
+                                              bool *ScaleOffset) const {
   if (SOffset && Offset) {
     assert(!Imm32Only && !IsBuffer);
     SDValue B;
 
-    if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+    if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
       return false;
 
     int64_t ImmOff = 0;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
       ImmOff = C->getSExtValue();
 
-    return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
-                                ImmOff);
+    return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
+                                true, ImmOff, ScaleOffset);
   }
 
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2268,23 +2382,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
   if (!N0 || !N1)
     return false;
 
-  if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
-                       ImmOffset)) {
+  if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+                       ImmOffset, ScaleOffset)) {
     SBase = N0;
     return true;
   }
-  if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
-                       ImmOffset)) {
+  if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+                       ImmOffset, ScaleOffset)) {
     SBase = N1;
     return true;
   }
   return false;
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
                                     SDValue *SOffset, SDValue *Offset,
-                                    bool Imm32Only) const {
-  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+                                    bool Imm32Only, bool *ScaleOffset) const {
+  if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
+                           /* IsBuffer */ false, /* HasSOffset */ false,
+                           /* ImmOffset */ 0, ScaleOffset)) {
     SBase = Expand32BitAddress(SBase);
     return true;
   }
@@ -2300,36 +2416,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
                                        SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
+  return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+                    &Offset);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
                                          SDValue &Offset) const {
   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
-                    /* Imm32Only */ true);
+  return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+                    &Offset, /* Imm32Only */ true);
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
-                                        SDValue &SOffset) const {
-  return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
+                                        SDValue &SOffset, SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
+                  /* Imm32Only */ false, &ScaleOffset))
+    return false;
+
+  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+                                   SDLoc(N), MVT::i32);
+  return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
-                                           SDValue &SOffset,
-                                           SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, &SOffset, &Offset);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
+                                           SDValue &SBase, SDValue &SOffset,
+                                           SDValue &Offset,
+                                           SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
+    return false;
+
+  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+                                   SDLoc(N), MVT::i32);
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
-  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+  return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
                           /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
                                                SDValue &Offset) const {
   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+  return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
                           /* Imm32Only */ true, /* IsBuffer */ true);
 }
 
@@ -2338,9 +2469,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
   // Match the (soffset + offset) pair as a 32-bit register base and
   // an immediate offset.
   return N.getValueType() == MVT::i32 &&
-         SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
-                              &Offset, /* Imm32Only */ false,
-                              /* IsBuffer */ true);
+         SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
+                              /* SOffset*/ nullptr, &Offset,
+                              /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 9967f46e085e4..6123d75d7b616 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -162,30 +162,44 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
                            SDValue &Offset) const;
   bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
-                         SDValue &VOffset, SDValue &Offset) const;
+                         SDValue &VOffset, SDValue &Offset, bool &ScaleOffset,
+                         bool NeedIOffset = true) const;
+  bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+                         SDValue &VOffset, SDValue &Offset,
+                         SDValue &CPol) const;
+  bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
+                            SDValue &VOffset, SDValue &Offset,
+                            SDValue &CPol) const;
   bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
                           SDValue &Offset) const;
   bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
                                      uint64_t ImmOffset) const;
   bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
-                           SDValue &SAddr, SDValue &Offset) const;
+                           SDValue &SAddr, SDValue &Offset,
+                           SDValue &CPol) const;
 
-  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
+  bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
                         SDValue *Offset, bool Imm32Only = false,
                         bool IsBuffer = false, bool HasSOffset = false,
-                        int64_t ImmOffset = 0) const;
+                        int64_t ImmOffset = 0,
+                        bool *ScaleOffset = nullptr) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
-  bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                            SDValue *Offset, bool Imm32Only = false,
-                            bool IsBuffer = false, bool HasSOffset = false,
-                            int64_t ImmOffset = 0) const;
-  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                  SDValue *Offset, bool Imm32Only = false) const;
+  bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase,
+                            SDValue *SOffset, SDValue *Offset,
+                            bool Imm32Only = false, bool IsBuffer = false,
+                            bool HasSOffset = false, int64_t ImmOffset = 0,
+                            bool *ScaleOffset = nullptr) const;
+  bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset,
+                  SDValue *Offset, bool Imm32Only = false,
+                  bool *ScaleOffset = nullptr) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
-  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
-  bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
-                         SDValue &Offset) const;
+  bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const;
+  bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset,
+                      SDValue &CPol) const;
+  bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase,
+                         SDValue &SOffset, SDValue &Offset,
+                         SDValue &CPol) const;
   bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 280f87b82b7fd..e3ca09e512b3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -375,7 +375,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
-  setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
@@ -1143,6 +1142,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::Cold:
     return CC_AMDGPU_Func;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return CC_SI_Gfx;
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
@@ -1168,6 +1168,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   case CallingConv::AMDGPU_LS:
     return RetCC_SI_Shader;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return RetCC_SI_Gfx;
   case CallingConv::C:
   case CallingConv::Fast:
@@ -4843,11 +4844,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
+// Detect when CMP and SELECT use the same constant and fold them to avoid
+// loading the constant twice. Specifically handles patterns like:
+// %cmp = icmp eq i32 %val, 4242
+// %sel = select i1 %cmp, i32 4242, i32 %other
+// It can be optimized to reuse %val instead of 4242 in select.
+static SDValue
+foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AMDGPUSubtarget *ST) {
+  SDValue Cond = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue FalseVal = N->getOperand(2);
+
+  // Check if condition is a comparison.
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+  bool isInteger = LHS.getValueType().isInteger();
+
+  // Handle simple floating-point and integer types only.
+  if (!isFloatingPoint && !isInteger)
+    return SDValue();
+
+  bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+  bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+  if (!isEquality && !isNonEquality)
+    return SDValue();
+
+  SDValue ArgVal, ConstVal;
+  if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+      (isInteger && isa<ConstantSDNode>(RHS))) {
+    ConstVal = RHS;
+    ArgVal = LHS;
+  } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+             (isInteger && isa<ConstantSDNode>(LHS))) {
+    ConstVal = LHS;
+    ArgVal = RHS;
+  } else {
+    return SDValue();
+  }
+
+  // Check if constant should not be optimized - early return if not.
+  if (isFloatingPoint) {
+    const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+    const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
+
+    // Only optimize normal floating-point values (finite, non-zero, and
+    // non-subnormal as per IEEE 754), skip optimization for inlinable
+    // floating-point constants.
+    if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
+      return SDValue();
+  } else {
+    int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+
+    // Skip optimization for inlinable integer immediates.
+    // Inlinable immediates include: -16 to 64 (inclusive).
+    if (IntVal >= -16 && IntVal <= 64)
+      return SDValue();
+  }
+
+  // For equality and non-equality comparisons, patterns:
+  // select (setcc x, const), const, y -> select (setcc x, const), x, y
+  // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+  if (!(isEquality && TrueVal == ConstVal) &&
+      !(isNonEquality && FalseVal == ConstVal))
+    return SDValue();
+
+  SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+  SDValue SelectRHS =
+      (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+  return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+                         SelectLHS, SelectRHS);
+}
+
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
     return Folded;
 
+  // Try to fold CMP + SELECT patterns with shared constants (both FP and
+  // integer).
+  if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
+    return Folded;
+
   SDValue Cond = N->getOperand(0);
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
@@ -5792,6 +5876,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
   NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
+  NODE_NAME_CASE(WHOLE_WAVE_SETUP)
+  NODE_NAME_CASE(WHOLE_WAVE_RETURN)
   }
   return nullptr;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4e8c6c7ea3b27..39bb0adfc1a17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -608,6 +608,12 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_FMAX,
   BUFFER_ATOMIC_COND_SUB_U32,
   LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
+
+  // Set up a whole wave function.
+  WHOLE_WAVE_SETUP,
+
+  // Return from a whole wave function.
+  WHOLE_WAVE_RETURN,
 };
 
 } // End namespace AMDGPUISD
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 44eaebffb70dc..9a90787963d7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -25,6 +25,7 @@ namespace {
 
 class AMDGPUInsertDelayAlu {
 public:
+  const GCNSubtarget *ST;
   const SIInstrInfo *SII;
   const TargetRegisterInfo *TRI;
 
@@ -65,13 +66,16 @@ class AMDGPUInsertDelayAlu {
   // Types of delay that can be encoded in an s_delay_alu instruction.
   enum DelayType { VALU, TRANS, SALU, OTHER };
 
-  // Get the delay type for an instruction with the specified TSFlags.
-  static DelayType getDelayType(uint64_t TSFlags) {
-    if (TSFlags & SIInstrFlags::TRANS)
+  // Get the delay type for a MachineInstr.
+  DelayType getDelayType(const MachineInstr &MI) {
+    if (SIInstrInfo::isTRANS(MI))
       return TRANS;
-    if (TSFlags & SIInstrFlags::VALU)
+    // WMMA XDL ops are treated the same as TRANS.
+    if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+      return TRANS;
+    if (SIInstrInfo::isVALU(MI))
       return VALU;
-    if (TSFlags & SIInstrFlags::SALU)
+    if (SIInstrInfo::isSALU(MI))
       return SALU;
     return OTHER;
   }
@@ -368,7 +372,7 @@ class AMDGPUInsertDelayAlu {
         continue;
       }
 
-      DelayType Type = getDelayType(MI.getDesc().TSFlags);
+      DelayType Type = getDelayType(MI);
 
       if (instructionWaitsForSGPRWrites(MI)) {
         auto It = State.find(LastSGPRFromVALU);
@@ -456,12 +460,12 @@ class AMDGPUInsertDelayAlu {
     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
                       << "\n");
 
-    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-    if (!ST.hasDelayAlu())
+    ST = &MF.getSubtarget<GCNSubtarget>();
+    if (!ST->hasDelayAlu())
       return false;
 
-    SII = ST.getInstrInfo();
-    TRI = ST.getRegisterInfo();
+    SII = ST->getInstrInfo();
+    TRI = ST->getRegisterInfo();
     SchedModel = &SII->getSchedModel();
 
     // Calculate the delay state for each basic block, iterating until we reach
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index e2c2e8912c715..f2207ff4cb1c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1694,6 +1694,47 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     NewII->takeName(&II);
     return IC.replaceInstUsesWith(II, NewII);
   }
+  case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4: {
+    Value *Src0 = II.getArgOperand(1);
+    Value *Src1 = II.getArgOperand(3);
+    unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
+    uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
+    auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
+    auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
+
+    bool MadeChange = false;
+    unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
+    unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
+
+    // Depending on the used format, fewer registers are required so shrink the
+    // vector type.
+    if (Src0Ty->getNumElements() > Src0NumElts) {
+      Src0 = IC.Builder.CreateExtractVector(
+          FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
+          IC.Builder.getInt64(0));
+      MadeChange = true;
+    }
+
+    if (Src1Ty->getNumElements() > Src1NumElts) {
+      Src1 = IC.Builder.CreateExtractVector(
+          FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
+          IC.Builder.getInt64(0));
+      MadeChange = true;
+    }
+
+    if (!MadeChange)
+      return std::nullopt;
+
+    SmallVector<Value *, 13> Args(II.args());
+    Args[1] = Src0;
+    Args[3] = Src1;
+
+    CallInst *NewII = IC.Builder.CreateIntrinsic(
+        IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
+        Args, &II);
+    NewII->takeName(&II);
+    return IC.replaceInstUsesWith(II, NewII);
+  }
   }
   if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ce58e93a15207..e305f08925cc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
+// Marks the entry into a whole wave function.
+def AMDGPUwhole_wave_setup : SDNode<
+  "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+// Marks the return from a whole wave function.
+def AMDGPUwhole_wave_return : SDNode<
+  "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
   SDTCisInt<0>,       // i8 tgt
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1a63c48e3666c..877c3ac34d555 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 }
 
 /// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
   Register ZExtSrc;
-  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
-    return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+  if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
+    return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
 
   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
-  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
     return Register();
 
   assert(Def->getNumOperands() == 3 &&
-         MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
-  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+  if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
     return Def->getOperand(1).getReg();
   }
 
   return Register();
 }
 
+/// Match a sign extend from a 32-bit value to 64-bits.
+Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
+  Register SExtSrc;
+  if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
+    return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
+
+  // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+    return Register();
+
+  assert(Def->getNumOperands() == 3 &&
+         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+  if (mi_match(Def->getOperand(2).getReg(), *MRI,
+               m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
+                       m_SpecificICst(31))))
+    return Def->getOperand(1).getReg();
+
+  if (VT->signBitIsZero(Reg))
+    return matchZeroExtendFromS32(Reg);
+
+  return Register();
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
+  return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+                                              : matchZeroExtendFromS32(Reg);
+}
+
+/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
+  return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+                                              : matchSignExtendFromS32(Reg);
+}
+
+Register
+AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
+                                                   bool IsSigned) const {
+  if (IsSigned)
+    return matchSignExtendFromS32OrS32(Reg);
+
+  return matchZeroExtendFromS32OrS32(Reg);
+}
+
 Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
   Register AnyExtSrc;
   if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
@@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
           getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
       if (isSGPR(SAddr)) {
         Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
-        if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+        if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
           Addr = SAddr;
           VOffset = Off;
         }
@@ -4160,6 +4209,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return true;
   case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
     return selectWaveAddress(I);
+  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
+    I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
+    return true;
+  }
   case AMDGPU::G_STACKRESTORE:
     return selectStackRestore(I);
   case AMDGPU::G_PHI:
@@ -5219,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
   unsigned Key = 0;
 
-  Register S32 = matchZeroExtendFromS32(*MRI, Src);
+  Register S32 = matchZeroExtendFromS32(Src);
   if (!S32)
     S32 = matchAnyExtendFromS32(Src);
 
@@ -5292,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
   }};
 }
 
+// Given \p Offset and load specified by the \p Root operand check if \p Offset
+// is a multiple of the load byte size. If it is update \p Offset to a
+// pre-scaled value and return true.
+bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
+                                                  Register &Offset,
+                                                  bool IsSigned) const {
+  if (!Subtarget->hasScaleOffset())
+    return false;
+
+  const MachineInstr &MI = *Root.getParent();
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+
+  if (!MMO->getSize().hasValue())
+    return false;
+
+  uint64_t Size = MMO->getSize().getValue();
+
+  Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
+  if (!OffsetReg)
+    OffsetReg = Offset;
+
+  if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
+    OffsetReg = Def->Reg;
+
+  Register Op0;
+  MachineInstr *Mul;
+  bool ScaleOffset =
+      (isPowerOf2_64(Size) &&
+       mi_match(OffsetReg, *MRI,
+                m_GShl(m_Reg(Op0),
+                       m_any_of(m_SpecificICst(Log2_64(Size)),
+                                m_Copy(m_SpecificICst(Log2_64(Size))))))) ||
+      mi_match(OffsetReg, *MRI,
+               m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),
+                                           m_Copy(m_SpecificICst(Size))))) ||
+      mi_match(
+          OffsetReg, *MRI,
+          m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
+                  m_Reg(Op0), m_SpecificICst(Size))) ||
+      // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
+      (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
+       (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
+                                      : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
+        (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
+         VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
+       mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
+       mi_match(Mul->getOperand(3).getReg(), *MRI,
+                m_GTrunc(m_any_of(m_SpecificICst(Size),
+                                  m_Copy(m_SpecificICst(Size))))) &&
+       mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
+
+  if (ScaleOffset)
+    Offset = Op0;
+
+  return ScaleOffset;
+}
+
 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
                                                  Register &Base,
                                                  Register *SOffset,
-                                                 int64_t *Offset) const {
+                                                 int64_t *Offset,
+                                                 bool *ScaleOffset) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
 
@@ -5310,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
   const GEPInfo &GEPI = AddrInfo[0];
   std::optional<int64_t> EncodedImm;
 
+  if (ScaleOffset)
+    *ScaleOffset = false;
+
   if (SOffset && Offset) {
     EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
                                               /*HasSOffset=*/true);
@@ -5317,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
         AddrInfo.size() > 1) {
       const GEPInfo &GEPI2 = AddrInfo[1];
       if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
-        if (Register OffsetReg =
-                matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+        Register OffsetReg = GEPI2.SgprParts[1];
+        if (ScaleOffset)
+          *ScaleOffset =
+              selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+        OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+        if (OffsetReg) {
           Base = GEPI2.SgprParts[0];
           *SOffset = OffsetReg;
           *Offset = *EncodedImm;
@@ -5363,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
   }
 
   if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
-    if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+    Register OffsetReg = GEPI.SgprParts[1];
+    if (ScaleOffset)
+      *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+    OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+    if (OffsetReg) {
       Base = GEPI.SgprParts[0];
       *SOffset = OffsetReg;
       return true;
@@ -5377,7 +5499,8 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
   Register Base;
   int64_t Offset;
-  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
+  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
+                        /* ScaleOffset */ nullptr))
     return std::nullopt;
 
   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
@@ -5408,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
   Register Base, SOffset;
-  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
+  bool ScaleOffset;
+  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
+                        &ScaleOffset))
     return std::nullopt;
 
+  unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
-           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
 }
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
   Register Base, SOffset;
   int64_t Offset;
-  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+  bool ScaleOffset;
+  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
     return std::nullopt;
 
+  unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
-           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
 }
 
 std::pair<Register, int>
@@ -5485,7 +5615,9 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
 
 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
 InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
+                                             unsigned CPolBits,
+                                             bool NeedIOffset) const {
   Register Addr = Root.getReg();
   Register PtrBase;
   int64_t ConstOffset;
@@ -5496,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
 
   if (ConstOffset != 0) {
-    if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+    if (NeedIOffset &&
+        TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
                               SIInstrFlags::FlatGlobal)) {
       Addr = PtrBase;
       ImmOffset = ConstOffset;
@@ -5509,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
           // saddr + large_offset -> saddr +
           //                         (voffset = large_offset & ~MaxOffset) +
           //                         (large_offset & MaxOffset);
-          int64_t SplitImmOffset, RemainderOffset;
-          std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
-              ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+          int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
+          if (NeedIOffset) {
+            std::tie(SplitImmOffset, RemainderOffset) =
+                TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+                                    SIInstrFlags::FlatGlobal);
+          }
 
-          if (isUInt<32>(RemainderOffset)) {
+          if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+                                              : isUInt<32>(RemainderOffset)) {
             MachineInstr *MI = Root.getParent();
             MachineBasicBlock *MBB = MI->getParent();
             Register HighBits =
@@ -5523,12 +5660,23 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
                     HighBits)
                 .addImm(RemainderOffset);
 
+            if (NeedIOffset)
+              return {{
+                  [=](MachineInstrBuilder &MIB) {
+                    MIB.addReg(PtrBase);
+                  }, // saddr
+                  [=](MachineInstrBuilder &MIB) {
+                    MIB.addReg(HighBits);
+                  }, // voffset
+                  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+                  [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+              }};
             return {{
                 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
                 [=](MachineInstrBuilder &MIB) {
                   MIB.addReg(HighBits);
                 }, // voffset
-                [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+                [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
             }};
           }
         }
@@ -5559,15 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
 
       // It's possible voffset is an SGPR here, but the copy to VGPR will be
       // inserted later.
-      if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+      bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
+                                           Subtarget->hasSignedGVSOffset());
+      if (Register VOffset = matchExtendFromS32OrS32(
+              PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
+        if (NeedIOffset)
+          return {{[=](MachineInstrBuilder &MIB) { // saddr
+                     MIB.addReg(SAddr);
+                   },
+                   [=](MachineInstrBuilder &MIB) { // voffset
+                     MIB.addReg(VOffset);
+                   },
+                   [=](MachineInstrBuilder &MIB) { // offset
+                     MIB.addImm(ImmOffset);
+                   },
+                   [=](MachineInstrBuilder &MIB) { // cpol
+                     MIB.addImm(CPolBits |
+                                (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+                   }}};
         return {{[=](MachineInstrBuilder &MIB) { // saddr
                    MIB.addReg(SAddr);
                  },
                  [=](MachineInstrBuilder &MIB) { // voffset
                    MIB.addReg(VOffset);
                  },
-                 [=](MachineInstrBuilder &MIB) { // offset
-                   MIB.addImm(ImmOffset);
+                 [=](MachineInstrBuilder &MIB) { // cpol
+                   MIB.addImm(CPolBits |
+                              (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
                  }}};
       }
     }
@@ -5588,13 +5754,30 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
       .addImm(0);
 
+  if (NeedIOffset)
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); },    // offset
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }      // cpol
+    }};
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
       [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },      // voffset
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }     // offset
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }      // cpol
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
+  return selectGlobalSAddr(Root, 0);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
+  return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   Register Addr = Root.getReg();
@@ -5712,22 +5895,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
     return std::nullopt;
 
+  unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
+                      ? AMDGPU::CPol::SCAL
+                      : 0;
+
   if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = LHSDef->MI->getOperand(1).getIndex();
     return {{
-        [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); },       // vaddr
         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
-        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }       // cpol
     }};
   }
 
+  if (!isSGPR(LHS))
+    if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
+      LHS = Def->Reg;
+
   if (!isSGPR(LHS))
     return std::nullopt;
 
   return {{
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); },       // vaddr
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); },       // saddr
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }       // cpol
   }};
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 2cb7904d27ccc..5f7f05c52ad21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -232,8 +232,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectVINTERPModsHi(MachineOperand &Root) const;
 
+  bool selectScaleOffset(MachineOperand &Root, Register &Offset,
+                         bool IsSigned) const;
   bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
-                        int64_t *Offset) const;
+                        int64_t *Offset, bool *ScaleOffset) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
@@ -253,8 +255,13 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectScratchOffset(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits,
+                    bool NeedIOffset = true) const;
   InstructionSelector::ComplexRendererFns
   selectGlobalSAddr(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectGlobalSAddrGLC(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectScratchSAddr(MachineOperand &Root) const;
@@ -417,6 +424,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   // shift amount operand's `ShAmtBits` bits is unneeded.
   bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
 
+  /// Match a zero extend from a 32-bit value to 64-bits.
+  Register matchZeroExtendFromS32(Register Reg) const;
+  /// Match a sign extend from a 32-bit value to 64-bits.
+  Register matchSignExtendFromS32(Register Reg) const;
+  /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+  /// is 32-bit.
+  Register matchZeroExtendFromS32OrS32(Register Reg) const;
+  /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+  /// is 32-bit.
+  Register matchSignExtendFromS32OrS32(Register Reg) const;
+  /// Match either sign or zero extend depending on the \p IsSigned from a
+  /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit.
+  Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const;
   /// Match an any extend from a 32-bit value to 64-bit.
   Register matchAnyExtendFromS32(Register Reg) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index fa8af68817dfc..304e91ec184f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1583,15 +1583,13 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
     if (!SplitUsers.contains(I))
       continue;
 
-    SmallVector<DbgValueInst *> Dbgs;
-    findDbgValues(Dbgs, I);
-    for (auto *Dbg : Dbgs) {
-      IRB.SetInsertPoint(Dbg);
+    SmallVector<DbgVariableRecord *> Dbgs;
+    findDbgValues(I, Dbgs);
+    for (DbgVariableRecord *Dbg : Dbgs) {
       auto &DL = I->getDataLayout();
       assert(isSplitFatPtr(I->getType()) &&
              "We should've RAUW'd away loads, stores, etc. at this point");
-      auto *OffDbg = cast<DbgValueInst>(Dbg->clone());
-      copyMetadata(OffDbg, Dbg);
+      DbgVariableRecord *OffDbg = Dbg->clone();
       auto [Rsrc, Off] = getPtrParts(I);
 
       int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType());
@@ -1606,9 +1604,9 @@ void SplitPtrStructs::killAndReplaceSplitInstructions(
       if (OffExpr) {
         OffDbg->setExpression(*OffExpr);
         OffDbg->replaceVariableLocationOp(I, Off);
-        IRB.Insert(OffDbg);
+        OffDbg->insertBefore(Dbg);
       } else {
-        OffDbg->deleteValue();
+        OffDbg->eraseFromParent();
       }
       if (RsrcExpr) {
         Dbg->setExpression(*RsrcExpr);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 250547acb1ee7..b6c6d927d0e89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -114,6 +114,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
 MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
 MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
 MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
+MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
 MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
 MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
 MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
new file mode 100644
index 0000000000000..3b06e9b00ac69
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -0,0 +1,108 @@
+//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Make simple transformations to relax register constraints for cases which can
+// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into
+// AGPR or VGPR with a pseudo with an AV_* class register constraint. This
+// allows later passes to inflate the register class if necessary. The register
+// allocator does not know to replace instructions to relax constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
+
+namespace {
+
+class AMDGPUPrepareAGPRAllocImpl {
+private:
+  const SIInstrInfo &TII;
+  MachineRegisterInfo &MRI;
+
+public:
+  AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
+      : TII(*ST.getInstrInfo()), MRI(MRI) {}
+  bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
+    initializeAMDGPUPrepareAGPRAllocLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+                      "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+                    "AMDGPU Prepare AGPR Alloc", false, false)
+
+char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
+
+char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID;
+
+bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+}
+
+PreservedAnalyses
+AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
+                                MachineFunctionAnalysisManager &MFAM) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+  return PreservedAnalyses::all();
+}
+
+bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
+  if (MRI.isReserved(AMDGPU::AGPR0))
+    return false;
+
+  const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+           TII.isInlineConstant(MI, 1)) ||
+          (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           MI.getOperand(1).isImm())) {
+        MI.setDesc(AVImmPseudo);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
new file mode 100644
index 0000000000000..dc598c98f241b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrepareAGPRAllocPass
+    : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index cbbb57c6f8122..f1caf2478e630 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4558,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_cvt_pk_u16:
     case Intrinsic::amdgcn_cvt_pk_f16_fp8:
     case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+    case Intrinsic::amdgcn_sat_pk4_i4_i8:
+    case Intrinsic::amdgcn_sat_pk4_u4_u8:
     case Intrinsic::amdgcn_fmed3:
     case Intrinsic::amdgcn_cubeid:
     case Intrinsic::amdgcn_cubema:
@@ -4712,6 +4714,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
     case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
     case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
     case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
     case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
     case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
@@ -5538,6 +5541,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_PREFETCH:
     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
     break;
+  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
+  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+    break;
   }
 
   return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 46027b8890234..8101c68986241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
 
   Info.UsesVCC =
       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+  Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
+                                                /*IncludeCalls=*/false);
+  if (ST.hasMAIInsts())
+    Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
+                                          /*IncludeCalls=*/false);
 
   // If there are no calls, MachineRegisterInfo can tell us the used register
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
-    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-    if (ST.hasMAIInsts())
-      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
+                                          /*IncludeCalls=*/false);
     return Info;
   }
 
   int32_t MaxVGPR = -1;
-  int32_t MaxAGPR = -1;
-  int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      // TODO: Check regmasks? Do they occur anywhere except calls?
-      for (const MachineOperand &MO : MI.operands()) {
-        unsigned Width = 0;
-        bool IsSGPR = false;
-        bool IsAGPR = false;
+      for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+        const MachineOperand &MO = MI.getOperand(I);
 
         if (!MO.isReg())
           continue;
 
         Register Reg = MO.getReg();
         switch (Reg) {
-        case AMDGPU::EXEC:
-        case AMDGPU::EXEC_LO:
-        case AMDGPU::EXEC_HI:
-        case AMDGPU::SCC:
-        case AMDGPU::M0:
-        case AMDGPU::M0_LO16:
-        case AMDGPU::M0_HI16:
-        case AMDGPU::SRC_SHARED_BASE_LO:
-        case AMDGPU::SRC_SHARED_BASE:
-        case AMDGPU::SRC_SHARED_LIMIT_LO:
-        case AMDGPU::SRC_SHARED_LIMIT:
-        case AMDGPU::SRC_PRIVATE_BASE_LO:
-        case AMDGPU::SRC_PRIVATE_BASE:
-        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
-        case AMDGPU::SRC_PRIVATE_LIMIT:
-        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
-        case AMDGPU::SGPR_NULL:
-        case AMDGPU::SGPR_NULL64:
-        case AMDGPU::MODE:
-          continue;
-
         case AMDGPU::NoRegister:
           assert(MI.isDebugInstr() &&
                  "Instruction uses invalid noreg register");
           continue;
 
-        case AMDGPU::VCC:
-        case AMDGPU::VCC_LO:
-        case AMDGPU::VCC_HI:
-        case AMDGPU::VCC_LO_LO16:
-        case AMDGPU::VCC_LO_HI16:
-        case AMDGPU::VCC_HI_LO16:
-        case AMDGPU::VCC_HI_HI16:
-          Info.UsesVCC = true;
-          continue;
-
-        case AMDGPU::FLAT_SCR:
-        case AMDGPU::FLAT_SCR_LO:
-        case AMDGPU::FLAT_SCR_HI:
-          continue;
-
         case AMDGPU::XNACK_MASK:
         case AMDGPU::XNACK_MASK_LO:
         case AMDGPU::XNACK_MASK_HI:
@@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
           break;
         }
 
-        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
-            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
-            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 1;
-        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 1;
-        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 2;
-        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 3;
-        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 3;
-        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 3;
-        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 4;
-        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 5;
-        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 5;
-        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 5;
-        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 6;
-        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 6;
-        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 6;
-        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 7;
-        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 7;
-        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 7;
-        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 8;
-        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 9;
-        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 9;
-        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 9;
-        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 10;
-        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 10;
-        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 10;
-        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 11;
-        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 11;
-        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 11;
-        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 12;
-        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 12;
-        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 12;
-        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 16;
-        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 16;
-        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 32;
-        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 32;
-        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 32;
-        } else {
-          // We only expect TTMP registers or registers that do not belong to
-          // any RC.
-          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
-                  !TRI.getPhysRegBaseClass(Reg)) &&
-                 "Unknown register class");
-        }
+        const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+        assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
+                TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
+                AMDGPU::TTMP_64RegClass.contains(Reg) ||
+                AMDGPU::TTMP_128RegClass.contains(Reg) ||
+                AMDGPU::TTMP_256RegClass.contains(Reg) ||
+                AMDGPU::TTMP_512RegClass.contains(Reg)) &&
+               "Unknown register class");
+
+        if (!RC || !TRI.isVGPRClass(RC))
+          continue;
+
+        unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
         unsigned HWReg = TRI.getHWRegIndex(Reg);
         int MaxUsed = HWReg + Width - 1;
-        if (IsSGPR) {
-          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
-        } else if (IsAGPR) {
-          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
-        } else {
-          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
-        }
+        MaxVGPR = std::max(MaxUsed, MaxVGPR);
       }
 
       if (MI.isCall()) {
@@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
     }
   }
 
-  Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
-  Info.NumAGPR = MaxAGPR + 1;
 
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f4dc4a483181c..c865082a1dcea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -25,6 +25,7 @@
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPUPrepareAGPRAlloc.h"
 #include "AMDGPURemoveIncompatibleFunctions.h"
 #include "AMDGPUReserveWWMRegs.h"
 #include "AMDGPUResourceUsageAnalysis.h"
@@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGlobalISel(*PR);
   initializeAMDGPUAsmPrinterPass(*PR);
   initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
+  initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
   initializeGCNDPPCombineLegacyPass(*PR);
   initializeSILowerI1CopiesLegacyPass(*PR);
   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
@@ -1196,6 +1198,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addPreRegAlloc() override;
   void addFastRegAlloc() override;
   void addOptimizedRegAlloc() override;
 
@@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() {
   TargetPassConfig::addFastRegAlloc();
 }
 
+void GCNPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(&AMDGPUPrepareAGPRAllocLegacyID);
+}
+
 void GCNPassConfig::addOptimizedRegAlloc() {
   if (EnableDCEInRA)
     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   Base::addOptimizedRegAlloc(addPass);
 }
 
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(AMDGPUPrepareAGPRAllocPass());
+}
+
 Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
     AddMachinePass &addPass) const {
   // TODO: Check --regalloc-npm option
@@ -2284,6 +2297,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
   Base::addPostRegAlloc(addPass);
 }
 
+void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    addPass(SIShrinkInstructionsPass());
+  addPass(SIPostRABundlerPass());
+}
+
 void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
   if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
     addPass(GCNCreateVOPDPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3c62cd19c6e57..e0f1296ddded8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -181,8 +181,11 @@ class AMDGPUCodeGenPassBuilder
   void addMachineSSAOptimization(AddMachinePass &) const;
   void addPostRegAlloc(AddMachinePass &) const;
   void addPreEmitPass(AddMachinePass &) const;
+  void addPreEmitRegAlloc(AddMachinePass &) const;
   Error addRegAssignmentOptimized(AddMachinePass &) const;
+  void addPreRegAlloc(AddMachinePass &) const;
   void addOptimizedRegAlloc(AddMachinePass &) const;
+  void addPreSched2(AddMachinePass &) const;
 
   /// Check if a pass is enabled given \p Opt option. The option always
   /// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 43d4e8db791b0..421fc429048ff 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -176,6 +176,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyWaitVAVDst,
     ImmTyWaitVMVSrc,
     ImmTyBitOp3,
+    ImmTyMatrixAFMT,
+    ImmTyMatrixBFMT,
     ImmTyMatrixAReuse,
     ImmTyMatrixBReuse,
     ImmTyByteSel,
@@ -423,6 +425,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
   bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
   bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
+  bool isMatrixAFMT() const { return isImmTy(ImmTyMatrixAFMT); }
+  bool isMatrixBFMT() const { return isImmTy(ImmTyMatrixBFMT); }
   bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
   bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
@@ -1174,6 +1178,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
     case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
     case ImmTyBitOp3: OS << "BitOp3"; break;
+    case ImmTyMatrixAFMT: OS << "ImmTyMatrixAFMT"; break;
+    case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
     case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
     case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
     case ImmTyByteSel: OS << "ByteSel" ; break;
@@ -1714,6 +1720,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   ParseStatus parseIndexKey8bit(OperandVector &Operands);
   ParseStatus parseIndexKey16bit(OperandVector &Operands);
   ParseStatus parseIndexKey32bit(OperandVector &Operands);
+  ParseStatus tryParseMatrixFMT(OperandVector &Operands, StringRef Name,
+                                AMDGPUOperand::ImmTy Type);
+  ParseStatus parseMatrixAFMT(OperandVector &Operands);
+  ParseStatus parseMatrixBFMT(OperandVector &Operands);
 
   ParseStatus parseDfmtNfmt(int64_t &Format);
   ParseStatus parseUfmt(int64_t &Format);
@@ -1849,6 +1859,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                               const unsigned CPol);
   bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
   std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
+  bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -5128,13 +5139,45 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const {
 
 bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const {
   auto FB = getFeatureBits();
+  if (!FB[AMDGPU::FeatureGFX90AInsts] && !FB[AMDGPU::FeatureGFX1250Insts])
+    return true;
+
   unsigned Opc = Inst.getOpcode();
+  const MCRegisterInfo *MRI = getMRI();
   // DS_READ_B96_TR_B6 is the only DS instruction in GFX950, that allows
   // unaligned VGPR. All others only allow even aligned VGPRs.
-  if (!(FB[AMDGPU::FeatureGFX90AInsts]) || Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
+  if (FB[AMDGPU::FeatureGFX90AInsts] && Opc == AMDGPU::DS_READ_B96_TR_B6_vi)
     return true;
 
-  const MCRegisterInfo *MRI = getMRI();
+  if (FB[AMDGPU::FeatureGFX1250Insts]) {
+    switch (Opc) {
+    default:
+      break;
+    case AMDGPU::DS_LOAD_TR6_B96:
+    case AMDGPU::DS_LOAD_TR6_B96_gfx12:
+      // DS_LOAD_TR6_B96 is the only DS instruction in GFX1250, that
+      // allows unaligned VGPR. All others only allow even aligned VGPRs.
+      return true;
+    case AMDGPU::GLOBAL_LOAD_TR6_B96:
+    case AMDGPU::GLOBAL_LOAD_TR6_B96_gfx1250: {
+      // GLOBAL_LOAD_TR6_B96 is the only GLOBAL instruction in GFX1250, that
+      // allows unaligned VGPR for vdst, but other operands still only allow
+      // even aligned VGPRs.
+      int VAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+      if (VAddrIdx != -1) {
+        const MCOperand &Op = Inst.getOperand(VAddrIdx);
+        MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+        if ((Sub - AMDGPU::VGPR0) & 1)
+          return false;
+      }
+      return true;
+    }
+    case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR:
+    case AMDGPU::GLOBAL_LOAD_TR6_B96_SADDR_gfx1250:
+      return true;
+    }
+  }
+
   const MCRegisterClass &VGPR32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID);
   const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
   for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
@@ -5280,6 +5323,28 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
 
   unsigned CPol = Inst.getOperand(CPolPos).getImm();
 
+  if (!isGFX1250()) {
+    if (CPol & CPol::SCAL) {
+      SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+      StringRef CStr(S.getPointer());
+      S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+      Error(S, "scale_offset is not supported on this GPU");
+    }
+    if (CPol & CPol::NV) {
+      SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+      StringRef CStr(S.getPointer());
+      S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
+      Error(S, "nv is not supported on this GPU");
+    }
+  }
+
+  if ((CPol & CPol::SCAL) && !supportsScaleOffset(MII, Inst.getOpcode())) {
+    SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+    StringRef CStr(S.getPointer());
+    S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
+    Error(S, "scale_offset is not supported for this instruction");
+  }
+
   if (isGFX12Plus())
     return validateTHAndScopeBits(Inst, Operands, CPol);
 
@@ -5400,6 +5465,37 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
   return true;
 }
 
+bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
+                                   const OperandVector &Operands) {
+  unsigned Opc = Inst.getOpcode();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  auto validateFmt = [&](AMDGPU::OpName FmtOp, AMDGPU::OpName SrcOp) -> bool {
+    int FmtIdx = AMDGPU::getNamedOperandIdx(Opc, FmtOp);
+    if (FmtIdx == -1)
+      return true;
+    unsigned Fmt = Inst.getOperand(FmtIdx).getImm();
+    int SrcIdx = AMDGPU::getNamedOperandIdx(Opc, SrcOp);
+    unsigned RegSize =
+        TRI->getRegClass(Desc.operands()[SrcIdx].RegClass).getSizeInBits();
+
+    if (RegSize == AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(Fmt) * 32)
+      return true;
+
+    static const char *FmtNames[] = {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
+                                     "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+                                     "MATRIX_FMT_FP4"};
+
+    Error(getRegLoc(mc2PseudoReg(Inst.getOperand(SrcIdx).getReg()), Operands),
+          "wrong register tuple size for " + Twine(FmtNames[Fmt]));
+    return false;
+  };
+
+  return validateFmt(AMDGPU::OpName::matrix_a_fmt, AMDGPU::OpName::src0) &&
+         validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1);
+}
+
 bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
                                           const SMLoc &IDLoc,
                                           const OperandVector &Operands) {
@@ -5533,6 +5629,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateTFE(Inst, Operands)) {
     return false;
   }
+  if (!validateWMMA(Inst, Operands)) {
+    return false;
+  }
 
   return true;
 }
@@ -6916,6 +7015,8 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
     int64_t CPolVal = 0;
     ParseStatus ResTH = ParseStatus::NoMatch;
     ParseStatus ResScope = ParseStatus::NoMatch;
+    ParseStatus ResNV = ParseStatus::NoMatch;
+    ParseStatus ResScal = ParseStatus::NoMatch;
 
     for (;;) {
       if (ResTH.isNoMatch()) {
@@ -6940,10 +7041,36 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
         }
       }
 
+      // NV bit exists on GFX12+, but does something starting from GFX1250.
+      // Allow parsing on all GFX12 and fail on validation for better
+      // diagnostics.
+      if (ResNV.isNoMatch()) {
+        if (trySkipId("nv")) {
+          ResNV = ParseStatus::Success;
+          CPolVal |= CPol::NV;
+          continue;
+        } else if (trySkipId("no", "nv")) {
+          ResNV = ParseStatus::Success;
+          continue;
+        }
+      }
+
+      if (ResScal.isNoMatch()) {
+        if (trySkipId("scale_offset")) {
+          ResScal = ParseStatus::Success;
+          CPolVal |= CPol::SCAL;
+          continue;
+        } else if (trySkipId("no", "scale_offset")) {
+          ResScal = ParseStatus::Success;
+          continue;
+        }
+      }
+
       break;
     }
 
-    if (ResTH.isNoMatch() && ResScope.isNoMatch())
+    if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch() &&
+        ResScal.isNoMatch())
       return ParseStatus::NoMatch;
 
     Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
@@ -7191,6 +7318,26 @@ ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
   return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
 }
 
+ParseStatus AMDGPUAsmParser::tryParseMatrixFMT(OperandVector &Operands,
+                                               StringRef Name,
+                                               AMDGPUOperand::ImmTy Type) {
+  return parseStringOrIntWithPrefix(Operands, Name,
+                                    {"MATRIX_FMT_FP8", "MATRIX_FMT_BF8",
+                                     "MATRIX_FMT_FP6", "MATRIX_FMT_BF6",
+                                     "MATRIX_FMT_FP4"},
+                                    Type);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixAFMT(OperandVector &Operands) {
+  return tryParseMatrixFMT(Operands, "matrix_a_fmt",
+                           AMDGPUOperand::ImmTyMatrixAFMT);
+}
+
+ParseStatus AMDGPUAsmParser::parseMatrixBFMT(OperandVector &Operands) {
+  return tryParseMatrixFMT(Operands, "matrix_b_fmt",
+                           AMDGPUOperand::ImmTyMatrixBFMT);
+}
+
 // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
 // values to live in a joint format operand in the MCInst encoding.
 ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9292,6 +9439,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
                           DefaultVal);
   }
 
+  int MatrixAFMTIdx =
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_a_fmt);
+  if (MatrixAFMTIdx != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyMatrixAFMT, 0);
+  }
+
+  int MatrixBFMTIdx =
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::matrix_b_fmt);
+  if (MatrixBFMTIdx != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyMatrixBFMT, 0);
+  }
+
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
     addOptionalImmOperand(Inst, Operands, OptIdx,
                           AMDGPUOperand::ImmTyMatrixAReuse, 0);
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 0caabe41e9b79..f99e71637f70f 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1488,7 +1488,6 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3f32, "BUFFER_STORE_FORMAT_XYZ">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMAT_XYZ">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
@@ -2451,6 +2450,7 @@ class VBUFFER_Real <bits<8> op, BUF_Pseudo ps, string real_name> :
   let Inst{62}    = ps.offen;
   let Inst{63}    = ps.idxen;
 
+  let Inst{7}     = cpol{5};   // nv
   let Inst{54-53} = cpol{2-1}; // th{2-1}
   let Inst{52}    = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
   let Inst{51-50} = cpol{4-3}; // scope
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e3519f192137c..42edec0d01493 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
+  AMDGPUPrepareAGPRAlloc.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e219fe05f881b..319cc9d1da181 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -886,7 +886,6 @@ defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
 defm : DSReadPat_mc <DS_READ_U8,  i32, "extloadi8_local">;
 defm : DSReadPat_mc <DS_READ_U8,  i32, "zextloadi8_local">;
 defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
-defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
 defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
 defm : DSReadPat_t16 <DS_READ_I8,  i16, "sextloadi8_local">;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 98f7e17e9528c..5c1989b345bdc 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -877,6 +877,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsMAI)
     convertMAIInst(MI);
 
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::IsWMMA)
+    convertWMMAInst(MI);
+
   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                               AMDGPU::OpName::vdst_in);
   if (VDstIn_Idx != -1) {
@@ -974,10 +977,23 @@ static void adjustMFMA_F8F6F4OpRegClass(const MCRegisterInfo &MRI,
     return MO.setReg(
         MRI.getSubReg(MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5));
   case 8:
+    if (MCRegister NewReg = MRI.getSubReg(
+            MO.getReg(), AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7)) {
+      MO.setReg(NewReg);
+    }
+    return;
+  case 12: {
+    // There is no 384-bit subreg index defined.
+    MCRegister BaseReg = MRI.getSubReg(MO.getReg(), AMDGPU::sub0);
+    MCRegister NewReg = MRI.getMatchingSuperReg(
+        BaseReg, AMDGPU::sub0, &MRI.getRegClass(AMDGPU::VReg_384RegClassID));
+    return MO.setReg(NewReg);
+  }
+  case 16:
     // No-op in cases where one operand is still f8/bf8.
     return;
   default:
-    llvm_unreachable("Unexpected size for mfma f8f6f4 operand");
+    llvm_unreachable("Unexpected size for mfma/wmma f8f6f4 operand");
   }
 }
 
@@ -1015,6 +1031,35 @@ void AMDGPUDisassembler::convertMAIInst(MCInst &MI) const {
                               AdjustedRegClassOpcode->NumRegsSrcB);
 }
 
+void AMDGPUDisassembler::convertWMMAInst(MCInst &MI) const {
+  int FmtAIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_a_fmt);
+  if (FmtAIdx == -1)
+    return;
+
+  int FmtBIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::matrix_b_fmt);
+
+  unsigned FmtA = MI.getOperand(FmtAIdx).getImm();
+  unsigned FmtB = MI.getOperand(FmtBIdx).getImm();
+
+  const AMDGPU::MFMA_F8F6F4_Info *AdjustedRegClassOpcode =
+      AMDGPU::getWMMA_F8F6F4_WithFormatArgs(FmtA, FmtB, MI.getOpcode());
+  if (!AdjustedRegClassOpcode ||
+      AdjustedRegClassOpcode->Opcode == MI.getOpcode())
+    return;
+
+  MI.setOpcode(AdjustedRegClassOpcode->Opcode);
+  int Src0Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+  int Src1Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+  adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src0Idx),
+                              AdjustedRegClassOpcode->NumRegsSrcA);
+  adjustMFMA_F8F6F4OpRegClass(MRI, MI.getOperand(Src1Idx),
+                              AdjustedRegClassOpcode->NumRegsSrcB);
+}
+
 struct VOPModifiers {
   unsigned OpSel = 0;
   unsigned OpSelHi = 0;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 84041001b6ba7..f4d164bf10c3c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -161,6 +161,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   void convertFMAanyK(MCInst &MI) const;
   void convertSDWAInst(MCInst &MI) const;
   void convertMAIInst(MCInst &MI) const;
+  void convertWMMAInst(MCInst &MI) const;
   void convertDPP8Inst(MCInst &MI) const;
   void convertMIMGInst(MCInst &MI) const;
   void convertVOP3DPPInst(MCInst &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 3625db9a4791f..679c55dd0ea48 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,9 +11,10 @@ let WantsRoot = true in {
   def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
   def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
 
-  def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [], -10>;
+  def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
+  def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
   def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
-  def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
+  def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
 }
 
 class True16D16Table <string hiOp, string loOp> {
@@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
 
   bits<7> saddr;
   bits<8> vdst;
-  bits<6> cpol;
+  bits<12> cpol;
   bits<8> vdata; // vsrc
   bits<8> vaddr;
   bits<24> offset;
@@ -192,6 +193,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{31-26} = 0x3b;
   let Inst{39-32} = !if(ps.has_vdst, vdst, ?);
   let Inst{49} = ps.sve;
+  let Inst{7} = cpol{5}; // nv
   let Inst{54-53} = cpol{2-1}; // th{2-1}
   let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
   let Inst{51-50} = cpol{4-3}; // scope
@@ -200,6 +202,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{95-72} = !if(ps.has_offset, offset, ?);
 }
 
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
 class GlobalSaddrTable <bit is_saddr, string Name = ""> {
   bit IsSaddr = is_saddr;
   string SaddrOp = Name;
@@ -237,10 +240,18 @@ class FLAT_Load_Pseudo<
   let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
-multiclass FLAT_Load_Pseudo_t16<string opName> {
-  def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+  def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
+    GlobalSaddrTable<0, opName>;
+  let OtherPredicates = [HasFlatGVSMode] in
+  def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
+  defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
   let True16Predicate = UseRealTrue16Insts in
-    def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+    defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
 }
 
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
@@ -260,10 +271,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   let enabled_saddr = EnableSaddr;
 }
 
-multiclass FLAT_Store_Pseudo_t16<string opName> {
-  def "" : FLAT_Store_Pseudo<opName, VGPR_32>;
-  let OtherPredicates = [HasTrue16BitInsts] in
-    def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+  def "" : FLAT_Store_Pseudo<opName, regClass>,
+    GlobalSaddrTable<0, opName>;
+  let OtherPredicates = [HasFlatGVSMode] in
+  def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
+  defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+
+  defvar Name16 = opName#"_t16";
+  let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
+    def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+      GlobalSaddrTable<0, Name16>,
+      True16D16Table<NAME#"_D16_HI", NAME>;
+	def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+      GlobalSaddrTable<1, Name16>,
+      True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
+  }
 }
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
@@ -657,6 +684,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
     let FPAtomic = data_vt.isFP;
     let AddedComplexity = -1; // Prefer global atomics if available
   }
+
+  def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+    (outs),
+    (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+    " $vaddr, $vdata, $saddr$offset$cpol">,
+    GlobalSaddrTable<1, opName> {
+    let OtherPredicates = [HasFlatGVSMode];
+    let has_saddr = 1;
+    let enabled_saddr = 1;
+    let FPAtomic = data_vt.isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
+  }
 }
 
 multiclass FLAT_Atomic_Pseudo_RTN<
@@ -665,15 +704,29 @@ multiclass FLAT_Atomic_Pseudo_RTN<
   ValueType vt,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+  RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+    (outs vdst_op:$vdst),
     (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata$offset$cpol">,
     GlobalSaddrTable<0, opName#"_rtn"> {
     let FPAtomic = data_vt.isFP;
     let AddedComplexity = -1; // Prefer global atomics if available
   }
+
+  def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+    (outs vdst_op:$vdst),
+      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+    " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+    GlobalSaddrTable<1, opName#"_rtn"> {
+    let OtherPredicates = [HasFlatGVSMode];
+    let has_saddr = 1;
+    let enabled_saddr = 1;
+    let PseudoInstr = NAME#"_SADDR_RTN";
+    let FPAtomic = data_vt.isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
+  }
 }
 
 multiclass FLAT_Atomic_Pseudo<
@@ -762,36 +815,36 @@ multiclass FLAT_Global_Atomic_Pseudo<
 // Flat Instructions
 //===----------------------------------------------------------------------===//
 
-def FLAT_LOAD_UBYTE    : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE    : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT   : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT   : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD    : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2  : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4  : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3  : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+defm FLAT_LOAD_SBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+defm FLAT_LOAD_USHORT   : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+defm FLAT_LOAD_SSHORT   : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+defm FLAT_LOAD_DWORD    : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
+defm FLAT_LOAD_DWORDX2  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+defm FLAT_LOAD_DWORDX4  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+defm FLAT_LOAD_DWORDX3  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
 
-def FLAT_STORE_DWORD   : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD   : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
 
 let SubtargetPredicate = HasD16LoadStore in {
 let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16_HI  : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_UBYTE_D16    : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-def FLAT_LOAD_SBYTE_D16_HI  : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SBYTE_D16    : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-def FLAT_LOAD_SHORT_D16_HI  : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SHORT_D16    : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
 }
 
-def FLAT_STORE_BYTE_D16_HI  : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI  : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
 }
 
-defm FLAT_STORE_BYTE   : FLAT_Store_Pseudo_t16 <"flat_store_byte">;
-defm FLAT_STORE_SHORT  : FLAT_Store_Pseudo_t16 <"flat_store_short">;
+defm FLAT_STORE_BYTE   : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
+defm FLAT_STORE_SHORT  : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
 
 defm FLAT_ATOMIC_CMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
                                 VGPR_32, i32, v2i32, VReg_64>;
@@ -1200,6 +1253,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
   (inst $saddr, $voffset, $offset, 0, $in)
 >;
 
+class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)),
+  (inst $saddr, $voffset, $offset, $cpol, $in)
+>;
+
+class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (inst $saddr, $voffset, $offset, $cpol)
+>;
+
 class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
   (inst $saddr, $voffset, $offset, (i32 0))
@@ -1210,27 +1273,27 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
   (inst $vaddr, $offset)
 >;
 
-class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
-  (inst $saddr, $voffset, $offset, 0)
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+  (inst $saddr, $voffset, $offset, $cpol)
 >;
 
-class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
-                           ValueType vt> : GCNPat <
-  (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
-  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                         ValueType vt> : GCNPat <
+  (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)),
+  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
 >;
 
-class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
-                            ValueType vt, ValueType data_vt = vt> : GCNPat <
-  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)),
-  (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset)
+class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
+                          ValueType vt, ValueType data_vt = vt> : GCNPat <
+  (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
+  (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)
 >;
 
 class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
                                  ValueType vt> : GCNPat <
-  (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data),
-  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+  (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data),
+  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
 >;
 
 class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1259,6 +1322,12 @@ multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
   let AddedComplexity = 1 in
   def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
     (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+  def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+                           GlobalSAddr, vt, data_vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
 }
 
 multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
@@ -1277,6 +1346,11 @@ multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
 
   def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
     (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+
+  def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> {
+    let AddedComplexity = 8;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
 }
 
 multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
@@ -1369,19 +1443,19 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
 >;
 
 class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
-  (inst $vaddr, $saddr, $offset, 0)
+  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+  (inst $vaddr, $saddr, $offset, $cpol)
 >;
 
 class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
                              ValueType vt> : GCNPat <
-  (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)),
-  (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset)
+  (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
+  (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol)
 >;
 
 class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
-  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
-  (inst $vaddr, $saddr, $offset, 0, $in)
+  (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
+  (inst $vaddr, $saddr, $offset, $cpol, $in)
 >;
 
 class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1394,7 +1468,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
     let AddedComplexity = 10;
   }
 
-  def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1404,7 +1478,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu
     let AddedComplexity = 10;
   }
 
-  def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1425,7 +1499,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
     let AddedComplexity = 10;
   }
 
-  def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1435,7 +1509,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu
     let AddedComplexity = 10;
   }
 
-  def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1446,7 +1520,8 @@ multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
   def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;
 
   let AddedComplexity = 13 in
-  def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
+  def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+                             GlobalSAddr, vt, data_vt>;
 }
 
 multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
@@ -1457,7 +1532,7 @@ multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
   def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
 
   let AddedComplexity = 12 in
-  def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
+  def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt>;
 }
 
 multiclass GlobalFLATAtomicPatsNoRtn<string inst, string node, ValueType vt,
@@ -1568,80 +1643,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
   }
 }
 
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadPat <inst, node, vt>;
+
+  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadPat_D16 <inst, node, vt>;
+
+  def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadPat_D16_t16 <inst, node, vt>;
+
+  def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatStorePat <inst, node, vt>;
+
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>;
+
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
 let OtherPredicates = [HasFlatAddressSpace] in {
 
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
 
 foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
 let True16Predicate = p in {
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
 }
 
 let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+  defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+  defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
   def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
   def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
 } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
 
-def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
 
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
 
 foreach vt = Reg32Types.types in {
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
 }
 
 foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
 }
 
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
 
 foreach vt = VReg_128.RegTypes in {
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
 }
 
-def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
-def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+
 
 foreach as = [ "flat", "global" ] in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1684,37 +1808,39 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
 
 } // end foreach as
 
-let SubtargetPredicate = isGFX12Plus in {
-  defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
 
-  let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-    defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-}
+} // End OtherPredicates = [HasFlatAddressSpace]
+
+let OtherPredicates = [isGFX12Plus] in
+defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
+
+let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
+defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
 
 let OtherPredicates = [HasD16LoadStore] in {
-def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
 }
 
 let OtherPredicates = [D16PreservesUnusedBits] in {
 // TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
 
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
 }
 
-} // End OtherPredicates = [HasFlatAddressSpace]
-
 let OtherPredicates = [HasFlatGlobalInsts] in {
 
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
@@ -1782,6 +1908,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
 // appropriate waits.
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
 
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1821,6 +1948,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2813,6 +2941,7 @@ multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemoni
     let DecoderNamespace = "GFX12";
 
     let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+    let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
   }
 }
 
@@ -2832,14 +2961,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
   VFLAT_Aliases_gfx12<name, alias>,
   VFLAT_Real_gfx12<op, name>;
 
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
-                                    string name = get_FLAT_ps<NAME>.Mnemonic,
-                                    string alias = name> :
-  VFLAT_Real_Base_gfx12<op, name, alias> {
-  defm _RTN : VFLAT_Real_gfx12<op, name>;
-}
-
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
                                       string name = get_FLAT_ps<NAME>.Mnemonic,
                                       string alias = name> :
   VFLAT_Real_Base_gfx12<op, name, alias> {
@@ -2853,7 +2975,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
   }
 }
 
-multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op,
                                        string name = get_FLAT_ps<NAME>.Mnemonic> :
   VFLAT_Aliases_gfx12<name> {
   let DecoderNamespace = "GFX12W64" in {
@@ -2862,10 +2984,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
   }
 }
 
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
                                       string name = get_FLAT_ps<NAME>.Mnemonic,
                                       string alias = name> :
-  VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
+  VFLAT_Real_AllAddr_gfx12<op, name, alias> {
   defm _RTN : VFLAT_Real_gfx12<op, name>;
   defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
 }
@@ -2879,28 +3001,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op,
 }
 
 // ENC_VFLAT.
-defm FLAT_LOAD_UBYTE               : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">;
-defm FLAT_LOAD_SBYTE               : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">;
-defm FLAT_LOAD_USHORT              : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">;
-defm FLAT_LOAD_SSHORT              : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">;
-defm FLAT_LOAD_DWORD               : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">;
-defm FLAT_LOAD_DWORDX2             : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">;
-defm FLAT_LOAD_DWORDX3             : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">;
-defm FLAT_LOAD_DWORDX4             : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">;
-defm FLAT_STORE_BYTE               : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">;
-defm FLAT_STORE_SHORT              : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">;
-defm FLAT_STORE_DWORD              : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">;
-defm FLAT_STORE_DWORDX2            : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">;
-defm FLAT_STORE_DWORDX3            : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">;
-defm FLAT_STORE_DWORDX4            : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">;
-defm FLAT_LOAD_UBYTE_D16           : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">;
-defm FLAT_LOAD_SBYTE_D16           : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">;
-defm FLAT_LOAD_SHORT_D16           : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">;
-defm FLAT_LOAD_UBYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_SBYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_SHORT_D16_HI        : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">;
-defm FLAT_STORE_BYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">;
-defm FLAT_STORE_SHORT_D16_HI       : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_LOAD_UBYTE               : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE               : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT              : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT              : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD               : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2             : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3             : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4             : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE               : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT              : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD              : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2            : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3            : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4            : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16           : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16           : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16           : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI       : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">;
 defm FLAT_ATOMIC_SWAP              : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">;
 defm FLAT_ATOMIC_CMPSWAP           : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">;
 defm FLAT_ATOMIC_ADD               : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">;
@@ -2936,74 +3058,74 @@ defm FLAT_ATOMIC_PK_ADD_F16        : VFLAT_Real_Atomics_gfx12<0x059>;
 defm FLAT_ATOMIC_PK_ADD_BF16       : VFLAT_Real_Atomics_gfx12<0x05a>;
 
 // ENC_VGLOBAL.
-defm GLOBAL_LOAD_UBYTE             : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">;
-defm GLOBAL_LOAD_SBYTE             : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">;
-defm GLOBAL_LOAD_USHORT            : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">;
-defm GLOBAL_LOAD_SSHORT            : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">;
-defm GLOBAL_LOAD_DWORD             : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">;
-defm GLOBAL_LOAD_DWORDX2           : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">;
-defm GLOBAL_LOAD_DWORDX3           : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">;
-defm GLOBAL_LOAD_DWORDX4           : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">;
-defm GLOBAL_STORE_BYTE             : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">;
-defm GLOBAL_STORE_SHORT            : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">;
-defm GLOBAL_STORE_DWORD            : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
-defm GLOBAL_STORE_DWORDX2          : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
-defm GLOBAL_STORE_DWORDX3          : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
-defm GLOBAL_STORE_DWORDX4          : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
-defm GLOBAL_LOAD_UBYTE_D16         : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
-defm GLOBAL_LOAD_SBYTE_D16         : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
-defm GLOBAL_LOAD_SHORT_D16         : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
-defm GLOBAL_LOAD_UBYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_SBYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_SHORT_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_BYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_SHORT_D16_HI     : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_DWORD_ADDTID      : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
-defm GLOBAL_STORE_DWORD_ADDTID     : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
-defm GLOBAL_LOAD_BLOCK             : VGLOBAL_Real_AllAddr_gfx12<0x053>;
-defm GLOBAL_STORE_BLOCK            : VGLOBAL_Real_AllAddr_gfx12<0x054>;
-
-defm GLOBAL_ATOMIC_SWAP            : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
-defm GLOBAL_ATOMIC_CMPSWAP         : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
-defm GLOBAL_ATOMIC_ADD             : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
-defm GLOBAL_ATOMIC_SUB             : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
-defm GLOBAL_ATOMIC_CSUB            : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
-defm GLOBAL_ATOMIC_SMIN            : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
-defm GLOBAL_ATOMIC_UMIN            : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
-defm GLOBAL_ATOMIC_SMAX            : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
-defm GLOBAL_ATOMIC_UMAX            : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
-defm GLOBAL_ATOMIC_AND             : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
-defm GLOBAL_ATOMIC_OR              : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
-defm GLOBAL_ATOMIC_XOR             : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
-defm GLOBAL_ATOMIC_INC             : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
-defm GLOBAL_ATOMIC_DEC             : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
-defm GLOBAL_ATOMIC_SWAP_X2         : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
-defm GLOBAL_ATOMIC_CMPSWAP_X2      : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
-defm GLOBAL_ATOMIC_ADD_X2          : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
-defm GLOBAL_ATOMIC_SUB_X2          : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
-defm GLOBAL_ATOMIC_SMIN_X2         : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
-defm GLOBAL_ATOMIC_UMIN_X2         : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
-defm GLOBAL_ATOMIC_SMAX_X2         : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
-defm GLOBAL_ATOMIC_UMAX_X2         : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
-defm GLOBAL_ATOMIC_AND_X2          : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
-defm GLOBAL_ATOMIC_OR_X2           : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
-defm GLOBAL_ATOMIC_XOR_X2          : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
-defm GLOBAL_ATOMIC_INC_X2          : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
-defm GLOBAL_ATOMIC_DEC_X2          : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
-defm GLOBAL_ATOMIC_COND_SUB_U32    : VGLOBAL_Real_Atomics_gfx12<0x050>;
-defm GLOBAL_ATOMIC_FMIN            : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_FMAX            : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056>;
+defm GLOBAL_LOAD_UBYTE             : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE             : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT            : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT            : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD             : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2           : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3           : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4           : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE             : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT            : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD            : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2          : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3          : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4          : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16         : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16         : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16         : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI     : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID      : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID     : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK             : VFLAT_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK            : VFLAT_Real_AllAddr_gfx12<0x054>;
+
+defm GLOBAL_ATOMIC_SWAP            : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP         : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD             : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB             : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB            : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN            : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN            : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX            : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX            : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND             : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR              : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR             : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC             : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC             : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2         : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2      : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2          : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2          : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2         : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2         : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2         : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2         : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2          : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2           : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2          : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2          : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2          : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_COND_SUB_U32    : VFLAT_Real_Atomics_gfx12<0x050>;
+defm GLOBAL_ATOMIC_FMIN            : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX            : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32         : VFLAT_Real_Atomics_gfx12<0x056>;
 
 defm GLOBAL_LOAD_TR_B128_w32       : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
 defm GLOBAL_LOAD_TR_B64_w32        : VGLOBAL_Real_AllAddr_gfx1200<0x058>;
 
-defm GLOBAL_LOAD_TR_B128_w64       : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
-defm GLOBAL_LOAD_TR_B64_w64        : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
+defm GLOBAL_LOAD_TR_B128_w64       : VFLAT_Real_AllAddr_gfx12_w64<0x057>;
+defm GLOBAL_LOAD_TR_B64_w64        : VFLAT_Real_AllAddr_gfx12_w64<0x058>;
 
-defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
-defm GLOBAL_ATOMIC_PK_ADD_F16      : VGLOBAL_Real_Atomics_gfx12<0x059>;
-defm GLOBAL_ATOMIC_PK_ADD_BF16     : VGLOBAL_Real_Atomics_gfx12<0x05a>;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>;
+defm GLOBAL_ATOMIC_PK_ADD_F16      : VFLAT_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16     : VFLAT_Real_Atomics_gfx12<0x05a>;
 
 defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b>;
 defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c>;
@@ -3049,6 +3171,7 @@ multiclass VFLAT_Real_gfx1250<bits<8> op,
     let DecoderNamespace = "GFX1250";
 
     let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
+    let Inst{48} = cpol{CPolBit.SCAL}; // scale offset
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0976fccf78d86..94886b04202b9 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -520,8 +520,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
                               const MachineInstr *MI, IsExpiredFn IsExpired) {
   DenseSet<const MachineBasicBlock *> Visited;
   return getWaitStatesSince(IsHazard, MI->getParent(),
-                            std::next(MI->getReverseIterator()),
-                            0, IsExpired, Visited);
+                            std::next(MI->getReverseIterator()), 0, IsExpired,
+                            Visited, SIInstrInfo::getNumWaitStates);
 }
 
 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
@@ -1189,7 +1189,9 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   }
   fixVALUPartialForwardingHazard(MI);
   fixVALUTransUseHazard(MI);
-  fixWMMAHazards(MI);
+  fixVALUTransCoexecutionHazards(MI);
+  fixWMMAHazards(MI); // fall-through if co-execution is enabled.
+  fixWMMACoexecutionHazards(MI);
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
   fixRequiredExportPriority(MI);
@@ -1809,6 +1811,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
   return true;
 }
 
+bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
+  if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
+      !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
+    if (!SIInstrInfo::isTRANS(I))
+      return false;
+
+    // RAW: Trans(I) writes, VALU(MI) reads.
+    Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+    for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+      if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
+        return true;
+    }
+
+    auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+    if (!ValuDst || !ValuDst->isReg())
+      return false;
+
+    // WAR: Trans(I) reads, VALU(MI) writes.
+    Register ValuDef = ValuDst->getReg();
+    for (const MachineOperand &TransUse : I.explicit_uses()) {
+      if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
+        return true;
+    }
+
+    return false;
+  };
+
+  auto IsExpiredFn = [](const MachineInstr &I, int) {
+    return SIInstrInfo::isVALU(I);
+  };
+
+  const int HasVALU = std::numeric_limits<int>::max();
+  if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+  return true;
+}
+
 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
     return false;
@@ -1863,6 +1910,182 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
   return true;
 }
 
+static bool isCoexecutableVALUInst(const MachineInstr &MI) {
+  return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
+         !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
+}
+
+static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
+                                       const SIInstrInfo *TII, unsigned Latency,
+                                       unsigned Category) {
+  assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
+         "Handle me if the xdl wmma instruction latency changes");
+
+  switch (Category) {
+  case 0: // Dense WMMA Instructions:
+          //   WMMA_*F16, WMMA_*BF16
+          //   WMMA_*FP8FP8
+          //   WMMA_*FP8BF8
+          //   WMMA_*BF8FP8
+          //   WMMA_*BF8BF8
+          //   WMMA_*F8F6F4 if SRCA & SRCB != F8
+    return Latency == 8 && SIInstrInfo::isWMMA(MI);
+
+  case 1: // Dense WMMA Instructions:
+          //   WMMA_IU8
+          //   WMMA_IU4
+          //   WMMA_*F8F6F4 if SRCA OR SRCB == F8
+    return Latency == 16 && SIInstrInfo::isWMMA(MI);
+
+  case 2: // Dense SWMMAC Instructions
+          //   SWMMAC_*F16, SWMMAC_*BF16,
+          //   SWMMAC_*FP8FP8
+          //   SWMMAC_*BF8FP8
+          //   SWMMAC_*FP8BF8
+          //   SWMMAC_*BF8BF8
+    return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
+
+  case 3: // Sparse WMMA Instructions:
+          //   SWMMAC_IU8
+          //   SWMMAC_IU4
+    return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
+  default:
+    break;
+  } // end switch.
+
+  return false;
+}
+
+bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+  if (!AMDGPU::isGFX1250(ST))
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
+    return false;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
+  // be in between the first WMMA and the second instruction to cover the hazard
+  // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
+  // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
+  // numbers, which depends on the category of the first WMMA.
+  const int WMMAWaitStates[] = {5, 9, 3, 5};
+  const int VALUWaitStates[] = {4, 8, 2, 4};
+  unsigned Category = 0;
+
+  auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+    if (!TII->isXDLWMMA(I))
+      return false;
+
+    unsigned Latency = TSchedModel.computeInstrLatency(&I);
+    if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+      return false;
+
+    Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+    Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
+    Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
+
+    // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
+    if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
+      return true;
+
+    if (SIInstrInfo::isSWMMAC(*MI)) {
+      Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+      if (TRI->regsOverlap(D0, Idx1))
+        return true;
+    }
+
+    return false;
+  };
+
+  auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
+    if (!TII->isXDLWMMA(I))
+      return false;
+
+    unsigned Latency = TSchedModel.computeInstrLatency(&I);
+    if (!IsWMMAHazardInstInCategory(I, TII, Latency, Category))
+      return false;
+
+    // WMMA writes, VALU reads.
+    Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+    for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+      if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
+        return true;
+    }
+
+    auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+    if (!ValuDst || !ValuDst->isReg())
+      return false;
+    Register D1 = ValuDst->getReg();
+
+    // WMMA writes, VALU writes.
+    if (TRI->regsOverlap(D0, D1))
+      return true;
+
+    // WMMA reads, VALU writes.
+    Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();
+    Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();
+    if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
+      return true;
+
+    if (SIInstrInfo::isSWMMAC(I)) {
+      Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();
+      if (TRI->regsOverlap(D1, Idx0))
+        return true;
+    }
+
+    return false;
+  };
+
+  int Limit = 0;
+  auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
+    return WaitStates >= Limit;
+  };
+
+  auto GetWaitStatesFn = [](const MachineInstr &I) {
+    return SIInstrInfo::isVALU(I) ? 1 : 0;
+  };
+
+  int WaitStatesNeeded = -1;
+  if (TII->isXDLWMMA(*MI)) {
+    for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+      Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
+      DenseSet<const MachineBasicBlock *> Visited;
+      // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+      // exists, and INT_MAX if there is no hazard. As a result, a negative
+      // WaitStatesNeeded here means no hazard, and we will continue to search
+      // for other categories.
+      WaitStatesNeeded =
+          Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
+                                       std::next(MI->getReverseIterator()), 0,
+                                       IsExpiredFn, Visited, GetWaitStatesFn);
+    }
+  } else { // Must be a co-executable VALU.
+    for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
+      Limit = VALUWaitStates[Category]; // for IsExpiredFn.
+      DenseSet<const MachineBasicBlock *> Visited;
+      // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+      // exists, and INT_MAX if there is no hazard. As a result, a negative
+      // WaitStatesNeeded here means no hazard, and we will continue to search
+      // for other categories.
+      WaitStatesNeeded =
+          Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
+                                       std::next(MI->getReverseIterator()), 0,
+                                       IsExpiredFn, Visited, GetWaitStatesFn);
+    }
+  }
+
+  // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
+  // means not needed.
+  for (int i = 0; i < WaitStatesNeeded; i++)
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            TII->get(AMDGPU::V_NOP_e32));
+
+  return true;
+}
+
 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (!ST.hasShift64HighRegBug())
     return false;
@@ -3160,7 +3383,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
   // Check entry priority at each export (as there will only be a few).
   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
   bool Changed = false;
-  if (CC != CallingConv::AMDGPU_Gfx)
+  if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
 
   auto NextMI = std::next(It);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index bbc55851bf967..f796eeaebea89 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -104,7 +104,9 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixLdsDirectVMEMHazard(MachineInstr *MI);
   bool fixVALUPartialForwardingHazard(MachineInstr *MI);
   bool fixVALUTransUseHazard(MachineInstr *MI);
+  bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
   bool fixWMMAHazards(MachineInstr *MI);
+  bool fixWMMACoexecutionHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
   bool fixRequiredExportPriority(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fce8f36d45969..a6553083d722b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() {
 GCNRegPressure
 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   GCNDownwardRPTracker RPTracker(*LIS);
-  RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+  RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
+                    &LiveIns[RegionIdx]);
   return RPTracker.moveMaxPressure();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b8f0f44cbe2c..9a2bab108232d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -324,7 +324,7 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
 }
 
 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                       unsigned NumRegionInstrs) const {
+                                       const SchedRegion &Region) const {
   // Track register pressure so the scheduler can try to decrease
   // pressure once register usage is above the threshold defined by
   // SIRegisterInfo::getRegPressureSetLimit()
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 68430526dba26..56851571c6c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -214,6 +214,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool FlatInstOffsets = false;
   bool FlatGlobalInsts = false;
   bool FlatScratchInsts = false;
+  bool FlatGVSMode = false;
   bool ScalarFlatScratchInsts = false;
   bool HasArchitectedFlatScratch = false;
   bool EnableFlatScratch = false;
@@ -233,6 +234,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasRestrictedSOffset = false;
   bool Has64BitLiterals = false;
   bool HasBitOp3Insts = false;
+  bool HasTanhInsts = false;
   bool HasTransposeLoadF4F6Insts = false;
   bool HasPrngInst = false;
   bool HasBVHDualAndBVH8Insts = false;
@@ -1020,7 +1022,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           unsigned NumRegionInstrs) const override;
+                           const SchedRegion &Region) const override;
 
   void mirFileLoaded(MachineFunction &MF) const override;
 
@@ -1160,6 +1162,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
 
+  // Scalar and global loads support scale_offset bit.
+  bool hasScaleOffset() const { return GFX1250Insts; }
+
+  bool hasFlatGVSMode() const { return FlatGVSMode; }
+
+  // FLAT GLOBAL VOffset is signed
+  bool hasSignedGVSOffset() const { return GFX1250Insts; }
+
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
@@ -1377,6 +1387,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasMinimum3Maximum3F16;
   }
 
+  bool hasTanhInsts() const { return HasTanhInsts; }
+
   bool hasAddPC64Inst() const { return GFX1250Insts; }
 
   bool hasMinimum3Maximum3PKF16() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ec9248b972ec4..11b072e05a6e1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -157,9 +157,15 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
     const int64_t TH = Imm & CPol::TH;
     const int64_t Scope = Imm & CPol::SCOPE;
 
+    if (Imm & CPol::SCAL)
+      O << " scale_offset";
+
     printTH(MI, TH, Scope, O);
     printScope(Scope, O);
 
+    if (Imm & CPol::NV)
+      O << " nv";
+
     return;
   }
 
@@ -1342,6 +1348,48 @@ void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
   O << " index_key:" << Imm;
 }
 
+void AMDGPUInstPrinter::printMatrixFMT(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O, char AorB) {
+  auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+  if (Imm == 0)
+    return;
+
+  O << " matrix_" << AorB << "_fmt:";
+  switch (Imm) {
+  default:
+    O << Imm;
+    break;
+  case WMMA::MatrixFMT::MATRIX_FMT_FP8:
+    O << "MATRIX_FMT_FP8";
+    break;
+  case WMMA::MatrixFMT::MATRIX_FMT_BF8:
+    O << "MATRIX_FMT_BF8";
+    break;
+  case WMMA::MatrixFMT::MATRIX_FMT_FP6:
+    O << "MATRIX_FMT_FP6";
+    break;
+  case WMMA::MatrixFMT::MATRIX_FMT_BF6:
+    O << "MATRIX_FMT_BF6";
+    break;
+  case WMMA::MatrixFMT::MATRIX_FMT_FP4:
+    O << "MATRIX_FMT_FP4";
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printMatrixAFMT(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  printMatrixFMT(MI, OpNo, STI, O, 'a');
+}
+
+void AMDGPUInstPrinter::printMatrixBFMT(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  printMatrixFMT(MI, OpNo, STI, O, 'b');
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3299a618e882..e0b7aa5799e62 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -134,6 +134,12 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMatrixFMT(const MCInst *MI, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O, char AorB);
+  void printMatrixAFMT(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMatrixBFMT(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index f48739fe01814..c49ad798c1824 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -384,6 +384,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
   if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
        Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
        Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
+      // Matrix B format operand reuses op_sel_hi.
+      !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
       // Matrix B reuse operand reuses op_sel_hi.
       !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
     Encoding |= getImplicitOpSelHiEncoding(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index a8649970aa825..3902d4c3b1027 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -398,8 +398,12 @@ enum CPol {
   SCOPE_DEV = 2 << 3,
   SCOPE_SYS = 3 << 3,
 
+  NV = 1 << 5, // Non-volatile bit
+
   SWZ = 1 << 6, // Swizzle bit
 
+  SCAL = 1 << 11, // Scale offset bit
+
   ALL = TH | SCOPE,
 
   // Helper bits
@@ -1003,6 +1007,16 @@ enum Target : unsigned {
 
 } // namespace Exp
 
+namespace WMMA {
+enum MatrixFMT : unsigned {
+  MATRIX_FMT_FP8 = 0,
+  MATRIX_FMT_BF8 = 1,
+  MATRIX_FMT_FP6 = 2,
+  MATRIX_FMT_BF6 = 3,
+  MATRIX_FMT_FP4 = 4
+};
+} // namespace WMMA
+
 namespace VOP3PEncoding {
 
 enum OpSel : uint64_t {
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a46395695d..f018f77bc83e1 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
   default:
     return false;
   case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
     SMovOp = AMDGPU::S_MOV_B32;
     break;
   case AMDGPU::V_MOV_B64_PSEUDO:
@@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
 
     // Copies and REG_SEQUENCE do not contribute to the final assembly
     // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
-    if (Inst->isCopy() || Inst->isRegSequence()) {
-      if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
-        if (!Inst->isCopy() ||
-            !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
-          Info.NumSVCopies++;
-          continue;
-        }
+    if (Inst->isRegSequence() &&
+        TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+      Info.NumSVCopies++;
+      continue;
+    }
+    if (Inst->isCopy()) {
+      const TargetRegisterClass *SrcRC, *DstRC;
+      std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
+      if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
+          !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+        Info.NumSVCopies++;
+        continue;
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507af..e5d1eaad2b8f4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1209,18 +1209,24 @@ void SIFoldOperandsImpl::foldOperand(
         return;
     }
 
-    // A frame index will resolve to a positive constant, so it should always be
-    // safe to fold the addressing mode, even pre-GFX9.
-    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
-
     const unsigned Opc = UseMI->getOpcode();
     if (TII->isFLATScratch(*UseMI) &&
         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
         !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
       unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
+      unsigned CPol =
+          TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
+      if ((CPol & AMDGPU::CPol::SCAL) &&
+          !AMDGPU::supportsScaleOffset(*TII, NewOpc))
+        return;
+
       UseMI->setDesc(TII->get(NewOpc));
     }
 
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
+
     return;
   }
 
@@ -1761,6 +1767,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
   for (MachineInstr *Copy : CopiesToReplace)
     Copy->addImplicitDefUseOperands(*MF);
 
+  SetVector<MachineInstr *> ConstantFoldCandidates;
   for (FoldCandidate &Fold : FoldList) {
     assert(!Fold.isReg() || Fold.Def.OpToFold);
     if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1790,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
                         << static_cast<int>(Fold.UseOpNo) << " of "
                         << *Fold.UseMI);
 
-      if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
-        LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
-        Changed = true;
-      }
+      if (Fold.isImm())
+        ConstantFoldCandidates.insert(Fold.UseMI);
 
     } else if (Fold.Commuted) {
       // Restoring instruction's original operand order if fold has failed.
       TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
+
+  for (MachineInstr *MI : ConstantFoldCandidates) {
+    if (tryConstantFoldOp(MI)) {
+      LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+      Changed = true;
+    }
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6a3867937d57f..11552b3a9a438 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
 
   initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
 
-  ScratchExecCopy = findScratchNonCalleeSaveRegister(
-      MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+  if (FuncInfo->isWholeWaveFunction()) {
+    // Whole wave functions already have a copy of the original EXEC mask that
+    // we can use.
+    assert(IsProlog && "Epilog should look at return, not setup");
+    ScratchExecCopy =
+        TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
+    assert(ScratchExecCopy && "Couldn't find copy of EXEC");
+  } else {
+    ScratchExecCopy = findScratchNonCalleeSaveRegister(
+        MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+  }
+
   if (!ScratchExecCopy)
     report_fatal_error("failed to find free scratch register");
 
@@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores(
       };
 
   StoreWWMRegisters(WWMScratchRegs);
+
+  auto EnableAllLanes = [&]() {
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+  };
+
   if (!WWMCalleeSavedRegs.empty()) {
     if (ScratchExecCopy) {
-      unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+      EnableAllLanes();
     } else {
       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
                                              /*IsProlog*/ true,
@@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores(
   }
 
   StoreWWMRegisters(WWMCalleeSavedRegs);
-  if (ScratchExecCopy) {
+  if (FuncInfo->isWholeWaveFunction()) {
+    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
+    // it now. If we have already saved some WWM CSR registers, then the EXEC is
+    // already -1 and we don't need to do anything else. Otherwise, set EXEC to
+    // -1 here.
+    if (!ScratchExecCopy)
+      buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
+                           /*EnableInactiveLanes*/ true);
+    else if (WWMCalleeSavedRegs.empty())
+      EnableAllLanes();
+    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+  } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
@@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores(
   Register ScratchExecCopy;
   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
-  if (!WWMScratchRegs.empty())
-    ScratchExecCopy =
-        buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
-                             /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
-
   auto RestoreWWMRegisters =
       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
         for (const auto &Reg : WWMRegs) {
@@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores(
         }
       };
 
+  if (FuncInfo->isWholeWaveFunction()) {
+    // For whole wave functions, the EXEC is already -1 at this point.
+    // Therefore, we can restore the CSR WWM registers right away.
+    RestoreWWMRegisters(WWMCalleeSavedRegs);
+
+    // The original EXEC is the first operand of the return instruction.
+    const MachineInstr &Return = MBB.instr_back();
+    assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
+           "Unexpected return inst");
+    Register OrigExec = Return.getOperand(0).getReg();
+
+    if (!WWMScratchRegs.empty()) {
+      unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+      BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+          .addReg(OrigExec)
+          .addImm(-1);
+      RestoreWWMRegisters(WWMScratchRegs);
+    }
+
+    // Restore original EXEC.
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+    return;
+  }
+
+  if (!WWMScratchRegs.empty()) {
+    ScratchExecCopy =
+        buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
+                             /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
+  }
   RestoreWWMRegisters(WWMScratchRegs);
   if (!WWMCalleeSavedRegs.empty()) {
     if (ScratchExecCopy) {
@@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
         NeedExecCopyReservedReg = true;
       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+               MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
                (MFI->isChainFunction() &&
                 TII->isChainCallOpcode(MI.getOpcode()))) {
         // We expect all return to be the same size.
@@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isEntryFunction())
     return;
 
+  if (MFI->isWholeWaveFunction()) {
+    // In practice, all the VGPRs are WWM registers, and we will need to save at
+    // least their inactive lanes. Add them to WWMReservedRegs.
+    assert(!NeedExecCopyReservedReg &&
+           "Whole wave functions can use the reg mapped for their i1 argument");
+
+    // FIXME: Be more efficient!
+    for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+      if (MF.getRegInfo().isPhysRegModified(Reg)) {
+        MFI->reserveWWMRegister(Reg);
+        MF.begin()->addLiveIn(Reg);
+      }
+    MF.begin()->sortUniqueLiveIns();
+  }
+
   // Remove any VGPRs used in the return value because these do not need to be saved.
   // This prevents CSR restore from clobbering return VGPRs.
   if (ReturnMI) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfe6f65d240e6..d65c3ae76566b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
 #include <optional>
 
 using namespace llvm;
+using namespace llvm::SDPatternMatch;
 
 #define DEBUG_TYPE "si-lower"
 
@@ -616,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         ISD::FSIN, ISD::FROUND},
                        MVT::f16, Custom);
 
+    // BF16 - VOP1 Actions.
+    if (Subtarget->hasBF16TransInsts())
+      setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
+
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
 
@@ -2258,7 +2264,8 @@ SDValue SITargetLowering::getPreloadedValue(
   const ArgDescriptor WorkGroupIDZ =
       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
   if (Subtarget->hasArchitectedSGPRs() &&
-      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
+       CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
     switch (PVID) {
     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
       Reg = &WorkGroupIDX;
@@ -2940,12 +2947,15 @@ SDValue SITargetLowering::LowerFormalArguments(
     if (!Subtarget->enableFlatScratch())
       assert(!UserSGPRInfo.hasFlatScratchInit());
     if ((CallConv != CallingConv::AMDGPU_CS &&
-         CallConv != CallingConv::AMDGPU_Gfx) ||
+         CallConv != CallingConv::AMDGPU_Gfx &&
+         CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
         !Subtarget->hasArchitectedSGPRs())
       assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
              !Info->hasWorkGroupIDZ());
   }
 
+  bool IsWholeWaveFunc = Info->isWholeWaveFunction();
+
   if (CallConv == CallingConv::AMDGPU_PS) {
     processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
 
@@ -2986,7 +2996,8 @@ SDValue SITargetLowering::LowerFormalArguments(
   } else if (IsKernel) {
     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
-    Splits.append(Ins.begin(), Ins.end());
+    Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
+                  Ins.end());
   }
 
   if (IsKernel)
@@ -3017,6 +3028,13 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   SmallVector<SDValue, 16> Chains;
 
+  if (IsWholeWaveFunc) {
+    SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
+                                {MVT::i1, MVT::Other}, Chain);
+    InVals.push_back(Setup.getValue(0));
+    Chains.push_back(Setup.getValue(1));
+  }
+
   // FIXME: This is the minimum kernel argument alignment. We should improve
   // this to the maximum alignment of the arguments.
   //
@@ -3024,7 +3042,8 @@ SDValue SITargetLowering::LowerFormalArguments(
   // kern arg offset.
   const Align KernelArgBaseAlign = Align(16);
 
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
+       ++i) {
     const ISD::InputArg &Arg = Ins[i];
     if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
       InVals.push_back(DAG.getPOISON(Arg.VT));
@@ -3372,7 +3391,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   unsigned Opc = AMDGPUISD::ENDPGM;
   if (!IsWaveEnd)
-    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
+    Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
+          : IsShader                  ? AMDGPUISD::RETURN_TO_EPILOG
+                                      : AMDGPUISD::RET_GLUE;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -3874,7 +3895,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
+  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
+      CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -5888,6 +5910,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent();
     return SplitBB;
   }
+  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
+    assert(MFI->isWholeWaveFunction());
+
+    // During ISel, it's difficult to propagate the original EXEC mask to use as
+    // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
+    MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
+    Register OriginalExec = Setup->getOperand(0).getReg();
+    assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+    MF->getRegInfo().clearKillFlags(OriginalExec);
+    MI.getOperand(0).setReg(OriginalExec);
+    return BB;
+  }
   default:
     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
       if (!MI.mayStore())
@@ -9308,7 +9342,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_reloc_constant: {
-    Module *M = const_cast<Module *>(MF.getFunction().getParent());
+    Module *M = MF.getFunction().getParent();
     const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
     auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
     auto *RelocSymbol = cast<GlobalVariable>(
@@ -11131,7 +11165,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.getSizeInBits() == 64);
 
   SDLoc DL(Op);
-  SDValue Cond = Op.getOperand(0);
+  SDValue Cond = DAG.getFreeze(Op.getOperand(0));
 
   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
   SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -11170,7 +11204,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
     // Without !fpmath accuracy information, we can't do more because we don't
     // know exactly whether rcp is accurate enough to meet !fpmath requirement.
     // f16 is always accurate enough
-    if (!AllowInaccurateRcp && VT != MVT::f16)
+    if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
       return SDValue();
 
     if (CLHS->isExactlyValue(1.0)) {
@@ -11197,9 +11231,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
     }
   }
 
-  // For f16 require afn or arcp.
+  // For f16 and bf16 require afn or arcp.
   // For f32 require afn.
-  if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+  if (!AllowInaccurateRcp &&
+      ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
     return SDValue();
 
   // Turn into multiply by the reciprocal.
@@ -11590,7 +11625,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
   if (VT == MVT::f64)
     return LowerFDIV64(Op, DAG);
 
-  if (VT == MVT::f16)
+  if (VT == MVT::f16 || VT == MVT::bf16)
     return LowerFDIV16(Op, DAG);
 
   llvm_unreachable("Unexpected type for fdiv");
@@ -13598,6 +13633,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_rsq_legacy:
     case Intrinsic::amdgcn_trig_preop:
+    case Intrinsic::amdgcn_tanh:
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
     case Intrinsic::amdgcn_sqrt:
@@ -14561,7 +14597,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
 // instead of a tree.
 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
-  assert(N->getOpcode() == ISD::ADD);
+  assert(N->isAnyAdd());
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
@@ -14594,7 +14630,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
     for (SDNode *User : LHS->users()) {
       // There is a use that does not feed into addition, so the multiply can't
       // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
-      if (User->getOpcode() != ISD::ADD)
+      if (!User->isAnyAdd())
         return SDValue();
 
       // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14706,8 +14742,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
 
     SDValue Hi = getHiHalf64(LHS, DAG);
     SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+    unsigned Opcode = N->getOpcode();
+    if (Opcode == ISD::PTRADD)
+      Opcode = ISD::ADD;
     SDValue AddHi =
-        DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+        DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
 
     SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
     return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15181,42 +15220,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  if (N1.getOpcode() == ISD::ADD) {
-    // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
-    //    y is not, and (add y, z) is used only once.
-    // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
-    //    z is not, and (add y, z) is used only once.
-    // The goal is to move constant offsets to the outermost ptradd, to create
-    // more opportunities to fold offsets into memory instructions.
-    // Together with the generic combines in DAGCombiner.cpp, this also
-    // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
-    //
-    // This transform is here instead of in the general DAGCombiner as it can
-    // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
-    // AArch64's CPA.
-    SDValue X = N0;
-    SDValue Y = N1.getOperand(0);
-    SDValue Z = N1.getOperand(1);
-    if (N1.hasOneUse()) {
-      bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
-      bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
-      if (ZIsConstant != YIsConstant) {
-        // If both additions in the original were NUW, the new ones are as well.
-        SDNodeFlags Flags =
-            (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
-        if (YIsConstant)
-          std::swap(Y, Z);
+  // The following folds transform PTRADDs into regular arithmetic in cases
+  // where the PTRADD wouldn't be folded as an immediate offset into memory
+  // instructions anyway. They are target-specific in that other targets might
+  // prefer to not lose information about the pointer arithmetic.
+
+  // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+  // Adapted from DAGCombiner::visitADDLikeCommutative.
+  SDValue V, K;
+  if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+    SDNodeFlags ShlFlags = N1->getFlags();
+    // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
+    // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
+    // preserved.
+    SDNodeFlags NewShlFlags =
+        ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
+            ? SDNodeFlags::NoSignedWrap
+            : SDNodeFlags();
+    SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
+    DCI.AddToWorklist(Inner.getNode());
+    return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+  }
+
+  // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+  // performAddCombine.
+  if (N1.getOpcode() == ISD::MUL) {
+    if (Subtarget->hasMad64_32()) {
+      if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+        return Folded;
+    }
+  }
+
+  // If the 32 low bits of the constant are all zero, there is nothing to fold
+  // into an immediate offset, so it's better to eliminate the unnecessary
+  // addition for the lower 32 bits than to preserve the PTRADD.
+  // Analogous to a fold in performAddCombine.
+  if (VT == MVT::i64) {
+    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+      return Folded;
+  }
 
-        SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+  if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+    // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+    // global address GA and constant c, such that c can be folded into GA.
+    SDValue GAValue = N0.getOperand(0);
+    if (const GlobalAddressSDNode *GA =
+            dyn_cast<GlobalAddressSDNode>(GAValue)) {
+      if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
+        // If both additions in the original were NUW, reassociation preserves
+        // that.
+        SDNodeFlags Flags =
+            (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+        SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
         DCI.AddToWorklist(Inner.getNode());
-        return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+        return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
       }
     }
   }
 
+  if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+    return SDValue();
+
+  // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+  //    y is not, and (add y, z) is used only once.
+  // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+  //    z is not, and (add y, z) is used only once.
+  // The goal is to move constant offsets to the outermost ptradd, to create
+  // more opportunities to fold offsets into memory instructions.
+  // Together with the generic combines in DAGCombiner.cpp, this also
+  // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+  //
+  // This transform is here instead of in the general DAGCombiner as it can
+  // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+  // AArch64's CPA.
+  SDValue X = N0;
+  SDValue Y = N1.getOperand(0);
+  SDValue Z = N1.getOperand(1);
+  bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+  bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+  // If both additions in the original were NUW, reassociation preserves that.
+  SDNodeFlags ReassocFlags =
+      (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+
+  if (ZIsConstant != YIsConstant) {
+    if (YIsConstant)
+      std::swap(Y, Z);
+    SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+    DCI.AddToWorklist(Inner.getNode());
+    return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+  }
+
+  // If one of Y and Z is constant, they have been handled above. If both were
+  // constant, the addition would have been folded in SelectionDAG::getNode
+  // already. This ensures that the generic DAG combines won't undo the
+  // following reassociation.
+  assert(!YIsConstant && !ZIsConstant);
+
+  if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+    // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+    // y are uniform and z isn't.
+    // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+    // z are uniform and y isn't.
+    // The goal is to push uniform operands up in the computation, so that they
+    // can be handled with scalar operations. We can't use reassociateScalarOps
+    // for this since it requires two identical commutative operations to
+    // reassociate.
+    if (Y->isDivergent())
+      std::swap(Y, Z);
+    SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+    DCI.AddToWorklist(UniformInner.getNode());
+    return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359f03da6..9faf4974e3fd6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
   llvm_unreachable("event type has no associated counter");
 }
 
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
-public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
-                  HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
-                  InstCounterType SmemAccessCounter)
-      : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
-        WaitEventMaskForInst(WaitEventMaskForInst),
-        SmemAccessCounter(SmemAccessCounter) {}
-
-  unsigned getWaitCountMax(InstCounterType T) const {
-    switch (T) {
-    case LOAD_CNT:
-      return Limits.LoadcntMax;
-    case DS_CNT:
-      return Limits.DscntMax;
-    case EXP_CNT:
-      return Limits.ExpcntMax;
-    case STORE_CNT:
-      return Limits.StorecntMax;
-    case SAMPLE_CNT:
-      return Limits.SamplecntMax;
-    case BVH_CNT:
-      return Limits.BvhcntMax;
-    case KM_CNT:
-      return Limits.KmcntMax;
-    case X_CNT:
-      return Limits.XcntMax;
-    default:
-      break;
-    }
-    return 0;
-  }
-
-  bool isSmemCounter(InstCounterType T) const {
-    return T == SmemAccessCounter || T == X_CNT;
-  }
-
-  unsigned getSgprScoresIdx(InstCounterType T) const {
-    assert(isSmemCounter(T) && "Invalid SMEM counter");
-    return T == X_CNT ? 1 : 0;
-  }
-
-  unsigned getScoreLB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
-    return ScoreLBs[T];
-  }
-
-  unsigned getScoreUB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
-    return ScoreUBs[T];
-  }
-
-  unsigned getScoreRange(InstCounterType T) const {
-    return getScoreUB(T) - getScoreLB(T);
-  }
-
-  unsigned getRegScore(int GprNo, InstCounterType T) const {
-    if (GprNo < NUM_ALL_VGPRS)
-      return VgprScores[T][GprNo];
-    return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
-  }
-
-  bool merge(const WaitcntBrackets &Other);
-
-  RegInterval getRegInterval(const MachineInstr *MI,
-                             const MachineRegisterInfo *MRI,
-                             const SIRegisterInfo *TRI,
-                             const MachineOperand &Op) const;
-
-  bool counterOutOfOrder(InstCounterType T) const;
-  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
-  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
-  void determineWait(InstCounterType T, RegInterval Interval,
-                     AMDGPU::Waitcnt &Wait) const;
-  void determineWait(InstCounterType T, int RegNo,
-                     AMDGPU::Waitcnt &Wait) const {
-    determineWait(T, {RegNo, RegNo + 1}, Wait);
-  }
-
-  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
-  void applyWaitcnt(InstCounterType T, unsigned Count);
-  void applyXcnt(const AMDGPU::Waitcnt &Wait);
-  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
-                     const MachineRegisterInfo *MRI, WaitEventType E,
-                     MachineInstr &MI);
-
-  unsigned hasPendingEvent() const { return PendingEvents; }
-  unsigned hasPendingEvent(WaitEventType E) const {
-    return PendingEvents & (1 << E);
-  }
-  unsigned hasPendingEvent(InstCounterType T) const {
-    unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
-    assert((HasPending != 0) == (getScoreRange(T) != 0));
-    return HasPending;
-  }
-
-  bool hasMixedPendingEvents(InstCounterType T) const {
-    unsigned Events = hasPendingEvent(T);
-    // Return true if more than one bit is set in Events.
-    return Events & (Events - 1);
-  }
-
-  bool hasPendingFlat() const {
-    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
-             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
-            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
-             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
-  }
-
-  void setPendingFlat() {
-    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
-    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
-  }
-
-  bool hasPendingGDS() const {
-    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
-  }
-
-  unsigned getPendingGDSWait() const {
-    return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
-  }
-
-  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
-
-  // Return true if there might be pending writes to the vgpr-interval by VMEM
-  // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      if (VgprVmemTypes[RegNo] & ~(1 << V))
-        return true;
-    }
-    return false;
-  }
-
-  void clearVgprVmemTypes(RegInterval Interval) {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      VgprVmemTypes[RegNo] = 0;
-    }
-  }
-
-  void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
-    PendingEvents |= WaitEventMaskForInst[STORE_CNT];
-  }
-
-  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
-    return LDSDMAStores;
-  }
-
-  bool hasPointSampleAccel(const MachineInstr &MI) const;
-  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
-                                      RegInterval Interval) const;
-
-  void print(raw_ostream &) const;
-  void dump() const { print(dbgs()); }
-
-private:
-  struct MergeInfo {
-    unsigned OldLB;
-    unsigned OtherLB;
-    unsigned MyShift;
-    unsigned OtherShift;
-  };
-  static bool mergeScore(const MergeInfo &M, unsigned &Score,
-                         unsigned OtherScore);
-
-  void setScoreLB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreLBs[T] = Val;
-  }
-
-  void setScoreUB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreUBs[T] = Val;
-
-    if (T != EXP_CNT)
-      return;
-
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
-      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
-  }
-
-  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
-    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
-  }
-
-  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
-                          unsigned Score);
-
-  void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
-                         const MachineRegisterInfo *MRI,
-                         const MachineOperand &Op, InstCounterType CntTy,
-                         unsigned Val);
-
-  const GCNSubtarget *ST = nullptr;
-  InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
-  HardwareLimits Limits = {};
-  const unsigned *WaitEventMaskForInst;
-  InstCounterType SmemAccessCounter;
-  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
-  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
-  unsigned PendingEvents = 0;
-  // Remember the last flat memory operation.
-  unsigned LastFlat[NUM_INST_CNTS] = {0};
-  // Remember the last GDS operation.
-  unsigned LastGDS = 0;
-  // wait_cnt scores for every vgpr.
-  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int VgprUB = -1;
-  int SgprUB = -1;
-  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
-  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
-  // X_CNT score.
-  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
-  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
-  // write to each vgpr.
-  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
-  // Store representative LDS DMA operations. The only useful info here is
-  // alias info. One store is kept per unique AAInfo.
-  SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
-};
+class WaitcntBrackets;
 
 // This abstracts the logic for generating and updating S_WAIT* instructions
 // away from the analysis that determines where they are needed. This was
@@ -640,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 };
 
 class SIInsertWaitcnts {
+public:
+  const GCNSubtarget *ST;
+  InstCounterType SmemAccessCounter;
+  InstCounterType MaxCounter;
+  const unsigned *WaitEventMaskForInst;
+
 private:
-  const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
@@ -657,8 +429,6 @@ class SIInsertWaitcnts {
     bool Dirty = true;
   };
 
-  InstCounterType SmemAccessCounter;
-
   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -675,7 +445,7 @@ class SIInsertWaitcnts {
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
-  InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+  HardwareLimits Limits;
 
 public:
   SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -686,6 +456,30 @@ class SIInsertWaitcnts {
     (void)ForceVMCounter;
   }
 
+  unsigned getWaitCountMax(InstCounterType T) const {
+    switch (T) {
+    case LOAD_CNT:
+      return Limits.LoadcntMax;
+    case DS_CNT:
+      return Limits.DscntMax;
+    case EXP_CNT:
+      return Limits.ExpcntMax;
+    case STORE_CNT:
+      return Limits.StorecntMax;
+    case SAMPLE_CNT:
+      return Limits.SamplecntMax;
+    case BVH_CNT:
+      return Limits.BvhcntMax;
+    case KM_CNT:
+      return Limits.KmcntMax;
+    case X_CNT:
+      return Limits.XcntMax;
+    default:
+      break;
+    }
+    return 0;
+  }
+
   bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
   bool isPreheaderToFlush(MachineBasicBlock &MBB,
                           const WaitcntBrackets &ScoreBrackets);
@@ -791,6 +585,211 @@ class SIInsertWaitcnts {
                             WaitcntBrackets &ScoreBrackets);
 };
 
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+
+  bool isSmemCounter(InstCounterType T) const {
+    return T == Context->SmemAccessCounter || T == X_CNT;
+  }
+
+  unsigned getSgprScoresIdx(InstCounterType T) const {
+    assert(isSmemCounter(T) && "Invalid SMEM counter");
+    return T == X_CNT ? 1 : 0;
+  }
+
+  unsigned getScoreLB(InstCounterType T) const {
+    assert(T < NUM_INST_CNTS);
+    return ScoreLBs[T];
+  }
+
+  unsigned getScoreUB(InstCounterType T) const {
+    assert(T < NUM_INST_CNTS);
+    return ScoreUBs[T];
+  }
+
+  unsigned getScoreRange(InstCounterType T) const {
+    return getScoreUB(T) - getScoreLB(T);
+  }
+
+  unsigned getRegScore(int GprNo, InstCounterType T) const {
+    if (GprNo < NUM_ALL_VGPRS)
+      return VgprScores[T][GprNo];
+    return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+  }
+
+  bool merge(const WaitcntBrackets &Other);
+
+  RegInterval getRegInterval(const MachineInstr *MI,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI,
+                             const MachineOperand &Op) const;
+
+  bool counterOutOfOrder(InstCounterType T) const;
+  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+
+  void determineWait(InstCounterType T, RegInterval Interval,
+                     AMDGPU::Waitcnt &Wait) const;
+  void determineWait(InstCounterType T, int RegNo,
+                     AMDGPU::Waitcnt &Wait) const {
+    determineWait(T, {RegNo, RegNo + 1}, Wait);
+  }
+
+  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+  void applyWaitcnt(InstCounterType T, unsigned Count);
+  void applyXcnt(const AMDGPU::Waitcnt &Wait);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  unsigned hasPendingEvent() const { return PendingEvents; }
+  unsigned hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
+  }
+  unsigned hasPendingEvent(InstCounterType T) const {
+    unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
+    assert((HasPending != 0) == (getScoreRange(T) != 0));
+    return HasPending;
+  }
+
+  bool hasMixedPendingEvents(InstCounterType T) const {
+    unsigned Events = hasPendingEvent(T);
+    // Return true if more than one bit is set in Events.
+    return Events & (Events - 1);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+  }
+
+  bool hasPendingGDS() const {
+    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+  }
+
+  unsigned getPendingGDSWait() const {
+    return std::min(getScoreUB(DS_CNT) - LastGDS,
+                    Context->getWaitCountMax(DS_CNT) - 1);
+  }
+
+  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+
+  // Return true if there might be pending writes to the vgpr-interval by VMEM
+  // instructions with types different from V.
+  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      assert(RegNo < NUM_ALL_VGPRS);
+      if (VgprVmemTypes[RegNo] & ~(1 << V))
+        return true;
+    }
+    return false;
+  }
+
+  void clearVgprVmemTypes(RegInterval Interval) {
+    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      assert(RegNo < NUM_ALL_VGPRS);
+      VgprVmemTypes[RegNo] = 0;
+    }
+  }
+
+  void setStateOnFunctionEntryOrReturn() {
+    setScoreUB(STORE_CNT,
+               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
+    PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+  }
+
+  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+    return LDSDMAStores;
+  }
+
+  bool hasPointSampleAccel(const MachineInstr &MI) const;
+  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+                                      RegInterval Interval) const;
+
+  void print(raw_ostream &) const;
+  void dump() const { print(dbgs()); }
+
+private:
+  struct MergeInfo {
+    unsigned OldLB;
+    unsigned OtherLB;
+    unsigned MyShift;
+    unsigned OtherShift;
+  };
+  static bool mergeScore(const MergeInfo &M, unsigned &Score,
+                         unsigned OtherScore);
+
+  void setScoreLB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreLBs[T] = Val;
+  }
+
+  void setScoreUB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreUBs[T] = Val;
+
+    if (T != EXP_CNT)
+      return;
+
+    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+  }
+
+  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+                          unsigned Score);
+
+  void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+                         const MachineRegisterInfo *MRI,
+                         const MachineOperand &Op, InstCounterType CntTy,
+                         unsigned Val);
+
+  const SIInsertWaitcnts *Context;
+
+  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+  unsigned PendingEvents = 0;
+  // Remember the last flat memory operation.
+  unsigned LastFlat[NUM_INST_CNTS] = {0};
+  // Remember the last GDS operation.
+  unsigned LastGDS = 0;
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int VgprUB = -1;
+  int SgprUB = -1;
+  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+  // X_CNT score.
+  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+  // write to each vgpr.
+  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+  // Store representative LDS DMA operations. The only useful info here is
+  // alias info. One store is kept per unique AAInfo.
+  SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+};
+
 class SIInsertWaitcntsLegacy : public MachineFunctionPass {
 public:
   static char ID;
@@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   RegInterval Result;
 
-  MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
+  MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
   unsigned RegIdx = TRI->getHWRegIndex(MCReg);
   assert(isUInt<8>(RegIdx));
 
@@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
 // this at compile time, so we have to assume it might be applied if the
 // instruction supports it).
 bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
-  if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+  if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
     return false;
 
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
                                     WaitEventType E, MachineInstr &Inst) {
-  InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+  InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
 
   unsigned UB = getScoreUB(T);
   unsigned CurrScore = UB + 1;
@@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 }
 
 void WaitcntBrackets::print(raw_ostream &OS) const {
+  const GCNSubtarget *ST = Context->ST;
+
   OS << '\n';
-  for (auto T : inst_counter_types(MaxCounter)) {
+  for (auto T : inst_counter_types(Context->MaxCounter)) {
     unsigned SR = getScoreRange(T);
 
     switch (T) {
@@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
     // s_waitcnt instruction.
     if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
       if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-          !ST->hasFlatLgkmVMemCountInOrder()) {
+          !Context->ST->hasFlatLgkmVMemCountInOrder()) {
         // If there is a pending FLAT operation, and this is a VMem or LGKM
         // waitcnt and the target can report early completion, then we need
         // to force a waitcnt 0.
@@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
         // If a counter has been maxed out avoid overflow by waiting for
         // MAX(CounterType) - 1 instead.
         unsigned NeededWait =
-            std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+            std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
         addWait(Wait, T, NeededWait);
       }
     }
@@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
     setScoreLB(T, UB);
-    PendingEvents &= ~WaitEventMaskForInst[T];
+    PendingEvents &= ~Context->WaitEventMaskForInst[T];
   }
 }
 
@@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
 // the decrement may go out of order.
 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
-  if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+  if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
       (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
     return true;
   return hasMixedPendingEvents(T);
@@ -1811,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::SI_RETURN ||
+      MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
@@ -2386,8 +2388,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
   VgprUB = std::max(VgprUB, Other.VgprUB);
   SgprUB = std::max(SgprUB, Other.SgprUB);
 
-  for (auto T : inst_counter_types(MaxCounter)) {
+  for (auto T : inst_counter_types(Context->MaxCounter)) {
     // Merge event flags for this counter
+    const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
     if (OtherEvents & ~OldEvents)
@@ -2746,11 +2749,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
-  const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+  WaitEventMaskForInst = WCG->getWaitEventMask();
 
   SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
 
-  HardwareLimits Limits = {};
   if (ST->hasExtendedWaitCounts()) {
     Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
     Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2807,8 +2809,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
     }
 
-    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
-        ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
     NonKernelInitialState->setStateOnFunctionEntryOrReturn();
     BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
 
@@ -2839,15 +2840,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
           *Brackets = *BI.Incoming;
       } else {
         if (!Brackets) {
-          Brackets = std::make_unique<WaitcntBrackets>(
-              ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+          Brackets = std::make_unique<WaitcntBrackets>(this);
         } else {
           // Reinitialize in-place. N.B. do not do this by assigning from a
           // temporary because the WaitcntBrackets class is large and it could
           // cause this function to use an unreasonable amount of stack space.
           Brackets->~WaitcntBrackets();
-          new (Brackets.get()) WaitcntBrackets(
-              ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+          new (Brackets.get()) WaitcntBrackets(this);
         }
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index a368bc5d0b1a1..89d9b0d32b25b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -317,6 +317,8 @@ def CPolBit {
   int SLC = 1;
   int DLC = 2;
   int SCC = 4;
+  int NV = 5;
+  int SCAL = 11;
 }
 
 class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a1e14d90ebcab..571f3efd68260 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
         if (!SafeToPropagate)
           break;
 
-        DefOp.setIsKill(false);
+        for (auto I = Def; I != MI; ++I)
+          I->clearRegisterKills(DefOp.getReg(), &RI);
       }
 
       MachineInstrBuilder Builder =
@@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
   }
 }
 
-static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
-  switch (Size) {
-  case 4:
-    return AMDGPU::SI_SPILL_A32_SAVE;
-  case 8:
-    return AMDGPU::SI_SPILL_A64_SAVE;
-  case 12:
-    return AMDGPU::SI_SPILL_A96_SAVE;
-  case 16:
-    return AMDGPU::SI_SPILL_A128_SAVE;
-  case 20:
-    return AMDGPU::SI_SPILL_A160_SAVE;
-  case 24:
-    return AMDGPU::SI_SPILL_A192_SAVE;
-  case 28:
-    return AMDGPU::SI_SPILL_A224_SAVE;
-  case 32:
-    return AMDGPU::SI_SPILL_A256_SAVE;
-  case 36:
-    return AMDGPU::SI_SPILL_A288_SAVE;
-  case 40:
-    return AMDGPU::SI_SPILL_A320_SAVE;
-  case 44:
-    return AMDGPU::SI_SPILL_A352_SAVE;
-  case 48:
-    return AMDGPU::SI_SPILL_A384_SAVE;
-  case 64:
-    return AMDGPU::SI_SPILL_A512_SAVE;
-  case 128:
-    return AMDGPU::SI_SPILL_A1024_SAVE;
-  default:
-    llvm_unreachable("unknown register size");
-  }
-}
-
 static unsigned getAVSpillSaveOpcode(unsigned Size) {
   switch (Size) {
   case 4:
@@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
   return AMDGPU::SI_SPILL_WWM_V32_SAVE;
 }
 
-static unsigned getVectorRegSpillSaveOpcode(Register Reg,
-                                            const TargetRegisterClass *RC,
-                                            unsigned Size,
-                                            const SIRegisterInfo &TRI,
-                                            const SIMachineFunctionInfo &MFI) {
-  bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
+    Register Reg, const TargetRegisterClass *RC, unsigned Size,
+    const SIMachineFunctionInfo &MFI) const {
+  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
 
   // Choose the right opcode if spilling a WWM register.
   if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
     return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
 
-  if (IsVectorSuperClass)
+  // TODO: Check if AGPRs are available
+  if (ST.hasMAIInsts())
     return getAVSpillSaveOpcode(Size);
 
-  return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
-                             : getVGPRSpillSaveOpcode(Size);
+  return getVGPRSpillSaveOpcode(Size);
 }
 
 void SIInstrInfo::storeRegToStackSlot(
@@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot(
     return;
   }
 
-  unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
-                                                SpillSize, RI, *MFI);
+  unsigned Opcode =
+      getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
   MFI->setHasSpilledVGPRs();
 
   BuildMI(MBB, MI, DL, get(Opcode))
@@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
   }
 }
 
-static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
-  switch (Size) {
-  case 4:
-    return AMDGPU::SI_SPILL_A32_RESTORE;
-  case 8:
-    return AMDGPU::SI_SPILL_A64_RESTORE;
-  case 12:
-    return AMDGPU::SI_SPILL_A96_RESTORE;
-  case 16:
-    return AMDGPU::SI_SPILL_A128_RESTORE;
-  case 20:
-    return AMDGPU::SI_SPILL_A160_RESTORE;
-  case 24:
-    return AMDGPU::SI_SPILL_A192_RESTORE;
-  case 28:
-    return AMDGPU::SI_SPILL_A224_RESTORE;
-  case 32:
-    return AMDGPU::SI_SPILL_A256_RESTORE;
-  case 36:
-    return AMDGPU::SI_SPILL_A288_RESTORE;
-  case 40:
-    return AMDGPU::SI_SPILL_A320_RESTORE;
-  case 44:
-    return AMDGPU::SI_SPILL_A352_RESTORE;
-  case 48:
-    return AMDGPU::SI_SPILL_A384_RESTORE;
-  case 64:
-    return AMDGPU::SI_SPILL_A512_RESTORE;
-  case 128:
-    return AMDGPU::SI_SPILL_A1024_RESTORE;
-  default:
-    llvm_unreachable("unknown register size");
-  }
-}
-
 static unsigned getAVSpillRestoreOpcode(unsigned Size) {
   switch (Size) {
   case 4:
@@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
   if (Size != 4)
     llvm_unreachable("unknown wwm register spill size");
 
-  if (IsVectorSuperClass)
+  if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
     return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
 
   return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
 }
 
-static unsigned
-getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
-                               unsigned Size, const SIRegisterInfo &TRI,
-                               const SIMachineFunctionInfo &MFI) {
-  bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
+    Register Reg, const TargetRegisterClass *RC, unsigned Size,
+    const SIMachineFunctionInfo &MFI) const {
+  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
 
   // Choose the right opcode if restoring a WWM register.
   if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
     return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
 
-  if (IsVectorSuperClass)
+  // TODO: Check if AGPRs are available
+  if (ST.hasMAIInsts())
     return getAVSpillRestoreOpcode(Size);
 
-  return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
-                             : getVGPRSpillRestoreOpcode(Size);
+  assert(!RI.isAGPRClass(RC));
+  return getVGPRSpillRestoreOpcode(Size);
 }
 
 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   }
 
   unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
-                                                   SpillSize, RI, *MFI);
+                                                   SpillSize, *MFI);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
       .addFrameIndex(FrameIndex)           // vaddr
       .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
@@ -2543,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
+  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
   case AMDGPU::SI_RETURN: {
     const MachineFunction *MF = MBB.getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -5552,6 +5482,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
+    if (CPol->getImm() & AMDGPU::CPol::SCAL) {
+      if (!ST.hasScaleOffset()) {
+        ErrInfo = "Subtarget does not support offset scaling";
+        return false;
+      }
+      if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
+        ErrInfo = "Instruction does not support offset scaling";
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
@@ -5828,6 +5771,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
     Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
 }
 
+MachineInstr *
+SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
+  assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
+         "Not a whole wave func");
+  MachineBasicBlock &MBB = *MF.begin();
+  for (MachineInstr &MI : MBB)
+    if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
+        MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+      return &MI;
+
+  llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
+}
+
 static const TargetRegisterClass *
 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
                           const MachineRegisterInfo &MRI,
@@ -6460,7 +6416,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   if (OldSAddrIdx < 0)
     return false;
 
-  assert(isSegmentSpecificFLAT(Inst));
+  assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
 
   int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
   if (NewOpc < 0)
@@ -6484,7 +6440,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   if (OldVAddrIdx >= 0) {
     MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
     VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
-    if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+    if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
         !VAddrDef->getOperand(1).isImm() ||
         VAddrDef->getOperand(1).getImm() != 0)
       return false;
@@ -6537,7 +6493,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
 // FIXME: Remove this when SelectionDAG is obsoleted.
 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
-  if (!isSegmentSpecificFLAT(MI))
+  if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
     return;
 
   // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
@@ -10466,10 +10422,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
   return TargetInstrInfo::isGlobalMemoryObject(MI);
 }
 
+bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
+  if (!isWMMA(MI) && !isSWMMAC(MI))
+    return false;
+
+  if (AMDGPU::isGFX1250(ST))
+    return AMDGPU::getWMMAIsXDL(MI.getOpcode());
+
+  return true;
+}
+
 bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
 
-  if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
+  if (AMDGPU::isGFX12Plus(ST))
+    return isDOT(MI) || isXDLWMMA(MI);
+
+  if (!isMAI(MI) || isDGEMM(Opcode) ||
       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a380199977616..800ea9ab50b85 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -33,6 +33,7 @@ class LiveVariables;
 class MachineDominatorTree;
 class MachineRegisterInfo;
 class RegScavenger;
+class SIMachineFunctionInfo;
 class TargetRegisterClass;
 class ScheduleHazardRecognizer;
 
@@ -287,6 +288,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
                                int64_t &ImmVal) const override;
 
+  unsigned getVectorRegSpillSaveOpcode(Register Reg,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Size,
+                                       const SIMachineFunctionInfo &MFI) const;
+  unsigned
+  getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
+                                 unsigned Size,
+                                 const SIMachineFunctionInfo &MFI) const;
+
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
@@ -867,6 +877,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
 
+  bool isXDLWMMA(const MachineInstr &MI) const;
+
   bool isXDL(const MachineInstr &MI) const;
 
   static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); }
@@ -1101,7 +1113,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   // that will not require an additional 4-bytes; this function assumes that it
   // will.
   bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const {
-    assert(!MO.isReg() && "isInlineConstant called on register operand!");
     if (!MO.isImm())
       return false;
     return isInlineConstant(MO.getImm(), OperandType);
@@ -1204,6 +1215,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                    Register Reg, SlotIndexes *Indexes = nullptr) const;
 
+  MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const;
+
   /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
   /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ab7d34002e9f1..bd4995b3c6e6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1307,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
 def BitOp3 : NamedIntOperand<"bitop3">;
 def bitop3_0 : DefaultOperand<BitOp3, 0>;
 
+def MatrixAFMT : CustomOperand<i32, 1, "MatrixAFMT">;
+def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
+
 def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
 def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
 
@@ -1882,6 +1885,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
         !eq(VT, v4bf16)    : AVSrc_64,
         !eq(VT.Size, 1024) : VRegSrc_1024,
         !eq(VT.Size, 512)  : VRegSrc_512,
+        !eq(VT.Size, 384)  : VRegSrc_384,
         !eq(VT.Size, 256)  : VRegSrc_256,
         !eq(VT.Size, 192)  : VRegSrc_192,
         !eq(VT.Size, 128)  : VRegSrc_128,
@@ -1894,6 +1898,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
 class getVOP3VRegSrcForVT<ValueType VT> {
   RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
                               !eq(VT.Size, 512)  : VRegSrc_512,
+                              !eq(VT.Size, 384)  : VRegSrc_384,
                               !eq(VT.Size, 256)  : VRegSrc_256,
                               !eq(VT.Size, 192)  : VRegSrc_192,
                               !eq(VT.Size, 128)  : VRegSrc_128,
@@ -2666,6 +2671,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                                HasOMod);
   field bit HasNeg = HasModifiers;
   field bit HasMatrixReuse = 0;
+  field bit HasMatrixFMT = 0;
 
   field bit HasSrc0Mods = HasModifiers;
   field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
@@ -2850,6 +2856,7 @@ def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
 def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
 def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
 def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
+def VOP1_I16_I32 :  VOPProfile<[i16, i32, untyped, untyped]>;
 
 def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d48eb52d2faae..d05be8f95c618 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
   let isConvergent = 1;
 }
 
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI <
+  (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> {
+  let Defs = [EXEC];
+  let Uses = [EXEC];
+
+  let isConvergent = 1;
+}
+
+// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN.
+def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
+  (outs), (ins SReg_1:$orig_exec)> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let SchedRW = [WriteBranch];
+
+  // We're going to use custom handling to set the $orig_exec to the correct value.
+  let usesCustomInserter = 1;
+}
+
+// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+  (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -3459,30 +3485,32 @@ def : GCNPat <
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
 >;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
 >;
 
-
 def : GCNPat <
-  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
-  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+  (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
 >;
 
 def : GCNPat <
-  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
-  (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+  (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+  (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
 >;
+}
 
 def : GCNPat <
-  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 def : GCNPat <
-  (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
-  (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 foreach vecTy = [v2i16, v2f16, v2bf16] in {
@@ -4335,6 +4363,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$origExec);
+  let InOperandList = (ins);
+  let isConvergent = 1;
+}
+
+def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins type0:$origExec);
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
 // This is equivalent to the G_INTRINSIC*, but the operands may have
 // been legalized depending on the subtarget requirements.
 def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b0d6fd95cd271..b49c5a997af78 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -61,6 +61,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
@@ -1078,7 +1079,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
     if (EltOffset0 + CI.Width != EltOffset1 &&
             EltOffset1 + Paired.Width != EltOffset0)
       return false;
-    if (CI.CPol != Paired.CPol)
+    // Instructions with scale_offset modifier cannot be combined unless we
+    // also generate a code to scale the offset and reset that bit.
+    if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
       return false;
     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
@@ -2225,8 +2228,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   MachineBasicBlock::iterator E = MBB->end();
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   ++MBBI;
-  const SITargetLowering *TLI =
-    static_cast<const SITargetLowering *>(STM->getTargetLowering());
+  const SITargetLowering *TLI = STM->getTargetLowering();
 
   for ( ; MBBI != E; ++MBBI) {
     MachineInstr &MINext = *MBBI;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 75ce67c00228d..f0be204cd9bdb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };
 
 using namespace llvm;
 
+// TODO -- delete this flag once we have more robust mechanisms to allocate the
+// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
+// where it is better to produce the VGPR form (e.g. if there are VGPR users
+// of the MFMA result).
+cl::opt<bool> MFMAVGPRForm(
+    "amdgpu-mfma-vgpr-form", cl::Hidden,
+    cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
+             "unspecified, default to compiler heuristics"),
+    cl::init(false));
+
 const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
   const SITargetLowering *TLI = STI->getTargetLowering();
   return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -41,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
       PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
       WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
-      GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+      GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
+      IsWholeWaveFunction(F.getCallingConv() ==
+                          CallingConv::AMDGPU_Gfx_WholeWave) {
   const GCNSubtarget &ST = *STI;
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
@@ -69,8 +81,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
   }
 
-  MayNeedAGPRs = ST.hasMAIInsts();
-  if (ST.hasGFX90AInsts() &&
+  MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
+  if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
       ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
       !mayUseAGPRs(F))
     MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
@@ -89,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
 
     ImplicitArgPtr = false;
   } else if (!isEntryFunction()) {
-    if (CC != CallingConv::AMDGPU_Gfx)
+    if (CC != CallingConv::AMDGPU_Gfx &&
+        CC != CallingConv::AMDGPU_Gfx_WholeWave)
       ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
 
     FrameOffsetReg = AMDGPU::SGPR33;
@@ -722,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
       MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
       Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+      IsWholeWaveFunction(MFI.isWholeWaveFunction()),
       DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
       ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
@@ -768,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
   BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
   ReturnsVoid = YamlMFI.ReturnsVoid;
+  IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
 
   if (YamlMFI.ScavengeFI) {
     auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60adb8d07..08b0206d244fb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   StringValue LongBranchReservedReg;
 
   bool HasInitWholeWave = false;
+  bool IsWholeWaveFunction = false;
 
   unsigned DynamicVGPRBlockSize = 0;
   unsigned ScratchReservedForDynamicVGPRs = 0;
@@ -356,6 +357,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
     YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
                        MFI.ScratchReservedForDynamicVGPRs, 0);
+    YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
   }
 };
 
@@ -565,6 +567,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // the serialization easier.
   ReservedRegSet WWMReservedRegs;
 
+  bool IsWholeWaveFunction = false;
+
   using PrologEpilogSGPRSpill =
       std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
   // To track the SGPR spill method used for a CSR SGPR register during
@@ -670,6 +674,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return WWMReservedRegs.contains(Reg);
   }
 
+  bool isWholeWaveFunction() const { return IsWholeWaveFunction; }
+
   ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
     assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first()));
     return PrologEpilogSGPRSpills;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 7093fe6405abb..5940f45e74bf2 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -85,7 +85,8 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
                  S_00B848_PRIV(ProgInfo.Priv) |
                  S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
                  S_00B848_WGP_MODE(ProgInfo.WgpMode) |
-                 S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
+                 S_00B848_MEM_ORDERED(ProgInfo.MemOrdered) |
+                 S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
 
   if (ST.hasDX10ClampMode())
     Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
@@ -93,10 +94,6 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
   if (ST.hasIEEEMode())
     Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
-  // TODO: in the long run we will want to enable this unconditionally.
-  if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA)
-    Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
-
   if (ST.hasRrWGMode())
     Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9173041a7bccd..84cfa878276fd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
                                : CSR_AMDGPU_SaveList;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
                                : CSR_AMDGPU_SI_Gfx_SaveList;
   case CallingConv::AMDGPU_CS_ChainPreserve:
@@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
                                : CSR_AMDGPU_RegMask;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
                                : CSR_AMDGPU_SI_Gfx_RegMask;
   case CallingConv::AMDGPU_CS_Chain:
@@ -4052,11 +4054,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
   return 0;
 }
 
-unsigned
-SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
-                                   const TargetRegisterClass &RC) const {
+unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+                                            const TargetRegisterClass &RC,
+                                            bool IncludeCalls) const {
   for (MCPhysReg Reg : reverse(RC.getRegisters()))
-    if (MRI.isPhysRegUsed(Reg))
+    if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
       return getHWRegIndex(Reg) + 1;
   return 0;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 06a7a17b0246b..0008e5f8cf3b4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,9 +486,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                                      unsigned SubReg) const;
 
   // \returns a number of registers of a given \p RC used in a function.
-  // Does not go inside function calls.
+  // Does not go inside function calls. If \p IncludeCalls is true, it will
+  // include registers that may be clobbered by calls.
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass &RC) const;
+                              const TargetRegisterClass &RC,
+                              bool IncludeCalls = true) const;
 
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c194e5c255d4d..0039d2ffa100e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1207,6 +1207,7 @@ def VRegSrc_96 : SrcReg9<VReg_96>;
 def VRegSrc_128: SrcReg9<VReg_128>;
 def VRegSrc_192: SrcReg9<VReg_192>;
 def VRegSrc_256: SrcReg9<VReg_256>;
+def VRegSrc_384: SrcReg9<VReg_384>;
 def VRegSrc_512: SrcReg9<VReg_512>;
 def VRegSrc_1024: SrcReg9<VReg_1024>;
 def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32>;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index ef8faffa5f557..8eecb1c1019ae 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -464,6 +464,20 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
 
 }  // End SchedModel = GFX12SpeedModel
 
+// Check if any matrix inputs are interpreted as f8 in an f8f6f4
+// wmma instruction.
+def PredIsF8_WMMA_SCALE : SchedPredicate<[{
+  TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_a_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8 ||
+  TII->getNamedOperand(*MI, AMDGPU::OpName::matrix_b_fmt)->getImm() <= AMDGPU::WMMA::MATRIX_FMT_BF8
+}]>;
+
+// If either matrix format is f8, the instruction takes 2x as many
+// cycles. TODO: This isn't reflected in MCA.
+def WriteWMMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
+    SchedVar<PredIsF8_WMMA_SCALE, [WriteXDL4PassWMMA]>,
+    SchedVar<NoSchedPred, [WriteXDL2PassWMMA]>
+]>;
+
 multiclass GFX125xCommonWriteRes {
 
 let ReleaseAtCycles = [8] in
@@ -495,6 +509,7 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
 
 def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
 def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[WriteWMMAScale_16X16X128_F8F6F4], (instregex "^V_WMMA_.*_16X16X128_F8F6F4.*_w32")>;
 def : InstRW<[Write4PassWMMA],    (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
 def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
 } // End GFX125xCommonWriteRes
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 37dcc10086257..38cc51b8ab32b 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -87,7 +87,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
   bits<7>  sdst;
   bits<32> offset;
   bits<8>  soffset;
-  bits<5>  cpol;
+  bits<12> cpol;
 }
 
 class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
@@ -864,8 +864,10 @@ def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
 
 def SMRDImm         : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
 def SMRDImm32       : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
-def SMRDSgpr        : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
-def SMRDSgprImm     : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
+let WantsRoot = true in {
+  def SMRDSgpr        : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>;
+  def SMRDSgprImm     : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>;
+}
 def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
@@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
     let SubtargetPredicate = isNotGFX9Plus;
   }
   def : GCNPat <
-    (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+    (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> {
     let SubtargetPredicate = isGFX9Plus;
   }
 
   // 4. SGPR+IMM offset
   def : GCNPat <
-    (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+    (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> {
     let SubtargetPredicate = isGFX9Plus;
   }
 
@@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
 
    // 2. SGPR offset
    def : GCNPat <
-     (node (SMRDSgpr i64:$sbase, i32:$soffset)),
-     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+     (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{
        let SubtargetPredicate = isGFX12Plus;
    }
 
    // 3. SGPR+IMM offset
    def : GCNPat <
-     (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
-     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+     (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{
        let SubtargetPredicate = isGFX12Plus;
    }
 
@@ -1485,8 +1487,10 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
   RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
   let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
 
+  let Inst{20} = cpol{CPolBit.NV}; // non-volatile
   let Inst{22-21} = cpol{4-3}; // scope
   let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+  let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
 }
 
 multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9df2bdededa13..b5b3cc97569ed 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
 #define GET_MIMGOffsetMappingTable_IMPL
 #define GET_MIMGG16MappingTable_IMPL
 #define GET_MAIInstInfoTable_IMPL
+#define GET_WMMAInstInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
   return Info && Info->is_gfx940_xdl;
 }
 
+bool getWMMAIsXDL(unsigned Opc) {
+  const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc);
+  return Info ? Info->is_wmma_xdl : false;
+}
+
 uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
   switch (EncodingVal) {
   case MFMAScaleFormats::FP6_E2M3:
@@ -592,6 +598,29 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
   return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
 }
 
+uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt) {
+  switch (Fmt) {
+  case WMMA::MATRIX_FMT_FP8:
+  case WMMA::MATRIX_FMT_BF8:
+    return 16;
+  case WMMA::MATRIX_FMT_FP6:
+  case WMMA::MATRIX_FMT_BF6:
+    return 12;
+  case WMMA::MATRIX_FMT_FP4:
+    return 8;
+  }
+
+  llvm_unreachable("covered switch over wmma scale formats");
+}
+
+const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
+                                                      unsigned FmtB,
+                                                      unsigned F8F8Opcode) {
+  uint8_t SrcANumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtA);
+  uint8_t SrcBNumRegs = wmmaScaleF8F6F4FormatToNumRegs(FmtB);
+  return getMFMA_F8F6F4_InstWithNumRegs(SrcANumRegs, SrcBNumRegs, F8F8Opcode);
+}
+
 unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
   if (ST.hasFeature(AMDGPU::FeatureGFX1250Insts))
     return SIEncodingFamily::GFX1250;
@@ -3199,6 +3228,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                           : getGfx9BufferFormatInfo(Format);
 }
 
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode) {
+  uint64_t TSFlags = MII.get(Opcode).TSFlags;
+
+  if (TSFlags & SIInstrFlags::SMRD)
+    return !getSMEMIsBuffer(Opcode);
+  if (!(TSFlags & SIInstrFlags::FLAT))
+    return false;
+
+  // Only SV and SVS modes are supported.
+  if (TSFlags & SIInstrFlags::FlatScratch)
+    return hasNamedOperand(Opcode, OpName::vaddr);
+
+  // Only GVS mode is supported.
+  return hasNamedOperand(Opcode, OpName::vaddr) &&
+         hasNamedOperand(Opcode, OpName::saddr);
+
+  return false;
+}
+
 bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
   for (auto OpName : {OpName::vdst, OpName::src0, OpName::src1, OpName::src2}) {
     int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6708e0a3f4549..c09a9d694f3d8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -119,6 +119,11 @@ struct True16D16Info {
   unsigned LoOp;
 };
 
+struct WMMAInstInfo {
+  uint16_t Opcode;
+  bool is_wmma_xdl;
+};
+
 #define GET_MIMGBaseOpcode_DECL
 #define GET_MIMGDim_DECL
 #define GET_MIMGEncoding_DECL
@@ -129,6 +134,7 @@ struct True16D16Info {
 #define GET_isMFMA_F8F6F4Table_DECL
 #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
 #define GET_True16D16Table_DECL
+#define GET_WMMAInstInfoTable_DECL
 #include "AMDGPUGenSearchableTables.inc"
 
 namespace IsaInfo {
@@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc);
 LLVM_READONLY
 bool getMAIIsGFX940XDL(unsigned Opc);
 
+LLVM_READONLY
+bool getWMMAIsXDL(unsigned Opc);
+
 // Get an equivalent BitOp3 for a binary logical \p Opc.
 // \returns BitOp3 modifier for the logical operation or zero.
 // Used in VOPD3 conversion.
@@ -618,6 +627,14 @@ const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
                                                       unsigned BLGP,
                                                       unsigned F8F8Opcode);
 
+LLVM_READNONE
+uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt);
+
+LLVM_READONLY
+const MFMA_F8F6F4_Info *getWMMA_F8F6F4_WithFormatArgs(unsigned FmtA,
+                                                      unsigned FmtB,
+                                                      unsigned F8F8Opcode);
+
 LLVM_READONLY
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,
@@ -1414,7 +1431,8 @@ constexpr bool isShader(CallingConv::ID CC) {
 
 LLVM_READNONE
 constexpr bool isGraphics(CallingConv::ID CC) {
-  return isShader(CC) || CC == CallingConv::AMDGPU_Gfx;
+  return isShader(CC) || CC == CallingConv::AMDGPU_Gfx ||
+         CC == CallingConv::AMDGPU_Gfx_WholeWave;
 }
 
 LLVM_READNONE
@@ -1739,6 +1757,9 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 /// \returns true if the intrinsic is uniform
 bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
+/// \returns true if a memory instruction supports scale_offset modifier.
+bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
+
 /// \returns lds block size in terms of dwords. \p
 /// This is used to calculate the lds size encoded for PAL metadata 3.0+ which
 /// must be defined in terms of bytes.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index e464470143e52..fd6253daa327a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) {
   case CallingConv::AMDGPU_LS:
     return ".ls";
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     llvm_unreachable("Callable shader has no hardware stage");
   default:
     return ".cs";
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index e2f371079179d..f621f8581f778 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
 defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 
 defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -527,10 +530,19 @@ defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
 defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 
+let SubtargetPredicate = HasTanhInsts in {
+defm V_TANH_F16  : VOP1Inst_t16 <"v_tanh_f16",  VOP_F16_F16, int_amdgcn_tanh>;
+}
+
 let SubtargetPredicate = HasBF16TransInsts in {
 defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
 defm V_RCP_BF16  : VOP1Inst_t16 <"v_rcp_bf16",  VOP_BF16_BF16, AMDGPUrcp>;
 defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
+defm V_RSQ_BF16  : VOP1Inst_t16 <"v_rsq_bf16",  VOP_BF16_BF16, AMDGPUrsq>;
+defm V_LOG_BF16  : VOP1Inst_t16 <"v_log_bf16",  VOP_BF16_BF16, AMDGPUlogf16>;
+defm V_EXP_BF16  : VOP1Inst_t16 <"v_exp_bf16",  VOP_BF16_BF16, AMDGPUexpf16>;
+defm V_SIN_BF16  : VOP1Inst_t16 <"v_sin_bf16",  VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16  : VOP1Inst_t16 <"v_cos_bf16",  VOP_BF16_BF16, AMDGPUcos>;
 }
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -791,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in {
     def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
     def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>;
   }
+
+  defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>;
+  defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>;
 } // End SubtargetPredicate = isGFX1250Plus
 
 let SubtargetPredicate = isGFX10Plus in {
@@ -1068,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
        VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
 }
 
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+   defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+   def _e64_gfx1250 :
+        VOP3_Real_Gen<ps, GFX1250Gen>,
+        VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
 defm V_CVT_F32_FP8      : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
 defm V_CVT_F32_FP8      : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
 
@@ -1133,14 +1155,25 @@ defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
 
 defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
 
+defm V_TANH_F32              : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
+defm V_TANH_F16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32   : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
 defm V_TANH_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
+defm V_PRNG_B32              : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
 defm V_CVT_F32_BF16          : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_SAT_PK4_I4_I8         : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
+defm V_SAT_PK4_U4_U8         : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
 defm V_CVT_PK_F16_FP8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
 defm V_CVT_PK_F16_BF8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
 defm V_CVT_F16_FP8           : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
 defm V_CVT_F16_BF8           : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
 defm V_RCP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
 defm V_SQRT_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
+defm V_RSQ_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
+defm V_LOG_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
+defm V_EXP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
+defm V_SIN_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 75c531913ded1..b6f95686704be 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -271,6 +271,16 @@ let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
 } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 } // End isReMaterializable = 1
 
+let SubtargetPredicate = isGFX9GFX10 in
+def : GCNPat <
+(i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)),
+                           (i32 (VOP3OpSelMods i32:$src1, i32:$src1_modifiers)),
+                           (i32 (VOP3OpSelMods i32:$src2, i32:$src2_modifiers)))),
+(V_ALIGNBYTE_B32_opsel_e64 i32:$src0_modifiers, VSrc_b32:$src0,
+                            i32:$src1_modifiers, VSrc_b32:$src1,
+                            i32:$src2_modifiers, VGPR_32:$src2)
+>;
+
 let True16Predicate = UseFakeTrue16Insts in
 def : GCNPat <
 (i32 (int_amdgcn_alignbyte (i32 (VOP3OpSelMods i32:$src0, i32:$src0_modifiers)),
@@ -1908,6 +1918,7 @@ let AssemblerPredicate = isGFX11Plus in {
 
 // These instructions differ from GFX12 variant by supporting DPP:
 defm V_LSHL_ADD_U64                  : VOP3Only_Realtriple_gfx1250<0x252>;
+defm V_CVT_PK_BF16_F32               : VOP3Only_Realtriple_gfx1250<0x36d>;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e51e9574f8de0..9feea361692c1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1318,13 +1318,15 @@ let WaveSizePredicate = isWave64 in {
 
 class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                              bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
-                             bit _HasMatrixReuse = 0, bit _IsF4 = 0>
+                             bit _HasMatrixFMT = 0, bit _HasMatrixReuse = 0,
+                             bit _IsF4 = 0>
     : VOP3P_Profile<VOPProfile<ArgTy>> {
   bit IsIU = _IsIU;
   bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
   bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
 
   int IndexType = _IndexType;
+  let HasMatrixFMT = _HasMatrixFMT;
   let HasMatrixReuse = _HasMatrixReuse;
 
   bit HasIModOp = _Has_ImodOp;
@@ -1422,7 +1424,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                        !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
                        !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
                        !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
-
+  dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
+                                   (ins));
   dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
   dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
   dag Neg = !cond(!and(NegLoAny, NegHiAny)             : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
@@ -1436,7 +1439,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                                                  (ins VRegSrc_64:$src2),
                                                  (ins VRegSrc_32:$src2)),
                                             IndexKey)),
-                      MatrixReuse, Clamp, Neg);
+                      MatrixFMT, MatrixReuse, Clamp, Neg);
 
   // asm
 
@@ -1444,13 +1447,14 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                              !eq(IndexType, 8)  : "$index_key_8bit",
                              !eq(IndexType, 16) : "$index_key_16bit",
                              !eq(IndexType, 32) : "$index_key_32bit");
+  string MatrxFMTAsm = !if(HasMatrixFMT, "$matrix_a_fmt$matrix_b_fmt", "");
   string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
   string ClampAsm = !if(HasClamp, "$clamp", "");
   string NegAsm = !cond(!and(NegLoAny, NegHiAny)             : "$neg_lo$neg_hi",
                         !and(NegLoAny, !not(NegHiAny))       : "$neg_lo",
                         !and(!not(NegLoAny), !not(NegHiAny)) : "");
 
-  let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
+  let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrxFMTAsm#MatrixReuseAsm#NegAsm#ClampAsm;
 
   // isel patterns
   bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
@@ -1462,6 +1466,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                          IsAB_F16_IMod0     : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
                          IsAB_BF16_IMod0    : (ins Src0VT:$src0),
                          IsIU               : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+                         HasMatrixFMT       : (ins timm:$matrix_a_fmt, Src0VT:$src0),
                          NoABMods           : (ins Src0VT:$src0));
   dag Src0OutPat = !cond(IsAB_F32F64_IMod1  : (ins i32:$src0_modifiers, Src0VT:$src0),
                          IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
@@ -1474,6 +1479,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                          IsAB_F16_IMod0     : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
                          IsAB_BF16_IMod0    : (ins Src1VT:$src1),
                          IsIU               : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+                         HasMatrixFMT       : (ins timm:$matrix_b_fmt, Src1VT:$src1),
                          NoABMods           : (ins Src1VT:$src1));
   dag Src1OutPat = !cond(IsAB_F32F64_IMod1  : (ins i32:$src1_modifiers, Src1VT:$src1),
                          IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
@@ -1499,7 +1505,6 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                              IsIUXF32         : (ins Src2VT:$src2),
                              IsSWMMAC         : (ins));
   dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
-
   dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
                          !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
                          !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
@@ -1508,6 +1513,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
                           !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
                           !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
                           !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
+  dag MatrixFMTOutPat = !if(HasMatrixFMT, (ins i32:$matrix_a_fmt, i32:$matrix_b_fmt), (ins));
   dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
   dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1,  (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
 
@@ -1515,7 +1521,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
 
   dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
-  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
+  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
 
   dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
   dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
@@ -1523,7 +1529,7 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
   // can't use _twoaddr since it would violate src2 tied to vdst constraint.
   dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
-  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
+  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixFMTOutPat, MatrixReuseOutModPat, ClampPat);
 }
 
 def WMMAInstInfoTable : GenericTable {
@@ -1632,26 +1638,45 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32,   i32, v2i32, v4f32], 1,
 // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
 //                       for matrix A, index is i16; Matrix B uses all lanes
 
-def F64_F64X4_WMMA_w32           : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
-def F32_F32_WMMA_w32             : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_BF16X32_WMMA_w32         : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F16_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
-def BF16_BF16X32_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
-def BF16F32_BF16_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
-def F32_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
-def F32_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
-def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
-def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
-def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
-def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
-def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
-def F32_BF16X64_SWMMAC_w32       : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
-def F16_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
-def BF16_BF16X64_SWMMAC_w32      : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
-def F32_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8i32,  v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
-def F16_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8i32,  v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
-def I32_IU8X128_SWMMAC_w32       : VOP3PWMMA_Profile<[v8i32, v8i32,  v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
+def F32_F32_WMMA_w32             : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_BF16X32_WMMA_w32         : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F16_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 0, 1>;
+def BF16_BF16X32_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 0, 1>;
+def BF16F32_BF16_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 0, 1>;
+def F32_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
+def F32_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 1>;
+def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 1>;
+def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 1>;
+def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
+def F32_BF16X64_SWMMAC_w32       : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 1>;
+def F16_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 1>;
+def BF16_BF16X64_SWMMAC_w32      : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 0, 1>;
+def F32_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8i32,  v16i32, v8f32], 1, 32, 0, 1, 1, 0, 1>;
+def F16_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8i32,  v16i32, v8f16], 1, 32, 0, 1, 1, 0, 1>;
+def I32_IU8X128_SWMMAC_w32       : VOP3PWMMA_Profile<[v8i32, v8i32,  v16i32, v8i32], 1, 32, 1, 0, 1, 0, 1>;
+
+multiclass WMMA_F8F6F4_Profiles<bit HasMatrixReuse> {
+  def _f8_f8_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f8_f6_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f8_f4_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v8i32,  v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f6_f8_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f6_f6_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f6_f4_w32 : VOP3PWMMA_Profile<[v8f32, v12i32, v8i32,  v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f4_f8_w32 : VOP3PWMMA_Profile<[v8f32, v8i32,  v16i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f4_f6_w32 : VOP3PWMMA_Profile<[v8f32, v8i32,  v12i32, v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+  def _f4_f4_w32 : VOP3PWMMA_Profile<[v8f32, v8i32,  v8i32,  v8f32], 0, 0, 0, 1, 1, 1, HasMatrixReuse>;
+}
+
+defm F32_16X16X128_F8F6F4         : WMMA_F8F6F4_Profiles<0>;
+
+multiclass WMMAInst_SrcFormats_mc<string OpName, string Profile> {
+  foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+    defm _#I#_w32 : WMMAInstGFX12<OpName # "_" # I # "_w32", !cast<VOP3PWMMA_Profile>(Profile # "_" # I # "_w32"), "_w32">;
+  }
+}
 
 let WaveSizePredicate = isWave32 in {
 let SubtargetPredicate = isGFX125xOnly in {
@@ -1697,6 +1722,8 @@ defm V_SWMMAC_I32_16X16X128_IU8_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x12
 defm V_SWMMAC_F32_16X16X64_F16_w32      : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16",      F32_F16X64_SWMMAC_w32, "_w32">;
 defm V_SWMMAC_F16_16X16X64_F16_w32      : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16",      F16_F16X64_SWMMAC_w32, "_w32">;
 
+defm V_WMMA_F32_16X16X128_F8F6F4         : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
+
 } // End is_wmma_xdl = 1.
 
 } // End SubtargetPredicate = isGFX125xOnly
@@ -1854,6 +1881,10 @@ let SubtargetPredicate = isGFX125xOnly in {
   defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32",    int_amdgcn_wmma_f32_16x16x128_bf8_bf8,    F32_FP8BF8X128_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32",         int_amdgcn_wmma_f32_32x16x128_f4,         F32_32X16X128_F4_WMMA_w32>;
 
+  foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+    defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32",         int_amdgcn_wmma_f32_16x16x128_f8f6f4,         !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
+  }
+
   def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr,     int_amdgcn_swmmac_f32_16x16x64_bf16,     F32_BF16X64_SWMMAC_w32>;
   def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr,    int_amdgcn_swmmac_bf16_16x16x64_bf16,    BF16_BF16X64_SWMMAC_w32>;
   def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
@@ -1912,17 +1943,22 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<8> op, string backing_ps_name = NAME
 
 class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
     : VOP3Pe_gfx11_gfx12<op, P>{
+
   // opsel
-  let Inst{11} = !cond(!eq(WMMAP.IndexType, 0)  : 0,
+  let Inst{11} = !cond(WMMAP.HasMatrixFMT       : matrix_a_fmt{0},
+                       !eq(WMMAP.IndexType, 0)  : 0,
                        !eq(WMMAP.IndexType, 8)  : index_key_8bit{0},
                        !eq(WMMAP.IndexType, 16) : index_key_16bit{0},
                        !eq(WMMAP.IndexType, 32) : index_key_32bit{0});
-  let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
-  let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
+  let Inst{12} = !if(WMMAP.HasMatrixFMT, matrix_a_fmt{1},
+                     !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0));
+  let Inst{13} = !if (WMMAP.HasMatrixFMT, matrix_a_fmt{2},
+                      !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0));
   // opsel_hi
-  let Inst{59} = 1;
-  let Inst{60} = 1;
-  let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
+  let Inst{59} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{0}, 1);
+  let Inst{60} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{1}, 1);
+  let Inst{14} = !if (WMMAP.HasMatrixFMT, matrix_b_fmt{2},
+                      !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1));
   // neg_lo
   let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
   let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@@ -1961,6 +1997,24 @@ multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
   }
 }
 
+multiclass VOP3P_Real_WMMA_F8F6F4_gfx1250<bits<8> op, VOP3PWMMA_Profile WMMAP> {
+  defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+  defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+  defvar psName = !substr(NAME, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
+  let AsmString = asmName # PS.AsmOperands in
+    defm NAME : VOP3P_Real_WMMA_gfx1250<op, WMMAP>,
+                MFMA_F8F6F4_WithSizeTable_Helper<PS, psName # "_f8_f8_w32_twoaddr_gfx1250">;
+}
+
+multiclass VOP3P_Real_WMMA_gfx1250_SrcFormats<bits<8> op, string WMMAP> {
+  defm _f8_f8_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_f8_f8_w32")>;
+  foreach I = ["f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
+    let isAsmParserOnly = true in { // Disable ambiguous disassembly.
+      defm _#I#_w32 : VOP3P_Real_WMMA_F8F6F4_gfx1250<op, !cast<VOP3PWMMA_Profile>(WMMAP # "_" # I # "_w32")>;
+    }
+  }
+}
+
 defm V_WMMA_F32_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
 defm V_WMMA_F32_16X16X16_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
 defm V_WMMA_F16_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -2035,6 +2089,8 @@ defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8B
 defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
 defm V_WMMA_F32_32X16X128_F4_w32      : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
 
+defm V_WMMA_F32_16X16X128_F8F6F4        : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x033, "F32_16X16X128_F8F6F4">;
+
 defm V_SWMMAC_F32_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
 defm V_SWMMAC_F32_16X16X64_BF16_w32     : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
 defm V_SWMMAC_F16_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 2b91ea7386be4..c21e2d38398fa 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
 
 // Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
 // op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base  {
   bits<1> fi;
   bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+  // OPSEL[0] specifies FI
+  let Inst{11} = fi;
+  // OPSEL[1] specifies BOUND_CTRL
+  let Inst{12} = bound_ctrl;
+}
 
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
   // OPSEL[0] specifies FI
   let Inst{11} = fi;
   // OPSEL[1] specifies BOUND_CTRL
@@ -444,6 +453,8 @@ class VOP3Pe_Base {
   bits<2> index_key_8bit;
   bits<1> index_key_16bit;
   bits<1> index_key_32bit;
+  bits<3> matrix_a_fmt;
+  bits<3> matrix_b_fmt;
   bits<1> matrix_a_reuse;
   bits<1> matrix_b_reuse;
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 65d1c4e2d6515..8b7f06a5b5014 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
     auto T = const_cast<Type*>(CP->getType());
     auto C = const_cast<Constant*>(CP->getConstVal());
-    auto M = const_cast<Module*>(DAG.getMachineFunction().
-                                 getFunction().getParent());
+    auto M = DAG.getMachineFunction().getFunction().getParent();
     auto GV = new GlobalVariable(
                     *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
@@ -21585,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -21593,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load");
+
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
 
@@ -21727,11 +21731,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
+                                              Value *LaneMask,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
+  auto *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!LaneMask && "Unexpected mask on store");
 
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..825145d813fb1 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -681,11 +681,12 @@ class VectorType;
 
     unsigned getMaxSupportedInterleaveFactor() const override;
 
-    bool lowerInterleavedLoad(LoadInst *LI,
+    bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
-    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                               ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index eaba6fe5bfcb7..a7a9911de2d04 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -593,7 +593,7 @@ class ARMELFStreamer : public MCELFStreamer {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
         return;
       }
-      getOrCreateDataFragment();
+      getCurrentFragment();
     }
 
     emitDataMappingSymbol();
@@ -1207,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
 }
 
 void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
-  MCFragment *Frag = getOrCreateDataFragment();
+  MCFragment *Frag = getCurrentFragment();
   Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
 }
 
@@ -1295,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
       MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
 
   visitUsedExpr(*PersonalityRef);
-  MCFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getCurrentFragment();
   DF->addFixup(
       MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index db09738bfd22d..128cc0b573954 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -514,19 +514,7 @@ bool AVRAsmBackend::forceRelocation(const MCFragment &F, const MCFixup &Fixup,
     return false;
 
   case AVR::fixup_7_pcrel:
-  case AVR::fixup_13_pcrel: {
-    uint64_t Offset = Target.getConstant();
-    uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
-
-    // If the jump is too large to encode it, fall back to a relocation.
-    //
-    // Note that trying to actually link that relocation *would* fail, but the
-    // hopes are that the module we're currently compiling won't be actually
-    // linked to the final binary.
-    return !adjust::adjustRelativeBranch(Size, Fixup, Offset,
-                                         getContext().getSubtargetInfo());
-  }
-
+  case AVR::fixup_13_pcrel:
   case AVR::fixup_call:
     return true;
   }
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index 5d49949ddea25..7faae8b725b43 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -22,7 +22,7 @@ class BPFTargetMachine;
 class InstructionSelector;
 class PassRegistry;
 
-static const char *BPF_TRAP = "__bpf_trap";
+#define BPF_TRAP "__bpf_trap"
 
 ModulePass *createBPFCheckAndAdjustIR();
 
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index ce43645d005b0..f0e2e786dfaf4 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Info.RootFlattenedArrayType, Info.RootPointerOperand,
         {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
 
+    // If the pointer operand is a global variable and all indices are 0,
+    // IRBuilder::CreateGEP will return the global variable instead of creating
+    // a GEP instruction or GEP ConstantExpr. In this case we have to create and
+    // insert our own GEP instruction.
+    if (!isa<GEPOperator>(NewGEP))
+      NewGEP = GetElementPtrInst::Create(
+          Info.RootFlattenedArrayType, Info.RootPointerOperand,
+          {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
+          Builder.GetInsertPoint());
+
     // Replace the current GEP with the new GEP. Store GEPInfo into the map
     // for later use in case this GEP was not the end of the chain
     GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index c9ff7137fdac1..c73648f21e8d7 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I,
 }
 
 static void
-legalizeLoadStoreOnArrayAllocas(Instruction &I,
+legalizeScalarLoadStoreOnArrays(Instruction &I,
                                 SmallVectorImpl<Instruction *> &ToRemove,
                                 DenseMap<Value *, Value *> &) {
 
@@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I,
   } else
     return;
 
-  assert(LoadStoreTy->isSingleValueType() &&
-         "Expected load/store type to be a single-valued type");
+  // If the load/store is not of a single-value type (i.e., scalar or vector)
+  // then we do not modify it. It shouldn't be a vector either because the
+  // dxil-data-scalarization pass is expected to run before this, but it's not
+  // incorrect to apply this transformation to vector load/stores.
+  if (!LoadStoreTy->isSingleValueType())
+    return;
 
-  auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp);
-  if (!AllocaPtrOp)
+  Type *ArrayTy;
+  if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
+    ArrayTy = GlobalVarPtrOp->getValueType();
+  else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
+    ArrayTy = AllocaPtrOp->getAllocatedType();
+  else
     return;
 
-  Type *Ty = AllocaPtrOp->getAllocatedType();
-  if (!isa<ArrayType>(Ty))
+  if (!isa<ArrayType>(ArrayTy))
     return;
-  assert(!isa<ArrayType>(Ty->getArrayElementType()) &&
-         "Expected allocated type of AllocaInst to be a flat ArrayType");
 
-  IRBuilder<> Builder(&I);
-  Value *Zero = Builder.getInt32(0);
-  Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "",
-                                 GEPNoWrapFlags::all());
+  assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
+         "Expected array element type to be the same as to the scalar load or "
+         "store type");
+
+  Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
+  Value *GEP = GetElementPtrInst::Create(
+      ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
   I.setOperand(PtrOpIndex, GEP);
 }
 
@@ -651,7 +659,7 @@ class DXILLegalizationPipeline {
     // downcastI64toI32InsertExtractElements needs to handle.
     LegalizationPipeline[Stage2].push_back(
         downcastI64toI32InsertExtractElements);
-    LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas);
+    LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
   }
 };
 
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 703a9e56626c8..c8866bfefdfc5 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -24,7 +24,6 @@
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -240,11 +239,6 @@ class DXILPrepareModule : public ModulePass {
       for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
         F.removeParamAttrs(Idx, AttrMask);
 
-      // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
-      if (Intrinsic::ID IID = F.getIntrinsicID();
-          IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
-        F.removeFnAttr(Attribute::Memory);
-
       for (auto &BB : F) {
         IRBuilder<> Builder(&BB);
         for (auto &I : make_early_inc_range(BB)) {
@@ -253,7 +247,7 @@ class DXILPrepareModule : public ModulePass {
 
           // Emtting NoOp bitcast instructions allows the ValueEnumerator to be
           // unmodified as it reserves instruction IDs during contruction.
-          if (auto *LI = dyn_cast<LoadInst>(&I)) {
+          if (auto LI = dyn_cast<LoadInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, LI->getPointerOperand(),
                     LI->getType())) {
@@ -263,7 +257,7 @@ class DXILPrepareModule : public ModulePass {
             }
             continue;
           }
-          if (auto *SI = dyn_cast<StoreInst>(&I)) {
+          if (auto SI = dyn_cast<StoreInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, SI->getPointerOperand(),
                     SI->getValueOperand()->getType())) {
@@ -274,7 +268,7 @@ class DXILPrepareModule : public ModulePass {
             }
             continue;
           }
-          if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+          if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, GEP->getPointerOperand(),
                     GEP->getSourceElementType()))
@@ -286,17 +280,6 @@ class DXILPrepareModule : public ModulePass {
             CB->removeRetAttrs(AttrMask);
             for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
               CB->removeParamAttrs(Idx, AttrMask);
-            // LLVM 3.7 Lifetime intrinics require an i8* pointer operand, so we
-            // insert a bitcast here to ensure that is the case
-            if (isa<LifetimeIntrinsic>(CB)) {
-              Value *PtrOperand = CB->getArgOperand(1);
-              Builder.SetInsertPoint(CB);
-              PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
-              Value *NoOpBitcast = Builder.Insert(
-                  CastInst::Create(Instruction::BitCast, PtrOperand,
-                                   Builder.getPtrTy(PtrTy->getAddressSpace())));
-              CB->setArgOperand(1, NoOpBitcast);
-            }
             continue;
           }
         }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 46d5d7177c198..1d79c3018439e 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -2545,25 +2545,6 @@ void DXILBitcodeWriter::writeInstruction(const Instruction &I, unsigned InstID,
   Vals.clear();
 }
 
-// HLSL Change
-namespace {
-struct ValueNameCreator {
-  MallocAllocator Allocator;
-  SmallVector<ValueName *, 2>
-      ValueNames; // SmallVector N = 2 because we currently only expect this
-                  // to hold ValueNames for Lifetime intrinsics
-  ~ValueNameCreator() {
-    for (auto *VN : ValueNames)
-      VN->Destroy(Allocator);
-  }
-  ValueName *create(StringRef Name, Value *V) {
-    ValueName *VN = ValueName::create(Name, Allocator, V);
-    ValueNames.push_back(VN);
-    return VN;
-  }
-};
-} // anonymous namespace
-
 // Emit names for globals/functions etc.
 void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
     const ValueSymbolTable &VST) {
@@ -2578,24 +2559,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
   // to ensure the binary is the same no matter what values ever existed.
   SmallVector<const ValueName *, 16> SortedTable;
 
-  // HLSL Change
-  ValueNameCreator VNC;
   for (auto &VI : VST) {
-    ValueName *VN = VI.second->getValueName();
-    // Clang mangles lifetime intrinsic names by appending '.p0' to the end,
-    // making them invalid lifetime intrinsics in LLVM 3.7. We can't
-    // demangle in dxil-prepare because it would result in invalid IR.
-    // Therefore we have to do this in the bitcode writer while writing its
-    // name to the symbol table.
-    if (const Function *Fn = dyn_cast<Function>(VI.getValue());
-        Fn && Fn->isIntrinsic()) {
-      Intrinsic::ID IID = Fn->getIntrinsicID();
-      if (IID == Intrinsic::lifetime_start || IID == Intrinsic::lifetime_end)
-        VN = VNC.create(Intrinsic::getBaseName(IID), VI.second);
-    }
-    SortedTable.push_back(VN);
+    SortedTable.push_back(VI.second->getValueName());
   }
-
   // The keys are unique, so there shouldn't be stability issues.
   llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
     return A->first() < B->first();
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index dfc79039cb54e..1bd5dd78fedd1 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -52,6 +53,53 @@ class WriteDXILPass : public llvm::ModulePass {
   }
 };
 
+static void legalizeLifetimeIntrinsics(Module &M) {
+  for (Function &F : M) {
+    Intrinsic::ID IID = F.getIntrinsicID();
+    if (IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+      continue;
+
+    // Lifetime intrinsics in LLVM 3.7 do not have the memory FnAttr
+    F.removeFnAttr(Attribute::Memory);
+
+    // Lifetime intrinsics in LLVM 3.7 do not have mangled names
+    F.setName(Intrinsic::getBaseName(IID));
+
+    // LLVM 3.7 Lifetime intrinics require an i8* operand, so we insert bitcasts
+    // to ensure that is the case
+    for (auto *User : make_early_inc_range(F.users())) {
+      CallInst *CI = dyn_cast<CallInst>(User);
+      assert(CI && "Expected user of a lifetime intrinsic function to be a "
+                   "lifetime intrinsic call");
+      Value *PtrOperand = CI->getArgOperand(1);
+      PointerType *PtrTy = cast<PointerType>(PtrOperand->getType());
+      Value *NoOpBitCast = CastInst::Create(Instruction::BitCast, PtrOperand,
+                                            PtrTy, "", CI->getIterator());
+      CI->setArgOperand(1, NoOpBitCast);
+    }
+  }
+}
+
+static void removeLifetimeIntrinsics(Module &M) {
+  for (Function &F : make_early_inc_range(M)) {
+    if (Intrinsic::ID IID = F.getIntrinsicID();
+        IID != Intrinsic::lifetime_start && IID != Intrinsic::lifetime_end)
+      continue;
+
+    for (User *U : make_early_inc_range(F.users())) {
+      LifetimeIntrinsic *LI = dyn_cast<LifetimeIntrinsic>(U);
+      assert(LI && "Expected user of lifetime intrinsic function to be "
+                   "a LifetimeIntrinsic instruction");
+      BitCastInst *BCI = dyn_cast<BitCastInst>(LI->getArgOperand(1));
+      assert(BCI && "Expected pointer operand of LifetimeIntrinsic to be a "
+                    "BitCastInst");
+      LI->eraseFromParent();
+      BCI->eraseFromParent();
+    }
+    F.eraseFromParent();
+  }
+}
+
 class EmbedDXILPass : public llvm::ModulePass {
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -70,8 +118,17 @@ class EmbedDXILPass : public llvm::ModulePass {
     // Only the output bitcode need to be DXIL triple.
     M.setTargetTriple(Triple("dxil-ms-dx"));
 
+    // Perform late legalization of lifetime intrinsics that would otherwise
+    // fail the Module Verifier if performed in an earlier pass
+    legalizeLifetimeIntrinsics(M);
+
     WriteDXILToFile(M, OS);
 
+    // We no longer need lifetime intrinsics after bitcode serialization, so we
+    // simply remove them to keep the Module Verifier happy after our
+    // not-so-legal legalizations
+    removeLifetimeIntrinsics(M);
+
     // Recover triple.
     M.setTargetTriple(OriginalTriple);
 
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5bd31707acb6f..22cff7c80fa01 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -43,12 +43,12 @@ namespace {
 class HexagonDisassembler : public MCDisassembler {
 public:
   std::unique_ptr<MCInstrInfo const> const MCII;
-  std::unique_ptr<MCInst *> CurrentBundle;
+  mutable std::unique_ptr<MCInst> CurrentBundle;
   mutable MCInst const *CurrentExtender;
 
   HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                       MCInstrInfo const *MCII)
-      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *),
+      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr),
         CurrentExtender(nullptr) {}
 
   DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
@@ -57,7 +57,23 @@ class HexagonDisassembler : public MCDisassembler {
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &CStream) const override;
+
+  DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+                                    ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                    raw_ostream &CStream) const override;
+
   void remapInstruction(MCInst &Instr) const;
+
+private:
+  bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                  uint64_t &BytesToSkip, raw_ostream &CS) const;
+
+  void resetBundle() const {
+    CurrentBundle.reset();
+    CurrentInstruction = nullptr;
+  }
+
+  mutable MCOperand *CurrentInstruction = nullptr;
 };
 
 static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI,
@@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() {
                                          createHexagonDisassembler);
 }
 
-DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 ArrayRef<uint8_t> Bytes,
-                                                 uint64_t Address,
-                                                 raw_ostream &CS) const {
-  CommentStream = &CS;
-
-  DecodeStatus Result = DecodeStatus::Success;
+bool HexagonDisassembler::makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                     uint64_t &BytesToSkip,
+                                     raw_ostream &CS) const {
   bool Complete = false;
-  Size = 0;
+  DecodeStatus Result = DecodeStatus::Success;
 
-  *CurrentBundle = &MI;
-  MI.setOpcode(Hexagon::BUNDLE);
-  MI.addOperand(MCOperand::createImm(0));
+  CurrentBundle.reset(new MCInst);
+  CurrentBundle->setOpcode(Hexagon::BUNDLE);
+  CurrentBundle->addOperand(MCOperand::createImm(0));
   while (Result == Success && !Complete) {
     if (Bytes.size() < HEXAGON_INSTR_SIZE)
-      return MCDisassembler::Fail;
+      return false;
     MCInst *Inst = getContext().createMCInst();
-    Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete);
-    MI.addOperand(MCOperand::createInst(Inst));
-    Size += HEXAGON_INSTR_SIZE;
+    Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS,
+                                  Complete);
+    CurrentBundle->addOperand(MCOperand::createInst(Inst));
+    BytesToSkip += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
   if (Result == MCDisassembler::Fail)
-    return Result;
-  if (Size > HEXAGON_MAX_PACKET_SIZE)
-    return MCDisassembler::Fail;
+    return false;
+  if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE)
+    return false;
 
   const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
   const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI;
-  HexagonMCChecker Checker(getContext(), *MCII, STI_, MI,
+  HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle,
                            *getContext().getRegisterInfo(), false);
   if (!Checker.check())
-    return MCDisassembler::Fail;
-  remapInstruction(MI);
+    return false;
+  remapInstruction(*CurrentBundle);
+  return true;
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  Size = 0;
+  uint64_t BytesToSkip = 0;
+
+  if (!CurrentBundle) {
+    if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+      Size = BytesToSkip;
+      resetBundle();
+      return MCDisassembler::Fail;
+    }
+    CurrentInstruction = (CurrentBundle->begin() + 1);
+  }
+
+  MI = *(CurrentInstruction->getInst());
+  Size = HEXAGON_INSTR_SIZE;
+  if (++CurrentInstruction == CurrentBundle->end())
+    resetBundle();
   return MCDisassembler::Success;
 }
 
+DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI,
+                                                       uint64_t &Size,
+                                                       ArrayRef<uint8_t> Bytes,
+                                                       uint64_t Address,
+                                                       raw_ostream &CS) const {
+  CommentStream = &CS;
+  Size = 0;
+  uint64_t BytesToSkip = 0;
+  assert(!CurrentBundle);
+
+  if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+    Size = BytesToSkip;
+    resetBundle();
+    return MCDisassembler::Fail;
+  }
+
+  MI = *CurrentBundle;
+  Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI);
+  resetBundle();
+
+  return Success;
+}
+
 void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
   for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) {
     auto &MI = const_cast<MCInst &>(*I.getInst());
@@ -482,7 +543,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     unsigned Offset = 1;
     bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
     bool PrevVector = false;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle);
     auto i = Instructions.end() - 1;
     for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
       if (i == n)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 53943de3bc597..e285e04543694 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
       R = N;
       break;
     }
+    case ISD::AssertSext: {
+      EVT T = cast<VTSDNode>(N.getOperand(1))->getVT();
+      if (T.getSizeInBits() == 32)
+        R = N.getOperand(0);
+      else
+        return false;
+      break;
+    }
+
     default:
       return false;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index f0ca90832c4fb..605064986b336 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -336,5 +336,4 @@ class InstDuplex<bits<4> iClass, string cstr = ""> : Instruction,
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-include "HexagonInstrFormatsV60.td"
 include "HexagonInstrFormatsV65.td"
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
deleted file mode 100644
index 86a82183a1ad8..0000000000000
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 instruction classes in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//----------------------------------------------------------------------------//
-//                         Instruction Classes Definitions +
-//----------------------------------------------------------------------------//
-
-class CVI_VA_Resource<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VA>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
-     OpcodeHexagon, Requires<[HasV60, UseHVX]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
index 246a1d364d41a..85b826fc835a1 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
@@ -20,11 +20,6 @@
 //                         Instruction Classes Definitions +
 //----------------------------------------------------------------------------//
 
-class CVI_VA_Resource_NoOpcode<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VA>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>;
-
 class CVI_GATHER_TMP_LD_Resource_NoOpcode<dag outs, dag ins, string asmstr,
                         list<dag> pattern = [], string cstr = "",
                         InstrItinClass itin = CVI_GATHER_PSEUDO>
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
deleted file mode 100644
index 44f39a3e9b163..0000000000000
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ /dev/null
@@ -1,414 +0,0 @@
-//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
-def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
-def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
-
-def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
-def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
-def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
-def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
-def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
-def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
-def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
-def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
-def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
-def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
-// Rxx[^]=vpmpyh(Rs,Rt)
-def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
-// Rxx^=pmpyw(Rs,Rt)
-def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
-
-//Rxx^=asr(Rss,Rt)
-def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
-//Rxx^=asl(Rss,Rt)
-def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
-//Rxx^=lsr(Rss,Rt)
-def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
-//Rxx^=lsl(Rss,Rt)
-def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
-
-// Multiply and use upper result
-def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
-def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
-def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
-def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
-def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
-
-def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
-def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
-
-def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
-def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
-def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
-
-def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
-def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
-def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
-
-def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
-def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
-def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
-def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
-def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
-def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
-def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
-def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
-def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
-def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
-
-def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
-def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
-def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
-def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
-def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
-def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
-
-def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
-def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
-def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
-
-def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
-def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
-def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
-
-def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
-def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
-
-def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
-def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
-def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
-def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
-def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
-def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
-def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
-
-// Complex multiply 32x16
-def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
-def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
-
-def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
-def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
-
-def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
-def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
-
-// Complex add/sub halfwords/words
-def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
-def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
-def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
-def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
-
-def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
-def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
-
-// Extract bitfield
-def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
-def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
-def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
-def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
-
-// Shift an immediate left by register amount
-def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
-
-// Vector reduce maximum halfwords
-def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
-def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
-
-// Vector reduce maximum words
-def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
-def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
-
-// Vector reduce minimum halfwords
-def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
-def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
-
-// Vector reduce minimum words
-def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
-def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
-
-// Rotate and reduce bytes
-def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
-                                     u2_0ImmPred:$src3),
-           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
-                                         IntRegs:$src3, u2_0ImmPred:$src4),
-           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
-                             IntRegs:$src3, u2_0ImmPred:$src4)>;
-
-// Vector conditional negate
-def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
-
-// Logical xor with xor accumulation
-def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
-
-// ALU64 - Vector min/max byte
-def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
-def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
-
-// Shift and add/sub/and/or
-def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
-def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
-def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
-def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
-def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
-def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
-def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
-def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
-
-// Split bitfield
-def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
-def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
-
-def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
-
-def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
-def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
-
-def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
-def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
-def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
-
-//*******************************************************************
-//            ALU32/ALU
-//*******************************************************************
-
-// ALU32 / ALU / Logical Operations.
-def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
-def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
-
-//*******************************************************************
-//            ALU32/PERM
-//*******************************************************************
-
-// Combine Words Into Doublewords.
-def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
-def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
-
-//*******************************************************************
-//           ALU32/PRED
-//*******************************************************************
-
-// Compare
-def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
-
-// Compare To General Register.
-def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
-def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
-def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
-
-def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
-def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
-
-def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
-def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
-
-//*******************************************************************
-//           CR
-//*******************************************************************
-
-// CR / Logical Operations On Predicates.
-def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
-def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
-def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
-def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
-def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
-def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
-def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
-def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
-
-//*******************************************************************
-//           XTYPE/ALU
-//*******************************************************************
-
-// Add And Accumulate.
-
-def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
-def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
-
-
-// XTYPE / ALU / Logical-logical Words.
-def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
-def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
-def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
-def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
-def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
-def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
-def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
-def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
-def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
-def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
-def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
-
-def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
-def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
-def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
-
-// Modulo wrap.
-def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
-
-// Arithmetic/Convergent round
-// Rd=[cround|round](Rs,Rt)[:sat]
-// Rd=[cround|round](Rs,#u5)[:sat]
-def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
-def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
-
-def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
-def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
-
-def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
-def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
-
-def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
-
-//Rdd[+]=vrmpybsu(Rss,Rtt)
-//Rdd[+]=vrmpybuu(Rss,Rtt)
-def : T_PP_pat  <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
-def : T_PP_pat  <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
-
-def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>;
-
-def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>;
-def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>;
-//Rxx+=vdmpybsu(Rss,Rtt):sat
-def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>;
-
-// Vector multiply bytes
-// Rdd=vmpyb[s]u(Rs,Rt)
-def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>;
-def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>;
-
-// Rxx+=vmpyb[s]u(Rs,Rt)
-def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>;
-def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
-
-// Rd=vaddhub(Rss,Rtt):sat
-def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
-
-def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
-def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
-def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>;
-def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>;
-def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>;
-
-def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
-def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
-def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
-
-def : T_Q_QQ_pat<C4_fastcorner9,     int_hexagon_C4_fastcorner9>;
-def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
-
-def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
-def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
-
-def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>;
-def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax,
-                int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
-
-def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax,
-                int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
-
-def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>;
-
-def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>;
-def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>;
-def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>;
-def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
-def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
-
-// Compare floating-point value
-def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
-def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
-def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
-def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
-
-def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
-def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
-def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
-def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
-
-// Create floating-point value
-def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
-def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
-def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
-def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
-
-def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
-def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
-def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
-def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
-def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
-def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>;
-def : T_R_pat <F2_conv_w2sf,  int_hexagon_F2_conv_w2sf>;
-def : T_R_pat <F2_conv_w2df,  int_hexagon_F2_conv_w2df>;
-def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>;
-def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>;
-def : T_P_pat <F2_conv_d2sf,  int_hexagon_F2_conv_d2sf>;
-def : T_P_pat <F2_conv_d2df,  int_hexagon_F2_conv_d2df>;
-def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>;
-def : T_F_pat <F2_conv_sf2w,  int_hexagon_F2_conv_sf2w>;
-def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>;
-def : T_F_pat <F2_conv_sf2d,  int_hexagon_F2_conv_sf2d>;
-def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>;
-def : T_D_pat <F2_conv_df2w,  int_hexagon_F2_conv_df2w>;
-def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>;
-def : T_D_pat <F2_conv_df2d,  int_hexagon_F2_conv_df2d>;
-def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>;
-def : T_F_pat <F2_conv_sf2w_chop,  int_hexagon_F2_conv_sf2w_chop>;
-def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>;
-def : T_F_pat <F2_conv_sf2d_chop,  int_hexagon_F2_conv_sf2d_chop>;
-def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>;
-def : T_D_pat <F2_conv_df2w_chop,  int_hexagon_F2_conv_df2w_chop>;
-def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>;
-def : T_D_pat <F2_conv_df2d_chop,  int_hexagon_F2_conv_df2d_chop>;
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
deleted file mode 100644
index 796979e590614..0000000000000
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ /dev/null
@@ -1,642 +0,0 @@
-//===- HexagonIntrinsicsV60.td - V60 instruction intrinsics -*- tablegen *-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-
-let AddedComplexity = 100 in {
-def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 HvxWR:$src1))),
-            (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_lo)) >;
-
-def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 HvxWR:$src1))),
-            (v16i32 (EXTRACT_SUBREG (v32i32 HvxWR:$src1), vsub_hi)) >;
-
-def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 HvxWR:$src1))),
-            (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_lo)) >;
-
-def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))),
-            (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi)) >;
-}
-
-def : Pat <(v64i1 (bitconvert (v16i32 HvxVR:$src1))),
-           (v64i1 (V6_vandvrt(v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i1 (bitconvert (v32i16 HvxVR:$src1))),
-           (v64i1 (V6_vandvrt(v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i1 (bitconvert (v64i8  HvxVR:$src1))),
-           (v64i1 (V6_vandvrt(v64i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v16i32 (bitconvert (v64i1 HvxQR:$src1))),
-           (v16i32 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v32i16 (bitconvert (v64i1 HvxQR:$src1))),
-           (v32i16 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i8  (bitconvert (v64i1 HvxQR:$src1))),
-           (v64i8  (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v32i32 HvxVR:$src1))),
-           (v128i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v64i16 HvxVR:$src1))),
-           (v128i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i1 (bitconvert (v128i8  HvxVR:$src1))),
-           (v128i1 (V6_vandvrt (v128i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v32i32 (bitconvert (v128i1 HvxQR:$src1))),
-           (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v64i16 (bitconvert (v128i1 HvxQR:$src1))),
-           (v64i16 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(v128i8  (bitconvert (v128i1 HvxQR:$src1))),
-           (v128i8  (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
-
-let AddedComplexity = 140 in {
-def : Pat <(store (v64i1 HvxQR:$src1), (i32 IntRegs:$addr)),
-           (V6_vS32b_ai IntRegs:$addr, 0,
-           (v16i32 (V6_vandqrt (v64i1 HvxQR:$src1),
-                                       (A2_tfrsi 0x01010101))))>;
-
-def : Pat <(v64i1 (load (i32 IntRegs:$addr))),
-           (v64i1 (V6_vandvrt
-           (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
-
-def : Pat <(store (v128i1 HvxQR:$src1), (i32 IntRegs:$addr)),
-           (V6_vS32b_ai IntRegs:$addr, 0,
-           (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1),
-                                       (A2_tfrsi 0x01010101))))>;
-
-def : Pat <(v128i1 (load (i32 IntRegs:$addr))),
-           (v128i1 (V6_vandvrt
-           (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
-}
-
-multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
-           (MI IntRegs:$src1)>;
-}
-
-multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1),
-           (MI    HvxVR:$src1)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1),
-           (MI HvxVR:$src1)>;
-}
-
-multiclass T_W_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1),
-           (MI    HvxWR:$src1)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1),
-           (MI HvxWR:$src1)>;
-}
-
-multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1),
-           (MI    HvxQR:$src1)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1),
-           (MI HvxQR:$src1)>;
-}
-
-multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, IntRegs:$src2),
-           (MI    HvxWR:$src1, IntRegs:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxWR:$src1, IntRegs:$src2),
-           (MI HvxWR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, IntRegs:$src2),
-           (MI    HvxVR:$src1, IntRegs:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B")HvxVR:$src1, IntRegs:$src2),
-           (MI HvxVR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxVR:$src2),
-           (MI    HvxWR:$src1, HvxVR:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2),
-           (MI  HvxWR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxWR:$src2),
-           (MI    HvxWR:$src1, HvxWR:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2),
-           (MI  HvxWR:$src1, HvxWR:$src2)>;
-}
-
-multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2),
-           (MI    HvxVR:$src1, HvxVR:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2),
-           (MI  HvxVR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, IntRegs:$src2),
-           (MI    HvxQR:$src1, IntRegs:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2),
-           (MI  HvxQR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, HvxQR:$src2),
-           (MI    HvxQR:$src1, HvxQR:$src2)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2),
-           (MI  HvxQR:$src1, HvxQR:$src2)>;
-}
-
-multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-           (MI    HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
-                                            IntRegs:$src3),
-           (MI  HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-           (MI    HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            IntRegs:$src3),
-           (MI  HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-           (MI    HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
-                                            IntRegs:$src3),
-           (MI  HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
-           (MI    HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxWR:$src2,
-                                            IntRegs:$src3),
-           (MI  HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-           (MI    HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3),
-           (MI  HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-           (MI    HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3),
-           (MI  HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-           (MI    HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3),
-           (MI  HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
-           (MI    HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2,
-                                            IntRegs:$src3),
-           (MI  HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-}
-
-
-multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
-           (MI    HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2,
-                                            IntRegs:$src3),
-           (MI  HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3),
-           (MI    HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1,
-                                            HvxVR:$src2, imm:$src3),
-           (MI  HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-}
-
-multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, IntRegs:$src2, imm:$src3),
-           (MI    HvxWR:$src1, IntRegs:$src2, imm:$src3)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1,
-                                            IntRegs:$src2, imm:$src3),
-           (MI  HvxWR:$src1, IntRegs:$src2, imm:$src3)>;
-}
-
-multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4),
-           (MI   HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
-                                            IntRegs:$src3, imm:$src4),
-           (MI  HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, imm:$src4)>;
-}
-
-multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4),
-           (MI    HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3, IntRegs:$src4),
-           (MI  HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-}
-
-multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4),
-           (MI    HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3, IntRegs:$src4),
-           (MI  HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegs:$src4)>;
-}
-
-defm : T_WR_pat <V6_vtmpyb, int_hexagon_V6_vtmpyb>;
-defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
-defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
-defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
-defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
-defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
-defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
-defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
-defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
-defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
-defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
-defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
-defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
-defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
-defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
-defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
-defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
-defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
-defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
-defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
-defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
-defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
-defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
-defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
-defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
-defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
-defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
-defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
-defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
-defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
-defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
-defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
-
-defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
-defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
-defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
-defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
-defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
-defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
-defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
-defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
-defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
-defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
-defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
-defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
-defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
-defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
-defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
-defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
-defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
-defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
-defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
-defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
-defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
-defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
-defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
-defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
-defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
-defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
-defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
-defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
-defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
-defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
-defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
-defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
-defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
-defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
-defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
-defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
-defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
-defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
-defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
-defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
-defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
-defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
-defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
-defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
-defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
-defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
-defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
-defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
-defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
-defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
-defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
-defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
-defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
-defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
-defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
-defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
-defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
-defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
-defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
-defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
-defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
-defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
-defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
-defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
-
-defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
-defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
-defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
-defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
-defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
-defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
-defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
-defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
-defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
-defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
-defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
-
-defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
-defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
-
-defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
-defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
-defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
-defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
-
-defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
-defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
-defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
-defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
-defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
-defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
-defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
-defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
-
-defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
-defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
-defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
-defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
-defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
-defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
-defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
-defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
-defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
-defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
-defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
-defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
-defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
-defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
-defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
-
-// Compare instructions
-defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
-defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
-defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
-defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
-defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
-defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
-defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
-defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
-defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
-defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
-defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
-defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
-defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
-defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
-defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
-defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
-defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
-defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
-defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
-defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
-defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
-defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
-defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
-defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
-defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
-defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
-defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
-
-defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
-defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
-defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
-defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
-defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
-defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
-defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
-defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
-defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
-defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
-defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
-defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
-defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
-defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
-defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
-defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
-defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
-defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
-defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
-defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
-defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
-defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
-defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
-defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
-defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
-defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
-defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
-defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
-defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
-defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
-defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
-defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
-defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
-defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
-defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
-defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
-defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
-defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
-defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
-defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
-defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
-defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
-defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
-defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
-defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
-defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
-
-defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
-defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
-defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
-defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
-defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
-defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
-defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
-defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
-defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
-defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
-defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
-defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
-
-defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
-defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
-defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
-defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
-defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
-defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
-defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
-defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
-defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
-defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
-defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
-defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
-defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
-defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
-defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
-defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
-defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
-defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
-defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
-defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
-defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
-defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
-defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
-
-defm : T_W_pat <V6_lo, int_hexagon_V6_lo>;
-defm : T_W_pat <V6_hi, int_hexagon_V6_hi>;
-defm : T_W_pat <V6_vassignp, int_hexagon_V6_vassignp>;
-
-defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
-defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
-defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
-
-defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
-defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
-defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
-
-// assembler mapped.
-//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
-// not present earlier.. need to add intrinsic
-defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
-defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
-defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
-defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
-defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
-defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
-defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
-defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
-defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
-
-defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
-defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
-
-defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
-defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
-defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
-defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
-
-defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
-defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
-defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
-defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
-defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
-defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
-defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
-defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
-defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
-defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
-defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
-defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
-defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
-defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
-defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
-defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
-defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
-
-defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
-defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
-defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
-defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
-defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
-defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
-
-defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
-defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
-defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
-defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
-
-defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
-def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
-def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
-def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
-def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
-def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
-def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
-def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
-def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
-def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
-def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
-def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
-def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
-
-defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
-defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
-
-//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
-
-def: Pat<(v64i16 (trunc v64i32:$Vdd)),
-         (v64i16 (V6_vpackwh_sat
-                 (v32i32 (V6_hi HvxWR:$Vdd)),
-                 (v32i32 (V6_lo HvxWR:$Vdd))))>;
-
-def: Pat<(int_hexagon_V6_vd0),      (V6_vd0)>;
-def: Pat<(int_hexagon_V6_vd0_128B), (V6_vd0)>;
-
diff --git a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
deleted file mode 100644
index 2fcefe6a4ef6c..0000000000000
--- a/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
+++ /dev/null
@@ -1,179 +0,0 @@
-//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, IntRegs:$src2),
-           (MI HvxVR:$src1, IntRegs:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, IntRegs:$src2),
-           (MI HvxVR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-           (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            IntRegsLow8:$src3),
-           (MI HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>;
-}
-
-multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2),
-           (MI HvxVR:$src1, HvxVR:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2),
-           (MI HvxVR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxWR:$src2),
-           (MI HvxWR:$src1, HvxWR:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2),
-           (MI HvxWR:$src1, HvxWR:$src2)>;
-}
-
-multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-           (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3),
-           (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>;
-}
-
-multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, IntRegs:$src2),
-           (MI HvxWR:$src1, IntRegs:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, IntRegs:$src2),
-           (MI HvxWR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-           (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxWR:$src2,
-                                            IntRegs:$src3),
-           (MI HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-           (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            IntRegs:$src3),
-           (MI HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, IntRegs:$src2),
-           (MI HvxQR:$src1, IntRegs:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2),
-           (MI HvxQR:$src1, IntRegs:$src2)>;
-}
-
-multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
-           (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxQR:$src2,
-                                            IntRegs:$src3),
-           (MI HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>;
-}
-
-multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, HvxVR:$src2),
-           (MI HvxQR:$src1, HvxVR:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxVR:$src2),
-           (MI HvxQR:$src1, HvxVR:$src2)>;
-}
-
-multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID IntRegs:$src1),
-           (MI IntRegs:$src1)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
-           (MI IntRegs:$src1)>;
-}
-
-multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxQR:$src1, HvxQR:$src2),
-           (MI HvxQR:$src1, HvxQR:$src2)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, HvxQR:$src2),
-           (MI HvxQR:$src1, HvxQR:$src2)>;
-}
-
-multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, imm:$src3),
-           (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            imm:$src3),
-           (MI HvxVR:$src1, HvxVR:$src2, imm:$src3)>;
-}
-
-multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4),
-           (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxVR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3, imm:$src4),
-           (MI HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-}
-
-multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
-  def: Pat<(IntID HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4),
-           (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-  def: Pat<(!cast<Intrinsic>(IntID#"_128B") HvxWR:$src1, HvxVR:$src2,
-                                            HvxVR:$src3, imm:$src4),
-           (MI HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, imm:$src4)>;
-}
-
-def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>;
-def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>;
-def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>;
-def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>;
-def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>;
-
-defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>;
-defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>;
-defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>;
-defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>;
-defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>;
-defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>;
-defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>;
-defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>;
-defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>;
-defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>;
-defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>;
-defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>;
-defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>;
-defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>;
-defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>;
-defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>;
-defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>;
-defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>;
-defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>;
-defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>;
-defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>;
-defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>;
-defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>;
-defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>;
-defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>;
-defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>;
-defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>;
-defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>;
-defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>;
-defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>;
-defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>;
-defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>;
-defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>;
-defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>;
-defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>;
-defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>;
-defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>;
-defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>;
-defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>;
-defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>;
-defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 9030e43b7149f..f83e06cd3d930 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) {
 void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                    StringRef Annot, const MCSubtargetInfo &STI,
                                    raw_ostream &OS) {
-  assert(HexagonMCInstrInfo::isBundle(*MI));
-  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
-  assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
-  HasExtender = false;
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
-    MCInst const &MCI = *I.getInst();
-    if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
-      printInstruction(MCI.getOperand(1).getInst(), Address, OS);
-      OS << '\v';
-      HasExtender = false;
-      printInstruction(MCI.getOperand(0).getInst(), Address, OS);
-    } else
-      printInstruction(&MCI, Address, OS);
-    HasExtender = HexagonMCInstrInfo::isImmext(MCI);
-    OS << "\n";
-  }
-
-  bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
-  bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
-  if (IsLoop0) {
-    OS << (IsLoop1 ? " :endloop01" : " :endloop0");
-  } else if (IsLoop1) {
-    OS << " :endloop1";
+  if (HexagonMCInstrInfo::isDuplex(MII, *MI)) {
+    printInstruction(MI->getOperand(1).getInst(), Address, OS);
+    OS << '\v';
+    HasExtender = false;
+    printInstruction(MI->getOperand(0).getInst(), Address, OS);
+  } else {
+    printInstruction(MI, Address, OS);
   }
+  HasExtender = HexagonMCInstrInfo::isImmext(*MI);
+  if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_PACKET_END)
+    HasExtender = false;
 }
 
 void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 980df819b2c26..bfea50e2d6dc0 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -252,8 +252,21 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     std::string Buffer;
     {
       raw_string_ostream TempStream(Buffer);
-      InstPrinter.printInst(&Inst, Address, "", STI, TempStream);
+      for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+        InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream);
+        TempStream << "\n";
+      }
+    }
+
+    std::string LoopString = "";
+    bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst);
+    bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst);
+    if (IsLoop0) {
+      LoopString += (IsLoop1 ? " :endloop01" : " :endloop0");
+    } else if (IsLoop1) {
+      LoopString += " :endloop1";
     }
+
     StringRef Contents(Buffer);
     auto PacketBundle = Contents.rsplit('\n');
     auto HeadTail = PacketBundle.first.split('\n');
@@ -275,9 +288,9 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     }
 
     if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
-      OS << "\n\t} :mem_noshuf" << PacketBundle.second;
+      OS << "\n\t} :mem_noshuf" << LoopString;
     else
-      OS << "\t}" << PacketBundle.second;
+      OS << "\t}" << LoopString;
   }
 
   void finish() override { finishAttributeSection(); }
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index d5a5f17348e4b..36c3011be2b9e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the baisc single-precision floating-point instructions.
+// This file describes the basic single-precision floating-point instructions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index ac5e7f3891c72..1493bf4cba695 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
   // estimateStackSize has been observed to under-estimate the final stack
   // size, so give ourselves wiggle-room by checking for stack size
   // representable an 11-bit signed field rather than 12-bits.
-  if (!isInt<11>(MFI.estimateStackSize(MF)))
+  // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit
+  // signed field is fine.
+  unsigned EstimateStackSize = MFI.estimateStackSize(MF);
+  if (!isInt<11>(EstimateStackSize) ||
+      (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() &&
+       !isInt<7>(EstimateStackSize)))
     ScavSlotsNum = std::max(ScavSlotsNum, 1u);
 
   // For CFR spill.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index c47987fbf683b..e915a3c432e0b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2514,8 +2514,9 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
     assert(ResTy.isVector());
 
     unsigned NumElts = ResTy.getVectorNumElements();
-    SDValue Vector = DAG.getUNDEF(ResTy);
-    for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Vector =
+        DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+    for (unsigned i = 1; i < NumElts; ++i) {
       Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
                            Node->getOperand(i),
                            DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
@@ -2597,12 +2598,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VecTy = Op->getOperand(0)->getValueType(0);
   SDValue Idx = Op->getOperand(1);
-  EVT EltTy = VecTy.getVectorElementType();
   unsigned NumElts = VecTy.getVectorNumElements();
 
-  if (isa<ConstantSDNode>(Idx) &&
-      (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
-       EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))
+  if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
     return Op;
 
   return SDValue();
@@ -4563,6 +4561,80 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
   llvm_unreachable("Unexpected node type for vXi1 sign extension");
 }
 
+static SDValue
+performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const LoongArchSubtarget &Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  if (Src.getOpcode() != ISD::SETCC || !Src.hasOneUse())
+    return SDValue();
+
+  bool UseLASX;
+  unsigned Opc = ISD::DELETED_NODE;
+  EVT CmpVT = Src.getOperand(0).getValueType();
+  EVT EltVT = CmpVT.getVectorElementType();
+
+  if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() == 128)
+    UseLASX = false;
+  else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
+           CmpVT.getSizeInBits() == 256)
+    UseLASX = true;
+  else
+    return SDValue();
+
+  SDValue SrcN1 = Src.getOperand(1);
+  switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
+  default:
+    break;
+  case ISD::SETEQ:
+    // x == 0 => not (vmsknez.b x)
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
+    break;
+  case ISD::SETGT:
+    // x > -1 => vmskgez.b x
+    if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+    break;
+  case ISD::SETGE:
+    // x >= 0 => vmskgez.b x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+    break;
+  case ISD::SETLT:
+    // x < 0 => vmskltz.{b,h,w,d} x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+         EltVT == MVT::i64))
+      Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    break;
+  case ISD::SETLE:
+    // x <= -1 => vmskltz.{b,h,w,d} x
+    if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+         EltVT == MVT::i64))
+      Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    break;
+  case ISD::SETNE:
+    // x != 0 => vmsknez.b x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
+    break;
+  }
+
+  if (Opc == ISD::DELETED_NODE)
+    return SDValue();
+
+  SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0));
+  EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+  V = DAG.getZExtOrTrunc(V, DL, T);
+  return DAG.getBitcast(VT, V);
+}
+
 static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const LoongArchSubtarget &Subtarget) {
@@ -4577,110 +4649,63 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
     return SDValue();
 
-  unsigned Opc = ISD::DELETED_NODE;
   // Combine SETCC and BITCAST into [X]VMSK{LT,GE,NE} when possible
+  SDValue Res = performSETCC_BITCASTCombine(N, DAG, DCI, Subtarget);
+  if (Res)
+    return Res;
+
+  // Generate vXi1 using [X]VMSKLTZ
+  MVT SExtVT;
+  unsigned Opc;
+  bool UseLASX = false;
+  bool PropagateSExt = false;
+
   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse()) {
-    bool UseLASX;
     EVT CmpVT = Src.getOperand(0).getValueType();
-    EVT EltVT = CmpVT.getVectorElementType();
-
-    if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() <= 128)
-      UseLASX = false;
-    else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
-             CmpVT.getSizeInBits() <= 256)
-      UseLASX = true;
-    else
+    if (CmpVT.getSizeInBits() > 256)
       return SDValue();
-
-    SDValue SrcN1 = Src.getOperand(1);
-    switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
-    default:
-      break;
-    case ISD::SETEQ:
-      // x == 0 => not (vmsknez.b x)
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
-      break;
-    case ISD::SETGT:
-      // x > -1 => vmskgez.b x
-      if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
-      break;
-    case ISD::SETGE:
-      // x >= 0 => vmskgez.b x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
-      break;
-    case ISD::SETLT:
-      // x < 0 => vmskltz.{b,h,w,d} x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
-          (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
-           EltVT == MVT::i64))
-        Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-      break;
-    case ISD::SETLE:
-      // x <= -1 => vmskltz.{b,h,w,d} x
-      if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
-          (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
-           EltVT == MVT::i64))
-        Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-      break;
-    case ISD::SETNE:
-      // x != 0 => vmsknez.b x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
-      break;
-    }
   }
 
-  // Generate vXi1 using [X]VMSKLTZ
-  if (Opc == ISD::DELETED_NODE) {
-    MVT SExtVT;
-    bool UseLASX = false;
-    bool PropagateSExt = false;
-    switch (SrcVT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v2i1:
-      SExtVT = MVT::v2i64;
-      break;
-    case MVT::v4i1:
-      SExtVT = MVT::v4i32;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v4i64;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v8i1:
-      SExtVT = MVT::v8i16;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v8i32;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v16i1:
-      SExtVT = MVT::v16i8;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v16i16;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v32i1:
-      SExtVT = MVT::v32i8;
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v2i1:
+    SExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    SExtVT = MVT::v4i32;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v4i64;
       UseLASX = true;
-      break;
-    };
-    if (UseLASX && !Subtarget.has32S() && !Subtarget.hasExtLASX())
-      return SDValue();
-    Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
-                        : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
-    Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-  } else {
-    Src = Src.getOperand(0);
-  }
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v8i1:
+    SExtVT = MVT::v8i16;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v8i32;
+      UseLASX = true;
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v16i1:
+    SExtVT = MVT::v16i8;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v16i16;
+      UseLASX = true;
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v32i1:
+    SExtVT = MVT::v32i8;
+    UseLASX = true;
+    break;
+  };
+  if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
+    return SDValue();
+  Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+                      : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+  Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
 
   SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
   EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
@@ -6003,10 +6028,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
   Register ScratchReg1 = XSrc;
   if (Idx >= HalfSize) {
     ScratchReg1 = MRI.createVirtualRegister(RC);
-    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1)
-        .addReg(XSrc)
+    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1)
         .addReg(XSrc)
-        .addImm(1);
+        .addImm(14);
   }
 
   Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 95e9fd49d1c0d..5096a8fcda8eb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> {
             (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
 }
 
+multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...3 in {
+    foreach imm2 = 0...3 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert (vector_insert vecty:$xd,
+                    (elemty (vector_extract vecty:$xj, imm1)), imm2),
+                    (elemty (vector_extract vecty:$xj, !add(imm1, 4))),
+                    !add(imm2, 4)),
+                (XVEXTRINS_W $xd, $xj, Imm)>;
+    }
+  }
+}
+
+multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...1 in {
+    foreach imm2 = 0...1 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert (vector_insert vecty:$xd,
+                    (elemty (vector_extract vecty:$xj, imm1)), imm2),
+                    (elemty (vector_extract vecty:$xj, !add(imm1, 2))),
+                    !add(imm2, 2)),
+                (XVEXTRINS_D $xd, $xj, Imm)>;
+    }
+  }
+}
+
 let Predicates = [HasExtLASX] in {
 
 // XVADD_{B/H/W/D}
@@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">;
 defm : PatCCXrXrF<SETO, "XVFCMP_COR">;
 defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">;
 
+// Insert two elements extracted from vector into vector. (The positions
+// of the two elements must be same in the source or destination vector's
+// front and back 128bits.)
+// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D}
+// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D}
+foreach imm1 = 0...15 in {
+  foreach imm2 = 0...15 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert (vector_insert v32i8:$xd,
+                  (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2),
+                  (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))),
+                  !add(imm2, 16)),
+              (XVEXTRINS_B $xd, $xj, Imm)>;
+  }
+}
+
+foreach imm1 = 0...7 in {
+  foreach imm2 = 0...7 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert (vector_insert v16i16:$xd,
+                  (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2),
+                  (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))),
+                  !add(imm2, 8)),
+              (XVEXTRINS_H $xd, $xj, Imm)>;
+  }
+}
+
+defm : PairInsertExtractPatV8<v8i32, GRLenVT>;
+defm : PairInsertExtractPatV8<v8f32, f32>;
+defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
+defm : PairInsertExtractPatV4<v4f64, f64>;
+
 // PseudoXVINSGR2VR_{B/H}
 def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
           (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
@@ -1593,11 +1651,20 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
           (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
 def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
           (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
-          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
-          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+          (XVINSGR2VR_W $xd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+          (XVINSGR2VR_D $xd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+          (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+          (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+
+// XVINSVE0_{W/D}
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+          (XVINSVE0_W $xd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+          (XVINSVE0_D $xd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;
 
 // scalar_to_vector
 def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
@@ -1790,7 +1857,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
   def  : RegRegStPat<store, XVSTX, LASX256, vt>;
 }
 
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))),
+          (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))),
+          (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>;
+
 // Vector extraction with constant index.
+foreach imm = 16...31 in {
+  defvar Imm = !and(imm, 15);
+  def : Pat<(i64 (vector_extract v32i8:$xj, imm)),
+            (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128),
+                Imm)>;
+}
+foreach imm = 8...15 in {
+  defvar Imm = !and(imm, 7);
+  def : Pat<(i64 (vector_extract v16i16:$xj, imm)),
+            (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128),
+                Imm)>;
+}
 def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
           (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
 def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddcd..3c9defb0366ff 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst,
             (Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>;
 }
 
+multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...3 in {
+    foreach imm2 = 0...3 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert vecty:$vd,
+                    (elemty (vector_extract vecty:$vj, imm1)), imm2),
+                (VEXTRINS_W $vd, $vj, Imm)>;
+    }
+  }
+}
+
+multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...1 in {
+    foreach imm2 = 0...1 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert vecty:$vd,
+                    (elemty (vector_extract vecty:$vj, imm1)), imm2),
+                (VEXTRINS_D $vd, $vj, Imm)>;
+    }
+  }
+}
+
 let Predicates = [HasExtLSX] in {
 
 // VADD_{B/H/W/D}
@@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">;
 defm : PatCCVrVrF<SETO, "VFCMP_COR">;
 defm : PatCCVrVrF<SETUO, "VFCMP_CUN">;
 
+// Insert element extracted from vector into vector.
+// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D}
+foreach imm1 = 0...15 in {
+  foreach imm2 = 0...15 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert v16i8:$vd,
+                  (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2),
+              (VEXTRINS_B $vd, $vj, Imm)>;
+  }
+}
+
+foreach imm1 = 0...7 in {
+  foreach imm2 = 0...7 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert v8i16:$vd,
+                  (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2),
+              (VEXTRINS_H $vd, $vj, Imm)>;
+  }
+}
+
+defm : InsertExtractPatV4<v4i32, GRLenVT>;
+defm : InsertExtractPatV4<v4f32, f32>;
+defm : InsertExtractPatV2<v2i64, GRLenVT>;
+defm : InsertExtractPatV2<v2f64, f64>;
+
 // VINSGR2VR_{B/H/W/D}
 def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
           (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
@@ -1791,11 +1838,23 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
           (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
 def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
           (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
+def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm),
+          (VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
+          (VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
+
+// VEXTRINS_{W/D}
+foreach imm = 0...3 in {
+  defvar Imm = !shl(imm, 4);
+  def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, imm),
+            (VEXTRINS_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), Imm)>;
+}
 
-def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
-          (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
-def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
-          (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>;
+foreach imm = 0...1 in {
+  defvar Imm = !shl(imm, 4);
+  def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, imm),
+            (VEXTRINS_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), Imm)>;
+}
 
 // scalar_to_vector
 def : Pat<(v4f32 (scalar_to_vector FPR32:$fj)),
@@ -1990,6 +2049,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
   def  : RegRegStPat<store, VSTX, LSX128, vt>;
 }
 
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))),
+          (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))),
+          (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>;
+
 // Vector extraction with constant index.
 def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)),
           (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 7b9f1156f9102..8fa72bc9a30a7 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -177,74 +177,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
   }
 }
 
-// Linker relaxation may change code size. We have to insert Nops
-// for .align directive when linker relaxation enabled. So then Linker
-// could satisfy alignment by removing Nops.
-// The function returns the total Nops Size we need to insert.
-bool LoongArchAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
-    const MCAlignFragment &AF, unsigned &Size) {
-  // Calculate Nops Size only when linker relaxation enabled.
-  if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
-    return false;
-
-  // Ignore alignment if MaxBytesToEmit is less than the minimum Nop size.
-  const unsigned MinNopLen = 4;
-  if (AF.getMaxBytesToEmit() < MinNopLen)
-    return false;
-  Size = AF.getAlignment().value() - MinNopLen;
-  return AF.getAlignment() > MinNopLen;
-}
-
-// We need to insert R_LARCH_ALIGN relocation type to indicate the
-// position of Nops and the total bytes of the Nops have been inserted
-// when linker relaxation enabled.
-// The function inserts fixup_loongarch_align fixup which eventually will
-// transfer to R_LARCH_ALIGN relocation type.
-// The improved R_LARCH_ALIGN requires symbol index. The lowest 8 bits of
-// addend represent alignment and the other bits of addend represent the
-// maximum number of bytes to emit. The maximum number of bytes is zero
-// means ignore the emit limit.
-bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
-                                                        MCAlignFragment &AF) {
-  // Insert the fixup only when linker relaxation enabled.
-  if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
-    return false;
-
-  // Calculate total Nops we need to insert. If there are none to insert
-  // then simply return.
-  unsigned InsertedNopBytes;
-  if (!shouldInsertExtraNopBytesForCodeAlign(AF, InsertedNopBytes))
-    return false;
-
-  MCSection *Sec = AF.getParent();
-  MCContext &Ctx = getContext();
-  const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
-  MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_LARCH_ALIGN);
-  unsigned MaxBytesToEmit = AF.getMaxBytesToEmit();
-
-  auto createExtendedValue = [&]() {
-    const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
-    if (MCSym == nullptr) {
-      // Define a marker symbol at the section with an offset of 0.
-      MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
-      Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
-      Asm.registerSymbol(*Sym);
-      MCSym = MCSymbolRefExpr::create(Sym, Ctx);
-      getSecToAlignSym()[Sec] = MCSym;
-    }
-    return MCValue::get(&MCSym->getSymbol(), nullptr,
-                        MaxBytesToEmit << 8 | Log2(AF.getAlignment()));
-  };
-
-  uint64_t FixedValue = 0;
-  MCValue Value = MaxBytesToEmit >= InsertedNopBytes
-                      ? MCValue::get(InsertedNopBytes)
-                      : createExtendedValue();
-  Asm.getWriter().recordRelocation(AF, Fixup, Value, FixedValue);
-
-  return true;
-}
-
 bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
                                                 const MCValue &Target) {
   switch (Fixup.getKind()) {
@@ -279,6 +211,53 @@ getRelocPairForSize(unsigned Size) {
   }
 }
 
+// Check if an R_LARCH_ALIGN relocation is needed for an alignment directive.
+// If conditions are met, compute the padding size and create a fixup encoding
+// the padding size in the addend. If MaxBytesToEmit is smaller than the padding
+// size, the fixup encodes MaxBytesToEmit in the higher bits and references a
+// per-section marker symbol.
+bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+  // Use default handling unless linker relaxation is enabled and the
+  // MaxBytesToEmit >= the nop size.
+  if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
+    return false;
+  const unsigned MinNopLen = 4;
+  unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit();
+  if (MaxBytesToEmit < MinNopLen)
+    return false;
+
+  Size = F.getAlignment().value() - MinNopLen;
+  if (F.getAlignment() <= MinNopLen)
+    return false;
+
+  MCContext &Ctx = getContext();
+  const MCExpr *Expr = nullptr;
+  if (MaxBytesToEmit >= Size) {
+    Expr = MCConstantExpr::create(Size, getContext());
+  } else {
+    MCSection *Sec = F.getParent();
+    const MCSymbolRefExpr *SymRef = getSecToAlignSym()[Sec];
+    if (SymRef == nullptr) {
+      // Define a marker symbol at the section with an offset of 0.
+      MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
+      Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
+      Asm->registerSymbol(*Sym);
+      SymRef = MCSymbolRefExpr::create(Sym, Ctx);
+      getSecToAlignSym()[Sec] = SymRef;
+    }
+    Expr = MCBinaryExpr::createAdd(
+        SymRef,
+        MCConstantExpr::create((MaxBytesToEmit << 8) | Log2(F.getAlignment()),
+                               Ctx),
+        Ctx);
+  }
+  MCFixup Fixup =
+      MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
+  F.setVarFixups({Fixup});
+  F.getParent()->setLinkerRelaxable();
+  return true;
+}
+
 std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F,
                                                        int64_t &Value) const {
   const MCExpr &Expr = F.getLEBValue();
@@ -434,7 +413,7 @@ bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
 
   // Otherwise, check if the offset between the symbol and fragment is fully
   // resolved, unaffected by linker-relaxable fragments (e.g. instructions or
-  // offset-affected MCAlignFragment). Complements the generic
+  // offset-affected FT_Align fragments). Complements the generic
   // isSymbolRefDifferenceFullyResolvedImpl.
   if (!PCRelTemp)
     PCRelTemp = getContext().createTempSymbol();
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index b32ba067810ce..3d929fc49f95e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -45,20 +45,13 @@ class LoongArchAsmBackend : public MCAsmBackend {
                   MutableArrayRef<char> Data, uint64_t Value,
                   bool IsResolved) override;
 
-  // Return Size with extra Nop Bytes for alignment directive in code section.
-  bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
-                                             unsigned &Size) override;
-
-  // Insert target specific fixup type for alignment directive in code section.
-  bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
-                                     MCAlignFragment &AF) override;
-
   bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target);
 
   std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 
   MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
 
+  bool relaxAlign(MCFragment &F, unsigned &Size) override;
   bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
   bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
   std::pair<bool, bool> relaxLEB128(MCFragment &F,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
index 03ce004ed33a5..7cefb3f8119b8 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) {
   bool Is64Bit = TT.isArch64Bit();
   ABI TripleABI;
   switch (TT.getEnvironment()) {
+  case llvm::Triple::EnvironmentType::UnknownEnvironment:
+    TripleABI = ABI_Unknown;
+    break;
   case llvm::Triple::EnvironmentType::GNUSF:
   case llvm::Triple::EnvironmentType::MuslSF:
     TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S;
@@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
 
   // 1. If the '-target-abi' is valid, use it.
   if (IsABIValidForFeature(ArgProvidedABI)) {
-    if (TT.hasEnvironment() && ArgProvidedABI != TripleABI)
+    if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI)
       errs()
           << "warning: triple-implied ABI conflicts with provided target-abi '"
           << ABIName << "', using target-abi\n";
@@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
       return Is64Bit ? ABI_LP64F : ABI_ILP32F;
     return Is64Bit ? ABI_LP64S : ABI_ILP32S;
   };
-  if (ABIName.empty())
-    errs() << "warning: the triple-implied ABI is invalid, ignoring and using "
-              "feature-implied ABI\n";
-  else
+  if (!ABIName.empty())
     errs() << "warning: both target-abi and the triple-implied ABI are "
               "invalid, ignoring and using feature-implied ABI\n";
   return checkABIStandardized(GetFeatureABI());
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 01e4d17f6236d..259b71b37d9a3 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
       TOut.getStreamer().emitRelocDirective(
           *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
-          RelocJalrExpr, IDLoc, *STI);
+          RelocJalrExpr);
       TOut.getStreamer().emitLabel(TmpLabel);
     }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index ad8f5f0a09745..7abe9c9606749 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -385,11 +385,12 @@ void MipsELFObjectWriter::sortRelocs(std::vector<ELFRelocationEntry> &Relocs) {
   if (hasRelocationAddend())
     return;
 
-  // Sort relocations by the address they are applied to.
-  llvm::sort(Relocs,
-             [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
-               return A.Offset < B.Offset;
-             });
+  // Sort relocations by r_offset. There might be more than one at an offset
+  // with composed relocations or .reloc directives.
+  llvm::stable_sort(
+      Relocs, [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+        return A.Offset < B.Offset;
+      });
 
   // Place relocations in a list for reorder convenience. Hi16 contains the
   // iterators of high-part relocations.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index b89d6890903dd..feb4eb3719247 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1033,45 +1033,40 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
 }
 
 void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
-  MCFragment *DF = getStreamer().getOrCreateDataFragment();
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               Mips::fixup_Mips_GPREL32));
-  DF->appendContents(4, 0);
+  auto &S = getStreamer();
+  S.addFixup(Value, Mips::fixup_Mips_GPREL32);
+  S.appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
-  MCFragment *DF = getStreamer().getOrCreateDataFragment();
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               Mips::fixup_Mips_GPREL32));
-  DF->appendContents(8, 0);
+  auto &S = getStreamer();
+  // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
+  S.addFixup(Value, Mips::fixup_Mips_GPREL32);
+  S.appendContents(8, 0);
 }
 
 void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
-  MCFragment *DF = getStreamer().getOrCreateDataFragment();
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               Mips::fixup_Mips_DTPREL32));
-  DF->appendContents(4, 0);
+  auto &S = getStreamer();
+  S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
+  S.appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
-  MCFragment *DF = getStreamer().getOrCreateDataFragment();
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               Mips::fixup_Mips_DTPREL64));
-  DF->appendContents(8, 0);
+  auto &S = getStreamer();
+  S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
+  S.appendContents(8, 0);
 }
 
 void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
-  MCFragment *DF = getStreamer().getOrCreateDataFragment();
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               Mips::fixup_Mips_TPREL32));
-  DF->appendContents(4, 0);
+  auto &S = getStreamer();
+  S.addFixup(Value, Mips::fixup_Mips_TPREL32);
+  S.appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
-  MCFragment *DF = getStreamer().getOrCreateDataFragment();
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               Mips::fixup_Mips_TPREL64));
-  DF->appendContents(8, 0);
+  auto &S = getStreamer();
+  S.addFixup(Value, Mips::fixup_Mips_TPREL64);
+  S.appendContents(8, 0);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index c18ba44bea08e..ca0331006be74 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -166,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
         OutStreamer.emitRelocDirective(
             *OffsetExpr,
             Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
-            CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
+            CaleeExpr);
         OutStreamer.emitLabel(OffsetLabel);
         return;
       }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d017c658c53a3..7ef81a62a41aa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -731,6 +731,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
 
   // PTX does not support load / store predicate registers
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -1048,9 +1050,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                       MVT::v32i32, MVT::v64i32, MVT::v128i32},
                      Custom);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom);
+  // Enable custom lowering for the following:
+  //   * MVT::i128 - clusterlaunchcontrol
+  //   * MVT::i32 - prmt
+  //   * MVT::Other - internal.addrspace.wrap
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
+                     Custom);
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -2060,6 +2065,21 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
+static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,
+                       SelectionDAG &DAG,
+                       unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+  assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
+         Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
+  return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
+                     {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
+}
+
+static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
+                       SelectionDAG &DAG,
+                       unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+  return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
+}
+
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2111,15 +2131,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
         L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
         R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
       }
-      return DAG.getNode(
-          NVPTXISD::PRMT, DL, MVT::v4i8,
-          {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32),
-           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      return getPRMT(L, R, SelectionValue, DL, DAG);
     };
     auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
     auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
     auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
-    return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
+    return DAG.getBitcast(VT, PRMT3210);
   }
 
   // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
@@ -2176,11 +2193,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
                                    DAG.getZExtOrTrunc(Index, DL, MVT::i32),
                                    DAG.getConstant(0x7770, DL, MVT::i32));
-    SDValue PRMT = DAG.getNode(
-        NVPTXISD::PRMT, DL, MVT::i32,
-        {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32),
-         Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-    return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+    SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
+                           DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
+    SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+    SDNodeFlags Flags;
+    Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
+    Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
+    Ext->setFlags(Flags);
+    return Ext;
   }
 
   // Constant index will be matched by tablegen.
@@ -2242,9 +2262,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   SDLoc DL(Op);
-  return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
-                     DAG.getConstant(Selector, DL, MVT::i32),
-                     DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
+  SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
+                         DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
+  return DAG.getBitcast(Op.getValueType(), PRMT);
 }
 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
@@ -2729,10 +2749,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,
                      {TryCancelResponse0, TryCancelResponse1});
 }
 
+static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
+  const unsigned Mode = [&]() {
+    switch (Op->getConstantOperandVal(0)) {
+    case Intrinsic::nvvm_prmt:
+      return NVPTX::PTXPrmtMode::NONE;
+    case Intrinsic::nvvm_prmt_b4e:
+      return NVPTX::PTXPrmtMode::B4E;
+    case Intrinsic::nvvm_prmt_ecl:
+      return NVPTX::PTXPrmtMode::ECL;
+    case Intrinsic::nvvm_prmt_ecr:
+      return NVPTX::PTXPrmtMode::ECR;
+    case Intrinsic::nvvm_prmt_f4e:
+      return NVPTX::PTXPrmtMode::F4E;
+    case Intrinsic::nvvm_prmt_rc16:
+      return NVPTX::PTXPrmtMode::RC16;
+    case Intrinsic::nvvm_prmt_rc8:
+      return NVPTX::PTXPrmtMode::RC8;
+    default:
+      llvm_unreachable("unsupported/unhandled intrinsic");
+    }
+  }();
+  SDLoc DL(Op);
+  SDValue A = Op->getOperand(1);
+  SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
+                                       : DAG.getConstant(0, DL, MVT::i32);
+  SDValue Selector = (Op->op_end() - 1)->get();
+  return getPRMT(A, B, Selector, DL, DAG, Mode);
+}
 static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
   switch (Op->getConstantOperandVal(0)) {
   default:
     return Op;
+  case Intrinsic::nvvm_prmt:
+  case Intrinsic::nvvm_prmt_b4e:
+  case Intrinsic::nvvm_prmt_ecl:
+  case Intrinsic::nvvm_prmt_ecr:
+  case Intrinsic::nvvm_prmt_f4e:
+  case Intrinsic::nvvm_prmt_rc16:
+  case Intrinsic::nvvm_prmt_rc8:
+    return lowerPrmtIntrinsic(Op, DAG);
   case Intrinsic::nvvm_internal_addrspace_wrap:
     return Op.getOperand(1);
   case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
@@ -3952,7 +4008,10 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
   case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
-  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
+  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
     Info.opc = ISD::INTRINSIC_VOID;
     Info.memVT = MVT::v2i32;
     Info.ptrVal = I.getArgOperand(0);
@@ -3975,6 +4034,30 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     return true;
   }
 
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = MVT::i32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.flags = MachineMemOperand::MOStore;
+    Info.align = Align(4);
+    return true;
+  }
+
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
+  case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = MVT::v4i32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.flags = MachineMemOperand::MOStore;
+    Info.align = Align(16);
+    return true;
+  }
+
   case Intrinsic::nvvm_atomic_add_gen_f_cta:
   case Intrinsic::nvvm_atomic_add_gen_f_sys:
   case Intrinsic::nvvm_atomic_add_gen_i_cta:
@@ -5008,12 +5091,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
           return !U.getUser()->use_empty();
         }
 
-        // Handle CopyToReg nodes that will become dead after our replacement
-        if (U.getUser()->getOpcode() == ISD::CopyToReg) {
-          DeadCopyToRegs.push_back(U.getUser());
-          return true;
-        }
-
         // Otherwise, this use prevents us from splitting a value.
         return false;
       }))
@@ -5080,10 +5157,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
     Results.push_back(NewLoad.getValue(NewNumOutputs + I));
 
-  // Remove dead CopyToReg nodes by folding them into the chain they reference
-  for (SDNode *CTR : DeadCopyToRegs)
-    DCI.CombineTo(CTR, CTR->getOperand(0));
-
   return DCI.DAG.getMergeValues(Results, DL);
 }
 
@@ -5775,11 +5848,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc DL(N);
   auto &DAG = DCI.DAG;
 
-  auto PRMT = DAG.getNode(
-      NVPTXISD::PRMT, DL, MVT::v4i8,
-      {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
-       DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-  return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
+  auto PRMT =
+      getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
+              (Op1Bytes << 8) | Op0Bytes, DL, DAG);
+  return DAG.getBitcast(VT, PRMT);
 }
 
 static SDValue combineADDRSPACECAST(SDNode *N,
@@ -5797,47 +5869,124 @@ static SDValue combineADDRSPACECAST(SDNode *N,
   return SDValue();
 }
 
+// Given a constant selector value and a prmt mode, return the selector value
+// normalized to the generic prmt mode. See the PTX ISA documentation for more
+// details:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
+  assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
+
+  if (Mode == NVPTX::PTXPrmtMode::NONE)
+    return Selector;
+
+  const unsigned V = Selector.trunc(2).getZExtValue();
+
+  const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
+                              unsigned S3) {
+    return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
+  };
+
+  switch (Mode) {
+  case NVPTX::PTXPrmtMode::F4E:
+    return GetSelector(V, V + 1, V + 2, V + 3);
+  case NVPTX::PTXPrmtMode::B4E:
+    return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
+  case NVPTX::PTXPrmtMode::RC8:
+    return GetSelector(V, V, V, V);
+  case NVPTX::PTXPrmtMode::ECL:
+    return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
+  case NVPTX::PTXPrmtMode::ECR:
+    return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
+  case NVPTX::PTXPrmtMode::RC16: {
+    unsigned V1 = (V & 1) << 1;
+    return GetSelector(V1, V1 + 1, V1, V1 + 1);
+  }
+  default:
+    llvm_unreachable("Invalid PRMT mode");
+  }
+}
+
+static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
+  assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
+         Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
+  // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+  APInt BitField = B.concat(A);
+  APInt SelectorVal = getPRMTSelector(Selector, Mode);
+  APInt Result(32, 0);
+  for (unsigned I : llvm::seq(4U)) {
+    APInt Sel = SelectorVal.extractBits(4, I * 4);
+    unsigned Idx = Sel.getLoBits(3).getZExtValue();
+    unsigned Sign = Sel.getHiBits(1).getZExtValue();
+    APInt Byte = BitField.extractBits(8, Idx * 8);
+    if (Sign)
+      Byte = Byte.ashr(8);
+    Result.insertBits(Byte, I * 8);
+  }
+  return Result;
+}
+
+static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                           CodeGenOptLevel OptLevel) {
+  if (OptLevel == CodeGenOptLevel::None)
+    return SDValue();
+
+  // Constant fold PRMT
+  if (isa<ConstantSDNode>(N->getOperand(0)) &&
+      isa<ConstantSDNode>(N->getOperand(1)) &&
+      isa<ConstantSDNode>(N->getOperand(2)))
+    return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
+                                           N->getConstantOperandAPInt(1),
+                                           N->getConstantOperandAPInt(2),
+                                           N->getConstantOperandVal(3)),
+                               SDLoc(N), N->getValueType(0));
+
+  return SDValue();
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
   switch (N->getOpcode()) {
-    default: break;
-    case ISD::ADD:
-      return PerformADDCombine(N, DCI, OptLevel);
-    case ISD::FADD:
-      return PerformFADDCombine(N, DCI, OptLevel);
-    case ISD::MUL:
-      return PerformMULCombine(N, DCI, OptLevel);
-    case ISD::SHL:
-      return PerformSHLCombine(N, DCI, OptLevel);
-    case ISD::AND:
-      return PerformANDCombine(N, DCI);
-    case ISD::UREM:
-    case ISD::SREM:
-      return PerformREMCombine(N, DCI, OptLevel);
-    case ISD::SETCC:
-      return PerformSETCCCombine(N, DCI, STI.getSmVersion());
-    case ISD::LOAD:
-    case NVPTXISD::LoadParamV2:
-    case NVPTXISD::LoadV2:
-    case NVPTXISD::LoadV4:
-      return combineUnpackingMovIntoLoad(N, DCI);
-    case NVPTXISD::StoreParam:
-    case NVPTXISD::StoreParamV2:
-    case NVPTXISD::StoreParamV4:
-      return PerformStoreParamCombine(N, DCI);
-    case ISD::STORE:
-    case NVPTXISD::StoreV2:
-    case NVPTXISD::StoreV4:
-      return PerformStoreCombine(N, DCI);
-    case ISD::EXTRACT_VECTOR_ELT:
-      return PerformEXTRACTCombine(N, DCI);
-    case ISD::VSELECT:
-      return PerformVSELECTCombine(N, DCI);
-    case ISD::BUILD_VECTOR:
-      return PerformBUILD_VECTORCombine(N, DCI);
-    case ISD::ADDRSPACECAST:
-      return combineADDRSPACECAST(N, DCI);
+  default:
+    break;
+  case ISD::ADD:
+    return PerformADDCombine(N, DCI, OptLevel);
+  case ISD::ADDRSPACECAST:
+    return combineADDRSPACECAST(N, DCI);
+  case ISD::AND:
+    return PerformANDCombine(N, DCI);
+  case ISD::BUILD_VECTOR:
+    return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return PerformEXTRACTCombine(N, DCI);
+  case ISD::FADD:
+    return PerformFADDCombine(N, DCI, OptLevel);
+  case ISD::LOAD:
+  case NVPTXISD::LoadParamV2:
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4:
+    return combineUnpackingMovIntoLoad(N, DCI);
+  case ISD::MUL:
+    return PerformMULCombine(N, DCI, OptLevel);
+  case NVPTXISD::PRMT:
+    return combinePRMT(N, DCI, OptLevel);
+  case ISD::SETCC:
+    return PerformSETCCCombine(N, DCI, STI.getSmVersion());
+  case ISD::SHL:
+    return PerformSHLCombine(N, DCI, OptLevel);
+  case ISD::SREM:
+  case ISD::UREM:
+    return PerformREMCombine(N, DCI, OptLevel);
+  case NVPTXISD::StoreParam:
+  case NVPTXISD::StoreParamV2:
+  case NVPTXISD::StoreParamV4:
+    return PerformStoreParamCombine(N, DCI);
+  case ISD::STORE:
+  case NVPTXISD::StoreV2:
+  case NVPTXISD::StoreV4:
+    return PerformStoreCombine(N, DCI);
+  case ISD::VSELECT:
+    return PerformVSELECTCombine(N, DCI);
   }
   return SDValue();
 }
@@ -6387,17 +6536,20 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
   ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   unsigned Mode = Op.getConstantOperandVal(3);
 
-  if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector)
+  if (!Selector)
     return;
 
   KnownBits AKnown = DAG.computeKnownBits(A, Depth);
   KnownBits BKnown = DAG.computeKnownBits(B, Depth);
 
   // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+  assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
+         "PRMT must have i32 operands");
+  assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
   KnownBits BitField = BKnown.concat(AKnown);
 
-  APInt SelectorVal = Selector->getAPIntValue();
-  for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
+  APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
+  for (unsigned I : llvm::seq(4)) {
     APInt Sel = SelectorVal.extractBits(4, I * 4);
     unsigned Idx = Sel.getLoBits(3).getZExtValue();
     unsigned Sign = Sel.getHiBits(1).getZExtValue();
@@ -6420,4 +6572,4 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
   default:
     break;
   }
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4eef6c939720c..b5df4c6de7fd8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
+def hasTMACTAGroupSupport  : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
 def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
@@ -1453,18 +1454,33 @@ let hasSideEffects = false in {
                 (ins PrmtMode:$mode),
                 "prmt.b32$mode",
                 [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>;
+  def PRMT_B32rir
+  : BasicFlagsNVPTXInst<(outs B32:$d),
+              (ins B32:$a, i32imm:$b, B32:$c),
+              (ins PrmtMode:$mode),
+              "prmt.b32$mode",
+              [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
   def PRMT_B32rii
     : BasicFlagsNVPTXInst<(outs B32:$d),
                 (ins B32:$a, i32imm:$b, Hexu32imm:$c),
                 (ins PrmtMode:$mode),
                 "prmt.b32$mode",
                 [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>;
-  def PRMT_B32rir
+  def PRMT_B32irr
     : BasicFlagsNVPTXInst<(outs B32:$d),
-                (ins B32:$a, i32imm:$b, B32:$c),
-                (ins PrmtMode:$mode),
+                (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>;
+  def PRMT_B32iri
+    : BasicFlagsNVPTXInst<(outs B32:$d),
+                (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>;
+  def PRMT_B32iir
+    : BasicFlagsNVPTXInst<(outs B32:$d),
+                (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode),
                 "prmt.b32$mode",
-                [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
+                [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>;
 
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index bad4c3c4c5f3a..0a00220d94289 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
 // TMA Async Bulk Tensor Copy Functions
 //-------------------------------------
 
-class TMA_DIMS_UTIL<int dim> {
+class TMA_DIMS_UTIL<int dim, string mode = ""> {
   // For example, when 'dim' is 3, this generates:
   // an ins_dag:    B32:$d0, B32:$d1, B32:$d2
   // with base_str: $d0, $d1, $d2
   dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i));
   string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+
+  // Tile::Gather4/scatter4 actually operate on a 2D tensor,
+  // though they take 5 co-ordinates.
+  //
+  // The scatter-gather happens over 4 rows with a fixed
+  // column-index. The first co-ordinate represents the
+  // col-index followed by four row-indices.
+  int num_dims = !cond(
+                   !eq(mode, "tile_scatter4") : 2,
+                   !eq(mode, "tile_gather4")  : 2,
+                   true : dim); // for all other modes
 }
 
 class TMA_IM2COL_UTIL<int dim, string mode> {
@@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in {
   }
 }
 
+multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
+  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+  defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+  defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+  defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+  defvar asm_str = !if(!empty(im2col_str),
+                       asm_str_base,
+                       asm_str_base # ", {{" # im2col_str # "}}");
+
+  defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
+  defvar inst_name = "cp.async.bulk.tensor"
+                     # "." # dim_val # "d"
+                     # "." # "shared::cluster.global"
+                     # "." # !subst("_", "::", mode)
+                     # "." # "mbarrier::complete_tx::bytes";
+  defvar intr = !cast<Intrinsic>(
+                  "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d");
+
+  defvar ins_dag = !con(
+                     (ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
+                     dims_dag, im2col_dag,
+                     (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg));
+
+  defvar intr_dag_base = !con(
+                         (intr addr:$dst, addr:$mbar, B64:$tmap),
+                         !setdagop(dims_dag, intr),
+                         !setdagop(im2col_dag, intr),
+                         (intr B16:$mc, B64:$ch));
+  defvar intr_dag_no_hints   = !con(intr_dag_base, (intr 0,  0,  timm:$cg));
+  defvar intr_dag_with_mc    = !con(intr_dag_base, (intr -1, 0,  timm:$cg));
+  defvar intr_dag_with_ch    = !con(intr_dag_base, (intr 0, -1,  timm:$cg));
+  defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
+
+  def "" : NVPTXInst<(outs), ins_dag,
+             inst_name # asm_str # ";",
+             [intr_dag_no_hints]>,
+             Requires<pred>;
+  def _MC : NVPTXInst<(outs), ins_dag,
+              inst_name # ".multicast::cluster" # asm_str # ", $mc;",
+              [intr_dag_with_mc]>,
+              Requires<pred>;
+  def _CH : NVPTXInst<(outs), ins_dag,
+              inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+              [intr_dag_with_ch]>,
+              Requires<pred>;
+  def _MC_CH : NVPTXInst<(outs), ins_dag,
+                 inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;",
+                 [intr_dag_with_mc_ch]>,
+                 Requires<pred>;
+}
+foreach dim = 3...5 in {
+  foreach mode = ["im2col_w", "im2col_w_128"] in {
+    defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
+      : TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
+  }
+}
+defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
+                               [hasTMACTAGroupSupport]>;
+
+multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
+  defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
+  defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
+  defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+  defvar im2col_dag = TMA_IM2COL_UTIL<dim, mode>.ins_dag;
+  defvar im2col_str = TMA_IM2COL_UTIL<dim, mode>.base_str;
+  defvar asm_str = !if(!empty(im2col_str),
+                       asm_str_base,
+                       asm_str_base # ", {{" # im2col_str # "}}");
+
+  defvar ins_dag = !con(
+                     (ins ADDR:$dst, ADDR:$mbar, B64:$tmap),
+                     dims_dag, im2col_dag,
+                     (ins B64:$ch));
+
+  defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
+  defvar intr = !cast<Intrinsic>(
+                  "int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d");
+  defvar intr_dag = !con(
+                      (intr addr:$dst, addr:$mbar, B64:$tmap),
+                      !setdagop(dims_dag, intr),
+                      !setdagop(im2col_dag, intr),
+                      (intr B64:$ch, 0));
+  defvar intr_dag_with_ch = !con(
+                              (intr addr:$dst, addr:$mbar, B64:$tmap),
+                              !setdagop(dims_dag, intr),
+                              !setdagop(im2col_dag, intr),
+                              (intr B64:$ch, -1));
+  defvar inst_name = "cp.async.bulk.tensor"
+                     # "." # dim_val # "d"
+                     # "." # "shared::cta.global"
+                     # "." # !subst("_", "::", mode)
+                     # "." # "mbarrier::complete_tx::bytes";
+
+  def "" : NVPTXInst<(outs), ins_dag,
+             inst_name # asm_str # ";",
+             [intr_dag]>,
+             Requires<pred>;
+  def _CH : NVPTXInst<(outs), ins_dag,
+              inst_name # ".L2::cache_hint" # asm_str # ", $ch;",
+              [intr_dag_with_ch]>,
+              Requires<pred>;
+}
+foreach dim = 1...5 in {
+  defm TMA_G2S_CTA_TILE_ # dim # "D"
+    : TMA_TENSOR_G2S_CTA_INTR<dim, "tile", [hasPTX<86>, hasSM<90>]>;
+}
+foreach dim = 3...5 in {
+  defm TMA_G2S_CTA_IM2COL_ # dim # "D"
+    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col", [hasPTX<86>, hasSM<90>]>;
+
+  defm TMA_G2S_CTA_IM2COL_W_ # dim # "D"
+    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
+
+  defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
+    : TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
+}
+defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
+                                   [hasPTX<86>, hasSM<100>]>;
+
 multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
                                list<Predicate> pred = [hasPTX<80>, hasSM<90>]> {
   defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
   defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
   defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
 
+  defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
   defvar intr = !cast<Intrinsic>(
-                  "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d);
+                  "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d");
+
   defvar intr_dag = !con((intr addr:$src, B64:$tmap),
                          !setdagop(dims_dag, intr),
                          (intr B64:$ch, 0));
@@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
                                  !setdagop(dims_dag, intr),
                                  (intr B64:$ch, -1));
 
-  // For im2col mode, the actual asm_str is "im2col_no_offs"
-  defvar mode_asm_str = !if(!eq(mode, "im2col"),
-                            "im2col_no_offs", mode);
+  // Fix-up the asm_str when it is im2col/scatter4.
+  defvar mode_asm_str = !cond(
+                          !eq(mode, "im2col") : "im2col_no_offs",
+                          !eq(mode, "tile_scatter4") : "tile::scatter4",
+                          true : mode);
   defvar prefix = "cp.async.bulk.tensor"
-                  # "." # dim # "d"
+                  # "." # dim_val # "d"
                   # ".global.shared::cta"
                   # "." # mode_asm_str
                   # ".bulk_group";
@@ -729,10 +866,12 @@ multiclass TMA_TENSOR_S2G_INTR<int dim, string mode,
 }
 foreach dim = 1...5 in {
   foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
-    defvar suffix = !toupper(mode) # "_" # dim # D;
+    defvar suffix = !toupper(mode) # "_" # dim # "D";
     defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR<dim, mode>;
   }
 }
+defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
+                                [hasTMACTAGroupSupport]>;
 
 def TMAReductionFlags : Operand<i32> {
   let PrintMethod = "printTmaReductionMode";
@@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
                        asm_str_base,
                        asm_str_base # ", {{" # im2col_str # "}}");
 
+  defvar dim_val = TMA_DIMS_UTIL<dim, mode>.num_dims;
   defvar inst_name = "cp.async.bulk.prefetch.tensor"
-                     # "." # dim # "d"
+                     # "." # dim_val # "d"
                      # "." # "L2.global"
-                     # "." # mode;
+                     # "." # !subst("_", "::", mode);
 
   defvar intr = !cast<Intrinsic>(
-                  "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d);
+                  "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d");
 
   defvar ins_dag  = !con((ins  B64:$tmap),
                          dims_dag,
@@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR<int dim, string mode,
 }
 foreach dim = 1...5 in {
   foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
-    defvar suffix = !toupper(mode) # "_" # dim # D;
+    defvar suffix = !toupper(mode) # "_" # dim # "D";
     defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode>;
   }
 }
+foreach dim = 3...5 in {
+  foreach mode = ["im2col_w", "im2col_w_128"] in {
+    defvar suffix = !toupper(mode) # "_" # dim # "D";
+    defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
+                                   [hasTMACTAGroupSupport]>;
+  }
+}
+defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
+                                     [hasTMACTAGroupSupport]>;
 
 //Prefetch and Prefetchu 
 
@@ -1047,24 +1196,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
 // MISC
 //
 
-class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
-    : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c),
-          (PRMT_B32rrr $a, $b, $c, prmt_mode)>;
-
-class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
-    : Pat<(prmt_intrinsic i32:$a, i32:$c),
-          (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>;
-
-def : PRMT3Pat<int_nvvm_prmt,      PrmtNONE>;
-def : PRMT3Pat<int_nvvm_prmt_f4e,  PrmtF4E>;
-def : PRMT3Pat<int_nvvm_prmt_b4e,  PrmtB4E>;
-
-def : PRMT2Pat<int_nvvm_prmt_rc8,  PrmtRC8>;
-def : PRMT2Pat<int_nvvm_prmt_ecl,  PrmtECL>;
-def : PRMT2Pat<int_nvvm_prmt_ecr,  PrmtECR>;
-def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
-
-
 def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32",
                              [(int_nvvm_nanosleep imm:$i)]>,
         Requires<[hasPTX<63>, hasSM<70>]>;
@@ -4627,7 +4758,14 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
 
     !and(!eq(op, "ldmatrix"),
          !eq(ptx_elt_type, "b8x16.b4x16_p64"),
-         !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
+         !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
+
+    !and(!eq(op, "stmatrix"),!eq(ptx_elt_type, "b16"),
+         !eq(geom, "m8n8")) : [hasSM<90>, hasPTX<78>],
+
+    !and(!eq(op, "stmatrix"),
+         !eq(ptx_elt_type, "b8"),
+         !eq(geom, "m16n8")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
 
   // template DAGs for instruction inputs/output.
   dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -4908,6 +5046,42 @@ defset list<WMMA_INSTR> LDMATRIXs  = {
   } // transposed
 } // defset
 
+//
+// stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
+//
+class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
+  : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>,
+    Requires<Frag.Predicates> {
+  // Build PatFrag that only matches particular address space.
+  dag PFOperands = !con((ops node:$dst),
+                        !dag(ops, !listsplat(node, !size(Frag.regs)), Frag.reg_names));
+  PatFrag IntrFrag = PatFrag<PFOperands,
+                             !foreach(tmp, PFOperands, !subst(ops, Intr, tmp)),
+                             !cond(!eq(Space, ".shared"): AS_match.shared,
+                                   true: AS_match.generic)>;
+  // Build AS-constrained pattern.
+  let IntrinsicPattern = BuildPatternPF<IntrFrag, Args>.ret;
+  let OutOperandList = (outs);
+  let InOperandList = !con(Args, (ins MmaCode:$ptx));
+  let AsmString = "stmatrix.sync.aligned."
+                  # Frag.geom
+                  # "." # Frag.frag
+                  # !if(Transposed, ".trans", "")
+                  # Space
+                  # "." # Frag.ptx_elt_type
+                  # " [$dst], " # Frag.regstring # ";";
+}
+
+// Create all stmatrix variants
+defset list<WMMA_INSTR> STMATRIXs = {
+  foreach transposed = [false, true] in {foreach space = [".shared", ""] in {
+      foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in
+        if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then
+          def : STMATRIX<WMMA_REGINFO<frag, "stmatrix">, transposed, space>;
+    } // space
+  } // transposed
+} // defset
+
 // Constructing non-flat DAGs is still a pain. I can't !subst a dag node with a
 // dag, so the ptx.version must be appended *after* foreach replaces 'ins' with
 // the instruction record.
@@ -4918,7 +5092,7 @@ class MMA_PAT<WMMA_INSTR wi>
         Requires<wi.Predicates>;
 
 // Build intrinsic->instruction patterns for all MMA instructions.
-foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in
+foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs, STMATRIXs) in
   def : MMA_PAT<mma>;
 
 multiclass MAPA<string suffix, Intrinsic Intr> {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 1ac91fadf6582..80fac18d5737f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -53,34 +53,30 @@ let Predicates = [IsISAFuture] in {
 
 let Predicates = [HasVSX, IsISAFuture] in {
   let mayLoad = 1 in {
-    def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
-                              "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
-    def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
-                               "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
-    def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp),
-                                 (ins memr:$RA, g8rc:$RB),
-                                 "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
-    def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp),
-                                  (ins memr:$RA, g8rc:$RB),
-                                  "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
+    def LXVRL
+        : XX1Form_memOp<31, 525, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+                        "lxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+    def LXVRLL
+        : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins memr:$RA, g8rc:$RB),
+                        "lxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
+    def LXVPRL
+        : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+                          "lxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
+    def LXVPRLL
+        : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), (ins memr:$RA, g8rc:$RB),
+                          "lxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
   }
 
   let mayStore = 1 in {
-    def STXVRL : XX1Form_memOp<31, 653, (outs),
-                               (ins vsrc:$XT, memr:$RA, g8rc:$RB),
-                               "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
-
-    def STXVRLL : XX1Form_memOp<31, 685, (outs),
-                                (ins vsrc:$XT, memr:$RA, g8rc:$RB),
-                                "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
-
+    def STXVRL
+        : XX1Form_memOp<31, 653, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+                        "stxvrl $XT, $RA, $RB", IIC_LdStLoad, []>;
+    def STXVRLL
+        : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, memr:$RA, g8rc:$RB),
+                        "stxvrll $XT, $RA, $RB", IIC_LdStLoad, []>;
     def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs),
                                   (ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
                                   "stxvprl $XTp, $RA, $RB", IIC_LdStLFD, []>;
-
     def STXVPRLL : XForm_XTp5_XAB5<31, 749, (outs),
                                    (ins vsrprc:$XTp, memr:$RA, g8rc:$RB),
                                    "stxvprll $XTp, $RA, $RB", IIC_LdStLFD, []>;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 75a0272af7c31..996b6efb320df 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -171,7 +171,7 @@ void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
 }
 
 void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                       unsigned NumRegionInstrs) const {
+                                       const SchedRegion &Region) const {
   // The GenericScheduler that we use defaults to scheduling bottom up only.
   // We want to schedule from both the top and the bottom and so we set
   // OnlyBottomUp to false.
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 9a97d1aa4dab0..3c59a475c7eb6 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -240,7 +240,8 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           unsigned NumRegionInstrs) const override;
+                           const SchedRegion &Region) const override;
+
   bool useAA() const override;
 
   bool enableSubRegLiveness() const override;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index f76f8b3060d2a..2c37c3bfd0fe3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -302,6 +302,28 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
   Inst = std::move(Res);
 }
 
+// Check if an R_RISCV_ALIGN relocation is needed for an alignment directive.
+// If conditions are met, compute the padding size and create a fixup encoding
+// the padding size in the addend.
+bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
+  // Use default handling unless linker relaxation is enabled and the alignment
+  // is larger than the nop size.
+  const MCSubtargetInfo *STI = F.getSubtargetInfo();
+  if (!STI->hasFeature(RISCV::FeatureRelax))
+    return false;
+  unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
+  if (F.getAlignment() <= MinNopLen)
+    return false;
+
+  Size = F.getAlignment().value() - MinNopLen;
+  auto *Expr = MCConstantExpr::create(Size, getContext());
+  MCFixup Fixup =
+      MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
+  F.setVarFixups({Fixup});
+  F.getParent()->setLinkerRelaxable();
+  return true;
+}
+
 bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
                                          bool &WasRelaxed) const {
   MCContext &C = getContext();
@@ -637,7 +659,7 @@ bool RISCVAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
 
   // Otherwise, check if the offset between the symbol and fragment is fully
   // resolved, unaffected by linker-relaxable fragments (e.g. instructions or
-  // offset-affected MCAlignFragment). Complements the generic
+  // offset-affected FT_Align fragments). Complements the generic
   // isSymbolRefDifferenceFullyResolvedImpl.
   if (!PCRelTemp)
     PCRelTemp = getContext().createTempSymbol();
@@ -887,55 +909,6 @@ void RISCVAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
   }
 }
 
-// Linker relaxation may change code size. We have to insert Nops
-// for .align directive when linker relaxation enabled. So then Linker
-// could satisfy alignment by removing Nops.
-// The function return the total Nops Size we need to insert.
-bool RISCVAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
-    const MCAlignFragment &AF, unsigned &Size) {
-  // Calculate Nops Size only when linker relaxation enabled.
-  const MCSubtargetInfo *STI = AF.getSubtargetInfo();
-  if (!STI->hasFeature(RISCV::FeatureRelax))
-    return false;
-
-  unsigned MinNopLen = STI->hasFeature(RISCV::FeatureStdExtZca) ? 2 : 4;
-
-  if (AF.getAlignment() <= MinNopLen) {
-    return false;
-  } else {
-    Size = AF.getAlignment().value() - MinNopLen;
-    return true;
-  }
-}
-
-// We need to insert R_RISCV_ALIGN relocation type to indicate the
-// position of Nops and the total bytes of the Nops have been inserted
-// when linker relaxation enabled.
-// The function insert fixup_riscv_align fixup which eventually will
-// transfer to R_RISCV_ALIGN relocation type.
-bool RISCVAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
-                                                    MCAlignFragment &AF) {
-  // Insert the fixup only when linker relaxation enabled.
-  const MCSubtargetInfo *STI = AF.getSubtargetInfo();
-  if (!STI->hasFeature(RISCV::FeatureRelax))
-    return false;
-
-  // Calculate total Nops we need to insert. If there are none to insert
-  // then simply return.
-  unsigned Count;
-  if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count) || (Count == 0))
-    return false;
-
-  MCContext &Ctx = getContext();
-  const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
-  MCFixup Fixup = MCFixup::create(0, Dummy, ELF::R_RISCV_ALIGN);
-
-  uint64_t FixedValue = 0;
-  MCValue NopBytes = MCValue::get(Count);
-  Asm.getWriter().recordRelocation(AF, Fixup, NopBytes, FixedValue);
-  return true;
-}
-
 std::unique_ptr<MCObjectTargetWriter>
 RISCVAsmBackend::createObjectTargetWriter() const {
   return createRISCVELFObjectWriter(OSABI, Is64Bit);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 8c10fbec3c8fc..d97d63204e7e4 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -38,14 +38,6 @@ class RISCVAsmBackend : public MCAsmBackend {
                   const MCTargetOptions &Options);
   ~RISCVAsmBackend() override = default;
 
-  // Return Size with extra Nop Bytes for alignment directive in code section.
-  bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
-                                             unsigned &Size) override;
-
-  // Insert target specific fixup type for alignment directive in code section.
-  bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
-                                     MCAlignFragment &AF) override;
-
   std::optional<bool> evaluateFixup(const MCFragment &, MCFixup &, MCValue &,
                                     uint64_t &) override;
   bool addReloc(const MCFragment &, const MCFixup &, const MCValue &,
@@ -73,6 +65,7 @@ class RISCVAsmBackend : public MCAsmBackend {
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
+  bool relaxAlign(MCFragment &F, unsigned &Size) override;
   bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
   bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
   std::pair<bool, bool> relaxLEB128(MCFragment &LF,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index aeda5ac109f1a..5abb5461f74b3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -52,15 +52,6 @@ namespace RISCV {
 #include "RISCVGenSearchableTables.inc"
 } // namespace RISCV
 
-// Report an error but don't ask the user to report a bug.
-// TODO: Remove these wrappers.
-[[noreturn]] static void reportError(const char *Reason) {
-  reportFatalUsageError(Reason);
-}
-[[noreturn]] static void reportError(Error Err) {
-  reportFatalUsageError(std::move(Err));
-}
-
 namespace RISCVABI {
 ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
                      StringRef ABIName) {
@@ -97,7 +88,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
   if ((TargetABI == RISCVABI::ABI::ABI_ILP32E ||
        (TargetABI == ABI_Unknown && IsRVE && !IsRV64)) &&
       FeatureBits[RISCV::FeatureStdExtD])
-    reportError("ILP32E cannot be used with the D ISA extension");
+    reportFatalUsageError("ILP32E cannot be used with the D ISA extension");
 
   if (TargetABI != ABI_Unknown)
     return TargetABI;
@@ -105,7 +96,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
   // If no explicit ABI is given, try to compute the default ABI.
   auto ISAInfo = RISCVFeatures::parseFeatureBits(IsRV64, FeatureBits);
   if (!ISAInfo)
-    reportError(ISAInfo.takeError());
+    reportFatalUsageError(ISAInfo.takeError());
   return getTargetABI((*ISAInfo)->computeDefaultABI());
 }
 
@@ -137,12 +128,12 @@ namespace RISCVFeatures {
 
 void validate(const Triple &TT, const FeatureBitset &FeatureBits) {
   if (TT.isArch64Bit() && !FeatureBits[RISCV::Feature64Bit])
-    reportError("RV64 target requires an RV64 CPU");
+    reportFatalUsageError("RV64 target requires an RV64 CPU");
   if (!TT.isArch64Bit() && !FeatureBits[RISCV::Feature32Bit])
-    reportError("RV32 target requires an RV32 CPU");
+    reportFatalUsageError("RV32 target requires an RV32 CPU");
   if (FeatureBits[RISCV::Feature32Bit] &&
       FeatureBits[RISCV::Feature64Bit])
-    reportError("RV32 and RV64 can't be combined");
+    reportFatalUsageError("RV32 and RV64 can't be combined");
 }
 
 llvm::Expected<std::unique_ptr<RISCVISAInfo>>
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 4c8dcf376755b..7ad5d5f3118b6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -494,6 +494,17 @@ inline static bool isValidRoundingMode(unsigned Mode) {
 }
 } // namespace RISCVVXRndMode
 
+namespace RISCVExceptFlags {
+enum ExceptionFlag {
+  NX = 0x01, // Inexact
+  UF = 0x02, // Underflow
+  OF = 0x04, // Overflow
+  DZ = 0x08, // Divide by zero
+  NV = 0x10, // Invalid operation
+  ALL = 0x1F // Mask for all accrued exception flags
+};
+}
+
 //===----------------------------------------------------------------------===//
 // Floating-point Immediates
 //
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index cbf039edec273..4c303a93c7349 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -56,19 +56,21 @@ def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
 def CSR_XLEN_F64_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
                                              (sequence "F%u_D", 0, 31))>;
 
+defvar VREGS = (add (sequence "V%u", 0, 31),
+                    (sequence "V%uM2", 0, 31, 2),
+                    (sequence "V%uM4", 0, 31, 4),
+                    (sequence "V%uM8", 0, 31, 8));
+
 // Same as CSR_Interrupt, but including all vector registers.
-def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
-                                           (sequence "V%u", 0, 31))>;
+def CSR_XLEN_V_Interrupt: CalleeSavedRegs<(add CSR_Interrupt, VREGS)>;
 
 // Same as CSR_Interrupt, but including all 32-bit FP registers and all vector
 // registers.
-def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt,
-                                               (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F32_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F32_Interrupt, VREGS)>;
 
 // Same as CSR_Interrupt, but including all 64-bit FP registers and all vector
 // registers.
-def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt,
-                                               (sequence "V%u", 0, 31))>;
+def CSR_XLEN_F64_V_Interrupt: CalleeSavedRegs<(add CSR_XLEN_F64_Interrupt, VREGS)>;
 
 // Same as CSR_Interrupt, but excluding X16-X31.
 def CSR_Interrupt_RVE : CalleeSavedRegs<(sub CSR_Interrupt,
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6c8e3da80b932..b1ab76a732de6 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -95,6 +95,11 @@ static const std::pair<MCPhysReg, int8_t> FixedCSRFIQCIInterruptMap[] = {
     /* -21, -22, -23, -24 are reserved */
 };
 
+/// Returns true if DWARF CFI instructions ("frame moves") should be emitted.
+static bool needsDwarfCFI(const MachineFunction &MF) {
+  return MF.needsFrameMoves();
+}
+
 // For now we use x3, a.k.a gp, as pointer to shadow call stack.
 // User should not use x3 in their asm.
 static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
       .addImm(-SlotSize)
       .setMIFlag(MachineInstr::FrameSetup);
 
+  if (!needsDwarfCFI(MF))
+    return;
+
   // Emit a CFI instruction that causes SlotSize to be subtracted from the value
   // of the shadow stack pointer when unwinding past this frame.
   char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true);
@@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
       .addReg(SCSPReg)
       .addImm(-SlotSize)
       .setMIFlag(MachineInstr::FrameDestroy);
-  // Restore the SCS pointer
-  CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg);
+  if (needsDwarfCFI(MF)) {
+    // Restore the SCS pointer
+    CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg);
+  }
 }
 
 // Insert instruction to swap mscratchsw with sp
@@ -935,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
                              getUnmanagedCSI(MF, CSI).size());
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+  bool NeedsDwarfCFI = needsDwarfCFI(MF);
 
   // If libcalls are used to spill and restore callee-saved registers, the frame
   // has two sections; the opaque section managed by the libcalls, and the
@@ -962,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
         alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign());
     RVFI->setLibCallStackSize(LibCallFrameSize);
 
-    CFIBuilder.buildDefCFAOffset(LibCallFrameSize);
-    for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
-      CFIBuilder.buildOffset(CS.getReg(),
-                             MFI.getObjectOffset(CS.getFrameIdx()));
+    if (NeedsDwarfCFI) {
+      CFIBuilder.buildDefCFAOffset(LibCallFrameSize);
+      for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+        CFIBuilder.buildOffset(CS.getReg(),
+                               MFI.getObjectOffset(CS.getFrameIdx()));
+    }
   }
 
   // FIXME (note copied from Lanai): This appears to be overallocating.  Needs
@@ -996,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     // could only be the next instruction.
     ++PossiblePush;
 
-    // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)`
-    // could be. The PUSH will also get its own CFI metadata for its own
-    // modifications, which should come after the PUSH.
-    CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup);
-    PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount);
-    for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI))
-      PushCFIBuilder.buildOffset(CS.getReg(),
-                                 MFI.getObjectOffset(CS.getFrameIdx()));
+    if (NeedsDwarfCFI) {
+      // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)`
+      // could be. The PUSH will also get its own CFI metadata for its own
+      // modifications, which should come after the PUSH.
+      CFIInstBuilder PushCFIBuilder(MBB, PossiblePush,
+                                    MachineInstr::FrameSetup);
+      PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount);
+      for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI))
+        PushCFIBuilder.buildOffset(CS.getReg(),
+                                   MFI.getObjectOffset(CS.getFrameIdx()));
+    }
   }
 
   if (RVFI->isPushable(MF) && PossiblePush != MBB.end() &&
@@ -1017,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     PossiblePush->getOperand(1).setImm(StackAdj);
     StackSize -= StackAdj;
 
-    CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize);
-    for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
-      CFIBuilder.buildOffset(CS.getReg(),
-                             MFI.getObjectOffset(CS.getFrameIdx()));
+    if (NeedsDwarfCFI) {
+      CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize);
+      for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+        CFIBuilder.buildOffset(CS.getReg(),
+                               MFI.getObjectOffset(CS.getFrameIdx()));
+    }
   }
 
   // Allocate space on the stack if necessary.
@@ -1031,7 +1049,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   bool DynAllocation =
       MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
   if (StackSize != 0)
-    allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
+    allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI,
                   NeedProbe, ProbeSize, DynAllocation,
                   MachineInstr::FrameSetup);
 
@@ -1049,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   // Iterate over list of callee-saved registers and emit .cfi_offset
   // directives.
-  for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
-    CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx()));
+  if (NeedsDwarfCFI)
+    for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
+      CFIBuilder.buildOffset(CS.getReg(),
+                             MFI.getObjectOffset(CS.getFrameIdx()));
 
   // Generate new FP.
   if (hasFP(MF)) {
@@ -1069,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
           MachineInstr::FrameSetup, getStackAlign());
     }
 
-    CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize());
+    if (NeedsDwarfCFI)
+      CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize());
   }
 
   uint64_t SecondSPAdjustAmount = 0;
@@ -1080,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
            "SecondSPAdjustAmount should be greater than zero");
 
     allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
-                  getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
-                  ProbeSize, DynAllocation, MachineInstr::FrameSetup);
+                  getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF),
+                  NeedProbe, ProbeSize, DynAllocation,
+                  MachineInstr::FrameSetup);
   }
 
   if (RVVStackSize) {
     if (NeedProbe) {
       allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
-                                  MachineInstr::FrameSetup, !hasFP(MF),
-                                  DynAllocation);
+                                  MachineInstr::FrameSetup,
+                                  NeedsDwarfCFI && !hasFP(MF), DynAllocation);
     } else {
       // We must keep the stack pointer aligned through any intermediate
       // updates.
@@ -1097,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
                     MachineInstr::FrameSetup, getStackAlign());
     }
 
-    if (!hasFP(MF)) {
+    if (NeedsDwarfCFI && !hasFP(MF)) {
       // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
       CFIBuilder.insertCFIInst(createDefCFAExpression(
           *RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8));
     }
 
     std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
-    emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
+    if (NeedsDwarfCFI)
+      emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
   }
 
   if (hasFP(MF)) {
@@ -1171,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF,
                 MachineInstr::FrameDestroy, getStackAlign());
   StackSize = 0;
 
-  CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
-      .buildDefCFAOffset(CFAOffset);
+  if (needsDwarfCFI(MF))
+    CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
+        .buildDefCFAOffset(CFAOffset);
 }
 
 void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -1212,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
       std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
   CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn,
                             MachineInstr::FrameDestroy);
+  bool NeedsDwarfCFI = needsDwarfCFI(MF);
 
   uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
   uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount
@@ -1232,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
                     StackOffset::getScalable(RVVStackSize),
                     MachineInstr::FrameDestroy, getStackAlign());
 
-    if (!hasFP(MF))
-      CFIBuilder.buildDefCFA(SPReg, RealStackSize);
-
-    emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
+    if (NeedsDwarfCFI) {
+      if (!hasFP(MF))
+        CFIBuilder.buildDefCFA(SPReg, RealStackSize);
+      emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
+    }
   }
 
   if (FirstSPAdjustAmount) {
@@ -1251,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
                     StackOffset::getFixed(SecondSPAdjustAmount),
                     MachineInstr::FrameDestroy, getStackAlign());
 
-    if (!hasFP(MF))
+    if (NeedsDwarfCFI && !hasFP(MF))
       CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount);
   }
 
@@ -1272,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
                   getStackAlign());
   }
 
-  if (hasFP(MF))
+  if (NeedsDwarfCFI && hasFP(MF))
     CFIBuilder.buildDefCFA(SPReg, RealStackSize);
 
   // Skip to after the restores of scalar callee-saved registers
@@ -1295,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   // Recover callee-saved registers.
-  for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
-    CFIBuilder.buildRestore(CS.getReg());
+  if (NeedsDwarfCFI)
+    for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
+      CFIBuilder.buildRestore(CS.getReg());
 
   if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) {
     // Use available stack adjustment in pop instruction to deallocate stack
@@ -1315,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
     auto NextI = next_nodbg(MBBI, MBB.end());
     if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) {
       ++MBBI;
-      CFIBuilder.setInsertPoint(MBBI);
+      if (NeedsDwarfCFI) {
+        CFIBuilder.setInsertPoint(MBBI);
 
-      for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
-        CFIBuilder.buildRestore(CS.getReg());
+        for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+          CFIBuilder.buildRestore(CS.getReg());
 
-      // Update CFA Offset. If this is a QCI interrupt function, there will be a
-      // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise
-      // getQCIInterruptStackSize() will be 0.
-      CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize());
+        // Update CFA Offset. If this is a QCI interrupt function, there will
+        // be a leftover offset which is deallocated by `QC.C.MILEAVERET`,
+        // otherwise getQCIInterruptStackSize() will be 0.
+        CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize());
+      }
     }
   }
 
@@ -1515,10 +1544,53 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset;
 }
 
+static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
+                                     const Register &Reg) {
+  MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
+  // If it's not a grouped vector register, it doesn't have subregister, so
+  // the base register is just itself.
+  if (BaseReg == RISCV::NoRegister)
+    BaseReg = Reg;
+  return BaseReg;
+}
+
 void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                               BitVector &SavedRegs,
                                               RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  // In TargetFrameLowering::determineCalleeSaves, any vector register is marked
+  // as saved if any of its subregister is clobbered, this is not correct in
+  // vector registers. We only want the vector register to be marked as saved
+  // if all of its subregisters are clobbered.
+  // For example:
+  // Original behavior: If v24 is marked, v24m2, v24m4, v24m8 are also marked.
+  // Correct behavior: v24m2 is marked only if v24 and v25 are marked.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+  const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned CSReg = CSRegs[i];
+    // Only vector registers need special care.
+    if (!RISCV::VRRegClass.contains(getRVVBaseRegister(TRI, CSReg)))
+      continue;
+
+    SavedRegs.reset(CSReg);
+
+    auto SubRegs = TRI.subregs(CSReg);
+    // Set the register and all its subregisters.
+    if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) {
+      SavedRegs.set(CSReg);
+      llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); });
+    }
+
+    // Combine to super register if all of its subregisters are marked.
+    if (!SubRegs.empty() && llvm::all_of(SubRegs, [&](unsigned Reg) {
+          return SavedRegs.test(Reg);
+        }))
+      SavedRegs.set(CSReg);
+  }
+
   // Unconditionally spill RA and FP only if the function uses a frame
   // pointer.
   if (hasFP(MF)) {
@@ -1812,7 +1884,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
         // allocateStack.
         bool DynAllocation =
             MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
-        allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF),
+        allocateStack(MBB, MI, MF, -Amount, -Amount,
+                      needsDwarfCFI(MF) && !hasFP(MF),
                       /*NeedProbe=*/true, ProbeSize, DynAllocation,
                       MachineInstr::NoFlags);
       } else {
@@ -2107,16 +2180,6 @@ static unsigned getCalleeSavedRVVNumRegs(const Register &BaseReg) {
                                                  : 8;
 }
 
-static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
-                                     const Register &Reg) {
-  MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
-  // If it's not a grouped vector register, it doesn't have subregister, so
-  // the base register is just itself.
-  if (BaseReg == RISCV::NoRegister)
-    BaseReg = Reg;
-  return BaseReg;
-}
-
 void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const {
   MachineFunction *MF = MBB.getParent();
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 0f948b22759fe..a541c2fe2654c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3058,17 +3058,28 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
   };
 
   if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) {
+    // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
     if (LHS.getOpcode() == ISD::ADD &&
-        SelectShl(LHS.getOperand(0), Index, Scale) &&
         !isa<ConstantSDNode>(LHS.getOperand(1)) &&
         isInt<12>(C1->getSExtValue())) {
-      // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
-      SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
-                                                SDLoc(Addr), VT);
-      Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
-                                            LHS.getOperand(1), C1Val),
-                     0);
-      return true;
+      if (SelectShl(LHS.getOperand(1), Index, Scale)) {
+        SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+                                                  SDLoc(Addr), VT);
+        Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+                                              LHS.getOperand(0), C1Val),
+                       0);
+        return true;
+      }
+
+      // Add is commutative so we need to check both operands.
+      if (SelectShl(LHS.getOperand(0), Index, Scale)) {
+        SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+                                                  SDLoc(Addr), VT);
+        Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+                                              LHS.getOperand(1), C1Val),
+                       0);
+        return true;
+      }
     }
 
     // Don't match add with constants.
@@ -3095,6 +3106,25 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
   return true;
 }
 
+bool RISCVDAGToDAGISel::SelectAddrRegZextRegScale(SDValue Addr,
+                                                  unsigned MaxShiftAmount,
+                                                  unsigned Bits, SDValue &Base,
+                                                  SDValue &Index,
+                                                  SDValue &Scale) {
+  if (!SelectAddrRegRegScale(Addr, MaxShiftAmount, Base, Index, Scale))
+    return false;
+
+  if (Index.getOpcode() == ISD::AND) {
+    auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1));
+    if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) {
+      Index = Index.getOperand(0);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) {
   if (Addr.getOpcode() != ISD::ADD)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 72e2f965f0809..ee3a86e25add0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -59,19 +59,14 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
     return SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale);
   }
 
+  bool SelectAddrRegZextRegScale(SDValue Addr, unsigned MaxShiftAmount,
+                                 unsigned Bits, SDValue &Base, SDValue &Index,
+                                 SDValue &Scale);
+
   template <unsigned MaxShift, unsigned Bits>
   bool SelectAddrRegZextRegScale(SDValue Addr, SDValue &Base, SDValue &Index,
                                  SDValue &Scale) {
-    if (SelectAddrRegRegScale(Addr, MaxShift, Base, Index, Scale)) {
-      if (Index.getOpcode() == ISD::AND) {
-        auto *C = dyn_cast<ConstantSDNode>(Index.getOperand(1));
-        if (C && C->getZExtValue() == maskTrailingOnes<uint64_t>(Bits)) {
-          Index = Index.getOperand(0);
-          return true;
-        }
-      }
-    }
-    return false;
+    return SelectAddrRegZextRegScale(Addr, MaxShift, Bits, Base, Index, Scale);
   }
 
   bool SelectAddrRegReg(SDValue Addr, SDValue &Base, SDValue &Offset);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2754f6c3f8252..d859db3a965dd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -655,6 +655,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::GET_FPENV, XLenVT, Custom);
     setOperationAction(ISD::SET_FPENV, XLenVT, Custom);
     setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
+    setOperationAction(ISD::GET_FPMODE, XLenVT, Custom);
+    setOperationAction(ISD::SET_FPMODE, XLenVT, Custom);
+    setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
   }
 
   setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool,
@@ -2316,6 +2319,10 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
   if (getLegalZfaFPImm(Imm, VT) >= 0)
     return true;
 
+  // Some constants can be produced by fli+fneg.
+  if (Imm.isNegative() && getLegalZfaFPImm(-Imm, VT) >= 0)
+    return true;
+
   // Cannot create a 64 bit floating-point immediate value for rv32.
   if (Subtarget.getXLen() < VT.getScalarSizeInBits()) {
     // td can handle +0.0 or -0.0 already.
@@ -8225,6 +8232,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerSET_FPENV(Op, DAG);
   case ISD::RESET_FPENV:
     return lowerRESET_FPENV(Op, DAG);
+  case ISD::GET_FPMODE:
+    return lowerGET_FPMODE(Op, DAG);
+  case ISD::SET_FPMODE:
+    return lowerSET_FPMODE(Op, DAG);
+  case ISD::RESET_FPMODE:
+    return lowerRESET_FPMODE(Op, DAG);
   case ISD::EH_DWARF_CFA:
     return lowerEH_DWARF_CFA(Op, DAG);
   case ISD::VP_MERGE:
@@ -14002,6 +14015,52 @@ SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op,
                      EnvValue);
 }
 
+const uint64_t ModeMask64 = ~RISCVExceptFlags::ALL;
+const uint32_t ModeMask32 = ~RISCVExceptFlags::ALL;
+
+SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
+  SDValue Result = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
+  Chain = Result.getValue(1);
+  return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+SDValue RISCVTargetLowering::lowerSET_FPMODE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32;
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue EnvValue = Op->getOperand(1);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT);
+
+  EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue);
+  EnvValue = DAG.getNode(ISD::AND, DL, XLenVT, EnvValue, ModeMask);
+  Chain = DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo,
+                      ModeMask);
+  return DAG.getNode(RISCVISD::SET_CSR, DL, MVT::Other, Chain, SysRegNo,
+                     EnvValue);
+}
+
+SDValue RISCVTargetLowering::lowerRESET_FPMODE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32;
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT);
+
+  return DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo,
+                     ModeMask);
+}
+
 SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 41bbf6b9dcf2e..f0447e02191ae 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -429,26 +429,21 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool fallBackToDAGISel(const Instruction &Inst) const override;
 
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
 
-  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                             ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(
-      Instruction *Load, Value *Mask,
-      ArrayRef<Value *> DeinterleaveValues) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+                                        IntrinsicInst *DI) const override;
 
   bool lowerInterleaveIntrinsicToStore(
-      StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
-
-  bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                              ArrayRef<Value *> DeinterleaveRes) const override;
-
-  bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
-                               ArrayRef<Value *> InterleaveOps) const override;
+      Instruction *Store, Value *Mask,
+      ArrayRef<Value *> InterleaveValues) const override;
 
   bool supportKCFIBundles() const override { return true; }
 
@@ -563,6 +558,9 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index e23001a3a0bff..d9c6101478064 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -174,6 +174,7 @@ class EltDeps<bit vl, bit mask> {
 
 def EltDepsNone      : EltDeps<vl=0, mask=0>;
 def EltDepsVL        : EltDeps<vl=1, mask=0>;
+def EltDepsMask      : EltDeps<vl=0, mask=1>;
 def EltDepsVLMask    : EltDeps<vl=1, mask=1>;
 
 class EEW <bits<2> val> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index f63531a0109b0..653607827282e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -120,6 +120,20 @@ def riscv_swap_csr  : RVSDNode<"SWAP_CSR",
                                                     SDTCisInt<2>]>,
                                [SDNPHasChain]>;
 
+// Clear bits of CSR. The first operand is the address of the required CSR,
+// the second is the bitmask of cleared bits.
+def riscv_clear_csr  : RVSDNode<"CLEAR_CSR",
+                               SDTypeProfile<0, 2, [SDTCisInt<0>,
+                                                    SDTCisInt<1>]>,
+                               [SDNPHasChain]>;
+
+// Set bits of CSR. The first operand is the address of the required CSR,
+// the second is the bitmask of bits to set.
+def riscv_set_csr  : RVSDNode<"SET_CSR",
+                               SDTypeProfile<0, 2, [SDTCisInt<0>,
+                                                    SDTCisInt<1>]>,
+                               [SDNPHasChain]>;
+
 // A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)).
 // It takes a chain operand and another two target constant operands (the
 // CSR numbers of the low and high parts of the counter).
@@ -2038,6 +2052,42 @@ class SwapSysRegImm<SysReg SR, list<Register> Regs>
   let Defs = Regs;
 }
 
+class ClearSysReg<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins GPR:$val),
+           [(riscv_clear_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>,
+    PseudoInstExpansion<(CSRRC X0, SR.Encoding, GPR:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
+class ClearSysRegImm<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins uimm5:$val),
+           [(riscv_clear_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+    PseudoInstExpansion<(CSRRCI X0, SR.Encoding, uimm5:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
+class SetSysReg<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins GPR:$val),
+           [(riscv_set_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>,
+    PseudoInstExpansion<(CSRRS X0, SR.Encoding, GPR:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
+class SetSysRegImm<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins uimm5:$val),
+           [(riscv_set_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+    PseudoInstExpansion<(CSRRSI X0, SR.Encoding, uimm5:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
 def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>;
 let hasPostISelHook = 1 in {
 def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>;
@@ -2056,6 +2106,10 @@ let hasPostISelHook = 1 in {
 def ReadFCSR : ReadSysReg<SysRegFCSR, [FRM, FFLAGS]>;
 def WriteFCSR : WriteSysReg<SysRegFCSR, [FRM, FFLAGS]>;
 def WriteFCSRImm : WriteSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+def ClearFCSR : ClearSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def ClearFCSRImm : ClearSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+def SetFCSR : SetSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def SetFCSRImm : SetSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
 }
 
 /// Other pseudo-instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index aef410fb4cc6e..ff48f0601fa76 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -44,45 +44,62 @@ def simm10_unsigned : RISCVOp {
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm10<bits<7> funct7, string opcodestr,
-                    DAGOperand TyImm10 = simm10>
-    : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins TyImm10:$imm10),
-                  opcodestr, "$rd, $imm10"> {
+class PLI_i<bits<7> funct7, string opcodestr>
+    : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [],
+             InstFormatOther> {
   bits<10> imm10;
+  bits<5> rd;
 
   let Inst{31-25} = funct7;
   let Inst{24-16} = imm10{8-0};
   let Inst{15}    = imm10{9};
+  let Inst{14-12} = 0b010;
+  let Inst{11-7} = rd;
+  let Inst{6-0} = OPC_OP_IMM_32.Value;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVPUnaryImm8<bits<8> funct8, string opcodestr>
-    : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), (ins uimm8:$uimm8),
-                  opcodestr, "$rd, $uimm8"> {
+class PLUI_i<bits<7> funct7, string opcodestr>
+    : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr,
+             "$rd, $imm10", [], InstFormatOther> {
+  bits<10> imm10;
+  bits<5> rd;
+
+  let Inst{31-25} = funct7;
+  let Inst{24}    = imm10{0};
+  let Inst{23-15} = imm10{9-1};
+  let Inst{14-12} = 0b010;
+  let Inst{11-7} = rd;
+  let Inst{6-0} = OPC_OP_IMM_32.Value;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class PLI_B_i<bits<8> funct8, string opcodestr>
+    : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [],
+             InstFormatOther> {
   bits<8> uimm8;
+  bits<5> rd;
 
   let Inst{31-24} = funct8;
   let Inst{23-16} = uimm8;
   let Inst{15}    = 0b0;
+  let Inst{14-12} = 0b010;
+  let Inst{11-7} = rd;
+  let Inst{6-0} = OPC_OP_IMM_32.Value;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class RVPUnary<bits<3> f, string opcodestr, dag operands, string argstr>
     : RVInstIBase<0b010, OPC_OP_IMM_32, (outs GPR:$rd), operands, opcodestr, argstr> {
-  bits<5> imm;
-  bits<5> rs1;
-
   let Inst{31}    = 0b1;
   let Inst{30-28} = f;
   let Inst{27}    = 0b0;
-  let Inst{19-15} = rs1;
 }
 
 class RVPUnaryImm5<bits<3> f, string opcodestr>
     : RVPUnary<f, opcodestr, (ins GPR:$rs1, uimm5:$uimm5), "$rd, $rs1, $uimm5"> {
   bits<5> uimm5;
 
-  let imm = uimm5;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = uimm5;
 }
@@ -145,11 +162,11 @@ def PSSLAI_W : RVPUnaryImm5<0b101, "psslai.w">;
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in
-def PLI_H : RVPUnaryImm10<0b1011000, "pli.h">;
+def PLI_H : PLI_i<0b1011000, "pli.h">;
 let Predicates = [HasStdExtP, IsRV64] in
-def PLI_W : RVPUnaryImm10<0b1011001, "pli.w">;
+def PLI_W : PLI_i<0b1011001, "pli.w">;
 let Predicates = [HasStdExtP] in
-def PLI_B : RVPUnaryImm8<0b10110100, "pli.b">;
+def PLI_B : PLI_B_i<0b10110100, "pli.b">;
 
 let Predicates = [HasStdExtP] in {
 def PSEXT_H_B : RVPUnaryWUF<0b00, 0b00100, "psext.h.b">;
@@ -162,6 +179,6 @@ def PSEXT_W_H      : RVPUnaryWUF<0b01, 0b00101, "psext.w.h">;
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in
-def PLUI_H : RVPUnaryImm10<0b1111000, "plui.h", simm10_unsigned>;
+def PLUI_H : PLUI_i<0b1111000, "plui.h">;
 let Predicates = [HasStdExtP, IsRV64] in
-def PLUI_W : RVPUnaryImm10<0b1111001, "plui.w", simm10_unsigned>;
+def PLUI_W : PLUI_i<0b1111001, "plui.w">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 5d13a877c90a2..33c713833d8b9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1642,7 +1642,7 @@ def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
 
 def : MnemonicAlias<"vpopc.m", "vcpop.m">;
 
-let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask in {
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask in {
 
 let DestEEW = EEW1 in {
 // vmsbf.m set-before-first mask bit
@@ -1655,7 +1655,7 @@ defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
 // Vector Iota Instruction
 defm VIOTA_M : VIOTA_MV_V<"viota.m", 0b010100, 0b10000>;
 
-} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsVLMask
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota, ElementsDependOn = EltDepsMask
 
 // Vector Element Index Instruction
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index c7cb6e237aeac..f39130090defd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1377,9 +1377,9 @@ let Predicates = [HasVendorXqciac, IsRV32] in {
 def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
 def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 
 /// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index ddfacd970e950..0565fcd9c6bcf 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
 #include "RISCVISelLowering.h"
 #include "RISCVSubtarget.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -68,6 +69,120 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
     Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
     Intrinsic::riscv_vlseg8_mask};
 
+static const Intrinsic::ID FixedVssegIntrIds[] = {
+    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+    Intrinsic::riscv_seg8_store_mask};
+
+static const Intrinsic::ID ScalableVssegIntrIds[] = {
+    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+    Intrinsic::riscv_vsseg8_mask};
+
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+  assert(N);
+  if (N == 1)
+    return true;
+
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  if (match(V, m_CombineOr(m_ConstantInt(C),
+                           m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
+      C && C % N == 0)
+    return true;
+
+  if (isPowerOf2_32(N)) {
+    KnownBits KB = llvm::computeKnownBits(V, DL);
+    return KB.countMinTrailingZeros() >= Log2_32(N);
+  }
+
+  return false;
+}
+
+/// Do the common operand retrieval and validition required by the
+/// routines below.
+static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
+                           Instruction *I, Value *&Ptr, Value *&Mask,
+                           Value *&VL, Align &Alignment) {
+
+  IRBuilder<> Builder(I);
+  const DataLayout &DL = I->getDataLayout();
+  ElementCount EC = VTy->getElementCount();
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    assert(LI->isSimple());
+    Ptr = LI->getPointerOperand();
+    Alignment = LI->getAlign();
+    assert(!Mask && "Unexpected mask on a load");
+    Mask = Builder.getAllOnesMask(EC);
+    VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
+                                   : Constant::getAllOnesValue(XLenTy);
+    return true;
+  }
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    assert(SI->isSimple());
+    Ptr = SI->getPointerOperand();
+    Alignment = SI->getAlign();
+    assert(!Mask && "Unexpected mask on a store");
+    Mask = Builder.getAllOnesMask(EC);
+    VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
+                                   : Constant::getAllOnesValue(XLenTy);
+    return true;
+  }
+
+  auto *II = cast<IntrinsicInst>(I);
+  switch (II->getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unsupported intrinsic type");
+  case Intrinsic::vp_load:
+  case Intrinsic::vp_store: {
+    auto *VPLdSt = cast<VPIntrinsic>(I);
+    Ptr = VPLdSt->getMemoryPointerParam();
+    Alignment = VPLdSt->getPointerAlignment().value_or(
+        DL.getABITypeAlign(VTy->getElementType()));
+
+    assert(Mask && "vp.load and vp.store needs a mask!");
+
+    Value *WideEVL = VPLdSt->getVectorLengthParam();
+    // Conservatively check if EVL is a multiple of factor, otherwise some
+    // (trailing) elements might be lost after the transformation.
+    if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
+      return false;
+
+    auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+    VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+    return true;
+  }
+  case Intrinsic::masked_load: {
+    Ptr = II->getOperand(0);
+    Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
+
+    if (!isa<UndefValue>(II->getOperand(3)))
+      return false;
+
+    assert(Mask && "masked.load needs a mask!");
+
+    VL = isa<FixedVectorType>(VTy)
+             ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+             : Constant::getAllOnesValue(XLenTy);
+    return true;
+  }
+  case Intrinsic::masked_store: {
+    Ptr = II->getOperand(1);
+    Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue();
+
+    assert(Mask && "masked.store needs a mask!");
+
+    VL = isa<FixedVectorType>(VTy)
+             ? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
+             : Constant::getAllOnesValue(XLenTy);
+    return true;
+  }
+  }
+}
+
 /// Lower an interleaved load into a vlsegN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -81,21 +196,25 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Indices.size() == Shuffles.size());
 
-  IRBuilder<> Builder(LI);
-
-  const DataLayout &DL = LI->getDataLayout();
+  IRBuilder<> Builder(Load);
 
+  const DataLayout &DL = Load->getDataLayout();
   auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
-  if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(), DL))
+  auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
+
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
     return false;
 
-  auto *PtrTy = LI->getPointerOperandType();
-  auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = PtrTy->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+    return false;
 
   // If the segment load is going to be performed segment at a time anyways
   // and there's only one element used, use a strided load instead.  This
@@ -104,26 +223,24 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
     unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
-    Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
-    Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
-                                           VTy->getElementCount());
-
+    Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+    // Note: Same VL as above, but i32 not xlen due to signature of
+    // vp.strided.load
+    VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+                                    VTy->getElementCount());
     CallInst *CI =
         Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
                                 {VTy, BasePtr->getType(), Stride->getType()},
                                 {BasePtr, Stride, Mask, VL});
-    CI->addParamAttr(
-        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+    Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
+    CI->addParamAttr(0,
+                     Attribute::getWithAlignment(CI->getContext(), Alignment));
     Shuffles[0]->replaceAllUsesWith(CI);
     return true;
   };
 
-  Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
-  Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
   CallInst *VlsegN = Builder.CreateIntrinsic(
-      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
-      {LI->getPointerOperand(), Mask, VL});
+      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
     Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -133,18 +250,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
   return true;
 }
 
-static const Intrinsic::ID FixedVssegIntrIds[] = {
-    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
-    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
-    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
-    Intrinsic::riscv_seg8_store_mask};
-
-static const Intrinsic::ID ScalableVssegIntrIds[] = {
-    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
-    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
-    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
-    Intrinsic::riscv_vsseg8_mask};
-
 /// Lower an interleaved store into a vssegN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
@@ -161,22 +266,28 @@ static const Intrinsic::ID ScalableVssegIntrIds[] = {
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
+                                                Value *LaneMask,
                                                 ShuffleVectorInst *SVI,
                                                 unsigned Factor) const {
-  IRBuilder<> Builder(SI);
-  const DataLayout &DL = SI->getDataLayout();
+  IRBuilder<> Builder(Store);
+  const DataLayout &DL = Store->getDataLayout();
   auto Mask = SVI->getShuffleMask();
   auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
   // Given SVI : <n*factor x ty>, then VTy : <n x ty>
   auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
                                    ShuffleVTy->getNumElements() / Factor);
-  if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(), DL))
+  auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
+
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
     return false;
 
-  auto *PtrTy = SI->getPointerOperandType();
-  auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = PtrTy->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+    return false;
 
   unsigned Index;
   // If the segment store only has one active lane (i.e. the interleave is
@@ -187,26 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
     unsigned ScalarSizeInBytes =
         DL.getTypeStoreSize(ShuffleVTy->getElementType());
     Value *Data = SVI->getOperand(0);
-    auto *DataVTy = cast<FixedVectorType>(Data->getType());
+    Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
-    Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
-    Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
-                                           VTy->getElementCount());
-
-    CallInst *CI = Builder.CreateIntrinsic(
-        Intrinsic::experimental_vp_strided_store,
-        {Data->getType(), BasePtr->getType(), Stride->getType()},
-        {Data, BasePtr, Stride, Mask, VL});
-    CI->addParamAttr(
-        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+    Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+    // Note: Same VL as above, but i32 not xlen due to signature of
+    // vp.strided.store
+    VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+                                    VTy->getElementCount());
 
+    CallInst *CI =
+        Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+                                {VTy, BasePtr->getType(), Stride->getType()},
+                                {Data, BasePtr, Stride, LaneMask, VL});
+    Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
+    CI->addParamAttr(1,
+                     Attribute::getWithAlignment(CI->getContext(), Alignment));
     return true;
   }
 
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
+      Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
 
   SmallVector<Value *, 10> Ops;
   SmallVector<int, 16> NewShuffleMask;
@@ -222,87 +334,29 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
     NewShuffleMask.clear();
   }
-  // This VL should be OK (should be executable in one vsseg instruction,
-  // potentially under larger LMULs) because we checked that the fixed vector
-  // type fits in isLegalInterleavedAccessType
-  Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
-  Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
-  Ops.append({SI->getPointerOperand(), StoreMask, VL});
-
+  Ops.append({Ptr, LaneMask, VL});
   Builder.CreateCall(VssegNFunc, Ops);
 
   return true;
 }
 
-static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
-  assert(N);
-  if (N == 1)
-    return true;
-
-  using namespace PatternMatch;
-  // Right now we're only recognizing the simplest pattern.
-  uint64_t C;
-  if (match(V, m_CombineOr(m_ConstantInt(C),
-                           m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
-      C && C % N == 0)
-    return true;
-
-  if (isPowerOf2_32(N)) {
-    KnownBits KB = llvm::computeKnownBits(V, DL);
-    return KB.countMinTrailingZeros() >= Log2_32(N);
-  }
-
-  return false;
-}
-
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    Instruction *Load, Value *Mask,
-    ArrayRef<Value *> DeinterleaveValues) const {
-  const unsigned Factor = DeinterleaveValues.size();
+    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   if (Factor > 8)
     return false;
 
   IRBuilder<> Builder(Load);
 
-  Value *FirstActive =
-      *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
-  VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+  VectorType *ResVTy = getDeinterleavedVectorType(DI);
 
   const DataLayout &DL = Load->getDataLayout();
-  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+  auto *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
 
   Value *Ptr, *VL;
   Align Alignment;
-  if (auto *LI = dyn_cast<LoadInst>(Load)) {
-    assert(LI->isSimple());
-    Ptr = LI->getPointerOperand();
-    Alignment = LI->getAlign();
-    assert(!Mask && "Unexpected mask on a load\n");
-    Mask = Builder.getAllOnesMask(ResVTy->getElementCount());
-    VL = isa<FixedVectorType>(ResVTy)
-             ? Builder.CreateElementCount(XLenTy, ResVTy->getElementCount())
-             : Constant::getAllOnesValue(XLenTy);
-  } else {
-    auto *VPLoad = cast<VPIntrinsic>(Load);
-    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
-           "Unexpected intrinsic");
-    Ptr = VPLoad->getMemoryPointerParam();
-    Alignment = VPLoad->getPointerAlignment().value_or(
-        DL.getABITypeAlign(ResVTy->getElementType()));
-
-    assert(Mask && "vp.load needs a mask!");
-
-    Value *WideEVL = VPLoad->getVectorLengthParam();
-    // Conservatively check if EVL is a multiple of factor, otherwise some
-    // (trailing) elements might be lost after the transformation.
-    if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
-      return false;
-
-    VL = Builder.CreateZExt(
-        Builder.CreateUDiv(WideEVL,
-                           ConstantInt::get(WideEVL->getType(), Factor)),
-        XLenTy);
-  }
+  if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+    return false;
 
   Type *PtrTy = Ptr->getType();
   unsigned AS = PtrTy->getPointerAddressSpace();
@@ -318,8 +372,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
     unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
     Type *VecTupTy = TargetExtType::get(
         Load->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
-                                NumElts * SEW / 8),
+        ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8),
         Factor);
     Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
         Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
@@ -346,60 +399,45 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
     }
   }
 
-  for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
-    if (!DIV)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIV->replaceAllUsesWith(NewEV);
-  }
-
+  DI->replaceAllUsesWith(Return);
   return true;
 }
 
 bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
-    StoreInst *SI, ArrayRef<Value *> InterleaveValues) const {
+    Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const {
   unsigned Factor = InterleaveValues.size();
   if (Factor > 8)
     return false;
 
-  assert(SI->isSimple());
-  IRBuilder<> Builder(SI);
+  IRBuilder<> Builder(Store);
 
   auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
-  auto *PtrTy = SI->getPointerOperandType();
-  const DataLayout &DL = SI->getDataLayout();
+  const DataLayout &DL = Store->getDataLayout();
+  Type *XLenTy = Builder.getIntNTy(Subtarget.getXLen());
 
-  if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(), DL))
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment))
+    return false;
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = Ptr->getType()->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL))
     return false;
-
-  Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
   if (isa<FixedVectorType>(InVTy)) {
     Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy});
-
+        Store->getModule(), FixedVssegIntrIds[Factor - 2],
+        {InVTy, PtrTy, XLenTy});
     SmallVector<Value *, 10> Ops(InterleaveValues);
-    Value *VL = Builder.CreateElementCount(XLenTy, InVTy->getElementCount());
-    Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount());
-    Ops.append({SI->getPointerOperand(), Mask, VL});
-
+    Ops.append({Ptr, Mask, VL});
     Builder.CreateCall(VssegNFunc, Ops);
     return true;
   }
   unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
   unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
   Type *VecTupTy = TargetExtType::get(
-      SI->getContext(), "riscv.vector.tuple",
-      ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
-                              NumElts * SEW / 8),
-      Factor);
-
-  Value *VL = Constant::getAllOnesValue(XLenTy);
-  Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount());
+      Store->getContext(), "riscv.vector.tuple",
+      ScalableVectorType::get(Builder.getInt8Ty(), NumElts * SEW / 8), Factor);
 
   Value *StoredVal = PoisonValue::get(VecTupTy);
   for (unsigned i = 0; i < Factor; ++i)
@@ -407,216 +445,12 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
         Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
         {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
 
-  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      SI->getModule(), ScalableVssegIntrIds[Factor - 2],
-      {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
-
-  Value *Operands[] = {StoredVal, SI->getPointerOperand(), Mask, VL,
-                       ConstantInt::get(XLenTy, Log2_64(SEW))};
-  Builder.CreateCall(VssegNFunc, Operands);
-  return true;
-}
-
-/// Lower an interleaved vp.load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.load (Factor = 2):
-///   %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
-///                                                         %mask,
-///                                                         i32 %wide.rvl)
-///   %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-///             @llvm.vector.deinterleave2.nxv64i8(
-///               <vscale x 64 x i8> %l)
-///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
-///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
-///
-/// Into:
-///   %rvl = udiv %wide.rvl, 2
-///   %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-///             @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
-///                                                 <vscale x 32 x i8> undef,
-///                                                 ptr %ptr,
-///                                                 %mask,
-///                                                 i64 %rvl,
-///                                                 i64 1)
-///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
-///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
-///
-/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
-/// removed by the caller
-/// TODO: We probably can loosen the dependency on matching extractvalue when
-/// dealing with factor of 2 (extractvalue is still required for most of other
-/// factors though).
-bool RISCVTargetLowering::lowerInterleavedVPLoad(
-    VPIntrinsic *Load, Value *Mask,
-    ArrayRef<Value *> DeinterleaveResults) const {
-  const unsigned Factor = DeinterleaveResults.size();
-  assert(Mask && "Expect a valid mask");
-  assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
-         "Unexpected intrinsic");
-
-  Value *FirstActive = *llvm::find_if(DeinterleaveResults,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
-
-  auto &DL = Load->getModule()->getDataLayout();
-  Align Alignment = Load->getParamAlign(0).value_or(
-      DL.getABITypeAlign(VTy->getElementType()));
-  if (!isLegalInterleavedAccessType(
-          VTy, Factor, Alignment,
-          Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
-    return false;
-
-  IRBuilder<> Builder(Load);
-
-  Value *WideEVL = Load->getVectorLengthParam();
-  // Conservatively check if EVL is a multiple of factor, otherwise some
-  // (trailing) elements might be lost after the transformation.
-  if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
-    return false;
-
-  auto *PtrTy = Load->getArgOperand(0)->getType();
-  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExt(
-      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
-      XLenTy);
-
-  Value *Return = nullptr;
-  if (isa<FixedVectorType>(VTy)) {
-    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                     {VTy, PtrTy, XLenTy},
-                                     {Load->getArgOperand(0), Mask, EVL});
-  } else {
-    unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-    unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-    Type *VecTupTy = TargetExtType::get(
-        Load->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
-                                NumElts * SEW / 8),
-        Factor);
-
-    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
-        {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
-    Value *Operands[] = {
-        PoisonValue::get(VecTupTy),
-        Load->getArgOperand(0),
-        Mask,
-        EVL,
-        ConstantInt::get(XLenTy,
-                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
-        ConstantInt::get(XLenTy, Log2_64(SEW))};
-
-    CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
-
-    SmallVector<Type *, 8> AggrTypes{Factor, VTy};
-    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
-    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
-    for (unsigned i = 0; i < Factor; ++i) {
-      Value *VecExtract =
-          Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
-      Return = Builder.CreateInsertValue(Return, VecExtract, i);
-    }
-  }
-
-  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
-    if (!DIO)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIO->replaceAllUsesWith(NewEV);
-  }
-
-  return true;
-}
-
-/// Lower an interleaved vp.store into a vssegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.store (Factor = 2):
-///
-///   %is = tail call <vscale x 64 x i8>
-///             @llvm.vector.interleave2.nxv64i8(
-///                               <vscale x 32 x i8> %load0,
-///                               <vscale x 32 x i8> %load1
-///   %wide.rvl = shl nuw nsw i32 %rvl, 1
-///   tail call void @llvm.vp.store.nxv64i8.p0(
-///                               <vscale x 64 x i8> %is, ptr %ptr,
-///                               %mask,
-///                               i32 %wide.rvl)
-///
-/// Into:
-///   call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
-///                               <vscale x 32 x i8> %load1,
-///                               <vscale x 32 x i8> %load2, ptr %ptr,
-///                               %mask,
-///                               i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedVPStore(
-    VPIntrinsic *Store, Value *Mask,
-    ArrayRef<Value *> InterleaveOperands) const {
-  assert(Mask && "Expect a valid mask");
-  assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
-         "Unexpected intrinsic");
-
-  const unsigned Factor = InterleaveOperands.size();
-
-  auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
-  if (!VTy)
-    return false;
-
-  const DataLayout &DL = Store->getDataLayout();
-  Align Alignment = Store->getParamAlign(1).value_or(
-      DL.getABITypeAlign(VTy->getElementType()));
-  if (!isLegalInterleavedAccessType(
-          VTy, Factor, Alignment,
-          Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
-    return false;
-
-  IRBuilder<> Builder(Store);
-  Value *WideEVL = Store->getArgOperand(3);
-  // Conservatively check if EVL is a multiple of factor, otherwise some
-  // (trailing) elements might be lost after the transformation.
-  if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
-    return false;
-
-  auto *PtrTy = Store->getArgOperand(1)->getType();
-  auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExt(
-      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
-      XLenTy);
-
-  if (isa<FixedVectorType>(VTy)) {
-    SmallVector<Value *, 8> Operands(InterleaveOperands);
-    Operands.append({Store->getArgOperand(1), Mask, EVL});
-    Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
-                            {VTy, PtrTy, XLenTy}, Operands);
-    return true;
-  }
-
-  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-  Type *VecTupTy = TargetExtType::get(
-      Store->getContext(), "riscv.vector.tuple",
-      ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
-                              NumElts * SEW / 8),
-      Factor);
-
-  Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
-      Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
-  Value *StoredVal = PoisonValue::get(VecTupTy);
-  for (unsigned i = 0; i < Factor; ++i)
-    StoredVal = Builder.CreateCall(
-        VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
-
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
       Store->getModule(), ScalableVssegIntrIds[Factor - 2],
-      {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
+      {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
 
-  Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
+  Value *Operands[] = {StoredVal, Ptr, Mask, VL,
                        ConstantInt::get(XLenTy, Log2_64(SEW))};
-
   Builder.CreateCall(VssegNFunc, Operands);
   return true;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 28d64031f8bcb..3b19c3456ad67 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -48,6 +48,8 @@ using namespace llvm;
 STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
 STATISTIC(NumTransformedToWInstrs,
           "Number of instructions transformed to W-ops");
+STATISTIC(NumTransformedToNonWInstrs,
+          "Number of instructions transformed to non-W-ops");
 
 static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
                                          cl::desc("Disable removal of sext.w"),
@@ -67,10 +69,9 @@ class RISCVOptWInstrs : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
   bool removeSExtWInstrs(MachineFunction &MF, const RISCVInstrInfo &TII,
                          const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
-  bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
-                      const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
-  bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
-                       const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
+  bool canonicalizeWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
+                             const RISCVSubtarget &ST,
+                             MachineRegisterInfo &MRI);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -721,45 +722,39 @@ bool RISCVOptWInstrs::removeSExtWInstrs(MachineFunction &MF,
   return MadeChange;
 }
 
-bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
-                                     const RISCVInstrInfo &TII,
-                                     const RISCVSubtarget &ST,
-                                     MachineRegisterInfo &MRI) {
+// Strips or adds W suffixes to eligible instructions depending on the
+// subtarget preferences.
+bool RISCVOptWInstrs::canonicalizeWSuffixes(MachineFunction &MF,
+                                            const RISCVInstrInfo &TII,
+                                            const RISCVSubtarget &ST,
+                                            MachineRegisterInfo &MRI) {
+  bool ShouldStripW = !(DisableStripWSuffix || ST.preferWInst());
+  bool ShouldPreferW = ST.preferWInst();
   bool MadeChange = false;
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      unsigned Opc;
-      switch (MI.getOpcode()) {
-      default:
-        continue;
-      case RISCV::ADDW:  Opc = RISCV::ADD;  break;
-      case RISCV::ADDIW: Opc = RISCV::ADDI; break;
-      case RISCV::MULW:  Opc = RISCV::MUL;  break;
-      case RISCV::SLLIW: Opc = RISCV::SLLI; break;
-      }
 
-      if (hasAllWUsers(MI, ST, MRI)) {
-        MI.setDesc(TII.get(Opc));
-        MadeChange = true;
-      }
-    }
-  }
-
-  return MadeChange;
-}
-
-bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
-                                      const RISCVInstrInfo &TII,
-                                      const RISCVSubtarget &ST,
-                                      MachineRegisterInfo &MRI) {
-  bool MadeChange = false;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      unsigned WOpc;
-      // TODO: Add more?
-      switch (MI.getOpcode()) {
+      std::optional<unsigned> WOpc;
+      std::optional<unsigned> NonWOpc;
+      unsigned OrigOpc = MI.getOpcode();
+      switch (OrigOpc) {
       default:
         continue;
+      case RISCV::ADDW:
+        NonWOpc = RISCV::ADD;
+        break;
+      case RISCV::ADDIW:
+        NonWOpc = RISCV::ADDI;
+        break;
+      case RISCV::MULW:
+        NonWOpc = RISCV::MUL;
+        break;
+      case RISCV::SLLIW:
+        NonWOpc = RISCV::SLLI;
+        break;
+      case RISCV::SUBW:
+        NonWOpc = RISCV::SUB;
+        break;
       case RISCV::ADD:
         WOpc = RISCV::ADDW;
         break;
@@ -773,7 +768,7 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
         WOpc = RISCV::MULW;
         break;
       case RISCV::SLLI:
-        // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+        // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits.
         if (MI.getOperand(2).getImm() >= 32)
           continue;
         WOpc = RISCV::SLLIW;
@@ -784,19 +779,30 @@ bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
         break;
       }
 
-      if (hasAllWUsers(MI, ST, MRI)) {
+      if (ShouldStripW && NonWOpc.has_value() && hasAllWUsers(MI, ST, MRI)) {
+        LLVM_DEBUG(dbgs() << "Replacing " << MI);
+        MI.setDesc(TII.get(NonWOpc.value()));
+        LLVM_DEBUG(dbgs() << "     with " << MI);
+        ++NumTransformedToNonWInstrs;
+        MadeChange = true;
+        continue;
+      }
+      // LWU is always converted to LW when possible as 1) LW is compressible
+      // and 2) it helps minimise differences vs RV32.
+      if ((ShouldPreferW || OrigOpc == RISCV::LWU) && WOpc.has_value() &&
+          hasAllWUsers(MI, ST, MRI)) {
         LLVM_DEBUG(dbgs() << "Replacing " << MI);
-        MI.setDesc(TII.get(WOpc));
+        MI.setDesc(TII.get(WOpc.value()));
         MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
         MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
         MI.clearFlag(MachineInstr::MIFlag::IsExact);
         LLVM_DEBUG(dbgs() << "     with " << MI);
         ++NumTransformedToWInstrs;
         MadeChange = true;
+        continue;
       }
     }
   }
-
   return MadeChange;
 }
 
@@ -813,12 +819,6 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
 
   bool MadeChange = false;
   MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
-
-  if (!(DisableStripWSuffix || ST.preferWInst()))
-    MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
-
-  if (ST.preferWInst())
-    MadeChange |= appendWSuffixes(MF, TII, ST, MRI);
-
+  MadeChange |= canonicalizeWSuffixes(MF, TII, ST, MRI);
   return MadeChange;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 05388f2d13113..3e286a754e4ee 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,6 +13,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
 def SpacemitX60Model : SchedMachineModel {
   let IssueWidth        = 2; // dual-issue
   let MicroOpBufferSize = 0; // in-order
@@ -44,6 +55,19 @@ let BufferSize = 0 in {
   // floating point instructions, this model assumes single issue as
   // increasing it reduces the gains we saw in performance
   def SMX60_FP : ProcResource<1>;
+
+  // Vector pipeline
+  // Single issue for vector store/load instructions
+  def SMX60_VLS : ProcResource<1>;
+
+  // The C908 user manual says: "Vector floating-point units support vector
+  // floating-point computation of different bits. In addition, vector integer
+  // units are added". Developer confirmed it's a separate VIEU
+  def SMX60_VIEU : ProcResource<1>;
+
+  // The C908 user manual says: "The vector execution unit is developed by
+  // extending the floating-point unit", so let's assume single issue for now
+  def SMX60_VFP : ProcResource<1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -232,9 +256,341 @@ let Latency = 4 in {
   def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>;
 }
 
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETVL, [SMX60_IEUA]>;
+
+// 7. Vector Loads and Stores
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  // Unit-stride loads and stores
+  defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>;
+
+  // Mask loads and stores
+  defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+  defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+
+  // Strided and indexed loads and stores
+  foreach eew = [8, 16, 32, 64] in {
+    defm "" : LMULWriteResMX<"WriteVLDS"  # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+
+    defm "" : LMULWriteResMX<"WriteVSTS"  # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+  }
+}
+
+// Segmented loads and stores
+foreach mx = SchedMxList in {
+  foreach nf=2-8 in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+      // Unit-stride segmented
+      defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+      // Strided/indexed segmented
+      defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+      // Indexed segmented
+      defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+    }
+  }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+  def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>;
+  def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>;
+
+  def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>;
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Vector Integer Division and Remainder
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+// Narrowing Shift and Clips
+foreach mx = SchedMxListW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 13. Vector Floating-Point Instructions
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, isF=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, isF=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+  foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListFW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+  foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// Narrowing
+foreach mx = SchedMxListW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+  foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// Vector Floating-Point Division and Square Root
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// 14. Vector Reduction Operations
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListWRed in {
+  foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListFWRed in {
+  foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// 15. Vector Mask Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 16. Vector Permutation Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
+
+def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
+
+// Gather and Compress
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
 // Others
 def : WriteRes<WriteCSR, [SMX60_IEU]>;
 def : WriteRes<WriteNop, [SMX60_IEU]>;
+def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>;
 
 //===----------------------------------------------------------------------===//
 // Bypass and advance
@@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>;
 def : ReadAdvance<ReadSingleBit, 0>;
 def : ReadAdvance<ReadSingleBitImm, 0>;
 
+// 6. Configuration-Setting Instructions
+def : ReadAdvance<ReadVSETVLI, 0>;
+def : ReadAdvance<ReadVSETVL, 0>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTM", 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
+defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+defm : LMULReadAdvance<"ReadVIALUV", 0>;
+defm : LMULReadAdvance<"ReadVIALUX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+defm : LMULReadAdvance<"ReadVExtV", 0>;
+defm : LMULReadAdvance<"ReadVICALUV", 0>;
+defm : LMULReadAdvance<"ReadVICALUX", 0>;
+defm : LMULReadAdvance<"ReadVShiftV", 0>;
+defm : LMULReadAdvance<"ReadVShiftX", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+defm : LMULReadAdvance<"ReadVICmpV", 0>;
+defm : LMULReadAdvance<"ReadVICmpX", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+defm : LMULReadAdvance<"ReadVIMulV", 0>;
+defm : LMULReadAdvance<"ReadVIMulX", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+defm : LMULReadAdvance<"ReadVIMovV", 0>;
+defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+// 14. Vector Floating-Point Instructions
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
+defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
+defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
+foreach mx = SchedMxList in {
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
+  foreach sew = SchedSEWSet<mx>.val in
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Unsupported extensions
 defm : UnsupportedSchedQ;
-defm : UnsupportedSchedV;
 defm : UnsupportedSchedZabha;
 defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index c754de45db7fd..e35ffaf2b3935 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -216,7 +216,7 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const {
 }
 
 void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         unsigned NumRegionInstrs) const {
+                                         const SchedRegion &Region) const {
   // Do bidirectional scheduling since it provides a more balanced scheduling
   // leading to better performance. This will increase compile time.
   Policy.OnlyTopDown = false;
@@ -231,8 +231,8 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   Policy.ShouldTrackPressure = true;
 }
 
-void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
-                                               unsigned NumRegionInstrs) const {
+void RISCVSubtarget::overridePostRASchedPolicy(
+    MachineSchedPolicy &Policy, const SchedRegion &Region) const {
   MISched::Direction PostRASchedDirection = getPostRASchedDirection();
   if (PostRASchedDirection == MISched::TopDown) {
     Policy.OnlyTopDown = true;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4f560cca22dff..fd57e02c25d05 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -395,11 +395,11 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   }
 
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           unsigned NumRegionInstrs) const override;
+                           const SchedRegion &Region) const override;
 
   void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
-                                 unsigned NumRegionInstrs) const override;
+                                 const SchedRegion &Region) const override;
 };
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 12bf8c1b4de70..d62d99cf31899 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -116,8 +116,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
   }
   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const override {
-    return ST->hasVInstructions() ? TailFoldingStyle::Data
-                                  : TailFoldingStyle::DataWithoutLaneMask;
+    return ST->hasVInstructions() ? TailFoldingStyle::DataWithEVL
+                                  : TailFoldingStyle::None;
   }
   std::optional<unsigned> getMaxVScale() const override;
   std::optional<unsigned> getVScaleForTuning() const override;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index c2b5e0135caea..b53d919872de5 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -33,6 +33,7 @@ namespace {
 class RISCVVLOptimizer : public MachineFunctionPass {
   const MachineRegisterInfo *MRI;
   const MachineDominatorTree *MDT;
+  const TargetInstrInfo *TII;
 
 public:
   static char ID;
@@ -747,6 +748,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
     return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
   }
 
+  // Vector Register Gather with 16-bit Index Elements Instruction
+  // Dest and source data EEW=SEW. Index vector EEW=16.
+  case RISCV::VRGATHEREI16_VV: {
+    if (MO.getOperandNo() == 2)
+      return 4;
+    return MILog2SEW;
+  }
+
   default:
     return std::nullopt;
   }
@@ -1058,6 +1067,11 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VSLIDEDOWN_VI:
   case RISCV::VSLIDE1UP_VX:
   case RISCV::VFSLIDE1UP_VF:
+  // Vector Register Gather Instructions
+  case RISCV::VRGATHER_VI:
+  case RISCV::VRGATHER_VV:
+  case RISCV::VRGATHER_VX:
+  case RISCV::VRGATHEREI16_VV:
   // Vector Single-Width Floating-Point Add/Subtract Instructions
   case RISCV::VFADD_VF:
   case RISCV::VFADD_VV:
@@ -1278,7 +1292,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
     return false;
   }
 
-  assert(!RISCVII::elementsDependOnVL(RISCV::getRVVMCOpcode(MI.getOpcode())) &&
+  assert(!RISCVII::elementsDependOnVL(
+             TII->get(RISCV::getRVVMCOpcode(MI.getOpcode())).TSFlags) &&
          "Instruction shouldn't be supported if elements depend on VL");
 
   assert(MI.getOperand(0).isReg() &&
@@ -1471,7 +1486,6 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const {
 }
 
 bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
-  assert(DemandedVLs.size() == 0);
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -1482,6 +1496,10 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (!ST.hasVInstructions())
     return false;
 
+  TII = ST.getInstrInfo();
+
+  assert(DemandedVLs.empty());
+
   // For each instruction that defines a vector, compute what VL its
   // downstream users demand.
   for (MachineBasicBlock *MBB : post_order(&MF)) {
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
index c1f4d19824e86..3bd2705f021a6 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -10,6 +10,10 @@
 // instructions and masked instructions, so that we can reduce the live range
 // overlaps of mask registers.
 //
+// If there are multiple masks producers followed by multiple masked
+// instructions, then at each masked instructions add dependency edges between
+// every producer and masked instruction.
+//
 // The reason why we need to do this:
 // 1. When tracking register pressure, we don't track physical registers.
 // 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't
@@ -67,11 +71,27 @@ class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
 
   void apply(ScheduleDAGInstrs *DAG) override {
     SUnit *NearestUseV0SU = nullptr;
+    SmallVector<SUnit *, 2> DefMask;
     for (SUnit &SU : DAG->SUnits) {
       const MachineInstr *MI = SU.getInstr();
-      if (MI->findRegisterUseOperand(RISCV::V0, TRI))
+      bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI);
+      if (isSoleUseCopyToV0(SU) && !UseV0)
+        DefMask.push_back(&SU);
+
+      if (UseV0) {
         NearestUseV0SU = &SU;
 
+        // Copy may not be a real use, so skip it here.
+        if (DefMask.size() > 1 && !MI->isCopy()) {
+          for (SUnit *Def : DefMask)
+            if (DAG->canAddEdge(Def, &SU))
+              DAG->addEdge(Def, SDep(&SU, SDep::Artificial));
+        }
+
+        if (!DefMask.empty())
+          DefMask.erase(DefMask.begin());
+      }
+
       if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) &&
           // For LMUL=8 cases, there will be more possibilities to spill.
           // FIXME: We should use RegPressureTracker to do fine-grained
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 84ef53985484f..c1cc19b503deb 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
   if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
     return false;
 
+  // Masked off lanes past TrueVL will come from False, and converting to vmv
+  // will lose these lanes unless MIVL <= TrueVL.
+  // TODO: We could relax this for False == Passthru and True policy == TU
+  const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  const MachineOperand &TrueVL =
+      True->getOperand(RISCVII::getVLOpNum(True->getDesc()));
+  if (!RISCV::isVLKnownLE(MIVL, TrueVL))
+    return false;
+
   // True's passthru needs to be equivalent to False
   Register TruePassthruReg = True->getOperand(1).getReg();
   Register FalseReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6897865eb4e15..ea78dcd135267 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro
 defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>;
 defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>;
 defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>;
-defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>;
 
 // GetQuery builtin records:
 defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>;
@@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>;
 defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
 defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>;
 defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>;
 defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd0bea0b90472..d4fa62a289202 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -296,6 +296,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectImageWriteIntrinsic(MachineInstr &I) const;
   bool selectResourceGetPointer(Register &ResVReg, const SPIRVType *ResType,
                                 MachineInstr &I) const;
+  bool selectModf(Register ResVReg, const SPIRVType *ResType,
+                  MachineInstr &I) const;
 
   // Utilities
   std::pair<Register, bool>
@@ -3120,6 +3122,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
   case Intrinsic::spv_normalize:
     return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize);
+  case Intrinsic::spv_refract:
+    return selectExtInst(ResVReg, ResType, I, GL::Refract);
   case Intrinsic::spv_reflect:
     return selectExtInst(ResVReg, ResType, I, GL::Reflect);
   case Intrinsic::spv_rsqrt:
@@ -3233,6 +3237,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_discard: {
     return selectDiscard(ResVReg, ResType, I);
   }
+  case Intrinsic::modf: {
+    return selectModf(ResVReg, ResType, I);
+  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
@@ -4016,6 +4023,83 @@ bool SPIRVInstructionSelector::selectLog10(Register ResVReg,
                        .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectModf(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I) const {
+  // llvm.modf has a single arg --the number to be decomposed-- and returns a
+  // struct { restype, restype }, while OpenCLLIB::modf has two args --the
+  // number to be decomposed and a pointer--, returns the fractional part and
+  // the integral part is stored in the pointer argument. Therefore, we can't
+  // use directly the OpenCLLIB::modf intrinsic. However, we can do some
+  // scaffolding to make it work. The idea is to create an alloca instruction
+  // to get a ptr, pass this ptr to OpenCL::modf, and then load the value
+  // from this ptr to place it in the struct. llvm.modf returns the fractional
+  // part as the first element of the result, and the integral part as the
+  // second element of the result.
+
+  // At this point, the return type is not a struct anymore, but rather two
+  // independent elements of SPIRVResType. We can get each independent element
+  // from I.getDefs() or I.getOperands().
+  if (STI.canUseExtInstSet(SPIRV::InstructionSet::OpenCL_std)) {
+    MachineIRBuilder MIRBuilder(I);
+    // Get pointer type for alloca variable.
+    const SPIRVType *PtrType = GR.getOrCreateSPIRVPointerType(
+        ResType, MIRBuilder, SPIRV::StorageClass::Function);
+    // Create new register for the pointer type of alloca variable.
+    Register PtrTyReg =
+        MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
+    MIRBuilder.getMRI()->setType(
+        PtrTyReg,
+        LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function),
+                     GR.getPointerSize()));
+    // Assign SPIR-V type of the pointer type of the alloca variable to the
+    // new register.
+    GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF());
+    MachineBasicBlock &EntryBB = I.getMF()->front();
+    MachineBasicBlock::iterator VarPos =
+        getFirstValidInstructionInsertPoint(EntryBB);
+    auto AllocaMIB =
+        BuildMI(EntryBB, VarPos, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+            .addDef(PtrTyReg)
+            .addUse(GR.getSPIRVTypeID(PtrType))
+            .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function));
+    Register Variable = AllocaMIB->getOperand(0).getReg();
+    // Modf must have 4 operands, the first two are the 2 parts of the result,
+    // the third is the operand, and the last one is the floating point value.
+    assert(I.getNumOperands() == 4 &&
+           "Expected 4 operands for modf instruction");
+    MachineBasicBlock &BB = *I.getParent();
+    // Create the OpenCLLIB::modf instruction.
+    auto MIB =
+        BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+            .addDef(ResVReg)
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std))
+            .addImm(CL::modf)
+            .setMIFlags(I.getFlags())
+            .add(I.getOperand(3)) // Floating point value.
+            .addUse(Variable);    // Pointer to integral part.
+    // Assign the integral part stored in the ptr to the second element of the
+    // result.
+    Register IntegralPartReg = I.getOperand(1).getReg();
+    if (IntegralPartReg.isValid()) {
+      // Load the value from the pointer to integral part.
+      auto LoadMIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
+                         .addDef(IntegralPartReg)
+                         .addUse(GR.getSPIRVTypeID(ResType))
+                         .addUse(Variable);
+      return LoadMIB.constrainAllUses(TII, TRI, RBI);
+    }
+
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  } else if (STI.canUseExtInstSet(SPIRV::InstructionSet::GLSL_std_450)) {
+    assert(false && "GLSL::Modf is deprecated.");
+    // FIXME: GL::Modf is deprecated, use Modfstruct instead.
+    return false;
+  }
+  return false;
+}
+
 // Generate the instructions to load 3-element vector builtin input
 // IDs/Indices.
 // Like: GlobalInvocationId, LocalInvocationId, etc....
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 2bffbf73b574a..595424b999439 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -380,7 +380,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
   bool Changed = false;
   const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
   for (BasicBlock &BB : *F) {
-    for (Instruction &I : BB) {
+    for (Instruction &I : make_early_inc_range(BB)) {
       auto Call = dyn_cast<CallInst>(&I);
       if (!Call)
         continue;
@@ -408,12 +408,18 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         if (!STI.isShader()) {
           Changed |= toSpvOverloadedIntrinsic(
               II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+        } else {
+          II->eraseFromParent();
+          Changed = true;
         }
         break;
       case Intrinsic::lifetime_end:
         if (!STI.isShader()) {
           Changed |= toSpvOverloadedIntrinsic(
               II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+        } else {
+          II->eraseFromParent();
+          Changed = true;
         }
         break;
       case Intrinsic::ptr_annotation:
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 768efb96a53e9..416d811ba4e65 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -995,4 +995,27 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
   return foldImm(ResType->getOperand(2), MRI);
 }
 
+MachineBasicBlock::iterator
+getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) {
+  // Find the position to insert the OpVariable instruction.
+  // We will insert it after the last OpFunctionParameter, if any, or
+  // after OpFunction otherwise.
+  MachineBasicBlock::iterator VarPos = BB.begin();
+  while (VarPos != BB.end() && VarPos->getOpcode() != SPIRV::OpFunction) {
+    ++VarPos;
+  }
+  // Advance VarPos to the next instruction after OpFunction, it will either
+  // be an OpFunctionParameter, so that we can start the next loop, or the
+  // position to insert the OpVariable instruction.
+  ++VarPos;
+  while (VarPos != BB.end() &&
+         VarPos->getOpcode() == SPIRV::OpFunctionParameter) {
+    ++VarPos;
+  }
+  // VarPos is now pointing at after the last OpFunctionParameter, if any,
+  // or after OpFunction, if no parameters.
+  return VarPos != BB.end() && VarPos->getOpcode() == SPIRV::OpLabel ? ++VarPos
+                                                                     : VarPos;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index d732188f9289f..45c520a922d10 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -506,6 +506,8 @@ MachineInstr *getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI);
 int64_t foldImm(const MachineOperand &MO, const MachineRegisterInfo *MRI);
 unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
                                 const MachineInstr *ResType);
+MachineBasicBlock::iterator
+getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
 
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 9b434d87c2676..1aa8efe3e9979 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2201,7 +2201,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Chain = DAG.getEntryNode();
     SDValue InGlue;
 
-    Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue);
     InGlue = Chain.getValue(1);
     SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2219,7 +2219,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                      InGlue};
     Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
     InGlue = Chain.getValue(1);
-    Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL);
+    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
     InGlue = Chain.getValue(1);
     SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue);
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 2662241ef8499..e6486e247209b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -256,9 +256,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
 
   // Precompute the set of registers that are unused, so that we can insert
   // drops to their defs.
+  // And unstackify any stackified registers that don't have any uses, so that
+  // they can be dropped later. This can happen when transformations after
+  // RegStackify remove instructions using stackified registers.
   BitVector UseEmpty(MRI.getNumVirtRegs());
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
-    UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.use_empty(Reg)) {
+      UseEmpty[I] = true;
+      MFI.unstackifyVReg(Reg);
+    }
+  }
 
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index ac819cf5c1801..fa5776cddcd29 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -15,12 +15,14 @@
 #include "WebAssembly.h"
 #include "WebAssemblyISelLowering.h"
 #include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -118,6 +120,51 @@ static SDValue getTagSymNode(int Tag, SelectionDAG *DAG) {
   return DAG->getTargetExternalSymbol(SymName, PtrVT);
 }
 
+static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
+                                     SmallVector<MVT, 4> &Returns,
+                                     SmallVector<MVT, 4> &Params) {
+  auto toWasmValType = [&](MVT VT) {
+    if (VT == MVT::i32) {
+      return wasm::ValType::I32;
+    }
+    if (VT == MVT::i64) {
+      return wasm::ValType::I64;
+    }
+    if (VT == MVT::f32) {
+      return wasm::ValType::F32;
+    }
+    if (VT == MVT::f64) {
+      return wasm::ValType::F64;
+    }
+    LLVM_DEBUG(errs() << "Unhandled type for llvm.wasm.ref.test.func: " << VT
+                      << "\n");
+    llvm_unreachable("Unhandled type for llvm.wasm.ref.test.func");
+  };
+  auto NParams = Params.size();
+  auto NReturns = Returns.size();
+  auto BitWidth = (NParams + NReturns + 2) * 64;
+  auto Sig = APInt(BitWidth, 0);
+
+  // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will
+  // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we
+  // always emit a CImm. So xor NParams with 0x7ffffff to ensure
+  // getSignificantBits() > 64
+  Sig |= NReturns ^ 0x7ffffff;
+  for (auto &Return : Returns) {
+    auto V = toWasmValType(Return);
+    Sig <<= 64;
+    Sig |= (int64_t)V;
+  }
+  Sig <<= 64;
+  Sig |= NParams;
+  for (auto &Param : Params) {
+    auto V = toWasmValType(Param);
+    Sig <<= 64;
+    Sig |= (int64_t)V;
+  }
+  return Sig;
+}
+
 void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -189,6 +236,50 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, TLSAlign);
       return;
     }
+    case Intrinsic::wasm_ref_test_func: {
+      // First emit the TABLE_GET instruction to convert function pointer ==>
+      // funcref
+      MachineFunction &MF = CurDAG->getMachineFunction();
+      auto PtrVT = MVT::getIntegerVT(MF.getDataLayout().getPointerSizeInBits());
+      MCSymbol *Table = WebAssembly::getOrCreateFunctionTableSymbol(
+          MF.getContext(), Subtarget);
+      SDValue TableSym = CurDAG->getMCSymbol(Table, PtrVT);
+      SDValue FuncRef = SDValue(
+          CurDAG->getMachineNode(WebAssembly::TABLE_GET_FUNCREF, DL,
+                                 MVT::funcref, TableSym, Node->getOperand(1)),
+          0);
+
+      // Encode the signature information into the type index placeholder.
+      // This gets decoded and converted into the actual type signature in
+      // WebAssemblyMCInstLower.cpp.
+      SmallVector<MVT, 4> Params;
+      SmallVector<MVT, 4> Returns;
+
+      bool IsParam = false;
+      // Operand 0 is the return register, Operand 1 is the function pointer.
+      // The remaining operands encode the type of the function we are testing
+      // for.
+      for (unsigned I = 2, E = Node->getNumOperands(); I < E; ++I) {
+        MVT VT = Node->getOperand(I).getValueType().getSimpleVT();
+        if (VT == MVT::Untyped) {
+          IsParam = true;
+          continue;
+        }
+        if (IsParam) {
+          Params.push_back(VT);
+        } else {
+          Returns.push_back(VT);
+        }
+      }
+      auto Sig = encodeFunctionSignature(CurDAG, DL, Returns, Params);
+
+      auto SigOp = CurDAG->getTargetConstant(
+          Sig, DL, EVT::getIntegerVT(*CurDAG->getContext(), Sig.getBitWidth()));
+      MachineSDNode *RefTestNode = CurDAG->getMachineNode(
+          WebAssembly::REF_TEST_FUNCREF, DL, MVT::i32, {SigOp, FuncRef});
+      ReplaceNode(Node, RefTestNode);
+      return;
+    }
     }
     break;
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index bf2e04caa0a61..11936a32937af 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -46,6 +46,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     : TargetLowering(TM), Subtarget(&STI) {
   auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
 
+  // Set the load count for memcmp expand optimization
+  MaxLoadsPerMemcmp = 8;
+  MaxLoadsPerMemcmpOptSize = 4;
+
   // Booleans always contain 0 or 1.
   setBooleanContents(ZeroOrOneBooleanContent);
   // Except in SIMD vectors
@@ -794,6 +798,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
 
   if (IsIndirect) {
     // Placeholder for the type index.
+    // This gets replaced with the correct value in WebAssemblyMCInstLower.cpp
     MIB.addImm(0);
     // The table into which this call_indirect indexes.
     MCSymbolWasm *Table = IsFuncrefCall
@@ -2934,6 +2939,25 @@ performVectorExtendToFPCombine(SDNode *N,
   return DAG.getNode(N->getOpcode(), SDLoc(N), ResVT, Conv);
 }
 
+static SDValue
+performVectorNonNegToFPCombine(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+
+  SDNodeFlags Flags = N->getFlags();
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Optimize uitofp to sitofp when the sign bit is known to be zero.
+  // Depending on the target (runtime) backend, this might be performance
+  // neutral (e.g. AArch64) or a significant improvement (e.g. x86_64).
+  if (VT.isVector() && (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0))) {
+    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+  }
+
+  return SDValue();
+}
+
 static SDValue
 performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   auto &DAG = DCI.DAG;
@@ -3515,6 +3539,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ZERO_EXTEND:
     return performVectorExtendCombine(N, DCI);
   case ISD::UINT_TO_FP:
+    if (auto ExtCombine = performVectorExtendToFPCombine(N, DCI))
+      return ExtCombine;
+    return performVectorNonNegToFPCombine(N, DCI);
   case ISD::SINT_TO_FP:
     return performVectorExtendToFPCombine(N, DCI);
   case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index cc36244e63ff5..4613fcb608d64 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -15,13 +15,18 @@
 #include "WebAssemblyMCInstLower.h"
 #include "MCTargetDesc/WebAssemblyMCAsmInfo.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyMCTypeUtilities.h"
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "Utils/WebAssemblyTypeUtilities.h"
 #include "WebAssemblyAsmPrinter.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -152,6 +157,34 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand(
   return MCOperand::createExpr(Expr);
 }
 
+MCOperand
+WebAssemblyMCInstLower::lowerEncodedFunctionSignature(const APInt &Sig) const {
+  // For APInt a word is 64 bits on all architectures, see definition in APInt.h
+  auto NumWords = Sig.getNumWords();
+  SmallVector<wasm::ValType, 4> Params;
+  SmallVector<wasm::ValType, 2> Returns;
+
+  int Idx = NumWords;
+  auto GetWord = [&Idx, &Sig]() {
+    Idx--;
+    return Sig.extractBitsAsZExtValue(64, 64 * Idx);
+  };
+  // Annoying special case: if getSignificantBits() <= 64 then InstrEmitter will
+  // emit an Imm instead of a CImm. It simplifies WebAssemblyMCInstLower if we
+  // always emit a CImm. So xor NParams with 0x7ffffff to ensure
+  // getSignificantBits() > 64
+  // See encodeFunctionSignature in WebAssemblyISelDAGtoDAG.cpp
+  int NReturns = GetWord() ^ 0x7ffffff;
+  for (int I = 0; I < NReturns; I++) {
+    Returns.push_back(static_cast<wasm::ValType>(GetWord()));
+  }
+  int NParams = GetWord();
+  for (int I = 0; I < NParams; I++) {
+    Params.push_back(static_cast<wasm::ValType>(GetWord()));
+  }
+  return lowerTypeIndexOperand(std::move(Returns), std::move(Params));
+}
+
 static void getFunctionReturns(const MachineInstr *MI,
                                SmallVectorImpl<wasm::ValType> &Returns) {
   const Function &F = MI->getMF()->getFunction();
@@ -196,11 +229,30 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
       MCOp = MCOperand::createReg(WAReg);
       break;
     }
+    case llvm::MachineOperand::MO_CImmediate: {
+      // Lower type index placeholder for ref.test
+      // Currently this is the only way that CImmediates show up so panic if we
+      // get confused.
+      unsigned DescIndex = I - NumVariadicDefs;
+      assert(DescIndex < Desc.NumOperands && "unexpected CImmediate operand");
+      auto Operands = Desc.operands();
+      const MCOperandInfo &Info = Operands[DescIndex];
+      assert(Info.OperandType == WebAssembly::OPERAND_TYPEINDEX &&
+             "unexpected CImmediate operand");
+      (void)Info;
+      MCOp = lowerEncodedFunctionSignature(MO.getCImm()->getValue());
+      break;
+    }
     case MachineOperand::MO_Immediate: {
       unsigned DescIndex = I - NumVariadicDefs;
       if (DescIndex < Desc.NumOperands) {
-        const MCOperandInfo &Info = Desc.operands()[DescIndex];
+        auto Operands = Desc.operands();
+        const MCOperandInfo &Info = Operands[DescIndex];
+        // Replace type index placeholder with actual type index. The type index
+        // placeholders are Immediates and have an operand type of
+        // OPERAND_TYPEINDEX or OPERAND_SIGNATURE.
         if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+          // Lower type index placeholder for a CALL_INDIRECT instruction
           SmallVector<wasm::ValType, 4> Returns;
           SmallVector<wasm::ValType, 4> Params;
 
@@ -228,6 +280,7 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
           break;
         }
         if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+          // Lower type index placeholder for blocks
           auto BT = static_cast<WebAssembly::BlockType>(MO.getImm());
           assert(BT != WebAssembly::BlockType::Invalid);
           if (BT == WebAssembly::BlockType::Multivalue) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index 9f08499e5cde1..34404d93434bb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -36,6 +36,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
   MCOperand lowerTypeIndexOperand(SmallVectorImpl<wasm::ValType> &&,
                                   SmallVectorImpl<wasm::ValType> &&) const;
+  MCOperand lowerEncodedFunctionSignature(const APInt &Sig) const;
 
 public:
   WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 4f159996e4c6c..52e706514226b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -141,6 +141,21 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 }
 
+WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
+WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+
+  Options.AllowOverlappingLoads = true;
+
+  // TODO: Teach WebAssembly backend about load v128.
+
+  Options.LoadSizes.append({8, 4, 2, 1});
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+
+  return Options;
+}
+
 InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
     unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
     TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index d83b8d1f45dbd..c915eeb07d4fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -73,6 +73,10 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                    TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                    const Instruction *I = nullptr) const override;
+
+  TTI::MemCmpExpansionOptions
+  enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
+
   InstructionCost getMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
       TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3d060c6f4a780..e213923ccf38e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -127,7 +127,6 @@ class X86AsmBackend : public MCAsmBackend {
   unsigned PrevInstOpcode = 0;
   MCBoundaryAlignFragment *PendingBA = nullptr;
   std::pair<MCFragment *, size_t> PrevInstPosition;
-  bool IsRightAfterData = false;
 
   uint8_t determinePaddingPrefix(const MCInst &Inst) const;
   bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
@@ -156,10 +155,13 @@ class X86AsmBackend : public MCAsmBackend {
       AlignBranchType = X86AlignBranchKindLoc;
     if (X86PadMaxPrefixSize.getNumOccurrences())
       TargetPrefixMax = X86PadMaxPrefixSize;
+
+    AllowAutoPadding =
+        AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone;
+    AllowEnhancedRelaxation =
+        AllowAutoPadding && TargetPrefixMax != 0 && X86PadForBranchAlign;
   }
 
-  bool allowAutoPadding() const override;
-  bool allowEnhancedRelaxation() const override;
   void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst,
                             const MCSubtargetInfo &STI);
   void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst);
@@ -365,14 +367,6 @@ static bool hasVariantSymbol(const MCInst &MI) {
   return false;
 }
 
-bool X86AsmBackend::allowAutoPadding() const {
-  return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
-}
-
-bool X86AsmBackend::allowEnhancedRelaxation() const {
-  return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
-}
-
 /// X86 has certain instructions which enable interrupts exactly one
 /// instruction *after* the instruction which stores to SS.  Return true if the
 /// given instruction may have such an interrupt delay slot.
@@ -447,7 +441,7 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
     // semantic.
     return false;
 
-  if (IsRightAfterData)
+  if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
     // If this instruction follows any data, there is no clear
     // instruction boundary, inserting a nop/prefix would change semantic.
     return false;
@@ -484,13 +478,26 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
           (AlignBranchType & X86::AlignBranchIndirect));
 }
 
+void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
+                             const MCSubtargetInfo &STI) {
+  bool AutoPadding = S.getAllowAutoPadding();
+  if (LLVM_LIKELY(!AutoPadding && !X86PadForAlign)) {
+    S.MCObjectStreamer::emitInstruction(Inst, STI);
+    return;
+  }
+
+  auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
+  Backend.emitInstructionBegin(S, Inst, STI);
+  S.MCObjectStreamer::emitInstruction(Inst, STI);
+  Backend.emitInstructionEnd(S, Inst);
+}
+
 /// Insert BoundaryAlignFragment before instructions to align branches.
 void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
                                          const MCInst &Inst, const MCSubtargetInfo &STI) {
-  // Used by canPadInst. Done here, because in emitInstructionEnd, the current
-  // fragment will have changed.
-  IsRightAfterData =
-      isRightAfterData(OS.getCurrentFragment(), PrevInstPosition);
+  bool CanPadInst = canPadInst(Inst, OS);
+  if (CanPadInst)
+    OS.getCurrentFragment()->setAllowAutoPadding(true);
 
   if (!canPadBranches(OS))
     return;
@@ -504,7 +511,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
   // we call canPadInst (not cheap) twice. However, in the common case, we can
   // avoid unnecessary calls to that, as this is otherwise only used for
   // relaxable fragments.
-  if (!canPadInst(Inst, OS))
+  if (!CanPadInst)
     return;
 
   if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
@@ -542,11 +549,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
 /// Set the last fragment to be aligned for the BoundaryAlignFragment.
 void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
                                        const MCInst &Inst) {
-  MCFragment *CF = OS.getCurrentFragment();
-  if (CF->getKind() == MCFragment::FT_Relaxable)
-    CF->setAllowAutoPadding(canPadInst(Inst, OS));
-
   // Update PrevInstOpcode here, canPadInst() reads that.
+  MCFragment *CF = OS.getCurrentFragment();
   PrevInstOpcode = Inst.getOpcode();
   PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
 
@@ -567,11 +571,10 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
   // DataFragment, so that we can get the size of instructions later in
   // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
   // DataFragment.
-  OS.insert(OS.getContext().allocFragment<MCFragment>());
+  OS.newFragment();
 
   // Update the maximum alignment on the current section if necessary.
-  MCSection *Sec = OS.getCurrentSectionOnly();
-  Sec->ensureMinAlignment(AlignBoundary);
+  CF->getParent()->ensureMinAlignment(AlignBoundary);
 }
 
 std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
@@ -923,13 +926,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
         continue;
       }
 
-      const uint64_t OrigSize = Asm.computeFragmentSize(F);
-
       // To keep the effects local, prefer to relax instructions closest to
       // the align directive.  This is purely about human understandability
       // of the resulting code.  If we later find a reason to expand
       // particular instructions over others, we can adjust.
-      unsigned RemainingSize = OrigSize;
+      unsigned RemainingSize = Asm.computeFragmentSize(F) - F.getFixedSize();
       while (!Relaxable.empty() && RemainingSize != 0) {
         auto &RF = *Relaxable.pop_back_val();
         // Give the backend a chance to play any tricks it wishes to increase
@@ -1542,14 +1543,6 @@ class X86ELFStreamer : public MCELFStreamer {
 };
 } // end anonymous namespace
 
-void X86_MC::emitInstruction(MCObjectStreamer &S, const MCInst &Inst,
-                             const MCSubtargetInfo &STI) {
-  auto &Backend = static_cast<X86AsmBackend &>(S.getAssembler().getBackend());
-  Backend.emitInstructionBegin(S, Inst, STI);
-  S.MCObjectStreamer::emitInstruction(Inst, STI);
-  Backend.emitInstructionEnd(S, Inst);
-}
-
 void X86ELFStreamer::emitInstruction(const MCInst &Inst,
                                      const MCSubtargetInfo &STI) {
   X86_MC::emitInstruction(*this, Inst, STI);
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index efb951b73532f..e02b5562d3b5e 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -151,6 +151,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
                                     MCSymbol *LazyPointer) override;
 
   void emitCallInstruction(const llvm::MCInst &MCI);
+  void maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI);
 
   // Emits a label to mark the next instruction as being relevant to Import Call
   // Optimization.
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f7a81f7..5d5a705893242 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                 CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(ValVT == MVT::i32 && "Should have i32 parts");
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  assert(PendingMembers.size() == 4 && "Should have four parts");
+
+  int64_t Offset = State.AllocateStack(16, Align(16));
+  PendingMembers[0].convertToMem(Offset);
+  PendingMembers[1].convertToMem(Offset + 4);
+  PendingMembers[2].convertToMem(Offset + 8);
+  PendingMembers[3].convertToMem(Offset + 12);
+
+  State.addLoc(PendingMembers[0]);
+  State.addLoc(PendingMembers[1]);
+  State.addLoc(PendingMembers[2]);
+  State.addLoc(PendingMembers[3]);
+  PendingMembers.clear();
+  return true;
+}
+
 // Provides entry points of CC_X86 and RetCC_X86.
 #include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa02262..f020e0b55141c 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
   // The 'nest' parameter, if any, is passed in ECX.
   CCIfNest<CCAssignToReg<[ECX]>>,
 
+  // i128 and fp128 need to be passed on the stack with a higher alignment than
+  // their legal types. Handle this with a custom function.
+  CCIfType<[i32],
+           CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
   // On swifttailcc pass swiftself in ECX.
   CCIfCC<"CallingConv::SwiftTail",
          CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d91ea1ea1bb1b..568a8c4cfed79 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1323,11 +1323,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
   }
 
-  if (Subtarget.hasGFNI()) {
+  if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
     setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
+
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
+    }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -32694,7 +32698,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   if (Subtarget.hasXOP() && !VT.is512BitVector())
     return LowerBITREVERSE_XOP(Op, DAG);
 
-  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+  assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
+         "SSSE3 or GFNI required for BITREVERSE");
 
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
@@ -45054,6 +45059,10 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   unsigned NumElts = DemandedElts.getBitWidth();
 
   switch (Op.getOpcode()) {
+  case X86ISD::GlobalBaseReg:
+  case X86ISD::Wrapper:
+  case X86ISD::WrapperRIP:
+    return true;
   case X86ISD::BLENDI:
   case X86ISD::PSHUFD:
   case X86ISD::UNPCKL:
@@ -45093,27 +45102,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
 
   switch (Op.getOpcode()) {
+  // SSE vector insert/extracts use modulo indices.
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW:
+  case X86ISD::PEXTRB:
+  case X86ISD::PEXTRW:
+    return false;
   // SSE vector multiplies are either inbounds or saturate.
   case X86ISD::VPMADDUBSW:
   case X86ISD::VPMADDWD:
+    return false;
   // SSE vector shifts handle out of bounds shift amounts.
   case X86ISD::VSHLI:
   case X86ISD::VSRLI:
   case X86ISD::VSRAI:
     return false;
-    // SSE blends.
+  // SSE blends.
   case X86ISD::BLENDI:
   case X86ISD::BLENDV:
     return false;
-    // SSE target shuffles.
+  // SSE target shuffles.
   case X86ISD::PSHUFD:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMV3:
     return false;
-    // SSE comparisons handle all icmp/fcmp cases.
-    // TODO: Add CMPM/MM with test coverage.
+  // SSE comparisons handle all icmp/fcmp cases.
+  // TODO: Add CMPM/MM with test coverage.
   case X86ISD::CMPP:
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a36e91b5..547b2210fdbf0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,14 +1661,15 @@ namespace llvm {
 
     /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedLoad(LoadInst *LI,
+    bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                               ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad355311527b..b4639ac2577e8 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
     const DataLayout &DL) const {
-  // i128 split into i64 needs to be allocated to two consecutive registers,
-  // or spilled to the stack as a whole.
-  return Ty->isIntegerTy(128);
+  // On x86-64 i128 is split into two i64s and needs to be allocated to two
+  // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+  // is split to four i32s and never actually passed in registers, but we use
+  // the consecutive register mark to match it in TableGen.
+  if (Ty->isIntegerTy(128))
+    return true;
+
+  // On x86-32, fp128 acts the same as i128.
+  if (Subtarget.is32Bit() && Ty->isFP128Ty())
+    return true;
+
+  return false;
 }
 
 /// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3b2cd18..636b072837441 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // number of shuffles and ISA.
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load");
+
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);
   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
@@ -817,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
 
-bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
+                                              Value *LaneMask,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -827,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
              0 &&
          "Invalid interleaved store");
 
+  auto *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!LaneMask && "Unexpected mask on store");
+
   // Holds the indices of SVI that correspond to the starting index of each
   // interleaved shuffle.
   auto Mask = SVI->getShuffleMask();
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 45d596bb498f6..481a9be8374ab 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
@@ -833,6 +834,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
     CallInst.setOpcode(CallOpcode);
     CallInst.addOperand(CallTargetMCOp);
     OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
+    maybeEmitNopAfterCallForWindowsEH(&MI);
   }
 
   // Record our statepoint node in the same section used by STACKMAP
@@ -1430,21 +1432,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
     OutStreamer->emitLabel(FallthroughLabel);
 }
 
-// Returns instruction preceding MBBI in MachineFunction.
-// If MBBI is the first instruction of the first basic block, returns null.
-static MachineBasicBlock::const_iterator
-PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
-  const MachineBasicBlock *MBB = MBBI->getParent();
-  while (MBBI == MBB->begin()) {
-    if (MBB == &MBB->getParent()->front())
-      return MachineBasicBlock::const_iterator();
-    MBB = MBB->getPrevNode();
-    MBBI = MBB->end();
-  }
-  --MBBI;
-  return MBBI;
-}
-
 static unsigned getSrcIdx(const MachineInstr* MI, unsigned SrcIdx) {
   if (X86II::isKMasked(MI->getDesc().TSFlags)) {
     // Skip mask operand.
@@ -2271,6 +2258,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       OutStreamer->AddComment("EVEX TO EVEX Compression ", false);
   }
 
+  // We use this to suppress NOP padding for Windows EH.
+  bool IsTailJump = false;
+
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
@@ -2325,6 +2315,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     // Lower this as normal, but add a comment.
     OutStreamer->AddComment("TAILCALL");
+    IsTailJump = true;
     break;
 
   case X86::TAILJMPr:
@@ -2340,6 +2331,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     // Lower these as normal, but add some comments.
     OutStreamer->AddComment("TAILCALL");
+    IsTailJump = true;
     break;
 
   case X86::TAILJMPm64_REX:
@@ -2349,6 +2341,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
 
     OutStreamer->AddComment("TAILCALL");
+    IsTailJump = true;
     break;
 
   case X86::TAILJMPr64_REX: {
@@ -2361,6 +2354,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
 
     OutStreamer->AddComment("TAILCALL");
+    IsTailJump = true;
     break;
   }
 
@@ -2537,26 +2531,6 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   case X86::SEH_BeginEpilogue: {
     assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
-    // Windows unwinder will not invoke function's exception handler if IP is
-    // either in prologue or in epilogue.  This behavior causes a problem when a
-    // call immediately precedes an epilogue, because the return address points
-    // into the epilogue.  To cope with that, we insert a 'nop' if it ends up
-    // immediately after a CALL in the final emitted code.
-    MachineBasicBlock::const_iterator MBBI(MI);
-    // Check if preceded by a call and emit nop if so.
-    for (MBBI = PrevCrossBBInst(MBBI);
-         MBBI != MachineBasicBlock::const_iterator();
-         MBBI = PrevCrossBBInst(MBBI)) {
-      // Pseudo instructions that aren't a call are assumed to not emit any
-      // code. If they do, we worst case generate unnecessary noops after a
-      // call.
-      if (MBBI->isCall() || !MBBI->isPseudo()) {
-        if (MBBI->isCall())
-          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
-        break;
-      }
-    }
-
     EmitSEHInstruction(MI);
     return;
   }
@@ -2585,6 +2559,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
       emitCallInstruction(TmpInst);
       emitNop(*OutStreamer, 5, Subtarget);
+      maybeEmitNopAfterCallForWindowsEH(MI);
       return;
     }
 
@@ -2605,6 +2580,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
       // For Import Call Optimization to work, we need a 3-byte nop after the
       // call instruction.
       emitNop(*OutStreamer, 3, Subtarget);
+      maybeEmitNopAfterCallForWindowsEH(MI);
       return;
     }
     break;
@@ -2638,6 +2614,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   if (MI->isCall()) {
     emitCallInstruction(TmpInst);
+    // Since tail calls transfer control without leaving a stack frame, there is
+    // never a need for NOP padding tail calls.
+    if (!IsTailJump)
+      maybeEmitNopAfterCallForWindowsEH(MI);
     return;
   }
 
@@ -2659,6 +2639,164 @@ void X86AsmPrinter::emitCallInstruction(const llvm::MCInst &MCI) {
   OutStreamer->emitInstruction(MCI, getSubtargetInfo());
 }
 
+// Determines whether a NOP is required after a CALL, so that Windows EH
+// IP2State tables have the correct information.
+//
+// On most Windows platforms (AMD64, ARM64, ARM32, IA64, but *not* x86-32),
+// exception handling works by looking up instruction pointers in lookup
+// tables. These lookup tables are stored in .xdata sections in executables.
+// One element of the lookup tables are the "IP2State" tables (Instruction
+// Pointer to State).
+//
+// If a function has any instructions that require cleanup during exception
+// unwinding, then it will have an IP2State table. Each entry in the IP2State
+// table describes a range of bytes in the function's instruction stream, and
+// associates an "EH state number" with that range of instructions. A value of
+// -1 means "the null state", which does not require any code to execute.
+// A value other than -1 is an index into the State table.
+//
+// The entries in the IP2State table contain byte offsets within the instruction
+// stream of the function. The Windows ABI requires that these offsets are
+// aligned to instruction boundaries; they are not permitted to point to a byte
+// that is not the first byte of an instruction.
+//
+// Unfortunately, CALL instructions present a problem during unwinding. CALL
+// instructions push the address of the instruction after the CALL instruction,
+// so that execution can resume after the CALL. If the CALL is the last
+// instruction within an IP2State region, then the return address (on the stack)
+// points to the *next* IP2State region. This means that the unwinder will
+// use the wrong cleanup funclet during unwinding.
+//
+// To fix this problem, the Windows AMD64 ABI requires that CALL instructions
+// are never placed at the end of an IP2State region. Stated equivalently, the
+// end of a CALL instruction cannot be aligned to an IP2State boundary.  If a
+// CALL instruction would occur at the end of an IP2State region, then the
+// compiler must insert a NOP instruction after the CALL. The NOP instruction
+// is placed in the same EH region as the CALL instruction, so that the return
+// address points to the NOP and the unwinder will locate the correct region.
+//
+// NOP padding is only necessary on Windows AMD64 targets. On ARM64 and ARM32,
+// instructions have a fixed size so the unwinder knows how to "back up" by
+// one instruction.
+//
+// Interaction with Import Call Optimization (ICO):
+//
+// Import Call Optimization (ICO) is a compiler + OS feature on Windows which
+// improves the performance and security of DLL imports. ICO relies on using a
+// specific CALL idiom that can be replaced by the OS DLL loader. This removes
+// a load and indirect CALL and replaces it with a single direct CALL.
+//
+// To achieve this, ICO also inserts NOPs after the CALL instruction. If the
+// end of the CALL is aligned with an EH state transition, we *also* insert
+// a single-byte NOP.  **Both forms of NOPs must be preserved.**  They cannot
+// be combined into a single larger NOP; nor can the second NOP be removed.
+//
+// This is necessary because, if ICO is active and the call site is modified
+// by the loader, the loader will end up overwriting the NOPs that were inserted
+// for ICO. That means that those NOPs cannot be used for the correct
+// termination of the exception handling region (the IP2State transition),
+// so we still need an additional NOP instruction.  The NOPs cannot be combined
+// into a longer NOP (which is ordinarily desirable) because then ICO would
+// split one instruction, producing a malformed instruction after the ICO call.
+void X86AsmPrinter::maybeEmitNopAfterCallForWindowsEH(const MachineInstr *MI) {
+  // We only need to insert NOPs after CALLs when targeting Windows on AMD64.
+  // (Don't let the name fool you: Itanium refers to table-based exception
+  // handling, not the Itanium architecture.)
+  if (MAI->getExceptionHandlingType() != ExceptionHandling::WinEH ||
+      MAI->getWinEHEncodingType() != WinEH::EncodingType::Itanium) {
+    return;
+  }
+
+  bool HasEHPersonality = MF->getWinEHFuncInfo() != nullptr;
+
+  // Set up MBB iterator, initially positioned on the same MBB as MI.
+  MachineFunction::const_iterator MFI(MI->getParent());
+  MachineFunction::const_iterator MFE(MF->end());
+
+  // Set up instruction iterator, positioned immediately *after* MI.
+  MachineBasicBlock::const_iterator MBBI(MI);
+  MachineBasicBlock::const_iterator MBBE = MI->getParent()->end();
+  ++MBBI; // Step over MI
+
+  // This loop iterates MBBs
+  for (;;) {
+    // This loop iterates instructions
+    for (; MBBI != MBBE; ++MBBI) {
+      // Check the instruction that follows this CALL.
+      const MachineInstr &NextMI = *MBBI;
+
+      // If there is an EH_LABEL after this CALL, then there is an EH state
+      // transition after this CALL. This is exactly the situation which
+      // requires NOP padding.
+      if (NextMI.isEHLabel()) {
+        if (HasEHPersonality) {
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+          return;
+        }
+        // We actually want to continue, in case there is an SEH_BeginEpilogue
+        // instruction after the EH_LABEL. In some situations, IR is produced
+        // that contains EH_LABEL pseudo-instructions, even when we are not
+        // generating IP2State tables. We still need to insert a NOP before
+        // SEH_BeginEpilogue in that case.
+        continue;
+      }
+
+      // Somewhat similarly, if the CALL is the last instruction before the
+      // SEH prologue, then we also need a NOP. This is necessary because the
+      // Windows stack unwinder will not invoke a function's exception handler
+      // if the instruction pointer is in the function prologue or epilogue.
+      //
+      // We always emit a NOP before SEH_BeginEpilogue, even if there is no
+      // personality function (unwind info) for this frame. This is the same
+      // behavior as MSVC.
+      if (NextMI.getOpcode() == X86::SEH_BeginEpilogue) {
+        EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+        return;
+      }
+
+      if (!NextMI.isPseudo() && !NextMI.isMetaInstruction()) {
+        // We found a real instruction. During the CALL, the return IP will
+        // point to this instruction. Since this instruction has the same EH
+        // state as the call itself (because there is no intervening EH_LABEL),
+        // the IP2State table will be accurate; there is no need to insert a
+        // NOP.
+        return;
+      }
+
+      // The next instruction is a pseudo-op. Ignore it and keep searching.
+      // Because these instructions do not generate any machine code, they
+      // cannot prevent the IP2State table from pointing at the wrong
+      // instruction during a CALL.
+    }
+
+    // We've reached the end of this MBB. Find the next MBB in program order.
+    // MBB order should be finalized by this point, so falling across MBBs is
+    // expected.
+    ++MFI;
+    if (MFI == MFE) {
+      // No more blocks; we've reached the end of the function. This should
+      // only happen with no-return functions, but double-check to be sure.
+      if (HasEHPersonality) {
+        // If the CALL has no successors, then it is a noreturn function.
+        // Insert an INT3 instead of a NOP. This accomplishes the same purpose,
+        // but is more clear to read. Also, analysis tools will understand
+        // that they should not continue disassembling after the CALL (unless
+        // there are other branches to that label).
+        if (MI->getParent()->succ_empty())
+          EmitAndCountInstruction(MCInstBuilder(X86::INT3));
+        else
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+      }
+      return;
+    }
+
+    // Set up iterator to scan the next basic block.
+    const MachineBasicBlock *NextMBB = &*MFI;
+    MBBI = NextMBB->instr_begin();
+    MBBE = NextMBB->instr_end();
+  }
+}
+
 void X86AsmPrinter::emitLabelAndRecordForImportCallOptimization(
     ImportCallKind Kind) {
   assert(EnableImportCallOptimization);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 8fd91fcd33f63..78bd5b4b5bd25 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1855,7 +1855,7 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
 
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
   StringMap<bool> Features;
@@ -2068,7 +2068,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   StringMap<bool> Features;
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
   if (!P)
@@ -2148,7 +2148,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64))
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   StringMap<bool> Features;
 
   // If we're asking the OS at runtime, believe what the OS says
@@ -2167,7 +2167,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
 }
 #elif defined(__linux__) && defined(__loongarch__)
 #include <sys/auxv.h>
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   unsigned long hwcap = getauxval(AT_HWCAP);
   bool HasFPU = hwcap & (1UL << 3); // HWCAP_LOONGARCH_FPU
   uint32_t cpucfg2 = 0x2, cpucfg3 = 0x3;
@@ -2196,7 +2196,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(__linux__) && defined(__riscv)
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0},
                        {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0},
                        {/*RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF=*/9, 0}};
@@ -2279,7 +2279,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #else
-const StringMap<bool> sys::getHostCPUFeatures() { return {}; }
+StringMap<bool> sys::getHostCPUFeatures() { return {}; }
 #endif
 
 #if __APPLE__
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index d7e206ef8cd4f..4ca7444a73b35 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["gfx1250-insts"] = true;
       Features["bitop3-insts"] = true;
       Features["prng-inst"] = true;
+      Features["tanh-insts"] = true;
       Features["transpose-load-f4f6-insts"] = true;
       Features["bf16-trans-insts"] = true;
       Features["fp8-conversion-insts"] = true;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index bcc60c53484e4..be51453ee21d7 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -64,6 +64,10 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case renderscript64: return "renderscript64";
   case riscv32:        return "riscv32";
   case riscv64:        return "riscv64";
+  case riscv32be:
+    return "riscv32be";
+  case riscv64be:
+    return "riscv64be";
   case shave:          return "shave";
   case sparc:          return "sparc";
   case sparcel:        return "sparcel";
@@ -238,7 +242,10 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case wasm64:      return "wasm";
 
   case riscv32:
-  case riscv64:     return "riscv";
+  case riscv64:
+  case riscv32be:
+  case riscv64be:
+    return "riscv";
 
   case ve:          return "ve";
   case csky:        return "csky";
@@ -426,71 +433,73 @@ static Triple::ArchType parseBPFArch(StringRef ArchName) {
 Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
   Triple::ArchType BPFArch(parseBPFArch(Name));
   return StringSwitch<Triple::ArchType>(Name)
-    .Case("aarch64", aarch64)
-    .Case("aarch64_be", aarch64_be)
-    .Case("aarch64_32", aarch64_32)
-    .Case("arc", arc)
-    .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
-    .Case("arm64_32", aarch64_32)
-    .Case("arm", arm)
-    .Case("armeb", armeb)
-    .Case("avr", avr)
-    .StartsWith("bpf", BPFArch)
-    .Case("m68k", m68k)
-    .Case("mips", mips)
-    .Case("mipsel", mipsel)
-    .Case("mips64", mips64)
-    .Case("mips64el", mips64el)
-    .Case("msp430", msp430)
-    .Case("ppc64", ppc64)
-    .Case("ppc32", ppc)
-    .Case("ppc", ppc)
-    .Case("ppc32le", ppcle)
-    .Case("ppcle", ppcle)
-    .Case("ppc64le", ppc64le)
-    .Case("r600", r600)
-    .Case("amdgcn", amdgcn)
-    .Case("riscv32", riscv32)
-    .Case("riscv64", riscv64)
-    .Case("hexagon", hexagon)
-    .Case("sparc", sparc)
-    .Case("sparcel", sparcel)
-    .Case("sparcv9", sparcv9)
-    .Case("s390x", systemz)
-    .Case("systemz", systemz)
-    .Case("tce", tce)
-    .Case("tcele", tcele)
-    .Case("thumb", thumb)
-    .Case("thumbeb", thumbeb)
-    .Case("x86", x86)
-    .Case("i386", x86)
-    .Case("x86-64", x86_64)
-    .Case("xcore", xcore)
-    .Case("nvptx", nvptx)
-    .Case("nvptx64", nvptx64)
-    .Case("amdil", amdil)
-    .Case("amdil64", amdil64)
-    .Case("hsail", hsail)
-    .Case("hsail64", hsail64)
-    .Case("spir", spir)
-    .Case("spir64", spir64)
-    .Case("spirv", spirv)
-    .Case("spirv32", spirv32)
-    .Case("spirv64", spirv64)
-    .Case("kalimba", kalimba)
-    .Case("lanai", lanai)
-    .Case("shave", shave)
-    .Case("wasm32", wasm32)
-    .Case("wasm64", wasm64)
-    .Case("renderscript32", renderscript32)
-    .Case("renderscript64", renderscript64)
-    .Case("ve", ve)
-    .Case("csky", csky)
-    .Case("loongarch32", loongarch32)
-    .Case("loongarch64", loongarch64)
-    .Case("dxil", dxil)
-    .Case("xtensa", xtensa)
-    .Default(UnknownArch);
+      .Case("aarch64", aarch64)
+      .Case("aarch64_be", aarch64_be)
+      .Case("aarch64_32", aarch64_32)
+      .Case("arc", arc)
+      .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
+      .Case("arm64_32", aarch64_32)
+      .Case("arm", arm)
+      .Case("armeb", armeb)
+      .Case("avr", avr)
+      .StartsWith("bpf", BPFArch)
+      .Case("m68k", m68k)
+      .Case("mips", mips)
+      .Case("mipsel", mipsel)
+      .Case("mips64", mips64)
+      .Case("mips64el", mips64el)
+      .Case("msp430", msp430)
+      .Case("ppc64", ppc64)
+      .Case("ppc32", ppc)
+      .Case("ppc", ppc)
+      .Case("ppc32le", ppcle)
+      .Case("ppcle", ppcle)
+      .Case("ppc64le", ppc64le)
+      .Case("r600", r600)
+      .Case("amdgcn", amdgcn)
+      .Case("riscv32", riscv32)
+      .Case("riscv64", riscv64)
+      .Case("riscv32be", riscv32be)
+      .Case("riscv64be", riscv64be)
+      .Case("hexagon", hexagon)
+      .Case("sparc", sparc)
+      .Case("sparcel", sparcel)
+      .Case("sparcv9", sparcv9)
+      .Case("s390x", systemz)
+      .Case("systemz", systemz)
+      .Case("tce", tce)
+      .Case("tcele", tcele)
+      .Case("thumb", thumb)
+      .Case("thumbeb", thumbeb)
+      .Case("x86", x86)
+      .Case("i386", x86)
+      .Case("x86-64", x86_64)
+      .Case("xcore", xcore)
+      .Case("nvptx", nvptx)
+      .Case("nvptx64", nvptx64)
+      .Case("amdil", amdil)
+      .Case("amdil64", amdil64)
+      .Case("hsail", hsail)
+      .Case("hsail64", hsail64)
+      .Case("spir", spir)
+      .Case("spir64", spir64)
+      .Case("spirv", spirv)
+      .Case("spirv32", spirv32)
+      .Case("spirv64", spirv64)
+      .Case("kalimba", kalimba)
+      .Case("lanai", lanai)
+      .Case("shave", shave)
+      .Case("wasm32", wasm32)
+      .Case("wasm64", wasm64)
+      .Case("renderscript32", renderscript32)
+      .Case("renderscript64", renderscript64)
+      .Case("ve", ve)
+      .Case("csky", csky)
+      .Case("loongarch32", loongarch32)
+      .Case("loongarch64", loongarch64)
+      .Case("dxil", dxil)
+      .Case("xtensa", xtensa)
+      .Default(UnknownArch);
 }
 
 static Triple::ArchType parseARMArch(StringRef ArchName) {
@@ -559,84 +568,85 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
-  auto AT =
-      StringSwitch<Triple::ArchType>(ArchName)
-          .Cases("i386", "i486", "i586", "i686", Triple::x86)
-          // FIXME: Do we need to support these?
-          .Cases("i786", "i886", "i986", Triple::x86)
-          .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
-          .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
-          .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
-          .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
-          .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
-          .Case("xscale", Triple::arm)
-          .Case("xscaleeb", Triple::armeb)
-          .Case("aarch64", Triple::aarch64)
-          .Case("aarch64_be", Triple::aarch64_be)
-          .Case("aarch64_32", Triple::aarch64_32)
-          .Case("arc", Triple::arc)
-          .Case("arm64", Triple::aarch64)
-          .Case("arm64_32", Triple::aarch64_32)
-          .Case("arm64e", Triple::aarch64)
-          .Case("arm64ec", Triple::aarch64)
-          .Case("arm", Triple::arm)
-          .Case("armeb", Triple::armeb)
-          .Case("thumb", Triple::thumb)
-          .Case("thumbeb", Triple::thumbeb)
-          .Case("avr", Triple::avr)
-          .Case("m68k", Triple::m68k)
-          .Case("msp430", Triple::msp430)
-          .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6",
-                 Triple::mips)
-          .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el",
-                 Triple::mipsel)
-          .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6",
-                 "mipsn32r6", Triple::mips64)
-          .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
-                 "mipsn32r6el", Triple::mips64el)
-          .Case("r600", Triple::r600)
-          .Case("amdgcn", Triple::amdgcn)
-          .Case("riscv32", Triple::riscv32)
-          .Case("riscv64", Triple::riscv64)
-          .Case("hexagon", Triple::hexagon)
-          .Cases("s390x", "systemz", Triple::systemz)
-          .Case("sparc", Triple::sparc)
-          .Case("sparcel", Triple::sparcel)
-          .Cases("sparcv9", "sparc64", Triple::sparcv9)
-          .Case("tce", Triple::tce)
-          .Case("tcele", Triple::tcele)
-          .Case("xcore", Triple::xcore)
-          .Case("nvptx", Triple::nvptx)
-          .Case("nvptx64", Triple::nvptx64)
-          .Case("amdil", Triple::amdil)
-          .Case("amdil64", Triple::amdil64)
-          .Case("hsail", Triple::hsail)
-          .Case("hsail64", Triple::hsail64)
-          .Case("spir", Triple::spir)
-          .Case("spir64", Triple::spir64)
-          .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
-          .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
-            "spirv32v1.3", "spirv32v1.4", "spirv32v1.5",
-            "spirv32v1.6", Triple::spirv32)
-          .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
-            "spirv64v1.3", "spirv64v1.4", "spirv64v1.5",
-            "spirv64v1.6", Triple::spirv64)
-          .StartsWith("kalimba", Triple::kalimba)
-          .Case("lanai", Triple::lanai)
-          .Case("renderscript32", Triple::renderscript32)
-          .Case("renderscript64", Triple::renderscript64)
-          .Case("shave", Triple::shave)
-          .Case("ve", Triple::ve)
-          .Case("wasm32", Triple::wasm32)
-          .Case("wasm64", Triple::wasm64)
-          .Case("csky", Triple::csky)
-          .Case("loongarch32", Triple::loongarch32)
-          .Case("loongarch64", Triple::loongarch64)
-          .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
-                 "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8",
-                 Triple::dxil)
-          .Case("xtensa", Triple::xtensa)
-          .Default(Triple::UnknownArch);
+  auto AT = StringSwitch<Triple::ArchType>(ArchName)
+                .Cases("i386", "i486", "i586", "i686", Triple::x86)
+                // FIXME: Do we need to support these?
+                .Cases("i786", "i886", "i986", Triple::x86)
+                .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
+                .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
+                .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
+                .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
+                .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
+                .Case("xscale", Triple::arm)
+                .Case("xscaleeb", Triple::armeb)
+                .Case("aarch64", Triple::aarch64)
+                .Case("aarch64_be", Triple::aarch64_be)
+                .Case("aarch64_32", Triple::aarch64_32)
+                .Case("arc", Triple::arc)
+                .Case("arm64", Triple::aarch64)
+                .Case("arm64_32", Triple::aarch64_32)
+                .Case("arm64e", Triple::aarch64)
+                .Case("arm64ec", Triple::aarch64)
+                .Case("arm", Triple::arm)
+                .Case("armeb", Triple::armeb)
+                .Case("thumb", Triple::thumb)
+                .Case("thumbeb", Triple::thumbeb)
+                .Case("avr", Triple::avr)
+                .Case("m68k", Triple::m68k)
+                .Case("msp430", Triple::msp430)
+                .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6",
+                       "mipsr6", Triple::mips)
+                .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el",
+                       Triple::mipsel)
+                .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6",
+                       "mips64r6", "mipsn32r6", Triple::mips64)
+                .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
+                       "mipsn32r6el", Triple::mips64el)
+                .Case("r600", Triple::r600)
+                .Case("amdgcn", Triple::amdgcn)
+                .Case("riscv32", Triple::riscv32)
+                .Case("riscv64", Triple::riscv64)
+                .Case("riscv32be", Triple::riscv32be)
+                .Case("riscv64be", Triple::riscv64be)
+                .Case("hexagon", Triple::hexagon)
+                .Cases("s390x", "systemz", Triple::systemz)
+                .Case("sparc", Triple::sparc)
+                .Case("sparcel", Triple::sparcel)
+                .Cases("sparcv9", "sparc64", Triple::sparcv9)
+                .Case("tce", Triple::tce)
+                .Case("tcele", Triple::tcele)
+                .Case("xcore", Triple::xcore)
+                .Case("nvptx", Triple::nvptx)
+                .Case("nvptx64", Triple::nvptx64)
+                .Case("amdil", Triple::amdil)
+                .Case("amdil64", Triple::amdil64)
+                .Case("hsail", Triple::hsail)
+                .Case("hsail64", Triple::hsail64)
+                .Case("spir", Triple::spir)
+                .Case("spir64", Triple::spir64)
+                .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
+                .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
+                       "spirv32v1.3", "spirv32v1.4", "spirv32v1.5",
+                       "spirv32v1.6", Triple::spirv32)
+                .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
+                       "spirv64v1.3", "spirv64v1.4", "spirv64v1.5",
+                       "spirv64v1.6", Triple::spirv64)
+                .StartsWith("kalimba", Triple::kalimba)
+                .Case("lanai", Triple::lanai)
+                .Case("renderscript32", Triple::renderscript32)
+                .Case("renderscript64", Triple::renderscript64)
+                .Case("shave", Triple::shave)
+                .Case("ve", Triple::ve)
+                .Case("wasm32", Triple::wasm32)
+                .Case("wasm64", Triple::wasm64)
+                .Case("csky", Triple::csky)
+                .Case("loongarch32", Triple::loongarch32)
+                .Case("loongarch64", Triple::loongarch64)
+                .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
+                       "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7",
+                       "dxilv1.8", Triple::dxil)
+                .Case("xtensa", Triple::xtensa)
+                .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
   // ArchType result.
@@ -966,6 +976,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::renderscript64:
   case Triple::riscv32:
   case Triple::riscv64:
+  case Triple::riscv32be:
+  case Triple::riscv64be:
   case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
@@ -1688,6 +1700,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::r600:
   case llvm::Triple::renderscript32:
   case llvm::Triple::riscv32:
+  case llvm::Triple::riscv32be:
   case llvm::Triple::shave:
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
@@ -1718,6 +1731,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::ppc64le:
   case llvm::Triple::renderscript64:
   case llvm::Triple::riscv64:
+  case llvm::Triple::riscv64be:
   case llvm::Triple::sparcv9:
   case llvm::Triple::spirv:
   case llvm::Triple::spir64:
@@ -1796,6 +1810,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::r600:
   case Triple::renderscript32:
   case Triple::riscv32:
+  case Triple::riscv32be:
   case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
@@ -1828,6 +1843,9 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::ppc64le:        T.setArch(Triple::ppcle);   break;
   case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
+  case Triple::riscv64be:
+    T.setArch(Triple::riscv32be);
+    break;
   case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
   case Triple::spir64:         T.setArch(Triple::spir);    break;
   case Triple::spirv:
@@ -1878,6 +1896,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::ppc64le:
   case Triple::renderscript64:
   case Triple::riscv64:
+  case Triple::riscv64be:
   case Triple::sparcv9:
   case Triple::spir64:
   case Triple::spirv64:
@@ -1905,6 +1924,9 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::ppcle:           T.setArch(Triple::ppc64le);    break;
   case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
+  case Triple::riscv32be:
+    T.setArch(Triple::riscv64be);
+    break;
   case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
   case Triple::spir:            T.setArch(Triple::spir64);     break;
   case Triple::spirv:
@@ -1943,8 +1965,6 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::r600:
   case Triple::renderscript32:
   case Triple::renderscript64:
-  case Triple::riscv32:
-  case Triple::riscv64:
   case Triple::shave:
   case Triple::spir64:
   case Triple::spir:
@@ -1977,6 +1997,12 @@ Triple Triple::getBigEndianArchVariant() const {
     break;
   case Triple::ppcle:   T.setArch(Triple::ppc);        break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
+  case Triple::riscv32:
+    T.setArch(Triple::riscv32be);
+    break;
+  case Triple::riscv64:
+    T.setArch(Triple::riscv64be);
+    break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
   case Triple::tcele:   T.setArch(Triple::tce);        break;
   default:
@@ -2014,6 +2040,12 @@ Triple Triple::getLittleEndianArchVariant() const {
     break;
   case Triple::ppc:        T.setArch(Triple::ppcle);    break;
   case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
+  case Triple::riscv32be:
+    T.setArch(Triple::riscv32);
+    break;
+  case Triple::riscv64be:
+    T.setArch(Triple::riscv64);
+    break;
   case Triple::sparc:      T.setArch(Triple::sparcel);  break;
   case Triple::tce:        T.setArch(Triple::tcele);    break;
   default:
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 57fbc71fa22ee..9cd35e35d4bc9 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -176,10 +176,10 @@ constexpr FeatureBitset FeaturesArrowlakeS =
     FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
-    FeaturesArrowlakeS ^ FeatureWIDEKL | FeaturePREFETCHI;
+    (FeaturesArrowlakeS ^ FeatureWIDEKL) | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesClearwaterforest =
-    FeaturesSierraforest ^ FeatureWIDEKL | FeatureAVXVNNIINT16 | FeatureSHA512 |
-    FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
+    (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
+    FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
 
 // Geode Processor.
 constexpr FeatureBitset FeaturesGeode =
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 8c156c93ba8d1..7fa6e6c5161cf 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -842,6 +842,156 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
   return true;
 }
 
+/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
+struct PartStore {
+  Value *PtrBase;
+  APInt PtrOffset;
+  Value *Val;
+  uint64_t ValOffset;
+  uint64_t ValWidth;
+  StoreInst *Store;
+
+  bool isCompatibleWith(const PartStore &Other) const {
+    return PtrBase == Other.PtrBase && Val == Other.Val;
+  }
+
+  bool operator<(const PartStore &Other) const {
+    return PtrOffset.slt(Other.PtrOffset);
+  }
+};
+
+static std::optional<PartStore> matchPartStore(Instruction &I,
+                                               const DataLayout &DL) {
+  auto *Store = dyn_cast<StoreInst>(&I);
+  if (!Store || !Store->isSimple())
+    return std::nullopt;
+
+  Value *StoredVal = Store->getValueOperand();
+  Type *StoredTy = StoredVal->getType();
+  if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
+    return std::nullopt;
+
+  uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
+  uint64_t ValOffset = 0;
+  Value *Val;
+  if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
+                                                   m_ConstantInt(ValOffset))),
+                                    m_Trunc(m_Value(Val)))))
+    return std::nullopt;
+
+  Value *Ptr = Store->getPointerOperand();
+  APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+  Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets(
+      DL, PtrOffset, /*AllowNonInbounds=*/true);
+  return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
+}
+
+static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
+                                       unsigned Width, const DataLayout &DL,
+                                       TargetTransformInfo &TTI) {
+  if (Parts.size() < 2)
+    return false;
+
+  // Check whether combining the stores is profitable.
+  // FIXME: We could generate smaller stores if we can't produce a large one.
+  const PartStore &First = Parts.front();
+  LLVMContext &Ctx = First.Store->getContext();
+  Type *NewTy = Type::getIntNTy(Ctx, Width);
+  unsigned Fast = 0;
+  if (!TTI.isTypeLegal(NewTy) ||
+      !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
+                                          First.Store->getPointerAddressSpace(),
+                                          First.Store->getAlign(), &Fast) ||
+      !Fast)
+    return false;
+
+  // Generate the combined store.
+  IRBuilder<> Builder(First.Store);
+  Value *Val = First.Val;
+  if (First.ValOffset != 0)
+    Val = Builder.CreateLShr(Val, First.ValOffset);
+  Val = Builder.CreateTrunc(Val, NewTy);
+  StoreInst *Store = Builder.CreateAlignedStore(
+      Val, First.Store->getPointerOperand(), First.Store->getAlign());
+
+  AAMDNodes AATags = First.Store->getAAMetadata();
+  for (const PartStore &Part : drop_begin(Parts))
+    AATags = AATags.concat(Part.Store->getAAMetadata());
+  Store->setAAMetadata(AATags);
+
+  // Remove the old stores.
+  for (const PartStore &Part : Parts)
+    Part.Store->eraseFromParent();
+
+  return true;
+}
+
+static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
+                            const DataLayout &DL, TargetTransformInfo &TTI) {
+  if (Parts.size() < 2)
+    return false;
+
+  // We now have multiple parts of the same value stored to the same pointer.
+  // Sort the parts by pointer offset, and make sure they are consistent with
+  // the value offsets. Also check that the value is fully covered without
+  // overlaps.
+  bool Changed = false;
+  llvm::sort(Parts);
+  int64_t LastEndOffsetFromFirst = 0;
+  const PartStore *First = &Parts[0];
+  for (const PartStore &Part : Parts) {
+    APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
+    int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
+    if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
+        LastEndOffsetFromFirst != ValOffsetFromFirst) {
+      Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part),
+                                            LastEndOffsetFromFirst, DL, TTI);
+      First = &Part;
+      LastEndOffsetFromFirst = Part.ValWidth;
+      continue;
+    }
+
+    LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
+  }
+
+  Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()),
+                                        LastEndOffsetFromFirst, DL, TTI);
+  return Changed;
+}
+
+static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
+                                  TargetTransformInfo &TTI, AliasAnalysis &AA) {
+  // FIXME: Add big endian support.
+  if (DL.isBigEndian())
+    return false;
+
+  SmallVector<PartStore, 8> Parts;
+  bool MadeChange = false;
+  for (Instruction &I : make_early_inc_range(BB)) {
+    if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
+      if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
+        Parts.push_back(std::move(*Part));
+        continue;
+      }
+
+      MadeChange |= mergePartStores(Parts, DL, TTI);
+      Parts.clear();
+      Parts.push_back(std::move(*Part));
+      continue;
+    }
+
+    // FIXME: Use AA to make this more precise.
+    if (I.mayReadOrWriteMemory() || I.mayThrow()) {
+      MadeChange |= mergePartStores(Parts, DL, TTI);
+      Parts.clear();
+      continue;
+    }
+  }
+
+  MadeChange |= mergePartStores(Parts, DL, TTI);
+  return MadeChange;
+}
+
 /// Combine away instructions providing they are still equivalent when compared
 /// against 0. i.e do they have any bits set.
 static Value *optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder) {
@@ -1330,6 +1480,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       // bugs.
       MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
     }
+
+    // Do this separately to avoid redundantly scanning stores multiple times.
+    MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
   }
 
   // We're done with transforms, so remove dead instructions.
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index e279fec18bdbc..6561b1cd4ade1 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -170,6 +170,12 @@ void Lowerer::hidePromiseAlloca(CoroIdInst *CoroId, CoroBeginInst *CoroBegin) {
   auto *PI = Builder.CreateIntrinsic(
       Builder.getPtrTy(), Intrinsic::coro_promise, Arg, {}, "promise.addr");
   PI->setCannotDuplicate();
+  // Remove lifetime markers, as these are only allowed on allocas.
+  for (User *U : make_early_inc_range(PA->users())) {
+    auto *I = cast<Instruction>(U);
+    if (I->isLifetimeStartOrEnd())
+      I->eraseFromParent();
+  }
   PA->replaceUsesWithIf(PI, [CoroId](Use &U) {
     bool IsBitcast = U == U.getUser()->stripPointerCasts();
     bool IsCoroId = U.getUser() == CoroId;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index fe30c6dc6abe4..332050860e05b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -553,7 +553,6 @@ static void cacheDIVar(FrameDataInfo &FrameData,
       if (I != Container.end())
         DIVarCache.insert({V, (*I)->getVariable()});
     };
-    CacheIt(findDbgDeclares(V));
     CacheIt(findDVRDeclares(V));
   }
 }
@@ -1103,14 +1102,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
               FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
               SpillAlignment, E.first->getName() + Twine(".reload"));
 
-        TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def);
         TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def);
         // Try best to find dbg.declare. If the spill is a temp, there may not
         // be a direct dbg.declare. Walk up the load chain to find one from an
         // alias.
         if (F->getSubprogram()) {
           auto *CurDef = Def;
-          while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) {
+          while (DVRs.empty() && isa<LoadInst>(CurDef)) {
             auto *LdInst = cast<LoadInst>(CurDef);
             // Only consider ptr to ptr same type load.
             if (LdInst->getPointerOperandType() != LdInst->getType())
@@ -1118,12 +1116,11 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
             CurDef = LdInst->getPointerOperand();
             if (!isa<AllocaInst, LoadInst>(CurDef))
               break;
-            DIs = findDbgDeclares(CurDef);
             DVRs = findDVRDeclares(CurDef);
           }
         }
 
-        auto SalvageOne = [&](auto *DDI) {
+        auto SalvageOne = [&](DbgVariableRecord *DDI) {
           // This dbg.declare is preserved for all coro-split function
           // fragments. It will be unreachable in the main function, and
           // processed by coro::salvageDebugInfo() by the Cloner.
@@ -1137,7 +1134,6 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
           // will be deleted in all coro-split functions.
           coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/);
         };
-        for_each(DIs, SalvageOne);
         for_each(DVRs, SalvageOne);
       }
 
@@ -1179,6 +1175,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
       AllocaInst *Alloca = P.Alloca;
       auto *G = GetFramePointer(Alloca);
 
+      // Remove any lifetime intrinsics, now that these are no longer allocas.
+      for (User *U : make_early_inc_range(Alloca->users())) {
+        auto *I = cast<Instruction>(U);
+        if (I->isLifetimeStartOrEnd())
+          I->eraseFromParent();
+      }
+
       // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
       // here, as we are changing location of the instruction.
       G->takeName(Alloca);
@@ -1215,11 +1218,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
     auto *G = GetFramePointer(Alloca);
     G->setName(Alloca->getName() + Twine(".reload.addr"));
 
-    SmallVector<DbgVariableIntrinsic *, 4> DIs;
     SmallVector<DbgVariableRecord *> DbgVariableRecords;
-    findDbgUsers(DIs, Alloca, &DbgVariableRecords);
-    for (auto *DVI : DIs)
-      DVI->replaceUsesOfWith(Alloca, G);
+    findDbgUsers(Alloca, DbgVariableRecords);
     for (auto *DVR : DbgVariableRecords)
       DVR->replaceVariableLocationOp(Alloca, G);
 
@@ -1913,48 +1913,6 @@ salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
   return {{*Storage, *Expr}};
 }
 
-void coro::salvageDebugInfo(
-    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
-    DbgVariableIntrinsic &DVI, bool UseEntryValue) {
-
-  Function *F = DVI.getFunction();
-  // Follow the pointer arithmetic all the way to the incoming
-  // function argument and convert into a DIExpression.
-  bool SkipOutermostLoad = !isa<DbgValueInst>(DVI);
-  Value *OriginalStorage = DVI.getVariableLocationOp(0);
-
-  auto SalvagedInfo =
-      ::salvageDebugInfoImpl(ArgToAllocaMap, UseEntryValue, F, OriginalStorage,
-                             DVI.getExpression(), SkipOutermostLoad);
-  if (!SalvagedInfo)
-    return;
-
-  Value *Storage = &SalvagedInfo->first;
-  DIExpression *Expr = &SalvagedInfo->second;
-
-  DVI.replaceVariableLocationOp(OriginalStorage, Storage);
-  DVI.setExpression(Expr);
-  // We only hoist dbg.declare today since it doesn't make sense to hoist
-  // dbg.value since it does not have the same function wide guarantees that
-  // dbg.declare does.
-  if (isa<DbgDeclareInst>(DVI)) {
-    std::optional<BasicBlock::iterator> InsertPt;
-    if (auto *I = dyn_cast<Instruction>(Storage)) {
-      InsertPt = I->getInsertionPointAfterDef();
-      // Update DILocation only if variable was not inlined.
-      DebugLoc ILoc = I->getDebugLoc();
-      DebugLoc DVILoc = DVI.getDebugLoc();
-      if (ILoc && DVILoc &&
-          DVILoc->getScope()->getSubprogram() ==
-              ILoc->getScope()->getSubprogram())
-        DVI.setDebugLoc(I->getDebugLoc());
-    } else if (isa<Argument>(Storage))
-      InsertPt = F->getEntryBlock().begin();
-    if (InsertPt)
-      DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt);
-  }
-}
-
 void coro::salvageDebugInfo(
     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
     DbgVariableRecord &DVR, bool UseEntryValue) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index b53c5a48eb10b..52f4ffe292dae 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -34,14 +34,11 @@ void suppressCoroAllocs(CoroIdInst *CoroId);
 void suppressCoroAllocs(LLVMContext &Context,
                         ArrayRef<CoroAllocInst *> CoroAllocs);
 
-/// Attempts to rewrite the location operand of debug intrinsics in terms of
+/// Attempts to rewrite the location operand of debug records in terms of
 /// the coroutine frame pointer, folding pointer offsets into the DIExpression
 /// of the intrinsic.
 /// If the frame pointer is an Argument, store it into an alloca to enhance the
 /// debugability.
-void salvageDebugInfo(
-    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
-    DbgVariableIntrinsic &DVI, bool IsEntryPoint);
 void salvageDebugInfo(
     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
     DbgVariableRecord &DVR, bool UseEntryValue);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 5a8a41f0dc432..64b33e46404f0 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -618,19 +618,15 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
   }
 }
 
-/// Returns all DbgVariableIntrinsic in F.
-static std::pair<SmallVector<DbgVariableIntrinsic *, 8>,
-                 SmallVector<DbgVariableRecord *>>
-collectDbgVariableIntrinsics(Function &F) {
-  SmallVector<DbgVariableIntrinsic *, 8> Intrinsics;
+/// Returns all debug records in F.
+static SmallVector<DbgVariableRecord *>
+collectDbgVariableRecords(Function &F) {
   SmallVector<DbgVariableRecord *> DbgVariableRecords;
   for (auto &I : instructions(F)) {
     for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
       DbgVariableRecords.push_back(&DVR);
-    if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-      Intrinsics.push_back(DVI);
   }
-  return {Intrinsics, DbgVariableRecords};
+  return DbgVariableRecords;
 }
 
 void coro::BaseCloner::replaceSwiftErrorOps() {
@@ -638,13 +634,11 @@ void coro::BaseCloner::replaceSwiftErrorOps() {
 }
 
 void coro::BaseCloner::salvageDebugInfo() {
-  auto [Worklist, DbgVariableRecords] = collectDbgVariableIntrinsics(*NewF);
+  auto DbgVariableRecords = collectDbgVariableRecords(*NewF);
   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
 
   // Only 64-bit ABIs have a register we can refer to with the entry value.
   bool UseEntryValue = OrigF.getParent()->getTargetTriple().isArch64Bit();
-  for (DbgVariableIntrinsic *DVI : Worklist)
-    coro::salvageDebugInfo(ArgToAllocaMap, *DVI, UseEntryValue);
   for (DbgVariableRecord *DVR : DbgVariableRecords)
     coro::salvageDebugInfo(ArgToAllocaMap, *DVR, UseEntryValue);
 
@@ -655,7 +649,7 @@ void coro::BaseCloner::salvageDebugInfo() {
     return !isPotentiallyReachable(&NewF->getEntryBlock(), BB, nullptr,
                                    &DomTree);
   };
-  auto RemoveOne = [&](auto *DVI) {
+  auto RemoveOne = [&](DbgVariableRecord *DVI) {
     if (IsUnreachableBlock(DVI->getParent()))
       DVI->eraseFromParent();
     else if (isa_and_nonnull<AllocaInst>(DVI->getVariableLocationOp(0))) {
@@ -669,7 +663,6 @@ void coro::BaseCloner::salvageDebugInfo() {
         DVI->eraseFromParent();
     }
   };
-  for_each(Worklist, RemoveOne);
   for_each(DbgVariableRecords, RemoveOne);
 }
 
@@ -2022,9 +2015,7 @@ static void doSplitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
   // original function. The Cloner has already salvaged debug info in the new
   // coroutine funclets.
   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
-  auto [DbgInsts, DbgVariableRecords] = collectDbgVariableIntrinsics(F);
-  for (auto *DDI : DbgInsts)
-    coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/);
+  auto DbgVariableRecords = collectDbgVariableRecords(F);
   for (DbgVariableRecord *DVR : DbgVariableRecords)
     coro::salvageDebugInfo(ArgToAllocaMap, *DVR, false /*UseEntryValue*/);
 
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index 8017db1cfe146..4e717685555b5 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -514,17 +514,13 @@ void collectSpillsAndAllocasFromInsts(
 void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
                               const SuspendCrossingInfo &Checker) {
   // We don't want the layout of coroutine frame to be affected
-  // by debug information. So we only choose to salvage DbgValueInst for
+  // by debug information. So we only choose to salvage dbg.values for
   // whose value is already in the frame.
   // We would handle the dbg.values for allocas specially
   for (auto &Iter : Spills) {
     auto *V = Iter.first;
-    SmallVector<DbgValueInst *, 16> DVIs;
     SmallVector<DbgVariableRecord *, 16> DVRs;
-    findDbgValues(DVIs, V, &DVRs);
-    for (DbgValueInst *DVI : DVIs)
-      if (Checker.isDefinitionAcrossSuspend(*V, DVI))
-        Spills[V].push_back(DVI);
+    findDbgValues(V, DVRs);
     // Add the instructions which carry debug info that is in the frame.
     for (DbgVariableRecord *DVR : DVRs)
       if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 469f435374793..b803c97a7bd99 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3998,6 +3998,24 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
 }
 
+// Update the debug information attached to NewFunc to use the clone Name. Note
+// this needs to be done for both any existing DISubprogram for the definition,
+// as well as any separate declaration DISubprogram.
+static void updateSubprogramLinkageName(Function *NewFunc, StringRef Name) {
+  assert(Name == NewFunc->getName());
+  auto *SP = NewFunc->getSubprogram();
+  if (!SP)
+    return;
+  auto *MDName = MDString::get(NewFunc->getParent()->getContext(), Name);
+  SP->replaceLinkageName(MDName);
+  DISubprogram *Decl = SP->getDeclaration();
+  if (!Decl)
+    return;
+  TempDISubprogram NewDecl = Decl->clone();
+  NewDecl->replaceLinkageName(MDName);
+  SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
+}
+
 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                      Instruction *>::FuncInfo
 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
@@ -4009,9 +4027,7 @@ ModuleCallsiteContextGraph::cloneFunctionForCallsite(
   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
   assert(!Func.func()->getParent()->getFunction(Name));
   NewFunc->setName(Name);
-  if (auto *SP = NewFunc->getSubprogram())
-    SP->replaceLinkageName(
-        MDString::get(NewFunc->getParent()->getContext(), Name));
+  updateSubprogramLinkageName(NewFunc, Name);
   for (auto &Inst : CallsWithMetadataInFunc) {
     // This map always has the initial version in it.
     assert(Inst.cloneNo() == 0);
@@ -4950,9 +4966,7 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
       PrevF->eraseFromParent();
     } else
       NewF->setName(Name);
-    if (auto *SP = NewF->getSubprogram())
-      SP->replaceLinkageName(
-          MDString::get(NewF->getParent()->getContext(), Name));
+    updateSubprogramLinkageName(NewF, Name);
     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
              << "created clone " << ore::NV("NewFunction", NewF));
 
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index d4555e9435f1d..f5525deb0172f 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -572,7 +572,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 
   // Work out whether a dbg.value intrinsic or an equivalent DbgVariableRecord
   // is a parameter to be preserved.
-  auto ExamineDbgValue = [](auto *DbgVal, auto &Container) {
+  auto ExamineDbgValue = [&PDVRRelated](DbgVariableRecord *DbgVal) {
     LLVM_DEBUG(dbgs() << " Deciding: ");
     LLVM_DEBUG(DbgVal->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n");
@@ -581,7 +581,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
       LLVM_DEBUG(dbgs() << "  Include (parameter): ");
       LLVM_DEBUG(DbgVal->print(dbgs()));
       LLVM_DEBUG(dbgs() << "\n");
-      Container.insert(DbgVal);
+      PDVRRelated.insert(DbgVal);
     } else {
       LLVM_DEBUG(dbgs() << "  Delete (!parameter): ");
       LLVM_DEBUG(DbgVal->print(dbgs()));
@@ -589,7 +589,8 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
     }
   };
 
-  auto ExamineDbgDeclare = [&PDIRelated](auto *DbgDecl, auto &Container) {
+  auto ExamineDbgDeclare = [&PDIRelated,
+                            &PDVRRelated](DbgVariableRecord *DbgDecl) {
     LLVM_DEBUG(dbgs() << " Deciding: ");
     LLVM_DEBUG(DbgDecl->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n");
@@ -616,7 +617,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
                 LLVM_DEBUG(dbgs() << "  Include: ");
                 LLVM_DEBUG(DbgDecl->print(dbgs()));
                 LLVM_DEBUG(dbgs() << "\n");
-                Container.insert(DbgDecl);
+                PDVRRelated.insert(DbgDecl);
               } else {
                 LLVM_DEBUG(dbgs() << "   Delete (!parameter): ");
                 LLVM_DEBUG(SI->print(dbgs()));
@@ -647,18 +648,14 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
     // they connected to parameters?
     for (DbgVariableRecord &DVR : filterDbgVars(BI->getDbgRecordRange())) {
       if (DVR.isDbgValue() || DVR.isDbgAssign()) {
-        ExamineDbgValue(&DVR, PDVRRelated);
+        ExamineDbgValue(&DVR);
       } else {
         assert(DVR.isDbgDeclare());
-        ExamineDbgDeclare(&DVR, PDVRRelated);
+        ExamineDbgDeclare(&DVR);
       }
     }
 
-    if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
-      ExamineDbgValue(DVI, PDIRelated);
-    } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
-      ExamineDbgDeclare(DDI, PDIRelated);
-    } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
+    if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
       LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
       LLVM_DEBUG(BI->print(dbgs()));
       LLVM_DEBUG(dbgs() << "\n");
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 5de2285c2d2e3..5e2247f2a88d0 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2875,7 +2875,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
       if (It->getSecond().IsReachedFromAlignedBarrierOnly)
         break;
       return false;
-    } while ((CurI = CurI->getPrevNonDebugInstruction()));
+    } while ((CurI = CurI->getPrevNode()));
 
     // Delayed decision on the forward pass to allow aligned barrier detection
     // in the backwards traversal.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 73293bb5f4a0e..d88bc2c4901c7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3933,7 +3933,7 @@ Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
   if (NFI && isIdenticalOrStrongerFence(NFI, &FI))
     return eraseInstFromFunction(FI);
 
-  if (auto *PFI = dyn_cast_or_null<FenceInst>(FI.getPrevNonDebugInstruction()))
+  if (auto *PFI = dyn_cast_or_null<FenceInst>(FI.getPrevNode()))
     if (isIdenticalOrStrongerFence(PFI, &FI))
       return eraseInstFromFunction(FI);
   return nullptr;
@@ -4352,6 +4352,13 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
             Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
     }
 
+  // Drop unnecessary callee_type metadata from calls that were converted
+  // into direct calls.
+  if (Call.getMetadata(LLVMContext::MD_callee_type) && !Call.isIndirectCall()) {
+    Call.setMetadata(LLVMContext::MD_callee_type, nullptr);
+    Changed = true;
+  }
+
   // Drop unnecessary kcfi operand bundles from calls that were converted
   // into direct calls.
   auto Bundle = Call.getOperandBundle(LLVMContext::OB_kcfi);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 9df08553d86e4..c90ff2a868d4c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -21,8 +22,10 @@
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
@@ -8222,6 +8225,98 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
   return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
 }
 
+// Transform 'fptrunc(x) cmp C' to 'x cmp ext(C)' if possible.
+// Patterns include:
+//    fptrunc(x) <  C  -->  x <  ext(C)
+//    fptrunc(x) <= C  -->  x <= ext(C)
+//    fptrunc(x) >  C  -->  x >  ext(C)
+//    fptrunc(x) >= C  -->  x >= ext(C)
+// where 'ext(C)' is the extension of 'C' to the type of 'x' with a small bias
+// due to precision loss.
+static Instruction *foldFCmpFpTrunc(FCmpInst &I, const Instruction &FPTrunc,
+                                    const Constant &C) {
+  FCmpInst::Predicate Pred = I.getPredicate();
+  bool RoundDown = false;
+
+  if (Pred == FCmpInst::FCMP_OGE || Pred == FCmpInst::FCMP_UGE ||
+      Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_ULT)
+    RoundDown = true;
+  else if (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT ||
+           Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)
+    RoundDown = false;
+  else
+    return nullptr;
+
+  const APFloat *CValue;
+  if (!match(&C, m_APFloat(CValue)))
+    return nullptr;
+
+  if (CValue->isNaN() || CValue->isInfinity())
+    return nullptr;
+
+  auto ConvertFltSema = [](const APFloat &Src, const fltSemantics &Sema) {
+    bool LosesInfo;
+    APFloat Dest = Src;
+    Dest.convert(Sema, APFloat::rmNearestTiesToEven, &LosesInfo);
+    return Dest;
+  };
+
+  auto NextValue = [](const APFloat &Value, bool RoundDown) {
+    APFloat NextValue = Value;
+    NextValue.next(RoundDown);
+    return NextValue;
+  };
+
+  APFloat NextCValue = NextValue(*CValue, RoundDown);
+
+  Type *DestType = FPTrunc.getOperand(0)->getType();
+  const fltSemantics &DestFltSema =
+      DestType->getScalarType()->getFltSemantics();
+
+  APFloat ExtCValue = ConvertFltSema(*CValue, DestFltSema);
+  APFloat ExtNextCValue = ConvertFltSema(NextCValue, DestFltSema);
+
+  // When 'NextCValue' is infinity, use an imaged 'NextCValue' that equals
+  // 'CValue + bias' to avoid the infinity after conversion. The bias is
+  // estimated as 'CValue - PrevCValue', where 'PrevCValue' is the previous
+  // value of 'CValue'.
+  if (NextCValue.isInfinity()) {
+    APFloat PrevCValue = NextValue(*CValue, !RoundDown);
+    APFloat Bias = ConvertFltSema(*CValue - PrevCValue, DestFltSema);
+
+    ExtNextCValue = ExtCValue + Bias;
+  }
+
+  APFloat ExtMidValue =
+      scalbn(ExtCValue + ExtNextCValue, -1, APFloat::rmNearestTiesToEven);
+
+  const fltSemantics &SrcFltSema =
+      C.getType()->getScalarType()->getFltSemantics();
+
+  // 'MidValue' might be rounded to 'NextCValue'. Correct it here.
+  APFloat MidValue = ConvertFltSema(ExtMidValue, SrcFltSema);
+  if (MidValue != *CValue)
+    ExtMidValue.next(!RoundDown);
+
+  // Check whether 'ExtMidValue' is a valid result since the assumption on
+  // imaged 'NextCValue' might not hold for new float types.
+  // ppc_fp128 can't pass here when converting from max float because of
+  // APFloat implementation.
+  if (NextCValue.isInfinity()) {
+    // ExtMidValue --- narrowed ---> Finite
+    if (ConvertFltSema(ExtMidValue, SrcFltSema).isInfinity())
+      return nullptr;
+
+    // NextExtMidValue --- narrowed ---> Infinity
+    APFloat NextExtMidValue = NextValue(ExtMidValue, RoundDown);
+    if (ConvertFltSema(NextExtMidValue, SrcFltSema).isFinite())
+      return nullptr;
+  }
+
+  return new FCmpInst(Pred, FPTrunc.getOperand(0),
+                      ConstantFP::get(DestType, ExtMidValue), "", &I);
+}
+
 /// Optimize fabs(X) compared with zero.
 static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
   Value *X;
@@ -8712,6 +8807,10 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
                   cast<LoadInst>(LHSI), GEP, GV, I))
             return Res;
       break;
+    case Instruction::FPTrunc:
+      if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC))
+        return NV;
+      break;
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9d7c025ccff86..f7fbf0815df03 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -825,9 +825,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
 
   bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock);
-  void tryToSinkInstructionDbgValues(
-      Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-      BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers);
   void tryToSinkInstructionDbgVariableRecords(
       Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
       BasicBlock *DestBlock, SmallVectorImpl<DbgVariableRecord *> &DPUsers);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 73ba0f78e8053..eb4332fbc0959 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -878,7 +878,11 @@ static Instruction *foldSetClearBits(SelectInst &Sel,
 // is a vector consisting of 0 and undefs. If a constant compared with x
 // is a scalar undefined value or undefined vector then an expression
 // should be already folded into a constant.
-static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
+//
+// This also holds all operations such that Op(0) == 0
+// e.g. Shl, Umin, etc
+static Instruction *foldSelectZeroOrFixedOp(SelectInst &SI,
+                                            InstCombinerImpl &IC) {
   auto *CondVal = SI.getCondition();
   auto *TrueVal = SI.getTrueValue();
   auto *FalseVal = SI.getFalseValue();
@@ -900,10 +904,23 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
   // non-zero elements that are masked by undef elements in the compare
   // constant.
   auto *TrueValC = dyn_cast<Constant>(TrueVal);
-  if (TrueValC == nullptr ||
-      !match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) ||
-      !isa<Instruction>(FalseVal))
+  if (TrueValC == nullptr || !isa<Instruction>(FalseVal))
+    return nullptr;
+
+  bool FreezeY;
+  if (match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) ||
+      match(FalseVal, m_c_And(m_Specific(X), m_Value(Y))) ||
+      match(FalseVal, m_FShl(m_Specific(X), m_Specific(X), m_Value(Y))) ||
+      match(FalseVal, m_FShr(m_Specific(X), m_Specific(X), m_Value(Y))) ||
+      match(FalseVal,
+            m_c_Intrinsic<Intrinsic::umin>(m_Specific(X), m_Value(Y)))) {
+    FreezeY = true;
+  } else if (match(FalseVal, m_IDiv(m_Specific(X), m_Value(Y))) ||
+             match(FalseVal, m_IRem(m_Specific(X), m_Value(Y)))) {
+    FreezeY = false;
+  } else {
     return nullptr;
+  }
 
   auto *ZeroC = cast<Constant>(cast<Instruction>(CondVal)->getOperand(1));
   auto *MergedC = Constant::mergeUndefsWith(TrueValC, ZeroC);
@@ -914,9 +931,15 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
     return nullptr;
 
   auto *FalseValI = cast<Instruction>(FalseVal);
-  auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"),
-                                     FalseValI->getIterator());
-  IC.replaceOperand(*FalseValI, FalseValI->getOperand(0) == Y ? 0 : 1, FrY);
+  if (FreezeY) {
+    auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"),
+                                       FalseValI->getIterator());
+    IC.replaceOperand(*FalseValI,
+                      FalseValI->getOperand(0) == Y
+                          ? 0
+                          : (FalseValI->getOperand(1) == Y ? 1 : 2),
+                      FrY);
+  }
   return IC.replaceInstUsesWith(SI, FalseValI);
 }
 
@@ -4104,7 +4127,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return Add;
   if (Instruction *Or = foldSetClearBits(SI, Builder))
     return Or;
-  if (Instruction *Mul = foldSelectZeroOrMul(SI, *this))
+  if (Instruction *Mul = foldSelectZeroOrFixedOp(SI, *this))
     return Mul;
 
   // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 91a1b61ddc483..e2a9255ca9c6e 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -219,18 +219,64 @@ Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
 Value *InstCombinerImpl::EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs,
                                         GEPNoWrapFlags NW, Type *IdxTy,
                                         bool RewriteGEPs) {
-  Value *Sum = nullptr;
-  for (GEPOperator *GEP : reverse(GEPs)) {
-    Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
-    if (Offset->getType() != IdxTy)
-      Offset = Builder.CreateVectorSplat(
-          cast<VectorType>(IdxTy)->getElementCount(), Offset);
+  auto Add = [&](Value *Sum, Value *Offset) -> Value * {
     if (Sum)
-      Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
-                              NW.isInBounds());
+      return Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
+                               NW.isInBounds());
     else
-      Sum = Offset;
+      return Offset;
+  };
+
+  Value *Sum = nullptr;
+  Value *OneUseSum = nullptr;
+  Value *OneUseBase = nullptr;
+  GEPNoWrapFlags OneUseFlags = GEPNoWrapFlags::all();
+  for (GEPOperator *GEP : reverse(GEPs)) {
+    Value *Offset;
+    {
+      // Expand the offset at the point of the previous GEP to enable rewriting.
+      // However, use the original insertion point for calculating Sum.
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      auto *Inst = dyn_cast<Instruction>(GEP);
+      if (RewriteGEPs && Inst)
+        Builder.SetInsertPoint(Inst);
+
+      Offset = llvm::emitGEPOffset(&Builder, DL, GEP);
+      if (Offset->getType() != IdxTy)
+        Offset = Builder.CreateVectorSplat(
+            cast<VectorType>(IdxTy)->getElementCount(), Offset);
+      if (GEP->hasOneUse()) {
+        // Offsets of one-use GEPs will be merged into the next multi-use GEP.
+        OneUseSum = Add(OneUseSum, Offset);
+        OneUseFlags = OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags());
+        if (!OneUseBase)
+          OneUseBase = GEP->getPointerOperand();
+        continue;
+      }
+
+      if (OneUseSum)
+        Offset = Add(OneUseSum, Offset);
+
+      // Rewrite the GEP to reuse the computed offset. This also includes
+      // offsets from preceding one-use GEPs.
+      if (RewriteGEPs && Inst &&
+          !(GEP->getSourceElementType()->isIntegerTy(8) &&
+            GEP->getOperand(1) == Offset)) {
+        replaceInstUsesWith(
+            *Inst,
+            Builder.CreatePtrAdd(
+                OneUseBase ? OneUseBase : GEP->getPointerOperand(), Offset, "",
+                OneUseFlags.intersectForOffsetAdd(GEP->getNoWrapFlags())));
+        eraseInstFromFunction(*Inst);
+      }
+    }
+
+    Sum = Add(Sum, Offset);
+    OneUseSum = OneUseBase = nullptr;
+    OneUseFlags = GEPNoWrapFlags::all();
   }
+  if (OneUseSum)
+    Sum = Add(Sum, OneUseSum);
   if (!Sum)
     return Constant::getNullValue(IdxTy);
   return Sum;
@@ -1417,24 +1463,17 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) {
   }
 
   // Update pre-existing debug value uses.
-  SmallVector<DbgValueInst *, 4> DbgValues;
   SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
-  llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
+  llvm::findDbgValues(I, DbgVariableRecords);
 
-  auto InvertDbgValueUse = [&](auto *DbgVal) {
+  for (DbgVariableRecord *DbgVal : DbgVariableRecords) {
     SmallVector<uint64_t, 1> Ops = {dwarf::DW_OP_not};
     for (unsigned Idx = 0, End = DbgVal->getNumVariableLocationOps();
          Idx != End; ++Idx)
       if (DbgVal->getVariableLocationOp(Idx) == I)
         DbgVal->setExpression(
             DIExpression::appendOpsToArg(DbgVal->getExpression(), Ops, Idx));
-  };
-
-  for (DbgValueInst *DVI : DbgValues)
-    InvertDbgValueUse(DVI);
-
-  for (DbgVariableRecord *DVR : DbgVariableRecords)
-    InvertDbgValueUse(DVR);
+  }
 }
 
 /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
@@ -3570,11 +3609,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
 
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
   // before each store.
-  SmallVector<DbgVariableIntrinsic *, 8> DVIs;
   SmallVector<DbgVariableRecord *, 8> DVRs;
   std::unique_ptr<DIBuilder> DIB;
   if (isa<AllocaInst>(MI)) {
-    findDbgUsers(DVIs, &MI, &DVRs);
+    findDbgUsers(&MI, DVRs);
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
@@ -3644,9 +3682,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-        for (auto *DVI : DVIs)
-          if (DVI->isAddressOfVariable())
-            ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
         for (auto *DVR : DVRs)
           if (DVR->isAddressOfVariable())
             ConvertDebugDeclareToDebugValue(DVR, SI, *DIB);
@@ -3699,9 +3734,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     //
     // FIXME: the Assignment Tracking project has now likely made this
     // redundant (and it's sometimes harmful).
-    for (auto *DVI : DVIs)
-      if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
-        DVI->eraseFromParent();
     for (auto *DVR : DVRs)
       if (DVR->isAddressOfVariable() || DVR->getExpression()->startsWithDeref())
         DVR->eraseFromParent();
@@ -3890,7 +3922,7 @@ bool InstCombinerImpl::removeInstructionsBeforeUnreachable(Instruction &I) {
   // This includes instructions like stores and "llvm.assume" that may not get
   // removed by simple dead code elimination.
   bool Changed = false;
-  while (Instruction *Prev = I.getPrevNonDebugInstruction()) {
+  while (Instruction *Prev = I.getPrevNode()) {
     // While we theoretically can erase EH, that would result in a block that
     // used to start with an EH no longer starting with EH, which is invalid.
     // To make it valid, we'd need to fixup predecessors to no longer refer to
@@ -4899,13 +4931,14 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
   // If operand is guaranteed not to be poison, there is no need to add freeze
   // to the operand. So we first find the operand that is not guaranteed to be
   // poison.
-  Use *MaybePoisonOperand = nullptr;
-  for (Use &U : OrigOpInst->operands()) {
-    if (isa<MetadataAsValue>(U.get()) ||
-        isGuaranteedNotToBeUndefOrPoison(U.get()))
+  Value *MaybePoisonOperand = nullptr;
+  for (Value *V : OrigOpInst->operands()) {
+    if (isa<MetadataAsValue>(V) || isGuaranteedNotToBeUndefOrPoison(V) ||
+        // Treat identical operands as a single operand.
+        (MaybePoisonOperand && MaybePoisonOperand == V))
       continue;
     if (!MaybePoisonOperand)
-      MaybePoisonOperand = &U;
+      MaybePoisonOperand = V;
     else
       return nullptr;
   }
@@ -4917,10 +4950,10 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
     return OrigOp;
 
   Builder.SetInsertPoint(OrigOpInst);
-  auto *FrozenMaybePoisonOperand = Builder.CreateFreeze(
-      MaybePoisonOperand->get(), MaybePoisonOperand->get()->getName() + ".fr");
+  Value *FrozenMaybePoisonOperand = Builder.CreateFreeze(
+      MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr");
 
-  replaceUse(*MaybePoisonOperand, FrozenMaybePoisonOperand);
+  OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
   return OrigOp;
 }
 
@@ -5252,11 +5285,8 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
   // maximise the range variables have location for. If we cannot salvage, then
   // mark the location undef: we know it was supposed to receive a new location
   // here, but that computation has been sunk.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
   SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
-  findDbgUsers(DbgUsers, I, &DbgVariableRecords);
-  if (!DbgUsers.empty())
-    tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers);
+  findDbgUsers(I, DbgVariableRecords);
   if (!DbgVariableRecords.empty())
     tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock,
                                            DbgVariableRecords);
@@ -5273,71 +5303,12 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
   return true;
 }
 
-void InstCombinerImpl::tryToSinkInstructionDbgValues(
-    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers) {
-  // For all debug values in the destination block, the sunk instruction
-  // will still be available, so they do not need to be dropped.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSalvage;
-  for (auto &DbgUser : DbgUsers)
-    if (DbgUser->getParent() != DestBlock)
-      DbgUsersToSalvage.push_back(DbgUser);
-
-  // Process the sinking DbgUsersToSalvage in reverse order, as we only want
-  // to clone the last appearing debug intrinsic for each given variable.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSink;
-  for (DbgVariableIntrinsic *DVI : DbgUsersToSalvage)
-    if (DVI->getParent() == SrcBlock)
-      DbgUsersToSink.push_back(DVI);
-  llvm::sort(DbgUsersToSink,
-             [](auto *A, auto *B) { return B->comesBefore(A); });
-
-  SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
-  SmallSet<DebugVariable, 4> SunkVariables;
-  for (auto *User : DbgUsersToSink) {
-    // A dbg.declare instruction should not be cloned, since there can only be
-    // one per variable fragment. It should be left in the original place
-    // because the sunk instruction is not an alloca (otherwise we could not be
-    // here).
-    if (isa<DbgDeclareInst>(User))
-      continue;
-
-    DebugVariable DbgUserVariable =
-        DebugVariable(User->getVariable(), User->getExpression(),
-                      User->getDebugLoc()->getInlinedAt());
-
-    if (!SunkVariables.insert(DbgUserVariable).second)
-      continue;
-
-    // Leave dbg.assign intrinsics in their original positions and there should
-    // be no need to insert a clone.
-    if (isa<DbgAssignIntrinsic>(User))
-      continue;
-
-    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
-    if (isa<DbgDeclareInst>(User) && isa<CastInst>(I))
-      DIIClones.back()->replaceVariableLocationOp(I, I->getOperand(0));
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
-  }
-
-  // Perform salvaging without the clones, then sink the clones.
-  if (!DIIClones.empty()) {
-    salvageDebugInfoForDbgValues(*I, DbgUsersToSalvage, {});
-    // The clones are in reverse order of original appearance, reverse again to
-    // maintain the original order.
-    for (auto &DIIClone : llvm::reverse(DIIClones)) {
-      DIIClone->insertBefore(InsertPos);
-      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
-    }
-  }
-}
-
 void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
     Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
     BasicBlock *DestBlock,
     SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
-  // Implementation of tryToSinkInstructionDbgValues, but for the
-  // DbgVariableRecord of variable assignments rather than dbg.values.
+  // For all debug values in the destination block, the sunk instruction
+  // will still be available, so they do not need to be dropped.
 
   // Fetch all DbgVariableRecords not already in the destination.
   SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage;
@@ -5442,7 +5413,7 @@ void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
   if (DVRClones.empty())
     return;
 
-  salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
+  salvageDebugInfoForDbgValues(*I, DbgVariableRecordsToSalvage);
 
   // The clones are in reverse order of original appearance. Assert that the
   // head bit is set on the iterator as we _should_ have received it via
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index dfbe4f8172066..fbaa651641566 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -3424,7 +3424,7 @@ static void findStoresToUninstrumentedArgAllocas(
           isa<Argument>(cast<CastInst>(Val)->getOperand(0)) &&
           // Check that the cast appears directly before the store. Otherwise
           // moving the cast before InsBefore may break the IR.
-          Val == It->getPrevNonDebugInstruction();
+          Val == It->getPrevNode();
       bool IsArgInit = IsDirectArgInit || IsArgInitViaCast;
       if (!IsArgInit)
         continue;
@@ -3637,6 +3637,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
          "Variable descriptions relative to ASan stack base will be dropped");
 
   // Replace Alloca instructions with base+offset.
+  SmallVector<Value *> NewAllocaPtrs;
   for (const auto &Desc : SVD) {
     AllocaInst *AI = Desc.AI;
     replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
@@ -3645,6 +3646,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
         IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
         AI->getType());
     AI->replaceAllUsesWith(NewAllocaPtr);
+    NewAllocaPtrs.push_back(NewAllocaPtr);
   }
 
   // The left-most redzone has enough space for at least 4 pointers.
@@ -3694,6 +3696,15 @@ void FunctionStackPoisoner::processStaticAllocas() {
     }
   }
 
+  // Remove lifetime markers now that these are no longer allocas.
+  for (Value *NewAllocaPtr : NewAllocaPtrs) {
+    for (User *U : make_early_inc_range(NewAllocaPtr->users())) {
+      auto *I = cast<Instruction>(U);
+      if (I->isLifetimeStartOrEnd())
+        I->eraseFromParent();
+    }
+  }
+
   SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
   SmallVector<uint8_t, 64> ShadowAfterReturn;
 
@@ -3829,6 +3840,13 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
 
   Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
 
+  // Remove lifetime markers now that this is no longer an alloca.
+  for (User *U : make_early_inc_range(AI->users())) {
+    auto *I = cast<Instruction>(U);
+    if (I->isLifetimeStartOrEnd())
+      I->eraseFromParent();
+  }
+
   // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
   AI->replaceAllUsesWith(NewAddressPtr);
 
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index e7a6fa48e9004..2486e77ab0137 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -71,7 +72,7 @@ static void emitRemark(IntrinsicInst *II, OptimizationRemarkEmitter &ORE,
   }
 }
 
-static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
+static bool lowerAllowChecks(Function &F, const BlockFrequencyInfo &BFI,
                              const ProfileSummaryInfo *PSI,
                              OptimizationRemarkEmitter &ORE,
                              const LowerAllowCheckPass::Options &Opts) {
@@ -159,7 +160,7 @@ PreservedAnalyses LowerAllowCheckPass::run(Function &F,
   OptimizationRemarkEmitter &ORE =
       AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  return removeUbsanTraps(F, BFI, PSI, ORE, Opts)
+  return lowerAllowChecks(F, BFI, PSI, ORE, Opts)
              // We do not change the CFG, we only replace the intrinsics with
              // true or false.
              ? PreservedAnalyses::none().preserveSet<CFGAnalyses>()
@@ -184,22 +185,14 @@ void LowerAllowCheckPass::printPipeline(
   // correctness.
   // TODO: print shorter output by combining adjacent runs, etc.
   int i = 0;
-  bool printed = false;
+  ListSeparator LS(";");
   for (unsigned int cutoff : Opts.cutoffs) {
-    if (cutoff > 0) {
-      if (printed)
-        OS << ";";
-      OS << "cutoffs[" << i << "]=" << cutoff;
-      printed = true;
-    }
-
+    if (cutoff > 0)
+      OS << LS << "cutoffs[" << i << "]=" << cutoff;
     i++;
   }
-  if (Opts.runtime_check) {
-    if (printed)
-      OS << ";";
-    OS << "runtime_check=" << Opts.runtime_check;
-  }
+  if (Opts.runtime_check)
+    OS << LS << "runtime_check=" << Opts.runtime_check;
 
   OS << '>';
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index e5b357fc1bfbf..a9a0731f16d90 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -361,6 +361,131 @@ static void addVPMetadata(Module &M, Instruction &I,
   }
 }
 
+static void
+handleAllocSite(Instruction &I, CallBase *CI,
+                ArrayRef<uint64_t> InlinedCallStack, LLVMContext &Ctx,
+                OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
+                const std::set<const AllocationInfo *> &AllocInfoSet,
+                std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
+                    &FullStackIdToAllocMatchInfo) {
+  // We may match this instruction's location list to multiple MIB
+  // contexts. Add them to a Trie specialized for trimming the contexts to
+  // the minimal needed to disambiguate contexts with unique behavior.
+  CallStackTrie AllocTrie(&ORE, MaxColdSize);
+  uint64_t TotalSize = 0;
+  uint64_t TotalColdSize = 0;
+  for (auto *AllocInfo : AllocInfoSet) {
+    // Check the full inlined call stack against this one.
+    // If we found and thus matched all frames on the call, include
+    // this MIB.
+    if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
+                                           InlinedCallStack)) {
+      NumOfMemProfMatchedAllocContexts++;
+      uint64_t FullStackId = 0;
+      if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
+        FullStackId = computeFullStackId(AllocInfo->CallStack);
+      auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
+      TotalSize += AllocInfo->Info.getTotalSize();
+      if (AllocType == AllocationType::Cold)
+        TotalColdSize += AllocInfo->Info.getTotalSize();
+      // Record information about the allocation if match info printing
+      // was requested.
+      if (ClPrintMemProfMatchInfo) {
+        assert(FullStackId != 0);
+        FullStackIdToAllocMatchInfo[std::make_pair(FullStackId,
+                                                   InlinedCallStack.size())] = {
+            AllocInfo->Info.getTotalSize(), AllocType};
+      }
+    }
+  }
+  // If the threshold for the percent of cold bytes is less than 100%,
+  // and not all bytes are cold, see if we should still hint this
+  // allocation as cold without context sensitivity.
+  if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
+      TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
+    AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, "dominant");
+    return;
+  }
+
+  // We might not have matched any to the full inlined call stack.
+  // But if we did, create and attach metadata, or a function attribute if
+  // all contexts have identical profiled behavior.
+  if (!AllocTrie.empty()) {
+    NumOfMemProfMatchedAllocs++;
+    // MemprofMDAttached will be false if a function attribute was
+    // attached.
+    bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
+    assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
+    if (MemprofMDAttached) {
+      // Add callsite metadata for the instruction's location list so that
+      // it simpler later on to identify which part of the MIB contexts
+      // are from this particular instruction (including during inlining,
+      // when the callsite metadata will be updated appropriately).
+      // FIXME: can this be changed to strip out the matching stack
+      // context ids from the MIB contexts and not add any callsite
+      // metadata here to save space?
+      addCallsiteMetadata(I, InlinedCallStack, Ctx);
+    }
+  }
+}
+
+// Helper struct for maintaining refs to callsite data. As an alternative we
+// could store a pointer to the CallSiteInfo struct but we also need the frame
+// index. Using ArrayRefs instead makes it a little easier to read.
+struct CallSiteEntry {
+  // Subset of frames for the corresponding CallSiteInfo.
+  ArrayRef<Frame> Frames;
+  // Potential targets for indirect calls.
+  ArrayRef<GlobalValue::GUID> CalleeGuids;
+
+  // Only compare Frame contents.
+  // Use pointer-based equality instead of ArrayRef's operator== which does
+  // element-wise comparison. We want to check if it's the same slice of the
+  // underlying array, not just equivalent content.
+  bool operator==(const CallSiteEntry &Other) const {
+    return Frames.data() == Other.Frames.data() &&
+           Frames.size() == Other.Frames.size();
+  }
+};
+
+struct CallSiteEntryHash {
+  size_t operator()(const CallSiteEntry &Entry) const {
+    return computeFullStackId(Entry.Frames);
+  }
+};
+
+static void handleCallSite(
+    Instruction &I, const Function *CalledFunction,
+    ArrayRef<uint64_t> InlinedCallStack,
+    const std::unordered_set<CallSiteEntry, CallSiteEntryHash> &CallSiteEntries,
+    Module &M, std::set<std::vector<uint64_t>> &MatchedCallSites) {
+  auto &Ctx = M.getContext();
+  for (const auto &CallSiteEntry : CallSiteEntries) {
+    // If we found and thus matched all frames on the call, create and
+    // attach call stack metadata.
+    if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
+                                           InlinedCallStack)) {
+      NumOfMemProfMatchedCallSites++;
+      addCallsiteMetadata(I, InlinedCallStack, Ctx);
+
+      // Try to attach indirect call metadata if possible.
+      if (!CalledFunction)
+        addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
+
+      // Only need to find one with a matching call stack and add a single
+      // callsite metadata.
+
+      // Accumulate call site matching information upon request.
+      if (ClPrintMemProfMatchInfo) {
+        std::vector<uint64_t> CallStack;
+        append_range(CallStack, InlinedCallStack);
+        MatchedCallSites.insert(std::move(CallStack));
+      }
+      break;
+    }
+  }
+}
+
 static void readMemprof(Module &M, Function &F,
                         IndexedInstrProfReader *MemProfReader,
                         const TargetLibraryInfo &TLI,
@@ -431,31 +556,6 @@ static void readMemprof(Module &M, Function &F,
   // (allocation info and the callsites).
   std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
 
-  // Helper struct for maintaining refs to callsite data. As an alternative we
-  // could store a pointer to the CallSiteInfo struct but we also need the frame
-  // index. Using ArrayRefs instead makes it a little easier to read.
-  struct CallSiteEntry {
-    // Subset of frames for the corresponding CallSiteInfo.
-    ArrayRef<Frame> Frames;
-    // Potential targets for indirect calls.
-    ArrayRef<GlobalValue::GUID> CalleeGuids;
-
-    // Only compare Frame contents.
-    // Use pointer-based equality instead of ArrayRef's operator== which does
-    // element-wise comparison. We want to check if it's the same slice of the
-    // underlying array, not just equivalent content.
-    bool operator==(const CallSiteEntry &Other) const {
-      return Frames.data() == Other.Frames.data() &&
-             Frames.size() == Other.Frames.size();
-    }
-  };
-
-  struct CallSiteEntryHash {
-    size_t operator()(const CallSiteEntry &Entry) const {
-      return computeFullStackId(Entry.Frames);
-    }
-  };
-
   // For the callsites we need to record slices of the frame array (see comments
   // below where the map entries are added) along with their CalleeGuids.
   std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>>
@@ -553,100 +653,15 @@ static void readMemprof(Module &M, Function &F,
       // allocation context with the same leaf.
       if (AllocInfoIter != LocHashToAllocInfo.end() &&
           // Only consider allocations which support hinting.
-          isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI)) {
-        // We may match this instruction's location list to multiple MIB
-        // contexts. Add them to a Trie specialized for trimming the contexts to
-        // the minimal needed to disambiguate contexts with unique behavior.
-        CallStackTrie AllocTrie(&ORE, MaxColdSize);
-        uint64_t TotalSize = 0;
-        uint64_t TotalColdSize = 0;
-        for (auto *AllocInfo : AllocInfoIter->second) {
-          // Check the full inlined call stack against this one.
-          // If we found and thus matched all frames on the call, include
-          // this MIB.
-          if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
-                                                 InlinedCallStack)) {
-            NumOfMemProfMatchedAllocContexts++;
-            uint64_t FullStackId = 0;
-            if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
-              FullStackId = computeFullStackId(AllocInfo->CallStack);
-            auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
-            TotalSize += AllocInfo->Info.getTotalSize();
-            if (AllocType == AllocationType::Cold)
-              TotalColdSize += AllocInfo->Info.getTotalSize();
-            // Record information about the allocation if match info printing
-            // was requested.
-            if (ClPrintMemProfMatchInfo) {
-              assert(FullStackId != 0);
-              FullStackIdToAllocMatchInfo[std::make_pair(
-                  FullStackId, InlinedCallStack.size())] = {
-                  AllocInfo->Info.getTotalSize(), AllocType};
-            }
-          }
-        }
-        // If the threshold for the percent of cold bytes is less than 100%,
-        // and not all bytes are cold, see if we should still hint this
-        // allocation as cold without context sensitivity.
-        if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
-            TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
-          AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold,
-                                                "dominant");
-          continue;
-        }
-
-        // We might not have matched any to the full inlined call stack.
-        // But if we did, create and attach metadata, or a function attribute if
-        // all contexts have identical profiled behavior.
-        if (!AllocTrie.empty()) {
-          NumOfMemProfMatchedAllocs++;
-          // MemprofMDAttached will be false if a function attribute was
-          // attached.
-          bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
-          assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
-          if (MemprofMDAttached) {
-            // Add callsite metadata for the instruction's location list so that
-            // it simpler later on to identify which part of the MIB contexts
-            // are from this particular instruction (including during inlining,
-            // when the callsite metadata will be updated appropriately).
-            // FIXME: can this be changed to strip out the matching stack
-            // context ids from the MIB contexts and not add any callsite
-            // metadata here to save space?
-            addCallsiteMetadata(I, InlinedCallStack, Ctx);
-          }
-        }
-        continue;
-      }
-
-      if (CallSitesIter == LocHashToCallSites.end())
-        continue;
-
-      // Otherwise, add callsite metadata. If we reach here then we found the
-      // instruction's leaf location in the callsites map and not the allocation
-      // map.
-      for (const auto &CallSiteEntry : CallSitesIter->second) {
-        // If we found and thus matched all frames on the call, create and
-        // attach call stack metadata.
-        if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames,
-                                               InlinedCallStack)) {
-          NumOfMemProfMatchedCallSites++;
-          addCallsiteMetadata(I, InlinedCallStack, Ctx);
-
-          // Try to attach indirect call metadata if possible.
-          if (!CalledFunction)
-            addVPMetadata(M, I, CallSiteEntry.CalleeGuids);
-
-          // Only need to find one with a matching call stack and add a single
-          // callsite metadata.
-
-          // Accumulate call site matching information upon request.
-          if (ClPrintMemProfMatchInfo) {
-            std::vector<uint64_t> CallStack;
-            append_range(CallStack, InlinedCallStack);
-            MatchedCallSites.insert(std::move(CallStack));
-          }
-          break;
-        }
-      }
+          isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI))
+        handleAllocSite(I, CI, InlinedCallStack, Ctx, ORE, MaxColdSize,
+                        AllocInfoIter->second, FullStackIdToAllocMatchInfo);
+      else if (CallSitesIter != LocHashToCallSites.end())
+        // Otherwise, add callsite metadata. If we reach here then we found the
+        // instruction's leaf location in the callsites map and not the
+        // allocation map.
+        handleCallSite(I, CalledFunction, InlinedCallStack,
+                       CallSitesIter->second, M, MatchedCallSites);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 2786d81773ed9..1ddb8ae9518fc 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1486,9 +1486,8 @@ static bool checkAndReplaceCondition(
 
     // Update the debug value records that satisfy the same condition used
     // in replaceUsesWithIf.
-    SmallVector<DbgVariableIntrinsic *> DbgUsers;
     SmallVector<DbgVariableRecord *> DVRUsers;
-    findDbgUsers(DbgUsers, Cmp, &DVRUsers);
+    findDbgUsers(Cmp, DVRUsers);
 
     for (auto *DVR : DVRUsers) {
       auto *DTN = DT.getNode(DVR->getParent());
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 85dd9a1bf7161..0f63ed0166cf4 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2079,6 +2079,7 @@ struct DSEState {
       AllocFnKind AllocKind =
           Attrs.getFnAttr(Attribute::AllocKind).getAllocKind() |
           AllocFnKind::Zeroed;
+      AllocKind &= ~AllocFnKind::Uninitialized;
       Attrs =
           Attrs.addFnAttribute(Ctx, Attribute::getWithAllocKind(Ctx, AllocKind))
               .removeFnAttribute(Ctx, "alloc-variant-zeroed");
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index d9d05c3e8cc49..f6bf09d09433d 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1310,7 +1310,7 @@ static Value *findDominatingValue(const MemoryLocation &Loc, Type *LoadTy,
   BatchAAResults BatchAA(*AA);
   for (BasicBlock *BB = FromBB; BB; BB = BB->getSinglePredecessor())
     for (auto *Inst = BB == FromBB ? From : BB->getTerminator();
-         Inst != nullptr; Inst = Inst->getPrevNonDebugInstruction()) {
+         Inst != nullptr; Inst = Inst->getPrevNode()) {
       // Stop the search if limit is reached.
       if (++NumVisitedInsts > MaxNumVisitedInsts)
         return nullptr;
@@ -2367,11 +2367,14 @@ uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
   // See if we can refine the value number by looking at the PN incoming value
   // for the given predecessor.
   if (PHINode *PN = NumberingPhi[Num]) {
-    if (PN->getParent() == PhiBlock)
-      for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I)
-        if (PN->getIncomingBlock(I) == Pred)
-          if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
-            return TransVal;
+    if (PN->getParent() != PhiBlock)
+      return Num;
+    for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) {
+      if (PN->getIncomingBlock(I) != Pred)
+        continue;
+      if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
+        return TransVal;
+    }
     return Num;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 2058df33ea331..a5fc0b4c6904d 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -799,7 +799,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
                                   BasicBlock *BBEnd) {
   SmallVector<Instruction *, 4> Insts;
   for (BasicBlock *BB : Blocks)
-    Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction());
+    Insts.push_back(BB->getTerminator()->getPrevNode());
   Instruction *I0 = Insts.front();
 
   SmallVector<Value *, 4> NewOperands;
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 66836ef05d5db..85ee824b67121 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -430,6 +430,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
   }
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
+    // Always force lifetime markers to work directly on the alloca.
+    NewV = NewV->stripPointerCasts();
     Function *NewDecl = Intrinsic::getOrInsertDeclaration(
         M, II->getIntrinsicID(), {NewV->getType()});
     II->setArgOperand(1, NewV);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index b5dbef13289ac..c2a737d8f9a4a 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1960,7 +1960,6 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
   // PHI insertion, of which we are prepared to do, clean these up now.
   SSAUpdater SSAUpdate;
   SmallVector<Use *, 16> UsesToRename;
-  SmallVector<DbgValueInst *, 4> DbgValues;
   SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
 
   for (Instruction &I : *BB) {
@@ -1978,16 +1977,13 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
     }
 
     // Find debug values outside of the block
-    findDbgValues(DbgValues, &I, &DbgVariableRecords);
-    llvm::erase_if(DbgValues, [&](const DbgValueInst *DbgVal) {
-      return DbgVal->getParent() == BB;
-    });
+    findDbgValues(&I, DbgVariableRecords);
     llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) {
       return DbgVarRec->getParent() == BB;
     });
 
     // If there are no uses outside the block, we're done with this instruction.
-    if (UsesToRename.empty() && DbgValues.empty() && DbgVariableRecords.empty())
+    if (UsesToRename.empty() && DbgVariableRecords.empty())
       continue;
     LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
@@ -2000,10 +1996,8 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    if (!DbgValues.empty() || !DbgVariableRecords.empty()) {
-      SSAUpdate.UpdateDebugValues(&I, DbgValues);
+    if (!DbgVariableRecords.empty()) {
       SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords);
-      DbgValues.clear();
       DbgVariableRecords.clear();
     }
 
@@ -2032,32 +2026,7 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping,
   // copy of the block 'NewBB'.  If there are PHI nodes in the source basic
   // block, evaluate them to account for entry from PredBB.
 
-  // Retargets llvm.dbg.value to any renamed variables.
-  auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool {
-    auto DbgInstruction = dyn_cast<DbgValueInst>(NewInst);
-    if (!DbgInstruction)
-      return false;
-
-    SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap;
-    for (auto DbgOperand : DbgInstruction->location_ops()) {
-      auto DbgOperandInstruction = dyn_cast<Instruction>(DbgOperand);
-      if (!DbgOperandInstruction)
-        continue;
-
-      auto I = ValueMapping.find(DbgOperandInstruction);
-      if (I != ValueMapping.end()) {
-        OperandsToRemap.insert(
-            std::pair<Value *, Value *>(DbgOperand, I->second));
-      }
-    }
-
-    for (auto &[OldOp, MappedOp] : OperandsToRemap)
-      DbgInstruction->replaceVariableLocationOp(OldOp, MappedOp);
-    return true;
-  };
-
-  // Duplicate implementation of the above dbg.value code, using
-  // DbgVariableRecords instead.
+  // Retargets dbg.value to any renamed variables.
   auto RetargetDbgVariableRecordIfPossible = [&](DbgVariableRecord *DVR) {
     SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap;
     for (auto *Op : DVR->location_ops()) {
@@ -2116,9 +2085,6 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping,
     if (const DebugLoc &DL = New->getDebugLoc())
       mapAtomInstance(DL, ValueMapping);
 
-    if (RetargetDbgValueIfPossible(New))
-      continue;
-
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
       if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a0f9f3c4a35a5..70e9eee5339a7 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -78,6 +78,7 @@ enum class RuleTy {
   PerLoopCacheAnalysis,
   PerInstrOrderCost,
   ForVectorization,
+  Ignore
 };
 
 } // end anonymous namespace
@@ -106,14 +107,20 @@ static cl::list<RuleTy> Profitabilities(
                clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
                           "Prioritize the IVs order of each instruction"),
                clEnumValN(RuleTy::ForVectorization, "vectorize",
-                          "Prioritize vectorization")));
+                          "Prioritize vectorization"),
+               clEnumValN(RuleTy::Ignore, "ignore",
+                          "Ignore profitability, force interchange (does not "
+                          "work with other options)")));
 
 #ifndef NDEBUG
-static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {
+static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
   SmallSet<RuleTy, 4> Set;
-  for (RuleTy Rule : Rules)
+  for (RuleTy Rule : Rules) {
     if (!Set.insert(Rule).second)
       return false;
+    if (Rule == RuleTy::Ignore)
+      return false;
+  }
   return true;
 }
 
@@ -1357,6 +1364,13 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
     unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
+
+  // Return true if interchange is forced and the cost-model ignored.
+  if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
+    return true;
+  assert(noDuplicateRulesAndIgnore(Profitabilities) &&
+         "Duplicate rules and option 'ignore' are not allowed");
+
   // isProfitable() is structured to avoid endless loop interchange. If the
   // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
   // decide the profitability then, profitability check will stop and return the
@@ -1365,7 +1379,6 @@ bool LoopInterchangeProfitability::isProfitable(
   // second highest priority rule (isProfitablePerInstrOrderCost by default).
   // Likewise, if it failed to analysis the profitability then only, the last
   // rule (isProfitableForVectorization by default) will decide.
-  assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules");
   std::optional<bool> shouldInterchange;
   for (RuleTy RT : Profitabilities) {
     switch (RT) {
@@ -1382,6 +1395,9 @@ bool LoopInterchangeProfitability::isProfitable(
       shouldInterchange =
           isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
       break;
+    case RuleTy::Ignore:
+      llvm_unreachable("Option 'ignore' is not supported with other options");
+      break;
     }
 
     // If this rule could determine the profitability, don't call subsequent
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 221094f170ac7..b9546c5fa236b 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -128,6 +128,8 @@ class ConstantTerminatorFoldingImpl {
   // from any other block. So this variable set to true means that loop's latch
   // has become unreachable from loop header.
   bool DeleteCurrentLoop = false;
+  // Whether or not we enter the loop through an indirectbr.
+  bool HasIndirectEntry = false;
 
   // The blocks of the original loop that will still be reachable from entry
   // after the constant folding.
@@ -216,6 +218,19 @@ class ConstantTerminatorFoldingImpl {
       return;
     }
 
+    // We need a loop preheader to split in handleDeadExits(). If LoopSimplify
+    // wasn't able to form one because the loop can be entered through an
+    // indirectbr we cannot continue.
+    if (!L.getLoopPreheader()) {
+      assert(any_of(predecessors(L.getHeader()),
+                    [&](BasicBlock *Pred) {
+                      return isa<IndirectBrInst>(Pred->getTerminator());
+                    }) &&
+             "Loop should have preheader if it is not entered indirectly");
+      HasIndirectEntry = true;
+      return;
+    }
+
     // Collect live and dead loop blocks and exits.
     LiveLoopBlocks.insert(L.getHeader());
     for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
@@ -546,6 +561,12 @@ class ConstantTerminatorFoldingImpl {
       return false;
     }
 
+    if (HasIndirectEntry) {
+      LLVM_DEBUG(dbgs() << "Loops which can be entered indirectly are not"
+                           " supported!\n");
+      return false;
+    }
+
     // Nothing to constant-fold.
     if (FoldCandidates.empty()) {
       LLVM_DEBUG(
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index dc8fa4379752f..9e318b04c2c99 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3790,6 +3790,11 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
             continue;
         }
 
+        // Do not consider uses inside lifetime intrinsics. These are not
+        // actually materialized.
+        if (UserInst->isLifetimeStartOrEnd())
+          continue;
+
         std::pair<size_t, Immediate> P =
             getUse(S, LSRUse::Basic, MemAccessTy());
         size_t LUIdx = P.first;
@@ -6630,13 +6635,10 @@ struct SCEVDbgValueBuilder {
 /// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
 /// and DIExpression.
 struct DVIRecoveryRec {
-  DVIRecoveryRec(DbgValueInst *DbgValue)
-      : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
-        HadLocationArgList(false) {}
   DVIRecoveryRec(DbgVariableRecord *DVR)
       : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
 
-  PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef;
+  DbgVariableRecord *DbgRef;
   DIExpression *Expr;
   bool HadLocationArgList;
   SmallVector<WeakVH, 2> LocationOps;
@@ -6695,44 +6697,38 @@ static void updateDVIWithLocations(T &DbgVal,
 }
 
 /// Write the new expression and new location ops for the dbg.value. If possible
-/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
+/// reduce the szie of the dbg.value by omitting DIArglist. This
 /// can be omitted if:
 /// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
 /// 2. The DW_OP_LLVM_arg is the first operand in the expression.
-static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
-                               SmallVectorImpl<Value *> &NewLocationOps,
-                               SmallVectorImpl<uint64_t> &NewExpr) {
-  auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
-    unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
-    if (NumLLVMArgs == 0) {
-      // Location assumed to be on the stack.
-      updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
-    } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
-      // There is only a single DW_OP_llvm_arg at the start of the expression,
-      // so it can be omitted along with DIArglist.
-      assert(NewExpr[1] == 0 &&
-             "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
-      llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
-      updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
-    } else {
-      // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
-      updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
-    }
+static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
+                           SmallVectorImpl<Value *> &NewLocationOps,
+                           SmallVectorImpl<uint64_t> &NewExpr) {
+  DbgVariableRecord *DbgVal = DVIRec.DbgRef;
+  unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
+  if (NumLLVMArgs == 0) {
+    // Location assumed to be on the stack.
+    updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
+  } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
+    // There is only a single DW_OP_llvm_arg at the start of the expression,
+    // so it can be omitted along with DIArglist.
+    assert(NewExpr[1] == 0 &&
+           "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
+    llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
+    updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
+  } else {
+    // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
+    updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
+  }
 
-    // If the DIExpression was previously empty then add the stack terminator.
-    // Non-empty expressions have only had elements inserted into them and so
-    // the terminator should already be present e.g. stack_value or fragment.
-    DIExpression *SalvageExpr = DbgVal->getExpression();
-    if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
-      SalvageExpr =
-          DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
-      DbgVal->setExpression(SalvageExpr);
-    }
-  };
-  if (isa<DbgValueInst *>(DVIRec.DbgRef))
-    UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
-  else
-    UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
+  // If the DIExpression was previously empty then add the stack terminator.
+  // Non-empty expressions have only had elements inserted into them and so
+  // the terminator should already be present e.g. stack_value or fragment.
+  DIExpression *SalvageExpr = DbgVal->getExpression();
+  if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
+    SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
+    DbgVal->setExpression(SalvageExpr);
+  }
 }
 
 /// Cached location ops may be erased during LSR, in which case a poison is
@@ -6746,39 +6742,34 @@ static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) {
 
 /// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
 static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
-  auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
-    LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
-                      << "scev-salvage: post-LSR: " << *DbgVal << '\n');
-    assert(DVIRec.Expr && "Expected an expression");
-    DbgVal->setExpression(DVIRec.Expr);
-
-    // Even a single location-op may be inside a DIArgList and referenced with
-    // DW_OP_LLVM_arg, which is valid only with a DIArgList.
-    if (!DVIRec.HadLocationArgList) {
-      assert(DVIRec.LocationOps.size() == 1 &&
-             "Unexpected number of location ops.");
-      // LSR's unsuccessful salvage attempt may have added DIArgList, which in
-      // this case was not present before, so force the location back to a
-      // single uncontained Value.
-      Value *CachedValue =
-          getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
-      DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
-    } else {
-      SmallVector<ValueAsMetadata *, 3> MetadataLocs;
-      for (WeakVH VH : DVIRec.LocationOps) {
-        Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
-        MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
-      }
-      auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
-      DbgVal->setRawLocation(
-          llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
+  DbgVariableRecord *DbgVal = DVIRec.DbgRef;
+  LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
+                    << "scev-salvage: post-LSR: " << *DbgVal << '\n');
+  assert(DVIRec.Expr && "Expected an expression");
+  DbgVal->setExpression(DVIRec.Expr);
+
+  // Even a single location-op may be inside a DIArgList and referenced with
+  // DW_OP_LLVM_arg, which is valid only with a DIArgList.
+  if (!DVIRec.HadLocationArgList) {
+    assert(DVIRec.LocationOps.size() == 1 &&
+           "Unexpected number of location ops.");
+    // LSR's unsuccessful salvage attempt may have added DIArgList, which in
+    // this case was not present before, so force the location back to a
+    // single uncontained Value.
+    Value *CachedValue =
+        getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
+    DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
+  } else {
+    SmallVector<ValueAsMetadata *, 3> MetadataLocs;
+    for (WeakVH VH : DVIRec.LocationOps) {
+      Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
+      MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
     }
-    LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
-  };
-  if (isa<DbgValueInst *>(DVIRec.DbgRef))
-    RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
-  else
-    RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
+    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
+    DbgVal->setRawLocation(
+        llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
+  }
+  LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
 }
 
 static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
@@ -6786,9 +6777,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
                        const SCEV *SCEVInductionVar,
                        SCEVDbgValueBuilder IterCountExpr) {
 
-  if (isa<DbgValueInst *>(DVIRec.DbgRef)
-          ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
-          : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
+  if (!DVIRec.DbgRef->isKillLocation())
     return false;
 
   // LSR may have caused several changes to the dbg.value in the failed salvage
@@ -6882,13 +6871,8 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
     DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
   }
 
-  UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
-  if (isa<DbgValueInst *>(DVIRec.DbgRef))
-    LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
-                      << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
-  else
-    LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
-                      << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
+  UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
+  LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
   return true;
 }
 
@@ -6934,21 +6918,23 @@ static void DbgRewriteSalvageableDVIs(
 /// cacheing and salvaging.
 static void DbgGatherSalvagableDVI(
     Loop *L, ScalarEvolution &SE,
-    SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
-    SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
+    SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
   for (const auto &B : L->getBlocks()) {
     for (auto &I : *B) {
-      auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
+      for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
+        if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
+          continue;
+
         // Ensure that if any location op is undef that the dbg.vlue is not
         // cached.
-        if (DbgVal->isKillLocation())
-          return false;
+        if (DbgVal.isKillLocation())
+          continue;
 
         // Check that the location op SCEVs are suitable for translation to
         // DIExpression.
         const auto &HasTranslatableLocationOps =
-            [&](const auto *DbgValToTranslate) -> bool {
-          for (const auto LocOp : DbgValToTranslate->location_ops()) {
+            [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
+          for (const auto LocOp : DbgValToTranslate.location_ops()) {
             if (!LocOp)
               return false;
 
@@ -6963,31 +6949,21 @@ static void DbgGatherSalvagableDVI(
         };
 
         if (!HasTranslatableLocationOps(DbgVal))
-          return false;
+          continue;
 
         std::unique_ptr<DVIRecoveryRec> NewRec =
-            std::make_unique<DVIRecoveryRec>(DbgVal);
+            std::make_unique<DVIRecoveryRec>(&DbgVal);
         // Each location Op may need a SCEVDbgValueBuilder in order to recover
         // it. Pre-allocating a vector will enable quick lookups of the builder
         // later during the salvage.
-        NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
-        for (const auto LocOp : DbgVal->location_ops()) {
+        NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
+        for (const auto LocOp : DbgVal.location_ops()) {
           NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
           NewRec->LocationOps.push_back(LocOp);
-          NewRec->HadLocationArgList = DbgVal->hasArgList();
+          NewRec->HadLocationArgList = DbgVal.hasArgList();
         }
         SalvageableDVISCEVs.push_back(std::move(NewRec));
-        return true;
-      };
-      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
-        if (DVR.isDbgValue() || DVR.isDbgAssign())
-          ProcessDbgValue(&DVR);
       }
-      auto DVI = dyn_cast<DbgValueInst>(&I);
-      if (!DVI)
-        continue;
-      if (ProcessDbgValue(DVI))
-        DVIHandles.insert(DVI);
     }
   }
 }
@@ -7036,8 +7012,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   // Debug preservation - before we start removing anything identify which DVI
   // meet the salvageable criteria and store their DIExpression and SCEVs.
   SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
-  SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
-  DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
+  DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
 
   bool Changed = false;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
@@ -7105,7 +7080,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   for (auto &Rec : SalvageableDVIRecords)
     Rec->clear();
   SalvageableDVIRecords.clear();
-  DVIHandles.clear();
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 7eeaaa0d99602..6a3f65614d310 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -82,6 +82,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
@@ -3044,6 +3045,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
     if (isInstructionTriviallyDead(&I, TLI)) {
       InstrDFS[&I] = 0;
       LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      salvageDebugInfo(I);
       markInstructionForDeletion(&I);
       continue;
     }
@@ -4076,6 +4078,12 @@ bool NewGVN::eliminateInstructions(Function &F) {
                 if (!match(DefI, m_Intrinsic<Intrinsic::ssa_copy>()))
                   patchReplacementInstruction(DefI, DominatingLeader);
 
+                SmallVector<DbgVariableRecord *> DVRUsers;
+                findDbgUsers(DefI, DVRUsers);
+
+                for (auto *DVR : DVRUsers)
+                  DVR->replaceVariableLocationOp(DefI, DominatingLeader);
+
                 markInstructionForDeletion(DefI);
               }
             }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 70b4552190a4e..23256cf2acbd2 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -315,18 +315,11 @@ calculateFragment(DILocalVariable *Variable,
   return UseFrag;
 }
 
-static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
-  return DebugVariable(DVI->getVariable(), std::nullopt,
-                       DVI->getDebugLoc().getInlinedAt());
-}
 static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
   return DebugVariable(DVR->getVariable(), std::nullopt,
                        DVR->getDebugLoc().getInlinedAt());
 }
 
-/// Helpers for handling new and old debug info modes in migrateDebugInfo.
-/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
-/// on the \p Unused parameter type.
 DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
   (void)Unused;
   return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
@@ -376,9 +369,6 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   /// Map of aggregate variables to their fragment associated with OldAlloca.
   DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
       BaseFragments;
-  for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
-    BaseFragments[getAggregateVariable(DAI)] =
-        DAI->getExpression()->getFragmentInfo();
   for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
     BaseFragments[getAggregateVariable(DVR)] =
         DVR->getExpression()->getFragmentInfo();
@@ -391,7 +381,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
   assert(OldAlloca->isStaticAlloca());
 
-  auto MigrateDbgAssign = [&](auto *DbgAssign) {
+  auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
                       << "\n");
     auto *Expr = DbgAssign->getExpression();
@@ -486,7 +476,6 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
   };
 
-  for_each(MarkerRange, MigrateDbgAssign);
   for_each(DVRAssignMarkerRange, MigrateDbgAssign);
 }
 
@@ -5119,36 +5108,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
 }
 
 // There isn't a shared interface to get the "address" parts out of a
-// dbg.declare and dbg.assign, so provide some wrappers now for
-// both debug intrinsics and records.
-const Value *getAddress(const DbgVariableIntrinsic *DVI) {
-  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-    return DAI->getAddress();
-  return cast<DbgDeclareInst>(DVI)->getAddress();
-}
-
-const Value *getAddress(const DbgVariableRecord *DVR) {
-  return DVR->getAddress();
-}
-
-bool isKillAddress(const DbgVariableIntrinsic *DVI) {
-  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-    return DAI->isKillAddress();
-  return cast<DbgDeclareInst>(DVI)->isKillLocation();
-}
-
+// dbg.declare and dbg.assign, so provide some wrappers.
 bool isKillAddress(const DbgVariableRecord *DVR) {
   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
     return DVR->isKillAddress();
   return DVR->isKillLocation();
 }
 
-const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) {
-  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-    return DAI->getAddressExpression();
-  return cast<DbgDeclareInst>(DVI)->getExpression();
-}
-
 const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
     return DVR->getAddressExpression();
@@ -5236,66 +5202,6 @@ static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
   return DIExpression::get(Expr->getContext(), Ops);
 }
 
-/// Insert a new dbg.declare.
-/// \p Orig Original to copy debug loc and variable from.
-/// \p NewAddr Location's new base address.
-/// \p NewAddrExpr New expression to apply to address.
-/// \p BeforeInst Insert position.
-/// \p NewFragment New fragment (absolute, non-relative).
-/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
-static void
-insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
-                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
-                 std::optional<DIExpression::FragmentInfo> NewFragment,
-                 int64_t BitExtractAdjustment) {
-  if (NewFragment)
-    NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment,
-                                          BitExtractAdjustment);
-  if (!NewAddrExpr)
-    return;
-
-  DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr,
-                    Orig->getDebugLoc(), BeforeInst->getIterator());
-}
-
-/// Insert a new dbg.assign.
-/// \p Orig Original to copy debug loc, variable, value and value expression
-///    from.
-/// \p NewAddr Location's new base address.
-/// \p NewAddrExpr New expression to apply to address.
-/// \p BeforeInst Insert position.
-/// \p NewFragment New fragment (absolute, non-relative).
-/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
-static void
-insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr,
-                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
-                 std::optional<DIExpression::FragmentInfo> NewFragment,
-                 int64_t BitExtractAdjustment) {
-  // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
-  (void)BeforeInst;
-
-  // A dbg.assign puts fragment info in the value expression only. The address
-  // expression has already been built: NewAddrExpr.
-  DIExpression *NewFragmentExpr = Orig->getExpression();
-  if (NewFragment)
-    NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
-                                              BitExtractAdjustment);
-  if (!NewFragmentExpr)
-    return;
-
-  // Apply a DIAssignID to the store if it doesn't already have it.
-  if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
-    NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
-                         DIAssignID::getDistinct(NewAddr->getContext()));
-  }
-
-  Instruction *NewAssign = cast<Instruction *>(DIB.insertDbgAssign(
-      NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
-      NewAddrExpr, Orig->getDebugLoc()));
-  LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
-  (void)NewAssign;
-}
-
 /// Insert a new DbgRecord.
 /// \p Orig Original to copy record type, debug loc and variable from, and
 ///    additionally value and value expression for dbg_assign records.
@@ -5457,12 +5363,12 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
-  auto MigrateOne = [&](auto *DbgVariable) {
+  auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
     // Can't overlap with undef memory.
     if (isKillAddress(DbgVariable))
       return;
 
-    const Value *DbgPtr = getAddress(DbgVariable);
+    const Value *DbgPtr = DbgVariable->getAddress();
     DIExpression::FragmentInfo VarFrag =
         DbgVariable->getFragmentOrEntireVariable();
     // Get the address expression constant offset if one exists and the ops
@@ -5543,7 +5449,6 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         if (SameVariableFragment(OldDII, DbgVariable))
           OldDII->eraseFromParent();
       };
-      for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
       for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
       for_each(findDVRValues(Fragment.Alloca), RemoveOne);
       insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
@@ -5553,10 +5458,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
-  for_each(findDbgDeclares(&AI), MigrateOne);
   for_each(findDVRDeclares(&AI), MigrateOne);
   for_each(findDVRValues(&AI), MigrateOne);
-  for_each(at::getAssignmentMarkers(&AI), MigrateOne);
   for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
 
   return Changed;
@@ -5777,8 +5680,6 @@ bool SROA::deleteDeadInstructions(
     // not be able to find it.
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
       DeletedAllocas.insert(AI);
-      for (DbgDeclareInst *OldDII : findDbgDeclares(AI))
-        OldDII->eraseFromParent();
       for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
         OldDII->eraseFromParent();
     }
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 820c8e12d2449..ced61cb8e51fe 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1105,7 +1105,9 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
     Res.push_back(ResElem);
   }
 
-  gather(&EVI, Res, *VS);
+  Type *ActualVecType = cast<FixedVectorType>(OpTy->getContainedType(Index));
+  std::optional<VectorSplit> AVS = getVectorSplit(ActualVecType);
+  gather(&EVI, Res, *AVS);
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index a09303bb4469f..60e5df08c6efd 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -194,8 +194,7 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F,
     // Calls to experimental_deoptimize must be followed by a return
     // of the value computed by experimental_deoptimize.
     // I.e., we can not change `ret` to `br` for this block.
-    if (auto *CI =
-            dyn_cast_or_null<CallInst>(Term->getPrevNonDebugInstruction())) {
+    if (auto *CI = dyn_cast_or_null<CallInst>(Term->getPrevNode())) {
       if (Function *F = CI->getCalledFunction())
         if (Intrinsic::ID ID = F->getIntrinsicID())
           if (ID == Intrinsic::experimental_deoptimize)
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index fccb73a36b182..b187208bc238c 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -576,9 +576,8 @@ void PruningFunctionCloner::CloneBlock(
     }
 
     // Eagerly remap operands to the newly cloned instruction, except for PHI
-    // nodes for which we defer processing until we update the CFG. Also defer
-    // debug intrinsic processing because they may contain use-before-defs.
-    if (!isa<PHINode>(NewInst) && !isa<DbgVariableIntrinsic>(NewInst)) {
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
       RemapInstruction(NewInst, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
 
@@ -733,15 +732,6 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     StartingInst = &StartingBB->front();
   }
 
-  // Collect debug intrinsics for remapping later.
-  SmallVector<const DbgVariableIntrinsic *, 8> DbgIntrinsics;
-  for (const auto &BB : *OldFunc) {
-    for (const auto &I : BB) {
-      if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-        DbgIntrinsics.push_back(DVI);
-    }
-  }
-
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock *> CloneWorklist;
   PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
@@ -899,21 +889,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Restore attributes.
   NewFunc->setAttributes(Attrs);
 
-  // Remap debug intrinsic operands now that all values have been mapped.
-  // Doing this now (late) preserves use-before-defs in debug intrinsics. If
+  // Remap debug records operands now that all values have been mapped.
+  // Doing this now (late) preserves use-before-defs in debug records. If
   // we didn't do this, ValueAsMetadata(use-before-def) operands would be
   // replaced by empty metadata. This would signal later cleanup passes to
-  // remove the debug intrinsics, potentially causing incorrect locations.
-  for (const auto *DVI : DbgIntrinsics) {
-    if (DbgVariableIntrinsic *NewDVI =
-            cast_or_null<DbgVariableIntrinsic>(VMap.lookup(DVI)))
-      RemapInstruction(NewDVI, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper, Materializer);
-  }
-
-  // Do the same for DbgVariableRecords, touching all the instructions in the
-  // cloned range of blocks.
+  // remove the debug records, potentially causing incorrect locations.
   Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
   for (BasicBlock &BB : make_range(Begin, NewFunc->end())) {
     for (Instruction &I : BB) {
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index eacaf42e4e8ba..7a9dd37b72205 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1219,12 +1219,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
 /// \p F.
 static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
   for (Instruction &I : instructions(F)) {
-    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
     SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
-    findDbgUsers(DbgUsers, &I, &DbgVariableRecords);
-    for (DbgVariableIntrinsic *DVI : DbgUsers)
-      if (DVI->getFunction() != &F)
-        DVI->eraseFromParent();
+    findDbgUsers(&I, DbgVariableRecords);
     for (DbgVariableRecord *DVR : DbgVariableRecords)
       if (DVR->getFunction() != &F)
         DVR->eraseFromParent();
@@ -1286,17 +1282,13 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
         NewFunc.getEntryBlock().getTerminator()->getIterator());
   };
   for (auto [Input, NewVal] : zip_equal(Inputs, NewValues)) {
-    SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
     SmallVector<DbgVariableRecord *, 1> DPUsers;
-    findDbgUsers(DbgUsers, Input, &DPUsers);
+    findDbgUsers(Input, DPUsers);
     DIExpression *Expr = DIB.createExpression();
 
     // Iterate the debud users of the Input values. If they are in the extracted
     // function then update their location with the new value. If they are in
     // the parent function then create a similar debug record.
-    for (auto *DVI : DbgUsers)
-      UpdateOrInsertDebugRecord(DVI, Input, NewVal, Expr,
-                                isa<DbgDeclareInst>(DVI));
     for (auto *DVR : DPUsers)
       UpdateOrInsertDebugRecord(DVR, Input, NewVal, Expr, DVR->isDbgDeclare());
   }
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index a1f030a336c15..4210ce6da1eb2 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -808,9 +808,6 @@ bool checkDebugifyMetadata(Module &M,
 
     // Find missing lines.
     for (Instruction &I : instructions(F)) {
-      if (isa<DbgValueInst>(&I))
-        continue;
-
       auto DL = I.getDebugLoc();
       if (DL && DL.getLine() != 0) {
         MissingLines.reset(DL.getLine() - 1);
@@ -839,10 +836,6 @@ bool checkDebugifyMetadata(Module &M,
       for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
         if (DVR.isDbgValue() || DVR.isDbgAssign())
           CheckForMisSized(&DVR);
-      auto *DVI = dyn_cast<DbgValueInst>(&I);
-      if (!DVI)
-        continue;
-      CheckForMisSized(DVI);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 6929d14bc56ea..ed3dca2f7c307 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1978,14 +1978,13 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL,
       continue;
 
     // Find all local variables associated with the backing storage.
-    auto CollectAssignsForStorage = [&](auto *DbgAssign) {
+    auto CollectAssignsForStorage = [&](DbgVariableRecord *DbgAssign) {
       // Skip variables from inlined functions - they are not local variables.
       if (DbgAssign->getDebugLoc().getInlinedAt())
         return;
       LLVM_DEBUG(errs() << " > DEF : " << *DbgAssign << "\n");
       EscapedLocals[Base].insert(at::VarRecord(DbgAssign));
     };
-    for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage);
     for_each(at::getDVRAssignmentMarkers(Base), CollectAssignsForStorage);
   }
   return EscapedLocals;
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index c3c3cdf50a985..8d18c755d04c4 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -243,26 +243,10 @@ formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist,
       SSAUpdate.RewriteUse(*UseToRewrite);
     }
 
-    SmallVector<DbgValueInst *, 4> DbgValues;
     SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
-    llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
+    llvm::findDbgValues(I, DbgVariableRecords);
 
     // Update pre-existing debug value uses that reside outside the loop.
-    for (auto *DVI : DbgValues) {
-      BasicBlock *UserBB = DVI->getParent();
-      if (InstBB == UserBB || L->contains(UserBB))
-        continue;
-      // We currently only handle debug values residing in blocks that were
-      // traversed while rewriting the uses. If we inserted just a single PHI,
-      // we will handle all relevant debug values.
-      Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
-                                       : SSAUpdate.FindValueForBlock(UserBB);
-      if (V)
-        DVI->replaceVariableLocationOp(I, V);
-    }
-
-    // RemoveDIs: copy-paste of block above, using non-instruction debug-info
-    // records.
     for (DbgVariableRecord *DVR : DbgVariableRecords) {
       BasicBlock *UserBB = DVR->getMarker()->getParent();
       if (InstBB == UserBB || L->contains(UserBB))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ccdaca9b0e91c..f89d36f98675d 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -428,10 +428,6 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
   if (I->isEHPad())
     return false;
 
-  // We don't want debug info removed by anything this general.
-  if (isa<DbgVariableIntrinsic>(I))
-    return false;
-
   if (const DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
     if (DLI->getLabel())
       return false;
@@ -614,14 +610,11 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
 }
 
 bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
-  findDbgUsers(DbgUsers, I, &DPUsers);
-  for (auto *DII : DbgUsers)
-    DII->setKillLocation();
+  findDbgUsers(I, DPUsers);
   for (auto *DVR : DPUsers)
     DVR->setKillLocation();
-  return !DbgUsers.empty() || !DPUsers.empty();
+  return !DPUsers.empty();
 }
 
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
@@ -1608,15 +1601,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   // Since we can't guarantee that the original dbg.declare intrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
-  SmallVector<DbgValueInst *, 1> DbgValues;
   SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
-  findDbgValues(DbgValues, APN, &DbgVariableRecords);
-  for (auto *DVI : DbgValues) {
-    assert(is_contained(DVI->getValues(), APN));
-    if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
-      return true;
-  }
-  for (auto *DVR : DbgVariableRecords) {
+  findDbgValues(APN, DbgVariableRecords);
+  for (DbgVariableRecord *DVR : DbgVariableRecords) {
     assert(is_contained(DVR->location_ops(), APN));
     if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr))
       return true;
@@ -1632,33 +1619,6 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 /// describes an alloca'd variable, so we need to use the alloc size of the
 /// value when doing the comparison. E.g. an i1 value will be identified as
 /// covering an n-bit fragment, if the store size of i1 is at least n bits.
-static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
-  const DataLayout &DL = DII->getDataLayout();
-  TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
-  if (std::optional<uint64_t> FragmentSize =
-          DII->getExpression()->getActiveBits(DII->getVariable()))
-    return TypeSize::isKnownGE(ValueSize, TypeSize::getFixed(*FragmentSize));
-
-  // We can't always calculate the size of the DI variable (e.g. if it is a
-  // VLA). Try to use the size of the alloca that the dbg intrinsic describes
-  // instead.
-  if (DII->isAddressOfVariable()) {
-    // DII should have exactly 1 location when it is an address.
-    assert(DII->getNumVariableLocationOps() == 1 &&
-           "address of variable must have exactly 1 location operand.");
-    if (auto *AI =
-            dyn_cast_or_null<AllocaInst>(DII->getVariableLocationOp(0))) {
-      if (std::optional<TypeSize> FragmentSize =
-              AI->getAllocationSizeInBits(DL)) {
-        return TypeSize::isKnownGE(ValueSize, *FragmentSize);
-      }
-    }
-  }
-  // Could not determine size of variable. Conservatively return false.
-  return false;
-}
-// RemoveDIs: duplicate implementation of the above, using DbgVariableRecords,
-// the replacement for dbg.values.
 static bool valueCoversEntireFragment(Type *ValTy, DbgVariableRecord *DVR) {
   const DataLayout &DL = DVR->getModule()->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
@@ -1695,106 +1655,12 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
   Instr->getParent()->insertDbgRecordBefore(DVRec, Instr);
 }
 
-static void insertDbgValueOrDbgVariableRecordAfter(
-    DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr,
-    const DebugLoc &NewLoc, Instruction *Instr) {
-  BasicBlock::iterator NextIt = std::next(Instr->getIterator());
-  NextIt.setHeadBit(true);
-  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, NextIt);
-}
-
-/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                           StoreInst *SI, DIBuilder &Builder) {
-  assert(DII->isAddressOfVariable() || isa<DbgAssignIntrinsic>(DII));
-  auto *DIVar = DII->getVariable();
-  assert(DIVar && "Missing variable");
-  auto *DIExpr = DII->getExpression();
-  Value *DV = SI->getValueOperand();
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  // If the alloca describes the variable itself, i.e. the expression in the
-  // dbg.declare doesn't start with a dereference, we can perform the
-  // conversion if the value covers the entire fragment of DII.
-  // If the alloca describes the *address* of DIVar, i.e. DIExpr is
-  // *just* a DW_OP_deref, we use DV as is for the dbg.value.
-  // We conservatively ignore other dereferences, because the following two are
-  // not equivalent:
-  //     dbg.declare(alloca, ..., !Expr(deref, plus_uconstant, 2))
-  //     dbg.value(DV, ..., !Expr(deref, plus_uconstant, 2))
-  // The former is adding 2 to the address of the variable, whereas the latter
-  // is adding 2 to the value of the variable. As such, we insist on just a
-  // deref expression.
-  bool CanConvert =
-      DIExpr->isDeref() || (!DIExpr->startsWithDeref() &&
-                            valueCoversEntireFragment(DV->getType(), DII));
-  if (CanConvert) {
-    insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
-                                      SI->getIterator());
-    return;
-  }
-
-  // FIXME: If storing to a part of the variable described by the dbg.declare,
-  // then we want to insert a dbg.value for the corresponding fragment.
-  LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DII
-                    << '\n');
-  // For now, when there is a store to parts of the variable (but we do not
-  // know which part) we insert an dbg.value intrinsic to indicate that we
-  // know nothing about the variable's content.
-  DV = PoisonValue::get(DV->getType());
-  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
-                                    SI->getIterator());
-}
-
 static DIExpression *dropInitialDeref(const DIExpression *DIExpr) {
   int NumEltDropped = DIExpr->getElements()[0] == dwarf::DW_OP_LLVM_arg ? 3 : 1;
   return DIExpression::get(DIExpr->getContext(),
                            DIExpr->getElements().drop_front(NumEltDropped));
 }
 
-void llvm::InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII, StoreInst *SI,
-                                      DIBuilder &Builder) {
-  auto *DIVar = DII->getVariable();
-  assert(DIVar && "Missing variable");
-  auto *DIExpr = DII->getExpression();
-  DIExpr = dropInitialDeref(DIExpr);
-  Value *DV = SI->getValueOperand();
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
-                                    SI->getIterator());
-}
-
-/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                           LoadInst *LI, DIBuilder &Builder) {
-  auto *DIVar = DII->getVariable();
-  auto *DIExpr = DII->getExpression();
-  assert(DIVar && "Missing variable");
-
-  if (!valueCoversEntireFragment(LI->getType(), DII)) {
-    // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a dbg.value for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
-                      << *DII << '\n');
-    return;
-  }
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  // We are now tracking the loaded value instead of the address. In the
-  // future if multi-location support is added to the IR, it might be
-  // preferable to keep tracking both the loaded value and the original
-  // address in case the alloca can not be elided.
-  insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc,
-                                         LI);
-}
-
 void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                            StoreInst *SI, DIBuilder &Builder) {
   assert(DVR->isAddressOfVariable() || DVR->isDbgAssign());
@@ -1855,40 +1721,6 @@ void llvm::InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI,
                                     SI->getIterator());
 }
 
-/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
-/// llvm.dbg.declare intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                           PHINode *APN, DIBuilder &Builder) {
-  auto *DIVar = DII->getVariable();
-  auto *DIExpr = DII->getExpression();
-  assert(DIVar && "Missing variable");
-
-  if (PhiHasDebugValue(DIVar, DIExpr, APN))
-    return;
-
-  if (!valueCoversEntireFragment(APN->getType(), DII)) {
-    // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a dbg.value for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
-                      << *DII << '\n');
-    return;
-  }
-
-  BasicBlock *BB = APN->getParent();
-  auto InsertionPt = BB->getFirstInsertionPt();
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  // The block may be a catchswitch block, which does not have a valid
-  // insertion point.
-  // FIXME: Insert dbg.value markers in the successors when appropriate.
-  if (InsertionPt != BB->end()) {
-    insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc,
-                                      InsertionPt);
-  }
-}
-
 void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
                                            DIBuilder &Builder) {
   auto *DIVar = DVR->getVariable();
@@ -1981,7 +1813,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
   if (Dbgs.empty() && DVRs.empty())
     return Changed;
 
-  auto LowerOne = [&](auto *DDI) {
+  auto LowerOne = [&](DbgVariableRecord *DDI) {
     AllocaInst *AI =
         dyn_cast_or_null<AllocaInst>(DDI->getVariableLocationOp(0));
     // If this is an alloca for a scalar variable, insert a dbg.value
@@ -2036,7 +1868,6 @@ bool llvm::LowerDbgDeclare(Function &F) {
     Changed = true;
   };
 
-  for_each(Dbgs, LowerOne);
   for_each(DVRs, LowerOne);
 
   if (Changed)
@@ -2046,12 +1877,9 @@ bool llvm::LowerDbgDeclare(Function &F) {
   return Changed;
 }
 
-// RemoveDIs: re-implementation of insertDebugValuesForPHIs, but which pulls the
-// debug-info out of the block's DbgVariableRecords rather than dbg.value
-// intrinsics.
-static void
-insertDbgVariableRecordsForPHIs(BasicBlock *BB,
-                                SmallVectorImpl<PHINode *> &InsertedPHIs) {
+/// Propagate dbg.value records through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
   assert(BB && "No BasicBlock to clone DbgVariableRecord(s) from.");
   if (InsertedPHIs.size() == 0)
     return;
@@ -2113,76 +1941,12 @@ insertDbgVariableRecordsForPHIs(BasicBlock *BB,
   }
 }
 
-/// Propagate dbg.value intrinsics through the newly inserted PHIs.
-void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
-                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
-  assert(BB && "No BasicBlock to clone dbg.value(s) from.");
-  if (InsertedPHIs.size() == 0)
-    return;
-
-  insertDbgVariableRecordsForPHIs(BB, InsertedPHIs);
-
-  // Map existing PHI nodes to their dbg.values.
-  ValueToValueMapTy DbgValueMap;
-  for (auto &I : *BB) {
-    if (auto DbgII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-      for (Value *V : DbgII->location_ops())
-        if (auto *Loc = dyn_cast_or_null<PHINode>(V))
-          DbgValueMap.insert({Loc, DbgII});
-    }
-  }
-  if (DbgValueMap.size() == 0)
-    return;
-
-  // Map a pair of the destination BB and old dbg.value to the new dbg.value,
-  // so that if a dbg.value is being rewritten to use more than one of the
-  // inserted PHIs in the same destination BB, we can update the same dbg.value
-  // with all the new PHIs instead of creating one copy for each.
-  MapVector<std::pair<BasicBlock *, DbgVariableIntrinsic *>,
-            DbgVariableIntrinsic *>
-      NewDbgValueMap;
-  // Then iterate through the new PHIs and look to see if they use one of the
-  // previously mapped PHIs. If so, create a new dbg.value intrinsic that will
-  // propagate the info through the new PHI. If we use more than one new PHI in
-  // a single destination BB with the same old dbg.value, merge the updates so
-  // that we get a single new dbg.value with all the new PHIs.
-  for (auto *PHI : InsertedPHIs) {
-    BasicBlock *Parent = PHI->getParent();
-    // Avoid inserting an intrinsic into an EH block.
-    if (Parent->getFirstNonPHIIt()->isEHPad())
-      continue;
-    for (auto *VI : PHI->operand_values()) {
-      auto V = DbgValueMap.find(VI);
-      if (V != DbgValueMap.end()) {
-        auto *DbgII = cast<DbgVariableIntrinsic>(V->second);
-        auto [NewDI, Inserted] = NewDbgValueMap.try_emplace({Parent, DbgII});
-        if (Inserted)
-          NewDI->second = cast<DbgVariableIntrinsic>(DbgII->clone());
-        DbgVariableIntrinsic *NewDbgII = NewDI->second;
-        // If PHI contains VI as an operand more than once, we may
-        // replaced it in NewDbgII; confirm that it is present.
-        if (is_contained(NewDbgII->location_ops(), VI))
-          NewDbgII->replaceVariableLocationOp(VI, PHI);
-      }
-    }
-  }
-  // Insert thew new dbg.values into their destination blocks.
-  for (auto DI : NewDbgValueMap) {
-    BasicBlock *Parent = DI.first.first;
-    auto *NewDbgII = DI.second;
-    auto InsertionPt = Parent->getFirstInsertionPt();
-    assert(InsertionPt != Parent->end() && "Ill-formed basic block");
-    NewDbgII->insertBefore(InsertionPt);
-  }
-}
-
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              DIBuilder &Builder, uint8_t DIExprFlags,
                              int Offset) {
-  TinyPtrVector<DbgDeclareInst *> DbgDeclares = findDbgDeclares(Address);
   TinyPtrVector<DbgVariableRecord *> DVRDeclares = findDVRDeclares(Address);
 
-  auto ReplaceOne = [&](auto *DII) {
+  auto ReplaceOne = [&](DbgVariableRecord *DII) {
     assert(DII->getVariable() && "Missing variable");
     auto *DIExpr = DII->getExpression();
     DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset);
@@ -2190,16 +1954,14 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
     DII->replaceVariableLocationOp(Address, NewAddress);
   };
 
-  for_each(DbgDeclares, ReplaceOne);
   for_each(DVRDeclares, ReplaceOne);
 
-  return !DbgDeclares.empty() || !DVRDeclares.empty();
+  return !DVRDeclares.empty();
 }
 
 static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
                                        DILocalVariable *DIVar,
                                        DIExpression *DIExpr, Value *NewAddress,
-                                       DbgValueInst *DVI,
                                        DbgVariableRecord *DVR,
                                        DIBuilder &Builder, int Offset) {
   assert(DIVar && "Missing variable");
@@ -2215,42 +1977,28 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
   if (Offset)
     DIExpr = DIExpression::prepend(DIExpr, 0, Offset);
 
-  if (DVI) {
-    DVI->setExpression(DIExpr);
-    DVI->replaceVariableLocationOp(0u, NewAddress);
-  } else {
-    assert(DVR);
-    DVR->setExpression(DIExpr);
-    DVR->replaceVariableLocationOp(0u, NewAddress);
-  }
+  DVR->setExpression(DIExpr);
+  DVR->replaceVariableLocationOp(0u, NewAddress);
 }
 
 void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                     DIBuilder &Builder, int Offset) {
-  SmallVector<DbgValueInst *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
-  findDbgValues(DbgUsers, AI, &DPUsers);
-
-  // Attempt to replace dbg.values that use this alloca.
-  for (auto *DVI : DbgUsers)
-    updateOneDbgValueForAlloca(DVI->getDebugLoc(), DVI->getVariable(),
-                               DVI->getExpression(), NewAllocaAddress, DVI,
-                               nullptr, Builder, Offset);
+  findDbgValues(AI, DPUsers);
 
   // Replace any DbgVariableRecords that use this alloca.
   for (DbgVariableRecord *DVR : DPUsers)
     updateOneDbgValueForAlloca(DVR->getDebugLoc(), DVR->getVariable(),
-                               DVR->getExpression(), NewAllocaAddress, nullptr,
-                               DVR, Builder, Offset);
+                               DVR->getExpression(), NewAllocaAddress, DVR,
+                               Builder, Offset);
 }
 
 /// Where possible to salvage debug information for \p I do so.
 /// If not possible mark undef.
 void llvm::salvageDebugInfo(Instruction &I) {
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
-  findDbgUsers(DbgUsers, &I, &DPUsers);
-  salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
+  findDbgUsers(&I, DPUsers);
+  salvageDebugInfoForDbgValues(I, DPUsers);
 }
 
 template <typename T> static void salvageDbgAssignAddress(T *Assign) {
@@ -2288,9 +2036,8 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) {
   }
 }
 
-void llvm::salvageDebugInfoForDbgValues(
-    Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers,
-    ArrayRef<DbgVariableRecord *> DPUsers) {
+void llvm::salvageDebugInfoForDbgValues(Instruction &I,
+                                        ArrayRef<DbgVariableRecord *> DPUsers) {
   // These are arbitrary chosen limits on the maximum number of values and the
   // maximum size of a debug expression we can salvage up to, used for
   // performance reasons.
@@ -2298,66 +2045,6 @@ void llvm::salvageDebugInfoForDbgValues(
   const unsigned MaxExpressionSize = 128;
   bool Salvaged = false;
 
-  for (auto *DII : DbgUsers) {
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DII)) {
-      if (DAI->getAddress() == &I) {
-        salvageDbgAssignAddress(DAI);
-        Salvaged = true;
-      }
-      if (DAI->getValue() != &I)
-        continue;
-    }
-
-    // Do not add DW_OP_stack_value for DbgDeclare, because they are implicitly
-    // pointing out the value as a DWARF memory location description.
-    bool StackValue = isa<DbgValueInst>(DII);
-    auto DIILocation = DII->location_ops();
-    assert(
-        is_contained(DIILocation, &I) &&
-        "DbgVariableIntrinsic must use salvaged instruction as its location");
-    SmallVector<Value *, 4> AdditionalValues;
-    // `I` may appear more than once in DII's location ops, and each use of `I`
-    // must be updated in the DIExpression and potentially have additional
-    // values added; thus we call salvageDebugInfoImpl for each `I` instance in
-    // DIILocation.
-    Value *Op0 = nullptr;
-    DIExpression *SalvagedExpr = DII->getExpression();
-    auto LocItr = find(DIILocation, &I);
-    while (SalvagedExpr && LocItr != DIILocation.end()) {
-      SmallVector<uint64_t, 16> Ops;
-      unsigned LocNo = std::distance(DIILocation.begin(), LocItr);
-      uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands();
-      Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues);
-      if (!Op0)
-        break;
-      SalvagedExpr =
-          DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue);
-      LocItr = std::find(++LocItr, DIILocation.end(), &I);
-    }
-    // salvageDebugInfoImpl should fail on examining the first element of
-    // DbgUsers, or none of them.
-    if (!Op0)
-      break;
-
-    SalvagedExpr = SalvagedExpr->foldConstantMath();
-    DII->replaceVariableLocationOp(&I, Op0);
-    bool IsValidSalvageExpr = SalvagedExpr->getNumElements() <= MaxExpressionSize;
-    if (AdditionalValues.empty() && IsValidSalvageExpr) {
-      DII->setExpression(SalvagedExpr);
-    } else if (isa<DbgValueInst>(DII) && IsValidSalvageExpr &&
-               DII->getNumVariableLocationOps() + AdditionalValues.size() <=
-                   MaxDebugArgs) {
-      DII->addVariableLocationOps(AdditionalValues, SalvagedExpr);
-    } else {
-      // Do not salvage using DIArgList for dbg.declare, as it is not currently
-      // supported in those instructions. Also do not salvage if the resulting
-      // DIArgList would contain an unreasonably large number of values.
-      DII->setKillLocation();
-    }
-    LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
-    Salvaged = true;
-  }
-  // Duplicate of above block for DbgVariableRecords.
   for (auto *DVR : DPUsers) {
     if (DVR->isDbgAssign()) {
       if (DVR->getAddress() == &I) {
@@ -2426,9 +2113,6 @@ void llvm::salvageDebugInfoForDbgValues(
   if (Salvaged)
     return;
 
-  for (auto *DII : DbgUsers)
-    DII->setKillLocation();
-
   for (auto *DVR : DPUsers)
     DVR->setKillLocation();
 }
@@ -2645,52 +2329,35 @@ using DbgValReplacement = std::optional<DIExpression *>;
 /// changes are made.
 static bool rewriteDebugUsers(
     Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
-    function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr,
     function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) {
   // Find debug users of From.
-  SmallVector<DbgVariableIntrinsic *, 1> Users;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
-  findDbgUsers(Users, &From, &DPUsers);
-  if (Users.empty() && DPUsers.empty())
+  findDbgUsers(&From, DPUsers);
+  if (DPUsers.empty())
     return false;
 
   // Prevent use-before-def of To.
   bool Changed = false;
 
-  SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
   SmallPtrSet<DbgVariableRecord *, 1> UndefOrSalvageDVR;
   if (isa<Instruction>(&To)) {
     bool DomPointAfterFrom = From.getNextNode() == &DomPoint;
 
-    for (auto *DII : Users) {
-      // It's common to see a debug user between From and DomPoint. Move it
-      // after DomPoint to preserve the variable update without any reordering.
-      if (DomPointAfterFrom && DII->getNextNode() == &DomPoint) {
-        LLVM_DEBUG(dbgs() << "MOVE:  " << *DII << '\n');
-        DII->moveAfter(&DomPoint);
-        Changed = true;
-
-      // Users which otherwise aren't dominated by the replacement value must
-      // be salvaged or deleted.
-      } else if (!DT.dominates(&DomPoint, DII)) {
-        UndefOrSalvage.insert(DII);
-      }
-    }
-
     // DbgVariableRecord implementation of the above.
     for (auto *DVR : DPUsers) {
       Instruction *MarkedInstr = DVR->getMarker()->MarkedInstr;
       Instruction *NextNonDebug = MarkedInstr;
-      // The next instruction might still be a dbg.declare, skip over it.
-      if (isa<DbgVariableIntrinsic>(NextNonDebug))
-        NextNonDebug = NextNonDebug->getNextNode();
 
+      // It's common to see a debug user between From and DomPoint. Move it
+      // after DomPoint to preserve the variable update without any reordering.
       if (DomPointAfterFrom && NextNonDebug == &DomPoint) {
         LLVM_DEBUG(dbgs() << "MOVE:  " << *DVR << '\n');
         DVR->removeFromParent();
-        // Ensure there's a marker.
         DomPoint.getParent()->insertDbgRecordAfter(DVR, &DomPoint);
         Changed = true;
+
+      // Users which otherwise aren't dominated by the replacement value must
+      // be salvaged or deleted.
       } else if (!DT.dominates(&DomPoint, MarkedInstr)) {
         UndefOrSalvageDVR.insert(DVR);
       }
@@ -2698,19 +2365,6 @@ static bool rewriteDebugUsers(
   }
 
   // Update debug users without use-before-def risk.
-  for (auto *DII : Users) {
-    if (UndefOrSalvage.count(DII))
-      continue;
-
-    DbgValReplacement DVRepl = RewriteExpr(*DII);
-    if (!DVRepl)
-      continue;
-
-    DII->replaceVariableLocationOp(&From, &To);
-    DII->setExpression(*DVRepl);
-    LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
-    Changed = true;
-  }
   for (auto *DVR : DPUsers) {
     if (UndefOrSalvageDVR.count(DVR))
       continue;
@@ -2725,7 +2379,7 @@ static bool rewriteDebugUsers(
     Changed = true;
   }
 
-  if (!UndefOrSalvage.empty() || !UndefOrSalvageDVR.empty()) {
+  if (!UndefOrSalvageDVR.empty()) {
     // Try to salvage the remaining debug users.
     salvageDebugInfo(From);
     Changed = true;
@@ -2770,9 +2424,6 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   Type *FromTy = From.getType();
   Type *ToTy = To.getType();
 
-  auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
-    return DII.getExpression();
-  };
   auto IdentityDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
     return DVR.getExpression();
   };
@@ -2781,7 +2432,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   Module &M = *From.getModule();
   const DataLayout &DL = M.getDataLayout();
   if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
-    return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
+    return rewriteDebugUsers(From, To, DomPoint, DT, IdentityDVR);
 
   // Handle integer-to-integer widening and narrowing.
   // FIXME: Use DW_OP_convert when it's available everywhere.
@@ -2793,24 +2444,10 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
     // When the width of the result grows, assume that a debugger will only
     // access the low `FromBits` bits when inspecting the source variable.
     if (FromBits < ToBits)
-      return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
+      return rewriteDebugUsers(From, To, DomPoint, DT, IdentityDVR);
 
     // The width of the result has shrunk. Use sign/zero extension to describe
     // the source variable's high bits.
-    auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
-      DILocalVariable *Var = DII.getVariable();
-
-      // Without knowing signedness, sign/zero extension isn't possible.
-      auto Signedness = Var->getSignedness();
-      if (!Signedness)
-        return std::nullopt;
-
-      bool Signed = *Signedness == DIBasicType::Signedness::Signed;
-      return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
-                                     Signed);
-    };
-    // RemoveDIs: duplicate implementation working on DbgVariableRecords rather
-    // than on dbg.value intrinsics.
     auto SignOrZeroExtDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
       DILocalVariable *Var = DVR.getVariable();
 
@@ -2823,8 +2460,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
       return DIExpression::appendExt(DVR.getExpression(), ToBits, FromBits,
                                      Signed);
     };
-    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt,
-                             SignOrZeroExtDVR);
+    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExtDVR);
   }
 
   // TODO: Floating-point conversions, vectors.
@@ -3352,6 +2988,12 @@ static void combineMetadata(Instruction *K, const Instruction *J,
       case LLVMContext::MD_memprof:
       case LLVMContext::MD_callsite:
         break;
+      case LLVMContext::MD_callee_type:
+        if (!AAOnly) {
+          K->setMetadata(LLVMContext::MD_callee_type,
+                         MDNode::getMergedCalleeTypeMetadata(KMD, JMD));
+        }
+        break;
       case LLVMContext::MD_align:
         if (!AAOnly && (DoesKMove || !K->hasMetadata(LLVMContext::MD_noundef)))
           K->setMetadata(
@@ -3697,11 +3339,8 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
 }
 
 void llvm::dropDebugUsers(Instruction &I) {
-  SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
-  findDbgUsers(DbgUsers, &I, &DPUsers);
-  for (auto *DII : DbgUsers)
-    DII->eraseFromParent();
+  findDbgUsers(&I, DPUsers);
   for (auto *DVR : DPUsers)
     DVR->eraseFromParent();
 }
@@ -3800,10 +3439,6 @@ void llvm::remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst) {
     if (I != Mapping.end())
       DA->setAddress(I->second);
   };
-  if (auto DVI = dyn_cast<DbgVariableIntrinsic>(Inst))
-    RemapDebugOperands(DVI, DVI->location_ops());
-  if (auto DAI = dyn_cast<DbgAssignIntrinsic>(Inst))
-    RemapAssignAddress(DAI);
   for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange())) {
     RemapDebugOperands(&DVR, DVR.location_ops());
     if (DVR.isDbgAssign())
@@ -4222,6 +3857,10 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
   if (Op->isSwiftError())
     return false;
 
+  // Cannot replace alloca argument with phi/select.
+  if (I->isLifetimeStartOrEnd())
+    return false;
+
   // Early exit.
   if (!isa<Constant, InlineAsm>(Op))
     return true;
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 66d0573e83f65..7cc9ff8b11139 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -158,32 +158,9 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
     // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
     // intrinsics.
-    SmallVector<DbgValueInst *, 1> DbgValues;
     SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
-    llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords);
-    for (auto &DbgValue : DbgValues) {
-      // The original users in the OrigHeader are already using the original
-      // definitions.
-      BasicBlock *UserBB = DbgValue->getParent();
-      if (UserBB == OrigHeader)
-        continue;
-
-      // Users in the OrigPreHeader need to use the value to which the
-      // original definitions are mapped and anything else can be handled by
-      // the SSAUpdater. To avoid adding PHINodes, check if the value is
-      // available in UserBB, if not substitute poison.
-      Value *NewVal;
-      if (UserBB == OrigPreheader)
-        NewVal = OrigPreHeaderVal;
-      else if (SSA.HasValueForBlock(UserBB))
-        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
-      else
-        NewVal = PoisonValue::get(OrigHeaderVal->getType());
-      DbgValue->replaceVariableLocationOp(OrigHeaderVal, NewVal);
-    }
+    llvm::findDbgValues(OrigHeaderVal, DbgVariableRecords);
 
-    // RemoveDIs: duplicate implementation for non-instruction debug-info
-    // storage in DbgVariableRecords.
     for (DbgVariableRecord *DVR : DbgVariableRecords) {
       // The original users in the OrigHeader are already using the original
       // definitions.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 200d1fb854155..e7623aaff105d 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
   case RecurKind::UMin:
     return Intrinsic::vector_reduce_umin;
   case RecurKind::FMax:
+  case RecurKind::FMaxNum:
     return Intrinsic::vector_reduce_fmax;
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
     return Intrinsic::vector_reduce_fmin;
   case RecurKind::FMaximum:
     return Intrinsic::vector_reduce_fmaximum;
@@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
   case RecurKind::SMax:
     return Intrinsic::smax;
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
     return Intrinsic::minnum;
   case RecurKind::FMax:
+  case RecurKind::FMaxNum:
     return Intrinsic::maxnum;
   case RecurKind::FMinimum:
     return Intrinsic::minimum;
@@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   Type *Ty = Left->getType();
   if (Ty->isIntOrIntVectorTy() ||
-      (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
+      (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum ||
+       RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
        RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
-    // TODO: Add float minnum/maxnum support when FMF nnan is set.
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
     return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
                                    "rdx.minmax");
@@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
   case RecurKind::UMin:
   case RecurKind::FMax:
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
+  case RecurKind::FMaxNum:
   case RecurKind::FMinimum:
   case RecurKind::FMaximum:
   case RecurKind::FMinimumNum:
diff --git a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
index 8f55d7bbd3182..27439319da147 100644
--- a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
@@ -319,9 +319,9 @@ void MemoryOpRemark::visitVariable(const Value *V,
 
   // If we find some information in the debug info, take that.
   bool FoundDI = false;
-  // Try to get an llvm.dbg.declare, which has a DILocalVariable giving us the
+  // Try to get a dbg.declare, which has a DILocalVariable giving us the
   // real debug info name and size of the variable.
-  auto FindDI = [&](const auto *DVI) {
+  auto FindDI = [&](const DbgVariableRecord *DVI) {
     if (DILocalVariable *DILV = DVI->getVariable()) {
       std::optional<uint64_t> DISize = getSizeInBytes(DILV->getSizeInBits());
       VariableInfo Var{DILV->getName(), DISize};
@@ -331,7 +331,6 @@ void MemoryOpRemark::visitVariable(const Value *V,
       }
     }
   };
-  for_each(findDbgDeclares(const_cast<Value *>(V)), FindDI);
   for_each(findDVRDeclares(const_cast<Value *>(V)), FindDI);
 
   if (FoundDI) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 511c15555fa83..40dc02c546dfa 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -168,22 +168,6 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
       Info.AllocasToInstrument[AI].LifetimeEnd.push_back(II);
     return;
   }
-  if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    auto AddIfInteresting = [&](Value *V) {
-      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
-        if (getAllocaInterestingness(*AI) !=
-            AllocaInterestingness::kInteresting)
-          return;
-        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
-        auto &DVIVec = AInfo.DbgVariableIntrinsics;
-        if (DVIVec.empty() || DVIVec.back() != DVI)
-          DVIVec.push_back(DVI);
-      }
-    };
-    for_each(DVI->location_ops(), AddIfInteresting);
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-      AddIfInteresting(DAI->getAddress());
-  }
 
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
@@ -248,13 +232,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   NewAI->setSwiftError(Info.AI->isSwiftError());
   NewAI->copyMetadata(*Info.AI);
 
-  Value *NewPtr = NewAI;
-
-  // TODO: Remove when typed pointers dropped
-  if (Info.AI->getType() != NewAI->getType())
-    NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI->getIterator());
-
-  Info.AI->replaceAllUsesWith(NewPtr);
+  Info.AI->replaceAllUsesWith(NewAI);
   Info.AI->eraseFromParent();
   Info.AI = NewAI;
 }
@@ -297,19 +275,12 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) {
                                 IRB.CreateCall(ThreadPointerFunc), 8 * Slot);
 }
 
-static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
-  return dyn_cast<DbgAssignIntrinsic>(DVI);
-}
-
 static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
   return DVR->isDbgAssign() ? DVR : nullptr;
 }
 
 void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) {
-  // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
-  // abstracted over whether they're intrinsic-stored or DbgVariableRecord
-  // stored.
-  auto AnnotateDbgRecord = [&](auto *DPtr) {
+  auto AnnotateDbgRecord = [&](DbgVariableRecord *DPtr) {
     // Prepend "tag_offset, N" to the dwarf expression.
     // Tag offset logically applies to the alloca pointer, and it makes sense
     // to put it at the beginning of the expression.
@@ -325,7 +296,6 @@ void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) {
     }
   };
 
-  llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
   llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
 }
 
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 46808a818cb26..d96f1d6c23d47 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -115,29 +115,17 @@ static void createDebugValue(DIBuilder &DIB, Value *NewValue,
   DbgVariableRecord::createDbgVariableRecord(NewValue, Variable, Expression, DI,
                                              *InsertBefore);
 }
-static void createDebugValue(DIBuilder &DIB, Value *NewValue,
-                             DILocalVariable *Variable,
-                             DIExpression *Expression, const DILocation *DI,
-                             Instruction *InsertBefore) {
-  DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI,
-                              InsertBefore->getIterator());
-}
 
 /// Helper for updating assignment tracking debug info when promoting allocas.
 class AssignmentTrackingInfo {
   /// DbgAssignIntrinsics linked to the alloca with at most one per variable
   /// fragment. (i.e. not be a comprehensive set if there are multiple
   /// dbg.assigns for one variable fragment).
-  SmallVector<DbgVariableIntrinsic *> DbgAssigns;
   SmallVector<DbgVariableRecord *> DVRAssigns;
 
 public:
   void init(AllocaInst *AI) {
     SmallSet<DebugVariable, 2> Vars;
-    for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(AI)) {
-      if (Vars.insert(DebugVariable(DAI)).second)
-        DbgAssigns.push_back(DAI);
-    }
     for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(AI)) {
       if (Vars.insert(DebugVariable(DVR)).second)
         DVRAssigns.push_back(DVR);
@@ -148,11 +136,10 @@ class AssignmentTrackingInfo {
   /// \p ToDelete that stores to this alloca.
   void updateForDeletedStore(
       StoreInst *ToDelete, DIBuilder &DIB,
-      SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
       SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
     // There's nothing to do if the alloca doesn't have any variables using
     // assignment tracking.
-    if (DbgAssigns.empty() && DVRAssigns.empty())
+    if (DVRAssigns.empty())
       return;
 
     // Insert a dbg.value where the linked dbg.assign is and remember to delete
@@ -169,25 +156,22 @@ class AssignmentTrackingInfo {
                        DbgAssign->getExpression(), DbgAssign->getDebugLoc(),
                        DbgAssign);
     };
-    for (auto *Assign : at::getAssignmentMarkers(ToDelete))
-      InsertValueForAssign(Assign, DbgAssignsToDelete);
     for (auto *Assign : at::getDVRAssignmentMarkers(ToDelete))
       InsertValueForAssign(Assign, DVRAssignsToDelete);
 
     // It's possible for variables using assignment tracking to have no
-    // dbg.assign linked to this store. These are variables in DbgAssigns that
+    // dbg.assign linked to this store. These are variables in DVRAssigns that
     // are missing from VarHasDbgAssignForStore. Since there isn't a dbg.assign
     // to mark the assignment - and the store is going to be deleted - insert a
     // dbg.value to do that now. An untracked store may be either one that
     // cannot be represented using assignment tracking (non-const offset or
     // size) or one that is trackable but has had its DIAssignID attachment
     // dropped accidentally.
-    auto ConvertUnlinkedAssignToValue = [&](auto *Assign) {
+    auto ConvertUnlinkedAssignToValue = [&](DbgVariableRecord *Assign) {
       if (VarHasDbgAssignForStore.contains(DebugVariableAggregate(Assign)))
         return;
       ConvertDebugDeclareToDebugValue(Assign, ToDelete, DIB);
     };
-    for_each(DbgAssigns, ConvertUnlinkedAssignToValue);
     for_each(DVRAssigns, ConvertUnlinkedAssignToValue);
   }
 
@@ -197,21 +181,15 @@ class AssignmentTrackingInfo {
     // Regardless of the position of dbg.assigns relative to stores, the
     // incoming values into a new PHI should be the same for the (imaginary)
     // debug-phi.
-    for (auto *DAI : DbgAssigns)
-      ConvertDebugDeclareToDebugValue(DAI, NewPhi, DIB);
     for (auto *DVR : DVRAssigns)
       ConvertDebugDeclareToDebugValue(DVR, NewPhi, DIB);
   }
 
-  void clear() {
-    DbgAssigns.clear();
-    DVRAssigns.clear();
-  }
-  bool empty() { return DbgAssigns.empty() && DVRAssigns.empty(); }
+  void clear() { DVRAssigns.clear(); }
+  bool empty() { return DVRAssigns.empty(); }
 };
 
 struct AllocaInfo {
-  using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
   using DPUserVec = SmallVector<DbgVariableRecord *, 1>;
 
   SmallVector<BasicBlock *, 32> DefiningBlocks;
@@ -222,7 +200,6 @@ struct AllocaInfo {
   bool OnlyUsedInOneBlock;
 
   /// Debug users of the alloca - does not include dbg.assign intrinsics.
-  DbgUserVec DbgUsers;
   DPUserVec DPUsers;
   /// Helper to update assignment tracking debug info.
   AssignmentTrackingInfo AssignmentTracking;
@@ -233,7 +210,6 @@ struct AllocaInfo {
     OnlyStore = nullptr;
     OnlyBlock = nullptr;
     OnlyUsedInOneBlock = true;
-    DbgUsers.clear();
     DPUsers.clear();
     AssignmentTracking.clear();
   }
@@ -267,13 +243,8 @@ struct AllocaInfo {
           OnlyUsedInOneBlock = false;
       }
     }
-    DbgUserVec AllDbgUsers;
     SmallVector<DbgVariableRecord *> AllDPUsers;
-    findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
-    std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(),
-                 std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) {
-                   return !isa<DbgAssignIntrinsic>(DII);
-                 });
+    findDbgUsers(AI, AllDPUsers);
     std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
                  std::back_inserter(DPUsers),
                  [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); });
@@ -401,10 +372,9 @@ struct PromoteMem2Reg {
   /// to.
   DenseMap<PHINode *, unsigned> PhiToAllocaMap;
 
-  /// For each alloca, we keep track of the dbg.declare intrinsic that
+  /// For each alloca, we keep track of the dbg.declare record that
   /// describes it, if any, so that we can convert it to a dbg.value
-  /// intrinsic if the alloca gets promoted.
-  SmallVector<AllocaInfo::DbgUserVec, 8> AllocaDbgUsers;
+  /// record if the alloca gets promoted.
   SmallVector<AllocaInfo::DPUserVec, 8> AllocaDPUsers;
 
   /// For each alloca, keep an instance of a helper class that gives us an easy
@@ -412,7 +382,6 @@ struct PromoteMem2Reg {
   SmallVector<AssignmentTrackingInfo, 8> AllocaATInfo;
   /// A set of dbg.assigns to delete because they've been demoted to
   /// dbg.values. Call cleanUpDbgAssigns to delete them.
-  SmallSet<DbgAssignIntrinsic *, 8> DbgAssignsToDelete;
   SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
@@ -467,9 +436,6 @@ struct PromoteMem2Reg {
 
   /// Delete dbg.assigns that have been demoted to dbg.values.
   void cleanUpDbgAssigns() {
-    for (auto *DAI : DbgAssignsToDelete)
-      DAI->eraseFromParent();
-    DbgAssignsToDelete.clear();
     for (auto *DVR : DVRAssignsToDelete)
       DVR->eraseFromParent();
     DVRAssignsToDelete.clear();
@@ -571,7 +537,6 @@ static bool
 rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
                          const DataLayout &DL, DominatorTree &DT,
                          AssumptionCache *AC,
-                         SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
                          SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
   Value *ReplVal = OnlyStore->getOperand(0);
@@ -637,27 +602,23 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
 
   DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
   // Update assignment tracking info for the store we're going to delete.
-  Info.AssignmentTracking.updateForDeletedStore(
-      Info.OnlyStore, DIB, DbgAssignsToDelete, DVRAssignsToDelete);
+  Info.AssignmentTracking.updateForDeletedStore(Info.OnlyStore, DIB,
+                                                DVRAssignsToDelete);
 
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
-  auto ConvertDebugInfoForStore = [&](auto &Container) {
-    for (auto *DbgItem : Container) {
-      if (DbgItem->isAddressOfVariable()) {
-        ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB);
-        DbgItem->eraseFromParent();
-      } else if (DbgItem->isValueOfVariable() &&
-                 DbgItem->getExpression()->startsWithDeref()) {
-        InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB);
-        DbgItem->eraseFromParent();
-      } else if (DbgItem->getExpression()->startsWithDeref()) {
-        DbgItem->eraseFromParent();
-      }
+  for (DbgVariableRecord *DbgItem : Info.DPUsers) {
+    if (DbgItem->isAddressOfVariable()) {
+      ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB);
+      DbgItem->eraseFromParent();
+    } else if (DbgItem->isValueOfVariable() &&
+               DbgItem->getExpression()->startsWithDeref()) {
+      InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB);
+      DbgItem->eraseFromParent();
+    } else if (DbgItem->getExpression()->startsWithDeref()) {
+      DbgItem->eraseFromParent();
     }
-  };
-  ConvertDebugInfoForStore(Info.DbgUsers);
-  ConvertDebugInfoForStore(Info.DPUsers);
+  }
 
   // Remove dbg.assigns linked to the alloca as these are now redundant.
   at::deleteAssignmentMarkers(AI);
@@ -690,7 +651,6 @@ static bool
 promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                          LargeBlockInfo &LBI, const DataLayout &DL,
                          DominatorTree &DT, AssumptionCache *AC,
-                         SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
                          SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
@@ -755,18 +715,13 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   while (!AI->use_empty()) {
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Update assignment tracking info for the store we're going to delete.
-    Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete,
-                                                  DVRAssignsToDelete);
+    Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DVRAssignsToDelete);
     // Record debuginfo for the store before removing it.
-    auto DbgUpdateForStore = [&](auto &Container) {
-      for (auto *DbgItem : Container) {
-        if (DbgItem->isAddressOfVariable()) {
-          ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
-        }
+    for (DbgVariableRecord *DbgItem : Info.DPUsers) {
+      if (DbgItem->isAddressOfVariable()) {
+        ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
       }
-    };
-    DbgUpdateForStore(Info.DbgUsers);
-    DbgUpdateForStore(Info.DPUsers);
+    }
 
     SI->eraseFromParent();
     LBI.deleteValue(SI);
@@ -777,14 +732,11 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   AI->eraseFromParent();
 
   // The alloca's debuginfo can be removed as well.
-  auto DbgUpdateForAlloca = [&](auto &Container) {
-    for (auto *DbgItem : Container)
-      if (DbgItem->isAddressOfVariable() ||
-          DbgItem->getExpression()->startsWithDeref())
-        DbgItem->eraseFromParent();
-  };
-  DbgUpdateForAlloca(Info.DbgUsers);
-  DbgUpdateForAlloca(Info.DPUsers);
+  for (DbgVariableRecord *DbgItem : Info.DPUsers) {
+    if (DbgItem->isAddressOfVariable() ||
+        DbgItem->getExpression()->startsWithDeref())
+      DbgItem->eraseFromParent();
+  }
 
   ++NumLocalPromoted;
   return true;
@@ -793,7 +745,6 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  AllocaDbgUsers.resize(Allocas.size());
   AllocaATInfo.resize(Allocas.size());
   AllocaDPUsers.resize(Allocas.size());
 
@@ -830,7 +781,7 @@ void PromoteMem2Reg::run() {
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
       if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                   &DbgAssignsToDelete, &DVRAssignsToDelete)) {
+                                   &DVRAssignsToDelete)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -842,7 +793,7 @@ void PromoteMem2Reg::run() {
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
         promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                 &DbgAssignsToDelete, &DVRAssignsToDelete)) {
+                                 &DVRAssignsToDelete)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -852,9 +803,7 @@ void PromoteMem2Reg::run() {
     if (BBNumPreds.empty())
       BBNumPreds.resize(F.getMaxBlockNumber());
 
-    // Remember the dbg.declare intrinsic describing this alloca, if any.
-    if (!Info.DbgUsers.empty())
-      AllocaDbgUsers[AllocaNum] = Info.DbgUsers;
+    // Remember the dbg.declare record describing this alloca, if any.
     if (!Info.AssignmentTracking.empty())
       AllocaATInfo[AllocaNum] = Info.AssignmentTracking;
     if (!Info.DPUsers.empty())
@@ -930,16 +879,12 @@ void PromoteMem2Reg::run() {
   }
 
   // Remove alloca's dbg.declare intrinsics from the function.
-  auto RemoveDbgDeclares = [&](auto &Container) {
-    for (auto &DbgUsers : Container) {
-      for (auto *DbgItem : DbgUsers)
-        if (DbgItem->isAddressOfVariable() ||
-            DbgItem->getExpression()->startsWithDeref())
-          DbgItem->eraseFromParent();
-    }
-  };
-  RemoveDbgDeclares(AllocaDbgUsers);
-  RemoveDbgDeclares(AllocaDPUsers);
+  for (auto &DbgUsers : AllocaDPUsers) {
+    for (DbgVariableRecord *DbgItem : DbgUsers)
+      if (DbgItem->isAddressOfVariable() ||
+          DbgItem->getExpression()->startsWithDeref())
+        DbgItem->eraseFromParent();
+  }
 
   // Loop over all of the PHI nodes and see if there are any that we can get
   // rid of because they merge all of the same incoming values.  This can
@@ -1182,13 +1127,9 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) {
         // The currently active variable for this block is now the PHI.
         IncomingVals.set(AllocaNo, APN);
         AllocaATInfo[AllocaNo].updateForNewPhi(APN, DIB);
-        auto ConvertDbgDeclares = [&](auto &Container) {
-          for (auto *DbgItem : Container)
-            if (DbgItem->isAddressOfVariable())
-              ConvertDebugDeclareToDebugValue(DbgItem, APN, DIB);
-        };
-        ConvertDbgDeclares(AllocaDbgUsers[AllocaNo]);
-        ConvertDbgDeclares(AllocaDPUsers[AllocaNo]);
+        for (DbgVariableRecord *DbgItem : AllocaDPUsers[AllocaNo])
+          if (DbgItem->isAddressOfVariable())
+            ConvertDebugDeclareToDebugValue(DbgItem, APN, DIB);
 
         // Get the next phi node.
         ++PNI;
@@ -1242,15 +1183,11 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) {
 
       // Record debuginfo for the store before removing it.
       IncomingLocs.set(AllocaNo, SI->getDebugLoc());
-      AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DbgAssignsToDelete,
+      AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB,
                                                    &DVRAssignsToDelete);
-      auto ConvertDbgDeclares = [&](auto &Container) {
-        for (auto *DbgItem : Container)
-          if (DbgItem->isAddressOfVariable())
-            ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
-      };
-      ConvertDbgDeclares(AllocaDbgUsers[ai->second]);
-      ConvertDbgDeclares(AllocaDPUsers[ai->second]);
+      for (DbgVariableRecord *DbgItem : AllocaDPUsers[ai->second])
+        if (DbgItem->isAddressOfVariable())
+          ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
       SI->eraseFromParent();
     }
   }
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 586874f57ae73..b9292af093bd0 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -19,7 +19,9 @@
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
@@ -245,11 +247,43 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
   const APInt *RHSC;
   // Remove masking operations.
   if (match(&Inst, m_And(m_Value(X), m_LowBitMask(RHSC)))) {
-    ConstantRange LRange = GetRange(Inst.getOperand(0));
+    ConstantRange LRange = GetRange(X);
     if (LRange.getUnsignedMax().ule(*RHSC))
       return X;
   }
 
+  // Check if we can simplify [us]cmp(X, Y) to X - Y.
+  if (auto *Cmp = dyn_cast<CmpIntrinsic>(&Inst)) {
+    Value *LHS = Cmp->getOperand(0);
+    Value *RHS = Cmp->getOperand(1);
+    unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
+    // Bail out on 1-bit comparisons.
+    if (BitWidth == 1)
+      return nullptr;
+    ConstantRange LRange = GetRange(LHS);
+    if (LRange.isSizeLargerThan(3))
+      return nullptr;
+    ConstantRange RRange = GetRange(RHS);
+    if (RRange.isSizeLargerThan(3))
+      return nullptr;
+    ConstantRange RHSLower = RRange.sub(APInt(BitWidth, 1));
+    ConstantRange RHSUpper = RRange.add(APInt(BitWidth, 1));
+    ICmpInst::Predicate Pred =
+        Cmp->isSigned() ? CmpInst::ICMP_SLE : CmpInst::ICMP_ULE;
+    if (!RHSLower.icmp(Pred, LRange) || !LRange.icmp(Pred, RHSUpper))
+      return nullptr;
+
+    IRBuilder<NoFolder> Builder(&Inst);
+    Value *Sub = Builder.CreateSub(LHS, RHS, Inst.getName(), /*HasNUW=*/false,
+                                   /*HasNSW=*/Cmp->isSigned());
+    InsertedValues.insert(Sub);
+    if (Sub->getType() != Inst.getType()) {
+      Sub = Builder.CreateSExtOrTrunc(Sub, Inst.getType());
+      InsertedValues.insert(Sub);
+    }
+    return Sub;
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 5db7fc956c497..49d0d9584347e 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -197,14 +197,8 @@ void SSAUpdater::RewriteUse(Use &U) {
 }
 
 void SSAUpdater::UpdateDebugValues(Instruction *I) {
-  SmallVector<DbgValueInst *, 4> DbgValues;
   SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
-  llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
-  for (auto &DbgValue : DbgValues) {
-    if (DbgValue->getParent() == I->getParent())
-      continue;
-    UpdateDebugValue(I, DbgValue);
-  }
+  llvm::findDbgValues(I, DbgVariableRecords);
   for (auto &DVR : DbgVariableRecords) {
     if (DVR->getParent() == I->getParent())
       continue;
@@ -212,13 +206,6 @@ void SSAUpdater::UpdateDebugValues(Instruction *I) {
   }
 }
 
-void SSAUpdater::UpdateDebugValues(Instruction *I,
-                                   SmallVectorImpl<DbgValueInst *> &DbgValues) {
-  for (auto &DbgValue : DbgValues) {
-    UpdateDebugValue(I, DbgValue);
-  }
-}
-
 void SSAUpdater::UpdateDebugValues(
     Instruction *I, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
   for (auto &DVR : DbgVariableRecords) {
@@ -226,15 +213,6 @@ void SSAUpdater::UpdateDebugValues(
   }
 }
 
-void SSAUpdater::UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue) {
-  BasicBlock *UserBB = DbgValue->getParent();
-  if (HasValueForBlock(UserBB)) {
-    Value *NewVal = GetValueAtEndOfBlock(UserBB);
-    DbgValue->replaceVariableLocationOp(I, NewVal);
-  } else
-    DbgValue->setKillLocation();
-}
-
 void SSAUpdater::UpdateDebugValue(Instruction *I, DbgVariableRecord *DVR) {
   BasicBlock *UserBB = DVR->getParent();
   if (HasValueForBlock(UserBB)) {
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 739ac00ba47c5..ed08c0bfa2e7d 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1223,6 +1223,24 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   return Result;
 }
 
+Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
+  const Loop *L = S->getLoop();
+  BasicBlock *EB = L->getExitBlock();
+  if (!EB || !EB->getSinglePredecessor() ||
+      !SE.DT.dominates(EB, Builder.GetInsertBlock()))
+    return nullptr;
+
+  for (auto &PN : EB->phis()) {
+    if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType())
+      continue;
+    auto *ExitV = SE.getSCEV(&PN);
+    if (S == ExitV)
+      return &PN;
+  }
+
+  return nullptr;
+}
+
 Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // In canonical mode we compute the addrec as an expression of a canonical IV
   // using evaluateAtIteration and expand the resulting SCEV expression. This
@@ -1262,6 +1280,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     return V;
   }
 
+  // If S is expanded outside the defining loop, check if there is a
+  // matching LCSSA phi node for it.
+  if (Value *V = tryToReuseLCSSAPhi(S))
+    return V;
+
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
     if (isa<PointerType>(S->getType())) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index a75f29000ca18..94b0ab892f2dd 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2227,16 +2227,6 @@ static bool canSinkInstructions(
       return I->getOperand(OI) == I0->getOperand(OI);
     };
     if (!all_of(Insts, SameAsI0)) {
-      // SROA can't speculate lifetime markers of selects/phis, and the
-      // backend may handle such lifetimes incorrectly as well (#104776).
-      // Don't sink lifetimes if it would introduce a phi on the pointer
-      // argument.
-      if (isa<LifetimeIntrinsic>(I0) && OI == 1 &&
-          any_of(Insts, [](const Instruction *I) {
-            return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
-          }))
-        return false;
-
       if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
           !canReplaceOperandWithVariable(I0, OI))
         // We can't create a PHI from this GEP.
@@ -7493,7 +7483,7 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
   SmallPtrSet<PHINode *, 8> Phis;
   SmallPtrSet<BasicBlock *, 8> Seen;
   DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> PhiPredIVs;
-  DenseMap<BasicBlock *, SmallVector<unsigned, 4>> BBToSuccessorIndexes;
+  DenseMap<BasicBlock *, SmallVector<unsigned, 32>> BBToSuccessorIndexes;
   SmallVector<SwitchSuccWrapper> Cases;
   Cases.reserve(SI->getNumSuccessors());
 
@@ -7505,12 +7495,6 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
     if (BB->size() != 1)
       continue;
 
-    // FIXME: This case needs some extra care because the terminators other than
-    // SI need to be updated. For now, consider only backedges to the SI.
-    if (BB->hasNPredecessorsOrMore(4) ||
-        BB->getUniquePredecessor() != SI->getParent())
-      continue;
-
     // FIXME: Relax that the terminator is a BranchInst by checking for equality
     // on other kinds of terminators. We decide to only support unconditional
     // branches for now for compile time reasons.
@@ -7518,14 +7502,24 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
     if (!BI || BI->isConditional())
       continue;
 
-    if (Seen.insert(BB).second) {
-      // Keep track of which PHIs we need as keys in PhiPredIVs below.
-      for (BasicBlock *Succ : BI->successors())
-        Phis.insert_range(llvm::make_pointer_range(Succ->phis()));
-      // Add the successor only if not previously visited.
-      Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs});
+    if (!Seen.insert(BB).second) {
+      auto It = BBToSuccessorIndexes.find(BB);
+      if (It != BBToSuccessorIndexes.end())
+        It->second.emplace_back(I);
+      continue;
     }
 
+    // FIXME: This case needs some extra care because the terminators other than
+    // SI need to be updated. For now, consider only backedges to the SI.
+    if (BB->getUniquePredecessor() != SI->getParent())
+      continue;
+
+    // Keep track of which PHIs we need as keys in PhiPredIVs below.
+    for (BasicBlock *Succ : BI->successors())
+      Phis.insert_range(llvm::make_pointer_range(Succ->phis()));
+
+    // Add the successor only if not previously visited.
+    Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs});
     BBToSuccessorIndexes[BB].emplace_back(I);
   }
 
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 7ba95e299c1b1..8d8a60b6918fe 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -987,6 +987,13 @@ void Mapper::remapInstruction(Instruction *I) {
              "Referenced value not in value map!");
   }
 
+  // Drop callee_type metadata from calls that were remapped
+  // into a direct call from an indirect one.
+  if (auto *CB = dyn_cast<CallBase>(I)) {
+    if (CB->getMetadata(LLVMContext::MD_callee_type) && !CB->isIndirectCall())
+      CB->setMetadata(LLVMContext::MD_callee_type, nullptr);
+  }
+
   // Remap phi nodes' incoming blocks.
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 11853859484e3..f57ce0c3ccb4d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -230,7 +230,6 @@ class VPBuilder {
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
   /// and \p B.
-  /// TODO: add createFCmp when needed.
   VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
                             DebugLoc DL = DebugLoc::getUnknown(),
                             const Twine &Name = "") {
@@ -240,6 +239,17 @@ class VPBuilder {
         new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
   }
 
+  /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
+  /// and \p B.
+  VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+                            DebugLoc DL = DebugLoc::getUnknown(),
+                            const Twine &Name = "") {
+    assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
+           Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
+    return tryInsertInstruction(
+        new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+  }
+
   VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ceeabd65cced3..46bc26c95390b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -947,9 +947,8 @@ class LoopVectorizationCostModel {
   /// user options, for the given register kind.
   bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
 
-  /// \return True if maximizing vector bandwidth is enabled by the target or
-  /// user options, for the given vector factor.
-  bool useMaxBandwidth(ElementCount VF);
+  /// \return True if register pressure should be calculated for the given VF.
+  bool shouldCalculateRegPressureForVF(ElementCount VF);
 
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1355,9 +1354,10 @@ class LoopVectorizationCostModel {
       ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
                                 ForceTailFoldingStyle.getValue()};
 
-    if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+    if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
+        ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
       return;
-    // Override forced styles if needed.
+    // Override EVL styles if needed.
     // FIXME: Investigate opportunity for fixed vector factor.
     bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
                       TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
@@ -1736,6 +1736,9 @@ class LoopVectorizationCostModel {
   /// Whether this loop should be optimized for size based on function attribute
   /// or profile information.
   bool OptForSize;
+
+  /// The highest VF possible for this loop, without using MaxBandwidth.
+  FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
 };
 } // end namespace llvm
 
@@ -3832,10 +3835,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return FixedScalableVFPair::getNone();
 }
 
-bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
-  return useMaxBandwidth(VF.isScalable()
-                             ? TargetTransformInfo::RGK_ScalableVector
-                             : TargetTransformInfo::RGK_FixedWidthVector);
+bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
+    ElementCount VF) {
+  if (!useMaxBandwidth(VF.isScalable()
+                           ? TargetTransformInfo::RGK_ScalableVector
+                           : TargetTransformInfo::RGK_FixedWidthVector))
+    return false;
+  // Only calculate register pressure for VFs enabled by MaxBandwidth.
+  return ElementCount::isKnownGT(
+      VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
+                          : MaxPermissibleVFWithoutMaxBW.FixedVF);
 }
 
 bool LoopVectorizationCostModel::useMaxBandwidth(
@@ -3911,6 +3920,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
+
+  if (MaxVF.isScalable())
+    MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
+  else
+    MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
+
   if (useMaxBandwidth(RegKind)) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
@@ -4264,9 +4279,11 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       if (VF.isScalar())
         continue;
 
-      /// Don't consider the VF if it exceeds the number of registers for the
-      /// target.
-      if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
+      /// If the register pressure needs to be considered for VF,
+      /// don't consider the VF as valid if it exceeds the number
+      /// of registers for the target.
+      if (CM.shouldCalculateRegPressureForVF(VF) &&
+          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
         continue;
 
       InstructionCost C = CM.expectedCost(VF);
@@ -4345,10 +4362,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
     ElementCount VF) const {
-  // Cross iteration phis such as reductions need special handling and are
-  // currently unsupported.
-  if (any_of(OrigLoop->getHeader()->phis(),
-             [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
+  // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
+  // reductions need special handling and are currently unsupported.
+  if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
+        if (!Legal->isReductionVariable(&Phi))
+          return Legal->isFixedOrderRecurrence(&Phi);
+        RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
+        return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+      }))
     return false;
 
   // Phis with uses outside of the loop require special handling and are
@@ -4459,6 +4480,28 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   Type *TCType = Legal->getWidestInductionType();
   const SCEV *RemainingIterations = nullptr;
   unsigned MaxTripCount = 0;
+  if (MainLoopVF.isFixed()) {
+    // TODO: extend to support scalable VFs.
+    const SCEV *TC = vputils::getSCEVExprForVPValue(
+        getPlanFor(MainLoopVF).getTripCount(), SE);
+    assert(!isa<SCEVCouldNotCompute>(TC) &&
+           "Trip count SCEV must be computable");
+    RemainingIterations = SE.getURemExpr(
+        TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+
+    // No iterations left to process in the epilogue.
+    if (RemainingIterations->isZero())
+      return Result;
+
+    MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
+    if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
+                            SE.getConstant(TCType, MaxTripCount))) {
+      MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
+    }
+    LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
+                      << MaxTripCount << "\n");
+  }
+
   for (auto &NextVF : ProfitableVFs) {
     // Skip candidate VFs without a corresponding VPlan.
     if (!hasPlanWithVF(NextVF.Width))
@@ -4476,24 +4519,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
 
     // If NextVF is greater than the number of remaining iterations, the
     // epilogue loop would be dead. Skip such factors.
-    if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
-      // TODO: extend to support scalable VFs.
-      if (!RemainingIterations) {
-        const SCEV *TC = vputils::getSCEVExprForVPValue(
-            getPlanFor(NextVF.Width).getTripCount(), SE);
-        assert(!isa<SCEVCouldNotCompute>(TC) &&
-               "Trip count SCEV must be computable");
-        RemainingIterations = SE.getURemExpr(
-            TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
-        MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
-        if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
-                                SE.getConstant(TCType, MaxTripCount))) {
-          MaxTripCount =
-              SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
-        }
-        LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
-                          << MaxTripCount << "\n");
-      }
+    if (RemainingIterations && !NextVF.Width.isScalable()) {
       if (SE.isKnownPredicate(
               CmpInst::ICMP_UGT,
               SE.getConstant(TCType, NextVF.Width.getFixedValue()),
@@ -7044,7 +7070,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
+      if (CM.shouldCalculateRegPressureForVF(VF) &&
+          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
         LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
                           << VF << " because it uses too many registers\n");
         continue;
@@ -8770,6 +8797,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Adjust the recipes for any inloop reductions.
   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
 
+  // Apply mandatory transformation to handle FP maxnum/minnum reduction with
+  // NaNs if possible, bail out otherwise.
+  if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
+                                *Plan))
+    return nullptr;
+
   // Transform recipes to abstract recipes if it is legal and beneficial and
   // clamp the range for better cost estimation.
   // TODO: Enable following transform when the EVL-version of extended-reduction
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 87de28044b2ae..0d0b342505214 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5540,8 +5540,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
       return std::max(Entries[I].front()->getVectorFactor(),
                       Entries[I].back()->getVectorFactor());
     });
-  unsigned NumUndefs =
-      count_if(CurrentOrder, [&](unsigned Idx) { return Idx == NumScalars; });
+  unsigned NumUndefs = count(CurrentOrder, NumScalars);
   if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
     return std::nullopt;
   return std::move(CurrentOrder);
@@ -8623,11 +8622,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                                State == LoadsState::CompressVectorize)
                              return false;
                            ConsecutiveNodesSize += VL.size();
-                           unsigned Start = std::distance(Slice.begin(), It);
-                           unsigned Sz = Slice.size() - Start;
+                           size_t Start = std::distance(Slice.begin(), It);
+                           size_t Sz = Slice.size() - Start;
                            return Sz < VL.size() ||
-                                  Slice.slice(std::distance(Slice.begin(), It),
-                                              VL.size()) != VL;
+                                  Slice.slice(Start, VL.size()) != VL;
                          }))
                 continue;
               // Try to build long masked gather loads.
@@ -11695,6 +11693,7 @@ void BoUpSLP::transformNodes() {
         if (StartIdx + VF > End)
           continue;
         SmallVector<std::pair<unsigned, unsigned>> Slices;
+        bool AllStrided = true;
         for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
           // If any instruction is vectorized already - do not try again.
@@ -11745,6 +11744,9 @@ void BoUpSLP::transformNodes() {
                 SmallVector<Value *> PointerOps;
                 LoadsState Res =
                     canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+                AllStrided &= Res == LoadsState::StridedVectorize ||
+                              Res == LoadsState::ScatterVectorize ||
+                              Res == LoadsState::Gather;
                 // Do not vectorize gathers.
                 if (Res == LoadsState::ScatterVectorize ||
                     Res == LoadsState::Gather) {
@@ -11774,6 +11776,11 @@ void BoUpSLP::transformNodes() {
           }
           Slices.emplace_back(Cnt, Slice.size());
         }
+        // Do not try to vectorize if all slides are strided or gathered with
+        // vector factor 2 and there are more than 2 slices. Better to handle
+        // them in gathered loads analysis, may result in better vectorization.
+        if (VF == 2 && AllStrided && Slices.size() > 2)
+          continue;
         auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
           E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
           if (StartIdx == Cnt)
@@ -22140,7 +22147,7 @@ class HorizontalReduction {
     // Try to regroup reduced values so that it gets more profitable to try to
     // reduce them. Values are grouped by their value ids, instructions - by
     // instruction op id and/or alternate op id, plus do extra analysis for
-    // loads (grouping them by the distabce between pointers) and cmp
+    // loads (grouping them by the distance between pointers) and cmp
     // instructions (grouping them by the predicate).
     SmallMapVector<
         size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
@@ -22207,10 +22214,9 @@ class HorizontalReduction {
     for (auto &PossibleReducedVals : PossibleReducedValsVect) {
       auto PossibleRedVals = PossibleReducedVals.second.takeVector();
       SmallVector<SmallVector<Value *>> PossibleRedValsVect;
-      for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
-           It != E; ++It) {
+      for (auto &Slice : PossibleRedVals) {
         PossibleRedValsVect.emplace_back();
-        auto RedValsVect = It->second.takeVector();
+        auto RedValsVect = Slice.second.takeVector();
         stable_sort(RedValsVect, llvm::less_second());
         for (const std::pair<Value *, unsigned> &Data : RedValsVect)
           PossibleRedValsVect.back().append(Data.second, Data.first);
@@ -22370,8 +22376,8 @@ class HorizontalReduction {
       SmallVector<Value *> Candidates;
       Candidates.reserve(2 * OrigReducedVals.size());
       DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
-      for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
-        Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
+      for (Value *ReducedVal : OrigReducedVals) {
+        Value *RdxVal = TrackedVals.at(ReducedVal);
         // Check if the reduction value was not overriden by the extractelement
         // instruction because of the vectorization and exclude it, if it is not
         // compatible with other values.
@@ -22382,7 +22388,7 @@ class HorizontalReduction {
             (S && !Inst))
           continue;
         Candidates.push_back(RdxVal);
-        TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+        TrackedToOrig.try_emplace(RdxVal, ReducedVal);
       }
       bool ShuffledExtracts = false;
       // Try to handle shuffled extractelements.
@@ -23196,6 +23202,8 @@ class HorizontalReduction {
         case RecurKind::FindFirstIVUMin:
         case RecurKind::FindLastIVSMax:
         case RecurKind::FindLastIVUMax:
+        case RecurKind::FMaxNum:
+        case RecurKind::FMinNum:
         case RecurKind::FMaximumNum:
         case RecurKind::FMinimumNum:
         case RecurKind::None:
@@ -23333,6 +23341,8 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::FMaxNum:
+    case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
@@ -23435,6 +23445,8 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::FMaxNum:
+    case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 703cfe969577d..db40ce2d20b81 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1357,9 +1357,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
 
 public:
   VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, DebugLoc DL)
+                const VPIRFlags &Flags, const VPIRMetadata &Metadata,
+                DebugLoc DL)
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
-        Opcode(Opcode) {}
+        VPIRMetadata(Metadata), Opcode(Opcode) {}
 
   VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I),
@@ -1368,8 +1369,9 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
   ~VPWidenRecipe() override = default;
 
   VPWidenRecipe *clone() override {
-    auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands());
-    R->transferFlags(*this);
+    auto *R =
+        new VPWidenRecipe(getOpcode(), operands(), *this, *this, getDebugLoc());
+    R->setUnderlyingValue(getUnderlyingValue());
     return R;
   }
 
@@ -4186,13 +4188,11 @@ class VPlan {
     return VPB;
   }
 
-  /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set
-  /// to nullptr. If \p IsReplicator is true, the region is a replicate region.
-  /// The returned block is owned by the VPlan and deleted once the VPlan is
-  /// destroyed.
-  VPRegionBlock *createVPRegionBlock(const std::string &Name = "",
-                                     bool IsReplicator = false) {
-    auto *VPB = new VPRegionBlock(Name, IsReplicator);
+  /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
+  /// to nullptr. The returned block is owned by the VPlan and deleted once the
+  /// VPlan is destroyed.
+  VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
+    auto *VPB = new VPRegionBlock(Name);
     CreatedBlocks.push_back(VPB);
     return VPB;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b27a7ffeed208..3499e650ae853 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return ResTy;
   }
   case Instruction::ICmp:
+  case Instruction::FCmp:
   case VPInstruction::ActiveLaneMask:
     assert(inferScalarType(R->getOperand(0)) ==
                inferScalarType(R->getOperand(1)) &&
@@ -404,9 +405,12 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) {
   return 1;
 }
 
-bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const {
-  return any_of(MaxLocalUsers, [&TTI](auto &LU) {
-    return LU.second > TTI.getNumberOfRegisters(LU.first);
+bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
+                                        unsigned OverrideMaxNumRegs) const {
+  return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
+    return LU.second > (OverrideMaxNumRegs > 0
+                            ? OverrideMaxNumRegs
+                            : TTI.getNumberOfRegisters(LU.first));
   });
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7bcf9dba8c311..cd86d27cf9122 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -85,8 +85,10 @@ struct VPRegisterUsage {
   SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
 
   /// Check if any of the tracked live intervals exceeds the number of
-  /// available registers for the target.
-  bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const;
+  /// available registers for the target. If non-zero, OverrideMaxNumRegs
+  /// is used in place of the target's number of registers.
+  bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
+                         unsigned OverrideMaxNumRegs = 0) const;
 };
 
 /// Estimate the register usage for \p Plan and vectorization factors in \p VFs
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7fb5e82f9d32b..194874af547e4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -411,7 +411,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
   // LatchExitVPB, taking care to preserve the original predecessor & successor
   // order of blocks. Set region entry and exiting after both HeaderVPB and
   // LatchVPBB have been disconnected from their predecessors/successors.
-  auto *R = Plan.createVPRegionBlock("", false /*isReplicator*/);
+  auto *R = Plan.createVPRegionBlock();
   VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
   VPBlockUtils::disconnectBlocks(LatchVPBB, R);
   VPBlockUtils::connectBlocks(PreheaderVPBB, R);
@@ -652,3 +652,164 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
+
+bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
+  auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
+    auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
+        RedPhiR->getBackedgeValue()->getDefiningRecipe());
+    if (!MinMaxR)
+      return nullptr;
+
+    auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
+    if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+        !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
+      return nullptr;
+
+#ifndef NDEBUG
+    Intrinsic::ID RdxIntrinsicId =
+        RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
+                                                           : Intrinsic::minnum;
+    assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+            cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
+                RdxIntrinsicId) ||
+           (RepR &&
+            cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
+                RdxIntrinsicId) &&
+               "Intrinsic did not match recurrence kind");
+#endif
+
+    if (MinMaxR->getOperand(0) == RedPhiR)
+      return MinMaxR->getOperand(1);
+
+    assert(MinMaxR->getOperand(1) == RedPhiR &&
+           "Reduction phi operand expected");
+    return MinMaxR->getOperand(0);
+  };
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPReductionPHIRecipe *RedPhiR = nullptr;
+  bool HasUnsupportedPhi = false;
+  for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
+    if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
+      continue;
+    auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
+    if (!Cur) {
+      // TODO: Also support fixed-order recurrence phis.
+      HasUnsupportedPhi = true;
+      continue;
+    }
+    // For now, only a single reduction is supported.
+    // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
+    if (RedPhiR)
+      return false;
+    if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
+        Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+      HasUnsupportedPhi = true;
+      continue;
+    }
+    RedPhiR = Cur;
+  }
+
+  if (!RedPhiR)
+    return true;
+
+  // We won't be able to resume execution in the scalar tail, if there are
+  // unsupported header phis or there is no scalar tail at all, due to
+  // tail-folding.
+  if (HasUnsupportedPhi || !Plan.hasScalarTail())
+    return false;
+
+  VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
+  if (!MinMaxOp)
+    return false;
+
+  RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
+  assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
+         "unsupported reduction");
+  (void)RedPhiRK;
+
+  /// Check if the vector loop of \p Plan can early exit and restart
+  /// execution of last vector iteration in the scalar loop. This requires all
+  /// recipes up to early exit point be side-effect free as they are
+  /// re-executed. Currently we check that the loop is free of any recipe that
+  /// may write to memory. Expected to operate on an early VPlan w/o nested
+  /// regions.
+  for (VPBlockBase *VPB : vp_depth_first_shallow(
+           Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+    auto *VPBB = cast<VPBasicBlock>(VPB);
+    for (auto &R : *VPBB) {
+      if (R.mayWriteToMemory() &&
+          !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+        return false;
+    }
+  }
+
+  VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+  VPBuilder Builder(LatchVPBB->getTerminator());
+  auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
+  assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+         "Unexpected terminator");
+  auto *IsLatchExitTaken =
+      Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+                         LatchExitingBranch->getOperand(1));
+
+  VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
+  VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
+  auto *AnyExitTaken =
+      Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
+  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+  LatchExitingBranch->eraseFromParent();
+
+  // If we exit early due to NaNs, compute the final reduction result based on
+  // the reduction phi at the beginning of the last vector iteration.
+  auto *RdxResult = find_singleton<VPSingleDefRecipe>(
+      RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+          return VPI;
+        return nullptr;
+      });
+
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
+  auto *NewSel =
+      Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
+  RdxResult->setOperand(1, NewSel);
+
+  auto *ScalarPH = Plan.getScalarPreheader();
+  // Update resume phis for inductions in the scalar preheader. If AnyNaN is
+  // true, the resume from the start of the last vector iteration via the
+  // canonical IV, otherwise from the original value.
+  for (auto &R : ScalarPH->phis()) {
+    auto *ResumeR = cast<VPPhi>(&R);
+    VPValue *VecV = ResumeR->getOperand(0);
+    if (VecV == RdxResult)
+      continue;
+    if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
+      if (DerivedIV->getNumUsers() == 1 &&
+          DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
+        auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
+                                            &Plan.getVectorTripCount());
+        DerivedIV->moveAfter(&*Builder.getInsertPoint());
+        DerivedIV->setOperand(1, NewSel);
+        continue;
+      }
+    }
+    // Bail out and abandon the current, partially modified, VPlan if we
+    // encounter resume phi that cannot be updated yet.
+    if (VecV != &Plan.getVectorTripCount()) {
+      LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
+                           "FMaxNum/FMinNum reduction.\n");
+      return false;
+    }
+    auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
+    ResumeR->setOperand(0, NewSel);
+  }
+
+  auto *MiddleTerm = MiddleVPBB->getTerminator();
+  Builder.setInsertPoint(MiddleTerm);
+  VPValue *MiddleCond = MiddleTerm->getOperand(0);
+  VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
+  MiddleTerm->setOperand(0, NewCond);
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1664bcc3881aa..1fbc3f3e725ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
     return Builder.CreateFreeze(Op, Name);
   }
+  case Instruction::FCmp:
   case Instruction::ICmp: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Res = State.get(getOperand(0));
     for (VPValue *Op : drop_begin(operands()))
       Res = Builder.CreateOr(Res, State.get(Op));
-    return Builder.CreateOrReduce(Res);
+    return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
   case VPInstruction::FirstActiveLane: {
     if (getNumOperands() == 1) {
@@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   switch (getOpcode()) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case VPInstruction::AnyOf:
@@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return Op == getOperand(1);
   case Instruction::PHI:
     return true;
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case Instruction::Or:
@@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
   switch (getOpcode()) {
   default:
     return false;
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
     return vputils::onlyFirstPartUsed(this);
@@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
     return Opcode == Instruction::ZExt;
     break;
   case OperationType::Cmp:
-    return Opcode == Instruction::ICmp;
+    return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
   case OperationType::Other:
     return true;
   }
@@ -3441,7 +3445,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   VPValue *BlockInMask = getMask();
   VPValue *Addr = getAddr();
   Value *ResAddr = State.get(Addr, VPLane(0));
-  Value *PoisonVec = PoisonValue::get(VecTy);
 
   auto CreateGroupMask = [&BlockInMask, &State,
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
@@ -3480,6 +3483,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     Instruction *NewLoad;
     if (BlockInMask || MaskForGaps) {
       Value *GroupMask = CreateGroupMask(MaskForGaps);
+      Value *PoisonVec = PoisonValue::get(VecTy);
       NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
                                                Group->getAlign(), GroupMask,
                                                PoisonVec, "wide.masked.vec");
@@ -3489,57 +3493,39 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     Group->addMetadata(NewLoad);
 
     ArrayRef<VPValue *> VPDefs = definedValues();
-    const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
       // Scalable vectors cannot use arbitrary shufflevectors (only splats),
       // so must use intrinsics to deinterleave.
       assert(InterleaveFactor <= 8 &&
              "Unsupported deinterleave factor for scalable vectors");
-      Value *Deinterleave = State.Builder.CreateIntrinsic(
+      NewLoad = State.Builder.CreateIntrinsic(
           getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
           NewLoad,
           /*FMFSource=*/nullptr, "strided.vec");
+    }
 
-      for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
-        Instruction *Member = Group->getMember(I);
-        Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
-        if (!Member) {
-          // This value is not needed as it's not used
-          cast<Instruction>(StridedVec)->eraseFromParent();
-          continue;
-        }
-        // If this member has different type, cast the result type.
-        if (Member->getType() != ScalarTy) {
-          VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
-          StridedVec =
-              createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
-        }
-
-        if (Group->isReverse())
-          StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
-
-        State.set(VPDefs[J], StridedVec);
-        ++J;
-      }
+    auto CreateStridedVector = [&InterleaveFactor, &State,
+                                &NewLoad](unsigned Index) -> Value * {
+      assert(Index < InterleaveFactor && "Illegal group index");
+      if (State.VF.isScalable())
+        return State.Builder.CreateExtractValue(NewLoad, Index);
 
-      return;
-    }
-    assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+      // For fixed length VF, use shuffle to extract the sub-vectors from the
+      // wide load.
+      auto StrideMask =
+          createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
+      return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
+                                               "strided.vec");
+    };
 
-    // For each member in the group, shuffle out the appropriate data from the
-    // wide loads.
-    unsigned J = 0;
-    for (unsigned I = 0; I < InterleaveFactor; ++I) {
+    for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
       Instruction *Member = Group->getMember(I);
 
       // Skip the gaps in the group.
       if (!Member)
         continue;
 
-      auto StrideMask =
-          createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
-      Value *StridedVec =
-          State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
+      Value *StridedVec = CreateStridedVector(I);
 
       // If this member has different type, cast the result type.
       if (Member->getType() != ScalarTy) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6a3b3e6e41955..cb370fe0e5b65 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1481,9 +1481,9 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   // (BranchOnCond true).
   auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
   auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
-  if (all_of(
-          Header->phis(),
-          IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
+  if (all_of(Header->phis(),
+             IsaPred<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+                     VPFirstOrderRecurrencePHIRecipe>)) {
     for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
       auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
       HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
@@ -3275,10 +3275,13 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     }
     auto *WideLoad = cast<VPWidenLoadRecipe>(R);
 
+    VPValue *PtrOp = WideLoad->getAddr();
+    if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
+      PtrOp = VecPtr->getOperand(0);
     // Narrow wide load to uniform scalar load, as transformed VPlan will only
     // process one original iteration.
-    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
-                                    WideLoad->operands(), /*IsUniform*/ true,
+    auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
+                                    /*IsUniform*/ true,
                                     /*Mask*/ nullptr, *WideLoad);
     N->insertBefore(WideLoad);
     return N;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 84a12470f45e4..ab189f6d39bce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -103,6 +103,12 @@ struct VPlanTransforms {
   /// not valid.
   static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
 
+  /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
+  /// try to update the vector loop to exit early if any input is NaN and resume
+  /// executing in the scalar loop to handle the NaNs there. Return false if
+  /// this attempt was unsuccessful.
+  static bool handleMaxMinNumReductions(VPlan &Plan);
+
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   static void clearReductionWrapFlags(VPlan &Plan);
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fe8d74c43dfdc..82adc34fdbd84 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,7 +115,7 @@ class VectorCombine {
   bool foldInsExtFNeg(Instruction &I);
   bool foldInsExtBinop(Instruction &I);
   bool foldInsExtVectorToShuffle(Instruction &I);
-  bool foldBitOpOfBitcasts(Instruction &I);
+  bool foldBitOpOfCastops(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
   bool scalarizeOpOrCmp(Instruction &I);
   bool scalarizeVPIntrinsic(Instruction &I);
@@ -808,48 +808,87 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
   return true;
 }
 
-bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
-  // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
-  Value *LHSSrc, *RHSSrc;
-  if (!match(&I, m_BitwiseLogic(m_BitCast(m_Value(LHSSrc)),
-                                m_BitCast(m_Value(RHSSrc)))))
+/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
+/// Supports: bitcast, trunc, sext, zext
+bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
+  // Check if this is a bitwise logic operation
+  auto *BinOp = dyn_cast<BinaryOperator>(&I);
+  if (!BinOp || !BinOp->isBitwiseLogicOp())
     return false;
 
+  // Get the cast instructions
+  auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
+  auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
+  if (!LHSCast || !RHSCast) {
+    LLVM_DEBUG(dbgs() << "  One or both operands are not cast instructions\n");
+    return false;
+  }
+
+  // Both casts must be the same type
+  Instruction::CastOps CastOpcode = LHSCast->getOpcode();
+  if (CastOpcode != RHSCast->getOpcode())
+    return false;
+
+  // Only handle supported cast operations
+  switch (CastOpcode) {
+  case Instruction::BitCast:
+  case Instruction::Trunc:
+  case Instruction::SExt:
+  case Instruction::ZExt:
+    break;
+  default:
+    return false;
+  }
+
+  Value *LHSSrc = LHSCast->getOperand(0);
+  Value *RHSSrc = RHSCast->getOperand(0);
+
   // Source types must match
   if (LHSSrc->getType() != RHSSrc->getType())
     return false;
-  if (!LHSSrc->getType()->getScalarType()->isIntegerTy())
-    return false;
 
-  // Only handle vector types
+  // Only handle vector types with integer elements
   auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
   auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
   if (!SrcVecTy || !DstVecTy)
     return false;
 
-  // Same total bit width
-  assert(SrcVecTy->getPrimitiveSizeInBits() ==
-             DstVecTy->getPrimitiveSizeInBits() &&
-         "Bitcast should preserve total bit width");
+  if (!SrcVecTy->getScalarType()->isIntegerTy() ||
+      !DstVecTy->getScalarType()->isIntegerTy())
+    return false;
 
   // Cost Check :
-  // OldCost = bitlogic + 2*bitcasts
-  // NewCost = bitlogic + bitcast
-  auto *BinOp = cast<BinaryOperator>(&I);
+  // OldCost = bitlogic + 2*casts
+  // NewCost = bitlogic + cast
+
+  // Calculate specific costs for each cast with instruction context
+  InstructionCost LHSCastCost =
+      TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+                           TTI::CastContextHint::None, CostKind, LHSCast);
+  InstructionCost RHSCastCost =
+      TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
+                           TTI::CastContextHint::None, CostKind, RHSCast);
+
   InstructionCost OldCost =
-      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy) +
-      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, LHSSrc->getType(),
-                           TTI::CastContextHint::None) +
-      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, RHSSrc->getType(),
-                           TTI::CastContextHint::None);
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
+      LHSCastCost + RHSCastCost;
+
+  // For new cost, we can't provide an instruction (it doesn't exist yet)
+  InstructionCost GenericCastCost = TTI.getCastInstrCost(
+      CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
+
   InstructionCost NewCost =
-      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy) +
-      TTI.getCastInstrCost(Instruction::BitCast, DstVecTy, SrcVecTy,
-                           TTI::CastContextHint::None);
+      TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
+      GenericCastCost;
 
-  LLVM_DEBUG(dbgs() << "Found a bitwise logic op of bitcasted values: " << I
-                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
-                    << "\n");
+  // Account for multi-use casts using specific costs
+  if (!LHSCast->hasOneUse())
+    NewCost += LHSCastCost;
+  if (!RHSCast->hasOneUse())
+    NewCost += RHSCastCost;
+
+  LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
+                    << " NewCost=" << NewCost << "\n");
 
   if (NewCost > OldCost)
     return false;
@@ -862,8 +901,16 @@ bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
 
   Worklist.pushValue(NewOp);
 
-  // Bitcast the result back
-  Value *Result = Builder.CreateBitCast(NewOp, I.getType());
+  // Create the cast operation directly to ensure we get a new instruction
+  Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
+
+  // Preserve cast instruction flags
+  NewCast->copyIRFlags(LHSCast);
+  NewCast->andIRFlags(RHSCast);
+
+  // Insert the new instruction
+  Value *Result = Builder.Insert(NewCast);
+
   replaceValue(I, *Result);
   return true;
 }
@@ -3773,7 +3820,7 @@ bool VectorCombine::run() {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-        MadeChange |= foldBitOpOfBitcasts(I);
+        MadeChange |= foldBitOpOfCastops(I);
         break;
       default:
         MadeChange |= shrinkType(I);
diff --git a/llvm/test/Analysis/BasicAA/modref.ll b/llvm/test/Analysis/BasicAA/modref.ll
index 0619f8e615b80..1aab28f3f1871 100644
--- a/llvm/test/Analysis/BasicAA/modref.ll
+++ b/llvm/test/Analysis/BasicAA/modref.ll
@@ -67,27 +67,33 @@ define i8 @test2a(ptr %P) {
   ret i8 %A
 }
 
-define void @test3(ptr %P, i8 %X) {
+define void @test3(i8 %X) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 2
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i32 2
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[P]])
 ; CHECK-NEXT:    store i8 2, ptr [[P2]], align 1
+; CHECK-NEXT:    call void @external(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
+  %P = alloca i64
   %Y = add i8 %X, 1     ;; Dead, because the only use (the store) is dead.
 
   %P2 = getelementptr i8, ptr %P, i32 2
   store i8 %Y, ptr %P2  ;; Not read by lifetime.end, should be removed.
   call void @llvm.lifetime.end.p0(i64 1, ptr %P)
   store i8 2, ptr %P2
+  call void @external(ptr %P)
   ret void
 }
 
-define void @test3a(ptr %P, i8 %X) {
+define void @test3a(i8 %X) {
 ; CHECK-LABEL: @test3a(
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 10, ptr [[P:%.*]])
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 10, ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
+  %P = alloca i64
   %Y = add i8 %X, 1     ;; Dead, because the only use (the store) is dead.
 
   %P2 = getelementptr i8, ptr %P, i32 2
diff --git a/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll b/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll
index 658d73804c174..1c9d20193869e 100644
--- a/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll
+++ b/llvm/test/Analysis/CallGraph/ignore-assumelike-calls.ll
@@ -10,7 +10,7 @@
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   Call graph node for function: 'bitcast_only'<<{{.*}}>>  #uses=0
 ; CHECK-EMPTY:
-; CHECK-NEXT:   Call graph node for function: 'llvm.lifetime.start.p0'<<{{.*}}>>  #uses=3
+; CHECK-NEXT:   Call graph node for function: 'llvm.lifetime.start.p0'<<{{.*}}>>  #uses=2
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   Call graph node for function: 'llvm.memset.p0.i64'<<{{.*}}>>  #uses=2
 ; CHECK-EMPTY:
@@ -25,18 +25,11 @@
 ; CHECK-NEXT:   Call graph node for function: 'used_by_lifetime'<<{{.*}}>>  #uses=0
 ; CHECK-NEXT:   CS<{{.*}}> calls function 'llvm.lifetime.start.p0'
 ; CHECK-EMPTY:
-; CHECK-NEXT:   Call graph node for function: 'used_by_lifetime_cast'<<{{.*}}>>  #uses=0
-; CHECK-NEXT:   CS<{{.*}}> calls function 'llvm.lifetime.start.p0'
-; CHECK-EMPTY:
 
 define internal void @used_by_lifetime() {
 entry:
-  call void @llvm.lifetime.start.p0(i64 4, ptr @used_by_lifetime)
-  ret void
-}
-
-define internal void @used_by_lifetime_cast() addrspace(1) {
-  call void @llvm.lifetime.start.p0(i64 4, ptr addrspacecast (ptr addrspace(1) @used_by_lifetime_cast to ptr))
+  %a = alloca i8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %a)
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index 21e0356fd7321..b221fc8a35ab3 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -15,14 +15,14 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index 27dd42297bfab..4bb59e3a09b7a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -15,14 +15,14 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
index 826605450a2d8..8e81aadbb9934 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -15,14 +15,14 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
index a8c5c43c3a9f8..3a54428bd8291 100644
--- a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll
@@ -4,6 +4,7 @@
 
 define i32 @trivially_free() {
 ; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -13,14 +14,15 @@ define i32 @trivially_free() {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -30,13 +32,14 @@ define i32 @trivially_free() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %alloca = alloca i8
   %a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef)
   call void @llvm.assume(i1 undef)
   call void @llvm.experimental.noalias.scope.decl(metadata !4)
@@ -46,8 +49,8 @@ define i32 @trivially_free() {
   %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
   %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
   %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-  call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-  call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+  call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
   %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1)
   %a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
   call void @llvm.var.annotation(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
index 560af3d2b48fc..96064dc3af60b 100644
--- a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
+++ b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll
@@ -6,6 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i32 @trivially_free() {
 ; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 4
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -15,8 +16,8 @@ define i32 @trivially_free() {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a7 = call i1 @llvm.allow.ubsan.check(i8 123)
@@ -25,6 +26,7 @@ define i32 @trivially_free() {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 4
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -34,8 +36,8 @@ define i32 @trivially_free() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a7 = call i1 @llvm.allow.ubsan.check(i8 123)
@@ -43,6 +45,7 @@ define i32 @trivially_free() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+  %alloca = alloca i8
   %a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef)
   call void @llvm.assume(i1 undef)
   call void @llvm.experimental.noalias.scope.decl(metadata !4)
@@ -52,8 +55,8 @@ define i32 @trivially_free() {
   %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
   %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
   %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-  call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-  call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+  call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
   %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1)
   %a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
   %a7 = call i1 @llvm.allow.ubsan.check(i8 123)
diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
index 53828f2f07277..f989ebe592a11 100644
--- a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
+++ b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll
@@ -4,6 +4,7 @@
 
 define i32 @trivially_free() {
 ; CHECK-SIZE-LABEL: 'trivially_free'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -13,8 +14,8 @@ define i32 @trivially_free() {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
@@ -23,6 +24,7 @@ define i32 @trivially_free() {
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-THROUGHPUT-LABEL: 'trivially_free'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %alloca = alloca i8, align 1
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a0 = call i32 @llvm.annotation.i32.p0(i32 undef, ptr undef, ptr undef, i32 undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3)
@@ -32,8 +34,8 @@ define i32 @trivially_free() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 true, i1 true, i1 true)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a6 = call ptr @llvm.ptr.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.var.annotation.p0.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
@@ -41,6 +43,7 @@ define i32 @trivially_free() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a8 = call i1 @llvm.allow.runtime.check(metadata !"test_check")
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+  %alloca = alloca i8
   %a0 = call i32 @llvm.annotation.i32(i32 undef, ptr undef, ptr undef, i32 undef)
   call void @llvm.assume(i1 undef)
   call void @llvm.experimental.noalias.scope.decl(metadata !4)
@@ -50,8 +53,8 @@ define i32 @trivially_free() {
   %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef)
   %a3 = call ptr @llvm.strip.invariant.group.p0(ptr undef)
   %a4 = call i1 @llvm.is.constant.i32(i32 undef)
-  call void @llvm.lifetime.start.p0(i64 1, ptr undef)
-  call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+  call void @llvm.lifetime.start.p0(i64 1, ptr %alloca)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %alloca)
   %a5 = call i64 @llvm.objectsize.i64.p0(ptr undef, i1 1, i1 1, i1 1)
   %a6 = call ptr @llvm.ptr.annotation.p0(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
   call void @llvm.var.annotation(ptr undef, ptr undef, ptr undef, i32 undef, ptr undef)
diff --git a/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll b/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll
index 4f95da4f79c57..d9ccea55dd478 100644
--- a/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll
@@ -11,7 +11,7 @@
 define i32 @alias_with_different_offsets(ptr nocapture %A) {
 ; CHECK-LABEL: 'alias_with_different_offsets'
 ; CHECK-NEXT:  Src: store i32 2, ptr %arrayidx, align 1 --> Dst: store i32 2, ptr %arrayidx, align 1
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 2, ptr %arrayidx, align 1 --> Dst: %0 = load i32, ptr %A, align 1
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %A, align 1 --> Dst: %0 = load i32, ptr %A, align 1
@@ -207,11 +207,11 @@ end:
 ;        *((long long *)idx) = 1;
 ;      }
 ;
-; FIXME: There are loop-carried dependencies across iterations in the store.
+; There are loop-carried dependencies across iterations in the store.
 define void @multidim_accesses2(ptr %A) {
 ; CHECK-LABEL: 'multidim_accesses2'
 ; CHECK-NEXT:  Src: store i64 1, ptr %idx, align 4 --> Dst: store i64 1, ptr %idx, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   br label %for.i
diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
index c1f8c85f2bf0e..b498d70648bad 100644
--- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
@@ -40,6 +40,9 @@ define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 {
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %v27 = load <32 x i32>, ptr %v25, align 256 --> Dst: %v27 = load <32 x i32>, ptr %v25, align 256
 ; CHECK-NEXT:    da analyze - consistent input [0 S S]!
+; CHECK-NEXT:    Runtime Assumptions:
+; CHECK-NEXT:    Equal predicate: (zext i7 (4 * (trunc i32 %v1 to i7) * (1 + (trunc i32 %n to i7))) to i32) == 0
+; CHECK-NEXT:    Equal predicate: (8 * (zext i4 (trunc i32 %v1 to i4) to i32))<nuw><nsw> == 0
 ; CHECK-NEXT:  Src: %v27 = load <32 x i32>, ptr %v25, align 256 --> Dst: %v32 = load <32 x i32>, ptr %v30, align 128
 ; CHECK-NEXT:    da analyze - input [* S S|<]!
 ; CHECK-NEXT:    Runtime Assumptions:
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 247a105940e6e..fe140d01e8818 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -909,10 +909,10 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
-define i16 @not.crc.bad.cast(i8 %msg, i16 %checksum) {
-; CHECK-LABEL: 'not.crc.bad.cast'
+define i16 @not.crc.bad.endian.swapped.sb.check(i8 %msg, i16 %checksum) {
+; CHECK-LABEL: 'not.crc.bad.endian.swapped.sb.check'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Expected bottom 8 bits zero (????????00001011)
+; CHECK-NEXT:  Reason: Found stray unvisited instructions
 ;
 entry:
   br label %loop
@@ -1189,3 +1189,55 @@ loop:                                              ; preds = %loop, %entry
 exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
+
+define i16 @not.crc.stray.unvisited.call(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.stray.unvisited.call'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Found stray unvisited instructions
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 4129
+  %check.sb = icmp slt i16 %crc, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  call void @print(i16 %crc.next)
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+declare void @print(i16)
+
+define i16 @not.crc.call.sb.check(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.call.sb.check'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Found stray unvisited instructions
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 4129
+  %call = call i16 @side.effect()
+  %check.sb = icmp slt i16 %call, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+declare i16 @side.effect()
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
index 0d1b0829c09da..72b620aad31dc 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
@@ -106,8 +106,10 @@ exit:
   ret void
 }
 
-define void @backward_dep_known_distance_less_than_btc(ptr %A) {
-; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc'
+; TOOD: The loop should be safe without dependence, as all accesses to %l are
+; completely before the first store.
+define void @backward_dep_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'backward_dep_known_safe_due_to_backedge_taken_count'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 8160 bits
 ; CHECK-NEXT:      Dependences:
@@ -142,3 +144,40 @@ loop:
 exit:
   ret void
 }
+
+define void @backward_dep_known_distance_less_than_btc(ptr %A) {
+; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 4064 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %A.510 = getelementptr inbounds i32, ptr %A, i64 510
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul.2 = shl nuw nsw i64 %iv, 1
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv.mul.2
+  %l = load i32, ptr %gep, align 4
+  %add = add nsw i32 %l, 5
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv
+  store i32 %add, ptr %gep.mul.2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
index d409c140dc7b2..18d2459b42c40 100644
--- a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
+++ b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
@@ -2,8 +2,12 @@
 ; This test checks that lifetime markers are considered clobbers of %P,
 ; and due to lack of noalias information, of %Q as well.
 
-define i8 @test(ptr %P, ptr %Q) {
+declare ptr @obscure(ptr) memory(none)
+
+define i8 @test() {
 entry:
+  %P = alloca [32 x i8]
+  %Q = call ptr @obscure(ptr %P)
 ; CHECK:  1 = MemoryDef(liveOnEntry)
 ; CHECK-NEXT:   call void @llvm.lifetime.start.p0(i64 32, ptr %P)
   call void @llvm.lifetime.start.p0(i64 32, ptr %P)
diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll
index a9b442c735b60..254fb1104c590 100644
--- a/llvm/test/Analysis/MemorySSA/pr43427.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43427.ll
@@ -30,7 +30,7 @@
 ; CHECK-NEXT: ; [[NO6:.*]] = MemoryDef([[NO7]])
 ; CHECK-NEXT:   store i16 undef, ptr %e, align 1
 ; CHECK-NEXT:  3 = MemoryDef([[NO6]])
-; CHECK-NEXT:   call void @llvm.lifetime.end.p0(i64 1, ptr null)
+; CHECK-NEXT:   call void @g()
 
 define void @f(i1 %arg) {
 entry:
@@ -57,7 +57,7 @@ cleanup:                                          ; preds = %lbl3
   br i1 %switch, label %cleanup.cont, label %lbl1
 
 cleanup.cont:                                     ; preds = %cleanup
-  call void @llvm.lifetime.end.p0(i64 1, ptr null)
+  call void @g()
   ret void
 
 if.else:                                          ; preds = %lbl1
@@ -65,6 +65,3 @@ if.else:                                          ; preds = %lbl1
 }
 
 declare void @g()
-
-; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/Analysis/MemorySSA/pr43438.ll b/llvm/test/Analysis/MemorySSA/pr43438.ll
index d137c52e3e0ff..0e09137c1cf9a 100644
--- a/llvm/test/Analysis/MemorySSA/pr43438.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43438.ll
@@ -87,7 +87,7 @@ if.else:                                          ; preds = %lbl1
   ]
 
 if.end12:                                         ; preds = %cleanup.cont11s, %cleanup.cont
-  call void @llvm.lifetime.end.p0(i64 1, ptr undef)
+  call i16 @g(i16 1)
   ret void
 
 unreachable:                                      ; preds = %if.else, %for.end5
@@ -95,6 +95,3 @@ unreachable:                                      ; preds = %if.else, %for.end5
 }
 
 declare i16 @g(i16)
-
-; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
index 37fa7d3e84eae..7fa1cf47f06be 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
@@ -786,83 +786,6 @@ end:
   ret void
 }
 
-define void @non_alloca(ptr %p) {
-; CHECK-LABEL: define void @non_alloca
-entry:
-; CHECK: entry:
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-  %x = alloca i8, align 4
-  %y = alloca i8, align 4
-
-  call void @llvm.lifetime.start.p0(i64 4, ptr %p)
-; CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %p)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
-  call void @llvm.lifetime.start.p0(i64 4, ptr %x)
-; CHECK: call void @llvm.lifetime.start.p0(i64 4, ptr %x)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
-  call void @llvm.lifetime.end.p0(i64 4, ptr %p)
-; CHECK: call void @llvm.lifetime.end.p0(i64 4, ptr %p)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
-  ret void
-}
-
-define void @select_alloca(i1 %v) {
-; CHECK-LABEL: define void @select_alloca
-entry:
-; CHECK: entry:
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-  %x = alloca i8, align 4
-  %y = alloca i8, align 4
-  %cxcy = select i1 %v, ptr %x, ptr %y
-
-  call void @llvm.lifetime.start.p0(i64 1, ptr %cxcy)
-; CHECK: call void @llvm.lifetime.start.p0(i64 1, ptr %cxcy)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
-  call void @llvm.lifetime.start.p0(i64 1, ptr %x)
-; CHECK: call void @llvm.lifetime.start.p0(i64 1, ptr %x)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
-  call void @llvm.lifetime.end.p0(i64 1, ptr %x)
-; CHECK: call void @llvm.lifetime.end.p0(i64 1, ptr %x)
-; MAY-NEXT: Alive: <x y>
-; MUST-NEXT: Alive: <>
-
-  ret void
-}
-
-define void @alloca_offset() {
-; CHECK-LABEL: define void @alloca_offset
-entry:
-; CHECK: entry:
-; MAY-NEXT: Alive: <x>
-; MUST-NEXT: Alive: <>
-  %x = alloca [5 x i32], align 4
-  %x2 = getelementptr [5 x i32], ptr %x, i64 0, i64 1
-
-  call void @llvm.lifetime.start.p0(i64 20, ptr %x2)
-; CHECK: call void @llvm.lifetime.start.p0(i64 20, ptr %x2)
-; MAY-NEXT: Alive: <x>
-; MUST-NEXT: Alive: <>
-
-  call void @llvm.lifetime.end.p0(i64 20, ptr %x2)
-; CHECK: call void @llvm.lifetime.end.p0(i64 20, ptr %x2)
-; MAY-NEXT: Alive: <x>
-; MUST-NEXT: Alive: <>
-
-  ret void
-}
-
 define void @alloca_size() {
 ; CHECK-LABEL: define void @alloca_size
 entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 705c128a1b34f..10c656ac027af 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -302,6 +302,14 @@ define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+define amdgpu_ps void @wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 ; CHRCK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
 define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
@@ -836,6 +844,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>,
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
diff --git a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
new file mode 100644
index 0000000000000..00ab93470c193
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S < %s | FileCheck %s
+
+define void @strip_bitcast() {
+; CHECK-LABEL: define void @strip_bitcast() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[B:%.*]] = bitcast ptr [[A]] to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  %b = bitcast ptr %a to ptr
+  call void @llvm.lifetime.start.p0(i64 1, ptr %b)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %b)
+  ret void
+}
+
+define void @strip_addrspacecast() {
+; CHECK-LABEL: define void @strip_addrspacecast() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[B:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  %b = addrspacecast ptr %a to ptr addrspace(1)
+  call void @llvm.lifetime.start.p1(i64 1, ptr addrspace(1) %b)
+  call void @llvm.lifetime.end.p1(i64 1, ptr addrspace(1) %b)
+  ret void
+}
+
+define void @strip_gep() {
+; CHECK-LABEL: define void @strip_gep() {
+; CHECK-NEXT:    [[A:%.*]] = alloca [2 x i8], align 1
+; CHECK-NEXT:    [[B:%.*]] = getelementptr [2 x i8], ptr [[A]], i64 0, i64 0
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [2 x i8]
+  %b = getelementptr [2 x i8], ptr %a, i64 0, i64 0
+  call void @llvm.lifetime.start.p0(i64 1, ptr %b)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %b)
+  ret void
+}
+
+define void @remove_unanalyzable(ptr %p) {
+; CHECK-LABEL: define void @remove_unanalyzable(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.lifetime.start.p0(i64 1, ptr %p)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %p)
+  ret void
+}
diff --git a/llvm/test/Assembler/callee-type-metadata.ll b/llvm/test/Assembler/callee-type-metadata.ll
new file mode 100644
index 0000000000000..9c3cfbe82fc13
--- /dev/null
+++ b/llvm/test/Assembler/callee-type-metadata.ll
@@ -0,0 +1,21 @@
+;; Test if the callee_type metadata attached to indirect call sites adhere to the expected format.
+
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+define i32 @_Z13call_indirectPFicEc(ptr %func, i8 signext %x) !type !0 {
+entry:
+  %func.addr = alloca ptr, align 8
+  %x.addr = alloca i8, align 1
+  store ptr %func, ptr %func.addr, align 8
+  store i8 %x, ptr %x.addr, align 1
+  %fptr = load ptr, ptr %func.addr, align 8
+  %x_val = load i8, ptr %x.addr, align 1
+  ; CHECK: %call = call i32 %fptr(i8 signext %x_val), !callee_type !1
+  %call = call i32 %fptr(i8 signext %x_val), !callee_type !1
+  ret i32 %call
+}
+
+declare !type !2 i32 @_Z3barc(i8 signext)
+
+!0 = !{i64 0, !"_ZTSFiPvcE.generalized"}
+!1 = !{!2}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
diff --git a/llvm/test/Assembler/difile-empty-source.ll b/llvm/test/Assembler/difile-empty-source.ll
new file mode 100644
index 0000000000000..11587d8f1afdc
--- /dev/null
+++ b/llvm/test/Assembler/difile-empty-source.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder
+
+; CHECK: !DIFile({{.*}}, source: "")
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "-", directory: "/", checksumkind: CSK_MD5, checksum: "d41d8cd98f00b204e9800998ecf8427e", source: "")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 9cf3fdbe550b4..0b5ce08c00a23 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -564,6 +564,10 @@ declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
 ; CHECK: declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
 declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
 ; CHECK: declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
+declare cc124 void @f.cc124(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.cc124(i1)
+declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
 declare cc1023 void @f.cc1023()
 ; CHECK: declare cc1023 void @f.cc1023()
 
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 2a6135da9a61e..3426b6ff8d24d 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS
           llvm-exegesis
           llvm-extract
           llvm-gsymutil
+          llvm-ir2vec
           llvm-isel-fuzzer
           llvm-ifs
           llvm-install-name-tool
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll
index 55cf48ed2245f..d1a6584a331c1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll
@@ -9,7 +9,7 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
 
 declare i32 @logg(...)
 
-define i32 @scanfile(i32 %call148) {
+define i32 @scanfile(i32 %call148, ptr %p) {
 ; CHECK-LABEL: scanfile:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
@@ -26,7 +26,7 @@ define i32 @scanfile(i32 %call148) {
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB0_3: ; %entry
-; CHECK-NEXT:    b.eq LBB0_2
+; CHECK-NEXT:    b.eq LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %entry
 ; CHECK-NEXT:    cmp w8, #2
 ; CHECK-NEXT:    b.eq LBB0_6
@@ -46,6 +46,10 @@ define i32 @scanfile(i32 %call148) {
 ; CHECK-NEXT:  LBB0_9: ; %sw.bb150
 ; CHECK-NEXT:    bl _logg
 ; CHECK-NEXT:    brk #0x1
+; CHECK-NEXT:  LBB0_10: ; %sw.bb178
+; CHECK-NEXT:    str wzr, [x1]
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   switch i32 %call148, label %common.ret [
     i32 -1, label %sw.bb
@@ -80,7 +84,7 @@ sw.bb152:                                         ; preds = %entry
   br label %common.ret
 
 sw.bb178:                                         ; preds = %entry
-  call void @llvm.lifetime.start.p0(i64 0, ptr null)
+  store i32 0, ptr %p
   br label %common.ret
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
new file mode 100644
index 0000000000000..8552931c1f4c0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -0,0 +1,109 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple aarch64 -passes="print<gisel-value-tracking>" %s -o - 2>&1 | FileCheck %s
+
+---
+name:            Cst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @Cst
+  ; CHECK-NEXT: %0:_ KnownBits:10000000 SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:11110000 SignBits:4
+    %0:_(s8) = G_CONSTANT i8 128
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_ASHR %0, %1
+...
+---
+name:            CstBig
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstBig
+  ; CHECK-NEXT: %0:_ KnownBits:11111000 SignBits:5
+  ; CHECK-NEXT: %1:_ KnownBits:00000110 SignBits:5
+  ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 248
+    %1:_(s8) = G_CONSTANT i8 6
+    %2:_(s8) = G_ASHR %0, %1
+...
+---
+name:            ScalarVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = COPY $b1
+    %2:_(s8) = G_ASHR %0, %1
+...
+---
+name:            ScalarCst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarCst
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:4
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_ASHR %0, %1
+...
+---
+name:            VectorVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s16>) = G_ASHR %0, %1
+...
+---
+name:            VectorCst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:4
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_ASHR %0, %2
+...
+---
+name:            VectorCst36
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst36
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000110 SignBits:13
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:4
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(s16) = G_CONSTANT i16 6
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_ASHR %0, %3
+...
+---
+name:            VectorCst3unknown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst3unknown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %2:_(s16) = COPY $h0
+    %1:_(s16) = G_CONSTANT i16 3
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_ASHR %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index bd2d8c095831b..5c164bf672082 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -71,12 +71,13 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 #
 # DEBUG-NEXT: G_ABDS (opcode 65): 1 type index, 0 imm indices
-# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT:G_ABDU (opcode 66): 1 type index, 0 imm indices
-# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_ABDU (opcode 66): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: {{[0-9]+}}, OK
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
new file mode 100644
index 0000000000000..09eb18b0e3574
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
@@ -0,0 +1,364 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            split_loads_to_fpr128
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSui %0, 0
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %1
+    %8:fpr128 = LD1i32 %7, 2, killed %2
+    %9:fpr128 = LD1i32 %8, 3, killed %3
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHroX %0, killed %1, 0, 1
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %2
+    %12:fpr128 = LD1i16 %11, 2, killed %3
+    %13:fpr128 = LD1i16 %12, 3, killed %4
+    %14:fpr128 = LD1i16 %13, 4, killed %5
+    %15:fpr128 = LD1i16 %14, 5, killed %6
+    %16:fpr128 = LD1i16 %15, 6, killed %7
+    %17:fpr128 = LD1i16 %16, 7, killed %8
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHui %0, 0
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %1
+    %12:fpr128 = LD1i16 %11, 2, killed %2
+    %13:fpr128 = LD1i16 %12, 3, killed %3
+    %14:fpr128 = LD1i16 %13, 4, killed %4
+    %15:fpr128 = LD1i16 %14, 5, killed %5
+    %16:fpr128 = LD1i16 %15, 6, killed %6
+    %17:fpr128 = LD1i16 %16, 7, killed %7
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i8
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i8
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16
+    ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]]
+    ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]]
+    ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]]
+    ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]]
+    ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:gpr64common = COPY $x9
+    %10:gpr64common = COPY $x10
+    %11:gpr64common = COPY $x11
+    %12:gpr64common = COPY $x12
+    %13:gpr64common = COPY $x13
+    %14:gpr64common = COPY $x14
+    %15:gpr64common = COPY $x15
+    %16:gpr64common = COPY $x16
+    %17:fpr8 = LDRBroX %0, killed %1, 0, 0
+    %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub
+    %19:fpr128 = LD1i8 %18, 1, killed %2
+    %20:fpr128 = LD1i8 %19, 2, killed %3
+    %21:fpr128 = LD1i8 %20, 3, killed %4
+    %22:fpr128 = LD1i8 %21, 4, killed %5
+    %23:fpr128 = LD1i8 %22, 5, killed %6
+    %24:fpr128 = LD1i8 %23, 6, killed %7
+    %25:fpr128 = LD1i8 %24, 7, killed %8
+    %26:fpr128 = LD1i8 %25, 8, killed %9
+    %27:fpr128 = LD1i8 %26, 9, killed %10
+    %28:fpr128 = LD1i8 %27, 10, killed %11
+    %29:fpr128 = LD1i8 %28, 11, killed %12
+    %30:fpr128 = LD1i8 %29, 12, killed %13
+    %31:fpr128 = LD1i8 %30, 13, killed %14
+    %32:fpr128 = LD1i8 %31, 14, killed %15
+    %33:fpr128 = LD1i8 %32, 15, killed %16
+    $q0 = COPY %33
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_missing_lanes
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: negative_pattern_missing_lanes
+    ; CHECK:      [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+    ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+  
+    %0:gpr64common = COPY $x0
+    %1:fpr128 = LDRQui $x1, 0
+    %2:fpr128 = LD1i32 %1, 3, %0
+    $q0 = COPY %2
+    RET_ReallyLR implicit $q0
+
+---
+name:            out_of_order_lanes
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: out_of_order_lanes
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 2, killed %2
+    %8:fpr128 = LD1i32 %7, 1, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_no_subreg_to_reg
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0
+    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:fpr128 = LDRQui %0, 0            
+    %5:fpr128 = LD1i32 %4, 1, killed %1 
+    %6:fpr128 = LD1i32 %5, 2, killed %2
+    %7:fpr128 = LD1i32 %6, 3, killed %3
+    $q0 = COPY %7
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_multiple_users
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: negative_pattern_multiple_users
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+    ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    $q1 = COPY %8
+    RET_ReallyLR implicit $q0, implicit $q1
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index be79135c8b831..747db396bc807 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -14,10 +14,10 @@ define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) {
 ; CHECK-GI-LABEL: dupsext_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    lsl w8, w0, #8
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
 ; CHECK-GI-NEXT:    dup v1.8h, w8
-; CHECK-GI-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    smull v0.8h, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %in = sext i8 %src to i16
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index ff7872c922e32..83530049a50d6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -87,46 +87,17 @@ entry:
 }
 
 define void @memset_10_zeroval_volatile(ptr %dst) {
-; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_zeroval_volatile:
-; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w1, wzr
-; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
-; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O0-NEXT:    ret
-;
-; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_zeroval_volatile:
-; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w1, wzr
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
-; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O3-NEXT:    ret
-;
-; GISel-MOPS-O0-LABEL: memset_10_zeroval_volatile:
-; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT:    mov x9, xzr
-; GISel-MOPS-O0-NEXT:    setp [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT:    setm [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT:    sete [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT:    ret
+; GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile:
+; GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-NEXT:    str xzr, [x0]
+; GISel-WITHOUT-MOPS-NEXT:    strh wzr, [x0, #8]
+; GISel-WITHOUT-MOPS-NEXT:    ret
 ;
-; GISel-MOPS-O3-LABEL: memset_10_zeroval_volatile:
-; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT:    sete [x0]!, x8!, xzr
-; GISel-MOPS-O3-NEXT:    ret
+; GISel-MOPS-LABEL: memset_10_zeroval_volatile:
+; GISel-MOPS:       // %bb.0: // %entry
+; GISel-MOPS-NEXT:    str xzr, [x0]
+; GISel-MOPS-NEXT:    strh wzr, [x0, #8]
+; GISel-MOPS-NEXT:    ret
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_zeroval_volatile:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
@@ -490,43 +461,46 @@ entry:
 define void @memset_10_volatile(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile:
 ; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
-; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O0-NEXT:    // implicit-def: $x8
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, w1
+; GISel-WITHOUT-MOPS-O0-NEXT:    and x8, x8, #0xff
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov x9, #72340172838076673 // =0x101010101010101
+; GISel-WITHOUT-MOPS-O0-NEXT:    mul x8, x8, x9
+; GISel-WITHOUT-MOPS-O0-NEXT:    str x8, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT:    // kill: def $w8 killed $w8 killed $x8
+; GISel-WITHOUT-MOPS-O0-NEXT:    strh w8, [x0, #8]
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ret
 ;
 ; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile:
 ; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
-; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; GISel-WITHOUT-MOPS-O3-NEXT:    and x9, x1, #0xff
+; GISel-WITHOUT-MOPS-O3-NEXT:    mul x8, x9, x8
+; GISel-WITHOUT-MOPS-O3-NEXT:    str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT:    strh w8, [x0, #8]
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT:    // implicit-def: $x9
-; GISel-MOPS-O0-NEXT:    mov w9, w1
-; GISel-MOPS-O0-NEXT:    setp [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT:    setm [x0]!, x8!, x9
-; GISel-MOPS-O0-NEXT:    sete [x0]!, x8!, x9
+; GISel-MOPS-O0-NEXT:    // implicit-def: $x8
+; GISel-MOPS-O0-NEXT:    mov w8, w1
+; GISel-MOPS-O0-NEXT:    and x8, x8, #0xff
+; GISel-MOPS-O0-NEXT:    mov x9, #72340172838076673 // =0x101010101010101
+; GISel-MOPS-O0-NEXT:    mul x8, x8, x9
+; GISel-MOPS-O0-NEXT:    str x8, [x0]
+; GISel-MOPS-O0-NEXT:    // kill: def $w8 killed $w8 killed $x8
+; GISel-MOPS-O0-NEXT:    strh w8, [x0, #8]
 ; GISel-MOPS-O0-NEXT:    ret
 ;
 ; GISel-MOPS-O3-LABEL: memset_10_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, x1
-; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, x1
-; GISel-MOPS-O3-NEXT:    sete [x0]!, x8!, x1
+; GISel-MOPS-O3-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; GISel-MOPS-O3-NEXT:    and x9, x1, #0xff
+; GISel-MOPS-O3-NEXT:    mul x8, x9, x8
+; GISel-MOPS-O3-NEXT:    str x8, [x0]
+; GISel-MOPS-O3-NEXT:    strh w8, [x0, #8]
 ; GISel-MOPS-O3-NEXT:    ret
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_volatile:
@@ -905,43 +879,21 @@ entry:
 }
 
 define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) {
-; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_10_volatile:
-; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT:    bl memcpy
-; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O0-NEXT:    ret
-;
-; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_10_volatile:
-; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT:    bl memcpy
-; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; GISel-WITHOUT-MOPS-O3-NEXT:    ret
-;
-; GISel-MOPS-O0-LABEL: memcpy_10_volatile:
-; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT:    cpyfp [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT:    cpyfm [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT:    cpyfe [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT:    ret
+; GISel-WITHOUT-MOPS-LABEL: memcpy_10_volatile:
+; GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; GISel-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; GISel-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; GISel-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; GISel-WITHOUT-MOPS-NEXT:    ret
 ;
-; GISel-MOPS-O3-LABEL: memcpy_10_volatile:
-; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT:    cpyfp [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT:    cpyfm [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT:    cpyfe [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT:    ret
+; GISel-MOPS-LABEL: memcpy_10_volatile:
+; GISel-MOPS:       // %bb.0: // %entry
+; GISel-MOPS-NEXT:    ldr x8, [x1]
+; GISel-MOPS-NEXT:    str x8, [x0]
+; GISel-MOPS-NEXT:    ldrh w8, [x1, #8]
+; GISel-MOPS-NEXT:    strh w8, [x0, #8]
+; GISel-MOPS-NEXT:    ret
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_10_volatile:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
@@ -1736,40 +1688,34 @@ entry:
 define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-LABEL: memmove_10_volatile:
 ; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
-; GISel-WITHOUT-MOPS-O0-NEXT:    bl memmove
-; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x9, [x1]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldrh w8, [x1, #8]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str x9, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT:    strh w8, [x0, #8]
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ret
 ;
 ; GISel-WITHOUT-MOPS-O3-LABEL: memmove_10_volatile:
 ; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
-; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
-; GISel-WITHOUT-MOPS-O3-NEXT:    bl memmove
-; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x8, [x1]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldrh w9, [x1, #8]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT:    strh w9, [x0, #8]
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memmove_10_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
-; GISel-MOPS-O0-NEXT:    cpyp [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT:    cpym [x0]!, [x1]!, x8!
-; GISel-MOPS-O0-NEXT:    cpye [x0]!, [x1]!, x8!
+; GISel-MOPS-O0-NEXT:    ldr x9, [x1]
+; GISel-MOPS-O0-NEXT:    ldrh w8, [x1, #8]
+; GISel-MOPS-O0-NEXT:    str x9, [x0]
+; GISel-MOPS-O0-NEXT:    strh w8, [x0, #8]
 ; GISel-MOPS-O0-NEXT:    ret
 ;
 ; GISel-MOPS-O3-LABEL: memmove_10_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
-; GISel-MOPS-O3-NEXT:    cpyp [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT:    cpym [x0]!, [x1]!, x8!
-; GISel-MOPS-O3-NEXT:    cpye [x0]!, [x1]!, x8!
+; GISel-MOPS-O3-NEXT:    ldr x8, [x1]
+; GISel-MOPS-O3-NEXT:    ldrh w9, [x1, #8]
+; GISel-MOPS-O3-NEXT:    str x8, [x0]
+; GISel-MOPS-O3-NEXT:    strh w9, [x0, #8]
 ; GISel-MOPS-O3-NEXT:    ret
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memmove_10_volatile:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 2f23a32c36a9f..6e5c666bdbc75 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -2264,33 +2264,12 @@ define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-NEON-LABEL: asr:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-NEON-NEXT:    shrn v1.2s, v1.2d, #32
-; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: asr:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-SVE-NEXT:    shrn v1.2s, v1.2d, #32
-; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: asr:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #32
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #32
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: asr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
     %x = ashr <2 x i64> %a, <i64 32, i64 32>
     %y = ashr <2 x i64> %b, <i64 32, i64 32>
     %z = mul nsw <2 x i64> %x, %y
@@ -2298,34 +2277,12 @@ define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-NEON-LABEL: asr_const:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    movi v1.2s, #31
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: asr_const:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    movi v1.2s, #31
-; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: asr_const:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI81_0
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #32
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI81_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: asr_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2s, #31
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
     %x = ashr <2 x i64> %a, <i64 32, i64 32>
     %z = mul nsw <2 x i64> %x, <i64 31, i64 31>
     ret <2 x i64> %z
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index e31c9a072dc4b..113eb14ca4803 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -263,3 +263,110 @@ entry:
   %conv = zext i1 %cmp to i8
   ret i8 %conv
 }
+
+; Test ANDS.
+define i32 @test1_ands(i32 %a) {
+; CHECK-LABEL: test1_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and w8, w0, #0x3ffc00
+; CHECK-NEXT:    ands w8, w8, #0xffe007ff
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i32 %a, 2098176
+  %c = icmp eq i32 %ands, 0
+  %r = select i1 %c, i32 %a, i32 %ands
+  ret i32 %r
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_ands(i32 %a) {
+; CHECK-LABEL: test2_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    ands w8, w0, w8
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i32 %a, 135
+  %c = icmp eq i32 %ands, 0
+  %r = select i1 %c, i32 %a, i32 %ands
+  ret i32 %r
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_ands(i32 %a) {
+; CHECK-LABEL: test3_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    ands w8, w0, w8
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i32 %a, 2163712
+  %c = icmp eq i32 %ands, 0
+  %r = select i1 %c, i32 %a, i32 %ands
+  ret i32 %r
+}
+
+define i64 @test4_ands(i64 %a) {
+; CHECK-LABEL: test4_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and x8, x0, #0x3ffc00
+; CHECK-NEXT:    ands x8, x8, #0xffffffffffe007ff
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i64 %a, 2098176
+  %c = icmp eq i64 %ands, 0
+  %r = select i1 %c, i64 %a, i64 %ands
+  ret i64 %r
+}
+
+define i64 @test5_ands(i64 %a) {
+; CHECK-LABEL: test5_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and x8, x0, #0x3ffffc000
+; CHECK-NEXT:    ands x8, x8, #0xfffffffe00007fff
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i64 %a, 8589950976
+  %c = icmp eq i64 %ands, 0
+  %r = select i1 %c, i64 %a, i64 %ands
+  ret i64 %r
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_ands(i64 %a) {
+; CHECK-LABEL: test6_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    ands x8, x0, x8
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i64 %a, 135
+  %c = icmp eq i64 %ands, 0
+  %r = select i1 %c, i64 %a, i64 %ands
+  ret i64 %r
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_ands(i64 %a) {
+; CHECK-LABEL: test7_ands:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1024 // =0x400
+; CHECK-NEXT:    movk w8, #33, lsl #16
+; CHECK-NEXT:    ands x8, x0, x8
+; CHECK-NEXT:    csel x0, x0, x8, eq
+; CHECK-NEXT:    ret
+entry:
+  %ands = and i64 %a, 2163712
+  %c = icmp eq i64 %ands, 0
+  %r = select i1 %c, i64 %a, i64 %ands
+  ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index bd28d13973f9c..256ff94830113 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,5 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for test_vmull_p8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_p64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p64
 
 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
@@ -101,11 +107,18 @@ entry:
 }
 
 define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vaddl_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddl_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddl_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
   %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
@@ -229,11 +242,18 @@ entry:
 }
 
 define <8 x i16> @test_vaddl_high_a8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaddl_high_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddl_high_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddl_high_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
@@ -345,11 +365,18 @@ entry:
 }
 
 define <8 x i16> @test_vaddw_a8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vaddw_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddw_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddw_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i, %a
@@ -458,11 +485,18 @@ entry:
 }
 
 define <8 x i16> @test_vaddw_high_a8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vaddw_high_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddw_high_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddw_high_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
@@ -574,11 +608,18 @@ entry:
 }
 
 define <8 x i16> @test_vsubl_a8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vsubl_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubl_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubl_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
   %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
@@ -702,11 +743,18 @@ entry:
 }
 
 define <8 x i16> @test_vsubl_high_a8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsubl_high_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubl_high_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubl_high_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
@@ -818,11 +866,18 @@ entry:
 }
 
 define <8 x i16> @test_vsubw_a8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vsubw_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubw v0.8h, v0.8h, v1.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubw_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubw_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    usubw v0.8h, v0.8h, v1.8b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
   %sub.i = sub <8 x i16> %a, %vmovl.i.i
@@ -931,11 +986,18 @@ entry:
 }
 
 define <8 x i16> @test_vsubw_high_a8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vsubw_high_a8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubw2 v0.8h, v0.8h, v1.16b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubw_high_a8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubw2 v0.8h, v0.8h, v1.16b
+; CHECK-SD-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubw_high_a8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    usubw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
@@ -975,10 +1037,16 @@ entry:
 }
 
 define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i = add <8 x i16> %a, %b
   %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -987,10 +1055,16 @@ entry:
 }
 
 define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i = add <4 x i32> %a, %b
   %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -999,10 +1073,16 @@ entry:
 }
 
 define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i = add <2 x i64> %a, %b
   %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
@@ -1011,10 +1091,16 @@ entry:
 }
 
 define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i = add <8 x i16> %a, %b
   %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1023,10 +1109,16 @@ entry:
 }
 
 define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i = add <4 x i32> %a, %b
   %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1035,10 +1127,16 @@ entry:
 }
 
 define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_u64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_u64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_u64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i = add <2 x i64> %a, %b
   %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
@@ -1047,11 +1145,20 @@ entry:
 }
 
 define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_high_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_high_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_high_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i.i = add <8 x i16> %a, %b
   %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1064,11 +1171,20 @@ entry:
 }
 
 define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_high_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_high_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_high_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i.i = add <4 x i32> %a, %b
   %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1081,11 +1197,20 @@ entry:
 }
 
 define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_high_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_high_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_high_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i.i = add <2 x i64> %a, %b
   %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
@@ -1098,11 +1223,20 @@ entry:
 }
 
 define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vaddhn_high_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_high_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_high_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i.i = add <8 x i16> %a, %b
   %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1115,11 +1249,20 @@ entry:
 }
 
 define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vaddhn_high_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_high_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_high_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i.i = add <4 x i32> %a, %b
   %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1132,11 +1275,20 @@ entry:
 }
 
 define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vaddhn_high_u64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddhn_high_u64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddhn_high_u64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddhn.i.i = add <2 x i64> %a, %b
   %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
@@ -1209,11 +1361,19 @@ entry:
 }
 
 define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vraddhn_high_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vraddhn_high_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vraddhn_high_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    raddhn v1.8b, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
@@ -1224,11 +1384,19 @@ entry:
 }
 
 define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vraddhn_high_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vraddhn_high_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vraddhn_high_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    raddhn v1.4h, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
@@ -1239,11 +1407,19 @@ entry:
 }
 
 define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vraddhn_high_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vraddhn_high_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vraddhn_high_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    raddhn v1.2s, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
@@ -1254,11 +1430,19 @@ entry:
 }
 
 define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vraddhn_high_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vraddhn_high_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vraddhn_high_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    raddhn v1.8b, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
@@ -1269,11 +1453,19 @@ entry:
 }
 
 define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vraddhn_high_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vraddhn_high_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vraddhn_high_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    raddhn v1.4h, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
@@ -1284,11 +1476,19 @@ entry:
 }
 
 define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vraddhn_high_u64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vraddhn_high_u64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vraddhn_high_u64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    raddhn v1.2s, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
@@ -1299,10 +1499,16 @@ entry:
 }
 
 define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i = sub <8 x i16> %a, %b
   %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1311,10 +1517,16 @@ entry:
 }
 
 define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i = sub <4 x i32> %a, %b
   %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1323,10 +1535,16 @@ entry:
 }
 
 define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i = sub <2 x i64> %a, %b
   %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
@@ -1335,10 +1553,16 @@ entry:
 }
 
 define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i = sub <8 x i16> %a, %b
   %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1347,10 +1571,16 @@ entry:
 }
 
 define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i = sub <4 x i32> %a, %b
   %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1359,10 +1589,16 @@ entry:
 }
 
 define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_u64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_u64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_u64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i = sub <2 x i64> %a, %b
   %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
@@ -1371,11 +1607,20 @@ entry:
 }
 
 define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_high_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_high_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_high_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i.i = sub <8 x i16> %a, %b
   %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1388,11 +1633,20 @@ entry:
 }
 
 define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_high_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_high_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_high_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i.i = sub <4 x i32> %a, %b
   %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1405,11 +1659,20 @@ entry:
 }
 
 define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_high_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_high_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_high_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i.i = sub <2 x i64> %a, %b
   %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
@@ -1422,11 +1685,20 @@ entry:
 }
 
 define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vsubhn_high_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_high_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_high_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i.i = sub <8 x i16> %a, %b
   %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1439,11 +1711,20 @@ entry:
 }
 
 define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vsubhn_high_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_high_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_high_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i.i = sub <4 x i32> %a, %b
   %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1456,11 +1737,20 @@ entry:
 }
 
 define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsubhn_high_u64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vsubhn_high_u64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vsubhn_high_u64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vsubhn.i.i = sub <2 x i64> %a, %b
   %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
@@ -1533,11 +1823,19 @@ entry:
 }
 
 define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vrsubhn_high_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    rsubhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vrsubhn_high_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    rsubhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vrsubhn_high_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rsubhn v1.8b, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
@@ -1548,11 +1846,19 @@ entry:
 }
 
 define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vrsubhn_high_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    rsubhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vrsubhn_high_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    rsubhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vrsubhn_high_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rsubhn v1.4h, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
@@ -1563,11 +1869,19 @@ entry:
 }
 
 define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vrsubhn_high_s64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    rsubhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vrsubhn_high_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    rsubhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vrsubhn_high_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rsubhn v1.2s, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
@@ -1578,11 +1892,19 @@ entry:
 }
 
 define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vrsubhn_high_u16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    rsubhn2 v0.16b, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vrsubhn_high_u16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    rsubhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vrsubhn_high_u16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rsubhn v1.8b, v1.8h, v2.8h
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
@@ -1593,11 +1915,19 @@ entry:
 }
 
 define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vrsubhn_high_u32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    rsubhn2 v0.8h, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vrsubhn_high_u32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    rsubhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vrsubhn_high_u32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rsubhn v1.4h, v1.4s, v2.4s
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
@@ -1608,11 +1938,19 @@ entry:
 }
 
 define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vrsubhn_high_u64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    rsubhn2 v0.4s, v1.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vrsubhn_high_u64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    rsubhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vrsubhn_high_u64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    rsubhn v1.2s, v1.2d, v2.2d
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
 entry:
   %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
@@ -2535,21 +2873,40 @@ entry:
 }
 
 define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) {
-; CHECK-LABEL: cmplx_mul_combined_re_im:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    movi v1.2d, #0xffff0000ffff0000
-; CHECK-NEXT:    rev32 v4.8h, v0.8h
-; CHECK-NEXT:    dup v2.8h, w8
-; CHECK-NEXT:    sqneg v3.8h, v2.8h
-; CHECK-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    fmov d3, x0
-; CHECK-NEXT:    sqdmull v2.4s, v4.4h, v1.4h
-; CHECK-NEXT:    sqdmull2 v1.4s, v4.8h, v1.8h
-; CHECK-NEXT:    sqdmlal v2.4s, v0.4h, v3.h[0]
-; CHECK-NEXT:    sqdmlal2 v1.4s, v0.8h, v3.h[0]
-; CHECK-NEXT:    uzp2 v0.8h, v2.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsr x8, x0, #16
+; CHECK-SD-NEXT:    movi v1.2d, #0xffff0000ffff0000
+; CHECK-SD-NEXT:    rev32 v4.8h, v0.8h
+; CHECK-SD-NEXT:    dup v2.8h, w8
+; CHECK-SD-NEXT:    sqneg v3.8h, v2.8h
+; CHECK-SD-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    fmov d3, x0
+; CHECK-SD-NEXT:    sqdmull v2.4s, v4.4h, v1.4h
+; CHECK-SD-NEXT:    sqdmull2 v1.4s, v4.8h, v1.8h
+; CHECK-SD-NEXT:    sqdmlal v2.4s, v0.4h, v3.h[0]
+; CHECK-SD-NEXT:    sqdmlal2 v1.4s, v0.8h, v3.h[0]
+; CHECK-SD-NEXT:    uzp2 v0.8h, v2.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    lsr x9, x0, #16
+; CHECK-GI-NEXT:    adrp x8, .LCPI196_0
+; CHECK-GI-NEXT:    rev32 v4.8h, v0.8h
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI196_0]
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    dup v2.8h, v1.h[0]
+; CHECK-GI-NEXT:    sqneg v1.8h, v2.8h
+; CHECK-GI-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    fmov d3, x0
+; CHECK-GI-NEXT:    sqdmull v2.4s, v2.4h, v3.h[0]
+; CHECK-GI-NEXT:    sqdmull v5.4s, v4.4h, v1.4h
+; CHECK-GI-NEXT:    sqdmlal v5.4s, v0.4h, v3.h[0]
+; CHECK-GI-NEXT:    sqdmlal2 v2.4s, v4.8h, v1.8h
+; CHECK-GI-NEXT:    uzp2 v0.8h, v5.8h, v2.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index cc9732b83e10c..6c7ddd916abdf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=arm64-none-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
@@ -197,11 +198,20 @@ define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 }
 
 define <2 x i32> @test_sabd_v2i32_const() {
-; CHECK-LABEL: test_sabd_v2i32_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI19_0
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_sabd_v2i32_const:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI19_0
+; CHECK-SD-NEXT:    ldr d0, [x8, :lo12:.LCPI19_0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_sabd_v2i32_const:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI19_1
+; CHECK-GI-NEXT:    adrp x9, .LCPI19_0
+; CHECK-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI19_1]
+; CHECK-GI-NEXT:    ldr d1, [x9, :lo12:.LCPI19_0]
+; CHECK-GI-NEXT:    sabd v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
     <2 x i32> <i32 -2147483648, i32 2147450880>,
     <2 x i32> <i32 -65536, i32 65535>)
@@ -293,15 +303,26 @@ define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 }
 
 define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK-LABEL: test_uabd_knownbits_vec8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #15
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    uabd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_uabd_knownbits_vec8i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v2.8h, #15
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    uabd v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_uabd_knownbits_vec8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.8h, #15
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    uabd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %and1 = and <8 x i16> %lhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %and2 = and <8 x i16> %rhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %uabd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %and1, <8 x i16> %and2)
@@ -311,11 +332,22 @@ define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 }
 
 define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_uabd_mask_and_shuffle_lshr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #17
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_uabd_mask_and_shuffle_lshr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ushr v0.4s, v0.4s, #17
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_uabd_mask_and_shuffle_lshr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    uabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #17
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535>
   %2 = and <4 x i32> %a1, <i32 65535, i32 65535, i32 65535, i32 65535>
   %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2)
@@ -325,10 +357,19 @@ define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32>
 }
 
 define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_mask_and_shuffle_lshr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_mask_and_shuffle_lshr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_mask_and_shuffle_lshr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    uabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #17
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767>
   %2 = and <4 x i32> %a1, <i32 32767, i32 32767, i32 32767, i32 32767>
   %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2)
@@ -338,20 +379,36 @@ define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1)
 }
 
 define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: test_sabd_knownbits_vec4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI31_0
-; CHECK-NEXT:    adrp x9, .LCPI31_1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI31_0]
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI31_1]
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-NEXT:    mov v0.s[1], v0.s[0]
-; CHECK-NEXT:    trn2 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_sabd_knownbits_vec4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI31_0
+; CHECK-SD-NEXT:    adrp x9, .LCPI31_1
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI31_0]
+; CHECK-SD-NEXT:    ldr q3, [x9, :lo12:.LCPI31_1]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov v0.s[1], v0.s[0]
+; CHECK-SD-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_sabd_knownbits_vec4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI31_2
+; CHECK-GI-NEXT:    adrp x9, .LCPI31_1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI31_2]
+; CHECK-GI-NEXT:    ldr q3, [x9, :lo12:.LCPI31_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI31_0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT:    movi v3.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %and1 = and <4 x i32> %lhs, <i32 255, i32 -1, i32 -1, i32 255>
   %and2 = and <4 x i32> %rhs, <i32 255, i32 255, i32 -1, i32 -1>
   %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %and1, <4 x i32> %and2)
@@ -361,15 +418,27 @@ define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 }
 
 define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_sabd_and_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI32_0
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI32_0]
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_sabd_and_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI32_0
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI32_0]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    zip2 v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_sabd_and_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI32_1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI32_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI32_0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
   %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085>
   %3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %1, <4 x i32> %2)
@@ -378,10 +447,25 @@ define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_sabd_and_or_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_sabd_and_or_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_sabd_and_or_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI33_1
+; CHECK-GI-NEXT:    movi v3.2d, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI33_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    uabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
   %2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
   %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -392,18 +476,33 @@ define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_sabd_and_xor_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI34_0
-; CHECK-NEXT:    movi v3.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_sabd_and_xor_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI34_0
+; CHECK-SD-NEXT:    movi v3.2d, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    eor v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    eor v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    zip2 v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_sabd_and_xor_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_1
+; CHECK-GI-NEXT:    movi v3.2d, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    eor v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    eor v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
   %2 = xor <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
   %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -414,10 +513,24 @@ define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_sabd_and_shl_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_sabd_and_shl_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_sabd_and_shl_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #17
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #17
+; CHECK-GI-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536>
   %2 = shl <4 x i32> %1, <i32 17, i32 17, i32 17, i32 17>
   %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536>
@@ -428,18 +541,32 @@ define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: knownbits_sabd_and_mul_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI36_0
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
-; CHECK-NEXT:    and v3.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mov v0.s[1], v0.s[0]
-; CHECK-NEXT:    trn2 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: knownbits_sabd_and_mul_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI36_0
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-SD-NEXT:    and v3.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    mul v0.4s, v0.4s, v3.4s
+; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov v0.s[1], v0.s[0]
+; CHECK-SD-NEXT:    trn2 v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: knownbits_sabd_and_mul_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    and v3.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536>
   %2 = mul <4 x i32> %a0, %1
   %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index ff28c7817d143..bae254bbd2104 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
 ; CHECK-LABEL: test_insert_v8f16_insert_1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    dup.8h v0, v0[0]
-; CHECK-NEXT:    mov.h v0[7], wzr
+; CHECK-NEXT:    mov.h v0[7], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
   %v.1 = insertelement <8 x half> %v.0, half %a, i32 1
@@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
 ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
   %v.1 = insertelement <4 x float> %v.0, float %a, i32 1
@@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
   ret <8 x i16> %v.0
 }
 
-; TODO: This should jsut be a mov.s v0[3], wzr
 define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
 ; CHECK-LABEL: test_insert_v4f16_f16_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov.h v0[0], wzr
+; CHECK-NEXT:    mov.h v0[0], v1[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
@@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
 define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
 ; CHECK-LABEL: test_insert_v8f16_f16_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.h v0[6], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.h v0[6], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
   ret <8 x half> %v.0
@@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
 define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
 ; CHECK-LABEL: test_insert_v2f32_f32_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov.s v0[0], wzr
+; CHECK-NEXT:    mov.s v0[0], v1[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
@@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
 define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
 ; CHECK-LABEL: test_insert_v4f32_f32_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
   ret <4 x float> %v.0
@@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
 define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
 ; CHECK-LABEL: test_insert_v2f64_f64_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.d v0[1], v1[0]
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
+  ret <2 x double> %v.0
+}
+
+define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov.h v0[0], wzr
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
+  ret <4 x half> %v.0
+}
+
+define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.h v0[6], wzr
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
+  ret <8 x half> %v.0
+}
+
+define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov.s v0[0], wzr
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
+  ret <2 x float> %v.0
+}
+
+define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
+  ret <4 x float> %v.0
+}
+
+define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
+; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov.d v0[1], xzr
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
   ret <2 x double> %v.0
 }
+
+attributes #1 = {"tune-cpu"="cortex-a55"}
diff --git a/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir
new file mode 100644
index 0000000000000..23ac67cac6416
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir
@@ -0,0 +1,98 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s
+
+
+---
+name:            BSL_COPY
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+
+    ; CHECK-LABEL: name: BSL_COPY
+    ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q2 = ORRv16i8 killed renamable $q20, killed renamable $q20
+    ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+    ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+    ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+    ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+    ; CHECK-NEXT: RET undef $lr, implicit $q22
+    renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    $q22 = ORRv16i8 $q0, killed $q0
+    $q23 = ORRv16i8 $q1, killed $q1
+    $q24 = ORRv16i8 $q2, killed $q2
+    $q25 = ORRv16i8 $q3, killed $q3
+    RET_ReallyLR implicit $q22
+...
+---
+name:            BSL
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+    ; CHECK-LABEL: name: BSL
+    ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+    ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+    ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+    ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+    ; CHECK-NEXT: RET undef $lr, implicit $q22
+    renamable $q2 = BSPv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    $q22 = ORRv16i8 $q0, killed $q0
+    $q23 = ORRv16i8 $q1, killed $q1
+    $q24 = ORRv16i8 $q2, killed $q2
+    $q25 = ORRv16i8 $q3, killed $q3
+    RET_ReallyLR implicit $q22
+...
+---
+name:            BIF
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+    ; CHECK-LABEL: name: BIF
+    ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q2 = BIFv16i8 renamable $q2, renamable $q6, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+    ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+    ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+    ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+    ; CHECK-NEXT: RET undef $lr, implicit $q22
+    renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q2, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    $q22 = ORRv16i8 $q0, killed $q0
+    $q23 = ORRv16i8 $q1, killed $q1
+    $q24 = ORRv16i8 $q2, killed $q2
+    $q25 = ORRv16i8 $q3, killed $q3
+    RET_ReallyLR implicit $q22
+...
+---
+name:            BIT
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+
+    ; CHECK-LABEL: name: BIT
+    ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q2 = BITv16i8 renamable $q2, renamable $q21, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0
+    ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1
+    ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2
+    ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3
+    ; CHECK-NEXT: RET undef $lr, implicit $q22
+    renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q2, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3
+    $q22 = ORRv16i8 $q0, killed $q0
+    $q23 = ORRv16i8 $q1, killed $q1
+    $q24 = ORRv16i8 $q2, killed $q2
+    $q25 = ORRv16i8 $q3, killed $q3
+    RET_ReallyLR implicit $q22
+...
diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
new file mode 100644
index 0000000000000..9986af7eb231c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
@@ -0,0 +1,32 @@
+; This test checks if two similar functions, @0 and @1, are not merged as they are unnamed.
+
+; RUN: opt -mtriple=arm64-apple-darwin -S --passes=global-merge-func %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s
+
+; CHECK-NOT: .Tgm
+
+@g = external local_unnamed_addr global [0 x i32], align 4
+@g1 = external global i32, align 4
+@g2 = external global i32, align 4
+
+define i32 @0(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
+
+define i32 @1(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index 2b7fa085cf603..e1ba0e98a6c01 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1631,7 +1631,6 @@ define i8 @combine_i8_sdiv_const100(i8 %x) {
 ; CHECK-GI-NEXT:    sxtb w8, w0
 ; CHECK-GI-NEXT:    mov w9, #41 // =0x29
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    sxth w8, w8
 ; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
 ; CHECK-GI-NEXT:    asr w8, w8, #4
 ; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 7686740aec302..13434fabefa78 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c)
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT:    ldr s17, [sp, #40]
-; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    ldr s17, [sp, #32]
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    add x10, sp, #64
 ; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    ldr s3, [sp, #32]
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
-; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-NEXT:    ldr s16, [sp, #8]
 ; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #72
-; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    add x11, sp, #72
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ldr s18, [x10]
+; CHECK-NEXT:    add x9, sp, #80
+; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    ldr s16, [sp, #8]
+; CHECK-NEXT:    ldr s3, [sp, #96]
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #88
 ; CHECK-NEXT:    ldr s2, [sp]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #112
-; CHECK-NEXT:    ldr s20, [sp, #136]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
-; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ldr s5, [sp, #96]
-; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
+; CHECK-NEXT:    ldr s5, [sp, #40]
 ; CHECK-NEXT:    mov v0.s[2], v4.s[0]
-; CHECK-NEXT:    add x9, sp, #88
-; CHECK-NEXT:    ldr s4, [sp, #104]
-; CHECK-NEXT:    ldr s19, [sp, #192]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #80
-; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
-; CHECK-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ld1 { v3.s }[3], [x10]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    ldr s7, [sp, #128]
+; CHECK-NEXT:    ldr s19, [x11]
 ; CHECK-NEXT:    add x10, sp, #144
+; CHECK-NEXT:    zip1 v4.2d, v17.2d, v18.2d
+; CHECK-NEXT:    add x11, sp, #160
+; CHECK-NEXT:    ldr s18, [sp, #136]
+; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ldr s6, [sp, #128]
+; CHECK-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ldr s7, [sp, #104]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-NEXT:    zip1 v5.2d, v5.2d, v19.2d
+; CHECK-NEXT:    add x10, sp, #120
+; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    fmul v6.4s, v17.4s, v1.4s
-; CHECK-NEXT:    fmul v18.4s, v4.4s, v16.4s
-; CHECK-NEXT:    fmul v16.4s, v5.4s, v16.4s
-; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v7.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ld1 { v19.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v20.s }[1], [x9]
+; CHECK-NEXT:    ldr s17, [x11]
 ; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    add x10, sp, #184
-; CHECK-NEXT:    fneg v6.4s, v6.4s
-; CHECK-NEXT:    fneg v18.4s, v18.4s
-; CHECK-NEXT:    fmla v16.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v17.4s
-; CHECK-NEXT:    ld1 { v7.s }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    ld1 { v20.s }[2], [x9]
-; CHECK-NEXT:    ldr s4, [sp, #200]
+; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    fmul v19.4s, v5.4s, v1.4s
+; CHECK-NEXT:    fmul v20.4s, v7.4s, v16.4s
+; CHECK-NEXT:    fmul v16.4s, v3.4s, v16.4s
+; CHECK-NEXT:    fmul v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    ldr s21, [x11]
+; CHECK-NEXT:    zip1 v6.2d, v6.2d, v17.2d
+; CHECK-NEXT:    ldr s17, [sp, #192]
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    add x10, sp, #208
+; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #216
-; CHECK-NEXT:    fmla v6.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmla v18.4s, v2.4s, v5.4s
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    fsub v0.4s, v7.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v19.4s, v16.4s
-; CHECK-NEXT:    ld1 { v20.s }[3], [x10]
-; CHECK-NEXT:    fadd v2.4s, v4.4s, v18.4s
-; CHECK-NEXT:    fadd v3.4s, v20.4s, v6.4s
+; CHECK-NEXT:    fneg v19.4s, v19.4s
+; CHECK-NEXT:    fneg v20.4s, v20.4s
+; CHECK-NEXT:    fmla v16.4s, v2.4s, v7.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v5.4s
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    ldr s5, [sp, #200]
+; CHECK-NEXT:    zip1 v7.2d, v18.2d, v21.2d
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    fmla v19.4s, v0.4s, v4.4s
+; CHECK-NEXT:    fmla v20.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fsub v0.4s, v6.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v17.4s, v16.4s
+; CHECK-NEXT:    fadd v2.4s, v7.4s, v19.4s
+; CHECK-NEXT:    fadd v3.4s, v5.4s, v20.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ext v5.16b, v3.16b, v2.16b, #12
-; CHECK-NEXT:    trn2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    ext v5.16b, v3.16b, v5.16b, #8
+; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    trn2 v2.4s, v4.4s, v5.4s
-; CHECK-NEXT:    zip2 v4.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT:    mov v4.d[1], v2.d[0]
+; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v1.16b, v3.16b, v1.16b, #8
+; CHECK-NEXT:    mov v4.d[1], v3.d[0]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    stp q4, q1, [x8, #16]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index acf15f1bd1178..e6f27b95d92c8 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
-; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-NEXT:    ldr s1, [x2]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index c6b8e41f9bdfd..4906e2e15e51c 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    add x9, sp, #40
-; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    ldr h2, [x10]
 ; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    ldr h3, [sp, #72]
+; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
-; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #64
-; FULLFP16-NEXT:    str h2, [x8, #16]
+; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
@@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    add x9, sp, #40
-; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    ldr h2, [x10]
 ; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    ldr h3, [sp, #72]
+; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #64
-; FULLFP16-NEXT:    str h2, [x8, #16]
+; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index 0c56e1b66e81f..d428b6aa483a7 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -395,3 +395,37 @@ define i64 @freeze_array() {
   %t1 = add i64 %v1, %v2
   ret i64 %t1
 }
+
+define <8 x i16> @freeze_abdu(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: freeze_abdu:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaba v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_abdu:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uabd v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
+  %d = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %f = freeze <8 x i16> %d
+  %r = add <8 x i16> %a, %f
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @freeze_abds(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: freeze_abds:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: freeze_abds:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sabd v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
+  %d = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b)
+  %f = freeze <8 x i16> %d
+  %r = add <8 x i16> %a, %f
+  ret <8 x i16> %r
+}
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 4c28c90824028..ae2ef2649102e 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
 ;
 ; CHECK-GI-LABEL: fshl_v7i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s3, [sp, #48]
-; CHECK-GI-NEXT:    ldr s20, [sp, #56]
-; CHECK-GI-NEXT:    add x9, sp, #56
+; CHECK-GI-NEXT:    ldr s17, [sp, #48]
+; CHECK-GI-NEXT:    add x8, sp, #56
+; CHECK-GI-NEXT:    add x9, sp, #64
 ; CHECK-GI-NEXT:    ldr s4, [sp, #48]
-; CHECK-GI-NEXT:    ldr s7, [sp, #80]
-; CHECK-GI-NEXT:    mov w12, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    ldr s21, [sp, #88]
-; CHECK-GI-NEXT:    mov v3.s[1], v20.s[0]
-; CHECK-GI-NEXT:    fmov s20, w12
-; CHECK-GI-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-GI-NEXT:    ldr s17, [sp]
-; CHECK-GI-NEXT:    add x13, sp, #64
-; CHECK-GI-NEXT:    mov v7.s[1], v21.s[0]
+; CHECK-GI-NEXT:    ldr s21, [sp, #56]
+; CHECK-GI-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-GI-NEXT:    ldr s20, [x9]
+; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
 ; CHECK-GI-NEXT:    fmov s21, w7
+; CHECK-GI-NEXT:    ldr s6, [sp]
+; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x8]
 ; CHECK-GI-NEXT:    ldr s19, [sp, #64]
-; CHECK-GI-NEXT:    mov w11, #31 // =0x1f
-; CHECK-GI-NEXT:    mov v20.s[1], w12
+; CHECK-GI-NEXT:    ldr s7, [sp, #80]
+; CHECK-GI-NEXT:    ldr s22, [sp, #88]
+; CHECK-GI-NEXT:    mov w9, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w11, #1 // =0x1
+; CHECK-GI-NEXT:    mov v21.s[1], v6.s[0]
+; CHECK-GI-NEXT:    fmov s6, w9
 ; CHECK-GI-NEXT:    ldr s18, [sp, #96]
-; CHECK-GI-NEXT:    ld1 { v4.s }[2], [x13]
-; CHECK-GI-NEXT:    mov w13, #1 // =0x1
-; CHECK-GI-NEXT:    mov v3.s[2], v19.s[0]
-; CHECK-GI-NEXT:    mov v21.s[1], v17.s[0]
-; CHECK-GI-NEXT:    fmov s17, w11
-; CHECK-GI-NEXT:    fmov s19, w13
+; CHECK-GI-NEXT:    zip1 v17.2d, v17.2d, v20.2d
+; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v7.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], v19.s[0]
+; CHECK-GI-NEXT:    fmov s19, w11
 ; CHECK-GI-NEXT:    fmov s23, w0
-; CHECK-GI-NEXT:    fmov s24, w11
-; CHECK-GI-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-NEXT:    mov v6.s[1], w9
+; CHECK-GI-NEXT:    fmov s24, w9
+; CHECK-GI-NEXT:    ldr s2, [sp, #8]
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    ldr s0, [sp, #24]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #32]
+; CHECK-GI-NEXT:    mov v19.s[1], w11
 ; CHECK-GI-NEXT:    mov v7.s[2], v18.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], w11
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    mov v20.s[2], w12
 ; CHECK-GI-NEXT:    ldr s16, [sp, #72]
 ; CHECK-GI-NEXT:    mov v23.s[1], w1
 ; CHECK-GI-NEXT:    ldr s18, [sp, #80]
-; CHECK-GI-NEXT:    mov v21.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v24.s[1], w11
+; CHECK-GI-NEXT:    mov v21.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v24.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
-; CHECK-GI-NEXT:    fmov s6, w4
-; CHECK-GI-NEXT:    add x10, sp, #88
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    mov v20.s[2], w10
+; CHECK-GI-NEXT:    add x8, sp, #88
 ; CHECK-GI-NEXT:    movi v22.4s, #31
-; CHECK-GI-NEXT:    mov v3.s[3], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], w11
-; CHECK-GI-NEXT:    mov v19.s[2], w13
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    ldr s1, [sp, #40]
-; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x10]
-; CHECK-GI-NEXT:    eor v5.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT:    mov v4.s[3], v16.s[0]
+; CHECK-GI-NEXT:    mov v6.s[2], w9
+; CHECK-GI-NEXT:    mov v19.s[2], w11
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    ldr s3, [sp, #40]
+; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x8]
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    mov v6.s[1], w5
-; CHECK-GI-NEXT:    add x8, sp, #72
-; CHECK-GI-NEXT:    add x9, sp, #96
-; CHECK-GI-NEXT:    mov v21.s[3], v2.s[0]
-; CHECK-GI-NEXT:    mov v24.s[2], w11
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    ld1 { v4.s }[3], [x8]
-; CHECK-GI-NEXT:    bic v2.16b, v22.16b, v3.16b
-; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x9]
-; CHECK-GI-NEXT:    and v1.16b, v5.16b, v17.16b
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    add x8, sp, #96
+; CHECK-GI-NEXT:    eor v2.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT:    mov v21.s[3], v1.s[0]
+; CHECK-GI-NEXT:    mov v24.s[2], w9
+; CHECK-GI-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-GI-NEXT:    bic v1.16b, v22.16b, v4.16b
+; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x8]
 ; CHECK-GI-NEXT:    neg v3.4s, v19.4s
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v22.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
 ; CHECK-GI-NEXT:    mov v23.s[3], w3
-; CHECK-GI-NEXT:    mov v6.s[2], w6
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v22.16b
-; CHECK-GI-NEXT:    ushr v5.4s, v21.4s, #1
-; CHECK-GI-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    ushr v6.4s, v21.4s, #1
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-NEXT:    ushl v3.4s, v23.4s, v4.4s
-; CHECK-GI-NEXT:    ushl v2.4s, v5.4s, v2.4s
-; CHECK-GI-NEXT:    ushl v4.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    ushl v1.4s, v6.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
+; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    mov s5, v0.s[1]
 ; CHECK-GI-NEXT:    mov s6, v0.s[2]
-; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
index f82d1ed87fba7..df4889b6f09de 100644
--- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll
+++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE
 ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for test_pmull_high_p8_128
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_p8_64
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
@@ -12,10 +16,10 @@ declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2)
 
 define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK-LE-LABEL: test_smull_high_s16_base:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_smull_high_s16_base:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_base:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -35,10 +39,10 @@ entry:
 }
 
 define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 {
-; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_smull_high_s16_bitcasta1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -59,10 +63,10 @@ entry:
 }
 
 define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 {
-; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_smull_high_s16_bitcastb1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -83,10 +87,10 @@ entry:
 }
 
 define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 {
-; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_smull_high_s16_bitcasta2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -109,10 +113,10 @@ entry:
 }
 
 define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_smull_high_s16_bitcastb2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -157,6 +161,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_bitcasta1_wrongindex:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %a = bitcast <2 x i64> %aa to <8 x i16>
   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -186,6 +197,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_bitcastb1_wrongindex:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ext v1.16b, v1.16b, v0.16b, #6
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %b = bitcast <16 x i8> %bb to <8 x i16>
   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -215,6 +233,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i1
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_bitcasta2_wrongindex:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
   %s1 = bitcast <2 x i32> %s1a to <4 x i16>
@@ -244,6 +269,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_bitcastb2_wrongindex:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ext v1.16b, v1.16b, v0.16b, #4
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -269,6 +301,12 @@ define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_splata1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    smull v0.4s, v1.4h, v0.h[3]
+; CHECK-GI-NEXT:    ret
 entry:
   %a = bitcast <2 x i64> %aa to <8 x i16>
   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -293,6 +331,12 @@ define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_splatb1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.h[3]
+; CHECK-GI-NEXT:    ret
 entry:
   %b = bitcast <16 x i8> %bb to <8 x i16>
   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -322,6 +366,13 @@ define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_splata2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    dup v0.2s, v0.s[3]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %s1 = bitcast <2 x i32> %s1a to <4 x i16>
@@ -351,6 +402,13 @@ define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_smull_high_s16_splatb2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    dup v1.8b, v1.b[3]
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
 entry:
   %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -362,10 +420,10 @@ entry:
 
 
 define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 {
-; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_umull_high_s16_bitcasta1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -386,10 +444,10 @@ entry:
 }
 
 define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) {
-; CHECK-LE-LABEL: test_vabdl_high_u82:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_vabdl_high_u82:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_vabdl_high_u82:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -411,10 +469,10 @@ entry:
 }
 
 define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) {
-; CHECK-LE-LABEL: test_vabdl_high_s82:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_vabdl_high_s82:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_vabdl_high_s82:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -436,10 +494,10 @@ entry:
 }
 
 define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) {
-; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_vqdmlal_high_s16_bitcast:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -463,12 +521,12 @@ entry:
 }
 
 define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
-; CHECK-LE-LABEL: test_pmull_high_p8_128:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    fmov d0, x3
-; CHECK-LE-NEXT:    fmov d1, x1
-; CHECK-LE-NEXT:    pmull v0.8h, v1.8b, v0.8b
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_pmull_high_p8_128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x3
+; CHECK-NEXT:    fmov d1, x1
+; CHECK-NEXT:    pmull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_pmull_high_p8_128:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -490,10 +548,10 @@ entry:
 }
 
 define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) {
-; CHECK-LE-LABEL: test_pmull_high_p8_64:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-LE-NEXT:    ret
+; CHECK-LABEL: test_pmull_high_p8_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_pmull_high_p8_64:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -532,6 +590,14 @@ define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: foov8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #5
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #5
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
   %a0 = bitcast <16 x i8> %a1 to <4 x i32>
   %b0 = bitcast <2 x i64> %b1 to <4 x i32>
   %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
@@ -558,6 +624,12 @@ define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) {
 ; CHECK-BE-NEXT:    ushll2 v0.2d, v0.4s, #1
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: hadd32_zext_asr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-GI-NEXT:    ret
   %src1 = bitcast <16 x i8> %src1a to <4 x i32>
   %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %zextsrc1 = zext <2 x i32> %s1 to <2 x i64>
@@ -580,6 +652,12 @@ define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 {
 ; CHECK-BE-NEXT:    umull2 v0.2d, v1.4s, v0.s[1]
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_umull_high_s16_splata1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    umull v0.2d, v1.2s, v0.s[1]
+; CHECK-GI-NEXT:    ret
 entry:
   %a = bitcast <2 x i64> %aa to <4 x i32>
   %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
index 2213aa1429dbd..4e1876db772ed 100644
--- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
@@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; CHECK-NEXT:    ldr s1, [sp, #44]
 ; CHECK-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
 ; CHECK-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NEXT:    ld1 { v1.s }[2], [x20]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-NEXT:    ldr s0, [x20]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v1.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 v1.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 ;
@@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    ldr s0, [sp, #28]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x20]
+; CHECK-NEXT:    ldr s1, [x20]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir b/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir
index 23811425101fd..b99ca25a5432d 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir
@@ -16,27 +16,53 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: BL @OUTLINED_FUNCTION_0, implicit-def $lr, implicit $sp, implicit-def $lr, implicit-def $x0, implicit-def $x1, implicit-def $x2, implicit-def $x3, implicit-def $x16, implicit $x0, implicit $sp
     ; CHECK-NEXT: $x9 = ADDXri $x16, 16, 0
-    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 0
+    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 1
     ; CHECK-NEXT: BL @OUTLINED_FUNCTION_0, implicit-def $lr, implicit $sp, implicit-def $lr, implicit-def $x0, implicit-def $x1, implicit-def $x2, implicit-def $x3, implicit-def $x16, implicit $x0, implicit $sp
     ; CHECK-NEXT: $x9 = ADDXri $x9, 16, 0
-    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 0
+    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 2
     ; CHECK-NEXT: RET undef $x9
     $x0 = ADDXri $x0, 0, 0
     $x1 = ADDXri $x0, 1, 0
     $x2 = ADDXri $x0, 2, 0
     $x3 = ADDXri $x0, 3, 0
-
-    ; End safe range
     $x16 = ADDXri $x0, 16, 0
     $x9 = ADDXri $x16, 16, 0
-    $x16 = ADDXri killed $x16, 16, 0
-
+    $x16 = ADDXri killed $x16, 16, 1
+    ; End safe range
     $x0 = ADDXri $x0, 0, 0
     $x1 = ADDXri $x0, 1, 0
     $x2 = ADDXri $x0, 2, 0
     $x3 = ADDXri $x0, 3, 0
-    ; End safe range
     $x16 = ADDXri $x0, 16, 0
     $x9 = ADDXri $x9, 16, 0
-    $x16 = ADDXri killed $x16, 16, 0
+    $x16 = ADDXri killed $x16, 16, 2
+    ; End safe range
     RET undef $x9
+...
+---
+name:           unsafe_range_bundle
+tracksRegLiveness: true
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: unsafe_range_bundle
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x0 = ADDXri $x0, 0, 0
+    ; CHECK-NEXT: $x16 = ADDXri $x0, 16, 0
+    ; CHECK-NEXT: BUNDLE {
+    ; CHECK-NEXT:   $x16 = ADDXri killed $x16, 16, 3
+    ; CHECK-NEXT:   $x1 = ADDXri $x0, 0, 0
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: RET undef $x9
+    $x0 = ADDXri $x0, 0, 0
+    $x16 = ADDXri $x0, 16, 0
+    BUNDLE { ; Bundle crosses a safe range
+      $x16 = ADDXri killed $x16, 16, 3
+      ; End safe range
+      $x1 = ADDXri $x0, 0, 0
+    }
+    RET undef $x9
+...
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4f0c4080aa0ce..9443004ea434b 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
 ; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    .cfi_offset w29, -16
-; CHECK-SD-NEXT:    ldr b5, [sp, #208]
+; CHECK-SD-NEXT:    ldr b0, [sp, #208]
 ; CHECK-SD-NEXT:    add x8, sp, #216
-; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    add x9, sp, #272
+; CHECK-SD-NEXT:    ldr b2, [sp, #80]
 ; CHECK-SD-NEXT:    ldr b4, [sp, #976]
-; CHECK-SD-NEXT:    add x9, sp, #984
-; CHECK-SD-NEXT:    add x12, sp, #328
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #224
-; CHECK-SD-NEXT:    movi v1.16b, #1
-; CHECK-SD-NEXT:    mov v0.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT:    add x11, sp, #992
 ; CHECK-SD-NEXT:    ldr b6, [sp, #720]
-; CHECK-SD-NEXT:    ldr b7, [sp, #80]
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #224
+; CHECK-SD-NEXT:    fmov s16, w0
+; CHECK-SD-NEXT:    ldr b17, [sp, #848]
+; CHECK-SD-NEXT:    add x10, sp, #24
+; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v0.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #232
-; CHECK-SD-NEXT:    add x13, sp, #88
-; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x11]
-; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #856
-; CHECK-SD-NEXT:    mov v0.b[2], w2
-; CHECK-SD-NEXT:    add x14, sp, #1008
-; CHECK-SD-NEXT:    add x15, sp, #872
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT:    mov v16.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #240
-; CHECK-SD-NEXT:    add x16, sp, #888
-; CHECK-SD-NEXT:    add x10, sp, #16
-; CHECK-SD-NEXT:    add x9, sp, #24
-; CHECK-SD-NEXT:    add x11, sp, #40
-; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT:    mov v16.b[2], w2
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #248
-; CHECK-SD-NEXT:    mov v0.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT:    mov v16.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v0.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #256
-; CHECK-SD-NEXT:    mov v0.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #264
-; CHECK-SD-NEXT:    mov v0.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #272
-; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x8]
+; CHECK-SD-NEXT:    mov v16.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b1, [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #280
-; CHECK-SD-NEXT:    mov v0.b[6], w6
-; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #88
+; CHECK-SD-NEXT:    mov v16.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #288
-; CHECK-SD-NEXT:    mov v0.b[7], w7
-; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #296
-; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #128
-; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
+; CHECK-SD-NEXT:    mov v16.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #304
-; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #136
-; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x8]
+; CHECK-SD-NEXT:    mov v16.b[7], w7
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #312
-; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #320
-; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #32
-; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #144
-; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #728
-; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #1000
-; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x11]
-; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #736
-; CHECK-SD-NEXT:    add x11, sp, #920
-; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT:    ldr b5, [sp, #848]
-; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #48
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #744
-; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #96
-; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x12]
-; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #864
-; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #1016
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #752
-; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #104
-; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #1024
-; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x14]
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #760
-; CHECK-SD-NEXT:    add x14, sp, #112
-; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #880
-; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #1032
-; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x14]
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x13]
-; CHECK-SD-NEXT:    add x14, sp, #768
-; CHECK-SD-NEXT:    add x13, sp, #120
-; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #1040
-; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x14]
-; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #776
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x16]
-; CHECK-SD-NEXT:    add x14, sp, #1048
-; CHECK-SD-NEXT:    ld1 { v4.b }[8], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #896
-; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x13]
-; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #784
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x15]
-; CHECK-SD-NEXT:    add x13, sp, #1056
-; CHECK-SD-NEXT:    ld1 { v4.b }[9], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #904
-; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x10]
-; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #792
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x14]
-; CHECK-SD-NEXT:    add x10, sp, #1064
-; CHECK-SD-NEXT:    ld1 { v4.b }[10], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #912
-; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x9]
-; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x8]
-; CHECK-SD-NEXT:    add x9, sp, #800
-; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #328
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #96
+; CHECK-SD-NEXT:    add x9, sp, #144
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #104
+; CHECK-SD-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    movi v1.16b, #1
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #128
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #136
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b3, [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #152
-; CHECK-SD-NEXT:    ld1 { v4.b }[11], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1072
-; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x9]
-; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x8]
-; CHECK-SD-NEXT:    add x9, sp, #808
-; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x11]
-; CHECK-SD-NEXT:    add x8, sp, #56
-; CHECK-SD-NEXT:    ld1 { v4.b }[12], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #160
-; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #928
-; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1080
-; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #984
+; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #160
+; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #168
+; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #176
+; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #184
+; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #192
+; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #200
+; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #992
+; CHECK-SD-NEXT:    add x9, sp, #1040
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1000
+; CHECK-SD-NEXT:    zip1 v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1008
+; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1016
+; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1024
+; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1032
+; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b5, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #1048
+; CHECK-SD-NEXT:    add x9, sp, #728
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1056
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1064
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1072
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1080
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1088
+; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1096
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #736
+; CHECK-SD-NEXT:    add x9, sp, #784
+; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #744
+; CHECK-SD-NEXT:    zip1 v4.2d, v4.2d, v5.2d
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #752
+; CHECK-SD-NEXT:    sdot v19.4s, v4.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v5.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #760
+; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #768
+; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #776
+; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b7, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #792
+; CHECK-SD-NEXT:    add x9, sp, #856
+; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #800
+; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #808
+; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #816
-; CHECK-SD-NEXT:    ld1 { v4.b }[13], [x10]
-; CHECK-SD-NEXT:    add x9, sp, #168
-; CHECK-SD-NEXT:    add x10, sp, #176
-; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #936
-; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #1088
-; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #64
-; CHECK-SD-NEXT:    ld1 { v4.b }[14], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #824
-; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #944
-; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1096
-; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #824
+; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #832
-; CHECK-SD-NEXT:    ld1 { v4.b }[15], [x10]
-; CHECK-SD-NEXT:    add x9, sp, #184
-; CHECK-SD-NEXT:    add x10, sp, #72
-; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #952
-; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x9]
-; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #840
-; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x10]
-; CHECK-SD-NEXT:    sdot v2.4s, v4.16b, v1.16b
-; CHECK-SD-NEXT:    add x9, sp, #192
-; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #864
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #912
+; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #872
+; CHECK-SD-NEXT:    zip1 v0.2d, v6.2d, v7.2d
+; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x10]
+; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #880
+; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #888
+; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #896
+; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #904
+; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b18, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #920
+; CHECK-SD-NEXT:    ld1 { v18.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #928
+; CHECK-SD-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #40
+; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #936
+; CHECK-SD-NEXT:    ld1 { v18.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #944
+; CHECK-SD-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #952
+; CHECK-SD-NEXT:    ld1 { v18.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #64
+; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #960
-; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x9]
-; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    add x8, sp, #200
-; CHECK-SD-NEXT:    add x9, sp, #968
-; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x8]
-; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x9]
-; CHECK-SD-NEXT:    sdot v3.4s, v7.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v2.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT:    add v0.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #72
+; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #968
+; CHECK-SD-NEXT:    ld1 { v18.b }[7], [x8]
+; CHECK-SD-NEXT:    sdot v5.4s, v16.16b, v1.16b
+; CHECK-SD-NEXT:    zip1 v0.2d, v17.2d, v18.2d
+; CHECK-SD-NEXT:    sdot v5.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    add v0.4s, v5.4s, v19.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll
index 78ccc89399292..19967bd1a69ec 100644
--- a/llvm/test/CodeGen/AArch64/neon-saba.ll
+++ b/llvm/test/CodeGen/AArch64/neon-saba.ll
@@ -1,13 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; SABA from ADD(ABS(SUB NSW))
 
 define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
-; CHECK-LABEL: saba_abs_4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saba v0.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saba_abs_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saba_abs_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %sub = sub nsw <4 x i32> %b, %c
   %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
   %add = add <4 x i32> %a, %abs
@@ -15,10 +23,17 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
 }
 
 define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
-; CHECK-LABEL: saba_abs_2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saba v0.2s, v1.2s, v2.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saba_abs_2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saba_abs_2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    abs v1.2s, v1.2s
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %sub = sub nsw <2 x i32> %b, %c
   %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true)
   %add = add <2 x i32> %a, %abs
@@ -26,10 +41,17 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
 }
 
 define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
-; CHECK-LABEL: saba_abs_8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saba v0.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saba_abs_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.8h, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saba_abs_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    abs v1.8h, v1.8h
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %sub = sub nsw <8 x i16> %b, %c
   %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
   %add = add <8 x i16> %a, %abs
@@ -37,10 +59,17 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
 }
 
 define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
-; CHECK-LABEL: saba_abs_4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saba v0.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saba_abs_4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.4h, v1.4h, v2.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saba_abs_4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    abs v1.4h, v1.4h
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
   %sub = sub nsw <4 x i16> %b, %c
   %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true)
   %add = add <4 x i16> %a, %abs
@@ -48,10 +77,17 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
 }
 
 define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
-; CHECK-LABEL: saba_abs_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saba v0.16b, v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saba_abs_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saba_abs_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    abs v1.16b, v1.16b
+; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %sub = sub nsw <16 x i8> %b, %c
   %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true)
   %add = add <16 x i8> %a, %abs
@@ -59,10 +95,17 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
 }
 
 define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
-; CHECK-LABEL: saba_abs_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    saba v0.8b, v1.8b, v2.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: saba_abs_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saba v0.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saba_abs_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    abs v1.8b, v1.8b
+; CHECK-GI-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %sub = sub nsw <8 x i8> %b, %c
   %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true)
   %add = add <8 x i8> %a, %abs
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index f8ba150a0405f..f7a87ae340a73 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v17f32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
+; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-BE-NEXT:    ldr s16, [sp, #36]
+; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-BE-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BE-NEXT:    ldr s17, [sp, #4]
-; CHECK-BE-NEXT:    add x8, sp, #44
-; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT:    add x8, sp, #12
+; CHECK-BE-NEXT:    add x9, sp, #20
+; CHECK-BE-NEXT:    ldr s16, [sp, #36]
 ; CHECK-BE-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-BE-NEXT:    ldr s1, [sp, #4]
+; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT:    add x10, sp, #52
 ; CHECK-BE-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-BE-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-BE-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-BE-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-BE-NEXT:    ldr s1, [sp, #68]
-; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #12
-; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #52
-; CHECK-BE-NEXT:    str s1, [x0, #64]
-; CHECK-BE-NEXT:    ld1 { v16.s }[2], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #20
+; CHECK-BE-NEXT:    ld1 { v1.s }[1], [x8]
+; CHECK-BE-NEXT:    ldr s5, [x9]
+; CHECK-BE-NEXT:    add x8, sp, #28
+; CHECK-BE-NEXT:    add x9, sp, #44
+; CHECK-BE-NEXT:    ld1 { v5.s }[1], [x8]
+; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x9]
+; CHECK-BE-NEXT:    ldr s17, [x10]
+; CHECK-BE-NEXT:    add x8, sp, #60
 ; CHECK-BE-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-BE-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-BE-NEXT:    ld1 { v17.s }[2], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #60
-; CHECK-BE-NEXT:    ld1 { v16.s }[3], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #28
-; CHECK-BE-NEXT:    ld1 { v17.s }[3], [x8]
+; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-BE-NEXT:    ldr s2, [sp, #68]
+; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    zip1 v1.2d, v1.2d, v5.2d
+; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    str s2, [x0, #64]
+; CHECK-BE-NEXT:    zip1 v5.2d, v16.2d, v17.2d
 ; CHECK-BE-NEXT:    mov v4.s[3], v7.s[0]
-; CHECK-BE-NEXT:    add x8, x0, #48
 ; CHECK-BE-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-BE-NEXT:    st1 { v16.4s }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #32
-; CHECK-BE-NEXT:    st1 { v17.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
 ; CHECK-BE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1376f5d9a380d..c57383ad9b1e7 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) {
 ; CHECK-GI-LABEL: si8_7:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    mov w9, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #-109 // =0xffffff93
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    add w8, w0, w8, asr #8
+; CHECK-GI-NEXT:    sbfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -45,8 +50,13 @@ define i8 @si8_100(i8 %a, i8 %b) {
 ; CHECK-GI-LABEL: si8_100:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #41 // =0x29
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
+; CHECK-GI-NEXT:    asr w8, w8, #4
+; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -129,8 +139,12 @@ define i16 @si16_7(i16 %a, i16 %b) {
 ; CHECK-GI-LABEL: si16_7:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
-; CHECK-GI-NEXT:    mov w9, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    asr w8, w8, #16
+; CHECK-GI-NEXT:    asr w8, w8, #1
+; CHECK-GI-NEXT:    ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -155,8 +169,13 @@ define i16 @si16_100(i16 %a, i16 %b) {
 ; CHECK-GI-LABEL: si16_100:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    asr w8, w8, #16
+; CHECK-GI-NEXT:    asr w8, w8, #3
+; CHECK-GI-NEXT:    ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -240,8 +259,13 @@ define i32 @si32_7(i32 %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: si32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w0, w8
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movk w8, #37449, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    add w8, w8, w0
+; CHECK-GI-NEXT:    asr w8, w8, #2
+; CHECK-GI-NEXT:    add w8, w8, w8, lsr #31
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -265,9 +289,14 @@ define i32 @si32_100(i32 %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: si32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w9, w0, w8
-; CHECK-GI-NEXT:    msub w0, w9, w8, w0
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    asr w8, w8, #5
+; CHECK-GI-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem i32 %a, 100
@@ -348,8 +377,13 @@ define i64 @si64_7(i64 %a, i64 %b) {
 ;
 ; CHECK-GI-LABEL: si64_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv x8, x0, x8
+; CHECK-GI-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #48
+; CHECK-GI-NEXT:    smulh x8, x0, x8
+; CHECK-GI-NEXT:    asr x8, x8, #1
+; CHECK-GI-NEXT:    add x8, x8, x8, lsr #63
 ; CHECK-GI-NEXT:    lsl x9, x8, #3
 ; CHECK-GI-NEXT:    sub x8, x9, x8
 ; CHECK-GI-NEXT:    sub x0, x0, x8
@@ -376,9 +410,16 @@ define i64 @si64_100(i64 %a, i64 %b) {
 ;
 ; CHECK-GI-LABEL: si64_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv x9, x0, x8
-; CHECK-GI-NEXT:    msub x0, x9, x8, x0
+; CHECK-GI-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk x8, #28835, lsl #16
+; CHECK-GI-NEXT:    movk x8, #2621, lsl #32
+; CHECK-GI-NEXT:    movk x8, #41943, lsl #48
+; CHECK-GI-NEXT:    smulh x8, x0, x8
+; CHECK-GI-NEXT:    add x8, x8, x0
+; CHECK-GI-NEXT:    asr x8, x8, #6
+; CHECK-GI-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-GI-NEXT:    msub x0, x8, x9, x0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem i64 %a, 100
@@ -644,25 +685,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov w8, #65427 // =0xff93
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    smov w11, v1.h[1]
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    smov w8, v1.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[1]
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    movi v3.2s, #7
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w10, v1.b[1]
+; CHECK-GI-NEXT:    umov w9, v2.b[0]
+; CHECK-GI-NEXT:    umov w11, v2.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s1, w10
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i8> %d, <i8 7, i8 7>
@@ -687,25 +752,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    smov w11, v1.h[1]
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    smov w8, v1.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[1]
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    movi v3.2s, #100
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w10, v1.b[1]
+; CHECK-GI-NEXT:    umov w9, v2.b[0]
+; CHECK-GI-NEXT:    umov w11, v2.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s1, w10
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i8> %d, <i8 100, i8 100>
@@ -872,30 +958,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v3.4h, #7
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    mov v2.h[1], w8
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w9, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w9
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov w8, #147 // =0x93
+; CHECK-GI-NEXT:    shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v4.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov v4.b[3], w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    uzp1 v1.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    dup v3.4h, w9
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    add v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
@@ -943,30 +1036,37 @@ define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v4.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    mov v4.b[3], w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
 ; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v3.4h, #100
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    mov v2.h[1], w8
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w9, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w9
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    dup v3.4h, w8
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    add v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
@@ -988,42 +1088,15 @@ define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v4.8b, #7
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    movi v1.8b, #147
+; CHECK-GI-NEXT:    movi v3.8b, #7
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    add v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    sshr v2.8b, v1.8b, #2
+; CHECK-GI-NEXT:    ushr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT:    ssra v2.8b, v1.8b, #2
+; CHECK-GI-NEXT:    mls v0.8b, v2.8b, v3.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -1044,42 +1117,14 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v4.8b, #100
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    movi v1.8b, #41
+; CHECK-GI-NEXT:    movi v3.8b, #100
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    sshr v2.8b, v1.8b, #4
+; CHECK-GI-NEXT:    ushr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT:    ssra v2.8b, v1.8b, #4
+; CHECK-GI-NEXT:    mls v0.8b, v2.8b, v3.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1102,72 +1147,16 @@ define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv16i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v16.8b, #7
-; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s2
-; CHECK-GI-NEXT:    fmov w17, s0
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    mov w14, v2.s[1]
-; CHECK-GI-NEXT:    mov w18, v0.s[1]
-; CHECK-GI-NEXT:    mov w3, v3.s[1]
-; CHECK-GI-NEXT:    mov w15, v2.s[2]
-; CHECK-GI-NEXT:    mov w0, v0.s[2]
-; CHECK-GI-NEXT:    sdiv w11, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[1]
-; CHECK-GI-NEXT:    mov w4, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v2.s[3]
-; CHECK-GI-NEXT:    mov w1, v0.s[3]
-; CHECK-GI-NEXT:    mov w5, v3.s[3]
-; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    sdiv w17, w17, w8
-; CHECK-GI-NEXT:    fmov s5, w13
-; CHECK-GI-NEXT:    sdiv w2, w2, w8
-; CHECK-GI-NEXT:    fmov s6, w17
-; CHECK-GI-NEXT:    sdiv w12, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[2]
-; CHECK-GI-NEXT:    fmov s7, w2
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v4.s[1], w12
-; CHECK-GI-NEXT:    sdiv w18, w18, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w14
-; CHECK-GI-NEXT:    sdiv w3, w3, w8
-; CHECK-GI-NEXT:    mov v6.s[1], w18
-; CHECK-GI-NEXT:    sdiv w10, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[3]
-; CHECK-GI-NEXT:    mov v7.s[1], w3
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v4.s[2], w10
-; CHECK-GI-NEXT:    sdiv w0, w0, w8
-; CHECK-GI-NEXT:    mov v5.s[2], w15
-; CHECK-GI-NEXT:    sdiv w4, w4, w8
-; CHECK-GI-NEXT:    mov v6.s[2], w0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    mov v7.s[2], w4
-; CHECK-GI-NEXT:    sdiv w16, w16, w8
-; CHECK-GI-NEXT:    mov v4.s[3], w9
-; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    sdiv w1, w1, w8
-; CHECK-GI-NEXT:    mov v5.s[3], w16
-; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
-; CHECK-GI-NEXT:    sdiv w8, w5, w8
-; CHECK-GI-NEXT:    mov v6.s[3], w1
-; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
-; CHECK-GI-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    movi v1.16b, #147
+; CHECK-GI-NEXT:    movi v3.16b, #7
+; CHECK-GI-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    sshr v2.16b, v1.16b, #2
+; CHECK-GI-NEXT:    ushr v2.16b, v2.16b, #7
+; CHECK-GI-NEXT:    ssra v2.16b, v1.16b, #2
+; CHECK-GI-NEXT:    mls v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -1189,72 +1178,15 @@ define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv16i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v16.8b, #100
-; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s2
-; CHECK-GI-NEXT:    fmov w17, s0
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    mov w14, v2.s[1]
-; CHECK-GI-NEXT:    mov w18, v0.s[1]
-; CHECK-GI-NEXT:    mov w3, v3.s[1]
-; CHECK-GI-NEXT:    mov w15, v2.s[2]
-; CHECK-GI-NEXT:    mov w0, v0.s[2]
-; CHECK-GI-NEXT:    sdiv w11, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[1]
-; CHECK-GI-NEXT:    mov w4, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v2.s[3]
-; CHECK-GI-NEXT:    mov w1, v0.s[3]
-; CHECK-GI-NEXT:    mov w5, v3.s[3]
-; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    sdiv w17, w17, w8
-; CHECK-GI-NEXT:    fmov s5, w13
-; CHECK-GI-NEXT:    sdiv w2, w2, w8
-; CHECK-GI-NEXT:    fmov s6, w17
-; CHECK-GI-NEXT:    sdiv w12, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[2]
-; CHECK-GI-NEXT:    fmov s7, w2
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v4.s[1], w12
-; CHECK-GI-NEXT:    sdiv w18, w18, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w14
-; CHECK-GI-NEXT:    sdiv w3, w3, w8
-; CHECK-GI-NEXT:    mov v6.s[1], w18
-; CHECK-GI-NEXT:    sdiv w10, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[3]
-; CHECK-GI-NEXT:    mov v7.s[1], w3
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v4.s[2], w10
-; CHECK-GI-NEXT:    sdiv w0, w0, w8
-; CHECK-GI-NEXT:    mov v5.s[2], w15
-; CHECK-GI-NEXT:    sdiv w4, w4, w8
-; CHECK-GI-NEXT:    mov v6.s[2], w0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    mov v7.s[2], w4
-; CHECK-GI-NEXT:    sdiv w16, w16, w8
-; CHECK-GI-NEXT:    mov v4.s[3], w9
-; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    sdiv w1, w1, w8
-; CHECK-GI-NEXT:    mov v5.s[3], w16
-; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
-; CHECK-GI-NEXT:    sdiv w8, w5, w8
-; CHECK-GI-NEXT:    mov v6.s[3], w1
-; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
-; CHECK-GI-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    movi v1.16b, #41
+; CHECK-GI-NEXT:    movi v3.16b, #100
+; CHECK-GI-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    sshr v2.16b, v1.16b, #4
+; CHECK-GI-NEXT:    ushr v2.16b, v2.16b, #7
+; CHECK-GI-NEXT:    ssra v2.16b, v1.16b, #4
+; CHECK-GI-NEXT:    mls v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1754,20 +1686,31 @@ define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i16_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    shl v2.2s, v0.2s, #16
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    sshr v2.2s, v2.2s, #16
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    dup v3.2s, w8
+; CHECK-GI-NEXT:    ushl v2.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i16> %d, <i16 7, i16 7>
@@ -1792,20 +1735,31 @@ define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i16_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-GI-NEXT:    shl v2.2s, v0.2s, #16
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    sshr v2.2s, v2.2s, #16
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w8, #3 // =0x3
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    dup v3.2s, w8
+; CHECK-GI-NEXT:    ushl v2.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i16> %d, <i16 100, i16 100>
@@ -1949,24 +1903,15 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i16_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v2.4h, #7
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI44_0
+; CHECK-GI-NEXT:    movi v3.4h, #7
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI44_0]
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sshr v2.4h, v1.4h, #1
+; CHECK-GI-NEXT:    ushr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h, #1
+; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
@@ -1988,24 +1933,15 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i16_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v2.4h, #100
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI45_0
+; CHECK-GI-NEXT:    movi v3.4h, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI45_0]
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sshr v2.4h, v1.4h, #3
+; CHECK-GI-NEXT:    ushr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h, #3
+; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
@@ -2028,38 +1964,16 @@ define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i16_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v4.4h, #7
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    adrp x8, .LCPI46_0
+; CHECK-GI-NEXT:    movi v3.8h, #7
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI46_0]
+; CHECK-GI-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sshr v2.8h, v1.8h, #1
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT:    ssra v2.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
@@ -2082,38 +1996,16 @@ define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i16_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v4.4h, #100
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    adrp x8, .LCPI47_0
+; CHECK-GI-NEXT:    movi v3.8h, #100
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI47_0]
+; CHECK-GI-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sshr v2.8h, v1.8h, #3
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT:    ssra v2.8h, v1.8h, #3
+; CHECK-GI-NEXT:    mls v0.8h, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
@@ -2499,17 +2391,16 @@ define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    movi v2.2s, #7
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    adrp x8, .LCPI56_0
+; CHECK-GI-NEXT:    movi v3.2s, #7
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-GI-NEXT:    ushr v2.2s, v2.2s, #31
+; CHECK-GI-NEXT:    ssra v2.2s, v1.2s, #2
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i32> %d, <i32 7, i32 7>
@@ -2532,17 +2423,15 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    movi v2.2s, #100
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    adrp x8, .LCPI57_0
+; CHECK-GI-NEXT:    movi v3.2s, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    sshr v2.2s, v1.2s, #5
+; CHECK-GI-NEXT:    ushr v2.2s, v2.2s, #31
+; CHECK-GI-NEXT:    ssra v2.2s, v1.2s, #5
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i32> %d, <i32 100, i32 100>
@@ -2664,21 +2553,17 @@ define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    movi v2.4s, #7
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI60_0
+; CHECK-GI-NEXT:    movi v3.4s, #7
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshr v2.4s, v1.4s, #2
+; CHECK-GI-NEXT:    ushr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT:    ssra v2.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
@@ -2702,21 +2587,16 @@ define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    movi v2.4s, #100
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI61_0
+; CHECK-GI-NEXT:    movi v3.4s, #100
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sshr v2.4s, v1.4s, #5
+; CHECK-GI-NEXT:    ushr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT:    ssra v2.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..b647daf72ca35
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+
+
+define <2 x i16> @saturating_2xi16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: saturating_2xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i32>
+  %bs = sext <2 x i16> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define <4 x i16> @saturating_4xi16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: saturating_4xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <8 x i16> @saturating_8xi16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: saturating_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    ret
+  %as = sext <8 x i16> %a to <8 x i32>
+  %bs = sext <8 x i16> %b to <8 x i32>
+  %m = mul <8 x i32> %bs, %as
+  %sh = ashr <8 x i32> %m, splat (i32 15)
+  %ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 32767))
+  %t = trunc <8 x i32> %ma to <8 x i16>
+  ret <8 x i16> %t
+}
+
+define <2 x i32> @saturating_2xi32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  %t = trunc <2 x i64> %ma to <2 x i32>
+  ret <2 x i32> %t
+}
+
+define <4 x i32> @saturating_4xi32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: saturating_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i32> %a to <4 x i64>
+  %bs = sext <4 x i32> %b to <4 x i64>
+  %m = mul <4 x i64> %bs, %as
+  %sh = ashr <4 x i64> %m, splat (i64 31)
+  %ma = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %sh, <4 x i64> splat (i64 2147483647))
+  %t = trunc <4 x i64> %ma to <4 x i32>
+  ret <4 x i32> %t
+}
+
+define <8 x i32> @saturating_8xi32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: saturating_8xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    sqdmulh v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <8 x i32> %a to <8 x i64>
+  %bs = sext <8 x i32> %b to <8 x i64>
+  %m = mul <8 x i64> %bs, %as
+  %sh = ashr <8 x i64> %m, splat (i64 31)
+  %ma = tail call <8 x i64> @llvm.smin.v8i64(<8 x i64> %sh, <8 x i64> splat (i64 2147483647))
+  %t = trunc <8 x i64> %ma to <8 x i32>
+  ret <8 x i32> %t
+}
+
+define <2 x i64> @saturating_2xi32_2xi64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  ret <2 x i64> %ma
+}
+
+define <6 x i16> @saturating_6xi16(<6 x i16> %a, <6 x i16> %b) {
+; CHECK-LABEL: saturating_6xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull2 v3.4s, v1.8h, v0.8h
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    sqdmulh v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    sshr v3.4s, v3.4s, #15
+; CHECK-NEXT:    smin v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    xtn2 v0.8h, v2.4s
+; CHECK-NEXT:    ret
+  %as = sext <6 x i16> %a to <6 x i32>
+  %bs = sext <6 x i16> %b to <6 x i32>
+  %m = mul <6 x i32> %bs, %as
+  %sh = ashr <6 x i32> %m, splat (i32 15)
+  %ma = tail call <6 x i32> @llvm.smin.v6i32(<6 x i32> %sh, <6 x i32> splat (i32 32767))
+  %t = trunc <6 x i32> %ma to <6 x i16>
+  ret <6 x i16> %t
+}
+
+define <4 x i16> @unsupported_saturation_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_saturation_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 42))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <4 x i16> @unsupported_shift_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_shift_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #3
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 3)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <2 x i16> @extend_to_illegal_type(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: extend_to_illegal_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i48>
+  %bs = sext <2 x i16> %b to <2 x i48>
+  %m = mul <2 x i48> %bs, %as
+  %sh = ashr <2 x i48> %m, splat (i48 15)
+  %ma = tail call <2 x i48> @llvm.smin.v4i32(<2 x i48> %sh, <2 x i48> splat (i48 32767))
+  %t = trunc <2 x i48> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define <2 x i11> @illegal_source(<2 x i11> %a, <2 x i11> %b) {
+; CHECK-LABEL: illegal_source:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #21
+; CHECK-NEXT:    shl v1.2s, v1.2s, #21
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #21
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #21
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    movi v1.2s, #127, msl #8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #15
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i11> %a to <2 x i32>
+  %bs = sext <2 x i11> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i11>
+  ret <2 x i11> %t
+}
+define <1 x i16> @saturating_1xi16(<1 x i16> %a, <1 x i16> %b) {
+; CHECK-LABEL: saturating_1xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    zip1 v1.4h, v1.4h, v0.4h
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    movi v1.2s, #127, msl #8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #15
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %as = sext <1 x i16> %a to <1 x i32>
+  %bs = sext <1 x i16> %b to <1 x i32>
+  %m = mul <1 x i32> %bs, %as
+  %sh = ashr <1 x i32> %m, splat (i32 15)
+  %ma = tail call <1 x i32> @llvm.smin.v1i32(<1 x i32> %sh, <1 x i32> splat (i32 32767))
+  %t = trunc <1 x i32> %ma to <1 x i16>
+  ret <1 x i16> %t
+}
diff --git a/llvm/test/CodeGen/AArch64/spill-fold.mir b/llvm/test/CodeGen/AArch64/spill-fold.mir
index 0149e4504bed2..9ea9ce53b68a8 100644
--- a/llvm/test/CodeGen/AArch64/spill-fold.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fold.mir
@@ -10,6 +10,7 @@
   define i64 @test_subreg_fill_fold() { ret i64 0 }
   define double @test_subreg_fill_fold2() { ret double 0.0 }
   define <4 x float> @test_subreg_fill_fold3() { ret <4 x float> undef }
+  define i64 @test_subreg_fill_fold4() { ret i64 0 }
   define i64 @test_nzcv_spill_fold() { ret i64 0 }
 ...
 ---
@@ -121,6 +122,24 @@ body:             |
     RET_ReallyLR implicit $s0
 ...
 ---
+# CHECK-LABEL: name: test_subreg_fill_fold4
+# Ensure the COPY is maintained when its result register class is not compatible
+# with the fill load's.
+name:            test_subreg_fill_fold4
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: gpr64sp }
+body:             |
+  bb.0:
+    %0 = COPY $wzr
+    INLINEASM &nop, 1, 12, implicit-def dead $x0, 12, implicit-def dead $x1, 12, implicit-def dead $x2, 12, implicit-def dead $x3, 12, implicit-def dead $x4, 12, implicit-def dead $x5, 12, implicit-def dead $x6, 12, implicit-def dead $x7, 12, implicit-def dead $x8, 12, implicit-def dead $x9, 12, implicit-def dead $x10, 12, implicit-def dead $x11, 12, implicit-def dead $x12, 12, implicit-def dead $x13, 12, implicit-def dead $x14, 12, implicit-def dead $x15, 12, implicit-def dead $x16, 12, implicit-def dead $x17, 12, implicit-def dead $x18, 12, implicit-def dead $x19, 12, implicit-def dead $x20, 12, implicit-def dead $x21, 12, implicit-def dead $x22, 12, implicit-def dead $x23, 12, implicit-def dead $x24, 12, implicit-def dead $x25, 12, implicit-def dead $x26, 12, implicit-def dead $x27, 12, implicit-def dead $x28, 12, implicit-def dead $fp, 12, implicit-def dead $lr, 12, implicit-def $sp
+    ; CHECK: %2:gpr32 = LDRWui %stack.0, 0 :: (load (s32) from %stack.0)
+    ; CHECK: undef %1.sub_32:gpr64sp = COPY %2
+    undef %1.sub_32:gpr64sp = COPY %0
+    $x0 = COPY %1
+    RET_ReallyLR implicit $x0
+...
+---
 # CHECK-LABEL: name: test_nzcv_spill_fold
 # Ensure that nzcv COPY cannot be folded.
 name:            test_nzcv_spill_fold
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging.ll b/llvm/test/CodeGen/AArch64/stack-tagging.ll
index 8759fb12bea77..5d73c7ba968aa 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging.ll
@@ -143,54 +143,4 @@ l:
 ; CHECK-NOT: @llvm.aarch64.irg.sp
 ; CHECK:     ret void
 
-; If we can't trace one of the lifetime markers to a single alloca, fall back
-; to poisoning all allocas at the beginning of the function.
-; Each alloca must be poisoned only once.
-define void @UnrecognizedLifetime(i8 %v) sanitize_memtag {
-entry:
-  %x = alloca i32, align 4
-  %y = alloca i32, align 4
-  %z = alloca i32, align 4
-  %tobool = icmp eq i8 %v, 0
-  %xy = select i1 %tobool, ptr %x, ptr %y
-  %cxcy = select i1 %tobool, ptr %x, ptr %y
-  br label %another_bb
-
-another_bb:
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
-  store i32 7, ptr %z
-  call void @noUse32(ptr %z)
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
-  store i32 7, ptr %z
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy)
-  store i32 8, ptr %xy
-  call void @noUse32(ptr %x)
-  call void @noUse32(ptr %y)
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy)
-  ret void
-}
-
-; CHECK-LABEL: define void @UnrecognizedLifetime(
-; CHECK: call ptr @llvm.aarch64.irg.sp(i64 0)
-; CHECK: alloca { i32, [12 x i8] }, align 16
-; CHECK: call ptr @llvm.aarch64.tagp
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: alloca { i32, [12 x i8] }, align 16
-; CHECK: call ptr @llvm.aarch64.tagp
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: alloca { i32, [12 x i8] }, align 16
-; CHECK: call ptr @llvm.aarch64.tagp
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: store i32
-; CHECK: call void @noUse32(ptr
-; CHECK: store i32
-; CHECK: store i32
-; CHECK: call void @noUse32(ptr
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: call void @llvm.aarch64.settag(
-; CHECK: ret void
-
 !0 = !{}
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
index 4153f0be611a1..9698f1a6768fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
@@ -231,3 +231,27 @@ define <vscale x 8 x i64> @sload_8i8_8i64(ptr %a) {
   %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %aext
 }
+
+; Ensure we don't try to promote a predicate load to a sign-extended load.
+define <vscale x 16 x i8> @sload_16i1_16i8(ptr %addr) {
+; CHECK-LABEL: sload_16i1_16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr p0, [x0]
+; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ret
+  %load = load <vscale x 16 x i1>, ptr %addr
+  %zext = sext <vscale x 16 x i1> %load to <vscale x 16 x i8>
+  ret <vscale x 16 x i8> %zext
+}
+
+; Ensure we don't try to promote a predicate load to a zero-extended load.
+define <vscale x 16 x i8> @zload_16i1_16i8(ptr %addr) {
+; CHECK-LABEL: zload_16i1_16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr p0, [x0]
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    ret
+  %load = load <vscale x 16 x i1>, ptr %addr
+  %zext = zext <vscale x 16 x i1> %load to <vscale x 16 x i8>
+  ret <vscale x 16 x i8> %zext
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index a1d615c910792..c3c39f4d9cee2 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -54,3 +54,84 @@ body:             |
     renamable $z0 = FADD_ZPZZ_D_UNDEF killed $p0, killed $z1, killed $z2, implicit-def $z0_z1_z2_z3
     RET_ReallyLR implicit $z0_z1_z2_z3
 ...
+
+---
+name: unary_undef_operand
+body:             |
+  bb.0:
+    liveins: $p0, $z0
+
+    ; CHECK: name: unary_undef_operand
+    ; CHECK: $z0 = MOVPRFX_ZZ undef $z1
+    ; CHECK: $z0 = ABS_ZPmZ_S internal killed $z0, renamable $p0, killed undef renamable $z1
+    ; NOTE: Unary _UNDEF psuedo instructions ignore the passthru operand.
+    renamable $z0 = ABS_ZPmZ_S_UNDEF renamable $z0, renamable $p0, killed undef renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: binop_undef_operand
+body:             |
+  bb.0:
+    liveins: $p0, $z1
+
+    ; CHECK: name: binop_undef_operand
+    ; CHECK-NOT: MOVPRFX
+    ; CHECK: $z0 = SMIN_ZPmZ_S renamable $p0, killed undef $z0, killed renamable $z1
+    renamable $z0 = SMIN_ZPZZ_S_UNDEF renamable $p0, undef renamable $z0, killed renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: binop_undef_operand_requires_movpfrx
+body:             |
+  bb.0:
+    liveins: $p0, $z1
+
+    ; CHECK: name: binop_undef_operand_requires_movpfrx
+    ; CHECK: $z0 = MOVPRFX_ZZ undef $z2
+    ; CHECK: $z0 = SMIN_ZPmZ_S renamable $p0, internal killed $z0, killed renamable $z1
+    renamable $z0 = SMIN_ZPZZ_S_UNDEF renamable $p0, undef renamable $z2, killed renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: binop_undef_operand_requires_zeroing_movpfrx
+body:             |
+  bb.0:
+    liveins: $p0, $z1
+
+    ; CHECK: name: binop_undef_operand_requires_zeroing_movpfrx
+    ; CHECK: $z0 = MOVPRFX_ZPzZ_S $p0, undef $z2
+    ; CHECK: $z0 = ADD_ZPmZ_S renamable $p0, internal killed $z0, killed renamable $z1
+    renamable $z0 = ADD_ZPZZ_S_ZERO renamable $p0, undef renamable $z2, killed renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: ternaryop_undef_operand
+body:             |
+  bb.0:
+    liveins: $p0, $z1, $z2
+    ; CHECK: name: ternaryop_undef_operand
+    ; CHECK-NOT: MOVPRFX
+    ; CHECK: $z0 = MLA_ZPmZZ_B killed renamable $p0, killed undef $z0, killed renamable $z1, killed renamable $z2
+    renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed undef renamable $z0, killed renamable $z1, killed renamable $z2
+    RET_ReallyLR implicit $z0
+...
+
+---
+name: ternaryop_undef_operand_requires_movprfx
+body:             |
+  bb.0:
+    liveins: $p0, $z1, $z2
+    ; CHECK: name: ternaryop_undef_operand_requires_movprfx
+    ; CHECK: $z0 = MOVPRFX_ZZ undef $z3
+    ; CHECK: $z0 = MLA_ZPmZZ_B killed renamable $p0, internal killed $z0, killed renamable $z1, killed renamable $z2
+    renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed undef renamable $z3, killed renamable $z1, killed renamable $z2
+    RET_ReallyLR implicit $z0
+...
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 8a84d3ca2328c..59dfcf9850a49 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx)  {
 ;
 ; CHECK-SD-FP16-LABEL: add_v3HalfH:
 ; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    movi d1, #0000000000000000
 ; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-FP16-NEXT:    mov v0.h[3], wzr
+; CHECK-SD-FP16-NEXT:    mov v0.h[3], v1.h[0]
 ; CHECK-SD-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
 ; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
 ; CHECK-SD-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll
deleted file mode 100644
index 18b8aab0c70e3..0000000000000
--- a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s
-
-; Tests the fixed object layouts when two catchpads re-use the same stack
-; allocation for this catch objects.
-
-; Generated from this C++ code, with modifications to the IR (see comments in
-; IR):
-; https://godbolt.org/z/9qv5Yn68j
-; > clang --target=aarch64-pc-windows-msvc test.cpp
-; ```
-; extern "C" void boom();
-; extern "C" int calls_boom();
-; {
-;     try { boom(); }
-;     catch (int& i) { return i; }
-;     catch (long& l) { return l; }
-;     return 0;
-; }
-; ```
-
-; Only need 48 bytes on the stack, not 64.
-; CHECK-LABEL:  calls_boom:
-; CHECK:        sub     sp, sp, #48
-; CHECK:        .seh_stackalloc 48
-
-; Both the catch blocks load from the same address.
-; CHECK-LABEL:  "?catch$3@?0?calls_boom@4HA":
-; CHECK:        ldr     x8, [x29, #24]
-; CHECK-LABEL:  "?catch$4@?0?calls_boom@4HA":
-; CHECK:        ldr     x8, [x29, #24]
-
-; There's enough space for the UnwindHelp to be at -16 instead of -32
-; CHECK-LABEL:  $cppxdata$calls_boom:
-; CHECK:        .word   -16                             // UnwindHelp
-
-; Both catches have the same object offset.
-; CHECK-LABEL:  $handlerMap$0$calls_boom:
-; CHECK:        .word   -8                              // CatchObjOffset
-; CHECK-NEXT:   .word   "?catch$3@?0?calls_boom@4HA"@IMGREL // Handler
-; CHECK:        .word   -8                              // CatchObjOffset
-; CHECK-NEXT:   .word   "?catch$4@?0?calls_boom@4HA"@IMGREL // Handler
-
-%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] }
-
-$"??_R0H@8" = comdat any
-
-$"??_R0J@8" = comdat any
-
-@"??_7type_info@@6B@" = external constant ptr
-@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".H\00" }, comdat
-@"??_R0J@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".J\00" }, comdat
-
-define dso_local i32 @calls_boom() personality ptr @__CxxFrameHandler3 {
-entry:
-  %retval = alloca i32, align 4
-; MODIFICATION: Remove unusued alloca
-;  %l = alloca ptr, align 8
-  %i = alloca ptr, align 8
-  invoke void @boom()
-          to label %invoke.cont unwind label %catch.dispatch
-
-catch.dispatch:
-  %0 = catchswitch within none [label %catch1, label %catch] unwind to caller
-
-catch1:
-  %1 = catchpad within %0 [ptr @"??_R0H@8", i32 8, ptr %i]
-  %2 = load ptr, ptr %i, align 8
-  %3 = load i32, ptr %2, align 4
-  store i32 %3, ptr %retval, align 4
-  catchret from %1 to label %catchret.dest2
-
-catch:
-; MODIFICATION: Use %i instead of %l
-  %4 = catchpad within %0 [ptr @"??_R0J@8", i32 8, ptr %i]
-  %5 = load ptr, ptr %i, align 8
-  %6 = load i32, ptr %5, align 4
-  store i32 %6, ptr %retval, align 4
-  catchret from %4 to label %catchret.dest
-
-invoke.cont:
-  br label %try.cont
-
-catchret.dest:
-  br label %return
-
-catchret.dest2:
-  br label %return
-
-try.cont:
-  store i32 0, ptr %retval, align 4
-  br label %return
-
-return:
-  %7 = load i32, ptr %retval, align 4
-  ret i32 %7
-}
-
-declare dso_local void @boom() #1
-
-declare dso_local i32 @__CxxFrameHandler3(...)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 9b35920f8547a..fa4676e4befe4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3211,7 +3211,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_ieee_mode = 1
 ; GFX10-NEXT:     enable_wgp_mode = 1
 ; GFX10-NEXT:     enable_mem_ordered = 1
-; GFX10-NEXT:     enable_fwd_progress = 0
+; GFX10-NEXT:     enable_fwd_progress = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
 ; GFX10-NEXT:     user_sgpr_count = 14
 ; GFX10-NEXT:     enable_trap_handler = 0
@@ -3303,7 +3303,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     enable_ieee_mode = 1
 ; GFX11-NEXT:     enable_wgp_mode = 1
 ; GFX11-NEXT:     enable_mem_ordered = 1
-; GFX11-NEXT:     enable_fwd_progress = 0
+; GFX11-NEXT:     enable_fwd_progress = 1
 ; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
 ; GFX11-NEXT:     user_sgpr_count = 13
 ; GFX11-NEXT:     enable_trap_handler = 0
@@ -4215,7 +4215,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_ieee_mode = 1
 ; GFX10-NEXT:     enable_wgp_mode = 1
 ; GFX10-NEXT:     enable_mem_ordered = 1
-; GFX10-NEXT:     enable_fwd_progress = 0
+; GFX10-NEXT:     enable_fwd_progress = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
 ; GFX10-NEXT:     user_sgpr_count = 14
 ; GFX10-NEXT:     enable_trap_handler = 0
@@ -4300,7 +4300,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     enable_ieee_mode = 1
 ; GFX11-NEXT:     enable_wgp_mode = 1
 ; GFX11-NEXT:     enable_mem_ordered = 1
-; GFX11-NEXT:     enable_fwd_progress = 0
+; GFX11-NEXT:     enable_fwd_progress = 1
 ; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
 ; GFX11-NEXT:     user_sgpr_count = 13
 ; GFX11-NEXT:     enable_trap_handler = 0
@@ -4569,7 +4569,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_ieee_mode = 1
 ; GFX10-NEXT:     enable_wgp_mode = 1
 ; GFX10-NEXT:     enable_mem_ordered = 1
-; GFX10-NEXT:     enable_fwd_progress = 0
+; GFX10-NEXT:     enable_fwd_progress = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
 ; GFX10-NEXT:     user_sgpr_count = 14
 ; GFX10-NEXT:     enable_trap_handler = 0
@@ -4657,7 +4657,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     enable_ieee_mode = 1
 ; GFX11-NEXT:     enable_wgp_mode = 1
 ; GFX11-NEXT:     enable_mem_ordered = 1
-; GFX11-NEXT:     enable_fwd_progress = 0
+; GFX11-NEXT:     enable_fwd_progress = 1
 ; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
 ; GFX11-NEXT:     user_sgpr_count = 13
 ; GFX11-NEXT:     enable_trap_handler = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 8a80afd4a768f..fa0e4b9c23df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -257,20 +257,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 15
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -357,20 +353,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
 ; UNALIGNED_GFX12:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 15
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 7
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
@@ -937,19 +929,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX12-NEXT:    s_add_co_u32 s0, 0x100, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -1048,19 +1038,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX12-NEXT:    s_add_co_u32 s0, 0x100, s0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
@@ -1579,19 +1567,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 7
 ; GFX12-NEXT:    s_add_co_u32 s0, 0x4000, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -1692,19 +1678,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
 ; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 7
 ; UNALIGNED_GFX12-NEXT:    s_add_co_u32 s0, 0x4000, s0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
@@ -4060,9 +4044,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
 ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -4113,9 +4095,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
 ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
 ; UNALIGNED_GFX12:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
@@ -4172,9 +4152,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
 ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -4223,9 +4201,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
 ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
 ; UNALIGNED_GFX12:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
-; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index cebdffc74847c..eba64b853ac05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -223,37 +223,37 @@ body: |
     ; GFX7-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX9-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX11-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX12-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index eafc96dd32bdd..474f1308d8e24 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -252,30 +252,30 @@ body: |
     ; GFX7-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     ;
     ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
index 2675295ea98ed..ae010a872a41d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
@@ -22,6 +22,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_s32_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -51,6 +52,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p0) :: (store seq_cst (<2 x s16>))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -80,6 +82,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](p3), [[COPY1]](p0) :: (store seq_cst (p3))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p3_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -109,6 +112,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p0) :: (store seq_cst (p5))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p5_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -138,6 +142,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p6) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](p6), [[COPY1]](p0) :: (store seq_cst (p6))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p6_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -167,6 +172,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -193,15 +199,16 @@ body: |
     ; GFX7-LABEL: name: atomic_store_flat_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p0) :: (store seq_cst (<2 x s32>))
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s32>))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p0) :: (store seq_cst (<2 x s32>))
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s32>))
     %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:vgpr(p0) = COPY $vgpr2_vgpr3
     G_STORE %0, %1 :: (store seq_cst (<2 x s32>), align 8, addrspace 0)
@@ -225,6 +232,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store seq_cst (<4 x s16>))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -254,6 +262,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store seq_cst (p0))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -282,6 +291,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY]](p1), [[COPY1]](p0) :: (store seq_cst (p1))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
index be3fe91407fdf..4f5f52b25cdf7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
@@ -31,3 +31,33 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            memcpy_test_volatile
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: memcpy_test_volatile
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+    ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+    %6:_(s32) = G_CONSTANT i32 1
+    %7:_(s64) = G_ZEXT %6:_(s32)
+    G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8))
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
index a82ca30209820..0392aef6fe030 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
@@ -31,3 +31,33 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            memcpyinline_test_volatile
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: memcpyinline_test_volatile
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+    ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+    %6:_(s32) = G_CONSTANT i32 1
+    %7:_(s64) = G_ZEXT %6:_(s32)
+    G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8))
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
index e7cfaab135beb..1f8d1aac24ebb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
@@ -31,3 +31,33 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            memmove_test_volatile
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: memmove_test_volatile
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8))
+    ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8))
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32)
+    %6:_(s32) = G_CONSTANT i32 1
+    %7:_(s64) = G_ZEXT %6:_(s32)
+    G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8))
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
index 021cebbb6cb49..dda94e1550585 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
@@ -30,3 +30,32 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            memset_test_volatile
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: memset_test_volatile
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8)
+    ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8))
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s16) = G_TRUNC %3:_(s32)
+    %5:_(s8) = G_TRUNC %4:_(s16)
+    %6:_(s32) = G_CONSTANT i32 1
+    %7:_(s64) = G_ZEXT %6:_(s32)
+    G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8))
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
index cd69104851560..69e3561b362eb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
@@ -80,8 +80,7 @@ body: |
     ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]]
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32)
-    ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
-    ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[ASHR]](s32)
     ;
     ; GFX9-LABEL: name: test_smulh_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -93,8 +92,7 @@ body: |
     ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]]
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32)
-    ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
-    ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -200,9 +198,7 @@ body: |
     ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16
     ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]]
     ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32)
-    ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
-    ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir
index 2c545c89da218..1025d605f35f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir
@@ -92,8 +92,7 @@ body: |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GCN-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32)
-    ; GCN-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 20
-    ; GCN-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ; GCN-NEXT: $vgpr0 = COPY [[ASHR]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = G_CONSTANT i32 16
     %2:_(s32) = G_ASHR %0, %1(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
new file mode 100644
index 0000000000000..beca901945753
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+---
+name:            basic_test
+legalized:       true
+machineFunctionInfo:
+  isWholeWaveFunction: true
+body:             |
+  bb.1:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: basic_test
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+    ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+    ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+    %12:_(s32) = G_CONSTANT i32 5
+    %11:_(s32) = G_SELECT %0(s1), %1, %12
+    %14:_(s32) = G_CONSTANT i32 3
+    %13:_(s32) = G_SELECT %0(s1), %2, %14
+    %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0
+    $vgpr0 = COPY %15(s32)
+    G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 530f4cf53321e..1eb8457cd4a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -254,27 +254,13 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x45800000
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0xfffff000
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1000
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xfffff000, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xfffff000, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x80000001
+; CHECK-NEXT:    v_mul_hi_i32 v1, v0, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 11, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, 4096
@@ -327,42 +313,21 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_srem_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x45800000
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
-; CGP-NEXT:    v_mov_b32_e32 v5, 0x1000
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v7, 12, v7
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x80000001
+; CGP-NEXT:    v_mul_hi_i32 v3, v0, v2
+; CGP-NEXT:    v_mul_hi_i32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v1
+; CGP-NEXT:    v_ashrrev_i32_e32 v3, 11, v3
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 11, v2
+; CGP-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; CGP-NEXT:    v_lshrrev_b32_e32 v5, 31, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 0xfffff000, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff000, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -372,27 +337,14 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0xffed2705
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0xd9528441
+; CHECK-NEXT:    v_mul_hi_i32 v1, v0, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 20, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, 1235195
@@ -445,42 +397,22 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_srem_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x4996c7d8
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; CGP-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v7, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    v_mov_b32_e32 v2, 0xd9528441
+; CGP-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; CGP-NEXT:    v_mul_hi_i32 v4, v0, v2
+; CGP-NEXT:    v_mul_hi_i32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v1
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 20, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 20, v2
+; CGP-NEXT:    v_lshrrev_b32_e32 v5, 31, v4
+; CGP-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index 2bd1b8bf3f3f6..d22a4b978980f 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -45,6 +45,9 @@
     define amdgpu_kernel void @copy_agpr_to_agpr_tuple() #0 { ret void }
     define amdgpu_kernel void @copy_agpr_to_agpr_tuple_kill() #0 { ret void }
 
+    define amdgpu_kernel void @look_for_vgpr_killed() #0 { ret void }
+    define amdgpu_kernel void @look_for_vgpr_killed_tuple() #0 { ret void }
+
     attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
 ...
 
@@ -1517,3 +1520,83 @@ body:             |
     renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
 ...
+
+# Make sure the expansion of the a-to-a copy doesn't introduce a use
+# after kill of the source vgpr
+---
+name: look_for_vgpr_killed
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0
+
+    ; GFX908-LABEL: name: look_for_vgpr_killed
+    ; GFX908: liveins: $agpr0
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX908-NEXT: S_NOP 0, implicit $vgpr0
+    ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ;
+    ; GFX90A-LABEL: name: look_for_vgpr_killed
+    ; GFX90A: liveins: $agpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit killed $vgpr0
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    ;
+    ; GFX942-LABEL: name: look_for_vgpr_killed
+    ; GFX942: liveins: $agpr0
+    ; GFX942-NEXT: {{  $}}
+    ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX942-NEXT: S_NOP 0, implicit killed $vgpr0
+    ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0 = COPY $vgpr0
+    S_NOP 0, implicit killed $vgpr0
+    $agpr1 = COPY $agpr0
+
+...
+
+---
+name: look_for_vgpr_killed_tuple
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0
+
+    ; GFX908-LABEL: name: look_for_vgpr_killed_tuple
+    ; GFX908: liveins: $agpr0
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1
+    ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ;
+    ; GFX90A-LABEL: name: look_for_vgpr_killed_tuple
+    ; GFX90A: liveins: $agpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit killed $vgpr0_vgpr1
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    ;
+    ; GFX942-LABEL: name: look_for_vgpr_killed_tuple
+    ; GFX942: liveins: $agpr0
+    ; GFX942-NEXT: {{  $}}
+    ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX942-NEXT: S_NOP 0, implicit killed $vgpr0_vgpr1
+    ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0 = COPY $vgpr0
+    S_NOP 0, implicit killed $vgpr0_vgpr1
+    $agpr1 = COPY $agpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 50d20e9b0e4d7..6cb236dbee76e 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
@@ -789,11 +790,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index 6742ae6c1d584..f6465de86fa4f 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -6,17 +6,17 @@
 define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 {
 ; GFX908-LABEL: remat_constant_voids_spill:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a5, 6
-; GFX908-NEXT:    v_accvgpr_write_b32 a6, 7
-; GFX908-NEXT:    v_accvgpr_write_b32 a7, 8
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, 9
-; GFX908-NEXT:    v_accvgpr_write_b32 a2, 2
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, 3
-; GFX908-NEXT:    v_accvgpr_write_b32 a4, 4
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, 1
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 3
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, 4
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, 5
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, 5
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 6
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 7
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, 8
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, 9
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index cb2f0f28a29d6..0d5f538215f18 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
@@ -6549,319 +6549,314 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v64
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v23.h, v24.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v33, v39
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
@@ -15418,63 +15413,63 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:364
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:284
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:264
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:112
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
@@ -15488,146 +15483,144 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v82.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v83.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v84.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v86.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v96.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v99.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v160.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v160.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v161.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v161.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v162.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v162.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v163.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v163.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v164.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v164.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v49.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -15641,720 +15634,746 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB14_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v151.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX11-TRUE16-NEXT:  .LBB14_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v147.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v149.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v148.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v132.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v71.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v16.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v18.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v26.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v54.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v53.h, v28.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -42137,64 +42156,64 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -42222,50 +42241,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -42309,50 +42328,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB36_4: ; %end
@@ -42360,319 +42379,314 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v64
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v23.h, v24.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v33, v39
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
@@ -52196,63 +52210,63 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:364
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:284
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:264
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:112
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
@@ -52266,146 +52280,144 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v82.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v83.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v84.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v86.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v96.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v99.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v160.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v160.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v161.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v161.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v162.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v162.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v163.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v163.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v164.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v164.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v49.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -52419,720 +52431,746 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB38_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v151.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_2
 ; GFX11-TRUE16-NEXT:  .LBB38_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v147.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v149.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v148.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v132.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v71.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v16.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v18.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v26.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v54.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v53.h, v28.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -77900,64 +77938,64 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -77985,50 +78023,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -78097,50 +78135,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB56_4: ; %end
@@ -78148,319 +78186,314 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v64
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v23.h, v24.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v33, v39
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
@@ -87027,63 +87060,63 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:364
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:284
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:264
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:112
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
@@ -87097,146 +87130,144 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v82.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v83.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v84.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v86.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v96.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v99.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v160.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v160.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v161.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v161.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v162.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v162.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v163.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v163.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v164.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v164.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v49.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -87250,720 +87281,746 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB58_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v151.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX11-TRUE16-NEXT:  .LBB58_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v147.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v149.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v148.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v132.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v71.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v16.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v18.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v26.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v54.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v53.h, v28.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -111743,64 +111800,64 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -111828,50 +111885,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -111915,50 +111972,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB72_4: ; %end
@@ -111966,319 +112023,314 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v64
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v23.h, v24.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v33, v39
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
@@ -121787,63 +121839,63 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:364
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:284
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:264
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:216
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:40
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:112
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
@@ -121857,146 +121909,144 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:204
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v82.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v83.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v84.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v86.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v96.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v99.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v160.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v160.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v161.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v161.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v162.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v162.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v163.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v163.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v164.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v164.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v165.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v49.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -122010,720 +122060,746 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB74_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v151.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX11-TRUE16-NEXT:  .LBB74_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v147.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v149.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v148.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v132.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v71.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v16.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v18.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v26.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v54.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v53.h, v28.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v31
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -160055,116 +160131,116 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr107_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr106_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr106_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr93_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr75_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr63_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr57_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr73_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr56_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr89_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr61_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr57_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr95_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr93_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr92_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr79_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr73_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr61_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr62_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr63_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
@@ -160187,341 +160263,338 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v42, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 24, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v91, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v162.h, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v164.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v164.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v46.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v74.h, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v45.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v42.h, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v161.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v47.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v73.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v44.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v41.h, v10.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v89.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v59.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v60.h, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v94.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v77.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v76.h, v14.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v104.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v91.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v61.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v57.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v104.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v78.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v77.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v95.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v93.h, v16.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v92.h, v16.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v18.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v20.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v21.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v22.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v24.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v26.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.h, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v28.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.h, v30.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.h, v30.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.h, v31.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.h, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v32.h
 ; GFX11-TRUE16-NEXT:  .LBB90_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB90_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v17
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v33, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v17, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v55.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v36, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v50, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v70.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v34, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v37, v51, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v48, v34, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v50, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v71.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v80.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v33, v70
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v33, v55
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v34, v17
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v34, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v19, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v83, v33, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v81.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v33, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v22, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v83.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v84.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v20, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v33, v22, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v22
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v82, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v34, v71
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v82.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_cndmask_b32 v84, v19, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v20, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v34, v80
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v19, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v37, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v83.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v20
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v84
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v22
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v85.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v35, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v86.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v34, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v33, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v86.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v35, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v23, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v33, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v21
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v23, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v21
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v97.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v96.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v24, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v37, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v33, v87
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v24, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v26
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v36, v23
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v26, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v87.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v33, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 24, v24
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v26, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v98.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v101, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v97.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v26, v38 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v101.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v35, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v101, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v26, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v33, v28, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v99, v26, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v101.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v100, v26, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v34, v98
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v100, v25, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v34, v96
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v100.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v99, v25, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v37, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v99.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v100
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v27, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v28
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v27, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v112, v34, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v112, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v99
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v26
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v29
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
@@ -160529,21 +160602,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v112.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v28
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v25
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v35, v27
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v29, 16, 1
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v29, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v103.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v37, 16, 1
@@ -160556,45 +160630,44 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v113.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v30, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v33, v102
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v36, v29
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v36, v29
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v32
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v32, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v29
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v114, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v115, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v115.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v114, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v115, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v114.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v116, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v116.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v32, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v1
@@ -160607,10 +160680,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v33, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v131, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v133, v32, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v32, 0xffff, v34, v115
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v32, 0xffff, v34, v114
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v132, v31, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v37, v36
@@ -160622,9 +160695,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v131.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v133.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v32
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v144, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -160640,252 +160713,240 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v37, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v147, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v148, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v144.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v149, v33, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v144, v33, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v35, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v162, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v148.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v146.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 24, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v164, v34, v36, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v162.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v35, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v164.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v36, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v147.h
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v38 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v33, v149
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 24, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v33, v144
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 8, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v164, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v33, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v165.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v161, v35, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v164.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v180, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v35, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v37.l, v180.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v6, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v6, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v35, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v33, v8, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v178, v6, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v179, v6, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v39
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v34, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v179, v5, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v34, v161
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v179.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v33, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v178.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v46, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v179
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v178
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v47, v35, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v8
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v45, v7, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v47.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v91, 8, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v44, v7, v37, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v39
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v46.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v9, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v42, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v45.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v44.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v35, v42
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v37
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v35, v41
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v42, 24, v10
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v59, v38, v50, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v12, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v14
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v59.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v74, v35, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v61.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v73, v35, v49, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v12, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v60, v48, v52 :: v_dual_add_f32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v57, v48, v52, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v14, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v60
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v57
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v36, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v11
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v74.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v35, v37, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v73.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v12
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v36, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v35, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v36, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v39
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v89, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v76, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_lshlrev_b32 v16, 16, v16
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v49, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v77, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v77, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v49, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v77.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v78, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v78.h
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v37
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v15, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v13, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v16, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v94, v35, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v37, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v104, v35, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v37, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v93, v13, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v48, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v15, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v91, v13, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v50, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v94.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v104, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v95, v39, v51, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v104.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v50, v15, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v89.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v38, v76
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v38, v77
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v92, v35, v48, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v104.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v91.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v95.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v93.h
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v39, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 8, v14
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v35, v92
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
@@ -160905,332 +160966,327 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v7
 ; GFX11-TRUE16-NEXT:  .LBB90_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v146.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v108.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v106.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v133.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v107.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v107.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v2.h, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v106.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v78.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v105.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v164.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v105.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v94.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v91.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v95.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v93.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v180.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v90.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v164.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v180.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v90.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v144.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v88.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v165.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v8, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v4.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v165.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v88.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v8, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v5.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v47.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v76.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v75.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v161.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v179.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v6.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v46.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v75.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v178.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v63.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v179.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v62.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v74.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v73.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v8.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v44.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v45.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v56.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v43.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v89.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v41.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v42.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v10.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v89.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v40.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v61.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v183.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v16, v14
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v42.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v43.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v60.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v181.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v11.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v104.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v176.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v166.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v167.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v57.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v12.h, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v78.h
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v94.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v167.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v59.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v182.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v77.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v163.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v20, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v76.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v104.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v91.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v92.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v73.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v79.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v72.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v18
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v12.l, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v77.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v95.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, v18, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v93.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, v18, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v79.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v92.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v66, v18, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v74.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, v18, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v46.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v16, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v47.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v58.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v20, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v45.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v19, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v40.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v19, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v44.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v18, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v177.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v182.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v41.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v183.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v176.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v181.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v13.h, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v22, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v177.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v22, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v13.h, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v163.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v22, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v166.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v28, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v116.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v13.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v26, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v25, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v13.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v25, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v13.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v28, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v28, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v13.h, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v28, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v13.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v13.h, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v32, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v34
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v14
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[64:67], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[15:18], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[19:22], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[23:26], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[27:30], off offset:112
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:16
@@ -185249,64 +185305,64 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -185331,52 +185387,52 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
@@ -185436,371 +185492,366 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v17
 ; GFX11-TRUE16-NEXT:  .LBB94_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v68
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v54, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v68
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v69
-; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v68, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v51, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v67
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v68
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v51, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v54, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v66
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v51, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v49, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v49, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v48, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v39, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v39, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v39, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v39, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v39, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v38, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v38, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v103.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v65, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v51
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v54, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v37, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v37, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v36, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v36, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.l, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v51
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v33, v51
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
@@ -208007,64 +208058,64 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -208089,52 +208140,52 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
@@ -208194,371 +208245,366 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v17
 ; GFX11-TRUE16-NEXT:  .LBB98_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v68
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v54, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v68
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v69
-; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v68, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v51, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v67
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v68
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v54
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v51, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v134.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v54, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v66
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v51, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v119.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v49, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v49, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v52
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v48, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v39, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v39, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v39, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v39, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v39, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v38, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v38, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v103.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v65, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v51
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v133.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v54, v64
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v37, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v37, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v36, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v36, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.l, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v35, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v34, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v51
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v33, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v33, v51
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 34d7ed9290b67..3e96ab1d597d6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -2675,79 +2675,76 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v3, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v8, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v5, v0
 ; GFX11-TRUE16-NEXT:  .LBB22_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4122,18 +4119,18 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -4147,107 +4144,103 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB26_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  .LBB26_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7140,79 +7133,76 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB46_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v3, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v8, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v5, v0
 ; GFX11-TRUE16-NEXT:  .LBB46_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -8603,18 +8593,18 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -8628,107 +8618,103 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB50_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11253,79 +11239,76 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v3, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v8, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v5, v0
 ; GFX11-TRUE16-NEXT:  .LBB66_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -12700,18 +12683,18 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -12725,107 +12708,103 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB70_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB70_2
 ; GFX11-TRUE16-NEXT:  .LBB70_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14952,79 +14931,76 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v3, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v8, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v5, v0
 ; GFX11-TRUE16-NEXT:  .LBB82_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -16407,18 +16383,18 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -16432,107 +16408,103 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB86_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB86_2
 ; GFX11-TRUE16-NEXT:  .LBB86_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -18254,83 +18226,83 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB94_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v11, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v11, v12 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v9, v15, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v0, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v6, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v6
 ; GFX11-TRUE16-NEXT:  .LBB94_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -19840,18 +19812,18 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -19865,107 +19837,103 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB98_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX11-TRUE16-NEXT:  .LBB98_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -21172,79 +21140,79 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB102_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v1, v7, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v1, v7 :: v_dual_and_b32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v7 :: v_dual_add_f32 v7, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v14, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v13, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v7, v3
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v2.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v7, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v5, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v8, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v9, v4
 ; GFX11-TRUE16-NEXT:  .LBB102_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -22758,18 +22726,18 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -22783,107 +22751,103 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB106_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX11-TRUE16-NEXT:  .LBB106_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23876,87 +23840,92 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v14, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v13, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v14, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v9, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_and_b32 v5, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v7, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v5, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v14, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v9, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v1, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v13, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v7, v2
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v9, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v9, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[2:3]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
 ; GFX11-TRUE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
@@ -24976,18 +24945,18 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v15.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
@@ -25001,107 +24970,103 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB110_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v3.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB110_2
 ; GFX11-TRUE16-NEXT:  .LBB110_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v4, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index 2c78e34823742..5344095e99217 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -659,7 +659,8 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -1132,7 +1133,8 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index a19567bbe24f6..f8ffaa456c2b3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6296,33 +6296,32 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
@@ -6334,188 +6333,194 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB26_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v19.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  .LBB26_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v17.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v15.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v19.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v13.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v8, v21
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13330,33 +13335,32 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
@@ -13368,188 +13372,194 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB50_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v19.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v17.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v15.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v19.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v13.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v8, v21
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -19882,33 +19892,32 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB70_3
@@ -19920,188 +19929,194 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB70_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v19.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB70_2
 ; GFX11-TRUE16-NEXT:  .LBB70_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v17.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v15.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v19.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v13.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v8, v21
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25924,33 +25939,32 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB86_3
@@ -25962,188 +25976,194 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB86_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v19.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB86_2
 ; GFX11-TRUE16-NEXT:  .LBB86_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v17.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v15.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v19.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v13.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v8, v21
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index e773b546afe1b..0cefbc1c2dee5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -2966,20 +2966,20 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
@@ -2995,17 +2995,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB12_2: ; %Flow
@@ -3029,17 +3029,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
@@ -3047,100 +3047,105 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v15
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v4.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v15
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[10:11], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8:
@@ -5033,50 +5038,48 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v15.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v33.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v33.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB14_3
@@ -5091,228 +5094,243 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v25.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v24.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v0.h, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v1.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v3.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v4.l, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v14.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v5.l, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v6.h, v13.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v6.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v7.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v8.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX11-TRUE16-NEXT:  .LBB14_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v26.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v25.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v21.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v23.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v24.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v23.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v22.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v23.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v24.h, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v15.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v16.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v20.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v18.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v21.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v18.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v21.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v19.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v16.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v17.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v19.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v14.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v15.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v14.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v13.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v13.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v14.h, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v13.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v14.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v13.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v12.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v27
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v10.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v11.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v11.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v11.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v11.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v10, v27
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9933,20 +9951,20 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
@@ -9962,17 +9980,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB32_2: ; %Flow
@@ -9992,17 +10010,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB32_4: ; %end
@@ -10010,100 +10028,105 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v15
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v4.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v15
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[10:11], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8:
@@ -12014,50 +12037,48 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v15.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v33.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v33.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB34_3
@@ -12072,228 +12093,243 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v25.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v24.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v0.h, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v1.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v3.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v4.l, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v14.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v5.l, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v6.h, v13.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v6.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v7.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v8.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_2
 ; GFX11-TRUE16-NEXT:  .LBB34_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v26.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v25.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v21.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v23.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v24.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v23.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v22.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v23.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v24.h, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v15.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v16.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v20.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v18.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v21.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v18.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v21.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v19.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v16.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v17.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v19.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v14.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v15.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v14.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v13.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v13.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v14.h, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v13.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v14.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v13.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v12.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v27
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v10.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v11.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v11.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v11.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v11.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v10, v27
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16322,20 +16358,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
@@ -16351,17 +16387,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB48_2: ; %Flow
@@ -16385,17 +16421,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB48_4: ; %end
@@ -16403,100 +16439,105 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v15
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v4.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v15
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[10:11], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8:
@@ -22438,20 +22479,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
@@ -22467,17 +22508,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB60_2: ; %Flow
@@ -22501,17 +22542,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB60_4: ; %end
@@ -22519,100 +22560,105 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v15
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v4.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v15
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[10:11], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8:
@@ -28813,50 +28859,50 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v38.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v38.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v36.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v36.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB72_3
@@ -28871,228 +28917,243 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v34.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v0.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v25.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v1.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v2.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v3.l, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v21.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v5.l, v19.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v6.h, v19.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v6.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v7.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v11, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v8.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v9.l, v16.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v10
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB72_2
 ; GFX11-TRUE16-NEXT:  .LBB72_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v34.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v30.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v26.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v33.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v34.l, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v27.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v23.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v27.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v22.h, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v28.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v23.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v25.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v24.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v29.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v26.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v25.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v27.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v23.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v21.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v22.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v21.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v19.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v18.h, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v19.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v19.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v20.h, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v19.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v18.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v11
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v16.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v16.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v17.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v17.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v18.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v17.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v17.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v16.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -30847,20 +30908,20 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
@@ -30876,17 +30937,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB74_2: ; %Flow
@@ -30905,17 +30966,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB74_4: ; %end
@@ -30923,100 +30984,105 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v15
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v4.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v15
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[10:11], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8:
@@ -32944,50 +33010,50 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v38.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v38.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v36.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v36.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB76_3
@@ -33002,228 +33068,243 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v34.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v0.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v25.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v1.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v2.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v3.l, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v21.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v5.l, v19.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v6.h, v19.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v6.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v7.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v11, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v8.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v9.l, v16.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v10
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX11-TRUE16-NEXT:  .LBB76_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v34.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v30.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v26.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v33.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v34.l, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v27.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v23.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v27.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v22.h, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v28.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v23.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v25.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v24.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v29.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v26.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v25.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v27.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v23.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v21.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v22.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v21.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v19.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v18.h, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v19.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v19.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v20.h, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v19.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v18.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v11
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v16.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v16.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v17.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v17.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v18.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v17.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v17.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v16.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v10, v11
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34993,20 +35074,20 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
@@ -35022,17 +35103,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB78_2: ; %Flow
@@ -35059,17 +35140,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB78_4: ; %end
@@ -35077,100 +35158,105 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v15
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v4.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v15
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[10:11], off offset:32
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index bfed1f4304dd9..48c9b8775a474 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
@@ -2273,18 +2273,19 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  .LBB22_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-TRUE16-NEXT:  .LBB22_4: ; %cmp.true
@@ -2294,16 +2295,17 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4504,8 +4506,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
@@ -4520,18 +4522,19 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  .LBB42_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX11-TRUE16-NEXT:  .LBB42_4: ; %cmp.true
@@ -4541,16 +4544,17 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6463,8 +6467,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
@@ -6479,18 +6483,19 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  .LBB58_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX11-TRUE16-NEXT:  .LBB58_4: ; %cmp.true
@@ -6500,16 +6505,17 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8110,8 +8116,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
@@ -8126,18 +8132,19 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  .LBB70_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB70_2
 ; GFX11-TRUE16-NEXT:  .LBB70_4: ; %cmp.true
@@ -8147,16 +8154,17 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9471,8 +9479,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
@@ -9487,18 +9495,19 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  .LBB78_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB78_2
 ; GFX11-TRUE16-NEXT:  .LBB78_4: ; %cmp.true
@@ -9508,16 +9517,17 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10183,8 +10193,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
@@ -10199,18 +10209,19 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  .LBB82_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX11-TRUE16-NEXT:  .LBB82_4: ; %cmp.true
@@ -10220,16 +10231,17 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 45e205b3ca556..68312b89142c7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -145,37 +145,36 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7fc0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.h
 ; GFX11-TRUE16-NEXT:  .LBB0_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -797,40 +796,40 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v2
 ; GFX11-TRUE16-NEXT:  .LBB4_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index d52451418e49a..5aac06a7f3a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -8768,32 +8768,32 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
@@ -8812,26 +8812,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB24_2: ; %Flow
@@ -8864,26 +8864,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB24_4: ; %end
@@ -8891,151 +8891,156 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
@@ -12465,13 +12470,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:100
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
@@ -12487,81 +12492,83 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v65.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v66.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v68.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v70.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v70.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v71.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
@@ -12574,366 +12581,384 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB26_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v53.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v21.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  .LBB26_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v54.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v51.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v52.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v32.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v17.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v16, v55
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23563,32 +23588,32 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
@@ -23607,26 +23632,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB48_2: ; %Flow
@@ -23651,26 +23676,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB48_4: ; %end
@@ -23678,151 +23703,156 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
@@ -27383,13 +27413,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:100
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
@@ -27405,81 +27435,83 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v65.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v66.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v68.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v70.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v70.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v71.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
@@ -27492,366 +27524,384 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB50_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v53.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v21.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v54.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v51.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v52.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v32.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v17.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v16, v55
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -37866,32 +37916,32 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
@@ -37910,26 +37960,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB68_2: ; %Flow
@@ -37967,26 +38017,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB68_4: ; %end
@@ -37994,151 +38044,156 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
@@ -41573,13 +41628,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:100
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
@@ -41595,81 +41650,83 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v65.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v66.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v68.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v70.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v70.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v71.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB70_3
@@ -41682,366 +41739,384 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB70_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v53.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v21.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB70_2
 ; GFX11-TRUE16-NEXT:  .LBB70_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v54.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v51.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v52.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v32.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v17.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v16, v55
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51220,32 +51295,32 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
@@ -51264,26 +51339,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB84_2: ; %Flow
@@ -51308,26 +51383,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB84_4: ; %end
@@ -51335,151 +51410,156 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
@@ -54909,13 +54989,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:100
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:92
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
@@ -54931,81 +55011,83 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:60
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v80.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v65.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v66.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.l, 8, v67.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v68.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v68.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v70.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v70.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v71.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB86_3
@@ -55018,366 +55100,384 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB86_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v53.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v4.h, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v9.h, v21.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB86_2
 ; GFX11-TRUE16-NEXT:  .LBB86_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v54.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v51.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v52.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v26.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v30.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v32.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v17.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v16, v55
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -64473,32 +64573,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
@@ -64517,26 +64617,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB96_2: ; %Flow
@@ -64569,26 +64669,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB96_4: ; %end
@@ -64596,151 +64696,156 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
@@ -76596,32 +76701,32 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
@@ -76640,26 +76745,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB104_2: ; %Flow
@@ -76692,26 +76797,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB104_4: ; %end
@@ -76719,151 +76824,156 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v25
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
@@ -85582,58 +85692,58 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -85648,297 +85758,304 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v6.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v8.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v10.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.h, v14.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v16.h
 ; GFX11-TRUE16-NEXT:  .LBB108_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v26.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v21, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v27
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v20, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v17, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v20, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v18, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v17, v23, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v21, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v31, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v18, v19, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v20, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v32.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v17, v29
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v19, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v17, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v20, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v19, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v32.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v17, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v8, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v35.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v6, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v17, v8, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v34, v6, v17 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v18, v33
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v34.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v6, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v18, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v5, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v21, v20
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v35.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v20, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v19, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v19, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v49.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v7, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v7, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v38.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v19, v48
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v19, v48
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v23
-; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v39 :: v_dual_cndmask_b32 v52, v22, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v22, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v19, v25, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v53, v24, v50, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v52
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v20, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v11
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v9, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v20, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v65.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 24, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v20, v9
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v21, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v24, v19, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v25, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v25, v7, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v16
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v23, v24, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v20, v9
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v68.h
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21
@@ -85950,42 +86067,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v19, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v16, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v14, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v21
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v13, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v13, v25, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v24, v14, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v14
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v15, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v23, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v23, v38, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v85.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v86.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v37, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v70.h
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v22, v67
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v71.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v22, v66
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v19, v24, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v81.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v84.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v82.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v85.h
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v23, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 24, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v19, v82
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v19, v81
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v9
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v13
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v21, v7
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v18, v17
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
@@ -85994,160 +86111,159 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v7
 ; GFX11-TRUE16-NEXT:  .LBB108_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v28.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v102.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v24.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v2.h, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v100.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v8, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v4.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v8, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v10, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v12
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v16, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.h, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v6.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v7.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v10, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v8.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v8.h, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v24
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v14, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v10.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v23, v24
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v14, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v14, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v12.h, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v12.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v13.l, v14.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v18.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.h, v15.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.h, v17.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v16, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v14.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v14.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v19
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v22, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v17, v24
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index a40ee1698b8e0..6fe66655de3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2160,46 +2160,47 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v3, v4
 ; GFX11-TRUE16-NEXT:  .LBB22_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -3064,12 +3065,13 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -3083,66 +3085,61 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB26_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  .LBB26_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5307,46 +5304,47 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB46_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v3, v4
 ; GFX11-TRUE16-NEXT:  .LBB46_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -6216,12 +6214,13 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -6235,66 +6234,61 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB50_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8166,46 +8160,47 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v3, v4
 ; GFX11-TRUE16-NEXT:  .LBB66_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -9068,12 +9063,13 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -9087,66 +9083,61 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB70_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB70_2
 ; GFX11-TRUE16-NEXT:  .LBB70_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10698,46 +10689,47 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v3, v4
 ; GFX11-TRUE16-NEXT:  .LBB82_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -11611,12 +11603,13 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -11630,66 +11623,61 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB86_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB86_2
 ; GFX11-TRUE16-NEXT:  .LBB86_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12867,49 +12855,49 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB94_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v3
 ; GFX11-TRUE16-NEXT:  .LBB94_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -13841,12 +13829,13 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -13860,66 +13849,61 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB98_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX11-TRUE16-NEXT:  .LBB98_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14696,46 +14680,46 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB102_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v3, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-TRUE16-NEXT:  .LBB102_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -15671,12 +15655,13 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -15690,66 +15675,61 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB106_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX11-TRUE16-NEXT:  .LBB106_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16347,42 +16327,41 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB108_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v0 :: v_dual_lshlrev_b32 v0, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v0 :: v_dual_add_f32 v0, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v9.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v4, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v12, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v9, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v11, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v3, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[8:9]
@@ -16987,12 +16966,13 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
@@ -17006,66 +16986,61 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB110_3: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v4
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB110_2
 ; GFX11-TRUE16-NEXT:  .LBB110_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v3.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v5
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 5163539046bb0..e5245f7bd71d3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -1102,15 +1102,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
@@ -1126,76 +1126,79 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX11-TRUE16-NEXT:  .LBB6_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v7.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v6.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v6.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v5.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.h, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v7
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2084,62 +2087,57 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v1, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v11, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v6, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
 ; GFX11-TRUE16-NEXT:  .LBB10_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4243,15 +4241,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
@@ -4267,76 +4265,79 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-TRUE16-NEXT:  .LBB22_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v7.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v7.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v6.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v6.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v5.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.h, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v7
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5229,62 +5230,57 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v1, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v11, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v6, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
 ; GFX11-TRUE16-NEXT:  .LBB26_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -6889,16 +6885,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
@@ -6914,77 +6910,79 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v2.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX11-TRUE16-NEXT:  .LBB36_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v9.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v8.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v6.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v4.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v9
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7783,67 +7781,68 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v13.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v12.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v0, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v12, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v1, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v13, v14, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v5, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v12, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v7, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v2, v1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, 0x7fc07fc0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[12:13]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v7, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v5, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
 ; GFX11-TRUE16-NEXT:  .LBB38_4: ; %end
@@ -8652,16 +8651,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
@@ -8677,77 +8676,79 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v2.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB40_2
 ; GFX11-TRUE16-NEXT:  .LBB40_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v9.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v8.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v6.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v4.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v9
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10064,16 +10065,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v10.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
@@ -10089,77 +10090,79 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v2.l, v4.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB44_2
 ; GFX11-TRUE16-NEXT:  .LBB44_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v9.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v8.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v6.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v6.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.h, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v4.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v9
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11446,59 +11449,61 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v8, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v3
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v8, v12 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v10, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v12 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v6, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v4, v5
 ; GFX11-TRUE16-NEXT:  .LBB48_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -12390,64 +12395,66 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v4, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v9, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v6, v8 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v8, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v10, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v12, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v7, v13, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v0, 16, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v5, 16, v4
 ; GFX11-TRUE16-NEXT:  .LBB52_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b7097a9557b75..c7385e4324e2c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
@@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
@@ -10096,9 +10096,15 @@ define i64 @udiv_i64_9divbits(i8 %size) {
 }
 
 define <2 x i64> @srem_zero_zero() {
-; GCN-LABEL: kernel:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_endpgm
+; GFX6-LABEL: srem_zero_zero:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: srem_zero_zero:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %B = srem <2 x i64> zeroinitializer, zeroinitializer
   ret <2 x i64> %B
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
new file mode 100644
index 0000000000000..69bdb1f5066f0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
@@ -0,0 +1,95 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX90A %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX908 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx906 -passes=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefix=NO-AGPR %s
+
+--- |
+  define void @func() {
+    ret void
+  }
+
+  ; Attribute is ignored for gfx90a
+  define void @no_agprs() "amdgpu-agpr-alloc"="0,0" {
+    ret void
+  }
+
+...
+---
+name: func
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4 }
+body:             |
+  ; HAS-AGPR-LABEL: name: func
+  ; HAS-AGPR: bb.0:
+  ; HAS-AGPR-NEXT:   successors: %bb.1(0x80000000)
+  ; HAS-AGPR-NEXT:   liveins: $vgpr0
+  ; HAS-AGPR-NEXT: {{  $}}
+  ; HAS-AGPR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; HAS-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+  ; HAS-AGPR-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; HAS-AGPR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
+  ; HAS-AGPR-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+  ; HAS-AGPR-NEXT:   [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; HAS-AGPR-NEXT:   [[AV_MOV_2:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 6, implicit $exec
+  ; HAS-AGPR-NEXT: {{  $}}
+  ; HAS-AGPR-NEXT: bb.1:
+  ; HAS-AGPR-NEXT:   [[AV_MOV_3:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 3, implicit $exec
+  ;
+  ; NO-AGPR-LABEL: name: func
+  ; NO-AGPR: bb.0:
+  ; NO-AGPR-NEXT:   successors: %bb.1(0x80000000)
+  ; NO-AGPR-NEXT:   liveins: $vgpr0
+  ; NO-AGPR-NEXT: {{  $}}
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec
+  ; NO-AGPR-NEXT: {{  $}}
+  ; NO-AGPR-NEXT: bb.1:
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+  bb.0:
+    liveins: $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %5:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    %6:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec
+
+  bb.1:
+    %7:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+
+...
+
+---
+name: no_agprs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX90A-LABEL: name: no_agprs
+    ; GFX90A: liveins: $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX90A-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    ;
+    ; GFX908-LABEL: name: no_agprs
+    ; GFX908: liveins: $vgpr0
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+    ; GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+    ;
+    ; NO-AGPR-LABEL: name: no_agprs
+    ; NO-AGPR: liveins: $vgpr0
+    ; NO-AGPR-NEXT: {{  $}}
+    ; NO-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
index d4826a22db795..6044f6e354ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
 ; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}}
 ; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
-; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}}
+; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xe00f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll
new file mode 100644
index 0000000000000..535f05bc01b42
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
+
+@global_smem = external local_unnamed_addr addrspace(1) global [0 x i8], align 16
+
+define amdgpu_kernel void @v_atomicrmw_fadd_bf16(ptr addrspace(1) %out, i1 %in, ptr addrspace(1) %ptr) #0 {
+; GFX11-TRUE16-LABEL: v_atomicrmw_fadd_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1] offset:4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s2, -4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-TRUE16-NEXT:    s_not_b32 s3, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    .p2align 6
+; GFX11-TRUE16-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v0, v4, v[0:1], s[0:1] glc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_gl1_inv
+; GFX11-TRUE16-NEXT:    buffer_gl0_inv
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_atomicrmw_fadd_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v0, s[0:1] offset:4
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s2, -4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-FAKE16-NEXT:    s_not_b32 s3, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    .p2align 6
+; GFX11-FAKE16-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX11-FAKE16-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_gl1_inv
+; GFX11-FAKE16-NEXT:    buffer_gl0_inv
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
+  %load = load <4 x bfloat>, ptr addrspace(1) %in.gep
+  %extract1 = extractelement <4 x bfloat> %load, i64 3
+  %fadd = atomicrmw fadd ptr addrspace(1) %out, bfloat %extract1 syncscope("agent") acq_rel
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index 7ce5a00c0bf36..d91b2117c7ad9 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -514,9 +514,9 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt
   ret void
 }
 
-define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val, i32 %offset) #0 {
+define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 {
 ; CHECK-LABEL: define internal void @callee_alias_addr_space_branch(
-; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]]
 ; CHECK:       [[BB_1_TRUE]]:
 ; CHECK-NEXT:    br label %[[BB_1_END:.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
new file mode 100644
index 0000000000000..1a457c94778fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+---
+name:            bad_ra
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64, preferred-register: '$sgpr4_sgpr5' }
+  - { id: 1, class: sgpr_128, preferred-register: '%2' }
+  - { id: 2, class: areg_128, preferred-register: '%1' }
+  - { id: 3, class: areg_128, preferred-register: '%4' }
+  - { id: 4, class: av_128, preferred-register: '%3' }
+  - { id: 5, class: areg_128, preferred-register: '%6' }
+  - { id: 6, class: vreg_128, preferred-register: '%5' }
+  - { id: 7, class: areg_128, preferred-register: '%4' }
+  - { id: 8, class: vgpr_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: areg_128 }
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       10
+  vgprForAGPRCopy: '$vgpr255'
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+    liveins: $sgpr4_sgpr5
+
+    ; CHECK-LABEL: name: bad_ra
+    ; CHECK: liveins: $sgpr4_sgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
+    ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1
+    ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1
+    ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3
+    ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
+    %8:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
+    %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %10:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
+    %2:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:areg_128 = V_MFMA_F32_4X4X1F32_e64 %8, %10, %2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %4.sub1:av_128 = COPY %3.sub1
+    %4.sub0:av_128 = COPY %3.sub0
+    %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %8, %10, %3, 0, 0, 0, implicit $mode, implicit $exec
+    %4.sub3:av_128 = COPY %11.sub1
+    %4.sub2:av_128 = COPY %11.sub0
+    %7:areg_128 = COPY %4
+    %5:areg_128 = V_MFMA_F32_4X4X1F32_e64 %8, %10, %7, 0, 0, 0, implicit $mode, implicit $exec
+    %6:vreg_128 = COPY %5
+    GLOBAL_STORE_DWORDX4_SADDR %9, %6, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 5b4866c386793..6823a472c3ac6 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s
 
 ; TODO: Add global-isel when it can support bf16
 
@@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_bf16_f32_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
   %cvt = fpext bfloat %v to float
   ret float %cvt
 }
@@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 16
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_bf16_f32_s:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; return to shader part epilog
   %cvt = fpext bfloat %v to float
   ret float %cvt
 }
@@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
 ; GFX-950:       ; %bb.0:
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX-950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x float> %src to <2 x bfloat>
   %cast = bitcast <2 x bfloat> %res to float
   ret float %cast
@@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
 ; GFX-950-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, s0, v0
 ; GFX-950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, s0, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x float> %src to <2 x bfloat>
   %cast = bitcast <2 x bfloat> %res to float
   ret float %cast
@@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX-950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX-950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_f32_bf16_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
   %trunc = fptrunc float %src to bfloat
   %ext = fpext bfloat %trunc to float
   ret float %ext
@@ -172,6 +202,38 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v4
 ; GFX-950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_cvt_f32_f64_e32 v8, v[2:3]
+; GFX1250-NEXT:    v_cvt_f32_f64_e32 v9, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]|
+; GFX1250-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1250-NEXT:    v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, -1, 1, s1
+; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
+; GFX1250-NEXT:    v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
+; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1
+; GFX1250-NEXT:    v_and_b32_e32 v11, 1, v9
+; GFX1250-NEXT:    v_cmp_eq_u32_e64 s1, 1, v10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_add_nc_u32_e32 v0, v9, v0
+; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, 1, v11
+; GFX1250-NEXT:    s_or_b32 vcc_lo, s1, vcc_lo
+; GFX1250-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX1250-NEXT:    s_or_b32 vcc_lo, s2, s0
+; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x double> %src to <2 x bfloat>
   %cast = bitcast <2 x bfloat> %res to float
   ret float %cast
@@ -201,6 +263,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
 ; GFX-950:       ; %bb.0: ; %entry
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX-950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    ; return to shader part epilog
 entry:
   %a.cvt = fptrunc float %a to bfloat
   %b.cvt = fptrunc float %b to bfloat
@@ -236,6 +303,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
 ; GFX-950:       ; %bb.0: ; %entry
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1|
 ; GFX-950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1|
+; GFX1250-NEXT:    ; return to shader part epilog
 entry:
   %a.neg = fneg float %a
   %a.cvt = fptrunc float %a.neg to bfloat
@@ -269,6 +341,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX-950-NEXT:    flat_store_short v[2:3], v0
 ; GFX-950-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f32_to_bf16:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    s_endpgm
 entry:
   %a.cvt = fptrunc float %a to bfloat
   store bfloat %a.cvt, ptr %out
@@ -298,6 +377,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
 ; GFX-950-NEXT:    flat_store_short v[2:3], v0
 ; GFX-950-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f32_to_bf16_abs:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    s_endpgm
 entry:
   %a.abs = call float @llvm.fabs.f32(float %a)
   %a.cvt = fptrunc float %a.abs to bfloat
@@ -328,6 +414,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
 ; GFX-950-NEXT:    flat_store_short v[2:3], v0
 ; GFX-950-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f32_to_bf16_neg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    s_endpgm
 entry:
   %a.neg = fneg float %a
   %a.cvt = fptrunc float %a.neg to bfloat
@@ -373,6 +466,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX-950-NEXT:    flat_store_short v[2:3], v0
 ; GFX-950-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f64_to_bf16:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT:    v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmp_eq_u32_e64 s0, 1, v7
+; GFX1250-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    s_endpgm
 entry:
   %a.cvt = fptrunc double %a to bfloat
   store bfloat %a.cvt, ptr %out
@@ -417,6 +528,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX-950-NEXT:    flat_store_short v[2:3], v0
 ; GFX-950-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f64_to_bf16_neg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_cvt_f32_f64_e64 v6, -v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT:    v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1
+; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX1250-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    s_endpgm
 entry:
   %a.neg = fneg double %a
   %a.cvt = fptrunc double %a.neg to bfloat
@@ -462,6 +592,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
 ; GFX-950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX-950-NEXT:    flat_store_short v[2:3], v0
 ; GFX-950-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fptrunc_f64_to_bf16_abs:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT:    v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1
+; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX1250-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    s_endpgm
 entry:
   %a.abs = call double @llvm.fabs.f64(double %a)
   %a.cvt = fptrunc double %a.abs to bfloat
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
new file mode 100644
index 0000000000000..b49614d05700a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+; TODO: Add global-isel when it can support bf16
+define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_sqrt_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_bf16_e32 v2, v2
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
+  store bfloat %sqrt, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_sqrt_bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_bf16_e32 v2, s0
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src)
+  store bfloat %sqrt, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_log2_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_log_bf16_e32 v2, v2
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %log = call bfloat @llvm.log2.bf16(bfloat %src)
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_log2_bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_log_bf16_e32 v2, s0
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %log = call bfloat @llvm.log2.bf16(bfloat %src)
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_exp2_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_exp_bf16_e32 v2, v2
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_exp2_bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_exp_bf16_e32 v2, s0
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+declare bfloat @llvm.sqrt.bf16(bfloat)
+declare bfloat @llvm.log2.bf16(bfloat)
+declare bfloat @llvm.exp2.bf16(bfloat)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 2bdf994496421..7859fcdfe8a70 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2,7 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9,GFX900
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 | FileCheck %s -check-prefixes=GFX9,GFX950
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
@@ -967,12 +968,21 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_store_global_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_store_global_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[1:2], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_store_global_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-NEXT:    global_store_dword v[2:3], v0, off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_store_global_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -2019,23 +2029,41 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_store_global_v64bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_store_global_v64bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX900-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_store_global_v64bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX950-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_store_global_v64bf16:
 ; GFX10:       ; %bb.0:
@@ -2204,20 +2232,30 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_f32_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_load_store_f32_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v0, v[0:1], off
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX900-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_load_store_f32_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dword v0, v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    global_store_short v[2:3], v0, off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_f32_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -2308,30 +2346,50 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_f64_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1]
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v7
-; GFX9-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
-; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    s_or_b64 vcc, vcc, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_add3_u32 v4, v5, v4, s8
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_load_store_f64_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX900-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v7
+; GFX900-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]|
+; GFX900-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; GFX900-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX900-NEXT:    s_or_b64 vcc, vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX900-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT:    v_add3_u32 v4, v5, v4, s8
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX900-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_load_store_f64_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX950-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
+; GFX950-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5]
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v7
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GFX950-NEXT:    v_add_u32_e32 v0, v6, v0
+; GFX950-NEXT:    s_or_b64 vcc, vcc, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    global_store_short v[2:3], v0, off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_f64_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -2858,12 +2916,21 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_arg_store:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_short v[1:2], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_arg_store:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-NEXT:    global_store_short v[2:3], v0, off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store:
 ; GFX10:       ; %bb.0:
@@ -2918,12 +2985,21 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_arg_store_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[1:2], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_arg_store_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-NEXT:    global_store_dword v[2:3], v0, off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -3384,12 +3460,19 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_byval:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_byval:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_byval:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_store_short off, v0, s32
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_byval:
 ; GFX10:       ; %bb.0:
@@ -3440,12 +3523,19 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_sret:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_sret:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_sret:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_store_short v0, v1, off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_sret:
 ; GFX10:       ; %bb.0:
@@ -3907,34 +3997,63 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s33, s18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s18, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX900-NEXT:    s_addk_i32 s32, 0x400
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_mov_b32 s33, s18
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s2, s33
+; GFX950-NEXT:    s_mov_b32 s33, s32
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_store_dword off, v4, s33 ; 4-byte Folded Spill
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_add_i32 s32, s32, 16
+; GFX950-NEXT:    s_getpc_b64 s[0:1]
+; GFX950-NEXT:    s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
+; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    scratch_store_short v1, v0, off sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX950-NEXT:    s_mov_b32 s32, s33
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s33, s2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call:
 ; GFX10:       ; %bb.0: ; %entry
@@ -4104,34 +4223,63 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s33, s18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v2bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s18, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX900-NEXT:    s_addk_i32 s32, 0x400
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_mov_b32 s33, s18
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v2bf16:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s2, s33
+; GFX950-NEXT:    s_mov_b32 s33, s32
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_store_dword off, v4, s33 ; 4-byte Folded Spill
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_add_i32 s32, s32, 16
+; GFX950-NEXT:    s_getpc_b64 s[0:1]
+; GFX950-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX950-NEXT:    v_writelane_b32 v4, s31, 1
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    scratch_store_dword v1, v0, off sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX950-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX950-NEXT:    s_mov_b32 s32, s33
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_load_dword v4, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s33, s2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
@@ -4308,36 +4456,68 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s33, s18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v3bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s18, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX900-NEXT:    s_addk_i32 s32, 0x400
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_mov_b32 s33, s18
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v3bf16:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s2, s33
+; GFX950-NEXT:    s_mov_b32 s33, s32
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_add_i32 s32, s32, 16
+; GFX950-NEXT:    s_getpc_b64 s[0:1]
+; GFX950-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    scratch_store_short v4, v1, off offset:4 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    scratch_store_dword v4, v0, off sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX950-NEXT:    s_mov_b32 s32, s33
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s33, s2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v3bf16:
 ; GFX10:       ; %bb.0: ; %entry
@@ -4534,36 +4714,66 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s33, s18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v4bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s18, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX900-NEXT:    s_addk_i32 s32, 0x400
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_mov_b32 s33, s18
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v4bf16:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s2, s33
+; GFX950-NEXT:    s_mov_b32 s33, s32
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_add_i32 s32, s32, 16
+; GFX950-NEXT:    s_getpc_b64 s[0:1]
+; GFX950-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX950-NEXT:    v_mov_b32_e32 v4, v2
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX950-NEXT:    s_mov_b32 s32, s33
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s33, s2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v4bf16:
 ; GFX10:       ; %bb.0: ; %entry
@@ -4804,40 +5014,69 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s33, s18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v8bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s18, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX900-NEXT:    s_addk_i32 s32, 0x400
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_mov_b32 s33, s18
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v8bf16:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s2, s33
+; GFX950-NEXT:    s_mov_b32 s33, s32
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_store_dword off, v5, s33 ; 4-byte Folded Spill
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_add_i32 s32, s32, 16
+; GFX950-NEXT:    s_getpc_b64 s[0:1]
+; GFX950-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX950-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    scratch_store_dwordx4 v4, v[0:3], off sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX950-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX950-NEXT:    s_mov_b32 s32, s33
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_load_dword v5, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s33, s2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v8bf16:
 ; GFX10:       ; %bb.0: ; %entry
@@ -5174,48 +5413,79 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s18, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[16:17], -1
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[16:17]
-; GFX9-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX9-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX9-NEXT:    s_mov_b32 s32, s33
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s33, s18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_call_v16bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s18, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; GFX900-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX900-NEXT:    s_addk_i32 s32, 0x400
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX900-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX900-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX900-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_mov_b32 s33, s18
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_call_v16bf16:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s2, s33
+; GFX950-NEXT:    s_mov_b32 s33, s32
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_store_dword off, v9, s33 ; 4-byte Folded Spill
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_add_i32 s32, s32, 16
+; GFX950-NEXT:    s_getpc_b64 s[0:1]
+; GFX950-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX950-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX950-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX950-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX950-NEXT:    scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    scratch_store_dwordx4 v8, v[0:3], off sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX950-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX950-NEXT:    s_mov_b32 s32, s33
+; GFX950-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX950-NEXT:    scratch_load_dword v9, off, s33 ; 4-byte Folded Reload
+; GFX950-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s33, s2
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_call_v16bf16:
 ; GFX10:       ; %bb.0: ; %entry
@@ -5332,14 +5602,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_alloca_load_store_ret:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_alloca_load_store_ret:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_alloca_load_store_ret:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_store_short off, v0, s32 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    scratch_load_ushort v0, off, s32 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_alloca_load_store_ret:
 ; GFX10:       ; %bb.0: ; %entry
@@ -5625,52 +5904,72 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_overflow_stack:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(25)
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    s_waitcnt vmcnt(25)
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(25)
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: test_overflow_stack:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX900-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX900-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX900-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX900-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX900-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX900-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    s_nop 0
+; GFX900-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; GFX900-NEXT:    s_nop 0
+; GFX900-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX900-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX900-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX900-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX900-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX900-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX900-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX900-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX900-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX900-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX900-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX900-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX900-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX900-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX900-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX900-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX900-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX900-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX900-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX900-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX900-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(25)
+; GFX900-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:124
+; GFX900-NEXT:    s_waitcnt vmcnt(25)
+; GFX900-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
+; GFX900-NEXT:    s_waitcnt vmcnt(25)
+; GFX900-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:116
+; GFX900-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: test_overflow_stack:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[26:29], off offset:96
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[22:25], off offset:80
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[18:21], off offset:64
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[14:17], off offset:48
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[30:33], off offset:112
+; GFX950-NEXT:    scratch_store_short v0, v1, off offset:128
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_overflow_stack:
 ; GFX10:       ; %bb.0:
@@ -5870,15 +6169,25 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
 ; GFX10:       ; %bb.0:
@@ -6120,18 +6429,31 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX10:       ; %bb.0:
@@ -6766,16 +7088,27 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v2, v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dword v0, v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
 ; GFX10:       ; %bb.0:
@@ -6852,18 +7185,31 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
 ; GFX10:       ; %bb.0:
@@ -8476,193 +8822,363 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v9, v[1:2], off offset:62
-; GFX9-NEXT:    global_load_ushort v11, v[1:2], off offset:60
-; GFX9-NEXT:    global_load_ushort v12, v[1:2], off offset:58
-; GFX9-NEXT:    global_load_ushort v13, v[1:2], off offset:56
-; GFX9-NEXT:    global_load_ushort v14, v[1:2], off offset:54
-; GFX9-NEXT:    global_load_ushort v15, v[1:2], off offset:52
-; GFX9-NEXT:    global_load_ushort v16, v[1:2], off offset:50
-; GFX9-NEXT:    global_load_ushort v17, v[1:2], off offset:48
-; GFX9-NEXT:    global_load_ushort v18, v[1:2], off offset:46
-; GFX9-NEXT:    global_load_ushort v19, v[1:2], off offset:44
-; GFX9-NEXT:    global_load_ushort v20, v[1:2], off offset:42
-; GFX9-NEXT:    global_load_ushort v21, v[1:2], off offset:40
-; GFX9-NEXT:    global_load_ushort v22, v[1:2], off offset:38
-; GFX9-NEXT:    global_load_ushort v23, v[1:2], off offset:36
-; GFX9-NEXT:    global_load_ushort v24, v[1:2], off offset:34
-; GFX9-NEXT:    global_load_ushort v25, v[1:2], off offset:32
-; GFX9-NEXT:    global_load_ushort v26, v[1:2], off
-; GFX9-NEXT:    global_load_ushort v27, v[1:2], off offset:2
-; GFX9-NEXT:    global_load_ushort v3, v[1:2], off offset:16
-; GFX9-NEXT:    global_load_ushort v4, v[1:2], off offset:18
-; GFX9-NEXT:    global_load_ushort v5, v[1:2], off offset:20
-; GFX9-NEXT:    global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT:    global_load_ushort v8, v[1:2], off offset:24
-; GFX9-NEXT:    global_load_ushort v28, v[1:2], off offset:30
-; GFX9-NEXT:    global_load_ushort v29, v[1:2], off offset:26
-; GFX9-NEXT:    global_load_ushort v30, v[1:2], off offset:28
-; GFX9-NEXT:    global_load_ushort v31, v[1:2], off offset:4
-; GFX9-NEXT:    global_load_ushort v32, v[1:2], off offset:6
-; GFX9-NEXT:    global_load_ushort v33, v[1:2], off offset:8
-; GFX9-NEXT:    global_load_ushort v34, v[1:2], off offset:10
-; GFX9-NEXT:    global_load_ushort v7, v[1:2], off offset:12
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v11
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v12
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v14
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v15
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
-; GFX9-NEXT:    s_waitcnt vmcnt(32)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v21
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v21
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v19
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[19:20], v20
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:144
-; GFX9-NEXT:    s_waitcnt vmcnt(44)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:136
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v11
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[9:10], v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(41)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[15:16], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[17:18], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(41)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[19:20], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[5:6], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(41)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v10
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_ushort v9, v[1:2], off offset:62
+; GFX900-NEXT:    global_load_ushort v11, v[1:2], off offset:60
+; GFX900-NEXT:    global_load_ushort v12, v[1:2], off offset:58
+; GFX900-NEXT:    global_load_ushort v13, v[1:2], off offset:56
+; GFX900-NEXT:    global_load_ushort v14, v[1:2], off offset:54
+; GFX900-NEXT:    global_load_ushort v15, v[1:2], off offset:52
+; GFX900-NEXT:    global_load_ushort v16, v[1:2], off offset:50
+; GFX900-NEXT:    global_load_ushort v17, v[1:2], off offset:48
+; GFX900-NEXT:    global_load_ushort v18, v[1:2], off offset:46
+; GFX900-NEXT:    global_load_ushort v19, v[1:2], off offset:44
+; GFX900-NEXT:    global_load_ushort v20, v[1:2], off offset:42
+; GFX900-NEXT:    global_load_ushort v21, v[1:2], off offset:40
+; GFX900-NEXT:    global_load_ushort v22, v[1:2], off offset:38
+; GFX900-NEXT:    global_load_ushort v23, v[1:2], off offset:36
+; GFX900-NEXT:    global_load_ushort v24, v[1:2], off offset:34
+; GFX900-NEXT:    global_load_ushort v25, v[1:2], off offset:32
+; GFX900-NEXT:    global_load_ushort v26, v[1:2], off
+; GFX900-NEXT:    global_load_ushort v27, v[1:2], off offset:2
+; GFX900-NEXT:    global_load_ushort v3, v[1:2], off offset:16
+; GFX900-NEXT:    global_load_ushort v4, v[1:2], off offset:18
+; GFX900-NEXT:    global_load_ushort v5, v[1:2], off offset:20
+; GFX900-NEXT:    global_load_ushort v6, v[1:2], off offset:22
+; GFX900-NEXT:    global_load_ushort v8, v[1:2], off offset:24
+; GFX900-NEXT:    global_load_ushort v28, v[1:2], off offset:30
+; GFX900-NEXT:    global_load_ushort v29, v[1:2], off offset:26
+; GFX900-NEXT:    global_load_ushort v30, v[1:2], off offset:28
+; GFX900-NEXT:    global_load_ushort v31, v[1:2], off offset:4
+; GFX900-NEXT:    global_load_ushort v32, v[1:2], off offset:6
+; GFX900-NEXT:    global_load_ushort v33, v[1:2], off offset:8
+; GFX900-NEXT:    global_load_ushort v34, v[1:2], off offset:10
+; GFX900-NEXT:    global_load_ushort v7, v[1:2], off offset:12
+; GFX900-NEXT:    s_nop 0
+; GFX900-NEXT:    global_load_ushort v1, v[1:2], off offset:14
+; GFX900-NEXT:    s_waitcnt vmcnt(31)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(30)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
+; GFX900-NEXT:    s_waitcnt vmcnt(28)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:252
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:248
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX900-NEXT:    s_waitcnt vmcnt(29)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:244
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:240
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(30)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:236
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:232
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v11
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[11:12], v12
+; GFX900-NEXT:    s_waitcnt vmcnt(31)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX900-NEXT:    s_waitcnt vmcnt(30)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v17
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:228
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
+; GFX900-NEXT:    s_waitcnt vmcnt(31)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[13:14], v14
+; GFX900-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
+; GFX900-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[11:12], v15
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
+; GFX900-NEXT:    s_waitcnt vmcnt(32)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
+; GFX900-NEXT:    s_waitcnt vmcnt(30)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v21
+; GFX900-NEXT:    s_waitcnt vmcnt(28)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX900-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
+; GFX900-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:200
+; GFX900-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:196
+; GFX900-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:192
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[11:12], v21
+; GFX900-NEXT:    s_waitcnt vmcnt(33)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[13:14], v19
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[19:20], v20
+; GFX900-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:188
+; GFX900-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:184
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:180
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:176
+; GFX900-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:172
+; GFX900-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:168
+; GFX900-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:164
+; GFX900-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:160
+; GFX900-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:156
+; GFX900-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
+; GFX900-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:148
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v2
+; GFX900-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:144
+; GFX900-NEXT:    s_waitcnt vmcnt(44)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:140
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:136
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v11
+; GFX900-NEXT:    s_waitcnt vmcnt(38)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:132
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:128
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
+; GFX900-NEXT:    s_waitcnt vmcnt(38)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:124
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:120
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:116
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:112
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[9:10], v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[13:14], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(41)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[15:16], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(40)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; GFX900-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:108
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:104
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[17:18], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(41)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[19:20], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(40)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
+; GFX900-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[5:6], v2
+; GFX900-NEXT:    s_waitcnt vmcnt(41)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX900-NEXT:    s_waitcnt vmcnt(40)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
+; GFX900-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX900-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
+; GFX900-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
+; GFX900-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
+; GFX900-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
+; GFX900-NEXT:    v_cvt_f64_f32_e32 v[1:2], v10
+; GFX900-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
+; GFX900-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
+; GFX900-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
+; GFX900-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
+; GFX900-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
+; GFX900-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
+; GFX900-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:44
+; GFX900-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:40
+; GFX900-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:36
+; GFX900-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:32
+; GFX900-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
+; GFX900-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
+; GFX900-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
+; GFX900-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
+; GFX900-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
+; GFX900-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
+; GFX900-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:4
+; GFX900-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT:    global_load_ushort v1, v[2:3], off offset:2
+; GFX950-NEXT:    global_load_ushort v4, v[2:3], off offset:12
+; GFX950-NEXT:    global_load_ushort v5, v[2:3], off offset:8
+; GFX950-NEXT:    global_load_ushort v6, v[2:3], off offset:4
+; GFX950-NEXT:    global_load_ushort v7, v[2:3], off
+; GFX950-NEXT:    global_load_ushort v8, v[2:3], off offset:6
+; GFX950-NEXT:    global_load_ushort v9, v[2:3], off offset:10
+; GFX950-NEXT:    global_load_ushort v10, v[2:3], off offset:14
+; GFX950-NEXT:    global_load_ushort v11, v[2:3], off offset:18
+; GFX950-NEXT:    global_load_ushort v12, v[2:3], off offset:28
+; GFX950-NEXT:    global_load_ushort v13, v[2:3], off offset:24
+; GFX950-NEXT:    global_load_ushort v14, v[2:3], off offset:20
+; GFX950-NEXT:    global_load_ushort v15, v[2:3], off offset:16
+; GFX950-NEXT:    global_load_ushort v16, v[2:3], off offset:22
+; GFX950-NEXT:    global_load_ushort v17, v[2:3], off offset:26
+; GFX950-NEXT:    global_load_ushort v18, v[2:3], off offset:30
+; GFX950-NEXT:    global_load_ushort v19, v[2:3], off offset:34
+; GFX950-NEXT:    global_load_ushort v20, v[2:3], off offset:44
+; GFX950-NEXT:    global_load_ushort v21, v[2:3], off offset:40
+; GFX950-NEXT:    global_load_ushort v22, v[2:3], off offset:36
+; GFX950-NEXT:    global_load_ushort v23, v[2:3], off offset:32
+; GFX950-NEXT:    global_load_ushort v24, v[2:3], off offset:38
+; GFX950-NEXT:    global_load_ushort v25, v[2:3], off offset:42
+; GFX950-NEXT:    global_load_ushort v26, v[2:3], off offset:46
+; GFX950-NEXT:    global_load_ushort v42, v[2:3], off offset:50
+; GFX950-NEXT:    global_load_ushort v43, v[2:3], off offset:62
+; GFX950-NEXT:    global_load_ushort v46, v[2:3], off offset:60
+; GFX950-NEXT:    global_load_ushort v47, v[2:3], off offset:56
+; GFX950-NEXT:    global_load_ushort v60, v[2:3], off offset:52
+; GFX950-NEXT:    global_load_ushort v56, v[2:3], off offset:48
+; GFX950-NEXT:    global_load_ushort v57, v[2:3], off offset:54
+; GFX950-NEXT:    global_load_ushort v58, v[2:3], off offset:58
+; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT:    s_waitcnt vmcnt(31)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    s_waitcnt vmcnt(30)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX950-NEXT:    s_waitcnt vmcnt(29)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
+; GFX950-NEXT:    s_waitcnt vmcnt(27)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX950-NEXT:    s_waitcnt vmcnt(26)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT:    s_waitcnt vmcnt(24)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v11
+; GFX950-NEXT:    s_waitcnt vmcnt(22)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX950-NEXT:    s_waitcnt vmcnt(21)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX950-NEXT:    s_waitcnt vmcnt(20)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX950-NEXT:    s_waitcnt vmcnt(19)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX950-NEXT:    s_waitcnt vmcnt(18)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[12:13], v27
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v37, 16, v18
+; GFX950-NEXT:    s_waitcnt vmcnt(15)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v39, 16, v19
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v20
+; GFX950-NEXT:    s_waitcnt vmcnt(13)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v40, 16, v21
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[14:15], v30
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[20:21], v31
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v49, 16, v24
+; GFX950-NEXT:    s_waitcnt vmcnt(9)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v26
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v42
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v43
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[18:19], v32
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[26:27], v36
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[32:33], v37
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[30:31], v38
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[36:37], v39
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[38:39], v44
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[44:45], v42
+; GFX950-NEXT:    s_waitcnt vmcnt(5)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v42, 16, v46
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[42:43], v42
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v46, 16, v58
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[42:45], off offset:240
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v46
+; GFX950-NEXT:    v_lshlrev_b32_e32 v46, 16, v47
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[44:45], v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v56
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[42:43], v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[56:57], v46
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v48, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[56:59], off offset:224
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[10:11], v28
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[58:59], v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v60
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[22:23], v34
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[28:29], v35
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[34:35], v48
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[50:51], v49
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[48:49], v52
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[54:55], v53
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[52:53], v40
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[40:41], v41
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[56:57], v1
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[8:9], v7
+; GFX950-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[56:59], off offset:208
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[42:45], off offset:192
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[38:41], off offset:176
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[52:55], off offset:160
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[48:51], off offset:144
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[34:37], off offset:128
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[30:33], off offset:112
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[26:29], off offset:96
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[22:25], off offset:80
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[18:21], off offset:64
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[14:17], off offset:48
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
+; GFX950-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
+; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
 ; GFX10:       ; %bb.0:
@@ -9050,20 +9566,29 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16:
 ; GFX10:       ; %bb.0:
@@ -9082,17 +9607,19 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_fadd_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9176,29 +9703,41 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -9361,38 +9900,54 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -9602,46 +10157,65 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -9965,80 +10539,113 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX900-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX950-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX950-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v8bf16:
 ; GFX10:       ; %bb.0:
@@ -10654,148 +11261,209 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX900-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX900-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX900-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX900-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX900-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX900-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX900-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX900-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX950-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX950-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX950-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX950-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX950-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX950-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -12110,286 +12778,407 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v32, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT:    v_add_f32_e32 v33, v33, v34
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_add_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT:    v_add_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_add_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT:    v_add_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_add_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX900-NEXT:    v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT:    v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT:    v_add_f32_e32 v30, v32, v30
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX900-NEXT:    v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT:    v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT:    v_add_f32_e32 v32, v32, v29
+; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX900-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT:    v_add_f32_e32 v33, v33, v34
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_add_f32_e32 v29, v15, v29
+; GFX900-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT:    v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT:    v_add_f32_e32 v28, v33, v28
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX900-NEXT:    v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT:    v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT:    v_add_f32_e32 v27, v33, v27
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX900-NEXT:    v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT:    v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT:    v_add_f32_e32 v26, v33, v26
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX900-NEXT:    v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT:    v_add_f32_e32 v25, v33, v25
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX900-NEXT:    v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX900-NEXT:    v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX900-NEXT:    v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX900-NEXT:    v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX900-NEXT:    v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX900-NEXT:    v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX900-NEXT:    v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT:    v_add_f32_e32 v18, v33, v18
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX900-NEXT:    v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX900-NEXT:    v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT:    v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT:    v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX950-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX950-NEXT:    v_add_f32_e32 v30, v36, v35
+; GFX950-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX950-NEXT:    v_add_f32_e32 v29, v38, v37
+; GFX950-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX950-NEXT:    v_add_f32_e32 v28, v48, v39
+; GFX950-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX950-NEXT:    v_add_f32_e32 v27, v50, v49
+; GFX950-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX950-NEXT:    v_add_f32_e32 v26, v52, v51
+; GFX950-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX950-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT:    v_add_f32_e32 v24, v32, v24
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_add_f32_e32 v23, v32, v23
+; GFX950-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_add_f32_e32 v22, v32, v22
+; GFX950-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_add_f32_e32 v21, v32, v21
+; GFX950-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_add_f32_e32 v20, v32, v20
+; GFX950-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_add_f32_e32 v19, v32, v19
+; GFX950-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT:    v_add_f32_e32 v18, v32, v18
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX950-NEXT:    v_add_f32_e32 v31, v40, v55
+; GFX950-NEXT:    v_add_f32_e32 v17, v32, v17
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -13288,19 +14077,27 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_bf16_fpimm_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_bf16_fpimm_0:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_bf16_fpimm_0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX10:       ; %bb.0:
@@ -13318,9 +14115,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, 1.0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13383,19 +14181,27 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_bf16_fpimm_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fadd_bf16_fpimm_1:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fadd_bf16_fpimm_1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX10:       ; %bb.0:
@@ -13413,9 +14219,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, 0x42280000, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13483,20 +14290,29 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_bf16:
 ; GFX10:       ; %bb.0:
@@ -13515,17 +14331,19 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_fsub_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13609,29 +14427,41 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -13794,38 +14624,54 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -14035,46 +14881,65 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fsub_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fsub_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX950-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -14243,20 +15108,29 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_bf16:
 ; GFX10:       ; %bb.0:
@@ -14275,17 +15149,19 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_fmul_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14369,29 +15245,41 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -14554,38 +15442,54 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -14795,46 +15699,65 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -15158,80 +16081,113 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX900-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX950-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX950-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
@@ -15847,148 +16803,209 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX900-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX900-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX900-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX900-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX900-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX900-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX900-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX900-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX950-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX950-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX950-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX950-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX950-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX950-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -17303,286 +18320,407 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v34
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_mul_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT:    v_mul_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_mul_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT:    v_mul_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_mul_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmul_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT:    v_mul_f32_e32 v31, v32, v31
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX900-NEXT:    v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT:    v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT:    v_mul_f32_e32 v30, v32, v30
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX900-NEXT:    v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT:    v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT:    v_mul_f32_e32 v32, v32, v29
+; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX900-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT:    v_mul_f32_e32 v33, v33, v34
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_mul_f32_e32 v29, v15, v29
+; GFX900-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT:    v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT:    v_mul_f32_e32 v28, v33, v28
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX900-NEXT:    v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT:    v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT:    v_mul_f32_e32 v27, v33, v27
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX900-NEXT:    v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT:    v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT:    v_mul_f32_e32 v26, v33, v26
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX900-NEXT:    v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT:    v_mul_f32_e32 v25, v33, v25
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX900-NEXT:    v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX900-NEXT:    v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX900-NEXT:    v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX900-NEXT:    v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX900-NEXT:    v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX900-NEXT:    v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX900-NEXT:    v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v18, v33, v18
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX900-NEXT:    v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX900-NEXT:    v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT:    v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT:    v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmul_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_mul_f32_e32 v33, v34, v33
+; GFX950-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX950-NEXT:    v_mul_f32_e32 v30, v36, v35
+; GFX950-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX950-NEXT:    v_mul_f32_e32 v29, v38, v37
+; GFX950-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX950-NEXT:    v_mul_f32_e32 v28, v48, v39
+; GFX950-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX950-NEXT:    v_mul_f32_e32 v27, v50, v49
+; GFX950-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX950-NEXT:    v_mul_f32_e32 v26, v52, v51
+; GFX950-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX950-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT:    v_mul_f32_e32 v24, v32, v24
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_mul_f32_e32 v23, v32, v23
+; GFX950-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_mul_f32_e32 v22, v32, v22
+; GFX950-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_mul_f32_e32 v21, v32, v21
+; GFX950-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_mul_f32_e32 v20, v32, v20
+; GFX950-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_mul_f32_e32 v19, v32, v19
+; GFX950-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT:    v_mul_f32_e32 v18, v32, v18
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v15, v15, v31
+; GFX950-NEXT:    v_mul_f32_e32 v31, v40, v55
+; GFX950-NEXT:    v_mul_f32_e32 v17, v32, v17
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -18516,30 +19654,50 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fdiv_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
-; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
-; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
-; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fdiv_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX900-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX900-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX900-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX900-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX900-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX900-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX900-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX900-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX900-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fdiv_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX950-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX950-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX950-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX950-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX950-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX950-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_bf16:
 ; GFX10:       ; %bb.0:
@@ -18568,32 +19726,34 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_fdiv_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
-; GFX11TRUE16-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT:    v_div_scale_f32 v1, null, v0, v0, v2
+; GFX11TRUE16-NEXT:    v_div_scale_f32 v5, vcc_lo, v2, v0, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11TRUE16-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
 ; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v3, v4, v3
-; GFX11TRUE16-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v3
+; GFX11TRUE16-NEXT:    v_fma_f32 v6, -v1, v4, v5
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fma_f32 v6, -v2, v4, v5
 ; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v4, v6, v3
+; GFX11TRUE16-NEXT:    v_fma_f32 v1, -v1, v4, v5
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fma_f32 v2, -v2, v4, v5
-; GFX11TRUE16-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11TRUE16-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX11TRUE16-NEXT:    v_div_fixup_f32 v0, v1, v0, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -18986,20 +20146,29 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_bf16:
 ; GFX10:       ; %bb.0:
@@ -19018,17 +20187,19 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_minnum_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_min_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -19112,29 +20283,41 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -19297,38 +20480,54 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -19538,46 +20737,65 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -19901,80 +21119,113 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX900-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX950-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v8bf16:
 ; GFX10:       ; %bb.0:
@@ -20590,148 +21841,209 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX900-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX900-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX900-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX900-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX900-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX900-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX900-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX900-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX950-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX950-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX950-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX950-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX950-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -22046,286 +23358,407 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT:    v_min_f32_e32 v32, v32, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT:    v_min_f32_e32 v33, v33, v34
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_min_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT:    v_min_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_min_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT:    v_min_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_min_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minnum_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX900-NEXT:    v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT:    v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT:    v_min_f32_e32 v30, v32, v30
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX900-NEXT:    v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT:    v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT:    v_min_f32_e32 v32, v32, v29
+; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX900-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT:    v_min_f32_e32 v33, v33, v34
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_min_f32_e32 v29, v15, v29
+; GFX900-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT:    v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT:    v_min_f32_e32 v28, v33, v28
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX900-NEXT:    v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT:    v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT:    v_min_f32_e32 v27, v33, v27
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX900-NEXT:    v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT:    v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT:    v_min_f32_e32 v26, v33, v26
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX900-NEXT:    v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT:    v_min_f32_e32 v25, v33, v25
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX900-NEXT:    v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX900-NEXT:    v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX900-NEXT:    v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX900-NEXT:    v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX900-NEXT:    v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX900-NEXT:    v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX900-NEXT:    v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX900-NEXT:    v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX900-NEXT:    v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT:    v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT:    v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minnum_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_min_f32_e32 v33, v34, v33
+; GFX950-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX950-NEXT:    v_min_f32_e32 v30, v36, v35
+; GFX950-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX950-NEXT:    v_min_f32_e32 v29, v38, v37
+; GFX950-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX950-NEXT:    v_min_f32_e32 v28, v48, v39
+; GFX950-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX950-NEXT:    v_min_f32_e32 v27, v50, v49
+; GFX950-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX950-NEXT:    v_min_f32_e32 v26, v52, v51
+; GFX950-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX950-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT:    v_min_f32_e32 v24, v32, v24
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_min_f32_e32 v23, v32, v23
+; GFX950-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_min_f32_e32 v22, v32, v22
+; GFX950-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_min_f32_e32 v21, v32, v21
+; GFX950-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_min_f32_e32 v20, v32, v20
+; GFX950-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_min_f32_e32 v19, v32, v19
+; GFX950-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT:    v_min_f32_e32 v18, v32, v18
+; GFX950-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_min_f32_e32 v15, v15, v31
+; GFX950-NEXT:    v_min_f32_e32 v31, v40, v55
+; GFX950-NEXT:    v_min_f32_e32 v17, v32, v17
+; GFX950-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -23238,20 +24671,29 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_bf16:
 ; GFX10:       ; %bb.0:
@@ -23270,17 +24712,19 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_maxnum_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_max_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23364,29 +24808,41 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -23549,38 +25005,54 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -23790,46 +25262,65 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -24153,80 +25644,113 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX900-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX900-NEXT:    v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_add3_u32 v9, v9, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX950-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
@@ -24842,148 +26366,209 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
-; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX900-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX900-NEXT:    v_add3_u32 v17, v17, v16, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT:    v_bfe_u32 v15, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v15, v15, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX900-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX900-NEXT:    v_add3_u32 v17, v17, v15, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v15
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_bfe_u32 v14, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v14, v14, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX900-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX900-NEXT:    v_add3_u32 v17, v17, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v13, v13, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX900-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX900-NEXT:    v_add3_u32 v17, v17, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v12, v12, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX900-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX900-NEXT:    v_add3_u32 v17, v17, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v11, v11, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX900-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX900-NEXT:    v_add3_u32 v17, v17, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v10, v10, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX900-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX900-NEXT:    v_add3_u32 v17, v17, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v9, v9, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX900-NEXT:    v_add3_u32 v17, v17, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GFX900-NEXT:    v_add3_u32 v8, v8, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX950-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX950-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX950-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX950-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX950-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v9
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v11
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v13
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v14
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v15
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v16
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -26298,286 +27883,407 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_alignbit_b32 v15, v16, v15, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
-; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
-; GFX9-NEXT:    v_max_f32_e32 v33, v33, v34
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX9-NEXT:    v_max_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
-; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
-; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
-; GFX9-NEXT:    v_max_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_max_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
-; GFX9-NEXT:    v_max_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_max_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
-; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
-; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
-; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v32, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v29, v15, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maxnum_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX900-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX900-NEXT:    v_add3_u32 v32, v32, v31, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT:    v_bfe_u32 v30, v14, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v30, v30, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX900-NEXT:    v_max_f32_e32 v30, v32, v30
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX900-NEXT:    v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT:    v_bfe_u32 v29, v13, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
+; GFX900-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX900-NEXT:    v_max_f32_e32 v32, v32, v29
+; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX900-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v28, v28, v12, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX900-NEXT:    v_max_f32_e32 v33, v33, v34
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_max_f32_e32 v29, v15, v29
+; GFX900-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX900-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v29
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
+; GFX900-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX900-NEXT:    v_add3_u32 v33, v33, v32, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v32
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
+; GFX900-NEXT:    v_max_f32_e32 v28, v33, v28
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX900-NEXT:    v_add3_u32 v33, v33, v28, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v28
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT:    v_bfe_u32 v27, v11, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v27, v27, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX900-NEXT:    v_max_f32_e32 v27, v33, v27
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX900-NEXT:    v_add3_u32 v33, v33, v27, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v27
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT:    v_bfe_u32 v26, v10, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v26, v26, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
+; GFX900-NEXT:    v_max_f32_e32 v26, v33, v26
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX900-NEXT:    v_add3_u32 v33, v33, v26, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v26
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v25, v25, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; GFX900-NEXT:    v_max_f32_e32 v25, v33, v25
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX900-NEXT:    v_add3_u32 v33, v33, v25, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v25
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_bfe_u32 v24, v8, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v24, v24, v8, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX900-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX900-NEXT:    v_add3_u32 v33, v33, v24, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v23, v23, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX900-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX900-NEXT:    v_add3_u32 v33, v33, v23, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v23
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v22, v22, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX900-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX900-NEXT:    v_add3_u32 v33, v33, v22, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v22
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v21, v21, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX900-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX900-NEXT:    v_add3_u32 v33, v33, v21, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v21
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v20, v20, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX900-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX900-NEXT:    v_add3_u32 v33, v33, v20, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v20
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v19, v19, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX900-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX900-NEXT:    v_add3_u32 v33, v33, v19, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v19
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_bfe_u32 v18, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v18, v18, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
+; GFX900-NEXT:    v_max_f32_e32 v18, v33, v18
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX900-NEXT:    v_add3_u32 v33, v33, v18, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v18
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT:    v_bfe_u32 v17, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v17, v17, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX900-NEXT:    v_add3_u32 v33, v33, v17, s4
+; GFX900-NEXT:    v_or_b32_e32 v34, 0x400000, v17
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
+; GFX900-NEXT:    v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v33, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT:    v_perm_b32 v12, v12, v32, s4
+; GFX900-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT:    v_perm_b32 v15, v29, v15, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maxnum_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX950-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v7
+; GFX950-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX950-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX950-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_max_f32_e32 v33, v34, v33
+; GFX950-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX950-NEXT:    v_max_f32_e32 v30, v36, v35
+; GFX950-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX950-NEXT:    v_max_f32_e32 v29, v38, v37
+; GFX950-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX950-NEXT:    v_max_f32_e32 v28, v48, v39
+; GFX950-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX950-NEXT:    v_max_f32_e32 v27, v50, v49
+; GFX950-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX950-NEXT:    v_max_f32_e32 v26, v52, v51
+; GFX950-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX950-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v8, v8, v25
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v9, v9, v26
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v10, v10, v27
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v11, v11, v28
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v12, v12, v29
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v13, v13, v30
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v14, v14, v33
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v31
+; GFX950-NEXT:    v_max_f32_e32 v24, v32, v24
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_max_f32_e32 v23, v32, v23
+; GFX950-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_max_f32_e32 v22, v32, v22
+; GFX950-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_max_f32_e32 v21, v32, v21
+; GFX950-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_max_f32_e32 v20, v32, v20
+; GFX950-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_max_f32_e32 v19, v32, v19
+; GFX950-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX950-NEXT:    v_max_f32_e32 v18, v32, v18
+; GFX950-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v15, v15, v31
+; GFX950-NEXT:    v_max_f32_e32 v31, v40, v55
+; GFX950-NEXT:    v_max_f32_e32 v17, v32, v17
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v17
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v18
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v19
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v20
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v21
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v22
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v23
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v31
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v15, v15, v24
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -27529,36 +29235,66 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sqrt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_sqrt_f32_e32 v1, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, -1, v1
-; GFX9-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX9-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x260
-; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sqrt_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX900-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX900-NEXT:    v_add_u32_e32 v2, -1, v1
+; GFX900-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; GFX900-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; GFX900-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX900-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX900-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x260
+; GFX900-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sqrt_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0xf800000
+; GFX950-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_add_u32_e32 v2, -1, v1
+; GFX950-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; GFX950-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v3
+; GFX950-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
+; GFX950-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
+; GFX950-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x260
+; GFX950-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sqrt_bf16:
 ; GFX10:       ; %bb.0:
@@ -27591,11 +29327,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_sqrt_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_sqrt_f32_e32 v1, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -27700,19 +29437,27 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_ldexp_bf16_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_ldexp_bf16_i32:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_ldexp_bf16_i32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ldexp_bf16_i32:
 ; GFX10:       ; %bb.0:
@@ -27730,9 +29475,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX11TRUE16-LABEL: v_ldexp_bf16_i32:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v2, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -27804,20 +29550,29 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_frexp_bf16_i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_frexp_bf16_i16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_frexp_bf16_i16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_frexp_bf16_i16:
 ; GFX10:       ; %bb.0:
@@ -27836,17 +29591,18 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_frexp_bf16_i16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX11TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27962,35 +29718,61 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_log_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x800000
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_log_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3377d1cf
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x800000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX900-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x3377d1cf
+; GFX900-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX900-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_log_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x800000
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x3f317217
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT:    v_log_f32_e32 v0, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX950-NEXT:    v_fma_f32 v2, v0, s0, -v1
+; GFX950-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x7f800000
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log_bf16:
 ; GFX10:       ; %bb.0:
@@ -28019,11 +29801,12 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_log_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -28135,26 +29918,42 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_log2_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x800000
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_log_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x800000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_log2_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x800000
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_log_f32_e32 v0, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log2_bf16:
 ; GFX10:       ; %bb.0:
@@ -28177,13 +29976,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_log2_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -28310,35 +30110,61 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_log10_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x800000
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_log_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3284fbcf
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log10_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x800000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX900-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x3284fbcf
+; GFX900-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX900-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_log10_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x800000
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x3e9a209a
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT:    v_log_f32_e32 v0, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX950-NEXT:    v_fma_f32 v2, v0, s0, -v1
+; GFX950-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x7f800000
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX950-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log10_bf16:
 ; GFX10:       ; %bb.0:
@@ -28367,11 +30193,12 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_log10_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -28521,36 +30348,61 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_exp_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
-; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x32a5705f
-; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
-; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x42b17218
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX900-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX900-NEXT:    v_fma_f32 v1, v0, s4, -v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x32a5705f
+; GFX900-NEXT:    v_fma_f32 v1, v0, s4, v1
+; GFX900-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX900-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
+; GFX900-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42b17218
+; GFX900-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_exp_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x3fb8aa3b
+; GFX950-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX950-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX950-NEXT:    v_fma_f32 v1, v0, s0, -v1
+; GFX950-NEXT:    v_fmamk_f32 v1, v0, 0x32a5705f, v1
+; GFX950-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX950-NEXT:    v_exp_f32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0xc2ce8ed0
+; GFX950-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x42b17218
+; GFX950-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX950-NEXT:    v_cmp_nlt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp_bf16:
 ; GFX10:       ; %bb.0:
@@ -28580,25 +30432,26 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_exp_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v2, v1
-; GFX11TRUE16-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v1
+; GFX11TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v1
+; GFX11TRUE16-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v1, -v0
+; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v3, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX11TRUE16-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
-; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_fmamk_f32 v2, v1, 0x32a5705f, v2
+; GFX11TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11TRUE16-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -28701,27 +30554,43 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_exp2_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, 63
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp2_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_exp2_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0xc2fc0000
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX950-NEXT:    v_not_b32_e32 v1, 63
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_exp_f32_e32 v0, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp2_bf16:
 ; GFX10:       ; %bb.0:
@@ -28744,13 +30613,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_exp2_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
@@ -28878,36 +30748,61 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_exp10_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
-; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
-; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x33979a37
-; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0xc23369f4
-; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x421a209b
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_exp10_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX900-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX900-NEXT:    v_fma_f32 v1, v0, s4, -v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x33979a37
+; GFX900-NEXT:    v_fma_f32 v1, v0, s4, v1
+; GFX900-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX900-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT:    s_mov_b32 s4, 0xc23369f4
+; GFX900-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x421a209b
+; GFX900-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_exp10_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x40549a78
+; GFX950-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX950-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX950-NEXT:    v_fma_f32 v1, v0, s0, -v1
+; GFX950-NEXT:    v_fmamk_f32 v1, v0, 0x33979a37, v1
+; GFX950-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX950-NEXT:    v_exp_f32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0xc23369f4
+; GFX950-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x421a209b
+; GFX950-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX950-NEXT:    v_cmp_nlt_f32_e32 vcc, s0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp10_bf16:
 ; GFX10:       ; %bb.0:
@@ -28937,25 +30832,26 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_exp10_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v2, v1
-; GFX11TRUE16-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v1
+; GFX11TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v1
+; GFX11TRUE16-NEXT:    v_fma_f32 v2, 0x40549a78, v1, -v0
+; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v3, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX11TRUE16-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
-; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_fmamk_f32 v2, v1, 0x33979a37, v2
+; GFX11TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11TRUE16-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -29036,19 +30932,27 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_ceil_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_ceil_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_ceil_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ceil_bf16:
 ; GFX10:       ; %bb.0:
@@ -29066,9 +30970,10 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_ceil_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_ceil_f32_e32 v0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29133,19 +31038,27 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_trunc_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_trunc_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_trunc_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_trunc_bf16:
 ; GFX10:       ; %bb.0:
@@ -29163,9 +31076,10 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_trunc_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_trunc_f32_e32 v0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29230,19 +31144,27 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_rint_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_rint_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_rint_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rint_bf16:
 ; GFX10:       ; %bb.0:
@@ -29260,9 +31182,10 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_rint_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29327,19 +31250,27 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_nearbyint_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_nearbyint_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_nearbyint_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_nearbyint_bf16:
 ; GFX10:       ; %bb.0:
@@ -29357,9 +31288,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_nearbyint_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29442,25 +31374,40 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_round_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_round_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX900-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX900-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX900-NEXT:    s_brev_b32 s4, -2
+; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX900-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_round_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX950-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX950-NEXT:    s_brev_b32 s0, -2
+; GFX950-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; GFX950-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_round_bf16:
 ; GFX10:       ; %bb.0:
@@ -29483,16 +31430,17 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_round_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11TRUE16-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11TRUE16-NEXT:    v_trunc_f32_e32 v0, v1
+; GFX11TRUE16-NEXT:    v_sub_f32_e32 v2, v1, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11TRUE16-NEXT:    v_bfi_b32 v1, 0x7fffffff, v2, v1
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -29564,19 +31512,27 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_roundeven_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_roundeven_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_roundeven_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_bf16:
 ; GFX10:       ; %bb.0:
@@ -29594,9 +31550,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_roundeven_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_rndne_f32_e32 v0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29661,19 +31618,27 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_floor_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_floor_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_floor_f32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_floor_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_floor_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_floor_bf16:
 ; GFX10:       ; %bb.0:
@@ -29691,9 +31656,10 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_floor_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_floor_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_floor_f32_e32 v0, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29756,19 +31722,27 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_canonicalize_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_canonicalize_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_canonicalize_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_canonicalize_bf16:
 ; GFX10:       ; %bb.0:
@@ -29786,9 +31760,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX11TRUE16-LABEL: v_canonicalize_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11TRUE16-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29898,14 +31873,24 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_oeq_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_oeq_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_oeq_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oeq_bf16:
 ; GFX10:       ; %bb.0:
@@ -29916,15 +31901,27 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_oeq_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_oeq_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_oeq_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp oeq bfloat %a, %b
   ret i1 %op
 }
@@ -29961,14 +31958,24 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_ogt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ogt_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ogt_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ogt_bf16:
 ; GFX10:       ; %bb.0:
@@ -29979,15 +31986,27 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_ogt_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ogt_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ogt_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ogt bfloat %a, %b
   ret i1 %op
 }
@@ -30024,14 +32043,24 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_oge_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_oge_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_oge_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_oge_bf16:
 ; GFX10:       ; %bb.0:
@@ -30042,15 +32071,27 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_oge_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_oge_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_oge_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp oge bfloat %a, %b
   ret i1 %op
 }
@@ -30087,14 +32128,24 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_olt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_olt_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_olt_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_olt_bf16:
 ; GFX10:       ; %bb.0:
@@ -30105,15 +32156,27 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_olt_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_olt_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_olt_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp olt bfloat %a, %b
   ret i1 %op
 }
@@ -30150,14 +32213,24 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_ole_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ole_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ole_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ole_bf16:
 ; GFX10:       ; %bb.0:
@@ -30168,15 +32241,27 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_ole_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ole_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ole_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ole bfloat %a, %b
   ret i1 %op
 }
@@ -30213,14 +32298,24 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_one_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_one_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_one_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_one_bf16:
 ; GFX10:       ; %bb.0:
@@ -30231,15 +32326,27 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_one_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_one_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_one_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp one bfloat %a, %b
   ret i1 %op
 }
@@ -30276,14 +32383,24 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_uno_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_uno_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_uno_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uno_bf16:
 ; GFX10:       ; %bb.0:
@@ -30294,15 +32411,27 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_uno_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_uno_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_uno_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp uno bfloat %a, %b
   ret i1 %op
 }
@@ -30339,14 +32468,24 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_ueq_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ueq_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ueq_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ueq_bf16:
 ; GFX10:       ; %bb.0:
@@ -30357,15 +32496,27 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_ueq_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ueq_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ueq_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ueq bfloat %a, %b
   ret i1 %op
 }
@@ -30402,14 +32553,24 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_ugt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ugt_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ugt_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ugt_bf16:
 ; GFX10:       ; %bb.0:
@@ -30420,15 +32581,27 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_ugt_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ugt_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ugt_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ugt bfloat %a, %b
   ret i1 %op
 }
@@ -30465,14 +32638,24 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_uge_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_uge_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_uge_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_uge_bf16:
 ; GFX10:       ; %bb.0:
@@ -30483,15 +32666,27 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_uge_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_uge_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_uge_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp uge bfloat %a, %b
   ret i1 %op
 }
@@ -30528,14 +32723,24 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_ult_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ult_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ult_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ult_bf16:
 ; GFX10:       ; %bb.0:
@@ -30546,15 +32751,27 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_ult_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ult_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ult_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ult bfloat %a, %b
   ret i1 %op
 }
@@ -30591,14 +32808,24 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_ule_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_ule_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_ule_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_ule_bf16:
 ; GFX10:       ; %bb.0:
@@ -30609,15 +32836,27 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_ule_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ule_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ule_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp ule bfloat %a, %b
   ret i1 %op
 }
@@ -30654,14 +32893,24 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fcmp_une_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fcmp_une_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fcmp_une_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fcmp_une_bf16:
 ; GFX10:       ; %bb.0:
@@ -30672,15 +32921,27 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fcmp_une_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_une_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_une_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fcmp une bfloat %a, %b
   ret i1 %op
 }
@@ -30763,13 +33024,22 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fptosi_bf16_to_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v1
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi bfloat %x to i16
   ret i16 %op
 }
@@ -30815,16 +33085,27 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
 ; GFX10:       ; %bb.0:
@@ -30914,18 +33195,31 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_perm_b32 v0, v0, v2, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
 ; GFX10:       ; %bb.0:
@@ -31036,21 +33330,37 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX900-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT:    v_perm_b32 v1, v1, v2, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX10:       ; %bb.0:
@@ -31144,13 +33454,22 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fptosi_bf16_to_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i32:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v1
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i32:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi bfloat %x to i32
   ret i32 %op
 }
@@ -31458,24 +33777,44 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_bf16_to_i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, s4, |v0|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_bf16_to_i64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v1, v1
+; GFX900-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX900-NEXT:    v_fma_f32 v1, v1, s4, |v0|
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX900-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX900-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX900-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX900-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX900-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_bf16_to_i64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT:    v_mul_f32_e64 v1, |v0|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0xcf800000
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX950-NEXT:    v_fma_f32 v1, v1, s0, |v0|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX950-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX950-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i64:
 ; GFX10:       ; %bb.0:
@@ -31494,27 +33833,50 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fptosi_bf16_to_i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_floor_f32_e32 v1, v1
-; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i64:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_trunc_f32_e32 v0, v1
+; GFX11TRUE16-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11TRUE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11TRUE16-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i64:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11FAKE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11FAKE16-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi bfloat %x to i64
   ret i64 %op
 }
@@ -31617,36 +33979,69 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, s4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_floor_f32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v0
-; GFX9-NEXT:    v_fma_f32 v3, v2, s5, |v1|
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v4|, s4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_fma_f32 v5, v0, s5, |v4|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v3, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v3
-; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX900-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT:    v_mul_f32_e64 v2, |v1|, s4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_floor_f32_e32 v2, v2
+; GFX900-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX900-NEXT:    v_trunc_f32_e32 v4, v0
+; GFX900-NEXT:    v_fma_f32 v3, v2, s5, |v1|
+; GFX900-NEXT:    v_mul_f32_e64 v0, |v4|, s4
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX900-NEXT:    v_floor_f32_e32 v0, v0
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX900-NEXT:    v_fma_f32 v5, v0, s5, |v4|
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX900-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX900-NEXT:    v_xor_b32_e32 v3, v3, v1
+; GFX900-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX900-NEXT:    v_sub_co_u32_e32 v0, vcc, v3, v1
+; GFX900-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX900-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX900-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX900-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX900-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX900-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT:    v_mul_f32_e64 v2, |v1|, s0
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_floor_f32_e32 v2, v2
+; GFX950-NEXT:    s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT:    v_trunc_f32_e32 v4, v0
+; GFX950-NEXT:    v_fma_f32 v3, v2, s1, |v1|
+; GFX950-NEXT:    v_mul_f32_e64 v0, |v4|, s0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT:    v_floor_f32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX950-NEXT:    v_fma_f32 v5, v0, s1, |v4|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v1
+; GFX950-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v3, v1
+; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX950-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX950-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
 ; GFX10:       ; %bb.0:
@@ -31854,49 +34249,96 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v3, |v2|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v3, v3
-; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v4, v3, s5, |v2|
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v5|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_fma_f32 v6, v0, s5, |v5|
-; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v0
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT:    v_mul_f32_e64 v5, |v1|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v5, v5
-; GFX9-NEXT:    v_xor_b32_e32 v2, v7, v3
-; GFX9-NEXT:    v_fma_f32 v7, v5, s5, |v1|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_xor_b32_e32 v4, v8, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v1
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX900-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT:    v_mul_f32_e64 v3, |v2|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v3, v3
+; GFX900-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX900-NEXT:    v_fma_f32 v4, v3, s5, |v2|
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX900-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX900-NEXT:    v_mul_f32_e64 v0, |v5|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v0, v0
+; GFX900-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX900-NEXT:    v_fma_f32 v6, v0, s5, |v5|
+; GFX900-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v8, v0
+; GFX900-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX900-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX900-NEXT:    v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
+; GFX900-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX900-NEXT:    v_mul_f32_e64 v5, |v1|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v5, v5
+; GFX900-NEXT:    v_xor_b32_e32 v2, v7, v3
+; GFX900-NEXT:    v_fma_f32 v7, v5, s5, |v1|
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX900-NEXT:    v_xor_b32_e32 v4, v8, v3
+; GFX900-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX900-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX900-NEXT:    v_xor_b32_e32 v4, v7, v1
+; GFX900-NEXT:    v_xor_b32_e32 v5, v5, v1
+; GFX900-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v1
+; GFX900-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v1, v6
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT:    v_mul_f32_e64 v3, |v2|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v3, v3
+; GFX950-NEXT:    s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT:    v_fma_f32 v4, v3, s1, |v2|
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX950-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT:    v_mul_f32_e64 v0, |v5|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v0, v0
+; GFX950-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX950-NEXT:    v_fma_f32 v6, v0, s1, |v5|
+; GFX950-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v8, v0
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX950-NEXT:    v_mul_f32_e64 v5, |v1|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v5, v5
+; GFX950-NEXT:    v_xor_b32_e32 v2, v7, v3
+; GFX950-NEXT:    v_fma_f32 v7, v5, s1, |v1|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX950-NEXT:    v_xor_b32_e32 v4, v8, v3
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT:    v_xor_b32_e32 v4, v7, v1
+; GFX950-NEXT:    v_xor_b32_e32 v5, v5, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v1, v6
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
 ; GFX10:       ; %bb.0:
@@ -32165,61 +34607,120 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v3, |v2|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v3, v3
-; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v4, v3, s5, |v2|
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v5|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_fma_f32 v6, v0, s5, |v5|
-; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v0
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v3
-; GFX9-NEXT:    v_mul_f32_e64 v6, |v5|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v6, v6
-; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v3
-; GFX9-NEXT:    v_fma_f32 v7, v6, s5, |v5|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v5
-; GFX9-NEXT:    v_mul_f32_e64 v7, |v1|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v7, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_fma_f32 v9, v7, s5, |v1|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, v9, v1
-; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v1
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX900-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX900-NEXT:    v_mul_f32_e64 v3, |v2|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v3, v3
+; GFX900-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX900-NEXT:    v_fma_f32 v4, v3, s5, |v2|
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX900-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX900-NEXT:    v_mul_f32_e64 v0, |v5|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v0, v0
+; GFX900-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX900-NEXT:    v_fma_f32 v6, v0, s5, |v5|
+; GFX900-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX900-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; GFX900-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX900-NEXT:    v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX900-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX900-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX900-NEXT:    v_mul_f32_e64 v6, |v5|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v6, v6
+; GFX900-NEXT:    v_xor_b32_e32 v4, v7, v3
+; GFX900-NEXT:    v_fma_f32 v7, v6, s5, |v5|
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX900-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX900-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX900-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX900-NEXT:    v_xor_b32_e32 v4, v7, v5
+; GFX900-NEXT:    v_mul_f32_e64 v7, |v1|, s4
+; GFX900-NEXT:    v_floor_f32_e32 v7, v7
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX900-NEXT:    v_fma_f32 v9, v7, s5, |v1|
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX900-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX900-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GFX900-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v5
+; GFX900-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX900-NEXT:    v_xor_b32_e32 v6, v9, v1
+; GFX900-NEXT:    v_xor_b32_e32 v7, v7, v1
+; GFX900-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v1
+; GFX900-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v1, v8
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX950-NEXT:    s_mov_b32 s0, 0x2f800000
+; GFX950-NEXT:    v_mul_f32_e64 v3, |v2|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v3, v3
+; GFX950-NEXT:    s_mov_b32 s1, 0xcf800000
+; GFX950-NEXT:    v_fma_f32 v4, v3, s1, |v2|
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX950-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX950-NEXT:    v_mul_f32_e64 v0, |v5|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v0, v0
+; GFX950-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX950-NEXT:    v_fma_f32 v6, v0, s1, |v5|
+; GFX950-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX950-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; GFX950-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX950-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX950-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX950-NEXT:    v_mul_f32_e64 v6, |v5|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v6, v6
+; GFX950-NEXT:    v_xor_b32_e32 v4, v7, v3
+; GFX950-NEXT:    v_fma_f32 v7, v6, s1, |v5|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX950-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX950-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX950-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX950-NEXT:    v_xor_b32_e32 v4, v7, v5
+; GFX950-NEXT:    v_mul_f32_e64 v7, |v1|, s0
+; GFX950-NEXT:    v_floor_f32_e32 v7, v7
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX950-NEXT:    v_fma_f32 v9, v7, s1, |v1|
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX950-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX950-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GFX950-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v5
+; GFX950-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX950-NEXT:    v_xor_b32_e32 v6, v9, v1
+; GFX950-NEXT:    v_xor_b32_e32 v7, v7, v1
+; GFX950-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v1, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
 ; GFX10:       ; %bb.0:
@@ -32366,18 +34867,25 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_i16_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_i16_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_i16_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -32470,25 +34978,33 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -32618,32 +35134,42 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -32814,38 +35340,49 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -32991,18 +35528,25 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_i32_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_i32_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_i32_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -33087,25 +35631,33 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -33224,32 +35776,42 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v2, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -33401,38 +35963,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -33599,29 +36172,47 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_i64_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v2, v0, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
-; GFX9-NEXT:    v_min_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_i64_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX900-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX900-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX900-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX900-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX900-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_i64_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX950-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX950-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX950-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i64_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -33816,47 +36407,77 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v4, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
-; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_add3_u32 v5, v0, v4, s4
-; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX900-NEXT:    v_ffbh_i32_e32 v4, v1
+; GFX900-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX900-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX900-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX900-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX900-NEXT:    v_ldexp_f32 v4, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX900-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX900-NEXT:    v_add3_u32 v5, v0, v4, s4
+; GFX900-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX900-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX900-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v2, 32, v6
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX950-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX950-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX950-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX950-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX950-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX950-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX950-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX950-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX950-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX950-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX950-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -34158,65 +36779,109 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
-; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
-; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_add3_u32 v6, v0, v5, s4
-; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX900-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX900-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX900-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX900-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX900-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX900-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX900-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX900-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX900-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX900-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX900-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX900-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX900-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
+; GFX900-NEXT:    v_ldexp_f32 v5, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX900-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX900-NEXT:    v_add3_u32 v6, v0, v5, s4
+; GFX900-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX900-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX900-NEXT:    v_min_u32_e32 v7, v0, v1
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v5
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v4, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX950-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX950-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX950-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX950-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX950-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX950-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX950-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX950-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX950-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX950-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX950-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX950-NEXT:    v_ffbh_i32_e32 v5, v3
+; GFX950-NEXT:    v_add_u32_e32 v5, -1, v5
+; GFX950-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX950-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX950-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX950-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT:    v_xor_b32_e32 v6, v0, v1
+; GFX950-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX950-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX950-NEXT:    v_min_u32_e32 v3, v3, v6
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v5
+; GFX950-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -34614,82 +37279,137 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
-; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v8, v4, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
-; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
-; GFX9-NEXT:    v_add3_u32 v9, v4, v8, s4
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
-; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
-; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
-; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_add3_u32 v7, v0, v6, s4
-; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_perm_b32 v1, v4, v5, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX900-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX900-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX900-NEXT:    v_add_u32_e32 v8, -1, v8
+; GFX900-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX900-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX900-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX900-NEXT:    v_sub_u32_e32 v5, 32, v8
+; GFX900-NEXT:    v_ldexp_f32 v8, v4, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX900-NEXT:    v_xor_b32_e32 v5, v6, v7
+; GFX900-NEXT:    v_add3_u32 v9, v4, v8, s4
+; GFX900-NEXT:    v_ffbh_i32_e32 v4, v7
+; GFX900-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX900-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX900-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX900-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX900-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX900-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX900-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX900-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX900-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX900-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX900-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v6, 32, v10
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX900-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
+; GFX900-NEXT:    v_ldexp_f32 v6, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v0, v6, 16, 1
+; GFX900-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX900-NEXT:    v_add3_u32 v7, v0, v6, s4
+; GFX900-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX900-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX900-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX900-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v6
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    v_perm_b32 v1, v4, v5, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GFX950-NEXT:    v_ffbh_i32_e32 v8, v7
+; GFX950-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX950-NEXT:    v_add_u32_e32 v8, -1, v8
+; GFX950-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX950-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX950-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
+; GFX950-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX950-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX950-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX950-NEXT:    v_ffbh_i32_e32 v7, v5
+; GFX950-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX950-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX950-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX950-NEXT:    v_min_u32_e32 v7, v7, v9
+; GFX950-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX950-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v6, v6
+; GFX950-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX950-NEXT:    v_sub_u32_e32 v5, 32, v8
+; GFX950-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX950-NEXT:    v_sub_u32_e32 v6, 32, v7
+; GFX950-NEXT:    v_xor_b32_e32 v7, v2, v3
+; GFX950-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX950-NEXT:    v_ffbh_i32_e32 v6, v3
+; GFX950-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX950-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX950-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX950-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX950-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX950-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX950-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX950-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX950-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX950-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX950-NEXT:    v_min_u32_e32 v3, v3, v7
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX950-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v4, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -34974,18 +37694,25 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_i16_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_i16_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_i16_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -35078,25 +37805,33 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -35229,32 +37964,42 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -35428,38 +38173,49 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX900-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -35610,18 +38366,25 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_i32_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_i32_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_i32_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -35706,25 +38469,33 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -35843,32 +38614,42 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX900-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v2, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v2, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -36020,38 +38801,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -36206,25 +38998,39 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_i64_to_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX9-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_i64_to_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX900-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_i64_to_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX950-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i64_to_bf16:
 ; GFX10:       ; %bb.0:
@@ -36378,39 +39184,61 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
-; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v4, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
-; GFX9-NEXT:    v_add3_u32 v5, v0, v4, s4
-; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX900-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX900-NEXT:    v_ldexp_f32 v4, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v5, v0, v4, s4
+; GFX900-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX900-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v2, 32, v6
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX950-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX950-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX950-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX950-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX950-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -36646,53 +39474,85 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
-; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
-; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
-; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
-; GFX9-NEXT:    v_add3_u32 v6, v0, v5, s4
-; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX900-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX900-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX900-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX900-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX900-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
+; GFX900-NEXT:    v_ldexp_f32 v5, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX900-NEXT:    v_add3_u32 v6, v0, v5, s4
+; GFX900-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX900-NEXT:    v_min_u32_e32 v7, 32, v0
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v5
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v4, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX950-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX950-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX950-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX950-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX950-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX950-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX950-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX950-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
+; GFX950-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX950-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v5
+; GFX950-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    v_mov_b32_e32 v1, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -37008,66 +39868,105 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v8, v4, v5
-; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
-; GFX9-NEXT:    v_add3_u32 v9, v4, v8, s4
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
-; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
-; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
-; GFX9-NEXT:    v_add3_u32 v7, v0, v6, s4
-; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_perm_b32 v1, v4, v5, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX900-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX900-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX900-NEXT:    v_sub_u32_e32 v5, 32, v8
+; GFX900-NEXT:    v_ldexp_f32 v8, v4, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX900-NEXT:    v_add3_u32 v9, v4, v8, s4
+; GFX900-NEXT:    v_ffbh_u32_e32 v4, v7
+; GFX900-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX900-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX900-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX900-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX900-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX900-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v6, 32, v10
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX900-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
+; GFX900-NEXT:    v_ldexp_f32 v6, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v0, v6, 16, 1
+; GFX900-NEXT:    v_add3_u32 v7, v0, v6, s4
+; GFX900-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX900-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX900-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v6
+; GFX900-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX900-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX900-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX900-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX900-NEXT:    v_perm_b32 v1, v4, v5, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_ffbh_u32_e32 v8, v7
+; GFX950-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX950-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
+; GFX950-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX950-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX950-NEXT:    v_ffbh_u32_e32 v7, v5
+; GFX950-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX950-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX950-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v6, v6
+; GFX950-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX950-NEXT:    v_sub_u32_e32 v5, 32, v8
+; GFX950-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX950-NEXT:    v_sub_u32_e32 v6, 32, v7
+; GFX950-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX950-NEXT:    v_ffbh_u32_e32 v6, v3
+; GFX950-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX950-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX950-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX950-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX950-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX950-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX950-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX950-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX950-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX950-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX950-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX950-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX950-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX950-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v4, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -37303,13 +40202,22 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_bf16:
 ; GFX10:       ; %bb.0:
@@ -37372,14 +40280,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_fneg_lhs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_fneg_lhs_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_fneg_lhs_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
 ; GFX10:       ; %bb.0:
@@ -37446,14 +40364,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_fneg_rhs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_fneg_rhs_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_fneg_rhs_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
 ; GFX10:       ; %bb.0:
@@ -37537,16 +40465,28 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -37631,18 +40571,32 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_vselect_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -37718,15 +40672,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_select_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX900-LABEL: s_select_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    v_mov_b32_e32 v1, s1
+; GFX900-NEXT:    v_mov_b32_e32 v2, s0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX900-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_bf16:
 ; GFX10:       ; %bb.0:
@@ -37818,21 +40784,39 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_select_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX900-LABEL: s_select_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX900-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX900-NEXT:    v_mov_b32_e32 v1, s3
+; GFX900-NEXT:    v_mov_b32_e32 v2, s2
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v1, s1
+; GFX900-NEXT:    v_mov_b32_e32 v2, s0
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX900-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX950-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX950-NEXT:    v_mov_b32_e32 v1, s3
+; GFX950-NEXT:    v_mov_b32_e32 v2, s2
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -37931,22 +40915,42 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_vselect_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX900-LABEL: s_vselect_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX900-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX900-NEXT:    v_mov_b32_e32 v2, s3
+; GFX900-NEXT:    v_mov_b32_e32 v3, s2
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v2, s1
+; GFX900-NEXT:    v_mov_b32_e32 v3, s0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX900-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_vselect_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX950-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX950-NEXT:    v_mov_b32_e32 v2, s3
+; GFX950-NEXT:    v_mov_b32_e32 v3, s2
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-NEXT:    v_mov_b32_e32 v3, s0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_vselect_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -38057,14 +41061,24 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -38155,14 +41169,24 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -38276,15 +41300,26 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v6bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v6bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v6bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
@@ -38423,16 +41458,28 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
 ; GFX10:       ; %bb.0:
@@ -38672,20 +41719,36 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -39241,32 +42304,60 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_select_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_select_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX900-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX900-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_select_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:4
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(1)
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v31, v15, vcc
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -39376,19 +42467,34 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_select_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX900-LABEL: s_select_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    v_mov_b32_e32 v1, s2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v1, s3
+; GFX900-NEXT:    v_mov_b32_e32 v2, s1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX900-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX900-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v1, s2
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v1, s3
+; GFX950-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX950-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -39492,18 +42598,32 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_select_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX900-LABEL: s_select_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    v_mov_b32_e32 v1, s3
+; GFX900-NEXT:    v_mov_b32_e32 v2, s1
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v1, s2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s0
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX900-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX900-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_select_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v1, s3
+; GFX950-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v1, s2
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX950-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -39626,34 +42746,66 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_vselect_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s1
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX900-LABEL: s_vselect_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX900-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX900-NEXT:    v_mov_b32_e32 v4, s5
+; GFX900-NEXT:    v_mov_b32_e32 v5, s4
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v4, s3
+; GFX900-NEXT:    v_mov_b32_e32 v5, s1
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX900-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX900-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX900-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX900-NEXT:    v_perm_b32 v2, v3, v2, s1
+; GFX900-NEXT:    v_mov_b32_e32 v3, s4
+; GFX900-NEXT:    v_mov_b32_e32 v4, s3
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT:    v_mov_b32_e32 v3, s2
+; GFX900-NEXT:    v_mov_b32_e32 v4, s0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s1
+; GFX900-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX900-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX900-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_vselect_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX950-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX950-NEXT:    v_mov_b32_e32 v4, s5
+; GFX950-NEXT:    v_mov_b32_e32 v5, s4
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX950-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v4, s3
+; GFX950-NEXT:    v_mov_b32_e32 v5, s1
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX950-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX950-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX950-NEXT:    v_perm_b32 v2, v3, v2, s1
+; GFX950-NEXT:    v_mov_b32_e32 v3, s4
+; GFX950-NEXT:    v_mov_b32_e32 v4, s3
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v3, s2
+; GFX950-NEXT:    v_mov_b32_e32 v4, s0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s1
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -39825,26 +42977,48 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_vselect_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v5, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX900-NEXT:    v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX900-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v1, v7, v5, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX900-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v1
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[2:3], 1, v1
+; GFX950-NEXT:    v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    v_cndmask_b32_e64 v1, v7, v5, s[2:3]
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX950-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX950-NEXT:    v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_perm_b32 v0, v3, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v2, v1, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -40066,47 +43240,93 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_vselect_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT:    v_perm_b32 v2, v5, v4, s4
-; GFX9-NEXT:    v_perm_b32 v3, v7, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX900-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX900-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX900-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX900-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT:    v_perm_b32 v2, v5, v4, s4
+; GFX900-NEXT:    v_perm_b32 v3, v7, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX950-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX950-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX950-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX950-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX950-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v3, v2, s0
+; GFX950-NEXT:    v_perm_b32 v2, v5, v4, s0
+; GFX950-NEXT:    v_perm_b32 v3, v7, v6, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v8bf16:
 ; GFX10:       ; %bb.0:
@@ -40575,85 +43795,171 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_vselect_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v10
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v12
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v13
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v30, v22, s[8:9]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v8
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v30
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v13, 1, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v12, v10, s[8:9]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v29, v21, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, v28, v20, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v27, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v21, v12, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v28
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v22, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v27, v21, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v6, v10, v6, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v8, v23, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v15, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v15, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT:    v_perm_b32 v2, v5, v4, s4
-; GFX9-NEXT:    v_perm_b32 v3, v9, v19, s4
-; GFX9-NEXT:    v_perm_b32 v4, v11, v20, s4
-; GFX9-NEXT:    v_perm_b32 v5, v12, v14, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v13, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX900-NEXT:    v_and_b32_e32 v6, 1, v8
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v6
+; GFX900-NEXT:    v_and_b32_e32 v6, 1, v10
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v6
+; GFX900-NEXT:    v_and_b32_e32 v6, 1, v12
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v6
+; GFX900-NEXT:    v_and_b32_e32 v8, 1, v13
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v30, v22, s[8:9]
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v8
+; GFX900-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
+; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v30
+; GFX900-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX900-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX900-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX900-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX900-NEXT:    v_and_b32_e32 v13, 1, v14
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v12, v10, s[8:9]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v29, v21, s[6:7]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v29
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v11
+; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; GFX900-NEXT:    v_cndmask_b32_e64 v20, v28, v20, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v19
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v9
+; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v27, v19, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v21, v12, s[6:7]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v28
+; GFX900-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v9, v22, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX900-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX900-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v27, v21, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v6, v10, v6, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v8, v23, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
+; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v15, v8, vcc
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v16
+; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v15, v8, vcc
+; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX900-NEXT:    v_perm_b32 v2, v5, v4, s4
+; GFX900-NEXT:    v_perm_b32 v3, v9, v19, s4
+; GFX900-NEXT:    v_perm_b32 v4, v11, v20, s4
+; GFX900-NEXT:    v_perm_b32 v5, v12, v14, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v13, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v31, off, s32
+; GFX950-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX950-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX950-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX950-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX950-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX950-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX950-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX950-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX950-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX950-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX950-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX950-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX950-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX950-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX950-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX950-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX950-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX950-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX950-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX950-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX950-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX950-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v31
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v18, v32, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX950-NEXT:    v_lshrrev_b32_e32 v18, 16, v25
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    v_lshrrev_b32_e32 v17, 16, v24
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v3, v2, s0
+; GFX950-NEXT:    v_perm_b32 v2, v5, v4, s0
+; GFX950-NEXT:    v_perm_b32 v3, v7, v6, s0
+; GFX950-NEXT:    v_perm_b32 v4, v9, v8, s0
+; GFX950-NEXT:    v_perm_b32 v5, v11, v10, s0
+; GFX950-NEXT:    v_perm_b32 v6, v13, v12, s0
+; GFX950-NEXT:    v_perm_b32 v7, v15, v14, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -41753,205 +45059,438 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_vselect_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[92:93], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v33, s34, 2
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_writelane_b32 v33, s35, 3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v30, v31, v32, s[34:35]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[30:31]
-; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v29, s[94:95]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v29, s[92:93]
-; GFX9-NEXT:    v_cndmask_b32_e64 v29, v26, v27, s[90:91]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[88:89]
-; GFX9-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[78:79]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[76:77]
-; GFX9-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[74:75]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[72:73]
-; GFX9-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[62:63]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[60:61]
-; GFX9-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[58:59]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[56:57]
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[46:47]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[44:45]
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[42:43]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[40:41]
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[24:25]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s[20:21]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[16:17]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s[12:13]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[8:9]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v2, v5, s4
-; GFX9-NEXT:    v_perm_b32 v2, v4, v7, s4
-; GFX9-NEXT:    v_perm_b32 v3, v6, v9, s4
-; GFX9-NEXT:    v_perm_b32 v4, v8, v11, s4
-; GFX9-NEXT:    v_perm_b32 v5, v10, v13, s4
-; GFX9-NEXT:    v_perm_b32 v6, v12, v15, s4
-; GFX9-NEXT:    v_perm_b32 v7, v14, v17, s4
-; GFX9-NEXT:    v_perm_b32 v8, v16, v19, s4
-; GFX9-NEXT:    v_perm_b32 v9, v18, v21, s4
-; GFX9-NEXT:    v_perm_b32 v10, v20, v23, s4
-; GFX9-NEXT:    v_perm_b32 v11, v22, v25, s4
-; GFX9-NEXT:    v_perm_b32 v12, v24, v27, s4
-; GFX9-NEXT:    v_perm_b32 v13, v26, v29, s4
-; GFX9-NEXT:    v_perm_b32 v14, v28, v32, s4
-; GFX9-NEXT:    v_perm_b32 v15, v31, v30, s4
-; GFX9-NEXT:    v_readlane_b32 s35, v33, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v33, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_vselect_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v11
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v13
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v12
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v15
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v14
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v17
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v19
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v18
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v21
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v20
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v23
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[72:73], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v22
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[74:75], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v25
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[76:77], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v24
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[78:79], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v27
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[88:89], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v26
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[90:91], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v29
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[92:93], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v28
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[94:95], 1, v0
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
+; GFX900-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v33, s34, 2
+; GFX900-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX900-NEXT:    v_writelane_b32 v33, s35, 3
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX900-NEXT:    v_and_b32_e32 v0, 1, v30
+; GFX900-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX900-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX900-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX900-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX900-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX900-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX900-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX900-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX900-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX900-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX900-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX900-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX900-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX900-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX900-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX900-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX900-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX900-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
+; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX900-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX900-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX900-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX900-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX900-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX900-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX900-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:60
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cndmask_b32_e64 v30, v31, v32, s[34:35]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX900-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX900-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[30:31]
+; GFX900-NEXT:    v_cndmask_b32_e64 v32, v28, v29, s[94:95]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX900-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX900-NEXT:    v_cndmask_b32_e64 v28, v28, v29, s[92:93]
+; GFX900-NEXT:    v_cndmask_b32_e64 v29, v26, v27, s[90:91]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX900-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX900-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[88:89]
+; GFX900-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[78:79]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX900-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX900-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[76:77]
+; GFX900-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[74:75]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX900-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX900-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[72:73]
+; GFX900-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[62:63]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX900-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX900-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[60:61]
+; GFX900-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[58:59]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX900-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX900-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[56:57]
+; GFX900-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[46:47]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX900-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX900-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[44:45]
+; GFX900-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[42:43]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX900-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[40:41]
+; GFX900-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[24:25]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s[20:21]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[16:17]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX900-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX900-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s[12:13]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX900-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[8:9]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v2, v5, s4
+; GFX900-NEXT:    v_perm_b32 v2, v4, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v6, v9, s4
+; GFX900-NEXT:    v_perm_b32 v4, v8, v11, s4
+; GFX900-NEXT:    v_perm_b32 v5, v10, v13, s4
+; GFX900-NEXT:    v_perm_b32 v6, v12, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v14, v17, s4
+; GFX900-NEXT:    v_perm_b32 v8, v16, v19, s4
+; GFX900-NEXT:    v_perm_b32 v9, v18, v21, s4
+; GFX900-NEXT:    v_perm_b32 v10, v20, v23, s4
+; GFX900-NEXT:    v_perm_b32 v11, v22, v25, s4
+; GFX900-NEXT:    v_perm_b32 v12, v24, v27, s4
+; GFX900-NEXT:    v_perm_b32 v13, v26, v29, s4
+; GFX900-NEXT:    v_perm_b32 v14, v28, v32, s4
+; GFX900-NEXT:    v_perm_b32 v15, v31, v30, s4
+; GFX900-NEXT:    v_readlane_b32 s35, v33, 3
+; GFX900-NEXT:    v_readlane_b32 s34, v33, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v33, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_vselect_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT:    scratch_load_dword v31, off, s32 offset:60
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:124
+; GFX950-NEXT:    scratch_load_ushort v33, off, s32
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:128
+; GFX950-NEXT:    scratch_load_dword v36, off, s32 offset:120
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:116
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:52
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:112
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:88
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:24
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:92
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:108
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:44
+; GFX950-NEXT:    scratch_load_dword v40, off, s32 offset:96
+; GFX950-NEXT:    scratch_load_dword v41, off, s32 offset:32
+; GFX950-NEXT:    scratch_load_dword v42, off, s32 offset:100
+; GFX950-NEXT:    scratch_load_dword v43, off, s32 offset:36
+; GFX950-NEXT:    scratch_load_dword v44, off, s32 offset:104
+; GFX950-NEXT:    scratch_load_dword v45, off, s32 offset:40
+; GFX950-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v29
+; GFX950-NEXT:    scratch_load_dword v29, off, s32 offset:84
+; GFX950-NEXT:    scratch_load_dword v56, off, s32 offset:20
+; GFX950-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v28
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX950-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX950-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX950-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX950-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX950-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX950-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX950-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX950-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX950-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX950-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX950-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX950-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX950-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX950-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX950-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX950-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX950-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX950-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX950-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX950-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX950-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX950-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX950-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX950-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX950-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX950-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX950-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX950-NEXT:    s_waitcnt vmcnt(24)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v46, 16, v31
+; GFX950-NEXT:    s_waitcnt vmcnt(23)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v47, 16, v32
+; GFX950-NEXT:    s_waitcnt vmcnt(22)
+; GFX950-NEXT:    v_and_b32_e32 v28, 1, v33
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:80
+; GFX950-NEXT:    scratch_load_dword v57, off, s32 offset:16
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[2:3], 1, v28
+; GFX950-NEXT:    v_and_b32_e32 v28, 1, v30
+; GFX950-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GFX950-NEXT:    scratch_load_dword v28, off, s32 offset:76
+; GFX950-NEXT:    scratch_load_dword v30, off, s32 offset:12
+; GFX950-NEXT:    s_waitcnt vmcnt(25)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v58, 16, v34
+; GFX950-NEXT:    s_waitcnt vmcnt(24)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v59, 16, v35
+; GFX950-NEXT:    v_cndmask_b32_e64 v34, v35, v34, s[4:5]
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:72
+; GFX950-NEXT:    v_cndmask_b32_e64 v58, v59, v58, s[2:3]
+; GFX950-NEXT:    scratch_load_dword v59, off, s32 offset:8
+; GFX950-NEXT:    v_cndmask_b32_e64 v31, v32, v31, s[0:1]
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:68
+; GFX950-NEXT:    v_cndmask_b32_e32 v46, v47, v46, vcc
+; GFX950-NEXT:    scratch_load_dword v47, off, s32 offset:4
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v26
+; GFX950-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-NEXT:    s_waitcnt vmcnt(26)
+; GFX950-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v27
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v24
+; GFX950-NEXT:    s_waitcnt vmcnt(24)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v39
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v38
+; GFX950-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v25
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v25, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v22
+; GFX950-NEXT:    s_waitcnt vmcnt(22)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v49
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v48
+; GFX950-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v23
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v23, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v55
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v54
+; GFX950-NEXT:    v_cndmask_b32_e32 v20, v54, v55, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v21, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v45
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v44
+; GFX950-NEXT:    v_cndmask_b32_e32 v18, v44, v45, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v19, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v43
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v42
+; GFX950-NEXT:    v_cndmask_b32_e32 v16, v42, v43, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v17, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v40
+; GFX950-NEXT:    v_cndmask_b32_e32 v14, v40, v41, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    v_cndmask_b32_e32 v15, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v53
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v52
+; GFX950-NEXT:    v_cndmask_b32_e32 v12, v52, v53, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v13, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v51
+; GFX950-NEXT:    v_lshrrev_b32_e32 v37, 16, v50
+; GFX950-NEXT:    v_cndmask_b32_e32 v10, v50, v51, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v11, v37, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v36, 16, v56
+; GFX950-NEXT:    v_cndmask_b32_e32 v8, v29, v56, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v9, v29, v36, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v29, 16, v57
+; GFX950-NEXT:    v_cndmask_b32_e32 v6, v33, v57, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v7, v33, v29, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT:    s_waitcnt vmcnt(4)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
+; GFX950-NEXT:    v_cndmask_b32_e32 v4, v28, v30, vcc
+; GFX950-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_cndmask_b32_e32 v5, v28, v29, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT:    s_waitcnt vmcnt(2)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v28, 16, v59
+; GFX950-NEXT:    v_lshrrev_b32_e32 v29, 16, v35
+; GFX950-NEXT:    v_cndmask_b32_e32 v2, v35, v59, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v3, v29, v28, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_lshrrev_b32_e32 v28, 16, v47
+; GFX950-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
+; GFX950-NEXT:    v_cndmask_b32_e32 v0, v32, v47, vcc
+; GFX950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_cndmask_b32_e32 v1, v29, v28, vcc
+; GFX950-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX950-NEXT:    v_perm_b32 v1, v3, v2, s0
+; GFX950-NEXT:    v_perm_b32 v2, v5, v4, s0
+; GFX950-NEXT:    v_perm_b32 v3, v7, v6, s0
+; GFX950-NEXT:    v_perm_b32 v4, v9, v8, s0
+; GFX950-NEXT:    v_perm_b32 v5, v11, v10, s0
+; GFX950-NEXT:    v_perm_b32 v6, v13, v12, s0
+; GFX950-NEXT:    v_perm_b32 v7, v15, v14, s0
+; GFX950-NEXT:    v_perm_b32 v8, v17, v16, s0
+; GFX950-NEXT:    v_perm_b32 v9, v19, v18, s0
+; GFX950-NEXT:    v_perm_b32 v10, v21, v20, s0
+; GFX950-NEXT:    v_perm_b32 v11, v23, v22, s0
+; GFX950-NEXT:    v_perm_b32 v12, v25, v24, s0
+; GFX950-NEXT:    v_perm_b32 v13, v27, v26, s0
+; GFX950-NEXT:    v_perm_b32 v14, v46, v31, s0
+; GFX950-NEXT:    v_perm_b32 v15, v58, v34, s0
+; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -42541,21 +46080,31 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fma_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v2, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_bf16:
 ; GFX10:       ; %bb.0:
@@ -42575,18 +46124,21 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11TRUE16-LABEL: v_fma_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v3, v1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v3, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
-; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -42681,31 +46233,45 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fma_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v2, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -42887,41 +46453,60 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fma_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v5, s0
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v6, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v4, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -43163,50 +46748,73 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fma_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fma_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX900-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX900-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v7, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v3, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v5, v6
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v4bf16:
 ; GFX10:       ; %bb.0:
@@ -43409,28 +47017,41 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmuladd_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_bf16:
 ; GFX10:       ; %bb.0:
@@ -43457,26 +47078,30 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11TRUE16-LABEL: v_fmuladd_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v0, v1, v3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -43604,45 +47229,65 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmuladd_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_v2bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v2bf16:
 ; GFX10:       ; %bb.0:
@@ -43910,62 +47555,90 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmuladd_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_v3bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_v3bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v3bf16:
 ; GFX10:       ; %bb.0:
@@ -44325,78 +47998,113 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmuladd_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_fmuladd_v4bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX900-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmuladd_v4bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
+; GFX950-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX950-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX950-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX950-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v4bf16:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 92c63fead15ac..006fe51a32c72 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc
@@ -56,8 +56,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $vgpr15 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
@@ -112,14 +112,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr19 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr21 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr23 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr25 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
   ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
@@ -671,7 +671,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
@@ -759,7 +759,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
@@ -801,12 +801,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr40_vgpr41 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr46_vgpr47 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr53 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
@@ -814,7 +814,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
@@ -913,7 +913,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
@@ -955,7 +955,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
@@ -989,7 +989,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.69
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
index 253e7e278aaff..0e5ef3c61cce7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
@@ -68,7 +68,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -149,7 +149,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index 474ba71b0ebac..a25c52fa70882 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -69,7 +69,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -151,7 +151,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index d103423ae1675..95504052249e0 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -145,12 +145,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    s_wait_xcnt 0x0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
 ; GCN-NEXT:    s_mov_b32 s0, exec_lo
 ; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 348862d4d8ced..f4b432dce8c8a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -5100,55 +5100,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5257,53 +5258,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5619,48 +5621,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v4.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5773,46 +5776,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6124,15 +6128,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX12-TRUE16-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6146,39 +6150,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX12-TRUE16-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v6, v6, v8
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v8.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX12-TRUE16-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6193,14 +6196,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -6208,7 +6211,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6384,16 +6387,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX11-TRUE16-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6405,39 +6408,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v5.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX11-TRUE16-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6451,14 +6453,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6468,7 +6470,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ab867b089b875..6f1675edbe58a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -4228,55 +4228,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v1, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4385,53 +4386,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4749,48 +4751,49 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v1, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v4.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4903,46 +4906,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5256,15 +5260,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX12-TRUE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5278,39 +5282,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX12-TRUE16-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v6, v6, v8
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v8.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX12-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -5325,14 +5328,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5340,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5516,16 +5519,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX11-TRUE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5537,39 +5540,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v5.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX11-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -5583,14 +5585,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -5600,7 +5602,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 1a25904dd553f..acb27be1846b9 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -4228,55 +4228,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v1, v1, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4385,53 +4386,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v5, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v3
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v3
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4749,48 +4751,49 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, -4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-TRUE16-NEXT:    s_and_b32 s4, s16, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], null offen
 ; GFX12-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX12-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v1, v1, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v4.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX12-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4903,46 +4906,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, v3, s[0:3], 0 offen
 ; GFX11-TRUE16-NEXT:    s_not_b32 s6, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, s4, v4
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v1, v2, s6, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5256,15 +5260,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX12-TRUE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5278,39 +5282,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], null offen
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX12-TRUE16-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v6, v6, v8
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v8.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX12-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -5325,14 +5328,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -5340,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5516,16 +5519,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x200, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 3, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, -4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v11, v7
 ; GFX11-TRUE16-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5537,39 +5540,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v10, s[4:7], 0 offen
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v5.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, v4, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, v7, v5
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
 ; GFX11-TRUE16-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -5583,14 +5585,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v8
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -5600,7 +5602,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, v4, v8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index c69e12731e10d..3c991cfb7a1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
-; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a0, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a1, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a2, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a3, v10 ; Reload Reuse
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
@@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT:    s_nop 0
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(2)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a4, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a5, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a6, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
@@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a4 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a5 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a6 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a7 ; Reload Reuse
+; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB0_1
 ; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
@@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT:    v_add_u32_e32 v62, s8, v0
-; SDAG-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
-; SDAG-GFX942-NEXT:    s_and_b64 vcc, exec, vcc
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a0, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a1, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a2, v11 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a3, v10 ; Reload Reuse
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
@@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT:    s_nop 0
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT:    v_add_u32_e32 v62, s8, v0
+; SDAG-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
+; SDAG-GFX942-NEXT:    s_and_b64 vcc, exec, vcc
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v63, a3 ; Reload Reuse
+; SDAG-GFX942-NEXT:    scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(2)
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a4, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a5, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a6, v11 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
@@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a4 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v4, a5 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v3, a6 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v2, a7 ; Reload Reuse
+; SDAG-GFX942-NEXT:    scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
 ; SDAG-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
 ; SDAG-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
@@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s12, v0
-; GISEL-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
-; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], vcc, -1
-; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; GISEL-GFX942-NEXT:    s_and_b64 vcc, s[2:3], exec
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a0, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a1, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a2, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a3, v10 ; Reload Reuse
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
@@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT:    s_nop 0
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s12, v0
+; GISEL-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
+; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GISEL-GFX942-NEXT:    s_and_b64 vcc, s[2:3], exec
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v63, a3 ; Reload Reuse
+; GISEL-GFX942-NEXT:    scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(2)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a4, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a5, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a6, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
@@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a4 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a5 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a6 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a7 ; Reload Reuse
+; GISEL-GFX942-NEXT:    scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
new file mode 100644
index 0000000000000..d0c9740c6954e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
+---
+name: snork
+body:  |
+  bb.0:
+    ; CHECK-LABEL: name: snork
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+    ; CHECK-NEXT: SI_RETURN
+    %0:sreg_32 = S_MOV_B32 0
+    %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
+    %2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc
+    SI_RETURN
+...
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index a1aef8ddf6bba..da4914016151d 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2748,100 +2748,101 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, v10.l, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 1, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, v8.l, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 3, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, v6.l, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 2, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 1, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v8.l, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, v4.l, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 3, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 2, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, v6.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v6.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 1, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v4.l, 1
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v2.l, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.l, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v5.h, v9.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.h, 3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 1
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, v28.l, 1
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, v26.l, 1
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, v3.l, 15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 1, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 2, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 2, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 1, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 3, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, v24.l, 1
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 1, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, v22.l, 1
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v20.l, 1
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, v18.l, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 1, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, v16.l, 1
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, v14.l, 1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 1, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, v12.l, 1
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v30.l, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 3, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 2, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v6.l, v5.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 3, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 2, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 3, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 2, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 3, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 2, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 3, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 2, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 2, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, v4.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, v5.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v7.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v12.h, v13.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, v26.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 1, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, v24.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, v22.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 1, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, v20.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, v18.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 1, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, v16.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 3, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 2, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v12.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, v5.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v7.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, v3.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v6.l, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, v30.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 1, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, v28.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 3, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 2, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 3, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 2, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v20.l, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 3, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 2, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v16.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v12.h, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, v8.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 3, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 2, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v28.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v23.l, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, v16.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v15.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, 15
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, v3.l, 15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 4, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, v4.l, 15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 12, v8.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v30.h, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v24.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v14.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 12, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 12, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v24.h, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v20.h, 15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 4, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v1.h, 15
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 12, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
index 4404f1aa37c5d..ac8ef48d9e456 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
@@ -20,10 +20,10 @@ body:             |
     ; CHECK-LABEL: name: foo1
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -41,10 +41,10 @@ body:             |
     ; CHECK-LABEL: name: foo2
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -62,10 +62,10 @@ body:             |
     ; CHECK-LABEL: name: foo3
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -83,10 +83,10 @@ body:             |
     ; CHECK-LABEL: name: foo4
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 745e047348626..86e890b06989a 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1771,33 +1771,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, 9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 9
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff00, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff00, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff00, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff00, v4.h
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 9
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x900, v0.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x900, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x900, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v6, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v7, v6
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 738bad7ad1809..f26b72027a784 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -2811,20 +2811,20 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2852,20 +2852,20 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2945,20 +2945,20 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3f00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2986,20 +2986,20 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3f00, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3105,34 +3105,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test3:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x4000, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x4000, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v2, v4, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test3:
@@ -3170,34 +3170,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test3:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x4000, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x4000, s0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v2, v4, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test3:
@@ -3314,34 +3314,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test4:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x3f00, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, 0x3f00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x3f00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v2, v4, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test4:
@@ -3379,34 +3379,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test4:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x3f00, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, 0x3f00, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x3f00, s0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v2, v4, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test4:
@@ -3498,20 +3498,20 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4100
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3539,20 +3539,20 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4100
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3634,20 +3634,20 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4040
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc100, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3675,20 +3675,20 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4040
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc100, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3769,20 +3769,20 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc080
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4100, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3810,20 +3810,20 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc080
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4100, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3902,12 +3902,13 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test8:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v3, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -3940,12 +3941,13 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test8:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v3, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -4033,20 +4035,20 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc200
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc180, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4074,20 +4076,20 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc200
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc180, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4169,20 +4171,20 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xdb80
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xe000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4210,20 +4212,20 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xdb80
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xe000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4305,20 +4307,20 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4c00
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3480, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4346,20 +4348,20 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4c00
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3480, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f8e13fcdd2273..e6c38d29be949 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -521,16 +513,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
@@ -1043,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2664,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -2696,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2710,16 +2697,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
@@ -3232,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 45fe2d07226a1..85e56a243cdc9 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -77,11 +77,20 @@ define i32 @divergent_vec_0_i16(i16 %a) {
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: divergent_vec_0_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_0_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_0_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> poison, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -160,11 +169,20 @@ define i32 @divergent_vec_i16_0(i16 %a) {
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: divergent_vec_i16_0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_i16_0:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_i16_0:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> poison, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -243,11 +261,20 @@ define float @divergent_vec_f16_0(half %a) {
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: divergent_vec_f16_0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_f16_0:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_f16_0:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> poison, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
   %val = bitcast <2 x half> %vec to float
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 6cd439999a554..d8f81db70e309 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -624,31 +624,30 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0x7fff, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0x7fff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_mul_f32 v2, v1, v2 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
@@ -809,35 +808,35 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s4, 16
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0x7fff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_lshlrev_b32 v1, 16, v1
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s4, 0xffff0000
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, s2, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0x7fff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, s2, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s4, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, s2, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -988,34 +987,36 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0x7fff, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mul_f32 v0, 4.0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 2.0, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, 4.0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 2.0, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v[0:1], v1, off dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_extract_fabs_fold_v2bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll
new file mode 100644
index 0000000000000..85e7038b38563
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s
+
+define float @test_canonicalize_amdgcn_tanh_f32(float %a) {
+; GCN-LABEL: test_canonicalize_amdgcn_tanh_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_tanh_f32_e32 v0, v0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %tanh = call float @llvm.amdgcn.tanh.f32(float %a)
+  %canonicalized = call float @llvm.canonicalize.f32(float %tanh)
+  ret float %canonicalized
+}
+
+define bfloat @test_canonicalize_amdgcn_tanh_bf16(bfloat %a) {
+; GCN-LABEL: test_canonicalize_amdgcn_tanh_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_tanh_bf16_e32 v0, v0
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat %a)
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %tanh)
+  ret bfloat %canonicalized
+}
+
+define half @test_canonicalize_amdgcn_tanh_f16(half %a) {
+; GCN-LABEL: test_canonicalize_amdgcn_tanh_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_tanh_f16_e32 v0, v0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %tanh = call half @llvm.amdgcn.tanh.f16(half %a)
+  %canonicalized = call half @llvm.canonicalize.f16(half %tanh)
+  ret half %canonicalized
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
new file mode 100644
index 0000000000000..01ebe7d71428b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
@@ -0,0 +1,298 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+
+/* TODO: Support safe bf16 fdiv lowering.
+define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) {
+  %fdiv = fdiv bfloat %x, %y
+  ret bfloat %fdiv
+}
+*/
+
+define bfloat @v_rcp_bf16(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fdiv = fdiv bfloat 1.0, %x
+  ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_abs(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e64 v0.l, |v0.l|
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e64 v0, |v0|
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %x)
+  %fdiv = fdiv bfloat 1.0, %fabs
+  ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_afn(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fdiv = fdiv afn bfloat 1.0, %x
+  ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_neg(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fdiv = fdiv bfloat -1.0, %x
+  ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+  %fdiv = fdiv contract bfloat 1.0, %sqrt
+  ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_neg(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+  %fdiv = fdiv contract bfloat -1.0, %sqrt
+  ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v1.l, v1.l
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v1.h, v1.l
+; GFX1250-TRUE16-NEXT:    v_nop
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v1, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT:    v_nop
+; GFX1250-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+  %fdiv = fdiv contract bfloat 1.0, %sqrt
+  %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0
+  %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1
+  ret <2 x bfloat> %r2
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x)
+  %fdiv = fdiv contract bfloat 1.0, %sqrt
+  ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+  %fdiv = fdiv bfloat 1.0, %sqrt
+  ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+  %fdiv = fdiv bfloat -1.0, %sqrt
+  ret bfloat %fdiv
+}
+
+define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
+; GFX1250-TRUE16-LABEL: v_rsq_v2bf16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT:    v_nop
+; GFX1250-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+  %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt
+  ret <2 x bfloat> %fdiv
+}
+
+define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
+; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e64 v0.h, -v0.h
+; GFX1250-TRUE16-NEXT:    v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-FAKE16-NEXT:    v_sqrt_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_rcp_bf16_e64 v1, -v1
+; GFX1250-FAKE16-NEXT:    v_nop
+; GFX1250-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+  %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt
+  ret <2 x bfloat> %fdiv
+}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 8581e4d030261..8c7d5cffe39d9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -11974,7 +11974,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -11988,20 +11988,22 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12121,7 +12123,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -12136,19 +12138,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12425,34 +12429,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12578,12 +12582,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12594,19 +12597,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12891,34 +12896,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13045,12 +13050,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13061,19 +13065,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13355,45 +13361,45 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13505,45 +13511,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13806,45 +13813,45 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13957,45 +13964,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14259,27 +14267,28 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -14379,27 +14388,28 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14629,32 +14639,33 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -14744,33 +14755,34 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14988,7 +15000,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -15002,18 +15014,20 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -15130,7 +15144,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -15145,17 +15159,19 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -15424,34 +15440,34 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15579,12 +15595,11 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -15595,19 +15610,21 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15891,46 +15908,46 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -16043,45 +16060,46 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 883063b5471ca..56ad91dd59ffb 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -9836,7 +9836,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9850,20 +9850,22 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9983,7 +9985,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9998,19 +10000,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10288,34 +10292,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10441,12 +10445,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10457,19 +10460,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10755,34 +10760,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10909,12 +10914,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10925,19 +10929,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -11220,7 +11226,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -11234,18 +11240,20 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -11362,7 +11370,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -11377,17 +11385,19 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -11654,45 +11664,45 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11804,45 +11814,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12106,45 +12117,45 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -12257,45 +12268,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12560,27 +12572,28 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12680,27 +12693,28 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12931,32 +12945,33 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13046,33 +13061,34 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13294,34 +13310,34 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13449,12 +13465,11 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13465,19 +13480,21 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13762,46 +13779,46 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13914,45 +13931,46 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index c603421ca15b4..f0083bd23660a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -9836,7 +9836,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9850,20 +9850,22 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9983,7 +9985,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9998,19 +10000,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10288,34 +10292,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10441,12 +10445,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10457,19 +10460,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10755,34 +10760,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10909,12 +10914,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10925,19 +10929,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -11220,7 +11226,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -11234,18 +11240,20 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -11362,7 +11370,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -11377,17 +11385,19 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -11654,45 +11664,45 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11804,45 +11814,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12106,45 +12117,45 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -12257,45 +12268,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12560,27 +12572,28 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12680,27 +12693,28 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12931,32 +12945,33 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13046,33 +13061,34 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13294,34 +13310,34 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13449,12 +13465,11 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13465,19 +13480,21 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13762,46 +13779,46 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13914,45 +13931,46 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
 ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index c987effec3be3..3ee0bb2122abe 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -9419,7 +9419,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9433,20 +9433,22 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9566,7 +9568,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9581,19 +9583,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9870,34 +9874,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10023,12 +10027,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10039,19 +10042,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10336,34 +10341,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10490,12 +10495,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10506,19 +10510,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10800,7 +10806,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10814,18 +10820,20 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -10942,7 +10950,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10957,17 +10965,19 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -11233,45 +11243,45 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11383,45 +11393,46 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11684,45 +11695,45 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11835,45 +11846,46 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12137,27 +12149,28 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -12257,27 +12270,28 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12507,32 +12521,33 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -12622,33 +12637,34 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
 ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1] offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12869,34 +12885,34 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13024,12 +13040,11 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    flat_load_b32 v5, v[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13040,19 +13055,21 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13336,46 +13353,46 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13488,45 +13505,46 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
 ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
new file mode 100644
index 0000000000000..f4040f3049e0d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -0,0 +1,6030 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test using saddr addressing mode of flat_* atomic instructions.
+
+define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xchg_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; Maximum positive offset on gfx10
+define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047
+  %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; Maximum negative offset on gfx10
+define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048
+  %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xchg_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2048
+  %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048
+  %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+; --------------------------------------------------------------------------------
+; Uniformity edge cases
+; --------------------------------------------------------------------------------
+
+@ptr.in.lds = internal addrspace(3) global ptr undef
+
+; Base pointer is uniform, but also in VGPRs
+define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42
+  %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+; Base pointer is uniform, but also in VGPRs
+define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42
+  %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; All atomicrmw ops
+; --------------------------------------------------------------------------------
+
+; --------------------------------------------------------------------------------
+; atomicrmw xchg
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX1250-SDAG-NEXT:  .LBB10_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB10_5
+; GFX1250-SDAG-NEXT:  .LBB10_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB10_2
+; GFX1250-SDAG-NEXT:  .LBB10_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB10_5
+; GFX1250-SDAG-NEXT:  .LBB10_5:
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX1250-GISEL-NEXT:  .LBB10_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB10_5
+; GFX1250-GISEL-NEXT:  .LBB10_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB10_2
+; GFX1250-GISEL-NEXT:  .LBB10_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[4:5], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB10_5
+; GFX1250-GISEL-NEXT:  .LBB10_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX1250-SDAG-NEXT:  .LBB11_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB11_5
+; GFX1250-SDAG-NEXT:  .LBB11_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB11_2
+; GFX1250-SDAG-NEXT:  .LBB11_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB11_5
+; GFX1250-SDAG-NEXT:  .LBB11_5:
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX1250-GISEL-NEXT:  .LBB11_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB11_5
+; GFX1250-GISEL-NEXT:  .LBB11_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB11_2
+; GFX1250-GISEL-NEXT:  .LBB11_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[4:5], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB11_5
+; GFX1250-GISEL-NEXT:  .LBB11_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw xchg ptr %gep1, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX1250-SDAG-NEXT:  .LBB12_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB12_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1250-SDAG-NEXT:  .LBB12_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX1250-GISEL-NEXT:  .LBB12_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB12_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1250-GISEL-NEXT:  .LBB12_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX1250-SDAG-NEXT:  .LBB13_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB13_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB13_2
+; GFX1250-SDAG-NEXT:  .LBB13_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX1250-GISEL-NEXT:  .LBB13_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB13_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB13_2
+; GFX1250-GISEL-NEXT:  .LBB13_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw xchg ptr %gep1, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw add
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_add_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw add ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_add_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw add ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX1250-SDAG-NEXT:  .LBB18_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB18_5
+; GFX1250-SDAG-NEXT:  .LBB18_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB18_2
+; GFX1250-SDAG-NEXT:  .LBB18_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB18_5
+; GFX1250-SDAG-NEXT:  .LBB18_5:
+;
+; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX1250-GISEL-NEXT:  .LBB18_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB18_5
+; GFX1250-GISEL-NEXT:  .LBB18_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB18_2
+; GFX1250-GISEL-NEXT:  .LBB18_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB18_5
+; GFX1250-GISEL-NEXT:  .LBB18_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB19_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB19_4
+; GFX1250-SDAG-NEXT:  .LBB19_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB19_5
+; GFX1250-SDAG-NEXT:  .LBB19_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB19_2
+; GFX1250-SDAG-NEXT:  .LBB19_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB19_5
+; GFX1250-SDAG-NEXT:  .LBB19_5:
+;
+; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB19_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB19_4
+; GFX1250-GISEL-NEXT:  .LBB19_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB19_5
+; GFX1250-GISEL-NEXT:  .LBB19_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB19_2
+; GFX1250-GISEL-NEXT:  .LBB19_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB19_5
+; GFX1250-GISEL-NEXT:  .LBB19_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw add ptr %gep1, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB20_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB20_4
+; GFX1250-SDAG-NEXT:  .LBB20_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB20_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB20_2
+; GFX1250-SDAG-NEXT:  .LBB20_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB20_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB20_4
+; GFX1250-GISEL-NEXT:  .LBB20_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB20_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB20_2
+; GFX1250-GISEL-NEXT:  .LBB20_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX1250-SDAG-NEXT:  .LBB21_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB21_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB21_2
+; GFX1250-SDAG-NEXT:  .LBB21_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX1250-GISEL-NEXT:  .LBB21_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB21_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB21_2
+; GFX1250-GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw add ptr %gep1, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw sub
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_sub_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw sub ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_sub_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw sub ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB26_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB26_4
+; GFX1250-SDAG-NEXT:  .LBB26_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB26_5
+; GFX1250-SDAG-NEXT:  .LBB26_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB26_2
+; GFX1250-SDAG-NEXT:  .LBB26_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB26_5
+; GFX1250-SDAG-NEXT:  .LBB26_5:
+;
+; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB26_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB26_4
+; GFX1250-GISEL-NEXT:  .LBB26_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB26_5
+; GFX1250-GISEL-NEXT:  .LBB26_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB26_2
+; GFX1250-GISEL-NEXT:  .LBB26_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB26_5
+; GFX1250-GISEL-NEXT:  .LBB26_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB27_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB27_4
+; GFX1250-SDAG-NEXT:  .LBB27_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB27_5
+; GFX1250-SDAG-NEXT:  .LBB27_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB27_2
+; GFX1250-SDAG-NEXT:  .LBB27_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB27_5
+; GFX1250-SDAG-NEXT:  .LBB27_5:
+;
+; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB27_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB27_4
+; GFX1250-GISEL-NEXT:  .LBB27_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB27_5
+; GFX1250-GISEL-NEXT:  .LBB27_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB27_2
+; GFX1250-GISEL-NEXT:  .LBB27_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB27_5
+; GFX1250-GISEL-NEXT:  .LBB27_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw sub ptr %gep1, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX1250-SDAG-NEXT:  .LBB28_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB28_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB28_2
+; GFX1250-SDAG-NEXT:  .LBB28_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX1250-GISEL-NEXT:  .LBB28_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB28_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB28_2
+; GFX1250-GISEL-NEXT:  .LBB28_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB29_4
+; GFX1250-SDAG-NEXT:  .LBB29_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB29_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB29_2
+; GFX1250-SDAG-NEXT:  .LBB29_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB29_4
+; GFX1250-GISEL-NEXT:  .LBB29_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB29_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB29_2
+; GFX1250-GISEL-NEXT:  .LBB29_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw sub ptr %gep1, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw and
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_and_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw and ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_and_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw and ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB34_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB34_4
+; GFX1250-SDAG-NEXT:  .LBB34_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB34_5
+; GFX1250-SDAG-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB34_2
+; GFX1250-SDAG-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v3, v1, v3
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB34_5
+; GFX1250-SDAG-NEXT:  .LBB34_5:
+;
+; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB34_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB34_4
+; GFX1250-GISEL-NEXT:  .LBB34_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB34_5
+; GFX1250-GISEL-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB34_2
+; GFX1250-GISEL-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v2, v0, v4
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v3, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB34_5
+; GFX1250-GISEL-NEXT:  .LBB34_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX1250-SDAG-NEXT:  .LBB35_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB35_5
+; GFX1250-SDAG-NEXT:  .LBB35_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB35_2
+; GFX1250-SDAG-NEXT:  .LBB35_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v3, v1, v3
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB35_5
+; GFX1250-SDAG-NEXT:  .LBB35_5:
+;
+; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX1250-GISEL-NEXT:  .LBB35_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB35_5
+; GFX1250-GISEL-NEXT:  .LBB35_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB35_2
+; GFX1250-GISEL-NEXT:  .LBB35_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v2, v0, v4
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v3, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB35_5
+; GFX1250-GISEL-NEXT:  .LBB35_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw and ptr %gep1, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB36_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB36_4
+; GFX1250-SDAG-NEXT:  .LBB36_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB36_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB36_2
+; GFX1250-SDAG-NEXT:  .LBB36_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB36_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB36_4
+; GFX1250-GISEL-NEXT:  .LBB36_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB36_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB36_2
+; GFX1250-GISEL-NEXT:  .LBB36_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB37_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB37_4
+; GFX1250-SDAG-NEXT:  .LBB37_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB37_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB37_2
+; GFX1250-SDAG-NEXT:  .LBB37_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB37_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB37_4
+; GFX1250-GISEL-NEXT:  .LBB37_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB37_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB37_2
+; GFX1250-GISEL-NEXT:  .LBB37_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw and ptr %gep1, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw or
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_or_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw or ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_or_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw or ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB42_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB42_4
+; GFX1250-SDAG-NEXT:  .LBB42_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB42_5
+; GFX1250-SDAG-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB42_2
+; GFX1250-SDAG-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB42_5
+; GFX1250-SDAG-NEXT:  .LBB42_5:
+;
+; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB42_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB42_4
+; GFX1250-GISEL-NEXT:  .LBB42_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB42_5
+; GFX1250-GISEL-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB42_2
+; GFX1250-GISEL-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v2, v0, v4
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v3, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB42_5
+; GFX1250-GISEL-NEXT:  .LBB42_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX1250-SDAG-NEXT:  .LBB43_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB43_5
+; GFX1250-SDAG-NEXT:  .LBB43_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB43_2
+; GFX1250-SDAG-NEXT:  .LBB43_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v3, v1, v3
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB43_5
+; GFX1250-SDAG-NEXT:  .LBB43_5:
+;
+; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX1250-GISEL-NEXT:  .LBB43_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB43_5
+; GFX1250-GISEL-NEXT:  .LBB43_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB43_2
+; GFX1250-GISEL-NEXT:  .LBB43_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v2, v0, v4
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v3, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB43_5
+; GFX1250-GISEL-NEXT:  .LBB43_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw or ptr %gep1, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB44_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB44_4
+; GFX1250-SDAG-NEXT:  .LBB44_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB44_2
+; GFX1250-SDAG-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB44_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB44_4
+; GFX1250-GISEL-NEXT:  .LBB44_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB44_2
+; GFX1250-GISEL-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB45_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB45_4
+; GFX1250-SDAG-NEXT:  .LBB45_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB45_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB45_2
+; GFX1250-SDAG-NEXT:  .LBB45_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB45_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB45_4
+; GFX1250-GISEL-NEXT:  .LBB45_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB45_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB45_2
+; GFX1250-GISEL-NEXT:  .LBB45_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw or ptr %gep1, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw xor
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xor_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw xor ptr %gep1, i32 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xor_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw xor ptr %gep1, i32 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB50_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB50_4
+; GFX1250-SDAG-NEXT:  .LBB50_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB50_5
+; GFX1250-SDAG-NEXT:  .LBB50_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB50_2
+; GFX1250-SDAG-NEXT:  .LBB50_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v3, v1, v3
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB50_5
+; GFX1250-SDAG-NEXT:  .LBB50_5:
+;
+; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB50_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB50_4
+; GFX1250-GISEL-NEXT:  .LBB50_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB50_5
+; GFX1250-GISEL-NEXT:  .LBB50_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB50_2
+; GFX1250-GISEL-NEXT:  .LBB50_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v2, v0, v4
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v3, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB50_5
+; GFX1250-GISEL-NEXT:  .LBB50_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB51_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB51_4
+; GFX1250-SDAG-NEXT:  .LBB51_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB51_5
+; GFX1250-SDAG-NEXT:  .LBB51_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB51_2
+; GFX1250-SDAG-NEXT:  .LBB51_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v3, v1, v3
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB51_5
+; GFX1250-SDAG-NEXT:  .LBB51_5:
+;
+; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB51_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB51_4
+; GFX1250-GISEL-NEXT:  .LBB51_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB51_5
+; GFX1250-GISEL-NEXT:  .LBB51_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB51_2
+; GFX1250-GISEL-NEXT:  .LBB51_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v2, v0, v4
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v3, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB51_5
+; GFX1250-GISEL-NEXT:  .LBB51_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw xor ptr %gep1, i64 %data syncscope("agent") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB52_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB52_4
+; GFX1250-SDAG-NEXT:  .LBB52_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB52_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB52_2
+; GFX1250-SDAG-NEXT:  .LBB52_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB52_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB52_4
+; GFX1250-GISEL-NEXT:  .LBB52_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB52_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB52_2
+; GFX1250-GISEL-NEXT:  .LBB52_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX1250-SDAG-NEXT:  .LBB53_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB53_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB53_2
+; GFX1250-SDAG-NEXT:  .LBB53_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX1250-GISEL-NEXT:  .LBB53_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB53_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB53_2
+; GFX1250-GISEL-NEXT:  .LBB53_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw xor ptr %gep1, i64 %data syncscope("agent") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw max
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_max_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_max_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw max ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_max_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw max ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB58_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB58_4
+; GFX1250-SDAG-NEXT:  .LBB58_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB58_5
+; GFX1250-SDAG-NEXT:  .LBB58_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB58_2
+; GFX1250-SDAG-NEXT:  .LBB58_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB58_5
+; GFX1250-SDAG-NEXT:  .LBB58_5:
+;
+; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB58_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB58_4
+; GFX1250-GISEL-NEXT:  .LBB58_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB58_5
+; GFX1250-GISEL-NEXT:  .LBB58_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB58_2
+; GFX1250-GISEL-NEXT:  .LBB58_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB58_5
+; GFX1250-GISEL-NEXT:  .LBB58_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB59_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB59_4
+; GFX1250-SDAG-NEXT:  .LBB59_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB59_5
+; GFX1250-SDAG-NEXT:  .LBB59_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB59_2
+; GFX1250-SDAG-NEXT:  .LBB59_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB59_5
+; GFX1250-SDAG-NEXT:  .LBB59_5:
+;
+; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB59_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB59_4
+; GFX1250-GISEL-NEXT:  .LBB59_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB59_5
+; GFX1250-GISEL-NEXT:  .LBB59_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB59_2
+; GFX1250-GISEL-NEXT:  .LBB59_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB59_5
+; GFX1250-GISEL-NEXT:  .LBB59_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw max ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB60_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB60_4
+; GFX1250-SDAG-NEXT:  .LBB60_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB60_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB60_2
+; GFX1250-SDAG-NEXT:  .LBB60_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB60_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB60_4
+; GFX1250-GISEL-NEXT:  .LBB60_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB60_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB60_2
+; GFX1250-GISEL-NEXT:  .LBB60_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB61_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB61_4
+; GFX1250-SDAG-NEXT:  .LBB61_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB61_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB61_2
+; GFX1250-SDAG-NEXT:  .LBB61_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB61_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB61_4
+; GFX1250-GISEL-NEXT:  .LBB61_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB61_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB61_2
+; GFX1250-GISEL-NEXT:  .LBB61_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw max ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw min
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_min_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_min_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw min ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_min_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw min ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB66_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB66_4
+; GFX1250-SDAG-NEXT:  .LBB66_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB66_5
+; GFX1250-SDAG-NEXT:  .LBB66_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB66_2
+; GFX1250-SDAG-NEXT:  .LBB66_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB66_5
+; GFX1250-SDAG-NEXT:  .LBB66_5:
+;
+; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB66_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB66_4
+; GFX1250-GISEL-NEXT:  .LBB66_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB66_5
+; GFX1250-GISEL-NEXT:  .LBB66_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB66_2
+; GFX1250-GISEL-NEXT:  .LBB66_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB66_5
+; GFX1250-GISEL-NEXT:  .LBB66_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB67_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB67_4
+; GFX1250-SDAG-NEXT:  .LBB67_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB67_5
+; GFX1250-SDAG-NEXT:  .LBB67_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB67_2
+; GFX1250-SDAG-NEXT:  .LBB67_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB67_5
+; GFX1250-SDAG-NEXT:  .LBB67_5:
+;
+; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB67_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB67_4
+; GFX1250-GISEL-NEXT:  .LBB67_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB67_5
+; GFX1250-GISEL-NEXT:  .LBB67_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB67_2
+; GFX1250-GISEL-NEXT:  .LBB67_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB67_5
+; GFX1250-GISEL-NEXT:  .LBB67_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw min ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB68_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB68_4
+; GFX1250-SDAG-NEXT:  .LBB68_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB68_2
+; GFX1250-SDAG-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB68_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB68_4
+; GFX1250-GISEL-NEXT:  .LBB68_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB68_2
+; GFX1250-GISEL-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB69_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB69_4
+; GFX1250-SDAG-NEXT:  .LBB69_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB69_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB69_2
+; GFX1250-SDAG-NEXT:  .LBB69_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB69_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB69_4
+; GFX1250-GISEL-NEXT:  .LBB69_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB69_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB69_2
+; GFX1250-GISEL-NEXT:  .LBB69_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw min ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw umax
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umax_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umax_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw umax ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umax_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw umax ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB74_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB74_4
+; GFX1250-SDAG-NEXT:  .LBB74_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB74_5
+; GFX1250-SDAG-NEXT:  .LBB74_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB74_2
+; GFX1250-SDAG-NEXT:  .LBB74_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB74_5
+; GFX1250-SDAG-NEXT:  .LBB74_5:
+;
+; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB74_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB74_4
+; GFX1250-GISEL-NEXT:  .LBB74_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB74_5
+; GFX1250-GISEL-NEXT:  .LBB74_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB74_2
+; GFX1250-GISEL-NEXT:  .LBB74_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB74_5
+; GFX1250-GISEL-NEXT:  .LBB74_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB75_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB75_4
+; GFX1250-SDAG-NEXT:  .LBB75_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB75_5
+; GFX1250-SDAG-NEXT:  .LBB75_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB75_2
+; GFX1250-SDAG-NEXT:  .LBB75_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB75_5
+; GFX1250-SDAG-NEXT:  .LBB75_5:
+;
+; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB75_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB75_4
+; GFX1250-GISEL-NEXT:  .LBB75_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB75_5
+; GFX1250-GISEL-NEXT:  .LBB75_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB75_2
+; GFX1250-GISEL-NEXT:  .LBB75_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB75_5
+; GFX1250-GISEL-NEXT:  .LBB75_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw umax ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB76_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB76_4
+; GFX1250-SDAG-NEXT:  .LBB76_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB76_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB76_2
+; GFX1250-SDAG-NEXT:  .LBB76_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB76_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB76_4
+; GFX1250-GISEL-NEXT:  .LBB76_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB76_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB76_2
+; GFX1250-GISEL-NEXT:  .LBB76_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB77_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB77_4
+; GFX1250-SDAG-NEXT:  .LBB77_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB77_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB77_2
+; GFX1250-SDAG-NEXT:  .LBB77_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB77_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB77_4
+; GFX1250-GISEL-NEXT:  .LBB77_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB77_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB77_2
+; GFX1250-GISEL-NEXT:  .LBB77_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw umax ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; atomicrmw umin
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umin_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umin_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw umin ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umin_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw umin ptr %gep1, i32 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB82_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB82_4
+; GFX1250-SDAG-NEXT:  .LBB82_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB82_5
+; GFX1250-SDAG-NEXT:  .LBB82_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB82_2
+; GFX1250-SDAG-NEXT:  .LBB82_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB82_5
+; GFX1250-SDAG-NEXT:  .LBB82_5:
+;
+; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB82_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB82_4
+; GFX1250-GISEL-NEXT:  .LBB82_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB82_5
+; GFX1250-GISEL-NEXT:  .LBB82_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB82_2
+; GFX1250-GISEL-NEXT:  .LBB82_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB82_5
+; GFX1250-GISEL-NEXT:  .LBB82_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB83_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB83_4
+; GFX1250-SDAG-NEXT:  .LBB83_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB83_5
+; GFX1250-SDAG-NEXT:  .LBB83_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB83_2
+; GFX1250-SDAG-NEXT:  .LBB83_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB83_5
+; GFX1250-SDAG-NEXT:  .LBB83_5:
+;
+; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB83_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB83_4
+; GFX1250-GISEL-NEXT:  .LBB83_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB83_5
+; GFX1250-GISEL-NEXT:  .LBB83_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB83_2
+; GFX1250-GISEL-NEXT:  .LBB83_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB83_5
+; GFX1250-GISEL-NEXT:  .LBB83_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw umin ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB84_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB84_4
+; GFX1250-SDAG-NEXT:  .LBB84_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB84_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB84_2
+; GFX1250-SDAG-NEXT:  .LBB84_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB84_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB84_4
+; GFX1250-GISEL-NEXT:  .LBB84_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB84_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB84_2
+; GFX1250-GISEL-NEXT:  .LBB84_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB85_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB85_4
+; GFX1250-SDAG-NEXT:  .LBB85_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB85_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB85_2
+; GFX1250-SDAG-NEXT:  .LBB85_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB85_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB85_4
+; GFX1250-GISEL-NEXT:  .LBB85_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB85_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB85_2
+; GFX1250-GISEL-NEXT:  .LBB85_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw umin ptr %gep1, i64 %data syncscope("workgroup") seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; cmpxchg
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %cmpxchg = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst
+  %rtn = extractvalue { i32, i1 } %cmpxchg, 0
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %cmpxchg = cmpxchg ptr %gep1, i32 %cmp, i32 %data seq_cst seq_cst
+  %rtn = extractvalue { i32, i1 } %cmpxchg, 0
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = cmpxchg ptr %gep1, i32 %cmp, i32 %data seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB90_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB90_4
+; GFX1250-SDAG-NEXT:  .LBB90_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB90_5
+; GFX1250-SDAG-NEXT:  .LBB90_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB90_2
+; GFX1250-SDAG-NEXT:  .LBB90_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v8, -1, v2, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v8, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v8, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB90_5
+; GFX1250-SDAG-NEXT:  .LBB90_5:
+;
+; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v5
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB90_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB90_4
+; GFX1250-GISEL-NEXT:  .LBB90_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB90_5
+; GFX1250-GISEL-NEXT:  .LBB90_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB90_2
+; GFX1250-GISEL-NEXT:  .LBB90_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB90_5
+; GFX1250-GISEL-NEXT:  .LBB90_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %cmpxchg = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst
+  %rtn = extractvalue { i64, i1 } %cmpxchg, 0
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB91_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB91_4
+; GFX1250-SDAG-NEXT:  .LBB91_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB91_5
+; GFX1250-SDAG-NEXT:  .LBB91_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB91_2
+; GFX1250-SDAG-NEXT:  .LBB91_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v8, -1, v2, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v8, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v8, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB91_5
+; GFX1250-SDAG-NEXT:  .LBB91_5:
+;
+; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB91_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB91_4
+; GFX1250-GISEL-NEXT:  .LBB91_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB91_5
+; GFX1250-GISEL-NEXT:  .LBB91_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB91_2
+; GFX1250-GISEL-NEXT:  .LBB91_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB91_5
+; GFX1250-GISEL-NEXT:  .LBB91_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %cmpxchg = cmpxchg ptr %gep1, i64 %cmp, i64 %data seq_cst seq_cst
+  %rtn = extractvalue { i64, i1 } %cmpxchg, 0
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB92_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB92_4
+; GFX1250-SDAG-NEXT:  .LBB92_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB92_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB92_2
+; GFX1250-SDAG-NEXT:  .LBB92_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB92_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB92_4
+; GFX1250-GISEL-NEXT:  .LBB92_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB92_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB92_2
+; GFX1250-GISEL-NEXT:  .LBB92_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB93_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB93_4
+; GFX1250-SDAG-NEXT:  .LBB93_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB93_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB93_2
+; GFX1250-SDAG-NEXT:  .LBB93_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB93_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB93_4
+; GFX1250-GISEL-NEXT:  .LBB93_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB93_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB93_2
+; GFX1250-GISEL-NEXT:  .LBB93_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = cmpxchg ptr %gep1, i64 %cmp, i64 %data seq_cst seq_cst
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; amdgcn atomic inc
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_inc_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_inc_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw uinc_wrap ptr %gep1, i32 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_inc_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_inc_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw uinc_wrap ptr %gep1, i32 %data syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB98_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB98_4
+; GFX1250-SDAG-NEXT:  .LBB98_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB98_5
+; GFX1250-SDAG-NEXT:  .LBB98_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB98_2
+; GFX1250-SDAG-NEXT:  .LBB98_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB98_5
+; GFX1250-SDAG-NEXT:  .LBB98_5:
+;
+; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB98_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB98_4
+; GFX1250-GISEL-NEXT:  .LBB98_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB98_5
+; GFX1250-GISEL-NEXT:  .LBB98_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB98_2
+; GFX1250-GISEL-NEXT:  .LBB98_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB98_5
+; GFX1250-GISEL-NEXT:  .LBB98_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB99_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB99_4
+; GFX1250-SDAG-NEXT:  .LBB99_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB99_5
+; GFX1250-SDAG-NEXT:  .LBB99_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB99_2
+; GFX1250-SDAG-NEXT:  .LBB99_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_branch .LBB99_5
+; GFX1250-SDAG-NEXT:  .LBB99_5:
+;
+; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB99_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB99_4
+; GFX1250-GISEL-NEXT:  .LBB99_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB99_5
+; GFX1250-GISEL-NEXT:  .LBB99_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB99_2
+; GFX1250-GISEL-NEXT:  .LBB99_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_branch .LBB99_5
+; GFX1250-GISEL-NEXT:  .LBB99_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw uinc_wrap ptr %gep1, i64 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB100_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB100_4
+; GFX1250-SDAG-NEXT:  .LBB100_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB100_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB100_2
+; GFX1250-SDAG-NEXT:  .LBB100_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB100_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB100_4
+; GFX1250-GISEL-NEXT:  .LBB100_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB100_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB100_2
+; GFX1250-GISEL-NEXT:  .LBB100_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB101_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB101_4
+; GFX1250-SDAG-NEXT:  .LBB101_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB101_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB101_2
+; GFX1250-SDAG-NEXT:  .LBB101_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB101_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB101_4
+; GFX1250-GISEL-NEXT:  .LBB101_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB101_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB101_2
+; GFX1250-GISEL-NEXT:  .LBB101_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw uinc_wrap ptr %gep1, i64 %data syncscope("agent") monotonic
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; amdgcn atomic dec
+; --------------------------------------------------------------------------------
+
+
+define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_dec_saddr_i32_rtn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_dec_saddr_i32_rtn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw udec_wrap ptr %gep1, i32 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i32 %rtn to float
+  ret float %cast.rtn
+}
+
+define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_dec_saddr_i32_nortn:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_dec_saddr_i32_nortn_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw udec_wrap ptr %gep1, i32 %data syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB106_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB106_4
+; GFX1250-SDAG-NEXT:  .LBB106_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB106_5
+; GFX1250-SDAG-NEXT:  .LBB106_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB106_2
+; GFX1250-SDAG-NEXT:  .LBB106_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-SDAG-NEXT:    s_branch .LBB106_5
+; GFX1250-SDAG-NEXT:  .LBB106_5:
+;
+; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB106_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB106_4
+; GFX1250-GISEL-NEXT:  .LBB106_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB106_5
+; GFX1250-GISEL-NEXT:  .LBB106_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB106_2
+; GFX1250-GISEL-NEXT:  .LBB106_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-GISEL-NEXT:    s_branch .LBB106_5
+; GFX1250-GISEL-NEXT:  .LBB106_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %rtn = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB107_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB107_4
+; GFX1250-SDAG-NEXT:  .LBB107_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_branch .LBB107_5
+; GFX1250-SDAG-NEXT:  .LBB107_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB107_2
+; GFX1250-SDAG-NEXT:  .LBB107_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-SDAG-NEXT:    s_branch .LBB107_5
+; GFX1250-SDAG-NEXT:  .LBB107_5:
+;
+; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB107_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB107_4
+; GFX1250-GISEL-NEXT:  .LBB107_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_branch .LBB107_5
+; GFX1250-GISEL-NEXT:  .LBB107_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB107_2
+; GFX1250-GISEL-NEXT:  .LBB107_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-GISEL-NEXT:    s_branch .LBB107_5
+; GFX1250-GISEL-NEXT:  .LBB107_5:
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %rtn = atomicrmw udec_wrap ptr %gep1, i64 %data syncscope("agent") monotonic
+  %cast.rtn = bitcast i64 %rtn to <2 x float>
+  ret <2 x float> %cast.rtn
+}
+
+define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB108_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB108_4
+; GFX1250-SDAG-NEXT:  .LBB108_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB108_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB108_2
+; GFX1250-SDAG-NEXT:  .LBB108_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB108_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB108_4
+; GFX1250-GISEL-NEXT:  .LBB108_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB108_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB108_2
+; GFX1250-GISEL-NEXT:  .LBB108_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %unused = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xffffffffffffff80)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB109_3
+; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB109_4
+; GFX1250-SDAG-NEXT:  .LBB109_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-SDAG-NEXT:  .LBB109_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB109_2
+; GFX1250-SDAG-NEXT:  .LBB109_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[0:1], off
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB109_3
+; GFX1250-GISEL-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB109_4
+; GFX1250-GISEL-NEXT:  .LBB109_2: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT:    s_endpgm
+; GFX1250-GISEL-NEXT:  .LBB109_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT:    flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB109_2
+; GFX1250-GISEL-NEXT:  .LBB109_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-GISEL-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[0:1], off
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %unused = atomicrmw udec_wrap ptr %gep1, i64 %data syncscope("agent") monotonic
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
new file mode 100644
index 0000000000000..e6018e413a85d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -0,0 +1,2229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test using saddr addressing mode of flat_*load_* instructions.
+
+; --------------------------------------------------------------------------------
+; No vgpr offset, constants
+; --------------------------------------------------------------------------------
+
+; SGPR base only
+define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %load = load i8, ptr %sbase
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx1250 immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx1250 immediate offset + 1
+define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388608
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx1250 immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388608
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx1250 immediate offset -1
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xff7fffff
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388609
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, -1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967296
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967297
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfff
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971391
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0x1000
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971392
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x800000, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388607
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967295
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, -1
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967296
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, -1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967297
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Basic addressing patterns
+; --------------------------------------------------------------------------------
+
+; Basic pattern, no immediate offset.
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum positive offset on gfx1250
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum positive offset on gfx1250 + 1
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388608
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum negative offset on gfx1250
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388608:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388608
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum negative offset on gfx1250 - 1
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388607:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388607
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 %zext.offset
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; pointer addressing done in integers
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %sbase.as.int, %zext.offset
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; zext forced to LHS of addressing expression
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; zext forced to LHS of addressing expression, with immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add.immoffset = add i64 %sbase.as.int, 128
+  %add = add i64 %zext.offset, %add.immoffset
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Uniformity edge cases
+; --------------------------------------------------------------------------------
+
+@ptr.in.lds = internal addrspace(3) global ptr undef
+
+; Base pointer is uniform, but also in VGPRs
+define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[0:1] offset:42
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:42
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both 64-bit base and 32-bit offset are scalar
+define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
+define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-24
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[0:1] offset:-24
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both components uniform, zext forced to LHS of addressing expression
+define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[0:1] offset:128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; divergent 64-bit base, 32-bit scalar offset.
+define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; divergent 64-bit base, 32-bit scalar offset, with imm offset
+define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Natural addressing shifts with restricted range
+; --------------------------------------------------------------------------------
+
+; Cannot push the shift into 32-bits, and cannot match.
+define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep
+  ret float %load
+}
+
+; Cannot push the shift into 32-bits, with an immediate offset.
+define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing_immoffset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 128
+  %load = load float, ptr %gep1
+  ret float %load
+}
+
+; Range is sufficiently restricted to push the shift into 32-bits.
+define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep
+  ret float %load
+}
+
+; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
+define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds float, ptr %gep0, i64 100
+  %load = load float, ptr %gep1
+  ret float %load
+}
+
+; Range is 1 beyond the limit where we can move the shift into 32-bits.
+define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep
+  ret float %load
+}
+
+; --------------------------------------------------------------------------------
+; Stress various type loads
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %cast.load = bitcast i16 %load to half
+  ret half %cast.load
+}
+
+define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %cast.load = bitcast i16 %load to half
+  ret half %cast.load
+}
+
+define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load half, ptr %gep0
+  ret half %load
+}
+
+define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load half, ptr %gep1
+  ret half %load
+}
+
+define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i32, ptr %gep0
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i32, ptr %gep1
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep0
+  ret float %load
+}
+
+define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load float, ptr %gep1
+  ret float %load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x i16>, ptr %gep0
+  %cast.load = bitcast <2 x i16> %load to <2 x half>
+  ret <2 x half> %cast.load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x i16>, ptr %gep1
+  %cast.load = bitcast <2 x i16> %load to <2 x half>
+  ret <2 x half> %cast.load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x half>, ptr %gep0
+  ret <2 x half> %load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x half>, ptr %gep1
+  ret <2 x half> %load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load ptr addrspace(3), ptr %gep0
+  %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
+  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
+  ret <2 x half> %cast.load1
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p3_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load ptr addrspace(3), ptr %gep1
+  %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
+  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
+  ret <2 x half> %cast.load1
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load double, ptr %gep0
+  %cast.load = bitcast double %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load double, ptr %gep1
+  %cast.load = bitcast double %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i64, ptr %gep0
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i64, ptr %gep1
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x float>, ptr %gep0
+  ret <2 x float> %load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x float>, ptr %gep1
+  ret <2 x float> %load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x i32>, ptr %gep0
+  %cast.load = bitcast <2 x i32> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x i32>, ptr %gep1
+  %cast.load = bitcast <2 x i32> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x i16>, ptr %gep0
+  %cast.load = bitcast <4 x i16> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x i16>, ptr %gep1
+  %cast.load = bitcast <4 x i16> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x half>, ptr %gep0
+  %cast.load = bitcast <4 x half> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x half>, ptr %gep1
+  %cast.load = bitcast <4 x half> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load ptr, ptr %gep0
+  %cast.load0 = ptrtoint ptr %load to i64
+  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
+  ret <2 x float> %cast.load1
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p1_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load ptr, ptr %gep1
+  %cast.load0 = ptrtoint ptr %load to i64
+  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
+  ret <2 x float> %cast.load1
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <3 x float>, ptr %gep0
+  ret <3 x float> %load
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <3 x float>, ptr %gep1
+  ret <3 x float> %load
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <3 x i32>, ptr %gep0
+  %cast.load = bitcast <3 x i32> %load to <3 x float>
+  ret <3 x float> %cast.load
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <3 x i32>, ptr %gep1
+  %cast.load = bitcast <3 x i32> %load to <3 x float>
+  ret <3 x float> %cast.load
+}
+
+define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v6f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <6 x half>, ptr %gep0
+  ret <6 x half> %load
+}
+
+define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v6f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <6 x half>, ptr %gep1
+  ret <6 x half> %load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x float>, ptr %gep0
+  ret <4 x float> %load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x float>, ptr %gep1
+  ret <4 x float> %load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x i32>, ptr %gep0
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x i32>, ptr %gep1
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x i64>, ptr %gep0
+  %cast.load = bitcast <2 x i64> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x i64>, ptr %gep1
+  %cast.load = bitcast <2 x i64> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i128, ptr %gep0
+  %cast.load = bitcast i128 %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i128_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i128, ptr %gep1
+  %cast.load = bitcast i128 %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2p1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x ptr>, ptr %gep0
+  %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64>
+  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2p1_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x ptr>, ptr %gep1
+  %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64>
+  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4p3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x ptr addrspace(3)>, ptr %gep0
+  %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
+  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4p3_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x ptr addrspace(3)>, ptr %gep1
+  %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
+  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+; --------------------------------------------------------------------------------
+; Extending loads
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %sextload = sext i8 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i8_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %sextload = sext i8 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %sextload = sext i16 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %sextload = sext i16 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zextload = zext i8 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i8_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %zextload = zext i8 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %zextload = zext i16 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %zextload = zext i16 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+; --------------------------------------------------------------------------------
+; Atomic load
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load atomic i32, ptr %gep0 seq_cst, align 4
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load atomic i32, ptr %gep1 seq_cst, align 4
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load atomic i64, ptr %gep0 seq_cst, align 8
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load atomic i64, ptr %gep1 seq_cst, align 8
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+; --------------------------------------------------------------------------------
+; D16 load (low 16)
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> undef, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> undef, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+; --------------------------------------------------------------------------------
+; D16 hi load (hi16)
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> undef, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> undef, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+; --------------------------------------------------------------------------------
+; or-with-constant as add
+; --------------------------------------------------------------------------------
+
+; Check add-as-or with split 64-bit or.
+define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_bitop2_b32 v0, 16, v0 bitop3:0x54
+; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 16
+  %addr = inttoptr i64 %or to ptr
+  %load = load i8, ptr %addr
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 4160
+  %addr = inttoptr i64 %or to ptr
+  %load = load i8, ptr %addr
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Full 64-bit scalar add.
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
+; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv:
+; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT:  .LBB116_1: ; %bb3
+; GFX1250-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT:    s_cbranch_scc0 .LBB116_1
+; GFX1250-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
+; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT:  .LBB116_1: ; %bb3
+; GFX1250-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT:    s_cbranch_vccz .LBB116_1
+; GFX1250-GISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-GISEL-NEXT:    s_endpgm
+bb:
+  br label %bb3
+
+bb2:                                              ; preds = %bb3
+  ret void
+
+bb3:                                              ; preds = %bb3, %bb
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr %arg, i64 %i4
+  %i6 = load volatile float, ptr %i5, align 4
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb3
+}
+
+; Make sure we only have a single zero vaddr initialization.
+
+define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inreg %arg.1) {
+; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload:
+; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT:  .LBB117_1: ; %bb3
+; GFX1250-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT:    s_cbranch_scc0 .LBB117_1
+; GFX1250-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
+; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT:  .LBB117_1: ; %bb3
+; GFX1250-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT:    flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT:    s_cbranch_vccz .LBB117_1
+; GFX1250-GISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-GISEL-NEXT:    s_endpgm
+bb:
+  br label %bb3
+
+bb2:                                              ; preds = %bb3
+  ret void
+
+bb3:                                              ; preds = %bb3, %bb
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr %arg, i64 %i4
+  %i6 = load volatile float, ptr %i5, align 4
+  %i5.1 = getelementptr inbounds float, ptr %arg.1, i64 %i4
+  %i6.1 = load volatile float, ptr %i5, align 4
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb3
+}
+
+!0 = !{i32 0, i32 1073741824} ; (1 << 30)
+!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
new file mode 100644
index 0000000000000..32888d2acf1cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
@@ -0,0 +1,1118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test using saddr addressing mode of flat_*store_* instructions.
+
+define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
+; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i8 %data, ptr %gep0
+  ret void
+}
+
+; Maximum positive offset on gfx10
+define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
+; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_2047:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3] offset:2047
+; GFX1250-NEXT:    s_endpgm
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047
+  store i8 %data, ptr %gep1
+  ret void
+}
+
+; Maximum negative offset on gfx10
+define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
+; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_neg2048:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3] offset:-2048
+; GFX1250-NEXT:    s_endpgm
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048
+  store i8 %data, ptr %gep1
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Uniformity edge cases
+; --------------------------------------------------------------------------------
+
+@ptr.in.lds = internal addrspace(3) global ptr undef
+
+; Base pointer is uniform, but also in VGPRs
+define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_store_b8 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_store_b8 v[2:3], v1
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i8 %data, ptr %gep0
+  ret void
+}
+
+; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_store_b8 v0, v1, s[0:1] offset:-120
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_store_b8 v[2:3], v1 offset:-120
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -120
+  store i8 %data, ptr %gep1
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Stress various type stores
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i16 %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store i16 %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) {
+; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store half %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) {
+; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store half %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i32 %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store i32 %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) {
+; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store float %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) {
+; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store float %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
+; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store ptr addrspace(3) %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
+; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store ptr addrspace(3) %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i64 %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store i64 %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, double %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store double %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, double %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store double %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x i32> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x i32> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x float> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x float> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x i16> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x i16> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x half> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x half> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store ptr %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store ptr %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <3 x i32> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <3 x i32> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <3 x float> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <3 x float> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <6 x i16> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <6 x i16> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <6 x half> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <6 x half> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x i32> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x i32> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x float> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x float> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x i64> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x i64> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x double> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x double> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x double> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x double> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <8 x i16> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <8 x i16> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <8 x half> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <8 x half> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x ptr> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x ptr> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x ptr addrspace(3)> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x ptr addrspace(3)> %data, ptr %gep1
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Atomic store
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store atomic i32 %data, ptr %gep0 seq_cst, align 4
+  ret void
+}
+
+define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store atomic i32 %data, ptr %gep1 seq_cst, align 4
+  ret void
+}
+
+define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store atomic i64 %data, ptr %gep0 seq_cst, align 8
+  ret void
+}
+
+define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store atomic i64 %data, ptr %gep1 seq_cst, align 8
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; D16 HI store (hi 16)
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  store i16 %data.hi, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  store i16 %data.hi, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b8 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  %data.hi.trunc = trunc i16 %data.hi to i8
+  store i8 %data.hi.trunc, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  %data.hi.trunc = trunc i16 %data.hi to i8
+  store i8 %data.hi.trunc, ptr %gep1
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir
new file mode 100644
index 0000000000000..e5955ad3d1d49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=si-fold-operands -stop-after=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            test_fold_fi_scratch_load_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr
+    ; GCN: renamable $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    S_ENDPGM 0, implicit %1
+
+...
+
+# SS form of the SCRATCH_LOAD_DWORD does not support offset scaling
+
+---
+name:            test_no_fold_fi_scratch_load_vgpr_scale_offset
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: test_no_fold_fi_scratch_load_vgpr_scale_offset
+    ; GCN: renamable $vgpr0 = V_MOV_B32_e32 $sgpr32, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 4, 2048, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 2048, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5)
+    S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index a98df5c97293c..b0e6752386285 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -150,13 +150,11 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -321,15 +319,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_mul_u32_u24_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -494,15 +491,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_mul_u32_u24_e32 v0, 4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -664,17 +660,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-LABEL: soff2_voff1:
 ; GFX12-GISEL:       ; %bb.0: ; %bb
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -850,13 +844,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_mul_u32_u24_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -1032,13 +1024,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_mul_u32_u24_e32 v0, 4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -1200,17 +1190,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-LABEL: soff4_voff1:
 ; GFX12-GISEL:       ; %bb.0: ; %bb
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -1386,13 +1374,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_mul_u32_u24_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -1565,13 +1551,11 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_mul_u32_u24_e32 v0, 4, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
@@ -1672,9 +1656,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index 2f08931f2287e..87843522fe0ab 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1872,63 +1872,48 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-TRUE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1985,74 +1970,59 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2505,114 +2475,88 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v6, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v4, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2710,123 +2654,103 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v6, v8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v4, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index 0742ac7b425a6..bc85dc2f1e9e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -69,21 +69,16 @@ define bfloat @v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum(bfloat %a) #1 {
 ; GFX11-SDAG-TRUE16-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x4000, v0.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 2.0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x4000, v1.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 4.0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4080, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4080, v1.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %max = call bfloat @llvm.maximumnum.bf16(bfloat %a, bfloat 2.0)
   %med = call bfloat @llvm.minimumnum.bf16(bfloat %max, bfloat 4.0)
@@ -196,35 +191,26 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f32_e64 s0, v2, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x4000, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.h, 0x4000, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f32_e64 s0, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 2.0, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x4000, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.h, 0x4000, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 2.0, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, 2.0, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x4000, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 4.0, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 4.0, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4080, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 4.0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4080, v0.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %max = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> splat (bfloat 2.0))
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index 969c6c3980fc3..7b2d793973d08 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1874,63 +1874,48 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX11-SDAG-TRUE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -1987,74 +1972,59 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2510,114 +2480,88 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v6, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v4, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2715,123 +2659,103 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v6, v8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v4, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v7
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v4.h, s1
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v6, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v3
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index ac438062ae208..9a347d71bf430 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -14592,5 +14592,241 @@ define void @freeze_v4i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
   store <4 x i1> %freeze, ptr addrspace(1) %ptrb
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX8-SDAG: {{.*}}
+
+define double @freeze_fabs_double(float %a, double %b, double %c) {
+; GFX6-SDAG-LABEL: freeze_fabs_double:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX6-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX6-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_fabs_double:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX6-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX6-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX6-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_fabs_double:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX7-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX7-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_fabs_double:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX7-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX7-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX7-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: freeze_fabs_double:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX8-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX8-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_fabs_double:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX8-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX8-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX8-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_fabs_double:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX9-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: freeze_fabs_double:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX10-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_fabs_double:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX10-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_fabs_double:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX11-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_fabs_double:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX11-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %pv = insertelement <2 x float> poison, float %a, i32 1
+  %d = bitcast <2 x float> %pv to double
+  %r = call double @llvm.fabs.f64(double %d)
+  %fr = freeze double %r
+  %add1 = fadd double %fr, %b
+  %add2 = fadd double %fr, %c
+  %add = fadd double %add1, %add2
+  ret double %add
+}
+
+define <4 x float> @freeze_fabs_v4float(<4 x float> %A, <4 x float> %B) {
+; GFX6-SDAG-LABEL: freeze_fabs_v4float:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_fabs_v4float:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_fabs_v4float:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_fabs_v4float:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: freeze_fabs_v4float:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_fabs_v4float:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_fabs_v4float:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: freeze_fabs_v4float:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_fabs_v4float:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_fabs_v4float:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_fabs_v4float:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %A0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %A)
+  %F1 = freeze <4 x float> %A0
+  %A1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %F1)
+  ret <4 x float> %A1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index a901d7f97eb37..f8ff8efbb1ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1109,18 +1109,19 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1191,19 +1192,20 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v4, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1289,23 +1291,24 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v0.h, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v0.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v0, v4
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[1:2], off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: void_func_v8i8:
@@ -1419,46 +1422,47 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 ; GFX11-TRUE16-LABEL: void_func_v16i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v8.h, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v13
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v8.h, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v5.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v0.h, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v10.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v4, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v12
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[6:9], off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: void_func_v16i8:
@@ -1654,85 +1658,84 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v31, off, s32
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v9.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v13, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v5.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v12, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v6.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v6.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v5.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v7.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v8.h, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.l, v9.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v11.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v13.l, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v14.l, v13.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v15.l, v14.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v11.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v13, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v8.h, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v15, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 16
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v9.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v10.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v4.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v9, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v1, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v17, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v12, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v11, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v4.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v9, v32
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index dd7f18357f965..facc91a7666d9 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -4896,22 +4896,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[40:41], v0, off
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
@@ -5156,18 +5157,22 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v4, off
@@ -5175,9 +5180,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
@@ -5440,36 +5442,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT:    global_store_b64 v[40:41], v[0:1], off
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v0, v5
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    global_store_b64 v[40:41], v[1:2], off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
@@ -5911,81 +5912,83 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v15.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v5.h, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v2.h, v1.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v9, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v8, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v1, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v3, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v6, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v7, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v6, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v13
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_store_b128 v[42:43], v[7:10], off
-; GFX11-TRUE16-NEXT:    global_store_b128 v[40:41], v[3:6], off
+; GFX11-TRUE16-NEXT:    global_store_b128 v[42:43], v[6:9], off
+; GFX11-TRUE16-NEXT:    global_store_b128 v[40:41], v[2:5], off
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s33
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 100a560c1d127..1f74fbdc46e98 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -12320,7 +12320,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -12334,20 +12334,22 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12467,7 +12469,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -12482,19 +12484,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12821,34 +12825,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12974,12 +12978,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12990,19 +12993,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13339,34 +13344,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13493,12 +13498,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13509,19 +13513,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13855,7 +13861,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -13869,18 +13875,20 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -13997,7 +14005,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -14012,17 +14020,19 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -14337,45 +14347,45 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -14487,45 +14497,46 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14838,45 +14849,45 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -14989,45 +15000,46 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -15341,27 +15353,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -15461,27 +15474,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -15750,32 +15764,33 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -15865,33 +15880,34 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -16149,34 +16165,34 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -16304,12 +16320,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -16320,19 +16335,21 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -16668,46 +16685,46 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -16820,45 +16837,46 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index faa3ee61427a2..faa74fef2be2f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -8755,20 +8755,22 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -8888,7 +8890,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -8903,19 +8905,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9244,34 +9248,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9397,12 +9401,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9413,19 +9416,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9764,34 +9769,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9918,12 +9923,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9934,19 +9938,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10282,7 +10288,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10296,18 +10302,20 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -10424,7 +10432,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10439,17 +10447,19 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -10766,45 +10776,45 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -10916,45 +10926,46 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11269,45 +11280,45 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11420,45 +11431,46 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11774,27 +11786,28 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -11894,27 +11907,28 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -12185,32 +12199,33 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -12300,33 +12315,34 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12586,34 +12602,34 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12741,12 +12757,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12757,19 +12772,21 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13107,46 +13124,46 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13259,45 +13276,46 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index cb66f85ff3ae2..a46b0129b79e6 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -8755,20 +8755,22 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -8888,7 +8890,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -8903,19 +8905,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9244,34 +9248,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9397,12 +9401,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9413,19 +9416,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9764,34 +9769,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9918,12 +9923,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9934,19 +9938,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10282,7 +10288,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10296,18 +10302,20 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -10424,7 +10432,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10439,17 +10447,19 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -10766,45 +10776,45 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -10916,45 +10926,46 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11269,45 +11280,45 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11420,45 +11431,46 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11774,27 +11786,28 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -11894,27 +11907,28 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -12185,32 +12199,33 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -12300,33 +12315,34 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12586,34 +12602,34 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -12741,12 +12757,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12757,19 +12772,21 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13107,46 +13124,46 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13259,45 +13276,46 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
 ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index f869b5778bfb2..053efdcb76261 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -9266,7 +9266,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9280,20 +9280,22 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9413,7 +9415,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -9428,19 +9430,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9767,34 +9771,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -9920,12 +9924,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9936,19 +9939,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10285,34 +10290,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10439,12 +10444,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10455,19 +10459,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -10801,7 +10807,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10815,18 +10821,20 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -10943,7 +10951,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
@@ -10958,17 +10966,19 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
@@ -11283,45 +11293,45 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11433,45 +11443,46 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11784,45 +11795,45 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -11935,45 +11946,46 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12287,27 +12299,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -12407,27 +12420,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -12696,32 +12710,33 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -12811,33 +12826,34 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:2046
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13095,34 +13111,34 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX12-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_not_b32_e32 v4, v4
 ; GFX12-TRUE16-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13250,12 +13266,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
 ; GFX11-TRUE16-NEXT:    global_load_b32 v5, v[0:1], off
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13266,19 +13281,21 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v3, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -13614,46 +13631,46 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX12-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX12-TRUE16-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -13766,45 +13783,46 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 3, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v3, v4, v6, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 5d35adc8cbe0a..79907fd0c60bc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -482,17 +482,16 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GCN-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[8:9], 12
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[10:11], 8
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[12:13], 0
-; GCN-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GCN-SDAG-NEXT:    s_clause 0x1
-; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v4, s[0:1]
-; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v4, s[2:3] scale_offset
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v3, v3, v7
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v2, v2, v6
@@ -509,21 +508,20 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GCN-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[8:9], 0
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[10:11], 2
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[12:13], 4
-; GCN-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[14:15], 6
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[16:17], 8
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[18:19], 10
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[20:21], 12
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    s_clause 0x1
-; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v4, s[0:1]
-; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v4, s[2:3] scale_offset
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v0, v0, v4
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index d588f0e0897b7..723e3ef15553a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4007,6 +4007,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, 0, 16, v0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
@@ -4053,6 +4055,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, 0, 16, v0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
@@ -4411,7 +4415,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
@@ -4457,7 +4461,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
@@ -4882,7 +4886,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-GISEL-NEXT:  .LBB132_1: ; %bb3
 ; GFX12-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v4, vcc, v0, v2
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffd
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
@@ -5002,7 +5006,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-GISEL-NEXT:  .LBB133_1: ; %bb3
 ; GFX12-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_add_co_u32 v4, vcc, v0, v2
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffd
 ; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 5e502882a2645..7ebd69204d87f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2724,31 +2724,31 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    global_load_d16_u8 v0, v5, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v3.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v1.l, 8, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v3.h, v4.h
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v1.h, 8, v4.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v0.h, v1.h
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v0.h, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v3.h, v4.h
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.h, v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v0.h
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.h
 ; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-DL-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_or_b16 v6.h, v0.h, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.h, v4.h, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-DL-TRUE16-NEXT:    global_store_b8 v5, v0, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
index 2cc25d88347ee..c34c9749d553a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
@@ -17,24 +17,22 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5
 
     ; CHECK-LABEL: name: av_mov_b32_split
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5
+    ; CHECK: liveins: $agpr3, $agpr4, $vgpr0, $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
     ; CHECK-NEXT: renamable $agpr1 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
     ; CHECK-NEXT: renamable $agpr2 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
-    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec
-    ; CHECK-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 4, implicit $exec
-    ; CHECK-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
+    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 4, implicit $exec
+    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
     %0:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
     %1:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
     %2:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
@@ -68,29 +66,25 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5
 
     ; CHECK-LABEL: name: v_mov_b32_split
-    ; CHECK: liveins: $vgpr0, $vgpr3, $vgpr4, $vgpr5, $sgpr4_sgpr5
+    ; CHECK: liveins: $agpr3, $agpr4, $vgpr0, $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1, implicit $exec
     ; CHECK-NEXT: renamable $vgpr2 = V_MOV_B32_e32 2, implicit $exec
     ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; CHECK-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
     ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; CHECK-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 4, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; CHECK-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec
+    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     %2:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index c1e0d0716acae..11de6c8d52d59 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -27,7 +27,7 @@
 # CHECK-LABEL: name: inflated_reg_class_copy_use_after_free
 # CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
 # CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
 # CHECK-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
 # CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
 # CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir
new file mode 100644
index 0000000000000..0abf34797a5e7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir
@@ -0,0 +1,67 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+---
+name: wmma_xdl_twoaddr_trans
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}wmma_xdl_twoaddr_trans:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[8:15], v[0:7], v[0:7], v[8:15]
+    ; CHECK-NEXT: v_exp_f32_e32 v16, v16
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v17, v17, v8
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16, $vgpr17
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, 0, implicit $exec
+    $vgpr16 = V_EXP_F32_e32 $vgpr16, implicit $exec, implicit $mode
+    $vgpr17 = V_ADD_U32_e32 $vgpr17, $vgpr8, implicit $exec
+...
+
+---
+name: wmma_xdl_threeaddr_trans
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}wmma_xdl_threeaddr_trans:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[8:15], v[0:7], v[0:7], v[16:23]
+    ; CHECK-NEXT: v_exp_f32_e32 v24, v24
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v25, v25, v8
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24, $vgpr25
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_threeaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr24 = V_EXP_F32_e32 $vgpr24, implicit $exec, implicit $mode
+    $vgpr25 = V_ADD_U32_e32 $vgpr25, $vgpr8, implicit $exec
+...
+
+name: swmmac_xdl_twoaddr_trans
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}swmmac_xdl_twoaddr_trans:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+    ; CHECK-NEXT: v_exp_f32_e32 v30, v30
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v31, v31, v24
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27, $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr30 = V_EXP_F32_e32 $vgpr30, implicit $exec, implicit $mode
+    $vgpr31 = V_ADD_U32_e32 $vgpr31, $vgpr24, implicit $exec
+...
+
+name: wmma_non_xdl_large_data_valu
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}wmma_non_xdl_large_data_valu:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+    ; CHECK-NEXT: v_exp_f32_e32 v12, v12
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v13, v13, v8
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12, $vgpr13
+    $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, -1, 0, 0, implicit $exec
+    $vgpr12 = V_EXP_F32_e32 $vgpr12, implicit $exec, implicit $mode
+    $vgpr13 = V_ADD_U32_e32 $vgpr13, $vgpr8, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
new file mode 100644
index 0000000000000..b68786b579dd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+  ; CHECK-LABEL: name: basic_test
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]]
+  ; CHECK-NEXT:   [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]]
+  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+  ; CHECK-NEXT:   $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+  ; CHECK-LABEL: name: unused_active
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+  ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+  ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+  ; CHECK-LABEL: name: multiple_blocks
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1)
+  ; CHECK-NEXT:   G_BRCOND [[INT]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.end:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32)
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[SELECT]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+  %c = icmp eq i32 %a, %b
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %0
+  %d = add i32 %a, %b
+  br label %if.end
+
+if.end:
+  %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+  %e = select i1 %active, i32 %a, i32 %f
+  ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+  ; CHECK-LABEL: name: ret_64
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]]
+  ; CHECK-NEXT:   [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]]
+  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1
+  %x = select i1 %active, i64 %a, i64 5
+  %y = select i1 %active, i64 %b, i64 3
+  %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 0d3340006f17e..3261e4cae5bcd 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -917,47 +917,91 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
 }
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
-  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-TRUE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX11-WF32-TRUE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[COPY]], %subreg.hi16
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX11-WF32-FAKE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-TRUE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX11-WF64-TRUE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[COPY]], %subreg.hi16
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+  ; DAGISEL-GFX11-WF64-FAKE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
   ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
new file mode 100644
index 0000000000000..3450d63ff7b4a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+  ; DAGISEL-LABEL: name: basic_test
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; DAGISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: basic_test
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; GISEL-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+  ; DAGISEL-LABEL: name: unused_active
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: unused_active
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+  ; GISEL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+  ; DAGISEL-LABEL: name: multiple_blocks
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; DAGISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+  ; DAGISEL-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec
+  ; DAGISEL-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; DAGISEL-NEXT:   S_BRANCH %bb.1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT: bb.1.if.then:
+  ; DAGISEL-NEXT:   successors: %bb.2(0x80000000)
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT: bb.2.if.end:
+  ; DAGISEL-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1
+  ; DAGISEL-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; DAGISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]]
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: multiple_blocks
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+  ; GISEL-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GISEL-NEXT:   S_BRANCH %bb.2
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT: bb.2.if.then:
+  ; GISEL-NEXT:   successors: %bb.3(0x80000000)
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT: bb.3.if.end:
+  ; GISEL-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2
+  ; GISEL-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  %c = icmp eq i32 %a, %b
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %0
+  %d = add i32 %a, %b
+  br label %if.end
+
+if.end:
+  %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+  %e = select i1 %active, i32 %a, i32 %f
+  ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+  ; DAGISEL-LABEL: name: ret_64
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; DAGISEL-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+  ; DAGISEL-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; DAGISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; DAGISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+  ; DAGISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; DAGISEL-NEXT:   $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+  ; DAGISEL-NEXT:   [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+  ;
+  ; GISEL-LABEL: name: ret_64
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; GISEL-NEXT:   $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+  %x = select i1 %active, i64 %a, i64 5
+  %y = select i1 %active, i64 %b, i64 3
+  %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i64 %ret
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 25d102847ab04..4f81d3599d1de 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index af3241e95e91d..2a5c65278f7dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -329,6 +329,7 @@
 ; GCN-O1-NEXT:        Remove dead machine instructions
 ; GCN-O1-NEXT:        SI Shrink Instructions
 ; GCN-O1-NEXT:        Register Usage Information Propagation
+; GCN-O1-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O1-NEXT:        Detect Dead Lanes
 ; GCN-O1-NEXT:        Remove dead machine instructions
 ; GCN-O1-NEXT:        Init Undef Pass
@@ -640,6 +641,7 @@
 ; GCN-O1-OPTS-NEXT:        Remove dead machine instructions
 ; GCN-O1-OPTS-NEXT:        SI Shrink Instructions
 ; GCN-O1-OPTS-NEXT:        Register Usage Information Propagation
+; GCN-O1-OPTS-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O1-OPTS-NEXT:        Detect Dead Lanes
 ; GCN-O1-OPTS-NEXT:        Remove dead machine instructions
 ; GCN-O1-OPTS-NEXT:        Init Undef Pass
@@ -956,6 +958,7 @@
 ; GCN-O2-NEXT:        Remove dead machine instructions
 ; GCN-O2-NEXT:        SI Shrink Instructions
 ; GCN-O2-NEXT:        Register Usage Information Propagation
+; GCN-O2-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O2-NEXT:        Detect Dead Lanes
 ; GCN-O2-NEXT:        Remove dead machine instructions
 ; GCN-O2-NEXT:        Init Undef Pass
@@ -1286,6 +1289,7 @@
 ; GCN-O3-NEXT:        Remove dead machine instructions
 ; GCN-O3-NEXT:        SI Shrink Instructions
 ; GCN-O3-NEXT:        Register Usage Information Propagation
+; GCN-O3-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O3-NEXT:        Detect Dead Lanes
 ; GCN-O3-NEXT:        Remove dead machine instructions
 ; GCN-O3-NEXT:        Init Undef Pass
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
index b77b2f7441a0c..1ec4f250f8726 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
@@ -19,6 +21,30 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX9-LABEL: v_alignbyte_b32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_alignbyte_b32 v1, s0, v1, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_alignbyte_b32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_alignbyte_b32 v0, s0, s1, v0
+; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
 ; GFX11-TRUE16-LABEL: v_alignbyte_b32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
@@ -73,6 +99,41 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX9-LABEL: v_alignbyte_b32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x3c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_alignbyte_b32 v1, v1, v2, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_alignbyte_b32_2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x3c
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_alignbyte_b32 v0, v1, v0, s2
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
 ; GFX11-TRUE16-LABEL: v_alignbyte_b32_2:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll
new file mode 100644
index 0000000000000..091859f3c9bf3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.cos.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}cos_bf16:
+; GCN: v_cos_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat %src) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}cos_bf16_constant_4
+; GCN: v_cos_bf16_e32 v0, 4.0
+define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 4.0) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}cos_bf16_constant_100
+; GCN: v_cos_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 100.0) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
index 25889ded91681..95653148b09f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
@@ -9,6 +9,172 @@ declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32)
 declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16)
 declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16)
 
+define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e32 v0.l, v0
+; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e32 v0.l, v0
+; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 0)
+  %ret = fpext half %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1
+; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:1
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1
+; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:1
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 1)
+  %ret = fpext half %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2
+; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:2
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2
+; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:2
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 2)
+  %ret = fpext half %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
+; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
+; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3)
+  %ret = fpext half %cvt to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3
+; GFX1250-SDAG-REAL16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
+; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
+; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi:
+; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
+; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
+  %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3)
+  %ins.0 = insertelement <2 x half> undef, half 0.0, i32 0
+  %ins.1 = insertelement <2 x half> %ins.0, half %cvt, i32 1
+  %ret = bitcast <2 x half> %ins.1 to float
+  ret float %ret
+}
+
 define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
new file mode 100644
index 0000000000000..3a5507063b834
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
+
+declare i16 @llvm.amdgcn.sat.pk4.i4.i8(i32) #0
+declare i16 @llvm.amdgcn.sat.pk4.u4.u8(i32) #0
+
+define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_clause 0x1
+; SDAG-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_clause 0x1
+; SDAG-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s2
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
+; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_clause 0x1
+; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_clause 0x1
+; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, s2
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    s_endpgm
+  %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
+  store i16 %cvt, ptr %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
+; SDAG-REAL16:       ; %bb.1:
+; SDAG-REAL16-NEXT:    s_load_b32 s8, s[4:5], 0x0
+; SDAG-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-REAL16-NEXT:    s_branch .LBB1_0
+; SDAG-REAL16-NEXT:    .p2align 8
+; SDAG-REAL16-NEXT:  ; %bb.2:
+; SDAG-REAL16-NEXT:  .LBB1_0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s8
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
+; SDAG-FAKE16:       ; %bb.1:
+; SDAG-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-FAKE16-NEXT:    s_branch .LBB1_0
+; SDAG-FAKE16-NEXT:    .p2align 8
+; SDAG-FAKE16-NEXT:  ; %bb.2:
+; SDAG-FAKE16-NEXT:  .LBB1_0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s8
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
+; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_clause 0x1
+; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_clause 0x1
+; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, s2
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    s_endpgm
+  %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
+  store i16 %cvt, ptr %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, 0x64
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
+; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
+; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, 0x64
+; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    s_endpgm
+  %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0
+  store i16 %cvt, ptr %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_clause 0x1
+; SDAG-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_clause 0x1
+; SDAG-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s2
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
+; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_clause 0x1
+; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_clause 0x1
+; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, s2
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    s_endpgm
+  %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
+  store i16 %cvt, ptr %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
+; SDAG-REAL16:       ; %bb.1:
+; SDAG-REAL16-NEXT:    s_load_b32 s8, s[4:5], 0x0
+; SDAG-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-REAL16-NEXT:    s_branch .LBB4_0
+; SDAG-REAL16-NEXT:    .p2align 8
+; SDAG-REAL16-NEXT:  ; %bb.2:
+; SDAG-REAL16-NEXT:  .LBB4_0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s8
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
+; SDAG-FAKE16:       ; %bb.1:
+; SDAG-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-FAKE16-NEXT:    s_branch .LBB4_0
+; SDAG-FAKE16-NEXT:    .p2align 8
+; SDAG-FAKE16-NEXT:  ; %bb.2:
+; SDAG-FAKE16-NEXT:  .LBB4_0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s8
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
+; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_clause 0x1
+; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_clause 0x1
+; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, s2
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    s_endpgm
+  %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
+  store i16 %cvt, ptr %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, 0x64
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
+; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
+; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, 0x64
+; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    s_endpgm
+  %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0
+  store i16 %cvt, ptr %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind memory(none) }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll
new file mode 100644
index 0000000000000..6304923790ad5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.exp2.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}exp_bf16:
+; GCN: v_exp_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @exp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat %src) #0
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}exp_bf16_constant_4
+; GCN: v_exp_bf16_e32 v0, 4.0
+define amdgpu_kernel void @exp_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 4.0) #0
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}exp_bf16_constant_100
+; GCN: v_exp_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @exp_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 100.0) #0
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
new file mode 100644
index 0000000000000..a8b2077f5a35b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.log.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}log_bf16:
+; GCN: v_log_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @log_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %log = call bfloat @llvm.amdgcn.log.bf16(bfloat %src) #0
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}log_bf16_constant_4
+; GCN: v_log_bf16_e32 v0, 4.0
+define amdgpu_kernel void @log_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 4.0) #0
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}log_bf16_constant_100
+; GCN: v_log_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 100.0) #0
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
new file mode 100644
index 0000000000000..87a7c2ef6c95c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
+; HEURRC-LABEL: default:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: default:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
+; HEURRC-LABEL: request_agpr:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: request_agpr:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
+; HEURRC-LABEL: request_no_agpr:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: request_no_agpr:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
+attributes #1 = { "amdgpu-agpr-alloc"="0,0" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 4628a9c15391b..866dba7746565 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
 
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@@ -25,6 +27,48 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
 }
@@ -45,6 +89,48 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
   ret <4 x float> %result
 }
@@ -91,6 +177,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT:    v_mov_b32_e32 v12, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
@@ -138,6 +302,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT:    v_mov_b32_e32 v12, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
@@ -271,6 +513,258 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
+; HEURRC-NEXT:    s_nop 4
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[50:51], 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b64_e32 v[12:13], 48
+; AGPR-NEXT:    v_mov_b64_e32 v[14:15], 32
+; AGPR-NEXT:    v_mov_b64_e32 v[16:17], 16
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    v_mov_b64_e32 v[18:19], 0
+; AGPR-NEXT:    v_mov_b32_e32 v8, s16
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    v_mov_b32_e32 v9, s17
+; AGPR-NEXT:    v_mov_b32_e32 v10, s18
+; AGPR-NEXT:    v_mov_b32_e32 v11, s19
+; AGPR-NEXT:    s_nop 4
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b64_e32 v[44:45], 48
+; VGPR-NEXT:    v_mov_b64_e32 v[46:47], 32
+; VGPR-NEXT:    v_mov_b64_e32 v[48:49], 16
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[50:51], 0
+; VGPR-NEXT:    v_mov_b32_e32 v40, s16
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPR-NEXT:    v_mov_b32_e32 v41, s17
+; VGPR-NEXT:    v_mov_b32_e32 v42, s18
+; VGPR-NEXT:    v_mov_b32_e32 v43, s19
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v0, s20
+; VGPR-NEXT:    v_mov_b32_e32 v1, s21
+; VGPR-NEXT:    v_mov_b32_e32 v2, s22
+; VGPR-NEXT:    v_mov_b32_e32 v3, s23
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s8
+; VGPR-NEXT:    v_mov_b32_e32 v1, s9
+; VGPR-NEXT:    v_mov_b32_e32 v2, s10
+; VGPR-NEXT:    v_mov_b32_e32 v3, s11
+; VGPR-NEXT:    global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s12
+; VGPR-NEXT:    v_mov_b32_e32 v1, s13
+; VGPR-NEXT:    v_mov_b32_e32 v2, s14
+; VGPR-NEXT:    v_mov_b32_e32 v3, s15
+; VGPR-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
   store volatile <16 x float> %result, ptr addrspace(1) null
   store volatile <16 x float> %arg2, ptr addrspace(1) null
@@ -401,6 +895,258 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
+; HEURRC-NEXT:    s_nop 4
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[50:51], 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b64_e32 v[12:13], 48
+; AGPR-NEXT:    v_mov_b64_e32 v[14:15], 32
+; AGPR-NEXT:    v_mov_b64_e32 v[16:17], 16
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    v_mov_b64_e32 v[18:19], 0
+; AGPR-NEXT:    v_mov_b32_e32 v8, s16
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    v_mov_b32_e32 v9, s17
+; AGPR-NEXT:    v_mov_b32_e32 v10, s18
+; AGPR-NEXT:    v_mov_b32_e32 v11, s19
+; AGPR-NEXT:    s_nop 4
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b64_e32 v[44:45], 48
+; VGPR-NEXT:    v_mov_b64_e32 v[46:47], 32
+; VGPR-NEXT:    v_mov_b64_e32 v[48:49], 16
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[50:51], 0
+; VGPR-NEXT:    v_mov_b32_e32 v40, s16
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPR-NEXT:    v_mov_b32_e32 v41, s17
+; VGPR-NEXT:    v_mov_b32_e32 v42, s18
+; VGPR-NEXT:    v_mov_b32_e32 v43, s19
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v0, s20
+; VGPR-NEXT:    v_mov_b32_e32 v1, s21
+; VGPR-NEXT:    v_mov_b32_e32 v2, s22
+; VGPR-NEXT:    v_mov_b32_e32 v3, s23
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s8
+; VGPR-NEXT:    v_mov_b32_e32 v1, s9
+; VGPR-NEXT:    v_mov_b32_e32 v2, s10
+; VGPR-NEXT:    v_mov_b32_e32 v3, s11
+; VGPR-NEXT:    global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s12
+; VGPR-NEXT:    v_mov_b32_e32 v1, s13
+; VGPR-NEXT:    v_mov_b32_e32 v2, s14
+; VGPR-NEXT:    v_mov_b32_e32 v3, s15
+; VGPR-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
   store volatile <16 x float> %result, ptr addrspace(1) null
   store volatile <16 x float> %arg2, ptr addrspace(1) null
@@ -448,6 +1194,134 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT:    v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT:    v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT:    v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT:    v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT:    v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT:    v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT:    v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT:    v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT:    v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT:    v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT:    v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT:    v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT:    v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 3
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT:    v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT:    v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT:    v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT:    v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT:    v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT:    v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT:    v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT:    v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT:    v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT:    v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT:    v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 3
+; VGPR-NEXT:    v_mov_b32_e32 v0, v8
+; VGPR-NEXT:    v_mov_b32_e32 v1, v9
+; VGPR-NEXT:    v_mov_b32_e32 v2, v10
+; VGPR-NEXT:    v_mov_b32_e32 v3, v11
+; VGPR-NEXT:    v_mov_b32_e32 v4, v12
+; VGPR-NEXT:    v_mov_b32_e32 v5, v13
+; VGPR-NEXT:    v_mov_b32_e32 v6, v14
+; VGPR-NEXT:    v_mov_b32_e32 v7, v15
+; VGPR-NEXT:    v_mov_b32_e32 v8, v16
+; VGPR-NEXT:    v_mov_b32_e32 v9, v17
+; VGPR-NEXT:    v_mov_b32_e32 v10, v18
+; VGPR-NEXT:    v_mov_b32_e32 v11, v19
+; VGPR-NEXT:    v_mov_b32_e32 v12, v20
+; VGPR-NEXT:    v_mov_b32_e32 v13, v21
+; VGPR-NEXT:    v_mov_b32_e32 v14, v22
+; VGPR-NEXT:    v_mov_b32_e32 v15, v23
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
@@ -493,6 +1367,134 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT:    v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT:    v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT:    v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT:    v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT:    v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT:    v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT:    v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT:    v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT:    v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT:    v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT:    v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT:    v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT:    v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 3
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT:    v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT:    v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT:    v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT:    v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT:    v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT:    v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT:    v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT:    v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT:    v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT:    v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT:    v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 3
+; VGPR-NEXT:    v_mov_b32_e32 v0, v8
+; VGPR-NEXT:    v_mov_b32_e32 v1, v9
+; VGPR-NEXT:    v_mov_b32_e32 v2, v10
+; VGPR-NEXT:    v_mov_b32_e32 v3, v11
+; VGPR-NEXT:    v_mov_b32_e32 v4, v12
+; VGPR-NEXT:    v_mov_b32_e32 v5, v13
+; VGPR-NEXT:    v_mov_b32_e32 v6, v14
+; VGPR-NEXT:    v_mov_b32_e32 v7, v15
+; VGPR-NEXT:    v_mov_b32_e32 v8, v16
+; VGPR-NEXT:    v_mov_b32_e32 v9, v17
+; VGPR-NEXT:    v_mov_b32_e32 v10, v18
+; VGPR-NEXT:    v_mov_b32_e32 v11, v19
+; VGPR-NEXT:    v_mov_b32_e32 v12, v20
+; VGPR-NEXT:    v_mov_b32_e32 v13, v21
+; VGPR-NEXT:    v_mov_b32_e32 v14, v22
+; VGPR-NEXT:    v_mov_b32_e32 v15, v23
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
   ret <16 x float> %result
 }
@@ -615,6 +1617,246 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    v_mov_b32_e32 v12, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT:    v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT:    v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT:    v_mov_b32_e32 v8, s20
+; AGPR-NEXT:    v_mov_b32_e32 v9, s21
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
+; AGPR-NEXT:    v_mov_b32_e32 v10, s22
+; AGPR-NEXT:    v_mov_b32_e32 v11, s23
+; AGPR-NEXT:    v_mov_b32_e32 v0, s16
+; AGPR-NEXT:    v_mov_b32_e32 v1, s17
+; AGPR-NEXT:    v_mov_b32_e32 v2, s18
+; AGPR-NEXT:    v_mov_b32_e32 v3, s19
+; AGPR-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    v_mov_b32_e32 v44, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT:    v_mov_b32_e32 v40, s20
+; VGPR-NEXT:    v_mov_b32_e32 v41, s21
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPR-NEXT:    v_mov_b32_e32 v42, s22
+; VGPR-NEXT:    v_mov_b32_e32 v43, s23
+; VGPR-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 2
+; VGPR-NEXT:    v_mov_b32_e32 v16, s16
+; VGPR-NEXT:    v_mov_b32_e32 v17, s17
+; VGPR-NEXT:    v_mov_b32_e32 v18, s18
+; VGPR-NEXT:    v_mov_b32_e32 v19, s19
+; VGPR-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s12
+; VGPR-NEXT:    v_mov_b32_e32 v17, s13
+; VGPR-NEXT:    v_mov_b32_e32 v18, s14
+; VGPR-NEXT:    v_mov_b32_e32 v19, s15
+; VGPR-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s8
+; VGPR-NEXT:    v_mov_b32_e32 v17, s9
+; VGPR-NEXT:    v_mov_b32_e32 v18, s10
+; VGPR-NEXT:    v_mov_b32_e32 v19, s11
+; VGPR-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
   store volatile <16 x float> %arg2, ptr addrspace(1) %out
   store volatile <16 x float> %result, ptr addrspace(1) %out
@@ -739,6 +1981,246 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    v_mov_b32_e32 v12, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT:    v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT:    v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT:    v_mov_b32_e32 v8, s20
+; AGPR-NEXT:    v_mov_b32_e32 v9, s21
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; AGPR-NEXT:    v_mov_b32_e32 v10, s22
+; AGPR-NEXT:    v_mov_b32_e32 v11, s23
+; AGPR-NEXT:    v_mov_b32_e32 v0, s16
+; AGPR-NEXT:    v_mov_b32_e32 v1, s17
+; AGPR-NEXT:    v_mov_b32_e32 v2, s18
+; AGPR-NEXT:    v_mov_b32_e32 v3, s19
+; AGPR-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    v_mov_b32_e32 v44, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT:    v_mov_b32_e32 v40, s20
+; VGPR-NEXT:    v_mov_b32_e32 v41, s21
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPR-NEXT:    v_mov_b32_e32 v42, s22
+; VGPR-NEXT:    v_mov_b32_e32 v43, s23
+; VGPR-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 2
+; VGPR-NEXT:    v_mov_b32_e32 v16, s16
+; VGPR-NEXT:    v_mov_b32_e32 v17, s17
+; VGPR-NEXT:    v_mov_b32_e32 v18, s18
+; VGPR-NEXT:    v_mov_b32_e32 v19, s19
+; VGPR-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s12
+; VGPR-NEXT:    v_mov_b32_e32 v17, s13
+; VGPR-NEXT:    v_mov_b32_e32 v18, s14
+; VGPR-NEXT:    v_mov_b32_e32 v19, s15
+; VGPR-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s8
+; VGPR-NEXT:    v_mov_b32_e32 v17, s9
+; VGPR-NEXT:    v_mov_b32_e32 v18, s10
+; VGPR-NEXT:    v_mov_b32_e32 v19, s11
+; VGPR-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
   store volatile <16 x float> %arg2, ptr addrspace(1) %out
   store volatile <16 x float> %result, ptr addrspace(1) %out
@@ -819,6 +2301,136 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT:    v_mov_b32_e32 v0, 0
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 2
+; AGPR-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; VGPR-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 2
+; VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
   store <16 x float> %result, ptr addrspace(1) %out
   ret void
@@ -898,6 +2510,136 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT:    v_mov_b32_e32 v0, 0
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 2
+; AGPR-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; VGPR-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 2
+; VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
   store <16 x float> %result, ptr addrspace(1) %out
   ret void
@@ -925,6 +2667,48 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
   ret <4 x i32> %result
 }
@@ -945,6 +2729,48 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1)
   ret <4 x i32> %result
 }
@@ -995,6 +2821,104 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    v_mov_b32_e32 v4, s12
+; AGPR-NEXT:    v_mov_b32_e32 v5, s13
+; AGPR-NEXT:    v_mov_b32_e32 v6, s14
+; AGPR-NEXT:    v_mov_b32_e32 v7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT:    v_mov_b32_e32 v12, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v0, s8
+; VGPR-NEXT:    v_mov_b32_e32 v1, s9
+; VGPR-NEXT:    v_mov_b32_e32 v2, s10
+; VGPR-NEXT:    v_mov_b32_e32 v3, s11
+; VGPR-NEXT:    v_mov_b32_e32 v4, s12
+; VGPR-NEXT:    v_mov_b32_e32 v5, s13
+; VGPR-NEXT:    v_mov_b32_e32 v6, s14
+; VGPR-NEXT:    v_mov_b32_e32 v7, s15
+; VGPR-NEXT:    v_mov_b32_e32 v8, s0
+; VGPR-NEXT:    v_mov_b32_e32 v9, s1
+; VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT:    s_endpgm
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
   store <4 x i32> %result, ptr addrspace(1) %out
   ret void
@@ -1046,6 +2970,104 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    v_mov_b32_e32 v4, s12
+; AGPR-NEXT:    v_mov_b32_e32 v5, s13
+; AGPR-NEXT:    v_mov_b32_e32 v6, s14
+; AGPR-NEXT:    v_mov_b32_e32 v7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT:    v_mov_b32_e32 v12, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v0, s8
+; VGPR-NEXT:    v_mov_b32_e32 v1, s9
+; VGPR-NEXT:    v_mov_b32_e32 v2, s10
+; VGPR-NEXT:    v_mov_b32_e32 v3, s11
+; VGPR-NEXT:    v_mov_b32_e32 v4, s12
+; VGPR-NEXT:    v_mov_b32_e32 v5, s13
+; VGPR-NEXT:    v_mov_b32_e32 v6, s14
+; VGPR-NEXT:    v_mov_b32_e32 v7, s15
+; VGPR-NEXT:    v_mov_b32_e32 v8, s0
+; VGPR-NEXT:    v_mov_b32_e32 v9, s1
+; VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT:    s_endpgm
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1)
   store <4 x i32> %result, ptr addrspace(1) %out
   ret void
@@ -1187,6 +3209,282 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s27
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s31
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v37, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v38, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b64_e32 v[8:9], 48
+; AGPR-NEXT:    v_mov_b64_e32 v[10:11], 32
+; AGPR-NEXT:    v_mov_b64_e32 v[12:13], 16
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s24
+; AGPR-NEXT:    v_mov_b32_e32 v1, s25
+; AGPR-NEXT:    v_mov_b32_e32 v2, s26
+; AGPR-NEXT:    v_mov_b32_e32 v3, s27
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v4, s28
+; AGPR-NEXT:    v_mov_b32_e32 v5, s29
+; AGPR-NEXT:    v_mov_b32_e32 v6, s30
+; AGPR-NEXT:    v_mov_b32_e32 v7, s31
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    v_mov_b64_e32 v[14:15], 0
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT:    v_mov_b32_e32 v0, s16
+; AGPR-NEXT:    v_mov_b32_e32 v1, s17
+; AGPR-NEXT:    v_mov_b32_e32 v2, s18
+; AGPR-NEXT:    v_mov_b32_e32 v3, s19
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b64_e32 v[40:41], 48
+; VGPR-NEXT:    v_mov_b64_e32 v[42:43], 32
+; VGPR-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v32, s24
+; VGPR-NEXT:    v_mov_b32_e32 v33, s25
+; VGPR-NEXT:    v_mov_b32_e32 v34, s26
+; VGPR-NEXT:    v_mov_b32_e32 v35, s27
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b32_e32 v36, s28
+; VGPR-NEXT:    v_mov_b32_e32 v37, s29
+; VGPR-NEXT:    v_mov_b32_e32 v38, s30
+; VGPR-NEXT:    v_mov_b32_e32 v39, s31
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 3
+; VGPR-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; VGPR-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s20
+; VGPR-NEXT:    v_mov_b32_e32 v1, s21
+; VGPR-NEXT:    v_mov_b32_e32 v2, s22
+; VGPR-NEXT:    v_mov_b32_e32 v3, s23
+; VGPR-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s8
+; VGPR-NEXT:    v_mov_b32_e32 v1, s9
+; VGPR-NEXT:    v_mov_b32_e32 v2, s10
+; VGPR-NEXT:    v_mov_b32_e32 v3, s11
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s12
+; VGPR-NEXT:    v_mov_b32_e32 v1, s13
+; VGPR-NEXT:    v_mov_b32_e32 v2, s14
+; VGPR-NEXT:    v_mov_b32_e32 v3, s15
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
   store volatile <16 x i32> %result, ptr addrspace(1) null
   store volatile <16 x i32> %arg2, ptr addrspace(1) null
@@ -1323,6 +3621,282 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s27
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s31
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v37, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v38, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b64_e32 v[8:9], 48
+; AGPR-NEXT:    v_mov_b64_e32 v[10:11], 32
+; AGPR-NEXT:    v_mov_b64_e32 v[12:13], 16
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s24
+; AGPR-NEXT:    v_mov_b32_e32 v1, s25
+; AGPR-NEXT:    v_mov_b32_e32 v2, s26
+; AGPR-NEXT:    v_mov_b32_e32 v3, s27
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v4, s28
+; AGPR-NEXT:    v_mov_b32_e32 v5, s29
+; AGPR-NEXT:    v_mov_b32_e32 v6, s30
+; AGPR-NEXT:    v_mov_b32_e32 v7, s31
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    v_mov_b64_e32 v[14:15], 0
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; AGPR-NEXT:    v_mov_b32_e32 v0, s16
+; AGPR-NEXT:    v_mov_b32_e32 v1, s17
+; AGPR-NEXT:    v_mov_b32_e32 v2, s18
+; AGPR-NEXT:    v_mov_b32_e32 v3, s19
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b64_e32 v[40:41], 48
+; VGPR-NEXT:    v_mov_b64_e32 v[42:43], 32
+; VGPR-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v32, s24
+; VGPR-NEXT:    v_mov_b32_e32 v33, s25
+; VGPR-NEXT:    v_mov_b32_e32 v34, s26
+; VGPR-NEXT:    v_mov_b32_e32 v35, s27
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b32_e32 v36, s28
+; VGPR-NEXT:    v_mov_b32_e32 v37, s29
+; VGPR-NEXT:    v_mov_b32_e32 v38, s30
+; VGPR-NEXT:    v_mov_b32_e32 v39, s31
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 3
+; VGPR-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; VGPR-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s20
+; VGPR-NEXT:    v_mov_b32_e32 v1, s21
+; VGPR-NEXT:    v_mov_b32_e32 v2, s22
+; VGPR-NEXT:    v_mov_b32_e32 v3, s23
+; VGPR-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s8
+; VGPR-NEXT:    v_mov_b32_e32 v1, s9
+; VGPR-NEXT:    v_mov_b32_e32 v2, s10
+; VGPR-NEXT:    v_mov_b32_e32 v3, s11
+; VGPR-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v0, s12
+; VGPR-NEXT:    v_mov_b32_e32 v1, s13
+; VGPR-NEXT:    v_mov_b32_e32 v2, s14
+; VGPR-NEXT:    v_mov_b32_e32 v3, s15
+; VGPR-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 2, i32 3, i32 1)
   store volatile <16 x i32> %result, ptr addrspace(1) null
   store volatile <16 x i32> %arg2, ptr addrspace(1) null
@@ -1370,6 +3944,134 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT:    v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT:    v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT:    v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT:    v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT:    v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT:    v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT:    v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT:    v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT:    v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT:    v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT:    v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT:    v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT:    v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 3
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT:    v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT:    v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT:    v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT:    v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT:    v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT:    v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT:    v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT:    v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT:    v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT:    v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT:    v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 3
+; VGPR-NEXT:    v_mov_b32_e32 v0, v8
+; VGPR-NEXT:    v_mov_b32_e32 v1, v9
+; VGPR-NEXT:    v_mov_b32_e32 v2, v10
+; VGPR-NEXT:    v_mov_b32_e32 v3, v11
+; VGPR-NEXT:    v_mov_b32_e32 v4, v12
+; VGPR-NEXT:    v_mov_b32_e32 v5, v13
+; VGPR-NEXT:    v_mov_b32_e32 v6, v14
+; VGPR-NEXT:    v_mov_b32_e32 v7, v15
+; VGPR-NEXT:    v_mov_b32_e32 v8, v16
+; VGPR-NEXT:    v_mov_b32_e32 v9, v17
+; VGPR-NEXT:    v_mov_b32_e32 v10, v18
+; VGPR-NEXT:    v_mov_b32_e32 v11, v19
+; VGPR-NEXT:    v_mov_b32_e32 v12, v20
+; VGPR-NEXT:    v_mov_b32_e32 v13, v21
+; VGPR-NEXT:    v_mov_b32_e32 v14, v22
+; VGPR-NEXT:    v_mov_b32_e32 v15, v23
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
   ret <16 x i32> %result
 }
@@ -1415,6 +4117,134 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT:    v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT:    v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT:    v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT:    v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT:    v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT:    v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT:    v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT:    v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT:    v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT:    v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT:    v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT:    v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT:    v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT:    v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT:    v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT:    v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT:    v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT:    v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT:    v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 3
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT:    v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT:    v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT:    v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT:    v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT:    v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT:    v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT:    v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT:    v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT:    v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT:    v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT:    v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 3
+; VGPR-NEXT:    v_mov_b32_e32 v0, v8
+; VGPR-NEXT:    v_mov_b32_e32 v1, v9
+; VGPR-NEXT:    v_mov_b32_e32 v2, v10
+; VGPR-NEXT:    v_mov_b32_e32 v3, v11
+; VGPR-NEXT:    v_mov_b32_e32 v4, v12
+; VGPR-NEXT:    v_mov_b32_e32 v5, v13
+; VGPR-NEXT:    v_mov_b32_e32 v6, v14
+; VGPR-NEXT:    v_mov_b32_e32 v7, v15
+; VGPR-NEXT:    v_mov_b32_e32 v8, v16
+; VGPR-NEXT:    v_mov_b32_e32 v9, v17
+; VGPR-NEXT:    v_mov_b32_e32 v10, v18
+; VGPR-NEXT:    v_mov_b32_e32 v11, v19
+; VGPR-NEXT:    v_mov_b32_e32 v12, v20
+; VGPR-NEXT:    v_mov_b32_e32 v13, v21
+; VGPR-NEXT:    v_mov_b32_e32 v14, v22
+; VGPR-NEXT:    v_mov_b32_e32 v15, v23
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1)
   ret <16 x i32> %result
 }
@@ -1544,6 +4374,274 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT:    s_nop 6
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b32_e32 v4, s24
+; AGPR-NEXT:    v_mov_b32_e32 v5, s25
+; AGPR-NEXT:    v_mov_b32_e32 v6, s26
+; AGPR-NEXT:    v_mov_b32_e32 v7, s27
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT:    v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s16
+; AGPR-NEXT:    v_mov_b32_e32 v1, s17
+; AGPR-NEXT:    v_mov_b32_e32 v2, s18
+; AGPR-NEXT:    v_mov_b32_e32 v3, s19
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    v_mov_b32_e32 v40, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v32, s20
+; VGPR-NEXT:    v_mov_b32_e32 v33, s21
+; VGPR-NEXT:    v_mov_b32_e32 v34, s22
+; VGPR-NEXT:    v_mov_b32_e32 v35, s23
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b32_e32 v36, s24
+; VGPR-NEXT:    v_mov_b32_e32 v37, s25
+; VGPR-NEXT:    v_mov_b32_e32 v38, s26
+; VGPR-NEXT:    v_mov_b32_e32 v39, s27
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPR-NEXT:    s_nop 6
+; VGPR-NEXT:    v_mov_b32_e32 v16, s20
+; VGPR-NEXT:    v_mov_b32_e32 v17, s21
+; VGPR-NEXT:    v_mov_b32_e32 v18, s22
+; VGPR-NEXT:    v_mov_b32_e32 v19, s23
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s16
+; VGPR-NEXT:    v_mov_b32_e32 v17, s17
+; VGPR-NEXT:    v_mov_b32_e32 v18, s18
+; VGPR-NEXT:    v_mov_b32_e32 v19, s19
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s12
+; VGPR-NEXT:    v_mov_b32_e32 v17, s13
+; VGPR-NEXT:    v_mov_b32_e32 v18, s14
+; VGPR-NEXT:    v_mov_b32_e32 v19, s15
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s8
+; VGPR-NEXT:    v_mov_b32_e32 v17, s9
+; VGPR-NEXT:    v_mov_b32_e32 v18, s10
+; VGPR-NEXT:    v_mov_b32_e32 v19, s11
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
   store volatile <16 x i32> %arg2, ptr addrspace(1) %out
   store volatile <16 x i32> %result, ptr addrspace(1) %out
@@ -1675,6 +4773,274 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    s_waitcnt vmcnt(0)
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT:    s_nop 6
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b32_e32 v4, s24
+; AGPR-NEXT:    v_mov_b32_e32 v5, s25
+; AGPR-NEXT:    v_mov_b32_e32 v6, s26
+; AGPR-NEXT:    v_mov_b32_e32 v7, s27
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT:    v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s16
+; AGPR-NEXT:    v_mov_b32_e32 v1, s17
+; AGPR-NEXT:    v_mov_b32_e32 v2, s18
+; AGPR-NEXT:    v_mov_b32_e32 v3, s19
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s12
+; AGPR-NEXT:    v_mov_b32_e32 v1, s13
+; AGPR-NEXT:    v_mov_b32_e32 v2, s14
+; AGPR-NEXT:    v_mov_b32_e32 v3, s15
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_nop 0
+; AGPR-NEXT:    v_mov_b32_e32 v0, s8
+; AGPR-NEXT:    v_mov_b32_e32 v1, s9
+; AGPR-NEXT:    v_mov_b32_e32 v2, s10
+; AGPR-NEXT:    v_mov_b32_e32 v3, s11
+; AGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT:    s_waitcnt vmcnt(0)
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    v_mov_b32_e32 v40, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v32, s20
+; VGPR-NEXT:    v_mov_b32_e32 v33, s21
+; VGPR-NEXT:    v_mov_b32_e32 v34, s22
+; VGPR-NEXT:    v_mov_b32_e32 v35, s23
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b32_e32 v36, s24
+; VGPR-NEXT:    v_mov_b32_e32 v37, s25
+; VGPR-NEXT:    v_mov_b32_e32 v38, s26
+; VGPR-NEXT:    v_mov_b32_e32 v39, s27
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPR-NEXT:    s_nop 6
+; VGPR-NEXT:    v_mov_b32_e32 v16, s20
+; VGPR-NEXT:    v_mov_b32_e32 v17, s21
+; VGPR-NEXT:    v_mov_b32_e32 v18, s22
+; VGPR-NEXT:    v_mov_b32_e32 v19, s23
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s16
+; VGPR-NEXT:    v_mov_b32_e32 v17, s17
+; VGPR-NEXT:    v_mov_b32_e32 v18, s18
+; VGPR-NEXT:    v_mov_b32_e32 v19, s19
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s12
+; VGPR-NEXT:    v_mov_b32_e32 v17, s13
+; VGPR-NEXT:    v_mov_b32_e32 v18, s14
+; VGPR-NEXT:    v_mov_b32_e32 v19, s15
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_nop 0
+; VGPR-NEXT:    v_mov_b32_e32 v16, s8
+; VGPR-NEXT:    v_mov_b32_e32 v17, s9
+; VGPR-NEXT:    v_mov_b32_e32 v18, s10
+; VGPR-NEXT:    v_mov_b32_e32 v19, s11
+; VGPR-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT:    s_waitcnt vmcnt(0)
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 2, i32 3)
   store volatile <16 x i32> %arg2, ptr addrspace(1) %out
   store volatile <16 x i32> %result, ptr addrspace(1) %out
@@ -1760,6 +5126,156 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b32_e32 v20, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v21, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v22, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v23, s27
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b32_e32 v4, s24
+; AGPR-NEXT:    v_mov_b32_e32 v5, s25
+; AGPR-NEXT:    v_mov_b32_e32 v6, s26
+; AGPR-NEXT:    v_mov_b32_e32 v7, s27
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT:    v_mov_b32_e32 v0, 0
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 2
+; AGPR-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v16, s20
+; VGPR-NEXT:    v_mov_b32_e32 v17, s21
+; VGPR-NEXT:    v_mov_b32_e32 v18, s22
+; VGPR-NEXT:    v_mov_b32_e32 v19, s23
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b32_e32 v20, s24
+; VGPR-NEXT:    v_mov_b32_e32 v21, s25
+; VGPR-NEXT:    v_mov_b32_e32 v22, s26
+; VGPR-NEXT:    v_mov_b32_e32 v23, s27
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 2
+; VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
   store <16 x i32> %result, ptr addrspace(1) %out
   ret void
@@ -1844,6 +5360,156 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT:    v_mov_b32_e32 v20, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v21, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v22, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v23, s27
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b32_e32 v0, s20
+; AGPR-NEXT:    v_mov_b32_e32 v1, s21
+; AGPR-NEXT:    v_mov_b32_e32 v2, s22
+; AGPR-NEXT:    v_mov_b32_e32 v3, s23
+; AGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT:    v_mov_b32_e32 v4, s24
+; AGPR-NEXT:    v_mov_b32_e32 v5, s25
+; AGPR-NEXT:    v_mov_b32_e32 v6, s26
+; AGPR-NEXT:    v_mov_b32_e32 v7, s27
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT:    v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT:    v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT:    v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT:    v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT:    v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT:    v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT:    v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT:    v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT:    v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT:    v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT:    v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT:    v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT:    v_mov_b32_e32 v0, 0
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    s_nop 2
+; AGPR-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b32_e32 v16, s20
+; VGPR-NEXT:    v_mov_b32_e32 v17, s21
+; VGPR-NEXT:    v_mov_b32_e32 v18, s22
+; VGPR-NEXT:    v_mov_b32_e32 v19, s23
+; VGPR-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT:    v_mov_b32_e32 v20, s24
+; VGPR-NEXT:    v_mov_b32_e32 v21, s25
+; VGPR-NEXT:    v_mov_b32_e32 v22, s26
+; VGPR-NEXT:    v_mov_b32_e32 v23, s27
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    s_nop 2
+; VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT:    s_endpgm
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 3, i32 2, i32 1)
   store <16 x i32> %result, ptr addrspace(1) %out
   ret void
@@ -1871,6 +5537,48 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
 }
@@ -1891,6 +5599,48 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT:    v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT:    v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT:    v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT:    s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT:    v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT:    v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT:    v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT:    s_setpc_b64 s[30:31]
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
   ret <4 x float> %result
 }
@@ -1916,6 +5666,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT:    v_mov_b32_e32 v12, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
@@ -1942,6 +5770,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
+;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; HEURRC:       ; %bb.0:
+; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    s_nop 1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; VGPRRC:       ; %bb.0:
+; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    s_nop 1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    s_endpgm
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; AGPR:       ; %bb.0:
+; AGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT:    v_mov_b32_e32 v8, 0
+; AGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; AGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT:    v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT:    v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT:    v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT:    v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT:    s_nop 1
+; AGPR-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT:    s_nop 7
+; AGPR-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT:    s_endpgm
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; VGPR:       ; %bb.0:
+; VGPR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT:    v_mov_b32_e32 v12, 0
+; VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT:    s_nop 1
+; VGPR-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT:    s_nop 7
+; VGPR-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 67ae05eb6f0b8..561eaca3b77df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -4365,8 +4365,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; NOLIT-SRCC:       ; %bb.0: ; %bb
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1.0
-; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, 0
@@ -4465,8 +4465,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; LIT-SRCC:       ; %bb.0: ; %bb
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1.0
-; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 814086685880d..ed6a02b62ae9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
 
 ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
 ; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
@@ -17,6 +19,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv(i32 %vdst_old, i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -29,6 +43,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vi:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -41,6 +71,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vl:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vl:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0xc1d1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -54,6 +100,23 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_iv:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, v0
+; GFX950-NEXT:    v_mov_b32_e32 v0, 1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_iv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -67,6 +130,23 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_ss:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_ss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -80,6 +160,23 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_sv:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, v0
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_sv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -92,6 +189,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_permlane16_swap_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vs:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
   ret { i32, i32 } %v
 }
@@ -102,6 +215,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi(i32 %vdst_old, i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_permlane16_swap_b32_e64 v0, v1 fi:1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 false)
   ret { i32, i32 } %v
 }
@@ -112,6 +237,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_bc(i32 %vdst_old, i32 %src0_old) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 true)
   ret { i32, i32 } %v
 }
@@ -122,6 +259,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi_bc(i32 %vdst_old, i32 %src0_old
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 true)
   ret { i32, i32 } %v
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
index 2faf375a97a86..465414c5471ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.prng.b32(i32) #0
 
@@ -29,4 +31,4 @@ define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 {
 
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
\ No newline at end of file
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
index 3c49d0b9c01b1..199494d1c3473 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
@@ -1,10 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=SDAG-TRUE16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefix=SDAG-FAKE16 %s
 ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=GI-TRUE16 %s
 ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefix=GI-FAKE16 %s
 
-; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
 ; FIXME: GlobalISel does not work with bf16
 
 declare bfloat @llvm.amdgcn.rcp.bf16(bfloat) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
new file mode 100644
index 0000000000000..42d12fd0fb3c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0
+
+define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+; SDAG-REAL16-LABEL: rsq_bf16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, s2
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_bf16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat %src) #0
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: rsq_bf16_constant_4:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, 4.0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_bf16_constant_4:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, 4.0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 4.0) #0
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: rsq_bf16_constant_100:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, 0x42c8
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_bf16_constant_100:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, 0x42c8
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 100.0) #0
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @rsq_undef_bf16(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: rsq_undef_bf16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_undef_bf16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat undef)
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll
new file mode 100644
index 0000000000000..9c35a7eae0b8e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.sin.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}sin_bf16:
+; GCN: v_sin_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat %src) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}sin_bf16_constant_4
+; GCN: v_sin_bf16_e32 v0, 4.0
+define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 4.0) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}sin_bf16_constant_100
+; GCN: v_sin_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 100.0) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
index 344c0112e4a54..dd89f80a54949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
@@ -1,14 +1,180 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
 ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
 ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
 
-; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
 ; FIXME: GlobalISel does not work with bf16
 
+declare float @llvm.amdgcn.tanh.f32(float) #0
+declare half @llvm.amdgcn.tanh.f16(half) #0
 declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0
 
+define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
+; SDAG-REAL16-LABEL: tanh_f32:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, s2
+; SDAG-REAL16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f32:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float %src) #0
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f32_constant_4.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, 4.0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, 4.0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float 4.0) #0
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f32_constant_100.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, 0x42c80000
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, 0x42c80000
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float 100.0) #0
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_undef_f32:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_undef_f32:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float undef)
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 {
+; SDAG-REAL16-LABEL: tanh_f16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, s2
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half %src) #0
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f16_constant_4.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, 4.0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16_constant_4.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, 4.0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half 4.0) #0
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f16_constant_100.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, 0x5640
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16_constant_100.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, 0x5640
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half 100.0) #0
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_undef_f16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_undef_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half undef)
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
 define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; SDAG-REAL16-LABEL: tanh_bf16:
 ; SDAG-REAL16:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 2f5ff90c9274f..9149ed5c1ac1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -304,6 +304,556 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
+; GISEL-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
@@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
@@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
 declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
 declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
 declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
index fe8358fcc7a9a..12ea3142772ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -1342,6 +1342,110 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s1, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
@@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
index 9802144a29577..bf8308b5cd981 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
@@ -1126,6 +1126,72 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
@@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>,
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
new file mode 100644
index 0000000000000..ced96ee98e0ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.cos.bf16(bfloat) #0
+
+define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: cos_bf16_constant_4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_cos_bf16_e32 v0, 0x3f23
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
+  %cos = call bfloat @llvm.cos.bf16(bfloat 4.0) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: cos_bf16_constant_100:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_cos_bf16_e32 v0, 0x417f
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
+  %cos = call bfloat @llvm.cos.bf16(bfloat 100.0) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 978f223aafb94..8c1e166babaf8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -5213,121 +5213,15 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 }
 
 define float @v_exp_f32_undef() {
-; VI-SDAG-LABEL: v_exp_f32_undef:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v0, 0
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7fc00000
-; VI-SDAG-NEXT:    v_add_f32_e64 v1, -v0, s4
-; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp_f32_undef:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sub_f32_e64 v0, s4, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8a000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x39a3b295
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x39a3b295, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8a000, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_exp_f32_undef:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xffc00000
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s4, v0, v1
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s4, v1, v0
-; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v1, 0x7fc00000
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, 0x7fc00000, v1
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp_f32_undef:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2ce8ed0
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SDAG-LABEL: v_exp_f32_undef:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xffc00000
-; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v0, v1
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v1, v0
-; SI-SDAG-NEXT:    v_rndne_f32_e32 v1, 0x7fc00000
-; SI-SDAG-NEXT:    v_sub_f32_e32 v2, 0x7fc00000, v1
-; SI-SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
-; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp_f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_exp_f32_undef:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
-; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2ce8ed0
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp_f32_undef:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_undef:
 ; R600:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 70c3787bac9a1..edc505bdd6c1d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5291,121 +5291,15 @@ define float @v_exp10_f32_dynamic_mode(float %in) #1 {
 }
 
 define float @v_exp10_f32_undef() {
-; VI-SDAG-LABEL: v_exp10_f32_undef:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_rndne_f32_e32 v0, 0
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7fc00000
-; VI-SDAG-NEXT:    v_add_f32_e64 v1, -v0, s4
-; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp10_f32_undef:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sub_f32_e64 v0, s4, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40549000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3a2784bc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3a2784bc, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549000, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc23369f4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x421a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_exp10_f32_undef:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xffc00000
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s4, v0, v1
-; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
-; GFX900-SDAG-NEXT:    v_fma_f32 v0, s4, v1, v0
-; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v1, 0x7fc00000
-; GFX900-SDAG-NEXT:    v_sub_f32_e32 v2, 0x7fc00000, v1
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp10_f32_undef:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x33979a37
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc23369f4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x421a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SDAG-LABEL: v_exp10_f32_undef:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40549a78
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xffc00000
-; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v0, v1
-; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x33979a37
-; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v1, v0
-; SI-SDAG-NEXT:    v_rndne_f32_e32 v1, 0x7fc00000
-; SI-SDAG-NEXT:    v_sub_f32_e32 v2, 0x7fc00000, v1
-; SI-SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
-; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp10_f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SI-GISEL-LABEL: v_exp10_f32_undef:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x40549a78
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
-; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x33979a37
-; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc23369f4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x421a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp10_f32_undef:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp10_f32_undef:
 ; R600:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
new file mode 100644
index 0000000000000..52f6dab902b3e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
@@ -0,0 +1,1010 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-SDAG-FAKE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-GI-TRUE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-GI-FAKE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-SDAG-FAKE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-GI-TRUE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-GI-FAKE16
+
+define bfloat @v_exp2_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_fabs_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fabs_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0x7fff, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, |v0.l|
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, |v0|
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %result = call bfloat @llvm.exp2.bf16(bfloat %fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_fneg_fabs_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b16 v1.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -|v0.l|
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -|v0|
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %fneg.fabs = fneg bfloat %fabs
+  %result = call bfloat @llvm.exp2.bf16(bfloat %fneg.fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_fneg_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg bfloat %in
+  %result = call bfloat @llvm.exp2.bf16(bfloat %fneg)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_bf16_fast(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_bf16_fast:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_bf16_fast:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_bf16_fast:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_bf16_fast:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define <2 x bfloat> @v_exp2_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.h, v0.h
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v1, v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v1.l
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.h, v2.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v1, v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v2.h, 0x8000, v1.l
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v2.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -v1.l
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.h, -v2.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v1, -v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %fneg.fabs = fneg <2 x bfloat> %fabs
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fneg.fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_fneg_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, 0x8000, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v2, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.h, -v0.h
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v1, -v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg <2 x bfloat> %in
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fneg)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_v2bf16_fast(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.h, v0.h
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v1, v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+declare bfloat @llvm.exp2.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat>) #0
+declare bfloat @llvm.fabs.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 15bcab9f774e4..e71ea505caea1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -2783,56 +2783,10 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 }
 
 define float @v_exp2_f32_undef() {
-; GCN-SDAG-LABEL: v_exp2_f32_undef:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, 0x7fc00000
-; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp2_f32_undef:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
-; SI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp2_f32_undef:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
-; VI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp2_f32_undef:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_exp2_f32_undef:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_undef:
 ; R600:       ; %bb.0:
@@ -4076,3 +4030,4 @@ attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN-GISEL: {{.*}}
+; GCN-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 0d5846a4a4985..38d1b4789cf45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -5590,162 +5590,15 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 }
 
 define float @v_log_f32_undef() {
-; SI-SDAG-LABEL: v_log_f32_undef:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s5, v2
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log_f32_undef:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log_f32_undef:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v0, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log_f32_undef:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log_f32_undef:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v2, v0, s5, v2
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log_f32_undef:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log_f32_undef:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, s0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_log_f32_undef:
+; GFX689:       ; %bb.0:
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-GISEL-LABEL: v_log_f32_undef:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_log_f32_undef:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_undef:
 ; R600:       ; %bb.0:
@@ -6377,28 +6230,99 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_f32_from_fpext_bf16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1100-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1100-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_from_fpext_bf16:
 ; R600:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 8006876dbe3ff..058933f5481a0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -5590,162 +5590,15 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 }
 
 define float @v_log10_f32_undef() {
-; SI-SDAG-LABEL: v_log10_f32_undef:
-; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-SDAG-NEXT:    v_fma_f32 v2, v0, s5, v2
-; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
-; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log10_f32_undef:
-; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log10_f32_undef:
-; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
-; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-SDAG-NEXT:    v_sub_f32_e32 v3, v0, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
-; VI-SDAG-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v3
-; VI-SDAG-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v2, v4
-; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-SDAG-NEXT:    v_add_f32_e32 v2, v3, v2
-; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log10_f32_undef:
-; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log10_f32_undef:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
-; GFX900-SDAG-NEXT:    s_mov_b32 s6, 0x7f800000
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-SDAG-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-SDAG-NEXT:    v_fma_f32 v2, v0, s5, v2
-; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
-; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log10_f32_undef:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log10_f32_undef:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, s0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_log10_f32_undef:
+; GFX689:       ; %bb.0:
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-GISEL-LABEL: v_log10_f32_undef:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_log10_f32_undef:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_undef:
 ; R600:       ; %bb.0:
@@ -6377,28 +6230,99 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_f32_from_fpext_bf16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1100-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1100-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_from_fpext_bf16:
 ; R600:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
new file mode 100644
index 0000000000000..5bd9fa6f23aa0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-FAKE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-TRUE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-FAKE16 %s
+
+define bfloat @v_log2_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_fabs_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, |v0.l|
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, |v0|
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %result = call bfloat @llvm.log2.bf16(bfloat %fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_fneg_fabs_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -|v0.l|
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -|v0|
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %fneg.fabs = fneg bfloat %fabs
+  %result = call bfloat @llvm.log2.bf16(bfloat %fneg.fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_fneg_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg bfloat %in
+  %result = call bfloat @llvm.log2.bf16(bfloat %fneg)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_bf16_fast(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_bf16_fast:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_bf16_fast:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define <2 x bfloat> @v_log2_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.h, v0.h
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v1.l
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.h, v2.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -v1.l
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.h, -v2.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v1, -v1
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %fneg.fabs = fneg <2 x bfloat> %fabs
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg.fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fneg_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.h, -v0.h
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v1, -v1
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg <2 x bfloat> %in
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_v2bf16_fast(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16_fast:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.h, v0.h
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16_fast:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+declare bfloat @llvm.log2.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat>) #0
+declare bfloat @llvm.fabs.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index c1ac74e5094b0..4ca612aa32e84 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3542,45 +3542,15 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 }
 
 define float @v_log2_f32_undef() {
-; GFX689-SDAG-LABEL: v_log2_f32_undef:
-; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-SDAG-NEXT:    v_log_f32_e32 v0, s4
-; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX689-GISEL-LABEL: v_log2_f32_undef:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log2_f32_undef:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, s0
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX689-LABEL: v_log2_f32_undef:
+; GFX689:       ; %bb.0:
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-GISEL-LABEL: v_log2_f32_undef:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_log2_f32_undef:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_undef:
 ; R600:       ; %bb.0:
@@ -3830,20 +3800,67 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_f32_from_fpext_bf16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1100-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1100-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_from_fpext_bf16:
 ; R600:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index 28781ae9f13c7..c6cf6f64db1eb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
new file mode 100644
index 0000000000000..7a355a36b15bf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.sin.bf16(bfloat) #0
+
+define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: sin_bf16_constant_4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_sin_bf16_e32 v0, 0x3f23
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
+  %sin = call bfloat @llvm.sin.bf16(bfloat 4.0) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; GCN-LABEL: sin_bf16_constant_100:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_sin_bf16_e32 v0, 0x417f
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
+  %sin = call bfloat @llvm.sin.bf16(bfloat 100.0) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
index 5936d6aa86b82..dcf01f744945f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -2,8 +2,6 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
 
-; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
-
 declare bfloat @llvm.sqrt.bf16(bfloat %a)
 declare <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
 
@@ -66,6 +64,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_nop
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
@@ -90,6 +89,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_sqrt_bf16_e32 v1, v0
+; GFX12-FAKE16-NEXT:    v_nop
 ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
 ; GFX12-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
new file mode 100644
index 0000000000000..76e2092c8b57a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
@@ -0,0 +1,104 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            merge_global_load_dword_2_no_scale_offset
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_2_no_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s64) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[COPY]]
+    %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_2_same_scale_offset
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_2_same_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+    %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2049, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2049, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_2_different_scale_offset
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_2_different_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2048, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+    %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2048, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+# NB: We do not currently support merging SGPR offset and SGPR+Imm offset forms
+# of S_LOAD, but the check stays the same: these cannot be merged with different
+# scale offsets.
+#
+# We also do not currently merge flat scratch instructions, although a common
+# check in the merge logic that CPol shall not be set for merge to happen.
+
+---
+name: merge_s_load_x1_x1_imm_no_scale_offset
+body: |
+  bb.0:
+    ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: no_merge_s_load_x1_x1_imm_same_scale_offset
+body: |
+  bb.0:
+    ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_same_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 2048 :: (dereferenceable invariant load (s32))
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 2048 :: (dereferenceable invariant load (s32))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
+...
+
+---
+name: no_merge_s_load_x1_x1_imm_different_scale_offset
+body: |
+  bb.0:
+    ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_different_scale_offset
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32))
+    ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32))
+    %0:sgpr_64 = IMPLICIT_DEF
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 047bdde40ee57..82813200c75ec 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -11,11 +11,13 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:      #dbg_value(ptr addrspace(5) [[BUF_PTR_VAR]], [[META10:![0-9]+]], !DIExpression(), [[DBG21]])
 ; CHECK-NEXT:    [[AUX_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG22:![0-9]+]]
 ; CHECK-NEXT:      #dbg_value(ptr addrspace(5) [[AUX_PTR_VAR]], [[META12:![0-9]+]], !DIExpression(), [[DBG22]])
-; CHECK-NEXT:      #dbg_value({ ptr addrspace(8), i32 } poison, [[META13:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(i32 0, [[META13:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META23:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(8) [[BUF]], [[META13]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META23]])
 ; CHECK-NEXT:    [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]]
 ; CHECK-NEXT:    [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]]
 ; CHECK-NEXT:    store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]]
-; CHECK-NEXT:      #dbg_value({ ptr addrspace(8), i32 } poison, [[META15:![0-9]+]], !DIExpression(), [[META25:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META25:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META25]])
 ; CHECK-NEXT:    [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]]
 ; CHECK-NEXT:    [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]]
 ; CHECK-NEXT:    store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]]
@@ -24,10 +26,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]]
 ; CHECK-NEXT:    [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]]
 ; CHECK-NEXT:    [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]]
-; CHECK-NEXT:      #dbg_value({ ptr addrspace(8), i32 } poison, [[META16:![0-9]+]], !DIExpression(), [[DBG27]])
+; CHECK-NEXT:      #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG27]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG27]])
 ; CHECK-NEXT:    [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]]
 ; CHECK-NEXT:    [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]]
-; CHECK-NEXT:      #dbg_value({ ptr addrspace(8), i32 } poison, [[META17:![0-9]+]], !DIExpression(), [[DBG28]])
+; CHECK-NEXT:      #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]])
 ; CHECK-NEXT:    [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]]
 ; CHECK-NEXT:    [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]]
@@ -38,7 +42,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]]
 ; CHECK-NEXT:    [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]]
-; CHECK-NEXT:      #dbg_value({ ptr addrspace(8), i32 } poison, [[META18:![0-9]+]], !DIExpression(), [[DBG30]])
+; CHECK-NEXT:      #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG30]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG30]])
 ; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]]
 ; CHECK-NEXT:      #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]])
 ; CHECK-NEXT:    [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]]
@@ -46,7 +51,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]]
 ; CHECK-NEXT:    [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]]
 ; CHECK-NEXT:    [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]]
-; CHECK-NEXT:      #dbg_value({ ptr addrspace(8), i32 } poison, [[META20:![0-9]+]], !DIExpression(), [[DBG32]])
+; CHECK-NEXT:      #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG32]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG32]])
 ; CHECK-NEXT:    [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]]
 ; CHECK-NEXT:    [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 6e94896fa206e..c0fb1450ab682 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -193,13 +193,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
 }
 
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
-; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; SDAG-GFX9:       ; %bb.0:
@@ -265,13 +274,22 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
 }
 
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
-; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; SDAG-GFX9:       ; %bb.0:
@@ -569,3 +587,4 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign"
 attributes #1 = { nounwind readnone speculatable }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GISEL-GFX11-FAKE16: {{.*}}
+; SDAG-GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 409b1d6f6e30b..ce67a2eec93bc 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -33,7 +33,7 @@ name:            asm_write_vgpr_accvgpr_write_read
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
@@ -47,7 +47,7 @@ name:            asm_write_vgpr_accvgpr_write_read_partialnop
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
     S_NOP 0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
@@ -60,7 +60,7 @@ name:            asm_write_vgpr_accvgpr_write_read_otherreg
 body:             |
   bb.0:
     liveins: $vgpr0
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr1
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index 6246f2fd4fa5d..ca16e251d51cf 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -118,34 +118,29 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-LABEL: v_maximumnum_bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_bf16:
@@ -181,40 +176,34 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_bf16:
@@ -339,21 +328,21 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan:
@@ -381,25 +370,25 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan:
@@ -630,58 +619,46 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16:
@@ -738,62 +715,56 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16:
@@ -1012,34 +983,29 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
@@ -1085,36 +1051,35 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
 ; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
@@ -1444,66 +1409,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1575,77 +1541,80 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1939,41 +1908,40 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2029,48 +1997,50 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2507,85 +2477,83 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -2680,99 +2648,98 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -3158,53 +3125,52 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v7, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3273,62 +3239,63 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v5, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v7, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v3.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3957,125 +3924,120 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v13, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v10, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v1, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -4206,142 +4168,141 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v13, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v10.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v10, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v11.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v12.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v1, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v9.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v8.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -5219,171 +5180,160 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v15, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v13, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v2, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v11, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v15, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v1, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v0, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v14.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v15.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -5546,201 +5496,187 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v15, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v13, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v11.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v11, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v13, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v13.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v14
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v15, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v15
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v2.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v2, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v14, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v1.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v1, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v12.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v0, v12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v12.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v11.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v14.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v15.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -7352,341 +7288,314 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX11-TRUE16-LABEL: v_maximumnum_v16bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v25.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v16, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v16, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v16, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v16, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v7, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v6, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v5, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v4, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v3, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v26, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v27, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v21, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v23, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v17.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v14.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v18, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v2, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v1, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v18.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v32.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v28.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v32, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v29.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v22, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -8005,406 +7914,355 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v19.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v24, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v22.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v26, v21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v16, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v21.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v27, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v21, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v23.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v23, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v17.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v14.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v25.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v18, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v16, v26
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v16, v27
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v22, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v16, v28
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v16, v29
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v16, v30
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v7.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v7, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v6.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v6, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v5.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v5, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v16.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v4.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v4, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v3.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v3, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v16.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v2.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v2, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v1.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v1, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v17.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v18.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v32.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v26.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v28.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v32, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v29.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -11681,666 +11539,619 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s40, v86, v118
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s63, v116, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s42, v87, v118
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s43, v96, v119
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s45, v98, v129
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s56, v101, v132
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s59, v112, v135
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s60, v113, v144
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v39.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v51.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s46, v99, v130
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s61, v114, v145
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s62, v115, v146
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v33.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v52.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v70.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v81.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v97, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v35.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s57, v102, v133
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v65.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v66.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s47, v100, v131
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s58, v103, v134
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v38.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v64.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v71.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v80.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v85.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v67.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v84.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v82.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v83.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v69.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0, v96.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v35
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v35.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v36, v37
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0, v37.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v36, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v36, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v50.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v52, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v36, v48
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v36, v49
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v36, v50
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v36, v51
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v36, v52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v36, v53
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v36, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v36, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v36, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v36, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v36, v66
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s14, v36, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v82.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v14.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v38, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v31.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s16, v14, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s17, v13, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s18, v12, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s19, v11, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s20, v10, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v9.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s21, v9, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s22, v8, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s23, v7, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s24, v6, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s25, v5, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s26, v4, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v52, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v29.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v27, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v25, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s27, v83, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0, v82.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v39.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v19.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v38.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v82, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v49.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v50.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v51.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v54.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v52.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v53.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v55.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v19, v36
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v66.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v65.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v67.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v64.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v96, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16:
@@ -12956,753 +12767,697 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s40, v86, v118
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s63, v116, v86
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v55.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s42, v87, v118
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s43, v96, v119
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s45, v98, v129
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s56, v101, v132
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s59, v112, v135
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s60, v113, v144
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v39.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v51.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s46, v99, v130
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s61, v114, v145
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s62, v115, v146
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v33.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v52.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v53.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v70.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v81.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v54.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v97, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v35.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s57, v102, v133
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v65.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v66.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s47, v100, v131
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s58, v103, v134
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v38.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v64.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v71.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v80.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v85.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v67.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v84.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v82.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v83.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v69.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0, v96.h
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v15.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v50.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v52, v53
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v51.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v38, v53
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v31.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v36, v35
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v35.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v52, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v36, v37
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0, v37.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v36, v38
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v51, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v29.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v36, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v28.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v51, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v36, v48
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v27.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v51, v50
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s4, v36, v49
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v27, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s5, v36, v50
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s6, v36, v51
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v26, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s7, v36, v52
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v25, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s8, v36, v53
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v12, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s9, v36, v54
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s10, v36, v55
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s11, v36, v64
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s12, v36, v65
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v11, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s13, v36, v66
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s14, v36, v67
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v82.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s15, v82, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v14.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s16, v14, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v13.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s17, v13, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v12, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s18, v12, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v11.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v17, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s19, v11, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v10.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s20, v10, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v9.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s21, v9, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v8.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s22, v8, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v7.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s23, v7, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s24, v6, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v5.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s25, v5, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0, v4.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s26, v4, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0, v83.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s27, v83, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0, v82.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0, v39.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0, v19.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0, v38.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v48.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s44, v82, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v49.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v50.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v51.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v54.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v52.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v53.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v55.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v19, v36
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v66.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v65.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v67.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v64.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v96, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v32bf16:
@@ -14612,34 +14367,29 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-TRUE16-LABEL: v_maximumnum_bf16_no_ieee:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14675,40 +14425,34 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14949,58 +14693,46 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -15057,62 +14789,56 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -15458,66 +15184,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15589,77 +15316,80 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -16117,85 +15847,83 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -16290,99 +16018,98 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0, v7.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index 678d0a432a44f..416a601797617 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -120,34 +120,29 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-LABEL: v_minimumnum_bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_bf16:
@@ -183,40 +178,34 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_bf16:
@@ -344,21 +333,21 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan:
@@ -386,25 +375,25 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan:
@@ -639,58 +628,46 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16:
@@ -747,62 +724,56 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16:
@@ -1024,34 +995,29 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
@@ -1097,36 +1063,35 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
 ; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
@@ -1459,66 +1424,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1590,77 +1556,80 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1957,41 +1926,40 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2047,48 +2015,50 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.h, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2528,85 +2498,83 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -2701,99 +2669,98 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -3181,53 +3148,52 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v7, v6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3296,62 +3262,63 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v5, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v7, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v3.h
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.h, v0.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.h, v0.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3984,125 +3951,120 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v13, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v10, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v1, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v0, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -4233,142 +4195,141 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v7, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v9.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v10, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v13, v13
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v13, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v13.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v10.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v7
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v7.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v10, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v11.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v8.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v5, v5
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v12, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v1, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v9.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v5, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v0, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v8.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.l, v0.h, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s4
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -5250,171 +5211,160 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v15, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v13, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v2, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v11, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v15, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v1, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v0, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v14.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v15.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -5577,201 +5527,187 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v12.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v15, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v13, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v11.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v15.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v11, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v17, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v17, v17
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v8.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v13, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v13.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v9.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v14, v14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v14
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v12.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v10.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v16, v16
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v14, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v15, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v15
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v12.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v16, v16
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v16, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v2, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.h, v13.h, s8
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v14, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v9.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v1.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v1, v12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v0, v12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v12.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v3.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v11.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v1.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v14.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v15.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v10.l, v0.h, s3
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v4.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -7391,341 +7327,314 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX11-TRUE16-LABEL: v_minimumnum_v16bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v25.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v16, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v16, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v16, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v16, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v7, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v6, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v5, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v4, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v3, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v19.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v24, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v22.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v26, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v27, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v21, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v23, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v17.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v14.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v18, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v2, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v1, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v18.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v32.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v28.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v32, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v29.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v22, v17
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -8044,406 +7953,355 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v18, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v7.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v19.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v24, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v22.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v26, v21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v21.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v13
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v16.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v16, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v16.l
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v19.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v16.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v21.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v27, v25
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v21, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v23.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v23, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v19.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v16.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v15.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v17.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v17.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v14.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v17.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v29, v29
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v32, v32
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v16, v18
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v32, v32
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v16, v25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v25.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v32, v32
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v18, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v20.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v16, v26
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v21.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v16, v27
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v22.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v22, v17
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v0.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v16, v28
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v24, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v16
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v13.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v16, v29
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v24.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v30, v30
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v16, v30
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v15.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v7.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v7, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v6.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v6, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v5, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v4.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v21
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v17
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v4, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v3.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v3, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v2.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v2, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v31, v31
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v16.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v16
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v1.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v1, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v17.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v18.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v32.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v16
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v28.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v32, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v29.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -11736,666 +11594,619 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s40, v86, v118
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v32.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s63, v116, v86
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s42, v87, v118
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s43, v96, v119
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s45, v98, v129
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s56, v101, v132
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s59, v112, v135
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s60, v113, v144
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v34.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v39.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v51.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s46, v99, v130
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s61, v114, v145
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s62, v115, v146
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v33.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v37.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v52.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v70.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v81.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v97, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v35.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s57, v102, v133
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v48.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v65.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v66.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s47, v100, v131
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s58, v103, v134
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v38.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v49.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v64.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v71.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v80.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v85.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v67.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v68.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v84.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v82.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v83.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v69.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0x8000, v96.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v35
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v35.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v36, v37
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0x8000, v37.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v36, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v36, v39
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v15.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v50.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v52, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v36, v48
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v36, v49
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v36, v50
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v36, v51
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v36, v52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v36, v53
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v36, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v36, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v36, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v36, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v36, v66
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s14, v36, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v82.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v14.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v38, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v31.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s16, v14, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v13.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s17, v13, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s18, v12, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s19, v11, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s20, v10, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v9.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s21, v9, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s22, v8, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s23, v7, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s24, v6, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s25, v5, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s26, v4, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v52, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v13.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v29.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v51, v50
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v27, v26
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v25, v24
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s27, v83, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0x8000, v82.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v39.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v19.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v38.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v82, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v50.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v51.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v54.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v52.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v53.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v55.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v19, v36
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v66.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v65.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v67.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v64.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v96, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16:
@@ -13011,753 +12822,697 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v68, off, s32
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v36.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v36.l
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v13
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v24
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v5
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v4
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v16
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v55, v55
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v85, v85
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v29
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v28
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v36.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v12
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v28
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v27
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v25
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v23
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v22
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v21
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v20
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v19
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v35, v35
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v37, v37
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v39, v39
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v49, v49
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v51, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v53, v53
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v54, v54
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v65, v65
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v67, v67
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v69, v69
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v71, v71
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v86, v86
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v55.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v18
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v36, v36
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v38, v38
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v48, v48
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v50, v50
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v52, v52
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v64, v64
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v66, v66
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v68, v68
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v70, v70
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v80, v80
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v83, v83
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v118
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v15
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v14
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v116.l, v54.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s40, v86, v118
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v85.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v30
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v87, v87
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v96, v96
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v33.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v36.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v39.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v50.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v64.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v65.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v66.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v67.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v69.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v70.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v71.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v80.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v81.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v82.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v116
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v97, v97
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v32.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v114.l, v52.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v53.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v87
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v96
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v98
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v101
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v112
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v113
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v83.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v147.l, v84.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v119
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v128
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v129
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v130
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v131
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v132
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v133
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v134
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v135
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v144
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v145
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s63, v116, v86
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v13
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v55.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v99
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v114
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v115
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v146
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v147
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s42, v87, v118
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s43, v96, v119
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s45, v98, v129
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s56, v101, v132
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s59, v112, v135
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s60, v113, v144
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v118.l, v86.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v34.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v35.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v39.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v50.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v51.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s46, v99, v130
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s61, v114, v145
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s62, v115, v146
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v118
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v33.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v37.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v102.l, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v52.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v53.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v97
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v70.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v81.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v101.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v112.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v113.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v100.l, v38.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v103.l, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v54.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v102
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v97, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v114.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v115.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v39
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v35.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v100
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v103
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s57, v102, v133
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v116.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v51
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v48.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v65.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v66.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s47, v100, v131
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s58, v103, v134
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v96.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v97.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v52
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v38.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v49.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v64.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v71.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v87.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v102.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v128
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v129
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v80.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v85.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v103.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v119
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v48
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v65
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v66
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v49
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v64
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v71
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v67.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v68.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v98.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v99.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v130
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v131
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v84.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v70
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v67
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v82.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v68
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v83.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v80
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v69.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX12-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v100.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v81
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v132
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v69
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v39, v39
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v9
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v8
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v23
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v22
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v21
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v6
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v4
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v3
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v19
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v18
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v17
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v15
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v30
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v17
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v16
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v83, v83
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s42, v86, v86
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v36.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s43, 0x8000, v96.h
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v31
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v50.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v15.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v68
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v31
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v51.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v54, v54
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v50.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v52, v53
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v15.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v52
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v51.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v32.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v50.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v38, v53
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v14.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v30.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v33.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v117, v117
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v31.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v52
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v36, v35
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v35.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v13.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v29.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v28
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v52, v51
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v31.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v32, v32
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v36, v37
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s44, 0x8000, v37.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v28.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v11
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v13.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v27
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v30.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v36, v38
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v51, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v29.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v33.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v34, v34
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v36, v39
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v27.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v11.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v28.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v31.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v34.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v48, v48
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v51, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.h, v28.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v36, v48
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v50
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v10.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v26.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v35.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v52
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v51, v50
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s5, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s4, v36, v49
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v11.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v9.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v38.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v24
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v25.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v27, v26
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s6, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s5, v36, v50
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v48.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s5, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v12.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v10.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s7, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s6, v36, v51
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v26, v9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v50.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s6, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v24
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v11.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s8, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s7, v36, v52
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v26
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v25, v24
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v52.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s7, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s9, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s8, v36, v53
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v10.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v12, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v53.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s8, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s10, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s9, v36, v54
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v8.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v54.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s9, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v21
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s11, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s10, v36, v55
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v11
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v55.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s10, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v20
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s12, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s11, v36, v64
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v64.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s11, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v35.l, v7.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s13, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s12, v36, v65
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v11, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, v6.l, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v65.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s12, 0, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s14, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s13, v36, v66
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v66.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s13, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v68
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s14, v36, v67
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v67.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v82.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s14, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s15, v82, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s15, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s16, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v13
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v68.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v28
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s15, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v14.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s16, v14, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v82, v82
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v26
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s17, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v25
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v14.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s16, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v13.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v23
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s17, v13, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v12, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v3.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v82, v82
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s17, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v22
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s18, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v11
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v21
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v13.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s17, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v12.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v82, v82
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v19
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s18, v12, v36
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v82, v82
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s18, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s19, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v10
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s18, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v11.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v17, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s19, v11, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s19, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s20, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v9
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v11.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s19, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v10.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s20, v10, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s20, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s21, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v10.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s20, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v9.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s21, v9, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s21, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s22, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v9.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s21, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v8.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s22, v8, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s22, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s23, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s22, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v7.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s23, v7, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s23, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s24, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v5
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s23, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s24, v6, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s24, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s25, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v11
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s24, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v5.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s25, v5, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s25, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s26, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s25, 0, v36
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s27, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s26, v4, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s26, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v81, v81
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s29, v81, v81
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s26, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s40, v3, v3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s41, 0x8000, v83.h
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v36.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s27, v83, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s40, 0x8000, v82.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s27, v85, v85
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s41, v87, v87
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v3.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s42, 0x8000, v39.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s28, 0x8000, v19.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s45, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s29, 0x8000, v38.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s44, v82, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v50.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v51.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v54.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v52.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v53.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v55.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v19, v36
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v66.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v65.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v67.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v64.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s4, 0, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v96, v36
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v16.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v36
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v16.l, v17.h, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v32bf16:
@@ -14669,34 +14424,29 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX11-TRUE16-LABEL: v_minimumnum_bf16_no_ieee:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14732,40 +14482,34 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v1, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v1.h, s0
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -15009,58 +14753,46 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -15117,62 +14849,56 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v4, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -15521,66 +15247,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15652,77 +15379,80 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v7, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v4
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v0.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s1
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -16183,85 +15913,83 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -16356,99 +16084,98 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v7.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v5
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s4, 0x8000, v7.h
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.l, v0.h, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v2.l, s2
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir
new file mode 100644
index 0000000000000..95ccf6c0a4a33
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir
@@ -0,0 +1,357 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GCN %s
+
+---
+name:            flat_load_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_load_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, %3, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_load_saddr_to_valu_non_zero_vaddr
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_load_saddr_to_valu_non_zero_vaddr
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub0, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub1, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+      %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, %3, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+
+---
+name:            flat_load_saddr_to_valu_undef_vaddr
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_load_saddr_to_valu_undef_vaddr
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub0, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub1, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[REG_SEQUENCE]], undef %4:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, undef %3:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_store_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_store_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   FLAT_STORE_DWORD [[PHI]], [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      %4:vgpr_32 = IMPLICIT_DEF
+      FLAT_STORE_DWORD_SADDR %3, %4, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_atomic_noret_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_atomic_noret_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %6, %bb.1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   FLAT_ATOMIC_ADD [[PHI]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      FLAT_ATOMIC_ADD_SADDR %3, %3, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_atomic_rtn_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_atomic_rtn_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[PHI]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      %4:vgpr_32 = FLAT_ATOMIC_ADD_SADDR_RTN %3, %3, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            scratch_load_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: scratch_load_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1
+  ; GCN-NEXT:   [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+      %0:sgpr_32 = COPY $vgpr0
+
+  bb.1:
+      %1:sgpr_32 = PHI %0, %bb.0, %2, %bb.1
+      %4:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sgpr_32 = S_AND_B32 %1, 1, implicit-def $scc
+      S_CMP_LG_U32 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            scratch_store_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: scratch_store_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD [[DEF]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+      %0:sgpr_32 = COPY $vgpr0
+
+  bb.1:
+      %1:sgpr_32 = PHI %0, %bb.0, %2, %bb.1
+      %4:vgpr_32 = IMPLICIT_DEF
+      SCRATCH_STORE_DWORD_SADDR %4, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sgpr_32 = S_AND_B32 %1, 1, implicit-def $scc
+      S_CMP_LG_U32 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 3844d6054e130..cf244f0b1f884 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,16 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-LABEL: matmul_kernel:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    s_mov_b32 s2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
@@ -43,16 +42,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-LABEL: matmul_kernel:
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX908-NEXT:    s_mov_b32 s2, 0
-; GFX908-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT:    s_mov_b32 s3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX908-NEXT:    s_mov_b32 s3, 0
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
index ee5481617cf59..01506d0af1913 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -80,16 +80,16 @@ body:             |
   ; COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
   ; COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
   ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
-  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.1:
   ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; COALESCE-NEXT: {{  $}}
-  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -103,10 +103,10 @@ body:             |
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
   ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.3:
@@ -134,16 +134,16 @@ body:             |
   ; GFX908-COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
   ; GFX908-COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.1:
   ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -157,10 +157,10 @@ body:             |
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
index 49c0aaf9fb390..a9207de317ea1 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -67,7 +67,7 @@ body:             |
   ; COALESCE-NEXT: bb.1:
   ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
   ; COALESCE-NEXT: {{  $}}
-  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   S_BRANCH %bb.3
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.2:
@@ -78,13 +78,13 @@ body:             |
   ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.3:
-  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
   ; COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
@@ -105,28 +105,28 @@ body:             |
   ; GFX908-COALESCE-NEXT: bb.1:
   ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.3
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.2:
   ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[AV_MOV_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
-  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.3:
-  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; GFX908-COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
index ae35d0dcb88f3..e6bc733775b17 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
@@ -17,6 +17,7 @@
 ; CHECK-NEXT:        .debug_mode:     0
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 638dc8965987e..310040d44bc34 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -19,6 +19,7 @@
 ; CHECK-NEXT:        .debug_mode:     0
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      true
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0x200
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
index fb6ac2e8833be..c1846c0f2c23b 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
@@ -59,6 +59,7 @@
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
@@ -113,6 +114,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_gs
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .forward_progress: true
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
@@ -124,6 +126,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_hs
 ; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .forward_progress: true
 ; CHECK-NEXT:        .lds_size:       0x1000
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
@@ -135,6 +138,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_ps
 ; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .forward_progress: true
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 15778c8861e83..5c0c366277829 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -62,6 +62,7 @@
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0
@@ -118,6 +119,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_gs_main
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
@@ -130,6 +132,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_hs_main
 ; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x1000
 ; CHECK-NEXT:        .mem_ordered:    true
@@ -142,6 +145,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_ps_main
 ; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
index 644722bdd1273..830872a58f0b8 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
@@ -62,6 +62,7 @@
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0
@@ -118,6 +119,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NOT:        .entry_point:    _amdgpu_gs_main
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
@@ -130,6 +132,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NOT:        .entry_point:    _amdgpu_hs_main
 ; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x1000
 ; CHECK-NEXT:        .mem_ordered:    true
@@ -142,6 +145,7 @@
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NOT:        .entry_point:    _amdgpu_ps_main
 ; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .forward_progress: true
 ; GFX11-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 663fd98b46bf7..ce96766116089 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -17,9 +17,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-  ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -42,8 +42,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-  ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+  ; PEI-GFX908-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; PEI-GFX908-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
   ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
@@ -62,9 +62,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-  ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
@@ -85,8 +85,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-  ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+  ; PEI-GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; PEI-GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
   ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
index c2c5340639a16..8145a1d7a2072 100644
--- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
+++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
@@ -167,3 +167,59 @@ body:             |
     %1:sreg_32 = COPY %0
     S_BRANCH %bb.2
 ...
+
+---
+
+name:            phi_moveimm_av_pseudo_input
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: phi_moveimm_av_pseudo_input
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc
+  ; GCN-NEXT:   S_BRANCH %bb.2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI %5, %bb.3, [[S_ADD_U32_]], %bb.1
+  ; GCN-NEXT:   S_BRANCH %bb.3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   S_BRANCH %bb.2
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr0, $sgpr1
+
+    %0:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+
+    %4:sreg_32 = COPY $sgpr0
+    %5:sreg_32 = COPY $sgpr1
+
+  bb.1:
+    successors: %bb.2
+    %2:sreg_32 = S_ADD_U32 %4, %5, implicit-def $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %3:sreg_32 = PHI %1, %bb.3, %2, %bb.1
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.2
+    %1:sreg_32 = COPY %0
+    S_BRANCH %bb.2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 1ec94162951a6..d48bfe0bb7f21 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -145,49 +145,29 @@ entry:
 
 ; Test skipping the lower-32-bit addition if it is unnecessary.
 define ptr @huge_offset_low_32_unused(ptr %p) {
-; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    s_mov_b32 s0, 0
-; GFX942_PTRADD-NEXT:    s_mov_b32 s1, 1
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: huge_offset_low_32_unused:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
   ret ptr %gep
 }
 
 ; Reassociate address computation if it leads to more scalar operations.
 define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_r:
-; GFX942_PTRADD:       ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_r:
-; GFX942_LEGACY:       ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT:    s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: reassoc_scalar_r:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_u32 s2, s2, s6
+; GFX942-NEXT:    s_addc_u32 s3, s3, s7
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
   %voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
 }
 
 define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_l:
-; GFX942_PTRADD:       ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_l:
-; GFX942_LEGACY:       ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT:    s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: reassoc_scalar_l:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_u32 s2, s2, s6
+; GFX942-NEXT:    s_addc_u32 s3, s3, s7
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
   %voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
 
 ; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
 define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
-; GFX942_PTRADD-LABEL: shl_neg_offset:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_sub_co_u32_e32 v2, vcc, 0, v2
-; GFX942_PTRADD-NEXT:    s_nop 1
-; GFX942_PTRADD-NEXT:    v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: shl_neg_offset:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_LEGACY-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX942_LEGACY-NEXT:    s_nop 1
-; GFX942_LEGACY-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: shl_neg_offset:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %offset = sub i64 0, %noffset
   %x = shl i64 %offset, %shift
   %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
 ; GFX942_PTRADD:       ; %bb.0:
 ; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942_PTRADD-NEXT:    s_getpc_b64 s[0:1]
-; GFX942_PTRADD-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+4
-; GFX942_PTRADD-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+12
+; GFX942_PTRADD-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+14
+; GFX942_PTRADD-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+22
 ; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 10
 ; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,30 +248,63 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
 
 ; Tests the tryFoldToMad64_32 PTRADD combine.
 define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
-; GFX942_PTRADD-LABEL: fold_mad64:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT:    v_mul_hi_u32_u24_e32 v1, 12, v0
-; GFX942_PTRADD-NEXT:    v_mul_u32_u24_e32 v0, 12, v0
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, 1.0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    global_store_dword v[0:1], v2, off
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: fold_mad64:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, 1.0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
-; GFX942_LEGACY-NEXT:    global_store_dword v[0:1], v2, off
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: fold_mad64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_endpgm
   %voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
   %voffset = zext i32 %voffset32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
   store float 1.0, ptr addrspace(1) %p1
   ret void
 }
+
+; Use non-zero shift amounts in v_lshl_add_u64.
+define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: select_v_lshl_add_u64:
+; GFX942_PTRADD:       ; %bb.0:
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: select_v_lshl_add_u64:
+; GFX942_LEGACY:       ; %bb.0:
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr inbounds i64, ptr %base, i64 %voffset
+  ret ptr %gep
+}
+
+; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the
+; mul into a mul24.
+define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
+; GFX942_PTRADD-LABEL: fold_mul24_into_mad:
+; GFX942_PTRADD:       ; %bb.0:
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
+; GFX942_PTRADD-NEXT:    v_and_b32_e32 v4, 0xfffff, v4
+; GFX942_PTRADD-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v4
+; GFX942_PTRADD-NEXT:    v_mul_u32_u24_e32 v2, v2, v4
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: fold_mul24_into_mad:
+; GFX942_LEGACY:       ; %bb.0:
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
+; GFX942_LEGACY-NEXT:    v_and_b32_e32 v3, 0xfffff, v4
+; GFX942_LEGACY-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+  %a_masked = and i64 %a, u0xfffff
+  %b_masked = and i64 %b, u0xfffff
+  %mul = mul i64 %a_masked, %b_masked
+  %gep = getelementptr inbounds i8, ptr %base, i64 %mul
+  ret ptr %gep
+}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index c9d0cf3893a2b..fef733218e509 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -45,13 +45,13 @@ body:             |
 
     INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
     %14:vgpr_32 = COPY killed $agpr0
-    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %7, 10158090 /* regdef:VReg_256 */, def %8, 4784138 /* regdef:VReg_128 */, def %9, 3670026 /* regdef:VReg_96 */, def %10, 3670026 /* regdef:VReg_96 */, def %11
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27262986 /* regdef:VReg_512 */, def %7, 13565962 /* regdef:VReg_256 */, def %8, 6094858 /* regdef:VReg_128 */, def %9, 4784138 /* regdef:VReg_96 */, def %10, 4784138 /* regdef:VReg_96 */, def %11
     INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 11534345 /* reguse:VReg_512 */, %7
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10158089 /* reguse:VReg_256 */, %8
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, %9
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %10
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %11
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27262985 /* reguse:VReg_512 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13565961 /* reguse:VReg_256 */, %8
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %9
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %10
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %11
     $agpr1 = COPY %14
     INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir
new file mode 100644
index 0000000000000..d27b4eaff1ed9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs --start-before=greedy,2 --stop-after=greedy,2 %s -o - | FileCheck %s
+
+# Make sure there's no machine verifier error
+
+# If RA is unable to find a register to allocate, then cleanupFailedVReg will do ad-hoc rewriting and will insert undefs to make the LiveRanges workable.
+# %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 is an example of such a rewrite / undef. If we were to want to spill %30, we should not be inserting
+# actual spill code, as the source operand is undef.
+# Check that there are no verfier issues with the LiveRange of $vgpr0_vgpr1_vgpr2_vgpr3 / that we do not insert spill code for %30.
+
+
+--- |
+  define void @foo() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+
+...
+
+---
+name:            foo
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 32, alignment: 4 }
+machineFunctionInfo:
+  maxKernArgAlign: 4
+  isEntryFunction: true
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledVGPRs: true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: foo
+    ; CHECK: INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10 /* regdef */, def %10, 10 /* regdef */, def %1, 10 /* regdef */, def %2, 10 /* regdef */, def $vgpr0_vgpr1_vgpr2_vgpr3, 10 /* regdef */, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK-NEXT: KILL undef $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: SI_SPILL_AV160_SAVE %2, %stack.1, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_AV256_SAVE %1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY %10
+    ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK-NEXT: SI_SPILL_AV512_SAVE [[COPY1]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_AV512_RESTORE]]
+    ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_AV512_SAVE [[SI_SPILL_V512_RESTORE]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:av_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_AV256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV160_RESTORE:%[0-9]+]]:vreg_160 = SI_SPILL_AV160_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE1:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE1:%[0-9]+]]:av_256 = SI_SPILL_AV256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE1]], 9 /* reguse */, [[SI_SPILL_AV256_RESTORE1]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16
+    ; CHECK-NEXT: SI_RETURN
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10, def %22:vreg_512, 10, def %25:vreg_256, 10, def %28:vreg_160, 10, def $vgpr0_vgpr1_vgpr2_vgpr3, 10, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3
+    %27:av_160 = COPY %28:vreg_160
+    %24:av_256 = COPY %25:vreg_256
+    SI_SPILL_V512_SAVE %22:vreg_512, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+    %18:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY %18:vreg_512
+    %23:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+    %26:vreg_256 = COPY %24:av_256
+    %29:vreg_160 = COPY %27:av_160
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %30:av_128
+    INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9, %23:vreg_512, 9, %26:vreg_256, 9, %29:vreg_160, 9, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16
+    SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index ba9dd8f7c2468..8fe68ba748971 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -559,16 +551,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
@@ -1081,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -1897,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -1929,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1943,16 +1930,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
@@ -2465,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
new file mode 100644
index 0000000000000..64392a15e9a9b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+  %ret = load float, ptr %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b32_idx32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %arrayidx = getelementptr inbounds float, ptr %p, i32 %idx
+  %ret = load float, ptr %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) {
+; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; SDAG-NEXT:    flat_load_b32 v0, v[0:1]
+; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
+; GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GISEL-NEXT:    flat_load_b32 v0, v[0:1]
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+  %ret = load float, ptr %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b16_idxprom_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
+  %ld = load i16, ptr %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+  %ret = load <2 x float>, ptr %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b96_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
+  %ret = load <3 x float>, ptr %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b96_idxpromi_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
+  %ret = load <3 x float>, ptr %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b128_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
+  %ret = load <4 x float>, ptr %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b32_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+  %ret = load float, ptr %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b32_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd
+  %ret = load float, ptr %arrayidx, align 4
+  ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b8_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd
+  %ld = load i8, ptr %arrayidx
+  %ret.i32 = zext i8 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b16_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_u16 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
+  %ld = load i16, ptr %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b16_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
+  %ld = load i16, ptr %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b64_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+  %ret = load <2 x float>, ptr %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b96_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
+  %ret = load <3 x float>, ptr %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b96_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
+  %ret = load <3 x float>, ptr %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b128_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    flat_load_b32 v0, v[0:1]
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    flat_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
+  %ret = load <4 x float>, ptr %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    flat_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+  store float 1.0, ptr %arrayidx, align 4
+  ret void
+}
+
+define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b16_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    flat_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
+  store i16 1, ptr %arrayidx, align 2
+  ret void
+}
+
+define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT:    flat_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom
+  store double 1.0, ptr %arrayidx, align 4
+  ret void
+}
+
+define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_atomicrmw_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom
+  atomicrmw add ptr %arrayidx, i32 1 monotonic
+  ret void
+}
+
+define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) {
+; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
+; SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
+; SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_wait_alu 0xfffe
+; SDAG-NEXT:    v_cmpx_ne_u32_e64 s1, v3
+; SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
+; SDAG-NEXT:    s_cbranch_execnz .LBB21_3
+; SDAG-NEXT:  ; %bb.1: ; %Flow
+; SDAG-NEXT:    s_wait_alu 0xfffe
+; SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; SDAG-NEXT:    s_cbranch_execnz .LBB21_4
+; SDAG-NEXT:  .LBB21_2: ; %atomicrmw.phi
+; SDAG-NEXT:    s_wait_alu 0xfffe
+; SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT:    s_branch .LBB21_5
+; SDAG-NEXT:  .LBB21_3: ; %atomicrmw.global
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 1
+; SDAG-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_wait_xcnt 0x0
+; SDAG-NEXT:    s_wait_alu 0xfffe
+; SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; SDAG-NEXT:    s_cbranch_execz .LBB21_2
+; SDAG-NEXT:  .LBB21_4: ; %atomicrmw.private
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
+; SDAG-NEXT:    s_wait_loadcnt 0x0
+; SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
+; SDAG-NEXT:    s_wait_xcnt 0x0
+; SDAG-NEXT:    s_wait_alu 0xfffe
+; SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; SDAG-NEXT:    s_branch .LBB21_5
+; SDAG-NEXT:  .LBB21_5:
+;
+; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GISEL-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 3, v[2:3]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v0
+; GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_cmpx_ne_u32_e64 s3, v5
+; GISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GISEL-NEXT:    s_cbranch_execnz .LBB21_3
+; GISEL-NEXT:  ; %bb.1: ; %Flow
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s2
+; GISEL-NEXT:    s_cbranch_execnz .LBB21_4
+; GISEL-NEXT:  .LBB21_2: ; %atomicrmw.phi
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_branch .LBB21_5
+; GISEL-NEXT:  .LBB21_3: ; %atomicrmw.global
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], 1
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GISEL-NEXT:    s_wait_xcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s2
+; GISEL-NEXT:    s_cbranch_execz .LBB21_2
+; GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GISEL-NEXT:    s_wait_alu 0xfffd
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GISEL-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GISEL-NEXT:    s_wait_xcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GISEL-NEXT:    s_branch .LBB21_5
+; GISEL-NEXT:  .LBB21_5:
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom
+  %ret = atomicrmw add ptr %arrayidx, i64 1 monotonic
+  %ret.cast = bitcast i64 %ret to <2 x float>
+  ret <2 x float> %ret.cast
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
new file mode 100644
index 0000000000000..faea84e34d7eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(1) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b32_idx32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx
+  %ret = load float, ptr addrspace(1) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; SDAG-LABEL: global_load_b32_idxprom_wrong_stride:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; SDAG-NEXT:    s_wait_loadcnt 0x0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: global_load_b32_idxprom_wrong_stride:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
+; GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GISEL-NEXT:    global_load_b32 v0, v[0:1], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(1) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b16_idxprom_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b96_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b96_idxpromi_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
+  %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b128_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b32_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(1) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b32_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] offset:64 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd
+  %ret = load float, ptr addrspace(1) %arrayidx, align 4
+  ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b8_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd
+  %ld = load i8, ptr addrspace(1) %arrayidx
+  %ret.i32 = zext i8 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b16_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_u16 v0, v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
+  %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b16_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b64_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b96_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b96_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
+  %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b128_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    global_load_b32 v0, v[0:1], off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    global_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+  store float 1.0, ptr addrspace(1) %arrayidx, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b16_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    global_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
+  store i16 1, ptr addrspace(1) %arrayidx, align 2
+  ret void
+}
+
+define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT:    global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom
+  store double 1.0, ptr addrspace(1) %arrayidx, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_atomicrmw_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom
+  atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic
+  ret void
+}
+
+define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) {
+; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1
+; GCN-NEXT:    global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom
+  %ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic
+  %ret.cast = bitcast i64 %ret to <2 x float>
+  ret <2 x float> %ret.cast
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
new file mode 100644
index 0000000000000..27ecc837ea732
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) {
+; GCN-LABEL: scratch_load_b32_alloca_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %p = alloca [32 x i32], align 4, addrspace(5)
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(5) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(5) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b32_idx32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx
+  %ret = load float, ptr addrspace(5) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    scratch_load_b32 v0, v0, s0
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(5) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b16_idxprom_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_u16 v0, v0, s0 offset:32 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(5) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b64 v[0:1], v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b96_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b96_idxpromi_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd
+  %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_load_b128_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b128 v[0:3], v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b32_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_b32 v0, v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(5) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_b32 v0, v0, s0 offset:64 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxadd
+  %ret = load float, ptr addrspace(5) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_u8 v0, v0, s0 offset:16
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(5) %p, i64 %idxadd
+  %ld = load i8, ptr addrspace(5) %arrayidx
+  %ret.i32 = zext i8 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b16_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_u16 v0, v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom
+  %ld = load i16, ptr addrspace(5) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_u16 v0, v0, s0 offset:32 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(5) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b64_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_b64 v[0:1], v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+; Multiplication is unsigned here, so we cannot match it.
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b96_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = sext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd
+  %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
+; GCN-LABEL: scratch_load_b128_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    scratch_load_b32 v0, v0, off
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    scratch_load_b128 v[0:3], v0, s0 scale_offset
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_store_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-NEXT:    scratch_store_b32 v0, v1, s0 scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
+  store float 1.0, ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_store_b16_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    scratch_store_b16 v0, v1, s0 scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom
+  store i16 1, ptr addrspace(5) %arrayidx, align 2
+  ret void
+}
+
+define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: scratch_store_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT:    scratch_store_b64 v0, v[2:3], s0 scale_offset
+; GCN-NEXT:    s_endpgm
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom
+  store double 1.0, ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
new file mode 100644
index 0000000000000..b5bb68e1eaa89
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected.
+
+define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idx32:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_ashr_i32 s3, s2, 31
+; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT:    s_wait_kmcnt 0x0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idx32:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GISEL-NEXT:    s_add_co_u32 s0, s0, s2
+; GISEL-NEXT:    s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    ; return to shader part epilog
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idxprom_wrong_stride:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_mov_b32 s3, 0
+; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
+; SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT:    s_wait_kmcnt 0x0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idxprom_wrong_stride:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_mov_b32 s3, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
+; GISEL-NEXT:    s_add_co_u32 s0, s0, s2
+; GISEL-NEXT:    s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b16_idxprom_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b96_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b128_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b256_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b512_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <16 x float> %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b8_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_u8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd
+  %ld = load i8, ptr addrspace(4) %arrayidx
+  %ret.i32 = zext i8 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom
+  %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b64_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b96_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b128_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b256_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b512_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <16 x float> %ret
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll
new file mode 100644
index 0000000000000..11af704d30973
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll
@@ -0,0 +1,1429 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX1010 %s
+
+; Test the CMP+SELECT optimization that folds shared constants to reduce
+; register pressure.
+
+;------------------------------------------------------------------------------
+; F32 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: fcmp oeq + select with constant in true value
+define float @fcmp_select_fold_oeq_f32_imm(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_f32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_f32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float 0x40490FDB00000000, float %other
+  ret float %sel
+}
+
+; Should be folded: fcmp oeq + select with constant in true value (commutative)
+define float @fcmp_select_fold_oeq_imm_f32(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_f32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float 0x40490FDB00000000, %arg
+  %sel = select i1 %cmp, float 0x40490FDB00000000, float %other
+  ret float %sel
+}
+
+; Should be folded: fcmp one + select with constant in false value
+define float @fcmp_select_fold_one_f32_imm(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_one_f32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x402df850
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_f32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x402df850, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float %arg, 0x4005BF0A00000000
+  %sel = select i1 %cmp, float %other, float 0x4005BF0A00000000
+  ret float %sel
+}
+
+; Should be folded: fcmp one + select with constant in false value (commutative)
+define float @fcmp_select_fold_one_imm_f32(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_f32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x402df850
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_f32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x402df850, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float 0x4005BF0A00000000, %arg
+  %sel = select i1 %cmp, float %other, float 0x4005BF0A00000000
+  ret float %sel
+}
+
+; Should NOT be folded: different constants
+define float @fcmp_select_no_fold_f32_different_const(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x46487ed8
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x46487ed8, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float 0x40C90FDB00000000, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp oeq with constant in other position
+define float @fcmp_select_no_fold_f32_other_pos(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42487ed8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x42487ed8, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float %other, float 0x40490FDB00000000
+  ret float %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define float @fcmp_select_no_fold_f32_unsupported_cmp(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42487ed8
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x42487ed8, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float %other, float 0x40490FDB00000000
+  ret float %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define float @fcmp_select_no_fold_f32_enc_imm(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, 1.0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 1.0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 1.0
+  %sel = select i1 %cmp, float 1.0, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define float @fcmp_select_no_fold_f32_enc_imm_2(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, -4.0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, -4.0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float -4.0, %arg
+  %sel = select i1 %cmp, float %other, float -4.0
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp oeq with zero constant
+define float @fcmp_select_no_fold_oeq_f32_zero(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f32_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f32_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0.0
+  %sel = select i1 %cmp, float 0.0, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp one with negative zero constant
+define float @fcmp_select_no_fold_one_f32_negzero(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f32_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_brev_b32 s4, 1
+; GFX900-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f32_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x80000000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x80000000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float -0.0, %arg ; 0x8000000000000000
+  %sel = select i1 %cmp, float %other, float -0.0 ;0x8000000000000000
+  ret float %sel
+}
+
+; NaN values should bypass the optimization due to special IEEE 754 behavior
+; fcmp oeq with NaN always returns false, so select always chooses %other
+define float @fcmp_select_no_fold_oeq_f32_nan(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f32_nan:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f32_nan:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x7FF8000000000000
+  %sel = select i1 %cmp, float 0x7FF8000000000000, float %other
+  ret float %sel
+}
+
+; NaN values should bypass the optimization due to special IEEE 754 behavior
+; fcmp one with NaN always returns false, so select always chooses the NaN constant
+define float @fcmp_select_no_fold_one_f32_nan(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f32_nan:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f32_nan:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float 0x7FF8000000000000, %arg
+  %sel = select i1 %cmp, float %other, float 0x7FF8000000000000
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp one with positive infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define float @fcmp_select_no_fold_posinf_oeq_f32(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_oeq_f32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_oeq_f32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0x7f800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x7FF0000000000000
+  %sel = select i1 %cmp, float 0x7FF0000000000000, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp one with negative infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define float @fcmp_select_no_fold_neginf_f32_one(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f32_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f32_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0xff800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xff800000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float 0xFFF0000000000000, %arg
+  %sel = select i1 %cmp, float %other, float 0xFFF0000000000000
+  ret float %sel
+}
+
+;------------------------------------------------------------------------------
+; F64 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: f64 fcmp oeq + select with constant in true value
+define double @fcmp_select_fold_oeq_f64_imm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_f64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_f64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double 3.141592653589793, double %other
+  ret double %sel
+}
+; Should be folded: f64 fcmp oeq + select with constant in true value (commutative)
+define double @fcmp_select_fold_oeq_imm_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 3.141592653589793, %arg
+  %sel = select i1 %cmp, double 3.141592653589793, double %other
+  ret double %sel
+}
+
+; Should be folded: f64 fcmp one + select with constant in false value
+define double @fcmp_select_fold_one_f64_imm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_one_f64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX900-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_f64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX1010-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double %arg, 2.718281828459045
+  %sel = select i1 %cmp, double %other, double 2.718281828459045
+  ret double %sel
+}
+; Should be folded: f64 fcmp one + select with constant in false value (commutative)
+define double @fcmp_select_fold_one_imm_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX900-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX1010-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double 2.718281828459045, %arg
+  %sel = select i1 %cmp, double %other, double 2.718281828459045
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with constant in other position
+define double @fcmp_select_no_fold_f64_other_pos(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x54442d18
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x400921fb
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x54442d18, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x400921fb, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double %other, double 3.141592653589793
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp unsupported comparison type
+define double @fcmp_select_no_fold_f64_unsupported_cmp(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x54442d18
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x400921fb
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_gt_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x54442d18, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x400921fb, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double %other, double 3.141592653589793
+  ret double %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define double @fcmp_select_no_fold_f64_enc_imm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, 1.0, v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 1.0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 1.0
+  %sel = select i1 %cmp, double 1.0, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define double @fcmp_select_no_fold_f64_enc_imm_2(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, -4.0, v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0xc0100000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, -4.0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0xc0100000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double -4.0, %arg
+  %sel = select i1 %cmp, double %other, double -4.0
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with zero constant
+define double @fcmp_select_no_fold_oeq_f64_zero(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f64_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f64_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0.0
+  %sel = select i1 %cmp, double 0.0, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp one with negative zero constant
+define double @fcmp_select_no_fold_one_f64_negzero(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f64_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_brev_b32 s5, 1
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f64_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, 0x80000000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x80000000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double -0.0, %arg
+  %sel = select i1 %cmp, double %other, double -0.0
+  ret double %sel
+}
+
+; Should NOT be folded: f64 different constants
+define double @fcmp_select_no_fold_f64_different_const(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x8b145769
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4005bf0a
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x8b145769, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x4005bf0a, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double 2.718281828459045, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with NaN constant
+; fcmp oeq with NaN always returns false, so select always chooses %other
+define double @fcmp_select_no_fold_nan_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0x7FF8000000000000
+  %sel = select i1 %cmp, double 0x7FF8000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with NaN constant (commutative variant)
+; fcmp oeq with NaN always returns false, so select always chooses %other
+define double @fcmp_select_no_fold_nan_f64_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 0x7FF8000000000000, %arg
+  %sel = select i1 %cmp, double 0x7FF8000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp one with NaN constant
+; fcmp one with NaN always returns false, so select always chooses the NaN constant
+define double @fcmp_select_no_fold_nan_f64_one(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double %arg, 0x7FF8000000000000
+  %sel = select i1 %cmp, double %other, double 0x7FF8000000000000
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp one with NaN constant (commutative variant)
+; fcmp one with NaN always returns false, so select always chooses the NaN constant
+define double @fcmp_select_no_fold_nan_f64_one_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64_one_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_one_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double 0x7FF8000000000000, %arg
+  %sel = select i1 %cmp, double %other, double 0x7FF8000000000000
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with positive infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_posinf_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x7ff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0x7FF0000000000000
+  %sel = select i1 %cmp, double 0x7FF0000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with negative infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_neginf_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0xfff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0xfff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0xfff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0xfff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0xFFF0000000000000
+  %sel = select i1 %cmp, double 0xFFF0000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with positive infinity (commutative variant)
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_posinf_f64_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_f64_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_f64_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x7ff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 0x7FF0000000000000, %arg
+  %sel = select i1 %cmp, double 0x7FF0000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with negative infinity (commutative variant)
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_neginf_f64_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f64_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0xfff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0xfff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f64_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0xfff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0xfff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 0xFFF0000000000000, %arg
+  %sel = select i1 %cmp, double 0xFFF0000000000000, double %other
+  ret double %sel
+}
+
+;------------------------------------------------------------------------------
+; F16 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: f16 fcmp oeq + select with constant in true value
+define half @fcmp_select_fold_oeq_f16_imm(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_f16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_f16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH4248
+  %sel = select i1 %cmp, half 0xH4248, half %other
+  ret half %sel
+}
+
+; Should be folded: f16 fcmp oeq + select with constant in true value (commutative)
+define half @fcmp_select_fold_oeq_imm_f16(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_f16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half 0xH4248, %arg
+  %sel = select i1 %cmp, half 0xH4248, half %other
+  ret half %sel
+}
+
+; Should be folded: f16 fcmp one + select with constant in false value
+define half @fcmp_select_fold_one_f16_imm(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_one_f16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4020
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_f16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x4020, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xH4020
+  %sel = select i1 %cmp, half %other, half 0xH4020
+  ret half %sel
+}
+
+; Should be folded: f16 fcmp one + select with constant in false value (commutative)
+define half @fcmp_select_fold_one_imm_f16(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_f16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4020
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_f16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x4020, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half 0xH4020, %arg
+  %sel = select i1 %cmp, half %other, half 0xH4020
+  ret half %sel
+}
+
+; Should NOT be folded: different constants
+define half @fcmp_select_no_fold_f16_different_const(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f16_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4300
+; GFX900-NEXT:    v_cmp_neq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f16_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4300, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH4248
+  %sel = select i1 %cmp, half 0xH4300, half %other
+  ret half %sel
+}
+
+; Should NOT be folded: NaN values bypass optimization
+define half @fcmp_select_no_fold_nan_f16(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH7e00
+  %sel = select i1 %cmp, half 0xH7e00, half %other
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp one with NaN constant
+define half @fcmp_select_no_fold_nan_f16_one(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xH7e00
+  %sel = select i1 %cmp, half %other, half 0xH7e00
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp one with +Inf constant
+define half @fcmp_select_no_fold_posinf_f16_one(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_f16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_f16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x7c00, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xH7c00
+  %sel = select i1 %cmp, half %other, half 0xH7c00
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp one with -Inf constant
+define half @fcmp_select_no_fold_neginf_f16_one(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xfc00
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xfc00
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0xfc00, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xfc00, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xHfc00
+  %sel = select i1 %cmp, half %other, half 0xHfc00
+  ret half %sel
+}
+; Should NOT be folded: f16 fcmp oeq with zero constant
+define half @fcmp_select_no_fold_oeq_f16_zero(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f16_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f16_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH0000
+  %sel = select i1 %cmp, half 0xH0000, half %other
+  ret half %sel
+}
+; Should NOT be folded: f16 fcmp one with negative zero constant
+define half @fcmp_select_no_fold_one_f16_negzero(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f16_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x8000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x8000
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f16_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x8000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x8000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half 0xH8000, %arg
+  %sel = select i1 %cmp, half %other, half 0xH8000
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp oeq with constant in other position
+define half @fcmp_select_no_fold_f16_other_pos(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f16_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f16_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH4248
+  %sel = select i1 %cmp, half %other, half 0xH4248
+  ret half %sel
+}
+
+; Should NOT be folded: f16 unsupported comparison type
+define half @fcmp_select_no_fold_f16_unsupported_cmp(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f16_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_gt_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f16_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt half %arg, 0xH4248
+  %sel = select i1 %cmp, half %other, half 0xH4248
+  ret half %sel
+}
+
+;------------------------------------------------------------------------------
+; BF16 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: bfloat fcmp oeq + select with constant in true value
+define bfloat @fcmp_select_fold_oeq_bf16_imm(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_bf16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_bf16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat 0xR4248, bfloat %other
+  ret bfloat %sel
+}
+
+; Should be folded: bfloat fcmp oeq + select with constant in true value (commutative)
+define bfloat @fcmp_select_fold_oeq_imm_bf16(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_bf16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat 0xR4248, %arg
+  %sel = select i1 %cmp, bfloat 0xR4248, bfloat %other
+  ret bfloat %sel
+}
+
+; Should be folded: bfloat fcmp one + select with constant in false value
+define bfloat @fcmp_select_fold_one_bf16_imm(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_one_bf16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x40200000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_bf16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x40200000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xR4020
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4020
+  ret bfloat %sel
+}
+
+; Should be folded: bfloat fcmp one + select with constant in false value (commutative)
+define bfloat @fcmp_select_fold_one_imm_bf16(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x40200000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_bf16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x40200000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat 0xR4020, %arg
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4020
+  ret bfloat %sel
+}
+
+; Should NOT be folded: different constants
+define bfloat @fcmp_select_no_fold_bf16_different_const(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_bf16_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4300
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_bf16_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0x42480000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4300, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat 0xR4300, bfloat %other
+  ret bfloat %sel
+}
+
+; Should NOT be folded: NaN values bypass optimization
+define bfloat @fcmp_select_no_fold_nan_bf16(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_bf16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR7FC0
+  %sel = select i1 %cmp, bfloat 0xR7FC0, bfloat %other
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with NaN constant
+define bfloat @fcmp_select_no_fold_nan_bf16_one(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_bf16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_bf16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xR7FC0
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR7FC0
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with +Inf constant
+define bfloat @fcmp_select_no_fold_posinf_bf16_one(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_bf16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f80
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_bf16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x7f800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7f80, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xR7F80
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR7F80
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with -Inf constant
+define bfloat @fcmp_select_no_fold_neginf_bf16_one(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_bf16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_bf16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0xff800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xffffff80, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xRFF80
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xRFF80
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp oeq with zero constant
+define bfloat @fcmp_select_no_fold_oeq_bf16_zero(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_bf16_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_bf16_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR0000
+  %sel = select i1 %cmp, bfloat 0xR0000, bfloat %other
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with negative zero constant
+define bfloat @fcmp_select_no_fold_one_bf16_negzero(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_bf16_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_brev_b32 s4, 1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_bf16_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x80000000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xffff8000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat 0xR8000, %arg
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR8000
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp oeq with constant in other position
+define bfloat @fcmp_select_no_fold_bf16_other_pos(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_bf16_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_bf16_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4248
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat unsupported comparison type
+define bfloat @fcmp_select_no_fold_bf16_unsupported_cmp(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_bf16_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_bf16_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x42480000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4248
+  ret bfloat %sel
+}
diff --git a/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll
new file mode 100644
index 0000000000000..4383cfd36f945
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll
@@ -0,0 +1,955 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX1010 %s
+
+;------------------------------------------------------------------------------
+; I32 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i32 @icmp_select_fold_eq_i32_imm(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 4242
+  %sel = select i1 %cmp, i32 4242, i32 %other
+  ret i32 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i32 @icmp_select_fold_eq_imm_i32(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 4242, %arg
+  %sel = select i1 %cmp, i32 4242, i32 %other
+  ret i32 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i32 @icmp_select_fold_ne_i32_imm(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i32 %arg, 4242
+  %sel = select i1 %cmp, i32 %other, i32 4242
+  ret i32 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i32 @icmp_select_fold_ne_imm_i32(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i32 4242, %arg
+  %sel = select i1 %cmp, i32 %other, i32 4242
+  ret i32 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i32 @icmp_select_no_fold_i32_different(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x978
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x978, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 4242
+  %sel = select i1 %cmp, i32 2424, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i32 @icmp_select_no_fold_i32_other_pos(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 4242
+  %sel = select i1 %cmp, i32 %other, i32 4242
+  ret i32 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i32 @icmp_select_no_fold_i32_unsupported_cmp(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1094
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x102d
+; GFX900-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x1094, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x102d, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i32 %arg, 4243
+  %sel = select i1 %cmp, i32 4141, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i32 @icmp_select_no_fold_i32_enc_imm(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 0
+  %sel = select i1 %cmp, i32 0, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i32 @icmp_select_no_fold_i32_enc_imm_2(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 64, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 64, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 64, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 64, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 64, %arg
+  %sel = select i1 %cmp, i32 64, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i32 @icmp_select_no_fold_i32_enc_imm_3(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, -16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -16, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -16, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -16, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i32 %arg, -16
+  %sel = select i1 %cmp, i32 %other, i32 -16
+  ret i32 %sel
+}
+
+;------------------------------------------------------------------------------
+; I64 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i64 @icmp_select_fold_eq_i64_imm(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 424242424242, i64 %other
+  ret i64 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i64 @icmp_select_fold_eq_imm_i64(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 424242424242, %arg
+  %sel = select i1 %cmp, i64 424242424242, i64 %other
+  ret i64 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i64 @icmp_select_fold_ne_i64_imm(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 %other, i64 424242424242
+  ret i64 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i64 @icmp_select_fold_ne_imm_i64(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i64 424242424242, %arg
+  %sel = select i1 %cmp, i64 %other, i64 424242424242
+  ret i64 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i64 @icmp_select_no_fold_i64_different(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x719c60f8
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x719c60f8, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 242424242424, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i64 @icmp_select_no_fold_i64_other_pos(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0xc6d1a9b2
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x62
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xc6d1a9b2, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x62, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 %other, i64 424242424242
+  ret i64 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i64 @icmp_select_no_fold_i64_unsupported_cmp(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b3
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0xc6d1a9b2
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x62
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b3
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xc6d1a9b2, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x62, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 424242424242, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i64 @icmp_select_no_fold_i64_enc_imm(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 0
+  %sel = select i1 %cmp, i64 0, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i64 @icmp_select_no_fold_i64_enc_imm_2(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, 32, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 32, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 32, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 32, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 32, %arg
+  %sel = select i1 %cmp, i64 32, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i64 @icmp_select_no_fold_i64_enc_imm_3(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, -8, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -8, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, -1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, -8, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -8, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, -1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i64 %arg, -8
+  %sel = select i1 %cmp, i64 %other, i64 -8
+  ret i64 %sel
+}
+
+;------------------------------------------------------------------------------
+; I16 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i16 @icmp_select_fold_eq_i16_imm(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 4242
+  %sel = select i1 %cmp, i16 4242, i16 %other
+  ret i16 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i16 @icmp_select_fold_eq_imm_i16(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 4242, %arg
+  %sel = select i1 %cmp, i16 4242, i16 %other
+  ret i16 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i16 @icmp_select_fold_ne_i16_imm(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i16 %arg, 4242
+  %sel = select i1 %cmp, i16 %other, i16 4242
+  ret i16 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i16 @icmp_select_fold_ne_imm_i16(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i16 4242, %arg
+  %sel = select i1 %cmp, i16 %other, i16 4242
+  ret i16 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i16 @icmp_select_no_fold_i16_different(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x978
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x978, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 4242
+  %sel = select i1 %cmp, i16 2424, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i16 @icmp_select_no_fold_i16_other_pos(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 4242
+  %sel = select i1 %cmp, i16 %other, i16 4242
+  ret i16 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i16 @icmp_select_no_fold_i16_unsupported_cmp(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1093
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x1092
+; GFX900-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x1093, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i16 %arg, 4242
+  %sel = select i1 %cmp, i16 4242, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i16 @icmp_select_no_fold_i16_enc_imm(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 0
+  %sel = select i1 %cmp, i16 0, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i16 @icmp_select_no_fold_i16_enc_imm_2(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 45, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 45, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 45, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 45, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 45, %arg
+  %sel = select i1 %cmp, i16 45, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i16 @icmp_select_no_fold_i16_enc_imm_3(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, -12, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -12, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, -12, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -12, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i16 %arg, -12
+  %sel = select i1 %cmp, i16 %other, i16 -12
+  ret i16 %sel
+}
+
+;------------------------------------------------------------------------------
+; I8 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i8 @icmp_select_fold_eq_i8_imm(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i8_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i8_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 123
+  %sel = select i1 %cmp, i8 123, i8 %other
+  ret i8 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i8 @icmp_select_fold_eq_imm_i8(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i8:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 123, %arg
+  %sel = select i1 %cmp, i8 123, i8 %other
+  ret i8 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i8 @icmp_select_fold_ne_i8_imm(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i8_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i8_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i8 %arg, 123
+  %sel = select i1 %cmp, i8 %other, i8 123
+  ret i8 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i8 @icmp_select_fold_ne_imm_i8(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i8:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i8 123, %arg
+  %sel = select i1 %cmp, i8 %other, i8 123
+  ret i8 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i8 @icmp_select_no_fold_i8_different(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7c
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7c, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 123
+  %sel = select i1 %cmp, i8 124, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i8 @icmp_select_no_fold_i8_other_pos(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7b, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 123
+  %sel = select i1 %cmp, i8 %other, i8 123
+  ret i8 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i8 @icmp_select_no_fold_i8_unsupported_cmp(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7c
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    v_cmp_lt_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7c
+; GFX1010-NEXT:    v_cmp_lt_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7b, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i8 %arg, 123
+  %sel = select i1 %cmp, i8 123, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i8 @icmp_select_no_fold_i8_enc_imm(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 0
+  %sel = select i1 %cmp, i8 0, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i8 @icmp_select_no_fold_i8_enc_imm_2(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 25
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 25, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 25
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 25, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 25, %arg
+  %sel = select i1 %cmp, i8 25, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i8 @icmp_select_no_fold_i8_enc_imm_3(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0xfb
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0xfb
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -5, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i8 %arg, -5
+  %sel = select i1 %cmp, i8 %other, i8 -5
+  ret i8 %sel
+}
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index ec3781fbf0fc4..f497752994852 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -841,3 +841,23 @@ ret:
   ret void
 }
 
+define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) {
+; GCN-LABEL: poison_should_freeze:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x5040100
+; GCN-NEXT:    v_perm_b32 v2, v2, s4, v7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %poisonv = insertelement <2 x i16> poison, i16 %val2, i32 1
+  %poison = bitcast <2 x i16> %poisonv to i32
+  %cond2 = select i1 %cond1, i32 %poison, i32 %val
+  %cmp = icmp eq i32 %cond2, 0
+  %select = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %select
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
new file mode 100644
index 0000000000000..192bd2073886a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s
+
+define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
+; GCN-LABEL: copy_to_vreg_1:
+; GCN:       ; %bb.0: ; %._crit_edge
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s5, 1, s4
+; GCN-NEXT:    s_cmp_lt_u32 s4, 2
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s3, s5, 1
+; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GCN-NEXT:    s_addc_u32 s0, 1, 0
+; GCN-NEXT:    v_readfirstlane_b32 s2, v1
+; GCN-NEXT:    s_cmp_ge_u32 s3, s4
+; GCN-NEXT:    s_cselect_b32 s4, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_cmp_lg_u64 0, 0
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_branch .LBB0_3
+; GCN-NEXT:  .LBB0_1: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_xor_b64 s[8:9], exec, -1
+; GCN-NEXT:  .LBB0_2: ; %Flow3
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_and_b64 s[4:5], exec, s[8:9]
+; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execz .LBB0_8
+; GCN-NEXT:  .LBB0_3: ; %.lr.ph27
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; GCN-NEXT:    s_xor_b64 s[6:7], s[8:9], -1
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], s[8:9]
+; GCN-NEXT:    s_cbranch_execz .LBB0_5
+; GCN-NEXT:  ; %bb.4: ; %pred.store.if
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    global_store_byte v[2:3], v1, off
+; GCN-NEXT:  .LBB0_5: ; %Flow2
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 s[8:9], -1
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB0_2
+; GCN-NEXT:  ; %bb.6: ; %pred.store.continue
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GCN-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB0_1
+; GCN-NEXT:  ; %bb.7: ; %pred.store.if41
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    global_store_byte v[2:3], v1, off
+; GCN-NEXT:    s_branch .LBB0_1
+; GCN-NEXT:  .LBB0_8: ; %DummyReturnBlock
+; GCN-NEXT:    s_endpgm
+._crit_edge:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = udiv i32 1, %0
+  br label %.lr.ph27
+
+.lr.ph27:                                         ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge
+  %iv = phi i32 [ %div, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ]
+  %cmp = icmp ugt i32 %iv, 0
+  %broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %cmp, i64 0
+  %.zext = zext i32 %id.x to i64
+  %broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0
+  %cmp.1 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1)
+  %or = or <4 x i1> %cmp.1, %broadcast.splatinsert37
+  %extract = extractelement <4 x i1> %or, i64 0
+  br i1 %extract, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:                                    ; preds = %.lr.ph27
+  store i8 0, ptr addrspace(1) null, align 64
+  br label %pred.store.continue
+
+pred.store.continue:                              ; preds = %pred.store.if, %.lr.ph27
+  %extract.1 = extractelement <4 x i1> %or, i64 1
+  br i1 %extract.1, label %pred.store.if41, label %.lr.ph27
+
+pred.store.if41:                                  ; preds = %pred.store.continue
+  store i8 0, ptr addrspace(1) null, align 64
+  br label %.lr.ph27
+}
+
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
new file mode 100644
index 0000000000000..2daea2b2eeb74
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            copy_to_vreg_1
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: copy_to_vreg_1
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
+  ; GCN-NEXT:   [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+      %0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
+      %1:sreg_32 = COPY %0:vgpr_32
+      %2:sreg_32 = COPY $vgpr1
+      samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc
+      %3:sreg_64 = COPY $scc
+      %4:vreg_1 = COPY %3:sreg_64
+
+  bb.1:
+      S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
index 8e6da4bf92ee0..3f6956b83ae92 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
@@ -18,9 +18,9 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -29,8 +29,8 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; GFX908-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr1 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32
@@ -62,9 +62,9 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -73,8 +73,8 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; GFX90A-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr1 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32
@@ -124,7 +124,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -133,7 +133,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64
@@ -164,7 +164,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -173,7 +173,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64
@@ -222,14 +222,14 @@ body: |
   ; GFX908-SPILLED-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
   ; GFX908-SPILLED-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -288,14 +288,14 @@ body: |
   ; GFX90A-SPILLED-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.2(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -385,7 +385,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -394,7 +394,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_AV96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96
@@ -427,7 +427,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -436,7 +436,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_AV96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96
@@ -486,7 +486,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -495,7 +495,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128
@@ -530,7 +530,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -539,7 +539,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128
@@ -591,7 +591,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -600,7 +600,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_AV160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160
@@ -637,7 +637,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -646,7 +646,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_AV160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160
@@ -700,7 +700,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -709,7 +709,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192
@@ -748,7 +748,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -757,7 +757,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192
@@ -813,7 +813,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -822,7 +822,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_AV256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256
@@ -865,7 +865,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -874,7 +874,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_AV256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256
@@ -934,7 +934,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -943,7 +943,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_AV288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr288
@@ -988,7 +988,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -997,7 +997,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_AV288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr288
@@ -1059,7 +1059,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1068,7 +1068,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_AV320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr320
@@ -1115,7 +1115,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1124,7 +1124,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_AV320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr320
@@ -1188,7 +1188,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1197,7 +1197,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_AV352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr352
@@ -1246,7 +1246,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1255,7 +1255,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_AV352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr352
@@ -1321,7 +1321,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1330,7 +1330,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_AV384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr384
@@ -1381,7 +1381,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1390,7 +1390,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_AV384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr384
@@ -1458,7 +1458,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1467,7 +1467,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512
@@ -1526,7 +1526,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1535,7 +1535,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512
@@ -1611,7 +1611,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1620,7 +1620,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_AV1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024
@@ -1711,7 +1711,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1720,7 +1720,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_AV1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index bd255e88b9512..648b59f69ea79 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -9,9 +9,9 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; GCN-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 47dfa9f4fc2d3..33c2ce628e108 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -921,45 +921,47 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[10:11], s[2:3], 31
-; GCN-NEXT:    s_ashr_i64 s[6:7], s[4:5], 31
-; GCN-NEXT:    s_ashr_i32 s4, s5, 31
-; GCN-NEXT:    s_add_u32 s6, s6, s4
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    s_addc_u32 s7, s7, s4
-; GCN-NEXT:    s_xor_b64 s[8:9], s[6:7], s[4:5]
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 31
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 31
+; GCN-NEXT:    s_ashr_i32 s6, s5, 31
+; GCN-NEXT:    s_add_u32 s4, s4, s6
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    s_addc_u32 s5, s5, s6
+; GCN-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s2, 0, s8
-; GCN-NEXT:    s_subb_u32 s4, 0, s9
-; GCN-NEXT:    s_ashr_i32 s12, s3, 31
+; GCN-NEXT:    s_sub_u32 s4, 0, s8
+; GCN-NEXT:    s_subb_u32 s5, 0, s9
+; GCN-NEXT:    s_ashr_i32 s10, s3, 31
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s13, s12
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_add_u32 s2, s2, s10
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    s_addc_u32 s3, s3, s10
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
@@ -967,12 +969,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
@@ -988,20 +990,18 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    s_add_u32 s2, s10, s12
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_addc_u32 s3, s11, s12
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s12, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s13, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s13, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s13, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
@@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
@@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s13
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
@@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s12, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s10, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s10, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 42bd2ff8797a1..9f539bd4cf0f8 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -813,7 +813,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
@@ -824,11 +825,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    v_pk_sub_i16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
index 831570800d06c..6966c3d8b6d6a 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
+++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
@@ -34,26 +34,26 @@ body:             |
   ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_AV128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_AV32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr9 = COPY killed renamable $vgpr5
-  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9
   ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir
new file mode 100644
index 0000000000000..fa27d689dd8dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1200 %s
+
+---
+name:            trans_writes_valu_reads_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_writes_valu_reads_hazard
+    ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_writes_valu_reads_hazard
+    ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_writes_valu_valu_reads_hazard_covered
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_writes_valu_valu_reads_hazard_covered
+    ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_writes_salu_valu_reads_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_writes_salu_valu_reads_hazard
+    ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_writes_salu_valu_reads_hazard
+    ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1200-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_no_hazard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_no_hazard
+    ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads_valu_writes_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_reads_valu_writes_hazard
+    ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_reads_valu_writes_hazard
+    ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads_valu_valu_writes_hazard_covered
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_reads_valu_valu_writes_hazard_covered
+    ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads__salu_valu_writes_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_reads__salu_valu_writes_hazard
+    ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_reads__salu_valu_writes_hazard
+    ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_writes_trans_reads_no_hazard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_writes_trans_reads_no_hazard
+    ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads_trans_writes_no_hazard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_reads_trans_writes_no_hazard
+    ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index f0829b53168d9..c12265bd7f372 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index e67420562e257..5056747c33cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index 92993d07b4f8f..ddae1b296024e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -1718,7 +1718,7 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
@@ -1751,7 +1751,7 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
@@ -3805,37 +3805,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3909,37 +3909,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4013,47 +4013,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4123,49 +4125,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4227,50 +4229,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4334,58 +4335,58 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index 2bcee373d9247..e3a7ae5fd0256 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -3544,37 +3544,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3648,37 +3648,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3752,47 +3752,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -3862,49 +3864,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -3966,50 +3968,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4073,58 +4074,58 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
new file mode 100644
index 0000000000000..93f489170cea0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
@@ -0,0 +1,448 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s
+
+---
+name:            save_inactive_lanes_non_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name:            save_all_lanes_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: save_all_lanes_csr_vgpr
+    ; CHECK: liveins: $vgpr40
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+
+...
+---
+name:            save_csr_sgpr_to_non_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr191
+    ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr
+    ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+    ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+    $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $sgpr20 = S_MOV_B32 14, implicit $exec
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name:            save_csr_sgpr_to_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr191
+    ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr
+    ; CHECK: liveins: $sgpr20, $vgpr191
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+    $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $sgpr20 = S_MOV_B32 14, implicit $exec
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name:            vgpr_and_sgpr_csr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+frameInfo:
+  maxAlignment:    4
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  hasSpilledSGPRs: true
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  spillPhysVGPRs:
+    - '$vgpr191'
+  wwmReservedRegs:
+    - '$vgpr191'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+    ; CHECK-LABEL: name: vgpr_and_sgpr_csr
+    ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+    $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name:            split_orig_exec
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+frameInfo:
+  maxAlignment:    4
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  hasSpilledSGPRs: true
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  spillPhysVGPRs:
+    - '$vgpr191'
+  wwmReservedRegs:
+    - '$vgpr191'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+    ; CHECK-LABEL: name: split_orig_exec
+    ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+    $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    $sgpr3 = COPY $vcc_lo
+    S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+
+...
+---
+name:            vgpr_superregs
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: vgpr_superregs
+    ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+    ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+    ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name:            dont_restore_used_vgprs
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr20' }
+  - { reg: '$vgpr40' }
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr20, $vgpr40
+
+    ; CHECK-LABEL: name: dont_restore_used_vgprs
+    ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name:            multiple_blocks
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  ; CHECK-LABEL: name: multiple_blocks
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+  ; CHECK-NEXT:   $exec_lo = S_MOV_B32 -1
+  ; CHECK-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; CHECK-NEXT:   V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+  ; CHECK-NEXT:   renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+  ; CHECK-NEXT:   $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+  ; CHECK-NEXT:   $exec_lo = S_MOV_B32 $vcc_lo
+  ; CHECK-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1
+
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $sgpr1 = S_MOV_B32 $exec_lo
+    V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+    renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+
+  bb.2:
+    liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+    $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+    renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
new file mode 100644
index 0000000000000..53d02925fb1c2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -0,0 +1,2414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s
+
+; Make sure the i1 %active is passed through EXEC.
+; The EXEC mask should be set to -1 for the duration of the function
+; and restored to its original value in the epilogue.
+; We will also need to restore the inactive lanes for any allocated VGPRs.
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: basic_test:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: basic_test:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if there's only one use for %active.
+define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: single_use_of_active:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: single_use_of_active:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: single_use_of_active:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: single_use_of_active:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %y = select i1 %active, i32 %b, i32 17
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: unused_active:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    v_mov_b32_e32 v0, 14
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: unused_active:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    v_mov_b32_e32 v0, 14
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: unused_active:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    v_mov_b32_e32 v0, 14
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: unused_active:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    v_mov_b32_e32 v0, 14
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  ret i32 14
+}
+
+; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes.
+; For CSR VGPRs, we need to restore all lanes.
+define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber CSR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    v_writelane_b32 v2, s20, 0
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber non-CSR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT:    v_readlane_b32 s20, v2, 0
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_store_b32 off, v2, s32
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber CSR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    v_writelane_b32 v2, s20, 0
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber non-CSR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT:    v_readlane_b32 s20, v2, 0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; GISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xf1ff
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber CSR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    v_writelane_b32 v2, s20, 0
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber non-CSR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT:    v_readlane_b32 s20, v2, 0
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xf1ff
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber CSR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    v_writelane_b32 v2, s20, 0
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber non-CSR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT:    v_readlane_b32 s20, v2, 0
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xf1ff
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"()
+  call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"()
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Save and restore all lanes of v40.
+define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr_vgpr_only:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber CSR VGPR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr_vgpr_only:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber CSR VGPR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr_vgpr_only:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber CSR VGPR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr_vgpr_only:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber CSR VGPR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber CSR VGPR", "~{v40}"()
+  ret void
+}
+
+define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: sgpr_spill_only:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    v_writelane_b32 v0, s68, 0
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber CSR SGPR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s68, v0, 0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sgpr_spill_only:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    v_writelane_b32 v0, s68, 0
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber CSR SGPR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s68, v0, 0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: sgpr_spill_only:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    v_writelane_b32 v0, s68, 0
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber CSR SGPR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_readlane_b32 s68, v0, 0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: sgpr_spill_only:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    v_writelane_b32 v0, s68, 0
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber CSR SGPR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_readlane_b32 s68, v0, 0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber CSR SGPR", "~{s68}"()
+  ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: multiple_blocks:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
+; DAGISEL-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL-NEXT:  ; %bb.1: ; %if.then
+; DAGISEL-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL-NEXT:  ; %bb.2: ; %if.end
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: multiple_blocks:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GISEL-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; GISEL-NEXT:  ; %bb.1: ; %if.then
+; GISEL-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; GISEL-NEXT:  ; %bb.2: ; %if.end
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: multiple_blocks:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL64-NEXT:    s_mov_b64 s[2:3], exec
+; DAGISEL64-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL64-NEXT:  ; %bb.1: ; %if.then
+; DAGISEL64-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL64-NEXT:  ; %bb.2: ; %if.end
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: multiple_blocks:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL64-NEXT:    s_mov_b64 s[2:3], exec
+; GISEL64-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; GISEL64-NEXT:  ; %bb.1: ; %if.then
+; GISEL64-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; GISEL64-NEXT:  ; %bb.2: ; %if.end
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %c = icmp eq i32 %a, %b
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %0
+  %d = add i32 %a, %b
+  br label %if.end
+
+if.end:
+  %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+  %e = select i1 %active, i32 %a, i32 %f
+  ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+; DAGISEL-LABEL: ret_64:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1
+; GISEL-NEXT:    v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: ret_64:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v2, 3, v2, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: ret_64:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v2, 3, v2, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %x = select i1 %active, i64 %a, i64 5
+  %y = select i1 %active, i64 %b, i64 3
+  %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i64 %ret
+}
+
+define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) {
+; DAGISEL-LABEL: inreg_args:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_clause 0x5
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9
+; DAGISEL-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; DAGISEL-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; DAGISEL-NEXT:    scratch_store_b32 off, v4, s10
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b128 off, v[0:3], s11
+; DAGISEL-NEXT:    scratch_store_b32 off, v5, s11
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT:    s_clause 0x5
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: inreg_args:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 s34, -1
+; GISEL-NEXT:    s_clause 0x5
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_mov_b32 s0, s5
+; GISEL-NEXT:    s_mov_b32 s1, s6
+; GISEL-NEXT:    s_mov_b32 s2, s7
+; GISEL-NEXT:    s_mov_b32 s3, s8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GISEL-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL-NEXT:    scratch_store_b32 off, v4, s10
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b128 off, v[0:3], s11
+; GISEL-NEXT:    scratch_store_b32 off, v5, s11
+; GISEL-NEXT:    s_xor_b32 exec_lo, s34, -1
+; GISEL-NEXT:    s_clause 0x5
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; GISEL-NEXT:    s_mov_b32 exec_lo, s34
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: inreg_args:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    s_clause 0x5
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    v_mov_b32_e32 v4, s4
+; DAGISEL64-NEXT:    v_mov_b32_e32 v0, s5
+; DAGISEL64-NEXT:    v_mov_b32_e32 v1, s6
+; DAGISEL64-NEXT:    v_mov_b32_e32 v2, s7
+; DAGISEL64-NEXT:    v_mov_b32_e32 v3, s8
+; DAGISEL64-NEXT:    v_mov_b32_e32 v5, s9
+; DAGISEL64-NEXT:    scratch_store_b32 off, v4, s10
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b128 off, v[0:3], s11
+; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s11
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT:    s_clause 0x5
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL64-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: inreg_args:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GISEL64-NEXT:    s_clause 0x5
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; GISEL64-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_mov_b32 s0, s5
+; GISEL64-NEXT:    s_mov_b32 s1, s6
+; GISEL64-NEXT:    s_mov_b32 s2, s7
+; GISEL64-NEXT:    s_mov_b32 s3, s8
+; GISEL64-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL64-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL64-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL64-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL64-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL64-NEXT:    scratch_store_b32 off, v4, s10
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b128 off, v[0:3], s11
+; GISEL64-NEXT:    scratch_store_b32 off, v5, s11
+; GISEL64-NEXT:    s_xor_b64 exec, s[34:35], -1
+; GISEL64-NEXT:    s_clause 0x5
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; GISEL64-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; GISEL64-NEXT:    s_mov_b64 exec, s[34:35]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %i32, ptr addrspace(5) %ptr
+  store <4 x i32> %v4i32, ptr addrspace(5) %ptr2
+  store float %float, ptr addrspace(5) %ptr2
+  ret void
+}
+
+declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y)
+
+define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) {
+; DAGISEL-LABEL: call_gfx_from_whole_wave:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s0, s33
+; DAGISEL-NEXT:    s_mov_b32 s33, s32
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_writelane_b32 v40, s0, 3
+; DAGISEL-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL-NEXT:    v_swap_b32 v0, v1
+; DAGISEL-NEXT:    s_mov_b32 s1, gfx_callee@abs32@hi
+; DAGISEL-NEXT:    v_writelane_b32 v40, s4, 0
+; DAGISEL-NEXT:    s_mov_b32 s0, gfx_callee@abs32@lo
+; DAGISEL-NEXT:    s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT:    v_writelane_b32 v40, s30, 1
+; DAGISEL-NEXT:    v_writelane_b32 v40, s31, 2
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
+; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
+; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_mov_b32 s32, s33
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; DAGISEL-NEXT:    s_mov_b32 s33, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: call_gfx_from_whole_wave:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s0, s33
+; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; GISEL-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; GISEL-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; GISEL-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; GISEL-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; GISEL-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; GISEL-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; GISEL-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; GISEL-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; GISEL-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; GISEL-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; GISEL-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; GISEL-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; GISEL-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; GISEL-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; GISEL-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; GISEL-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; GISEL-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; GISEL-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; GISEL-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; GISEL-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; GISEL-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; GISEL-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; GISEL-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; GISEL-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; GISEL-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; GISEL-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; GISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; GISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; GISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; GISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; GISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; GISEL-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; GISEL-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; GISEL-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; GISEL-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; GISEL-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; GISEL-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; GISEL-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; GISEL-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; GISEL-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; GISEL-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; GISEL-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; GISEL-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; GISEL-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; GISEL-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; GISEL-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; GISEL-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; GISEL-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; GISEL-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; GISEL-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; GISEL-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; GISEL-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; GISEL-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; GISEL-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; GISEL-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; GISEL-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; GISEL-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; GISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; GISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; GISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; GISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; GISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; GISEL-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; GISEL-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; GISEL-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; GISEL-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; GISEL-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; GISEL-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; GISEL-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; GISEL-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; GISEL-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; GISEL-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; GISEL-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; GISEL-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; GISEL-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; GISEL-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; GISEL-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; GISEL-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; GISEL-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; GISEL-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; GISEL-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; GISEL-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; GISEL-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; GISEL-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; GISEL-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; GISEL-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; GISEL-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; GISEL-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; GISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; GISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; GISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; GISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; GISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; GISEL-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; GISEL-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; GISEL-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; GISEL-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; GISEL-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; GISEL-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; GISEL-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; GISEL-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; GISEL-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; GISEL-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; GISEL-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; GISEL-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; GISEL-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; GISEL-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; GISEL-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; GISEL-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; GISEL-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; GISEL-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; GISEL-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; GISEL-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; GISEL-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; GISEL-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; GISEL-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; GISEL-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; GISEL-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; GISEL-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; GISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; GISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; GISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; GISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; GISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; GISEL-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; GISEL-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; GISEL-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; GISEL-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; GISEL-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; GISEL-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; GISEL-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; GISEL-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; GISEL-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; GISEL-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; GISEL-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; GISEL-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; GISEL-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_writelane_b32 v40, s0, 3
+; GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL-NEXT:    v_swap_b32 v0, v1
+; GISEL-NEXT:    s_mov_b32 s0, gfx_callee@abs32@lo
+; GISEL-NEXT:    v_writelane_b32 v40, s4, 0
+; GISEL-NEXT:    s_mov_b32 s1, gfx_callee@abs32@hi
+; GISEL-NEXT:    s_addk_co_i32 s32, 0x250
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 2
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s4, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s0, v40, 3
+; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 s32, s33
+; GISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; GISEL-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; GISEL-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; GISEL-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; GISEL-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; GISEL-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; GISEL-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; GISEL-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; GISEL-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; GISEL-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; GISEL-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; GISEL-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; GISEL-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; GISEL-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; GISEL-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; GISEL-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; GISEL-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; GISEL-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; GISEL-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; GISEL-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; GISEL-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; GISEL-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; GISEL-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; GISEL-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; GISEL-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; GISEL-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; GISEL-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; GISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; GISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; GISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; GISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; GISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; GISEL-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; GISEL-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; GISEL-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; GISEL-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; GISEL-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; GISEL-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; GISEL-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; GISEL-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; GISEL-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; GISEL-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; GISEL-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; GISEL-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; GISEL-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; GISEL-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; GISEL-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; GISEL-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; GISEL-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; GISEL-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; GISEL-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; GISEL-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; GISEL-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; GISEL-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; GISEL-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; GISEL-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; GISEL-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; GISEL-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; GISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; GISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; GISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; GISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; GISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; GISEL-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; GISEL-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; GISEL-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; GISEL-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; GISEL-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; GISEL-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; GISEL-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; GISEL-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; GISEL-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; GISEL-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; GISEL-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; GISEL-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; GISEL-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; GISEL-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; GISEL-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; GISEL-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; GISEL-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; GISEL-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; GISEL-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; GISEL-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; GISEL-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; GISEL-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; GISEL-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; GISEL-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; GISEL-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; GISEL-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; GISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; GISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; GISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; GISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; GISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; GISEL-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; GISEL-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; GISEL-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; GISEL-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; GISEL-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; GISEL-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; GISEL-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; GISEL-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; GISEL-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; GISEL-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; GISEL-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; GISEL-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; GISEL-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; GISEL-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; GISEL-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; GISEL-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; GISEL-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; GISEL-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; GISEL-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; GISEL-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; GISEL-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; GISEL-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; GISEL-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; GISEL-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; GISEL-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; GISEL-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; GISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; GISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; GISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; GISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; GISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; GISEL-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; GISEL-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; GISEL-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; GISEL-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; GISEL-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; GISEL-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; GISEL-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; GISEL-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; GISEL-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; GISEL-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; GISEL-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; GISEL-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; GISEL-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GISEL-NEXT:    s_mov_b32 s33, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: call_gfx_from_whole_wave:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_mov_b32 s0, s33
+; DAGISEL64-NEXT:    s_mov_b32 s33, s32
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL64-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL64-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL64-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL64-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL64-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL64-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL64-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL64-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL64-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL64-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL64-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL64-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL64-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL64-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL64-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL64-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL64-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL64-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL64-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL64-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL64-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL64-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL64-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL64-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL64-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL64-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL64-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL64-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL64-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL64-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL64-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL64-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL64-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL64-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL64-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL64-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL64-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL64-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL64-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL64-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL64-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL64-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL64-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL64-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL64-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL64-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL64-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL64-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL64-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL64-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL64-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL64-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL64-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL64-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL64-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL64-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL64-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL64-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL64-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL64-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL64-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL64-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL64-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL64-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL64-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL64-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL64-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL64-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL64-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL64-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL64-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL64-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL64-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL64-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL64-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL64-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL64-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL64-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL64-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL64-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL64-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL64-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL64-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL64-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL64-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL64-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL64-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL64-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL64-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL64-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL64-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL64-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL64-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL64-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL64-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL64-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL64-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL64-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL64-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL64-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL64-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL64-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL64-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL64-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL64-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL64-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL64-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL64-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL64-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL64-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL64-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL64-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL64-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL64-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL64-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s0, 4
+; DAGISEL64-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL64-NEXT:    v_swap_b32 v0, v1
+; DAGISEL64-NEXT:    s_mov_b32 s1, gfx_callee@abs32@hi
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s4, 0
+; DAGISEL64-NEXT:    s_mov_b32 s0, gfx_callee@abs32@lo
+; DAGISEL64-NEXT:    s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s5, 1
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s30, 2
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s31, 3
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
+; DAGISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT:    v_readlane_b32 s5, v40, 1
+; DAGISEL64-NEXT:    v_readlane_b32 s4, v40, 0
+; DAGISEL64-NEXT:    v_readlane_b32 s0, v40, 4
+; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_mov_b32 s32, s33
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL64-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL64-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL64-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL64-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL64-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL64-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL64-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL64-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL64-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL64-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL64-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL64-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL64-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL64-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL64-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL64-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL64-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL64-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL64-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL64-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL64-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL64-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL64-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL64-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL64-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL64-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL64-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL64-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL64-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL64-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL64-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL64-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL64-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL64-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL64-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL64-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL64-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL64-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL64-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL64-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL64-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL64-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL64-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL64-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL64-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL64-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL64-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL64-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL64-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL64-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL64-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL64-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL64-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL64-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL64-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL64-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL64-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL64-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL64-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL64-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL64-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL64-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL64-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL64-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL64-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL64-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL64-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL64-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL64-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL64-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL64-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL64-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL64-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL64-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL64-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL64-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL64-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL64-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL64-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL64-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL64-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL64-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL64-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL64-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL64-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL64-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL64-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL64-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL64-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL64-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL64-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL64-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL64-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL64-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL64-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL64-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL64-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL64-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL64-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL64-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL64-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL64-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL64-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL64-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL64-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL64-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL64-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL64-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL64-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL64-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL64-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL64-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL64-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL64-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL64-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL64-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL64-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[4:5]
+; DAGISEL64-NEXT:    s_mov_b32 s33, s0
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: call_gfx_from_whole_wave:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_mov_b32 s0, s33
+; GISEL64-NEXT:    s_mov_b32 s33, s32
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; GISEL64-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; GISEL64-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; GISEL64-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; GISEL64-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; GISEL64-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; GISEL64-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; GISEL64-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; GISEL64-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; GISEL64-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; GISEL64-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; GISEL64-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; GISEL64-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; GISEL64-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; GISEL64-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; GISEL64-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; GISEL64-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; GISEL64-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; GISEL64-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; GISEL64-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; GISEL64-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; GISEL64-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; GISEL64-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; GISEL64-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; GISEL64-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; GISEL64-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; GISEL64-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; GISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; GISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; GISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; GISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; GISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; GISEL64-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; GISEL64-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; GISEL64-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; GISEL64-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; GISEL64-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; GISEL64-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; GISEL64-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; GISEL64-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; GISEL64-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; GISEL64-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; GISEL64-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; GISEL64-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; GISEL64-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; GISEL64-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; GISEL64-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; GISEL64-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; GISEL64-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; GISEL64-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; GISEL64-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; GISEL64-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; GISEL64-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; GISEL64-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; GISEL64-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; GISEL64-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; GISEL64-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; GISEL64-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; GISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; GISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; GISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; GISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; GISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; GISEL64-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; GISEL64-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; GISEL64-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; GISEL64-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; GISEL64-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; GISEL64-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; GISEL64-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; GISEL64-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; GISEL64-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; GISEL64-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; GISEL64-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; GISEL64-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; GISEL64-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; GISEL64-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; GISEL64-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; GISEL64-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; GISEL64-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; GISEL64-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; GISEL64-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; GISEL64-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; GISEL64-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; GISEL64-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; GISEL64-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; GISEL64-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; GISEL64-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; GISEL64-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; GISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; GISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; GISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; GISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; GISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; GISEL64-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; GISEL64-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; GISEL64-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; GISEL64-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; GISEL64-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; GISEL64-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; GISEL64-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; GISEL64-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; GISEL64-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; GISEL64-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; GISEL64-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; GISEL64-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; GISEL64-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; GISEL64-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; GISEL64-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; GISEL64-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; GISEL64-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; GISEL64-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; GISEL64-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; GISEL64-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; GISEL64-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; GISEL64-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; GISEL64-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; GISEL64-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; GISEL64-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; GISEL64-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; GISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; GISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; GISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; GISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; GISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; GISEL64-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; GISEL64-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; GISEL64-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; GISEL64-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; GISEL64-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; GISEL64-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; GISEL64-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; GISEL64-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; GISEL64-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; GISEL64-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; GISEL64-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; GISEL64-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; GISEL64-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_writelane_b32 v40, s0, 4
+; GISEL64-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL64-NEXT:    v_swap_b32 v0, v1
+; GISEL64-NEXT:    s_mov_b32 s0, gfx_callee@abs32@lo
+; GISEL64-NEXT:    v_writelane_b32 v40, s4, 0
+; GISEL64-NEXT:    s_mov_b32 s1, gfx_callee@abs32@hi
+; GISEL64-NEXT:    s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT:    v_writelane_b32 v40, s5, 1
+; GISEL64-NEXT:    v_writelane_b32 v40, s30, 2
+; GISEL64-NEXT:    v_writelane_b32 v40, s31, 3
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
+; GISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT:    v_readlane_b32 s5, v40, 1
+; GISEL64-NEXT:    v_readlane_b32 s4, v40, 0
+; GISEL64-NEXT:    v_readlane_b32 s0, v40, 4
+; GISEL64-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_mov_b32 s32, s33
+; GISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; GISEL64-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; GISEL64-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; GISEL64-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; GISEL64-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; GISEL64-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; GISEL64-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; GISEL64-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; GISEL64-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; GISEL64-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; GISEL64-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; GISEL64-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; GISEL64-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; GISEL64-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; GISEL64-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; GISEL64-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; GISEL64-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; GISEL64-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; GISEL64-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; GISEL64-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; GISEL64-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; GISEL64-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; GISEL64-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; GISEL64-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; GISEL64-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; GISEL64-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; GISEL64-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; GISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; GISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; GISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; GISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; GISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; GISEL64-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; GISEL64-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; GISEL64-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; GISEL64-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; GISEL64-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; GISEL64-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; GISEL64-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; GISEL64-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; GISEL64-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; GISEL64-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; GISEL64-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; GISEL64-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; GISEL64-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; GISEL64-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; GISEL64-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; GISEL64-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; GISEL64-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; GISEL64-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; GISEL64-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; GISEL64-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; GISEL64-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; GISEL64-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; GISEL64-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; GISEL64-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; GISEL64-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; GISEL64-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; GISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; GISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; GISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; GISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; GISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; GISEL64-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; GISEL64-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; GISEL64-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; GISEL64-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; GISEL64-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; GISEL64-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; GISEL64-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; GISEL64-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; GISEL64-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; GISEL64-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; GISEL64-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; GISEL64-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; GISEL64-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; GISEL64-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; GISEL64-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; GISEL64-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; GISEL64-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; GISEL64-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; GISEL64-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; GISEL64-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; GISEL64-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; GISEL64-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; GISEL64-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; GISEL64-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; GISEL64-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; GISEL64-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; GISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; GISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; GISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; GISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; GISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; GISEL64-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; GISEL64-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; GISEL64-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; GISEL64-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; GISEL64-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; GISEL64-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; GISEL64-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; GISEL64-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; GISEL64-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; GISEL64-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; GISEL64-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; GISEL64-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; GISEL64-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; GISEL64-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; GISEL64-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; GISEL64-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; GISEL64-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; GISEL64-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; GISEL64-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; GISEL64-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; GISEL64-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; GISEL64-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; GISEL64-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; GISEL64-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; GISEL64-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; GISEL64-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; GISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; GISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; GISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; GISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; GISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; GISEL64-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; GISEL64-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; GISEL64-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; GISEL64-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; GISEL64-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; GISEL64-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; GISEL64-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; GISEL64-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; GISEL64-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; GISEL64-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; GISEL64-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; GISEL64-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; GISEL64-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; GISEL64-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL64-NEXT:    s_mov_b32 s33, s0
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
+  ret <2 x half> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
new file mode 100644
index 0000000000000..2f7a6e257bb96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir
@@ -0,0 +1,902 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s
+
+# WMMA writes: D0, WMMA reads: A0/B0/Index0
+# VALU writes: D1, VALU reads: Use1
+# Hards could be:
+#   RAW: D0 overlaps Use1
+#   WAW: D0 overlaps D1
+#   WAR: A0/B0/Index0 overlaps D1
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+    ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+    ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+    ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+    ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+    $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+    $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+    $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $sgpr2 = S_MOV_B32 2
+    $sgpr3 = S_MOV_B32 3
+    $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+    ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+    ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+    ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+    ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr26 = V_MOV_B32_e32 26, implicit $exec
+    $vgpr27 = V_MOV_B32_e32 27, implicit $exec
+    $vgpr28 = V_MOV_B32_e32 28, implicit $exec
+    $vgpr29 = V_MOV_B32_e32 29, implicit $exec
+    $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $sgpr2 = S_MOV_B32 2
+    $sgpr3 = S_MOV_B32 3
+    $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $sgpr2 = S_MOV_B32 2
+    $sgpr3 = S_MOV_B32 3
+    $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+    ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+    ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+    ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+    $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+    $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+    $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+    $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4
+    ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5
+    ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6
+    ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $sgpr2 = S_MOV_B32 2
+    $sgpr3 = S_MOV_B32 3
+    $sgpr4 = S_MOV_B32 4
+    $sgpr5 = S_MOV_B32 5
+    $sgpr6 = S_MOV_B32 6
+    $sgpr7 = S_MOV_B32 7
+    $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+    $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+    ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+    ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+    ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+    ; GFX1250-NEXT: $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    $vgpr44 = V_MOV_B32_e32 44, implicit $exec
+    $vgpr45 = V_MOV_B32_e32 45, implicit $exec
+    $vgpr46 = V_MOV_B32_e32 46, implicit $exec
+    $vgpr47 = V_MOV_B32_e32 47, implicit $exec
+    $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3
+    ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4
+    ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5
+    ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6
+    ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $sgpr2 = S_MOV_B32 2
+    $sgpr3 = S_MOV_B32 3
+    $sgpr4 = S_MOV_B32 4
+    $sgpr5 = S_MOV_B32 5
+    $sgpr6 = S_MOV_B32 6
+    $sgpr7 = S_MOV_B32 7
+    $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 40, implicit $exec
+    $vgpr41 = V_MOV_B32_e32 41, implicit $exec
+    $vgpr42 = V_MOV_B32_e32 42, implicit $exec
+    $vgpr43 = V_MOV_B32_e32 43, implicit $exec
+    $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1
+    ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2
+    ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 4
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr1 = S_MOV_B32 1
+    $sgpr2 = S_MOV_B32 2
+    $sgpr3 = S_MOV_B32 4
+    $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
new file mode 100644
index 0000000000000..2032b98eab979
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir
@@ -0,0 +1,1430 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s
+
+# For two conscutive wmma instructions, we need to insert one V_NOP instruction between
+# them if matrix A, B or index of the second wmma are the same or overlap with previous
+# wmma instruction’s D-matrix.
+
+---
+name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
+    $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec
+   $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1
+body:             |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1
+body:             |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1
+body:             |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
+    $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+    $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+    $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec
+    $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+    $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+   $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+   $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+   $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec
+   $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec
+    $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1
+body:             |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1
+body:             |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1
+body:             |
+  bb.0:
+    ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
index 3562b93a22503..9e1aa1066bff9 100644
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll
@@ -1,28 +1,21 @@
 ; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
-; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s
 ; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
 
 ; ATTINY85: <main>:
 ; ATTINY85-NEXT: andi r24, 0x1
 ; ATTINY85: cpi r24, 0x0
-; ATTINY85-NEXT: breq .+2
-; ATTINY85-NEXT: rjmp .+4086
+; ATTINY85-NEXT: breq .-2
+; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x100c
+; ATTINY85-NEXT: rjmp .-2
+; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x2
 ; ATTINY85: ldi r24, 0x3
 ; ATTINY85-NEXT: ret
 
-; AVR25: <main>:
-; AVR25-NEXT: andi r24, 0x1
-; AVR25: cpi r24, 0x0
-; AVR25-NEXT: breq .+2
-; AVR25-NEXT: rjmp .-2
-; AVR25-NEXT: R_AVR_13_PCREL .text+0x2
-; AVR25: ldi r24, 0x3
-; AVR25-NEXT: ret
-
 ; AVR3: <main>:
 ; AVR3-NEXT: andi r24, 0x1
 ; AVR3: cpi r24, 0x0
-; AVR3-NEXT: breq .+4
+; AVR3-NEXT: breq .-2
+; AVR3-NEXT: R_AVR_7_PCREL .text+0x100e
 ; AVR3-NEXT: jmp 0x0
 ; AVR3-NEXT: R_AVR_CALL .text+0x2
 ; AVR3: ldi r24, 0x3
diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
index a51cf42d5de8b..1fc84a763fcff 100644
--- a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
+++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll
@@ -1,28 +1,21 @@
 ; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s
-; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s
 ; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s
 
 ; ATTINY85: <main>:
 ; ATTINY85-NEXT: andi r24, 0x1
 ; ATTINY85-NEXT: cpi r24, 0x0
-; ATTINY85-NEXT: brne .+2
-; ATTINY85-NEXT: rjmp .-4092
+; ATTINY85-NEXT: brne .-2
+; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x8
+; ATTINY85-NEXT: rjmp .-2
+; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x100c
 ; ATTINY85: ldi r24, 0x3
 ; ATTINY85-NEXT: ret
 
-; AVR25: <main>:
-; AVR25-NEXT: andi r24, 0x1
-; AVR25-NEXT: cpi r24, 0x0
-; AVR25-NEXT: brne .+2
-; AVR25-NEXT: rjmp .-2
-; AVR25-NEXT: R_AVR_13_PCREL .text+0x100c
-; AVR25: ldi r24, 0x3
-; AVR25-NEXT: ret
-
 ; AVR3: <main>:
 ; AVR3-NEXT: andi r24, 0x1
 ; AVR3-NEXT: cpi r24, 0x0
-; AVR3-NEXT: brne .+4
+; AVR3-NEXT: brne .-2
+; AVR3-NEXT: R_AVR_7_PCREL .text+0xa
 ; AVR3-NEXT: jmp 0x0
 ; AVR3-NEXT: R_AVR_CALL .text+0x100e
 ; AVR3: ldi r24, 0x3
diff --git a/llvm/test/CodeGen/AVR/jmp.ll b/llvm/test/CodeGen/AVR/jmp.ll
index 95dfff4836b4e..1cbc6375dea17 100644
--- a/llvm/test/CodeGen/AVR/jmp.ll
+++ b/llvm/test/CodeGen/AVR/jmp.ll
@@ -18,7 +18,8 @@ declare i8 @bar(i8);
 ; CHECK: rcall   .-2
 ; CHECK-NEXT: 00000000: R_AVR_13_PCREL bar
 ; CHECK-NEXT: cpi     r24, 0x7b
-; CHECK-NEXT: brne    .+4
+; CHECK-NEXT: brne    .-2
+; CHECK-NEXT: R_AVR_7_PCREL .text+0xa
 ; CHECK-NEXT: ldi     r24, 0x64
 ; CHECK-NEXT: ret
 ; CHECK-NEXT: ldi     r24, 0xc8
diff --git a/llvm/test/CodeGen/DirectX/UAddc.ll b/llvm/test/CodeGen/DirectX/UAddc.ll
index 4b46b56b455f6..dd7aa23122c84 100644
--- a/llvm/test/CodeGen/DirectX/UAddc.ll
+++ b/llvm/test/CodeGen/DirectX/UAddc.ll
@@ -35,14 +35,10 @@ define noundef <2 x i32> @test_UAddc_vec2(<2 x i32> noundef %a, <2 x i32> nounde
 ; CHECK-NEXT:    [[UADDC_I1:%.*]] = call [[DX_TYPES_I32C]] @dx.op.binaryWithCarryOrBorrow.i32(i32 44, i32 [[A_I1]], i32 [[B_I1]]) #[[ATTR0]]
 ; CHECK-NEXT:    [[CARRY_ELEM0:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I0]], 1
 ; CHECK-NEXT:    [[CARRY_ELEM1:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I1]], 1
-; CHECK-NEXT:    [[CARRY_UPTO0:%.*]] = insertelement <2 x i1> poison, i1 [[CARRY_ELEM0]], i64 0
-; CHECK-NEXT:    [[CARRY:%.*]] = insertelement <2 x i1> [[CARRY_UPTO0]], i1 [[CARRY_ELEM1]], i64 1
-; CHECK-NEXT:    [[CARRY_I0:%.*]] = extractelement <2 x i1> [[CARRY]], i64 0
-; CHECK-NEXT:    [[CARRY_I1:%.*]] = extractelement <2 x i1> [[CARRY]], i64 1
 ; CHECK-NEXT:    [[SUM_ELEM0:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I0]], 0
 ; CHECK-NEXT:    [[SUM_ELEM1:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I1]], 0
-; CHECK-NEXT:    [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_I0]] to i32
-; CHECK-NEXT:    [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_I1]] to i32
+; CHECK-NEXT:    [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_ELEM0]] to i32
+; CHECK-NEXT:    [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_ELEM1]] to i32
 ; CHECK-NEXT:    [[RESULT_I0:%.*]] = add i32 [[SUM_ELEM0]], [[CARRY_ZEXT_I0]]
 ; CHECK-NEXT:    [[RESULT_I1:%.*]] = add i32 [[SUM_ELEM1]], [[CARRY_ZEXT_I1]]
 ; CHECK-NEXT:    [[RESULT_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RESULT_I0]], i64 0
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index 1376a1db25975..a2e105537ab88 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -218,6 +218,28 @@ define void @two_index_gep_const() {
   ret void
 }
 
+define void @zero_index_global() {
+  ; CHECK-LABEL: define void @zero_index_global(
+  ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 0
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0
+  %2 = load float, ptr addrspace(3) %1, align 4
+  ret void
+}
+
+; Note: A ConstantExpr GEP with all 0 indices is equivalent to the pointer
+; operand of the GEP. Therefore the visitLoadInst will not see the pointer operand
+; as a ConstantExpr GEP and will not create a GEP instruction to be visited.
+; The later dxil-legalize pass will insert a GEP in this instance.
+define void @zero_index_global_const() {
+  ; CHECK-LABEL: define void @zero_index_global_const(
+  ; CHECK-NEXT: load float, ptr addrspace(3) @g.1dim, align 4
+  ; CHECK-NEXT: ret void
+  %1 = load float, ptr addrspace(3) getelementptr inbounds nuw ([2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0), align 4
+  ret void
+}
+
 define void @gep_4d_index_test()  {
     ; CHECK-LABEL: gep_4d_index_test
     ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
diff --git a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
index f77df2d812dfe..77133eb729bdc 100644
--- a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll
@@ -1,30 +1,27 @@
 ; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM63
 ; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM66
-; RUN: opt -S -dxil-op-lower -dxil-prepare -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-PREPARE
+; RUN: opt -S -dxil-prepare -dxil-embed -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-EMBED
+
+; Lifetime intrinsics are not valid prior to shader model 6.6 and are instead
+; replaced with undef stores, provided the validator version is 1.6 or greater
+
+; The dxil-embed pass will remove lifetime intrinsics because they transformed
+; in a way that is illegal in modern LLVM IR before serializing to DXIL bitcode.
+; So we check that no bitcast or lifetime intrinsics remain after dxil-embed
 
 ; CHECK-LABEL: define void @test_legal_lifetime() {
-; 
-; CHECK-SM63-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
-; CHECK-SM63-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
-; CHECK-SM63-NEXT:    store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
-; CHECK-SM63-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-SM63-NEXT:    store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
-; 
-; CHECK-SM66-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
-; CHECK-SM66-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
-; CHECK-SM66-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-SM66-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-SM66-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-; 
-; CHECK-PREPARE-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
-; CHECK-PREPARE-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
-; CHECK-PREPARE-NEXT:    [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr
-; CHECK-PREPARE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[BITCAST]])
-; CHECK-PREPARE-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-PREPARE-NEXT:    [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr
-; CHECK-PREPARE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[BITCAST]])
-; 
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:       [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
+; CHECK-NEXT:       [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-SM63-NEXT:  store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
+; CHECK-SM66-NEXT:  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
+; CHECK-EMBED-NOT:  bitcast
+; CHECK-EMBED-NOT:  lifetime
+; CHECK-NEXT:       store i32 0, ptr [[GEP]], align 4
+; CHECK-SM63-NEXT:  store [1 x i32] undef, ptr [[ACCUM_I_FLAT]], align 4
+; CHECK-SM66-NEXT:  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
+; CHECK-EMBED-NOT:  bitcast
+; CHECK-EMBED-NOT:  lifetime
+; CHECK-NEXT:       ret void
 ;
 define void @test_legal_lifetime()  {
   %accum.i.flat = alloca [1 x i32], align 4
@@ -35,22 +32,6 @@ define void @test_legal_lifetime()  {
   ret void
 }
 
-; CHECK-PREPARE-DAG: attributes [[LIFETIME_ATTRS:#.*]] = { nounwind }
-
-; CHECK-PREPARE-DAG: ; Function Attrs: nounwind
-; CHECK-PREPARE-DAG: declare void @llvm.lifetime.start.p0(i64, ptr) [[LIFETIME_ATTRS]]
-
-; CHECK-PREPARE-DAG: ; Function Attrs: nounwind
-; CHECK-PREPARE-DAG: declare void @llvm.lifetime.end.p0(i64, ptr) [[LIFETIME_ATTRS]]
-
-; Function Attrs: nounwind memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p0(i64, ptr) #0
-
-; Function Attrs: nounwind memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p0(i64, ptr) #0
-
-attributes #0 = { nounwind memory(argmem: readwrite) }
-
 ; Set the validator version to 1.6
 !dx.valver = !{!0}
 !0 = !{i32 1, i32 6}
diff --git a/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
index b25b3de901d91..c6789ac7886d5 100644
--- a/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
@@ -21,3 +21,21 @@ define void @store() {
   store i32 0, ptr %a, align 4
   ret void
 }
+
+@g = local_unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+define void @load_whole_global () {
+; CHECK-LABEL: define void @load_whole_global
+; CHECK-NEXT:    load [4 x i32], ptr addrspace(3) @g, align 4
+; CHECK-NEXT:    ret void
+  %l = load [4 x i32], ptr addrspace(3) @g, align 4
+  ret void
+}
+
+define void @load_global_index0 () {
+; CHECK-LABEL: define void @load_global_index0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @g, i32 0, i32 0
+; CHECK-NEXT:    load i32, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    ret void
+  %l = load i32, ptr addrspace(3) @g, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index 27a892591a867..0c91c53227763 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -24,7 +24,8 @@
 define <4 x i32> @load_array_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_array_vec_test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 4
@@ -52,7 +53,8 @@ define <4 x i32> @load_array_vec_test() #0 {
 define <4 x i32> @load_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_vec_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 2), align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 3), align 4
@@ -203,7 +205,8 @@ define <4 x i32> @load_static_array_of_vec_from_i8_gep_test(i32 %index) #0 {
 define <4 x i32> @multid_load_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @multid_load_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 2), align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 3), align 4
diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
index a124c665ad15e..4394235ffe4bd 100644
--- a/llvm/test/CodeGen/DirectX/scalar-store.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -14,7 +14,8 @@
 
 ; CHECK-LABEL: store_array_vec_test
 define void @store_array_vec_test () local_unnamed_addr #0 {
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 16
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(3) [[GEP]], align 16
 ; CHECK-NEXT:    store float 2.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
 ; CHECK-NEXT:    store float 3.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 8
 ; CHECK-NEXT:    store float 2.000000e+00, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 16
@@ -30,7 +31,8 @@ define void @store_array_vec_test () local_unnamed_addr #0 {
 ; CHECK-LABEL: store_vec_test
 define void @store_vec_test(<4 x i32> %inputVec) #0 {
 ; CHECK-NEXT:    [[INPUTVEC_I01:%.*]] = extractelement <4 x i32> %inputVec, i32 0
-; CHECK-NEXT:    store i32 [[INPUTVEC_I01]], ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
+; CHECK-NEXT:    store i32 [[INPUTVEC_I01]], ptr addrspace(3) [[GEP]], align 4
 ; CHECK-NEXT:    [[INPUTVEC_I12:%.*]] = extractelement <4 x i32> %inputVec, i32 1
 ; CHECK-NEXT:    store i32 [[INPUTVEC_I12]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
 ; CHECK-NEXT:    [[INPUTVEC_I23:%.*]] = extractelement <4 x i32> %inputVec, i32 2
diff --git a/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
new file mode 100644
index 0000000000000..ff50f1abe5897
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: r{{[0-9]+}} = asr(r{{[0-9]+}},#{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}}:{{[0-9]+}} = mpyu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}} += mpyi(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = mpy(r{{[0-9]+}},r{{[0-9]+}})
+
+; ModuleID = '39544.c'
+source_filename = "39544.c"
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @mul_n(i64* nocapture %p, i32* nocapture readonly %a, i32 %k, i32 %n) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %k to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %arrayidx.phi = phi i32* [ %a, %for.body.lr.ph ], [ %arrayidx.inc, %for.body ]
+  %arrayidx2.phi = phi i64* [ %p, %for.body.lr.ph ], [ %arrayidx2.inc, %for.body ]
+  %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %conv = sext i32 %0 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  store i64 %mul, i64* %arrayidx2.phi, align 8
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx2.inc = getelementptr i64, i64* %arrayidx2.phi, i32 1
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
new file mode 100644
index 0000000000000..2960343564fca
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
@@ -0,0 +1,50 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner %s -o /dev/null
+
+# Check that edges that violate topological order are not added to the
+# SwingSchedulerDAG. This is a case where the crash was caused by PR 145878.
+
+--- |
+  target triple = "hexagon"
+  
+  define void @crash_145878() {
+  entry:
+    br label %loop
+  
+  loop:                                             ; preds = %loop, %entry
+    %lsr.iv2 = phi i32 [ %lsr.iv.next, %loop ], [ 1, %entry ]
+    %lsr.iv = phi ptr [ %cgep3, %loop ], [ inttoptr (i32 -8 to ptr), %entry ]
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 12
+    %load = load i32, ptr %cgep, align 4
+    store i32 %load, ptr %lsr.iv, align 4
+    %lsr.iv.next = add nsw i32 %lsr.iv2, -1
+    %iv.cmp.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep3 = getelementptr i8, ptr %lsr.iv, i32 -8
+    br i1 %iv.cmp.not, label %exit, label %loop
+  
+  exit:                                             ; preds = %loop
+    ret void
+  }
+...
+---
+name:            crash_145878
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+  
+    %5:intregs = A2_tfrsi -8
+    J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.1.loop (machine-block-address-taken):
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  
+    %1:intregs = PHI %5, %bb.0, %3, %bb.1
+    %6:intregs = L2_loadri_io %1, 12 :: (load (s32) from %ir.cgep)
+    S2_storeri_io %1, 0, killed %6 :: (store (s32) into %ir.lsr.iv)
+    %3:intregs = A2_addi %1, -8
+    ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+  
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
index d07e2914c753a..f7653af1fa9ba 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
@@ -122,23 +122,23 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 define i64 @caller_large_scalars() nounwind {
 ; CHECK-LABEL: caller_large_scalars:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $zero, $sp, 24
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $zero, $sp, 40
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 8
+; CHECK-NEXT:    vst $vr0, $sp, 24
 ; CHECK-NEXT:    ori $a0, $zero, 2
-; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 56
-; CHECK-NEXT:    vst $vr0, $sp, 40
+; CHECK-NEXT:    st.d $a0, $sp, 16
+; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    vst $vr0, $sp, 56
 ; CHECK-NEXT:    ori $a2, $zero, 1
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    addi.d $a1, $sp, 0
-; CHECK-NEXT:    st.d $a2, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 48
+; CHECK-NEXT:    addi.d $a1, $sp, 16
+; CHECK-NEXT:    st.d $a2, $sp, 48
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars(i256 1, i256 2)
   ret i64 %1
@@ -177,20 +177,20 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-LABEL: caller_large_scalars_exhausted_regs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $a0, $sp, 16
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    st.d $a0, $sp, 8
 ; CHECK-NEXT:    ori $a0, $zero, 9
 ; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 40
+; CHECK-NEXT:    st.d $zero, $sp, 56
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 24
+; CHECK-NEXT:    vst $vr0, $sp, 40
 ; CHECK-NEXT:    ori $a0, $zero, 10
-; CHECK-NEXT:    st.d $a0, $sp, 16
-; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    st.d $a0, $sp, 32
+; CHECK-NEXT:    st.d $zero, $sp, 88
 ; CHECK-NEXT:    ori $a0, $zero, 8
-; CHECK-NEXT:    st.d $a0, $sp, 48
+; CHECK-NEXT:    st.d $a0, $sp, 64
 ; CHECK-NEXT:    ori $a0, $zero, 1
 ; CHECK-NEXT:    ori $a1, $zero, 2
 ; CHECK-NEXT:    ori $a2, $zero, 3
@@ -198,12 +198,12 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-NEXT:    ori $a4, $zero, 5
 ; CHECK-NEXT:    ori $a5, $zero, 6
 ; CHECK-NEXT:    ori $a6, $zero, 7
-; CHECK-NEXT:    addi.d $a7, $sp, 48
-; CHECK-NEXT:    vst $vr0, $sp, 56
+; CHECK-NEXT:    addi.d $a7, $sp, 64
+; CHECK-NEXT:    vst $vr0, $sp, 72
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars_exhausted_regs)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars_exhausted_regs(
       i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i256 8, i64 9,
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
index c88b67f13d1e7..da8c3e93f6842 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
@@ -1252,8 +1252,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64F-LP64S-LABEL: caller_half_on_stack:
 ; LA64F-LP64S:       # %bb.0:
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64F-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64F-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64F-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64F-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1292,8 +1292,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64F-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64F-LP64S-NEXT:    ret
 ;
 ; LA64F-LP64D-LABEL: caller_half_on_stack:
@@ -1336,8 +1336,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64D-LP64S-LABEL: caller_half_on_stack:
 ; LA64D-LP64S:       # %bb.0:
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64D-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64D-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64D-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64D-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1376,8 +1376,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64D-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64D-LP64S-NEXT:    ret
 ;
 ; LA64D-LP64D-LABEL: caller_half_on_stack:
diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
index 52d8dd05aaa4c..1a9de3b0ef3d1 100644
--- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
@@ -14,41 +14,41 @@
 define dso_local noundef signext i32 @main() nounwind {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -272
-; CHECK-NEXT:    st.d $ra, $sp, 264 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -288
+; CHECK-NEXT:    st.d $ra, $sp, 280 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
 ; CHECK-NEXT:    xvld $xr0, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    xvst $xr0, $sp, 96 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 112 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT:    xvst $xr1, $sp, 64 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr1, $sp, 80 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_2)
 ; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_2)
-; CHECK-NEXT:    xvst $xr2, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr2, $sp, 48 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_3)
 ; CHECK-NEXT:    xvld $xr3, $a0, %pc_lo12(.LCPI0_3)
-; CHECK-NEXT:    xvst $xr3, $sp, 0 # 32-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvst $xr1, $sp, 168
-; CHECK-NEXT:    xvst $xr2, $sp, 200
-; CHECK-NEXT:    xvst $xr3, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvst $xr3, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvst $xr1, $sp, 184
+; CHECK-NEXT:    xvst $xr2, $sp, 216
+; CHECK-NEXT:    xvst $xr3, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(foo)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 96 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 168
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 200
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvld $xr0, $sp, 112 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 184
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 216
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(bar)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 264 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 272
+; CHECK-NEXT:    ld.d $ra, $sp, 280 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 288
 ; CHECK-NEXT:    ret
 entry:
   %s = alloca %struct.S, align 2
diff --git a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
index ccc5c703e71ed..15ac95dfc6c55 100644
--- a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
+++ b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
@@ -28,12 +28,12 @@ define void @func() {
 ; CHECK-NEXT:    ld.w $a3, $a1, 0
 ; CHECK-NEXT:    ld.w $a2, $a1, 0
 ; CHECK-NEXT:    ld.w $a0, $a1, 0
-; CHECK-NEXT:    st.d $fp, $sp, 0
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
 ; CHECK-NEXT:    lu12i.w $fp, 1
 ; CHECK-NEXT:    ori $fp, $fp, 12
 ; CHECK-NEXT:    add.d $fp, $sp, $fp
 ; CHECK-NEXT:    st.w $t8, $fp, 0
-; CHECK-NEXT:    ld.d $fp, $sp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
 ; CHECK-NEXT:    st.w $t8, $a1, 0
 ; CHECK-NEXT:    st.w $t7, $a1, 0
 ; CHECK-NEXT:    st.w $t6, $a1, 0
diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll
index 048703029d8c6..b29d8634854f3 100644
--- a/llvm/test/CodeGen/LoongArch/frame.ll
+++ b/llvm/test/CodeGen/LoongArch/frame.ll
@@ -1,5 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=loongarch64 -mattr=+d < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,-lsx < %s | FileCheck %s --check-prefixes=CHECK,NOLSX
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LSX
 
 %struct.key_t = type { i32, [16 x i8] }
 
@@ -7,20 +8,35 @@ declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
 declare void @test1(ptr)
 
 define i32 @test() nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -32
-; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; CHECK-NEXT:    st.w $zero, $sp, 16
-; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 4
-; CHECK-NEXT:    pcaddu18i $ra, %call36(test1)
-; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 32
-; CHECK-NEXT:    ret
+; NOLSX-LABEL: test:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -32
+; NOLSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NOLSX-NEXT:    st.w $zero, $sp, 16
+; NOLSX-NEXT:    st.d $zero, $sp, 8
+; NOLSX-NEXT:    st.d $zero, $sp, 0
+; NOLSX-NEXT:    addi.d $a0, $sp, 4
+; NOLSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; NOLSX-NEXT:    jirl $ra, $ra, 0
+; NOLSX-NEXT:    move $a0, $zero
+; NOLSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NOLSX-NEXT:    addi.d $sp, $sp, 32
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -32
+; LSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LSX-NEXT:    st.w $zero, $sp, 16
+; LSX-NEXT:    vrepli.b $vr0, 0
+; LSX-NEXT:    vst $vr0, $sp, 0
+; LSX-NEXT:    addi.d $a0, $sp, 4
+; LSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; LSX-NEXT:    jirl $ra, $ra, 0
+; LSX-NEXT:    move $a0, $zero
+; LSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LSX-NEXT:    addi.d $sp, $sp, 32
+; LSX-NEXT:    ret
   %key = alloca %struct.key_t, align 4
   call void @llvm.memset.p0.i64(ptr %key, i8 0, i64 20, i1 false)
   %1 = getelementptr inbounds %struct.key_t, ptr %key, i64 0, i32 1, i64 0
@@ -98,3 +114,62 @@ define void @test_large_frame_size_1234576() "frame-pointer"="all" {
   %1 = alloca i8, i32 1234567
   ret void
 }
+
+;; Note: will create an emergency spill slot, if (!isInt<7>(StackSize)).
+;; Should involve only one SP-adjusting addi per adjustment.
+;; LSX 112 + 16(emergency solt) = 128
+define void @test_frame_size_112() {
+; NOLSX-LABEL: test_frame_size_112:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -112
+; NOLSX-NEXT:    .cfi_def_cfa_offset 112
+; NOLSX-NEXT:    addi.d $sp, $sp, 112
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_112:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -128
+; LSX-NEXT:    .cfi_def_cfa_offset 128
+; LSX-NEXT:    addi.d $sp, $sp, 128
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 112
+  ret void
+}
+
+;; LSX 128 + 16(emergency solt) = 144
+define void @test_frame_size_128() {
+; NOLSX-LABEL: test_frame_size_128:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -128
+; NOLSX-NEXT:    .cfi_def_cfa_offset 128
+; NOLSX-NEXT:    addi.d $sp, $sp, 128
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_128:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -144
+; LSX-NEXT:    .cfi_def_cfa_offset 144
+; LSX-NEXT:    addi.d $sp, $sp, 144
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 128
+  ret void
+}
+
+;; LSX 144 + 16(emergency solt) = 160
+define void @test_frame_size_144() {
+; NOLSX-LABEL: test_frame_size_144:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -144
+; NOLSX-NEXT:    .cfi_def_cfa_offset 144
+; NOLSX-NEXT:    addi.d $sp, $sp, 144
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_144:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -160
+; LSX-NEXT:    .cfi_def_cfa_offset 160
+; LSX-NEXT:    addi.d $sp, $sp, 160
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 144
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
index 402ddb9ad941b..5a55b253c77bb 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -6,11 +6,11 @@
 define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
 ; CHECK-LABEL: box:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    slli.d $a2, $a1, 5
 ; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
-; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 16
 ; CHECK-NEXT:    add.d $a3, $a2, $a1
 ; CHECK-NEXT:    vldx $vr0, $a1, $a2
 ; CHECK-NEXT:    vld $vr1, $a3, 32
@@ -18,7 +18,7 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $a0, 32
 ; CHECK-NEXT:    vst $vr2, $a0, 16
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = alloca [2 x %Box], align 16
   %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index b06f6523e977c..61a915a2837cb 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -250,84 +250,68 @@ define void @buildvector_v32i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 14
 ; CHECK-NEXT:    ld.b $a1, $sp, 72
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 15
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    ld.b $a2, $sp, 80
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT:    ld.b $a1, $sp, 80
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
 ; CHECK-NEXT:    ld.b $a1, $sp, 88
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT:    ld.b $a1, $sp, 96
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 1
+; CHECK-NEXT:    ld.b $a2, $sp, 96
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 3
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
 ; CHECK-NEXT:    ld.b $a1, $sp, 104
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT:    ld.b $a1, $sp, 112
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 3
+; CHECK-NEXT:    ld.b $a2, $sp, 112
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 5
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
 ; CHECK-NEXT:    ld.b $a1, $sp, 120
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT:    ld.b $a1, $sp, 128
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 5
+; CHECK-NEXT:    ld.b $a2, $sp, 128
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
 ; CHECK-NEXT:    ld.b $a1, $sp, 136
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT:    ld.b $a1, $sp, 144
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 7
+; CHECK-NEXT:    ld.b $a2, $sp, 144
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 9
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
 ; CHECK-NEXT:    ld.b $a1, $sp, 152
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT:    ld.b $a1, $sp, 160
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 9
+; CHECK-NEXT:    ld.b $a2, $sp, 160
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 11
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
 ; CHECK-NEXT:    ld.b $a1, $sp, 168
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT:    ld.b $a1, $sp, 176
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 11
+; CHECK-NEXT:    ld.b $a2, $sp, 176
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 13
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
 ; CHECK-NEXT:    ld.b $a1, $sp, 184
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 13
+; CHECK-NEXT:    ld.b $a2, $sp, 192
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT:    ld.b $a1, $sp, 192
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 15
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 15
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
@@ -371,8 +355,15 @@ entry:
 define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
 ; CHECK-LABEL: buildvector_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    ld.h $t0, $sp, 8
-; CHECK-NEXT:    ld.h $t1, $sp, 0
+; CHECK-NEXT:    ld.h $t0, $sp, 64
+; CHECK-NEXT:    ld.h $t1, $sp, 56
+; CHECK-NEXT:    ld.h $t2, $sp, 48
+; CHECK-NEXT:    ld.h $t3, $sp, 40
+; CHECK-NEXT:    ld.h $t4, $sp, 32
+; CHECK-NEXT:    ld.h $t5, $sp, 24
+; CHECK-NEXT:    ld.h $t6, $sp, 16
+; CHECK-NEXT:    ld.h $t7, $sp, 8
+; CHECK-NEXT:    ld.h $t8, $sp, 0
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 2
@@ -380,45 +371,30 @@ define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i1
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a5, 4
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a6, 5
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a7, 6
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $t1, 7
-; CHECK-NEXT:    ld.h $a1, $sp, 16
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $t0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $t8, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t7, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a2, $sp, 24
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t6, 1
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a1, $sp, 32
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t5, 2
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a2, $sp, 40
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t4, 3
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a1, $sp, 48
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 4
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t3, 4
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a2, $sp, 56
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t2, 5
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a1, $sp, 64
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 6
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t1, 6
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t0, 7
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
@@ -491,22 +467,21 @@ entry:
 define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movfr2gr.s $a1, $fa0
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 0
-; CHECK-NEXT:    movfr2gr.s $a1, $fa1
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 1
-; CHECK-NEXT:    movfr2gr.s $a1, $fa2
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 2
-; CHECK-NEXT:    movfr2gr.s $a1, $fa3
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 3
-; CHECK-NEXT:    movfr2gr.s $a1, $fa4
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 4
-; CHECK-NEXT:    movfr2gr.s $a1, $fa5
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 5
-; CHECK-NEXT:    movfr2gr.s $a1, $fa6
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 6
-; CHECK-NEXT:    movfr2gr.s $a1, $fa7
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 7
+; CHECK-NEXT:    # kill: def $f7 killed $f7 def $xr7
+; CHECK-NEXT:    # kill: def $f6 killed $f6 def $xr6
+; CHECK-NEXT:    # kill: def $f5 killed $f5 def $xr5
+; CHECK-NEXT:    # kill: def $f4 killed $f4 def $xr4
+; CHECK-NEXT:    # kill: def $f3 killed $f3 def $xr3
+; CHECK-NEXT:    # kill: def $f2 killed $f2 def $xr2
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $xr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 1
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr2, 2
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr3, 3
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr4, 4
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr5, 5
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr6, 6
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr7, 7
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -525,14 +500,13 @@ entry:
 define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, double %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movfr2gr.d $a1, $fa0
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 0
-; CHECK-NEXT:    movfr2gr.d $a1, $fa1
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 1
-; CHECK-NEXT:    movfr2gr.d $a1, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 2
-; CHECK-NEXT:    movfr2gr.d $a1, $fa3
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 3
+; CHECK-NEXT:    # kill: def $f3_64 killed $f3_64 def $xr3
+; CHECK-NEXT:    # kill: def $f2_64 killed $f2_64 def $xr2
+; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $xr1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr2, 2
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr3, 3
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 789b51d9b5e5b..380071266d80e 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -6,91 +6,91 @@ declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
 define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 0
+; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 1
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 2
+; CHECK-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 3
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 3
+; CHECK-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 4
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 4
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 4
+; CHECK-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 5
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 5
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 5
+; CHECK-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 6
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 6
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 6
+; CHECK-NEXT:    xvst $xr1, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 7
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 7
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b)
@@ -102,51 +102,51 @@ declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
 define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 2
+; CHECK-NEXT:    xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 3
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 3
+; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll
index 86808c7a8f014..09ce1a04d6c9d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll
@@ -5,8 +5,6 @@ define i32 @bitcast_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa0, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <8 x float> %a, i32 7
@@ -18,8 +16,6 @@ define i64 @bitcast_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x double> %a, i32 3
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 04214f5dfa9d2..2e1618748688a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -76,21 +76,21 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
 define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 0
 ; CHECK-NEXT:    ld.b $a0, $a0, 0
 ; CHECK-NEXT:    st.b $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %e = extractelement <32 x i8> %v, i32 %idx
@@ -101,21 +101,21 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 1
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %e = extractelement <16 x i16> %v, i32 %idx
@@ -126,21 +126,21 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -151,21 +151,21 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    st.d $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %e = extractelement <4 x i64> %v, i32 %idx
@@ -176,21 +176,21 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fld.s $fa0, $a0, 0
 ; CHECK-NEXT:    fst.s $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx
@@ -201,21 +201,21 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fld.d $fa0, $a0, 0
 ; CHECK-NEXT:    fst.d $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %e = extractelement <4 x double> %v, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f3bec11810e9b..221aba3166ed7 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -6,23 +6,12 @@
 define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT:    movgr2fr.d $fa3, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa3
-; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 1
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
+; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 3
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 2
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 3
-; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 3
 ; CHECK-NEXT:    ret
 entry:
   %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7>
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll
index 7b2461b11f12d..b37b525981fd9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll
@@ -4,8 +4,6 @@
 define <8 x float> @insert_bitcast_v8f32(<8 x float> %a, i32 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -17,8 +15,6 @@ entry:
 define <4 x double> @insert_bitcast_v4f64(<4 x double> %a, i64 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index 3fdc439e68679..271e3eca31dbe 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -4,18 +4,9 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a0, $sp, 31
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vpickve2gr.b $a0, $vr1, 15
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <32 x i8> %a, i32 31
@@ -26,18 +17,9 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a0, $sp, 30
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vpickve2gr.h $a0, $vr1, 7
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <16 x i16> %a, i32 15
@@ -61,8 +43,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -87,8 +67,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index 88c3e4367ffa7..4e173c4feadba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -4,23 +4,7 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a1, $sp, 31
-; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    xvextrins.b $xr0, $xr0, 31
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <32 x i8> %a, i32 15
@@ -33,23 +17,7 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a1, $sp, 30
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    xvextrins.h $xr0, $xr0, 23
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <16 x i16> %a, i32 7
@@ -62,10 +30,7 @@ entry:
 define <8 x i32> @insert_extract_v8i32(<8 x i32> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 5
+; CHECK-NEXT:    xvextrins.w $xr0, $xr0, 19
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <8 x i32> %a, i32 3
@@ -78,14 +43,7 @@ entry:
 define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa2, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    movfr2gr.s $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 5
+; CHECK-NEXT:    xvextrins.w $xr0, $xr0, 19
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <8 x float> %a, i32 3
@@ -98,10 +56,7 @@ entry:
 define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 2
+; CHECK-NEXT:    xvextrins.d $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <4 x i64> %a, i32 1
@@ -114,14 +69,7 @@ entry:
 define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 2
+; CHECK-NEXT:    xvextrins.d $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 25106b456d2f7..c1d4220fc1166 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -18,8 +18,7 @@ define void @insert_32xi8_upper(ptr %src, ptr %dst, i8 %in) nounwind {
 ; CHECK-LABEL: insert_32xi8_upper:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
@@ -47,8 +46,7 @@ define void @insert_16xi16_upper(ptr %src, ptr %dst, i16 %in) nounwind {
 ; CHECK-LABEL: insert_16xi16_upper:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
@@ -89,8 +87,8 @@ define void @insert_8xfloat(ptr %src, ptr %dst, float %in) nounwind {
 ; CHECK-LABEL: insert_8xfloat:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.w $xr1, $a0, 1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 1
 ; CHECK-NEXT:    xvst $xr1, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
@@ -103,8 +101,8 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
 ; CHECK-LABEL: insert_4xdouble:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a0, 1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 1
 ; CHECK-NEXT:    xvst $xr1, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
@@ -116,22 +114,22 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
 define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
 ; CHECK-NEXT:    st.b $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx
@@ -142,22 +140,22 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
 ; CHECK-NEXT:    st.h $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx
@@ -168,22 +166,22 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
 ; CHECK-NEXT:    st.w $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx
@@ -194,22 +192,22 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
 ; CHECK-NEXT:    st.d $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx
@@ -220,22 +218,22 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fst.s $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %v_new = insertelement <8 x float> %v, float %in, i32 %idx
@@ -246,22 +244,22 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fst.d $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %v_new = insertelement <4 x double> %v, double %in, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
new file mode 100644
index 0000000000000..62ea5cba2fc26
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
@@ -0,0 +1,361 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 -mattr=+d < %s | FileCheck -check-prefix=LA32 %s
+; RUN: llc -mtriple=loongarch64 -mattr=+d < %s | FileCheck -check-prefix=LA64 %s
+
+define half @exp10_f16(half %x) #0 {
+; LA32-LABEL: exp10_f16:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    movfr2gr.s $a0, $fa0
+; LA32-NEXT:    lu12i.w $a1, -16
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_f16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(__extendhfsf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfhf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+  %r = call half @llvm.exp10.f16(half %x)
+  ret half %r
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %x) #0 {
+; LA32-LABEL: exp10_v2f16:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 0 # 8-byte Folded Spill
+; LA32-NEXT:    movgr2fr.w $fs0, $a1
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    movfr2gr.s $fp, $fa0
+; LA32-NEXT:    fmov.s $fa0, $fs0
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    movfr2gr.s $a1, $fa0
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    fld.d $fs0, $sp, 0 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -32
+; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a0
+; LA64-NEXT:    movgr2fr.w $fa0, $a1
+; LA64-NEXT:    pcaddu18i $ra, %call36(__extendhfsf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfhf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $sp, 2
+; LA64-NEXT:    movgr2fr.w $fa0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(__extendhfsf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfhf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $sp, 0
+; LA64-NEXT:    vld $vr0, $sp, 0
+; LA64-NEXT:    vpickve2gr.h $a0, $vr0, 0
+; LA64-NEXT:    vpickve2gr.h $a1, $vr0, 1
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    ret
+  %r = call <2 x half> @llvm.exp10.v2f16(<2 x half> %x)
+  ret <2 x half> %r
+}
+
+define float @exp10_f32(float %x) #0 {
+; LA32-LABEL: exp10_f32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b exp10f
+;
+; LA64-LABEL: exp10_f32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(exp10f)
+; LA64-NEXT:    jr $t8
+  %r = call float @llvm.exp10.f32(float %x)
+  ret float %r
+}
+
+define <2 x float> @exp10_v2f32(<2 x float> %x) #0 {
+; LA32-LABEL: exp10_v2f32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
+; LA32-NEXT:    fst.d $fs1, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    fmov.s $fs0, $fa1
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    fmov.s $fs1, $fa0
+; LA32-NEXT:    fmov.s $fa0, $fs0
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    fmov.s $fa1, $fa0
+; LA32-NEXT:    fmov.s $fa0, $fs1
+; LA32-NEXT:    fld.d $fs1, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
+; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
+; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+  %r = call <2 x float> @llvm.exp10.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define double @exp10_f64(double %x) #0 {
+; LA32-LABEL: exp10_f64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b exp10
+;
+; LA64-LABEL: exp10_f64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(exp10)
+; LA64-NEXT:    jr $t8
+  %r = call double @llvm.exp10.f64(double %x)
+  ret double %r
+}
+
+define <2 x double> @exp10_v2f64(<2 x double> %x) #0 {
+; LA32-LABEL: exp10_v2f64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
+; LA32-NEXT:    fst.d $fs1, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    fmov.d $fs0, $fa1
+; LA32-NEXT:    bl exp10
+; LA32-NEXT:    fmov.d $fs1, $fa0
+; LA32-NEXT:    fmov.d $fa0, $fs0
+; LA32-NEXT:    bl exp10
+; LA32-NEXT:    fmov.d $fa1, $fa0
+; LA32-NEXT:    fmov.d $fa0, $fs1
+; LA32-NEXT:    fld.d $fs1, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+  %r = call <2 x double> @llvm.exp10.v2f64(<2 x double> %x)
+  ret <2 x double> %r
+}
+
+define fp128 @exp10_f128(fp128 %x) #0 {
+; LA32-LABEL: exp10_f128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a2, $a1, 0
+; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    ld.w $a4, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 12
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    st.w $a1, $sp, 20
+; LA32-NEXT:    st.w $a4, $sp, 16
+; LA32-NEXT:    st.w $a3, $sp, 12
+; LA32-NEXT:    addi.w $a0, $sp, 24
+; LA32-NEXT:    addi.w $a1, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    bl exp10l
+; LA32-NEXT:    ld.w $a0, $sp, 36
+; LA32-NEXT:    ld.w $a1, $sp, 32
+; LA32-NEXT:    ld.w $a2, $sp, 28
+; LA32-NEXT:    ld.w $a3, $sp, 24
+; LA32-NEXT:    st.w $a0, $fp, 12
+; LA32-NEXT:    st.w $a1, $fp, 8
+; LA32-NEXT:    st.w $a2, $fp, 4
+; LA32-NEXT:    st.w $a3, $fp, 0
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_f128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10l)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+  %r = call fp128 @llvm.exp10.f128(fp128 %x)
+  ret fp128 %r
+}
+
+define <2 x fp128> @exp10_v2f128(<2 x fp128> %x) #0 {
+; LA32-LABEL: exp10_v2f128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -96
+; LA32-NEXT:    st.w $ra, $sp, 92 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 88 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 84 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 80 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $s0, $a1, 16
+; LA32-NEXT:    ld.w $s1, $a1, 20
+; LA32-NEXT:    ld.w $s2, $a1, 24
+; LA32-NEXT:    ld.w $s3, $a1, 28
+; LA32-NEXT:    ld.w $a2, $a1, 0
+; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    ld.w $a4, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 12
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    st.w $a1, $sp, 20
+; LA32-NEXT:    st.w $a4, $sp, 16
+; LA32-NEXT:    st.w $a3, $sp, 12
+; LA32-NEXT:    addi.w $a0, $sp, 24
+; LA32-NEXT:    addi.w $a1, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    bl exp10l
+; LA32-NEXT:    st.w $s3, $sp, 52
+; LA32-NEXT:    st.w $s2, $sp, 48
+; LA32-NEXT:    st.w $s1, $sp, 44
+; LA32-NEXT:    addi.w $a0, $sp, 56
+; LA32-NEXT:    addi.w $a1, $sp, 40
+; LA32-NEXT:    st.w $s0, $sp, 40
+; LA32-NEXT:    bl exp10l
+; LA32-NEXT:    ld.w $a0, $sp, 24
+; LA32-NEXT:    ld.w $a1, $sp, 28
+; LA32-NEXT:    ld.w $a2, $sp, 32
+; LA32-NEXT:    ld.w $a3, $sp, 36
+; LA32-NEXT:    ld.w $a4, $sp, 68
+; LA32-NEXT:    ld.w $a5, $sp, 64
+; LA32-NEXT:    ld.w $a6, $sp, 60
+; LA32-NEXT:    ld.w $a7, $sp, 56
+; LA32-NEXT:    st.w $a4, $fp, 28
+; LA32-NEXT:    st.w $a5, $fp, 24
+; LA32-NEXT:    st.w $a6, $fp, 20
+; LA32-NEXT:    st.w $a7, $fp, 16
+; LA32-NEXT:    st.w $a3, $fp, 12
+; LA32-NEXT:    st.w $a2, $fp, 8
+; LA32-NEXT:    st.w $a1, $fp, 4
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    ld.w $s3, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 80 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 84 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 88 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 92 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 96
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    ld.d $fp, $a1, 16
+; LA64-NEXT:    ld.d $s0, $a1, 24
+; LA64-NEXT:    ld.d $a2, $a1, 0
+; LA64-NEXT:    ld.d $a1, $a1, 8
+; LA64-NEXT:    move $s1, $a0
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10l)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    move $s2, $a0
+; LA64-NEXT:    move $s3, $a1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    move $a1, $s0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10l)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    st.d $a1, $s1, 24
+; LA64-NEXT:    st.d $a0, $s1, 16
+; LA64-NEXT:    st.d $s3, $s1, 8
+; LA64-NEXT:    st.d $s2, $s1, 0
+; LA64-NEXT:    ld.d $s3, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+  %r = call <2 x fp128> @llvm.exp10.v2f128(<2 x fp128> %x)
+  ret <2 x fp128> %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index ffedd7f9e9438..383d63cefe6e4 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -347,42 +347,42 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr1, $vr0, $vr1
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
@@ -439,48 +439,48 @@ define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v3f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -96
-; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -112
+; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 72
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 88
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 68
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 84
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 64
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 80
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 56
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 72
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 52
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 68
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 48
-; LA64-NEXT:    vld $vr0, $sp, 64
-; LA64-NEXT:    vld $vr1, $sp, 48
-; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 96
+; LA64-NEXT:    fst.s $fa0, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 80
+; LA64-NEXT:    vld $vr1, $sp, 64
+; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 112
 ; LA64-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
@@ -568,44 +568,42 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
 ; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 1
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
 ; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a0, 1
+; LA64-NEXT:    fmov.d $fa1, $fa0
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.d $vr1, $vr0, 16
 ; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
@@ -801,17 +799,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -80
-; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s6, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s7, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -96
+; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s4, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s5, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s6, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s7, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    ld.d $fp, $a1, 16
 ; LA64-NEXT:    ld.d $s0, $a1, 24
 ; LA64-NEXT:    ld.d $s1, $a1, 0
@@ -847,17 +845,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LA64-NEXT:    st.d $s6, $s3, 16
 ; LA64-NEXT:    st.d $s5, $s3, 8
 ; LA64-NEXT:    st.d $s4, $s3, 0
-; LA64-NEXT:    ld.d $s7, $sp, 0 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s6, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ld.d $s7, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s6, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s5, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s4, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s3, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 96
 ; LA64-NEXT:    ret
   %result = call { <2 x fp128>, <2 x fp128> } @llvm.sincos.v2f128(<2 x fp128> %a)
   ret { <2 x fp128>, <2 x fp128> } %result
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index d84e408cd28be..afc87d1575da5 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -334,14 +334,13 @@ entry:
 define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movfr2gr.s $a1, $fa0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 0
-; CHECK-NEXT:    movfr2gr.s $a1, $fa1
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 1
-; CHECK-NEXT:    movfr2gr.s $a1, $fa2
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 2
-; CHECK-NEXT:    movfr2gr.s $a1, $fa3
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 3
+; CHECK-NEXT:    # kill: def $f3 killed $f3 def $vr3
+; CHECK-NEXT:    # kill: def $f2 killed $f2 def $vr2
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vextrins.w $vr0, $vr1, 16
+; CHECK-NEXT:    vextrins.w $vr0, $vr2, 32
+; CHECK-NEXT:    vextrins.w $vr0, $vr3, 48
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -356,10 +355,9 @@ entry:
 define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
 ; CHECK-LABEL: buildvector_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movfr2gr.d $a1, $fa0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 0
-; CHECK-NEXT:    movfr2gr.d $a1, $fa1
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 1
+; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $vr1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
index aafef07fbb8f4..735dad453660e 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll
@@ -9,45 +9,45 @@ define <4 x float> @powi_v4f32(<4 x float> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -48
 ; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
 ; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
-; CHECK-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 1
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; CHECK-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT:    vextrins.w $vr0, $vr1, 16
+; CHECK-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
 ; CHECK-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 1
-; CHECK-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 2
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT:    vextrins.w $vr1, $vr0, 32
+; CHECK-NEXT:    vst $vr1, $sp, 0 # 16-byte Folded Spill
 ; CHECK-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 2
-; CHECK-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; CHECK-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
 ; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 3
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vld $vr1, $sp, 0 # 16-byte Folded Reload
+; CHECK-NEXT:    vextrins.w $vr1, $vr0, 48
+; CHECK-NEXT:    vori.b $vr0, $vr1, 0
 ; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
 ; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 48
@@ -67,23 +67,22 @@ define <2 x double> @powi_v2f64(<2 x double> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
 ; CHECK-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
 ; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
 ; CHECK-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; CHECK-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
 ; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
 ; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 48
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll
index df4896d7ec936..9a40feb45671f 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll
@@ -4,8 +4,7 @@
 define i32 @bitcast_extract_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x float> %a, i32 3
@@ -16,8 +15,7 @@ entry:
 define i64 @bitcast_extract_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <2 x double> %a, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll
index a20d17efdfb11..c42e3013c1131 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll
@@ -4,8 +4,6 @@
 define <4 x float> @insert_bitcast_v4f32(<4 x float> %a, i32 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -17,8 +15,6 @@ entry:
 define <2 x double> @insert_bitcast_v2f64(<2 x double> %a, i64 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index c7dd1454c7e33..e9a0c8a110452 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -4,8 +4,7 @@
 define <16 x i8> @insert_extract_v16i8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
+; CHECK-NEXT:    vextrins.b $vr0, $vr0, 31
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <16 x i8> %a, i32 15
@@ -16,8 +15,7 @@ entry:
 define <8 x i16> @insert_extract_v8i16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
+; CHECK-NEXT:    vextrins.h $vr0, $vr0, 23
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <8 x i16> %a, i32 7
@@ -28,8 +26,7 @@ entry:
 define <4 x i32> @insert_extract_v4i32(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 1
+; CHECK-NEXT:    vextrins.w $vr0, $vr0, 19
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x i32> %a, i32 3
@@ -40,9 +37,7 @@ entry:
 define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 3
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vextrins.w $vr0, $vr0, 3
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x float> %a, i32 3
@@ -53,8 +48,7 @@ entry:
 define <2 x i64> @insert_extract_v2i64(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vextrins.d $vr0, $vr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <2 x i64> %a, i32 1
@@ -65,9 +59,7 @@ entry:
 define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vextrins.d $vr0, $vr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <2 x double> %a, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 7f232073ae129..c73252bd1335f 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -57,8 +57,8 @@ define void @insert_4xfloat(ptr %src, ptr %dst, float %ins) nounwind {
 ; CHECK-LABEL: insert_4xfloat:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr1, $a0, 0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a0, 1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vextrins.w $vr1, $vr0, 16
 ; CHECK-NEXT:    vst $vr1, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x float>, ptr %src
@@ -71,8 +71,8 @@ define void @insert_2xdouble(ptr %src, ptr %dst, double %ins) nounwind {
 ; CHECK-LABEL: insert_2xdouble:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr1, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    vinsgr2vr.d $vr1, $a0, 1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vextrins.d $vr1, $vr0, 16
 ; CHECK-NEXT:    vst $vr1, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <2 x double>, ptr %src
diff --git a/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
new file mode 100644
index 0000000000000..96159e5884d3f
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx --verify-machineinstrs < %s | FileCheck %s
+define void @eliminate_frame_index(<16 x i8> %a) nounwind {
+; CHECK-LABEL: eliminate_frame_index:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -240
+; CHECK-NEXT:    st.d $ra, $sp, 232 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 224 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 216 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 208 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 200 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s3, $sp, 192 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s4, $sp, 184 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s5, $sp, 176 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s6, $sp, 168 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s7, $sp, 160 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s8, $sp, 152 # 8-byte Folded Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $zero, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $ra, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $tp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $fp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    st.d $a0, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    vstelm.b $vr0, $a0, 0, 0
+; CHECK-NEXT:    ld.d $a0, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $zero
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $ra
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $tp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $fp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld.d $s8, $sp, 152 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s7, $sp, 160 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s6, $sp, 168 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s5, $sp, 176 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s4, $sp, 184 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s3, $sp, 192 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s2, $sp, 200 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 208 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 216 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 224 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 232 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 240
+; CHECK-NEXT:    ret
+  %s = alloca [16 x i8]
+  %ss = alloca [128 x i8]
+
+  %zero =  call i64 asm sideeffect "addi.d $$zero, $$zero, 1", "={r0}"()
+  %ra =  call i64 asm sideeffect "addi.d $$ra, $$zero, 1", "={r1}"()
+  %tp =  call i64 asm sideeffect "addi.d $$tp, $$zero, 1", "={r2}"()
+  %a0 =  call i64 asm sideeffect "addi.d $$a0, $$zero, 1", "={r4}"()
+  %a1 =  call i64 asm sideeffect "addi.d $$a1, $$zero, 1", "={r5}"()
+  %a2 =  call i64 asm sideeffect "addi.d $$a2, $$zero, 1", "={r6}"()
+  %a3 =  call i64 asm sideeffect "addi.d $$a3, $$zero, 1", "={r7}"()
+  %a4 =  call i64 asm sideeffect "addi.d $$a4, $$zero, 1", "={r8}"()
+  %a5 =  call i64 asm sideeffect "addi.d $$a5, $$zero, 1", "={r9}"()
+  %a6 =  call i64 asm sideeffect "addi.d $$a6, $$zero, 1", "={r10}"()
+  %a7 =  call i64 asm sideeffect "addi.d $$a7, $$zero, 1", "={r11}"()
+  %t0 =  call i64 asm sideeffect "addi.d $$t0, $$zero, 1", "={r12}"()
+  %t1 =  call i64 asm sideeffect "addi.d $$t1, $$zero, 1", "={r13}"()
+  %t2 =  call i64 asm sideeffect "addi.d $$t2, $$zero, 1", "={r14}"()
+  %t3 =  call i64 asm sideeffect "addi.d $$t3, $$zero, 1", "={r15}"()
+  %t4 =  call i64 asm sideeffect "addi.d $$t4, $$zero, 1", "={r16}"()
+  %t5 =  call i64 asm sideeffect "addi.d $$t5, $$zero, 1", "={r17}"()
+  %t6 =  call i64 asm sideeffect "addi.d $$t6, $$zero, 1", "={r18}"()
+  %t7 =  call i64 asm sideeffect "addi.d $$t7, $$zero, 1", "={r19}"()
+  %t8 =  call i64 asm sideeffect "addi.d $$t8, $$zero, 1", "={r20}"()
+  ;; r21 Reserved (Non-allocatable)
+  %s9 =  call i64 asm sideeffect "addi.d $$s9, $$zero, 1", "={r22}"()
+  %s0 =  call i64 asm sideeffect "addi.d $$s0, $$zero, 1", "={r23}"()
+  %s1 =  call i64 asm sideeffect "addi.d $$s1, $$zero, 1", "={r24}"()
+  %s2 =  call i64 asm sideeffect "addi.d $$s2, $$zero, 1", "={r25}"()
+  %s3 =  call i64 asm sideeffect "addi.d $$s3, $$zero, 1", "={r26}"()
+  %s4 =  call i64 asm sideeffect "addi.d $$s4, $$zero, 1", "={r27}"()
+  %s5 =  call i64 asm sideeffect "addi.d $$s5, $$zero, 1", "={r28}"()
+  %s6 =  call i64 asm sideeffect "addi.d $$s6, $$zero, 1", "={r29}"()
+  %s7 =  call i64 asm sideeffect "addi.d $$s7, $$zero, 1", "={r30}"()
+  %s8 =  call i64 asm sideeffect "addi.d $$s8, $$zero, 1", "={r31}"()
+
+  %e = extractelement <16 x i8> %a, i64 0
+
+  store volatile i8 %e, ptr %s
+
+  call void asm sideeffect "# reg use $0", "{r0}"(i64 %zero)
+  call void asm sideeffect "# reg use $0", "{r1}"(i64 %ra)
+  call void asm sideeffect "# reg use $0", "{r2}"(i64 %tp)
+  call void asm sideeffect "# reg use $0", "{r4}"(i64 %a0)
+  call void asm sideeffect "# reg use $0", "{r5}"(i64 %a1)
+  call void asm sideeffect "# reg use $0", "{r6}"(i64 %a2)
+  call void asm sideeffect "# reg use $0", "{r7}"(i64 %a3)
+  call void asm sideeffect "# reg use $0", "{r8}"(i64 %a4)
+  call void asm sideeffect "# reg use $0", "{r9}"(i64 %a5)
+  call void asm sideeffect "# reg use $0", "{r10}"(i64 %a6)
+  call void asm sideeffect "# reg use $0", "{r11}"(i64 %a7)
+  call void asm sideeffect "# reg use $0", "{r12}"(i64 %t0)
+  call void asm sideeffect "# reg use $0", "{r13}"(i64 %t1)
+  call void asm sideeffect "# reg use $0", "{r14}"(i64 %t2)
+  call void asm sideeffect "# reg use $0", "{r15}"(i64 %t3)
+  call void asm sideeffect "# reg use $0", "{r16}"(i64 %t4)
+  call void asm sideeffect "# reg use $0", "{r17}"(i64 %t5)
+  call void asm sideeffect "# reg use $0", "{r18}"(i64 %t6)
+  call void asm sideeffect "# reg use $0", "{r19}"(i64 %t7)
+  call void asm sideeffect "# reg use $0", "{r20}"(i64 %t8)
+  ;; r21 Reserved (Non-allocatable)
+  call void asm sideeffect "# reg use $0", "{r22}"(i64 %s9)
+  call void asm sideeffect "# reg use $0", "{r23}"(i64 %s0)
+  call void asm sideeffect "# reg use $0", "{r24}"(i64 %s1)
+  call void asm sideeffect "# reg use $0", "{r25}"(i64 %s2)
+  call void asm sideeffect "# reg use $0", "{r26}"(i64 %s3)
+  call void asm sideeffect "# reg use $0", "{r27}"(i64 %s4)
+  call void asm sideeffect "# reg use $0", "{r28}"(i64 %s5)
+  call void asm sideeffect "# reg use $0", "{r29}"(i64 %s6)
+  call void asm sideeffect "# reg use $0", "{r30}"(i64 %s7)
+  call void asm sideeffect "# reg use $0", "{r31}"(i64 %s8)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index 0ee30120f77a6..ad57bbf9ee5c0 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -588,3 +588,18 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
   %res = bitcast <2 x i1> %y to i2
   ret i2 %res
 }
+
+define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: vmsk_eq_allzeros_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vseqi.b $vr0, $vr0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
+; CHECK-NEXT:    vmskltz.w $vr0, $vr0
+; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
+; CHECK-NEXT:    ret
+  %1 = icmp eq <4 x i8> %a, zeroinitializer
+  %2 = bitcast <4 x i1> %1 to i4
+  ret i4 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
index 9f15604fcca6b..69995a0721f8a 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
@@ -36,15 +36,15 @@ define void @caller(i32 %n) {
 ;
 ; LA64-LABEL: caller:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s8, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s8, $sp, 104 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
 ; LA64-NEXT:    .cfi_offset 31, -24
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
 ; LA64-NEXT:    move $s8, $sp
@@ -54,14 +54,14 @@ define void @caller(i32 %n) {
 ; LA64-NEXT:    slli.d $a0, $a0, 4
 ; LA64-NEXT:    sub.d $a0, $sp, $a0
 ; LA64-NEXT:    move $sp, $a0
-; LA64-NEXT:    addi.d $a1, $s8, 0
+; LA64-NEXT:    addi.d $a1, $s8, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $s8, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $s8, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, i32 %n
   %2 = alloca i32, align 64
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 0645339358b64..0188884543adb 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -28,22 +28,22 @@ define void @caller32() {
 ;
 ; LA64-LABEL: caller32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -32
-; LA64-NEXT:    .cfi_def_cfa_offset 32
-; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -64
+; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 32
+; LA64-NEXT:    addi.d $fp, $sp, 64
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 4, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 32
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -32
-; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    addi.d $sp, $fp, -64
+; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 64
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 32
   call void @callee(ptr %1)
@@ -102,22 +102,22 @@ define void @caller64() {
 ;
 ; LA64-LABEL: caller64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 64
   call void @callee(ptr %1)
@@ -176,22 +176,22 @@ define void @caller128() {
 ;
 ; LA64-LABEL: caller128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -128
-; LA64-NEXT:    .cfi_def_cfa_offset 128
-; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -256
+; LA64-NEXT:    .cfi_def_cfa_offset 256
+; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 128
+; LA64-NEXT:    addi.d $fp, $sp, 256
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 6, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 128
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -128
-; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 128
+; LA64-NEXT:    addi.d $sp, $fp, -256
+; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 256
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 128
   call void @callee(ptr %1)
@@ -250,22 +250,22 @@ define void @caller256() {
 ;
 ; LA64-LABEL: caller256:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -256
-; LA64-NEXT:    .cfi_def_cfa_offset 256
-; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -512
+; LA64-NEXT:    .cfi_def_cfa_offset 512
+; LA64-NEXT:    st.d $ra, $sp, 504 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 496 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 256
+; LA64-NEXT:    addi.d $fp, $sp, 512
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 7, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 256
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -256
-; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 256
+; LA64-NEXT:    addi.d $sp, $fp, -512
+; LA64-NEXT:    ld.d $fp, $sp, 496 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 504 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 512
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 256
   call void @callee(ptr %1)
diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
index eb656ad94e28b..6e9d26ab362d6 100644
--- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
+++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
@@ -24,9 +24,9 @@
 ; NO-WARNING-NOT:  warning: triple-implied ABI conflicts with provided target-abi 'lp64d', using target-abi
 
 ;; Check that ILP32-on-LA64 and LP64-on-LA32 combinations are handled properly.
-; RUN: llc --mtriple=loongarch64 --target-abi=ilp32d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch64-linux-gnu --target-abi=ilp32d --mattr=+d < %s 2>&1 \
 ; RUN:   | FileCheck %s --check-prefixes=LP64D,32ON64
-; RUN: llc --mtriple=loongarch32 --target-abi=lp64d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch32-linux-gnu --target-abi=lp64d --mattr=+d < %s 2>&1 \
 ; RUN:   | FileCheck %s --check-prefixes=ILP32D,64ON32
 
 ; 32ON64: warning: 32-bit ABIs are not supported for 64-bit targets, ignoring and using triple-implied ABI
@@ -49,12 +49,6 @@
 
 ; LP64D-LP64F-NOF: warning: both target-abi and the triple-implied ABI are invalid, ignoring and using feature-implied ABI
 
-;; Check that triple-implied ABI are invalid, use feature-implied ABI
-; RUN: llc --mtriple=loongarch64 --mattr=-f < %s 2>&1 \
-; RUN:   | FileCheck %s --check-prefixes=LP64S,LP64D-NONE-NOF
-
-; LP64D-NONE-NOF: warning: the triple-implied ABI is invalid, ignoring and using feature-implied ABI
-
 define float @f(float %a) {
 ; ILP32D-LABEL: f:
 ; ILP32D:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
index 925fdf3d60646..0d441e66a0c84 100644
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@@ -121,19 +121,19 @@ define void @t3() {
 ;
 ; LA64-LABEL: t3:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    .cfi_def_cfa_offset 80
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.L.str)
 ; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.L.str)
 ; LA64-NEXT:    ld.h $a1, $a0, 20
 ; LA64-NEXT:    ld.w $a2, $a0, 16
 ; LA64-NEXT:    ld.d $a3, $a0, 8
 ; LA64-NEXT:    ld.d $a0, $a0, 0
-; LA64-NEXT:    st.h $a1, $sp, 20
-; LA64-NEXT:    st.w $a2, $sp, 16
-; LA64-NEXT:    st.d $a3, $sp, 8
-; LA64-NEXT:    st.d $a0, $sp, 0
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    st.h $a1, $sp, 36
+; LA64-NEXT:    st.w $a2, $sp, 32
+; LA64-NEXT:    st.d $a3, $sp, 24
+; LA64-NEXT:    st.d $a0, $sp, 16
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
 entry:
   %msgbuf = alloca [64 x i8], align 1
diff --git a/llvm/test/CodeGen/LoongArch/vararg.ll b/llvm/test/CodeGen/LoongArch/vararg.ll
index 939cd2015c5b1..bc4b8a77c7e15 100644
--- a/llvm/test/CodeGen/LoongArch/vararg.ll
+++ b/llvm/test/CodeGen/LoongArch/vararg.ll
@@ -47,7 +47,7 @@ define i64 @va1(ptr %fmt, ...) {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -94,7 +94,7 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -112,11 +112,11 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-LABEL: va1_va_arg_alloca:
 ; LA64-FPELIM:       # %bb.0:
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -96
-; LA64-FPELIM-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 32
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -112
+; LA64-FPELIM-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 48
 ; LA64-FPELIM-NEXT:    move $s0, $a1
 ; LA64-FPELIM-NEXT:    st.d $a7, $fp, 56
 ; LA64-FPELIM-NEXT:    st.d $a6, $fp, 48
@@ -126,7 +126,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    st.d $a2, $fp, 16
 ; LA64-FPELIM-NEXT:    st.d $a1, $fp, 8
 ; LA64-FPELIM-NEXT:    addi.d $a0, $fp, 16
-; LA64-FPELIM-NEXT:    st.d $a0, $fp, -32
+; LA64-FPELIM-NEXT:    st.d $a0, $fp, -40
 ; LA64-FPELIM-NEXT:    addi.d $a0, $a1, 15
 ; LA64-FPELIM-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-FPELIM-NEXT:    sub.d $a0, $sp, $a0
@@ -134,20 +134,20 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-FPELIM-NEXT:    jirl $ra, $ra, 0
 ; LA64-FPELIM-NEXT:    move $a0, $s0
-; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -32
-; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 96
+; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -48
+; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 112
 ; LA64-FPELIM-NEXT:    ret
 ;
 ; LA64-WITHFP-LABEL: va1_va_arg_alloca:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -96
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 32
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 48
 ; LA64-WITHFP-NEXT:    move $s0, $a1
 ; LA64-WITHFP-NEXT:    st.d $a7, $fp, 56
 ; LA64-WITHFP-NEXT:    st.d $a6, $fp, 48
@@ -157,7 +157,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a0, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a0, $fp, -32
+; LA64-WITHFP-NEXT:    st.d $a0, $fp, -40
 ; LA64-WITHFP-NEXT:    addi.d $a0, $a1, 15
 ; LA64-WITHFP-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-WITHFP-NEXT:    sub.d $a0, $sp, $a0
@@ -165,11 +165,11 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
 ; LA64-WITHFP-NEXT:    move $a0, $s0
-; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -32
-; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
+; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -48
+; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
 ; LA64-WITHFP-NEXT:    ret
   %va = alloca ptr, align 8
   call void @llvm.va_start(ptr %va)
@@ -314,10 +314,10 @@ define void @va_aligned_stack_caller() nounwind {
 ;
 ; LA64-WITHFP-LABEL: va_aligned_stack_caller:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 96 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 112
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -128
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 128
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 17
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 48
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 16
@@ -336,23 +336,23 @@ define void @va_aligned_stack_caller() nounwind {
 ; LA64-WITHFP-NEXT:    lu32i.d $a0, 335544
 ; LA64-WITHFP-NEXT:    lu52i.d $a0, $a0, -328
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 16
-; LA64-WITHFP-NEXT:    st.d $zero, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $zero, $fp, -40
 ; LA64-WITHFP-NEXT:    vrepli.b $vr0, 0
-; LA64-WITHFP-NEXT:    vst $vr0, $fp, -40
+; LA64-WITHFP-NEXT:    vst $vr0, $fp, -56
 ; LA64-WITHFP-NEXT:    ori $a5, $zero, 1000
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 1
 ; LA64-WITHFP-NEXT:    ori $a1, $zero, 11
-; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -48
+; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -64
 ; LA64-WITHFP-NEXT:    ori $a3, $zero, 12
 ; LA64-WITHFP-NEXT:    ori $a4, $zero, 13
 ; LA64-WITHFP-NEXT:    ori $a7, $zero, 1
-; LA64-WITHFP-NEXT:    st.d $a5, $fp, -48
+; LA64-WITHFP-NEXT:    st.d $a5, $fp, -64
 ; LA64-WITHFP-NEXT:    move $a6, $zero
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(va_aligned_stack_callee)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 96 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 128
 ; LA64-WITHFP-NEXT:    ret
   %1 = call i32 (i32, ...) @va_aligned_stack_callee(i32 1, i32 11,
     i256 1000, i32 12, i32 13, i128 18446744073709551616, i32 14,
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index b514c49394d21..278cf0150c2f7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT:   hasInitWholeWave: false
 ; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT:   isWholeWaveFunction: false
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
   entry:
@@ -315,6 +316,7 @@
 ; CHECK-NEXT:   hasInitWholeWave: false
 ; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT:   isWholeWaveFunction: false
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
   entry:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index fc730f9e88454..890ea44081ce7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -46,6 +46,7 @@
 ; AFTER-PEI-NEXT: hasInitWholeWave: false
 ; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
+; AFTER-PEI-NEXT: isWholeWaveFunction: false
 ; AFTER-PEI-NEXT: body:
 define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr0 = call <32 x i32>  asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 5adef1433079d..f84ef8a3844dd 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
   bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index fa40164aa02f0..cc834d017c149 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
 bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 24565e4423d04..06c580ec6f6b4 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -55,6 +55,7 @@
 # FULL-NEXT:  hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -162,6 +163,7 @@ body:             |
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -240,6 +242,7 @@ body:             |
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -319,6 +322,7 @@ body:             |
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index a15271382f37d..427154651a381 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -56,6 +56,7 @@
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
   %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
   %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define void @function() {
   ret void
@@ -233,6 +236,7 @@ define void @function() {
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define void @function_nsz() #0 {
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index e2a914d8cfc36..ba5813c869236 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -359,11 +359,12 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
 define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x bfloat>
   ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll
new file mode 100644
index 0000000000000..843446a658626
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll
new file mode 100644
index 0000000000000..9b4858036fca6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll
new file mode 100644
index 0000000000000..432540594c790
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll
@@ -0,0 +1,353 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1);
+
+define void @cp_async_bulk_tensor_g2s_cta_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 0)
+  ret void
+}
+
+define void @cp_async_bulk_tensor_g2s_cta_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0)
+  ret void
+}
+
+define void @cp_async_bulk_tensor_g2s_cta_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 0)
+  ret void
+}
+
+define void @cp_async_bulk_tensor_g2s_cta_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 0)
+  ret void
+}
+
+define void @cp_async_bulk_tensor_g2s_cta_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1);
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 0)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_g2s_cta_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
new file mode 100644
index 0000000000000..ef4a8fb6ca72f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d
+define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1
+define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2
+define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
new file mode 100644
index 0000000000000..112dab1964065
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
@@ -0,0 +1,524 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d
+define void @cp_async_bulk_tensor_g2s_im2colw_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1
+define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2
+define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d
+define void @cp_async_bulk_tensor_g2s_im2colw_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1
+define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2
+define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d
+define void @cp_async_bulk_tensor_g2s_im2colw_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1
+define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2
+define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
new file mode 100644
index 0000000000000..54e861eca30cc
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
@@ -0,0 +1,524 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d
+define void @cp_async_bulk_tensor_g2s_im2colw_128_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1
+define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2
+define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d
+define void @cp_async_bulk_tensor_g2s_im2colw_128_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1
+define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2
+define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d
+define void @cp_async_bulk_tensor_g2s_im2colw_128_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1
+define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+
+  ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2
+define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll
new file mode 100644
index 0000000000000..6bf8f03f99ee1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1);
+declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1);
+
+define void @test_cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1)
+  ret void
+}
+
+define void @test_cp_async_bulk_tensor_prefetch_tile_gather4_2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll
new file mode 100644
index 0000000000000..2ef44ff643bfe
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag);
+
+; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d
+define void @cp_async_bulk_tensor_s2g_tile_scatter4_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index d0e2c1817f696..8918fbd8c6f3b 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -45,11 +45,12 @@ define <2 x half> @test_ret_const() #0 {
 define half @test_extract_0(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_extract_0(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 0
@@ -59,12 +60,13 @@ define half @test_extract_0(<2 x half> %a) #0 {
 define half @test_extract_1(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_extract_1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 1
   ret half %e
@@ -80,8 +82,9 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; CHECK-NEXT:    setp.eq.b64 %p1, %rd1, 0;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-NEXT:    ret;
@@ -107,14 +110,16 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fadd_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -143,7 +148,8 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -175,7 +181,8 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_1_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -207,14 +214,16 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fsub_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fsub_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -242,7 +251,8 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 %r3, 0f00000000;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r4, %r3, %r2;
@@ -275,14 +285,16 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmul_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmul_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    mul.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NOF16-NEXT:    mul.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -299,14 +311,16 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NEXT:    div.rn.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -331,10 +345,12 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_frem_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_frem_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.f32 %r7, %r6;
@@ -342,8 +358,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    testp.infinite.f32 %p1, %r3;
 ; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r9;
-; CHECK-NEXT:    cvt.f32.f16 %r10, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r11, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs3;
 ; CHECK-NEXT:    div.rn.f32 %r12, %r11, %r10;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r13, %r12;
 ; CHECK-NEXT:    neg.f32 %r14, %r13;
@@ -535,11 +551,13 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
 ; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-F16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
 ; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r3, %r4;
-; CHECK-F16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1];
-; CHECK-F16-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
-; CHECK-F16-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
+; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-F16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-F16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-F16-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
 ; CHECK-F16-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; CHECK-F16-NEXT:    ret;
 ;
@@ -550,18 +568,22 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs5;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
-; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs2, %rs8, %p2;
-; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs1, %rs7, %p1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; CHECK-NOF16-NEXT:    ret;
   %cc = fcmp une <2 x half> %c, %d
@@ -579,11 +601,13 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT:    ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT:    ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
 ; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT:    selp.f32 %r7, %r4, %r6, %p2;
-; CHECK-F16-NEXT:    selp.f32 %r8, %r3, %r5, %p1;
+; CHECK-F16-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-F16-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-F16-NEXT:    selp.f32 %r7, %r6, %r4, %p2;
+; CHECK-F16-NEXT:    selp.f32 %r8, %r5, %r3, %p1;
 ; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-F16-NEXT:    ret;
 ;
@@ -595,18 +619,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-NOF16-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT:    selp.f32 %r11, %r4, %r10, %p2;
-; CHECK-NOF16-NEXT:    selp.f32 %r12, %r3, %r9, %p1;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r6, %r5;
+; CHECK-NOF16-NEXT:    mov.b64 {%r7, %r8}, %rd2;
+; CHECK-NOF16-NEXT:    mov.b64 {%r9, %r10}, %rd1;
+; CHECK-NOF16-NEXT:    selp.f32 %r11, %r10, %r8, %p2;
+; CHECK-NOF16-NEXT:    selp.f32 %r12, %r9, %r7, %p1;
 ; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r11};
 ; CHECK-NOF16-NEXT:    ret;
                                            <2 x half> %c, <2 x half> %d) #0 {
@@ -624,14 +652,18 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1];
-; CHECK-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
-; CHECK-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r5, %r3;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r6, %r4;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
@@ -664,13 +696,15 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_une_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_une_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -705,13 +739,15 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ueq_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ueq_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.equ.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.equ.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -746,13 +782,15 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ugt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ugt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.gtu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.gtu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -787,13 +825,15 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uge_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uge_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.geu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.geu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -828,13 +868,15 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ult_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ult_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.ltu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.ltu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -869,13 +911,15 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ule_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ule_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.leu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.leu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -911,13 +955,15 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uno_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uno_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -952,13 +998,15 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_one_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_one_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.ne.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.ne.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -993,13 +1041,15 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oeq_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oeq_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1034,13 +1084,15 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ogt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ogt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1075,13 +1127,15 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oge_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oge_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.ge.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.ge.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1116,13 +1170,15 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_olt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_olt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1157,13 +1213,15 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ole_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ole_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.le.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.le.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1198,13 +1256,15 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ord_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ord_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.num.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.num.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1222,7 +1282,8 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.s32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1239,7 +1300,8 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.s64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1255,7 +1317,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.u32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1272,7 +1335,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.u64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1369,16 +1433,17 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r4, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r7, %r8;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
 ; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
@@ -1411,16 +1476,17 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_sitofp_2xi32_fadd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r4, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r7, %r8;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
 ; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
@@ -1433,11 +1499,17 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_fptrunc_2xfloat(
 ; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %r2;
+; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %r1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = fptrunc <2 x float> %a to <2 x half>
   ret <2 x half> %r
@@ -1468,7 +1540,8 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xfloat_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.f32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1485,7 +1558,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.f64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1578,7 +1652,8 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sqrt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1606,7 +1681,8 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sin_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1627,7 +1703,8 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_cos_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1703,17 +1780,20 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fma_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fma_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_fma_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
 ; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
@@ -1740,7 +1820,8 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    abs.f32 %r3, %r2;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1761,14 +1842,16 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_minnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_minnum_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    min.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NEXT:    min.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -1785,14 +1868,16 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_maxnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_maxnum_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    max.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NEXT:    max.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -1822,13 +1907,15 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_param_1];
-; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs4, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs2, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs5;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs3, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs1, 32767;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs5, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs1, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs4, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs7};
 ; CHECK-NOF16-NEXT:    ret;
@@ -1844,8 +1931,9 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b64 %rd1, [test_copysign_f32_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT:    mov.b64 {%r2, %r3}, %rd1;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %r3;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-F16-NEXT:    mov.b32 %r4, {%rs2, %rs1};
@@ -1862,8 +1950,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f32_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b64 %rd1, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-NOF16-NEXT:    mov.b64 {%r2, %r3}, %rd1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    and.b32 %r4, %r3, -2147483648;
 ; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; }
@@ -1906,7 +1996,8 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f64_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    and.b64 %rd3, %rd2, -9223372036854775808;
 ; CHECK-NOF16-NEXT:    shr.u64 %rd4, %rd3, 48;
@@ -1948,13 +2039,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_extended_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_extended_param_1];
-; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs3, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs5;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs4, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs2, 32767;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs1, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs4, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs2, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs5, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs10;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs7;
@@ -1972,7 +2065,8 @@ define <2 x half> @test_floor(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_floor_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rmi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rmi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -1988,7 +2082,8 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_ceil_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rpi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rpi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2004,7 +2099,8 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_trunc_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2020,7 +2116,8 @@ define <2 x half> @test_rint(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_rint_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2036,7 +2133,8 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_nearbyint_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_nearbyint_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2052,7 +2150,8 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_roundeven_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_roundeven_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2070,7 +2169,8 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<21>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_round_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
 ; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
@@ -2121,17 +2221,20 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmuladd_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmuladd_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_fmuladd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
 ; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
@@ -2148,7 +2251,8 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; CHECK-NEXT:    ret;
   %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
@@ -2158,12 +2262,13 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
 define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 {
 ; CHECK-LABEL: test_insertelement(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; CHECK-NEXT:    ret;
   %i = insertelement <2 x half> %a, half %x, i64 1
@@ -2177,7 +2282,8 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2193,7 +2299,8 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_uitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index af3cb63082e78..30afd690452eb 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -28,29 +28,53 @@ define <2 x float> @test_ret_const() #0 {
 }
 
 define float @test_extract_0(<2 x float> %a) #0 {
-; CHECK-LABEL: test_extract_0(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
-; CHECK-NEXT:    ret;
+; CHECK-NOF32X2-LABEL: test_extract_0(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-NOF32X2-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NOF32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_extract_0(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-F32X2-NEXT:    mov.b64 {%r1, _}, %rd1;
+; CHECK-F32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-F32X2-NEXT:    ret;
   %e = extractelement <2 x float> %a, i32 0
   ret float %e
 }
 
 define float @test_extract_1(<2 x float> %a) #0 {
-; CHECK-LABEL: test_extract_1(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
+; CHECK-NOF32X2-LABEL: test_extract_1(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-NOF32X2-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
+; CHECK-NOF32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_extract_1(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-F32X2-NEXT:    mov.b64 {_, %r1}, %rd1;
+; CHECK-F32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-F32X2-NEXT:    ret;
   %e = extractelement <2 x float> %a, i32 1
   ret float %e
 }
@@ -70,10 +94,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fadd_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -98,7 +124,8 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -128,7 +155,8 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -158,13 +186,17 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r9, %r4, %r8;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r10, %r3, %r7;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r11, %r2, %r6;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r12, %r1, %r5;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r7, %r8}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r9, %r10}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r11, %r10, %r8;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r12, %r9, %r7;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_v4(
@@ -189,12 +221,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4(
@@ -225,12 +259,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4(
@@ -261,10 +297,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1];
-; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fsub_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fsub_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -289,7 +327,8 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fneg_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    neg.f32 %r3, %r2;
 ; CHECK-NEXT:    neg.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -305,10 +344,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1];
-; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fmul_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fmul_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -333,10 +374,12 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1];
-; CHECK-NEXT:    div.rn.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    div.rn.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fdiv_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fdiv_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    div.rn.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = fdiv <2 x float> %a, %b
@@ -351,20 +394,22 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1];
-; CHECK-NEXT:    div.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_frem_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_frem_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.f32 %r7, %r6;
-; CHECK-NEXT:    fma.rn.f32 %r8, %r7, %r4, %r2;
-; CHECK-NEXT:    testp.infinite.f32 %p1, %r4;
-; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p1;
-; CHECK-NEXT:    div.rn.f32 %r10, %r1, %r3;
+; CHECK-NEXT:    fma.rn.f32 %r8, %r7, %r2, %r4;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r2;
+; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-NEXT:    div.rn.f32 %r10, %r3, %r1;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r11, %r10;
 ; CHECK-NEXT:    neg.f32 %r12, %r11;
-; CHECK-NEXT:    fma.rn.f32 %r13, %r12, %r3, %r1;
-; CHECK-NEXT:    testp.infinite.f32 %p2, %r3;
-; CHECK-NEXT:    selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NEXT:    fma.rn.f32 %r13, %r12, %r1, %r3;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r1;
+; CHECK-NEXT:    selp.f32 %r14, %r3, %r13, %p2;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r9};
 ; CHECK-NEXT:    ret;
   %r = frem <2 x float> %a, %b
@@ -378,10 +423,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fadd_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -406,7 +453,8 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -436,7 +484,8 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -466,13 +515,17 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r9, %r4, %r8;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r10, %r3, %r7;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r11, %r2, %r6;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r12, %r1, %r5;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r7, %r8}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r9, %r10}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r11, %r10, %r8;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r12, %r9, %r7;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_v4_ftz(
@@ -497,12 +550,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz(
@@ -533,12 +588,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz(
@@ -569,10 +626,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fsub_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fsub_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -597,7 +656,8 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fneg_ftz_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    neg.ftz.f32 %r3, %r2;
 ; CHECK-NEXT:    neg.ftz.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -613,10 +673,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fmul_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fmul_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -641,11 +703,14 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2];
-; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r7, %r2, %r4, %r6;
-; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd3, [test_fma_ftz_param_2];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fma_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fma_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r7, %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r8, %r5, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -671,10 +736,12 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1];
-; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    div.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fdiv_ftz_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    div.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = fdiv <2 x float> %a, %b
@@ -689,20 +756,22 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1];
-; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_frem_ftz_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_frem_ftz_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.ftz.f32 %r7, %r6;
-; CHECK-NEXT:    fma.rn.ftz.f32 %r8, %r7, %r4, %r2;
-; CHECK-NEXT:    testp.infinite.f32 %p1, %r4;
-; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p1;
-; CHECK-NEXT:    div.rn.ftz.f32 %r10, %r1, %r3;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r8, %r7, %r2, %r4;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r2;
+; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r10, %r3, %r1;
 ; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r11, %r10;
 ; CHECK-NEXT:    neg.ftz.f32 %r12, %r11;
-; CHECK-NEXT:    fma.rn.ftz.f32 %r13, %r12, %r3, %r1;
-; CHECK-NEXT:    testp.infinite.f32 %p2, %r3;
-; CHECK-NEXT:    selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r13, %r12, %r1, %r3;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r1;
+; CHECK-NEXT:    selp.f32 %r14, %r3, %r13, %p2;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r9};
 ; CHECK-NEXT:    ret;
   %r = frem <2 x float> %a, %b
@@ -877,14 +946,18 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
-; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1];
-; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p2;
-; CHECK-NEXT:    selp.f32 %r10, %r1, %r7, %p1;
+; CHECK-NEXT:    ld.param.b64 %rd4, [test_select_cc_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_select_cc_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r1;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT:    selp.f32 %r9, %r8, %r6, %p2;
+; CHECK-NEXT:    selp.f32 %r10, %r7, %r5, %p1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
 ; CHECK-NEXT:    ret;
   %cc = fcmp une <2 x float> %c, %d
@@ -902,10 +975,12 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r1, %r3;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r2, %r4;
+; CHECK-NEXT:    ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd6;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd5;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r1;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r2;
 ; CHECK-NEXT:    selp.f64 %rd7, %rd2, %rd4, %p2;
 ; CHECK-NEXT:    selp.f64 %rd8, %rd1, %rd3, %p1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd8, %rd7};
@@ -925,12 +1000,14 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
 ; CHECK-NEXT:    setp.neu.f64 %p1, %rd3, %rd5;
 ; CHECK-NEXT:    setp.neu.f64 %p2, %rd4, %rd6;
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1];
-; CHECK-NEXT:    selp.f32 %r5, %r2, %r4, %p2;
-; CHECK-NEXT:    selp.f32 %r6, %r1, %r3, %p1;
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    selp.f32 %r5, %r4, %r2, %p2;
+; CHECK-NEXT:    selp.f32 %r6, %r3, %r1, %p1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %cc = fcmp une <2 x double> %c, %d
@@ -947,10 +1024,12 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_une_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_une_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -969,10 +1048,12 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1];
-; CHECK-NEXT:    setp.equ.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.equ.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.equ.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.equ.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -991,10 +1072,12 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1];
-; CHECK-NEXT:    setp.gtu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.gtu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.gtu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.gtu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1013,10 +1096,12 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1];
-; CHECK-NEXT:    setp.geu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.geu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.geu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.geu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1035,10 +1120,12 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1];
-; CHECK-NEXT:    setp.ltu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.ltu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.ltu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.ltu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1057,10 +1144,12 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1];
-; CHECK-NEXT:    setp.leu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.leu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.leu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.leu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1079,10 +1168,12 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1];
-; CHECK-NEXT:    setp.nan.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.nan.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.nan.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.nan.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1101,10 +1192,12 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1];
-; CHECK-NEXT:    setp.ne.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.ne.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_one_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_one_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.ne.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.ne.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1123,10 +1216,12 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1];
-; CHECK-NEXT:    setp.eq.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.eq.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.eq.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.eq.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1145,10 +1240,12 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1];
-; CHECK-NEXT:    setp.gt.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.gt.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.gt.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.gt.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1167,10 +1264,12 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1];
-; CHECK-NEXT:    setp.ge.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.ge.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.ge.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.ge.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1189,10 +1288,12 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1];
-; CHECK-NEXT:    setp.lt.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.lt.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1211,10 +1312,12 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1];
-; CHECK-NEXT:    setp.le.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.le.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.le.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.le.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1233,10 +1336,12 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1];
-; CHECK-NEXT:    setp.num.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.num.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.num.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.num.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1253,7 +1358,8 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.s32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rzi.s32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1269,7 +1375,8 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.s64.f32 %rd2, %r2;
 ; CHECK-NEXT:    cvt.rzi.s64.f32 %rd3, %r1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
@@ -1285,7 +1392,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.u32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rzi.u32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1301,7 +1409,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.u64.f32 %rd2, %r2;
 ; CHECK-NEXT:    cvt.rzi.u64.f32 %rd3, %r1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
@@ -1380,9 +1489,10 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
 ; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF32X2-NEXT:    cvt.rn.f32.u32 %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    cvt.rn.f32.u32 %r4, %r2;
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, %r4;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, %r3;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
@@ -1431,7 +1541,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.f64.f32 %rd2, %r2;
 ; CHECK-NEXT:    cvt.f64.f32 %rd3, %r1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
@@ -1499,7 +1610,8 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
 ; CHECK-NEXT:    sqrt.rn.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1522,7 +1634,8 @@ define <2 x float> @test_sin(<2 x float> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sin_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    sin.approx.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1538,7 +1651,8 @@ define <2 x float> @test_cos(<2 x float> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_cos_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cos.approx.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1597,11 +1711,14 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2];
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r2, %r4, %r6;
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd3, [test_fma_param_2];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fma_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fma_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r5, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -1627,7 +1744,8 @@ define <2 x float> @test_fabs(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fabs_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    abs.f32 %r3, %r2;
 ; CHECK-NEXT:    abs.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1643,10 +1761,12 @@ define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1];
-; CHECK-NEXT:    min.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    min.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_minnum_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_minnum_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    min.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b)
@@ -1660,10 +1780,12 @@ define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1];
-; CHECK-NEXT:    max.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    max.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_maxnum_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_maxnum_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    max.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b)
@@ -1677,8 +1799,10 @@ define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_copysign_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_copysign_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; CHECK-NEXT:    copysign.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    copysign.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
@@ -1696,18 +1820,19 @@ define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0];
-; CHECK-NEXT:    abs.f32 %r3, %r2;
-; CHECK-NEXT:    neg.f32 %r4, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_copysign_f64_param_0];
 ; CHECK-NEXT:    shr.u64 %rd4, %rd3, 63;
 ; CHECK-NEXT:    and.b64 %rd5, %rd4, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd5, 0;
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    abs.f32 %r3, %r2;
+; CHECK-NEXT:    neg.f32 %r4, %r3;
 ; CHECK-NEXT:    selp.f32 %r5, %r4, %r3, %p1;
-; CHECK-NEXT:    abs.f32 %r6, %r1;
-; CHECK-NEXT:    neg.f32 %r7, %r6;
 ; CHECK-NEXT:    shr.u64 %rd6, %rd2, 63;
 ; CHECK-NEXT:    and.b64 %rd7, %rd6, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p2, %rd7, 0;
+; CHECK-NEXT:    abs.f32 %r6, %r1;
+; CHECK-NEXT:    neg.f32 %r7, %r6;
 ; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p2;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r5};
 ; CHECK-NEXT:    ret;
@@ -1723,8 +1848,10 @@ define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_copysign_extended_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_copysign_extended_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; CHECK-NEXT:    copysign.f32 %r5, %r3, %r1;
 ; CHECK-NEXT:    copysign.f32 %r6, %r4, %r2;
 ; CHECK-NEXT:    cvt.f64.f32 %rd3, %r6;
@@ -1743,7 +1870,8 @@ define <2 x float> @test_floor(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_floor_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rmi.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rmi.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1759,7 +1887,8 @@ define <2 x float> @test_ceil(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ceil_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rpi.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rpi.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1775,7 +1904,8 @@ define <2 x float> @test_trunc(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1791,7 +1921,8 @@ define <2 x float> @test_rint(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rint_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1807,7 +1938,8 @@ define <2 x float> @test_nearbyint(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_nearbyint_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1823,7 +1955,8 @@ define <2 x float> @test_roundeven(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_roundeven_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1841,7 +1974,8 @@ define <2 x float> @test_round(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_round_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_round_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
 ; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
 ; CHECK-NEXT:    add.rn.f32 %r5, %r2, %r4;
@@ -1875,11 +2009,14 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2];
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r2, %r4, %r6;
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd3, [test_fmuladd_param_2];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fmuladd_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fmuladd_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r5, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -1905,7 +2042,8 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_shufflevector_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
 ; CHECK-NEXT:    ret;
   %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> <i32 1, i32 0>
@@ -1913,16 +2051,29 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
 }
 
 define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 {
-; CHECK-LABEL: test_insertelement(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0];
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-NEXT:    ret;
+; CHECK-NOF32X2-LABEL: test_insertelement(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_insertelement_param_0];
+; CHECK-NOF32X2-NEXT:    { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; }
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_insertelement(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_insertelement_param_0];
+; CHECK-F32X2-NEXT:    mov.b64 {%r2, _}, %rd1;
+; CHECK-F32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-F32X2-NEXT:    ret;
   %i = insertelement <2 x float> %a, float %x, i64 1
   ret <2 x float> %i
 }
@@ -1957,6 +2108,43 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 {
   ret <2 x float> %r
 }
 
+define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) {
+; CHECK-LABEL: test_trunc_to_v2bf16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.b32 [%rd2], %r3;
+; CHECK-NEXT:    ret;
+  %trunc = fptrunc <2 x float> %a to <2 x bfloat>
+  store <2 x bfloat> %trunc, ptr %p
+  ret void
+}
+
+define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) {
+; CHECK-LABEL: test_trunc_to_v2f16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    cvt.rn.f16x2.f32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.b32 [%rd2], %r3;
+; CHECK-NEXT:    ret;
+  %trunc = fptrunc <2 x float> %a to <2 x half>
+  store <2 x half> %trunc, ptr %p
+  ret void
+}
+
+
 attributes #0 = { nounwind }
 attributes #1 = { "unsafe-fp-math" = "true" }
 attributes #2 = { "denormal-fp-math"="preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index f1adc3489c0d9..9a051b3fd8bb7 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
 ; CHECK-LABEL: test_select_i1_basic_folding(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<12>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .pred %p<13>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
 ; CHECK-NEXT:    setp.eq.b32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT:    setp.ne.b32 %p2, %r3, 0;
-; CHECK-NEXT:    setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT:    setp.eq.b32 %p4, %r5, 0;
-; CHECK-NEXT:    ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT:    setp.ne.b32 %p2, %r2, 0;
+; CHECK-NEXT:    setp.eq.b32 %p3, %r2, 0;
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT:    setp.eq.b32 %p4, %r3, 0;
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
 ; CHECK-NEXT:    xor.pred %p6, %p1, %p3;
-; CHECK-NEXT:    ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
 ; CHECK-NEXT:    and.pred %p7, %p6, %p4;
-; CHECK-NEXT:    and.pred %p8, %p2, %p4;
-; CHECK-NEXT:    and.pred %p9, %p3, %p7;
-; CHECK-NEXT:    or.pred %p10, %p9, %p8;
-; CHECK-NEXT:    xor.pred %p11, %p10, %p3;
-; CHECK-NEXT:    selp.b32 %r8, %r6, %r7, %p11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    and.pred %p9, %p2, %p4;
+; CHECK-NEXT:    and.pred %p10, %p3, %p7;
+; CHECK-NEXT:    or.pred %p11, %p10, %p9;
+; CHECK-NEXT:    xor.pred %p12, %p11, %p3;
+; CHECK-NEXT:    selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %b1 = icmp eq i32 %v1, 0
   %b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index f2211eb1c0b8e..44d85589b5056 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
 define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: srem_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<22>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<126>;
+; CHECK-NEXT:    .reg .b64 %rd<127>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd62, %r4;
 ; CHECK-NEXT:    add.s64 %rd63, %rd62, 64;
 ; CHECK-NEXT:    selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT:    mov.b64 %rd116, 0;
+; CHECK-NEXT:    mov.b64 %rd117, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd116, 0;
-; CHECK-NEXT:    setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT:    and.pred %p10, %p8, %p8;
-; CHECK-NEXT:    setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT:    setp.gt.u64 %p12, %rd66, 127;
-; CHECK-NEXT:    and.pred %p13, %p11, %p12;
-; CHECK-NEXT:    or.pred %p14, %p13, %p10;
-; CHECK-NEXT:    or.pred %p15, %p5, %p14;
-; CHECK-NEXT:    xor.b64 %rd67, %rd66, 127;
-; CHECK-NEXT:    or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd68, 0;
-; CHECK-NEXT:    selp.b64 %rd125, 0, %rd4, %p15;
-; CHECK-NEXT:    selp.b64 %rd124, 0, %rd3, %p15;
-; CHECK-NEXT:    or.pred %p17, %p15, %p16;
-; CHECK-NEXT:    @%p17 bra $L__BB0_5;
+; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT:    and.pred %p10, %p9, %p8;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT:    or.pred %p12, %p10, %p11;
+; CHECK-NEXT:    or.pred %p13, %p5, %p12;
+; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd69, 0;
+; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd118, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd119, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd71, %rd118, %rd119;
-; CHECK-NEXT:    setp.eq.b64 %p18, %rd71, 0;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT:    or.b64 %rd72, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd72, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r5, %rd66;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd72, %rd4, %r6;
+; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd73, %rd3, %r7;
-; CHECK-NEXT:    or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r7;
+; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd75, %rd3, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd123, %rd75, %rd74, %p19;
-; CHECK-NEXT:    shl.b64 %rd122, %rd3, %r6;
-; CHECK-NEXT:    mov.b64 %rd113, %rd116;
-; CHECK-NEXT:    @%p18 bra $L__BB0_4;
+; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r6;
+; CHECK-NEXT:    mov.b64 %rd114, %rd117;
+; CHECK-NEXT:    @%p16 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd118;
-; CHECK-NEXT:    shr.u64 %rd78, %rd3, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd79, %rd4, %r10;
-; CHECK-NEXT:    or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r10;
+; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd81, %rd4, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd120, %rd81, %rd80, %p20;
-; CHECK-NEXT:    shr.u64 %rd121, %rd4, %r9;
+; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.b64 %rd113, 0;
-; CHECK-NEXT:    mov.b64 %rd116, %rd113;
+; CHECK-NEXT:    mov.b64 %rd114, 0;
+; CHECK-NEXT:    mov.b64 %rd117, %rd114;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd82, %rd120, 63;
-; CHECK-NEXT:    shl.b64 %rd83, %rd121, 1;
-; CHECK-NEXT:    or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT:    shl.b64 %rd85, %rd120, 1;
-; CHECK-NEXT:    shr.u64 %rd86, %rd123, 63;
-; CHECK-NEXT:    or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT:    shr.u64 %rd88, %rd122, 63;
-; CHECK-NEXT:    shl.b64 %rd89, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT:    shl.b64 %rd91, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd122, %rd116, %rd91;
-; CHECK-NEXT:    or.b64 %rd123, %rd113, %rd90;
-; CHECK-NEXT:    sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT:    subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT:    shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT:    and.b64 %rd116, %rd94, 1;
-; CHECK-NEXT:    and.b64 %rd95, %rd94, %rd5;
-; CHECK-NEXT:    and.b64 %rd96, %rd94, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd120, %rd87, %rd95;
-; CHECK-NEXT:    subc.cc.s64 %rd121, %rd84, %rd96;
-; CHECK-NEXT:    add.cc.s64 %rd118, %rd118, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT:    or.b64 %rd97, %rd118, %rd119;
-; CHECK-NEXT:    setp.eq.b64 %p21, %rd97, 0;
-; CHECK-NEXT:    @%p21 bra $L__BB0_4;
+; CHECK-NEXT:    shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT:    shl.b64 %rd84, %rd122, 1;
+; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT:    shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT:    shr.u64 %rd87, %rd124, 63;
+; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT:    shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd91;
+; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT:    and.b64 %rd117, %rd95, 1;
+; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT:    or.b64 %rd98, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd98, %rd122, 63;
-; CHECK-NEXT:    shl.b64 %rd99, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT:    shl.b64 %rd101, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd124, %rd116, %rd101;
-; CHECK-NEXT:    or.b64 %rd125, %rd113, %rd100;
+; CHECK-NEXT:    shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd100, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT:    shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd101;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd102, %rd5, %rd124;
-; CHECK-NEXT:    mad.lo.s64 %rd103, %rd5, %rd125, %rd102;
-; CHECK-NEXT:    mad.lo.s64 %rd104, %rd6, %rd124, %rd103;
-; CHECK-NEXT:    mul.lo.s64 %rd105, %rd5, %rd124;
-; CHECK-NEXT:    sub.cc.s64 %rd106, %rd3, %rd105;
-; CHECK-NEXT:    subc.cc.s64 %rd107, %rd4, %rd104;
-; CHECK-NEXT:    xor.b64 %rd108, %rd106, %rd2;
+; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT:    mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT:    mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
 ; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd110, %rd108, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd110, %rd111};
+; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd111, %rd112};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<111>;
+; CHECK-NEXT:    .reg .b64 %rd<113>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd101, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd101, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.b64 %rd103, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd103, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd110, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd109, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd103, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd104, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd103, %rd104;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd108, %rd64, %rd63, %p15;
-; CHECK-NEXT:    shl.b64 %rd107, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd98, %rd101;
+; CHECK-NEXT:    selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd100, %rd103;
 ; CHECK-NEXT:    @%p14 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd103;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd105, %rd70, %rd69, %p16;
-; CHECK-NEXT:    shr.u64 %rd106, %rd42, %r9;
+; CHECK-NEXT:    selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd98, 0;
-; CHECK-NEXT:    mov.b64 %rd101, %rd98;
+; CHECK-NEXT:    mov.b64 %rd100, 0;
+; CHECK-NEXT:    mov.b64 %rd103, %rd100;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd105, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd106, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd105, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd108, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd107, 1;
-; CHECK-NEXT:    or.b64 %rd107, %rd101, %rd80;
-; CHECK-NEXT:    or.b64 %rd108, %rd98, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd101, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd105, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd106, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd103, %rd103, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd104, %rd104, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd103, %rd104;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd108, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd110, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd103, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd107, 1;
-; CHECK-NEXT:    or.b64 %rd109, %rd101, %rd90;
-; CHECK-NEXT:    or.b64 %rd110, %rd98, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd91;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd91, %rd3, %rd109;
-; CHECK-NEXT:    mad.lo.s64 %rd92, %rd3, %rd110, %rd91;
-; CHECK-NEXT:    mad.lo.s64 %rd93, %rd4, %rd109, %rd92;
-; CHECK-NEXT:    mul.lo.s64 %rd94, %rd3, %rd109;
-; CHECK-NEXT:    sub.cc.s64 %rd95, %rd41, %rd94;
-; CHECK-NEXT:    subc.cc.s64 %rd96, %rd42, %rd93;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd95, %rd96};
+; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT:    mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT:    mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd97, %rd98};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
 define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: sdiv_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<22>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<121>;
+; CHECK-NEXT:    .reg .b64 %rd<122>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
 ; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
 ; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    mov.b64 %rd111, 0;
+; CHECK-NEXT:    mov.b64 %rd112, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd111, 0;
-; CHECK-NEXT:    setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT:    and.pred %p10, %p8, %p8;
-; CHECK-NEXT:    setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT:    setp.gt.u64 %p12, %rd67, 127;
-; CHECK-NEXT:    and.pred %p13, %p11, %p12;
-; CHECK-NEXT:    or.pred %p14, %p13, %p10;
-; CHECK-NEXT:    or.pred %p15, %p5, %p14;
-; CHECK-NEXT:    xor.b64 %rd68, %rd67, 127;
-; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd8;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd120, 0, %rd2, %p15;
-; CHECK-NEXT:    selp.b64 %rd119, 0, %rd1, %p15;
-; CHECK-NEXT:    or.pred %p17, %p15, %p16;
-; CHECK-NEXT:    @%p17 bra $L__BB4_5;
+; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT:    and.pred %p10, %p9, %p8;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT:    or.pred %p12, %p10, %p11;
+; CHECK-NEXT:    or.pred %p13, %p5, %p12;
+; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd70, 0;
+; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd113, %rd67, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd114, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd113, %rd114;
-; CHECK-NEXT:    setp.eq.b64 %p18, %rd72, 0;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd73, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r5, %rd67;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd73, %rd2, %r6;
+; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd74, %rd1, %r7;
-; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r7;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd76, %rd1, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd118, %rd76, %rd75, %p19;
-; CHECK-NEXT:    shl.b64 %rd117, %rd1, %r6;
-; CHECK-NEXT:    mov.b64 %rd108, %rd111;
-; CHECK-NEXT:    @%p18 bra $L__BB4_4;
+; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r6;
+; CHECK-NEXT:    mov.b64 %rd109, %rd112;
+; CHECK-NEXT:    @%p16 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd113;
-; CHECK-NEXT:    shr.u64 %rd79, %rd1, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd80, %rd2, %r10;
-; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r10;
+; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd82, %rd2, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd115, %rd82, %rd81, %p20;
-; CHECK-NEXT:    shr.u64 %rd116, %rd2, %r9;
+; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd108, 0;
-; CHECK-NEXT:    mov.b64 %rd111, %rd108;
+; CHECK-NEXT:    mov.b64 %rd109, 0;
+; CHECK-NEXT:    mov.b64 %rd112, %rd109;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd115, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd116, 1;
-; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd115, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd118, 63;
-; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd117, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd117, %rd111, %rd92;
-; CHECK-NEXT:    or.b64 %rd118, %rd108, %rd91;
-; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd111, %rd95, 1;
-; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd3;
-; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd115, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd116, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd113, %rd114;
-; CHECK-NEXT:    setp.eq.b64 %p21, %rd98, 0;
-; CHECK-NEXT:    @%p21 bra $L__BB4_4;
+; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd117, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd119, %rd111, %rd102;
-; CHECK-NEXT:    or.b64 %rd120, %rd108, %rd101;
+; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd103, %rd119, %rd5;
 ; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd106, %rd107};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<105>;
+; CHECK-NEXT:    .reg .b64 %rd<107>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd95, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.b64 %rd97, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd104, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd103, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd102, %rd64, %rd63, %p15;
-; CHECK-NEXT:    shl.b64 %rd101, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd92, %rd95;
+; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd94, %rd97;
 ; CHECK-NEXT:    @%p14 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd97;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd99, %rd70, %rd69, %p16;
-; CHECK-NEXT:    shr.u64 %rd100, %rd42, %r9;
+; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.b64 %rd92, 0;
-; CHECK-NEXT:    mov.b64 %rd95, %rd92;
+; CHECK-NEXT:    mov.b64 %rd94, 0;
+; CHECK-NEXT:    mov.b64 %rd97, %rd94;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT:    or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT:    or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd103, %rd104};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 1a61498b10142..2b7a06c33d948 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -32,31 +32,57 @@ define <2 x i16> @test_ret_const() #0 {
 }
 
 define i16 @test_extract_0(<2 x i16> %a) #0 {
-; COMMON-LABEL: test_extract_0(
-; COMMON:       {
-; COMMON-NEXT:    .reg .b16 %rs<3>;
-; COMMON-NEXT:    .reg .b32 %r<3>;
-; COMMON-EMPTY:
-; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
-; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
-; COMMON-NEXT:    ret;
+; I16x2-LABEL: test_extract_0(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b16 %rs<2>;
+; I16x2-NEXT:    .reg .b32 %r<3>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; I16x2-NEXT:    mov.b32 {%rs1, _}, %r1;
+; I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_extract_0(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<2>;
+; NO-I16x2-NEXT:    .reg .b32 %r<3>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; NO-I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; NO-I16x2-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 0
   ret i16 %e
 }
 
 define i16 @test_extract_1(<2 x i16> %a) #0 {
-; COMMON-LABEL: test_extract_1(
-; COMMON:       {
-; COMMON-NEXT:    .reg .b16 %rs<3>;
-; COMMON-NEXT:    .reg .b32 %r<3>;
-; COMMON-EMPTY:
-; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
-; COMMON-NEXT:    cvt.u32.u16 %r2, %rs2;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
-; COMMON-NEXT:    ret;
+; I16x2-LABEL: test_extract_1(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b16 %rs<2>;
+; I16x2-NEXT:    .reg .b32 %r<3>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; I16x2-NEXT:    mov.b32 {_, %rs1}, %r1;
+; I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_extract_1(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<2>;
+; NO-I16x2-NEXT:    .reg .b32 %r<3>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; NO-I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; NO-I16x2-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 1
   ret i16 %e
 }
@@ -71,8 +97,9 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; COMMON-NEXT:    setp.eq.b64 %p1, %rd1, 0;
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs3;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -99,10 +126,12 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_add_param_1];
-; NO-I16x2-NEXT:    add.s16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    add.s16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    add.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    add.s16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %r = add <2 x i16> %a, %b
@@ -128,7 +157,8 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_0_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -155,7 +185,8 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_1_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -171,10 +202,12 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sub_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_sub_param_1];
-; COMMON-NEXT:    sub.s16 %rs5, %rs2, %rs4;
-; COMMON-NEXT:    sub.s16 %rs6, %rs1, %rs3;
+; COMMON-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    sub.s16 %rs5, %rs4, %rs2;
+; COMMON-NEXT:    sub.s16 %rs6, %rs3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
   %r = sub <2 x i16> %a, %b
@@ -199,10 +232,12 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_smax_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_smax_param_1];
-; NO-I16x2-NEXT:    max.s16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    max.s16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    max.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    max.s16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp sgt <2 x i16> %a, %b
@@ -228,10 +263,12 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_umax_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_umax_param_1];
-; NO-I16x2-NEXT:    max.u16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    max.u16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    max.u16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    max.u16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp ugt <2 x i16> %a, %b
@@ -257,10 +294,12 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_smin_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_smin_param_1];
-; NO-I16x2-NEXT:    min.s16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    min.s16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    min.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    min.s16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp sle <2 x i16> %a, %b
@@ -286,10 +325,12 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_umin_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_umin_param_1];
-; NO-I16x2-NEXT:    min.u16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    min.u16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    min.u16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    min.u16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp ule <2 x i16> %a, %b
@@ -304,10 +345,12 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_mul_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_mul_param_1];
-; COMMON-NEXT:    mul.lo.s16 %rs5, %rs2, %rs4;
-; COMMON-NEXT:    mul.lo.s16 %rs6, %rs1, %rs3;
+; COMMON-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    mul.lo.s16 %rs5, %rs4, %rs2;
+; COMMON-NEXT:    mul.lo.s16 %rs6, %rs3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
   %r = mul <2 x i16> %a, %b
@@ -686,14 +729,18 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
 ; COMMON-NEXT:    .reg .b32 %r<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_2];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_3];
-; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs5;
-; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs6;
-; COMMON-NEXT:    ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
-; COMMON-NEXT:    selp.b16 %rs9, %rs2, %rs8, %p2;
-; COMMON-NEXT:    selp.b16 %rs10, %rs1, %rs7, %p1;
+; COMMON-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; COMMON-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs1;
+; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs2;
+; COMMON-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; COMMON-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; COMMON-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; COMMON-NEXT:    ret;
   %cc = icmp ne <2 x i16> %c, %d
@@ -711,10 +758,12 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
 ; COMMON-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i32_i16_param_2];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i32_i16_param_3];
-; COMMON-NEXT:    setp.ne.b16 %p1, %rs1, %rs3;
-; COMMON-NEXT:    setp.ne.b16 %p2, %rs2, %rs4;
+; COMMON-NEXT:    ld.param.b32 %r6, [test_select_cc_i32_i16_param_3];
+; COMMON-NEXT:    ld.param.b32 %r5, [test_select_cc_i32_i16_param_2];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
+; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs1;
+; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs2;
 ; COMMON-NEXT:    selp.b32 %r7, %r2, %r4, %p2;
 ; COMMON-NEXT:    selp.b32 %r8, %r1, %r3, %p1;
 ; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
@@ -735,12 +784,14 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
 ; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i16_i32_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_i16_i32_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_i16_i32_param_0];
 ; COMMON-NEXT:    setp.ne.b32 %p1, %r3, %r5;
 ; COMMON-NEXT:    setp.ne.b32 %p2, %r4, %r6;
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i16_i32_param_1];
-; COMMON-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
-; COMMON-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; COMMON-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
                                           <2 x i32> %c, <2 x i32> %d) #0 {
@@ -851,7 +902,8 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi32_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; COMMON-NEXT:    cvt.u32.u16 %r3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -868,7 +920,8 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi64_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.u64.u16 %rd1, %rs2;
 ; COMMON-NEXT:    cvt.u64.u16 %rd2, %rs1;
 ; COMMON-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -926,7 +979,8 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; COMMON-NEXT:    ret;
   %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
@@ -934,16 +988,29 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 }
 
 define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
-; COMMON-LABEL: test_insertelement(
-; COMMON:       {
-; COMMON-NEXT:    .reg .b16 %rs<4>;
-; COMMON-NEXT:    .reg .b32 %r<2>;
-; COMMON-EMPTY:
-; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0];
-; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
-; COMMON-NEXT:    ret;
+; I16x2-LABEL: test_insertelement(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b16 %rs<3>;
+; I16x2-NEXT:    .reg .b32 %r<2>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; I16x2-NEXT:    mov.b32 {%rs2, _}, %r1;
+; I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_insertelement(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<3>;
+; NO-I16x2-NEXT:    .reg .b32 %r<2>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
+; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
+; NO-I16x2-NEXT:    ret;
   %i = insertelement <2 x i16> %a, i16 %x, i64 1
   ret <2 x i16> %i
 }
@@ -955,7 +1022,8 @@ define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; COMMON-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -971,7 +1039,8 @@ define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; COMMON-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 410c0019c7222..9891e33151f8a 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1,14 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; ## Support i16x2 instructions
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
-; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | FileCheck -allow-deprecated-dag-overlap %s
-; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \
-; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN:   | %ptxas-verify -arch=sm_90                                          \
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK,O0
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK,O3
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_90 -mattr=+ptx80 -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_90                                           \
+; RUN: %}
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_90 -mattr=+ptx80 -verify-machineinstrs              \
+; RUN:   | %ptxas-verify -arch=sm_90                                           \
 ; RUN: %}
 
+target triple = "nvptx64-nvidia-cuda"
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 define <4 x i8> @test_ret_const() #0 {
@@ -79,61 +84,111 @@ define i8 @test_extract_3(<4 x i8> %a) #0 {
 }
 
 define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
-; CHECK-LABEL: test_extract_i(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    or.b32 %r3, %r2, 30576;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_extract_i(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; O0-NEXT:    cvt.u32.u64 %r2, %rd1;
+; O0-NEXT:    or.b32 %r3, %r2, 30576;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
+; O0-NEXT:    st.param.b32 [func_retval0], %r4;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_extract_i(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_extract_i_param_1];
+; O3-NEXT:    or.b32 %r3, %r2, 30576;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r4;
+; O3-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i64 %idx
   ret i8 %e
 }
 
 define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_add(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_add_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_add_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_add(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_add_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    add.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    add.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    add.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    add.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_add(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_add_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    add.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    add.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    add.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    add.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %r = add <4 x i8> %a, %b
   ret <4 x i8> %r
 }
@@ -205,341 +260,631 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 }
 
 define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_sub(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_sub(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    sub.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    sub.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    sub.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    sub.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_sub(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    sub.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    sub.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    sub.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    sub.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %r = sub <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_smax(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_smax(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_smax(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_umax(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.gt.u32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_umax(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_umax(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp ugt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_smin(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_smin(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_smin(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_umin(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.le.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.le.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.le.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.le.u32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_umin(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.le.u32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.le.u32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.le.u32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.le.u32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_umin(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.le.u32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.le.u32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.le.u32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.le.u32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp ule <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
-; CHECK-LABEL: test_eq(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r3, [test_eq_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.eq.b32 %p1, %r5, %r4;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.eq.b32 %p2, %r7, %r6;
-; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.eq.b32 %p3, %r9, %r8;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.eq.b32 %p4, %r11, %r10;
-; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
-; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_eq(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<23>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r3, [test_eq_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
+; O0-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.eq.b32 %p1, %r5, %r4;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.eq.b32 %p2, %r7, %r6;
+; O0-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.eq.b32 %p3, %r9, %r8;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.eq.b32 %p4, %r11, %r10;
+; O0-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
+; O0-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
+; O0-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
+; O0-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O0-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r22;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_eq(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<23>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.eq.b32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.eq.b32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.eq.b32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.eq.b32 %p4, %r10, %r9;
+; O3-NEXT:    ld.param.b32 %r11, [test_eq_param_2];
+; O3-NEXT:    prmt.b32 %r12, %r11, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r13, %r10, %r12, %p4;
+; O3-NEXT:    prmt.b32 %r14, %r11, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r15, %r8, %r14, %p3;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r11, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r18, %r6, %r17, %p2;
+; O3-NEXT:    prmt.b32 %r19, %r11, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r20, %r4, %r19, %p1;
+; O3-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O3-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r22;
+; O3-NEXT:    ret;
   %cmp = icmp eq <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
-; CHECK-LABEL: test_ne(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r3, [test_ne_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r5, %r4;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r7, %r6;
-; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r9, %r8;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r11, %r10;
-; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
-; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ne(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<23>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r3, [test_ne_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
+; O0-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.ne.b32 %p1, %r5, %r4;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.ne.b32 %p2, %r7, %r6;
+; O0-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.ne.b32 %p3, %r9, %r8;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.ne.b32 %p4, %r11, %r10;
+; O0-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
+; O0-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
+; O0-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
+; O0-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O0-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r22;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ne(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<23>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.ne.b32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.ne.b32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.ne.b32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.ne.b32 %p4, %r10, %r9;
+; O3-NEXT:    ld.param.b32 %r11, [test_ne_param_2];
+; O3-NEXT:    prmt.b32 %r12, %r11, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r13, %r10, %r12, %p4;
+; O3-NEXT:    prmt.b32 %r14, %r11, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r15, %r8, %r14, %p3;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r11, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r18, %r6, %r17, %p2;
+; O3-NEXT:    prmt.b32 %r19, %r11, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r20, %r4, %r19, %p1;
+; O3-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O3-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r22;
+; O3-NEXT:    ret;
   %cmp = icmp ne <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_mul(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_mul(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_mul(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %r = mul <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_or(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_or_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_or_param_0];
-; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_or(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_or_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_or_param_0];
+; O0-NEXT:    or.b32 %r3, %r1, %r2;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_or(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_or_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_or_param_1];
+; O3-NEXT:    or.b32 %r3, %r1, %r2;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = or <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_or_computed(i8 %a) {
-; CHECK-LABEL: test_or_computed(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_or_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    or.b32 %r7, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_or_computed(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_or_computed_param_0];
+; O0-NEXT:    mov.b32 %r1, 0;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; O0-NEXT:    or.b32 %r7, %r6, %r5;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_or_computed(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_or_computed_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x5410U;
+; O3-NEXT:    bfi.b32 %r4, 5, %r3, 8, 8;
+; O3-NEXT:    or.b32 %r5, %r4, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r5;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %r = or <4 x i8> %ins.1, %ins.0
@@ -575,37 +920,61 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
 }
 
 define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_xor(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
-; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_xor(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
+; O0-NEXT:    xor.b32 %r3, %r1, %r2;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_xor(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
+; O3-NEXT:    xor.b32 %r3, %r1, %r2;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = xor <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_xor_computed(i8 %a) {
-; CHECK-LABEL: test_xor_computed(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_xor_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    xor.b32 %r7, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_xor_computed(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_xor_computed_param_0];
+; O0-NEXT:    mov.b32 %r1, 0;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; O0-NEXT:    xor.b32 %r7, %r6, %r5;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_xor_computed(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_xor_computed_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x5410U;
+; O3-NEXT:    bfi.b32 %r4, 5, %r3, 8, 8;
+; O3-NEXT:    xor.b32 %r5, %r4, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r5;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %r = xor <4 x i8> %ins.1, %ins.0
@@ -641,37 +1010,61 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
 }
 
 define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_and(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_and_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_and_param_0];
-; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_and(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_and_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_and_param_0];
+; O0-NEXT:    and.b32 %r3, %r1, %r2;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_and(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_and_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_and_param_1];
+; O3-NEXT:    and.b32 %r3, %r1, %r2;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = and <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_and_computed(i8 %a) {
-; CHECK-LABEL: test_and_computed(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_and_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    and.b32 %r7, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_and_computed(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_and_computed_param_0];
+; O0-NEXT:    mov.b32 %r1, 0;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; O0-NEXT:    and.b32 %r7, %r6, %r5;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_and_computed(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_and_computed_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x5410U;
+; O3-NEXT:    bfi.b32 %r4, 5, %r3, 8, 8;
+; O3-NEXT:    and.b32 %r5, %r4, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r5;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %r = and <4 x i8> %ins.1, %ins.0
@@ -707,76 +1100,132 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
 }
 
 define void @test_ldst_v2i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v2i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    st.b32 [%rd2], %r1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v2i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<2>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    st.b32 [%rd2], %r1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v2i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<2>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
+; O3-NEXT:    st.b32 [%rd2], %r1;
+; O3-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a
   store <4 x i8> %t1, ptr %b, align 16
   ret void
 }
 
 define void @test_ldst_v3i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v3i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    st.b16 [%rd2], %r1;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
-; CHECK-NEXT:    st.b8 [%rd2+2], %r2;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v3i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<3>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    st.b16 [%rd2], %r1;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
+; O0-NEXT:    st.b8 [%rd2+2], %r2;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v3i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<3>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
+; O3-NEXT:    st.b16 [%rd2], %r1;
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
+; O3-NEXT:    st.b8 [%rd2+2], %r2;
+; O3-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
   store <3 x i8> %t1, ptr %b, align 16
   ret void
 }
 
 define void @test_ldst_v4i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v4i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    st.b32 [%rd2], %r1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v4i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<2>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    st.b32 [%rd2], %r1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v4i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<2>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
+; O3-NEXT:    st.b32 [%rd2], %r1;
+; O3-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a
   store <4 x i8> %t1, ptr %b, align 16
   ret void
 }
 
 define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v4i8_unaligned(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
-; CHECK-NEXT:    ld.b8 %r1, [%rd1];
-; CHECK-NEXT:    ld.b8 %r2, [%rd1+1];
-; CHECK-NEXT:    ld.b8 %r3, [%rd1+2];
-; CHECK-NEXT:    ld.b8 %r4, [%rd1+3];
-; CHECK-NEXT:    st.b8 [%rd2+3], %r4;
-; CHECK-NEXT:    st.b8 [%rd2+2], %r3;
-; CHECK-NEXT:    st.b8 [%rd2+1], %r2;
-; CHECK-NEXT:    st.b8 [%rd2], %r1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v4i8_unaligned(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
+; O0-NEXT:    ld.b8 %r1, [%rd1];
+; O0-NEXT:    ld.b8 %r2, [%rd1+1];
+; O0-NEXT:    ld.b8 %r3, [%rd1+2];
+; O0-NEXT:    ld.b8 %r4, [%rd1+3];
+; O0-NEXT:    st.b8 [%rd2+3], %r4;
+; O0-NEXT:    st.b8 [%rd2+2], %r3;
+; O0-NEXT:    st.b8 [%rd2+1], %r2;
+; O0-NEXT:    st.b8 [%rd2], %r1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v4i8_unaligned(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
+; O3-NEXT:    ld.b8 %r1, [%rd1+1];
+; O3-NEXT:    ld.b8 %r2, [%rd1];
+; O3-NEXT:    ld.b8 %r3, [%rd1+3];
+; O3-NEXT:    ld.b8 %r4, [%rd1+2];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
+; O3-NEXT:    st.b8 [%rd2+2], %r4;
+; O3-NEXT:    st.b8 [%rd2+3], %r3;
+; O3-NEXT:    st.b8 [%rd2], %r2;
+; O3-NEXT:    st.b8 [%rd2+1], %r1;
+; O3-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a, align 1
   store <4 x i8> %t1, ptr %b, align 1
   ret void
@@ -784,17 +1233,29 @@ define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
 
 
 define void @test_ldst_v8i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v8i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
-; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v8i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<3>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
+; O0-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; O0-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v8i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<3>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
+; O3-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
+; O3-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
+; O3-NEXT:    ret;
   %t1 = load <8 x i8>, ptr %a
   store <8 x i8> %t1, ptr %b, align 16
   ret void
@@ -803,168 +1264,310 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
 declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0
 
 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_call(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
-; CHECK-NEXT:    { // callseq 0, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r2;
-; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_call(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_call_param_0];
+; O0-NEXT:    { // callseq 0, 0
+; O0-NEXT:    .param .align 4 .b8 param0[4];
+; O0-NEXT:    st.param.b32 [param0], %r1;
+; O0-NEXT:    .param .align 4 .b8 param1[4];
+; O0-NEXT:    st.param.b32 [param1], %r2;
+; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O0-NEXT:    ld.param.b32 %r3, [retval0];
+; O0-NEXT:    } // callseq 0
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_call(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_call_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; O3-NEXT:    { // callseq 0, 0
+; O3-NEXT:    .param .align 4 .b8 param0[4];
+; O3-NEXT:    st.param.b32 [param0], %r1;
+; O3-NEXT:    .param .align 4 .b8 param1[4];
+; O3-NEXT:    st.param.b32 [param1], %r2;
+; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O3-NEXT:    ld.param.b32 %r3, [retval0];
+; O3-NEXT:    } // callseq 0
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b)
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_call_flipped(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
-; CHECK-NEXT:    { // callseq 1, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r2;
-; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r1;
-; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_call_flipped(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
+; O0-NEXT:    { // callseq 1, 0
+; O0-NEXT:    .param .align 4 .b8 param0[4];
+; O0-NEXT:    st.param.b32 [param0], %r2;
+; O0-NEXT:    .param .align 4 .b8 param1[4];
+; O0-NEXT:    st.param.b32 [param1], %r1;
+; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O0-NEXT:    ld.param.b32 %r3, [retval0];
+; O0-NEXT:    } // callseq 1
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_call_flipped(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; O3-NEXT:    { // callseq 1, 0
+; O3-NEXT:    .param .align 4 .b8 param0[4];
+; O3-NEXT:    st.param.b32 [param0], %r2;
+; O3-NEXT:    .param .align 4 .b8 param1[4];
+; O3-NEXT:    st.param.b32 [param1], %r1;
+; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O3-NEXT:    ld.param.b32 %r3, [retval0];
+; O3-NEXT:    } // callseq 1
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
-; CHECK-NEXT:    { // callseq 2, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r2;
-; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r1;
-; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    } // callseq 2
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_tailcall_flipped(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
+; O0-NEXT:    { // callseq 2, 0
+; O0-NEXT:    .param .align 4 .b8 param0[4];
+; O0-NEXT:    st.param.b32 [param0], %r2;
+; O0-NEXT:    .param .align 4 .b8 param1[4];
+; O0-NEXT:    st.param.b32 [param1], %r1;
+; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O0-NEXT:    ld.param.b32 %r3, [retval0];
+; O0-NEXT:    } // callseq 2
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_tailcall_flipped(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; O3-NEXT:    { // callseq 2, 0
+; O3-NEXT:    .param .align 4 .b8 param0[4];
+; O3-NEXT:    st.param.b32 [param0], %r2;
+; O3-NEXT:    .param .align 4 .b8 param1[4];
+; O3-NEXT:    st.param.b32 [param1], %r1;
+; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O3-NEXT:    ld.param.b32 %r3, [retval0];
+; O3-NEXT:    } // callseq 2
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
-; CHECK-LABEL: test_select(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_0];
-; CHECK-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<2>;
+; O0-NEXT:    .reg .b16 %rs<3>;
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; O0-NEXT:    and.b16 %rs2, %rs1, 1;
+; O0-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; O0-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_select_param_0];
+; O0-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<2>;
+; O3-NEXT:    .reg .b16 %rs<3>;
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; O3-NEXT:    and.b16 %rs2, %rs1, 1;
+; O3-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; O3-NEXT:    ld.param.b32 %r1, [test_select_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; O3-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = select i1 %c, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 {
-; CHECK-LABEL: test_select_cc(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<28>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
-; CHECK-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r6, %r3, 0, 0x7770U;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r8, %r3, 0, 0x7771U;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r10, %r3, 0, 0x7772U;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r10, %r9;
-; CHECK-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r12, %r11;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    prmt.b32 %r16, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r20, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; CHECK-NEXT:    prmt.b32 %r23, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
-; CHECK-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r27;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select_cc(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<28>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; O0-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; O0-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r6, %r3, 0, 0x7770U;
+; O0-NEXT:    setp.ne.b32 %p1, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r8, %r3, 0, 0x7771U;
+; O0-NEXT:    setp.ne.b32 %p2, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r10, %r3, 0, 0x7772U;
+; O0-NEXT:    setp.ne.b32 %p3, %r10, %r9;
+; O0-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
+; O0-NEXT:    setp.ne.b32 %p4, %r12, %r11;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
+; O0-NEXT:    prmt.b32 %r16, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
+; O0-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
+; O0-NEXT:    prmt.b32 %r20, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
+; O0-NEXT:    prmt.b32 %r23, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
+; O0-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
+; O0-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r27;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select_cc(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<28>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_select_cc_param_3];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    ld.param.b32 %r4, [test_select_cc_param_2];
+; O3-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
+; O3-NEXT:    setp.ne.b32 %p1, %r5, %r3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
+; O3-NEXT:    setp.ne.b32 %p2, %r7, %r6;
+; O3-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
+; O3-NEXT:    setp.ne.b32 %p3, %r9, %r8;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
+; O3-NEXT:    setp.ne.b32 %p4, %r11, %r10;
+; O3-NEXT:    ld.param.b32 %r12, [test_select_cc_param_1];
+; O3-NEXT:    prmt.b32 %r13, %r12, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
+; O3-NEXT:    prmt.b32 %r16, %r12, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
+; O3-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
+; O3-NEXT:    prmt.b32 %r20, %r12, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
+; O3-NEXT:    prmt.b32 %r23, %r12, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
+; O3-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
+; O3-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r27;
+; O3-NEXT:    ret;
   %cc = icmp ne <4 x i8> %c, %d
   %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
-; CHECK-LABEL: test_select_cc_i32_i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
-; CHECK-NEXT:    ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
-; CHECK-NEXT:    ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT:    prmt.b32 %r11, %r10, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r12, %r9, 0, 0x7770U;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r12, %r11;
-; CHECK-NEXT:    prmt.b32 %r13, %r10, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r14, %r9, 0, 0x7771U;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r14, %r13;
-; CHECK-NEXT:    prmt.b32 %r15, %r10, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r16, %r9, 0, 0x7772U;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r16, %r15;
-; CHECK-NEXT:    prmt.b32 %r17, %r10, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r18, %r9, 0, 0x7773U;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r18, %r17;
-; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
-; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
-; CHECK-NEXT:    selp.b32 %r21, %r2, %r6, %p2;
-; CHECK-NEXT:    selp.b32 %r22, %r1, %r5, %p1;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select_cc_i32_i8(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<23>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
+; O0-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
+; O0-NEXT:    ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
+; O0-NEXT:    ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
+; O0-NEXT:    prmt.b32 %r11, %r10, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r12, %r9, 0, 0x7770U;
+; O0-NEXT:    setp.ne.b32 %p1, %r12, %r11;
+; O0-NEXT:    prmt.b32 %r13, %r10, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r14, %r9, 0, 0x7771U;
+; O0-NEXT:    setp.ne.b32 %p2, %r14, %r13;
+; O0-NEXT:    prmt.b32 %r15, %r10, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r16, %r9, 0, 0x7772U;
+; O0-NEXT:    setp.ne.b32 %p3, %r16, %r15;
+; O0-NEXT:    prmt.b32 %r17, %r10, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r18, %r9, 0, 0x7773U;
+; O0-NEXT:    setp.ne.b32 %p4, %r18, %r17;
+; O0-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
+; O0-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
+; O0-NEXT:    selp.b32 %r21, %r2, %r6, %p2;
+; O0-NEXT:    selp.b32 %r22, %r1, %r5, %p1;
+; O0-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select_cc_i32_i8(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<23>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
+; O3-NEXT:    ld.param.b32 %r5, [test_select_cc_i32_i8_param_3];
+; O3-NEXT:    prmt.b32 %r6, %r5, 0, 0x7770U;
+; O3-NEXT:    ld.param.b32 %r7, [test_select_cc_i32_i8_param_2];
+; O3-NEXT:    prmt.b32 %r8, %r7, 0, 0x7770U;
+; O3-NEXT:    setp.ne.b32 %p1, %r8, %r6;
+; O3-NEXT:    prmt.b32 %r9, %r5, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r10, %r7, 0, 0x7771U;
+; O3-NEXT:    setp.ne.b32 %p2, %r10, %r9;
+; O3-NEXT:    prmt.b32 %r11, %r5, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r12, %r7, 0, 0x7772U;
+; O3-NEXT:    setp.ne.b32 %p3, %r12, %r11;
+; O3-NEXT:    prmt.b32 %r13, %r5, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r14, %r7, 0, 0x7773U;
+; O3-NEXT:    setp.ne.b32 %p4, %r14, %r13;
+; O3-NEXT:    ld.param.v4.b32 {%r15, %r16, %r17, %r18}, [test_select_cc_i32_i8_param_1];
+; O3-NEXT:    selp.b32 %r19, %r4, %r18, %p4;
+; O3-NEXT:    selp.b32 %r20, %r3, %r17, %p3;
+; O3-NEXT:    selp.b32 %r21, %r2, %r16, %p2;
+; O3-NEXT:    selp.b32 %r22, %r1, %r15, %p1;
+; O3-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
+; O3-NEXT:    ret;
                                            <4 x i8> %c, <4 x i8> %d) #0 {
   %cc = icmp ne <4 x i8> %c, %d
   %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b
@@ -972,37 +1575,69 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 }
 
 define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
-; CHECK-LABEL: test_select_cc_i8_i32(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
-; CHECK-NEXT:    ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_i8_i32_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
-; CHECK-NEXT:    setp.ne.b32 %p1, %r3, %r7;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r4, %r8;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r5, %r9;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r6, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    prmt.b32 %r14, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r18, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
-; CHECK-NEXT:    prmt.b32 %r21, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select_cc_i8_i32(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<26>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
+; O0-NEXT:    ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_select_cc_i8_i32_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
+; O0-NEXT:    setp.ne.b32 %p1, %r3, %r7;
+; O0-NEXT:    setp.ne.b32 %p2, %r4, %r8;
+; O0-NEXT:    setp.ne.b32 %p3, %r5, %r9;
+; O0-NEXT:    setp.ne.b32 %p4, %r6, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
+; O0-NEXT:    prmt.b32 %r14, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
+; O0-NEXT:    prmt.b32 %r18, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
+; O0-NEXT:    prmt.b32 %r21, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
+; O0-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
+; O0-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r25;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select_cc_i8_i32(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<26>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
+; O3-NEXT:    ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [test_select_cc_i8_i32_param_2];
+; O3-NEXT:    ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [test_select_cc_i8_i32_param_3];
+; O3-NEXT:    setp.ne.b32 %p1, %r2, %r6;
+; O3-NEXT:    setp.ne.b32 %p2, %r3, %r7;
+; O3-NEXT:    setp.ne.b32 %p3, %r4, %r8;
+; O3-NEXT:    setp.ne.b32 %p4, %r5, %r9;
+; O3-NEXT:    ld.param.b32 %r10, [test_select_cc_i8_i32_param_1];
+; O3-NEXT:    prmt.b32 %r11, %r10, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
+; O3-NEXT:    prmt.b32 %r14, %r10, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
+; O3-NEXT:    prmt.b32 %r18, %r10, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
+; O3-NEXT:    prmt.b32 %r21, %r10, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
+; O3-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
+; O3-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r25;
+; O3-NEXT:    ret;
                                           <4 x i32> %c, <4 x i32> %d) #0 {
   %cc = icmp ne <4 x i32> %c, %d
   %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
@@ -1027,23 +1662,41 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 }
 
 define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
-; CHECK-LABEL: test_trunc_2xi64(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd4;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd3;
-; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u64 %r4, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd1;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_trunc_2xi64(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-NEXT:    .reg .b64 %rd<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
+; O0-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; O0-NEXT:    cvt.u32.u64 %r1, %rd4;
+; O0-NEXT:    cvt.u32.u64 %r2, %rd3;
+; O0-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; O0-NEXT:    cvt.u32.u64 %r4, %rd2;
+; O0-NEXT:    cvt.u32.u64 %r5, %rd1;
+; O0-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O0-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_trunc_2xi64(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<8>;
+; O3-NEXT:    .reg .b64 %rd<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; O3-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
+; O3-NEXT:    cvt.u32.u64 %r1, %rd4;
+; O3-NEXT:    cvt.u32.u64 %r2, %rd3;
+; O3-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; O3-NEXT:    cvt.u32.u64 %r4, %rd2;
+; O3-NEXT:    cvt.u32.u64 %r5, %rd1;
+; O3-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O3-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r7;
+; O3-NEXT:    ret;
   %r = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %r
 }
@@ -1066,24 +1719,43 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
 }
 
 define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
-; CHECK-LABEL: test_zext_2xi64(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<6>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
-; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u64.u32 %rd2, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u64.u32 %rd3, %r4;
-; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u64.u32 %rd4, %r5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd4, %rd3};
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd2, %rd1};
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_zext_2xi64(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<6>;
+; O0-NEXT:    .reg .b64 %rd<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u64.u32 %rd1, %r2;
+; O0-NEXT:    prmt.b32 %r3, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u64.u32 %rd2, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u64.u32 %rd3, %r4;
+; O0-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u64.u32 %rd4, %r5;
+; O0-NEXT:    st.param.v2.b64 [func_retval0], {%rd4, %rd3};
+; O0-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd2, %rd1};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_zext_2xi64(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-NEXT:    .reg .b64 %rd<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u64.u32 %rd1, %r2;
+; O3-NEXT:    prmt.b32 %r3, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u64.u32 %rd2, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u64.u32 %rd3, %r4;
+; O3-NEXT:    prmt.b32 %r5, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u64.u32 %rd4, %r5;
+; O3-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd4, %rd3};
+; O3-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; O3-NEXT:    ret;
   %r = zext <4 x i8> %a to <4 x i64>
   ret <4 x i64> %r
 }
@@ -1142,20 +1814,31 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
 
 
 define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
-; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<6>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 6;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 7, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_bitcast_4xi8_to_2xhalf(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<6>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
+; O0-NEXT:    mov.b32 %r1, 6;
+; O0-NEXT:    prmt.b32 %r2, %r1, 7, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r5;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_bitcast_4xi8_to_2xhalf(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_bitcast_4xi8_to_2xhalf_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 1798, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2
@@ -1166,153 +1849,281 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
 
 
 define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
-; CHECK-LABEL: test_shufflevector(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
-; CHECK-NEXT:    // implicit-def: %r3
-; CHECK-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_shufflevector(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; O0-NEXT:    // implicit-def: %r3
+; O0-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r2;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_shufflevector(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r2;
+; O3-NEXT:    ret;
   %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i8> %s
 }
 
 define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_shufflevector_2(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_shufflevector_2(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_shufflevector_2(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 7, i32 3, i32 5, i32 2>
   ret <4 x i8> %s
 }
 
 
 define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
-; CHECK-LABEL: test_insertelement(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
-; CHECK-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_insertelement(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_insertelement_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; O0-NEXT:    cvt.u32.u16 %r2, %rs1;
+; O0-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_insertelement(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; O3-NEXT:    ld.param.b8 %r2, [test_insertelement_param_1];
+; O3-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %i = insertelement <4 x i8> %a, i8 %x, i64 1
   ret <4 x i8> %i
 }
 
 define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
-; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
-; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
-; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
-; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_fptosi_4xhalf_to_4xi8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<12>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; O0-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
+; O0-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; O0-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
+; O0-NEXT:    cvt.u32.u16 %r4, %rs6;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs5;
+; O0-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O0-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
+; O0-NEXT:    mov.b32 %r7, {%rs10, %rs9};
+; O0-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs12;
+; O0-NEXT:    cvt.u32.u16 %r9, %rs11;
+; O0-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
+; O0-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r11;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_fptosi_4xhalf_to_4xi8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<10>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; O3-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
+; O3-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
+; O3-NEXT:    mov.b32 %r1, {%rs6, %rs5};
+; O3-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; O3-NEXT:    cvt.u32.u16 %r2, %rs8;
+; O3-NEXT:    cvt.u32.u16 %r3, %rs7;
+; O3-NEXT:    prmt.b32 %r4, %r3, %r2, 0x3340U;
+; O3-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
+; O3-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
+; O3-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; O3-NEXT:    mov.b32 {%rs11, %rs12}, %r5;
+; O3-NEXT:    cvt.u32.u16 %r6, %rs12;
+; O3-NEXT:    cvt.u32.u16 %r7, %rs11;
+; O3-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r4, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r9;
+; O3-NEXT:    ret;
   %r = fptosi <4 x half> %a to <4 x i8>
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
-; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
-; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
-; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
-; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_fptoui_4xhalf_to_4xi8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<12>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; O0-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
+; O0-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; O0-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
+; O0-NEXT:    cvt.u32.u16 %r4, %rs6;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs5;
+; O0-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O0-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
+; O0-NEXT:    mov.b32 %r7, {%rs10, %rs9};
+; O0-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs12;
+; O0-NEXT:    cvt.u32.u16 %r9, %rs11;
+; O0-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
+; O0-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r11;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_fptoui_4xhalf_to_4xi8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<10>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; O3-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
+; O3-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
+; O3-NEXT:    mov.b32 %r1, {%rs6, %rs5};
+; O3-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; O3-NEXT:    cvt.u32.u16 %r2, %rs8;
+; O3-NEXT:    cvt.u32.u16 %r3, %rs7;
+; O3-NEXT:    prmt.b32 %r4, %r3, %r2, 0x3340U;
+; O3-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
+; O3-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
+; O3-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; O3-NEXT:    mov.b32 {%rs11, %rs12}, %r5;
+; O3-NEXT:    cvt.u32.u16 %r6, %rs12;
+; O3-NEXT:    cvt.u32.u16 %r7, %rs11;
+; O3-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r4, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r9;
+; O3-NEXT:    ret;
   %r = fptoui <4 x half> %a to <4 x i8>
   ret <4 x i8> %r
 }
 
 define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: test_srem_v4i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    ld.b32 %r2, [%rd2];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    rem.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    rem.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    rem.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    rem.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.b32 [%rd3], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_srem_v4i8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-NEXT:    .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0: // %entry
+; O0-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
+; O0-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    ld.b32 %r2, [%rd2];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    rem.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    rem.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    rem.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    rem.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.b32 [%rd3], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_srem_v4i8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-NEXT:    .reg .b64 %rd<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0: // %entry
+; O3-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
+; O3-NEXT:    ld.b32 %r2, [%rd2];
+; O3-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    rem.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    rem.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    rem.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    rem.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.b32 [%rd3], %r17;
+; O3-NEXT:    ret;
 entry:
   %t57 = load <4 x i8>, ptr %a, align 4
   %t59 = load <4 x i8>, ptr %b, align 4
@@ -1328,52 +2139,97 @@ entry:
 ;; Ideally we want to split it into element-wise ops, but legalizer can't handle
 ;; odd-sized vectors.  TL;DR; don't use odd-sized vectors of v8.
 define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: test_srem_v3i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
-; CHECK-NEXT:    ld.b8 %rs1, [%rd1];
-; CHECK-NEXT:    ld.b8 %rs2, [%rd1+1];
-; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
-; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    ld.s8 %rs5, [%rd1+2];
-; CHECK-NEXT:    ld.b8 %rs6, [%rd2];
-; CHECK-NEXT:    ld.b8 %rs7, [%rd2+1];
-; CHECK-NEXT:    shl.b16 %rs8, %rs7, 8;
-; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r2, %rs9;
-; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs12, %r4;
-; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs13;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs14, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs15, %r7;
-; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs16;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    // implicit-def: %r11
-; CHECK-NEXT:    // implicit-def: %r12
-; CHECK-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
-; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
-; CHECK-NEXT:    mov.b32 {%rs18, _}, %r13;
-; CHECK-NEXT:    st.b8 [%rd3], %rs18;
-; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
-; CHECK-NEXT:    st.b8 [%rd3+1], %rs19;
-; CHECK-NEXT:    st.b8 [%rd3+2], %rs17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_srem_v3i8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<20>;
+; O0-NEXT:    .reg .b32 %r<14>;
+; O0-NEXT:    .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0: // %entry
+; O0-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
+; O0-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
+; O0-NEXT:    ld.b8 %rs1, [%rd1];
+; O0-NEXT:    ld.b8 %rs2, [%rd1+1];
+; O0-NEXT:    shl.b16 %rs3, %rs2, 8;
+; O0-NEXT:    or.b16 %rs4, %rs3, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r1, %rs4;
+; O0-NEXT:    ld.s8 %rs5, [%rd1+2];
+; O0-NEXT:    ld.b8 %rs6, [%rd2];
+; O0-NEXT:    ld.b8 %rs7, [%rd2+1];
+; O0-NEXT:    shl.b16 %rs8, %rs7, 8;
+; O0-NEXT:    or.b16 %rs9, %rs8, %rs6;
+; O0-NEXT:    cvt.u32.u16 %r2, %rs9;
+; O0-NEXT:    ld.s8 %rs10, [%rd2+2];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs12, %r4;
+; O0-NEXT:    rem.s16 %rs13, %rs12, %rs11;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs13;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs14, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs15, %r7;
+; O0-NEXT:    rem.s16 %rs16, %rs15, %rs14;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs16;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    // implicit-def: %r11
+; O0-NEXT:    // implicit-def: %r12
+; O0-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
+; O0-NEXT:    rem.s16 %rs17, %rs5, %rs10;
+; O0-NEXT:    cvt.u16.u32 %rs18, %r13;
+; O0-NEXT:    st.b8 [%rd3], %rs18;
+; O0-NEXT:    shr.u16 %rs19, %rs18, 8;
+; O0-NEXT:    st.b8 [%rd3+1], %rs19;
+; O0-NEXT:    st.b8 [%rd3+2], %rs17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_srem_v3i8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<20>;
+; O3-NEXT:    .reg .b32 %r<14>;
+; O3-NEXT:    .reg .b64 %rd<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0: // %entry
+; O3-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
+; O3-NEXT:    ld.b8 %rs1, [%rd1];
+; O3-NEXT:    ld.b8 %rs2, [%rd1+1];
+; O3-NEXT:    shl.b16 %rs3, %rs2, 8;
+; O3-NEXT:    or.b16 %rs4, %rs3, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r1, %rs4;
+; O3-NEXT:    ld.s8 %rs5, [%rd1+2];
+; O3-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
+; O3-NEXT:    ld.b8 %rs6, [%rd2];
+; O3-NEXT:    ld.b8 %rs7, [%rd2+1];
+; O3-NEXT:    shl.b16 %rs8, %rs7, 8;
+; O3-NEXT:    or.b16 %rs9, %rs8, %rs6;
+; O3-NEXT:    cvt.u32.u16 %r2, %rs9;
+; O3-NEXT:    ld.s8 %rs10, [%rd2+2];
+; O3-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs12, %r4;
+; O3-NEXT:    rem.s16 %rs13, %rs12, %rs11;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs13;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs14, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs15, %r7;
+; O3-NEXT:    rem.s16 %rs16, %rs15, %rs14;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs16;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
+; O3-NEXT:    rem.s16 %rs17, %rs5, %rs10;
+; O3-NEXT:    st.b8 [%rd3+2], %rs17;
+; O3-NEXT:    cvt.u16.u32 %rs18, %r13;
+; O3-NEXT:    st.b8 [%rd3], %rs18;
+; O3-NEXT:    shr.u16 %rs19, %rs18, 8;
+; O3-NEXT:    st.b8 [%rd3+1], %rs19;
+; O3-NEXT:    ret;
 entry:
   %t57 = load <3 x i8>, ptr %a, align 1
   %t59 = load <3 x i8>, ptr %b, align 1
@@ -1383,39 +2239,73 @@ entry:
 }
 
 define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: test_sext_v4i1_to_v4i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    ld.b32 %r2, [%rd2];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.gt.u32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, -1, 0, %p4;
-; CHECK-NEXT:    selp.b32 %r12, -1, 0, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, -1, 0, %p2;
-; CHECK-NEXT:    selp.b32 %r15, -1, 0, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.b32 [%rd3], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_sext_v4i1_to_v4i8(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-NEXT:    .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0: // %entry
+; O0-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
+; O0-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    ld.b32 %r2, [%rd2];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, -1, 0, %p4;
+; O0-NEXT:    selp.b32 %r12, -1, 0, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, -1, 0, %p2;
+; O0-NEXT:    selp.b32 %r15, -1, 0, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.b32 [%rd3], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_sext_v4i1_to_v4i8(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-NEXT:    .reg .b64 %rd<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0: // %entry
+; O3-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
+; O3-NEXT:    ld.b32 %r2, [%rd2];
+; O3-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, -1, 0, %p4;
+; O3-NEXT:    selp.b32 %r12, -1, 0, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, -1, 0, %p2;
+; O3-NEXT:    selp.b32 %r15, -1, 0, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.b32 [%rd3], %r17;
+; O3-NEXT:    ret;
 entry:
   %t1 = load <4 x i8>, ptr %a, align 4
   %t2 = load <4 x i8>, ptr %b, align 4
@@ -1425,4 +2315,51 @@ entry:
   ret void
 }
 
+define <4 x float> @test_uitofp_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: test_uitofp_v4i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_uitofp_v4i8_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r5, %r4;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r7, %r6;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7770U;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r9, %r8;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3};
+; CHECK-NEXT:    ret;
+  %r = uitofp <4 x i8> %a to <4 x float>
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_sitofp_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: test_sitofp_v4i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sitofp_v4i8_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0xbbb3U;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT:    cvt.rn.f32.s16 %r3, %rs1;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT:    cvt.rn.f32.s16 %r5, %rs2;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x9991U;
+; CHECK-NEXT:    cvt.u16.u32 %rs3, %r6;
+; CHECK-NEXT:    cvt.rn.f32.s16 %r7, %rs3;
+; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x8880U;
+; CHECK-NEXT:    cvt.u16.u32 %rs4, %r8;
+; CHECK-NEXT:    cvt.rn.f32.s16 %r9, %rs4;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3};
+; CHECK-NEXT:    ret;
+  %r = sitofp <4 x i8> %a to <4 x float>
+  ret <4 x float> %r
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll
new file mode 100644
index 0000000000000..95258f7a3f360
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/pr126337.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas -arch=sm_70 -c - %}
+
+; This IR should compile without triggering assertions in LICM
+; when the CopyToReg from %0 in the first BB gets eliminated
+; but we still use its result in the second BB.
+; Technically the problem happens in MIR, but there are multiple
+; passes involved, so testing with the IR reproducer is more convenient.
+; https://github.com/llvm/llvm-project/pull/126337#issuecomment-3081431594
+
+target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) {
+; CHECK-LABEL: Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %.preheader15
+; CHECK-NEXT:    ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT:    setp.eq.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    selp.b16 %rs1, 1, 0, %p1;
+; CHECK-NEXT:  $L__BB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    mov.b64 %rd2, 0;
+; CHECK-NEXT:    st.b8 [%rd2], %rs1;
+; CHECK-NEXT:    bra.uni $L__BB0_1;
+.preheader15:
+  br label %1
+
+1:                                                ; preds = %1, %.preheader15
+  %2 = fcmp oeq <2 x float> %0, zeroinitializer
+  %3 = extractelement <2 x i1> %2, i64 0
+  store i1 %3, ptr null, align 4
+  br label %1
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll b/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll
new file mode 100644
index 0000000000000..7afead127c84f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -verify-machineinstrs -mcpu=sm_50 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+@g = global i32 0
+
+define void @test_prmt_f4e() {
+; CHECK-LABEL: test_prmt_f4e(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 67305985;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 84148994;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 100992003;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x4)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
+
+define void @test_prmt_b4e() {
+; CHECK-LABEL: test_prmt_b4e(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 84281088;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 101122049;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 117440770;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 66051;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 84281088;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x4)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
+
+define void @test_prmt_ecl() {
+; CHECK-LABEL: test_prmt_ecl(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462977;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50463234;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50529027;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  ret void
+}
+
+define void @test_prmt_ecr() {
+; CHECK-LABEL: test_prmt_ecr(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16843008;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 33685760;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  ret void
+}
+
+define void @test_prmt_rc8() {
+; CHECK-LABEL: test_prmt_rc8(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16843009;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 33686018;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50529027;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x4)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
+
+define void @test_prmt_rc16() {
+; CHECK-LABEL: test_prmt_rc16(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16777472;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50463490;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16777472;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  ret void
+}
+
+
+define void @test_prmt_basic() {
+; CHECK-LABEL: test_prmt_basic(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 66051;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 117507841;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 1146447479;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    st.volatile.global.b32 [g], -16711936;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt(i32 u0x03020100, i32 u0x07060504, i32 u0x0123)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt(i32 u0x03020100, i32 u0x07060504, i32 u0x7171)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt(i32 u0x33221100, i32 u0x77665544, i32 u0x4567)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt(i32 u0x33221100, i32 u0x77665544, i32 u0xBA98)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt(i32 u0xF322F100, i32 u0x77665544, i32 u0xBA98)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 87f965c84b6b6..92cb51b17f0c8 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -117,16 +117,20 @@ define float @reduce_fadd_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0];
-; CHECK-NEXT:    add.rn.f32 %r9, %r5, 0f00000000;
-; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r6;
-; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r7;
-; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r8;
-; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r1;
-; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r2;
-; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r3;
-; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r4;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT:    add.rn.f32 %r9, %r7, 0f00000000;
+; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r6;
+; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r3;
+; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r4;
+; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r1;
+; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
@@ -140,14 +144,18 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
 ; CHECK-SM80-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-SM80-NEXT:    add.rn.f32 %r9, %r7, %r3;
-; CHECK-SM80-NEXT:    add.rn.f32 %r10, %r5, %r1;
-; CHECK-SM80-NEXT:    add.rn.f32 %r11, %r8, %r4;
-; CHECK-SM80-NEXT:    add.rn.f32 %r12, %r6, %r2;
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-SM80-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM80-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM80-NEXT:    add.rn.f32 %r5, %r3, %r1;
+; CHECK-SM80-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM80-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM80-NEXT:    add.rn.f32 %r10, %r8, %r6;
+; CHECK-SM80-NEXT:    add.rn.f32 %r11, %r4, %r2;
+; CHECK-SM80-NEXT:    add.rn.f32 %r12, %r9, %r7;
 ; CHECK-SM80-NEXT:    add.rn.f32 %r13, %r12, %r11;
-; CHECK-SM80-NEXT:    add.rn.f32 %r14, %r10, %r9;
+; CHECK-SM80-NEXT:    add.rn.f32 %r14, %r10, %r5;
 ; CHECK-SM80-NEXT:    add.rn.f32 %r15, %r14, %r13;
 ; CHECK-SM80-NEXT:    add.rn.f32 %r16, %r15, 0f00000000;
 ; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r16;
@@ -321,15 +329,19 @@ define float @reduce_fmul_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0];
-; CHECK-NEXT:    mul.rn.f32 %r9, %r5, %r6;
-; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r7;
-; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r8;
-; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r1;
-; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r2;
-; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r3;
-; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r4;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT:    mul.rn.f32 %r9, %r7, %r8;
+; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r5;
+; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r6;
+; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r4;
+; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r1;
+; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
@@ -343,14 +355,18 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
 ; CHECK-SM80-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-SM80-NEXT:    mul.rn.f32 %r9, %r7, %r3;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r10, %r5, %r1;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r11, %r8, %r4;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r12, %r6, %r2;
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-SM80-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM80-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r5, %r3, %r1;
+; CHECK-SM80-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM80-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r10, %r8, %r6;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r11, %r4, %r2;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r12, %r9, %r7;
 ; CHECK-SM80-NEXT:    mul.rn.f32 %r13, %r12, %r11;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r14, %r10, %r9;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r14, %r10, %r5;
 ; CHECK-SM80-NEXT:    mul.rn.f32 %r15, %r14, %r13;
 ; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-SM80-NEXT:    ret;
@@ -494,13 +510,17 @@ define float @reduce_fmax_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0];
-; CHECK-NEXT:    max.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -517,13 +537,17 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-NEXT:    max.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -628,13 +652,17 @@ define float @reduce_fmin_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0];
-; CHECK-NEXT:    min.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -651,13 +679,17 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-NEXT:    min.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -762,13 +794,17 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0];
-; CHECK-NEXT:    max.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -785,13 +821,17 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-NEXT:    max.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -896,13 +936,17 @@ define float @reduce_fminimum_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0];
-; CHECK-NEXT:    min.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -919,13 +963,17 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-NEXT:    min.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll
new file mode 100644
index 0000000000000..12502b6f29899
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define float @uitofp_trunc_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: uitofp_trunc_nuw(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [uitofp_trunc_nuw_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [uitofp_trunc_nuw_param_1];
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rn.f32.u32 %r4, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %v = add i32 %x, %y
+  %t = trunc nuw i32 %v to i16
+  %f = uitofp i16 %t to float
+  ret float %f
+}
+
+define float @sitofp_trunc_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: sitofp_trunc_nsw(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sitofp_trunc_nsw_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [sitofp_trunc_nsw_param_1];
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rn.f32.s32 %r4, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %v = add i32 %x, %y
+  %t = trunc nsw i32 %v to i16
+  %f = sitofp i16 %t to float
+  ret float %f
+}
+
+define float @uitofp_trunc_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: uitofp_trunc_nsw(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [uitofp_trunc_nsw_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [uitofp_trunc_nsw_param_1];
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT:    cvt.rn.f32.u16 %r4, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+    %v = add i32 %x, %y
+    %t = trunc nsw i32 %v to i16
+    %f = uitofp i16 %t to float
+    ret float %f
+}
+
+define float @sitofp_trunc_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: sitofp_trunc_nuw(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sitofp_trunc_nuw_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [sitofp_trunc_nuw_param_1];
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT:    cvt.rn.f32.s16 %r4, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+    %v = add i32 %x, %y
+    %t = trunc nuw i32 %v to i16
+    %f = sitofp i16 %t to float
+    ret float %f
+}
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py
new file mode 100644
index 0000000000000..8f502065345c1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py
@@ -0,0 +1,14 @@
+# Check all variants of instructions supported by PTX78 on SM90
+# RUN: %python %s --ptx=78 --gpu-arch=90 --aa > %t-ptx78-sm_90.ll
+# RUN: FileCheck %t-ptx78-sm_90.ll < %t-ptx78-sm_90.ll \
+# RUN:           --check-prefixes=PTX78STMATRIX-DAG
+# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \
+# RUN:           | FileCheck %t-ptx78-sm_90.ll
+# RUN: %if ptxas-12.7 %{                                                  \
+# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \
+# RUN:           | %ptxas-verify -arch=sm_90                              \
+# RUN: %}
+
+import wmma
+
+wmma.main()
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py
index 6ad0a2a5865c4..5c14a54601ed9 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py
@@ -1,9 +1,7 @@
 # Check all variants of instructions supported by PTX86 on SM100a
 # RUN: %python %s --ptx=86 --gpu-arch=100 --aa > %t-ptx86-sm_100a.ll
 # RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG
-# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG
+# RUN:           --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
 # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \
 # RUN:           | FileCheck %t-ptx86-sm_100a.ll
 # RUN: %if ptxas-12.7 %{                                                  \
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py
index 7d9953484da7d..a77f9adddff9c 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py
@@ -1,9 +1,7 @@
 # Check all variants of instructions supported by PTX86 on SM101a
 # RUN: %python %s --ptx=86 --gpu-arch=101 --aa > %t-ptx86-sm_101a.ll
 # RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG
-# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG
+# RUN:           --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
 # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \
 # RUN:           | FileCheck %t-ptx86-sm_101a.ll
 # RUN: %if ptxas-12.7 %{                                                  \
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
index 7bddf0b6fbb78..8126e64d6cc85 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py
@@ -1,9 +1,7 @@
 # Check all variants of instructions supported by PTX86 on SM120a
 # RUN: %python %s --ptx=86 --gpu-arch=120 --aa > %t-ptx86-sm_120a.ll
 # RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG
-# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \
-# RUN:           --check-prefixes=PTX86LDMATRIX-DAG
+# RUN:           --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG
 # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \
 # RUN:           | FileCheck %t-ptx86-sm_120a.ll
 # RUN: %if ptxas-12.7 %{                                                  \
diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 2ee489670e9e4..2eb3c3dbb4c39 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -10,6 +10,7 @@
 from itertools import product
 from string import Template
 
+
 class MMAType:
     def __init__(self, ptx_type):
         self.ptx_type = ptx_type
@@ -176,6 +177,13 @@ def __init__(self, geom, frag, ptx_elt_type):
             "m8n16:x1:b8x16.b4x16_p64": 1,
             "m8n16:x2:b8x16.b4x16_p64": 2,
             "m8n16:x4:b8x16.b4x16_p64": 4,
+            # stmatrix
+            "m8n8:x1:b16": 1,
+            "m8n8:x2:b16": 2,
+            "m8n8:x4:b16": 4,
+            "m16n8:x1:b8": 1,
+            "m16n8:x2:b8": 2,
+            "m16n8:x4:b8": 4,
         }.get(
             "%s:%s:%s" % (geom, frag, ptx_elt_type),
             {
@@ -241,6 +249,13 @@ def make_ldmatrix_ops(geoms, frags, types):
     ]
 
 
+def make_stmatrix_ops(geoms, frags, types):
+    return [
+        MMAFrag(geom, frag, ptx_type)
+        for (geom, frag, ptx_type) in product(geoms, frags, types)
+    ]
+
+
 def get_wmma_ops():
     return (
         make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], [])
@@ -315,6 +330,12 @@ def get_ldmatrix_ops():
     )
 
 
+def get_stmatrix_ops():
+    return make_stmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"]) + make_stmatrix_ops(
+        ["m16n8"], ["x1", "x2", "x4"], ["b8"]
+    )
+
+
 def is_wmma_geom_supported(geom):
     # geometries for FP and ints.
     if geom in ["m8n32k16", "m32n8k16"]:
@@ -360,6 +381,14 @@ def is_ldmatrix_geom_supported(geom):
     assert False  # Unexpected geometry.
 
 
+def is_stmatrix_geom_supported(geom):
+    if geom in ["m8n8"]:
+        return ptx_version >= 78 and gpu_arch >= 90
+    elif geom in ["m16n8"]:
+        return ptx_version >= 86 and gpu_arch >= 100 and aa
+    assert False  # Unexpected geometry.
+
+
 def is_ldmatrix_trans_supported(geom, trans):
     if geom in ["m8n8"]:
         return True
@@ -369,6 +398,15 @@ def is_ldmatrix_trans_supported(geom, trans):
         return trans == ""
     assert False  # Unexpected geometry.
 
+
+def is_stmatrix_trans_supported(geom, trans):
+    if geom in ["m8n8"]:
+        return True
+    elif geom in ["m16n8"]:
+        return trans == ".trans"
+    assert False  # Unexpected geometry.
+
+
 def is_type_supported(ptx_type):
     if ptx_type in ["s8", "u8", "s32"]:
         return ptx_version >= 63 and gpu_arch >= 72
@@ -463,6 +501,16 @@ def is_ldmatrix_variant_supported(frag, trans):
     return frag.frag in ["x1", "x2", "x4"]
 
 
+def is_stmatrix_variant_supported(frag, trans):
+    if not (
+        is_type_supported(frag.mma_type.ptx_type)
+        and is_stmatrix_geom_supported(frag.geom)
+        and is_stmatrix_trans_supported(frag.geom, trans)
+    ):
+        return False
+    return frag.frag in ["x1", "x2", "x4"]
+
+
 def make_wmma_slice_ty(frag):
     return [frag.mma_type.llvm_type] * frag.nregs
 
@@ -717,6 +765,65 @@ def gen_ldmatrix_tests():
     return generated_items
 
 
+def gen_stmatrix_tests():
+    stmatrix_template = """
+declare void @${intrinsic}(i8 ${as}* %dst, ${args});
+
+; CHECK-LABEL: .func {{.*}}test_${function}(
+define void @test_${function}(i8 ${as}* %dst, ${args}) {
+; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}]
+; CHECK: {${check_args}}
+  call void @${intrinsic}(i8${as}* %dst, ${args});
+  ret void
+}
+
+; CHECK-LABEL: .func{{.*}}test_${function}_o(
+define void @test_${function}_o(i8 ${as}* %dst, ${args}) {
+; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128],
+; CHECK: {${check_args}}
+  %dst1 = getelementptr i8, i8 ${as}* %dst, i32 128;
+  call void @${intrinsic}(i8 ${as}* %dst1, ${args});
+  ret void
+}
+"""
+    intrinsic_template = (
+        "llvm.nvvm.stmatrix.sync.aligned.${geom}.${frag}${trans}.${itype}.${pspace}"
+    )
+    instruction_template = (
+        "stmatrix.sync.aligned.${geom}.${frag}${trans}${space}.${itype}"
+    )
+    generated_items = []
+
+    for frag, space, trans in product(
+        get_stmatrix_ops(),
+        ["", ".shared"],
+        ["", ".trans"],
+    ):
+        if not is_stmatrix_variant_supported(frag, trans):
+            continue
+
+        params = {
+            "frag": frag.frag,
+            "space": space,
+            "trans": trans,
+            "itype": frag.mma_type.ptx_type,
+            "pspace": get_pspace(space),
+            "as": "addrspace(%d)" % get_aspace(space),
+            "geom": frag.geom,
+        }
+
+        test_params = params
+        test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
+        test_params["function"] = test_params["intrinsic"].replace(".", "_")
+        test_params["instruction"] = Template(instruction_template).substitute(params)
+        test_params["args"] = make_wmma_slice_args(frag)
+        test_params["check_args"] = check_pattern(frag)
+
+        print(Template(stmatrix_template).substitute(test_params))
+        generated_items.append((test_params["intrinsic"], test_params["instruction"]))
+
+    return generated_items
+
 def mma_signature(op):
     if op.a.mma_type.ptx_type == "f16":
         # FP16 ops identified by accumulator & result type.
@@ -893,6 +1000,7 @@ def gen_check_unsupported_ops(items):
 ; NOALTFLOAT-NOT: .{{bf16|tf32}}
 ; NODOUBLE-NOT: .f64
 ; NOLDMATRIX-NOT: ldmatrix.sync.aligned
+; NOSTMATRIX-NOT: stmatrix.sync.aligned
 
 ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
 ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
@@ -994,6 +1102,26 @@ def gen_check_unsupported_ops(items):
 ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b6x16_p32
 ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b4x16_p64
 
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.shared.b16
+; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.shared.b16
+
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.shared.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.shared.b8
+; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.shared.b8
+
 ; PTX71MMA-DAG: mma.m8n8k4.row.col.f64
 ; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32
 ; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32
@@ -1039,6 +1167,7 @@ def gen_tests():
     items = gen_wmma_load_tests()
     items += gen_wmma_store_tests()
     items += gen_ldmatrix_tests()
+    items += gen_stmatrix_tests()
     items += gen_wmma_mma_tests()
     items += gen_mma_tests()
     gen_check_unsupported_ops(items)
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 821cfd00dcd07..b540948b20f75 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ;
 ; CHECK-PWR7-LABEL: sub_absv_8_ext:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    stdu r1, -448(r1)
-; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 448
+; CHECK-PWR7-NEXT:    stdu r1, -512(r1)
+; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT:    .cfi_offset r14, -144
+; CHECK-PWR7-NEXT:    .cfi_offset r15, -136
+; CHECK-PWR7-NEXT:    .cfi_offset r16, -128
+; CHECK-PWR7-NEXT:    .cfi_offset r17, -120
+; CHECK-PWR7-NEXT:    .cfi_offset r18, -112
 ; CHECK-PWR7-NEXT:    .cfi_offset r19, -104
 ; CHECK-PWR7-NEXT:    .cfi_offset r20, -96
 ; CHECK-PWR7-NEXT:    .cfi_offset r21, -88
@@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    .cfi_offset r28, -32
 ; CHECK-PWR7-NEXT:    .cfi_offset r29, -24
 ; CHECK-PWR7-NEXT:    .cfi_offset r30, -16
-; CHECK-PWR7-NEXT:    addi r3, r1, 304
-; CHECK-PWR7-NEXT:    std r19, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r20, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r21, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r22, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r23, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r24, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r25, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r26, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r27, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r28, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r29, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r30, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    .cfi_offset r31, -8
+; CHECK-PWR7-NEXT:    .cfi_offset r2, -152
 ; CHECK-PWR7-NEXT:    addi r3, r1, 320
-; CHECK-PWR7-NEXT:    lbz r7, 304(r1)
-; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    lbz r8, 320(r1)
-; CHECK-PWR7-NEXT:    lbz r9, 305(r1)
-; CHECK-PWR7-NEXT:    lbz r10, 321(r1)
-; CHECK-PWR7-NEXT:    lbz r26, 325(r1)
-; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT:    clrlwi r8, r8, 24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT:    lbz r11, 306(r1)
-; CHECK-PWR7-NEXT:    lbz r12, 322(r1)
-; CHECK-PWR7-NEXT:    lbz r23, 314(r1)
-; CHECK-PWR7-NEXT:    clrlwi r22, r26, 24
-; CHECK-PWR7-NEXT:    lbz r26, 330(r1)
-; CHECK-PWR7-NEXT:    sub r8, r7, r8
-; CHECK-PWR7-NEXT:    lbz r7, 315(r1)
-; CHECK-PWR7-NEXT:    sub r20, r9, r10
-; CHECK-PWR7-NEXT:    lbz r9, 331(r1)
-; CHECK-PWR7-NEXT:    lbz r0, 307(r1)
-; CHECK-PWR7-NEXT:    lbz r30, 323(r1)
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r12, r12, 24
-; CHECK-PWR7-NEXT:    clrlwi r23, r23, 24
-; CHECK-PWR7-NEXT:    clrlwi r21, r26, 24
-; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r0, r0, 24
-; CHECK-PWR7-NEXT:    clrlwi r30, r30, 24
-; CHECK-PWR7-NEXT:    lbz r29, 308(r1)
-; CHECK-PWR7-NEXT:    lbz r28, 324(r1)
-; CHECK-PWR7-NEXT:    lbz r27, 309(r1)
-; CHECK-PWR7-NEXT:    lbz r25, 310(r1)
-; CHECK-PWR7-NEXT:    lbz r24, 326(r1)
-; CHECK-PWR7-NEXT:    sub r19, r11, r12
-; CHECK-PWR7-NEXT:    sub r11, r23, r21
-; CHECK-PWR7-NEXT:    sub r9, r7, r9
-; CHECK-PWR7-NEXT:    sub r26, r0, r30
-; CHECK-PWR7-NEXT:    srawi r12, r11, 31
-; CHECK-PWR7-NEXT:    srawi r0, r9, 31
-; CHECK-PWR7-NEXT:    lbz r3, 312(r1)
-; CHECK-PWR7-NEXT:    clrlwi r29, r29, 24
-; CHECK-PWR7-NEXT:    clrlwi r28, r28, 24
-; CHECK-PWR7-NEXT:    clrlwi r27, r27, 24
-; CHECK-PWR7-NEXT:    clrlwi r25, r25, 24
-; CHECK-PWR7-NEXT:    clrlwi r24, r24, 24
-; CHECK-PWR7-NEXT:    xor r11, r11, r12
-; CHECK-PWR7-NEXT:    xor r9, r9, r0
-; CHECK-PWR7-NEXT:    sub r28, r29, r28
-; CHECK-PWR7-NEXT:    sub r30, r27, r22
-; CHECK-PWR7-NEXT:    sub r29, r25, r24
-; CHECK-PWR7-NEXT:    sub r27, r11, r12
-; CHECK-PWR7-NEXT:    sub r24, r9, r0
-; CHECK-PWR7-NEXT:    lbz r9, 316(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 332(r1)
-; CHECK-PWR7-NEXT:    lbz r4, 328(r1)
-; CHECK-PWR7-NEXT:    lbz r5, 311(r1)
-; CHECK-PWR7-NEXT:    lbz r6, 327(r1)
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r3, r3, 24
-; CHECK-PWR7-NEXT:    clrlwi r4, r4, 24
-; CHECK-PWR7-NEXT:    clrlwi r5, r5, 24
-; CHECK-PWR7-NEXT:    clrlwi r6, r6, 24
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
+; CHECK-PWR7-NEXT:    std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    lbz r3, 320(r1)
+; CHECK-PWR7-NEXT:    addi r4, r1, 336
+; CHECK-PWR7-NEXT:    stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT:    lbz r15, 334(r1)
+; CHECK-PWR7-NEXT:    lbz r14, 350(r1)
+; CHECK-PWR7-NEXT:    lbz r31, 335(r1)
+; CHECK-PWR7-NEXT:    lbz r2, 351(r1)
+; CHECK-PWR7-NEXT:    sub r15, r15, r14
+; CHECK-PWR7-NEXT:    sub r14, r31, r2
+; CHECK-PWR7-NEXT:    srawi r2, r14, 31
+; CHECK-PWR7-NEXT:    xor r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r3, 333(r1)
+; CHECK-PWR7-NEXT:    lbz r19, 331(r1)
+; CHECK-PWR7-NEXT:    lbz r18, 347(r1)
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    lbz r17, 332(r1)
+; CHECK-PWR7-NEXT:    lbz r16, 348(r1)
+; CHECK-PWR7-NEXT:    sub r17, r17, r16
+; CHECK-PWR7-NEXT:    lbz r23, 329(r1)
+; CHECK-PWR7-NEXT:    sub r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r2, 349(r1)
+; CHECK-PWR7-NEXT:    lbz r22, 345(r1)
+; CHECK-PWR7-NEXT:    lbz r4, 336(r1)
+; CHECK-PWR7-NEXT:    lbz r5, 321(r1)
+; CHECK-PWR7-NEXT:    lbz r6, 337(r1)
+; CHECK-PWR7-NEXT:    lbz r7, 322(r1)
+; CHECK-PWR7-NEXT:    lbz r8, 338(r1)
+; CHECK-PWR7-NEXT:    lbz r9, 323(r1)
+; CHECK-PWR7-NEXT:    lbz r10, 339(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 324(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 340(r1)
+; CHECK-PWR7-NEXT:    lbz r0, 325(r1)
+; CHECK-PWR7-NEXT:    lbz r30, 341(r1)
+; CHECK-PWR7-NEXT:    lbz r29, 326(r1)
+; CHECK-PWR7-NEXT:    lbz r28, 342(r1)
+; CHECK-PWR7-NEXT:    lbz r27, 327(r1)
+; CHECK-PWR7-NEXT:    lbz r26, 343(r1)
+; CHECK-PWR7-NEXT:    sub r3, r3, r2
+; CHECK-PWR7-NEXT:    lbz r25, 328(r1)
+; CHECK-PWR7-NEXT:    lbz r24, 344(r1)
+; CHECK-PWR7-NEXT:    lbz r21, 330(r1)
+; CHECK-PWR7-NEXT:    lbz r20, 346(r1)
 ; CHECK-PWR7-NEXT:    sub r5, r5, r6
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    srawi r4, r3, 31
+; CHECK-PWR7-NEXT:    srawi r18, r3, 31
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    srawi r31, r15, 31
+; CHECK-PWR7-NEXT:    ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    xor r3, r3, r18
 ; CHECK-PWR7-NEXT:    srawi r6, r5, 31
-; CHECK-PWR7-NEXT:    xor r3, r3, r4
-; CHECK-PWR7-NEXT:    sldi r27, r27, 56
-; CHECK-PWR7-NEXT:    xor r5, r5, r6
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
-; CHECK-PWR7-NEXT:    sldi r24, r24, 56
+; CHECK-PWR7-NEXT:    srawi r8, r7, 31
+; CHECK-PWR7-NEXT:    srawi r10, r9, 31
+; CHECK-PWR7-NEXT:    srawi r12, r11, 31
+; CHECK-PWR7-NEXT:    srawi r30, r0, 31
+; CHECK-PWR7-NEXT:    sub r3, r3, r18
+; CHECK-PWR7-NEXT:    srawi r18, r19, 31
+; CHECK-PWR7-NEXT:    srawi r28, r29, 31
+; CHECK-PWR7-NEXT:    ld r16, 384(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r27, 208(r1)
-; CHECK-PWR7-NEXT:    sub r4, r5, r6
-; CHECK-PWR7-NEXT:    std r27, 216(r1)
-; CHECK-PWR7-NEXT:    srawi r27, r29, 31
-; CHECK-PWR7-NEXT:    lbz r10, 313(r1)
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    std r24, 224(r1)
-; CHECK-PWR7-NEXT:    lbz r22, 329(r1)
-; CHECK-PWR7-NEXT:    std r24, 232(r1)
-; CHECK-PWR7-NEXT:    srawi r24, r30, 31
-; CHECK-PWR7-NEXT:    ld r21, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r23, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 317(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 333(r1)
-; CHECK-PWR7-NEXT:    xor r29, r29, r27
-; CHECK-PWR7-NEXT:    std r3, 176(r1)
-; CHECK-PWR7-NEXT:    std r3, 184(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    sldi r23, r23, 56
-; CHECK-PWR7-NEXT:    xor r30, r30, r24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    sub r4, r30, r24
-; CHECK-PWR7-NEXT:    ld r30, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 160(r1)
-; CHECK-PWR7-NEXT:    std r3, 168(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    sub r3, r29, r27
-; CHECK-PWR7-NEXT:    std r23, 240(r1)
-; CHECK-PWR7-NEXT:    ld r29, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r23, 248(r1)
-; CHECK-PWR7-NEXT:    ld r27, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r23, r28, 31
+; CHECK-PWR7-NEXT:    srawi r26, r27, 31
+; CHECK-PWR7-NEXT:    srawi r24, r25, 31
+; CHECK-PWR7-NEXT:    xor r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r5, r5, r6
+; CHECK-PWR7-NEXT:    std r3, 272(r1)
+; CHECK-PWR7-NEXT:    std r3, 280(r1)
+; CHECK-PWR7-NEXT:    srawi r3, r17, 31
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r17, r17, r3
+; CHECK-PWR7-NEXT:    xor r9, r9, r10
+; CHECK-PWR7-NEXT:    xor r11, r11, r12
+; CHECK-PWR7-NEXT:    xor r0, r0, r30
+; CHECK-PWR7-NEXT:    xor r29, r29, r28
+; CHECK-PWR7-NEXT:    xor r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r3, r17, r3
+; CHECK-PWR7-NEXT:    xor r25, r25, r24
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    xor r28, r28, r23
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    std r3, 144(r1)
-; CHECK-PWR7-NEXT:    ld r24, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 152(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    sub r25, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 318(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 334(r1)
-; CHECK-PWR7-NEXT:    std r3, 128(r1)
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r5, r5, r6
+; CHECK-PWR7-NEXT:    sldi r14, r14, 56
+; CHECK-PWR7-NEXT:    sldi r15, r15, 56
+; CHECK-PWR7-NEXT:    ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 256(r1)
+; CHECK-PWR7-NEXT:    std r3, 264(r1)
+; CHECK-PWR7-NEXT:    sldi r3, r19, 56
 ; CHECK-PWR7-NEXT:    sldi r25, r25, 56
-; CHECK-PWR7-NEXT:    std r3, 136(r1)
-; CHECK-PWR7-NEXT:    sub r3, r28, r23
+; CHECK-PWR7-NEXT:    sldi r27, r27, 56
+; CHECK-PWR7-NEXT:    std r3, 240(r1)
+; CHECK-PWR7-NEXT:    std r3, 248(r1)
+; CHECK-PWR7-NEXT:    sub r3, r23, r22
+; CHECK-PWR7-NEXT:    srawi r23, r3, 31
+; CHECK-PWR7-NEXT:    sub r22, r21, r20
+; CHECK-PWR7-NEXT:    srawi r21, r22, 31
+; CHECK-PWR7-NEXT:    sldi r29, r29, 56
+; CHECK-PWR7-NEXT:    sldi r0, r0, 56
+; CHECK-PWR7-NEXT:    sldi r11, r11, 56
+; CHECK-PWR7-NEXT:    xor r3, r3, r23
+; CHECK-PWR7-NEXT:    xor r22, r22, r21
+; CHECK-PWR7-NEXT:    sldi r9, r9, 56
+; CHECK-PWR7-NEXT:    sldi r7, r7, 56
+; CHECK-PWR7-NEXT:    sldi r5, r5, 56
+; CHECK-PWR7-NEXT:    ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r3, r3, r23
+; CHECK-PWR7-NEXT:    sub r22, r22, r21
+; CHECK-PWR7-NEXT:    std r14, 304(r1)
+; CHECK-PWR7-NEXT:    ld r26, 464(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r3, 112(r1)
-; CHECK-PWR7-NEXT:    ld r28, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT:    std r25, 256(r1)
-; CHECK-PWR7-NEXT:    std r25, 264(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    srawi r25, r26, 31
-; CHECK-PWR7-NEXT:    xor r26, r26, r25
-; CHECK-PWR7-NEXT:    ld r23, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r3, 120(r1)
-; CHECK-PWR7-NEXT:    sub r4, r26, r25
-; CHECK-PWR7-NEXT:    clrlwi r22, r22, 24
-; CHECK-PWR7-NEXT:    srawi r7, r8, 31
-; CHECK-PWR7-NEXT:    sub r10, r10, r22
-; CHECK-PWR7-NEXT:    ld r26, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    srawi r22, r10, 31
-; CHECK-PWR7-NEXT:    xor r8, r8, r7
-; CHECK-PWR7-NEXT:    xor r10, r10, r22
-; CHECK-PWR7-NEXT:    sub r10, r10, r22
-; CHECK-PWR7-NEXT:    ld r25, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r12, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 319(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 335(r1)
-; CHECK-PWR7-NEXT:    std r3, 96(r1)
-; CHECK-PWR7-NEXT:    sldi r12, r12, 56
-; CHECK-PWR7-NEXT:    std r3, 104(r1)
-; CHECK-PWR7-NEXT:    ld r22, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sldi r10, r10, 56
-; CHECK-PWR7-NEXT:    std r10, 192(r1)
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    std r12, 272(r1)
-; CHECK-PWR7-NEXT:    std r12, 280(r1)
-; CHECK-PWR7-NEXT:    srawi r12, r19, 31
-; CHECK-PWR7-NEXT:    xor r0, r19, r12
-; CHECK-PWR7-NEXT:    ld r19, 344(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r3, r0, r12
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r10, 200(r1)
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
+; CHECK-PWR7-NEXT:    sldi r22, r22, 56
+; CHECK-PWR7-NEXT:    ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r14, 312(r1)
+; CHECK-PWR7-NEXT:    std r15, 288(r1)
+; CHECK-PWR7-NEXT:    std r3, 208(r1)
+; CHECK-PWR7-NEXT:    std r3, 216(r1)
+; CHECK-PWR7-NEXT:    lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r15, 296(r1)
+; CHECK-PWR7-NEXT:    ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r22, 224(r1)
+; CHECK-PWR7-NEXT:    std r22, 232(r1)
+; CHECK-PWR7-NEXT:    sub r4, r3, r4
+; CHECK-PWR7-NEXT:    std r25, 192(r1)
+; CHECK-PWR7-NEXT:    ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r3, r4, 31
+; CHECK-PWR7-NEXT:    std r25, 200(r1)
+; CHECK-PWR7-NEXT:    ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r27, 176(r1)
+; CHECK-PWR7-NEXT:    std r27, 184(r1)
+; CHECK-PWR7-NEXT:    xor r4, r4, r3
+; CHECK-PWR7-NEXT:    std r29, 160(r1)
+; CHECK-PWR7-NEXT:    ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r29, 168(r1)
+; CHECK-PWR7-NEXT:    std r0, 144(r1)
+; CHECK-PWR7-NEXT:    sub r3, r4, r3
+; CHECK-PWR7-NEXT:    std r0, 152(r1)
+; CHECK-PWR7-NEXT:    ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r18, 400(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    std r3, 80(r1)
-; CHECK-PWR7-NEXT:    std r3, 88(r1)
-; CHECK-PWR7-NEXT:    sldi r9, r9, 56
-; CHECK-PWR7-NEXT:    std r9, 288(r1)
-; CHECK-PWR7-NEXT:    std r9, 296(r1)
-; CHECK-PWR7-NEXT:    srawi r9, r20, 31
-; CHECK-PWR7-NEXT:    xor r11, r20, r9
-; CHECK-PWR7-NEXT:    ld r20, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r4, r11, r9
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
+; CHECK-PWR7-NEXT:    std r11, 128(r1)
+; CHECK-PWR7-NEXT:    ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r11, 136(r1)
+; CHECK-PWR7-NEXT:    std r9, 112(r1)
 ; CHECK-PWR7-NEXT:    std r3, 64(r1)
 ; CHECK-PWR7-NEXT:    std r3, 72(r1)
-; CHECK-PWR7-NEXT:    sub r3, r8, r7
-; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r3, 48(r1)
-; CHECK-PWR7-NEXT:    std r3, 56(r1)
-; CHECK-PWR7-NEXT:    addi r3, r1, 288
+; CHECK-PWR7-NEXT:    addi r3, r1, 304
+; CHECK-PWR7-NEXT:    std r9, 120(r1)
+; CHECK-PWR7-NEXT:    ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r7, 96(r1)
+; CHECK-PWR7-NEXT:    std r7, 104(r1)
+; CHECK-PWR7-NEXT:    std r5, 80(r1)
+; CHECK-PWR7-NEXT:    std r5, 88(r1)
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    addi r3, r1, 288
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 256
+; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    ld r14, 368(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 240
+; CHECK-PWR7-NEXT:    addi r3, r1, 256
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 224
+; CHECK-PWR7-NEXT:    addi r3, r1, 240
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 208
+; CHECK-PWR7-NEXT:    addi r3, r1, 224
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 192
+; CHECK-PWR7-NEXT:    addi r3, r1, 208
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 176
+; CHECK-PWR7-NEXT:    addi r3, r1, 192
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 160
+; CHECK-PWR7-NEXT:    addi r3, r1, 176
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs0, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 144
+; CHECK-PWR7-NEXT:    addi r3, r1, 160
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 128
+; CHECK-PWR7-NEXT:    addi r3, r1, 144
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 112
+; CHECK-PWR7-NEXT:    addi r3, r1, 128
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT:    addi r3, r1, 112
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 80
+; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 64
+; CHECK-PWR7-NEXT:    addi r3, r1, 80
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 48
+; CHECK-PWR7-NEXT:    addi r3, r1, 64
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs1, v3, v2
 ; CHECK-PWR7-NEXT:    xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT:    addi r1, r1, 448
+; CHECK-PWR7-NEXT:    addi r1, r1, 512
 ; CHECK-PWR7-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll b/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll
new file mode 100644
index 0000000000000..06c8f9a3b4bb6
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll
@@ -0,0 +1,8 @@
+; Adding -enable-matrix, which is disabled by default, forces the initialization
+; of the PPCSubtarget which verifies the incompatible CPU features.
+; RUN: not llc -mtriple=powerpcspe -mattr=+vsx -enable-matrix < %s 2>&1  | FileCheck %s
+
+; CHECK: SPE and traditional floating point cannot both be enabled
+define void @test() {
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index 4b999b892ed35..6864afe3855f4 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
 ; RV64IM-NEXT:    srli a2, a2, 32
 ; RV64IM-NEXT:    mul a1, a2, a1
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    sub a0, a0, a1
 ; RV64IM-NEXT:    srliw a0, a0, 1
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    srliw a0, a0, 2
@@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
 ; RV64IMZB-NEXT:    zext.w a2, a0
 ; RV64IMZB-NEXT:    mul a1, a2, a1
 ; RV64IMZB-NEXT:    srli a1, a1, 32
-; RV64IMZB-NEXT:    subw a0, a0, a1
+; RV64IMZB-NEXT:    sub a0, a0, a1
 ; RV64IMZB-NEXT:    srliw a0, a0, 1
 ; RV64IMZB-NEXT:    add a0, a0, a1
 ; RV64IMZB-NEXT:    srliw a0, a0, 2
@@ -250,7 +250,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
 ; RV64-NEXT:    zext.b a2, a0
 ; RV64-NEXT:    mul a1, a2, a1
 ; RV64-NEXT:    srli a1, a1, 8
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    zext.b a0, a0
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    add a0, a0, a1
@@ -414,8 +414,7 @@ define i32 @sdiv_constant_srai(i32 %a) nounwind {
 ; RV64-NEXT:    addi a1, a1, 1639
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 32
-; RV64-NEXT:    sraiw a0, a0, 1
+; RV64-NEXT:    srai a0, a0, 33
 ; RV64-NEXT:    srliw a1, a0, 31
 ; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
@@ -656,8 +655,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 24
-; RV32IM-NEXT:    slli a0, a0, 24
 ; RV32IM-NEXT:    srai a0, a0, 25
 ; RV32IM-NEXT:    zext.b a1, a0
 ; RV32IM-NEXT:    srli a1, a1, 7
@@ -670,9 +667,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV32IMZB-NEXT:    sext.b a0, a0
 ; RV32IMZB-NEXT:    mul a0, a0, a1
 ; RV32IMZB-NEXT:    sext.h a0, a0
-; RV32IMZB-NEXT:    srai a0, a0, 8
-; RV32IMZB-NEXT:    sext.b a0, a0
-; RV32IMZB-NEXT:    srai a0, a0, 1
+; RV32IMZB-NEXT:    srai a0, a0, 9
 ; RV32IMZB-NEXT:    zext.b a1, a0
 ; RV32IMZB-NEXT:    srli a1, a1, 7
 ; RV32IMZB-NEXT:    add a0, a0, a1
@@ -685,8 +680,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 56
-; RV64IM-NEXT:    slli a0, a0, 56
 ; RV64IM-NEXT:    srai a0, a0, 57
 ; RV64IM-NEXT:    zext.b a1, a0
 ; RV64IM-NEXT:    srli a1, a1, 7
@@ -699,9 +692,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV64IMZB-NEXT:    sext.b a0, a0
 ; RV64IMZB-NEXT:    mul a0, a0, a1
 ; RV64IMZB-NEXT:    sext.h a0, a0
-; RV64IMZB-NEXT:    srai a0, a0, 8
-; RV64IMZB-NEXT:    sext.b a0, a0
-; RV64IMZB-NEXT:    srai a0, a0, 1
+; RV64IMZB-NEXT:    srai a0, a0, 9
 ; RV64IMZB-NEXT:    zext.b a1, a0
 ; RV64IMZB-NEXT:    srli a1, a1, 7
 ; RV64IMZB-NEXT:    add a0, a0, a1
@@ -816,7 +807,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV64IM-NEXT:    mul a1, a2, a1
 ; RV64IM-NEXT:    slli a1, a1, 48
 ; RV64IM-NEXT:    srai a1, a1, 56
-; RV64IM-NEXT:    subw a1, a1, a0
+; RV64IM-NEXT:    sub a1, a1, a0
 ; RV64IM-NEXT:    slli a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a1, 58
 ; RV64IM-NEXT:    zext.b a1, a0
@@ -906,8 +897,6 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV32IM-NEXT:    addi a1, a1, 1639
 ; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    srai a0, a0, 16
-; RV32IM-NEXT:    slli a0, a0, 16
 ; RV32IM-NEXT:    srai a0, a0, 17
 ; RV32IM-NEXT:    slli a1, a0, 16
 ; RV32IM-NEXT:    srli a1, a1, 16
@@ -921,9 +910,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV32IMZB-NEXT:    addi a1, a1, 1639
 ; RV32IMZB-NEXT:    sext.h a0, a0
 ; RV32IMZB-NEXT:    mul a0, a0, a1
-; RV32IMZB-NEXT:    srai a0, a0, 16
-; RV32IMZB-NEXT:    sext.h a0, a0
-; RV32IMZB-NEXT:    srai a0, a0, 1
+; RV32IMZB-NEXT:    srai a0, a0, 17
 ; RV32IMZB-NEXT:    zext.h a1, a0
 ; RV32IMZB-NEXT:    srli a1, a1, 15
 ; RV32IMZB-NEXT:    add a0, a0, a1
@@ -936,9 +923,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV64IM-NEXT:    addi a1, a1, 1639
 ; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    sraiw a0, a0, 16
-; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 49
+; RV64IM-NEXT:    sraiw a0, a0, 17
 ; RV64IM-NEXT:    slli a1, a0, 48
 ; RV64IM-NEXT:    srli a1, a1, 48
 ; RV64IM-NEXT:    srli a1, a1, 15
@@ -951,9 +936,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV64IMZB-NEXT:    addi a1, a1, 1639
 ; RV64IMZB-NEXT:    sext.h a0, a0
 ; RV64IMZB-NEXT:    mul a0, a0, a1
-; RV64IMZB-NEXT:    sraiw a0, a0, 16
-; RV64IMZB-NEXT:    sext.h a0, a0
-; RV64IMZB-NEXT:    srai a0, a0, 1
+; RV64IMZB-NEXT:    sraiw a0, a0, 17
 ; RV64IMZB-NEXT:    zext.h a1, a0
 ; RV64IMZB-NEXT:    srli a1, a1, 15
 ; RV64IMZB-NEXT:    add a0, a0, a1
@@ -1071,7 +1054,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV64IM-NEXT:    srai a2, a2, 48
 ; RV64IM-NEXT:    mul a1, a2, a1
 ; RV64IM-NEXT:    sraiw a1, a1, 16
-; RV64IM-NEXT:    subw a1, a1, a0
+; RV64IM-NEXT:    sub a1, a1, a0
 ; RV64IM-NEXT:    slli a1, a1, 48
 ; RV64IM-NEXT:    srai a0, a1, 51
 ; RV64IM-NEXT:    slli a1, a0, 48
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
index a49e94f4bc910..620c5ecc6c1e7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
@@ -246,17 +246,11 @@ define double @fcvt_d_wu(i32 %a) nounwind {
 }
 
 define double @fcvt_d_wu_load(ptr %p) nounwind {
-; RV32IFD-LABEL: fcvt_d_wu_load:
-; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    lw a0, 0(a0)
-; RV32IFD-NEXT:    fcvt.d.wu fa0, a0
-; RV32IFD-NEXT:    ret
-;
-; RV64IFD-LABEL: fcvt_d_wu_load:
-; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    lwu a0, 0(a0)
-; RV64IFD-NEXT:    fcvt.d.wu fa0, a0
-; RV64IFD-NEXT:    ret
+; CHECKIFD-LABEL: fcvt_d_wu_load:
+; CHECKIFD:       # %bb.0:
+; CHECKIFD-NEXT:    lw a0, 0(a0)
+; CHECKIFD-NEXT:    fcvt.d.wu fa0, a0
+; CHECKIFD-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_d_wu_load:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
index fa093623dd6f8..bbea7929a304e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
@@ -232,17 +232,11 @@ define float @fcvt_s_wu(i32 %a) nounwind {
 }
 
 define float @fcvt_s_wu_load(ptr %p) nounwind {
-; RV32IF-LABEL: fcvt_s_wu_load:
-; RV32IF:       # %bb.0:
-; RV32IF-NEXT:    lw a0, 0(a0)
-; RV32IF-NEXT:    fcvt.s.wu fa0, a0
-; RV32IF-NEXT:    ret
-;
-; RV64IF-LABEL: fcvt_s_wu_load:
-; RV64IF:       # %bb.0:
-; RV64IF-NEXT:    lwu a0, 0(a0)
-; RV64IF-NEXT:    fcvt.s.wu fa0, a0
-; RV64IF-NEXT:    ret
+; CHECKIF-LABEL: fcvt_s_wu_load:
+; CHECKIF:       # %bb.0:
+; CHECKIF-NEXT:    lw a0, 0(a0)
+; CHECKIF-NEXT:    fcvt.s.wu fa0, a0
+; CHECKIF-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_wu_load:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
index 78a2227b84a3a..a7c1c6355bff6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
@@ -88,8 +88,7 @@ body:             |
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]]
     ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]]
-    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
-    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64)
+    ; RV64I-NEXT: $x10 = COPY [[XOR]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: abs_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
index 8a786fc9993d2..46d1661983c6a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll
@@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sllw a1, a0, a1
 ; RV64I-NEXT:    srlw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotl_32:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    sllw a1, a0, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srlw a1, a0, a1
 ; RV64I-NEXT:    sllw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotr_32:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    srlw a1, a0, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotl_64:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    sll a1, a0, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotr_64:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    srl a1, a0, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_32_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sllw a1, a0, a1
 ; RV64I-NEXT:    srlw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64ZBB-LABEL: rotl_32_mask:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    negw a2, a1
+; RV64ZBB-NEXT:    neg a2, a1
 ; RV64ZBB-NEXT:    sllw a1, a0, a1
 ; RV64ZBB-NEXT:    srlw a0, a0, a2
 ; RV64ZBB-NEXT:    or a0, a1, a0
@@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotl_32_mask:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    sllw a1, a0, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64I-LABEL: rotl_32_mask_and_63_and_31:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sllw a2, a0, a1
-; RV64ZBB-NEXT:    negw a1, a1
+; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    srlw a0, a0, a1
 ; RV64ZBB-NEXT:    or a0, a2, a0
 ; RV64ZBB-NEXT:    ret
@@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_32_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srlw a1, a0, a1
 ; RV64I-NEXT:    sllw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64ZBB-LABEL: rotr_32_mask:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    negw a2, a1
+; RV64ZBB-NEXT:    neg a2, a1
 ; RV64ZBB-NEXT:    srlw a1, a0, a1
 ; RV64ZBB-NEXT:    sllw a0, a0, a2
 ; RV64ZBB-NEXT:    or a0, a1, a0
@@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotr_32_mask:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    srlw a1, a0, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64I-LABEL: rotr_32_mask_and_63_and_31:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    srlw a2, a0, a1
-; RV64ZBB-NEXT:    negw a1, a1
+; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    sllw a0, a0, a1
 ; RV64ZBB-NEXT:    or a0, a2, a0
 ; RV64ZBB-NEXT:    ret
@@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_64_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64ZBB-LABEL: rotl_64_mask:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    negw a2, a1
+; RV64ZBB-NEXT:    neg a2, a1
 ; RV64ZBB-NEXT:    sll a1, a0, a1
 ; RV64ZBB-NEXT:    srl a0, a0, a2
 ; RV64ZBB-NEXT:    or a0, a1, a0
@@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotl_64_mask:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    sll a1, a0, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64I-LABEL: rotl_64_mask_and_127_and_63:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sll a2, a0, a1
-; RV64ZBB-NEXT:    negw a1, a1
+; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    srl a0, a0, a1
 ; RV64ZBB-NEXT:    or a0, a2, a0
 ; RV64ZBB-NEXT:    ret
@@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_64_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64ZBB-LABEL: rotr_64_mask:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    negw a2, a1
+; RV64ZBB-NEXT:    neg a2, a1
 ; RV64ZBB-NEXT:    srl a1, a0, a1
 ; RV64ZBB-NEXT:    sll a0, a0, a2
 ; RV64ZBB-NEXT:    or a0, a1, a0
@@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: rotr_64_mask:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    negw a2, a1
+; RV64XTHEADBB-NEXT:    neg a2, a1
 ; RV64XTHEADBB-NEXT:    srl a1, a0, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64I-LABEL: rotr_64_mask_and_127_and_63:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    srl a2, a0, a1
-; RV64ZBB-NEXT:    negw a1, a1
+; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    sll a0, a0, a1
 ; RV64ZBB-NEXT:    or a0, a2, a0
 ; RV64ZBB-NEXT:    ret
@@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a3, a2, 31
 ; RV64I-NEXT:    sllw a4, a0, a2
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    srlw a0, a0, a3
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    sllw a1, a1, a2
@@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 31
 ; RV64XTHEADBB-NEXT:    sllw a4, a0, a2
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a2
@@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a3, a2, 63
 ; RV64I-NEXT:    sll a4, a0, a2
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    srl a0, a0, a3
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    sll a1, a1, a2
@@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 63
 ; RV64XTHEADBB-NEXT:    sll a4, a0, a2
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a2
@@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a3, a2, 31
 ; RV64I-NEXT:    srlw a4, a0, a2
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    sllw a0, a0, a3
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    sllw a1, a1, a2
@@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 31
 ; RV64XTHEADBB-NEXT:    srlw a4, a0, a2
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a2
@@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a3, a2, 63
 ; RV64I-NEXT:    srl a4, a0, a2
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    sll a0, a0, a3
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    sll a1, a1, a2
@@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 63
 ; RV64XTHEADBB-NEXT:    srl a4, a0, a2
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a2
@@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I-NEXT:    andi a3, a2, 31
 ; RV64I-NEXT:    sllw a4, a0, a2
 ; RV64I-NEXT:    sllw a2, a1, a2
-; RV64I-NEXT:    negw a5, a3
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a5, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    srlw a0, a0, a5
 ; RV64I-NEXT:    srlw a1, a1, a3
 ; RV64I-NEXT:    or a0, a4, a0
@@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 31
 ; RV64XTHEADBB-NEXT:    sllw a4, a0, a2
 ; RV64XTHEADBB-NEXT:    sllw a2, a1, a2
-; RV64XTHEADBB-NEXT:    negw a5, a3
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a5, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a5
 ; RV64XTHEADBB-NEXT:    srlw a1, a1, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
@@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64I-NEXT:    andi a3, a2, 63
 ; RV64I-NEXT:    sll a4, a0, a2
 ; RV64I-NEXT:    sll a2, a1, a2
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    srl a0, a0, a3
 ; RV64I-NEXT:    srl a1, a1, a3
 ; RV64I-NEXT:    or a0, a4, a0
@@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 63
 ; RV64XTHEADBB-NEXT:    sll a4, a0, a2
 ; RV64XTHEADBB-NEXT:    sll a2, a1, a2
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a3
 ; RV64XTHEADBB-NEXT:    srl a1, a1, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
@@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I-NEXT:    andi a3, a2, 31
 ; RV64I-NEXT:    srlw a4, a0, a2
 ; RV64I-NEXT:    srlw a2, a1, a2
-; RV64I-NEXT:    negw a5, a3
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a5, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    sllw a0, a0, a5
 ; RV64I-NEXT:    sllw a1, a1, a3
 ; RV64I-NEXT:    or a0, a4, a0
@@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 31
 ; RV64XTHEADBB-NEXT:    srlw a4, a0, a2
 ; RV64XTHEADBB-NEXT:    srlw a2, a1, a2
-; RV64XTHEADBB-NEXT:    negw a5, a3
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a5, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a5
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
@@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64I-NEXT:    andi a3, a2, 63
 ; RV64I-NEXT:    srl a4, a0, a2
 ; RV64I-NEXT:    srl a2, a1, a2
-; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    neg a3, a3
 ; RV64I-NEXT:    sll a0, a0, a3
 ; RV64I-NEXT:    sll a1, a1, a3
 ; RV64I-NEXT:    or a0, a4, a0
@@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64XTHEADBB-NEXT:    andi a3, a2, 63
 ; RV64XTHEADBB-NEXT:    srl a4, a0, a2
 ; RV64XTHEADBB-NEXT:    srl a2, a1, a2
-; RV64XTHEADBB-NEXT:    negw a3, a3
+; RV64XTHEADBB-NEXT:    neg a3, a3
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a3
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a3
 ; RV64XTHEADBB-NEXT:    or a0, a4, a0
@@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64I-LABEL: rotl_64_zext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a2, 64
-; RV64I-NEXT:    subw a2, a2, a1
+; RV64I-NEXT:    sub a2, a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64ZBB-LABEL: rotl_64_zext:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a2, 64
-; RV64ZBB-NEXT:    subw a2, a2, a1
+; RV64ZBB-NEXT:    sub a2, a2, a1
 ; RV64ZBB-NEXT:    sll a1, a0, a1
 ; RV64ZBB-NEXT:    srl a0, a0, a2
 ; RV64ZBB-NEXT:    or a0, a1, a0
@@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_zext:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    li a2, 64
-; RV64XTHEADBB-NEXT:    subw a2, a2, a1
+; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    sll a1, a0, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
@@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64I-LABEL: rotr_64_zext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a2, 64
-; RV64I-NEXT:    subw a2, a2, a1
+; RV64I-NEXT:    sub a2, a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64ZBB-LABEL: rotr_64_zext:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a2, 64
-; RV64ZBB-NEXT:    subw a2, a2, a1
+; RV64ZBB-NEXT:    sub a2, a2, a1
 ; RV64ZBB-NEXT:    srl a1, a0, a1
 ; RV64ZBB-NEXT:    sll a0, a0, a2
 ; RV64ZBB-NEXT:    or a0, a1, a0
@@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_zext:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    li a2, 64
-; RV64XTHEADBB-NEXT:    subw a2, a2, a1
+; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    srl a1, a0, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a2
 ; RV64XTHEADBB-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
index 1eddb8fc2797e..b7f84ba696c26 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll
@@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
 define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: rol_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sllw a1, a0, a1
 ; RV64I-NEXT:    srlw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
 define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 ; RV64I-LABEL: rol_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a3, a1
+; RV64I-NEXT:    neg a3, a1
 ; RV64I-NEXT:    sllw a1, a0, a1
 ; RV64I-NEXT:    srlw a0, a0, a3
 ; RV64I-NEXT:    or a0, a1, a0
@@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: rol_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    neg a2, a0
 ; RV64I-NEXT:    sllw a0, a1, a0
 ; RV64I-NEXT:    srlw a1, a1, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: rol_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
 define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: ror_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srlw a1, a0, a1
 ; RV64I-NEXT:    sllw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
 define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 ; RV64I-LABEL: ror_i32_nosext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a3, a1
+; RV64I-NEXT:    neg a3, a1
 ; RV64I-NEXT:    srlw a1, a0, a1
 ; RV64I-NEXT:    sllw a0, a0, a3
 ; RV64I-NEXT:    or a0, a1, a0
@@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: ror_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    neg a2, a0
 ; RV64I-NEXT:    srlw a0, a1, a0
 ; RV64I-NEXT:    sllw a1, a1, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
 define i64 @ror_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: ror_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 9690302552090..2dd3bb3119dd3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -31,7 +31,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -88,7 +88,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    j .LBB1_3
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    li a0, 32
@@ -153,7 +153,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    subw a1, a1, a0
+; RV64I-NEXT:    sub a1, a1, a0
 ; RV64I-NEXT:  .LBB2_2: # %cond.end
 ; RV64I-NEXT:    subw a0, s0, a1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -212,7 +212,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -283,7 +283,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -412,7 +412,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -497,7 +497,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -553,7 +553,7 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -672,7 +672,7 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -728,7 +728,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
@@ -748,7 +748,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ;
 ; RV64ZBB-LABEL: ctpop_i32_load:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    lwu a0, 0(a0)
+; RV64ZBB-NEXT:    lw a0, 0(a0)
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    ret
   %a = load i32, ptr %p
@@ -1053,9 +1053,8 @@ define signext i32 @abs_i32_sext(i32 signext %x) {
 ; RV64I-LABEL: abs_i32_sext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srai a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abs_i32_sext:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index cd59c9e01806d..ba058ca0b500a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -114,7 +114,7 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind {
 define i64 @pack_i64_3(ptr %0, ptr %1) {
 ; RV64I-LABEL: pack_i64_3:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lwu a0, 0(a0)
+; RV64I-NEXT:    lw a0, 0(a0)
 ; RV64I-NEXT:    lwu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
@@ -122,8 +122,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) {
 ;
 ; RV64ZBKB-LABEL: pack_i64_3:
 ; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    lwu a0, 0(a0)
-; RV64ZBKB-NEXT:    lwu a1, 0(a1)
+; RV64ZBKB-NEXT:    lw a0, 0(a0)
+; RV64ZBKB-NEXT:    lw a1, 0(a1)
 ; RV64ZBKB-NEXT:    pack a0, a1, a0
 ; RV64ZBKB-NEXT:    ret
   %3 = load i32, ptr %0, align 4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
index 8b262db56ccd2..d634cc9f6395c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll
@@ -330,13 +330,13 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:    li a3, 64
 ; RV64I-NEXT:    bltu a2, a3, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a4, a2, a3
+; RV64I-NEXT:    sub a4, a2, a3
 ; RV64I-NEXT:    srl a4, a1, a4
 ; RV64I-NEXT:    bnez a2, .LBB6_3
 ; RV64I-NEXT:    j .LBB6_4
 ; RV64I-NEXT:  .LBB6_2:
 ; RV64I-NEXT:    srl a4, a0, a2
-; RV64I-NEXT:    negw a5, a2
+; RV64I-NEXT:    neg a5, a2
 ; RV64I-NEXT:    sll a5, a1, a5
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    beqz a2, .LBB6_4
@@ -476,13 +476,13 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:    li a3, 64
 ; RV64I-NEXT:    bltu a2, a3, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a4, a2, a3
+; RV64I-NEXT:    sub a4, a2, a3
 ; RV64I-NEXT:    sra a4, a1, a4
 ; RV64I-NEXT:    bnez a2, .LBB7_3
 ; RV64I-NEXT:    j .LBB7_4
 ; RV64I-NEXT:  .LBB7_2:
 ; RV64I-NEXT:    srl a4, a0, a2
-; RV64I-NEXT:    negw a5, a2
+; RV64I-NEXT:    neg a5, a2
 ; RV64I-NEXT:    sll a5, a1, a5
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    beqz a2, .LBB7_4
@@ -615,13 +615,13 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:    bltu a2, a4, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a0, 0
-; RV64I-NEXT:    subw a4, a2, a4
+; RV64I-NEXT:    sub a4, a2, a4
 ; RV64I-NEXT:    sll a3, a3, a4
 ; RV64I-NEXT:    bnez a2, .LBB8_3
 ; RV64I-NEXT:    j .LBB8_4
 ; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    sll a0, a3, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    srl a3, a3, a4
 ; RV64I-NEXT:    sll a4, a1, a2
 ; RV64I-NEXT:    or a3, a3, a4
@@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 ;
 ; RV64I-LABEL: fshr64_minsize:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -914,12 +914,12 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV64I-NEXT:    li a4, 64
 ; RV64I-NEXT:    bltu a5, a4, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a3, a5, a4
+; RV64I-NEXT:    sub a3, a5, a4
 ; RV64I-NEXT:    srl a6, a1, a3
 ; RV64I-NEXT:    j .LBB10_3
 ; RV64I-NEXT:  .LBB10_2:
 ; RV64I-NEXT:    srl a3, a0, a2
-; RV64I-NEXT:    negw a6, a5
+; RV64I-NEXT:    neg a6, a5
 ; RV64I-NEXT:    sll a6, a1, a6
 ; RV64I-NEXT:    or a6, a3, a6
 ; RV64I-NEXT:  .LBB10_3:
@@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV64I-NEXT:  # %bb.4:
 ; RV64I-NEXT:    mv a3, a6
 ; RV64I-NEXT:  .LBB10_5:
-; RV64I-NEXT:    negw a7, a2
+; RV64I-NEXT:    neg a7, a2
 ; RV64I-NEXT:    bltu a5, a4, .LBB10_7
 ; RV64I-NEXT:  # %bb.6:
 ; RV64I-NEXT:    li a2, 0
@@ -940,13 +940,13 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV64I-NEXT:    bltu a6, a4, .LBB10_10
 ; RV64I-NEXT:  # %bb.9:
 ; RV64I-NEXT:    li a5, 0
-; RV64I-NEXT:    subw a4, a6, a4
+; RV64I-NEXT:    sub a4, a6, a4
 ; RV64I-NEXT:    sll a0, a0, a4
 ; RV64I-NEXT:    bnez a6, .LBB10_11
 ; RV64I-NEXT:    j .LBB10_12
 ; RV64I-NEXT:  .LBB10_10:
 ; RV64I-NEXT:    sll a5, a0, a7
-; RV64I-NEXT:    negw a4, a6
+; RV64I-NEXT:    neg a4, a6
 ; RV64I-NEXT:    srl a0, a0, a4
 ; RV64I-NEXT:    sll a4, a1, a7
 ; RV64I-NEXT:    or a0, a0, a4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
index 69519c00f88ea..014b1c1b936ee 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -758,13 +758,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a6, a7
 ; RV64I-NEXT:    bltu a1, a4, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
+; RV64I-NEXT:    sub a5, a1, a4
 ; RV64I-NEXT:    srl a5, a3, a5
 ; RV64I-NEXT:    bnez a1, .LBB6_3
 ; RV64I-NEXT:    j .LBB6_4
 ; RV64I-NEXT:  .LBB6_2:
 ; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
+; RV64I-NEXT:    neg a6, a1
 ; RV64I-NEXT:    sll a6, a3, a6
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    beqz a1, .LBB6_4
@@ -1091,13 +1091,13 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    or a3, a6, a7
 ; RV64I-NEXT:    bltu a1, a4, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
+; RV64I-NEXT:    sub a5, a1, a4
 ; RV64I-NEXT:    srl a5, a3, a5
 ; RV64I-NEXT:    bnez a1, .LBB7_3
 ; RV64I-NEXT:    j .LBB7_4
 ; RV64I-NEXT:  .LBB7_2:
 ; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
+; RV64I-NEXT:    neg a6, a1
 ; RV64I-NEXT:    sll a6, a3, a6
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    beqz a1, .LBB7_4
@@ -1425,13 +1425,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    bltu a3, a5, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    subw a5, a3, a5
+; RV64I-NEXT:    sub a5, a3, a5
 ; RV64I-NEXT:    sll a4, a4, a5
 ; RV64I-NEXT:    bnez a3, .LBB8_3
 ; RV64I-NEXT:    j .LBB8_4
 ; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    sll a1, a4, a3
-; RV64I-NEXT:    negw a5, a3
+; RV64I-NEXT:    neg a5, a3
 ; RV64I-NEXT:    srl a4, a4, a5
 ; RV64I-NEXT:    sll a5, a0, a3
 ; RV64I-NEXT:    or a4, a4, a5
@@ -1754,13 +1754,13 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    bltu a3, a5, .LBB9_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a1, 0
-; RV64I-NEXT:    subw a5, a3, a5
+; RV64I-NEXT:    sub a5, a3, a5
 ; RV64I-NEXT:    sll a4, a4, a5
 ; RV64I-NEXT:    bnez a3, .LBB9_3
 ; RV64I-NEXT:    j .LBB9_4
 ; RV64I-NEXT:  .LBB9_2:
 ; RV64I-NEXT:    sll a1, a4, a3
-; RV64I-NEXT:    negw a5, a3
+; RV64I-NEXT:    neg a5, a3
 ; RV64I-NEXT:    srl a4, a4, a5
 ; RV64I-NEXT:    sll a5, a0, a3
 ; RV64I-NEXT:    or a4, a4, a5
@@ -2083,13 +2083,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a6, a7
 ; RV64I-NEXT:    bltu a1, a4, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
+; RV64I-NEXT:    sub a5, a1, a4
 ; RV64I-NEXT:    sra a5, a3, a5
 ; RV64I-NEXT:    bnez a1, .LBB10_3
 ; RV64I-NEXT:    j .LBB10_4
 ; RV64I-NEXT:  .LBB10_2:
 ; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
+; RV64I-NEXT:    neg a6, a1
 ; RV64I-NEXT:    sll a6, a3, a6
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    beqz a1, .LBB10_4
@@ -2416,13 +2416,13 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    or a3, a6, a7
 ; RV64I-NEXT:    bltu a1, a4, .LBB11_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    subw a5, a1, a4
+; RV64I-NEXT:    sub a5, a1, a4
 ; RV64I-NEXT:    sra a5, a3, a5
 ; RV64I-NEXT:    bnez a1, .LBB11_3
 ; RV64I-NEXT:    j .LBB11_4
 ; RV64I-NEXT:  .LBB11_2:
 ; RV64I-NEXT:    srl a5, a0, a1
-; RV64I-NEXT:    negw a6, a1
+; RV64I-NEXT:    neg a6, a1
 ; RV64I-NEXT:    sll a6, a3, a6
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    beqz a1, .LBB11_4
@@ -2796,8 +2796,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or t0, t5, t3
 ; RV64I-NEXT:    or a5, s0, t6
 ; RV64I-NEXT:    slli a5, a5, 3
-; RV64I-NEXT:    subw t1, a5, a7
-; RV64I-NEXT:    negw t5, a5
+; RV64I-NEXT:    sub t1, a5, a7
+; RV64I-NEXT:    neg t5, a5
 ; RV64I-NEXT:    sll t3, t0, t5
 ; RV64I-NEXT:    bltu a5, a7, .LBB12_2
 ; RV64I-NEXT:  # %bb.1:
@@ -2842,7 +2842,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    bgeu t6, a7, .LBB12_14
 ; RV64I-NEXT:  .LBB12_12:
 ; RV64I-NEXT:    sll t5, a6, t5
-; RV64I-NEXT:    negw s0, t6
+; RV64I-NEXT:    neg s0, t6
 ; RV64I-NEXT:    srl s0, a6, s0
 ; RV64I-NEXT:    or s1, s0, t3
 ; RV64I-NEXT:    j .LBB12_15
@@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    bltu t6, a7, .LBB12_12
 ; RV64I-NEXT:  .LBB12_14:
 ; RV64I-NEXT:    li t5, 0
-; RV64I-NEXT:    subw t3, t6, a7
+; RV64I-NEXT:    sub t3, t6, a7
 ; RV64I-NEXT:    sll s1, a6, t3
 ; RV64I-NEXT:  .LBB12_15:
 ; RV64I-NEXT:    sub s0, a5, t1
@@ -2862,13 +2862,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:  .LBB12_17:
 ; RV64I-NEXT:    bltu s0, a7, .LBB12_19
 ; RV64I-NEXT:  # %bb.18:
-; RV64I-NEXT:    subw t6, s0, a7
+; RV64I-NEXT:    sub t6, s0, a7
 ; RV64I-NEXT:    srl t6, t0, t6
 ; RV64I-NEXT:    bnez s0, .LBB12_20
 ; RV64I-NEXT:    j .LBB12_21
 ; RV64I-NEXT:  .LBB12_19:
 ; RV64I-NEXT:    srl t6, a6, s0
-; RV64I-NEXT:    negw s1, s0
+; RV64I-NEXT:    neg s1, s0
 ; RV64I-NEXT:    sll s1, t0, s1
 ; RV64I-NEXT:    or t6, t6, s1
 ; RV64I-NEXT:    beqz s0, .LBB12_21
@@ -3720,8 +3720,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    or t0, t5, t3
 ; RV64I-NEXT:    or a5, s0, t6
 ; RV64I-NEXT:    slli a5, a5, 5
-; RV64I-NEXT:    subw t1, a5, a7
-; RV64I-NEXT:    negw t5, a5
+; RV64I-NEXT:    sub t1, a5, a7
+; RV64I-NEXT:    neg t5, a5
 ; RV64I-NEXT:    sll t3, t0, t5
 ; RV64I-NEXT:    bltu a5, a7, .LBB13_2
 ; RV64I-NEXT:  # %bb.1:
@@ -3766,7 +3766,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    bgeu t6, a7, .LBB13_14
 ; RV64I-NEXT:  .LBB13_12:
 ; RV64I-NEXT:    sll t5, a6, t5
-; RV64I-NEXT:    negw s0, t6
+; RV64I-NEXT:    neg s0, t6
 ; RV64I-NEXT:    srl s0, a6, s0
 ; RV64I-NEXT:    or s1, s0, t3
 ; RV64I-NEXT:    j .LBB13_15
@@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    bltu t6, a7, .LBB13_12
 ; RV64I-NEXT:  .LBB13_14:
 ; RV64I-NEXT:    li t5, 0
-; RV64I-NEXT:    subw t3, t6, a7
+; RV64I-NEXT:    sub t3, t6, a7
 ; RV64I-NEXT:    sll s1, a6, t3
 ; RV64I-NEXT:  .LBB13_15:
 ; RV64I-NEXT:    sub s0, a5, t1
@@ -3786,13 +3786,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:  .LBB13_17:
 ; RV64I-NEXT:    bltu s0, a7, .LBB13_19
 ; RV64I-NEXT:  # %bb.18:
-; RV64I-NEXT:    subw t6, s0, a7
+; RV64I-NEXT:    sub t6, s0, a7
 ; RV64I-NEXT:    srl t6, t0, t6
 ; RV64I-NEXT:    bnez s0, .LBB13_20
 ; RV64I-NEXT:    j .LBB13_21
 ; RV64I-NEXT:  .LBB13_19:
 ; RV64I-NEXT:    srl t6, a6, s0
-; RV64I-NEXT:    negw s1, s0
+; RV64I-NEXT:    neg s1, s0
 ; RV64I-NEXT:    sll s1, t0, s1
 ; RV64I-NEXT:    or t6, t6, s1
 ; RV64I-NEXT:    beqz s0, .LBB13_21
@@ -4644,8 +4644,8 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    or t0, t5, t3
 ; RV64I-NEXT:    or a5, s0, t6
 ; RV64I-NEXT:    slli a5, a5, 6
-; RV64I-NEXT:    subw t1, a5, a7
-; RV64I-NEXT:    negw t5, a5
+; RV64I-NEXT:    sub t1, a5, a7
+; RV64I-NEXT:    neg t5, a5
 ; RV64I-NEXT:    sll t3, t0, t5
 ; RV64I-NEXT:    bltu a5, a7, .LBB14_2
 ; RV64I-NEXT:  # %bb.1:
@@ -4690,7 +4690,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    bgeu t6, a7, .LBB14_14
 ; RV64I-NEXT:  .LBB14_12:
 ; RV64I-NEXT:    sll t5, a6, t5
-; RV64I-NEXT:    negw s0, t6
+; RV64I-NEXT:    neg s0, t6
 ; RV64I-NEXT:    srl s0, a6, s0
 ; RV64I-NEXT:    or s1, s0, t3
 ; RV64I-NEXT:    j .LBB14_15
@@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    bltu t6, a7, .LBB14_12
 ; RV64I-NEXT:  .LBB14_14:
 ; RV64I-NEXT:    li t5, 0
-; RV64I-NEXT:    subw t3, t6, a7
+; RV64I-NEXT:    sub t3, t6, a7
 ; RV64I-NEXT:    sll s1, a6, t3
 ; RV64I-NEXT:  .LBB14_15:
 ; RV64I-NEXT:    sub s0, a5, t1
@@ -4710,13 +4710,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:  .LBB14_17:
 ; RV64I-NEXT:    bltu s0, a7, .LBB14_19
 ; RV64I-NEXT:  # %bb.18:
-; RV64I-NEXT:    subw t6, s0, a7
+; RV64I-NEXT:    sub t6, s0, a7
 ; RV64I-NEXT:    srl t6, t0, t6
 ; RV64I-NEXT:    bnez s0, .LBB14_20
 ; RV64I-NEXT:    j .LBB14_21
 ; RV64I-NEXT:  .LBB14_19:
 ; RV64I-NEXT:    srl t6, a6, s0
-; RV64I-NEXT:    negw s1, s0
+; RV64I-NEXT:    neg s1, s0
 ; RV64I-NEXT:    sll s1, t0, s1
 ; RV64I-NEXT:    or t6, t6, s1
 ; RV64I-NEXT:    beqz s0, .LBB14_21
@@ -5542,8 +5542,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, s0, a6
 ; RV64I-NEXT:    or a6, a1, s5
 ; RV64I-NEXT:    slli a6, a6, 3
-; RV64I-NEXT:    subw t2, a6, t0
-; RV64I-NEXT:    negw t3, a6
+; RV64I-NEXT:    sub t2, a6, t0
+; RV64I-NEXT:    neg t3, a6
 ; RV64I-NEXT:    srl s0, t1, t3
 ; RV64I-NEXT:    bltu a6, t0, .LBB15_2
 ; RV64I-NEXT:  # %bb.1:
@@ -5585,11 +5585,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli s4, s9, 16
 ; RV64I-NEXT:    bltu a4, t0, .LBB15_7
 ; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    subw s0, a4, t0
+; RV64I-NEXT:    sub s0, a4, t0
 ; RV64I-NEXT:    srl s0, a5, s0
 ; RV64I-NEXT:    j .LBB15_8
 ; RV64I-NEXT:  .LBB15_7:
-; RV64I-NEXT:    negw s6, a4
+; RV64I-NEXT:    neg s6, a4
 ; RV64I-NEXT:    sll s6, a5, s6
 ; RV64I-NEXT:    or s0, s0, s6
 ; RV64I-NEXT:  .LBB15_8:
@@ -5637,13 +5637,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    bltu s0, t0, .LBB15_20
 ; RV64I-NEXT:  # %bb.19:
 ; RV64I-NEXT:    li t2, 0
-; RV64I-NEXT:    subw t0, s0, t0
+; RV64I-NEXT:    sub t0, s0, t0
 ; RV64I-NEXT:    sll t0, t1, t0
 ; RV64I-NEXT:    bnez s0, .LBB15_21
 ; RV64I-NEXT:    j .LBB15_22
 ; RV64I-NEXT:  .LBB15_20:
 ; RV64I-NEXT:    sll t2, t1, s0
-; RV64I-NEXT:    negw t0, s0
+; RV64I-NEXT:    neg t0, s0
 ; RV64I-NEXT:    srl t0, t1, t0
 ; RV64I-NEXT:    sll t1, a5, s0
 ; RV64I-NEXT:    or t0, t0, t1
@@ -6456,8 +6456,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    or a5, s0, a6
 ; RV64I-NEXT:    or a6, a1, s5
 ; RV64I-NEXT:    slli a6, a6, 5
-; RV64I-NEXT:    subw t2, a6, t0
-; RV64I-NEXT:    negw t3, a6
+; RV64I-NEXT:    sub t2, a6, t0
+; RV64I-NEXT:    neg t3, a6
 ; RV64I-NEXT:    srl s0, t1, t3
 ; RV64I-NEXT:    bltu a6, t0, .LBB16_2
 ; RV64I-NEXT:  # %bb.1:
@@ -6499,11 +6499,11 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    slli s4, s9, 16
 ; RV64I-NEXT:    bltu a4, t0, .LBB16_7
 ; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    subw s0, a4, t0
+; RV64I-NEXT:    sub s0, a4, t0
 ; RV64I-NEXT:    srl s0, a5, s0
 ; RV64I-NEXT:    j .LBB16_8
 ; RV64I-NEXT:  .LBB16_7:
-; RV64I-NEXT:    negw s6, a4
+; RV64I-NEXT:    neg s6, a4
 ; RV64I-NEXT:    sll s6, a5, s6
 ; RV64I-NEXT:    or s0, s0, s6
 ; RV64I-NEXT:  .LBB16_8:
@@ -6551,13 +6551,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 ; RV64I-NEXT:    bltu s0, t0, .LBB16_20
 ; RV64I-NEXT:  # %bb.19:
 ; RV64I-NEXT:    li t2, 0
-; RV64I-NEXT:    subw t0, s0, t0
+; RV64I-NEXT:    sub t0, s0, t0
 ; RV64I-NEXT:    sll t0, t1, t0
 ; RV64I-NEXT:    bnez s0, .LBB16_21
 ; RV64I-NEXT:    j .LBB16_22
 ; RV64I-NEXT:  .LBB16_20:
 ; RV64I-NEXT:    sll t2, t1, s0
-; RV64I-NEXT:    negw t0, s0
+; RV64I-NEXT:    neg t0, s0
 ; RV64I-NEXT:    srl t0, t1, t0
 ; RV64I-NEXT:    sll t1, a5, s0
 ; RV64I-NEXT:    or t0, t0, t1
@@ -7370,8 +7370,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    or a5, s0, a6
 ; RV64I-NEXT:    or a6, a1, s5
 ; RV64I-NEXT:    slli a6, a6, 6
-; RV64I-NEXT:    subw t2, a6, t0
-; RV64I-NEXT:    negw t3, a6
+; RV64I-NEXT:    sub t2, a6, t0
+; RV64I-NEXT:    neg t3, a6
 ; RV64I-NEXT:    srl s0, t1, t3
 ; RV64I-NEXT:    bltu a6, t0, .LBB17_2
 ; RV64I-NEXT:  # %bb.1:
@@ -7413,11 +7413,11 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    slli s4, s9, 16
 ; RV64I-NEXT:    bltu a4, t0, .LBB17_7
 ; RV64I-NEXT:  # %bb.6:
-; RV64I-NEXT:    subw s0, a4, t0
+; RV64I-NEXT:    sub s0, a4, t0
 ; RV64I-NEXT:    srl s0, a5, s0
 ; RV64I-NEXT:    j .LBB17_8
 ; RV64I-NEXT:  .LBB17_7:
-; RV64I-NEXT:    negw s6, a4
+; RV64I-NEXT:    neg s6, a4
 ; RV64I-NEXT:    sll s6, a5, s6
 ; RV64I-NEXT:    or s0, s0, s6
 ; RV64I-NEXT:  .LBB17_8:
@@ -7465,13 +7465,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; RV64I-NEXT:    bltu s0, t0, .LBB17_20
 ; RV64I-NEXT:  # %bb.19:
 ; RV64I-NEXT:    li t2, 0
-; RV64I-NEXT:    subw t0, s0, t0
+; RV64I-NEXT:    sub t0, s0, t0
 ; RV64I-NEXT:    sll t0, t1, t0
 ; RV64I-NEXT:    bnez s0, .LBB17_21
 ; RV64I-NEXT:    j .LBB17_22
 ; RV64I-NEXT:  .LBB17_20:
 ; RV64I-NEXT:    sll t2, t1, s0
-; RV64I-NEXT:    negw t0, s0
+; RV64I-NEXT:    neg t0, s0
 ; RV64I-NEXT:    srl t0, t1, t0
 ; RV64I-NEXT:    sll t1, a5, s0
 ; RV64I-NEXT:    or t0, t0, t1
@@ -8310,8 +8310,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, t5, t4
 ; RV64I-NEXT:    or a6, s0, t6
 ; RV64I-NEXT:    slli a6, a6, 3
-; RV64I-NEXT:    subw t1, a6, t0
-; RV64I-NEXT:    negw t5, a6
+; RV64I-NEXT:    sub t1, a6, t0
+; RV64I-NEXT:    neg t5, a6
 ; RV64I-NEXT:    sll t4, a5, t5
 ; RV64I-NEXT:    bltu a6, t0, .LBB18_2
 ; RV64I-NEXT:  # %bb.1:
@@ -8356,7 +8356,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    bgeu t6, t0, .LBB18_14
 ; RV64I-NEXT:  .LBB18_12:
 ; RV64I-NEXT:    sll t5, a7, t5
-; RV64I-NEXT:    negw s0, t6
+; RV64I-NEXT:    neg s0, t6
 ; RV64I-NEXT:    srl s0, a7, s0
 ; RV64I-NEXT:    or s1, s0, t4
 ; RV64I-NEXT:    j .LBB18_15
@@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    bltu t6, t0, .LBB18_12
 ; RV64I-NEXT:  .LBB18_14:
 ; RV64I-NEXT:    li t5, 0
-; RV64I-NEXT:    subw t4, t6, t0
+; RV64I-NEXT:    sub t4, t6, t0
 ; RV64I-NEXT:    sll s1, a7, t4
 ; RV64I-NEXT:  .LBB18_15:
 ; RV64I-NEXT:    sub s0, a6, t1
@@ -8376,13 +8376,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:  .LBB18_17:
 ; RV64I-NEXT:    bltu s0, t0, .LBB18_19
 ; RV64I-NEXT:  # %bb.18:
-; RV64I-NEXT:    subw t6, s0, t0
+; RV64I-NEXT:    sub t6, s0, t0
 ; RV64I-NEXT:    sra t6, a5, t6
 ; RV64I-NEXT:    bnez s0, .LBB18_20
 ; RV64I-NEXT:    j .LBB18_21
 ; RV64I-NEXT:  .LBB18_19:
 ; RV64I-NEXT:    srl t6, a7, s0
-; RV64I-NEXT:    negw s1, s0
+; RV64I-NEXT:    neg s1, s0
 ; RV64I-NEXT:    sll s1, a5, s1
 ; RV64I-NEXT:    or t6, t6, s1
 ; RV64I-NEXT:    beqz s0, .LBB18_21
@@ -9241,8 +9241,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    or a5, t5, t4
 ; RV64I-NEXT:    or a6, s0, t6
 ; RV64I-NEXT:    slli a6, a6, 5
-; RV64I-NEXT:    subw t1, a6, t0
-; RV64I-NEXT:    negw t5, a6
+; RV64I-NEXT:    sub t1, a6, t0
+; RV64I-NEXT:    neg t5, a6
 ; RV64I-NEXT:    sll t4, a5, t5
 ; RV64I-NEXT:    bltu a6, t0, .LBB19_2
 ; RV64I-NEXT:  # %bb.1:
@@ -9287,7 +9287,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    bgeu t6, t0, .LBB19_14
 ; RV64I-NEXT:  .LBB19_12:
 ; RV64I-NEXT:    sll t5, a7, t5
-; RV64I-NEXT:    negw s0, t6
+; RV64I-NEXT:    neg s0, t6
 ; RV64I-NEXT:    srl s0, a7, s0
 ; RV64I-NEXT:    or s1, s0, t4
 ; RV64I-NEXT:    j .LBB19_15
@@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:    bltu t6, t0, .LBB19_12
 ; RV64I-NEXT:  .LBB19_14:
 ; RV64I-NEXT:    li t5, 0
-; RV64I-NEXT:    subw t4, t6, t0
+; RV64I-NEXT:    sub t4, t6, t0
 ; RV64I-NEXT:    sll s1, a7, t4
 ; RV64I-NEXT:  .LBB19_15:
 ; RV64I-NEXT:    sub s0, a6, t1
@@ -9307,13 +9307,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 ; RV64I-NEXT:  .LBB19_17:
 ; RV64I-NEXT:    bltu s0, t0, .LBB19_19
 ; RV64I-NEXT:  # %bb.18:
-; RV64I-NEXT:    subw t6, s0, t0
+; RV64I-NEXT:    sub t6, s0, t0
 ; RV64I-NEXT:    sra t6, a5, t6
 ; RV64I-NEXT:    bnez s0, .LBB19_20
 ; RV64I-NEXT:    j .LBB19_21
 ; RV64I-NEXT:  .LBB19_19:
 ; RV64I-NEXT:    srl t6, a7, s0
-; RV64I-NEXT:    negw s1, s0
+; RV64I-NEXT:    neg s1, s0
 ; RV64I-NEXT:    sll s1, a5, s1
 ; RV64I-NEXT:    or t6, t6, s1
 ; RV64I-NEXT:    beqz s0, .LBB19_21
@@ -10172,8 +10172,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    or a5, t5, t4
 ; RV64I-NEXT:    or a6, s0, t6
 ; RV64I-NEXT:    slli a6, a6, 6
-; RV64I-NEXT:    subw t1, a6, t0
-; RV64I-NEXT:    negw t5, a6
+; RV64I-NEXT:    sub t1, a6, t0
+; RV64I-NEXT:    neg t5, a6
 ; RV64I-NEXT:    sll t4, a5, t5
 ; RV64I-NEXT:    bltu a6, t0, .LBB20_2
 ; RV64I-NEXT:  # %bb.1:
@@ -10218,7 +10218,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    bgeu t6, t0, .LBB20_14
 ; RV64I-NEXT:  .LBB20_12:
 ; RV64I-NEXT:    sll t5, a7, t5
-; RV64I-NEXT:    negw s0, t6
+; RV64I-NEXT:    neg s0, t6
 ; RV64I-NEXT:    srl s0, a7, s0
 ; RV64I-NEXT:    or s1, s0, t4
 ; RV64I-NEXT:    j .LBB20_15
@@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:    bltu t6, t0, .LBB20_12
 ; RV64I-NEXT:  .LBB20_14:
 ; RV64I-NEXT:    li t5, 0
-; RV64I-NEXT:    subw t4, t6, t0
+; RV64I-NEXT:    sub t4, t6, t0
 ; RV64I-NEXT:    sll s1, a7, t4
 ; RV64I-NEXT:  .LBB20_15:
 ; RV64I-NEXT:    sub s0, a6, t1
@@ -10238,13 +10238,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; RV64I-NEXT:  .LBB20_17:
 ; RV64I-NEXT:    bltu s0, t0, .LBB20_19
 ; RV64I-NEXT:  # %bb.18:
-; RV64I-NEXT:    subw t6, s0, t0
+; RV64I-NEXT:    sub t6, s0, t0
 ; RV64I-NEXT:    sra t6, a5, t6
 ; RV64I-NEXT:    bnez s0, .LBB20_20
 ; RV64I-NEXT:    j .LBB20_21
 ; RV64I-NEXT:  .LBB20_19:
 ; RV64I-NEXT:    srl t6, a7, s0
-; RV64I-NEXT:    negw s1, s0
+; RV64I-NEXT:    neg s1, s0
 ; RV64I-NEXT:    sll s1, a5, s1
 ; RV64I-NEXT:    or t6, t6, s1
 ; RV64I-NEXT:    beqz s0, .LBB20_21
diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index 3fb0f2c53bdf0..41f73f51fe7b6 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -2221,7 +2221,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: abd_subnsw_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    subw a0, a1, a0
@@ -2236,7 +2236,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i32:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    sraiw a1, a0, 31
 ; RV64ZBB-NEXT:    xor a0, a0, a1
 ; RV64ZBB-NEXT:    subw a0, a1, a0
@@ -2258,7 +2258,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: abd_subnsw_i32_undef:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    subw a0, a1, a0
@@ -2273,7 +2273,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i32_undef:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    sraiw a1, a0, 31
 ; RV64ZBB-NEXT:    xor a0, a0, a1
 ; RV64ZBB-NEXT:    subw a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index efb4e1a6f15d6..28a95ef4f8de9 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -1733,21 +1733,13 @@ define i8 @abd_subnsw_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV32ZBB-LABEL: abd_subnsw_i8:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    sub a0, a0, a1
-; RV32ZBB-NEXT:    sext.b a0, a0
-; RV32ZBB-NEXT:    neg a1, a0
-; RV32ZBB-NEXT:    max a0, a0, a1
-; RV32ZBB-NEXT:    ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i8:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    subw a0, a0, a1
-; RV64ZBB-NEXT:    sext.b a0, a0
-; RV64ZBB-NEXT:    neg a1, a0
-; RV64ZBB-NEXT:    max a0, a0, a1
-; RV64ZBB-NEXT:    ret
+; ZBB-LABEL: abd_subnsw_i8:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sub a0, a0, a1
+; ZBB-NEXT:    sext.b a0, a0
+; ZBB-NEXT:    neg a1, a0
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    ret
   %sub = sub nsw i8 %a, %b
   %abs = call i8 @llvm.abs.i8(i8 %sub, i1 false)
   ret i8 %abs
@@ -1772,21 +1764,13 @@ define i8 @abd_subnsw_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV32ZBB-LABEL: abd_subnsw_i8_undef:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    sub a0, a0, a1
-; RV32ZBB-NEXT:    sext.b a0, a0
-; RV32ZBB-NEXT:    neg a1, a0
-; RV32ZBB-NEXT:    max a0, a0, a1
-; RV32ZBB-NEXT:    ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i8_undef:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    subw a0, a0, a1
-; RV64ZBB-NEXT:    sext.b a0, a0
-; RV64ZBB-NEXT:    neg a1, a0
-; RV64ZBB-NEXT:    max a0, a0, a1
-; RV64ZBB-NEXT:    ret
+; ZBB-LABEL: abd_subnsw_i8_undef:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sub a0, a0, a1
+; ZBB-NEXT:    sext.b a0, a0
+; ZBB-NEXT:    neg a1, a0
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    ret
   %sub = sub nsw i8 %a, %b
   %abs = call i8 @llvm.abs.i8(i8 %sub, i1 true)
   ret i8 %abs
@@ -1811,21 +1795,13 @@ define i16 @abd_subnsw_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV32ZBB-LABEL: abd_subnsw_i16:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    sub a0, a0, a1
-; RV32ZBB-NEXT:    sext.h a0, a0
-; RV32ZBB-NEXT:    neg a1, a0
-; RV32ZBB-NEXT:    max a0, a0, a1
-; RV32ZBB-NEXT:    ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i16:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    subw a0, a0, a1
-; RV64ZBB-NEXT:    sext.h a0, a0
-; RV64ZBB-NEXT:    neg a1, a0
-; RV64ZBB-NEXT:    max a0, a0, a1
-; RV64ZBB-NEXT:    ret
+; ZBB-LABEL: abd_subnsw_i16:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sub a0, a0, a1
+; ZBB-NEXT:    sext.h a0, a0
+; ZBB-NEXT:    neg a1, a0
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    ret
   %sub = sub nsw i16 %a, %b
   %abs = call i16 @llvm.abs.i16(i16 %sub, i1 false)
   ret i16 %abs
@@ -1850,21 +1826,13 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV32ZBB-LABEL: abd_subnsw_i16_undef:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    sub a0, a0, a1
-; RV32ZBB-NEXT:    sext.h a0, a0
-; RV32ZBB-NEXT:    neg a1, a0
-; RV32ZBB-NEXT:    max a0, a0, a1
-; RV32ZBB-NEXT:    ret
-;
-; RV64ZBB-LABEL: abd_subnsw_i16_undef:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    subw a0, a0, a1
-; RV64ZBB-NEXT:    sext.h a0, a0
-; RV64ZBB-NEXT:    neg a1, a0
-; RV64ZBB-NEXT:    max a0, a0, a1
-; RV64ZBB-NEXT:    ret
+; ZBB-LABEL: abd_subnsw_i16_undef:
+; ZBB:       # %bb.0:
+; ZBB-NEXT:    sub a0, a0, a1
+; ZBB-NEXT:    sext.h a0, a0
+; ZBB-NEXT:    neg a1, a0
+; ZBB-NEXT:    max a0, a0, a1
+; ZBB-NEXT:    ret
   %sub = sub nsw i16 %a, %b
   %abs = call i16 @llvm.abs.i16(i16 %sub, i1 true)
   ret i16 %abs
@@ -1881,7 +1849,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: abd_subnsw_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    subw a0, a0, a1
@@ -1916,7 +1884,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: abd_subnsw_i32_undef:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    subw a0, a0, a1
@@ -2317,7 +2285,7 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: abd_sub_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index aac355e3f055b..3b2cab2b66303 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -20,7 +20,7 @@ define i32 @add_mul_combine_accept_a1(i32 %x) {
 ; RV64IMB:       # %bb.0:
 ; RV64IMB-NEXT:    sh1add a1, a0, a0
 ; RV64IMB-NEXT:    slli a0, a0, 5
-; RV64IMB-NEXT:    subw a0, a0, a1
+; RV64IMB-NEXT:    sub a0, a0, a1
 ; RV64IMB-NEXT:    addiw a0, a0, 1073
 ; RV64IMB-NEXT:    ret
   %tmp0 = add i32 %x, 37
@@ -41,7 +41,7 @@ define signext i32 @add_mul_combine_accept_a2(i32 signext %x) {
 ; RV64IMB:       # %bb.0:
 ; RV64IMB-NEXT:    sh1add a1, a0, a0
 ; RV64IMB-NEXT:    slli a0, a0, 5
-; RV64IMB-NEXT:    subw a0, a0, a1
+; RV64IMB-NEXT:    sub a0, a0, a1
 ; RV64IMB-NEXT:    addiw a0, a0, 1073
 ; RV64IMB-NEXT:    ret
   %tmp0 = add i32 %x, 37
@@ -93,7 +93,7 @@ define i32 @add_mul_combine_accept_b1(i32 %x) {
 ; RV64IMB:       # %bb.0:
 ; RV64IMB-NEXT:    sh3add a1, a0, a0
 ; RV64IMB-NEXT:    slli a0, a0, 5
-; RV64IMB-NEXT:    subw a0, a0, a1
+; RV64IMB-NEXT:    sub a0, a0, a1
 ; RV64IMB-NEXT:    lui a1, 50
 ; RV64IMB-NEXT:    addi a1, a1, 1119
 ; RV64IMB-NEXT:    addw a0, a0, a1
@@ -118,7 +118,7 @@ define signext i32 @add_mul_combine_accept_b2(i32 signext %x) {
 ; RV64IMB:       # %bb.0:
 ; RV64IMB-NEXT:    sh3add a1, a0, a0
 ; RV64IMB-NEXT:    slli a0, a0, 5
-; RV64IMB-NEXT:    subw a0, a0, a1
+; RV64IMB-NEXT:    sub a0, a0, a1
 ; RV64IMB-NEXT:    lui a1, 50
 ; RV64IMB-NEXT:    addi a1, a1, 1119
 ; RV64IMB-NEXT:    addw a0, a0, a1
@@ -456,7 +456,7 @@ define i32 @add_mul_combine_reject_f1(i32 %x) {
 ; RV64IMB-NEXT:    addi a0, a0, 1972
 ; RV64IMB-NEXT:    sh1add a1, a0, a0
 ; RV64IMB-NEXT:    slli a0, a0, 5
-; RV64IMB-NEXT:    subw a0, a0, a1
+; RV64IMB-NEXT:    sub a0, a0, a1
 ; RV64IMB-NEXT:    addiw a0, a0, 11
 ; RV64IMB-NEXT:    ret
   %tmp0 = mul i32 %x, 29
@@ -479,7 +479,7 @@ define signext i32 @add_mul_combine_reject_f2(i32 signext %x) {
 ; RV64IMB-NEXT:    addi a0, a0, 1972
 ; RV64IMB-NEXT:    sh1add a1, a0, a0
 ; RV64IMB-NEXT:    slli a0, a0, 5
-; RV64IMB-NEXT:    subw a0, a0, a1
+; RV64IMB-NEXT:    sub a0, a0, a1
 ; RV64IMB-NEXT:    addiw a0, a0, 11
 ; RV64IMB-NEXT:    ret
   %tmp0 = mul i32 %x, 29
diff --git a/llvm/test/CodeGen/RISCV/aext-to-sext.ll b/llvm/test/CodeGen/RISCV/aext-to-sext.ll
index f3f71a923bdc2..34549a06dd298 100644
--- a/llvm/test/CodeGen/RISCV/aext-to-sext.ll
+++ b/llvm/test/CodeGen/RISCV/aext-to-sext.ll
@@ -16,7 +16,7 @@ define void @quux(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    subw s0, a1, a0
+; RV64I-NEXT:    sub s0, a1, a0
 ; RV64I-NEXT:  .LBB0_2: # %bb2
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    call hoge
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index bebc097deb192..7d29ac9944834 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -4582,7 +4582,7 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB56_2: # %else
-; RV64I-NEXT:    lwu a1, 0(a0)
+; RV64I-NEXT:    lw a1, 0(a0)
 ; RV64I-NEXT:    andi a2, a1, 1
 ; RV64I-NEXT:    sw a2, 0(a0)
 ; RV64I-NEXT:    sext.w a0, a1
@@ -4700,7 +4700,7 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB57_2: # %else
-; RV64I-NEXT:    lwu a1, 0(a0)
+; RV64I-NEXT:    lw a1, 0(a0)
 ; RV64I-NEXT:    andi a2, a1, 1
 ; RV64I-NEXT:    sw a2, 0(a0)
 ; RV64I-NEXT:    sext.w a0, a1
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index 27704d107f93d..ea9786d0b10b3 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -161,7 +161,7 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    sltu t0, t0, a5
 ; RV64IA-NEXT:    addi t0, t0, -1
 ; RV64IA-NEXT:    and t0, t0, a1
-; RV64IA-NEXT:    subw a6, a6, t0
+; RV64IA-NEXT:    sub a6, a6, t0
 ; RV64IA-NEXT:    zext.b a6, a6
 ; RV64IA-NEXT:    sllw a6, a6, a0
 ; RV64IA-NEXT:    and a3, a3, a4
@@ -345,7 +345,7 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    sltu t1, t1, a6
 ; RV64IA-NEXT:    addi t1, t1, -1
 ; RV64IA-NEXT:    and t1, t1, a1
-; RV64IA-NEXT:    subw a7, a7, t1
+; RV64IA-NEXT:    sub a7, a7, t1
 ; RV64IA-NEXT:    and a7, a7, a3
 ; RV64IA-NEXT:    sllw a7, a7, a0
 ; RV64IA-NEXT:    and a4, a4, a5
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index ada1933d91d60..4e04f38a6301d 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -150,7 +150,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA-NEXT:    zext.b a7, a5
 ; RV64IA-NEXT:    addi a5, a5, 1
 ; RV64IA-NEXT:    sltu a7, a7, a1
-; RV64IA-NEXT:    negw a7, a7
+; RV64IA-NEXT:    neg a7, a7
 ; RV64IA-NEXT:    and a5, a7, a5
 ; RV64IA-NEXT:    zext.b a5, a5
 ; RV64IA-NEXT:    sllw a5, a5, a0
@@ -325,7 +325,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    addi a6, a6, 1
 ; RV64IA-NEXT:    sltu t0, t0, a1
 ; RV64IA-NEXT:    and a6, a6, a3
-; RV64IA-NEXT:    negw t0, t0
+; RV64IA-NEXT:    neg t0, t0
 ; RV64IA-NEXT:    and a6, t0, a6
 ; RV64IA-NEXT:    sllw a6, a6, a0
 ; RV64IA-NEXT:    and a4, a4, a5
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 3422ea63e748a..6207a17734d62 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -1074,7 +1074,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind {
 ;
 ; CHECK64ZFBFMIN-LABEL: fcvt_bf16_wu_load:
 ; CHECK64ZFBFMIN:       # %bb.0:
-; CHECK64ZFBFMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64ZFBFMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.wu fa5, a0
 ; CHECK64ZFBFMIN-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK64ZFBFMIN-NEXT:    ret
@@ -1083,7 +1083,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind {
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    addi sp, sp, -16
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT:    lwu a0, 0(a0)
+; RV64ID-NEXT:    lw a0, 0(a0)
 ; RV64ID-NEXT:    fcvt.s.wu fa0, a0
 ; RV64ID-NEXT:    call __truncsfbf2
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 724891853630a..530980c13116c 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -63,7 +63,7 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    and a0, a0, a1
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
+; RV64NOZBB-NEXT:    sub a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 51
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
 ; RV64NOZBB-NEXT:    andi a0, a0, 51
@@ -262,7 +262,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -270,16 +270,16 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -318,7 +318,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    sext.w a1, a0
 ; RV64M-NEXT:    beqz a1, .LBB2_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
-; RV64M-NEXT:    negw a1, a0
+; RV64M-NEXT:    neg a1, a0
 ; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 30667
 ; RV64M-NEXT:    addi a1, a1, 1329
@@ -597,7 +597,7 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    and a0, a0, a1
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
+; RV64NOZBB-NEXT:    sub a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 51
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
 ; RV64NOZBB-NEXT:    andi a0, a0, 51
@@ -743,7 +743,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: test_cttz_i32_zero_undef:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -751,16 +751,16 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -788,7 +788,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ;
 ; RV64M-LABEL: test_cttz_i32_zero_undef:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    negw a1, a0
+; RV64M-NEXT:    neg a1, a0
 ; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 30667
 ; RV64M-NEXT:    addi a1, a1, 1329
@@ -1039,7 +1039,7 @@ define i8 @test_ctlz_i8(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    not a0, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
+; RV64NOZBB-NEXT:    sub a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 51
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
 ; RV64NOZBB-NEXT:    andi a0, a0, 51
@@ -1711,7 +1711,7 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
 ; RV64NOZBB-NEXT:    not a0, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
+; RV64NOZBB-NEXT:    sub a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 51
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
 ; RV64NOZBB-NEXT:    andi a0, a0, 51
@@ -2296,7 +2296,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
 ; RV64NOZBB:       # %bb.0:
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
+; RV64NOZBB-NEXT:    sub a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 51
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
 ; RV64NOZBB-NEXT:    andi a0, a0, 51
@@ -2336,7 +2336,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind {
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 1
 ; RV64XTHEADBB-NEXT:    andi a1, a1, 85
-; RV64XTHEADBB-NEXT:    subw a0, a0, a1
+; RV64XTHEADBB-NEXT:    sub a0, a0, a1
 ; RV64XTHEADBB-NEXT:    andi a1, a0, 51
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
 ; RV64XTHEADBB-NEXT:    andi a0, a0, 51
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 637fb314e9536..a1061fbbbbf02 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -163,7 +163,7 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind {
 ; RV64I-LABEL: ctz_dereferencing_pointer_zext:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -171,16 +171,16 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -248,7 +248,7 @@ define signext i32 @ctz1(i32 signext %x) nounwind {
 ;
 ; RV64I-LABEL: ctz1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -256,16 +256,16 @@ define signext i32 @ctz1(i32 signext %x) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -331,7 +331,7 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind {
 ;
 ; RV64I-LABEL: ctz1_flipped:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -339,16 +339,16 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -412,7 +412,7 @@ define signext i32 @ctz2(i32 signext %x) nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -420,16 +420,16 @@ define signext i32 @ctz2(i32 signext %x) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -490,7 +490,7 @@ define signext i32 @ctz3(i32 signext %x) nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -498,16 +498,16 @@ define signext i32 @ctz3(i32 signext %x) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -824,7 +824,7 @@ define signext i32 @ctz5(i32 signext %x) nounwind {
 ;
 ; RV64I-LABEL: ctz5:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -832,16 +832,16 @@ define signext i32 @ctz5(i32 signext %x) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -907,7 +907,7 @@ define signext i32 @ctz6(i32 signext %x) nounwind {
 ;
 ; RV64I-LABEL: ctz6:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -915,16 +915,16 @@ define signext i32 @ctz6(i32 signext %x) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -997,7 +997,7 @@ define signext i32 @globalVar() nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lui a0, %hi(global_x)
 ; RV64I-NEXT:    lw a0, %lo(global_x)(a0)
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -1005,16 +1005,16 @@ define signext i32 @globalVar() nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index ea8b04d727acf..53c3f5841ba0f 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -54,7 +54,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
 ; RV64IM-NEXT:    slli a2, a2, 32
 ; RV64IM-NEXT:    mulhu a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    sub a0, a0, a1
 ; RV64IM-NEXT:    srliw a0, a0, 1
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    srli a0, a0, 2
@@ -67,7 +67,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
 ; RV64IMZB-NEXT:    addi a2, a2, -1755
 ; RV64IMZB-NEXT:    mul a1, a1, a2
 ; RV64IMZB-NEXT:    srli a1, a1, 32
-; RV64IMZB-NEXT:    subw a0, a0, a1
+; RV64IMZB-NEXT:    sub a0, a0, a1
 ; RV64IMZB-NEXT:    srliw a0, a0, 1
 ; RV64IMZB-NEXT:    add a0, a0, a1
 ; RV64IMZB-NEXT:    srli a0, a0, 2
@@ -193,7 +193,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
 ; RV64IM-NEXT:    li a2, 37
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 8
-; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    sub a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 56
 ; RV64IM-NEXT:    srli a0, a0, 57
 ; RV64IM-NEXT:    add a0, a0, a1
@@ -206,7 +206,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
 ; RV64IMZB-NEXT:    sh3add a2, a1, a1
 ; RV64IMZB-NEXT:    sh2add a1, a2, a1
 ; RV64IMZB-NEXT:    srli a1, a1, 8
-; RV64IMZB-NEXT:    subw a0, a0, a1
+; RV64IMZB-NEXT:    sub a0, a0, a1
 ; RV64IMZB-NEXT:    slli a0, a0, 56
 ; RV64IMZB-NEXT:    srli a0, a0, 57
 ; RV64IMZB-NEXT:    add a0, a0, a1
@@ -257,7 +257,7 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
 ; RV64-NEXT:    lui a2, 149808
 ; RV64-NEXT:    mulhu a1, a1, a2
 ; RV64-NEXT:    srli a1, a1, 16
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    slli a0, a0, 48
 ; RV64-NEXT:    srli a0, a0, 49
 ; RV64-NEXT:    add a0, a0, a1
@@ -367,7 +367,7 @@ define i32 @sdiv_constant_sub_srai(i32 %a) nounwind {
 ; RV64-NEXT:    addi a2, a2, -1171
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    subw a1, a1, a0
+; RV64-NEXT:    sub a1, a1, a0
 ; RV64-NEXT:    srliw a0, a1, 31
 ; RV64-NEXT:    sraiw a1, a1, 2
 ; RV64-NEXT:    add a0, a1, a0
@@ -666,7 +666,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 8
-; RV64IM-NEXT:    subw a1, a1, a0
+; RV64IM-NEXT:    sub a1, a1, a0
 ; RV64IM-NEXT:    slli a1, a1, 56
 ; RV64IM-NEXT:    srli a0, a1, 63
 ; RV64IM-NEXT:    srai a1, a1, 58
@@ -679,7 +679,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV64IMZB-NEXT:    li a2, 109
 ; RV64IMZB-NEXT:    mul a1, a1, a2
 ; RV64IMZB-NEXT:    srli a1, a1, 8
-; RV64IMZB-NEXT:    subw a1, a1, a0
+; RV64IMZB-NEXT:    sub a1, a1, a0
 ; RV64IMZB-NEXT:    slli a1, a1, 56
 ; RV64IMZB-NEXT:    srli a0, a1, 63
 ; RV64IMZB-NEXT:    srai a1, a1, 58
@@ -889,7 +889,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV64IM-NEXT:    addi a2, a2, 1911
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 16
-; RV64IM-NEXT:    subw a1, a1, a0
+; RV64IM-NEXT:    sub a1, a1, a0
 ; RV64IM-NEXT:    slli a1, a1, 48
 ; RV64IM-NEXT:    srli a0, a1, 63
 ; RV64IM-NEXT:    srai a1, a1, 51
@@ -903,7 +903,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV64IMZB-NEXT:    addi a2, a2, 1911
 ; RV64IMZB-NEXT:    mul a1, a1, a2
 ; RV64IMZB-NEXT:    srli a1, a1, 16
-; RV64IMZB-NEXT:    subw a1, a1, a0
+; RV64IMZB-NEXT:    sub a1, a1, a0
 ; RV64IMZB-NEXT:    slli a1, a1, 48
 ; RV64IMZB-NEXT:    srli a0, a1, 63
 ; RV64IMZB-NEXT:    srai a1, a1, 51
diff --git a/llvm/test/CodeGen/RISCV/double-convert-strict.ll b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
index 2b1ec10fcaf17..9a5e357b05a17 100644
--- a/llvm/test/CodeGen/RISCV/double-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
@@ -347,17 +347,11 @@ define double @fcvt_d_wu(i32 %a) nounwind strictfp {
 declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)
 
 define double @fcvt_d_wu_load(ptr %p) nounwind strictfp {
-; RV32IFD-LABEL: fcvt_d_wu_load:
-; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    lw a0, 0(a0)
-; RV32IFD-NEXT:    fcvt.d.wu fa0, a0
-; RV32IFD-NEXT:    ret
-;
-; RV64IFD-LABEL: fcvt_d_wu_load:
-; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    lwu a0, 0(a0)
-; RV64IFD-NEXT:    fcvt.d.wu fa0, a0
-; RV64IFD-NEXT:    ret
+; CHECKIFD-LABEL: fcvt_d_wu_load:
+; CHECKIFD:       # %bb.0:
+; CHECKIFD-NEXT:    lw a0, 0(a0)
+; CHECKIFD-NEXT:    fcvt.d.wu fa0, a0
+; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load:
 ; RV32IZFINXZDINX:       # %bb.0:
@@ -367,7 +361,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind strictfp {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lwu a0, 0(a0)
+; RV64IZFINXZDINX-NEXT:    lw a0, 0(a0)
 ; RV64IZFINXZDINX-NEXT:    fcvt.d.wu a0, a0
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index fad9e21fee72b..a2e6186e051bf 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -582,17 +582,11 @@ define double @fcvt_d_wu(i32 %a) nounwind {
 }
 
 define double @fcvt_d_wu_load(ptr %p) nounwind {
-; RV32IFD-LABEL: fcvt_d_wu_load:
-; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    lw a0, 0(a0)
-; RV32IFD-NEXT:    fcvt.d.wu fa0, a0
-; RV32IFD-NEXT:    ret
-;
-; RV64IFD-LABEL: fcvt_d_wu_load:
-; RV64IFD:       # %bb.0:
-; RV64IFD-NEXT:    lwu a0, 0(a0)
-; RV64IFD-NEXT:    fcvt.d.wu fa0, a0
-; RV64IFD-NEXT:    ret
+; CHECKIFD-LABEL: fcvt_d_wu_load:
+; CHECKIFD:       # %bb.0:
+; CHECKIFD-NEXT:    lw a0, 0(a0)
+; CHECKIFD-NEXT:    fcvt.d.wu fa0, a0
+; CHECKIFD-NEXT:    ret
 ;
 ; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load:
 ; RV32IZFINXZDINX:       # %bb.0:
@@ -602,7 +596,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    lwu a0, 0(a0)
+; RV64IZFINXZDINX-NEXT:    lw a0, 0(a0)
 ; RV64IZFINXZDINX-NEXT:    fcvt.d.wu a0, a0
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/float-convert-strict.ll b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
index 0c265e11652a2..1b25a2b64f4d3 100644
--- a/llvm/test/CodeGen/RISCV/float-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
@@ -236,29 +236,17 @@ define float @fcvt_s_wu(i32 %a) nounwind strictfp {
 declare float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata, metadata)
 
 define float @fcvt_s_wu_load(ptr %p) nounwind strictfp {
-; RV32IF-LABEL: fcvt_s_wu_load:
-; RV32IF:       # %bb.0:
-; RV32IF-NEXT:    lw a0, 0(a0)
-; RV32IF-NEXT:    fcvt.s.wu fa0, a0
-; RV32IF-NEXT:    ret
-;
-; RV64IF-LABEL: fcvt_s_wu_load:
-; RV64IF:       # %bb.0:
-; RV64IF-NEXT:    lwu a0, 0(a0)
-; RV64IF-NEXT:    fcvt.s.wu fa0, a0
-; RV64IF-NEXT:    ret
-;
-; RV32IZFINX-LABEL: fcvt_s_wu_load:
-; RV32IZFINX:       # %bb.0:
-; RV32IZFINX-NEXT:    lw a0, 0(a0)
-; RV32IZFINX-NEXT:    fcvt.s.wu a0, a0
-; RV32IZFINX-NEXT:    ret
+; CHECKIF-LABEL: fcvt_s_wu_load:
+; CHECKIF:       # %bb.0:
+; CHECKIF-NEXT:    lw a0, 0(a0)
+; CHECKIF-NEXT:    fcvt.s.wu fa0, a0
+; CHECKIF-NEXT:    ret
 ;
-; RV64IZFINX-LABEL: fcvt_s_wu_load:
-; RV64IZFINX:       # %bb.0:
-; RV64IZFINX-NEXT:    lwu a0, 0(a0)
-; RV64IZFINX-NEXT:    fcvt.s.wu a0, a0
-; RV64IZFINX-NEXT:    ret
+; CHECKIZFINX-LABEL: fcvt_s_wu_load:
+; CHECKIZFINX:       # %bb.0:
+; CHECKIZFINX-NEXT:    lw a0, 0(a0)
+; CHECKIZFINX-NEXT:    fcvt.s.wu a0, a0
+; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_wu_load:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 1cb7b27dd69e4..60349a0e39953 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -482,29 +482,17 @@ define float @fcvt_s_wu(i32 %a) nounwind {
 }
 
 define float @fcvt_s_wu_load(ptr %p) nounwind {
-; RV32IF-LABEL: fcvt_s_wu_load:
-; RV32IF:       # %bb.0:
-; RV32IF-NEXT:    lw a0, 0(a0)
-; RV32IF-NEXT:    fcvt.s.wu fa0, a0
-; RV32IF-NEXT:    ret
-;
-; RV64IF-LABEL: fcvt_s_wu_load:
-; RV64IF:       # %bb.0:
-; RV64IF-NEXT:    lwu a0, 0(a0)
-; RV64IF-NEXT:    fcvt.s.wu fa0, a0
-; RV64IF-NEXT:    ret
-;
-; RV32IZFINX-LABEL: fcvt_s_wu_load:
-; RV32IZFINX:       # %bb.0:
-; RV32IZFINX-NEXT:    lw a0, 0(a0)
-; RV32IZFINX-NEXT:    fcvt.s.wu a0, a0
-; RV32IZFINX-NEXT:    ret
+; CHECKIF-LABEL: fcvt_s_wu_load:
+; CHECKIF:       # %bb.0:
+; CHECKIF-NEXT:    lw a0, 0(a0)
+; CHECKIF-NEXT:    fcvt.s.wu fa0, a0
+; CHECKIF-NEXT:    ret
 ;
-; RV64IZFINX-LABEL: fcvt_s_wu_load:
-; RV64IZFINX:       # %bb.0:
-; RV64IZFINX-NEXT:    lwu a0, 0(a0)
-; RV64IZFINX-NEXT:    fcvt.s.wu a0, a0
-; RV64IZFINX-NEXT:    ret
+; CHECKIZFINX-LABEL: fcvt_s_wu_load:
+; CHECKIZFINX:       # %bb.0:
+; CHECKIZFINX-NEXT:    lw a0, 0(a0)
+; CHECKIZFINX-NEXT:    fcvt.s.wu a0, a0
+; CHECKIZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_wu_load:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 246e6a614d6aa..117e3e4aac45d 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 8(sp)
-; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a2, 20(sp)
+; RV32IF-NEXT:    lw a0, 20(sp)
+; RV32IF-NEXT:    lw a1, 8(sp)
+; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 16(sp)
-; RV32IF-NEXT:    beqz a2, .LBB47_2
+; RV32IF-NEXT:    beqz a0, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a2, 0
+; RV32IF-NEXT:    slti a4, a0, 0
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
 ; RV32IF-NEXT:  .LBB47_3: # %entry
 ; RV32IF-NEXT:    xori a3, a3, 1
-; RV32IF-NEXT:    or a3, a3, a2
+; RV32IF-NEXT:    or a3, a3, a0
 ; RV32IF-NEXT:    seqz a3, a3
 ; RV32IF-NEXT:    addi a3, a3, -1
 ; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    neg a3, a3
+; RV32IF-NEXT:    and a2, a3, a2
 ; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
-; RV32IF-NEXT:    and a2, a3, a2
-; RV32IF-NEXT:    slti a2, a2, 0
-; RV32IF-NEXT:    addi a2, a2, -1
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    and a1, a2, a1
+; RV32IF-NEXT:    slti a0, a0, 0
+; RV32IF-NEXT:    addi a3, a0, -1
+; RV32IF-NEXT:    and a0, a3, a1
+; RV32IF-NEXT:    and a1, a3, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 8(sp)
-; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 20(sp)
+; RV32IFD-NEXT:    lw a0, 20(sp)
+; RV32IFD-NEXT:    lw a1, 8(sp)
+; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 16(sp)
-; RV32IFD-NEXT:    beqz a2, .LBB47_2
+; RV32IFD-NEXT:    beqz a0, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a2, 0
+; RV32IFD-NEXT:    slti a4, a0, 0
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
 ; RV32IFD-NEXT:  .LBB47_3: # %entry
 ; RV32IFD-NEXT:    xori a3, a3, 1
-; RV32IFD-NEXT:    or a3, a3, a2
+; RV32IFD-NEXT:    or a3, a3, a0
 ; RV32IFD-NEXT:    seqz a3, a3
 ; RV32IFD-NEXT:    addi a3, a3, -1
 ; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a2, a3, a2
 ; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    and a2, a3, a2
-; RV32IFD-NEXT:    slti a2, a2, 0
-; RV32IFD-NEXT:    addi a2, a2, -1
-; RV32IFD-NEXT:    and a0, a2, a0
-; RV32IFD-NEXT:    and a1, a2, a1
+; RV32IFD-NEXT:    slti a0, a0, 0
+; RV32IFD-NEXT:    addi a3, a0, -1
+; RV32IFD-NEXT:    and a0, a3, a1
+; RV32IFD-NEXT:    and a1, a3, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB50_2
+; RV32-NEXT:    beqz a0, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB50_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB53_2
+; RV32-NEXT:    beqz a0, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB53_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
index 148186b21c125..255c120434f34 100644
--- a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
+++ b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
@@ -35,3 +35,37 @@ entry:
   call void @llvm.reset.fpenv()
   ret void
 }
+
+define iXLen @func_get_fpmode() {
+; CHECK-LABEL: func_get_fpmode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frcsr a0
+; CHECK-NEXT:    ret
+entry:
+  %fpenv = call iXLen @llvm.get.fpmode.iXLen()
+  ret iXLen %fpenv
+}
+
+define void @func_set_fpmode(iXLen %fpmode) {
+; CHECK-LABEL: func_set_fpmode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li    a1, -32
+; CHECK-NEXT:    csrc  fcsr, a1
+; CHECK-NEXT:    andi  a0, a0, -32
+; CHECK-NEXT:    csrs  fcsr, a0
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.set.fpmode.iXLen(iXLen %fpmode)
+  ret void
+}
+
+define void @func_reset_fpmode() {
+; CHECK-LABEL: func_reset_fpmode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li   a0, -32
+; CHECK-NEXT:    csrc fcsr, a0
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.reset.fpmode()
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
index 0a04d44893e75..675e230816f37 100644
--- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
@@ -1461,29 +1461,17 @@ define half @fcvt_h_wu(i32 %a) nounwind strictfp {
 declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata)
 
 define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
-; RV32IZFH-LABEL: fcvt_h_wu_load:
-; RV32IZFH:       # %bb.0:
-; RV32IZFH-NEXT:    lw a0, 0(a0)
-; RV32IZFH-NEXT:    fcvt.h.wu fa0, a0
-; RV32IZFH-NEXT:    ret
-;
-; RV64IZFH-LABEL: fcvt_h_wu_load:
-; RV64IZFH:       # %bb.0:
-; RV64IZFH-NEXT:    lwu a0, 0(a0)
-; RV64IZFH-NEXT:    fcvt.h.wu fa0, a0
-; RV64IZFH-NEXT:    ret
-;
-; RV32IZHINX-LABEL: fcvt_h_wu_load:
-; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lw a0, 0(a0)
-; RV32IZHINX-NEXT:    fcvt.h.wu a0, a0
-; RV32IZHINX-NEXT:    ret
+; CHECKIZFH-LABEL: fcvt_h_wu_load:
+; CHECKIZFH:       # %bb.0:
+; CHECKIZFH-NEXT:    lw a0, 0(a0)
+; CHECKIZFH-NEXT:    fcvt.h.wu fa0, a0
+; CHECKIZFH-NEXT:    ret
 ;
-; RV64IZHINX-LABEL: fcvt_h_wu_load:
-; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lwu a0, 0(a0)
-; RV64IZHINX-NEXT:    fcvt.h.wu a0, a0
-; RV64IZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fcvt_h_wu_load:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    lw a0, 0(a0)
+; CHECKIZHINX-NEXT:    fcvt.h.wu a0, a0
+; CHECKIZHINX-NEXT:    ret
 ;
 ; RV32IDZFH-LABEL: fcvt_h_wu_load:
 ; RV32IDZFH:       # %bb.0:
@@ -1493,7 +1481,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
 ;
 ; RV64IDZFH-LABEL: fcvt_h_wu_load:
 ; RV64IDZFH:       # %bb.0:
-; RV64IDZFH-NEXT:    lwu a0, 0(a0)
+; RV64IDZFH-NEXT:    lw a0, 0(a0)
 ; RV64IDZFH-NEXT:    fcvt.h.wu fa0, a0
 ; RV64IDZFH-NEXT:    ret
 ;
@@ -1505,7 +1493,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load:
 ; RV64IZDINXZHINX:       # %bb.0:
-; RV64IZDINXZHINX-NEXT:    lwu a0, 0(a0)
+; RV64IZDINXZHINX-NEXT:    lw a0, 0(a0)
 ; RV64IZDINXZHINX-NEXT:    fcvt.h.wu a0, a0
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
@@ -1518,7 +1506,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
 ;
 ; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load:
 ; CHECK64-IZFHMIN:       # %bb.0:
-; CHECK64-IZFHMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64-IZFHMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.wu fa5, a0
 ; CHECK64-IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK64-IZFHMIN-NEXT:    ret
@@ -1532,7 +1520,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load:
 ; CHECK64-IZHINXMIN:       # %bb.0:
-; CHECK64-IZHINXMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64-IZHINXMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.wu a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    ret
@@ -1546,7 +1534,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp {
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0:
-; CHECK64-IZDINXZHINXMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64-IZDINXZHINXMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.wu a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index c53237ed6aef7..facb544fb52b6 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -4388,17 +4388,11 @@ define half @fcvt_h_wu(i32 %a) nounwind {
 }
 
 define half @fcvt_h_wu_load(ptr %p) nounwind {
-; RV32IZFH-LABEL: fcvt_h_wu_load:
-; RV32IZFH:       # %bb.0:
-; RV32IZFH-NEXT:    lw a0, 0(a0)
-; RV32IZFH-NEXT:    fcvt.h.wu fa0, a0
-; RV32IZFH-NEXT:    ret
-;
-; RV64IZFH-LABEL: fcvt_h_wu_load:
-; RV64IZFH:       # %bb.0:
-; RV64IZFH-NEXT:    lwu a0, 0(a0)
-; RV64IZFH-NEXT:    fcvt.h.wu fa0, a0
-; RV64IZFH-NEXT:    ret
+; CHECKIZFH-LABEL: fcvt_h_wu_load:
+; CHECKIZFH:       # %bb.0:
+; CHECKIZFH-NEXT:    lw a0, 0(a0)
+; CHECKIZFH-NEXT:    fcvt.h.wu fa0, a0
+; CHECKIZFH-NEXT:    ret
 ;
 ; RV32IDZFH-LABEL: fcvt_h_wu_load:
 ; RV32IDZFH:       # %bb.0:
@@ -4408,33 +4402,21 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
 ;
 ; RV64IDZFH-LABEL: fcvt_h_wu_load:
 ; RV64IDZFH:       # %bb.0:
-; RV64IDZFH-NEXT:    lwu a0, 0(a0)
+; RV64IDZFH-NEXT:    lw a0, 0(a0)
 ; RV64IDZFH-NEXT:    fcvt.h.wu fa0, a0
 ; RV64IDZFH-NEXT:    ret
 ;
-; RV32IZHINX-LABEL: fcvt_h_wu_load:
-; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lw a0, 0(a0)
-; RV32IZHINX-NEXT:    fcvt.h.wu a0, a0
-; RV32IZHINX-NEXT:    ret
-;
-; RV64IZHINX-LABEL: fcvt_h_wu_load:
-; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lwu a0, 0(a0)
-; RV64IZHINX-NEXT:    fcvt.h.wu a0, a0
-; RV64IZHINX-NEXT:    ret
-;
-; RV32IZDINXZHINX-LABEL: fcvt_h_wu_load:
-; RV32IZDINXZHINX:       # %bb.0:
-; RV32IZDINXZHINX-NEXT:    lw a0, 0(a0)
-; RV32IZDINXZHINX-NEXT:    fcvt.h.wu a0, a0
-; RV32IZDINXZHINX-NEXT:    ret
+; CHECKIZHINX-LABEL: fcvt_h_wu_load:
+; CHECKIZHINX:       # %bb.0:
+; CHECKIZHINX-NEXT:    lw a0, 0(a0)
+; CHECKIZHINX-NEXT:    fcvt.h.wu a0, a0
+; CHECKIZHINX-NEXT:    ret
 ;
-; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load:
-; RV64IZDINXZHINX:       # %bb.0:
-; RV64IZDINXZHINX-NEXT:    lwu a0, 0(a0)
-; RV64IZDINXZHINX-NEXT:    fcvt.h.wu a0, a0
-; RV64IZDINXZHINX-NEXT:    ret
+; CHECKIZDINXZHINX-LABEL: fcvt_h_wu_load:
+; CHECKIZDINXZHINX:       # %bb.0:
+; CHECKIZDINXZHINX-NEXT:    lw a0, 0(a0)
+; CHECKIZDINXZHINX-NEXT:    fcvt.h.wu a0, a0
+; CHECKIZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_h_wu_load:
 ; RV32I:       # %bb.0:
@@ -4476,7 +4458,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
 ; RV64ID-LP64:       # %bb.0:
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-LP64-NEXT:    lwu a0, 0(a0)
+; RV64ID-LP64-NEXT:    lw a0, 0(a0)
 ; RV64ID-LP64-NEXT:    fcvt.s.wu fa5, a0
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
 ; RV64ID-LP64-NEXT:    call __truncsfhf2
@@ -4505,7 +4487,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    addi sp, sp, -16
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-NEXT:    lwu a0, 0(a0)
+; RV64ID-NEXT:    lw a0, 0(a0)
 ; RV64ID-NEXT:    fcvt.s.wu fa0, a0
 ; RV64ID-NEXT:    call __truncsfhf2
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
@@ -4525,7 +4507,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
 ;
 ; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load:
 ; CHECK64-IZFHMIN:       # %bb.0:
-; CHECK64-IZFHMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64-IZFHMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.wu fa5, a0
 ; CHECK64-IZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK64-IZFHMIN-NEXT:    ret
@@ -4539,7 +4521,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load:
 ; CHECK64-IZHINXMIN:       # %bb.0:
-; CHECK64-IZHINXMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64-IZHINXMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.wu a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    ret
@@ -4553,7 +4535,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind {
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0:
-; CHECK64-IZDINXZHINXMIN-NEXT:    lwu a0, 0(a0)
+; CHECK64-IZDINXZHINXMIN-NEXT:    lw a0, 0(a0)
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.wu a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 66cde323ce507..774f1a1608821 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -651,7 +651,7 @@ define void @zext16_abs8(i8 %x, ptr %p) {
 ; RV64I-NEXT:    srai a2, a0, 63
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    subw a0, a0, a2
+; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    sh a0, 0(a1)
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/interrupt-attr.ll b/llvm/test/CodeGen/RISCV/interrupt-attr.ll
index e278b8d0b53b2..472b9031a5cae 100644
--- a/llvm/test/CodeGen/RISCV/interrupt-attr.ll
+++ b/llvm/test/CodeGen/RISCV/interrupt-attr.ll
@@ -794,498 +794,46 @@ define void @foo_with_call() #1 {
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-V-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 5
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-V-NEXT:    mv a1, a0
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-V-NEXT:    add a0, a0, a1
 ; CHECK-RV32-V-NEXT:    add a0, sp, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-V-NEXT:    add a0, sp, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-V-NEXT:    add a0, sp, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    addi a0, sp, 16
-; CHECK-RV32-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    call otherfoo
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 5
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-V-NEXT:    mv a1, a0
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-V-NEXT:    add a0, a0, a1
 ; CHECK-RV32-V-NEXT:    add a0, sp, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-V-NEXT:    add a0, sp, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-V-NEXT:    add a0, sp, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    add a0, sp, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    addi a0, sp, 16
-; CHECK-RV32-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-RV32-V-NEXT:    addi a0, sp, 16
+; CHECK-RV32-V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-V-NEXT:    add sp, sp, a0
@@ -1351,498 +899,46 @@ define void @foo_with_call() #1 {
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FV-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FV-NEXT:    mv a1, a0
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    addi a0, sp, 16
-; CHECK-RV32-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    call otherfoo
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    addi a0, sp, 16
-; CHECK-RV32-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV32-FV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FV-NEXT:    add sp, sp, a0
@@ -1928,498 +1024,46 @@ define void @foo_with_call() #1 {
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FDV-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    addi a0, sp, 16
+; CHECK-RV32-FDV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    call otherfoo
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
 ; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    addi a0, sp, 16
-; CHECK-RV32-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    call otherfoo
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    addi a0, sp, 16
-; CHECK-RV32-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FDV-NEXT:    add sp, sp, a0
@@ -3259,74 +1903,26 @@ define void @foo_with_call() #1 {
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-V-NEXT:    sub sp, sp, a0
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 5
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-V-NEXT:    mv a1, a0
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-V-NEXT:    add a0, a0, a1
 ; CHECK-RV64-V-NEXT:    add a0, sp, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-V-NEXT:    add a0, sp, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
 ; CHECK-RV64-V-NEXT:    add a0, sp, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV64-V-NEXT:    addi a0, sp, 16
+; CHECK-RV64-V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV64-V-NEXT:    call otherfoo
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-V-NEXT:    mv a1, a0
@@ -3334,423 +1930,19 @@ define void @foo_with_call() #1 {
 ; CHECK-RV64-V-NEXT:    add a0, a0, a1
 ; CHECK-RV64-V-NEXT:    add a0, sp, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-V-NEXT:    add a0, sp, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-V-NEXT:    add a0, sp, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    addi a0, sp, 16
-; CHECK-RV64-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    call otherfoo
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 5
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 4
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    add a0, sp, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    addi a0, sp, 16
-; CHECK-RV64-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-RV64-V-NEXT:    addi a0, sp, 16
+; CHECK-RV64-V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-V-NEXT:    add sp, sp, a0
@@ -3816,498 +2008,46 @@ define void @foo_with_call() #1 {
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FV-NEXT:    sub sp, sp, a0
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FV-NEXT:    mv a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV64-FV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    call otherfoo
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FV-NEXT:    mv a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    addi a0, sp, 16
-; CHECK-RV64-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    call otherfoo
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    addi a0, sp, 16
-; CHECK-RV64-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    addi a0, sp, 16
+; CHECK-RV64-FV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FV-NEXT:    add sp, sp, a0
@@ -4362,529 +2102,77 @@ define void @foo_with_call() #1 {
 ; CHECK-RV64-FDV-NEXT:    sd a2, 248(sp) # 8-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    sd a3, 240(sp) # 8-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    sd a4, 232(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd a6, 216(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd a7, 208(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t3, 200(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t4, 192(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t5, 184(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t6, 176(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft0, 168(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft1, 160(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft2, 152(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft3, 144(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft4, 136(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft5, 128(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft6, 120(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft7, 112(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa0, 104(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa1, 96(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa2, 88(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa3, 80(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa4, 72(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa5, 64(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa6, 56(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa7, 48(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft8, 40(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft9, 32(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft10, 24(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft11, 16(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
-; CHECK-RV64-FDV-NEXT:    sub sp, sp, a0
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a6, 216(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd a7, 208(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t3, 200(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t4, 192(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t5, 184(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t6, 176(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft0, 168(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft1, 160(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft2, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft3, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft4, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft5, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft6, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft7, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa0, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa1, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa2, 88(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa3, 80(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa4, 72(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa5, 64(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa6, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa7, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft8, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft9, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft10, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft11, 16(sp) # 8-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub sp, sp, a0
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
 ; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    addi a0, sp, 16
-; CHECK-RV64-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    call otherfoo
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    add a0, sp, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    addi a0, sp, 16
-; CHECK-RV64-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FDV-NEXT:    add sp, sp, a0
@@ -5660,514 +2948,62 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV32-V-NEXT:    sw a4, 40(sp) # 4-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    sw a5, 36(sp) # 4-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    sw a6, 32(sp) # 4-byte Folded Spill
-; CHECK-RV32-V-NEXT:    sw a7, 28(sp) # 4-byte Folded Spill
-; CHECK-RV32-V-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
-; CHECK-RV32-V-NEXT:    sw t4, 20(sp) # 4-byte Folded Spill
-; CHECK-RV32-V-NEXT:    sw t5, 16(sp) # 4-byte Folded Spill
-; CHECK-RV32-V-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-V-NEXT:    addi s0, sp, 80
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 5
-; CHECK-RV32-V-NEXT:    sub sp, sp, a0
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 4
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 5
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 5
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-V-NEXT:    call otherfoo
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 4
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 4
-; CHECK-RV32-V-NEXT:    add a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    sw a7, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t3, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t4, 20(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t5, 16(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-V-NEXT:    addi s0, sp, 80
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
+; CHECK-RV32-V-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 3
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-V-NEXT:    add a0, a0, a1
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV32-V-NEXT:    call otherfoo
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    mv a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 2
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a0, a0, a1
+; CHECK-RV32-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
+; CHECK-RV32-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-V-NEXT:    mv a1, a0
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
-; CHECK-RV32-V-NEXT:    add a1, a1, a0
-; CHECK-RV32-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-V-NEXT:    add a0, a0, a1
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-V-NEXT:    csrr a0, vlenb
-; CHECK-RV32-V-NEXT:    slli a1, a0, 5
-; CHECK-RV32-V-NEXT:    sub a0, a1, a0
-; CHECK-RV32-V-NEXT:    sub a0, s0, a0
-; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-V-NEXT:    addi a0, a0, -80
-; CHECK-RV32-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    addi sp, s0, -80
 ; CHECK-RV32-V-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; CHECK-RV32-V-NEXT:    lw t0, 72(sp) # 4-byte Folded Reload
@@ -6220,518 +3056,66 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV32-FV-NEXT:    fsw fa0, 56(sp) # 4-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    fsw fa1, 52(sp) # 4-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    fsw fa2, 48(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw fa3, 44(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw fa4, 40(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw fa5, 36(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw fa6, 32(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw fa7, 28(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw ft8, 24(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw ft9, 20(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw ft10, 16(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    fsw ft11, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    addi s0, sp, 160
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
-; CHECK-RV32-FV-NEXT:    sub sp, sp, a0
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa3, 44(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa4, 40(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa5, 36(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa6, 32(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw fa7, 28(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft8, 24(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft9, 20(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft10, 16(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    fsw ft11, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    addi s0, sp, 160
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FV-NEXT:    call otherfoo
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    mv a1, a0
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
 ; CHECK-RV32-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV32-FV-NEXT:    call otherfoo
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    mv a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV32-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FV-NEXT:    mv a1, a0
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FV-NEXT:    addi a0, a0, -160
-; CHECK-RV32-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    addi sp, s0, -160
 ; CHECK-RV32-FV-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
 ; CHECK-RV32-FV-NEXT:    lw t0, 152(sp) # 4-byte Folded Reload
@@ -6790,532 +3174,80 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV32-FDV-NEXT:    sw a6, 192(sp) # 4-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    sw a7, 188(sp) # 4-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    sw t3, 184(sp) # 4-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    sw t4, 180(sp) # 4-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    sw t5, 176(sp) # 4-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    sw t6, 172(sp) # 4-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft0, 160(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft1, 152(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft2, 144(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft3, 136(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft4, 128(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft5, 120(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft6, 112(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft7, 104(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa0, 96(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa1, 88(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa2, 80(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa3, 72(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa4, 64(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa5, 56(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa6, 48(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd fa7, 40(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft8, 32(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft9, 24(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft10, 16(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    fsd ft11, 8(sp) # 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    addi s0, sp, 240
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
-; CHECK-RV32-FDV-NEXT:    sub sp, sp, a0
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t4, 180(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t5, 176(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    sw t6, 172(sp) # 4-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft0, 160(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft1, 152(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft2, 144(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft3, 136(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft4, 128(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft5, 120(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft6, 112(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft7, 104(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa0, 96(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa1, 88(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa2, 80(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa3, 72(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa4, 64(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa5, 56(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa6, 48(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd fa7, 40(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft8, 32(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft9, 24(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft10, 16(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    fsd ft11, 8(sp) # 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    addi s0, sp, 240
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV32-FDV-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-FDV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-FDV-NEXT:    call otherfoo
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV32-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    mv a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FDV-NEXT:    mv a1, a0
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV32-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV32-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV32-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV32-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV32-FDV-NEXT:    addi a0, a0, -240
-; CHECK-RV32-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-FDV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    addi sp, s0, -240
 ; CHECK-RV32-FDV-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
 ; CHECK-RV32-FDV-NEXT:    lw t0, 232(sp) # 4-byte Folded Reload
@@ -8176,514 +4108,62 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV64-V-NEXT:    sd a4, 80(sp) # 8-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    sd a5, 72(sp) # 8-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    sd a6, 64(sp) # 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    sd a7, 56(sp) # 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    sd t3, 48(sp) # 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    sd t4, 40(sp) # 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    sd t5, 32(sp) # 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    sd t6, 24(sp) # 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    addi s0, sp, 160
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 5
-; CHECK-RV64-V-NEXT:    sub sp, sp, a0
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 4
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 5
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 5
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-V-NEXT:    call otherfoo
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 4
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 4
-; CHECK-RV64-V-NEXT:    add a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    sd a7, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t3, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t4, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t5, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    sd t6, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-V-NEXT:    addi s0, sp, 160
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
+; CHECK-RV64-V-NEXT:    sub sp, sp, a0
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 3
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-V-NEXT:    add a0, a0, a1
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV64-V-NEXT:    call otherfoo
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    mv a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 2
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a0, a0, a1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
+; CHECK-RV64-V-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-V-NEXT:    mv a1, a0
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
-; CHECK-RV64-V-NEXT:    add a1, a1, a0
-; CHECK-RV64-V-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-V-NEXT:    add a0, a0, a1
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-V-NEXT:    csrr a0, vlenb
-; CHECK-RV64-V-NEXT:    slli a1, a0, 5
-; CHECK-RV64-V-NEXT:    sub a0, a1, a0
-; CHECK-RV64-V-NEXT:    sub a0, s0, a0
-; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-V-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-V-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-V-NEXT:    addi a0, a0, -160
-; CHECK-RV64-V-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-V-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    addi sp, s0, -160
 ; CHECK-RV64-V-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; CHECK-RV64-V-NEXT:    ld t0, 144(sp) # 8-byte Folded Reload
@@ -8736,518 +4216,66 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV64-FV-NEXT:    fsw fa0, 68(sp) # 4-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    fsw fa1, 64(sp) # 4-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    fsw fa2, 60(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw fa3, 56(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw fa4, 52(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw fa5, 48(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw fa6, 44(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw fa7, 40(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw ft8, 36(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw ft9, 32(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw ft10, 28(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    fsw ft11, 24(sp) # 4-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    addi s0, sp, 240
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
-; CHECK-RV64-FV-NEXT:    sub sp, sp, a0
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa3, 56(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa4, 52(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa5, 48(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa6, 44(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw fa7, 40(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft8, 36(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft9, 32(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft10, 28(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    fsw ft11, 24(sp) # 4-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    addi s0, sp, 240
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FV-NEXT:    call otherfoo
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    sub sp, sp, a0
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FV-NEXT:    mv a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
 ; CHECK-RV64-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; CHECK-RV64-FV-NEXT:    call otherfoo
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    mv a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
+; CHECK-RV64-FV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FV-NEXT:    mv a1, a0
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FV-NEXT:    addi a0, a0, -240
-; CHECK-RV64-FV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    addi sp, s0, -240
 ; CHECK-RV64-FV-NEXT:    ld ra, 232(sp) # 8-byte Folded Reload
 ; CHECK-RV64-FV-NEXT:    ld t0, 224(sp) # 8-byte Folded Reload
@@ -9306,532 +4334,80 @@ define void @foo_fp_with_call() #2 {
 ; CHECK-RV64-FDV-NEXT:    sd a6, 224(sp) # 8-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    sd a7, 216(sp) # 8-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    sd t3, 208(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t4, 200(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t5, 192(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    sd t6, 184(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft0, 176(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft1, 168(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft2, 160(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft3, 152(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft4, 144(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft5, 136(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft6, 128(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft7, 120(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa0, 112(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa1, 104(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa2, 96(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa3, 88(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa4, 80(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa5, 72(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa6, 64(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd fa7, 56(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft8, 48(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft9, 40(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft10, 32(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    fsd ft11, 24(sp) # 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    addi s0, sp, 320
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
-; CHECK-RV64-FDV-NEXT:    sub sp, sp, a0
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v0, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v2, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v4, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v6, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t4, 200(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t5, 192(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    sd t6, 184(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft0, 176(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft1, 168(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft2, 160(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft3, 152(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft4, 144(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft5, 136(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft6, 128(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft7, 120(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa0, 112(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa1, 104(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa2, 96(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa3, 88(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa4, 80(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa5, 72(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa6, 64(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd fa7, 56(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft8, 48(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft9, 40(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft10, 32(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    fsd ft11, 24(sp) # 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    addi s0, sp, 320
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
+; CHECK-RV64-FDV-NEXT:    sub sp, sp, a0
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v0, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
+; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v25, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v26, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v27, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v28, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v29, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v30, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vs1r.v v31, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV64-FDV-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV64-FDV-NEXT:    call otherfoo
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v2, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v4, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v6, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 4
-; CHECK-RV64-FDV-NEXT:    add a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 3
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v25, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v26, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v27, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    mv a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 2
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v28, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FDV-NEXT:    mv a1, a0
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
-; CHECK-RV64-FDV-NEXT:    add a1, a1, a0
-; CHECK-RV64-FDV-NEXT:    slli a0, a0, 1
 ; CHECK-RV64-FDV-NEXT:    add a0, a0, a1
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v29, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
-; CHECK-RV64-FDV-NEXT:    slli a1, a0, 5
-; CHECK-RV64-FDV-NEXT:    sub a0, a1, a0
-; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
-; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v30, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-FDV-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-FDV-NEXT:    sub a0, s0, a0
 ; CHECK-RV64-FDV-NEXT:    addi a0, a0, -320
-; CHECK-RV64-FDV-NEXT:    vl1r.v v31, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV64-FDV-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    addi sp, s0, -320
 ; CHECK-RV64-FDV-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
 ; CHECK-RV64-FDV-NEXT:    ld t0, 304(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index b1a6d163664e5..a06c7505d543d 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
 define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-LABEL: ctz_nxv4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    vmv.v.i v11, -1
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    li a1, -1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vmacc.vv v8, v10, v11
-; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT:    vmadd.vx v10, a1, v8
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
@@ -28,21 +28,21 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ;
 ; RV64-LABEL: ctz_nxv4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    vmv.v.i v11, -1
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmacc.vv v8, v10, v11
-; RV64-NEXT:    vmv.v.i v9, 0
-; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT:    vmadd.vx v10, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    slli a0, a0, 48
 ; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ;
 ; RV64-LABEL: ctz_nxv8i1_no_range:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    vmv.v.i v24, -1
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmacc.vv v8, v16, v24
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT:    vmadd.vx v16, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 20dd590d2ea98..1216d3000e8c8 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -35,7 +35,7 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    li a1, 4
-; RV64-NEXT:    subw a1, a1, a0
+; RV64-NEXT:    sub a1, a1, a0
 ; RV64-NEXT:    zext.b a0, a1
 ; RV64-NEXT:    ret
   %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index 1be599e4f8e1e..7a1c41c1839fa 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -454,7 +454,7 @@ define i32 @test_reassoc_add_sub_i32_1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_add_sub_i32_1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    subw a2, a2, a3
+; CHECK-NEXT:    sub a2, a2, a3
 ; CHECK-NEXT:    subw a0, a0, a2
 ; CHECK-NEXT:    ret
   %t0 = add i32 %a0, %a1
@@ -467,7 +467,7 @@ define i32 @test_reassoc_add_sub_i32_2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_add_sub_i32_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    subw a2, a2, a3
+; CHECK-NEXT:    sub a2, a2, a3
 ; CHECK-NEXT:    addw a0, a0, a2
 ; CHECK-NEXT:    ret
   %t0 = add i32 %a0, %a1
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 0d57e4201512e..cd9357994742b 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -3780,9 +3780,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
@@ -3985,9 +3985,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index 0caab1f5ce2f0..a5bdb13d37fb8 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -4410,9 +4410,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lbu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
@@ -4615,9 +4615,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a0, 4(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lwu a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lhu a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a0, a2, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    pack a1, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 27d5eaa032522..4c9a98cabb15f 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1080,14 +1080,14 @@ define i32 @muli32_m65(i32 %a) nounwind {
 ; RV64I-LABEL: muli32_m65:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 6
-; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    neg a0, a0
 ; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_m65:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 6
-; RV64IM-NEXT:    negw a0, a0
+; RV64IM-NEXT:    neg a0, a0
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, -65
@@ -1980,14 +1980,14 @@ define i8 @muladd_demand(i8 %x, i8 %y) nounwind {
 ; RV64I-LABEL: muladd_demand:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    andi a0, a0, 15
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muladd_demand:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 1
-; RV64IM-NEXT:    subw a0, a1, a0
+; RV64IM-NEXT:    sub a0, a1, a0
 ; RV64IM-NEXT:    andi a0, a0, 15
 ; RV64IM-NEXT:    ret
   %m = mul i8 %x, 14
@@ -2048,14 +2048,14 @@ define i8 @muladd_demand_2(i8 %x, i8 %y) nounwind {
 ; RV64I-LABEL: muladd_demand_2:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    subw a1, a1, a0
+; RV64I-NEXT:    sub a1, a1, a0
 ; RV64I-NEXT:    ori a0, a1, -16
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muladd_demand_2:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 1
-; RV64IM-NEXT:    subw a1, a1, a0
+; RV64IM-NEXT:    sub a1, a1, a0
 ; RV64IM-NEXT:    ori a0, a1, -16
 ; RV64IM-NEXT:    ret
   %m = mul i8 %x, 14
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index fe19a4fa8bbd8..da81fe5708814 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -179,7 +179,7 @@ define i32 @neg_abs32_multiuse(i32 %x, ptr %y) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a2, a0, 31
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    subw a2, a0, a2
+; RV64I-NEXT:    sub a2, a0, a2
 ; RV64I-NEXT:    negw a0, a2
 ; RV64I-NEXT:    sw a2, 0(a1)
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 47b90a006a249..ba6769b2aa3e1 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -833,7 +833,7 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) {
 ; RV64-NEXT:    sext.w a3, a1
 ; RV64-NEXT:    sext.w a4, a0
 ; RV64-NEXT:    sltu a3, a4, a3
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    sw a0, 0(a2)
 ; RV64-NEXT:    mv a0, a3
 ; RV64-NEXT:    ret
@@ -860,7 +860,7 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    zext.b a2, a0
 ; RV64-NEXT:    li a3, 42
-; RV64-NEXT:    subw a3, a3, a0
+; RV64-NEXT:    sub a3, a3, a0
 ; RV64-NEXT:    sltiu a0, a2, 43
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    sb a3, 0(a1)
@@ -890,7 +890,7 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) {
 ; RV64-NEXT:    slli a2, a0, 48
 ; RV64-NEXT:    li a3, 43
 ; RV64-NEXT:    srli a2, a2, 48
-; RV64-NEXT:    subw a3, a3, a0
+; RV64-NEXT:    sub a3, a3, a0
 ; RV64-NEXT:    sltiu a0, a2, 44
 ; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    sh a3, 0(a1)
@@ -987,7 +987,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, ptr %p) {
 ; RV64-LABEL: usubo_ne_constant0_op1_i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a2, a0
-; RV64-NEXT:    negw a3, a0
+; RV64-NEXT:    neg a3, a0
 ; RV64-NEXT:    snez a0, a2
 ; RV64-NEXT:    sw a3, 0(a1)
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/pr145360.ll b/llvm/test/CodeGen/RISCV/pr145360.ll
index 4251ac60c8bf6..1c77fadbd4b7d 100644
--- a/llvm/test/CodeGen/RISCV/pr145360.ll
+++ b/llvm/test/CodeGen/RISCV/pr145360.ll
@@ -8,7 +8,7 @@ define i32 @signed(i32 %0, ptr %1) {
 ; CHECK-NEXT:    srliw a2, a2, 24
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    andi a2, a2, -256
-; CHECK-NEXT:    subw a2, a0, a2
+; CHECK-NEXT:    sub a2, a0, a2
 ; CHECK-NEXT:    sraiw a0, a0, 8
 ; CHECK-NEXT:    sw a2, 0(a1)
 ; CHECK-NEXT:    ret
@@ -29,7 +29,7 @@ define i32 @unsigned(i32 %0, ptr %1) {
 ; CHECK-NEXT:    srli a2, a2, 36
 ; CHECK-NEXT:    slli a4, a2, 5
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    subw a2, a2, a4
+; CHECK-NEXT:    sub a2, a2, a4
 ; CHECK-NEXT:    srliw a4, a0, 3
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    mulw a0, a4, a3
@@ -49,7 +49,7 @@ define i32 @signed_div_first(i32 %0, ptr %1) {
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    sraiw a2, a3, 8
 ; CHECK-NEXT:    andi a3, a3, -256
-; CHECK-NEXT:    subw a0, a0, a3
+; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    sw a0, 0(a1)
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    ret
@@ -70,7 +70,7 @@ define i32 @unsigned_div_first(i32 %0, ptr %1) {
 ; CHECK-NEXT:    srli a2, a2, 36
 ; CHECK-NEXT:    slli a3, a2, 5
 ; CHECK-NEXT:    slli a4, a2, 3
-; CHECK-NEXT:    subw a4, a4, a3
+; CHECK-NEXT:    sub a4, a4, a3
 ; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    sw a0, 0(a1)
 ; CHECK-NEXT:    mv a0, a2
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
index e05e27af4271c..b8ff7832fc7de 100644
--- a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
@@ -239,8 +239,8 @@ body:             |
     ; NO-PREFER-W-INST-NEXT: {{  $}}
     ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
     ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
-    ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0
-    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1
+    ; NO-PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
     ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
     ; NO-PREFER-W-INST-NEXT: PseudoRET
     ;
diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index 634cca5dcdb71..cf64650c964e8 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sllw a1, a0, a1
 ; RV64I-NEXT:    srlw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -56,7 +56,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_32:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srlw a1, a0, a1
 ; RV64I-NEXT:    sllw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -105,7 +105,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_32:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -159,7 +159,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -253,7 +253,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -307,7 +307,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -401,7 +401,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -423,7 +423,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_32_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sllw a1, a0, a1
 ; RV64I-NEXT:    srlw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -450,7 +450,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_32_mask:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -474,7 +474,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64I-LABEL: rotl_32_mask_and_63_and_31:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -500,7 +500,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -545,7 +545,7 @@ define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -569,7 +569,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_32_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srlw a1, a0, a1
 ; RV64I-NEXT:    sllw a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -596,7 +596,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_32_mask:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -620,7 +620,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64I-LABEL: rotr_32_mask_and_63_and_31:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -646,7 +646,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -691,7 +691,7 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -745,7 +745,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_64_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -835,7 +835,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_mask:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -890,7 +890,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64I-LABEL: rotl_64_mask_and_127_and_63:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -981,7 +981,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1026,7 +1026,7 @@ define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1080,7 +1080,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_64_mask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -1170,7 +1170,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_mask:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1225,7 +1225,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64I-LABEL: rotr_64_mask_and_127_and_63:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -1316,7 +1316,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1361,7 +1361,7 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -1390,7 +1390,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64I-LABEL: rotl_32_mask_shared:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    srlw a0, a0, a4
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sllw a1, a1, a2
@@ -1424,7 +1424,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64XTHEADBB-LABEL: rotl_32_mask_shared:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a2
@@ -1486,7 +1486,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64I-LABEL: rotl_64_mask_shared:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    srl a0, a0, a4
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sll a1, a1, a2
@@ -1590,7 +1590,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64XTHEADBB-LABEL: rotl_64_mask_shared:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a2
@@ -1618,7 +1618,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64I-LABEL: rotr_32_mask_shared:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    sllw a0, a0, a4
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sllw a1, a1, a2
@@ -1652,7 +1652,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign
 ; RV64XTHEADBB-LABEL: rotr_32_mask_shared:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a2
@@ -1713,7 +1713,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64I-LABEL: rotr_64_mask_shared:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    sll a0, a0, a4
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sll a1, a1, a2
@@ -1816,7 +1816,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV64XTHEADBB-LABEL: rotr_64_mask_shared:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a4
 ; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a2
@@ -1846,7 +1846,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I-LABEL: rotl_32_mask_multiple:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    sllw a2, a1, a2
 ; RV64I-NEXT:    srlw a0, a0, a4
 ; RV64I-NEXT:    srlw a1, a1, a4
@@ -1884,7 +1884,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB-LABEL: rotl_32_mask_multiple:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    sllw a2, a1, a2
 ; RV64XTHEADBB-NEXT:    srlw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    srlw a1, a1, a4
@@ -1948,7 +1948,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64I-LABEL: rotl_64_mask_multiple:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    sll a2, a1, a2
 ; RV64I-NEXT:    srl a0, a0, a4
 ; RV64I-NEXT:    srl a1, a1, a4
@@ -2056,7 +2056,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_mask_multiple:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    sll a2, a1, a2
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV64XTHEADBB-NEXT:    srl a1, a1, a4
@@ -2087,7 +2087,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I-LABEL: rotr_32_mask_multiple:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    srlw a2, a1, a2
 ; RV64I-NEXT:    sllw a0, a0, a4
 ; RV64I-NEXT:    sllw a1, a1, a4
@@ -2125,7 +2125,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB-LABEL: rotr_32_mask_multiple:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    srlw a2, a1, a2
 ; RV64XTHEADBB-NEXT:    sllw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a4
@@ -2188,7 +2188,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64I-LABEL: rotr_64_mask_multiple:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a3, a0, a2
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    srl a2, a1, a2
 ; RV64I-NEXT:    sll a0, a0, a4
 ; RV64I-NEXT:    sll a1, a1, a4
@@ -2295,7 +2295,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_mask_multiple:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a3, a0, a2
-; RV64XTHEADBB-NEXT:    negw a4, a2
+; RV64XTHEADBB-NEXT:    neg a4, a2
 ; RV64XTHEADBB-NEXT:    srl a2, a1, a2
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a4
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a4
@@ -2353,7 +2353,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotl_64_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    sll a1, a0, a1
 ; RV64I-NEXT:    srl a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -2447,7 +2447,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotl_64_zext:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    srl a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
@@ -2503,7 +2503,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: rotr_64_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    srl a1, a0, a1
 ; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -2597,7 +2597,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV64XTHEADBB-LABEL: rotr_64_zext:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a2, a0, a1
-; RV64XTHEADBB-NEXT:    negw a1, a1
+; RV64XTHEADBB-NEXT:    neg a1, a1
 ; RV64XTHEADBB-NEXT:    sll a0, a0, a1
 ; RV64XTHEADBB-NEXT:    or a0, a2, a0
 ; RV64XTHEADBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
index b8c43289bdfed..721436deb1c0b 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
@@ -121,7 +121,7 @@ define signext i32 @andi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) {
 define signext i32 @addi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) {
 ; CHECK-LABEL: addi_sub_cse:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    addiw a0, a0, -8
 ; CHECK-NEXT:    sw a0, 0(a2)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
index dad20b2d19464..6b4c2539c88f8 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
@@ -501,14 +501,14 @@ define signext i32 @sext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_subw_aext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_aext_aext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -518,14 +518,14 @@ define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
 define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_subw_aext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_aext_sext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -535,14 +535,14 @@ define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
 define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_subw_aext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_aext_zext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -552,14 +552,14 @@ define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_subw_sext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_sext_aext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -569,14 +569,14 @@ define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
 define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_subw_sext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_sext_sext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -586,14 +586,14 @@ define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_subw_sext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_sext_zext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -603,14 +603,14 @@ define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_subw_zext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_zext_aext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -620,14 +620,14 @@ define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_subw_zext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_zext_sext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
@@ -637,14 +637,14 @@ define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_subw_zext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_subw_zext_zext:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    zext.w a0, a0
 ; RV64ZBA-NEXT:    ret
   %1 = sub i32 %a, %b
diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
index 0782018833de3..219a5aa6e5f20 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
@@ -9,7 +9,7 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    not a2, a0
 ; CHECK-NEXT:    addi a3, a0, 1
 ; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    subw a1, a1, a0
+; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    addi a1, a1, -2
 ; CHECK-NEXT:    mul a3, a2, a3
 ; CHECK-NEXT:    slli a1, a1, 32
@@ -53,7 +53,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    subw a3, a1, a0
+; CHECK-NEXT:    sub a3, a1, a0
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    addi a3, a3, -2
 ; CHECK-NEXT:    mul a2, a1, a2
@@ -61,7 +61,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    slli a1, a1, 32
 ; CHECK-NEXT:    mulhu a1, a1, a3
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    subw a0, a2, a0
+; CHECK-NEXT:    sub a0, a2, a0
 ; CHECK-NEXT:    subw a0, a0, a1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2:
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 00f7b462f68db..81acb4f724136 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -357,7 +357,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    beqz a0, .LBB6_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -365,16 +365,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -410,7 +410,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: cttz_zero_undef_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -418,16 +418,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: findFirstSet_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -463,16 +463,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -508,7 +508,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: ffs_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -516,16 +516,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    lui a4, %hi(.LCPI9_0)
 ; RV64I-NEXT:    addi a4, a4, %lo(.LCPI9_0)
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index fdff4a39932b9..b46f7cc440b7a 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -3707,7 +3707,7 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
 define i64 @regression(i32 signext %x, i32 signext %y) {
 ; RV64I-LABEL: regression:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a0, 29
 ; RV64I-NEXT:    srli a0, a0, 27
@@ -3716,14 +3716,14 @@ define i64 @regression(i32 signext %x, i32 signext %y) {
 ;
 ; RV64ZBA-LABEL: regression:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    sub a0, a0, a1
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 3
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV64XANDESPERF-LABEL: regression:
 ; RV64XANDESPERF:       # %bb.0:
-; RV64XANDESPERF-NEXT:    subw a0, a0, a1
+; RV64XANDESPERF-NEXT:    sub a0, a0, a1
 ; RV64XANDESPERF-NEXT:    slli a0, a0, 32
 ; RV64XANDESPERF-NEXT:    srli a0, a0, 29
 ; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
index 12fc98c7edab8..f2c95f855e178 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
@@ -225,7 +225,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: rol_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -243,7 +243,7 @@ define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 ; RV64I-LABEL: rol_i32_nosext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a3, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sw a0, 0(a2)
@@ -263,7 +263,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: rol_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    neg a2, a0
 ; RV64I-NEXT:    sllw a0, a1, a0
 ; RV64I-NEXT:    srlw a1, a1, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -284,7 +284,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: rol_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -303,7 +303,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: ror_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -321,7 +321,7 @@ define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
 ; RV64I-LABEL: ror_i32_nosext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a3, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sw a0, 0(a2)
@@ -341,7 +341,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: ror_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    neg a2, a0
 ; RV64I-NEXT:    srlw a0, a1, a0
 ; RV64I-NEXT:    sllw a1, a1, a2
 ; RV64I-NEXT:    or a0, a0, a1
@@ -362,7 +362,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: ror_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index e6407279870db..d133f9d1db389 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -347,7 +347,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    beqz a0, .LBB6_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -355,16 +355,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -390,7 +390,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: cttz_zero_undef_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 6
 ; RV64I-NEXT:    slli a2, a0, 8
@@ -398,16 +398,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a4, a0, 12
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a0, 16
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 18
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a4, a0, 4
-; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    sub a4, a0, a4
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    slli a4, a0, 14
-; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    sub a3, a3, a4
 ; RV64I-NEXT:    slli a4, a0, 23
-; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    sub a2, a2, a4
 ; RV64I-NEXT:    slli a0, a0, 27
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    add a0, a2, a0
@@ -430,7 +430,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: findFirstSet_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -438,16 +438,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a1, a1, 27
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a1, a3, a1
@@ -478,7 +478,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: ffs_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    slli a2, a1, 6
 ; RV64I-NEXT:    slli a3, a1, 8
@@ -486,16 +486,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    slli a5, a1, 12
 ; RV64I-NEXT:    add a2, a2, a3
 ; RV64I-NEXT:    slli a3, a1, 16
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 18
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    slli a5, a1, 4
-; RV64I-NEXT:    subw a5, a1, a5
+; RV64I-NEXT:    sub a5, a1, a5
 ; RV64I-NEXT:    add a2, a5, a2
 ; RV64I-NEXT:    slli a5, a1, 14
-; RV64I-NEXT:    subw a4, a4, a5
+; RV64I-NEXT:    sub a4, a4, a5
 ; RV64I-NEXT:    slli a5, a1, 23
-; RV64I-NEXT:    subw a3, a3, a5
+; RV64I-NEXT:    sub a3, a3, a5
 ; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    lui a4, %hi(.LCPI9_0)
 ; RV64I-NEXT:    addi a4, a4, %lo(.LCPI9_0)
@@ -701,7 +701,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ;
 ; RV64ZBB-LABEL: ctpop_i32_load:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    lwu a0, 0(a0)
+; RV64ZBB-NEXT:    lw a0, 0(a0)
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    ret
   %a = load i32, ptr %p
@@ -1741,7 +1741,7 @@ define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
 ; RV64ZBB-LABEL: sub_if_uge_i8:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    zext.b a2, a0
-; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    zext.b a0, a0
 ; RV64ZBB-NEXT:    minu a0, a2, a0
 ; RV64ZBB-NEXT:    ret
@@ -1767,7 +1767,7 @@ define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
 ; RV64ZBB-LABEL: sub_if_uge_i16:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    zext.h a2, a0
-; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    zext.h a0, a0
 ; RV64ZBB-NEXT:    minu a0, a2, a0
 ; RV64ZBB-NEXT:    ret
@@ -1852,7 +1852,7 @@ define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
 ; CHECK-NEXT:    sltu a2, a3, a2
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    sllw a0, a0, a1
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i32 %x, %y
@@ -1870,7 +1870,7 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
 ; RV64I-NEXT:    sltu a4, a3, a2
 ; RV64I-NEXT:    addi a4, a4, -1
 ; RV64I-NEXT:    and a1, a4, a1
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    bltu a3, a2, .LBB68_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a1, 4
@@ -1980,7 +1980,7 @@ define i32 @sub_if_uge_C_i32(i32 signext %x) {
 ; RV64I-NEXT:    lui a2, 1048560
 ; RV64I-NEXT:    addi a1, a1, -16
 ; RV64I-NEXT:    sltu a1, a1, a0
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    addi a2, a2, 15
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    addw a0, a0, a1
@@ -2036,7 +2036,7 @@ define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
 ; RV64I-NEXT:    lui a3, 1048560
 ; RV64I-NEXT:    addi a2, a2, -16
 ; RV64I-NEXT:    sltu a2, a2, a0
-; RV64I-NEXT:    negw a4, a2
+; RV64I-NEXT:    neg a4, a2
 ; RV64I-NEXT:    addi a3, a3, 15
 ; RV64I-NEXT:    and a3, a4, a3
 ; RV64I-NEXT:    addw a0, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 696c2a5e0f806..818ea723ca2e1 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -114,7 +114,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) {
 ; RV64ZBKB-LABEL: pack_i64_3:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    lw a0, 0(a0)
-; RV64ZBKB-NEXT:    lwu a1, 0(a1)
+; RV64ZBKB-NEXT:    lw a1, 0(a1)
 ; RV64ZBKB-NEXT:    pack a0, a1, a0
 ; RV64ZBKB-NEXT:    ret
   %3 = load i32, ptr %0, align 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
index 96c349d83955c..d166a6ed2b4fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll
@@ -92,6 +92,150 @@ entry:
   ret <vscale x 1 x i32> %va
 }
 
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee2(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_callee2:
+; SPILL-O2:       # %bb.0: # %entry
+; SPILL-O2-NEXT:    addi sp, sp, -16
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 12
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    sub sp, sp, a0
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 11
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 10
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v3, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v5, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 3
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v7, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; SPILL-O2-NEXT:    #APP
+; SPILL-O2-NEXT:    #NO_APP
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 11
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 10
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v3, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v5, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 3
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v7, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 12
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add sp, sp, a0
+; SPILL-O2-NEXT:    addi sp, sp, 16
+; SPILL-O2-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v1},~{v3},~{v5},~{v7},~{v24m2},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+
+  ret <vscale x 1 x i32> %va
+}
+
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee3(<vscale x 1 x i32> %va) nounwind {
+; SPILL-O2-LABEL: test_vector_callee3:
+; SPILL-O2:       # %bb.0: # %entry
+; SPILL-O2-NEXT:    addi sp, sp, -16
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 10
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    sub sp, sp, a0
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v1, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 3
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs1r.v v24, (a0) # vscale x 8-byte Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 6
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs2r.v v2, (a0) # vscale x 16-byte Folded Spill
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 2
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vs2r.v v26, (a0) # vscale x 16-byte Folded Spill
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; SPILL-O2-NEXT:    #APP
+; SPILL-O2-NEXT:    #NO_APP
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a1, a0, 3
+; SPILL-O2-NEXT:    add a0, a1, a0
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v1, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 3
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl1r.v v24, (a0) # vscale x 8-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 6
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl2r.v v2, (a0) # vscale x 16-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    slli a0, a0, 2
+; SPILL-O2-NEXT:    add a0, sp, a0
+; SPILL-O2-NEXT:    addi a0, a0, 16
+; SPILL-O2-NEXT:    vl2r.v v26, (a0) # vscale x 16-byte Folded Reload
+; SPILL-O2-NEXT:    addi a0, sp, 16
+; SPILL-O2-NEXT:    vl4r.v v28, (a0) # vscale x 32-byte Folded Reload
+; SPILL-O2-NEXT:    csrr a0, vlenb
+; SPILL-O2-NEXT:    li a1, 10
+; SPILL-O2-NEXT:    mul a0, a0, a1
+; SPILL-O2-NEXT:    add sp, sp, a0
+; SPILL-O2-NEXT:    addi sp, sp, 16
+; SPILL-O2-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v1},~{v2},~{v3},~{v24},~{v26m2},~{v28m2},~{v29},~{v30},~{v31}"()
+
+  ret <vscale x 1 x i32> %va
+}
+
 ; Make sure the local stack allocation pass doesn't count vector registers. The
 ; sizes are chosen to be on the edge of what RISCVRegister::needsFrameBaseReg
 ; considers to need a virtual base register.
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
index 0d8aff306252e..2d4fce68f9545 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
@@ -313,12 +313,12 @@ define i32 @test_nxv128i1(<vscale x 128 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v0, v6, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v7, a1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a0
 ; CHECK-NEXT:    vslidedown.vx v5, v6, a0
-; CHECK-NEXT:    vslidedown.vx v4, v7, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
@@ -425,13 +425,15 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v5, a1
-; CHECK-NEXT:    vslidedown.vx v5, v7, a1
-; CHECK-NEXT:    vslidedown.vx v4, v6, a1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v6, a1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    addi a2, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index 5b82b27a51510..81b2b6594890e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -63,10 +63,10 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    and a2, t4, a2
 ; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
-; RV64-NEXT:    negw a7, a7
-; RV64-NEXT:    negw t0, t0
-; RV64-NEXT:    negw a2, a2
-; RV64-NEXT:    negw a3, a3
+; RV64-NEXT:    neg a7, a7
+; RV64-NEXT:    neg t0, t0
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    neg a3, a3
 ; RV64-NEXT:    and a4, a7, a4
 ; RV64-NEXT:    and a6, t0, a6
 ; RV64-NEXT:    and a1, a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index 796f8dde58f47..15417da962bd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -139,21 +139,20 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    addi a3, sp, 64
 ; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    vl8r.v v24, (a0)
+; RV32-NEXT:    vl8r.v v16, (a0)
 ; RV32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; RV32-NEXT:    vmseq.vi v0, v8, 0
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    vmseq.vi v8, v24, 0
-; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vs8r.v v24, (a3)
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    vs8r.v v8, (a2)
 ; RV32-NEXT:    lbu a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -179,21 +178,20 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    addi a3, sp, 64
 ; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    vl8r.v v24, (a0)
+; RV64-NEXT:    vl8r.v v16, (a0)
 ; RV64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; RV64-NEXT:    vmseq.vi v0, v8, 0
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    add a1, a3, a1
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    vmseq.vi v8, v24, 0
-; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vs8r.v v24, (a3)
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-NEXT:    vs8r.v v8, (a2)
 ; RV64-NEXT:    lbu a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 3e822d357b667..807651c9b40c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -274,6 +274,59 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
 }
 
+define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_partial:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    ret
+  %vec = load <24 x i8>, ptr %p
+  %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
+  %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+  %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+  %res0 = insertvalue { <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8> } %res0, <8 x i8> %t2, 1
+  ret { <8 x i8>, <8 x i8> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue.
+define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a2, a2, 1
+; CHECK-NEXT:    beqz a2, .LBB17_2
+; CHECK-NEXT:  # %bb.1: # %bb0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB17_2: # %bb1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v6, (a1)
+; CHECK-NEXT:    ret
+  br i1 %c, label %bb0, label %bb1
+
+bb0:
+  %vec0 = load <24 x i8>, ptr %p
+  %d0.0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec0)
+  br label %merge
+
+bb1:
+  %vec1 = load <24 x i8>, ptr %p1
+  %d0.1 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec1)
+  br label %merge
+
+merge:
+  %d0 = phi {<8 x i8>, <8 x i8>, <8 x i8>} [%d0.0, %bb0], [%d0.1, %bb1]
+  %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+  %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
+  %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+  %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
+  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
+}
+
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor4:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 2587411566a3f..fb070b24a4f34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -324,24 +324,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    zext.b a1, a1
-; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    li a3, 128
-; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32-NEXT:    li a2, 128
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle8.v v16, (a0)
-; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    vmseq.vi v0, v8, 0
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV32-NEXT:    vse8.v v16, (a2)
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vse8.v v8, (a0)
-; RV32-NEXT:    lbu a0, 0(a1)
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    zext.b a0, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    vse8.v v24, (a1)
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    vse8.v v8, (a1)
+; RV32-NEXT:    lbu a0, 0(a0)
 ; RV32-NEXT:    addi sp, s0, -384
 ; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
@@ -355,24 +354,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    zext.b a1, a1
-; RV64-NEXT:    mv a2, sp
-; RV64-NEXT:    li a3, 128
-; RV64-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64-NEXT:    li a2, 128
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a0)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle8.v v16, (a0)
-; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    vmseq.vi v0, v8, 0
-; RV64-NEXT:    vmv.v.i v24, 0
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV64-NEXT:    vse8.v v16, (a2)
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vse8.v v8, (a0)
-; RV64-NEXT:    lbu a0, 0(a1)
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    zext.b a0, a1
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    vse8.v v24, (a1)
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    vse8.v v8, (a1)
+; RV64-NEXT:    lbu a0, 0(a0)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
@@ -386,24 +384,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32ZBS-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32ZBS-NEXT:    addi s0, sp, 384
 ; RV32ZBS-NEXT:    andi sp, sp, -128
-; RV32ZBS-NEXT:    zext.b a1, a1
-; RV32ZBS-NEXT:    mv a2, sp
-; RV32ZBS-NEXT:    li a3, 128
-; RV32ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32ZBS-NEXT:    li a2, 128
+; RV32ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32ZBS-NEXT:    vle8.v v8, (a0)
 ; RV32ZBS-NEXT:    addi a0, a0, 128
 ; RV32ZBS-NEXT:    vle8.v v16, (a0)
-; RV32ZBS-NEXT:    add a1, a2, a1
 ; RV32ZBS-NEXT:    vmseq.vi v0, v8, 0
-; RV32ZBS-NEXT:    vmv.v.i v24, 0
-; RV32ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV32ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV32ZBS-NEXT:    vse8.v v16, (a2)
-; RV32ZBS-NEXT:    vmv1r.v v0, v8
-; RV32ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV32ZBS-NEXT:    addi a0, sp, 128
-; RV32ZBS-NEXT:    vse8.v v8, (a0)
-; RV32ZBS-NEXT:    lbu a0, 0(a1)
+; RV32ZBS-NEXT:    vmv.v.i v8, 0
+; RV32ZBS-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32ZBS-NEXT:    vmseq.vi v0, v16, 0
+; RV32ZBS-NEXT:    zext.b a0, a1
+; RV32ZBS-NEXT:    mv a1, sp
+; RV32ZBS-NEXT:    add a0, a1, a0
+; RV32ZBS-NEXT:    vse8.v v24, (a1)
+; RV32ZBS-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32ZBS-NEXT:    addi a1, sp, 128
+; RV32ZBS-NEXT:    vse8.v v8, (a1)
+; RV32ZBS-NEXT:    lbu a0, 0(a0)
 ; RV32ZBS-NEXT:    addi sp, s0, -384
 ; RV32ZBS-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
 ; RV32ZBS-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
@@ -417,24 +414,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64ZBS-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64ZBS-NEXT:    addi s0, sp, 384
 ; RV64ZBS-NEXT:    andi sp, sp, -128
-; RV64ZBS-NEXT:    zext.b a1, a1
-; RV64ZBS-NEXT:    mv a2, sp
-; RV64ZBS-NEXT:    li a3, 128
-; RV64ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64ZBS-NEXT:    li a2, 128
+; RV64ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64ZBS-NEXT:    vle8.v v8, (a0)
 ; RV64ZBS-NEXT:    addi a0, a0, 128
 ; RV64ZBS-NEXT:    vle8.v v16, (a0)
-; RV64ZBS-NEXT:    add a1, a2, a1
 ; RV64ZBS-NEXT:    vmseq.vi v0, v8, 0
-; RV64ZBS-NEXT:    vmv.v.i v24, 0
-; RV64ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV64ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV64ZBS-NEXT:    vse8.v v16, (a2)
-; RV64ZBS-NEXT:    vmv1r.v v0, v8
-; RV64ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV64ZBS-NEXT:    addi a0, sp, 128
-; RV64ZBS-NEXT:    vse8.v v8, (a0)
-; RV64ZBS-NEXT:    lbu a0, 0(a1)
+; RV64ZBS-NEXT:    vmv.v.i v8, 0
+; RV64ZBS-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64ZBS-NEXT:    vmseq.vi v0, v16, 0
+; RV64ZBS-NEXT:    zext.b a0, a1
+; RV64ZBS-NEXT:    mv a1, sp
+; RV64ZBS-NEXT:    add a0, a1, a0
+; RV64ZBS-NEXT:    vse8.v v24, (a1)
+; RV64ZBS-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64ZBS-NEXT:    addi a1, sp, 128
+; RV64ZBS-NEXT:    vse8.v v8, (a1)
+; RV64ZBS-NEXT:    lbu a0, 0(a0)
 ; RV64ZBS-NEXT:    addi sp, s0, -384
 ; RV64ZBS-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
 ; RV64ZBS-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 0c30cbe4a42ef..9df71cfc96cc7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -5707,3 +5707,197 @@ define void @msub_vv_v2i64_2(ptr %x, <2 x i64> %y) {
   store <2 x i64> %c, ptr %x
   ret void
 }
+
+define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <8 x i8> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i8> zeroinitializer, <8 x i8> %vb
+  %sub = sub nuw <8 x i8> %va, %select
+  ret <8 x i8> %sub
+}
+
+define <8 x i8> @vsub_if_uge_swapped_v8i8(<8 x i8> %va, <8 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <8 x i8> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i8> %vb, <8 x i8> zeroinitializer
+  %sub = sub nuw <8 x i8> %va, %select
+  ret <8 x i8> %sub
+}
+
+define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <8 x i16> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vb
+  %sub = sub nuw <8 x i16> %va, %select
+  ret <8 x i16> %sub
+}
+
+define <8 x i16> @vsub_if_uge_swapped_v8i16(<8 x i16> %va, <8 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <8 x i16> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i16> %vb, <8 x i16> zeroinitializer
+  %sub = sub nuw <8 x i16> %va, %select
+  ret <8 x i16> %sub
+}
+
+define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <4 x i32> %va, %vb
+  %select = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vb
+  %sub = sub nuw <4 x i32> %va, %select
+  ret <4 x i32> %sub
+}
+
+define <4 x i32> @vsub_if_uge_swapped_v4i32(<4 x i32> %va, <4 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <4 x i32> %va, %vb
+  %select = select <4 x i1> %cmp, <4 x i32> %vb, <4 x i32> zeroinitializer
+  %sub = sub nuw <4 x i32> %va, %select
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <2 x i64> %va, %vb
+  %select = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vb
+  %sub = sub nuw <2 x i64> %va, %select
+  ret <2 x i64> %sub
+}
+
+define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <2 x i64> %va, %vb
+  %select = select <2 x i1> %cmp, <2 x i64> %vb, <2 x i64> zeroinitializer
+  %sub = sub nuw <2 x i64> %va, %select
+  ret <2 x i64> %sub
+}
+
+define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: sub_if_uge_C_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v8, -13
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <8 x i8> %x, splat (i8 12)
+  %sub = add <8 x i8> %x, splat (i8 -13)
+  %select = select <8 x i1> %cmp, <8 x i8> %sub, <8 x i8> %x
+  ret <8 x i8> %select
+}
+
+define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) {
+; CHECK-LABEL: sub_if_uge_C_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, -2001
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <8 x i16> %x, splat (i16 2000)
+  %sub = add <8 x i16> %x, splat (i16 -2001)
+  %select = select <8 x i1> %cmp, <8 x i16> %sub, <8 x i16> %x
+  ret <8 x i16> %select
+}
+
+define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <4 x i32> %x, splat (i32 65520)
+  %sub = add <4 x i32> %x, splat (i32 -65521)
+  %select = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %x
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @sub_if_uge_C_swapped_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <4 x i32> %x, splat (i32 65521)
+  %sub = add <4 x i32> %x, splat (i32 -65521)
+  %select = select <4 x i1> %cmp, <4 x i32> %x, <4 x i32> %sub
+  ret <4 x i32> %select
+}
+
+define <2 x i64> @sub_if_uge_C_v2i64(<2 x i64> %x) nounwind {
+; RV32-LABEL: sub_if_uge_C_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    li a0, -2
+; RV32-NEXT:    lui a1, 876449
+; RV32-NEXT:    addi a1, a1, -513
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vadd.vv v9, v8, v9
+; RV32-NEXT:    vminu.vv v8, v9, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sub_if_uge_C_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a0, 1048278
+; RV64-NEXT:    addi a0, a0, -95
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -513
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vadd.vx v9, v8, a0
+; RV64-NEXT:    vminu.vv v8, v9, v8
+; RV64-NEXT:    ret
+  %cmp = icmp ugt <2 x i64> %x, splat (i64 5000000000)
+  %sub = add <2 x i64> %x, splat (i64 -5000000001)
+  %select = select <2 x i1> %cmp, <2 x i64> %sub, <2 x i64> %x
+  ret <2 x i64> %select
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 041aae229288f..7274e1bb59b92 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -190,6 +190,20 @@ define {<4 x i32>, <4 x i32>} @vpload_factor2(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>} %res1
 }
 
+define {<4 x i32>, <4 x i32>} @vpload_factor2_interleaved_mask_intrinsic(ptr %ptr, <4 x i1> %m) {
+; CHECK-LABEL: vpload_factor2_interleaved_mask_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.mask = call <8 x i1> @llvm.vector.interleave2(<4 x i1> %m, <4 x i1> %m)
+  %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> %interleaved.mask, i32 8)
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
 
 define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor3:
@@ -423,8 +437,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI20_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI20_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI21_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI21_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -509,12 +523,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI20_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI21_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI21_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI20_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI20_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI21_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI21_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -693,8 +707,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI20_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI21_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI21_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -758,16 +772,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI20_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI20_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI20_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI21_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI21_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI21_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI21_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI20_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI21_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI21_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -795,14 +809,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI20_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI20_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI20_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI21_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI21_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI21_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI21_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI20_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI21_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI21_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -889,8 +903,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI20_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI20_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI21_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI21_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1078,8 +1092,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI20_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI20_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI21_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI21_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1113,8 +1127,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI20_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI20_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI21_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI21_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1198,12 +1212,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI20_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI21_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI21_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI20_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI21_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI21_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1254,8 +1268,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI20_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI21_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI21_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1472,6 +1486,19 @@ define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   ret void
 }
 
+define void @vpstore_factor2_interleaved_mask_intrinsic(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i1> %m) {
+; CHECK-LABEL: vpstore_factor2_interleaved_mask_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg2e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.mask = call <8 x i1> @llvm.vector.interleave2(<4 x i1> %m, <4 x i1> %m)
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  tail call void @llvm.vp.store.v8i32.p0(<8 x i32> %interleaved.vec, ptr %ptr, <8 x i1> %interleaved.mask, i32 8)
+  ret void
+}
+
+
 define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
 ; CHECK-LABEL: vpstore_factor3:
 ; CHECK:       # %bb.0:
@@ -1718,6 +1745,30 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
   ret void
 }
 
+define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) {
+; CHECK-LABEL: vp_load_factor3_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1>  splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  ret <4 x i32> %v0
+}
+
+define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) {
+; CHECK-LABEL: vp_load_factor5_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 20
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+ %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1>  splat (i1 true), i32 20)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  ret <4 x i32> %v0
+}
+
 define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
 ; CHECK-LABEL: store_factor4_one_active:
 ; CHECK:       # %bb.0:
@@ -1730,6 +1781,18 @@ define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
   ret void
 }
 
+define void @vpstore_factor4_one_active(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: vpstore_factor4_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef, i32 undef>
+  tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %v0, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  ret void
+}
+
 define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) {
 ; CHECK-LABEL: store_factor4_one_active_idx1:
 ; CHECK:       # %bb.0:
@@ -1747,7 +1810,7 @@ define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
 ; CHECK-LABEL: store_factor4_one_active_fullwidth:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsse32.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef, i32 undef>
@@ -1804,8 +1867,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI51_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI51_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI56_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI56_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1880,8 +1943,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI52_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI52_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI57_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI57_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index c11319ff335fd..67584ba8a82cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -143,16 +143,15 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) {
 ; CHECK-LABEL: deinterleave6_0_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmv.v.i v0, 2
-; CHECK-NEXT:    vmv.v.i v8, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v9, v9, 5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vrgather.vi v9, v10, 4, v0.t
-; CHECK-NEXT:    vse8.v v9, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -188,16 +187,15 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) {
 ; CHECK-LABEL: deinterleave7_0_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmv.v.i v0, 2
-; CHECK-NEXT:    vmv.v.i v8, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v9, v9, 6, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vrgather.vi v9, v10, 6, v0.t
-; CHECK-NEXT:    vse8.v v9, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 6, v0.t
+; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vi v8, v9, 6, v0.t
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 07aa05f609c40..48845c54c5603 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -930,7 +930,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    add a2, a0, a4
 ; CHECK-NEXT:    slli a5, a4, 2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    subw a3, a3, a4
+; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    srli a3, a3, 32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index b6253c6ea63b2..dcf1ab08c3c24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -204,7 +204,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %
 ; RV64-SLOW-NEXT:  # %bb.1: # %cond.load
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a1, v8
-; RV64-SLOW-NEXT:    lwu a2, 4(a1)
+; RV64-SLOW-NEXT:    lw a2, 4(a1)
 ; RV64-SLOW-NEXT:    lwu a1, 0(a1)
 ; RV64-SLOW-NEXT:    slli a2, a2, 32
 ; RV64-SLOW-NEXT:    or a1, a2, a1
@@ -216,7 +216,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %
 ; RV64-SLOW-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
-; RV64-SLOW-NEXT:    lwu a1, 4(a0)
+; RV64-SLOW-NEXT:    lw a1, 4(a0)
 ; RV64-SLOW-NEXT:    lwu a0, 0(a0)
 ; RV64-SLOW-NEXT:    slli a1, a1, 32
 ; RV64-SLOW-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index 1a716f688dd59..e89bac54a7b66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -818,7 +818,7 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    lwu a0, 0(a1)
+; RV64-NEXT:    lw a0, 0(a1)
 ; RV64-NEXT:    vwaddu.vx v8, v9, a0
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 8ebd93e9dc637..b933ef9bb10ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -853,7 +853,7 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
-; RV64-NEXT:    lwu a0, 0(a1)
+; RV64-NEXT:    lw a0, 0(a1)
 ; RV64-NEXT:    vwmulsu.vx v8, v9, a0
 ; RV64-NEXT:    ret
   %a = load <2 x i32>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 90e9ffdcb320a..7cedee5068551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -710,13 +710,6 @@ define <4 x i32> @vwmulu_vx_v4i32_i8(ptr %x, ptr %y) {
 }
 
 define <4 x i32> @vwmulu_vx_v4i32_i16(ptr %x, ptr %y) {
-; CHECK-LABEL: vwmulu_vx_v4i32_i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    lhu a0, 0(a1)
-; CHECK-NEXT:    vwmulu.vx v8, v9, a0
-; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b = load i16, ptr %y
   %c = zext i16 %b to i32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index bfdda47cc819e..86ac038eed221 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -821,7 +821,7 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ;
 ; RV64-LABEL: vwsubu_vx_v2i64_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lwu a1, 0(a1)
+; RV64-NEXT:    lw a1, 0(a1)
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vle32.v v9, (a0)
 ; RV64-NEXT:    vmv.v.x v10, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index f9ac53b76ebaf..f481f9cff5de1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -274,10 +274,10 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    sgtz a6, a2
 ; CHECK-NOV-NEXT:    sgtz a7, a3
 ; CHECK-NOV-NEXT:    sgtz t0, a5
-; CHECK-NOV-NEXT:    negw t0, t0
-; CHECK-NOV-NEXT:    negw a7, a7
-; CHECK-NOV-NEXT:    negw a6, a6
-; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    neg t0, t0
+; CHECK-NOV-NEXT:    neg a7, a7
+; CHECK-NOV-NEXT:    neg a6, a6
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    and a5, t0, a5
 ; CHECK-NOV-NEXT:    and a3, a7, a3
 ; CHECK-NOV-NEXT:    and a2, a6, a2
@@ -755,10 +755,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    sgtz a4, s1
 ; CHECK-NOV-NEXT:    sgtz a5, a1
 ; CHECK-NOV-NEXT:    sgtz a6, a3
-; CHECK-NOV-NEXT:    negw a6, a6
-; CHECK-NOV-NEXT:    negw a5, a5
-; CHECK-NOV-NEXT:    negw a4, a4
-; CHECK-NOV-NEXT:    negw a2, a2
+; CHECK-NOV-NEXT:    neg a6, a6
+; CHECK-NOV-NEXT:    neg a5, a5
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    neg a2, a2
 ; CHECK-NOV-NEXT:    and a3, a6, a3
 ; CHECK-NOV-NEXT:    and a1, a5, a1
 ; CHECK-NOV-NEXT:    and a4, a4, s1
@@ -1166,10 +1166,10 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    sgtz a6, a2
 ; CHECK-NOV-NEXT:    sgtz a7, a3
 ; CHECK-NOV-NEXT:    sgtz t0, a5
-; CHECK-NOV-NEXT:    negw t0, t0
-; CHECK-NOV-NEXT:    negw a7, a7
-; CHECK-NOV-NEXT:    negw a6, a6
-; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    neg t0, t0
+; CHECK-NOV-NEXT:    neg a7, a7
+; CHECK-NOV-NEXT:    neg a6, a6
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    and a5, t0, a5
 ; CHECK-NOV-NEXT:    and a3, a7, a3
 ; CHECK-NOV-NEXT:    and a2, a6, a2
@@ -2040,14 +2040,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    sgtz t4, a5
 ; CHECK-NOV-NEXT:    sgtz t5, a6
 ; CHECK-NOV-NEXT:    sgtz t6, a7
-; CHECK-NOV-NEXT:    negw t6, t6
-; CHECK-NOV-NEXT:    negw t5, t5
-; CHECK-NOV-NEXT:    negw t4, t4
-; CHECK-NOV-NEXT:    negw t3, t3
-; CHECK-NOV-NEXT:    negw t2, t2
-; CHECK-NOV-NEXT:    negw t1, t1
-; CHECK-NOV-NEXT:    negw t0, t0
-; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    neg t6, t6
+; CHECK-NOV-NEXT:    neg t5, t5
+; CHECK-NOV-NEXT:    neg t4, t4
+; CHECK-NOV-NEXT:    neg t3, t3
+; CHECK-NOV-NEXT:    neg t2, t2
+; CHECK-NOV-NEXT:    neg t1, t1
+; CHECK-NOV-NEXT:    neg t0, t0
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    and a7, t6, a7
 ; CHECK-NOV-NEXT:    and a6, t5, a6
 ; CHECK-NOV-NEXT:    and a5, t4, a5
@@ -3830,16 +3830,16 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    mv a5, a3
 ; CHECK-NOV-NEXT:  .LBB32_5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a3, a5
-; CHECK-NOV-NEXT:    negw a3, a3
+; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a5
 ; CHECK-NOV-NEXT:    sgtz a5, a4
-; CHECK-NOV-NEXT:    negw a5, a5
+; CHECK-NOV-NEXT:    neg a5, a5
 ; CHECK-NOV-NEXT:    and a4, a5, a4
 ; CHECK-NOV-NEXT:    sgtz a5, a2
-; CHECK-NOV-NEXT:    negw a5, a5
+; CHECK-NOV-NEXT:    neg a5, a5
 ; CHECK-NOV-NEXT:    and a2, a5, a2
 ; CHECK-NOV-NEXT:    sgtz a5, a1
-; CHECK-NOV-NEXT:    negw a5, a5
+; CHECK-NOV-NEXT:    neg a5, a5
 ; CHECK-NOV-NEXT:    and a1, a5, a1
 ; CHECK-NOV-NEXT:    sw a3, 0(a0)
 ; CHECK-NOV-NEXT:    sw a4, 4(a0)
@@ -4306,16 +4306,16 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    mv a3, a2
 ; CHECK-NOV-NEXT:  .LBB35_5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a2, a3
-; CHECK-NOV-NEXT:    negw a2, a2
+; CHECK-NOV-NEXT:    neg a2, a2
 ; CHECK-NOV-NEXT:    and a2, a2, a3
 ; CHECK-NOV-NEXT:    sgtz a3, a1
-; CHECK-NOV-NEXT:    negw a3, a3
+; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a1, a3, a1
 ; CHECK-NOV-NEXT:    sgtz a3, s1
-; CHECK-NOV-NEXT:    negw a3, a3
+; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, s1
 ; CHECK-NOV-NEXT:    sgtz a4, a0
-; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    and a0, a4, a0
 ; CHECK-NOV-NEXT:    sw a2, 0(s0)
 ; CHECK-NOV-NEXT:    sw a1, 4(s0)
@@ -4707,16 +4707,16 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    mv a5, a3
 ; CHECK-NOV-NEXT:  .LBB41_5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a3, a5
-; CHECK-NOV-NEXT:    negw a3, a3
+; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a5
 ; CHECK-NOV-NEXT:    sgtz a5, a4
-; CHECK-NOV-NEXT:    negw a5, a5
+; CHECK-NOV-NEXT:    neg a5, a5
 ; CHECK-NOV-NEXT:    and a4, a5, a4
 ; CHECK-NOV-NEXT:    sgtz a5, a2
-; CHECK-NOV-NEXT:    negw a5, a5
+; CHECK-NOV-NEXT:    neg a5, a5
 ; CHECK-NOV-NEXT:    and a2, a5, a2
 ; CHECK-NOV-NEXT:    sgtz a5, a1
-; CHECK-NOV-NEXT:    negw a5, a5
+; CHECK-NOV-NEXT:    neg a5, a5
 ; CHECK-NOV-NEXT:    and a1, a5, a1
 ; CHECK-NOV-NEXT:    sh a3, 0(a0)
 ; CHECK-NOV-NEXT:    sh a4, 2(a0)
@@ -5572,28 +5572,28 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    mv a7, a3
 ; CHECK-NOV-NEXT:  .LBB44_9: # %entry
 ; CHECK-NOV-NEXT:    sgtz a3, a7
-; CHECK-NOV-NEXT:    negw a3, a3
+; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a7
 ; CHECK-NOV-NEXT:    sgtz a7, a6
-; CHECK-NOV-NEXT:    negw a7, a7
+; CHECK-NOV-NEXT:    neg a7, a7
 ; CHECK-NOV-NEXT:    and a6, a7, a6
 ; CHECK-NOV-NEXT:    sgtz a7, a5
-; CHECK-NOV-NEXT:    negw a7, a7
+; CHECK-NOV-NEXT:    neg a7, a7
 ; CHECK-NOV-NEXT:    and a5, a7, a5
 ; CHECK-NOV-NEXT:    sgtz a7, a4
-; CHECK-NOV-NEXT:    negw a7, a7
+; CHECK-NOV-NEXT:    neg a7, a7
 ; CHECK-NOV-NEXT:    and a4, a7, a4
 ; CHECK-NOV-NEXT:    sgtz a7, a2
-; CHECK-NOV-NEXT:    negw a7, a7
+; CHECK-NOV-NEXT:    neg a7, a7
 ; CHECK-NOV-NEXT:    and a2, a7, a2
 ; CHECK-NOV-NEXT:    sgtz a7, a1
-; CHECK-NOV-NEXT:    negw a7, a7
+; CHECK-NOV-NEXT:    neg a7, a7
 ; CHECK-NOV-NEXT:    and a1, a7, a1
 ; CHECK-NOV-NEXT:    sgtz a7, s1
-; CHECK-NOV-NEXT:    negw a7, a7
+; CHECK-NOV-NEXT:    neg a7, a7
 ; CHECK-NOV-NEXT:    and a7, a7, s1
 ; CHECK-NOV-NEXT:    sgtz t0, a0
-; CHECK-NOV-NEXT:    negw t0, t0
+; CHECK-NOV-NEXT:    neg t0, t0
 ; CHECK-NOV-NEXT:    and a0, t0, a0
 ; CHECK-NOV-NEXT:    sh a2, 8(s0)
 ; CHECK-NOV-NEXT:    sh a1, 10(s0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
index af2e8d384a448..42c2556f2157d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll
@@ -14,12 +14,8 @@ define void @foo_lmul1() nounwind #0 {
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-NEXT:    sub sp, sp, a0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(a)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(a)
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -31,12 +27,8 @@ define void @foo_lmul1() nounwind #0 {
 ; CHECK-RV32-NEXT:    lui a0, %hi(c)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(c)
 ; CHECK-RV32-NEXT:    vse32.v v8, (a0)
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 1
 ; CHECK-RV32-NEXT:    add sp, sp, a0
@@ -62,25 +54,8 @@ define void @foo_lmul2() nounwind #0 {
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 2
 ; CHECK-RV32-NEXT:    sub sp, sp, a0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    slli a1, a0, 1
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(d)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(d)
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
@@ -92,25 +67,8 @@ define void @foo_lmul2() nounwind #0 {
 ; CHECK-RV32-NEXT:    lui a0, %hi(f)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(f)
 ; CHECK-RV32-NEXT:    vse32.v v8, (a0)
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    slli a1, a0, 1
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 2
 ; CHECK-RV32-NEXT:    add sp, sp, a0
@@ -136,56 +94,8 @@ define void @foo_lmul4() nounwind #0 {
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    sub sp, sp, a0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    slli a1, a0, 3
-; CHECK-RV32-NEXT:    sub a0, a1, a0
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    slli a1, a0, 2
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    slli a1, a0, 1
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(g)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(g)
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
@@ -197,50 +107,8 @@ define void @foo_lmul4() nounwind #0 {
 ; CHECK-RV32-NEXT:    lui a0, %hi(i)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(i)
 ; CHECK-RV32-NEXT:    vse32.v v8, (a0)
-; CHECK-RV32-NEXT:    sw a1, 4(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 3
-; CHECK-RV32-NEXT:    sub a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 2
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 1
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    lw a1, 4(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add sp, sp, a0
@@ -268,108 +136,12 @@ define void @foo_lmul8() nounwind #0 {
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    sub sp, sp, a0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 4
-; CHECK-RV32-NEXT:    sub a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a1, a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a1, a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v10, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a1, a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v12, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v13, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 3
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v14, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v15, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 3
-; CHECK-RV32-NEXT:    sub a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v16, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v17, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 2
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v18, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v19, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 1
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v20, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v21, (a0) # vscale x 8-byte Folded Spill
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs1r.v v22, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs1r.v v23, (a0) # vscale x 8-byte Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(j)
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(j)
 ; CHECK-RV32-NEXT:    li a1, 32
@@ -383,108 +155,12 @@ define void @foo_lmul8() nounwind #0 {
 ; CHECK-RV32-NEXT:    addi a0, a0, %lo(l)
 ; CHECK-RV32-NEXT:    vse32.v v8, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 4
-; CHECK-RV32-NEXT:    sub a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a1, a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a1, a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a1, a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v12, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v13, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 3
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v14, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v15, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 3
-; CHECK-RV32-NEXT:    sub a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v16, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    mv a1, a0
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v17, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 2
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v18, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 2
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v19, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a1, a0, 1
-; CHECK-RV32-NEXT:    add a0, a1, a0
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v20, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v21, (a0) # vscale x 8-byte Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl1r.v v22, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl1r.v v23, (a0) # vscale x 8-byte Folded Reload
+; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll
index 4d9a6aeaad2ef..749b2041aa63d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll
@@ -11,7 +11,7 @@ define i32 @vscale_known_nonzero() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    negw a1, a0
+; CHECK-NEXT:    neg a1, a0
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    slli a1, a0, 6
 ; CHECK-NEXT:    slli a2, a0, 8
@@ -19,16 +19,16 @@ define i32 @vscale_known_nonzero() {
 ; CHECK-NEXT:    slli a4, a0, 12
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    slli a2, a0, 16
-; CHECK-NEXT:    subw a3, a3, a4
+; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    slli a4, a0, 18
-; CHECK-NEXT:    subw a2, a2, a4
+; CHECK-NEXT:    sub a2, a2, a4
 ; CHECK-NEXT:    slli a4, a0, 4
-; CHECK-NEXT:    subw a4, a0, a4
+; CHECK-NEXT:    sub a4, a0, a4
 ; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    slli a4, a0, 14
-; CHECK-NEXT:    subw a3, a3, a4
+; CHECK-NEXT:    sub a3, a3, a4
 ; CHECK-NEXT:    slli a4, a0, 23
-; CHECK-NEXT:    subw a2, a2, a4
+; CHECK-NEXT:    sub a2, a2, a4
 ; CHECK-NEXT:    slli a0, a0, 27
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a0, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
new file mode 100644
index 0000000000000..cca00bf58063d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define i32 @_ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_(<vscale x 4 x i32> %wide.load, <vscale x 4 x i1> %0, <vscale x 4 x i1> %1, <vscale x 4 x i1> %2, <vscale x 4 x i1> %3) #0 {
+; CHECK-LABEL: _ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv.v.i v14, 0
+; CHECK-NEXT:  .LBB0_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    vmv2r.v v16, v10
+; CHECK-NEXT:    vle32.v v16, (a0), v0.t
+; CHECK-NEXT:    vand.vi v16, v16, 1
+; CHECK-NEXT:    vmsne.vi v9, v16, 0
+; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmerge.vim v12, v12, -1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vor.vi v14, v14, 1, v0.t
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    j .LBB0_1
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ 1, %vector.body ]
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %predphi88, %vector.body ]
+  %vec.phi81 = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %predphi93, %vector.body ]
+  %wide.load1 = load <vscale x 4 x i32>, ptr null, align 4
+  %4 = icmp slt <vscale x 4 x i32> %wide.load, zeroinitializer
+  %5 = icmp sgt <vscale x 4 x i32> %wide.load, zeroinitializer
+  %wide.masked.load82 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr null, i32 1, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer)
+  %6 = icmp eq <vscale x 4 x i32> zeroinitializer, zeroinitializer
+  %7 = getelementptr i32, ptr null, i64 %index
+  %wide.masked.load83 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %7, i32 1, <vscale x 4 x i1> %0, <vscale x 4 x i32> zeroinitializer)
+  %8 = select <vscale x 4 x i1> %0, <vscale x 4 x i1> %0, <vscale x 4 x i1> zeroinitializer
+  %9 = trunc <vscale x 4 x i32> %wide.masked.load83 to <vscale x 4 x i1>
+  %narrow = select <vscale x 4 x i1> %0, <vscale x 4 x i1> %9, <vscale x 4 x i1> zeroinitializer
+  %10 = sext <vscale x 4 x i1> %narrow to <vscale x 4 x i32>
+  %predphi88 = or <vscale x 4 x i32> %vec.phi, %10
+  %11 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i32>
+  %predphi93 = or <vscale x 4 x i32> %vec.phi81, %11
+  %index.next = add i64 0, 1
+  br i1 false, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %12 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %vec.phi)
+  %13 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %vec.phi81)
+  ret i32 %13
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+
+; uselistorder directives
+uselistorder ptr @llvm.masked.load.nxv4i32.p0, { 1, 0 }
+uselistorder ptr @llvm.vector.reduce.add.nxv4i32, { 1, 0 }
+
+attributes #0 = { "target-features"="+v" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index a050034c63168..a7eaf39793236 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -78,12 +78,12 @@ body: |
     ; CHECK-NEXT: %false:vrnov0 = COPY $v9
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
     ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 4, 5 /* e32 */, 0 /* tu, mu */
     %pt:vrnov0 = COPY $v8
     %false:vrnov0 = COPY $v9
     %mask:vmv0 = COPY $v0
-    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
-    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 8, 5 /* e32 */, 0 /* tu, mu */
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 4, 5 /* e32 */
 ...
 ---
 # Shouldn't be converted because false operands are different
@@ -163,3 +163,47 @@ body: |
     %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
   bb.1:
     %5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
+...
+---
+# Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+name: preserve_false
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0, $x8, $x9
+    ; CHECK-LABEL: name: preserve_false
+    ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %false:vr = COPY $v9
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8
+    ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $v9
+    %mask:vmv0 = COPY $v0
+    %avl1:gprnox0 = COPY $x8
+    %avl2:gprnox0 = COPY $x9
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+    %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+...
+---
+# But we can convert this one because vmerge's avl being <= true's means we don't lose any false elements past avl.
+name: preserve_false_avl_known_le
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0
+    ; CHECK-LABEL: name: preserve_false_avl_known_le
+    ; CHECK: liveins: $v8, $v9, $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vr = COPY $v8
+    ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $v9
+    %mask:vmv0 = COPY $v0
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */
+    %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
index 3aeb4e864627c..9ffc84a8a0e4a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
@@ -71,10 +71,31 @@ define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64>
   ret <vscale x 8 x i64> %1
 }
 
-declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+; Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+define <vscale x 2 x i32> @preserve_false(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask, i64 %avl1, i64 %avl2) {
+; CHECK-LABEL: preserve_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vle32.v v10, (a0), v0.t
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
+; CHECK-NEXT:    ret
+  %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 %avl1, i64 3)
+  %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 %avl2)
+  ret <vscale x 2 x i32> %res
+}
+
+; Can fold this because its avl is known to be <= than true, so no elements from false need to be introduced past avl.
+define <vscale x 2 x i32> @preserve_false_avl_known_le(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: preserve_false_avl_known_le:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0), v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 2, i64 3)
+  %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 1)
+  ret <vscale x 2 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 8495dfe350729..32892bca84747 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
-; RUN:     --check-prefixes=CHECK,ZVFH
+; RUN:     --check-prefixes=CHECK,CHECK32,ZVFH
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
-; RUN:     --check-prefixes=CHECK,ZVFH
+; RUN:     --check-prefixes=CHECK,CHECK64,ZVFH
 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \
-; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN:     --check-prefixes=CHECK,CHECK32,ZVFHMIN
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
-; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN:     --check-prefixes=CHECK,CHECK64,ZVFHMIN
 
 declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, metadata, <vscale x 1 x i1>, i32)
 
@@ -4820,6 +4820,427 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f64(<vscale x 8 x double> %va, do
 declare <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, metadata, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK32-LABEL: fcmp_oeq_vv_nxv32f64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    addi sp, sp, -48
+; CHECK32-NEXT:    .cfi_def_cfa_offset 48
+; CHECK32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; CHECK32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; CHECK32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; CHECK32-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
+; CHECK32-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
+; CHECK32-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; CHECK32-NEXT:    .cfi_offset ra, -4
+; CHECK32-NEXT:    .cfi_offset s0, -8
+; CHECK32-NEXT:    .cfi_offset s1, -12
+; CHECK32-NEXT:    .cfi_offset s2, -16
+; CHECK32-NEXT:    .cfi_offset s3, -20
+; CHECK32-NEXT:    .cfi_offset s4, -24
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a3, a1
+; CHECK32-NEXT:    slli a1, a1, 2
+; CHECK32-NEXT:    add a3, a3, a1
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    add a1, a1, a3
+; CHECK32-NEXT:    sub sp, sp, a1
+; CHECK32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 26 * vlenb
+; CHECK32-NEXT:    mv s1, a6
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    mv s3, a2
+; CHECK32-NEXT:    mv s2, a0
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a1, a0, 3
+; CHECK32-NEXT:    add a0, a1, a0
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a1, a0
+; CHECK32-NEXT:    slli a0, a0, 3
+; CHECK32-NEXT:    add a0, a0, a1
+; CHECK32-NEXT:    add a0, sp, a0
+; CHECK32-NEXT:    addi a0, a0, 16
+; CHECK32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr s0, vlenb
+; CHECK32-NEXT:    li a1, 24
+; CHECK32-NEXT:    mv a0, s0
+; CHECK32-NEXT:    call __mulsi3
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vl1r.v v6, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    mv a1, a0
+; CHECK32-NEXT:    slli a4, s0, 3
+; CHECK32-NEXT:    srli s4, s0, 2
+; CHECK32-NEXT:    srli a0, s0, 3
+; CHECK32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslidedown.vx v7, v6, s4
+; CHECK32-NEXT:    add a2, s3, a4
+; CHECK32-NEXT:    vl8re64.v v16, (a2)
+; CHECK32-NEXT:    slli a6, s0, 4
+; CHECK32-NEXT:    slli a2, s0, 1
+; CHECK32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK32-NEXT:    vslidedown.vx v0, v6, a0
+; CHECK32-NEXT:    mv a3, s1
+; CHECK32-NEXT:    bltu s1, a2, .LBB257_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    mv a3, a2
+; CHECK32-NEXT:  .LBB257_2:
+; CHECK32-NEXT:    add a5, s3, a1
+; CHECK32-NEXT:    add a1, s2, a4
+; CHECK32-NEXT:    vslidedown.vx v9, v7, a0
+; CHECK32-NEXT:    csrr a4, vlenb
+; CHECK32-NEXT:    slli a7, a4, 4
+; CHECK32-NEXT:    add a4, a7, a4
+; CHECK32-NEXT:    add a4, sp, a4
+; CHECK32-NEXT:    addi a4, a4, 16
+; CHECK32-NEXT:    vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    add a4, s3, a6
+; CHECK32-NEXT:    vl8re64.v v24, (s3)
+; CHECK32-NEXT:    sub a6, a3, s0
+; CHECK32-NEXT:    sltu a7, a3, a6
+; CHECK32-NEXT:    addi a7, a7, -1
+; CHECK32-NEXT:    and a6, a7, a6
+; CHECK32-NEXT:    csrr a7, vlenb
+; CHECK32-NEXT:    slli t0, a7, 3
+; CHECK32-NEXT:    add a7, t0, a7
+; CHECK32-NEXT:    add a7, sp, a7
+; CHECK32-NEXT:    addi a7, a7, 16
+; CHECK32-NEXT:    vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v5, v8, v16, v0.t
+; CHECK32-NEXT:    bltu a3, s0, .LBB257_4
+; CHECK32-NEXT:  # %bb.3:
+; CHECK32-NEXT:    mv a3, s0
+; CHECK32-NEXT:  .LBB257_4:
+; CHECK32-NEXT:    vmv1r.v v0, v6
+; CHECK32-NEXT:    vl8re64.v v8, (a5)
+; CHECK32-NEXT:    csrr a5, vlenb
+; CHECK32-NEXT:    slli a6, a5, 3
+; CHECK32-NEXT:    add a5, a6, a5
+; CHECK32-NEXT:    add a5, sp, a5
+; CHECK32-NEXT:    addi a5, a5, 16
+; CHECK32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    csrr a5, vlenb
+; CHECK32-NEXT:    slli a5, a5, 1
+; CHECK32-NEXT:    mv a6, a5
+; CHECK32-NEXT:    slli a5, a5, 3
+; CHECK32-NEXT:    add a5, a5, a6
+; CHECK32-NEXT:    add a5, sp, a5
+; CHECK32-NEXT:    addi a5, a5, 16
+; CHECK32-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK32-NEXT:    vl8re64.v v16, (a1)
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK32-NEXT:    vl8re64.v v16, (a4)
+; CHECK32-NEXT:    sub a1, s1, a2
+; CHECK32-NEXT:    sltu a2, s1, a1
+; CHECK32-NEXT:    vl8re64.v v24, (s2)
+; CHECK32-NEXT:    addi a2, a2, -1
+; CHECK32-NEXT:    and s1, a2, a1
+; CHECK32-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; CHECK32-NEXT:    vslideup.vx v8, v5, a0
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a2, a1
+; CHECK32-NEXT:    slli a1, a1, 3
+; CHECK32-NEXT:    add a1, a1, a2
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    mv a1, s1
+; CHECK32-NEXT:    bltu s1, s0, .LBB257_6
+; CHECK32-NEXT:  # %bb.5:
+; CHECK32-NEXT:    mv a1, s0
+; CHECK32-NEXT:  .LBB257_6:
+; CHECK32-NEXT:    vmv1r.v v0, v7
+; CHECK32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; CHECK32-NEXT:    addi a1, sp, 16
+; CHECK32-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK32-NEXT:    li a1, 3
+; CHECK32-NEXT:    call __mulsi3
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a2, a1, 4
+; CHECK32-NEXT:    add a1, a2, a1
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vl1r.v v0, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    csrr a1, vlenb
+; CHECK32-NEXT:    slli a1, a1, 1
+; CHECK32-NEXT:    mv a2, a1
+; CHECK32-NEXT:    slli a1, a1, 3
+; CHECK32-NEXT:    add a1, a1, a2
+; CHECK32-NEXT:    add a1, sp, a1
+; CHECK32-NEXT:    addi a1, a1, 16
+; CHECK32-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    addi a1, sp, 16
+; CHECK32-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK32-NEXT:    vslideup.vx v9, v8, s4
+; CHECK32-NEXT:    sub a1, s1, s0
+; CHECK32-NEXT:    sltu a2, s1, a1
+; CHECK32-NEXT:    addi a2, a2, -1
+; CHECK32-NEXT:    and a1, a2, a1
+; CHECK32-NEXT:    csrr a2, vlenb
+; CHECK32-NEXT:    slli a3, a2, 3
+; CHECK32-NEXT:    add a2, a3, a2
+; CHECK32-NEXT:    add a2, sp, a2
+; CHECK32-NEXT:    addi a2, a2, 16
+; CHECK32-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    csrr a2, vlenb
+; CHECK32-NEXT:    add a2, sp, a2
+; CHECK32-NEXT:    addi a2, a2, 16
+; CHECK32-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK32-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; CHECK32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK32-NEXT:    vslideup.vx v9, v8, a0
+; CHECK32-NEXT:    vmv1r.v v0, v9
+; CHECK32-NEXT:    csrr a0, vlenb
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    mv a1, a0
+; CHECK32-NEXT:    slli a0, a0, 2
+; CHECK32-NEXT:    add a1, a1, a0
+; CHECK32-NEXT:    slli a0, a0, 1
+; CHECK32-NEXT:    add a0, a0, a1
+; CHECK32-NEXT:    add sp, sp, a0
+; CHECK32-NEXT:    .cfi_def_cfa sp, 48
+; CHECK32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; CHECK32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; CHECK32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
+; CHECK32-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; CHECK32-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
+; CHECK32-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
+; CHECK32-NEXT:    .cfi_restore ra
+; CHECK32-NEXT:    .cfi_restore s0
+; CHECK32-NEXT:    .cfi_restore s1
+; CHECK32-NEXT:    .cfi_restore s2
+; CHECK32-NEXT:    .cfi_restore s3
+; CHECK32-NEXT:    .cfi_restore s4
+; CHECK32-NEXT:    addi sp, sp, 48
+; CHECK32-NEXT:    .cfi_def_cfa_offset 0
+; CHECK32-NEXT:    ret
+;
+; CHECK64-LABEL: fcmp_oeq_vv_nxv32f64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    addi sp, sp, -64
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
+; CHECK64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; CHECK64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; CHECK64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; CHECK64-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
+; CHECK64-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
+; CHECK64-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
+; CHECK64-NEXT:    .cfi_offset ra, -8
+; CHECK64-NEXT:    .cfi_offset s0, -16
+; CHECK64-NEXT:    .cfi_offset s1, -24
+; CHECK64-NEXT:    .cfi_offset s2, -32
+; CHECK64-NEXT:    .cfi_offset s3, -40
+; CHECK64-NEXT:    .cfi_offset s4, -48
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    mv a3, a1
+; CHECK64-NEXT:    slli a1, a1, 2
+; CHECK64-NEXT:    add a3, a3, a1
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    add a1, a1, a3
+; CHECK64-NEXT:    sub sp, sp, a1
+; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 26 * vlenb
+; CHECK64-NEXT:    mv s1, a6
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    mv s3, a2
+; CHECK64-NEXT:    mv s2, a0
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a1, a0, 3
+; CHECK64-NEXT:    add a0, a1, a0
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv a1, a0
+; CHECK64-NEXT:    slli a0, a0, 3
+; CHECK64-NEXT:    add a0, a0, a1
+; CHECK64-NEXT:    add a0, sp, a0
+; CHECK64-NEXT:    addi a0, a0, 16
+; CHECK64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr s0, vlenb
+; CHECK64-NEXT:    li a1, 24
+; CHECK64-NEXT:    mv a0, s0
+; CHECK64-NEXT:    call __muldi3
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vl1r.v v6, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    mv a1, a0
+; CHECK64-NEXT:    slli a4, s0, 3
+; CHECK64-NEXT:    srli s4, s0, 2
+; CHECK64-NEXT:    srli a0, s0, 3
+; CHECK64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslidedown.vx v7, v6, s4
+; CHECK64-NEXT:    add a2, s3, a4
+; CHECK64-NEXT:    vl8re64.v v16, (a2)
+; CHECK64-NEXT:    slli a6, s0, 4
+; CHECK64-NEXT:    slli a2, s0, 1
+; CHECK64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK64-NEXT:    vslidedown.vx v0, v6, a0
+; CHECK64-NEXT:    mv a3, s1
+; CHECK64-NEXT:    bltu s1, a2, .LBB257_2
+; CHECK64-NEXT:  # %bb.1:
+; CHECK64-NEXT:    mv a3, a2
+; CHECK64-NEXT:  .LBB257_2:
+; CHECK64-NEXT:    add a5, s3, a1
+; CHECK64-NEXT:    add a1, s2, a4
+; CHECK64-NEXT:    vslidedown.vx v9, v7, a0
+; CHECK64-NEXT:    csrr a4, vlenb
+; CHECK64-NEXT:    slli a7, a4, 4
+; CHECK64-NEXT:    add a4, a7, a4
+; CHECK64-NEXT:    add a4, sp, a4
+; CHECK64-NEXT:    addi a4, a4, 16
+; CHECK64-NEXT:    vs1r.v v9, (a4) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    add a4, s3, a6
+; CHECK64-NEXT:    vl8re64.v v24, (s3)
+; CHECK64-NEXT:    sub a6, a3, s0
+; CHECK64-NEXT:    sltu a7, a3, a6
+; CHECK64-NEXT:    addi a7, a7, -1
+; CHECK64-NEXT:    and a6, a7, a6
+; CHECK64-NEXT:    csrr a7, vlenb
+; CHECK64-NEXT:    slli t0, a7, 3
+; CHECK64-NEXT:    add a7, t0, a7
+; CHECK64-NEXT:    add a7, sp, a7
+; CHECK64-NEXT:    addi a7, a7, 16
+; CHECK64-NEXT:    vl8r.v v8, (a7) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v5, v8, v16, v0.t
+; CHECK64-NEXT:    bltu a3, s0, .LBB257_4
+; CHECK64-NEXT:  # %bb.3:
+; CHECK64-NEXT:    mv a3, s0
+; CHECK64-NEXT:  .LBB257_4:
+; CHECK64-NEXT:    vmv1r.v v0, v6
+; CHECK64-NEXT:    vl8re64.v v8, (a5)
+; CHECK64-NEXT:    csrr a5, vlenb
+; CHECK64-NEXT:    slli a6, a5, 3
+; CHECK64-NEXT:    add a5, a6, a5
+; CHECK64-NEXT:    add a5, sp, a5
+; CHECK64-NEXT:    addi a5, a5, 16
+; CHECK64-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    csrr a5, vlenb
+; CHECK64-NEXT:    slli a5, a5, 1
+; CHECK64-NEXT:    mv a6, a5
+; CHECK64-NEXT:    slli a5, a5, 3
+; CHECK64-NEXT:    add a5, a5, a6
+; CHECK64-NEXT:    add a5, sp, a5
+; CHECK64-NEXT:    addi a5, a5, 16
+; CHECK64-NEXT:    vl8r.v v16, (a5) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK64-NEXT:    vl8re64.v v16, (a1)
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK64-NEXT:    vl8re64.v v16, (a4)
+; CHECK64-NEXT:    sub a1, s1, a2
+; CHECK64-NEXT:    sltu a2, s1, a1
+; CHECK64-NEXT:    vl8re64.v v24, (s2)
+; CHECK64-NEXT:    addi a2, a2, -1
+; CHECK64-NEXT:    and s1, a2, a1
+; CHECK64-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; CHECK64-NEXT:    vslideup.vx v8, v5, a0
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    mv a2, a1
+; CHECK64-NEXT:    slli a1, a1, 3
+; CHECK64-NEXT:    add a1, a1, a2
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    mv a1, s1
+; CHECK64-NEXT:    bltu s1, s0, .LBB257_6
+; CHECK64-NEXT:  # %bb.5:
+; CHECK64-NEXT:    mv a1, s0
+; CHECK64-NEXT:  .LBB257_6:
+; CHECK64-NEXT:    vmv1r.v v0, v7
+; CHECK64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; CHECK64-NEXT:    addi a1, sp, 16
+; CHECK64-NEXT:    vs1r.v v8, (a1) # vscale x 8-byte Folded Spill
+; CHECK64-NEXT:    li a1, 3
+; CHECK64-NEXT:    call __muldi3
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    slli a2, a1, 4
+; CHECK64-NEXT:    add a1, a2, a1
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vl1r.v v0, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    csrr a1, vlenb
+; CHECK64-NEXT:    slli a1, a1, 1
+; CHECK64-NEXT:    mv a2, a1
+; CHECK64-NEXT:    slli a1, a1, 3
+; CHECK64-NEXT:    add a1, a1, a2
+; CHECK64-NEXT:    add a1, sp, a1
+; CHECK64-NEXT:    addi a1, a1, 16
+; CHECK64-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    addi a1, sp, 16
+; CHECK64-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK64-NEXT:    vslideup.vx v9, v8, s4
+; CHECK64-NEXT:    sub a1, s1, s0
+; CHECK64-NEXT:    sltu a2, s1, a1
+; CHECK64-NEXT:    addi a2, a2, -1
+; CHECK64-NEXT:    and a1, a2, a1
+; CHECK64-NEXT:    csrr a2, vlenb
+; CHECK64-NEXT:    slli a3, a2, 3
+; CHECK64-NEXT:    add a2, a3, a2
+; CHECK64-NEXT:    add a2, sp, a2
+; CHECK64-NEXT:    addi a2, a2, 16
+; CHECK64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    csrr a2, vlenb
+; CHECK64-NEXT:    add a2, sp, a2
+; CHECK64-NEXT:    addi a2, a2, 16
+; CHECK64-NEXT:    vl8r.v v24, (a2) # vscale x 64-byte Folded Reload
+; CHECK64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK64-NEXT:    vmfeq.vv v8, v24, v16, v0.t
+; CHECK64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK64-NEXT:    vslideup.vx v9, v8, a0
+; CHECK64-NEXT:    vmv1r.v v0, v9
+; CHECK64-NEXT:    csrr a0, vlenb
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    mv a1, a0
+; CHECK64-NEXT:    slli a0, a0, 2
+; CHECK64-NEXT:    add a1, a1, a0
+; CHECK64-NEXT:    slli a0, a0, 1
+; CHECK64-NEXT:    add a0, a0, a1
+; CHECK64-NEXT:    add sp, sp, a0
+; CHECK64-NEXT:    .cfi_def_cfa sp, 64
+; CHECK64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; CHECK64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; CHECK64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; CHECK64-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
+; CHECK64-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; CHECK64-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
+; CHECK64-NEXT:    .cfi_restore ra
+; CHECK64-NEXT:    .cfi_restore s0
+; CHECK64-NEXT:    .cfi_restore s1
+; CHECK64-NEXT:    .cfi_restore s2
+; CHECK64-NEXT:    .cfi_restore s3
+; CHECK64-NEXT:    .cfi_restore s4
+; CHECK64-NEXT:    addi sp, sp, 64
+; CHECK64-NEXT:    .cfi_def_cfa_offset 0
+; CHECK64-NEXT:    ret
   %v = call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, metadata !"oeq", <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index c216fb65a6a5b..346e40ab0afe5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -549,7 +549,7 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB10_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    subw a3, a1, a3
+; CHECK-NEXT:    sub a3, a1, a3
 ; CHECK-NEXT:    sw a3, 0(a2)
 ; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    bne a2, a0, .LBB10_6
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index 66e114c938c06..f295bd8d74df3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2300,7 +2300,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV64-NEXT:    j .LBB98_5
 ; CHECK-RV64-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-RV64-NEXT:    srli a3, a4, 1
-; CHECK-RV64-NEXT:    negw a2, a3
+; CHECK-RV64-NEXT:    neg a2, a3
 ; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    slli a4, a4, 1
 ; CHECK-RV64-NEXT:    mv a5, a0
@@ -2393,7 +2393,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB64-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-NOZBB64-NEXT:    srli a3, a4, 1
-; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-NOZBB64-NEXT:    neg a2, a3
 ; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a5, a0
@@ -2485,7 +2485,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB64-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-ZBB64-NEXT:    srli a3, a4, 1
-; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-ZBB64-NEXT:    neg a2, a3
 ; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a5, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index 3740737ba2989..d0b184bd853ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -50,9 +50,9 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
 ; RV64-NEXT:    sgtz a5, a5
 ; RV64-NEXT:    sgtz a4, a4
 ; RV64-NEXT:    sgtz a3, a3
-; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    negw a4, a4
-; RV64-NEXT:    negw a5, a5
+; RV64-NEXT:    neg a3, a3
+; RV64-NEXT:    neg a4, a4
+; RV64-NEXT:    neg a5, a5
 ; RV64-NEXT:    and a3, a3, a6
 ; RV64-NEXT:    and a0, a4, a0
 ; RV64-NEXT:    and a2, a5, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 9af92aa995f1f..f9f0aa67a9034 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -538,3 +538,99 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
   %res7 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6, <vscale x 8 x i8> %t7, 7
   ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res7
 }
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
+; CHECK-LABEL: masked_load_factor2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
+  %deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %deinterleaved.results
+}
+
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4(ptr %p) {
+; CHECK-LABEL: masked_load_factor4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
+  %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
+  ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
+}
+
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: masked_load_factor4_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+  %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
+  %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
+  ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
+}
+
+; Negative test - some of the deinterleaved elements might come from the
+; passthru not the load
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4_passthru(ptr %p, <vscale x 8 x i1> %mask, <vscale x 32 x i8> %passthru) {
+; CHECK-LABEL: masked_load_factor4_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    add a3, a1, a2
+; CHECK-NEXT:    vmv.v.v v13, v12
+; CHECK-NEXT:    srli a4, a2, 2
+; CHECK-NEXT:    vmv.v.v v14, v12
+; CHECK-NEXT:    srli a5, a2, 3
+; CHECK-NEXT:    vmv.v.v v15, v12
+; CHECK-NEXT:    vsseg4e8.v v12, (a1)
+; CHECK-NEXT:    vl1r.v v12, (a1)
+; CHECK-NEXT:    add a1, a4, a5
+; CHECK-NEXT:    vl1r.v v13, (a3)
+; CHECK-NEXT:    add a3, a3, a2
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vl1r.v v14, (a3)
+; CHECK-NEXT:    vl1r.v v15, (a2)
+; CHECK-NEXT:    vmsne.vi v13, v13, 0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vmsne.vi v12, v14, 0
+; CHECK-NEXT:    vmsne.vi v14, v15, 0
+; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v0, v13, a5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v0, v12, a4
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v14, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0), v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+  %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> %passthru)
+  %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
+  ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index af55aaa8fce86..7e7d11e8384dc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -303,3 +303,26 @@ define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2
   store <vscale x 16 x i32> %v, ptr %p
   ret void
 }
+
+define void @masked_store_factor3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p) {
+; CHECK-LABEL: masked_store_factor3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c)
+  call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> splat (i1 true))
+  ret void
+}
+
+define void @masked_store_factor3_masked(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p, <vscale x 2 x i1> %m) {
+; CHECK-LABEL: masked_store_factor3_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.mask = call <vscale x 6 x i1> @llvm.vector.interleave3(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m)
+  %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c)
+  call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> %interleaved.mask)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index a5bc04d66e49d..4883a4dcfcf67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -5468,9 +5468,8 @@ define <vscale x 4 x i32> @vrgather_vi(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ;
 ; VLOPT-LABEL: vrgather_vi:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgather.vi v12, v8, 5
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vi v12, v8, 5
 ; VLOPT-NEXT:    vadd.vv v8, v12, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
@@ -5489,9 +5488,8 @@ define <vscale x 4 x i32> @vrgather_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ;
 ; VLOPT-LABEL: vrgather_vv:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgather.vv v12, v8, v10
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vv v12, v8, v10
 ; VLOPT-NEXT:    vadd.vv v8, v12, v8
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %idx, iXLen -1)
@@ -5510,9 +5508,8 @@ define <vscale x 4 x i32> @vrgather_vx(<vscale x 4 x i32> %a, iXLen %idx, <vscal
 ;
 ; VLOPT-LABEL: vrgather_vx:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgather.vx v12, v8, a0
 ; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vx v12, v8, a0
 ; VLOPT-NEXT:    vadd.vv v8, v12, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %idx, iXLen -1)
@@ -5531,9 +5528,8 @@ define <vscale x 4 x i32> @vrgatherei16_vv(<vscale x 4 x i32> %a, <vscale x 4 x
 ;
 ; VLOPT-LABEL: vrgatherei16_vv:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgatherei16.vv v12, v8, v10
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgatherei16.vv v12, v8, v10
 ; VLOPT-NEXT:    vadd.vv v8, v12, v8
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i16> %idx, iXLen -1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index b39ba422bd349..52cd3e35e6eb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -1801,4 +1801,63 @@ body: |
     ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 /* e8 */
     %x:vr = PseudoVMSET_M_B8 -1, 0
     %y:vr = PseudoVMAND_MM_B16  $noreg, %x, 1, 0
+...
+---
+name: vrgatherei16_vv
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv
+    ; CHECK: early-clobber %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_data_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_data_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
 ---
+name: vrgatherei16_vv_incompatible_index_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_index_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_dest_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_dest_emul
+    ; CHECK: early-clobber %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_source_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_source_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_index_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_index_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
index e3b2d6c1efe1f..9b58cb3d5c891 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
@@ -893,3 +893,197 @@ define <vscale x 8 x i32> @vmin_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = select <vscale x 8 x i1> %cmp, <vscale x 8 x i32> %va, <vscale x 8 x i32> %vs
   ret <vscale x 8 x i32> %vc
 }
+
+define <vscale x 2 x i8> @vsub_if_uge_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i8> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %vb
+  %sub = sub nuw <vscale x 2 x i8> %va, %select
+  ret <vscale x 2 x i8> %sub
+}
+
+define <vscale x 2 x i8> @vsub_if_uge_swapped_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i8> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> %vb, <vscale x 2 x i8> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i8> %va, %select
+  ret <vscale x 2 x i8> %sub
+}
+
+define <vscale x 2 x i16> @vsub_if_uge_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i16> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %vb
+  %sub = sub nuw <vscale x 2 x i16> %va, %select
+  ret <vscale x 2 x i16> %sub
+}
+
+define <vscale x 2 x i16> @vsub_if_uge_swapped_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i16> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> %vb, <vscale x 2 x i16> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i16> %va, %select
+  ret <vscale x 2 x i16> %sub
+}
+
+define <vscale x 2 x i32> @vsub_if_uge_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i32> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %vb
+  %sub = sub nuw <vscale x 2 x i32> %va, %select
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i32> @vsub_if_uge_swapped_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i32> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %vb, <vscale x 2 x i32> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i32> %va, %select
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i64> @vsub_if_uge_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsub.vv v10, v8, v10
+; CHECK-NEXT:    vminu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i64> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %vb
+  %sub = sub nuw <vscale x 2 x i64> %va, %select
+  ret <vscale x 2 x i64> %sub
+}
+
+define <vscale x 2 x i64> @vsub_if_uge_swapped_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsub.vv v10, v8, v10
+; CHECK-NEXT:    vminu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i64> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> %vb, <vscale x 2 x i64> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i64> %va, %select
+  ret <vscale x 2 x i64> %sub
+}
+
+define <vscale x 2 x i8> @sub_if_uge_C_nxv2i8(<vscale x 2 x i8> %x) {
+; CHECK-LABEL: sub_if_uge_C_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v8, -13
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i8> %x, splat (i8 12)
+  %sub = add <vscale x 2 x i8> %x, splat (i8 -13)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> %sub, <vscale x 2 x i8> %x
+  ret <vscale x 2 x i8> %select
+}
+
+define <vscale x 2 x i16> @sub_if_uge_C_nxv2i16(<vscale x 2 x i16> %x) {
+; CHECK-LABEL: sub_if_uge_C_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, -2001
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i16> %x, splat (i16 2000)
+  %sub = add <vscale x 2 x i16> %x, splat (i16 -2001)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> %sub, <vscale x 2 x i16> %x
+  ret <vscale x 2 x i16> %select
+}
+
+define <vscale x 2 x i32> @sub_if_uge_C_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i32> %x, splat (i32 65520)
+  %sub = add <vscale x 2 x i32> %x, splat (i32 -65521)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %sub, <vscale x 2 x i32> %x
+  ret <vscale x 2 x i32> %select
+}
+
+define <vscale x 2 x i32> @sub_if_uge_C_swapped_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i32> %x, splat (i32 65521)
+  %sub = add <vscale x 2 x i32> %x, splat (i32 -65521)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %x, <vscale x 2 x i32> %sub
+  ret <vscale x 2 x i32> %select
+}
+
+define <vscale x 2 x i64> @sub_if_uge_C_nxv2i64(<vscale x 2 x i64> %x) nounwind {
+; RV32-LABEL: sub_if_uge_C_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    li a0, -2
+; RV32-NEXT:    lui a1, 876449
+; RV32-NEXT:    addi a1, a1, -513
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vadd.vv v10, v8, v10
+; RV32-NEXT:    vminu.vv v8, v10, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sub_if_uge_C_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a0, 1048278
+; RV64-NEXT:    addi a0, a0, -95
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -513
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vadd.vx v10, v8, a0
+; RV64-NEXT:    vminu.vv v8, v10, v8
+; RV64-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i64> %x, splat (i64 5000000000)
+  %sub = add <vscale x 2 x i64> %x, splat (i64 -5000000001)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> %sub, <vscale x 2 x i64> %x
+  ret <vscale x 2 x i64> %select
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 7fb822d20f892..23c0c826e85e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -18,7 +18,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -31,30 +31,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor3_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg3e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: load_factor3_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 699051
-; RV64-NEXT:    addi a2, a2, -1365
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg3e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 3
+  %rvl = mul nuw i32 %evl, 3
   %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -66,6 +54,91 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_partial(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_partial:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:    vmv1r.v v8, v7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor3_partial:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:    vmv1r.v v8, v7
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 3
+  %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t2, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_no_extract(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_no_extract:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    beq a1, a2, .LBB3_2
+; RV32-NEXT:  # %bb.1: # %bb0
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:    j .LBB3_3
+; RV32-NEXT:  .LBB3_2: # %bb1
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:  .LBB3_3: # %merge
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor3_no_extract:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a1
+; RV64-NEXT:    li a3, 12
+; RV64-NEXT:    beq a2, a3, .LBB3_2
+; RV64-NEXT:  # %bb.1: # %bb0
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:    j .LBB3_3
+; RV64-NEXT:  .LBB3_2: # %bb1
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:  .LBB3_3: # %merge
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv1r.v v8, v7
+; RV64-NEXT:    ret
+  %p = icmp ne i32 %evl, 12
+  br i1 %p, label %bb0, label %bb1
+
+bb0:
+  %rvl.0 = mul nuw i32 %evl, 3
+  %wide.load.0 = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl.0)
+  %deinterleaved.results.0 = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.load.0)
+  br label %merge
+
+bb1:
+  %wide.load.1 = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 12)
+  %deinterleaved.results.1 = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.load.1)
+  br label %merge
+
+merge:
+  %deinterleaved.results = phi { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [%deinterleaved.results.0, %bb0], [%deinterleaved.results.1, %bb1]
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t2, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor4_v2:
 ; RV32:       # %bb.0:
@@ -82,7 +155,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
+  %rvl = mul nuw i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
@@ -100,30 +173,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor5_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor5_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 838861
-; RV32-NEXT:    addi a2, a2, -819
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg5e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: load_factor5_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 838861
-; RV64-NEXT:    addi a2, a2, -819
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg5e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 5
+  %rvl = mul nuw i32 %evl, 5
   %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -142,37 +203,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor7_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor7_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    lui a1, 149797
-; RV32-NEXT:    addi a1, a1, -1755
-; RV32-NEXT:    mulhu a1, a2, a1
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    srli a2, a2, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg7e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: load_factor7_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    lui a3, 149797
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    addi a1, a3, -1755
-; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    mulhu a1, a3, a1
 ; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    srliw a2, a2, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg7e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 7
+  %rvl = mul nuw i32 %evl, 7
   %wide.masked.load = call <vscale x 14 x i32> @llvm.vp.load(ptr %ptr, <vscale x 14 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave7(<vscale x 14 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -208,7 +250,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 8
+  %rvl = mul nuw i32 %evl, 8
   %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
   %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
@@ -247,7 +289,7 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -256,30 +298,18 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 define void @store_factor3_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor3_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg3e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_factor3_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 699051
-; RV64-NEXT:    addi a2, a2, -1365
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg3e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 3
+  %rvl = mul nuw i32 %evl, 3
   %interleaved.vec = call <vscale x 3 x i32> @llvm.vector.interleave3(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2)
   call void @llvm.vp.store(<vscale x 3 x i32> %interleaved.vec, ptr %ptr, <vscale x 3 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -305,7 +335,7 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vmv1r.v v11, v9
 ; RV64-NEXT:    vsseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 8
+  %rvl = mul nuw i32 %evl, 8
   %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -314,30 +344,18 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor5_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 838861
-; RV32-NEXT:    addi a2, a2, -819
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg5e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_factor5_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 838861
-; RV64-NEXT:    addi a2, a2, -819
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg5e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 5
+  %rvl = mul nuw i32 %evl, 5
   %interleaved.vec = call <vscale x 5 x i32> @llvm.vector.interleave5(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4)
   call void @llvm.vp.store(<vscale x 5 x i32> %interleaved.vec, ptr %ptr, <vscale x 5 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -346,37 +364,18 @@ define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <v
 define void @store_factor7_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor7_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    lui a1, 149797
-; RV32-NEXT:    addi a1, a1, -1755
-; RV32-NEXT:    mulhu a1, a2, a1
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    srli a2, a2, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg7e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_factor7_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    lui a3, 149797
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    addi a1, a3, -1755
-; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    mulhu a1, a3, a1
 ; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    srliw a2, a2, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg7e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 7
+  %rvl = mul nuw i32 %evl, 7
   %interleaved.vec = call <vscale x 7 x i32> @llvm.vector.interleave7(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4,  <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6)
   call void @llvm.vp.store(<vscale x 7 x i32> %interleaved.vec, ptr %ptr, <vscale x 7 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -410,7 +409,7 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vmv1r.v v15, v9
 ; RV64-NEXT:    vsseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 8
+  %rvl = mul nuw i32 %evl, 8
   %interleaved.vec = call <vscale x 8 x i32> @llvm.vector.interleave8.nxv8i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -432,7 +431,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -459,7 +458,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
+  %rvl = mul nuw i32 %evl, 4
   %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask, i32 %rvl)
   %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
@@ -495,7 +494,7 @@ define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.vec = tail call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   tail call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> %interleaved.mask, i32 %rvl)
@@ -520,7 +519,7 @@ define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, p
 ; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -610,7 +609,7 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -642,13 +641,58 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; RV64-NEXT:    vmv1r.v v11, v9
 ; RV64-NEXT:    vsseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
+  %rvl = mul nuw i32 %evl, 4
   %interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   ret void
 }
 
+define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor2_oneactive:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v7, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor2_oneactive:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg2e32.v v7, (a0)
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 4
+  %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  ret <vscale x 2 x i32> %t0
+}
+
+define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor5_oneactive:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg5e32.v v5, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor5_oneactive:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg5e32.v v5, (a0)
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 5
+  %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load)
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 3
+  ret <vscale x 2 x i32> %t3
+}
+
+
 ; Negative tests
 
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
@@ -724,7 +768,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    vnsrl.wx v9, v10, a0
 ; RV64-NEXT:    vnsrl.wi v8, v10, 0
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index 25a226e60e715..eb129da2697b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -959,7 +959,7 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv1i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v9, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
@@ -1022,7 +1022,7 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v10, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
@@ -1085,7 +1085,7 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv4i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v12, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
@@ -1148,7 +1148,7 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv8i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v16, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index 9e63b613ab70b..97524ac61b96e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -1626,7 +1626,7 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv1i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v9, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
@@ -1728,7 +1728,7 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v10, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
@@ -1830,7 +1830,7 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv4i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v12, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
@@ -1932,7 +1932,7 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv8i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
-; CHECK-RV64-NEXT:    negw a0, a0
+; CHECK-RV64-NEXT:    neg a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v16, v8, a1
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
index 8eef133d0e76c..4442f97b8fe76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
@@ -77,7 +77,7 @@ define i64 @con1024_minus_rem() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    negw a0, a0
+; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    andi a0, a0, 1024
 ; CHECK-NEXT:    ret
   %vscale = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index 206838917d004..ad2ed47e67e64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -153,20 +153,19 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vlm.v v10, (a2)
-; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v10
-; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
-; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v10, v11, v12
+; NO_FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v9
 ; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -174,20 +173,19 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vlm.v v10, (a2)
-; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; FOLDING-NEXT:    vmv.v.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v10
-; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; FOLDING-NEXT:    vmul.vv v9, v12, v9
-; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; FOLDING-NEXT:    vmul.vv v10, v11, v12
+; FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v9
 ; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -209,20 +207,19 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vlm.v v10, (a2)
-; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v10
-; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
-; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v10, v11, v12
+; NO_FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v9
 ; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -230,20 +227,19 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vlm.v v10, (a2)
-; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v10
-; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; FOLDING-NEXT:    vmul.vv v9, v12, v9
-; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; FOLDING-NEXT:    vmul.vv v10, v11, v12
+; FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v9
 ; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -444,16 +440,14 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vmv.v.i v10, 0
-; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vmv.v.v v0, v9
-; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vadd.vv v10, v9, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v9, v8
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -461,16 +455,14 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vmv.v.i v10, 0
-; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
-; FOLDING-NEXT:    vmv.v.v v0, v9
-; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a2)
+; FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vadd.vv v10, v9, v8
+; FOLDING-NEXT:    vsub.vv v8, v9, v8
+; FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -492,16 +484,14 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vmv.v.i v10, 0
-; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vadd.vv v10, v9, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v9, v8
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -509,16 +499,14 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vmv.v.i v10, 0
-; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
-; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a2)
+; FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vadd.vv v10, v9, v8
+; FOLDING-NEXT:    vsub.vv v8, v9, v8
+; FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index 9cdec6a9ff2e9..30044ad580143 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -494,17 +494,17 @@ define <vscale x 8 x double> @vfmerge_nzv_nxv8f64(<vscale x 8 x double> %va, <vs
 define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %va, <vscale x 16 x double> %vb) {
 ; CHECK-LABEL: vselect_combine_regression:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmseq.vi v24, v16, 0
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vle64.v v16, (a1), v0.t
+; CHECK-NEXT:    vmseq.vi v0, v24, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vle64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %cond = icmp eq <vscale x 16 x i64> %va, zeroinitializer
   %sel = select <vscale x 16 x i1> %cond, <vscale x 16 x double> %vb, <vscale x 16 x double> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index 261a1f8fd2c6c..7990dfc0880a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -304,27 +304,27 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:    li t1, 0
 ; RV64X60-NEXT:    addi s1, a7, -1
 ; RV64X60-NEXT:    zext.w s1, s1
-; RV64X60-NEXT:    mul t2, a1, s1
-; RV64X60-NEXT:    mul t3, a3, s1
-; RV64X60-NEXT:    mul t4, a5, s1
+; RV64X60-NEXT:    mul t3, a1, s1
+; RV64X60-NEXT:    mul t4, a3, s1
+; RV64X60-NEXT:    mul t5, a5, s1
 ; RV64X60-NEXT:    add s0, a0, a6
-; RV64X60-NEXT:    add s1, a2, a6
-; RV64X60-NEXT:    add t5, a4, a6
-; RV64X60-NEXT:    add s0, s0, t2
 ; RV64X60-NEXT:    csrr t2, vlenb
-; RV64X60-NEXT:    add t3, t3, s1
+; RV64X60-NEXT:    add s1, a2, a6
+; RV64X60-NEXT:    add t3, t3, s0
+; RV64X60-NEXT:    add s0, a4, a6
+; RV64X60-NEXT:    add t4, t4, s1
 ; RV64X60-NEXT:    li t6, 32
-; RV64X60-NEXT:    add t4, t4, t5
-; RV64X60-NEXT:    sltu t3, a0, t3
-; RV64X60-NEXT:    sltu s1, a2, s0
-; RV64X60-NEXT:    and t3, t3, s1
-; RV64X60-NEXT:    or t5, a1, a3
-; RV64X60-NEXT:    sltu s1, a0, t4
-; RV64X60-NEXT:    sltu s0, a4, s0
-; RV64X60-NEXT:    slti t4, t5, 0
+; RV64X60-NEXT:    add t5, t5, s0
+; RV64X60-NEXT:    sltu s0, a0, t4
+; RV64X60-NEXT:    sltu s1, a2, t3
+; RV64X60-NEXT:    and t4, s0, s1
+; RV64X60-NEXT:    or s2, a1, a3
+; RV64X60-NEXT:    sltu s0, a0, t5
+; RV64X60-NEXT:    sltu s1, a4, t3
+; RV64X60-NEXT:    slti t3, s2, 0
 ; RV64X60-NEXT:    and s0, s0, s1
 ; RV64X60-NEXT:    or s1, a1, a5
-; RV64X60-NEXT:    or t4, t3, t4
+; RV64X60-NEXT:    or t4, t4, t3
 ; RV64X60-NEXT:    slli t3, t2, 1
 ; RV64X60-NEXT:    slti s1, s1, 0
 ; RV64X60-NEXT:    or s0, s0, s1
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 0ea80bf592999..2e1784d369680 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -647,7 +647,7 @@ define i32 @select_add_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: select_add_1:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    negw a0, a0
+; RV64IM-NEXT:    neg a0, a0
 ; RV64IM-NEXT:    and a0, a0, a1
 ; RV64IM-NEXT:    addw a0, a2, a0
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index b128abb6b5bdd..b155feab9b4d9 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1048,21 +1048,21 @@ define signext i32 @bug(i32 signext %x) {
 ; CHECK-NEXT:    srliw a2, a0, 24
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    negw a2, a2
+; CHECK-NEXT:    neg a2, a2
 ; CHECK-NEXT:    sllw a0, a0, a3
 ; CHECK-NEXT:    andi a2, a2, -8
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    srliw a2, a0, 28
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 2
-; CHECK-NEXT:    negw a2, a2
+; CHECK-NEXT:    neg a2, a2
 ; CHECK-NEXT:    sllw a0, a0, a3
 ; CHECK-NEXT:    andi a2, a2, -4
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    srliw a2, a0, 30
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 1
-; CHECK-NEXT:    negw a2, a2
+; CHECK-NEXT:    neg a2, a2
 ; CHECK-NEXT:    sllw a0, a0, a3
 ; CHECK-NEXT:    andi a2, a2, -2
 ; CHECK-NEXT:    add a1, a1, a2
@@ -1090,21 +1090,21 @@ define signext i32 @bug(i32 signext %x) {
 ; NOREMOVAL-NEXT:    srliw a2, a0, 24
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 3
-; NOREMOVAL-NEXT:    negw a2, a2
+; NOREMOVAL-NEXT:    neg a2, a2
 ; NOREMOVAL-NEXT:    sllw a0, a0, a3
 ; NOREMOVAL-NEXT:    andi a2, a2, -8
 ; NOREMOVAL-NEXT:    add a1, a1, a2
 ; NOREMOVAL-NEXT:    srliw a2, a0, 28
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 2
-; NOREMOVAL-NEXT:    negw a2, a2
+; NOREMOVAL-NEXT:    neg a2, a2
 ; NOREMOVAL-NEXT:    sllw a0, a0, a3
 ; NOREMOVAL-NEXT:    andi a2, a2, -4
 ; NOREMOVAL-NEXT:    add a1, a1, a2
 ; NOREMOVAL-NEXT:    srliw a2, a0, 30
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 1
-; NOREMOVAL-NEXT:    negw a2, a2
+; NOREMOVAL-NEXT:    neg a2, a2
 ; NOREMOVAL-NEXT:    sllw a0, a0, a3
 ; NOREMOVAL-NEXT:    andi a2, a2, -2
 ; NOREMOVAL-NEXT:    add a1, a1, a2
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 7ca1ee1cba2f8..1ca23d72b107b 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -383,7 +383,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 ; RV64I-LABEL: fshr64_minsize:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a2, a0, a1
-; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll
index 99dc4f816d669..e44d247b7d56b 100644
--- a/llvm/test/CodeGen/RISCV/shl-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll
@@ -40,7 +40,7 @@ define i8 @shl_cttz_i8(i8 %x, i8 %y) {
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a2, a1, 1
 ; RV64I-NEXT:    andi a2, a2, 85
-; RV64I-NEXT:    subw a1, a1, a2
+; RV64I-NEXT:    sub a1, a1, a2
 ; RV64I-NEXT:    andi a2, a1, 51
 ; RV64I-NEXT:    srli a1, a1, 2
 ; RV64I-NEXT:    andi a1, a1, 51
@@ -96,7 +96,7 @@ define i8 @shl_cttz_constant_i8(i8 %y) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    andi a1, a1, 85
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 51
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    andi a0, a0, 51
@@ -276,7 +276,7 @@ define i32 @shl_cttz_i32(i32 %x, i32 %y) {
 ;
 ; RV64I-LABEL: shl_cttz_i32:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 30667
 ; RV64I-NEXT:    addi a2, a2, 1329
@@ -333,7 +333,7 @@ define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) {
 ; RV64I-NEXT:    sext.w a2, a1
 ; RV64I-NEXT:    beqz a2, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 30667
 ; RV64I-NEXT:    addi a2, a2, 1329
@@ -378,7 +378,7 @@ define i32 @shl_cttz_constant_i32(i32 %y) {
 ;
 ; RV64I-LABEL: shl_cttz_constant_i32:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 30667
 ; RV64I-NEXT:    addi a1, a1, 1329
@@ -474,7 +474,7 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) {
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    .cfi_offset s0, -16
 ; RV64I-NEXT:    .cfi_offset s1, -24
-; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    neg a2, a1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 30667
 ; RV64I-NEXT:    addi a2, a2, 1329
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index 061435c45ad0e..59a702ab6b17f 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -798,12 +798,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
 ; RV64SFBSIFIVEU74-LABEL: sextw_removal_ccor:
 ; RV64SFBSIFIVEU74:       # %bb.0: # %bb
 ; RV64SFBSIFIVEU74-NEXT:    addi sp, sp, -32
-; RV64SFBSIFIVEU74-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64SFBSIFIVEU74-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    mv s0, a3
+; RV64SFBSIFIVEU74-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    andi a0, a0, 1
 ; RV64SFBSIFIVEU74-NEXT:    mv s1, a2
+; RV64SFBSIFIVEU74-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    beqz a0, .LBB15_4
 ; RV64SFBSIFIVEU74-NEXT:  # %bb.3: # %bb
 ; RV64SFBSIFIVEU74-NEXT:    or s0, a3, a1
@@ -824,11 +824,11 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
 ; RV64SFBANDESAX45-LABEL: sextw_removal_ccor:
 ; RV64SFBANDESAX45:       # %bb.0: # %bb
 ; RV64SFBANDESAX45-NEXT:    addi sp, sp, -32
-; RV64SFBANDESAX45-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64SFBANDESAX45-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    mv s0, a3
+; RV64SFBANDESAX45-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    mv s1, a2
+; RV64SFBANDESAX45-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    nds.bbc a0, 0, .LBB15_2
 ; RV64SFBANDESAX45-NEXT:  # %bb.1:
 ; RV64SFBANDESAX45-NEXT:    or s0, s0, a1
@@ -848,12 +848,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
 ; ZICOND-LABEL: sextw_removal_ccor:
 ; ZICOND:       # %bb.0: # %bb
 ; ZICOND-NEXT:    addi sp, sp, -32
-; ZICOND-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; ZICOND-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    mv s0, a3
+; ZICOND-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    andi a0, a0, 1
 ; ZICOND-NEXT:    mv s1, a2
+; ZICOND-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    beqz a0, .LBB15_4
 ; ZICOND-NEXT:  # %bb.3: # %bb
 ; ZICOND-NEXT:    or s0, a3, a1
@@ -874,12 +874,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
 ; RV32SFB-LABEL: sextw_removal_ccor:
 ; RV32SFB:       # %bb.0: # %bb
 ; RV32SFB-NEXT:    addi sp, sp, -16
-; RV32SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    mv s0, a3
+; RV32SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    andi a0, a0, 1
 ; RV32SFB-NEXT:    mv s1, a2
+; RV32SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    beqz a0, .LBB15_4
 ; RV32SFB-NEXT:  # %bb.3: # %bb
 ; RV32SFB-NEXT:    or s0, a3, a1
@@ -941,11 +941,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; RV64SFBSIFIVEU74-LABEL: sextw_removal_ccaddw:
 ; RV64SFBSIFIVEU74:       # %bb.0: # %bb
 ; RV64SFBSIFIVEU74-NEXT:    addi sp, sp, -32
-; RV64SFBSIFIVEU74-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64SFBSIFIVEU74-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    mv s1, a1
 ; RV64SFBSIFIVEU74-NEXT:    andi a0, a0, 1
+; RV64SFBSIFIVEU74-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64SFBSIFIVEU74-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBSIFIVEU74-NEXT:    mv s0, a2
 ; RV64SFBSIFIVEU74-NEXT:    beqz a0, .LBB16_4
 ; RV64SFBSIFIVEU74-NEXT:  # %bb.3: # %bb
@@ -967,11 +967,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; RV64SFBANDESAX45-LABEL: sextw_removal_ccaddw:
 ; RV64SFBANDESAX45:       # %bb.0: # %bb
 ; RV64SFBANDESAX45-NEXT:    addi sp, sp, -32
-; RV64SFBANDESAX45-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64SFBANDESAX45-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    mv s0, a2
+; RV64SFBANDESAX45-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    mv s1, a1
+; RV64SFBANDESAX45-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFBANDESAX45-NEXT:    nds.bbc a0, 0, .LBB16_2
 ; RV64SFBANDESAX45-NEXT:  # %bb.1:
 ; RV64SFBANDESAX45-NEXT:    addw s1, s1, a3
@@ -991,11 +991,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; ZICOND-LABEL: sextw_removal_ccaddw:
 ; ZICOND:       # %bb.0: # %bb
 ; ZICOND-NEXT:    addi sp, sp, -32
-; ZICOND-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; ZICOND-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    mv s1, a1
 ; ZICOND-NEXT:    andi a0, a0, 1
+; ZICOND-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; ZICOND-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    mv s0, a2
 ; ZICOND-NEXT:    beqz a0, .LBB16_4
 ; ZICOND-NEXT:  # %bb.3: # %bb
@@ -1017,11 +1017,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; RV32SFB-LABEL: sextw_removal_ccaddw:
 ; RV32SFB:       # %bb.0: # %bb
 ; RV32SFB-NEXT:    addi sp, sp, -16
-; RV32SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    mv s1, a1
 ; RV32SFB-NEXT:    andi a0, a0, 1
+; RV32SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    mv s0, a2
 ; RV32SFB-NEXT:    beqz a0, .LBB16_4
 ; RV32SFB-NEXT:  # %bb.3: # %bb
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 93fb230f51ce1..bc23388315de7 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -50,7 +50,7 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV64-NEXT:    add a2, a2, a4
 ; RV64-NEXT:    slli a4, a0, 2
 ; RV64-NEXT:    add a4, a0, a4
-; RV64-NEXT:    subw a1, a1, a4
+; RV64-NEXT:    sub a1, a1, a4
 ; RV64-NEXT:    slli a4, a0, 17
 ; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    slli a0, a0, 23
@@ -59,8 +59,8 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    lui a3, 1324
 ; RV64-NEXT:    addi a2, a2, -83
-; RV64-NEXT:    subw a0, a0, a2
-; RV64-NEXT:    subw a1, a1, a0
+; RV64-NEXT:    sub a0, a0, a2
+; RV64-NEXT:    sub a1, a1, a0
 ; RV64-NEXT:    slli a1, a1, 35
 ; RV64-NEXT:    srli a1, a1, 35
 ; RV64-NEXT:    addi a0, a3, -165
@@ -189,7 +189,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV64M-NEXT:    add a1, a1, a2
 ; RV64M-NEXT:    slli a2, a1, 3
 ; RV64M-NEXT:    slli a1, a1, 1
-; RV64M-NEXT:    subw a1, a1, a2
+; RV64M-NEXT:    sub a1, a1, a2
 ; RV64M-NEXT:    add a0, a0, a1
 ; RV64M-NEXT:    andi a0, a0, 15
 ; RV64M-NEXT:    addi a0, a0, -1
@@ -225,7 +225,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV64MV-NEXT:    add a1, a1, a2
 ; RV64MV-NEXT:    slli a2, a1, 3
 ; RV64MV-NEXT:    slli a1, a1, 1
-; RV64MV-NEXT:    subw a1, a1, a2
+; RV64MV-NEXT:    sub a1, a1, a2
 ; RV64MV-NEXT:    add a0, a0, a1
 ; RV64MV-NEXT:    andi a0, a0, 15
 ; RV64MV-NEXT:    addi a0, a0, -1
@@ -256,7 +256,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 ; RV64-NEXT:    srli a1, a1, 62
 ; RV64-NEXT:    add a1, a0, a1
 ; RV64-NEXT:    andi a1, a1, 60
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    andi a0, a0, 63
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    ret
@@ -280,7 +280,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 ; RV64M-NEXT:    srli a1, a1, 62
 ; RV64M-NEXT:    add a1, a0, a1
 ; RV64M-NEXT:    andi a1, a1, 60
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    andi a0, a0, 63
 ; RV64M-NEXT:    snez a0, a0
 ; RV64M-NEXT:    ret
@@ -304,7 +304,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 ; RV64MV-NEXT:    srli a1, a1, 62
 ; RV64MV-NEXT:    add a1, a0, a1
 ; RV64MV-NEXT:    andi a1, a1, 60
-; RV64MV-NEXT:    subw a0, a0, a1
+; RV64MV-NEXT:    sub a0, a0, a1
 ; RV64MV-NEXT:    andi a0, a0, 63
 ; RV64MV-NEXT:    snez a0, a0
 ; RV64MV-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 30ffaf6c7ceca..5129cccdac06a 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -183,10 +183,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mul a5, a5, t1
 ; RV64IM-NEXT:    li t1, -124
 ; RV64IM-NEXT:    mul a6, a6, t1
-; RV64IM-NEXT:    subw a4, a4, a7
-; RV64IM-NEXT:    subw a1, a1, t0
-; RV64IM-NEXT:    subw a3, a3, a5
-; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, a7
+; RV64IM-NEXT:    sub a1, a1, t0
+; RV64IM-NEXT:    sub a3, a3, a5
+; RV64IM-NEXT:    sub a2, a2, a6
 ; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a4, 4(a0)
@@ -357,10 +357,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mul a7, a7, t1
 ; RV64IM-NEXT:    mul t0, t0, t1
 ; RV64IM-NEXT:    mul a2, a2, t1
-; RV64IM-NEXT:    subw a3, a3, a6
-; RV64IM-NEXT:    subw a4, a4, a7
-; RV64IM-NEXT:    subw a5, a5, t0
-; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sub a3, a3, a6
+; RV64IM-NEXT:    sub a4, a4, a7
+; RV64IM-NEXT:    sub a5, a5, t0
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    sh a4, 2(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
@@ -597,10 +597,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a1, a1, t1
 ; RV64IM-NEXT:    add a3, a3, t0
 ; RV64IM-NEXT:    add a4, a4, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    subw a1, a1, t4
-; RV64IM-NEXT:    subw a3, a3, t3
-; RV64IM-NEXT:    subw a4, a4, t2
+; RV64IM-NEXT:    sub a2, a2, a6
+; RV64IM-NEXT:    sub a1, a1, t4
+; RV64IM-NEXT:    sub a3, a3, t3
+; RV64IM-NEXT:    sub a4, a4, t2
 ; RV64IM-NEXT:    sh a2, 0(a0)
 ; RV64IM-NEXT:    sh a1, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
@@ -703,15 +703,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    srli a1, a2, 58
 ; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    andi a1, a1, -64
-; RV64I-NEXT:    subw s1, a2, a1
+; RV64I-NEXT:    sub s1, a2, a1
 ; RV64I-NEXT:    srli a1, a3, 59
 ; RV64I-NEXT:    add a1, a3, a1
 ; RV64I-NEXT:    andi a1, a1, -32
-; RV64I-NEXT:    subw s2, a3, a1
+; RV64I-NEXT:    sub s2, a3, a1
 ; RV64I-NEXT:    srli a1, a4, 61
 ; RV64I-NEXT:    add a1, a4, a1
 ; RV64I-NEXT:    andi a1, a1, -8
-; RV64I-NEXT:    subw s3, a4, a1
+; RV64I-NEXT:    sub s3, a4, a1
 ; RV64I-NEXT:    li a1, 95
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    sh s1, 0(s0)
@@ -737,23 +737,23 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    srli a6, a2, 58
 ; RV64IM-NEXT:    add a6, a2, a6
 ; RV64IM-NEXT:    andi a6, a6, -64
-; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    sub a2, a2, a6
 ; RV64IM-NEXT:    srli a6, a3, 59
 ; RV64IM-NEXT:    add a6, a3, a6
 ; RV64IM-NEXT:    andi a6, a6, -32
-; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    sub a3, a3, a6
 ; RV64IM-NEXT:    srli a6, a4, 61
 ; RV64IM-NEXT:    mulh a5, a1, a5
 ; RV64IM-NEXT:    add a6, a4, a6
 ; RV64IM-NEXT:    add a5, a5, a1
 ; RV64IM-NEXT:    andi a6, a6, -8
-; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    sub a4, a4, a6
 ; RV64IM-NEXT:    srli a6, a5, 63
 ; RV64IM-NEXT:    srli a5, a5, 6
 ; RV64IM-NEXT:    add a5, a5, a6
 ; RV64IM-NEXT:    li a6, 95
 ; RV64IM-NEXT:    mul a5, a5, a6
-; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sub a1, a1, a5
 ; RV64IM-NEXT:    sh a2, 0(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    sh a4, 4(a0)
@@ -909,9 +909,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mul a6, a6, a7
 ; RV64IM-NEXT:    li a7, 23
 ; RV64IM-NEXT:    mul a4, a4, a7
-; RV64IM-NEXT:    subw a2, a2, a5
-; RV64IM-NEXT:    subw a1, a1, a6
-; RV64IM-NEXT:    subw a3, a3, a4
+; RV64IM-NEXT:    sub a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a6
+; RV64IM-NEXT:    sub a3, a3, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
@@ -1011,7 +1011,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    lui a3, 8
 ; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    subw s3, a2, a1
+; RV64I-NEXT:    sub s3, a2, a1
 ; RV64I-NEXT:    li a1, 23
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    mv s2, a0
@@ -1050,7 +1050,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a5, a5, a7
 ; RV64IM-NEXT:    mulh a4, a3, a4
 ; RV64IM-NEXT:    add a4, a4, a3
-; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    sub a2, a2, a6
 ; RV64IM-NEXT:    srli a6, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 4
 ; RV64IM-NEXT:    add a4, a4, a6
@@ -1059,8 +1059,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mul a5, a5, a6
 ; RV64IM-NEXT:    li a6, 23
 ; RV64IM-NEXT:    mul a4, a4, a6
-; RV64IM-NEXT:    subw a1, a1, a5
-; RV64IM-NEXT:    subw a3, a3, a4
+; RV64IM-NEXT:    sub a1, a1, a5
+; RV64IM-NEXT:    sub a3, a3, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
index 3007c3574cf78..0c13a1d8a46f3 100644
--- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -26,7 +26,7 @@ define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
 define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: overflow_sub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    ori a0, a0, 1
 ; CHECK-NEXT:    slli a0, a0, 48
 ; CHECK-NEXT:    srli a0, a0, 48
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index af5121dfe180d..ee496123ba7b4 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -48,7 +48,7 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
 ; RV64IM-NEXT:    slli a2, a2, 32
 ; RV64IM-NEXT:    mulhu a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    subw a2, a0, a1
+; RV64IM-NEXT:    sub a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1
 ; RV64IM-NEXT:    add a1, a2, a1
 ; RV64IM-NEXT:    srli a1, a1, 6
@@ -174,7 +174,7 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV64IM-NEXT:    slli a2, a2, 32
 ; RV64IM-NEXT:    mulhu a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    subw a2, a0, a1
+; RV64IM-NEXT:    sub a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1
 ; RV64IM-NEXT:    add a1, a2, a1
 ; RV64IM-NEXT:    li a2, 95
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index d33c6662ceb5c..636fdfae68438 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -31,11 +31,11 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 ; RV64-NEXT:    slli a1, a0, 4
 ; RV64-NEXT:    slli a2, a0, 6
 ; RV64-NEXT:    slli a3, a0, 8
-; RV64-NEXT:    subw a1, a1, a2
+; RV64-NEXT:    sub a1, a1, a2
 ; RV64-NEXT:    slli a2, a0, 10
-; RV64-NEXT:    subw a3, a3, a2
+; RV64-NEXT:    sub a3, a3, a2
 ; RV64-NEXT:    slli a2, a0, 2
-; RV64-NEXT:    subw a2, a0, a2
+; RV64-NEXT:    sub a2, a0, a2
 ; RV64-NEXT:    slli a0, a0, 12
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    add a0, a3, a0
@@ -138,10 +138,10 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV64-NEXT:    slli a4, a0, 18
 ; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    slli a0, a0, 27
-; RV64-NEXT:    subw a0, a0, a2
+; RV64-NEXT:    sub a0, a0, a2
 ; RV64-NEXT:    lui a2, 2341
 ; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    slli a1, a0, 26
 ; RV64-NEXT:    slli a0, a0, 37
 ; RV64-NEXT:    srli a0, a0, 38
@@ -234,8 +234,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; RV64-LABEL: test_urem_odd_setne:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    negw a0, a0
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    sltiu a0, a0, 4
 ; RV64-NEXT:    xori a0, a0, 1
@@ -254,8 +254,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; RV64M-LABEL: test_urem_odd_setne:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    slli a1, a0, 1
-; RV64M-NEXT:    negw a0, a0
-; RV64M-NEXT:    subw a0, a0, a1
+; RV64M-NEXT:    neg a0, a0
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    andi a0, a0, 15
 ; RV64M-NEXT:    sltiu a0, a0, 4
 ; RV64M-NEXT:    xori a0, a0, 1
@@ -274,8 +274,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; RV64MV-LABEL: test_urem_odd_setne:
 ; RV64MV:       # %bb.0:
 ; RV64MV-NEXT:    slli a1, a0, 1
-; RV64MV-NEXT:    negw a0, a0
-; RV64MV-NEXT:    subw a0, a0, a1
+; RV64MV-NEXT:    neg a0, a0
+; RV64MV-NEXT:    sub a0, a0, a1
 ; RV64MV-NEXT:    andi a0, a0, 15
 ; RV64MV-NEXT:    sltiu a0, a0, 4
 ; RV64MV-NEXT:    xori a0, a0, 1
@@ -306,9 +306,9 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; RV64-NEXT:    slli a1, a0, 2
 ; RV64-NEXT:    slli a2, a0, 4
 ; RV64-NEXT:    slli a3, a0, 6
-; RV64-NEXT:    subw a1, a1, a0
-; RV64-NEXT:    subw a2, a2, a3
-; RV64-NEXT:    subw a1, a1, a2
+; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    sub a2, a2, a3
+; RV64-NEXT:    sub a1, a1, a2
 ; RV64-NEXT:    slli a0, a0, 8
 ; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    andi a0, a0, 511
@@ -437,7 +437,7 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    addi a2, a2, -2
 ; RV64-NEXT:    add a1, a1, a4
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    subw a4, t0, a7
+; RV64-NEXT:    sub a4, t0, a7
 ; RV64-NEXT:    slli a6, a3, 3
 ; RV64-NEXT:    slli a7, a3, 6
 ; RV64-NEXT:    slli t0, a3, 9
@@ -447,18 +447,18 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    slli a6, a2, 4
 ; RV64-NEXT:    add a7, a7, t0
 ; RV64-NEXT:    slli t0, a2, 6
-; RV64-NEXT:    subw a6, a6, t0
+; RV64-NEXT:    sub a6, a6, t0
 ; RV64-NEXT:    slli t0, a2, 8
-; RV64-NEXT:    subw a5, a5, a2
+; RV64-NEXT:    sub a5, a5, a2
 ; RV64-NEXT:    slli a2, a2, 10
-; RV64-NEXT:    subw a2, t0, a2
-; RV64-NEXT:    subw a4, a4, a1
+; RV64-NEXT:    sub a2, t0, a2
+; RV64-NEXT:    sub a4, a4, a1
 ; RV64-NEXT:    add a3, a3, a7
-; RV64-NEXT:    subw a1, a5, a6
+; RV64-NEXT:    sub a1, a5, a6
 ; RV64-NEXT:    slli a5, a4, 10
 ; RV64-NEXT:    slli a4, a4, 53
-; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    subw a1, a1, a2
+; RV64-NEXT:    neg a3, a3
+; RV64-NEXT:    sub a1, a1, a2
 ; RV64-NEXT:    srli a4, a4, 54
 ; RV64-NEXT:    andi a2, a3, 2047
 ; RV64-NEXT:    andi a1, a1, 2047
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 3ef9f3f945108..5a3dfd118307d 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -157,10 +157,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mul a7, a7, t1
 ; RV64IM-NEXT:    slli t1, a5, 7
 ; RV64IM-NEXT:    slli a5, a5, 2
-; RV64IM-NEXT:    subw a5, a5, t1
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    subw a4, a4, t0
-; RV64IM-NEXT:    subw a1, a1, a7
+; RV64IM-NEXT:    sub a5, a5, t1
+; RV64IM-NEXT:    sub a2, a2, a6
+; RV64IM-NEXT:    sub a4, a4, t0
+; RV64IM-NEXT:    sub a1, a1, a7
 ; RV64IM-NEXT:    add a3, a3, a5
 ; RV64IM-NEXT:    sh a2, 0(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
@@ -300,10 +300,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mul t0, t0, a6
 ; RV64IM-NEXT:    mul t1, t1, a6
 ; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, t0
-; RV64IM-NEXT:    subw a5, a5, t1
-; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    sub a3, a3, a7
+; RV64IM-NEXT:    sub a4, a4, t0
+; RV64IM-NEXT:    sub a5, a5, t1
+; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    sh a4, 2(a0)
 ; RV64IM-NEXT:    sh a5, 4(a0)
@@ -508,10 +508,10 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a1, a1, t1
 ; RV64IM-NEXT:    add a3, a3, t0
 ; RV64IM-NEXT:    add a4, a4, a7
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    subw a1, a1, t4
-; RV64IM-NEXT:    subw a3, a3, t3
-; RV64IM-NEXT:    subw a4, a4, t2
+; RV64IM-NEXT:    sub a2, a2, a6
+; RV64IM-NEXT:    sub a1, a1, t4
+; RV64IM-NEXT:    sub a3, a3, t3
+; RV64IM-NEXT:    sub a4, a4, t2
 ; RV64IM-NEXT:    sh a2, 0(a0)
 ; RV64IM-NEXT:    sh a1, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
@@ -622,7 +622,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    andi a4, a4, 7
 ; RV64IM-NEXT:    mulhu a5, a1, a5
 ; RV64IM-NEXT:    mul a5, a5, a6
-; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sub a1, a1, a5
 ; RV64IM-NEXT:    sh a2, 0(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    sh a4, 4(a0)
@@ -757,9 +757,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    addi a7, a7, 1327
 ; RV64IM-NEXT:    mulhu a5, a1, a5
 ; RV64IM-NEXT:    mul a5, a5, a7
-; RV64IM-NEXT:    subw a2, a2, a4
-; RV64IM-NEXT:    subw a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    sub a3, a3, a6
+; RV64IM-NEXT:    sub a1, a1, a5
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 32753ca382fc7..cd7f30d8f5898 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
-; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    mv t0, sp
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srli t3, a0, 3
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    andi a5, a0, 31
-; RV32I-NEXT:    andi t3, t3, 12
-; RV32I-NEXT:    xori a5, a5, 31
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    add t0, t0, t3
-; RV32I-NEXT:    sw a4, 0(sp)
-; RV32I-NEXT:    sw a3, 4(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
-; RV32I-NEXT:    lw a1, 4(t0)
-; RV32I-NEXT:    lw a3, 8(t0)
-; RV32I-NEXT:    lw a4, 0(t0)
-; RV32I-NEXT:    lw a6, 12(t0)
-; RV32I-NEXT:    srl a7, a1, a0
-; RV32I-NEXT:    slli t0, a3, 1
-; RV32I-NEXT:    srl a4, a4, a0
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli t1, a6, 1
-; RV32I-NEXT:    srl a0, a6, a0
-; RV32I-NEXT:    sll a6, t0, a5
-; RV32I-NEXT:    sll a1, a1, a5
-; RV32I-NEXT:    sll a5, t1, a5
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a0, t2, a0
+; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a6, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a7, a4, a1
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a4, a4, a3
+; RV32I-NEXT:    sll a3, t1, a3
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 8
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    srli t1, a1, 8
-; RV32I-NEXT:    srli t2, a6, 16
-; RV32I-NEXT:    srli t3, a6, 24
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
-; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    addi t0, sp, 16
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srli t3, a0, 3
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    andi a5, a0, 31
-; RV32I-NEXT:    andi t3, t3, 12
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    sub a7, t0, t3
-; RV32I-NEXT:    sw a4, 16(sp)
-; RV32I-NEXT:    sw a3, 20(sp)
-; RV32I-NEXT:    sw a6, 24(sp)
-; RV32I-NEXT:    sw a1, 28(sp)
-; RV32I-NEXT:    lw a1, 0(a7)
-; RV32I-NEXT:    lw a3, 4(a7)
-; RV32I-NEXT:    lw a4, 8(a7)
-; RV32I-NEXT:    lw a6, 12(a7)
-; RV32I-NEXT:    xori a5, a5, 31
-; RV32I-NEXT:    sll a7, a3, a0
-; RV32I-NEXT:    srli t0, a1, 1
-; RV32I-NEXT:    sll a6, a6, a0
-; RV32I-NEXT:    srli t1, a4, 1
-; RV32I-NEXT:    sll a4, a4, a0
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    sll a0, a1, a0
-; RV32I-NEXT:    srl a1, t0, a5
-; RV32I-NEXT:    srl t0, t1, a5
-; RV32I-NEXT:    srl a3, a3, a5
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    srli t1, a0, 24
-; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    addi t2, sp, 16
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    sub a0, t2, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    sll a7, a5, a1
+; RV32I-NEXT:    srli t0, a4, 1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    sll a6, a6, a1
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    srl a4, t0, a3
+; RV32I-NEXT:    srl t0, t1, a3
+; RV32I-NEXT:    srl a3, a5, a3
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t2, 1(a2)
 ; RV32I-NEXT:    sb a5, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 8
-; RV32I-NEXT:    srli a7, a6, 16
-; RV32I-NEXT:    srli t0, a6, 24
-; RV32I-NEXT:    srli t1, a6, 8
-; RV32I-NEXT:    srli t2, a1, 16
-; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a0, 16
+; RV32I-NEXT:    srli t0, a0, 24
+; RV32I-NEXT:    srli t1, a0, 8
+; RV32I-NEXT:    srli t2, a4, 16
+; RV32I-NEXT:    srli t3, a4, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a1, 8
-; RV32I-NEXT:    sb a6, 12(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a4, 8
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t1, 13(a2)
 ; RV32I-NEXT:    sb a7, 14(a2)
 ; RV32I-NEXT:    sb t0, 15(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
@@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    mv t0, sp
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srli a4, a0, 3
-; RV32I-NEXT:    or a5, t1, a5
-; RV32I-NEXT:    andi t1, a0, 31
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srai t3, t4, 31
-; RV32I-NEXT:    andi a4, a4, 12
-; RV32I-NEXT:    xori t1, t1, 31
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or t3, t5, t4
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    mv a5, sp
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t2, a0, t2
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    sw t3, 16(sp)
-; RV32I-NEXT:    sw t3, 20(sp)
-; RV32I-NEXT:    sw t3, 24(sp)
-; RV32I-NEXT:    sw t3, 28(sp)
-; RV32I-NEXT:    add a4, t0, a4
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or a7, t2, t0
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a5, 4(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
-; RV32I-NEXT:    lw a1, 4(a4)
-; RV32I-NEXT:    lw a3, 8(a4)
-; RV32I-NEXT:    lw a5, 0(a4)
-; RV32I-NEXT:    lw a4, 12(a4)
-; RV32I-NEXT:    srl a6, a1, a0
-; RV32I-NEXT:    slli a7, a3, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli t0, a4, 1
-; RV32I-NEXT:    sra a0, a4, a0
-; RV32I-NEXT:    sll a4, a7, t1
-; RV32I-NEXT:    sll a1, a1, t1
-; RV32I-NEXT:    sll a7, t0, t1
+; RV32I-NEXT:    sw a6, 4(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a0, a5, a0
+; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a6, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a7, a4, a1
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a4, a4, a3
+; RV32I-NEXT:    sll a3, t1, a3
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
@@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    srli a5, a3, 24
 ; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    srli t1, a1, 8
-; RV32I-NEXT:    srli t2, a4, 16
-; RV32I-NEXT:    srli t3, a4, 24
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
 ; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
 ; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 8
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    mv s2, sp
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    xori s5, s5, 63
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    add s2, s2, s3
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    mv a6, sp
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a1, 8(s2)
-; RV64I-NEXT:    ld a3, 16(s2)
-; RV64I-NEXT:    ld a4, 0(s2)
-; RV64I-NEXT:    ld a5, 24(s2)
-; RV64I-NEXT:    srl a6, a1, a0
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    srl a4, a4, a0
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    srl a3, a3, a0
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    add a0, a6, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    srl a5, a5, a0
-; RV64I-NEXT:    sll a0, a7, s5
-; RV64I-NEXT:    sll a1, a1, s5
-; RV64I-NEXT:    sll a7, t0, s5
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
-; RV64I-NEXT:    srli t4, a5, 24
-; RV64I-NEXT:    srli t5, a5, 16
-; RV64I-NEXT:    srli t6, a5, 8
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a3, a3, a7
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    srl t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a4, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a4, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t2, 29(a2)
-; RV64I-NEXT:    sb t1, 30(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a5, 24(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
@@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s7, 12(a0)
-; RV32I-NEXT:    lbu s8, 13(a0)
-; RV32I-NEXT:    lbu s9, 14(a0)
-; RV32I-NEXT:    lbu s10, 15(a0)
-; RV32I-NEXT:    lbu s11, 16(a0)
-; RV32I-NEXT:    lbu ra, 17(a0)
-; RV32I-NEXT:    lbu t4, 18(a0)
-; RV32I-NEXT:    lbu s0, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t6, t3
-; RV32I-NEXT:    or a7, s3, s2
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or s7, s8, s7
-; RV32I-NEXT:    or s2, s10, s9
-; RV32I-NEXT:    or s3, ra, s11
-; RV32I-NEXT:    lbu s4, 28(a0)
-; RV32I-NEXT:    lbu s8, 29(a0)
-; RV32I-NEXT:    lbu s9, 30(a0)
-; RV32I-NEXT:    lbu s10, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 64(sp)
@@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 44(sp)
 ; RV32I-NEXT:    sw zero, 48(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 8
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t4, s0, t4
-; RV32I-NEXT:    addi s0, sp, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t3, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s5, t3
-; RV32I-NEXT:    or t6, t6, s6
-; RV32I-NEXT:    or s1, s8, s4
-; RV32I-NEXT:    or s4, s10, s9
-; RV32I-NEXT:    andi s5, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, s7
-; RV32I-NEXT:    or a7, t4, s3
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, t6, t3
-; RV32I-NEXT:    or t2, s4, s1
-; RV32I-NEXT:    add s0, s0, s5
-; RV32I-NEXT:    sw a7, 24(sp)
-; RV32I-NEXT:    sw t0, 28(sp)
-; RV32I-NEXT:    sw t1, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
-; RV32I-NEXT:    lw a3, 0(s0)
-; RV32I-NEXT:    lw a4, 4(s0)
-; RV32I-NEXT:    lw a5, 8(s0)
-; RV32I-NEXT:    lw a6, 12(s0)
-; RV32I-NEXT:    lw a7, 16(s0)
-; RV32I-NEXT:    lw t0, 20(s0)
-; RV32I-NEXT:    lw t1, 24(s0)
-; RV32I-NEXT:    lw t2, 28(s0)
-; RV32I-NEXT:    srl t3, a4, a0
-; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s3, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl t5, a6, a0
-; RV32I-NEXT:    slli t6, a7, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    srl s0, t0, a0
-; RV32I-NEXT:    slli s1, t1, 1
-; RV32I-NEXT:    srl a7, a7, a0
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    slli s2, t2, 1
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
 ; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    srl s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll t4, t6, a1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    sll t6, s1, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll s1, s2, a1
-; RV32I-NEXT:    srli s2, t2, 24
-; RV32I-NEXT:    srli s3, t2, 16
-; RV32I-NEXT:    srli s4, t2, 8
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, t4
-; RV32I-NEXT:    or a4, a5, a6
-; RV32I-NEXT:    or a5, s0, t6
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, s1
-; RV32I-NEXT:    sb t2, 28(a2)
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s3, 30(a2)
-; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
@@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    addi s2, sp, 32
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    sub t2, s2, s3
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    addi a6, sp, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a1, 56(sp)
-; RV64I-NEXT:    ld a1, 0(t2)
-; RV64I-NEXT:    ld a3, 8(t2)
-; RV64I-NEXT:    ld a4, 16(t2)
-; RV64I-NEXT:    ld a5, 24(t2)
-; RV64I-NEXT:    xori a6, s5, 63
-; RV64I-NEXT:    sll a7, a3, a0
-; RV64I-NEXT:    srli t0, a1, 1
-; RV64I-NEXT:    sll a5, a5, a0
-; RV64I-NEXT:    srli t1, a4, 1
-; RV64I-NEXT:    sll a4, a4, a0
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    sll t2, a1, a0
-; RV64I-NEXT:    srl a0, t0, a6
-; RV64I-NEXT:    srl a1, t1, a6
-; RV64I-NEXT:    srl a3, a3, a6
-; RV64I-NEXT:    srli a6, t2, 56
-; RV64I-NEXT:    srli t0, t2, 48
-; RV64I-NEXT:    srli t1, t2, 40
-; RV64I-NEXT:    srli t3, t2, 32
-; RV64I-NEXT:    srli t4, t2, 24
-; RV64I-NEXT:    srli t5, t2, 16
-; RV64I-NEXT:    srli t6, t2, 8
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a1, a5, a1
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    sb t3, 4(a2)
-; RV64I-NEXT:    sb t1, 5(a2)
-; RV64I-NEXT:    sb t0, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t6, 1(a2)
-; RV64I-NEXT:    sb t5, 2(a2)
-; RV64I-NEXT:    sb t4, 3(a2)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a5, 56(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    sub a0, a6, a0
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 8(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    sll a7, a5, a1
+; RV64I-NEXT:    srli t0, a4, 1
+; RV64I-NEXT:    sll t1, a0, a1
+; RV64I-NEXT:    srli a0, a6, 1
+; RV64I-NEXT:    sll a6, a6, a1
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    sll a4, a4, a1
+; RV64I-NEXT:    srl a1, t0, a3
+; RV64I-NEXT:    srl t0, a0, a3
+; RV64I-NEXT:    srl a3, a5, a3
+; RV64I-NEXT:    srli a5, a4, 56
+; RV64I-NEXT:    srli t2, a4, 48
+; RV64I-NEXT:    srli t3, a4, 40
+; RV64I-NEXT:    srli t4, a4, 32
+; RV64I-NEXT:    srli t5, a4, 24
+; RV64I-NEXT:    srli t6, a4, 16
+; RV64I-NEXT:    srli s0, a4, 8
+; RV64I-NEXT:    or a0, a7, a1
+; RV64I-NEXT:    or a1, t1, t0
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    sb t4, 4(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 6(a2)
+; RV64I-NEXT:    sb a5, 7(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb s0, 1(a2)
+; RV64I-NEXT:    sb t6, 2(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
@@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
@@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s7, 12(a0)
-; RV32I-NEXT:    lbu s8, 13(a0)
-; RV32I-NEXT:    lbu s9, 14(a0)
-; RV32I-NEXT:    lbu s10, 15(a0)
-; RV32I-NEXT:    lbu s11, 16(a0)
-; RV32I-NEXT:    lbu ra, 17(a0)
-; RV32I-NEXT:    lbu t4, 18(a0)
-; RV32I-NEXT:    lbu s0, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t6, t3
-; RV32I-NEXT:    or a7, s3, s2
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or s7, s8, s7
-; RV32I-NEXT:    or s2, s10, s9
-; RV32I-NEXT:    or s3, ra, s11
-; RV32I-NEXT:    lbu s4, 28(a0)
-; RV32I-NEXT:    lbu s8, 29(a0)
-; RV32I-NEXT:    lbu s9, 30(a0)
-; RV32I-NEXT:    lbu s10, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 32(sp)
@@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 40
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t4, s0, t4
-; RV32I-NEXT:    addi s0, sp, 40
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw t3, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s5, t3
-; RV32I-NEXT:    or t6, t6, s6
-; RV32I-NEXT:    or s1, s8, s4
-; RV32I-NEXT:    or s4, s10, s9
-; RV32I-NEXT:    andi s5, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, s7
-; RV32I-NEXT:    or a7, t4, s3
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, t6, t3
-; RV32I-NEXT:    or t2, s4, s1
-; RV32I-NEXT:    sub t3, s0, s5
-; RV32I-NEXT:    sw a7, 56(sp)
-; RV32I-NEXT:    sw t0, 60(sp)
-; RV32I-NEXT:    sw t1, 64(sp)
-; RV32I-NEXT:    sw t2, 68(sp)
-; RV32I-NEXT:    sw a3, 40(sp)
-; RV32I-NEXT:    sw a4, 44(sp)
-; RV32I-NEXT:    sw a5, 48(sp)
-; RV32I-NEXT:    sw a6, 52(sp)
-; RV32I-NEXT:    lw a3, 0(t3)
-; RV32I-NEXT:    lw a4, 4(t3)
-; RV32I-NEXT:    lw a5, 8(t3)
-; RV32I-NEXT:    lw a6, 12(t3)
-; RV32I-NEXT:    lw a7, 16(t3)
-; RV32I-NEXT:    lw t0, 20(t3)
-; RV32I-NEXT:    lw t1, 24(t3)
-; RV32I-NEXT:    lw t2, 28(t3)
-; RV32I-NEXT:    sll t3, a4, a0
-; RV32I-NEXT:    srli t4, a3, 1
-; RV32I-NEXT:    sll t5, a6, a0
-; RV32I-NEXT:    srli t6, a5, 1
-; RV32I-NEXT:    sll a5, a5, a0
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    sll s0, t0, a0
-; RV32I-NEXT:    srli s1, a7, 1
-; RV32I-NEXT:    sll a7, a7, a0
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    sub a3, s3, a4
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 4(a3)
+; RV32I-NEXT:    lw a6, 8(a3)
+; RV32I-NEXT:    lw a7, 12(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw t1, 20(a3)
+; RV32I-NEXT:    lw t2, 24(a3)
+; RV32I-NEXT:    lw a3, 28(a3)
+; RV32I-NEXT:    sll t3, a5, a0
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    sll t5, a7, a0
+; RV32I-NEXT:    srli t6, a6, 1
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll s0, t1, a0
+; RV32I-NEXT:    srli s1, t0, 1
+; RV32I-NEXT:    sll t0, t0, a0
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    sll s2, a3, a0
+; RV32I-NEXT:    srli a3, t2, 1
 ; RV32I-NEXT:    sll t2, t2, a0
-; RV32I-NEXT:    srli s2, t1, 1
-; RV32I-NEXT:    sll t1, t1, a0
-; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    sll s3, a3, a0
+; RV32I-NEXT:    srli t1, t1, 1
+; RV32I-NEXT:    sll s3, a4, a0
 ; RV32I-NEXT:    srl a0, t4, a1
-; RV32I-NEXT:    srl a3, t6, a1
-; RV32I-NEXT:    srl a4, a4, a1
+; RV32I-NEXT:    srl a4, t6, a1
+; RV32I-NEXT:    srl a5, a5, a1
 ; RV32I-NEXT:    srl t4, s1, a1
-; RV32I-NEXT:    srl a6, a6, a1
-; RV32I-NEXT:    srl t6, s2, a1
-; RV32I-NEXT:    srl t0, t0, a1
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    srl t6, a3, a1
+; RV32I-NEXT:    srl t1, t1, a1
 ; RV32I-NEXT:    srli s1, s3, 24
-; RV32I-NEXT:    srli s2, s3, 16
-; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    srli s4, s3, 16
+; RV32I-NEXT:    srli s5, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
-; RV32I-NEXT:    or a1, t5, a3
-; RV32I-NEXT:    or a3, a5, a4
+; RV32I-NEXT:    or a1, t5, a4
+; RV32I-NEXT:    or a3, a6, a5
 ; RV32I-NEXT:    or a4, s0, t4
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a6, t2, t6
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, s2, t6
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    sb s3, 0(a2)
-; RV32I-NEXT:    sb s4, 1(a2)
-; RV32I-NEXT:    sb s2, 2(a2)
+; RV32I-NEXT:    sb s5, 1(a2)
+; RV32I-NEXT:    sb s4, 2(a2)
 ; RV32I-NEXT:    sb s1, 3(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
@@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    mv s2, sp
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
 ; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    xori s5, s5, 63
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    or a6, t2, t1
 ; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    add s2, s2, s3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t2, t1, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t3, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    sraiw t1, t1, 31
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t2, a1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t3, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd t1, 32(sp)
 ; RV64I-NEXT:    sd t1, 40(sp)
 ; RV64I-NEXT:    sd t1, 48(sp)
 ; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a1, 8(s2)
-; RV64I-NEXT:    ld a3, 16(s2)
-; RV64I-NEXT:    ld a4, 0(s2)
-; RV64I-NEXT:    ld a5, 24(s2)
-; RV64I-NEXT:    srl a6, a1, a0
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    srl a4, a4, a0
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    srl a3, a3, a0
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    add a0, s6, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    sra a5, a5, a0
-; RV64I-NEXT:    sll a0, a7, s5
-; RV64I-NEXT:    sll a1, a1, s5
-; RV64I-NEXT:    sll a7, t0, s5
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
-; RV64I-NEXT:    srli t4, a5, 24
-; RV64I-NEXT:    srli t5, a5, 16
-; RV64I-NEXT:    srli t6, a5, 8
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a3, a3, a7
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    sra t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a4, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a4, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t2, 29(a2)
-; RV64I-NEXT:    sb t1, 30(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a5, 24(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli s3, a0, 56
 ; RV64I-NEXT:    srli s4, a0, 48
 ; RV64I-NEXT:    srli s5, a0, 40
+; RV64I-NEXT:    srli s6, a0, 32
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a0, 32
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a3, 16(a2)
 ; RV64I-NEXT:    sb t2, 17(a2)
 ; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb t0, 19(a2)
-; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    srli a3, a0, 16
 ; RV64I-NEXT:    sb t6, 4(a2)
 ; RV64I-NEXT:    sb t5, 5(a2)
 ; RV64I-NEXT:    sb t4, 6(a2)
 ; RV64I-NEXT:    sb t3, 7(a2)
-; RV64I-NEXT:    srli a5, a0, 16
+; RV64I-NEXT:    srli a5, a0, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
 ; RV64I-NEXT:    sb s2, 1(a2)
 ; RV64I-NEXT:    sb s1, 2(a2)
 ; RV64I-NEXT:    sb s0, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a4, 12(a2)
+; RV64I-NEXT:    sb s6, 12(a2)
 ; RV64I-NEXT:    sb s5, 13(a2)
 ; RV64I-NEXT:    sb s4, 14(a2)
 ; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a5, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    sb a5, 9(a2)
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    sb a4, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
@@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t6, 8(a0)
-; RV32I-NEXT:    lbu s0, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s6, 12(a0)
-; RV32I-NEXT:    lbu s7, 13(a0)
-; RV32I-NEXT:    lbu s8, 14(a0)
-; RV32I-NEXT:    lbu s9, 15(a0)
-; RV32I-NEXT:    lbu s10, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu s2, 18(a0)
-; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
 ; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t4, t3
-; RV32I-NEXT:    or a7, s0, t6
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s4, 25(a0)
-; RV32I-NEXT:    lbu s5, 26(a0)
-; RV32I-NEXT:    lbu ra, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t6, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    lbu s6, 28(a0)
-; RV32I-NEXT:    lbu s7, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu s1, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s2, s7, s6
+; RV32I-NEXT:    or s3, s9, s8
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    lbu s5, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, ra
+; RV32I-NEXT:    addi s8, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or s1, a0, s1
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, s2, t3
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t4, a3
 ; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s4, t3
-; RV32I-NEXT:    or s1, ra, s5
-; RV32I-NEXT:    or s4, s7, s6
-; RV32I-NEXT:    or s5, s9, s8
-; RV32I-NEXT:    srai s6, s9, 31
-; RV32I-NEXT:    andi s7, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t6, t4
-; RV32I-NEXT:    or a7, s2, s0
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, s1, t3
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    sw s6, 56(sp)
-; RV32I-NEXT:    sw s6, 60(sp)
-; RV32I-NEXT:    sw s6, 64(sp)
-; RV32I-NEXT:    sw s6, 68(sp)
-; RV32I-NEXT:    sw s6, 40(sp)
-; RV32I-NEXT:    sw s6, 44(sp)
-; RV32I-NEXT:    sw s6, 48(sp)
-; RV32I-NEXT:    sw s6, 52(sp)
-; RV32I-NEXT:    add s3, s3, s7
-; RV32I-NEXT:    sw a7, 24(sp)
-; RV32I-NEXT:    sw t0, 28(sp)
-; RV32I-NEXT:    sw t1, 32(sp)
+; RV32I-NEXT:    or a0, a1, t6
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
 ; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
-; RV32I-NEXT:    lw a3, 0(s3)
-; RV32I-NEXT:    lw a4, 4(s3)
-; RV32I-NEXT:    lw a5, 8(s3)
-; RV32I-NEXT:    lw a6, 12(s3)
-; RV32I-NEXT:    lw a7, 16(s3)
-; RV32I-NEXT:    lw t0, 20(s3)
-; RV32I-NEXT:    lw t1, 24(s3)
-; RV32I-NEXT:    lw t2, 28(s3)
-; RV32I-NEXT:    srl t3, a4, a0
-; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s8, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl t5, a6, a0
-; RV32I-NEXT:    slli t6, a7, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    srl s0, t0, a0
-; RV32I-NEXT:    slli s1, t1, 1
-; RV32I-NEXT:    srl a7, a7, a0
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    slli s2, t2, 1
-; RV32I-NEXT:    sra t2, t2, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    sra s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll t4, t6, a1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    sll t6, s1, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll s1, s2, a1
-; RV32I-NEXT:    srli s2, t2, 24
-; RV32I-NEXT:    srli s3, t2, 16
-; RV32I-NEXT:    srli s4, t2, 8
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, t4
-; RV32I-NEXT:    or a4, a5, a6
-; RV32I-NEXT:    or a5, s0, t6
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, s1
-; RV32I-NEXT:    sb t2, 28(a2)
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s3, 30(a2)
-; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
diff --git a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
index e4524394b9991..854d0b659ea73 100644
--- a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
+++ b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
@@ -4,8 +4,6 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesbfhcvt -target-abi lp64f \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s
 
-declare bfloat @llvm.riscv.nds.fcvt.bf16.s(float)
-
 define float @fcvt_s_bf16(bfloat %a) nounwind {
 ; CHECK-LABEL: fcvt_s_bf16:
 ; CHECK:       # %bb.0:
@@ -15,8 +13,6 @@ define float @fcvt_s_bf16(bfloat %a) nounwind {
   ret float %1
 }
 
-declare float @llvm.riscv.nds.fcvt.s.bf16(bfloat)
-
 define bfloat @fcvt_bf16_s(float %a) nounwind {
 ; CHECK-LABEL: fcvt_bf16_s:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
index a3b4e7829a51d..4c77b391a47da 100644
--- a/llvm/test/CodeGen/RISCV/xqciac.ll
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -231,12 +231,12 @@ define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: pow2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: pow2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %mul = mul nsw i32 %b, 32
@@ -276,12 +276,12 @@ define dso_local i32 @shladd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladd:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladd:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shl = shl nsw i32 %b, 31
@@ -305,9 +305,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladd64:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    srli a4, a2, 1
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a2, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a2, a0, 31
 ; RV32IMXQCIAC-NEXT:    slli a2, a2, 31
-; RV32IMXQCIAC-NEXT:    qc.shladd a3, a4, a3, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a3, a3, a4, 31
 ; RV32IMXQCIAC-NEXT:    sltu a2, a0, a2
 ; RV32IMXQCIAC-NEXT:    add a1, a1, a3
 ; RV32IMXQCIAC-NEXT:    add a1, a1, a2
@@ -316,9 +316,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IZBAMXQCIAC-LABEL: shladd64:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    srli a4, a2, 1
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a2, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a2, a0, 31
 ; RV32IZBAMXQCIAC-NEXT:    slli a2, a2, 31
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a3, a4, a3, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a3, a3, a4, 31
 ; RV32IZBAMXQCIAC-NEXT:    sltu a2, a0, a2
 ; RV32IZBAMXQCIAC-NEXT:    add a1, a1, a3
 ; RV32IZBAMXQCIAC-NEXT:    add a1, a1, a2
@@ -338,12 +338,12 @@ define dso_local i32 @shladd_ordisjoint(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladd_ordisjoint:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 22
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 22
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladd_ordisjoint:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 22
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 22
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shl = shl nsw i32 %b, 22
@@ -361,13 +361,13 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladdc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
 ; RV32IMXQCIAC-NEXT:    slli a0, a0, 26
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
 ; RV32IZBAMXQCIAC-NEXT:    slli a0, a0, 26
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
@@ -388,7 +388,7 @@ define dso_local i32 @shxaddc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shxaddc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    slli a1, a1, 28
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shxaddc1c2:
@@ -417,18 +417,18 @@ define dso_local i64 @shladdc1c264(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladdc1c264:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    srli a1, a2, 12
-; RV32IMXQCIAC-NEXT:    qc.shladd a1, a1, a3, 20
+; RV32IMXQCIAC-NEXT:    qc.shladd a1, a3, a1, 20
 ; RV32IMXQCIAC-NEXT:    slli a2, a2, 20
-; RV32IMXQCIAC-NEXT:    qc.shladd a1, a1, a0, 23
+; RV32IMXQCIAC-NEXT:    qc.shladd a1, a0, a1, 23
 ; RV32IMXQCIAC-NEXT:    mv a0, a2
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c264:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    srli a1, a2, 12
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a1, a3, 20
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a3, a1, 20
 ; RV32IZBAMXQCIAC-NEXT:    slli a2, a2, 20
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a1, a0, 23
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a0, a1, 23
 ; RV32IZBAMXQCIAC-NEXT:    mv a0, a2
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
@@ -449,13 +449,13 @@ define dso_local i32 @shladdc1equalc2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladdc1equalc2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    slli a1, a1, 12
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 12
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 12
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1equalc2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    slli a1, a1, 12
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 12
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 12
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shlc1 = shl nsw i32 %a, 12
diff --git a/llvm/test/CodeGen/RISCV/xqcisls.ll b/llvm/test/CodeGen/RISCV/xqcisls.ll
index 2bc4834ad3559..709dc4ce074dc 100644
--- a/llvm/test/CodeGen/RISCV/xqcisls.ll
+++ b/llvm/test/CodeGen/RISCV/xqcisls.ll
@@ -206,7 +206,7 @@ define void @sw_ri(i32* %a, i32 %b, i32 %c) {
   ret void
 }
 
-define i8 @lrb_anyext(ptr %a, i64 %b) {
+define i8 @lrb_anyext(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrb_anyext:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
@@ -223,175 +223,106 @@ define i8 @lrb_anyext(ptr %a, i64 %b) {
 ; RV32IZBAXQCISLS:       # %bb.0:
 ; RV32IZBAXQCISLS-NEXT:    qc.lrbu a0, a0, a1, 0
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+  %1 = getelementptr i8, ptr %a, i32 %b
   %2 = load i8, ptr %1, align 1
   ret i8 %2
 }
 
-define i64 @lrb(ptr %a, i64 %b) {
+define i32 @lrb(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrb:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lb a1, 0(a0)
-; RV32I-NEXT:    srai a2, a1, 31
-; RV32I-NEXT:    add a0, a1, a1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a2, a2, a2
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    lb a0, 0(a0)
+; RV32I-NEXT:    add a0, a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IZBA-LABEL: lrb:
 ; RV32IZBA:       # %bb.0:
 ; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lb a1, 0(a0)
-; RV32IZBA-NEXT:    srai a2, a1, 31
-; RV32IZBA-NEXT:    add a0, a1, a1
-; RV32IZBA-NEXT:    sltu a1, a0, a1
-; RV32IZBA-NEXT:    add a2, a2, a2
-; RV32IZBA-NEXT:    add a1, a2, a1
+; RV32IZBA-NEXT:    lb a0, 0(a0)
+; RV32IZBA-NEXT:    add a0, a0, a0
 ; RV32IZBA-NEXT:    ret
 ;
 ; RV32IZBAXQCISLS-LABEL: lrb:
 ; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrb a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    srai a2, a1, 31
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
-; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
-; RV32IZBAXQCISLS-NEXT:    add a1, a2, a1
+; RV32IZBAXQCISLS-NEXT:    qc.lrb a0, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    add a0, a0, a0
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+  %1 = getelementptr i8, ptr %a, i32 %b
   %2 = load i8, ptr %1, align 1
-  %3 = sext i8 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
+  %3 = sext i8 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
-define i8 @lurb_anyext(ptr %a, i32 %b) {
-; RV32I-LABEL: lurb_anyext:
+define i32 @lrbu(ptr %a, i32 %b) {
+; RV32I-LABEL: lrbu:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    add a0, a0, a0
 ; RV32I-NEXT:    ret
 ;
-; RV32IZBA-LABEL: lurb_anyext:
+; RV32IZBA-LABEL: lrbu:
 ; RV32IZBA:       # %bb.0:
 ; RV32IZBA-NEXT:    add a0, a0, a1
 ; RV32IZBA-NEXT:    lbu a0, 0(a0)
+; RV32IZBA-NEXT:    add a0, a0, a0
 ; RV32IZBA-NEXT:    ret
 ;
-; RV32IZBAXQCISLS-LABEL: lurb_anyext:
+; RV32IZBAXQCISLS-LABEL: lrbu:
 ; RV32IZBAXQCISLS:       # %bb.0:
 ; RV32IZBAXQCISLS-NEXT:    qc.lrbu a0, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    add a0, a0, a0
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = getelementptr i8, ptr %a, i64 %1
-  %3 = load i8, ptr %2, align 1
-  ret i8 %3
+  %1 = getelementptr i8, ptr %a, i32 %b
+  %2 = load i8, ptr %1, align 1
+  %3 = zext i8 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
-define i64 @lurb(ptr %a, i32 %b) {
-; RV32I-LABEL: lurb:
+define i64 @lrd(ptr %a, i32 %b) {
+; RV32I-LABEL: lrd:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lb a1, 0(a0)
-; RV32I-NEXT:    srai a2, a1, 31
+; RV32I-NEXT:    lw a1, 0(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    add a0, a1, a1
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a2, a2, a2
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
-; RV32IZBA-LABEL: lurb:
+; RV32IZBA-LABEL: lrd:
 ; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lb a1, 0(a0)
-; RV32IZBA-NEXT:    srai a2, a1, 31
+; RV32IZBA-NEXT:    sh3add a0, a1, a0
+; RV32IZBA-NEXT:    lw a1, 0(a0)
+; RV32IZBA-NEXT:    lw a2, 4(a0)
 ; RV32IZBA-NEXT:    add a0, a1, a1
 ; RV32IZBA-NEXT:    sltu a1, a0, a1
 ; RV32IZBA-NEXT:    add a2, a2, a2
 ; RV32IZBA-NEXT:    add a1, a2, a1
 ; RV32IZBA-NEXT:    ret
 ;
-; RV32IZBAXQCISLS-LABEL: lurb:
+; RV32IZBAXQCISLS-LABEL: lrd:
 ; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrb a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    srai a2, a1, 31
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
-; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
-; RV32IZBAXQCISLS-NEXT:    add a1, a2, a1
-; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = getelementptr i8, ptr %a, i64 %1
-  %3 = load i8, ptr %2, align 1
-  %4 = sext i8 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
-}
-
-define i64 @lrbu(ptr %a, i64 %b) {
-; RV32I-LABEL: lrbu:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    add a0, a1, a1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    ret
-;
-; RV32IZBA-LABEL: lrbu:
-; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lbu a1, 0(a0)
-; RV32IZBA-NEXT:    add a0, a1, a1
-; RV32IZBA-NEXT:    sltu a1, a0, a1
-; RV32IZBA-NEXT:    ret
-;
-; RV32IZBAXQCISLS-LABEL: lrbu:
-; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrbu a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
-; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
-  %2 = load i8, ptr %1, align 1
-  %3 = zext i8 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
-}
-
-define i64 @lurbu(ptr %a, i32 %b) {
-; RV32I-LABEL: lurbu:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    add a0, a1, a1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    ret
-;
-; RV32IZBA-LABEL: lurbu:
-; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lbu a1, 0(a0)
-; RV32IZBA-NEXT:    add a0, a1, a1
-; RV32IZBA-NEXT:    sltu a1, a0, a1
-; RV32IZBA-NEXT:    ret
-;
-; RV32IZBAXQCISLS-LABEL: lurbu:
-; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrbu a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
+; RV32IZBAXQCISLS-NEXT:    qc.lrw a2, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT:    addi a0, a0, 4
+; RV32IZBAXQCISLS-NEXT:    qc.lrw a1, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT:    add a0, a2, a2
+; RV32IZBAXQCISLS-NEXT:    sltu a2, a0, a2
+; RV32IZBAXQCISLS-NEXT:    add a1, a1, a1
+; RV32IZBAXQCISLS-NEXT:    add a1, a1, a2
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = getelementptr i8, ptr %a, i64 %1
-  %3 = load i8, ptr %2, align 1
-  %4 = zext i8 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
+  %1 = getelementptr i64, ptr %a, i32 %b
+  %2 = load i64, ptr %1, align 8
+  %3 = add i64 %2, %2
+  ret i64 %3
 }
 
-define i64 @lrd_2(ptr %a, i64 %b) {
+define i64 @lrd_2(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrd_2:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 3
@@ -426,67 +357,134 @@ define i64 @lrd_2(ptr %a, i64 %b) {
 ; RV32IZBAXQCISLS-NEXT:    add a1, a1, a1
 ; RV32IZBAXQCISLS-NEXT:    add a1, a1, a2
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = add i64 %b, 12
-  %2 = getelementptr i64, ptr %a, i64 %1
+  %1 = add i32 %b, 12
+  %2 = getelementptr i64, ptr %a, i32 %1
   %3 = load i64, ptr %2, align 8
   %4 = add i64 %3, %3
   ret i64 %4
 }
 
-define void @srb(ptr %a, i64 %b, i8 %c) {
+define void @srb(ptr %a, i32 %b, i8 %c) {
 ; RV32I-LABEL: srb:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a3, a3, a3
+; RV32I-NEXT:    add a2, a2, a2
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    sb a3, 0(a0)
+; RV32I-NEXT:    sb a2, 0(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IZBA-LABEL: srb:
 ; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a3, a3, a3
+; RV32IZBA-NEXT:    add a2, a2, a2
 ; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    sb a3, 0(a0)
+; RV32IZBA-NEXT:    sb a2, 0(a0)
 ; RV32IZBA-NEXT:    ret
 ;
 ; RV32IZBAXQCISLS-LABEL: srb:
 ; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    add a3, a3, a3
-; RV32IZBAXQCISLS-NEXT:    qc.srb a3, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
+; RV32IZBAXQCISLS-NEXT:    qc.srb a2, a0, a1, 0
 ; RV32IZBAXQCISLS-NEXT:    ret
   %1 = add i8 %c, %c
-  %2 = getelementptr i8, ptr %a, i64 %b
+  %2 = getelementptr i8, ptr %a, i32 %b
   store i8 %1, ptr %2, align 1
   ret void
 }
 
-define void @surb(ptr %a, i32 %b, i8 %c) {
-; RV32I-LABEL: surb:
+define void @srh(ptr %a, i32 %b, i16 %c) {
+; RV32I-LABEL: srh:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a2, a2, a2
+; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    sb a2, 0(a0)
+; RV32I-NEXT:    sh a2, 0(a0)
 ; RV32I-NEXT:    ret
 ;
-; RV32IZBA-LABEL: surb:
+; RV32IZBA-LABEL: srh:
 ; RV32IZBA:       # %bb.0:
 ; RV32IZBA-NEXT:    add a2, a2, a2
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    sb a2, 0(a0)
+; RV32IZBA-NEXT:    sh1add a0, a1, a0
+; RV32IZBA-NEXT:    sh a2, 0(a0)
 ; RV32IZBA-NEXT:    ret
 ;
-; RV32IZBAXQCISLS-LABEL: surb:
+; RV32IZBAXQCISLS-LABEL: srh:
 ; RV32IZBAXQCISLS:       # %bb.0:
 ; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
-; RV32IZBAXQCISLS-NEXT:    qc.srb a2, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    qc.srh a2, a0, a1, 1
+; RV32IZBAXQCISLS-NEXT:    ret
+  %1 = add i16 %c, %c
+  %2 = getelementptr i16, ptr %a, i32 %b
+  store i16 %1, ptr %2, align 2
+  ret void
+}
+
+define void @srw(ptr %a, i32 %b, i32 %c) {
+; RV32I-LABEL: srw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a2, a2, a2
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32IZBA-LABEL: srw:
+; RV32IZBA:       # %bb.0:
+; RV32IZBA-NEXT:    add a2, a2, a2
+; RV32IZBA-NEXT:    sh2add a0, a1, a0
+; RV32IZBA-NEXT:    sw a2, 0(a0)
+; RV32IZBA-NEXT:    ret
+;
+; RV32IZBAXQCISLS-LABEL: srw:
+; RV32IZBAXQCISLS:       # %bb.0:
+; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
+; RV32IZBAXQCISLS-NEXT:    qc.srw a2, a0, a1, 2
+; RV32IZBAXQCISLS-NEXT:    ret
+  %1 = add i32 %c, %c
+  %2 = getelementptr i32, ptr %a, i32 %b
+  store i32 %1, ptr %2, align 4
+  ret void
+}
+
+define void @srd(ptr %a, i32 %b, i64 %c) {
+; RV32I-LABEL: srd:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a4, a2, a2
+; RV32I-NEXT:    add a3, a3, a3
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    sltu a2, a4, a2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a2, a3, a2
+; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    ret
+;
+; RV32IZBA-LABEL: srd:
+; RV32IZBA:       # %bb.0:
+; RV32IZBA-NEXT:    add a4, a2, a2
+; RV32IZBA-NEXT:    add a3, a3, a3
+; RV32IZBA-NEXT:    sh3add a0, a1, a0
+; RV32IZBA-NEXT:    sltu a1, a4, a2
+; RV32IZBA-NEXT:    add a1, a3, a1
+; RV32IZBA-NEXT:    sw a4, 0(a0)
+; RV32IZBA-NEXT:    sw a1, 4(a0)
+; RV32IZBA-NEXT:    ret
+;
+; RV32IZBAXQCISLS-LABEL: srd:
+; RV32IZBAXQCISLS:       # %bb.0:
+; RV32IZBAXQCISLS-NEXT:    add a4, a2, a2
+; RV32IZBAXQCISLS-NEXT:    add a3, a3, a3
+; RV32IZBAXQCISLS-NEXT:    sltu a2, a4, a2
+; RV32IZBAXQCISLS-NEXT:    qc.srw a4, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT:    add a2, a3, a2
+; RV32IZBAXQCISLS-NEXT:    addi a0, a0, 4
+; RV32IZBAXQCISLS-NEXT:    qc.srw a2, a0, a1, 3
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = add i8 %c, %c
-  %3 = getelementptr i8, ptr %a, i64 %1
-  store i8 %2, ptr %3, align 1
+  %1 = add i64 %c, %c
+  %2 = getelementptr i64, ptr %a, i32 %b
+  store i64 %1, ptr %2, align 8
   ret void
 }
 
-define i64 @lrd_large_shift(ptr %a, i64 %b) {
+define i64 @lrd_large_shift(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrd_large_shift:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 5
@@ -510,9 +508,9 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) {
 ; RV32IZBAXQCISLS-NEXT:    qc.lrw a0, a2, a1, 5
 ; RV32IZBAXQCISLS-NEXT:    qc.lrw a1, a3, a1, 5
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = add i64 %b, 12
-  %2 = shl i64 %1, 2
-  %3 = getelementptr i64, ptr %a, i64 %2
+  %1 = add i32 %b, 12
+  %2 = shl i32 %1, 2
+  %3 = getelementptr i64, ptr %a, i32 %2
   %4 = load i64, ptr %3, align 8
   ret i64 %4
 }
diff --git a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
index cdaae23dbd53e..5724c4fa27728 100644
--- a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll
@@ -1,33 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadfmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV32XTHEADMEMIDX
-; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadfmemidx -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64XTHEADFMEMIDX
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadfmemidx \
+; RUN:   -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADFMEMIDX
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadfmemidx \
+; RUN:   -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADFMEMIDX
 
-define float @flrw(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: flrw:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.flrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT:    fadd.s fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADFMEMIDX-LABEL: flrw:
-; RV64XTHEADFMEMIDX:       # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT:    th.flrw fa5, a0, a1, 2
-; RV64XTHEADFMEMIDX-NEXT:    fadd.s fa0, fa5, fa5
-; RV64XTHEADFMEMIDX-NEXT:    ret
-  %1 = getelementptr float, ptr %a, i64 %b
+define float @flrw(ptr %a, iXLen %b) {
+; CHECK-LABEL: flrw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.flrw fa5, a0, a1, 2
+; CHECK-NEXT:    fadd.s fa0, fa5, fa5
+; CHECK-NEXT:    ret
+  %1 = getelementptr float, ptr %a, iXLen %b
   %2 = load float, ptr %1, align 4
   %3 = fadd float %2, %2
   ret float %3
 }
 
 define float @flurw(ptr %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: flurw:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.flrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT:    fadd.s fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT:    ret
+; RV32XTHEADFMEMIDX-LABEL: flurw:
+; RV32XTHEADFMEMIDX:       # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT:    th.flrw fa5, a0, a1, 2
+; RV32XTHEADFMEMIDX-NEXT:    fadd.s fa0, fa5, fa5
+; RV32XTHEADFMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADFMEMIDX-LABEL: flurw:
 ; RV64XTHEADFMEMIDX:       # %bb.0:
@@ -41,30 +35,24 @@ define float @flurw(ptr %a, i32 %b) {
   ret float %4
 }
 
-define void @fsrw(ptr %a, i64 %b, float %c) {
-; RV32XTHEADMEMIDX-LABEL: fsrw:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    fadd.s fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT:    th.fsrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADFMEMIDX-LABEL: fsrw:
-; RV64XTHEADFMEMIDX:       # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT:    fadd.s fa5, fa0, fa0
-; RV64XTHEADFMEMIDX-NEXT:    th.fsrw fa5, a0, a1, 2
-; RV64XTHEADFMEMIDX-NEXT:    ret
+define void @fsrw(ptr %a, iXLen %b, float %c) {
+; CHECK-LABEL: fsrw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fadd.s fa5, fa0, fa0
+; CHECK-NEXT:    th.fsrw fa5, a0, a1, 2
+; CHECK-NEXT:    ret
   %1 = fadd float %c, %c
-  %2 = getelementptr float, ptr %a, i64 %b
+  %2 = getelementptr float, ptr %a, iXLen %b
   store float %1, ptr %2, align 4
   ret void
 }
 
 define void @fsurw(ptr %a, i32 %b, float %c) {
-; RV32XTHEADMEMIDX-LABEL: fsurw:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    fadd.s fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT:    th.fsrw fa5, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT:    ret
+; RV32XTHEADFMEMIDX-LABEL: fsurw:
+; RV32XTHEADFMEMIDX:       # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT:    fadd.s fa5, fa0, fa0
+; RV32XTHEADFMEMIDX-NEXT:    th.fsrw fa5, a0, a1, 2
+; RV32XTHEADFMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADFMEMIDX-LABEL: fsurw:
 ; RV64XTHEADFMEMIDX:       # %bb.0:
@@ -78,30 +66,24 @@ define void @fsurw(ptr %a, i32 %b, float %c) {
   ret void
 }
 
-define double @flrd(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: flrd:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.flrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    fadd.d fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADFMEMIDX-LABEL: flrd:
-; RV64XTHEADFMEMIDX:       # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT:    th.flrd fa5, a0, a1, 3
-; RV64XTHEADFMEMIDX-NEXT:    fadd.d fa0, fa5, fa5
-; RV64XTHEADFMEMIDX-NEXT:    ret
-  %1 = getelementptr double, ptr %a, i64 %b
+define double @flrd(ptr %a, iXLen %b) {
+; CHECK-LABEL: flrd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.flrd fa5, a0, a1, 3
+; CHECK-NEXT:    fadd.d fa0, fa5, fa5
+; CHECK-NEXT:    ret
+  %1 = getelementptr double, ptr %a, iXLen %b
   %2 = load double, ptr %1, align 8
   %3 = fadd double %2, %2
   ret double %3
 }
 
 define double @flurd(ptr %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: flurd:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.flrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    fadd.d fa0, fa5, fa5
-; RV32XTHEADMEMIDX-NEXT:    ret
+; RV32XTHEADFMEMIDX-LABEL: flurd:
+; RV32XTHEADFMEMIDX:       # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT:    th.flrd fa5, a0, a1, 3
+; RV32XTHEADFMEMIDX-NEXT:    fadd.d fa0, fa5, fa5
+; RV32XTHEADFMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADFMEMIDX-LABEL: flurd:
 ; RV64XTHEADFMEMIDX:       # %bb.0:
@@ -115,30 +97,24 @@ define double @flurd(ptr %a, i32 %b) {
   ret double %4
 }
 
-define void @fsrd(ptr %a, i64 %b, double %c) {
-; RV32XTHEADMEMIDX-LABEL: fsrd:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    fadd.d fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT:    th.fsrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADFMEMIDX-LABEL: fsrd:
-; RV64XTHEADFMEMIDX:       # %bb.0:
-; RV64XTHEADFMEMIDX-NEXT:    fadd.d fa5, fa0, fa0
-; RV64XTHEADFMEMIDX-NEXT:    th.fsrd fa5, a0, a1, 3
-; RV64XTHEADFMEMIDX-NEXT:    ret
+define void @fsrd(ptr %a, iXLen %b, double %c) {
+; CHECK-LABEL: fsrd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fadd.d fa5, fa0, fa0
+; CHECK-NEXT:    th.fsrd fa5, a0, a1, 3
+; CHECK-NEXT:    ret
   %1 = fadd double %c, %c
-  %2 = getelementptr double, ptr %a, i64 %b
+  %2 = getelementptr double, ptr %a, iXLen %b
   store double %1, ptr %2, align 8
   ret void
 }
 
 define void @fsurd(ptr %a, i32 %b, double %c) {
-; RV32XTHEADMEMIDX-LABEL: fsurd:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    fadd.d fa5, fa0, fa0
-; RV32XTHEADMEMIDX-NEXT:    th.fsrd fa5, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    ret
+; RV32XTHEADFMEMIDX-LABEL: fsurd:
+; RV32XTHEADFMEMIDX:       # %bb.0:
+; RV32XTHEADFMEMIDX-NEXT:    fadd.d fa5, fa0, fa0
+; RV32XTHEADFMEMIDX-NEXT:    th.fsrd fa5, a0, a1, 3
+; RV32XTHEADFMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADFMEMIDX-LABEL: fsurd:
 ; RV64XTHEADFMEMIDX:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index 578f51a957a75..a20b08aa61c68 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -1,238 +1,156 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV32XTHEADMEMIDX
-; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64XTHEADMEMIDX
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadmemidx \
+; RUN:   -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADMEMIDX
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadmemidx \
+; RUN:   -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADMEMIDX
 
 define ptr @lbia(ptr %base, ptr %addr.2, i8 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lbia a3, (a0), -1, 0
-; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT:    sb a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lbia a3, (a0), -1, 0
-; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT:    sb a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i8, ptr %base, i8 0
+; CHECK-LABEL: lbia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lbia a3, (a0), -1, 0
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sb a2, 0(a1)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i8, ptr %base, iXLen 0
   %ld = load i8, ptr %addr
-  %addr.1 = getelementptr i8, ptr %base, i8 -1
+  %addr.1 = getelementptr i8, ptr %base, iXLen -1
   %res = add i8 %ld, %a
   store i8 %res, ptr %addr.2
   ret ptr %addr.1
 }
 
 define ptr @lbib(ptr %base, i8 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lbib a2, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT:    sb a1, 1(a0)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lbib a2, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT:    sb a1, 1(a0)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: lbib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lbib a2, (a0), 1, 0
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    sb a1, 1(a0)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i8, ptr %base, iXLen 1
   %ld = load i8, ptr %addr
-  %addr.1 = getelementptr i8, ptr %base, i8 2
+  %addr.1 = getelementptr i8, ptr %base, iXLen 2
   %res = add i8 %ld, %a
   store i8 %res, ptr %addr.1
   ret ptr %addr
 }
 
-define ptr @lbuia(ptr %base, ptr %addr.2, i64 %a) {
-; RV32XTHEADMEMIDX-LABEL: lbuia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lbuia a4, (a0), -1, 0
-; RV32XTHEADMEMIDX-NEXT:    add a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT:    sltu a4, a2, a4
-; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a4
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT:    sw a3, 4(a1)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbuia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lbuia a3, (a0), -1, 0
-; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT:    sd a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i8, ptr %base, i8 0
+define ptr @lbuia(ptr %base, ptr %addr.2, i32 %a) {
+; CHECK-LABEL: lbuia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lbuia a3, (a0), -1, 0
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i8, ptr %base, iXLen 0
   %ld = load i8, ptr %addr
-  %zext = zext i8 %ld to i64
-  %addr.1 = getelementptr i8, ptr %base, i8 -1
-  %res = add i64 %zext, %a
-  store i64 %res, ptr %addr.2
+  %zext = zext i8 %ld to i32
+  %addr.1 = getelementptr i8, ptr %base, iXLen -1
+  %res = add i32 %zext, %a
+  store i32 %res, ptr %addr.2
   ret ptr %addr.1
 }
 
-define ptr @lbuib(ptr %base, i64 %a, ptr %addr.1) {
-; RV32XTHEADMEMIDX-LABEL: lbuib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lbuib a4, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT:    add a1, a4, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a4, a1, a4
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a4
-; RV32XTHEADMEMIDX-NEXT:    sw a1, 0(a3)
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 4(a3)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lbuib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lbuib a3, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT:    add a1, a3, a1
-; RV64XTHEADMEMIDX-NEXT:    sd a1, 0(a2)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i8, ptr %base, i8 1
+define ptr @lbuib(ptr %base, i32 %a, ptr %addr.1) {
+; CHECK-LABEL: lbuib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lbuib a3, (a0), 1, 0
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i8, ptr %base, iXLen 1
   %ld = load i8, ptr %addr
-  %zext = zext i8 %ld to i64
-  %res = add i64 %zext, %a
-  store i64 %res, ptr %addr.1
+  %zext = zext i8 %ld to i32
+  %res = add i32 %zext, %a
+  store i32 %res, ptr %addr.1
   ret ptr %addr
 }
 
 define ptr @lhia(ptr %base, ptr %addr.2, i16 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lhia a3, (a0), -16, 1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT:    sh a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lhia a3, (a0), -16, 1
-; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT:    sh a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i16, ptr %base, i16 0
+; CHECK-LABEL: lhia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lhia a3, (a0), -16, 1
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sh a2, 0(a1)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i16, ptr %base, iXLen 0
   %ld = load i16, ptr %addr
-  %addr.1 = getelementptr i16, ptr %base, i16 -16
+  %addr.1 = getelementptr i16, ptr %base, iXLen -16
   %res = add i16 %ld, %a
   store i16 %res, ptr %addr.2
   ret ptr %addr.1
 }
 
 define ptr @lhib(ptr %base, i16 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lhib a2, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT:    sh a1, 2(a0)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lhib a2, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT:    sh a1, 2(a0)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i16, ptr %base, i16 1
+; CHECK-LABEL: lhib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lhib a2, (a0), 2, 0
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    sh a1, 2(a0)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i16, ptr %base, iXLen 1
   %ld = load i16, ptr %addr
-  %addr.1 = getelementptr i16, ptr %base, i16 2
+  %addr.1 = getelementptr i16, ptr %base, iXLen 2
   %res = add i16 %ld, %a
   store i16 %res, ptr %addr.1
   ret ptr %addr
 }
 
-define ptr @lhuia(ptr %base, ptr %addr.2, i64 %a) {
-; RV32XTHEADMEMIDX-LABEL: lhuia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lhuia a4, (a0), -16, 1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT:    sltu a4, a2, a4
-; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a4
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT:    sw a3, 4(a1)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhuia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lhuia a3, (a0), -16, 1
-; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT:    sd a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i16, ptr %base, i16 0
+define ptr @lhuia(ptr %base, ptr %addr.2, i32 %a) {
+; CHECK-LABEL: lhuia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lhuia a3, (a0), -16, 1
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i16, ptr %base, iXLen 0
   %ld = load i16, ptr %addr
-  %zext = zext i16 %ld to i64
-  %addr.1 = getelementptr i16, ptr %base, i16 -16
-  %res = add i64 %zext, %a
-  store i64 %res, ptr %addr.2
+  %zext = zext i16 %ld to i32
+  %addr.1 = getelementptr i16, ptr %base, iXLen -16
+  %res = add i32 %zext, %a
+  store i32 %res, ptr %addr.2
   ret ptr %addr.1
 }
 
-define ptr @lhuib(ptr %base, i64 %a, ptr %addr.1) {
-; RV32XTHEADMEMIDX-LABEL: lhuib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lhuib a4, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT:    add a1, a4, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a4, a1, a4
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a4
-; RV32XTHEADMEMIDX-NEXT:    sw a1, 0(a3)
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 4(a3)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lhuib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lhuib a3, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT:    add a1, a3, a1
-; RV64XTHEADMEMIDX-NEXT:    sd a1, 0(a2)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i16, ptr %base, i16 1
+define ptr @lhuib(ptr %base, i32 %a, ptr %addr.1) {
+; CHECK-LABEL: lhuib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lhuib a3, (a0), 2, 0
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    sw a1, 0(a2)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i16, ptr %base, iXLen 1
   %ld = load i16, ptr %addr
-  %zext = zext i16 %ld to i64
-  %res = add i64 %zext, %a
-  store i64 %res, ptr %addr.1
+  %zext = zext i16 %ld to i32
+  %res = add i32 %zext, %a
+  store i32 %res, ptr %addr.1
   ret ptr %addr
 }
 
 define ptr @lwia(ptr %base, ptr %addr.2, i32 %a) {
-; RV32XTHEADMEMIDX-LABEL: lwia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lwia a3, (a0), -16, 2
-; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 0(a1)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lwia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lwia a3, (a0), -16, 2
-; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV64XTHEADMEMIDX-NEXT:    sw a2, 0(a1)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i32, ptr %base, i32 0
+; CHECK-LABEL: lwia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lwia a3, (a0), -16, 2
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i32, ptr %base, iXLen 0
   %ld = load i32, ptr %addr
-  %addr.1 = getelementptr i32, ptr %base, i32 -16
+  %addr.1 = getelementptr i32, ptr %base, iXLen -16
   %res = add i32 %ld, %a
   store i32 %res, ptr %addr.2
   ret ptr %addr.1
 }
 
 define ptr @lwib(ptr %base, i32 %a) {
-; RV32XTHEADMEMIDX-LABEL: lwib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lwib a2, (a0), 4, 0
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT:    sw a1, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lwib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lwib a2, (a0), 4, 0
-; RV64XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV64XTHEADMEMIDX-NEXT:    sw a1, 4(a0)
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i32, ptr %base, i32 1
+; CHECK-LABEL: lwib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lwib a2, (a0), 4, 0
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    sw a1, 4(a0)
+; CHECK-NEXT:    ret
+  %addr = getelementptr i32, ptr %base, iXLen 1
   %ld = load i32, ptr %addr
-  %addr.1 = getelementptr i32, ptr %base, i32 2
+  %addr.1 = getelementptr i32, ptr %base, iXLen 2
   %res = add i32 %ld, %a
   store i32 %res, ptr %addr.1
   ret ptr %addr
@@ -255,10 +173,10 @@ define ptr @lwuia(ptr %base, ptr %addr.2, i64 %a) {
 ; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
 ; RV64XTHEADMEMIDX-NEXT:    sd a2, 0(a1)
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i32, ptr %base, i32 0
+  %addr = getelementptr i32, ptr %base, iXLen 0
   %ld = load i32, ptr %addr
   %zext = zext i32 %ld to i64
-  %addr.1 = getelementptr i32, ptr %base, i32 -16
+  %addr.1 = getelementptr i32, ptr %base, iXLen -16
   %res = add i64 %zext, %a
   store i64 %res, ptr %addr.2
   ret ptr %addr.1
@@ -281,7 +199,7 @@ define ptr @lwuib(ptr %base, i64 %a, ptr %addr.1) {
 ; RV64XTHEADMEMIDX-NEXT:    add a1, a3, a1
 ; RV64XTHEADMEMIDX-NEXT:    sd a1, 0(a2)
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i32, ptr %base, i32 1
+  %addr = getelementptr i32, ptr %base, iXLen 1
   %ld = load i32, ptr %addr
   %zext = zext i32 %ld to i64
   %res = add i64 %zext, %a
@@ -309,9 +227,9 @@ define ptr @ldia(ptr %base, ptr %addr.2, i64 %a) {
 ; RV64XTHEADMEMIDX-NEXT:    add a2, a3, a2
 ; RV64XTHEADMEMIDX-NEXT:    sd a2, 0(a1)
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i64, ptr %base, i64 0
+  %addr = getelementptr i64, ptr %base, iXLen 0
   %ld = load i64, ptr %addr
-  %addr.1 = getelementptr i64, ptr %base, i64 -16
+  %addr.1 = getelementptr i64, ptr %base, iXLen -16
   %res = add i64 %ld, %a
   store i64 %res, ptr %addr.2
   ret ptr %addr.1
@@ -336,117 +254,81 @@ define ptr @ldib(ptr %base, i64 %a) {
 ; RV64XTHEADMEMIDX-NEXT:    add a1, a2, a1
 ; RV64XTHEADMEMIDX-NEXT:    sd a1, 8(a0)
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr = getelementptr i64, ptr %base, i64 1
+  %addr = getelementptr i64, ptr %base, iXLen 1
   %ld = load i64, ptr %addr
-  %addr.1 = getelementptr i64, ptr %base, i64 2
+  %addr.1 = getelementptr i64, ptr %base, iXLen 2
   %res = add i64 %ld, %a
   store i64 %res, ptr %addr.1
   ret ptr %addr
 }
 
 define ptr @sbia(ptr %base, i8 %a, i8 %b) {
-; RV32XTHEADMEMIDX-LABEL: sbia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.sbia a1, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: sbia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.sbia a1, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: sbia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.sbia a1, (a0), 1, 0
+; CHECK-NEXT:    ret
+  %addr.1 = getelementptr i8, ptr %base, iXLen 1
   %res = add i8 %a, %b
   store i8 %res, ptr %base
   ret ptr %addr.1
 }
 
 define ptr @sbib(ptr %base, i8 %a, i8 %b) {
-; RV32XTHEADMEMIDX-LABEL: sbib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.sbib a1, (a0), 1, 0
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: sbib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.sbib a1, (a0), 1, 0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i8, ptr %base, i8 1
+; CHECK-LABEL: sbib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.sbib a1, (a0), 1, 0
+; CHECK-NEXT:    ret
+  %addr.1 = getelementptr i8, ptr %base, iXLen 1
   %res = add i8 %a, %b
   store i8 %res, ptr %addr.1
   ret ptr %addr.1
 }
 
 define ptr @shia(ptr %base, i16 %a, i16 %b) {
-; RV32XTHEADMEMIDX-LABEL: shia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.shia a1, (a0), -9, 1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: shia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.shia a1, (a0), -9, 1
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i16, ptr %base, i16 -9
+; CHECK-LABEL: shia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.shia a1, (a0), -9, 1
+; CHECK-NEXT:    ret
+  %addr.1 = getelementptr i16, ptr %base, iXLen -9
   %res = add i16 %a, %b
   store i16 %res, ptr %base
   ret ptr %addr.1
 }
 
 define ptr @shib(ptr %base, i16 %a, i16 %b) {
-; RV32XTHEADMEMIDX-LABEL: shib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.shib a1, (a0), 2, 0
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: shib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.shib a1, (a0), 2, 0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i16, ptr %base, i16 1
+; CHECK-LABEL: shib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.shib a1, (a0), 2, 0
+; CHECK-NEXT:    ret
+  %addr.1 = getelementptr i16, ptr %base, iXLen 1
   %res = add i16 %a, %b
   store i16 %res, ptr %addr.1
   ret ptr %addr.1
 }
 
 define ptr @swia(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: swia:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.swia a1, (a0), 8, 2
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: swia:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.swia a1, (a0), 8, 2
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i32, ptr %base, i32 8
+; CHECK-LABEL: swia:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.swia a1, (a0), 8, 2
+; CHECK-NEXT:    ret
+  %addr.1 = getelementptr i32, ptr %base, iXLen 8
   %res = add i32 %a, %b
   store i32 %res, ptr %base
   ret ptr %addr.1
 }
 
 define ptr @swib(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: swib:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.swib a1, (a0), -13, 3
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: swib:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.swib a1, (a0), -13, 3
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i32, ptr %base, i32 -26
+; CHECK-LABEL: swib:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.swib a1, (a0), -13, 3
+; CHECK-NEXT:    ret
+  %addr.1 = getelementptr i32, ptr %base, iXLen -26
   %res = add i32 %a, %b
   store i32 %res, ptr %addr.1
   ret ptr %addr.1
@@ -470,7 +352,7 @@ define ptr @sdia(ptr %base, i64 %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
 ; RV64XTHEADMEMIDX-NEXT:    th.sdia a1, (a0), 8, 3
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i64, ptr %base, i64 8
+  %addr.1 = getelementptr i64, ptr %base, iXLen 8
   %res = add i64 %a, %b
   store i64 %res, ptr %base
   ret ptr %addr.1
@@ -492,48 +374,33 @@ define ptr @sdib(ptr %base, i64 %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
 ; RV64XTHEADMEMIDX-NEXT:    th.sdib a1, (a0), 8, 0
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %addr.1 = getelementptr i64, ptr %base, i64 1
+  %addr.1 = getelementptr i64, ptr %base, iXLen 1
   %res = add i64 %a, %b
   store i64 %res, ptr %addr.1
   ret ptr %addr.1
 }
 
-define i8 @lrb_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrb_anyext:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrb a0, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrb_anyext:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrb a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+define i8 @lrb_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrb_anyext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrb a0, a0, a1, 0
+; CHECK-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, iXLen %b
   %2 = load i8, ptr %1, align 1
   ret i8 %2
 }
 
-define i64 @lrb(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrb:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrb a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT:    srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrb:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrb a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+define i32 @lrb(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrb a0, a0, a1, 0
+; CHECK-NEXT:    add a0, a0, a0
+; CHECK-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, iXLen %b
   %2 = load i8, ptr %1, align 1
-  %3 = sext i8 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
+  %3 = sext i8 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
 define i8 @lurb_anyext(ptr %a, i32 %b) {
@@ -552,15 +419,11 @@ define i8 @lurb_anyext(ptr %a, i32 %b) {
   ret i8 %3
 }
 
-define i64 @lurb(ptr %a, i32 %b) {
+define i32 @lurb(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurb:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrb a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT:    srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT:    th.lrb a0, a0, a1, 0
+; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lurb:
@@ -571,37 +434,29 @@ define i64 @lurb(ptr %a, i32 %b) {
   %1 = zext i32 %b to i64
   %2 = getelementptr i8, ptr %a, i64 %1
   %3 = load i8, ptr %2, align 1
-  %4 = sext i8 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
-}
-
-define i64 @lrbu(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrbu:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrbu a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrbu:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrbu a0, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+  %4 = sext i8 %3 to i32
+  %5 = add i32 %4, %4
+  ret i32 %5
+}
+
+define i32 @lrbu(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrbu:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrbu a0, a0, a1, 0
+; CHECK-NEXT:    add a0, a0, a0
+; CHECK-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, iXLen %b
   %2 = load i8, ptr %1, align 1
-  %3 = zext i8 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
+  %3 = zext i8 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
-define i64 @lurbu(ptr %a, i32 %b) {
+define i32 @lurbu(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurbu:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrbu a1, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT:    th.lrbu a0, a0, a1, 0
+; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lurbu:
@@ -612,47 +467,32 @@ define i64 @lurbu(ptr %a, i32 %b) {
   %1 = zext i32 %b to i64
   %2 = getelementptr i8, ptr %a, i64 %1
   %3 = load i8, ptr %2, align 1
-  %4 = zext i8 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
+  %4 = zext i8 %3 to i32
+  %5 = add i32 %4, %4
+  ret i32 %5
 }
 
-define i16 @lrh_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrh_anyext:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrh a0, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrh_anyext:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrh a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i16, ptr %a, i64 %b
+define i16 @lrh_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrh_anyext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrh a0, a0, a1, 1
+; CHECK-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, iXLen %b
   %2 = load i16, ptr %1, align 2
   ret i16 %2
 }
 
-define i64 @lrh(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrh:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrh a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT:    srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrh:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrh a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i16, ptr %a, i64 %b
+define i32 @lrh(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrh a0, a0, a1, 1
+; CHECK-NEXT:    add a0, a0, a0
+; CHECK-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, iXLen %b
   %2 = load i16, ptr %1, align 2
-  %3 = sext i16 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
+  %3 = sext i16 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
 define i16 @lurh_anyext(ptr %a, i32 %b) {
@@ -671,15 +511,11 @@ define i16 @lurh_anyext(ptr %a, i32 %b) {
   ret i16 %3
 }
 
-define i64 @lurh(ptr %a, i32 %b) {
+define i32 @lurh(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurh:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrh a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT:    srai a2, a1, 31
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT:    th.lrh a0, a0, a1, 1
+; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lurh:
@@ -690,37 +526,29 @@ define i64 @lurh(ptr %a, i32 %b) {
   %1 = zext i32 %b to i64
   %2 = getelementptr i16, ptr %a, i64 %1
   %3 = load i16, ptr %2, align 2
-  %4 = sext i16 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
-}
-
-define i64 @lrhu(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrhu:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrhu a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrhu:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrhu a0, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i16, ptr %a, i64 %b
+  %4 = sext i16 %3 to i32
+  %5 = add i32 %4, %4
+  ret i32 %5
+}
+
+define i32 @lrhu(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrhu:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrhu a0, a0, a1, 1
+; CHECK-NEXT:    add a0, a0, a0
+; CHECK-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, iXLen %b
   %2 = load i16, ptr %1, align 2
-  %3 = zext i16 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
+  %3 = zext i16 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
-define i64 @lurhu(ptr %a, i32 %b) {
+define i32 @lurhu(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurhu:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrhu a1, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
+; RV32XTHEADMEMIDX-NEXT:    th.lrhu a0, a0, a1, 1
+; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lurhu:
@@ -731,27 +559,22 @@ define i64 @lurhu(ptr %a, i32 %b) {
   %1 = zext i32 %b to i64
   %2 = getelementptr i16, ptr %a, i64 %1
   %3 = load i16, ptr %2, align 2
-  %4 = zext i16 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
+  %4 = zext i16 %3 to i32
+  %5 = add i32 %4, %4
+  ret i32 %5
 }
 
-define i32 @lrw_anyext(ptr %a, i64 %b) {
-; RV32XTHEADMEMIDX-LABEL: lrw_anyext:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    th.lrw a0, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: lrw_anyext:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    th.lrw a0, a0, a1, 2
-; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i32, ptr %a, i64 %b
+define i32 @lrw_anyext(ptr %a, iXLen %b) {
+; CHECK-LABEL: lrw_anyext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    th.lrw a0, a0, a1, 2
+; CHECK-NEXT:    ret
+  %1 = getelementptr i32, ptr %a, iXLen %b
   %2 = load i32, ptr %1, align 4
   ret i32 %2
 }
 
-define i64 @lrw(ptr %a, i64 %b) {
+define i64 @lrw(ptr %a, iXLen %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrw:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 2
@@ -767,7 +590,7 @@ define i64 @lrw(ptr %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    th.lrw a0, a0, a1, 2
 ; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i32, ptr %a, i64 %b
+  %1 = getelementptr i32, ptr %a, iXLen %b
   %2 = load i32, ptr %1, align 4
   %3 = sext i32 %2 to i64
   %4 = add i64 %3, %3
@@ -814,7 +637,7 @@ define i64 @lurw(ptr %a, i32 %b) {
   ret i64 %5
 }
 
-define i64 @lrwu(ptr %a, i64 %b) {
+define i64 @lrwu(ptr %a, iXLen %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrwu:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 2
@@ -827,7 +650,7 @@ define i64 @lrwu(ptr %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    th.lrwu a0, a0, a1, 2
 ; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i32, ptr %a, i64 %b
+  %1 = getelementptr i32, ptr %a, iXLen %b
   %2 = load i32, ptr %1, align 4
   %3 = zext i32 %2 to i64
   %4 = add i64 %3, %3
@@ -855,17 +678,16 @@ define i64 @lurwu(ptr %a, i32 %b) {
   ret i64 %5
 }
 
-define i64 @lrd(ptr %a, i64 %b) {
+define i64 @lrd(ptr %a, iXLen %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    slli a2, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    th.lrw a2, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a2
-; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT:    add a0, a2, a2
+; RV32XTHEADMEMIDX-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a1
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lrd:
@@ -873,13 +695,13 @@ define i64 @lrd(ptr %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    th.lrd a0, a0, a1, 3
 ; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = getelementptr i64, ptr %a, i64 %b
+  %1 = getelementptr i64, ptr %a, iXLen %b
   %2 = load i64, ptr %1, align 8
   %3 = add i64 %2, %2
   ret i64 %3
 }
 
-define i64 @lrd_2(ptr %a, i64 %b) {
+define i64 @lrd_2(ptr %a, iXLen %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd_2:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    addi a2, a0, 96
@@ -898,8 +720,8 @@ define i64 @lrd_2(ptr %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    th.lrd a0, a0, a1, 3
 ; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a0
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = add i64 %b, 12
-  %2 = getelementptr i64, ptr %a, i64 %1
+  %1 = add iXLen %b, 12
+  %2 = getelementptr i64, ptr %a, iXLen %1
   %3 = load i64, ptr %2, align 8
   %4 = add i64 %3, %3
   ret i64 %4
@@ -908,14 +730,13 @@ define i64 @lrd_2(ptr %a, i64 %b) {
 define i64 @lurd(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    slli a2, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    th.lrw a2, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a2
-; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT:    add a0, a2, a2
+; RV32XTHEADMEMIDX-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a1
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lurd:
@@ -930,20 +751,14 @@ define i64 @lurd(ptr %a, i32 %b) {
   ret i64 %4
 }
 
-define void @srb(ptr %a, i64 %b, i8 %c) {
-; RV32XTHEADMEMIDX-LABEL: srb:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT:    th.srb a3, a0, a1, 0
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: srb:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT:    th.srb a2, a0, a1, 0
-; RV64XTHEADMEMIDX-NEXT:    ret
+define void @srb(ptr %a, iXLen %b, i8 %c) {
+; CHECK-LABEL: srb:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a2, a2, a2
+; CHECK-NEXT:    th.srb a2, a0, a1, 0
+; CHECK-NEXT:    ret
   %1 = add i8 %c, %c
-  %2 = getelementptr i8, ptr %a, i64 %b
+  %2 = getelementptr i8, ptr %a, iXLen %b
   store i8 %1, ptr %2, align 1
   ret void
 }
@@ -967,20 +782,14 @@ define void @surb(ptr %a, i32 %b, i8 %c) {
   ret void
 }
 
-define void @srh(ptr %a, i64 %b, i16 %c) {
-; RV32XTHEADMEMIDX-LABEL: srh:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT:    th.srh a3, a0, a1, 1
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: srh:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT:    th.srh a2, a0, a1, 1
-; RV64XTHEADMEMIDX-NEXT:    ret
+define void @srh(ptr %a, iXLen %b, i16 %c) {
+; CHECK-LABEL: srh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a2, a2, a2
+; CHECK-NEXT:    th.srh a2, a0, a1, 1
+; CHECK-NEXT:    ret
   %1 = add i16 %c, %c
-  %2 = getelementptr i16, ptr %a, i64 %b
+  %2 = getelementptr i16, ptr %a, iXLen %b
   store i16 %1, ptr %2, align 2
   ret void
 }
@@ -1004,20 +813,14 @@ define void @surh(ptr %a, i32 %b, i16 %c) {
   ret void
 }
 
-define void @srw(ptr %a, i64 %b, i32 %c) {
-; RV32XTHEADMEMIDX-LABEL: srw:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
-; RV32XTHEADMEMIDX-NEXT:    th.srw a3, a0, a1, 2
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: srw:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV64XTHEADMEMIDX-NEXT:    th.srw a2, a0, a1, 2
-; RV64XTHEADMEMIDX-NEXT:    ret
+define void @srw(ptr %a, iXLen %b, i32 %c) {
+; CHECK-LABEL: srw:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a2, a2, a2
+; CHECK-NEXT:    th.srw a2, a0, a1, 2
+; CHECK-NEXT:    ret
   %1 = add i32 %c, %c
-  %2 = getelementptr i32, ptr %a, i64 %b
+  %2 = getelementptr i32, ptr %a, iXLen %b
   store i32 %1, ptr %2, align 4
   ret void
 }
@@ -1041,17 +844,16 @@ define void @surw(ptr %a, i32 %b, i32 %c) {
   ret void
 }
 
-define void @srd(ptr %a, i64 %b, i64 %c) {
+define void @srd(ptr %a, iXLen %b, i64 %c) {
 ; RV32XTHEADMEMIDX-LABEL: srd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a3
-; RV32XTHEADMEMIDX-NEXT:    add a4, a4, a4
-; RV32XTHEADMEMIDX-NEXT:    sltu a3, a2, a3
-; RV32XTHEADMEMIDX-NEXT:    add a3, a4, a3
-; RV32XTHEADMEMIDX-NEXT:    slli a4, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a4, a0, a4
+; RV32XTHEADMEMIDX-NEXT:    add a4, a2, a2
+; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
+; RV32XTHEADMEMIDX-NEXT:    sltu a2, a4, a2
+; RV32XTHEADMEMIDX-NEXT:    th.srw a4, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
 ; RV32XTHEADMEMIDX-NEXT:    th.srw a2, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    sw a3, 4(a4)
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: srd:
@@ -1060,7 +862,7 @@ define void @srd(ptr %a, i64 %b, i64 %c) {
 ; RV64XTHEADMEMIDX-NEXT:    th.srd a2, a0, a1, 3
 ; RV64XTHEADMEMIDX-NEXT:    ret
   %1 = add i64 %c, %c
-  %2 = getelementptr i64, ptr %a, i64 %b
+  %2 = getelementptr i64, ptr %a, iXLen %b
   store i64 %1, ptr %2, align 8
   ret void
 }
@@ -1071,11 +873,10 @@ define void @surd(ptr %a, i32 %b, i64 %c) {
 ; RV32XTHEADMEMIDX-NEXT:    add a4, a2, a2
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
 ; RV32XTHEADMEMIDX-NEXT:    sltu a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT:    slli a3, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a3, a0, a3
 ; RV32XTHEADMEMIDX-NEXT:    th.srw a4, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 4(a3)
+; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
+; RV32XTHEADMEMIDX-NEXT:    th.srw a2, a0, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: surd:
@@ -1091,24 +892,18 @@ define void @surd(ptr %a, i32 %b, i64 %c) {
 }
 
 define ptr @test_simm5(ptr %base, i32 %a, i32 %b) {
-; RV32XTHEADMEMIDX-LABEL: test_simm5:
-; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV32XTHEADMEMIDX-NEXT:    th.swia a1, (a0), -12, 2
-; RV32XTHEADMEMIDX-NEXT:    ret
-;
-; RV64XTHEADMEMIDX-LABEL: test_simm5:
-; RV64XTHEADMEMIDX:       # %bb.0:
-; RV64XTHEADMEMIDX-NEXT:    add a1, a1, a2
-; RV64XTHEADMEMIDX-NEXT:    th.swia a1, (a0), -12, 2
-; RV64XTHEADMEMIDX-NEXT:    ret
+; CHECK-LABEL: test_simm5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    th.swia a1, (a0), -12, 2
+; CHECK-NEXT:    ret
   %addr.1 = getelementptr i32, ptr %base, i32 -12
   %res = add i32 %a, %b
   store i32 %res, ptr %base
   ret ptr %addr.1
 }
 
-define i64 @lrd_large_shift(ptr %a, i64 %b) {
+define i64 @lrd_large_shift(ptr %a, iXLen %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd_large_shift:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    slli a1, a1, 5
@@ -1123,14 +918,14 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    add a0, a1, a0
 ; RV64XTHEADMEMIDX-NEXT:    ld a0, 384(a0)
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = add i64 %b, 12
-  %2 = shl i64 %1, 2
-  %3 = getelementptr i64, ptr %a, i64 %2
+  %1 = add iXLen %b, 12
+  %2 = shl iXLen %1, 2
+  %3 = getelementptr i64, ptr %a, iXLen %2
   %4 = load i64, ptr %3, align 8
   ret i64 %4
 }
 
-define i64 @lrd_large_offset(ptr %a, i64 %b) {
+define i64 @lrd_large_offset(ptr %a, iXLen %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd_large_offset:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    slli a1, a1, 3
@@ -1149,8 +944,8 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) {
 ; RV64XTHEADMEMIDX-NEXT:    add a0, a0, a1
 ; RV64XTHEADMEMIDX-NEXT:    ld a0, 1792(a0)
 ; RV64XTHEADMEMIDX-NEXT:    ret
-  %1 = add i64 %b, 12000
-  %2 = getelementptr i64, ptr %a, i64 %1
+  %1 = add iXLen %b, 12000
+  %2 = getelementptr i64, ptr %a, iXLen %1
   %3 = load i64, ptr %2, align 8
   ret i64 %3
 }
diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
index f9db686c9e855..1ef37f73b3b08 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
@@ -242,7 +242,7 @@ define void @foo7(ptr nocapture %p) nounwind {
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    lui a1, %hi(d)
 ; RV64ZDINX-NEXT:    addi a2, a1, %lo(d)
-; RV64ZDINX-NEXT:    lwu a2, 8(a2)
+; RV64ZDINX-NEXT:    lw a2, 8(a2)
 ; RV64ZDINX-NEXT:    lwu a1, %lo(d+4)(a1)
 ; RV64ZDINX-NEXT:    slli a2, a2, 32
 ; RV64ZDINX-NEXT:    or a1, a2, a1
@@ -337,7 +337,7 @@ define void @foo9(ptr nocapture %p) nounwind {
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    lui a1, %hi(e)
 ; RV64ZDINX-NEXT:    addi a2, a1, %lo(e)
-; RV64ZDINX-NEXT:    lwu a2, 4(a2)
+; RV64ZDINX-NEXT:    lw a2, 4(a2)
 ; RV64ZDINX-NEXT:    lwu a1, %lo(e)(a1)
 ; RV64ZDINX-NEXT:    slli a2, a2, 32
 ; RV64ZDINX-NEXT:    or a1, a2, a1
@@ -480,7 +480,7 @@ define double @foo13(ptr nocapture %p) nounwind {
 ; RV64ZDINX-LABEL: foo13:
 ; RV64ZDINX:       # %bb.0: # %entry
 ; RV64ZDINX-NEXT:    lui a0, %hi(f)
-; RV64ZDINX-NEXT:    lwu a1, %lo(f+8)(a0)
+; RV64ZDINX-NEXT:    lw a1, %lo(f+8)(a0)
 ; RV64ZDINX-NEXT:    lwu a0, %lo(f+4)(a0)
 ; RV64ZDINX-NEXT:    slli a1, a1, 32
 ; RV64ZDINX-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/zdinx-spill.ll b/llvm/test/CodeGen/RISCV/zdinx-spill.ll
index d7a700622bf8c..6f206fe571c17 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-spill.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-spill.ll
@@ -9,7 +9,6 @@ define double @foo(double %x) nounwind {
   ; CHECK-NEXT:   liveins: $x10, $x11, $x8, $x9, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x2 = frame-setup ADDI $x2, -64
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 64
   ; CHECK-NEXT:   frame-setup SW killed $x8, $x2, 60 :: (store (s32) into %stack.1)
   ; CHECK-NEXT:   frame-setup SW killed $x9, $x2, 56 :: (store (s32) into %stack.2)
   ; CHECK-NEXT:   frame-setup SW killed $x18, $x2, 52 :: (store (s32) into %stack.3)
@@ -22,18 +21,6 @@ define double @foo(double %x) nounwind {
   ; CHECK-NEXT:   frame-setup SW killed $x25, $x2, 24 :: (store (s32) into %stack.10)
   ; CHECK-NEXT:   frame-setup SW killed $x26, $x2, 20 :: (store (s32) into %stack.11)
   ; CHECK-NEXT:   frame-setup SW killed $x27, $x2, 16 :: (store (s32) into %stack.12)
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -4
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x9, -8
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x18, -12
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x19, -16
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x20, -20
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x21, -24
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x22, -28
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x23, -32
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x24, -36
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x25, -40
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x26, -44
-  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x27, -48
   ; CHECK-NEXT:   renamable $x10_x11 = nofpexcept FADD_D_IN32X killed renamable $x10_x11, renamable $x10_x11, 7, implicit $frm
   ; CHECK-NEXT:   PseudoRV32ZdinxSD killed renamable $x10_x11, $x2, 8 :: (store (s64) into %stack.0, align 4)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $x6, 12 /* clobber */, implicit-def dead early-clobber $x7, 12 /* clobber */, implicit-def dead early-clobber $x8, 12 /* clobber */, implicit-def dead early-clobber $x9, 12 /* clobber */, implicit-def dead early-clobber $x10, 12 /* clobber */, implicit-def dead early-clobber $x11, 12 /* clobber */, implicit-def dead early-clobber $x12, 12 /* clobber */, implicit-def dead early-clobber $x13, 12 /* clobber */, implicit-def dead early-clobber $x14, 12 /* clobber */, implicit-def dead early-clobber $x15, 12 /* clobber */, implicit-def dead early-clobber $x16, 12 /* clobber */, implicit-def dead early-clobber $x17, 12 /* clobber */, implicit-def dead early-clobber $x18, 12 /* clobber */, implicit-def dead early-clobber $x19, 12 /* clobber */, implicit-def dead early-clobber $x20, 12 /* clobber */, implicit-def dead early-clobber $x21, 12 /* clobber */, implicit-def dead early-clobber $x22, 12 /* clobber */, implicit-def dead early-clobber $x23, 12 /* clobber */, implicit-def dead early-clobber $x24, 12 /* clobber */, implicit-def dead early-clobber $x25, 12 /* clobber */, implicit-def dead early-clobber $x26, 12 /* clobber */, implicit-def dead early-clobber $x27, 12 /* clobber */, implicit-def dead early-clobber $x28, 12 /* clobber */, implicit-def dead early-clobber $x29, 12 /* clobber */, implicit-def dead early-clobber $x31
@@ -50,20 +37,7 @@ define double @foo(double %x) nounwind {
   ; CHECK-NEXT:   $x25 = frame-destroy LW $x2, 24 :: (load (s32) from %stack.10)
   ; CHECK-NEXT:   $x26 = frame-destroy LW $x2, 20 :: (load (s32) from %stack.11)
   ; CHECK-NEXT:   $x27 = frame-destroy LW $x2, 16 :: (load (s32) from %stack.12)
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x9
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x18
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x19
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x20
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x21
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x22
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x23
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x24
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x25
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x26
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x27
   ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x2, 64
-  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
   ; CHECK-NEXT:   PseudoRET implicit $x10, implicit $x11
   %a = fadd double %x, %x
   call void asm sideeffect "", "~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{xr0},~{x31}"()
diff --git a/llvm/test/CodeGen/SPARC/tls-sp.ll b/llvm/test/CodeGen/SPARC/tls-sp.ll
new file mode 100644
index 0000000000000..de9af01398d23
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/tls-sp.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc -relocation-model=pic < %s | FileCheck --check-prefix=SPARC %s
+; RUN: llc -mtriple=sparc64 -relocation-model=pic < %s | FileCheck --check-prefix=SPARC64 %s
+
+@x = external thread_local global i8
+
+;; Test that we don't over-allocate stack space when calling __tls_get_addr
+;; with the call frame pseudos able to be eliminated.
+define ptr @no_alloca() nounwind {
+; SPARC-LABEL: no_alloca:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:  .Ltmp0:
+; SPARC-NEXT:    call .Ltmp1
+; SPARC-NEXT:  .Ltmp2:
+; SPARC-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT:  .Ltmp1:
+; SPARC-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT:    add %i0, %o7, %i0
+; SPARC-NEXT:    sethi %tgd_hi22(x), %i1
+; SPARC-NEXT:    add %i1, %tgd_lo10(x), %i1
+; SPARC-NEXT:    add %i0, %i1, %o0, %tgd_add(x)
+; SPARC-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ret
+; SPARC-NEXT:    restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: no_alloca:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:  .Ltmp0:
+; SPARC64-NEXT:    rd %pc, %o7
+; SPARC64-NEXT:  .Ltmp2:
+; SPARC64-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT:  .Ltmp1:
+; SPARC64-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT:    add %i0, %o7, %i0
+; SPARC64-NEXT:    sethi %tgd_hi22(x), %i1
+; SPARC64-NEXT:    add %i1, %tgd_lo10(x), %i1
+; SPARC64-NEXT:    add %i0, %i1, %o0, %tgd_add(x)
+; SPARC64-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore %g0, %o0, %o0
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  ret ptr %0
+}
+
+;; Test that %sp is valid for the call to __tls_get_addr. We store to a dynamic
+;; alloca in order to prevent eliminating any call frame pseudos from the call.
+define ptr @dynamic_alloca(i64 %n) nounwind {
+; SPARC-LABEL: dynamic_alloca:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:  .Ltmp3:
+; SPARC-NEXT:    call .Ltmp4
+; SPARC-NEXT:  .Ltmp5:
+; SPARC-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i0
+; SPARC-NEXT:  .Ltmp4:
+; SPARC-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i0
+; SPARC-NEXT:    add %i0, %o7, %i0
+; SPARC-NEXT:    sethi %tgd_hi22(x), %i2
+; SPARC-NEXT:    add %i2, %tgd_lo10(x), %i2
+; SPARC-NEXT:    add %i0, %i2, %o0, %tgd_add(x)
+; SPARC-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    add %i1, 7, %i0
+; SPARC-NEXT:    and %i0, -8, %i0
+; SPARC-NEXT:    sub %sp, %i0, %i0
+; SPARC-NEXT:    add %i0, -8, %sp
+; SPARC-NEXT:    mov 1, %i1
+; SPARC-NEXT:    stb %i1, [%i0+88]
+; SPARC-NEXT:    ret
+; SPARC-NEXT:    restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: dynamic_alloca:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:  .Ltmp3:
+; SPARC64-NEXT:    rd %pc, %o7
+; SPARC64-NEXT:  .Ltmp5:
+; SPARC64-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i1
+; SPARC64-NEXT:  .Ltmp4:
+; SPARC64-NEXT:    or %i1, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i1
+; SPARC64-NEXT:    add %i1, %o7, %i1
+; SPARC64-NEXT:    sethi %tgd_hi22(x), %i2
+; SPARC64-NEXT:    add %i2, %tgd_lo10(x), %i2
+; SPARC64-NEXT:    add %i1, %i2, %o0, %tgd_add(x)
+; SPARC64-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:    add %i0, 15, %i0
+; SPARC64-NEXT:    and %i0, -16, %i0
+; SPARC64-NEXT:    sub %sp, %i0, %i0
+; SPARC64-NEXT:    mov %i0, %sp
+; SPARC64-NEXT:    mov 1, %i1
+; SPARC64-NEXT:    stb %i1, [%i0+2175]
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore %g0, %o0, %o0
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %1 = alloca i8, i64 %n
+  store i8 1, ptr %1
+  ret ptr %0
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll
new file mode 100644
index 0000000000000..b18e929568534
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll
@@ -0,0 +1,36 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for refract are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef  <4 x half> @refract_half(<4 x half> noundef  %I, <4 x half> noundef  %N, half noundef  %ETA) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#float_16:]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Refract %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %spv.refract.i = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> %I, <4 x half> %N, half %ETA)
+  ret <4 x half> %spv.refract.i
+}
+
+define noundef  <4 x float> @refract_float4(<4 x float> noundef  %I, <4 x float> noundef  %N, float noundef  %ETA) {
+entry:
+  %conv.i = fpext reassoc nnan ninf nsz arcp afn float %ETA to double
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#float_32:]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Refract %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %spv.refract.i = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> %I, <4 x float> %N, float %ETA)
+  ret <4 x float> %spv.refract.i
+}
+
+declare <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half>, <4 x half>, half)
+declare <4 x float> @llvm.spv.reflect.v4f32.f32(<4 x float>, <4 x float>, float)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll
index 3d46b527bf14f..70030ca1a0316 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std"
 
@@ -337,3 +338,68 @@ entry:
 }
 
 declare float @llvm.fma.f32(float, float, float)
+
+; CHECK: OpFunction
+; CHECK: %[[#d:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function
+; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]]
+; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]]
+; CHECK: OpStore %[[#fracPtr]] %[[#frac]]
+; CHECK: OpStore %[[#integralPtr]] %[[#integral]]
+; CHECK: OpFunctionEnd
+define void @TestModf(double %d, ptr addrspace(1) %frac, ptr addrspace(1) %integral) {
+entry:
+  %4 = tail call { double, double } @llvm.modf.f64(double %d)
+  %5 = extractvalue { double, double } %4, 0
+  %6 = extractvalue { double, double } %4, 1
+  store double %5, ptr addrspace(1) %frac, align 8
+  store double %6, ptr addrspace(1) %integral, align 8
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#d:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#entryBlock:]] = OpLabel
+; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function
+; CHECK: OpBranchConditional %[[#]] %[[#lor_lhs_falseBlock:]] %[[#if_thenBlock:]]
+; CHECK: %[[#lor_lhs_falseBlock]] = OpLabel
+; CHECK: OpBranchConditional %[[#]] %[[#if_endBlock:]] %[[#if_thenBlock]]
+; CHECK: %[[#if_thenBlock]] = OpLabel
+; CHECK: OpBranch %[[#returnBlock:]]
+; CHECK: %[[#if_endBlock]] = OpLabel
+; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]]
+; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]]
+; CHECK: OpStore %[[#fracPtr]] %[[#frac]]
+; CHECK: OpStore %[[#integralPtr]] %[[#integral]]
+; CHECK: OpFunctionEnd
+define dso_local void @TestModf2(double noundef %d, ptr noundef %frac, ptr noundef %integral) {
+entry:
+  %0 = load ptr, ptr %frac, align 8
+  %tobool = icmp ne ptr %0, null
+  br i1 %tobool, label %lor.lhs.false, label %if.then
+
+lor.lhs.false:
+  %1 = load ptr, ptr %integral, align 8
+  %tobool1 = icmp ne ptr %1, null
+  br i1 %tobool1, label %if.end, label %if.then
+
+if.then:
+  br label %return
+
+if.end:
+  %6 = tail call { double, double } @llvm.modf.f64(double %d)
+  %7 = extractvalue { double, double } %6, 0
+  %8 = extractvalue { double, double } %6, 1
+  store double %7, ptr %frac, align 4
+  store double %8, ptr %integral, align 4
+  br label %return
+
+return:
+  ret void
+}
+
+declare { double, double } @llvm.modf.f64(double)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll b/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll
new file mode 100644
index 0000000000000..28208fb2e72f8
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.refract), %{{.*}}, %{{.*}}, %{{.*}} is only supported with the GLSL extended instruction set.
+
+define noundef <4 x float> @refract_float4(<4 x float> noundef %I, <4 x float> noundef %N, float noundef %ETA) {
+entry:
+  %spv.refract = call <4 x float> @llvm.spv.refract.f32(<4 x float> %I, <4 x float> %N, float %ETA)
+  ret <4 x float> %spv.refract
+}
+
+declare <4 x float> @llvm.spv.refract.f32(<4 x float>, <4 x float>, float)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
index 0a02a8bf56ace..b179732371d97 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
@@ -1,17 +1,109 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId
-; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
-; CHECK-SPIRV:     %[[#Id:]] = OpVariable %[[#]]
-; CHECK-SPIRV:     %[[#Id:]] = OpVariable %[[#]]
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id0:]] BuiltIn GlobalLinearId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id1:]] BuiltIn GlobalInvocationId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id2:]] BuiltIn LocalInvocationIndex
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id3:]] BuiltIn WorkDim
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id4:]] BuiltIn SubgroupSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id5:]] BuiltIn SubgroupMaxSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id6:]] BuiltIn NumSubgroups
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id7:]] BuiltIn NumEnqueuedSubgroups
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id8:]] BuiltIn SubgroupId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id9:]] BuiltIn SubgroupLocalInvocationId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id10:]] BuiltIn SubgroupEqMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id11:]] BuiltIn SubgroupGeMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id12:]] BuiltIn SubgroupGtMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id13:]] BuiltIn SubgroupLeMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id14:]] BuiltIn SubgroupLtMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id15:]] BuiltIn LocalInvocationId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id16:]] BuiltIn WorkgroupSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id17:]] BuiltIn GlobalSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id18:]] BuiltIn WorkgroupId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id19:]] BuiltIn EnqueuedWorkgroupSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id20:]] BuiltIn NumWorkgroups
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id21:]] BuiltIn GlobalOffset
+
+; CHECK-SPIRV:     %[[#Id0:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id1:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id2:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id3:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id4:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id5:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id6:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id7:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id8:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id9:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id10:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id11:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id12:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id13:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id14:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id15:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id16:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id17:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id18:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id19:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id20:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id21:]] = OpVariable %[[#]] Input
 
 define spir_kernel void @f() {
 entry:
   %0 = call spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv()
   %1 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 1)
+  %2 = call spir_func i64 @_Z35__spirv_BuiltInLocalInvocationIndexv()
+  %3 = call spir_func i32 @_Z22__spirv_BuiltInWorkDimv()
+  %4 = call spir_func i32 @_Z27__spirv_BuiltInSubgroupSizev()
+  %5 = call spir_func i32 @_Z30__spirv_BuiltInSubgroupMaxSizev()
+  %6 = call spir_func i32 @_Z27__spirv_BuiltInNumSubgroupsv()
+  %7 = call spir_func i32 @_Z35__spirv_BuiltInNumEnqueuedSubgroupsv()
+  %8 = call spir_func i32 @_Z25__spirv_BuiltInSubgroupIdv()
+  %9 = call spir_func i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv()
+  %10 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupEqMaskv()
+  %11 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupEqMaskKHRv()
+  %12 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGeMaskv()
+  %13 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGeMaskKHRv()
+  %14 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGtMaskv()
+  %15 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGtMaskKHRv()
+  %16 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLeMaskv()
+  %17 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLeMaskKHRv()
+  %18 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLtMaskv()
+  %19 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLtMaskKHRv()
+  %20 = call spir_func i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 0)
+  %21 = call spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 0)
+  %22 = call spir_func i64 @_Z25__spirv_BuiltInGlobalSizei(i32 0)
+  %23 = call spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 0)
+  %24 = call spir_func i64 @_Z36__spirv_BuiltInEnqueuedWorkgroupSizei(i32 0)
+  %25 = call spir_func i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 0)
+  %26 = call spir_func i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 0)
+
   ret void
 }
 
 declare spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv()
 declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32)
+declare spir_func i64 @_Z35__spirv_BuiltInLocalInvocationIndexv()
+declare spir_func i32 @_Z22__spirv_BuiltInWorkDimv()
+declare spir_func i32 @_Z27__spirv_BuiltInSubgroupSizev()
+declare spir_func i32 @_Z30__spirv_BuiltInSubgroupMaxSizev()
+declare spir_func i32 @_Z27__spirv_BuiltInNumSubgroupsv()
+declare spir_func i32 @_Z35__spirv_BuiltInNumEnqueuedSubgroupsv()
+declare spir_func i32 @_Z25__spirv_BuiltInSubgroupIdv()
+declare spir_func i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupEqMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupEqMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGeMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGeMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGtMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGtMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLeMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLeMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLtMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLtMaskKHRv()
+declare spir_func i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32)
+declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32)
+declare spir_func i64 @_Z25__spirv_BuiltInGlobalSizei(i32)
+declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32)
+declare spir_func i64 @_Z36__spirv_BuiltInEnqueuedWorkgroupSizei(i32)
+declare spir_func i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32)
+declare spir_func i64 @_Z27__spirv_BuiltInGlobalOffseti(i32)
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index bbf4d50bd716d..8a6a30318ae58 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
 ; CHECK-NEXT:    lhrl %r1, f+4
+; CHECK-NEXT:    sll %r1, 8
 ; CHECK-NEXT:    larl %r2, f
-; CHECK-NEXT:    llc %r2, 6(%r2)
-; CHECK-NEXT:    larl %r3, e
-; CHECK-NEXT:    lb %r0, 3(%r3)
-; CHECK-NEXT:    rosbg %r2, %r1, 32, 55, 8
-; CHECK-NEXT:    vlvgp %v0, %r2, %r0
-; CHECK-NEXT:    vlvgf %v0, %r2, 0
-; CHECK-NEXT:    vlvgf %v0, %r2, 2
-; CHECK-NEXT:    vlvgp %v1, %r0, %r2
-; CHECK-NEXT:    vlvgp %v2, %r2, %r2
-; CHECK-NEXT:    lr %r1, %r2
+; CHECK-NEXT:    ic %r1, 6(%r2)
+; CHECK-NEXT:    larl %r2, e
+; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlvgf %v1, %r1, 0
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
 ; CHECK-NEXT:    nilh %r1, 255
 ; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
+; CHECK-NEXT:    vlvgf %v0, %r0, 0
+; CHECK-NEXT:    vlvgf %v0, %r0, 2
 ; CHECK-NEXT:    vgbm %v3, 30583
 ; CHECK-NEXT:    vn %v0, %v0, %v3
-; CHECK-NEXT:    vlvgf %v1, %r0, 0
-; CHECK-NEXT:    vlvgf %v1, %r0, 2
 ; CHECK-NEXT:    vn %v1, %v1, %v3
 ; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vn %v2, %v2, %v3
 ; CHECK-NEXT:    vrepif %v3, 127
-; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    vchlf %v1, %v1, %v3
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
 ; CHECK-NEXT:    vchlf %v2, %v2, %v3
 ; CHECK-NEXT:    vlgvf %r3, %v2, 1
 ; CHECK-NEXT:    nilf %r3, 1
@@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    nilf %r14, 1
 ; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
 ; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
 ; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
 ; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
 ; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT:    vchlf %v0, %v1, %v3
+; CHECK-NEXT:    vchlf %v0, %v0, %v3
 ; CHECK-NEXT:    vlgvf %r13, %v0, 0
 ; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
 ; CHECK-NEXT:    vlgvf %r13, %v0, 1
diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
index 9acdd7e4a8adb..b70505cad4711 100644
--- a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
@@ -17,6 +17,7 @@ declare void @_ZNSsC1EPKcRKSaIcE() unnamed_addr #0
 ; CHECK: .LBB0_2
 ; Function Attrs: nounwind
 define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttributesEPKNS_5SUnitENS_13SUnitIteratorEPKNS_11ScheduleDAGE() #0 align 2 {
+  %a = alloca i8
   br i1 undef, label %1, label %2
 
 ; <label>:1:                                      ; preds = %0
@@ -25,7 +26,7 @@ define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttri
   br label %3
 
 ; <label>:2:                                      ; preds = %0
-  call void @llvm.lifetime.start.p0(i64 1, ptr undef) #0
+  call void @llvm.lifetime.start.p0(i64 1, ptr %a) #0
   call void @_ZNSaIcEC2Ev() #0
   br label %3
 
diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
new file mode 100644
index 0000000000000..8030438645f82
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s  -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare i32 @memcmp(ptr, ptr, i32)
+
+define i1 @memcmp_expand_3(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_3:
+; CHECK:         .functype memcmp_expand_3 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load16_u $push7=, 0($0):p2align=0
+; CHECK-NEXT:    i32.load16_u $push6=, 0($1):p2align=0
+; CHECK-NEXT:    i32.xor $push8=, $pop7, $pop6
+; CHECK-NEXT:    i32.const $push0=, 2
+; CHECK-NEXT:    i32.add $push3=, $0, $pop0
+; CHECK-NEXT:    i32.load8_u $push4=, 0($pop3)
+; CHECK-NEXT:    i32.const $push13=, 2
+; CHECK-NEXT:    i32.add $push1=, $1, $pop13
+; CHECK-NEXT:    i32.load8_u $push2=, 0($pop1)
+; CHECK-NEXT:    i32.xor $push5=, $pop4, $pop2
+; CHECK-NEXT:    i32.or $push9=, $pop8, $pop5
+; CHECK-NEXT:    i32.const $push10=, 65535
+; CHECK-NEXT:    i32.and $push11=, $pop9, $pop10
+; CHECK-NEXT:    i32.eqz $push12=, $pop11
+; CHECK-NEXT:    return $pop12
+  %cmp_3 = call i32 @memcmp(ptr %a, ptr %b, i32 3)
+  %res = icmp eq i32 %cmp_3, 0
+  ret i1 %res
+}
+
+define i1 @memcmp_expand_5(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_5:
+; CHECK:         .functype memcmp_expand_5 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load $push7=, 0($0):p2align=0
+; CHECK-NEXT:    i32.load $push6=, 0($1):p2align=0
+; CHECK-NEXT:    i32.xor $push8=, $pop7, $pop6
+; CHECK-NEXT:    i32.const $push0=, 4
+; CHECK-NEXT:    i32.add $push3=, $0, $pop0
+; CHECK-NEXT:    i32.load8_u $push4=, 0($pop3)
+; CHECK-NEXT:    i32.const $push11=, 4
+; CHECK-NEXT:    i32.add $push1=, $1, $pop11
+; CHECK-NEXT:    i32.load8_u $push2=, 0($pop1)
+; CHECK-NEXT:    i32.xor $push5=, $pop4, $pop2
+; CHECK-NEXT:    i32.or $push9=, $pop8, $pop5
+; CHECK-NEXT:    i32.eqz $push10=, $pop9
+; CHECK-NEXT:    return $pop10
+  %cmp_5 = call i32 @memcmp(ptr %a, ptr %b, i32 5)
+  %res = icmp eq i32 %cmp_5, 0
+  ret i1 %res
+}
+
+define i1 @memcmp_expand_7(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_7:
+; CHECK:         .functype memcmp_expand_7 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load $push7=, 0($0):p2align=0
+; CHECK-NEXT:    i32.load $push6=, 0($1):p2align=0
+; CHECK-NEXT:    i32.xor $push8=, $pop7, $pop6
+; CHECK-NEXT:    i32.const $push0=, 3
+; CHECK-NEXT:    i32.add $push3=, $0, $pop0
+; CHECK-NEXT:    i32.load $push4=, 0($pop3):p2align=0
+; CHECK-NEXT:    i32.const $push11=, 3
+; CHECK-NEXT:    i32.add $push1=, $1, $pop11
+; CHECK-NEXT:    i32.load $push2=, 0($pop1):p2align=0
+; CHECK-NEXT:    i32.xor $push5=, $pop4, $pop2
+; CHECK-NEXT:    i32.or $push9=, $pop8, $pop5
+; CHECK-NEXT:    i32.eqz $push10=, $pop9
+; CHECK-NEXT:    return $pop10
+  %cmp_7 = call i32 @memcmp(ptr %a, ptr %b, i32 7)
+  %res = icmp eq i32 %cmp_7, 0
+  ret i1 %res
+}
+
+; INFO: Negative test
+; Should not expand even with simd128
+define i1 @memcmp_expand_129(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_129:
+; CHECK:         .functype memcmp_expand_129 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const $push0=, 129
+; CHECK-NEXT:    call $push1=, memcmp, $0, $1, $pop0
+; CHECK-NEXT:    i32.eqz $push2=, $pop1
+; CHECK-NEXT:    return $pop2
+  %cmp_129 = call i32 @memcmp(ptr %a, ptr %b, i32 129)
+  %res = icmp eq i32 %cmp_129, 0
+  ret i1 %res
+}
+
+define i1 @memcmp_expand_2(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_2:
+; CHECK:         .functype memcmp_expand_2 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load16_u $push1=, 0($0):p2align=0
+; CHECK-NEXT:    i32.load16_u $push0=, 0($1):p2align=0
+; CHECK-NEXT:    i32.eq $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
+  %res = icmp eq i32 %cmp_2, 0
+  ret i1 %res
+}
+
+define i1 @memcmp_expand_2_align(ptr align(2) %a, ptr align(2) %b) {
+; CHECK-LABEL: memcmp_expand_2_align:
+; CHECK:         .functype memcmp_expand_2_align (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.load16_u $push1=, 0($0)
+; CHECK-NEXT:    i32.load16_u $push0=, 0($1)
+; CHECK-NEXT:    i32.eq $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
+  %res = icmp eq i32 %cmp_2, 0
+  ret i1 %res
+}
+
+define i1 @memcmp_expand_8(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_8:
+; CHECK:         .functype memcmp_expand_8 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.load $push1=, 0($0):p2align=0
+; CHECK-NEXT:    i64.load $push0=, 0($1):p2align=0
+; CHECK-NEXT:    i64.eq $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %cmp_8 = call i32 @memcmp(ptr %a, ptr %b, i32 8)
+  %res = icmp eq i32 %cmp_8, 0
+  ret i1 %res
+}
+
+; TODO: Should be using a single load i64x2 or equivalent in bitsizes
+define i1 @memcmp_expand_16(ptr %a, ptr %b) {
+; CHECK-LABEL: memcmp_expand_16:
+; CHECK:         .functype memcmp_expand_16 (i32, i32) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.load $push7=, 0($0):p2align=0
+; CHECK-NEXT:    i64.load $push6=, 0($1):p2align=0
+; CHECK-NEXT:    i64.xor $push8=, $pop7, $pop6
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push3=, $0, $pop0
+; CHECK-NEXT:    i64.load $push4=, 0($pop3):p2align=0
+; CHECK-NEXT:    i32.const $push11=, 8
+; CHECK-NEXT:    i32.add $push1=, $1, $pop11
+; CHECK-NEXT:    i64.load $push2=, 0($pop1):p2align=0
+; CHECK-NEXT:    i64.xor $push5=, $pop4, $pop2
+; CHECK-NEXT:    i64.or $push9=, $pop8, $pop5
+; CHECK-NEXT:    i64.eqz $push10=, $pop9
+; CHECK-NEXT:    return $pop10
+  %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
+  %res = icmp eq i32 %cmp_16, 0
+  ret i1 %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
new file mode 100644
index 0000000000000..97c2311c2558b
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -0,0 +1,1413 @@
+; RUN: opt -S -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+
+%struct.TwoInts = type { i32, i32 }
+%struct.ThreeInts = type { i32, i32, i32 }
+%struct.FourInts = type { i32, i32, i32, i32 }
+%struct.ThreeShorts = type { i16, i16, i16 }
+%struct.FourShorts = type { i16, i16, i16, i16 }
+%struct.FiveShorts = type { i16, i16, i16, i16, i16 }
+%struct.TwoBytes = type { i8, i8 }
+%struct.ThreeBytes = type { i8, i8, i8 }
+%struct.FourBytes = type { i8, i8, i8, i8 }
+%struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+
+; CHECK-LABEL: two_ints_same_op:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8
+  %10 = load i32, ptr %9, align 4
+  %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8
+  %12 = load i32, ptr %11, align 4
+  %13 = add i32 %12, %10
+  %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8
+  store i32 %13, ptr %14, align 4
+  %15 = getelementptr inbounds i8, ptr %9, i32 4
+  %16 = load i32, ptr %15, align 4
+  %17 = getelementptr inbounds i8, ptr %11, i32 4
+  %18 = load i32, ptr %17, align 4
+  %19 = add i32 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 4
+  store i32 %19, ptr %20, align 4
+  %21 = add nuw i32 %8, 1
+  %22 = icmp eq i32 %21, %3
+  br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: two_ints_vary_op:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.sub
+; CHECK: i32.store
+define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8
+  %10 = load i32, ptr %9, align 4
+  %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8
+  %12 = load i32, ptr %11, align 4
+  %13 = add i32 %12, %10
+  %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8
+  store i32 %13, ptr %14, align 4
+  %15 = getelementptr inbounds i8, ptr %9, i32 4
+  %16 = load i32, ptr %15, align 4
+  %17 = getelementptr inbounds i8, ptr %11, i32 4
+  %18 = load i32, ptr %17, align 4
+  %19 = sub i32 %16, %18
+  %20 = getelementptr inbounds i8, ptr %14, i32 4
+  store i32 %19, ptr %20, align 4
+  %21 = add nuw i32 %8, 1
+  %22 = icmp eq i32 %21, %3
+  br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: three_ints:
+; CHECK: loop
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.ThreeInts, ptr %1, i32 %8
+  %10 = load i32, ptr %9, align 4
+  %11 = getelementptr inbounds %struct.ThreeInts, ptr %2, i32 %8
+  %12 = load i32, ptr %11, align 4
+  %13 = add nsw i32 %12, %10
+  %14 = getelementptr inbounds %struct.ThreeInts, ptr %0, i32 %8
+  store i32 %13, ptr %14, align 4
+  %15 = getelementptr inbounds i8, ptr %9, i32 4
+  %16 = load i32, ptr %15, align 4
+  %17 = getelementptr inbounds i8, ptr %11, i32 4
+  %18 = load i32, ptr %17, align 4
+  %19 = add nsw i32 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 4
+  store i32 %19, ptr %20, align 4
+  %21 = getelementptr inbounds i8, ptr %9, i32 8
+  %22 = load i32, ptr %21, align 4
+  %23 = getelementptr inbounds i8, ptr %11, i32 8
+  %24 = load i32, ptr %23, align 4
+  %25 = add nsw i32 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 8
+  store i32 %25, ptr %26, align 4
+  %27 = add nuw i32 %8, 1
+  %28 = icmp eq i32 %27, %3
+  br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: three_shorts:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.mul
+; CHECK: i32.store16
+define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.ThreeShorts, ptr %1, i32 %8
+  %10 = load i16, ptr %9, align 2
+  %11 = getelementptr inbounds %struct.ThreeShorts, ptr %2, i32 %8
+  %12 = load i16, ptr %11, align 2
+  %13 = mul i16 %12, %10
+  %14 = getelementptr inbounds %struct.ThreeShorts, ptr %0, i32 %8
+  store i16 %13, ptr %14, align 2
+  %15 = getelementptr inbounds i8, ptr %9, i32 2
+  %16 = load i16, ptr %15, align 2
+  %17 = getelementptr inbounds i8, ptr %11, i32 2
+  %18 = load i16, ptr %17, align 2
+  %19 = mul i16 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 2
+  store i16 %19, ptr %20, align 2
+  %21 = getelementptr inbounds i8, ptr %9, i32 4
+  %22 = load i16, ptr %21, align 2
+  %23 = getelementptr inbounds i8, ptr %11, i32 4
+  %24 = load i16, ptr %23, align 2
+  %25 = mul i16 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 4
+  store i16 %25, ptr %26, align 2
+  %27 = add nuw i32 %8, 1
+  %28 = icmp eq i32 %27, %3
+  br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_same_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+  %10 = load i16, ptr %9, align 2
+  %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+  %12 = load i16, ptr %11, align 2
+  %13 = sub i16 %10, %12
+  %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+  store i16 %13, ptr %14, align 2
+  %15 = getelementptr inbounds i8, ptr %9, i32 2
+  %16 = load i16, ptr %15, align 2
+  %17 = getelementptr inbounds i8, ptr %11, i32 2
+  %18 = load i16, ptr %17, align 2
+  %19 = sub i16 %16, %18
+  %20 = getelementptr inbounds i8, ptr %14, i32 2
+  store i16 %19, ptr %20, align 2
+  %21 = getelementptr inbounds i8, ptr %9, i32 4
+  %22 = load i16, ptr %21, align 2
+  %23 = getelementptr inbounds i8, ptr %11, i32 4
+  %24 = load i16, ptr %23, align 2
+  %25 = sub i16 %22, %24
+  %26 = getelementptr inbounds i8, ptr %14, i32 4
+  store i16 %25, ptr %26, align 2
+  %27 = getelementptr inbounds i8, ptr %9, i32 6
+  %28 = load i16, ptr %27, align 2
+  %29 = getelementptr inbounds i8, ptr %11, i32 6
+  %30 = load i16, ptr %29, align 2
+  %31 = sub i16 %28, %30
+  %32 = getelementptr inbounds i8, ptr %14, i32 6
+  store i16 %31, ptr %32, align 2
+  %33 = add nuw i32 %8, 1
+  %34 = icmp eq i32 %33, %3
+  br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_split_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+  %10 = load i16, ptr %9, align 2
+  %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+  %12 = load i16, ptr %11, align 2
+  %13 = or i16 %12, %10
+  %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+  store i16 %13, ptr %14, align 2
+  %15 = getelementptr inbounds i8, ptr %9, i32 2
+  %16 = load i16, ptr %15, align 2
+  %17 = getelementptr inbounds i8, ptr %11, i32 2
+  %18 = load i16, ptr %17, align 2
+  %19 = or i16 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 2
+  store i16 %19, ptr %20, align 2
+  %21 = getelementptr inbounds i8, ptr %9, i32 4
+  %22 = load i16, ptr %21, align 2
+  %23 = getelementptr inbounds i8, ptr %11, i32 4
+  %24 = load i16, ptr %23, align 2
+  %25 = xor i16 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 4
+  store i16 %25, ptr %26, align 2
+  %27 = getelementptr inbounds i8, ptr %9, i32 6
+  %28 = load i16, ptr %27, align 2
+  %29 = getelementptr inbounds i8, ptr %11, i32 6
+  %30 = load i16, ptr %29, align 2
+  %31 = xor i16 %30, %28
+  %32 = getelementptr inbounds i8, ptr %14, i32 6
+  store i16 %31, ptr %32, align 2
+  %33 = add nuw i32 %8, 1
+  %34 = icmp eq i32 %33, %3
+  br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_shorts_interleave_op:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.or
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.xor
+; CHECK: i32.store16
+define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8
+  %10 = load i16, ptr %9, align 2
+  %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8
+  %12 = load i16, ptr %11, align 2
+  %13 = or i16 %12, %10
+  %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8
+  store i16 %13, ptr %14, align 2
+  %15 = getelementptr inbounds i8, ptr %9, i32 2
+  %16 = load i16, ptr %15, align 2
+  %17 = getelementptr inbounds i8, ptr %11, i32 2
+  %18 = load i16, ptr %17, align 2
+  %19 = xor i16 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 2
+  store i16 %19, ptr %20, align 2
+  %21 = getelementptr inbounds i8, ptr %9, i32 4
+  %22 = load i16, ptr %21, align 2
+  %23 = getelementptr inbounds i8, ptr %11, i32 4
+  %24 = load i16, ptr %23, align 2
+  %25 = or i16 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 4
+  store i16 %25, ptr %26, align 2
+  %27 = getelementptr inbounds i8, ptr %9, i32 6
+  %28 = load i16, ptr %27, align 2
+  %29 = getelementptr inbounds i8, ptr %11, i32 6
+  %30 = load i16, ptr %29, align 2
+  %31 = xor i16 %30, %28
+  %32 = getelementptr inbounds i8, ptr %14, i32 6
+  store i16 %31, ptr %32, align 2
+  %33 = add nuw i32 %8, 1
+  %34 = icmp eq i32 %33, %3
+  br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: five_shorts:
+; CHECK: loop
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+; CHECK: i32.load16_u
+; CHECK: i32.load16_u
+; CHECK: i32.sub
+; CHECK: i32.store16
+define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %39, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FiveShorts, ptr %1, i32 %8
+  %10 = load i16, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.FiveShorts, ptr %2, i32 %8
+  %12 = load i16, ptr %11, align 1
+  %13 = sub i16 %10, %12
+  %14 = getelementptr inbounds %struct.FiveShorts, ptr %0, i32 %8
+  store i16 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i16, ptr %9, i32 1
+  %16 = load i16, ptr %15, align 1
+  %17 = getelementptr inbounds i16, ptr %11, i32 1
+  %18 = load i16, ptr %17, align 1
+  %19 = sub i16 %16, %18
+  %20 = getelementptr inbounds i16, ptr %14, i32 1
+  store i16 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i16, ptr %9, i32 2
+  %22 = load i16, ptr %21, align 1
+  %23 = getelementptr inbounds i16, ptr %11, i32 2
+  %24 = load i16, ptr %23, align 1
+  %25 = sub i16 %22, %24
+  %26 = getelementptr inbounds i16, ptr %14, i32 2
+  store i16 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i16, ptr %9, i32 3
+  %28 = load i16, ptr %27, align 1
+  %29 = getelementptr inbounds i16, ptr %11, i32 3
+  %30 = load i16, ptr %29, align 1
+  %31 = sub i16 %28, %30
+  %32 = getelementptr inbounds i16, ptr %14, i32 3
+  store i16 %31, ptr %32, align 1
+  %33 = getelementptr inbounds i16, ptr %9, i32 4
+  %34 = load i16, ptr %33, align 1
+  %35 = getelementptr inbounds i16, ptr %11, i32 4
+  %36 = load i16, ptr %35, align 1
+  %37 = sub i16 %34, %36
+  %38 = getelementptr inbounds i16, ptr %14, i32 4
+  store i16 %37, ptr %38, align 1
+  %39 = add nuw i32 %8, 1
+  %40 = icmp eq i32 %39, %3
+  br i1 %40, label %6, label %7
+}
+
+; CHECK-LABEL: two_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = mul i8 %12, %10
+  %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = mul i8 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = add nuw i32 %8, 1
+  %22 = icmp eq i32 %21, %3
+  br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: two_bytes_vary_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %21, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = mul i8 %12, %10
+  %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = sub i8 %16, %18
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = add nuw i32 %8, 1
+  %22 = icmp eq i32 %21, %3
+  br i1 %22, label %6, label %7
+}
+
+; CHECK-LABEL: three_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = and i8 %12, %10
+  %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = and i8 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = and i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = add nuw i32 %8, 1
+  %28 = icmp eq i32 %27, %3
+  br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: three_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %27, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = add i8 %12, %10
+  %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = sub i8 %16, %18
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = add i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = add nuw i32 %8, 1
+  %28 = icmp eq i32 %27, %3
+  br i1 %28, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store8
+define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = and i8 %12, %10
+  %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = and i8 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = and i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i8, ptr %9, i32 3
+  %28 = load i8, ptr %27, align 1
+  %29 = getelementptr inbounds i8, ptr %11, i32 3
+  %30 = load i8, ptr %29, align 1
+  %31 = and i8 %30, %28
+  %32 = getelementptr inbounds i8, ptr %14, i32 3
+  store i8 %31, ptr %32, align 1
+  %33 = add nuw i32 %8, 1
+  %34 = icmp eq i32 %33, %3
+  br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_split_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = mul i8 %12, %10
+  %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = mul i8 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = sub i8 %22, %24
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i8, ptr %9, i32 3
+  %28 = load i8, ptr %27, align 1
+  %29 = getelementptr inbounds i8, ptr %11, i32 3
+  %30 = load i8, ptr %29, align 1
+  %31 = sub i8 %28, %30
+  %32 = getelementptr inbounds i8, ptr %14, i32 3
+  store i8 %31, ptr %32, align 1
+  %33 = add nuw i32 %8, 1
+  %34 = icmp eq i32 %33, %3
+  br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %33, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = add i8 %12, %10
+  %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = sub i8 %16, %18
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = add i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i8, ptr %9, i32 3
+  %28 = load i8, ptr %27, align 1
+  %29 = getelementptr inbounds i8, ptr %11, i32 3
+  %30 = load i8, ptr %29, align 1
+  %31 = sub i8 %28, %30
+  %32 = getelementptr inbounds i8, ptr %14, i32 3
+  store i8 %31, ptr %32, align 1
+  %33 = add nuw i32 %8, 1
+  %34 = icmp eq i32 %33, %3
+  br i1 %34, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store8
+define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = mul i8 %12, %10
+  %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = mul i8 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = mul i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i8, ptr %9, i32 3
+  %28 = load i8, ptr %27, align 1
+  %29 = getelementptr inbounds i8, ptr %11, i32 3
+  %30 = load i8, ptr %29, align 1
+  %31 = mul i8 %30, %28
+  %32 = getelementptr inbounds i8, ptr %14, i32 3
+  store i8 %31, ptr %32, align 1
+  %33 = getelementptr inbounds i8, ptr %9, i32 4
+  %34 = load i8, ptr %33, align 1
+  %35 = getelementptr inbounds i8, ptr %11, i32 4
+  %36 = load i8, ptr %35, align 1
+  %37 = mul i8 %36, %34
+  %38 = getelementptr inbounds i8, ptr %14, i32 4
+  store i8 %37, ptr %38, align 1
+  %39 = getelementptr inbounds i8, ptr %9, i32 5
+  %40 = load i8, ptr %39, align 1
+  %41 = getelementptr inbounds i8, ptr %11, i32 5
+  %42 = load i8, ptr %41, align 1
+  %43 = mul i8 %42, %40
+  %44 = getelementptr inbounds i8, ptr %14, i32 5
+  store i8 %43, ptr %44, align 1
+  %45 = getelementptr inbounds i8, ptr %9, i32 6
+  %46 = load i8, ptr %45, align 1
+  %47 = getelementptr inbounds i8, ptr %11, i32 6
+  %48 = load i8, ptr %47, align 1
+  %49 = mul i8 %48, %46
+  %50 = getelementptr inbounds i8, ptr %14, i32 6
+  store i8 %49, ptr %50, align 1
+  %51 = getelementptr inbounds i8, ptr %9, i32 7
+  %52 = load i8, ptr %51, align 1
+  %53 = getelementptr inbounds i8, ptr %11, i32 7
+  %54 = load i8, ptr %53, align 1
+  %55 = mul i8 %54, %52
+  %56 = getelementptr inbounds i8, ptr %14, i32 7
+  store i8 %55, ptr %56, align 1
+  %57 = add nuw i32 %8, 1
+  %58 = icmp eq i32 %57, %3
+  br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_split_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = add i8 %12, %10
+  %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = add i8 %18, %16
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = add i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i8, ptr %9, i32 3
+  %28 = load i8, ptr %27, align 1
+  %29 = getelementptr inbounds i8, ptr %11, i32 3
+  %30 = load i8, ptr %29, align 1
+  %31 = add i8 %30, %28
+  %32 = getelementptr inbounds i8, ptr %14, i32 3
+  store i8 %31, ptr %32, align 1
+  %33 = getelementptr inbounds i8, ptr %9, i32 4
+  %34 = load i8, ptr %33, align 1
+  %35 = getelementptr inbounds i8, ptr %11, i32 4
+  %36 = load i8, ptr %35, align 1
+  %37 = sub i8 %34, %36
+  %38 = getelementptr inbounds i8, ptr %14, i32 4
+  store i8 %37, ptr %38, align 1
+  %39 = getelementptr inbounds i8, ptr %9, i32 5
+  %40 = load i8, ptr %39, align 1
+  %41 = getelementptr inbounds i8, ptr %11, i32 5
+  %42 = load i8, ptr %41, align 1
+  %43 = sub i8 %40, %42
+  %44 = getelementptr inbounds i8, ptr %14, i32 5
+  store i8 %43, ptr %44, align 1
+  %45 = getelementptr inbounds i8, ptr %9, i32 6
+  %46 = load i8, ptr %45, align 1
+  %47 = getelementptr inbounds i8, ptr %11, i32 6
+  %48 = load i8, ptr %47, align 1
+  %49 = sub i8 %46, %48
+  %50 = getelementptr inbounds i8, ptr %14, i32 6
+  store i8 %49, ptr %50, align 1
+  %51 = getelementptr inbounds i8, ptr %9, i32 7
+  %52 = load i8, ptr %51, align 1
+  %53 = getelementptr inbounds i8, ptr %11, i32 7
+  %54 = load i8, ptr %53, align 1
+  %55 = sub i8 %52, %54
+  %56 = getelementptr inbounds i8, ptr %14, i32 7
+  store i8 %55, ptr %56, align 1
+  %57 = add nuw i32 %8, 1
+  %58 = icmp eq i32 %57, %3
+  br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: eight_bytes_interleave_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store8
+define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %57, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8
+  %12 = load i8, ptr %11, align 1
+  %13 = add i8 %12, %10
+  %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %9, i32 1
+  %16 = load i8, ptr %15, align 1
+  %17 = getelementptr inbounds i8, ptr %11, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = sub i8 %16, %18
+  %20 = getelementptr inbounds i8, ptr %14, i32 1
+  store i8 %19, ptr %20, align 1
+  %21 = getelementptr inbounds i8, ptr %9, i32 2
+  %22 = load i8, ptr %21, align 1
+  %23 = getelementptr inbounds i8, ptr %11, i32 2
+  %24 = load i8, ptr %23, align 1
+  %25 = add i8 %24, %22
+  %26 = getelementptr inbounds i8, ptr %14, i32 2
+  store i8 %25, ptr %26, align 1
+  %27 = getelementptr inbounds i8, ptr %9, i32 3
+  %28 = load i8, ptr %27, align 1
+  %29 = getelementptr inbounds i8, ptr %11, i32 3
+  %30 = load i8, ptr %29, align 1
+  %31 = sub i8 %28, %30
+  %32 = getelementptr inbounds i8, ptr %14, i32 3
+  store i8 %31, ptr %32, align 1
+  %33 = getelementptr inbounds i8, ptr %9, i32 4
+  %34 = load i8, ptr %33, align 1
+  %35 = getelementptr inbounds i8, ptr %11, i32 4
+  %36 = load i8, ptr %35, align 1
+  %37 = add i8 %36, %34
+  %38 = getelementptr inbounds i8, ptr %14, i32 4
+  store i8 %37, ptr %38, align 1
+  %39 = getelementptr inbounds i8, ptr %9, i32 5
+  %40 = load i8, ptr %39, align 1
+  %41 = getelementptr inbounds i8, ptr %11, i32 5
+  %42 = load i8, ptr %41, align 1
+  %43 = sub i8 %40, %42
+  %44 = getelementptr inbounds i8, ptr %14, i32 5
+  store i8 %43, ptr %44, align 1
+  %45 = getelementptr inbounds i8, ptr %9, i32 6
+  %46 = load i8, ptr %45, align 1
+  %47 = getelementptr inbounds i8, ptr %11, i32 6
+  %48 = load i8, ptr %47, align 1
+  %49 = add i8 %48, %46
+  %50 = getelementptr inbounds i8, ptr %14, i32 6
+  store i8 %49, ptr %50, align 1
+  %51 = getelementptr inbounds i8, ptr %9, i32 7
+  %52 = load i8, ptr %51, align 1
+  %53 = getelementptr inbounds i8, ptr %11, i32 7
+  %54 = load i8, ptr %53, align 1
+  %55 = sub i8 %52, %54
+  %56 = getelementptr inbounds i8, ptr %14, i32 7
+  store i8 %55, ptr %56, align 1
+  %57 = add nuw i32 %8, 1
+  %58 = icmp eq i32 %57, %3
+  br i1 %58, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_into_four_ints_same_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.load
+; CHECK: i32.add
+; CHECK: i32.store
+define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %49, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+  %13 = load i8, ptr %12, align 1
+  %14 = zext i8 %13 to i32
+  %15 = mul nuw nsw i32 %14, %11
+  %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8
+  %17 = load i32, ptr %16, align 4
+  %18 = add nsw i32 %15, %17
+  store i32 %18, ptr %16, align 4
+  %19 = getelementptr inbounds i8, ptr %9, i32 1
+  %20 = load i8, ptr %19, align 1
+  %21 = zext i8 %20 to i32
+  %22 = getelementptr inbounds i8, ptr %12, i32 1
+  %23 = load i8, ptr %22, align 1
+  %24 = zext i8 %23 to i32
+  %25 = mul nuw nsw i32 %24, %21
+  %26 = getelementptr inbounds i8, ptr %16, i32 4
+  %27 = load i32, ptr %26, align 4
+  %28 = add nsw i32 %25, %27
+  store i32 %28, ptr %26, align 4
+  %29 = getelementptr inbounds i8, ptr %9, i32 2
+  %30 = load i8, ptr %29, align 1
+  %31 = zext i8 %30 to i32
+  %32 = getelementptr inbounds i8, ptr %12, i32 2
+  %33 = load i8, ptr %32, align 1
+  %34 = zext i8 %33 to i32
+  %35 = mul nuw nsw i32 %34, %31
+  %36 = getelementptr inbounds i8, ptr %16, i32 8
+  %37 = load i32, ptr %36, align 4
+  %38 = add nsw i32 %35, %37
+  store i32 %38, ptr %36, align 4
+  %39 = getelementptr inbounds i8, ptr %9, i32 3
+  %40 = load i8, ptr %39, align 1
+  %41 = zext i8 %40 to i32
+  %42 = getelementptr inbounds i8, ptr %12, i32 3
+  %43 = load i8, ptr %42, align 1
+  %44 = zext i8 %43 to i32
+  %45 = mul nuw nsw i32 %44, %41
+  %46 = getelementptr inbounds i8, ptr %16, i32 12
+  %47 = load i32, ptr %46, align 4
+  %48 = add nsw i32 %45, %47
+  store i32 %48, ptr %46, align 4
+  %49 = add nuw i32 %8, 1
+  %50 = icmp eq i32 %49, %3
+  br i1 %50, label %6, label %7
+}
+
+; CHECK-LABEL: four_bytes_into_four_ints_vary_op:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.add
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.sub
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.mul
+; CHECK: i32.store
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.and
+; CHECK: i32.store
+define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %7, %4
+  ret void
+
+7:                                                ; preds = %4, %7
+  %8 = phi i32 [ %40, %7 ], [ 0, %4 ]
+  %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8
+  %10 = load i8, ptr %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8
+  %13 = load i8, ptr %12, align 1
+  %14 = zext i8 %13 to i32
+  %15 = add nuw nsw i32 %14, %11
+  %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8
+  store i32 %15, ptr %16, align 4
+  %17 = getelementptr inbounds i8, ptr %9, i32 1
+  %18 = load i8, ptr %17, align 1
+  %19 = zext i8 %18 to i32
+  %20 = getelementptr inbounds i8, ptr %12, i32 1
+  %21 = load i8, ptr %20, align 1
+  %22 = zext i8 %21 to i32
+  %23 = sub nsw i32 %19, %22
+  %24 = getelementptr inbounds i8, ptr %16, i32 4
+  store i32 %23, ptr %24, align 4
+  %25 = getelementptr inbounds i8, ptr %9, i32 2
+  %26 = load i8, ptr %25, align 1
+  %27 = zext i8 %26 to i32
+  %28 = getelementptr inbounds i8, ptr %12, i32 2
+  %29 = load i8, ptr %28, align 1
+  %30 = zext i8 %29 to i32
+  %31 = mul nuw nsw i32 %30, %27
+  %32 = getelementptr inbounds i8, ptr %16, i32 8
+  store i32 %31, ptr %32, align 4
+  %33 = getelementptr inbounds i8, ptr %9, i32 3
+  %34 = load i8, ptr %33, align 1
+  %35 = getelementptr inbounds i8, ptr %12, i32 3
+  %36 = load i8, ptr %35, align 1
+  %37 = and i8 %36, %34
+  %38 = zext i8 %37 to i32
+  %39 = getelementptr inbounds i8, ptr %16, i32 12
+  store i32 %38, ptr %39, align 4
+  %40 = add nuw i32 %8, 1
+  %41 = icmp eq i32 %40, %3
+  br i1 %41, label %6, label %7
+}
+
+; CHECK-LABEL: scale_uv_row_down2:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+  %5 = icmp sgt i32 %3, 0
+  br i1 %5, label %6, label %19
+
+6:                                                ; preds = %4, %6
+  %7 = phi i32 [ %17, %6 ], [ 0, %4 ]
+  %8 = phi ptr [ %15, %6 ], [ %0, %4 ]
+  %9 = phi ptr [ %16, %6 ], [ %2, %4 ]
+  %10 = getelementptr inbounds i8, ptr %8, i32 2
+  %11 = load i8, ptr %10, align 1
+  store i8 %11, ptr %9, align 1
+  %12 = getelementptr inbounds i8, ptr %8, i32 3
+  %13 = load i8, ptr %12, align 1
+  %14 = getelementptr inbounds i8, ptr %9, i32 1
+  store i8 %13, ptr %14, align 1
+  %15 = getelementptr inbounds i8, ptr %8, i32 4
+  %16 = getelementptr inbounds i8, ptr %9, i32 2
+  %17 = add nuw nsw i32 %7, 1
+  %18 = icmp eq i32 %17, %3
+  br i1 %18, label %19, label %6
+
+19:                                               ; preds = %6, %4
+  ret void
+}
+
+; CHECK-LABEL: scale_uv_row_down2_box:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+  %5 = icmp sgt i32 %3, 0
+  br i1 %5, label %6, label %54
+
+6:                                                ; preds = %4
+  %7 = add nsw i32 %1, 2
+  %8 = add nsw i32 %1, 1
+  %9 = add nsw i32 %1, 3
+  br label %10
+
+10:                                               ; preds = %6, %10
+  %11 = phi i32 [ 0, %6 ], [ %52, %10 ]
+  %12 = phi ptr [ %0, %6 ], [ %50, %10 ]
+  %13 = phi ptr [ %2, %6 ], [ %51, %10 ]
+  %14 = load i8, ptr %12, align 1
+  %15 = zext i8 %14 to i16
+  %16 = getelementptr inbounds i8, ptr %12, i32 2
+  %17 = load i8, ptr %16, align 1
+  %18 = zext i8 %17 to i16
+  %19 = getelementptr inbounds i8, ptr %12, i32 %1
+  %20 = load i8, ptr %19, align 1
+  %21 = zext i8 %20 to i16
+  %22 = getelementptr inbounds i8, ptr %12, i32 %7
+  %23 = load i8, ptr %22, align 1
+  %24 = zext i8 %23 to i16
+  %25 = add nuw nsw i16 %15, 2
+  %26 = add nuw nsw i16 %25, %18
+  %27 = add nuw nsw i16 %26, %21
+  %28 = add nuw nsw i16 %27, %24
+  %29 = lshr i16 %28, 2
+  %30 = trunc nuw i16 %29 to i8
+  store i8 %30, ptr %13, align 1
+  %31 = getelementptr inbounds i8, ptr %12, i32 1
+  %32 = load i8, ptr %31, align 1
+  %33 = zext i8 %32 to i16
+  %34 = getelementptr inbounds i8, ptr %12, i32 3
+  %35 = load i8, ptr %34, align 1
+  %36 = zext i8 %35 to i16
+  %37 = getelementptr inbounds i8, ptr %12, i32 %8
+  %38 = load i8, ptr %37, align 1
+  %39 = zext i8 %38 to i16
+  %40 = getelementptr inbounds i8, ptr %12, i32 %9
+  %41 = load i8, ptr %40, align 1
+  %42 = zext i8 %41 to i16
+  %43 = add nuw nsw i16 %33, 2
+  %44 = add nuw nsw i16 %43, %36
+  %45 = add nuw nsw i16 %44, %39
+  %46 = add nuw nsw i16 %45, %42
+  %47 = lshr i16 %46, 2
+  %48 = trunc nuw i16 %47 to i8
+  %49 = getelementptr inbounds i8, ptr %13, i32 1
+  store i8 %48, ptr %49, align 1
+  %50 = getelementptr inbounds i8, ptr %12, i32 4
+  %51 = getelementptr inbounds i8, ptr %13, i32 2
+  %52 = add nuw nsw i32 %11, 1
+  %53 = icmp eq i32 %52, %3
+  br i1 %53, label %54, label %10
+
+54:                                               ; preds = %10, %4
+  ret void
+}
+
+; CHECK-LABEL: scale_uv_row_down2_linear:
+; CHECK: loop
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+; CHECK: i32.load8_u
+; CHECK: i32.load8_u
+; CHECK: i32.shr_u
+; CHECK: i32.store8
+define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) {
+  %5 = icmp sgt i32 %3, 0
+  br i1 %5, label %6, label %34
+
+6:                                                ; preds = %4, %6
+  %7 = phi i32 [ %32, %6 ], [ 0, %4 ]
+  %8 = phi ptr [ %30, %6 ], [ %0, %4 ]
+  %9 = phi ptr [ %31, %6 ], [ %2, %4 ]
+  %10 = load i8, ptr %8, align 1
+  %11 = zext i8 %10 to i16
+  %12 = getelementptr inbounds i8, ptr %8, i32 2
+  %13 = load i8, ptr %12, align 1
+  %14 = zext i8 %13 to i16
+  %15 = add nuw nsw i16 %11, 1
+  %16 = add nuw nsw i16 %15, %14
+  %17 = lshr i16 %16, 1
+  %18 = trunc nuw i16 %17 to i8
+  store i8 %18, ptr %9, align 1
+  %19 = getelementptr inbounds i8, ptr %8, i32 1
+  %20 = load i8, ptr %19, align 1
+  %21 = zext i8 %20 to i16
+  %22 = getelementptr inbounds i8, ptr %8, i32 3
+  %23 = load i8, ptr %22, align 1
+  %24 = zext i8 %23 to i16
+  %25 = add nuw nsw i16 %21, 1
+  %26 = add nuw nsw i16 %25, %24
+  %27 = lshr i16 %26, 1
+  %28 = trunc nuw i16 %27 to i8
+  %29 = getelementptr inbounds i8, ptr %9, i32 1
+  store i8 %28, ptr %29, align 1
+  %30 = getelementptr inbounds i8, ptr %8, i32 4
+  %31 = getelementptr inbounds i8, ptr %9, i32 2
+  %32 = add nuw nsw i32 %7, 1
+  %33 = icmp eq i32 %32, %3
+  br i1 %33, label %34, label %6
+
+34:                                               ; preds = %6, %4
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
new file mode 100644
index 0000000000000..e3760a07c6445
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -mcpu=mvp -mattr=+reference-types | FileCheck --check-prefixes CHECK,CHK32 %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -mcpu=mvp -mattr=+reference-types | FileCheck --check-prefixes CHECK,CHK64 %s
+
+define void @test_fpsig_void_void(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_void_void:
+; CHK32:         .functype test_fpsig_void_void (i32) -> ()
+; CHK64:         .functype test_fpsig_void_void (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test () -> ()
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+define void @test_fpsig_return_i32(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_i32:
+; CHK32:         .functype test_fpsig_return_i32 (i32) -> ()
+; CHK64:         .functype test_fpsig_return_i32 (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test () -> (i32)
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 0)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+define void @test_fpsig_return_i64(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_i64:
+; CHK32:         .functype test_fpsig_return_i64 (i32) -> ()
+; CHK64:         .functype test_fpsig_return_i64 (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test () -> (i64)
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i64 0)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+define void @test_fpsig_return_f32(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_f32:
+; CHK32:         .functype test_fpsig_return_f32 (i32) -> ()
+; CHK64:         .functype test_fpsig_return_f32 (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test () -> (f32)
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, float 0.)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+define void @test_fpsig_return_f64(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_return_f64:
+; CHK32:         .functype test_fpsig_return_f64 (i32) -> ()
+; CHK64:         .functype test_fpsig_return_f64 (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test () -> (f64)
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, double 0.)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+
+define void @test_fpsig_param_i32(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_param_i32:
+; CHK32:         .functype test_fpsig_param_i32 (i32) -> ()
+; CHK64:         .functype test_fpsig_param_i32 (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test (f64) -> ()
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, token poison, double 0.)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+
+define void @test_fpsig_multiple_params_and_returns(ptr noundef %func) local_unnamed_addr #0 {
+; CHECK-LABEL: test_fpsig_multiple_params_and_returns:
+; CHK32:         .functype test_fpsig_multiple_params_and_returns (i32) -> ()
+; CHK64:         .functype test_fpsig_multiple_params_and_returns (i64) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    table.get __indirect_function_table
+; CHECK-NEXT:    ref.test (i64, f32, i64) -> (i32, i64, f32, f64)
+; CHECK-NEXT:    call use
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %res = tail call i32 (ptr, ...) @llvm.wasm.ref.test.func(ptr %func, i32 0, i64 0, float 0., double 0., token poison, i64 0, float 0., i64 0)
+  tail call void @use(i32 noundef %res) #3
+  ret void
+}
+
+
+declare void @use(i32 noundef) local_unnamed_addr #1
diff --git a/llvm/test/CodeGen/WebAssembly/removed-terminator.ll b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
new file mode 100644
index 0000000000000..188f6f67eee8b
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define void @test(i1 %x) {
+; CHECK-LABEL: test:
+; CHECK:         .functype test (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.xor
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    drop
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    return
+  %y = xor i1 %x, true
+  ; This br_if's operand (%y) is stackified in RegStackify. But this terminator
+  ; will be removed in CFGSort after that. We need to make sure we unstackify %y
+  ; so that it can be dropped in ExplicitLocals.
+  br i1 %y, label %exit, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
index 8459ec8101ff2..b355a0d60317b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -441,3 +441,31 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) {
   %a = fpext <2 x float> %v to <2 x double>
   ret <2 x double> %a
 }
+
+define <4 x float> @convert_u_v4f32_maybeneg(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32_maybeneg:
+; CHECK:         .functype convert_u_v4f32_maybeneg (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    f32x4.convert_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+  %a = ashr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <4 x float> @convert_u_v4f32_nonneg(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32_nonneg:
+; CHECK:         .functype convert_u_v4f32_nonneg (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32x4.shr_u
+; CHECK-NEXT:    f32x4.convert_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
+  %a = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
index c93b8aa7fb42e..eb39f90e68701 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
@@ -12,7 +12,7 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.extend_low_i16x8_u
-; CHECK-NEXT:    f32x4.convert_i32x4_u
+; CHECK-NEXT:    f32x4.convert_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
   %low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extended = uitofp <4 x i16> %low to <4 x float>
@@ -25,7 +25,7 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.extend_high_i16x8_u
-; CHECK-NEXT:    f32x4.convert_i32x4_u
+; CHECK-NEXT:    f32x4.convert_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
   %high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %extended = uitofp <4 x i16> %high to <4 x float>
@@ -39,7 +39,7 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i16x8.extend_low_i8x16_u
 ; CHECK-NEXT:    i32x4.extend_low_i16x8_u
-; CHECK-NEXT:    f32x4.convert_i32x4_u
+; CHECK-NEXT:    f32x4.convert_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
   %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %extended = uitofp <4 x i8> %low to <4 x float>
@@ -55,7 +55,7 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) {
 ; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK-NEXT:    i16x8.extend_low_i8x16_u
 ; CHECK-NEXT:    i32x4.extend_low_i16x8_u
-; CHECK-NEXT:    f32x4.convert_i32x4_u
+; CHECK-NEXT:    f32x4.convert_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
   %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %extended = uitofp <4 x i8> %high to <4 x float>
@@ -136,7 +136,7 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.extend_low_i16x8_u
-; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
   %low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
   %extended = uitofp <2 x i16> %low to <2 x double>
diff --git a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
index 3b3a46069509b..ab6672ee5f410 100644
--- a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
+++ b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX,X64CXX
+; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX
 ; RUN: sed -e s/.Seh:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=SEH
 ; RUN: %if aarch64-registered-target %{ sed -e s/.Cxx:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=CXX %}
 ; RUN: %if aarch64-registered-target %{ sed -e s/.Seh:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=SEH %}
@@ -49,18 +49,14 @@ catch.body.2:
 ; CXX-NEXT:   .[[ENTRY:long|word]]   .Lfunc_begin0@IMGREL
 ; CXX-NEXT:   .[[ENTRY]]   -1
 ; CXX-NEXT:   .[[ENTRY]]   .Ltmp0@IMGREL
-; X64CXX-SAME:   +1
 ; CXX-NEXT:   .[[ENTRY]]   1
 ; CXX-NEXT:   .[[ENTRY]]   .Ltmp1@IMGREL
-; X64CXX-SAME:   +1
 ; CXX-NEXT:   .[[ENTRY]]   -1
 ; CXX-NEXT:   .[[ENTRY]]   "?catch$3@?0?test@4HA"@IMGREL
 ; CXX-NEXT:   .[[ENTRY]]   2
 ; CXX-NEXT:   .[[ENTRY]]   .Ltmp2@IMGREL
-; X64CXX-SAME:   +1
 ; CXX-NEXT:   .[[ENTRY]]   3
 ; CXX-NEXT:   .[[ENTRY]]   .Ltmp3@IMGREL
-; X64CXX-SAME:   +1
 ; CXX-NEXT:   .[[ENTRY]]   2
 ; CXX-NEXT:   .[[ENTRY]]   "?catch$5@?0?test@4HA"@IMGREL
 ; CXX-NEXT:   .[[ENTRY]]   4
@@ -70,19 +66,19 @@ catch.body.2:
 ; SEH:        .LBB0_[[CATCH:[0-9]+]]: {{.*}} %catch.body
 ; SEH-LABEL: .Llsda_begin0:
 ; SEH-NEXT:    .[[ENTRY:long|word]]   .Ltmp0@IMGREL
-; SEH-NEXT:    .[[ENTRY]]   .Ltmp1@IMGREL+1
+; SEH-NEXT:    .[[ENTRY]]   .Ltmp1@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   dummy_filter@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   .LBB0_[[CATCH]]@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   .Ltmp0@IMGREL
-; SEH-NEXT:    .[[ENTRY]]   .Ltmp1@IMGREL+1
+; SEH-NEXT:    .[[ENTRY]]   .Ltmp1@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   dummy_filter@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   .LBB0_[[CATCH2]]@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   .Ltmp2@IMGREL
-; SEH-NEXT:    .[[ENTRY]]   .Ltmp3@IMGREL+1
+; SEH-NEXT:    .[[ENTRY]]   .Ltmp3@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   "?dtor$[[DTOR:[0-9]+]]@?0?test@4HA"@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   0
 ; SEH-NEXT:    .[[ENTRY]]   .Ltmp2@IMGREL
-; SEH-NEXT:    .[[ENTRY]]   .Ltmp3@IMGREL+1
+; SEH-NEXT:    .[[ENTRY]]   .Ltmp3@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   dummy_filter@IMGREL
 ; SEH-NEXT:    .[[ENTRY]]   .LBB0_[[CATCH2]]@IMGREL
 ; SEH-NEXT:  .Llsda_end0:
diff --git a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll
index 2bd004ed7dec5..9de79ee2641cb 100644
--- a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll
+++ b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll
@@ -1,4 +1,5 @@
-; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s
+; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: %if aarch64-registered-target %{ llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,ARM64 %}
 
 ; Tests the fixed object layouts when two catchpads re-use the same stack
 ; allocation for this catch objects.
@@ -18,27 +19,36 @@
 ; }
 ; ```
 
-; Minimum stack alloc is 64 bytes, so no change there.
 ; CHECK-LABEL:  calls_boom:
-; CHECK:        subq    $64, %rsp
-; CHECK:        .seh_stackalloc 64
+; Minimum stack alloc is 64 bytes, so no change there.
+; X64:          subq    $64, %rsp
+; X64:          .seh_stackalloc 64
+; Only need 48 bytes on the stack, not 64.
+; ARM64:        sub     sp, sp, #48
+; ARM64:        .seh_stackalloc 48
 
 ; Both the catch blocks load from the same address.
 ; CHECK-LABEL:  "?catch$3@?0?calls_boom@4HA":
-; CHECK:        movq    -8(%rbp), %rax
+; X64:          movq    -8(%rbp), %rax
+; ARM64:        ldr     x8, [x29, #24]
 ; CHECK-LABEL:  "?catch$4@?0?calls_boom@4HA":
-; CHECK:        movq    -8(%rbp), %rax
+; X64:          movq    -8(%rbp), %rax
+; ARM64:        ldr     x8, [x29, #24]
 
-; There's enough space for the UnwindHelp to be at 48 instead of 40
 ; CHECK-LABEL:  $cppxdata$calls_boom:
-; CHECK:        .long   48                              # UnwindHelp
+; There's enough space for the UnwindHelp to be at 48 instead of 40
+; X64:          .long   48                              # UnwindHelp
+; There's enough space for the UnwindHelp to be at -16 instead of -32
+; ARM64:        .word   -16                             // UnwindHelp
 
 ; Both catches have the same object offset.
 ; CHECK-LABEL:  $handlerMap$0$calls_boom:
-; CHECK:        .long   56                              # CatchObjOffset
-; CHECK-NEXT:   .long   "?catch$3@?0?calls_boom@4HA"@IMGREL # Handler
-; CHECK:        .long   56                              # CatchObjOffset
-; CHECK-NEXT:   .long   "?catch$4@?0?calls_boom@4HA"@IMGREL # Handler
+; X64:          .long   56                              # CatchObjOffset
+; ARM64:        .word   -8                              // CatchObjOffset
+; CHECK-NEXT:   "?catch$3@?0?calls_boom@4HA"@IMGREL
+; X64:          .long   56                              # CatchObjOffset
+; ARM64:        .word   -8                              // CatchObjOffset
+; CHECK-NEXT:   "?catch$4@?0?calls_boom@4HA"@IMGREL
 
 %rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] }
 
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f6d66ab47ce05..d9064c684cb20 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -367,44 +367,49 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -438,44 +443,49 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -639,55 +649,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovll %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovll %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovll 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll 36(%ebp), %ebx
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -848,37 +862,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    cmovgel %ebp, %ecx
-; X86-NEXT:    cmovgel %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %ebx
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel %edi, %esi
+; X86-NEXT:    cmovgel %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1058,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1089,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1118,35 +1136,39 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1175,35 +1197,39 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 0356c2702a419..a1a4ba81ae493 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -343,37 +343,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -404,37 +408,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -585,37 +593,41 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -768,37 +780,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1027,35 +1043,38 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128:
@@ -1079,35 +1098,38 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128_undef:
@@ -1282,37 +1304,41 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 6bda99c89a37e..b7c34070f1af6 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -355,39 +355,43 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -423,39 +427,43 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -621,55 +629,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovbl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovbl %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovbl 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovbl %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovbl 36(%ebp), %ebx
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -827,39 +839,43 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 27acec32fd348..043c9155f52f9 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -326,35 +326,38 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128:
@@ -381,35 +384,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128_undef:
@@ -548,35 +554,38 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_minmax_i128:
@@ -717,35 +726,38 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_cmp_i128:
@@ -887,35 +899,38 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_select_i128:
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index bae140abdf6b1..e252d5953e60e 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -144,31 +144,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
   ret i128 %r
@@ -688,13 +691,17 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_sextinreg_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    subl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
@@ -702,7 +709,9 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %shl = shl i128 %a, 64
   %ashr = ashr exact i128 %shl, 64
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index c2bfcf57185e3..1df284fb9fe2c 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -104,18 +104,21 @@ define i24 @test_i24_add_add_idx(i24 %x, i24 %y, i24 %z) nounwind {
 define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-LABEL: test_i128_add_add_idx:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    btl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    addl 24(%ebp), %esi
+; X86-NEXT:    adcl 28(%ebp), %edi
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl 36(%ebp), %edx
+; X86-NEXT:    btl $5, 64(%ebp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -124,8 +127,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128_add_add_idx:
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 0eb2c630e6818..f13627b55856f 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -188,11 +188,11 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $48, %esp
-; CHECK-NEXT:    movl 12(%ebp), %eax
+; CHECK-NEXT:    movl 24(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%ebp), %ebx
-; CHECK-NEXT:    movl 20(%ebp), %esi
-; CHECK-NEXT:    movl 24(%ebp), %edi
+; CHECK-NEXT:    movl 28(%ebp), %ebx
+; CHECK-NEXT:    movl 32(%ebp), %esi
+; CHECK-NEXT:    movl 36(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 217ccebdfb77f..0de308a9e0738 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm2
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm12
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; SSE2-NEXT:    movapd %xmm4, %xmm5
 ; SSE2-NEXT:    andpd %xmm1, %xmm5
 ; SSE2-NEXT:    xorpd %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    paddw %xmm5, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    andpd %xmm2, %xmm3
-; SSE2-NEXT:    xorpd %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    paddw %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT:    movapd %xmm2, %xmm3
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm0, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
@@ -1829,74 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm4, %eax
-; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT:    vpextrw $5, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm4
-; AVX1-NEXT:    vpextrw $2, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm5
-; AVX1-NEXT:    vpextrw $1, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm6
-; AVX1-NEXT:    vpextrw $0, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm5
+; AVX1-NEXT:    vpextrw $1, %xmm3, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm6
+; AVX1-NEXT:    vpextrw $0, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm7
-; AVX1-NEXT:    vpextrw $3, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm8
-; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm8
+; AVX1-NEXT:    vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT:    decq %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm9
-; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm10
-; AVX1-NEXT:    vpextrw $0, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm11
-; AVX1-NEXT:    vpextrw $5, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm10
+; AVX1-NEXT:    vpextrw $5, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm11
+; AVX1-NEXT:    vpextrw $4, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm12
-; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm13
-; AVX1-NEXT:    vpextrw $5, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm14
-; AVX1-NEXT:    vpextrw $4, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm15
-; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
 ; AVX1-NEXT:    decl %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm3
-; AVX1-NEXT:    vpextrw $7, %xmm2, %ecx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
+; AVX1-NEXT:    vmovd %ecx, %xmm13
+; AVX1-NEXT:    vpextrw $0, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm14
+; AVX1-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX1-NEXT:    decq %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm15
+; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm2
 ; AVX1-NEXT:    decl %eax
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX1-NEXT:    vmovd %eax, %xmm5
-; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT:    vmovd %ecx, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT:    vmovd %edx, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index f66f0c0ceabc4..cc58bc1e44f37 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -628,13 +628,19 @@ define half @s128_to_half(i128 %x) {
 ;
 ; X86-LABEL: s128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floattihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = sitofp i128 %x to half
   ret half %a
@@ -713,13 +719,19 @@ define half @u128_to_half(i128 %x) {
 ;
 ; X86-LABEL: u128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floatuntihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = uitofp i128 %x to half
   ret half %a
@@ -1020,11 +1032,15 @@ define half @f128_to_half(fp128 %x) nounwind {
 ;
 ; X86-LABEL: f128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __trunctfhf2
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a = fptrunc fp128 %x to half
   ret half %a
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827ac4dd6..33381313d3c19 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -146,37 +146,40 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind {
 define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ; X86-LABEL: bitselect_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    andl 56(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    andl 60(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    andl 64(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    andl 68(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-NOBMI-LABEL: bitselect_i128:
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 312f94c041235..143e10e6909e4 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -263,70 +263,78 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl %edi, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB8_7
-; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    rep bsfl %eax, %edx
-; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    jmp .LBB8_5
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    rep bsfl %ecx, %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:  .LBB8_5: # %cond.false
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
-; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:  # %bb.7: # %cond.false
+; X86-NEXT:    rep bsfl %ebx, %edx
 ; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %edi, %edx
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    jne .LBB8_13
-; X86-NEXT:  # %bb.12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:  .LBB8_13: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -361,46 +369,49 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB9_11
 ; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
 ; X86-NEXT:  # %bb.3: # %select.true.sink
-; X86-NEXT:    rep bsfl %ecx, %edi
-; X86-NEXT:    addl $32, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %ecx, %ebx
+; X86-NEXT:    addl $32, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB9_6
 ; X86-NEXT:  .LBB9_5:
-; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    rep bsfl %edi, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    je .LBB9_8
 ; X86-NEXT:    jmp .LBB9_9
 ; X86-NEXT:  .LBB9_11: # %select.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    jmp .LBB9_10
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    rep bsfl %edx, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %edx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB9_5
 ; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
@@ -409,13 +420,14 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:  .LBB9_9: # %select.true.sink
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:  .LBB9_10: # %select.true.sink
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index fbca4af425eac..ab0478a4e944b 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -291,79 +291,80 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl $128, %esi
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    bsrl %ebp, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    bsrl %ecx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    orl 36(%ebp), %edx
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    orl $64, %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    jmp .LBB8_14
 ; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -398,62 +399,67 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    bsrl %ebx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    orl $32, %ebp
-; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    je .LBB9_7
+; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:  .LBB9_6:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    jne .LBB9_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_7:
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:  .LBB9_8:
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl 32(%ebp), %ebx
 ; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  # %bb.10:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    jmp .LBB9_11
 ; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 6d5e995a6d574..673b7f16de75c 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -41,13 +41,16 @@ define i64 @bswap_i64(i64 %a0) nounwind {
 define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-LABEL: bswap_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    bswapl %edx
@@ -56,25 +59,32 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-MOVBE-LABEL: bswap_i128:
 ; X86-MOVBE:       # %bb.0:
+; X86-MOVBE-NEXT:    pushl %ebp
+; X86-MOVBE-NEXT:    movl %esp, %ebp
 ; X86-MOVBE-NEXT:    pushl %edi
 ; X86-MOVBE-NEXT:    pushl %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-MOVBE-NEXT:    andl $-16, %esp
+; X86-MOVBE-NEXT:    movl 8(%ebp), %eax
+; X86-MOVBE-NEXT:    movl 32(%ebp), %ecx
+; X86-MOVBE-NEXT:    movl 36(%ebp), %edx
+; X86-MOVBE-NEXT:    movl 24(%ebp), %esi
+; X86-MOVBE-NEXT:    movl 28(%ebp), %edi
 ; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
 ; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
 ; X86-MOVBE-NEXT:    movbel %edx, (%eax)
+; X86-MOVBE-NEXT:    leal -8(%ebp), %esp
 ; X86-MOVBE-NEXT:    popl %esi
 ; X86-MOVBE-NEXT:    popl %edi
+; X86-MOVBE-NEXT:    popl %ebp
 ; X86-MOVBE-NEXT:    retl $4
 ;
 ; X64-LABEL: bswap_i128:
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index ab9fa2287ffad..24d3030ea4bdb 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -48,6 +48,6 @@ return:                                           ; preds = %catch, %entry
 ; CHECK-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long   .Ltmp0@IMGREL
-; CHECK-NEXT: .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long   .Ltmp1@IMGREL
 ; CHECK-NEXT: .long   1
 ; CHECK-NEXT: .long   .LBB0_[[catch]]@IMGREL
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
index c4c194e139828..7855ff2c7eb83 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
 ; WIN64-NEXT:    # encoding: [0xeb,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
 ; WIN64-NEXT:  .LBB1_2: # %bb2
-; WIN64-NEXT:    nop # encoding: [0x90]
 ; WIN64-NEXT:    .seh_startepilogue
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
 ; WIN64-NEXT:    .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 9c1d83037d1f5..2859a87db3d56 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
 ; WIN64-NEXT:    # encoding: [0xeb,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
 ; WIN64-NEXT:  .LBB1_2: # %bb2
-; WIN64-NEXT:    nop # encoding: [0x90]
 ; WIN64-NEXT:    .seh_startepilogue
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
 ; WIN64-NEXT:    .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d869f8ec01a5a..455b72d16a075 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,17 +152,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -172,16 +172,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %edi, %edi
 ; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    orl $32, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    addl $64, %edi
+; X86-NEXT:    orl $64, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    orl $64, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -488,13 +487,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %ecx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl %ebx, 8(%ecx)
 ; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -508,7 +507,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -523,17 +522,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
@@ -543,13 +542,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index db6136c4a2b28..859e9244d29d2 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,60 +152,60 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    orl 48(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    orl 24(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    orl 20(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 36(%ebp), %ecx
+; X86-NEXT:    bsrl 48(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebx, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    addl $64, %eax
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl 12(%ebp), %edx
+; X86-NEXT:    bsrl 24(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    orl $64, %edx
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %edi, %edi
@@ -230,15 +230,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    cmovnel %edi, %edx
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    cmovnel %edi, %ebx
-; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 56(%ebp), %edi
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
 ; X86-NEXT:    movl %eax, %edi
@@ -249,18 +249,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
@@ -293,13 +293,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -326,16 +326,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 52(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -378,12 +378,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl 40(%ebp), %esi
+; X86-NEXT:    andl 52(%ebp), %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 36(%ebp), %eax
+; X86-NEXT:    andl 48(%ebp), %eax
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl 32(%ebp), %edx
-; X86-NEXT:    andl 28(%ebp), %ecx
+; X86-NEXT:    andl 44(%ebp), %edx
+; X86-NEXT:    andl 40(%ebp), %ecx
 ; X86-NEXT:    subl %ecx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -413,7 +413,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 56(%ebp), %edi
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -432,23 +432,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, 8(%edi)
 ; X86-NEXT:    movl %eax, 12(%edi)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 52(%ebp), %edi
 ; X86-NEXT:    imull %ebx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    imull 40(%ebp), %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -457,7 +457,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -468,26 +468,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 707b05f3478db..bb5640aeb66fa 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -481,18 +481,21 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -501,7 +504,7 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -620,18 +623,21 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -640,7 +646,7 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -818,18 +824,21 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -838,7 +847,7 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -1016,18 +1025,21 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1036,7 +1048,7 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba1..6d4ec063ccd46 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -415,16 +415,20 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-LABEL: TestFPToSIF128_I128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -432,7 +436,7 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-NEXT:    movl %edx, vi128+8
 ; X86-NEXT:    movl %ecx, vi128+4
 ; X86-NEXT:    movl %eax, vi128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -466,16 +470,20 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-LABEL: TestFPToUIF128_U128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -483,7 +491,7 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-NEXT:    movl %edx, vu128+8
 ; X86-NEXT:    movl %ecx, vu128+4
 ; X86-NEXT:    movl %eax, vu128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -913,16 +921,20 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-LABEL: TestSIToFPI128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vi128, %eax
+; X86-NEXT:    movl vi128+4, %ecx
+; X86-NEXT:    movl vi128+8, %edx
+; X86-NEXT:    movl vi128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi128+12
-; X86-NEXT:    pushl vi128+8
-; X86-NEXT:    pushl vi128+4
-; X86-NEXT:    pushl vi128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -930,7 +942,7 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -964,16 +976,20 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-LABEL: TestUIToFPU128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vu128, %eax
+; X86-NEXT:    movl vu128+4, %ecx
+; X86-NEXT:    movl vu128+8, %edx
+; X86-NEXT:    movl vu128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu128+12
-; X86-NEXT:    pushl vu128+8
-; X86-NEXT:    pushl vu128+4
-; X86-NEXT:    pushl vu128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,7 +997,7 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1134,33 +1150,30 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ;
 ; X86-LABEL: TestBits128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    orl (%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestBits128:
@@ -1359,12 +1372,14 @@ define i1 @PR34866(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866:
@@ -1394,12 +1409,14 @@ define i1 @PR34866_commute(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866_commute:
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a7eea04181f60..ad2d690fd7ed0 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -41,27 +41,40 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: add:
@@ -81,24 +94,32 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -107,9 +128,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -141,27 +163,40 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sub:
@@ -181,24 +216,32 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -207,9 +250,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,27 +285,40 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: mul:
@@ -281,24 +338,32 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -307,9 +372,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -341,27 +407,40 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: div:
@@ -381,24 +460,32 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -407,9 +494,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -434,31 +522,48 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: fma:
@@ -481,28 +586,40 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -511,9 +628,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -538,27 +656,40 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: frem:
@@ -578,24 +709,32 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -604,9 +743,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -631,23 +771,28 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: ceil:
@@ -667,17 +812,20 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -713,23 +861,28 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: acos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: acos:
@@ -749,17 +902,20 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -795,23 +951,28 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cos:
@@ -831,17 +992,20 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -877,23 +1041,28 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cosh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cosh:
@@ -913,17 +1082,20 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -959,23 +1131,28 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll expf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp:
@@ -995,17 +1172,20 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _expl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1041,23 +1221,28 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll exp2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp2:
@@ -1077,17 +1262,20 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _exp2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1123,23 +1311,28 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: floor:
@@ -1159,17 +1352,20 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1205,23 +1401,28 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll logf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log:
@@ -1241,17 +1442,20 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _logl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1287,23 +1491,28 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log10f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log10:
@@ -1323,17 +1532,20 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log10l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1369,23 +1581,28 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log2:
@@ -1405,17 +1622,20 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1451,27 +1671,40 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaxf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: maxnum:
@@ -1491,24 +1724,32 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmaxl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1517,9 +1758,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1544,27 +1786,40 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fminf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: minnum:
@@ -1584,24 +1839,32 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fminl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1610,9 +1873,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1637,23 +1901,28 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: nearbyint:
@@ -1673,17 +1942,20 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1719,27 +1991,40 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll powf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: pow:
@@ -1759,24 +2044,32 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _powl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1785,9 +2078,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1819,24 +2113,32 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __powitf2
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: powi:
@@ -1853,21 +2155,26 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___powitf2
-; WIN-X86-NEXT:    addl $24, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1876,9 +2183,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1903,23 +2211,28 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: rint:
@@ -1939,17 +2252,20 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1985,23 +2301,28 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: round:
@@ -2021,17 +2342,20 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2067,23 +2391,28 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundevenf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: roundeven:
@@ -2103,17 +2432,20 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundevenl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2149,23 +2481,28 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: asin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: asin:
@@ -2185,17 +2522,20 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2231,23 +2571,28 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sin:
@@ -2267,17 +2612,20 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2313,23 +2661,28 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sinh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sinh:
@@ -2349,17 +2702,20 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2395,23 +2751,28 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sqrt:
@@ -2431,17 +2792,20 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2477,23 +2841,28 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: atan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan:
@@ -2513,17 +2882,20 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2559,27 +2931,40 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: atan2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan2:
@@ -2599,24 +2984,32 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2625,9 +3018,10 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2652,23 +3046,28 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tan:
@@ -2688,17 +3087,20 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2734,23 +3136,28 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tanh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tanh:
@@ -2770,17 +3177,20 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2816,23 +3226,28 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: trunc:
@@ -2852,17 +3267,20 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2919,12 +3337,18 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -2969,12 +3393,18 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -3019,12 +3449,18 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3069,12 +3505,18 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3176,26 +3618,32 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmp:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB37_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB37_3
 ; WIN-X86-NEXT:  LBB37_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB37_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmp.f128(
                                                fp128 %x, fp128 %y,
@@ -3300,26 +3748,32 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmps:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB38_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB38_3
 ; WIN-X86-NEXT:  LBB38_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB38_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmps.f128(
                                                fp128 %x, fp128 %y,
@@ -3496,44 +3950,47 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_ueq_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    sete %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    orb %bl, %al
 ; WIN-X86-NEXT:    jne LBB39_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB39_3
 ; WIN-X86-NEXT:  LBB39_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB39_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
@@ -3716,32 +4173,34 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_one_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    setne %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
@@ -3749,13 +4208,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    testb %bl, %al
 ; WIN-X86-NEXT:    jne LBB40_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB40_3
 ; WIN-X86-NEXT:  LBB40_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB40_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index f727a79078627..4b0449fd7502e 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -42,22 +42,38 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Add:
@@ -78,22 +94,31 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -101,8 +126,10 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -144,22 +171,38 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Add:
@@ -180,22 +223,31 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -203,8 +255,10 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,22 +295,38 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sub:
@@ -277,22 +347,31 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -300,8 +379,10 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -343,22 +424,38 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Sub:
@@ -379,22 +476,31 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -402,8 +508,10 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -440,22 +548,38 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Mul:
@@ -476,22 +600,31 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -499,8 +632,10 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -542,22 +677,38 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Mul:
@@ -578,22 +729,31 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -601,8 +761,10 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -639,22 +801,38 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Div:
@@ -675,22 +853,31 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -698,8 +885,10 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -741,22 +930,38 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Div:
@@ -777,22 +982,31 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -800,8 +1014,10 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -830,22 +1046,38 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rem:
@@ -866,22 +1098,31 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -889,8 +1130,10 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -922,22 +1165,38 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Rem:
@@ -958,22 +1217,31 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,8 +1249,10 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1011,18 +1281,24 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sqrt:
@@ -1042,16 +1318,19 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1089,18 +1368,24 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sin:
@@ -1120,16 +1405,19 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1167,18 +1455,24 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Cos:
@@ -1198,16 +1492,19 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1245,18 +1542,24 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Ceil:
@@ -1276,16 +1579,19 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1323,18 +1629,24 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Floor:
@@ -1354,16 +1666,19 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1401,18 +1716,24 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Trunc:
@@ -1432,16 +1753,19 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1479,18 +1803,24 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Nearbyint:
@@ -1510,16 +1840,19 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1557,18 +1890,24 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rint:
@@ -1588,16 +1927,19 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1635,18 +1977,24 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Round:
@@ -1666,16 +2014,19 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1705,31 +2056,48 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128FMA:
@@ -1752,28 +2120,40 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1782,9 +2162,10 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1804,23 +2185,28 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Acos:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Acos:
@@ -1840,17 +2226,20 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1879,23 +2268,28 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Asin:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Asin:
@@ -1915,17 +2309,20 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1954,23 +2351,28 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Atan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan:
@@ -1990,17 +2392,20 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2029,27 +2434,40 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ;
 ; X86-LABEL: Test128Atan2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan2:
@@ -2069,24 +2487,32 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2095,9 +2521,10 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
@@ -2115,23 +2542,28 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Cosh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Cosh:
@@ -2151,17 +2583,20 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2190,23 +2625,28 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Sinh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Sinh:
@@ -2226,17 +2666,20 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2265,23 +2708,28 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tan:
@@ -2301,17 +2749,20 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2340,23 +2791,28 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tanh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tanh:
@@ -2376,17 +2832,20 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2425,27 +2884,34 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Modf:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll modff128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
 ; X86-NEXT:    movaps %xmm1, 16(%esi)
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    addl $80, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Modf:
@@ -2468,18 +2934,21 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    subl $112, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    pushl %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _modfl
-; WIN-X86-NEXT:    addl $24, %esp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42697d97..953a5e7285fe4 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
-; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%edx), %xmm0
+; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpand (%esi), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%esi), %xmm0
+; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ;
 ; X64-LABEL: freeze_extractelement_escape:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 12(%ebp), %esi
 ; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    vmovaps (%esi), %xmm0
-; X86-NEXT:    vandps (%edi), %xmm0, %xmm0
+; X86-NEXT:    vmovaps (%edi), %xmm0
+; X86-NEXT:    vandps (%esi), %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movzbl (%esp,%ecx), %ecx
 ; X86-NEXT:    cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $15, %ecx
 ; X64-NEXT:    andl $15, %edx
-; X64-NEXT:    vmovaps (%rsi), %xmm0
-; X64-NEXT:    vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vandps (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; X64-NEXT:    cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index e8c8ccfa8d37f..ec1b8a3c8d6d9 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -264,53 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %edi
+; X86-FAST-NEXT:    movl 28(%ebp), %edx
+; X86-FAST-NEXT:    movl 48(%ebp), %esi
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %eax
 ; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %ebx, %ebp
 ; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl %edi, %eax
-; X86-FAST-NEXT:    movl %edx, %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movl 32(%ebp), %edi
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %eax
+; X86-FAST-NEXT:    movl 36(%ebp), %edx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:    movl %edi, %esi
-; X86-FAST-NEXT:    movl %ebx, %edi
-; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-FAST-NEXT:    jmp .LBB6_6
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl 44(%ebp), %ebx
+; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl 40(%ebp), %ebx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_4
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-FAST-NEXT:  .LBB6_6:
-; X86-FAST-NEXT:    movl %ebx, %eax
-; X86-FAST-NEXT:    shldl %cl, %ebp, %eax
-; X86-FAST-NEXT:    movl %edi, %ebp
-; X86-FAST-NEXT:    shldl %cl, %ebx, %ebp
-; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    shldl %cl, %edi, %ebx
+; X86-FAST-NEXT:    movl %esi, %edi
+; X86-FAST-NEXT:    shldl %cl, %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    shldl %cl, %esi, %ebx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    shldl %cl, %edx, %esi
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shldl %cl, %esi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl %edx, 12(%ecx)
-; X86-FAST-NEXT:    movl %ebx, 8(%ecx)
-; X86-FAST-NEXT:    movl %ebp, 4(%ecx)
-; X86-FAST-NEXT:    movl %eax, (%ecx)
-; X86-FAST-NEXT:    movl %ecx, %eax
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %eax, %edx
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %edx, 12(%eax)
+; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    movl %ebx, 4(%eax)
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -320,77 +329,91 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    pushl %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $32, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 28(%ebp), %eax
+; X86-SLOW-NEXT:    movl 48(%ebp), %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %ebx, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl 32(%ebp), %esi
 ; X86-SLOW-NEXT:    movl %edi, %ecx
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %al
-; X86-SLOW-NEXT:    je .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %ebx
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl 36(%ebp), %eax
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
+; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    jne .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ecx, %ebp
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    movl %edx, %esi
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %ebp
-; X86-SLOW-NEXT:    movb %al, %ch
-; X86-SLOW-NEXT:    notb %ch
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    orl %esi, %ebp
-; X86-SLOW-NEXT:    movl %edi, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %edx
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %esi, %edx
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl %ebx, %edi
 ; X86-SLOW-NEXT:    shrl %edi
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    orl %eax, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %ebx
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shrl %cl, %ebx
 ; X86-SLOW-NEXT:    orl %eax, %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
 ; X86-SLOW-NEXT:    movl %ebx, 12(%eax)
-; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %esi, 8(%eax)
 ; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %ebp, (%eax)
-; X86-SLOW-NEXT:    addl $4, %esp
+; X86-SLOW-NEXT:    movl %edi, (%eax)
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 4340f8fd484ae..544ab7fc77374 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -258,51 +258,53 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    pushl %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %esi
+; X86-FAST-NEXT:    movl 28(%ebp), %eax
+; X86-FAST-NEXT:    movl 48(%ebp), %edx
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %ebx
 ; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl %esi, %ebp
-; X86-FAST-NEXT:    movl %ebx, %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:    movl 32(%ebp), %esi
+; X86-FAST-NEXT:    movl %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl 36(%ebp), %eax
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_4
 ; X86-FAST-NEXT:    jmp .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl 44(%ebp), %edi
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edi, %ebx
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl %edx, %esi
-; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl %ebx, %esi
+; X86-FAST-NEXT:    movl %edx, %ebx
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
-; X86-FAST-NEXT:    shrdl %cl, %esi, %edx
-; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
+; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
+; X86-FAST-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edi
-; X86-FAST-NEXT:    movl %edi, 12(%eax)
-; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %esi, 12(%eax)
+; X86-FAST-NEXT:    movl %ebx, 8(%eax)
 ; X86-FAST-NEXT:    movl %edx, 4(%eax)
-; X86-FAST-NEXT:    movl %ebp, (%eax)
-; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -312,78 +314,88 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $16, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
+; X86-SLOW-NEXT:    movl 28(%ebp), %esi
+; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 56(%ebp), %eax
+; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl %edi, %edx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl 32(%ebp), %edx
+; X86-SLOW-NEXT:    movl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    jne .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl 36(%ebp), %esi
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl 40(%ebp), %eax
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %eax
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    je .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ebx
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
-; X86-SLOW-NEXT:    leal (%ebp,%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    notb %al
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    addl %ebx, %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl %edx, %ebx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    leal (%edi,%edi), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    orl %ebp, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
 ; X86-SLOW-NEXT:    movl %esi, 12(%ecx)
-; X86-SLOW-NEXT:    movl %ebp, 8(%ecx)
+; X86-SLOW-NEXT:    movl %ebx, 8(%ecx)
 ; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %eax, (%ecx)
 ; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/function-align.ll b/llvm/test/CodeGen/X86/function-align.ll
new file mode 100644
index 0000000000000..11d0e99929927
--- /dev/null
+++ b/llvm/test/CodeGen/X86/function-align.ll
@@ -0,0 +1,18 @@
+; RUN: llc -function-sections < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: .section .text.f1
+; CHECK-NOT: .p2align
+; CHECK: f1:
+define void @f1() align 1 {
+  ret void
+}
+
+; CHECK: .section .text.f2
+; CHECK-NEXT: .globl f2
+; CHECK-NEXT: .p2align 1
+define void @f2() align 2 {
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78f9af38..df97f49440f74 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -74,43 +74,57 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-LABEL: fshl_i128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 48(%ebp), %edi
+; X86-SSE2-NEXT:    movl 52(%ebp), %eax
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $64, %cl
-; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    cmovnel %ebx, %eax
-; X86-SSE2-NEXT:    movl %edx, %ebp
-; X86-SSE2-NEXT:    cmovnel %edi, %ebp
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    movl %edx, %ecx
+; X86-SSE2-NEXT:    cmovnel %edi, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    movl %esi, %ebx
+; X86-SSE2-NEXT:    cmovnel %eax, %ebx
+; X86-SSE2-NEXT:    cmovnel 44(%ebp), %eax
+; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
+; X86-SSE2-NEXT:    cmovel 36(%ebp), %esi
+; X86-SSE2-NEXT:    cmovel 32(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    cmovnel %esi, %edx
-; X86-SSE2-NEXT:    cmovnel %ebp, %esi
-; X86-SSE2-NEXT:    cmovnel %eax, %ebp
-; X86-SSE2-NEXT:    cmovel %edi, %ebx
+; X86-SSE2-NEXT:    cmovnel %edx, %esi
+; X86-SSE2-NEXT:    cmovnel %ebx, %edx
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    cmovnel %ecx, %ebx
 ; X86-SSE2-NEXT:    cmovel %eax, %edi
-; X86-SSE2-NEXT:    movl %edi, %eax
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %eax
-; X86-SSE2-NEXT:    movl %ebp, %ebx
-; X86-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-SSE2-NEXT:    movl %esi, %edi
-; X86-SSE2-NEXT:    shldl %cl, %ebp, %edi
+; X86-SSE2-NEXT:    cmovel %ecx, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
-; X86-SSE2-NEXT:    movl %edi, 8(%ecx)
-; X86-SSE2-NEXT:    movl %ebx, 4(%ecx)
-; X86-SSE2-NEXT:    movl %eax, (%ecx)
-; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SSE2-NEXT:    movl %ebx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
+; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %edx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    shldl %cl, %ebx, %edi
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index 2849e448a0534..b4546c1e983c4 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -5,17 +5,20 @@
 define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: add_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    addl 40(%ebp), %esi
+; X86-NEXT:    adcl 44(%ebp), %edi
+; X86-NEXT:    adcl 48(%ebp), %ecx
+; X86-NEXT:    adcl 52(%ebp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -24,8 +27,10 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add_i128:
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index 4152dcf07f7e7..2174d5056e6ce 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -55,41 +55,47 @@ define void @store(PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 20(%esp), %ecx
+; CHECK-X86-NEXT:    movl 24(%esp), %edx
+; CHECK-X86-NEXT:    movl 28(%esp), %esi
+; CHECK-X86-NEXT:    movl 32(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
 }
 
 ; Illustrate stack alignment
-; FIXME(#77401): alignment on x86-32 is ABI-incorrect.
 define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X64-F128-LABEL: store_perturbed:
 ; CHECK-X64-F128:       # %bb.0:
@@ -130,34 +136,41 @@ define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 16(%esp), %eax
-; CHECK-X86-NEXT:    movl 20(%esp), %ecx
-; CHECK-X86-NEXT:    movl 24(%esp), %edx
-; CHECK-X86-NEXT:    movl 28(%esp), %esi
-; CHECK-X86-NEXT:    movl 32(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %eax
+; CHECK-X86-NEXT:    movl 36(%esp), %ecx
+; CHECK-X86-NEXT:    movl 40(%esp), %edx
+; CHECK-X86-NEXT:    movl 44(%esp), %esi
+; CHECK-X86-NEXT:    movl 48(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store_perturbed:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 32(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
@@ -271,34 +284,41 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %ecx
+; CHECK-X86-NEXT:    movl 36(%esp), %edx
+; CHECK-X86-NEXT:    movl 40(%esp), %esi
+; CHECK-X86-NEXT:    movl 44(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: first_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -344,34 +364,41 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 48(%esp), %ecx
-; CHECK-X86-NEXT:    movl 52(%esp), %edx
-; CHECK-X86-NEXT:    movl 56(%esp), %esi
-; CHECK-X86-NEXT:    movl 60(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 48(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 52(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -417,34 +444,41 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 72(%esp), %ecx
-; CHECK-X86-NEXT:    movl 76(%esp), %edx
-; CHECK-X86-NEXT:    movl 80(%esp), %esi
-; CHECK-X86-NEXT:    movl 84(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 80(%esp), %ecx
+; CHECK-X86-NEXT:    movl 84(%esp), %edx
+; CHECK-X86-NEXT:    movl 88(%esp), %esi
+; CHECK-X86-NEXT:    movl 92(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 72(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 76(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 80(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 84(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 72(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 76(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 80(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -488,34 +522,41 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 56(%esp), %ecx
-; CHECK-X86-NEXT:    movl 60(%esp), %edx
-; CHECK-X86-NEXT:    movl 64(%esp), %esi
-; CHECK-X86-NEXT:    movl 68(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 64(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 68(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -571,32 +612,43 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $56, %esp
+; CHECK-X86-NEXT:    movl 64(%esp), %eax
+; CHECK-X86-NEXT:    movl 68(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72(%esp), %edx
+; CHECK-X86-NEXT:    movl 76(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 28(%esp)
+; CHECK-X86-NEXT:    movl %edx, 24(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 20(%esp)
+; CHECK-X86-NEXT:    movl %eax, 16(%esp)
+; CHECK-X86-NEXT:    leal 32(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
 ; CHECK-X86-NEXT:    calll first_arg@PLT
-; CHECK-X86-NEXT:    addl $56, %esp
+; CHECK-X86-NEXT:    addl $52, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $64, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 16(%esp)
+; CHECK-MSVC32-NEXT:    leal 32(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
 ; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    addl $20, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @first_arg(PrimTy %x)
@@ -686,48 +738,59 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll leading_args@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
@@ -836,56 +899,67 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $104, %esp
+; CHECK-X86-NEXT:    movl 112(%esp), %eax
+; CHECK-X86-NEXT:    movl 116(%esp), %ecx
+; CHECK-X86-NEXT:    movl 120(%esp), %edx
+; CHECK-X86-NEXT:    movl 124(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 76(%esp)
+; CHECK-X86-NEXT:    movl %edx, 72(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 68(%esp)
+; CHECK-X86-NEXT:    movl %eax, 64(%esp)
+; CHECK-X86-NEXT:    leal 80(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 60(%esp)
+; CHECK-X86-NEXT:    movl $0, 56(%esp)
+; CHECK-X86-NEXT:    movl $0, 52(%esp)
+; CHECK-X86-NEXT:    movl $0, 48(%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll many_leading_args@PLT
-; CHECK-X86-NEXT:    addl $104, %esp
+; CHECK-X86-NEXT:    addl $100, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $112, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 76(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 72(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 68(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 64(%esp)
+; CHECK-MSVC32-NEXT:    leal 80(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 48(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    addl $68, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
@@ -975,48 +1049,59 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll trailing_arg@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index 717f52f198ee8..7d5757392c982 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -8,18 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl $30, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl 28(%ebp), %esi
+; X86-NEXT:    adcl 32(%ebp), %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    shrdl $2, %ecx, %edx
 ; X86-NEXT:    movl %ecx, %esi
@@ -29,8 +32,10 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
@@ -52,38 +57,44 @@ define i128 @test1(i128 %x) nounwind {
 define i128 @test2(i128 %x) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shrl $30, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    shrdl $2, %edx, %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl 28(%ebp), %edx
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    shrdl $2, %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    sarl $2, %edx
-; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    sarl $2, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %ebx
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test2:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 3f890b7f2443a..9011832421326 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -8,15 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 55c318e87a5a0..bdceeefbcfaba 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -123,31 +123,34 @@ define i64 @test_i64(i64 %a) nounwind {
 define i128 @test_i128(i128 %a) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128:
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index c52b3ed6c926d..4a6c1d0ae5deb 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -10,33 +10,39 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_lt_power_of_2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %loop
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    addl $1, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl $1, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    shrdl $28, %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shrdl $28, %ebx, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -73,15 +79,21 @@ exit:
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -98,15 +110,21 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -123,13 +141,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -146,13 +170,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -170,13 +200,17 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
 ; X86-NEXT:    shldl $17, %edx, %esi
 ; X86-NEXT:    shldl $17, %ecx, %edx
 ; X86-NEXT:    shldl $17, %eax, %ecx
@@ -194,9 +228,11 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    calll use@PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
diff --git a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
index ac4963f1f79cc..17065a4a61c2c 100644
--- a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
+++ b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
@@ -1,10 +1,10 @@
 ; RUN: not llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s
 
-; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo
+; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo, bar, baz
 
 declare void @g()
 
 define void @f(i32 %arg) {
-  call void @g() [ "foo"(i32 %arg) ]
+  call void @g() [ "foo"(i32 %arg), "bar"(i32 %arg), "baz"(i32 %arg) ]
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index fc1cc1f65627a..e10e48f9aea08 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -18,85 +18,80 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    imull %edi, %eax
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
   %k = mul i128 %t, %u
   ret i128 %k
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 961205c50d976..724b2dc4c431a 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -105,31 +105,35 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-LABEL: neg_abs_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    subl %ebx, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -259,37 +263,42 @@ define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind {
 define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: sub_abs_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
 ; X86-NEXT:    subl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: sub_abs_i128:
diff --git a/llvm/test/CodeGen/X86/noreturn-call-win64.ll b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
index 57aa022e89e29..13be1f13cf3dc 100644
--- a/llvm/test/CodeGen/X86/noreturn-call-win64.ll
+++ b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
@@ -111,3 +111,15 @@ declare dso_local void @"??1MakeCleanup@@QEAA@XZ"(ptr)
 ; CHECK: # %unreachable
 ; CHECK: int3
 ; CHECK: .seh_handlerdata
+
+
+define dso_local void @last_call_no_return() {
+  call void @abort1()
+  unreachable
+}
+
+; CHECK-LABEL: last_call_no_return:
+; CHECK: callq abort1
+; CHECK-NEXT: int3
+; CHECK-NEXT: .seh_endproc
+
diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
index 672ebc1ec7275..69ae1f19f3200 100644
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -9,6 +9,7 @@
 ; RUN: llc -O1 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O1
 ; RUN: llc -O2 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O2
 ; RUN: llc -O3 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O3
+; RUN: llc -O3 -mcpu=haswell -mattr=cx16 < %s | FileCheck %s --check-prefixes=HASWELL-O3
 
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -50,6 +51,14 @@ define void @mixed_atomic_non_atomic(ptr %a) {
 ; O3-NEXT:    movl $1, (%rdi)
 ; O3-NEXT:    decl (%rdi)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: mixed_atomic_non_atomic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    incl (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection0:
+; HASWELL-O3-NEXT:    movl $1, (%rdi)
+; HASWELL-O3-NEXT:    decl (%rdi)
+; HASWELL-O3-NEXT:    retq
 entry:
   ; Accesses the same location atomically and non-atomically.
   %0 = load volatile i32, ptr %a, align 4
@@ -107,6 +116,17 @@ define i64 @mixed_complex_atomic_non_atomic(ptr %a, ptr %b) {
 ; O3-NEXT:    movq %rdx, (%rsi)
 ; O3-NEXT:    addq %rcx, %rax
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: mixed_complex_atomic_non_atomic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movl $1, %eax
+; HASWELL-O3-NEXT:  .Lpcsection1:
+; HASWELL-O3-NEXT:    lock xaddq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq (%rsi), %rcx
+; HASWELL-O3-NEXT:    leaq 1(%rcx), %rdx
+; HASWELL-O3-NEXT:    movq %rdx, (%rsi)
+; HASWELL-O3-NEXT:    addq %rcx, %rax
+; HASWELL-O3-NEXT:    retq
 entry:
   %0 = atomicrmw add ptr %a, i64 1 monotonic, align 8, !pcsections !0
   %1 = load i64, ptr %b, align 8
@@ -148,6 +168,14 @@ define i8 @atomic8_load_unordered(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection2:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a unordered, align 1, !pcsections !0
@@ -187,6 +215,14 @@ define i8 @atomic8_load_monotonic(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection3:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a monotonic, align 1, !pcsections !0
@@ -226,6 +262,14 @@ define i8 @atomic8_load_acquire(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection4:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a acquire, align 1, !pcsections !0
@@ -265,6 +309,14 @@ define i8 @atomic8_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection5:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a seq_cst, align 1, !pcsections !0
@@ -304,6 +356,14 @@ define void @atomic8_store_unordered(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection6:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a unordered, align 1, !pcsections !0
@@ -343,6 +403,14 @@ define void @atomic8_store_monotonic(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection7:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a monotonic, align 1, !pcsections !0
@@ -382,6 +450,14 @@ define void @atomic8_store_release(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection8:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a release, align 1, !pcsections !0
@@ -425,6 +501,15 @@ define void @atomic8_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection9:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a seq_cst, align 1, !pcsections !0
@@ -468,6 +553,15 @@ define void @atomic8_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection10:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -507,6 +601,14 @@ define void @atomic8_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection11:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -546,6 +648,14 @@ define void @atomic8_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection12:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -585,6 +695,14 @@ define void @atomic8_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection13:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -624,6 +742,14 @@ define void @atomic8_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection14:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -663,6 +789,14 @@ define void @atomic8_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection15:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -763,6 +897,27 @@ define void @atomic8_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection16:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB16_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection17:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection18:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection19:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection20:
+; HASWELL-O3-NEXT:    jne .LBB16_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -806,6 +961,15 @@ define void @atomic8_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection21:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -845,6 +1009,14 @@ define void @atomic8_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection22:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -884,6 +1056,14 @@ define void @atomic8_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection23:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -923,6 +1103,14 @@ define void @atomic8_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection24:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -962,6 +1150,14 @@ define void @atomic8_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection25:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1001,6 +1197,14 @@ define void @atomic8_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection26:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1101,6 +1305,27 @@ define void @atomic8_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection27:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB23_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection28:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection29:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection30:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection31:
+; HASWELL-O3-NEXT:    jne .LBB23_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1144,6 +1369,15 @@ define void @atomic8_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection32:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1183,6 +1417,14 @@ define void @atomic8_add_release(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection33:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1222,6 +1464,14 @@ define void @atomic8_sub_release(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection34:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1261,6 +1511,14 @@ define void @atomic8_and_release(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection35:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1300,6 +1558,14 @@ define void @atomic8_or_release(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection36:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1339,6 +1605,14 @@ define void @atomic8_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection37:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1439,6 +1713,27 @@ define void @atomic8_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection38:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB30_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection39:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection40:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection41:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection42:
+; HASWELL-O3-NEXT:    jne .LBB30_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1482,6 +1777,15 @@ define void @atomic8_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection43:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1521,6 +1825,14 @@ define void @atomic8_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection44:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1560,6 +1872,14 @@ define void @atomic8_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection45:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1599,6 +1919,14 @@ define void @atomic8_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection46:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1638,6 +1966,14 @@ define void @atomic8_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection47:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1677,6 +2013,14 @@ define void @atomic8_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection48:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1777,6 +2121,27 @@ define void @atomic8_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection49:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB37_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection50:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection51:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection52:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection53:
+; HASWELL-O3-NEXT:    jne .LBB37_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1820,6 +2185,15 @@ define void @atomic8_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection54:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1859,6 +2233,14 @@ define void @atomic8_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection55:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1898,6 +2280,14 @@ define void @atomic8_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection56:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1937,6 +2327,14 @@ define void @atomic8_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection57:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1976,6 +2374,14 @@ define void @atomic8_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection58:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2015,6 +2421,14 @@ define void @atomic8_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection59:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2115,6 +2529,27 @@ define void @atomic8_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection60:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB44_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection61:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection62:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection63:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection64:
+; HASWELL-O3-NEXT:    jne .LBB44_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2200,6 +2635,25 @@ define void @atomic8_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection65:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection66:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection67:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection68:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection69:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection70:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 monotonic monotonic, align 1, !pcsections !0
@@ -2287,6 +2741,25 @@ define void @atomic8_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection71:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection72:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection73:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection74:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection75:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection76:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 acquire monotonic, align 1, !pcsections !0
@@ -2374,6 +2847,25 @@ define void @atomic8_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection77:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection78:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection79:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection80:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection81:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection82:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 release monotonic, align 1, !pcsections !0
@@ -2461,6 +2953,25 @@ define void @atomic8_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection83:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection84:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection85:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection86:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection87:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection88:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 acq_rel monotonic, align 1, !pcsections !0
@@ -2548,6 +3059,25 @@ define void @atomic8_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection89:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection90:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection91:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection92:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection93:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection94:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 seq_cst monotonic, align 1, !pcsections !0
@@ -2589,6 +3119,14 @@ define i16 @atomic16_load_unordered(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection95:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a unordered, align 2, !pcsections !0
@@ -2628,6 +3166,14 @@ define i16 @atomic16_load_monotonic(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection96:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a monotonic, align 2, !pcsections !0
@@ -2667,6 +3213,14 @@ define i16 @atomic16_load_acquire(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection97:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a acquire, align 2, !pcsections !0
@@ -2706,6 +3260,14 @@ define i16 @atomic16_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection98:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a seq_cst, align 2, !pcsections !0
@@ -2745,6 +3307,14 @@ define void @atomic16_store_unordered(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection99:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a unordered, align 2, !pcsections !0
@@ -2784,6 +3354,14 @@ define void @atomic16_store_monotonic(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection100:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a monotonic, align 2, !pcsections !0
@@ -2823,6 +3401,14 @@ define void @atomic16_store_release(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection101:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a release, align 2, !pcsections !0
@@ -2866,6 +3452,15 @@ define void @atomic16_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection102:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a seq_cst, align 2, !pcsections !0
@@ -2909,6 +3504,15 @@ define void @atomic16_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection103:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -2948,6 +3552,14 @@ define void @atomic16_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection104:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -2987,6 +3599,14 @@ define void @atomic16_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection105:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3026,6 +3646,14 @@ define void @atomic16_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection106:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3065,6 +3693,14 @@ define void @atomic16_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection107:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3104,6 +3740,14 @@ define void @atomic16_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection108:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3220,6 +3864,31 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection109:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB64_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection110:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection111:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection112:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection113:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection114:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection115:
+; HASWELL-O3-NEXT:    jne .LBB64_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3263,6 +3932,15 @@ define void @atomic16_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection116:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3302,6 +3980,14 @@ define void @atomic16_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection117:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3341,6 +4027,14 @@ define void @atomic16_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection118:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3380,6 +4074,14 @@ define void @atomic16_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection119:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3419,6 +4121,14 @@ define void @atomic16_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection120:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3458,6 +4168,14 @@ define void @atomic16_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection121:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3574,6 +4292,31 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection122:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB71_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection123:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection124:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection125:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection126:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection127:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection128:
+; HASWELL-O3-NEXT:    jne .LBB71_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3617,6 +4360,15 @@ define void @atomic16_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection129:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3656,6 +4408,14 @@ define void @atomic16_add_release(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection130:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3695,6 +4455,14 @@ define void @atomic16_sub_release(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection131:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3734,6 +4502,14 @@ define void @atomic16_and_release(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection132:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3773,6 +4549,14 @@ define void @atomic16_or_release(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection133:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3812,6 +4596,14 @@ define void @atomic16_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection134:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3928,6 +4720,31 @@ define void @atomic16_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection135:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB78_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection136:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection137:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection138:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection139:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection140:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection141:
+; HASWELL-O3-NEXT:    jne .LBB78_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3971,6 +4788,15 @@ define void @atomic16_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection142:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4010,6 +4836,14 @@ define void @atomic16_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection143:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4049,6 +4883,14 @@ define void @atomic16_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection144:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4088,6 +4930,14 @@ define void @atomic16_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection145:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4127,6 +4977,14 @@ define void @atomic16_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection146:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4166,6 +5024,14 @@ define void @atomic16_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection147:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4282,6 +5148,31 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection148:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB85_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection149:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection150:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection151:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection152:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection153:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection154:
+; HASWELL-O3-NEXT:    jne .LBB85_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4325,6 +5216,15 @@ define void @atomic16_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection155:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4364,6 +5264,14 @@ define void @atomic16_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection156:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4403,6 +5311,14 @@ define void @atomic16_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection157:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4442,6 +5358,14 @@ define void @atomic16_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection158:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4481,6 +5405,14 @@ define void @atomic16_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection159:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4520,6 +5452,14 @@ define void @atomic16_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection160:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4636,6 +5576,31 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection161:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB92_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection162:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection163:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection164:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection165:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection166:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection167:
+; HASWELL-O3-NEXT:    jne .LBB92_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4712,6 +5677,22 @@ define void @atomic16_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection168:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection169:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection170:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 monotonic monotonic, align 2, !pcsections !0
@@ -4790,6 +5771,22 @@ define void @atomic16_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection171:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection172:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection173:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 acquire monotonic, align 2, !pcsections !0
@@ -4868,6 +5865,22 @@ define void @atomic16_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection174:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection175:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection176:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 release monotonic, align 2, !pcsections !0
@@ -4946,6 +5959,22 @@ define void @atomic16_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection177:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection178:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection179:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 acq_rel monotonic, align 2, !pcsections !0
@@ -5024,6 +6053,22 @@ define void @atomic16_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection180:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection181:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection182:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 seq_cst monotonic, align 2, !pcsections !0
@@ -5065,6 +6110,14 @@ define i32 @atomic32_load_unordered(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection183:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a unordered, align 4, !pcsections !0
@@ -5104,6 +6157,14 @@ define i32 @atomic32_load_monotonic(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection184:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a monotonic, align 4, !pcsections !0
@@ -5143,6 +6204,14 @@ define i32 @atomic32_load_acquire(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection185:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a acquire, align 4, !pcsections !0
@@ -5182,6 +6251,14 @@ define i32 @atomic32_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection186:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a seq_cst, align 4, !pcsections !0
@@ -5221,6 +6298,14 @@ define void @atomic32_store_unordered(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection187:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a unordered, align 4, !pcsections !0
@@ -5260,6 +6345,14 @@ define void @atomic32_store_monotonic(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection188:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a monotonic, align 4, !pcsections !0
@@ -5299,6 +6392,14 @@ define void @atomic32_store_release(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection189:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a release, align 4, !pcsections !0
@@ -5342,6 +6443,15 @@ define void @atomic32_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection190:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a seq_cst, align 4, !pcsections !0
@@ -5385,6 +6495,15 @@ define void @atomic32_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection191:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5424,6 +6543,14 @@ define void @atomic32_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection192:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5463,6 +6590,14 @@ define void @atomic32_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection193:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5502,6 +6637,14 @@ define void @atomic32_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection194:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5541,6 +6684,14 @@ define void @atomic32_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection195:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5580,6 +6731,14 @@ define void @atomic32_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection196:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5680,6 +6839,27 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection197:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB112_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection198:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection199:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection200:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection201:
+; HASWELL-O3-NEXT:    jne .LBB112_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5723,6 +6903,15 @@ define void @atomic32_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection202:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5762,6 +6951,14 @@ define void @atomic32_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection203:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5801,6 +6998,14 @@ define void @atomic32_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection204:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5840,6 +7045,14 @@ define void @atomic32_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection205:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5879,6 +7092,14 @@ define void @atomic32_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection206:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5918,6 +7139,14 @@ define void @atomic32_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection207:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -6018,6 +7247,27 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection208:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB119_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection209:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection210:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection211:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection212:
+; HASWELL-O3-NEXT:    jne .LBB119_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -6061,6 +7311,15 @@ define void @atomic32_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection213:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6100,6 +7359,14 @@ define void @atomic32_add_release(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection214:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6139,6 +7406,14 @@ define void @atomic32_sub_release(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection215:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6178,6 +7453,14 @@ define void @atomic32_and_release(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection216:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6217,6 +7500,14 @@ define void @atomic32_or_release(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection217:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6256,6 +7547,14 @@ define void @atomic32_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection218:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6356,6 +7655,27 @@ define void @atomic32_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection219:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB126_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection220:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection221:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection222:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection223:
+; HASWELL-O3-NEXT:    jne .LBB126_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6399,6 +7719,15 @@ define void @atomic32_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection224:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6438,6 +7767,14 @@ define void @atomic32_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection225:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6477,6 +7814,14 @@ define void @atomic32_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection226:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6516,6 +7861,14 @@ define void @atomic32_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection227:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6555,6 +7908,14 @@ define void @atomic32_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection228:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6594,6 +7955,14 @@ define void @atomic32_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection229:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6694,6 +8063,27 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection230:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB133_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection231:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection232:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection233:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection234:
+; HASWELL-O3-NEXT:    jne .LBB133_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6737,6 +8127,15 @@ define void @atomic32_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection235:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6776,6 +8175,14 @@ define void @atomic32_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection236:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6815,6 +8222,14 @@ define void @atomic32_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection237:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6854,6 +8269,14 @@ define void @atomic32_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection238:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6893,6 +8316,14 @@ define void @atomic32_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection239:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6932,6 +8363,14 @@ define void @atomic32_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection240:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -7032,6 +8471,27 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection241:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB140_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection242:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection243:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection244:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection245:
+; HASWELL-O3-NEXT:    jne .LBB140_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -7117,6 +8577,25 @@ define void @atomic32_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection246:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection247:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection248:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection249:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection250:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection251:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 monotonic monotonic, align 4, !pcsections !0
@@ -7204,6 +8683,25 @@ define void @atomic32_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection252:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection253:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection254:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection255:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection256:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection257:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 acquire monotonic, align 4, !pcsections !0
@@ -7291,6 +8789,25 @@ define void @atomic32_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection258:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection259:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection260:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection261:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection262:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection263:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 release monotonic, align 4, !pcsections !0
@@ -7378,6 +8895,25 @@ define void @atomic32_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection264:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection265:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection266:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection267:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection268:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection269:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 acq_rel monotonic, align 4, !pcsections !0
@@ -7465,6 +9001,25 @@ define void @atomic32_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection270:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection271:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection272:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection273:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection274:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection275:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 seq_cst monotonic, align 4, !pcsections !0
@@ -7506,6 +9061,14 @@ define i64 @atomic64_load_unordered(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection276:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a unordered, align 8, !pcsections !0
@@ -7545,6 +9108,14 @@ define i64 @atomic64_load_monotonic(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection277:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a monotonic, align 8, !pcsections !0
@@ -7584,6 +9155,14 @@ define i64 @atomic64_load_acquire(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection278:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a acquire, align 8, !pcsections !0
@@ -7623,6 +9202,14 @@ define i64 @atomic64_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection279:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a seq_cst, align 8, !pcsections !0
@@ -7662,6 +9249,14 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection280:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic ptr, ptr %a seq_cst, align 8, !pcsections !0
@@ -7701,6 +9296,14 @@ define void @atomic64_store_unordered(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection281:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a unordered, align 8, !pcsections !0
@@ -7740,6 +9343,14 @@ define void @atomic64_store_monotonic(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection282:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a monotonic, align 8, !pcsections !0
@@ -7779,6 +9390,14 @@ define void @atomic64_store_release(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection283:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a release, align 8, !pcsections !0
@@ -7822,6 +9441,15 @@ define void @atomic64_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection284:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a seq_cst, align 8, !pcsections !0
@@ -7861,6 +9489,14 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection285:
+; HASWELL-O3-NEXT:    xchgq %rsi, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic ptr %v, ptr %a seq_cst, align 8, !pcsections !0
@@ -7904,6 +9540,15 @@ define void @atomic64_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection286:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -7943,6 +9588,14 @@ define void @atomic64_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection287:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -7982,6 +9635,14 @@ define void @atomic64_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection288:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8021,6 +9682,14 @@ define void @atomic64_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection289:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8060,6 +9729,14 @@ define void @atomic64_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection290:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8099,6 +9776,14 @@ define void @atomic64_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection291:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8202,6 +9887,27 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection292:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB162_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection293:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection294:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection295:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection296:
+; HASWELL-O3-NEXT:    jne .LBB162_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8245,6 +9951,15 @@ define void @atomic64_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection297:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8284,6 +9999,14 @@ define void @atomic64_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection298:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8323,6 +10046,14 @@ define void @atomic64_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection299:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8362,6 +10093,14 @@ define void @atomic64_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection300:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8401,6 +10140,14 @@ define void @atomic64_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection301:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8440,6 +10187,14 @@ define void @atomic64_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection302:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8543,6 +10298,27 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection303:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB169_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection304:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection305:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection306:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection307:
+; HASWELL-O3-NEXT:    jne .LBB169_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8586,6 +10362,15 @@ define void @atomic64_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection308:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8625,6 +10410,14 @@ define void @atomic64_add_release(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection309:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8664,6 +10457,14 @@ define void @atomic64_sub_release(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection310:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8703,6 +10504,14 @@ define void @atomic64_and_release(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection311:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8742,6 +10551,14 @@ define void @atomic64_or_release(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection312:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8781,6 +10598,14 @@ define void @atomic64_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection313:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8884,6 +10709,27 @@ define void @atomic64_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection314:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB176_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection315:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection316:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection317:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection318:
+; HASWELL-O3-NEXT:    jne .LBB176_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8927,6 +10773,15 @@ define void @atomic64_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection319:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -8966,6 +10821,14 @@ define void @atomic64_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection320:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9005,6 +10868,14 @@ define void @atomic64_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection321:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9044,6 +10915,14 @@ define void @atomic64_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection322:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9083,6 +10962,14 @@ define void @atomic64_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection323:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9122,6 +11009,14 @@ define void @atomic64_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection324:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9225,6 +11120,27 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection325:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB183_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection326:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection327:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection328:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection329:
+; HASWELL-O3-NEXT:    jne .LBB183_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9268,6 +11184,15 @@ define void @atomic64_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection330:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9307,6 +11232,14 @@ define void @atomic64_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection331:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9346,6 +11279,14 @@ define void @atomic64_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection332:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9385,6 +11326,14 @@ define void @atomic64_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection333:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9424,6 +11373,14 @@ define void @atomic64_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection334:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9463,6 +11420,14 @@ define void @atomic64_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection335:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9566,6 +11531,27 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection336:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB190_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection337:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection338:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection339:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection340:
+; HASWELL-O3-NEXT:    jne .LBB190_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9651,6 +11637,25 @@ define void @atomic64_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection341:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection342:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection343:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection344:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection345:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection346:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 monotonic monotonic, align 8, !pcsections !0
@@ -9738,6 +11743,25 @@ define void @atomic64_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection347:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection348:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection349:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection350:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection351:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection352:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 acquire monotonic, align 8, !pcsections !0
@@ -9825,6 +11849,25 @@ define void @atomic64_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection353:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection354:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection355:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection356:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection357:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection358:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 release monotonic, align 8, !pcsections !0
@@ -9912,6 +11955,25 @@ define void @atomic64_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection359:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection360:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection361:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection362:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection363:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection364:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 acq_rel monotonic, align 8, !pcsections !0
@@ -9999,6 +12061,25 @@ define void @atomic64_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $3, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection365:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection366:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection367:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection368:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection369:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection370:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $3, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 seq_cst monotonic, align 8, !pcsections !0
@@ -10044,6 +12125,15 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) {
 ; O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq %rsi, %rax
+; HASWELL-O3-NEXT:    movq foo(%rip), %rcx
+; HASWELL-O3-NEXT:  .Lpcsection371:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, ptr %v1, ptr %v2 seq_cst seq_cst, align 8, !pcsections !0
@@ -10102,6 +12192,18 @@ define i64 @atomic_use_cond(ptr %a) {
 ; O3-NEXT:  .LBB197_2: # %else
 ; O3-NEXT:    movl $2, %eax
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic_use_cond:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:  .Lpcsection372:
+; HASWELL-O3-NEXT:    lock decq (%rdi)
+; HASWELL-O3-NEXT:    jne .LBB197_2
+; HASWELL-O3-NEXT:  # %bb.1: # %then
+; HASWELL-O3-NEXT:    movl $1, %eax
+; HASWELL-O3-NEXT:    retq
+; HASWELL-O3-NEXT:  .LBB197_2: # %else
+; HASWELL-O3-NEXT:    movl $2, %eax
+; HASWELL-O3-NEXT:    retq
 entry:
   %x = atomicrmw sub ptr %a, i64 1 seq_cst, align 8, !pcsections !0
   %y = icmp eq i64 %x, 1
@@ -10196,6 +12298,18 @@ define i128 @atomic128_load_unordered(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection373:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection374:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection375:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a unordered, align 16, !pcsections !0
@@ -10285,6 +12399,18 @@ define i128 @atomic128_load_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection376:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection377:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection378:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a monotonic, align 16, !pcsections !0
@@ -10374,6 +12500,18 @@ define i128 @atomic128_load_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection379:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection380:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection381:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a acquire, align 16, !pcsections !0
@@ -10463,6 +12601,18 @@ define i128 @atomic128_load_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection382:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection383:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection384:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a seq_cst, align 16, !pcsections !0
@@ -10502,6 +12652,14 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection385:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic ptr, ptr %a seq_cst, align 16, !pcsections !0
@@ -10629,6 +12787,16 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection386:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection387:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a unordered, align 16, !pcsections !0
@@ -10756,6 +12924,16 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection388:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection389:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a monotonic, align 16, !pcsections !0
@@ -10883,6 +13061,16 @@ define void @atomic128_store_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection390:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection391:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a release, align 16, !pcsections !0
@@ -11010,6 +13198,18 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection392:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection393:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection394:
+; HASWELL-O3-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a seq_cst, align 16, !pcsections !0
@@ -11049,6 +13249,14 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection395:
+; HASWELL-O3-NEXT:    xchgq %rsi, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic ptr %v, ptr %a seq_cst, align 16, !pcsections !0
@@ -11176,6 +13384,33 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection396:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection397:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection398:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB208_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection399:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection400:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection401:
+; HASWELL-O3-NEXT:    jne .LBB208_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11309,6 +13544,35 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection402:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection403:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB209_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection404:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection405:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection406:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection407:
+; HASWELL-O3-NEXT:    jne .LBB209_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11442,6 +13706,35 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection408:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection409:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB210_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection410:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection411:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection412:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection413:
+; HASWELL-O3-NEXT:    jne .LBB210_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11574,6 +13867,34 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection414:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection415:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB211_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection416:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection417:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection418:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection419:
+; HASWELL-O3-NEXT:    jne .LBB211_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11699,6 +14020,33 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection420:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection421:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB212_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection422:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection423:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection424:
+; HASWELL-O3-NEXT:    jne .LBB212_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11824,6 +14172,33 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection425:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection426:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB213_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection427:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection428:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection429:
+; HASWELL-O3-NEXT:    jne .LBB213_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11964,6 +14339,36 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection430:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection431:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection432:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB214_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection433:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection434:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection435:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection436:
+; HASWELL-O3-NEXT:    jne .LBB214_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -12091,6 +14496,33 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection437:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection438:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection439:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB215_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection440:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection441:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection442:
+; HASWELL-O3-NEXT:    jne .LBB215_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12224,6 +14656,35 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection443:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection444:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB216_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection445:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection446:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection447:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection448:
+; HASWELL-O3-NEXT:    jne .LBB216_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12357,6 +14818,35 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection449:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection450:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB217_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection451:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection452:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection453:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection454:
+; HASWELL-O3-NEXT:    jne .LBB217_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12489,6 +14979,34 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection455:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection456:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB218_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection457:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection458:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection459:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection460:
+; HASWELL-O3-NEXT:    jne .LBB218_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12614,6 +15132,33 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection461:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection462:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB219_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection463:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection464:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection465:
+; HASWELL-O3-NEXT:    jne .LBB219_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12739,6 +15284,33 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection466:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection467:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB220_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection468:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection469:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection470:
+; HASWELL-O3-NEXT:    jne .LBB220_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12879,6 +15451,36 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection471:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection472:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection473:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB221_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection474:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection475:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection476:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection477:
+; HASWELL-O3-NEXT:    jne .LBB221_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -13006,6 +15608,33 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_release:
+; HASWELL-O3:       # %bb.0:
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection478:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection479:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection480:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB222_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection481:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection482:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection483:
+; HASWELL-O3-NEXT:    jne .LBB222_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 release, align 16, !pcsections !0
   store volatile i64 1, ptr @foo, align 8
@@ -13138,6 +15767,35 @@ define void @atomic128_add_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection484:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection485:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB223_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection486:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection487:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection488:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection489:
+; HASWELL-O3-NEXT:    jne .LBB223_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13271,6 +15929,35 @@ define void @atomic128_sub_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection490:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection491:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB224_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection492:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection493:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection494:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection495:
+; HASWELL-O3-NEXT:    jne .LBB224_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13403,6 +16090,34 @@ define void @atomic128_and_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection496:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection497:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB225_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection498:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection499:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection500:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection501:
+; HASWELL-O3-NEXT:    jne .LBB225_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13528,6 +16243,33 @@ define void @atomic128_or_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection502:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection503:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB226_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection504:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection505:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection506:
+; HASWELL-O3-NEXT:    jne .LBB226_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13653,6 +16395,33 @@ define void @atomic128_xor_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection507:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection508:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB227_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection509:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection510:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection511:
+; HASWELL-O3-NEXT:    jne .LBB227_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13793,6 +16562,36 @@ define void @atomic128_nand_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection512:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection513:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection514:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB228_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection515:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection516:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection517:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection518:
+; HASWELL-O3-NEXT:    jne .LBB228_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13920,6 +16719,33 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection519:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection520:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection521:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB229_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection522:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection523:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection524:
+; HASWELL-O3-NEXT:    jne .LBB229_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14053,6 +16879,35 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection525:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection526:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB230_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection527:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection528:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection529:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection530:
+; HASWELL-O3-NEXT:    jne .LBB230_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14186,6 +17041,35 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection531:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection532:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB231_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection533:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection534:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection535:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection536:
+; HASWELL-O3-NEXT:    jne .LBB231_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14318,6 +17202,34 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection537:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection538:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB232_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection539:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection540:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection541:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection542:
+; HASWELL-O3-NEXT:    jne .LBB232_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14443,6 +17355,33 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection543:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection544:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB233_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection545:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection546:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection547:
+; HASWELL-O3-NEXT:    jne .LBB233_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14568,6 +17507,33 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection548:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection549:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB234_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection550:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection551:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection552:
+; HASWELL-O3-NEXT:    jne .LBB234_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14708,6 +17674,36 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection553:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection554:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection555:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB235_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection556:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection557:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection558:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection559:
+; HASWELL-O3-NEXT:    jne .LBB235_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14835,6 +17831,33 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection560:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection561:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection562:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB236_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection563:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection564:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection565:
+; HASWELL-O3-NEXT:    jne .LBB236_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -14968,6 +17991,35 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection566:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection567:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB237_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection568:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection569:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection570:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection571:
+; HASWELL-O3-NEXT:    jne .LBB237_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15101,6 +18153,35 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection572:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection573:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB238_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection574:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection575:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection576:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection577:
+; HASWELL-O3-NEXT:    jne .LBB238_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15233,6 +18314,34 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection578:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection579:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB239_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection580:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection581:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection582:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection583:
+; HASWELL-O3-NEXT:    jne .LBB239_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15358,6 +18467,33 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection584:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection585:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB240_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection586:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection587:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection588:
+; HASWELL-O3-NEXT:    jne .LBB240_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15483,6 +18619,33 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection589:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection590:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB241_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection591:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection592:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection593:
+; HASWELL-O3-NEXT:    jne .LBB241_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15623,6 +18786,36 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection594:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection595:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection596:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB242_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection597:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection598:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection599:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection600:
+; HASWELL-O3-NEXT:    jne .LBB242_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15781,6 +18974,43 @@ define void @atomic128_cas_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection601:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection602:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection603:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection604:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection605:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection606:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection607:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection608:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection609:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection610:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection611:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection612:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection613:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 monotonic monotonic, align 16, !pcsections !0
@@ -15941,6 +19171,43 @@ define void @atomic128_cas_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection614:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection615:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection616:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection617:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection618:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection619:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection620:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection621:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection622:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection623:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection624:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection625:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection626:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 acquire monotonic, align 16, !pcsections !0
@@ -16101,6 +19368,43 @@ define void @atomic128_cas_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection627:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection628:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection629:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection630:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection631:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection632:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection633:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection634:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection635:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection636:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection637:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection638:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection639:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 release monotonic, align 16, !pcsections !0
@@ -16261,6 +19565,43 @@ define void @atomic128_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection640:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection641:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection642:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection643:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection644:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection645:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection646:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection647:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection648:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection649:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection650:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection651:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection652:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 acq_rel monotonic, align 16, !pcsections !0
@@ -16421,6 +19762,43 @@ define void @atomic128_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection653:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection654:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection655:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection656:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection657:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection658:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection659:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection660:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection661:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection662:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection663:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection664:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection665:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $3, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 seq_cst monotonic, align 16, !pcsections !0
diff --git a/llvm/test/CodeGen/X86/peephole-copy.mir b/llvm/test/CodeGen/X86/peephole-copy.mir
index e24abf84d12a6..f399398ab1ba1 100644
--- a/llvm/test/CodeGen/X86/peephole-copy.mir
+++ b/llvm/test/CodeGen/X86/peephole-copy.mir
@@ -22,14 +22,14 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: c
     ; CHECK: [[MOV32ri:%[0-9]+]]:gr32_abcd = MOV32ri 512
-    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
     ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32_abcd = MOV32ri 512
-    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
     ; CHECK-NEXT: RET 0
     %2 = MOV32ri 512
     %0 = COPY %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+    INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
     %1 = COPY %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+    INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
     RET 0
 ...
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 35c7c0e09f394..3004b8b72fcc5 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -340,84 +340,87 @@ define i64 @cnt64(i64 %x) nounwind readnone {
 define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-LABEL: cnt128:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 4(%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128:
@@ -462,20 +465,26 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -522,7 +531,11 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-SSE2-LABEL: cnt128:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -564,11 +577,17 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
 ; X86-SSE2-NEXT:    movl $0, 8(%eax)
 ; X86-SSE2-NEXT:    movl $0, 4(%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -600,6 +619,8 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 8(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 4(%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -928,87 +949,92 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-LABEL: cnt128_optsize:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl 32(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
 ; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %esi, %edi
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %edx, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    andl %edx, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %eax
 ; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1057,13 +1083,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1071,7 +1101,9 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1118,7 +1150,11 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-SSE2-LABEL: cnt128_optsize:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1161,11 +1197,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_optsize:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1198,6 +1240,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -1415,85 +1459,88 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt128_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128_pgso:
@@ -1538,13 +1585,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1552,7 +1603,9 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:
@@ -1599,7 +1652,11 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-SSE2-LABEL: cnt128_pgso:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1642,11 +1699,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_pgso:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1679,6 +1742,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/X86/pr149841.ll b/llvm/test/CodeGen/X86/pr149841.ll
new file mode 100644
index 0000000000000..c17a6172dee0c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr149841.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.bar = type { [5 x ptr] }
+
+@global = external dso_local global %struct.bar
+
+define i1 @foo(ptr %arg, i1 %arg1) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    cmpq $global+1, %rdi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    retq
+bb:
+  #dbg_value(ptr @global, !3, !DIExpression(), !5)
+  %icmp = icmp ne ptr %arg, getelementptr inbounds nuw (i8, ptr @global, i64 1)
+  %select = select i1 %arg1, i1 %icmp, i1 false
+  ret i1 %select
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "x.c", directory: "/proc/self/cwd")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1)
+!4 = distinct !DISubprogram(name: "x", scope: null, file: !1, spFlags: DISPFlagDefinition, unit: !0)
+!5 = !DILocation(line: 0, scope: !4)
+
diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll
index f7c7da089c365..829d6dfceba3d 100644
--- a/llvm/test/CodeGen/X86/pr46004.ll
+++ b/llvm/test/CodeGen/X86/pr46004.ll
@@ -6,7 +6,17 @@
 define void @fuzz22357(i128 %a0) {
 ; X86-LABEL: fuzz22357:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22357:
@@ -24,6 +34,15 @@ define void @fuzz22357(i128 %a0) {
 define void @fuzz22723(i128 %a0) {
 ; X86-LABEL: fuzz22723:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22723:
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 2d1b7fcbf0239..9728e130333c4 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -42,10 +42,10 @@ define i64 @PR62286(i32 %a) {
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll
new file mode 100644
index 0000000000000..841061c37e4ed
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-windows-msvc < %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: warning: Guid:8314849053352128226 Name:inlinee does not exist in pseudo probe desc
+; CHECK: warning: Guid:6492337042787843907 Name:extract2 does not exist in pseudo probe desc
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @extract1() !dbg !8 {
+entry:
+  call void @llvm.pseudoprobe(i64 6028998432455395745, i64 1, i32 0, i64 -1), !dbg !11
+  call void @llvm.pseudoprobe(i64 8314849053352128226, i64 1, i32 0, i64 -1), !dbg !12
+  ret void, !dbg !16
+}
+
+define void @extract2() !dbg !17 {
+entry:
+  call void @llvm.pseudoprobe(i64 6492337042787843907, i64 1, i32 0, i64 -1), !dbg !18
+  ret void, !dbg !18
+}
+
+declare void @llvm.pseudoprobe(i64, i64, i32, i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6}
+!llvm.pseudo_probe_desc = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/foo")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{i64 6028998432455395745, i64 281479271677951, !"extract1"}
+!8 = distinct !DISubprogram(name: "extract1", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !0)
+!9 = !DISubroutineType(types: !10)
+!10 = !{}
+!11 = !DILocation(line: 5, column: 3, scope: !8)
+!12 = !DILocation(line: 2, column: 1, scope: !13, inlinedAt: !14)
+!13 = distinct !DISubprogram(name: "inlinee", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0)
+!14 = distinct !DILocation(line: 5, column: 3, scope: !15)
+!15 = !DILexicalBlockFile(scope: !8, file: !1, discriminator: 455082007)
+!16 = !DILocation(line: 6, column: 1, scope: !8)
+!17 = distinct !DISubprogram(name: "extract2", scope: !1, file: !1, line: 8, type: !9, scopeLine: 8, spFlags: DISPFlagDefinition, unit: !0)
+!18 = !DILocation(line: 9, column: 1, scope: !17)
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index 50a967e1c2a1a..ce9723b3a84bc 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -762,11 +762,15 @@ define i32 @x_to_s32(x86_fp80 %a) nounwind {
 define i32 @t_to_u32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u32:
@@ -797,12 +801,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u32:
@@ -835,12 +845,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u32:
@@ -860,11 +876,15 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 define i32 @t_to_s32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s32:
@@ -895,12 +915,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s32:
@@ -933,12 +959,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s32:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index f516db8b30ffe..3287869f2c601 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -1417,11 +1417,15 @@ define i64 @x_to_s64(x86_fp80 %a) nounwind {
 define i64 @t_to_u64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u64:
@@ -1452,12 +1456,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u64:
@@ -1490,12 +1500,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u64:
@@ -1515,11 +1531,15 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 define i64 @t_to_s64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s64:
@@ -1550,12 +1570,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s64:
@@ -1588,12 +1614,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s64:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 874913629e9e3..8a287229a1cb1 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -118,30 +118,33 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: scmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    cmpl %ecx, 8(%ebp)
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    cmpl 8(%ebp), %esi
+; X86-NEXT:    sbbl 12(%ebp), %eax
+; X86-NEXT:    sbbl 16(%ebp), %edi
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 4925f8bc6c8b0..392bc83d9d5d8 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -307,69 +307,70 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    subl $112, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %al, %bl
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab6178..7df490f984928 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -370,67 +370,68 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %eax, %edi
-; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %edi
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %dl
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -438,41 +439,38 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edi
 ; X86-NEXT:    sbbl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovgel %ecx, %ebx
+; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    cmovgel %ecx, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    cmovgel %eax, %edx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -805,137 +803,155 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $208, %esp
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    movl 16(%ebp), %ebx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NEXT:    shrl $31, %ebx
 ; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%edi,%edi), %eax
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    leal (%ecx,%ecx), %eax
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
@@ -949,18 +965,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bl
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
+; X86-NEXT:    setne %bh
+; X86-NEXT:    testb %bl, %bh
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -1107,36 +1123,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1144,38 +1148,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
-; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    cmovgel %eax, %edi
+; X86-NEXT:    cmovgel %eax, %ecx
+; X86-NEXT:    cmovgel %eax, %esi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    cmovgel %eax, %ebx
-; X86-NEXT:    cmovgel %edx, %edi
-; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    shldl $31, %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/seh-catch-all.ll b/llvm/test/CodeGen/X86/seh-catch-all.ll
index 5250bb9312b78..4e25aabfeef58 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all.ll
@@ -40,7 +40,7 @@ catchall:
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL+1
+; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .long 1
 ; CHECK-NEXT: .long .LBB0_2@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index d958580e5925b..cb85f39439e02 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -123,23 +123,23 @@ __except.ret:                                     ; preds = %catch.dispatch.7
 ; CHECK-NEXT:         .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT:         .long   .Ltmp0@IMGREL
-; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp1@IMGREL
 ; CHECK-NEXT:         .long   1
 ; CHECK-NEXT:         .long   .LBB1_[[except1bb]]@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp0@IMGREL
-; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp1@IMGREL
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_[[except2bb]]@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp2@IMGREL
-; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp3@IMGREL
 ; CHECK-NEXT:         .long   "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL
 ; CHECK-NEXT:         .long   0
 ; CHECK-NEXT:         .long   .Ltmp2@IMGREL
-; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp3@IMGREL
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_3@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp6@IMGREL
-; CHECK-NEXT:         .long   .Ltmp7@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp7@IMGREL
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_3@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 7f706552fbd7a..539d776430c76 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -83,15 +83,15 @@ __try.cont:                                       ; preds = %__except, %invoke.c
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL
 ; CHECK-NEXT: .long 0
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
 ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .long .Ltmp4@IMGREL
-; CHECK-NEXT: .long .Ltmp5@IMGREL+1
+; CHECK-NEXT: .long .Ltmp5@IMGREL
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
 ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index 41823dfb38f0a..6093e5e437910 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -30,7 +30,7 @@ lpad:                                             ; preds = %entry
 ; X64-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
 ; X64-NEXT: .Llsda_begin0:
 ; X64-NEXT: .long   .Ltmp0@IMGREL # LabelStart
-; X64-NEXT: .long   .Ltmp1@IMGREL+1 # LabelEnd
+; X64-NEXT: .long   .Ltmp1@IMGREL # LabelEnd
 ; X64-NEXT: .long   "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet
 ; X64-NEXT: .long   0               # Null
 ; X64-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-safe-div.ll b/llvm/test/CodeGen/X86/seh-safe-div.ll
index 542d9f67e6126..20169f868a0bf 100644
--- a/llvm/test/CodeGen/X86/seh-safe-div.ll
+++ b/llvm/test/CodeGen/X86/seh-safe-div.ll
@@ -60,6 +60,7 @@ __try.cont:
 ; CHECK: .Ltmp0:
 ; CHECK: leaq [[rloc:.*\(%rbp\)]], %rcx
 ; CHECK: callq try_body
+; CHECK: nop
 ; CHECK-NEXT: .Ltmp1
 ; CHECK: [[cont_bb:\.LBB0_[0-9]+]]:
 ; CHECK: movl [[rloc]], %eax
@@ -82,11 +83,11 @@ __try.cont:
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long safe_div_filt0@IMGREL
 ; CHECK-NEXT: .long [[handler0]]@IMGREL
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long safe_div_filt1@IMGREL
 ; CHECK-NEXT: .long [[handler1]]@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
index 2c576df1b7549..5a6aeb6020344 100644
--- a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
+++ b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
@@ -56,8 +56,8 @@ declare dso_local void @printf(ptr, ...)
 ; CHECK-NEXT:$ip2state$test:
 ; CHECK-NEXT:    .long    .Lfunc_begin0@IMGREL            # IP
 ; CHECK-NEXT:    .long    -1                              # ToState
-; CHECK-NEXT:    .long    .Ltmp0@IMGREL+1                 # IP
+; CHECK-NEXT:    .long    .Ltmp0@IMGREL                   # IP
 ; CHECK-NEXT:    .long    0                               # ToState
-; CHECK-NEXT:    .long    .Ltmp1@IMGREL+1                 # IP
+; CHECK-NEXT:    .long    .Ltmp1@IMGREL                   # IP
 ; CHECK-NEXT:    .long    -1                              # ToState
 
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index d273d09e8dea3..c7cf9cb221300 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -229,9 +229,10 @@ define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
 }
 
 ; Expensive cold value operand with unsafe-to-sink (due to lifetime-end marker) load (partial slice sinking).
-define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-LABEL: @expensive_val_operand5(
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]])
 ; CHECK-NEXT:    [[CMP_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
@@ -242,6 +243,7 @@ define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
+  %a = alloca i32
   %load = load i32, ptr %a, align 8
   call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a)
   %x = add i32 %load, %b
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f1a7996..2ac2be5545dfd 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
 ; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 76cb4e87bae18..dfeef48897e06 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -792,14 +792,24 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
 define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) {
 ; X86-LABEL: combineShiftOfShiftedLogic:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %eax, 20(%ecx)
 ; X86-NEXT:    movl $0, 16(%ecx)
 ; X86-NEXT:    movl $0, 12(%ecx)
 ; X86-NEXT:    movl $0, 8(%ecx)
 ; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combineShiftOfShiftedLogic:
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd772ab7a3..9323cd5b1917f 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -212,9 +212,18 @@ entry:
 }
 
 define void @test_lshr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_lshr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_lshr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_lshr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = lshr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -222,9 +231,18 @@ entry:
 }
 
 define void @test_ashr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_ashr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_ashr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_ashr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = ashr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -232,9 +250,18 @@ entry:
 }
 
 define void @test_shl_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_shl_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_shl_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_shl_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = shl i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -874,26 +901,31 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) {
 define i128 @lshr_shl_mask(i128 %a0) {
 ; i686-LABEL: lshr_shl_mask:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %ebp, -8
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    .cfi_def_cfa_register %ebp
+; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    .cfi_offset %esi, -16
+; i686-NEXT:    .cfi_offset %edi, -12
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
 ; i686-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    andl 36(%ebp), %edi
 ; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 4
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    .cfi_def_cfa %esp, 4
 ; i686-NEXT:    retl $4
 ;
 ; x86_64-LABEL: lshr_shl_mask:
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 86891e964d96d..509d4443e930a 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -151,31 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ebx
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -717,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 8907f6c4cd598..5e9fe27b41d2c 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -151,32 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovll %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ecx
+; X86-NEXT:    cmovll 28(%ebp), %edx
+; X86-NEXT:    cmovll 32(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -718,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
index e2de2ff4a392e..74fe07e88aa33 100644
--- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
+++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
@@ -84,12 +84,12 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
 ; X86_64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X86_64-NEXT:  .Ltmp0:
 ; X86_64-NEXT:    callq throw
+; X86_64-NEXT:    nop
 ; X86_64-NEXT:  .Ltmp1:
 ; X86_64-NEXT:  # %bb.1: # %bb14
 ; X86_64-NEXT:  .LBB0_3: # Block address taken
 ; X86_64-NEXT:    # %exit
 ; X86_64-NEXT:  $ehgcr_0_3:
-; X86_64-NEXT:    nop
 ; X86_64-NEXT:    .seh_startepilogue
 ; X86_64-NEXT:    addq $64, %rsp
 ; X86_64-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll
index e556900767afd..1dc454dd29f59 100644
--- a/llvm/test/CodeGen/X86/swap.ll
+++ b/llvm/test/CodeGen/X86/swap.ll
@@ -47,12 +47,10 @@ define dso_local void @onealloc_noreadback(ptr nocapture %a, ptr nocapture %b) l
 entry:
   %alloc = alloca [16 x i8], i8 2, align 1
   %part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false)
   tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
   ret void
 }
 
@@ -115,8 +113,9 @@ define dso_local void @onealloc_readback_1(ptr nocapture %a, ptr nocapture %b) l
 ;
 ; AA-LABEL: onealloc_readback_1:
 ; AA:       # %bb.0: # %entry
-; AA-NEXT:    vmovups (%rsi), %xmm0
+; AA-NEXT:    vmovups (%rdi), %xmm0
 ; AA-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AA-NEXT:    vmovups (%rsi), %xmm0
 ; AA-NEXT:    vmovups %xmm0, (%rdi)
 ; AA-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
index 967e125f81352..f3bef4743ef88 100644
--- a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
@@ -37,9 +37,11 @@ cond.end:                                         ; preds = %entry, %cond.true
 ; CHECK: testq
 ; CHECK: je
 ; CHECK: callq alloc
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[L1:.Ltmp[0-9]+]]
 ; CHECK: jmp f2 # TAILCALL
 ; CHECK: callq alloc
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[L3:.Ltmp[0-9]+]]
 ; CHECK: jmp f2 # TAILCALL
 
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 6a52acfe2fb30..7f17299b39e33 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -107,29 +107,33 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: ucmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    cmpl %eax, 24(%ebp)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 5b1e0545502b8..82dfeeee13293 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -153,26 +153,28 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 30a7f80b2315d..3da5973f9f903 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -194,32 +194,34 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:  .LBB4_2:
 ; X86-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index f589d4a7b04a9..7ef859978cdbf 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -232,31 +232,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -282,37 +285,40 @@ define i128 @test_i128_1(i128 %a) nounwind {
 ; X86-LABEL: test_i128_1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    cmovel %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %ebx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    cmovel %edi, %ebp
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    cmpl $0, 28(%ebp)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovel %esi, %ebx
+; X86-NEXT:    cmovel 28(%ebp), %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1312,29 +1318,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 7a5cdbb9ce758..c927abf3a4263 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -147,32 +147,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ecx
+; X86-NEXT:    cmovbl 28(%ebp), %edx
+; X86-NEXT:    cmovbl 32(%ebp), %esi
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -727,29 +729,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c3170304b980..89afd1b00444b 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 48
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 5dcf19013f0b7..834dfd63432b0 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -8,7 +8,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F
@@ -492,11 +493,20 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v8i16:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v8i16:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm1
+; GFNISSE2-NEXT:    psrlw $8, %xmm1
+; GFNISSE2-NEXT:    psllw $8, %xmm0
+; GFNISSE2-NEXT:    por %xmm1, %xmm0
+; GFNISSE2-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i16:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_v8i16:
 ; GFNIAVX:       # %bb.0:
@@ -605,11 +615,25 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v4i32:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v4i32:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm1, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm2, %xmm0
+; GFNISSE2-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v4i32:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_v4i32:
 ; GFNIAVX:       # %bb.0:
@@ -720,11 +744,27 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v2i64:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v2i64:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm1, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm2, %xmm0
+; GFNISSE2-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v2i64:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_v2i64:
 ; GFNIAVX:       # %bb.0:
@@ -1042,15 +1082,30 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v16i16:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v16i16:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT:    psrlw $8, %xmm2
+; GFNISSE2-NEXT:    psllw $8, %xmm0
+; GFNISSE2-NEXT:    por %xmm2, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm3
+; GFNISSE2-NEXT:    psrlw $8, %xmm3
+; GFNISSE2-NEXT:    psllw $8, %xmm1
+; GFNISSE2-NEXT:    por %xmm3, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v16i16:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v16i16:
 ; GFNIAVX1:       # %bb.0:
@@ -1241,15 +1296,39 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v8i32:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v8i32:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm2, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm3
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm4, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i32:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v8i32:
 ; GFNIAVX1:       # %bb.0:
@@ -1444,15 +1523,43 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v4i64:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v4i64:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm2, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm3
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm4, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v4i64:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v4i64:
 ; GFNIAVX1:       # %bb.0:
@@ -2035,19 +2142,44 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v32i16:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v32i16:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE2-NEXT:    psrlw $8, %xmm4
+; GFNISSE2-NEXT:    psllw $8, %xmm0
+; GFNISSE2-NEXT:    por %xmm4, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm5
+; GFNISSE2-NEXT:    psrlw $8, %xmm5
+; GFNISSE2-NEXT:    psllw $8, %xmm1
+; GFNISSE2-NEXT:    por %xmm5, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE2-NEXT:    psrlw $8, %xmm5
+; GFNISSE2-NEXT:    psllw $8, %xmm2
+; GFNISSE2-NEXT:    por %xmm5, %xmm2
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm3, %xmm5
+; GFNISSE2-NEXT:    psrlw $8, %xmm5
+; GFNISSE2-NEXT:    psllw $8, %xmm3
+; GFNISSE2-NEXT:    por %xmm5, %xmm3
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v32i16:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v32i16:
 ; GFNIAVX1:       # %bb.0:
@@ -2393,19 +2525,61 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v16i32:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v16i32:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm4, %xmm4
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm5
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm2, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm2
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm3, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm3
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v16i32:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v16i32:
 ; GFNIAVX1:       # %bb.0:
@@ -2759,19 +2933,69 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v8i64:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v8i64:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm4, %xmm4
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm5
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm2, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm2
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm3, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm3
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i64:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v8i64:
 ; GFNIAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633d85381..12dccca76eb19 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -92,6 +92,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %esi, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -101,15 +103,15 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jge .LBB4_2
+; CHECK-NEXT:    jge .LBB4_3
 ; CHECK-NEXT:  # %bb.1: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB4_2: # %bb2
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:  .LBB4_3: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:  .LBB4_2: # %bb1
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
index bfb9c43b3fd16..0bf8370fa24fa 100644
--- a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
@@ -103,15 +103,15 @@ handler2:
 ; X64: $ip2state$try_in_catch:
 ; X64-NEXT: .long   .Lfunc_begin0@IMGREL
 ; X64-NEXT: .long   -1
-; X64-NEXT: .long   .Ltmp0@IMGREL+1
+; X64-NEXT: .long   .Ltmp0@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp1@IMGREL+1
+; X64-NEXT: .long   .Ltmp1@IMGREL
 ; X64-NEXT: .long   -1
 ; X64-NEXT: .long   "?catch$2@?0?try_in_catch@4HA"@IMGREL
 ; X64-NEXT: .long   1
-; X64-NEXT: .long   .Ltmp2@IMGREL+1
+; X64-NEXT: .long   .Ltmp2@IMGREL
 ; X64-NEXT: .long   2
-; X64-NEXT: .long   .Ltmp3@IMGREL+1
+; X64-NEXT: .long   .Ltmp3@IMGREL
 ; X64-NEXT: .long   1
 ; X64-NEXT: .long   "?catch$4@?0?try_in_catch@4HA"@IMGREL
 ; X64-NEXT: .long   3
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 249194610e9f8..62ea5109f9df2 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -214,9 +214,9 @@ try.cont:
 ; X64: $ip2state$try_catch_catch:
 ; X64-NEXT: .long   .Lfunc_begin0@IMGREL
 ; X64-NEXT: .long   -1
-; X64-NEXT: .long   .Ltmp0@IMGREL+1
+; X64-NEXT: .long   .Ltmp0@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp1@IMGREL+1
+; X64-NEXT: .long   .Ltmp1@IMGREL
 ; X64-NEXT: .long   -1
 ; X64-NEXT: .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
 ; X64-NEXT: .long   1
@@ -357,9 +357,9 @@ try.cont:
 ; X64-LABEL: $ip2state$branch_to_normal_dest:
 ; X64-NEXT: .long   .Lfunc_begin1@IMGREL
 ; X64-NEXT: .long   -1
-; X64-NEXT: .long   .Ltmp[[before_call]]@IMGREL+1
+; X64-NEXT: .long   .Ltmp[[before_call]]@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp[[after_call]]@IMGREL+1
+; X64-NEXT: .long   .Ltmp[[after_call]]@IMGREL
 ; X64-NEXT: .long   -1
 ; X64-NEXT: .long   "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL
 ; X64-NEXT: .long   1
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e3f7f5be0049e..e9265a1ed42e3 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -191,7 +191,7 @@ cleanup.outer:                                      ; preds = %invoke.cont.1, %c
 ; X64-NEXT: .long   1
 ; X64-NEXT: .long   .Ltmp6@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp7@IMGREL+1
+; X64-NEXT: .long   .Ltmp7@IMGREL
 ; X64-NEXT: .long   -1
 
 attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/win32-eh-states.ll b/llvm/test/CodeGen/X86/win32-eh-states.ll
index 42ae5b060e6f4..e645199f84602 100644
--- a/llvm/test/CodeGen/X86/win32-eh-states.ll
+++ b/llvm/test/CodeGen/X86/win32-eh-states.ll
@@ -86,11 +86,11 @@ catch.7:
 ; X64-LABEL: $ip2state$f:
 ; X64-NEXT:   .long .Lfunc_begin0@IMGREL
 ; X64-NEXT:   .long -1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 0
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long -1
 ; X64-NEXT:   .long "?catch${{.*}}@?0?f@4HA"@IMGREL
 ; X64-NEXT:   .long 2
@@ -189,15 +189,15 @@ unreachable:                                      ; preds = %entry
 ; X64-LABEL: $ip2state$g:
 ; X64-NEXT:   .long .Lfunc_begin1@IMGREL
 ; X64-NEXT:   .long -1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long -1
 ; X64-NEXT:   .long "?catch${{.*}}@?0?g@4HA"@IMGREL
 ; X64-NEXT:   .long 2
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 3
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 2
 
 
diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
new file mode 100644
index 0000000000000..5ac90a0af2e57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck -check-prefix=CHECK64 %s
+
+define i64 @test_sdiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_sdiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __alldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_sdiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    retq
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_srem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_srem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __allrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_srem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_udiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_udiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aulldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_udiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    retq
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_urem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_urem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aullrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_urem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_mul_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_mul_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:    mull %esi
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl %ecx, %edx
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_mul_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    imulq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
index bc5be7af6c7cf..75f156fe906e4 100644
--- a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
+++ b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
@@ -8,8 +8,8 @@ define i32 @foobar() gc "statepoint-example" personality ptr @__gxx_personality_
 ; CHECK-NEXT:    .seh_stackalloc 40
 ; CHECK-NEXT:    .seh_endprologue
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    addq $40, %rsp
 ; CHECK-NEXT:    .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll
index baf5eaa29d281..a3d0fde76c458 100644
--- a/llvm/test/CodeGen/X86/wineh-coreclr.ll
+++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll
@@ -38,6 +38,7 @@ entry:
 ; CHECK: [[test1_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f1:.+]]:
   invoke void @f(i32 1)
     to label %inner_try unwind label %finally
@@ -46,6 +47,7 @@ inner_try:
 ; CHECK: [[test1_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f2:.+]]:
   invoke void @f(i32 2)
     to label %finally.clone unwind label %exn.dispatch
@@ -69,6 +71,7 @@ catch1:
 ; CHECK: [[test1_before_f3:.+]]:
 ; CHECK-NEXT: movl $3, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f3:.+]]:
   invoke void @f(i32 3) [ "funclet"(token %catch.pad1) ]
     to label %catch1.ret unwind label %finally
@@ -92,6 +95,7 @@ catch2:
 ; CHECK: [[test1_before_f4:.+]]:
 ; CHECK-NEXT: movl $4, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f4:.+]]:
   invoke void @f(i32 4) [ "funclet"(token %catch.pad2) ]
     to label %try_in_catch unwind label %finally
@@ -100,6 +104,7 @@ try_in_catch:
 ; CHECK: [[test1_before_f5:.+]]:
 ; CHECK-NEXT: movl $5, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f5:.+]]:
   invoke void @f(i32 5) [ "funclet"(token %catch.pad2) ]
     to label %catch2.ret unwind label %fault
@@ -116,6 +121,7 @@ fault:
 ; CHECK: [[test1_before_f6:.+]]:
 ; CHECK-NEXT: movl $6, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f6:.+]]:
   invoke void @f(i32 6) [ "funclet"(token %fault.pad) ]
     to label %fault.ret unwind label %finally
@@ -312,6 +318,7 @@ unreachable:
 ; CHECK: [[test2_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test2_after_f1:.+]]:
 ; CHECK: .seh_proc [[test2_catch1:[^ ]+]]
 ; CHECK: .seh_proc [[test2_catch2:[^ ]+]]
@@ -320,6 +327,7 @@ unreachable:
 ; CHECK: [[test2_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test2_after_f2:.+]]:
 ; CHECK: int3
 ; CHECK: [[test2_end:.*func_end.*]]:
@@ -448,6 +456,7 @@ entry:
 ; CHECK: [[test3_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f1:.+]]:
   invoke void @f(i32 1)
     to label %exit unwind label %fault1
@@ -474,6 +483,7 @@ fault4:
 ; CHECK: [[test3_before_f6:.+]]:
 ; CHECK-NEXT: movl $6, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f6:.+]]:
   invoke void @f(i32 6) ["funclet"(token %fault.pad4)]
     to label %fault4.cont unwind label %exn.dispatch1
@@ -482,6 +492,7 @@ fault4.cont:
 ; CHECK: [[test3_before_f7:.+]]:
 ; CHECK-NEXT: movl $7, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f7:.+]]:
   invoke void @f(i32 7) ["funclet"(token %fault.pad4)]
     to label %unreachable unwind label %fault5
@@ -512,6 +523,7 @@ unreachable:
 ; CHECK: [[test3_before_f4:.+]]:
 ; CHECK-NEXT: movl $4, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f4:.+]]:
 ; CHECK: int3
 ; CHECK: .seh_proc [[test3_fault2:[^ ]+]]
@@ -520,6 +532,7 @@ unreachable:
 ; CHECK: [[test3_before_f3:.+]]:
 ; CHECK-NEXT: movl $3, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f3:.+]]:
 ; CHECK: int3
 ; CHECK: .seh_proc [[test3_fault1:[^ ]+]]
@@ -528,6 +541,7 @@ unreachable:
 ; CHECK: [[test3_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f2:.+]]:
 ; CHECK: int3
 ; CHECK: [[test3_end:.*func_end.*]]:
diff --git a/llvm/test/CodeGen/XCore/exception.ll b/llvm/test/CodeGen/XCore/exception.ll
index f222297f452cd..bb5f3f44abc9e 100644
--- a/llvm/test/CodeGen/XCore/exception.ll
+++ b/llvm/test/CodeGen/XCore/exception.ll
@@ -60,7 +60,7 @@ entry:
 ; CHECK: [[PRE_G:.L[a-zA-Z0-9_]+]]
 ; CHECK: bl g
 ; CHECK: [[POST_G:.L[a-zA-Z0-9_]+]]
-; CHECK: [[RETURN:.L[a-zA-Z0-9_]+]]
+; CHECK: [[RETURN:^.L[a-zA-Z0-9_]+]]
 ; CHECK: ldw r6, sp[1]
 ; CHECK: ldw r5, sp[2]
 ; CHECK: ldw r4, sp[3]
diff --git a/llvm/test/DebugInfo/Generic/mixed-source.ll b/llvm/test/DebugInfo/Generic/mixed-source.ll
index d5586f8fc5389..ee3598f1fdd13 100644
--- a/llvm/test/DebugInfo/Generic/mixed-source.ll
+++ b/llvm/test/DebugInfo/Generic/mixed-source.ll
@@ -5,36 +5,66 @@
 
 ; CHECK: include_directories[  0] = "dir"
 ; CHECK-NEXT: file_names[  0]:
+; CHECK-NEXT:            name: "main.c"
+; CHECK-NEXT:       dir_index: 0
+; CHECK-NOT:           source:
+; CHECK-NEXT: file_names[  1]:
 ; CHECK-NEXT:            name: "foo.c"
 ; CHECK-NEXT:       dir_index: 0
 ; CHECK-NEXT:          source: "void foo() { }\n"
-; CHECK-NEXT: file_names[  1]:
-; CHECK-NEXT:            name: "bar.h"
+; CHECK-NEXT: file_names[  2]:
+; CHECK-NEXT:            name: "newline.h"
+; CHECK-NEXT:       dir_index: 0
+; CHECK-NEXT:          source: "\n"
+; CHECK-NEXT: file_names[  3]:
+; CHECK-NEXT:            name: "empty.h"
+; CHECK-NEXT:       dir_index: 0
+; CHECK-NEXT:          source: "\n"
+; CHECK-NEXT: file_names[  4]:
+; CHECK-NEXT:            name: "absent.h"
 ; CHECK-NEXT:       dir_index: 0
 ; CHECK-NOT:           source:
 
 ; Test that DIFiles mixing source and no-source within a DICompileUnit works.
 
-define dso_local void @foo() !dbg !5 {
+define dso_local void @foo() !dbg !6 {
   ret void, !dbg !7
 }
 
-define dso_local void @bar() !dbg !6 {
-  ret void, !dbg !8
+define dso_local void @newline() !dbg !9 {
+  ret void, !dbg !10
 }
 
-!llvm.dbg.cu = !{!4}
+define dso_local void @empty() !dbg !12 {
+  ret void, !dbg !13
+}
+
+define dso_local void @absent() !dbg !15 {
+  ret void, !dbg !16
+}
+
+!llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!0, !1}
 
 !0 = !{i32 2, !"Dwarf Version", i32 5}
 !1 = !{i32 2, !"Debug Info Version", i32 3}
 
-!2 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
-!3 = !DIFile(filename: "bar.h", directory: "dir")
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, emissionKind: FullDebug, file: !4)
+!3 = !DISubroutineType(types: !{})
+!4 = !DIFile(filename: "main.c", directory: "dir")
+
+!5 = !DIFile(filename: "foo.c", directory: "dir", source: "void foo() { }\0A")
+!6 = distinct !DISubprogram(name: "foo", file: !5, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!7 = !DILocation(line: 1, scope: !6)
+
+!8 = !DIFile(filename: "newline.h", directory: "dir", source: "\0A")
+!9 = distinct !DISubprogram(name: "newline", file: !8, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!10 = !DILocation(line: 1, scope: !9)
+
+!11 = !DIFile(filename: "empty.h", directory: "dir", source: "")
+!12 = distinct !DISubprogram(name: "empty", file: !11, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!13 = !DILocation(line: 1, scope: !12)
 
-!4 = distinct !DICompileUnit(language: DW_LANG_C99, emissionKind: FullDebug, file: !2)
-!5 = distinct !DISubprogram(name: "foo", file: !2, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !4)
-!6 = distinct !DISubprogram(name: "bar", file: !3, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !4)
-!7 = !DILocation(line: 1, scope: !5)
-!8 = !DILocation(line: 1, scope: !6)
-!9 = !DISubroutineType(types: !{})
+!14 = !DIFile(filename: "absent.h", directory: "dir")
+!15 = distinct !DISubprogram(name: "absent", file: !14, line: 1, type: !3, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !2)
+!16 = !DILocation(line: 1, scope: !15)
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
index 0f8f505c51a58..5d73b2669ccda 100644
--- a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
@@ -7,6 +7,8 @@
 
 define dso_local void @f() !dbg !10 {
 entry:
+; Include non-key location to check verifier is checking the whole function.
+  %0 = add i32 0, 0, !dbg !14
   ret void, !dbg !13
 }
 
@@ -20,3 +22,4 @@ entry:
 !11 = !DISubroutineType(types: !12)
 !12 = !{null}
 !13 = !DILocation(line: 1, column: 11, scope: !10, atomGroup: 1, atomRank: 1)
+!14 = !DILocation(line: 1, column: 11, scope: !10)
diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s b/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s
index 0fca88b6e9ba2..ddbf02c1869f8 100644
--- a/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s
+++ b/llvm/test/ExecutionEngine/RuntimeDyld/LoongArch/ELF_LoongArch_relocations.s
@@ -2,6 +2,9 @@
 # RUN: llvm-mc --triple=loongarch64 --filetype=obj -o %t/reloc.o %s
 # RUN: llvm-rtdyld --triple=loongarch64 --verify --check=%s %t/reloc.o \
 # RUN:     --map-section reloc.o,.got=0x21f00 \
+# RUN:     --map-section reloc.o,.sec.large.pc=0x0000000012345000 \
+# RUN:     --map-section reloc.o,.sec.large.got=0x44433333abcde000 \
+# RUN:     --map-section reloc.o,.sec.dummy=0x4443333334567111 \
 # RUN:     --dummy-extern abs=0x0123456789abcdef \
 # RUN:     --dummy-extern external_data=0x1234
 
@@ -100,3 +103,42 @@ named_data:
     .quad 0x2222222222222222
     .quad 0x3333333333333333
     .size named_data, .-named_data
+
+    .section .sec.large.pc,"ax"
+    .globl test_large_pc
+test_large_pc:
+## Code after link should be:
+## 1a44444d pcalau12i $t1, 139810
+## 02c4440c addi.d    $t0, $zero, 273
+## 1666666c lu32i.d   $t0, 209715
+## 0311118c lu52i.d   $t0, $t0, 1092
+
+# rtdyld-check: *{4}(test_large_pc) = 0x1a44444d
+    pcalau12i $t1, %pc_hi20(.sec.dummy)
+# rtdyld-check: *{4}(test_large_pc + 4) = 0x02c4440c
+    addi.d $t0, $zero, %pc_lo12(.sec.dummy)
+# rtdyld-check: *{4}(test_large_pc + 8) = 0x1666666c
+    lu32i.d $t0, %pc64_lo20(.sec.dummy)
+# rtdyld-check: *{4}(test_large_pc + 12) = 0x0311118c
+    lu52i.d $t0, $t0, %pc64_hi12(.sec.dummy)
+
+    .section .sec.large.got,"ax"
+    .globl test_large_got
+test_large_got:
+## Code after link should be:
+## 1aa8688d pcalau12i $t1, 344900
+## 02fc000c addi.d    $t0, $zero, -256
+## 1799996c lu32i.d   $t0, -209717
+## 032eed8c lu52i.d   $t0, $t0, -1093
+
+# rtdyld-check: *{4}(test_large_got) = 0x1aa8688d
+    pcalau12i $t1, %got_pc_hi20(external_data)
+# rtdyld-check: *{4}(test_large_got + 4) = 0x02fc000c
+    addi.d $t0, $zero, %got_pc_lo12(external_data)
+# rtdyld-check: *{4}(test_large_got + 8) = 0x1799996c
+    lu32i.d $t0, %got64_pc_lo20(external_data)
+# rtdyld-check: *{4}(test_large_got + 12) = 0x032eed8c
+    lu52i.d $t0, $t0, %got64_pc_hi12(external_data)
+
+    .section .sec.dummy,"a"
+    .word 0
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
index e9c1075a2cb94..ae8b2b385ac09 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
@@ -23,7 +23,7 @@ declare i32 @dummyPersonality(...)
 
 define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr @__CxxFrameHandler3 {
 ; CHECK-INLINE-LABEL: define void @FuncletPersonality(
-; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
 ; CHECK-INLINE-NEXT:  entry:
 ; CHECK-INLINE-NEXT:    [[TMP0:%.*]] = alloca i64, align 32
 ; CHECK-INLINE-NEXT:    store i64 0, ptr [[TMP0]], align 8
@@ -87,7 +87,6 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_f3(i64 [[TMP38]], i64 1)
 ; CHECK-INLINE-NEXT:    [[TMP39:%.*]] = add i64 [[TMP29]], 1066
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_04(i64 [[TMP39]], i64 1)
-; CHECK-INLINE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP22]])
 ; CHECK-INLINE-NEXT:    [[TMP40:%.*]] = lshr i64 [[TMP21]], 3
 ; CHECK-INLINE-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], [[TMP1]]
 ; CHECK-INLINE-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
@@ -100,13 +99,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP43]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP48]], label [[TMP49:%.*]], label [[TMP50]]
 ; CHECK-INLINE:       49:
-; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR8:[0-9]+]]
+; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR7:[0-9]+]]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       50:
 ; CHECK-INLINE-NEXT:    store volatile i8 0, ptr [[TMP22]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP51:%.*]] = add i64 [[TMP29]], 1066
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_f8(i64 [[TMP51]], i64 1)
-; CHECK-INLINE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP22]])
 ; CHECK-INLINE-NEXT:    [[TMP52:%.*]] = alloca i8, i64 96, align 32
 ; CHECK-INLINE-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP52]] to i64
 ; CHECK-INLINE-NEXT:    [[TMP54:%.*]] = add i64 [[TMP53]], 32
@@ -128,7 +126,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP66:%.*]] = icmp ne i8 [[TMP65]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP66]], label [[TMP67:%.*]], label [[TMP68:%.*]]
 ; CHECK-INLINE:       67:
-; CHECK-INLINE-NEXT:    call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR8]]
+; CHECK-INLINE-NEXT:    call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR7]]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       68:
 ; CHECK-INLINE-NEXT:    store volatile i64 0, ptr [[TMP61]], align 8
@@ -158,7 +156,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP88:%.*]] = icmp sge i8 [[TMP87]], [[TMP83]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP88]], label [[TMP89:%.*]], label [[TMP90]]
 ; CHECK-INLINE:       89:
-; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR8]]
+; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR7]]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       90:
 ; CHECK-INLINE-NEXT:    store volatile i8 0, ptr [[TMP79]], align 1
@@ -185,7 +183,6 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE:       ehcleanup:
 ; CHECK-INLINE-NEXT:    [[TMP98:%.*]] = cleanuppad within none []
 ; CHECK-INLINE-NEXT:    call void @__asan_unpoison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ]
-; CHECK-INLINE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP56]])
 ; CHECK-INLINE-NEXT:    [[TMP99:%.*]] = lshr i64 [[TMP54]], 3
 ; CHECK-INLINE-NEXT:    [[TMP100:%.*]] = add i64 [[TMP99]], [[TMP1]]
 ; CHECK-INLINE-NEXT:    [[TMP101:%.*]] = inttoptr i64 [[TMP100]] to ptr
@@ -198,12 +195,11 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP107:%.*]] = icmp sge i8 [[TMP106]], [[TMP102]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109]]
 ; CHECK-INLINE:       108:
-; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       109:
 ; CHECK-INLINE-NEXT:    store volatile i8 0, ptr [[TMP56]], align 1
 ; CHECK-INLINE-NEXT:    call void @__asan_poison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ]
-; CHECK-INLINE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP56]])
 ; CHECK-INLINE-NEXT:    call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP110:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP111:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ]
@@ -226,7 +222,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP125:%.*]] = icmp sge i8 [[TMP124]], [[TMP120]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127]]
 ; CHECK-INLINE:       126:
-; CHECK-INLINE-NEXT:    call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT:    call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       127:
 ; CHECK-INLINE-NEXT:    [[TMP128:%.*]] = lshr i64 [[TMP114]], 3
@@ -241,7 +237,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP136:%.*]] = icmp sge i8 [[TMP135]], [[TMP131]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP136]], label [[TMP137:%.*]], label [[EHEXIT]]
 ; CHECK-INLINE:       137:
-; CHECK-INLINE-NEXT:    call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR8]] [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT:    call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       ehexit:
 ; CHECK-INLINE-NEXT:    store i64 0, ptr [[PTRPARAM]], align 1
@@ -265,7 +261,7 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    cleanupret from [[TMP98]] unwind to caller
 ;
 ; CHECK-OUTLINE-LABEL: define void @FuncletPersonality(
-; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
 ; CHECK-OUTLINE-NEXT:  entry:
 ; CHECK-OUTLINE-NEXT:    [[TMP0:%.*]] = alloca i64, align 32
 ; CHECK-OUTLINE-NEXT:    store i64 0, ptr [[TMP0]], align 8
@@ -339,12 +335,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_f3(i64 [[TMP45]], i64 5)
 ; CHECK-OUTLINE-NEXT:    [[TMP46:%.*]] = add i64 [[TMP33]], 1066
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_04(i64 [[TMP46]], i64 1)
-; CHECK-OUTLINE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP22]])
 ; CHECK-OUTLINE-NEXT:    call void @__asan_store1(i64 [[TMP21]])
 ; CHECK-OUTLINE-NEXT:    store volatile i8 0, ptr [[TMP22]], align 1
 ; CHECK-OUTLINE-NEXT:    [[TMP47:%.*]] = add i64 [[TMP33]], 1066
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_f8(i64 [[TMP47]], i64 1)
-; CHECK-OUTLINE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP22]])
 ; CHECK-OUTLINE-NEXT:    call void @__asan_store8(i64 [[TMP25]])
 ; CHECK-OUTLINE-NEXT:    store volatile i64 0, ptr [[TMP26]], align 8
 ; CHECK-OUTLINE-NEXT:    [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP26]], align 8
@@ -389,12 +383,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    [[TMP67:%.*]] = cleanuppad within none []
 ; CHECK-OUTLINE-NEXT:    [[TMP68:%.*]] = add i64 [[TMP33]], 1068
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_04(i64 [[TMP68]], i64 1) [ "funclet"(token [[TMP67]]) ]
-; CHECK-OUTLINE-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP24]])
 ; CHECK-OUTLINE-NEXT:    call void @__asan_store1(i64 [[TMP23]]) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    store volatile i8 0, ptr [[TMP24]], align 1
 ; CHECK-OUTLINE-NEXT:    [[TMP69:%.*]] = add i64 [[TMP33]], 1068
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_f8(i64 [[TMP69]], i64 1) [ "funclet"(token [[TMP67]]) ]
-; CHECK-OUTLINE-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP24]])
 ; CHECK-OUTLINE-NEXT:    call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP70:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP71:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ]
@@ -495,7 +487,7 @@ nopredecessor:
 ; Non-Windows personality, ensure no funclet gets attached to asan runtime call.
 define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @dummyPersonality {
 ; CHECK-LABEL: define void @OtherPersonality(
-; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @dummyPersonality {
+; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @dummyPersonality {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8
 ; CHECK-NEXT:    [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
index eac414a9367d9..ddfa5e1061878 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
@@ -24,7 +24,7 @@ entry:
 
   call void @llvm.lifetime.start.p0(i64 4, ptr %x)
   ; CHECK: store i8 4, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: @llvm.lifetime.start
+  ; CHECK-NOT: @llvm.lifetime.start
 
   %exception = call ptr @__cxa_allocate_exception(i64 4)
   invoke void @__cxa_throw(ptr %exception, ptr @_ZTI3ABC, ptr @_ZN3ABCD2Ev) noreturn
@@ -38,7 +38,7 @@ lpad:
   call void @_ZN3ABCD2Ev(ptr nonnull %x)
   call void @llvm.lifetime.end.p0(i64 4, ptr %x)
   ; CHECK: store i8 -8, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: @llvm.lifetime.end
+  ; CHECK-NOT: @llvm.lifetime.end
 
   resume { ptr, i32 } %0
   ; CHECK: store i64 0, ptr %{{[0-9]+}}
@@ -77,7 +77,7 @@ entry:
 
   call void @llvm.lifetime.start.p0(i64 4, ptr %x)
   ; CHECK: store i8 4, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: @llvm.lifetime.start
+  ; CHECK-NOT: @llvm.lifetime.start
 
   invoke void @_CxxThrowException(ptr %tmp, ptr nonnull @"_TI1?AUABC@@") noreturn
           to label %unreachable unwind label %ehcleanup
@@ -89,7 +89,7 @@ ehcleanup:
   call void @"\01??1ABC@@QEAA@XZ"(ptr nonnull %x) [ "funclet"(token %0) ]
   call void @llvm.lifetime.end.p0(i64 4, ptr %x)
   ; CHECK: store i8 -8, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: @llvm.lifetime.end
+  ; CHECK-NOT: @llvm.lifetime.end
 
   cleanupret from %0 unwind to caller
   ; CHECK: store i64 0, ptr %{{[0-9]+}}
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
index 1d073cdc3bdb9..bbfe00bf9a0eb 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test handling of llvm.lifetime intrinsics.
 ; RUN: opt < %s -passes=asan -asan-use-after-scope -asan-use-after-return=never -asan-use-stack-safety=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK-DEFAULT
 ; RUN: opt < %s -passes=asan -asan-use-after-scope -asan-use-after-return=never -asan-use-stack-safety=0 -asan-instrument-dynamic-allocas=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK-NO-DYNAMIC
@@ -8,161 +9,319 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture) nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind
 
-; CHECK-LABEL: define void @lifetime_no_size(
 define void @lifetime_no_size(i64 %i) sanitize_address {
+; CHECK-LABEL: define void @lifetime_no_size(
+; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT:    store i64 1102416563, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 ptrtoint (ptr @lifetime_no_size to i64), ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    store i64 -868083117767659023, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[AI:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP2]], i64 0, i64 [[I]]
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[AI]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP12]], 3
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 2147450880
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i8 [[TMP16]], 0
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB23:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i8
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp sge i8 [[TMP20]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB23]]
+; CHECK:       [[BB22]]:
+; CHECK-NEXT:    call void @__asan_report_store1(i64 [[TMP12]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB23]]:
+; CHECK-NEXT:    store volatile i8 0, ptr [[AI]], align 4
+; CHECK-NEXT:    store i64 1172321806, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+; CHECK-NEXT:    store i64 0, ptr [[TMP25]], align 1
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = alloca [2 x i32], align 4
 
   ; Poison memory in prologue: 0xf3f3f300f1f1f1f1
-  ; CHECK: store i64 -868083117767659023, ptr %[[#]]
 
   call void @llvm.lifetime.start.p0(i64 -1, ptr %a)
   ; Check that lifetime with no size are ignored.
-  ; CHECK-NOT: store
-  ; CHECK: call void @llvm.lifetime.start
 
   %ai = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 %i
   store volatile i8 0, ptr %ai, align 4
-  ; CHECK: store volatile
 
   call void @llvm.lifetime.end.p0(i64 -1, ptr %a)
   ; Check that lifetime with no size are ignored.
-  ; CHECK-NOT: store
-  ; CHECK: call void @llvm.lifetime.end
 
   ; Unpoison stack frame on exit.
-  ; CHECK: store i64 0, ptr %{{[0-9]+}}
-  ; CHECK: ret void
   ret void
 }
 
 ; Generic case of lifetime analysis.
 define void @lifetime() sanitize_address {
-  ; CHECK-LABEL: define void @lifetime()
+; CHECK-DEFAULT-LABEL: define void @lifetime(
+; CHECK-DEFAULT-SAME: ) #[[ATTR1]] {
+; CHECK-DEFAULT-NEXT:    [[TMP1:%.*]] = alloca i64, align 32
+; CHECK-DEFAULT-NEXT:    store i64 0, ptr [[TMP1]], align 8
+; CHECK-DEFAULT-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-DEFAULT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 32
+; CHECK-DEFAULT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-DEFAULT-NEXT:    store i64 1102416563, ptr [[TMP5]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP6:%.*]] = add i64 [[TMP2]], 8
+; CHECK-DEFAULT-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-DEFAULT-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP7]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 16
+; CHECK-DEFAULT-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-DEFAULT-NEXT:    store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP9]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP2]], 3
+; CHECK-DEFAULT-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880
+; CHECK-DEFAULT-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
+; CHECK-DEFAULT-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-DEFAULT-NEXT:    store i64 -868082052615769615, ptr [[TMP13]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP14:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-DEFAULT-NEXT:    store i8 4, ptr [[TMP15]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-DEFAULT-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP16]], 3
+; CHECK-DEFAULT-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 2147450880
+; CHECK-DEFAULT-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB27:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT:       [[BB22]]:
+; CHECK-DEFAULT-NEXT:    [[TMP23:%.*]] = and i64 [[TMP16]], 7
+; CHECK-DEFAULT-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i8
+; CHECK-DEFAULT-NEXT:    [[TMP25:%.*]] = icmp sge i8 [[TMP24]], [[TMP20]]
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[BB27]]
+; CHECK-DEFAULT:       [[BB26]]:
+; CHECK-DEFAULT-NEXT:    call void @__asan_report_store1(i64 [[TMP16]]) #[[ATTR4]]
+; CHECK-DEFAULT-NEXT:    unreachable
+; CHECK-DEFAULT:       [[BB27]]:
+; CHECK-DEFAULT-NEXT:    store volatile i8 0, ptr [[TMP4]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP28:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-DEFAULT-NEXT:    store i8 -8, ptr [[TMP29]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP30:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-DEFAULT-NEXT:    store i8 -8, ptr [[TMP31]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP32:%.*]] = alloca i8, i64 128, align 32
+; CHECK-DEFAULT-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-DEFAULT-NEXT:    [[TMP34:%.*]] = add i64 [[TMP33]], 32
+; CHECK-DEFAULT-NEXT:    call void @__asan_alloca_poison(i64 [[TMP34]], i64 40)
+; CHECK-DEFAULT-NEXT:    [[TMP35:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-DEFAULT-NEXT:    store i64 [[TMP35]], ptr [[TMP1]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-DEFAULT-NEXT:    call void @__asan_unpoison_stack_memory(i64 [[TMP37]], i64 40)
+; CHECK-DEFAULT-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-DEFAULT-NEXT:    [[TMP39:%.*]] = lshr i64 [[TMP38]], 3
+; CHECK-DEFAULT-NEXT:    [[TMP40:%.*]] = add i64 [[TMP39]], 2147450880
+; CHECK-DEFAULT-NEXT:    [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP42:%.*]] = load i8, ptr [[TMP41]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP43:%.*]] = icmp ne i8 [[TMP42]], 0
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP43]], label %[[BB44:.*]], label %[[BB49:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT:       [[BB44]]:
+; CHECK-DEFAULT-NEXT:    [[TMP45:%.*]] = and i64 [[TMP38]], 7
+; CHECK-DEFAULT-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i8
+; CHECK-DEFAULT-NEXT:    [[TMP47:%.*]] = icmp sge i8 [[TMP46]], [[TMP42]]
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP47]], label %[[BB48:.*]], label %[[BB49]]
+; CHECK-DEFAULT:       [[BB48]]:
+; CHECK-DEFAULT-NEXT:    call void @__asan_report_store1(i64 [[TMP38]]) #[[ATTR4]]
+; CHECK-DEFAULT-NEXT:    unreachable
+; CHECK-DEFAULT:       [[BB49]]:
+; CHECK-DEFAULT-NEXT:    store volatile i8 0, ptr [[TMP36]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-DEFAULT-NEXT:    call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40)
+; CHECK-DEFAULT-NEXT:    [[TMP51:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT:    [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr
+; CHECK-DEFAULT-NEXT:    store i8 4, ptr [[TMP52]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-DEFAULT-NEXT:    [[TMP54:%.*]] = lshr i64 [[TMP53]], 3
+; CHECK-DEFAULT-NEXT:    [[TMP55:%.*]] = add i64 [[TMP54]], 2147450880
+; CHECK-DEFAULT-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP57:%.*]] = load i8, ptr [[TMP56]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP58:%.*]] = icmp ne i8 [[TMP57]], 0
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP58]], label %[[BB59:.*]], label %[[BB64:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT:       [[BB59]]:
+; CHECK-DEFAULT-NEXT:    [[TMP60:%.*]] = and i64 [[TMP53]], 7
+; CHECK-DEFAULT-NEXT:    [[TMP61:%.*]] = trunc i64 [[TMP60]] to i8
+; CHECK-DEFAULT-NEXT:    [[TMP62:%.*]] = icmp sge i8 [[TMP61]], [[TMP57]]
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP62]], label %[[BB63:.*]], label %[[BB64]]
+; CHECK-DEFAULT:       [[BB63]]:
+; CHECK-DEFAULT-NEXT:    call void @__asan_report_store1(i64 [[TMP53]]) #[[ATTR4]]
+; CHECK-DEFAULT-NEXT:    unreachable
+; CHECK-DEFAULT:       [[BB64]]:
+; CHECK-DEFAULT-NEXT:    store volatile i8 0, ptr [[TMP4]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP65:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+; CHECK-DEFAULT-NEXT:    store i8 -8, ptr [[TMP66]], align 1
+; CHECK-DEFAULT-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-DEFAULT-NEXT:    [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-DEFAULT-NEXT:    call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]])
+; CHECK-DEFAULT-NEXT:    store i64 1172321806, ptr [[TMP5]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP69:%.*]] = add i64 [[TMP11]], 0
+; CHECK-DEFAULT-NEXT:    [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr
+; CHECK-DEFAULT-NEXT:    store i64 0, ptr [[TMP70]], align 1
+; CHECK-DEFAULT-NEXT:    ret void
+;
+; CHECK-NO-DYNAMIC-LABEL: define void @lifetime(
+; CHECK-NO-DYNAMIC-SAME: ) #[[ATTR1]] {
+; CHECK-NO-DYNAMIC-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 32
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i64 1102416563, ptr [[TMP4]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP6]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP7:%.*]] = add i64 [[TMP1]], 16
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP8]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP1]], 3
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i64 -868082052615769615, ptr [[TMP12]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP13:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i8 4, ptr [[TMP14]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP16:%.*]] = lshr i64 [[TMP15]], 3
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP20]], label %[[BB21:.*]], label %[[BB26:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC:       [[BB21]]:
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP22:%.*]] = and i64 [[TMP15]], 7
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP24:%.*]] = icmp sge i8 [[TMP23]], [[TMP19]]
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP24]], label %[[BB25:.*]], label %[[BB26]]
+; CHECK-NO-DYNAMIC:       [[BB25]]:
+; CHECK-NO-DYNAMIC-NEXT:    call void @__asan_report_store1(i64 [[TMP15]]) #[[ATTR4]]
+; CHECK-NO-DYNAMIC-NEXT:    unreachable
+; CHECK-NO-DYNAMIC:       [[BB26]]:
+; CHECK-NO-DYNAMIC-NEXT:    store volatile i8 0, ptr [[TMP3]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP27:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i8 -8, ptr [[TMP28]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP29:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i8 -8, ptr [[TMP30]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[ARR:%.*]] = alloca [10 x i32], align 16
+; CHECK-NO-DYNAMIC-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr [[ARR]])
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[ARR]] to i64
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP32:%.*]] = lshr i64 [[TMP31]], 3
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP36]], label %[[BB37:.*]], label %[[BB42:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC:       [[BB37]]:
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP38:%.*]] = and i64 [[TMP31]], 7
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP35]]
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42]]
+; CHECK-NO-DYNAMIC:       [[BB41]]:
+; CHECK-NO-DYNAMIC-NEXT:    call void @__asan_report_store1(i64 [[TMP31]]) #[[ATTR4]]
+; CHECK-NO-DYNAMIC-NEXT:    unreachable
+; CHECK-NO-DYNAMIC:       [[BB42]]:
+; CHECK-NO-DYNAMIC-NEXT:    store volatile i8 0, ptr [[ARR]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr [[ARR]])
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP43:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i8 4, ptr [[TMP44]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP45:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP46:%.*]] = lshr i64 [[TMP45]], 3
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP47:%.*]] = add i64 [[TMP46]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP50:%.*]] = icmp ne i8 [[TMP49]], 0
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP50]], label %[[BB51:.*]], label %[[BB56:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC:       [[BB51]]:
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP52:%.*]] = and i64 [[TMP45]], 7
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP54:%.*]] = icmp sge i8 [[TMP53]], [[TMP49]]
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP54]], label %[[BB55:.*]], label %[[BB56]]
+; CHECK-NO-DYNAMIC:       [[BB55]]:
+; CHECK-NO-DYNAMIC-NEXT:    call void @__asan_report_store1(i64 [[TMP45]]) #[[ATTR4]]
+; CHECK-NO-DYNAMIC-NEXT:    unreachable
+; CHECK-NO-DYNAMIC:       [[BB56]]:
+; CHECK-NO-DYNAMIC-NEXT:    store volatile i8 0, ptr [[TMP3]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP57:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i8 -8, ptr [[TMP58]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    store i64 1172321806, ptr [[TMP4]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP59:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    store i64 0, ptr [[TMP60]], align 1
+; CHECK-NO-DYNAMIC-NEXT:    ret void
+;
 
   ; Regular variable lifetime intrinsics.
   %i = alloca i32, align 4
 
   ; Poison memory in prologue: F1F1F1F1F8F3F3F3
-  ; CHECK: store i64 -868082052615769615, ptr %{{[0-9]+}}
 
   ; Memory is unpoisoned at llvm.lifetime.start
   call void @llvm.lifetime.start.p0(i64 3, ptr %i)
-  ; CHECK: store i8 4, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: llvm.lifetime.start
 
   store volatile i8 0, ptr %i
-  ; CHECK: store volatile
 
   call void @llvm.lifetime.end.p0(i64 4, ptr %i)
-  ; CHECK: store i8 -8, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: call void @llvm.lifetime.end
 
   ; Memory is poisoned at every call to llvm.lifetime.end
   call void @llvm.lifetime.end.p0(i64 2, ptr %i)
-  ; CHECK: store i8 -8, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: call void @llvm.lifetime.end
 
   ; Lifetime intrinsics for array.
   %arr = alloca [10 x i32], align 16
 
   call void @llvm.lifetime.start.p0(i64 40, ptr %arr)
-  ; CHECK-DEFAULT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40)
-  ; CHECK-NO-DYNAMIC-NOT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40)
 
   store volatile i8 0, ptr %arr
-  ; CHECK: store volatile
 
   call void @llvm.lifetime.end.p0(i64 40, ptr %arr)
-  ; CHECK-DEFAULT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40)
-  ; CHECK-NO-DYNAMIC-NOT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40)
 
   ; One more lifetime start/end for the same variable %i.
   call void @llvm.lifetime.start.p0(i64 2, ptr %i)
-  ; CHECK: store i8 4, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: llvm.lifetime.start
 
   store volatile i8 0, ptr %i
-  ; CHECK: store volatile
 
   call void @llvm.lifetime.end.p0(i64 4, ptr %i)
-  ; CHECK: store i8 -8, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: llvm.lifetime.end
 
   ; Memory is unpoisoned at function exit (only once).
-  ; CHECK: store i64 0, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: ret void
-  ret void
-}
-
-; Check that arguments of lifetime may come from phi nodes.
-define void @phi_args(i1 %x) sanitize_address {
-  ; CHECK-LABEL: define void @phi_args(i1 %x)
-
-entry:
-  %i = alloca i64, align 4
-
-  ; Poison memory in prologue: F1F1F1F1F8F3F3F3
-  ; CHECK: store i64 -868082052615769615, ptr %{{[0-9]+}}
-
-  call void @llvm.lifetime.start.p0(i64 8, ptr %i)
-  ; CHECK: store i8 0, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: llvm.lifetime.start
-
-  store volatile i8 0, ptr %i
-  ; CHECK: store volatile
-
-  br i1 %x, label %bb0, label %bb1
-
-bb0:
-  br label %bb1
-
-bb1:
-  %i.phi = phi ptr [ %i, %entry ], [ %i, %bb0 ]
-  call void @llvm.lifetime.end.p0(i64 8, ptr %i.phi)
-  ; CHECK: store i8 -8, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: llvm.lifetime.end
-
-  ret void
-  ; CHECK: store i64 0, ptr %{{[0-9]+}}
-  ; CHECK-NEXT: ret void
-}
-
-; Check that arguments of lifetime may come from getelementptr nodes.
-define void @getelementptr_args(i64 %i) sanitize_address{
-  ; CHECK-LABEL: define void @getelementptr_args
-entry:
-  %x = alloca [1024 x i8], align 16
-  %a = alloca [2 x ptr], align 8
-
-  ; F1F1F1F1
-  ; CHECK: store i32 -235802127, ptr %{{[0-9]+}}
-  ; CHECK: call void @__asan_set_shadow_f8(i64 %[[#]], i64 128)
-  ; 0xf2f2f2f2f2f2f2f2
-  ; CHECK: store i64 -940422246894996750, ptr %[[#]]
-  ; 0xf2f2f2f2f2f2f2f2
-  ; CHECK: store i64 -940422246894996750, ptr %[[#]]
-
-  call void @llvm.lifetime.start.p0(i64 1024, ptr %x)
-  ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 128)
-  ; CHECK-NEXT: call void @llvm.lifetime.start
-
-  %ai = getelementptr inbounds [2 x ptr], ptr %a, i64 0, i64 %i
-  store ptr %x, ptr %ai, align 8
-  ; CHECK: store ptr
-
-  call void @llvm.lifetime.end.p0(i64 1024, ptr %x)
-  ; CHECK: call void @__asan_set_shadow_f8(i64 %{{[0-9]+}}, i64 128)
-  ; CHECK-NEXT: call void @llvm.lifetime.end
-
   ret void
-  ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 148)
-  ; CHECK: store i16 0, ptr %[[#]], align 1
-  ; CHECK-NEXT: ret void
 }
 
 define void @zero_sized(i64 %a) #0 {
-; CHECK-LABEL: define void @zero_sized(i64 %a)
+; CHECK-LABEL: define void @zero_sized(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca [0 x i8], align 1
+; CHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 0, ptr [[B]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 0, ptr [[B]])
+; CHECK-NEXT:    ret void
+;
 
 entry:
   %a.addr = alloca i64, align 8
@@ -170,11 +329,13 @@ entry:
   store i64 %a, ptr %a.addr, align 8
 
   call void @llvm.lifetime.start.p0(i64 0, ptr %b) #2
-  ; CHECK: call void @llvm.lifetime.start
 
   call void @llvm.lifetime.end.p0(i64 0, ptr %b) #2
-  ; CHECK: call void @llvm.lifetime.end
 
   ret void
-  ; CHECK-NEXT: ret void
 }
+;.
+; CHECK-DEFAULT: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
+; CHECK-NO-DYNAMIC: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
index 9e2166475e940..b4fe74aa3d7cd 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
@@ -100,8 +100,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i8 2, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 650, ptr %xx)
-
   call void @Foo(ptr %xx)
   ; CHECK-NEXT: call void @Foo(ptr %xx)
 
@@ -109,8 +107,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
   ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82)
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 650, ptr %xx)
-
 
   call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
   ; 0005
@@ -118,8 +114,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i16 5, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
-
   call void @Foo(ptr %yy)
   ; CHECK-NEXT: call void @Foo(ptr %yy)
 
@@ -129,8 +123,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i16 -1800, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 13, ptr %yy)
-
 
   call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
   ; 00000000
@@ -142,8 +134,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i8 0, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
-
   call void @Foo(ptr %zz)
   ; CHECK-NEXT: call void @Foo(ptr %zz)
 
@@ -157,8 +147,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i8 -8, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr %zz)
-
   ; CHECK: {{^[0-9]+}}:
 
   ; CHECK-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 0
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
index 35833ed5c7f28..fca92cb9a7de4 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
@@ -100,8 +100,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i8 2, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 650, ptr %xx)
-
   call void @Foo(ptr %xx)
   ; CHECK-NEXT: call void @Foo(ptr %xx)
 
@@ -109,8 +107,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
   ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82)
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 650, ptr %xx)
-
 
   call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
   ; 0005
@@ -118,8 +114,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i16 1280, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 13, ptr %yy)
-
   call void @Foo(ptr %yy)
   ; CHECK-NEXT: call void @Foo(ptr %yy)
 
@@ -129,8 +123,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i16 -1800, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 13, ptr %yy)
-
 
   call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
   ; 00000000
@@ -142,8 +134,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i8 0, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr %zz)
-
   call void @Foo(ptr %zz)
   ; CHECK-NEXT: call void @Foo(ptr %zz)
 
@@ -157,8 +147,6 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to ptr
   ; ENTRY-UAS-NEXT: store i8 -8, ptr [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr %zz)
-
   ; CHECK: {{^[0-9]+}}:
 
   ; CHECK-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 0
@@ -209,40 +197,6 @@ entry:
   ; CHECK: ret void
 }
 
-declare void @foo(ptr)
-define void @PR41481(i1 %b) sanitize_address {
-; CHECK-LABEL: @PR41481
-entry:
-  %p1 = alloca i32
-  %p2 = alloca i32
-  br label %bb1
-
-  ; Since we cannot account for all lifetime intrinsics in this function, we
-  ; might have missed a lifetime.start one and therefore shouldn't poison the
-  ; allocas at function entry.
-  ; ENTRY: store i64 -935356719533264399
-  ; ENTRY-UAS: store i64 -935356719533264399
-
-bb1:
-  %p = select i1 %b, ptr %p1, ptr %p2
-  %q = select i1 %b, ptr  %p1, ptr  %p2
-  call void @llvm.lifetime.start.p0(i64 4, ptr %q)
-  call void @foo(ptr %p)
-  br i1 %b, label %bb2, label %bb3
-
-bb2:
-  call void @llvm.lifetime.end.p0(i64 4, ptr %p1)
-  br label %end
-
-bb3:
-  call void @llvm.lifetime.end.p0(i64 4, ptr %p2)
-  br label %end
-
-end:
-  ret void
-}
-
-
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll b/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll
index 4e22f5fec0067..f5ae1c0f80497 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll
@@ -4,7 +4,7 @@
 ; CHECK29: @four = global
 
 ; CHECK: @specialcaselisted = global i16 2, no_sanitize_hwaddress
-
+; CHECK: @insection = global i16 2, section "custom"
 ; CHECK: @__start_hwasan_globals = external hidden constant [0 x i8]
 ; CHECK: @__stop_hwasan_globals = external hidden constant [0 x i8]
 
@@ -37,3 +37,4 @@ source_filename = "foo"
 @sixteen = global [16 x i8] zeroinitializer
 @huge = global [16777232 x i8] zeroinitializer
 @specialcaselisted = global i16 2, no_sanitize_hwaddress
+@insection = global i16 2, section "custom"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
new file mode 100644
index 0000000000000..e5e4371c525b2
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
@@ -0,0 +1,670 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -mattr=+avx512vl,+gfni,+avx512bw -passes=msan 2>&1 | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.vgf2p8affineinvqb.128
+; - llvm.x86.vgf2p8affineinvqb.256
+; - llvm.x86.vgf2p8affineinvqb.512
+; - llvm.x86.vgf2p8affineqb.128
+; - llvm.x86.vgf2p8affineqb.256
+; - llvm.x86.vgf2p8affineqb.512
+;
+; Heuristically handled:
+; - llvm.x86.vgf2p8mulb.128
+; - llvm.x86.vgf2p8mulb.256
+; - llvm.x86.vgf2p8mulb.512
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineinvqb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP36]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
+  %3 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4)
+  %4 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+  %5 = select <16 x i1> %1, <16 x i8> %3, <16 x i8> zeroinitializer
+  %6 = select <16 x i1> %1, <16 x i8> %4, <16 x i8> %passthru
+  %7 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %2, 0
+  %8 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %7, <16 x i8> %5, 1
+  %9 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %8, <16 x i8> %6, 2
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %9
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
+define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineinvqb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP36]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
+  %3 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4)
+  %4 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 5)
+  %5 = select <32 x i1> %1, <32 x i8> %3, <32 x i8> zeroinitializer
+  %6 = select <32 x i1> %1, <32 x i8> %4, <32 x i8> %passthru
+  %7 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %2, 0
+  %8 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %7, <32 x i8> %5, 1
+  %9 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %8, <32 x i8> %6, 2
+  ret { <32 x i8>, <32 x i8>, <32 x i8> } %9
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8>, <64 x i8>, i8)
+define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineinvqb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP36]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
+  %3 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4)
+  %4 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5)
+  %5 = select <64 x i1> %1, <64 x i8> %3, <64 x i8> zeroinitializer
+  %6 = select <64 x i1> %1, <64 x i8> %4, <64 x i8> %passthru
+  %7 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> %2, 0
+  %8 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %7, <64 x i8> %5, 1
+  %9 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %8, <64 x i8> %6, 2
+  ret { <64 x i8>, <64 x i8>, <64 x i8> } %9
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineqb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP36]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
+  %3 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4)
+  %4 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+  %5 = select <16 x i1> %1, <16 x i8> %3, <16 x i8> zeroinitializer
+  %6 = select <16 x i1> %1, <16 x i8> %4, <16 x i8> %passthru
+  %7 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %2, 0
+  %8 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %7, <16 x i8> %5, 1
+  %9 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %8, <16 x i8> %6, 2
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %9
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineqb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP36]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
+  %3 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4)
+  %4 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 5)
+  %5 = select <32 x i1> %1, <32 x i8> %3, <32 x i8> zeroinitializer
+  %6 = select <32 x i1> %1, <32 x i8> %4, <32 x i8> %passthru
+  %7 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %2, 0
+  %8 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %7, <32 x i8> %5, 1
+  %9 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %8, <32 x i8> %6, 2
+  ret { <32 x i8>, <32 x i8>, <32 x i8> } %9
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineqb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP36]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
+  %3 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4)
+  %4 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5)
+  %5 = select <64 x i1> %1, <64 x i8> %3, <64 x i8> zeroinitializer
+  %6 = select <64 x i1> %1, <64 x i8> %4, <64 x i8> %passthru
+  %7 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> %2, 0
+  %8 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %7, <64 x i8> %5, 1
+  %9 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %8, <64 x i8> %6, 2
+  ret { <64 x i8>, <64 x i8>, <64 x i8> } %9
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[_MSPROP]], <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i8> [[TMP7]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP11]], <16 x i8> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP12]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+  %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_128_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[_MSPROP]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i8> [[TMP10]], <16 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP11]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+  %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
+define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[_MSPROP]], <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <32 x i8> [[TMP7]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i8> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP11]], <32 x i8> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP7]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[TMP12]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+  %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_256_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[_MSPROP]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> [[TMP10]], <32 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[TMP11]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+  %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8>, <64 x i8>)
+define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[_MSPROP]], <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <64 x i8> [[TMP7]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i8> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <64 x i8> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP11]], <64 x i8> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP7]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP12]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+  %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_512_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[_MSPROP]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <64 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> [[TMP10]], <64 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP6]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP11]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+  %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
+}
+
+attributes #0 = { sanitize_memory }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/alloca.ll b/llvm/test/Instrumentation/MemorySanitizer/alloca.ll
index 25a44ecd9d241..40ade5f40f9af 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/alloca.ll
@@ -176,78 +176,5 @@ entry:
 ; CHECK: call void @llvm.lifetime.end
 ; CHECK: ret void
 
-
-; If we can't trace one of the lifetime markers to a single alloca, fall back
-; to poisoning allocas at the beginning of the function.
-; Each alloca must be poisoned only once.
-define void @lifetime_no_alloca(i8 %v) sanitize_memory {
-entry:
-  %x = alloca i32, align 4
-  %y = alloca i32, align 4
-  %z = alloca i32, align 4
-  %tobool = icmp eq i8 %v, 0
-  %xy = select i1 %tobool, ptr %x, ptr %y
-  %cxcy = select i1 %tobool, ptr %x, ptr %y
-  br label %another_bb
-
-another_bb:
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
-  store i32 7, ptr %z
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z)
-  store i32 7, ptr %z
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z)
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy)
-  store i32 8, ptr %xy
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy)
-  ret void
-}
-
-; CHECK-LABEL: define void @lifetime_no_alloca(
-; CHECK-LABEL: entry:
-; CHECK: %x = alloca i32
-; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: %y = alloca i32
-; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: %z = alloca i32
-; INLINE: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-
-; There're two lifetime intrinsics for %z, but we must instrument it only once.
-; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK-LABEL: another_bb:
-
-; CHECK: call void @llvm.lifetime.start
-; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: call void @llvm.lifetime.end
-; CHECK: call void @llvm.lifetime.start
-; INLINE-NOT: call void @llvm.memset.p0.i64(ptr align 4 {{.*}}, i8 -1, i64 4, i1 false)
-; CALL-NOT: call void @__msan_poison_stack(ptr {{.*}}, i64 4)
-; ORIGIN-NOT: call void @__msan_set_alloca_origin_with_descr(ptr {{.*}}, i64 4,
-; ORIGIN-LEAN-NOT: call void @__msan_set_alloca_origin_no_descr(ptr {{.*}}, i64 4,
-; KMSAN-NOT: call void @__msan_poison_alloca(ptr {{.*}}, i64 4,
-; CHECK: call void @llvm.lifetime.end
-
-
-
 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/MC/AArch64/directives-case_insensitive.s b/llvm/test/MC/AArch64/directives-case_insensitive.s
index 35a90a1bffea8..c2bdec73e349e 100644
--- a/llvm/test/MC/AArch64/directives-case_insensitive.s
+++ b/llvm/test/MC/AArch64/directives-case_insensitive.s
@@ -15,8 +15,8 @@ tlbi vmalle1os
 .INST 0x5e104020
 // CHECK: .inst 0x5e104020
 
-.RELOC 0, R_AARCH64_NONE, 8
-// CHECK: .reloc 0, R_AARCH64_NONE, 8
+.RELOC ., R_AARCH64_NONE, 8
+// CHECK: .reloc {{.*}}, R_AARCH64_NONE, 8
 
 .HWORD 0x1234
 // CHECK: .hword  4660
diff --git a/llvm/test/MC/AArch64/reloc-directive-err.s b/llvm/test/MC/AArch64/reloc-directive-err.s
index 6eec2ae10c0a6..883dd2a06f28e 100644
--- a/llvm/test/MC/AArch64/reloc-directive-err.s
+++ b/llvm/test/MC/AArch64/reloc-directive-err.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=aarch64 %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/AArch64/reloc-directive.s b/llvm/test/MC/AArch64/reloc-directive.s
index 09b0f0d3cb9d3..a502201fb9291 100644
--- a/llvm/test/MC/AArch64/reloc-directive.s
+++ b/llvm/test/MC/AArch64/reloc-directive.s
@@ -2,32 +2,32 @@
 
 # RUN: llvm-mc -filetype=obj -triple=aarch64-linux-musl %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_AARCH64_NONE, .data
-# PRINT: .reloc 4, R_AARCH64_NONE, foo+4
-# PRINT: .reloc 0, R_AARCH64_NONE, 8
-# PRINT: .reloc 0, R_AARCH64_ABS64, .data+2
-# PRINT: .reloc 0, R_AARCH64_TLSDESC, foo+3
-# PRINT: .reloc 0, R_AARCH64_IRELATIVE, 5
-# PRINT: .reloc 0, BFD_RELOC_NONE, 9
-# PRINT: .reloc 0, BFD_RELOC_16, 9
-# PRINT: .reloc 0, BFD_RELOC_32, 9
-# PRINT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_AARCH64_NONE, .data
+# PRINT: .reloc {{.*}}+4, R_AARCH64_NONE, foo+4
+# PRINT: .reloc {{.*}}+0, R_AARCH64_NONE, 8
+# PRINT: .reloc {{.*}}+0, R_AARCH64_ABS64, .data+2
+# PRINT: .reloc {{.*}}+0, R_AARCH64_TLSDESC, foo+3
+# PRINT: .reloc {{.*}}+0, R_AARCH64_IRELATIVE, 5
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_64, 9
 .text
+  .reloc .+8, R_AARCH64_NONE, .data
+  .reloc .+4, R_AARCH64_NONE, foo+4
+  .reloc .+0, R_AARCH64_NONE, 8
+
+  .reloc .+0, R_AARCH64_ABS64, .data+2
+  .reloc .+0, R_AARCH64_TLSDESC, foo+3
+  .reloc .+0, R_AARCH64_IRELATIVE, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   ret
   nop
   nop
-  .reloc 8, R_AARCH64_NONE, .data
-  .reloc 4, R_AARCH64_NONE, foo+4
-  .reloc 0, R_AARCH64_NONE, 8
-
-  .reloc 0, R_AARCH64_ABS64, .data+2
-  .reloc 0, R_AARCH64_TLSDESC, foo+3
-  .reloc 0, R_AARCH64_IRELATIVE, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
new file mode 100644
index 0000000000000..800f66221114e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+s_load_b32 s4, s[2:3], 10 nv
+// GFX1250: s_load_b32 s4, s[2:3], 0xa nv           ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 10 nv
+// GFX12-ERR-NEXT:{{^}}                          ^
+
+s_buffer_load_i8 s5, s[4:7], s0 nv
+// GFX1250: s_buffer_load_i8 s5, s[4:7], s0 offset:0x0 nv ; encoding: [0x42,0x01,0x13,0xf4,0x00,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 nv
+// GFX12-ERR-NEXT:{{^}}                                ^
+
+s_load_b32 s4, s[2:3], 0xa scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset
+// GFX12-ERR-NEXT:{{^}}                           ^
+
+s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX12-ERR-NEXT:{{^}}                           ^
+// GFX12-ERR-NEXT: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 0xa scale_offset nv
+// GFX12-ERR-NEXT:{{^}}                                        ^
+
+s_load_b32 s4, s[2:3], s5 offset:32 scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], s5 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], s5 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                    ^
+
+s_load_b32 s4, s[2:3], m0 offset:32 scale_offset
+// GFX1250: s_load_b32 s4, s[2:3], m0 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], m0 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                    ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s
new file mode 100644
index 0000000000000..e57d4fc760309
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem_err.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+s_buffer_load_i8 s5, s[4:7], s0 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                ^
+
+s_prefetch_data s[18:19], 100, s10, 7 nv
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}s_prefetch_data s[18:19], 100, s10, 7 nv
+// GFX1250-ERR-NEXT:{{^}}                                      ^
+
+s_prefetch_data s[18:19], 100, s10, 7 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}s_prefetch_data s[18:19], 100, s10, 7 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                      ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s
new file mode 100644
index 0000000000000..1d14bd91a7569
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s
@@ -0,0 +1,20 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv
+// GFX1250: buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv
+// GFX12-ERR-NEXT:{{^}}                                                      ^
+
+buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv
+// GFX1250: buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv ; encoding: [0x84,0x40,0x07,0xc4,0x02,0x18,0x80,0x80,0x00,0xff,0x0f,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv
+// GFX12-ERR-NEXT:{{^}}                                                             ^
+
+buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv
+// GFX1250: buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv
+// GFX12-ERR-NEXT:{{^}}                                                            ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s
new file mode 100644
index 0000000000000..731eb6770ceed
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf_err.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                                      ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
index 07b4055f0ab9c..d3a49f2eb25fa 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
@@ -1,6 +1,114 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+global_load_b32 v0, v[2:3], off nv
+// GFX1250: global_load_b32 v0, v[2:3], off nv      ; encoding: [0xfc,0x00,0x05,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_load_b32 v0, v[2:3], off nv
+// GFX12-ERR-NEXT:{{^}}                                ^
+
+global_store_b32 v[2:3], v0, off nv
+// GFX1250: global_store_b32 v[2:3], v0, off nv     ; encoding: [0xfc,0x80,0x06,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_store_b32 v[2:3], v0, off nv
+// GFX12-ERR-NEXT:{{^}}                                 ^
+
+global_atomic_add v[2:3], v2, off nv
+// GFX1250: global_atomic_add_u32 v[2:3], v2, off nv ; encoding: [0xfc,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_atomic_add v[2:3], v2, off nv
+// GFX12-ERR-NEXT:{{^}}                                  ^
+
+global_load_addtid_b32 v5, s[2:3] nv
+// GFX1250: global_load_addtid_b32 v5, s[2:3] nv    ; encoding: [0x82,0x00,0x0a,0xee,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_load_addtid_b32 v5, s[2:3] nv
+// GFX12-ERR-NEXT:{{^}}                                  ^
+
+scratch_load_b32 v0, v2, off nv
+// GFX1250: scratch_load_b32 v0, v2, off nv         ; encoding: [0xfc,0x00,0x05,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v0, v2, off nv
+// GFX12-ERR-NEXT:{{^}}                             ^
+
+scratch_store_b32 v2, v0, off nv
+// GFX1250: scratch_store_b32 v2, v0, off nv        ; encoding: [0xfc,0x80,0x06,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v0, off nv
+// GFX12-ERR-NEXT:{{^}}                              ^
+
+flat_load_b32 v0, v[2:3] nv
+// GFX1250: flat_load_b32 v0, v[2:3] nv             ; encoding: [0xfc,0x00,0x05,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}flat_load_b32 v0, v[2:3] nv
+// GFX12-ERR-NEXT:{{^}}                         ^
+
+flat_store_b32 v[2:3], v0 nv
+// GFX1250: flat_store_b32 v[2:3], v0 nv            ; encoding: [0xfc,0x80,0x06,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}flat_store_b32 v[2:3], v0 nv
+// GFX12-ERR-NEXT:{{^}}                          ^
+
+flat_atomic_add v[2:3], v2 nv
+// GFX1250: flat_atomic_add_u32 v[2:3], v2 nv       ; encoding: [0xfc,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}flat_atomic_add v[2:3], v2 nv
+// GFX12-ERR-NEXT:{{^}}                           ^
+
+scratch_load_b32 v5, v2, off nv
+// GFX1250: scratch_load_b32 v5, v2, off nv         ; encoding: [0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off nv
+// GFX12-ERR-NEXT:{{^}}                             ^
+
+global_load_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX1250: global_load_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_load_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                         ^
+
+global_store_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX1250: global_store_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_store_b32 v5, v1, s[2:3] offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                          ^
+
+global_atomic_add_u32 v2, v5, s[2:3] scale_offset
+// GFX1250: global_atomic_add_u32 v2, v5, s[2:3] scale_offset ; encoding: [0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}global_atomic_add_u32 v2, v5, s[2:3] scale_offset
+// GFX12-ERR-NEXT:{{^}}                                     ^
+
+scratch_load_b32 v5, v2, off scale_offset
+// GFX1250: scratch_load_b32 v5, v2, off scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off scale_offset
+// GFX12-ERR-NEXT:{{^}}                             ^
+
+scratch_load_b32 v5, v2, off offset:32 scale_offset
+// GFX1250: scratch_load_b32 v5, v2, off offset:32 scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                       ^
+
+scratch_load_b32 v5, v2, s1 offset:32 scale_offset
+// GFX1250: scratch_load_b32 v5, v2, s1 offset:32 scale_offset ; encoding: [0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, s1 offset:32 scale_offset
+// GFX12-ERR-NEXT:{{^}}                                      ^
+
+scratch_store_b32 v2, v5, off scale_offset
+// GFX1250: scratch_store_b32 v2, v5, off scale_offset ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v5, off scale_offset
+// GFX12-ERR-NEXT:{{^}}                              ^
+
+scratch_store_b32 v2, v5, s1 scale_offset
+// GFX1250: scratch_store_b32 v2, v5, s1 scale_offset ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: scale_offset is not supported on this GPU
+// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v5, s1 scale_offset
+// GFX12-ERR-NEXT:{{^}}                             ^
+
 tensor_save s[0:1]
 // GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
@@ -20,3 +128,359 @@ tensor_stop
 tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS
 // GFX1250: tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1b,0xee,0x00,0x00,0x3c,0x00,0x00,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_atomic_add_f32 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_f32 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_f32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_u32 v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_u32 v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_u32 v2, v3, s[2:3] offset:-64
+// GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_and_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_and_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_and_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3]
+// GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cond_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_cond_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_dec_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_dec_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_dec_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_inc_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_inc_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_inc_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_max_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_i32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_max_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_min_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_num_f32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_i32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_i32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_min_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_or_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_or_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_or_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_sub_clamp_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_clamp_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_u32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_swap_b32 v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_swap_b32 v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_swap_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_xor_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_xor_b32 v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_xor_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_atomic_pk_add_f16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_pk_add_f16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_pk_add_bf16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN
+// GFX1250: flat_atomic_pk_add_bf16 v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_load_dword v1, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
+
+flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset
+// GFX1250: flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: not a valid operand.
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
new file mode 100644
index 0000000000000..26d7ed3cfae4c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
+
+global_load_b96 v[1:3], v[0:1], off
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+
+flat_load_b32 v5, v[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_load_b32 v5, v[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}}                         ^
+
+flat_load_b32 v5, v[2:3] offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_load_b32 v5, v[2:3] offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                   ^
+
+flat_store_b32 v[2:3], v5 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_store_b32 v[2:3], v5 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                          ^
+
+flat_atomic_add v[2:3], v2 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}flat_atomic_add v[2:3], v2 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                           ^
+
+global_load_b32 v5, v[2:3], off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_load_b32 v5, v[2:3], off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                          ^
+
+global_store_b32 v[2:3], v5, off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_store_b32 v[2:3], v5, off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                           ^
+
+global_atomic_add v[2:3], v2, off scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_atomic_add v[2:3], v2, off scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                  ^
+
+global_load_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_load_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                  ^
+
+global_store_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}global_store_addtid_b32 v5, s[2:3] scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                   ^
+
+scratch_load_b32 v5, off, s1 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, s1 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                             ^
+
+scratch_load_b32 v5, off, off offset:32 scale_offset
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
+// GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, off offset:32 scale_offset
+// GFX1250-ERR-NEXT:{{^}}                                        ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index c587b66e65011..811c6ebfe0161 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -28,6 +28,96 @@ v_mov_b64 v[4:5], 0.5
 v_mov_b64 v[254:255], 0xaf123456
 // GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
+v_tanh_f32 v5, v1
+// GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, v255
+// GFX1250: v_tanh_f32_e32 v5, v255                 ; encoding: [0xff,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, s1
+// GFX1250: v_tanh_f32_e32 v5, s1                   ; encoding: [0x01,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, s105
+// GFX1250: v_tanh_f32_e32 v5, s105                 ; encoding: [0x69,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_lo
+// GFX1250: v_tanh_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_hi
+// GFX1250: v_tanh_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, ttmp15
+// GFX1250: v_tanh_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, m0
+// GFX1250: v_tanh_f32_e32 v5, m0                   ; encoding: [0x7d,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_lo
+// GFX1250: v_tanh_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_hi
+// GFX1250: v_tanh_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, null
+// GFX1250: v_tanh_f32_e32 v5, null                 ; encoding: [0x7c,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, -1
+// GFX1250: v_tanh_f32_e32 v5, -1                   ; encoding: [0xc1,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, 0.5
+// GFX1250: v_tanh_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, src_scc
+// GFX1250: v_tanh_f32_e32 v5, src_scc              ; encoding: [0xfd,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v255, 0xaf123456
+// GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16 v5, v1
+// GFX1250: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, v127
+// GFX1250: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, s1
+// GFX1250: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, s105
+// GFX1250: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, ttmp15
+// GFX1250: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, m0
+// GFX1250: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_lo
+// GFX1250: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_hi
+// GFX1250: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, null
+// GFX1250: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, -1
+// GFX1250: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, 0.5
+// GFX1250: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, src_scc
+// GFX1250: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v127, 0x8000
+// GFX1250: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
 v_tanh_bf16 v5, v1
 // GFX1250: v_tanh_bf16_e32 v5, v1                  ; encoding: [0x01,0x95,0x0a,0x7e]
 
@@ -73,6 +163,51 @@ v_tanh_bf16 v5, src_scc
 v_tanh_bf16 v127, 0x8000
 // GFX1250: v_tanh_bf16_e32 v127, 0x8000            ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
+v_prng_b32 v5, v1
+// GFX1250: v_prng_b32_e32 v5, v1                   ; encoding: [0x01,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, v255
+// GFX1250: v_prng_b32_e32 v5, v255                 ; encoding: [0xff,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, s1
+// GFX1250: v_prng_b32_e32 v5, s1                   ; encoding: [0x01,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, s105
+// GFX1250: v_prng_b32_e32 v5, s105                 ; encoding: [0x69,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_lo
+// GFX1250: v_prng_b32_e32 v5, vcc_lo               ; encoding: [0x6a,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_hi
+// GFX1250: v_prng_b32_e32 v5, vcc_hi               ; encoding: [0x6b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, ttmp15
+// GFX1250: v_prng_b32_e32 v5, ttmp15               ; encoding: [0x7b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, m0
+// GFX1250: v_prng_b32_e32 v5, m0                   ; encoding: [0x7d,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_lo
+// GFX1250: v_prng_b32_e32 v5, exec_lo              ; encoding: [0x7e,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_hi
+// GFX1250: v_prng_b32_e32 v5, exec_hi              ; encoding: [0x7f,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, null
+// GFX1250: v_prng_b32_e32 v5, null                 ; encoding: [0x7c,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, -1
+// GFX1250: v_prng_b32_e32 v5, -1                   ; encoding: [0xc1,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, 0.5
+// GFX1250: v_prng_b32_e32 v5, 0.5                  ; encoding: [0xf0,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, src_scc
+// GFX1250: v_prng_b32_e32 v5, src_scc              ; encoding: [0xfd,0x96,0x0a,0x7e]
+
+v_prng_b32 v255, 0xaf123456
+// GFX1250: v_prng_b32_e32 v255, 0xaf123456         ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
 v_rcp_bf16 v5, v1
 // GFX1250: v_rcp_bf16_e32 v5, v1                   ; encoding: [0x01,0xf3,0x0a,0x7e]
 
@@ -163,6 +298,231 @@ v_sqrt_bf16 v5, src_scc
 v_sqrt_bf16 v127, 0x8000
 // GFX1250: v_sqrt_bf16_e32 v127, 0x8000            ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
+v_rsq_bf16 v5, v1
+// GFX1250: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, v127
+// GFX1250: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, s1
+// GFX1250: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, s105
+// GFX1250: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, ttmp15
+// GFX1250: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, m0
+// GFX1250: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_lo
+// GFX1250: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_hi
+// GFX1250: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, null
+// GFX1250: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, -1
+// GFX1250: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, 0.5
+// GFX1250: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, src_scc
+// GFX1250: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v127, 0x8000
+// GFX1250: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_log_bf16 v5, v1
+// GFX1250: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, v127
+// GFX1250: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, s1
+// GFX1250: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, s105
+// GFX1250: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_lo
+// GFX1250: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_hi
+// GFX1250: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, ttmp15
+// GFX1250: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, m0
+// GFX1250: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_lo
+// GFX1250: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_hi
+// GFX1250: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, null
+// GFX1250: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, -1
+// GFX1250: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, 0.5
+// GFX1250: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, src_scc
+// GFX1250: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+v_log_bf16 v127, 0x8000
+// GFX1250: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_exp_bf16 v5, v1
+// GFX1250: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, v127
+// GFX1250: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, s1
+// GFX1250: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, s105
+// GFX1250: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_lo
+// GFX1250: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_hi
+// GFX1250: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, ttmp15
+// GFX1250: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, m0
+// GFX1250: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_lo
+// GFX1250: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_hi
+// GFX1250: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, null
+// GFX1250: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, -1
+// GFX1250: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, 0.5
+// GFX1250: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, src_scc
+// GFX1250: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v127, 0x8000
+// GFX1250: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_sin_bf16 v5, v1
+// GFX1250: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, v127
+// GFX1250: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, s1
+// GFX1250: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, s105
+// GFX1250: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_lo
+// GFX1250: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_hi
+// GFX1250: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, ttmp15
+// GFX1250: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, m0
+// GFX1250: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_lo
+// GFX1250: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_hi
+// GFX1250: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, null
+// GFX1250: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, -1
+// GFX1250: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, 0.5
+// GFX1250: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, src_scc
+// GFX1250: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v127, 0x8000
+// GFX1250: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_cos_bf16 v5, v1
+// GFX1250: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, v127
+// GFX1250: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, s1
+// GFX1250: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, s105
+// GFX1250: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_lo
+// GFX1250: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_hi
+// GFX1250: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, ttmp15
+// GFX1250: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, m0
+// GFX1250: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_lo
+// GFX1250: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_hi
+// GFX1250: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, null
+// GFX1250: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, -1
+// GFX1250: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, 0.5
+// GFX1250: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, src_scc
+// GFX1250: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v127, 0x8000
+// GFX1250: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
 v_cvt_f32_bf16 v5, v1
 // GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
 
@@ -267,3 +627,60 @@ v_cvt_f32_fp8_e32 v1, 3
 
 v_cvt_f32_fp8_e32 v1, v3
 // GFX1250: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], s5
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5         ; encoding: [0x05,0xde,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], 3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3          ; encoding: [0x83,0xde,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3         ; encoding: [0x03,0xdf,0x08,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+
+v_sat_pk4_i4_i8 v1, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2              ; encoding: [0x02,0xe7,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, s2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2              ; encoding: [0x02,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2               ; encoding: [0x82,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234          ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v1, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2              ; encoding: [0x02,0xe9,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, s2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2              ; encoding: [0x02,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2               ; encoding: [0x82,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234          ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2        ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2        ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index 719eb3abc02a3..a313741ffe22d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -28,6 +28,99 @@ v_mov_b64 v[4:5], 0.5
 v_mov_b64 v[254:255], 0xaf123456
 // GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
+v_tanh_f32 v5, v1
+// GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, v255
+// GFX1250: v_tanh_f32_e32 v5, v255                 ; encoding: [0xff,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, s1
+// GFX1250: v_tanh_f32_e32 v5, s1                   ; encoding: [0x01,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, s105
+// GFX1250: v_tanh_f32_e32 v5, s105                 ; encoding: [0x69,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_lo
+// GFX1250: v_tanh_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_hi
+// GFX1250: v_tanh_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, ttmp15
+// GFX1250: v_tanh_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, m0
+// GFX1250: v_tanh_f32_e32 v5, m0                   ; encoding: [0x7d,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_lo
+// GFX1250: v_tanh_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_hi
+// GFX1250: v_tanh_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, null
+// GFX1250: v_tanh_f32_e32 v5, null                 ; encoding: [0x7c,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, -1
+// GFX1250: v_tanh_f32_e32 v5, -1                   ; encoding: [0xc1,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, 0.5
+// GFX1250: v_tanh_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, src_scc
+// GFX1250: v_tanh_f32_e32 v5, src_scc              ; encoding: [0xfd,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v255, 0xaf123456
+// GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16 v5, v1
+// GFX1250: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, v127
+// GFX1250: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, s1
+// GFX1250: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, s105
+// GFX1250: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, ttmp15
+// GFX1250: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, m0
+// GFX1250: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_lo
+// GFX1250: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_hi
+// GFX1250: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, null
+// GFX1250: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, -1
+// GFX1250: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, 0.5
+// GFX1250: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, src_scc
+// GFX1250: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v127, 0x8000
+// GFX1250: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_tanh_f16 v5.h, v1.h
+// GFX1250: v_tanh_f16_e32 v5.h, v1.h               ; encoding: [0x81,0x3f,0x0a,0x7f]
+
 v_tanh_bf16 v5, v1
 // GFX1250: v_tanh_bf16_e32 v5, v1                  ; encoding: [0x01,0x95,0x0a,0x7e]
 
@@ -76,6 +169,51 @@ v_tanh_bf16 v127, 0x8000
 v_tanh_bf16 v5.h, v1.h
 // GFX1250: v_tanh_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0x95,0x0a,0x7f]
 
+v_prng_b32 v5, v1
+// GFX1250: v_prng_b32_e32 v5, v1                   ; encoding: [0x01,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, v255
+// GFX1250: v_prng_b32_e32 v5, v255                 ; encoding: [0xff,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, s1
+// GFX1250: v_prng_b32_e32 v5, s1                   ; encoding: [0x01,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, s105
+// GFX1250: v_prng_b32_e32 v5, s105                 ; encoding: [0x69,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_lo
+// GFX1250: v_prng_b32_e32 v5, vcc_lo               ; encoding: [0x6a,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_hi
+// GFX1250: v_prng_b32_e32 v5, vcc_hi               ; encoding: [0x6b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, ttmp15
+// GFX1250: v_prng_b32_e32 v5, ttmp15               ; encoding: [0x7b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, m0
+// GFX1250: v_prng_b32_e32 v5, m0                   ; encoding: [0x7d,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_lo
+// GFX1250: v_prng_b32_e32 v5, exec_lo              ; encoding: [0x7e,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_hi
+// GFX1250: v_prng_b32_e32 v5, exec_hi              ; encoding: [0x7f,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, null
+// GFX1250: v_prng_b32_e32 v5, null                 ; encoding: [0x7c,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, -1
+// GFX1250: v_prng_b32_e32 v5, -1                   ; encoding: [0xc1,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, 0.5
+// GFX1250: v_prng_b32_e32 v5, 0.5                  ; encoding: [0xf0,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, src_scc
+// GFX1250: v_prng_b32_e32 v5, src_scc              ; encoding: [0xfd,0x96,0x0a,0x7e]
+
+v_prng_b32 v255, 0xaf123456
+// GFX1250: v_prng_b32_e32 v255, 0xaf123456         ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
 v_rcp_bf16 v5, v1
 // GFX1250: v_rcp_bf16_e32 v5, v1                   ; encoding: [0x01,0xf3,0x0a,0x7e]
 
@@ -172,6 +310,246 @@ v_sqrt_bf16 v127, 0x8000
 v_sqrt_bf16 v5.h, v1.h
 // GFX1250: v_sqrt_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0xf5,0x0a,0x7f]
 
+v_rsq_bf16 v5, v1
+// GFX1250: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, v127
+// GFX1250: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, s1
+// GFX1250: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, s105
+// GFX1250: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, ttmp15
+// GFX1250: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, m0
+// GFX1250: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_lo
+// GFX1250: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_hi
+// GFX1250: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, null
+// GFX1250: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, -1
+// GFX1250: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, 0.5
+// GFX1250: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, src_scc
+// GFX1250: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v127, 0x8000
+// GFX1250: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_rsq_bf16 v5.h, v1.h
+// GFX1250: v_rsq_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf7,0x0a,0x7f]
+
+v_log_bf16 v5, v1
+// GFX1250: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, v127
+// GFX1250: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, s1
+// GFX1250: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, s105
+// GFX1250: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_lo
+// GFX1250: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_hi
+// GFX1250: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, ttmp15
+// GFX1250: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, m0
+// GFX1250: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_lo
+// GFX1250: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_hi
+// GFX1250: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, null
+// GFX1250: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, -1
+// GFX1250: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, 0.5
+// GFX1250: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, src_scc
+// GFX1250: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+v_log_bf16 v127, 0x8000
+// GFX1250: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_log_bf16 v5.h, v1.h
+// GFX1250: v_log_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf9,0x0a,0x7f]
+
+v_exp_bf16 v5, v1
+// GFX1250: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, v127
+// GFX1250: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, s1
+// GFX1250: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, s105
+// GFX1250: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_lo
+// GFX1250: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_hi
+// GFX1250: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, ttmp15
+// GFX1250: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, m0
+// GFX1250: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_lo
+// GFX1250: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_hi
+// GFX1250: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, null
+// GFX1250: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, -1
+// GFX1250: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, 0.5
+// GFX1250: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, src_scc
+// GFX1250: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v127, 0x8000
+// GFX1250: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_exp_bf16 v5.h, v1.h
+// GFX1250: v_exp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfb,0x0a,0x7f]
+
+v_sin_bf16 v5, v1
+// GFX1250: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, v127
+// GFX1250: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, s1
+// GFX1250: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, s105
+// GFX1250: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_lo
+// GFX1250: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_hi
+// GFX1250: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, ttmp15
+// GFX1250: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, m0
+// GFX1250: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_lo
+// GFX1250: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_hi
+// GFX1250: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, null
+// GFX1250: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, -1
+// GFX1250: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, 0.5
+// GFX1250: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, src_scc
+// GFX1250: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v127, 0x8000
+// GFX1250: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_sin_bf16 v5.h, v1.h
+// GFX1250: v_sin_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfd,0x0a,0x7f]
+
+v_cos_bf16 v5, v1
+// GFX1250: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, v127
+// GFX1250: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, s1
+// GFX1250: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, s105
+// GFX1250: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_lo
+// GFX1250: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_hi
+// GFX1250: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, ttmp15
+// GFX1250: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, m0
+// GFX1250: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_lo
+// GFX1250: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_hi
+// GFX1250: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, null
+// GFX1250: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, -1
+// GFX1250: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, 0.5
+// GFX1250: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, src_scc
+// GFX1250: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v127, 0x8000
+// GFX1250: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_cos_bf16 v5.h, v1.h
+// GFX1250: v_cos_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xff,0x0a,0x7f]
+
 v_cvt_f32_bf16 v5, v1
 // GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
 
@@ -285,3 +663,78 @@ v_cvt_f32_fp8_e32 v1, 3
 
 v_cvt_f32_fp8_e32 v1, v3
 // GFX1250: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], s5
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5         ; encoding: [0x05,0xde,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], 3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3          ; encoding: [0x83,0xde,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3         ; encoding: [0x03,0xdf,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], v127.h
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v127.h     ; encoding: [0xff,0xdf,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], v127.l
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v127.l     ; encoding: [0x7f,0xdf,0x08,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[4:5], v127.h
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[4:5], v127.h     ; encoding: [0xff,0xdd,0x08,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[4:5], v127.l
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[4:5], v127.l     ; encoding: [0x7f,0xdd,0x08,0x7e]
+
+v_sat_pk4_i4_i8 v1, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2              ; encoding: [0x02,0xe7,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, s2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2              ; encoding: [0x02,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2               ; encoding: [0x82,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234          ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_i4_i8 v1.h, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1.h, v2            ; encoding: [0x02,0xe7,0x02,0x7f]
+
+v_sat_pk4_u4_u8 v1, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2              ; encoding: [0x02,0xe9,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, s2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2              ; encoding: [0x02,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2               ; encoding: [0x82,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234          ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v1.h, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1.h, v2            ; encoding: [0x02,0xe9,0x02,0x7f]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2        ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2        ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
index 44859fcffe223..7386df87f8dab 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
@@ -2,6 +2,118 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -58,6 +170,58 @@ v_tanh_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f
 // GFX1250: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -170,6 +334,286 @@ v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f
 // GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -249,3 +693,19 @@ v_cvt_pk_f16_bf8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
 v_cvt_pk_f16_fp8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
index 8fef387700972..0a46f2f074e10 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
@@ -2,6 +2,122 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -62,6 +178,58 @@ v_tanh_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -182,6 +350,306 @@ v_sqrt_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -281,3 +749,27 @@ v_cvt_pk_f16_fp8 v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
 v_cvt_pk_f16_fp8 v1, v2.h quad_perm:[0,1,2,3]
 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
index 28368456a35df..e2763090a8d15 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
@@ -2,6 +2,30 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -14,6 +38,18 @@ v_tanh_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -38,6 +74,66 @@ v_sqrt_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_sqrt_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -73,3 +169,19 @@ v_cvt_pk_f16_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pk_f16_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
index 1ed8f5faff3fc..359aadc49ccc4 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
@@ -2,6 +2,34 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -18,6 +46,18 @@ v_tanh_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -50,6 +90,86 @@ v_sqrt_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -105,3 +225,27 @@ v_cvt_pk_f16_fp8 v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pk_f16_fp8 v1, v2.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
index b68306d60cf8c..f67ad88b5ae83 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s
@@ -1,5 +1,8 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_add_f64 v[1:2], v[1:2], v[1:2]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+
 v_fmaak_f32 v4, v2, v6, 3 row_share:1
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 // GFX1250-ERR-NEXT:{{^}}v_fmaak_f32 v4, v2, v6, 3 row_share:1
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index 0070c8ab9ee78..789d6f892762b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -15,3 +15,48 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
 
 v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
 // GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, v1, v2
+// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
+
+v_cvt_pk_bf16_f32 v5, v255, v255
+// GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255        ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
+
+v_cvt_pk_bf16_f32 v5, s1, s2
+// GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, s105, s105
+// GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105        ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15    ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_pk_bf16_f32 v5, ttmp15, src_scc
+// GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc   ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, m0, 0.5
+// GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5           ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_lo, -1
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1       ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_hi, null
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null     ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, null, exec_lo
+// GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo     ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, -1, exec_hi
+// GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi       ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
+// GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2     ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
+
+v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4
+// GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
+
+v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
+// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index 553eacc8e7b61..e1165faf59d9c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -15,3 +15,48 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
 
 v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
 // GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, v1, v2
+// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
+
+v_cvt_pk_bf16_f32 v5, v255, v255
+// GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255        ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
+
+v_cvt_pk_bf16_f32 v5, s1, s2
+// GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, s105, s105
+// GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105        ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15    ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456
+// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_pk_bf16_f32 v5, ttmp15, src_scc
+// GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc   ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, m0, 0.5
+// GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5           ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_lo, -1
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1       ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
+
+v_cvt_pk_bf16_f32 v5, exec_hi, null
+// GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null     ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, null, exec_lo
+// GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo     ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, -1, exec_hi
+// GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi       ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
+// GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2     ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
+
+v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4
+// GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
+
+v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
+// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
new file mode 100644
index 0000000000000..bc910b9dd18e9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
new file mode 100644
index 0000000000000..3bb84e264cf76
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s
new file mode 100644
index 0000000000000..f48445f84aa31
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s
@@ -0,0 +1,19 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s
new file mode 100644
index 0000000000000..d7a95f42aaecc
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s
@@ -0,0 +1,19 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 4f7be4833681d..0931523bbf40c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -1,6 +1,3750 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 
+v_bfrev_b32_e64 v5, v1
+// GFX1250: v_bfrev_b32_e64 v5, v1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00]
+
+v_bfrev_b32_e64 v5, v255
+// GFX1250: v_bfrev_b32_e64 v5, v255                ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00]
+
+v_bfrev_b32_e64 v5, s1
+// GFX1250: v_bfrev_b32_e64 v5, s1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, s105
+// GFX1250: v_bfrev_b32_e64 v5, s105                ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, vcc_lo
+// GFX1250: v_bfrev_b32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, vcc_hi
+// GFX1250: v_bfrev_b32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, ttmp15
+// GFX1250: v_bfrev_b32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, m0
+// GFX1250: v_bfrev_b32_e64 v5, m0                  ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, exec_lo
+// GFX1250: v_bfrev_b32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, exec_hi
+// GFX1250: v_bfrev_b32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, null
+// GFX1250: v_bfrev_b32_e64 v5, null                ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, -1
+// GFX1250: v_bfrev_b32_e64 v5, -1                  ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, 0.5
+// GFX1250: v_bfrev_b32_e64 v5, 0.5                 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, src_scc
+// GFX1250: v_bfrev_b32_e64 v5, src_scc             ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v255, 0xaf123456
+// GFX1250: v_bfrev_b32_e64 v255, 0xaf123456        ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_ceil_f16_e64 v5, v1
+// GFX1250: v_ceil_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+
+v_ceil_f16_e64 v5, v255
+// GFX1250: v_ceil_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+
+v_ceil_f16_e64 v5, s1
+// GFX1250: v_ceil_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, s105
+// GFX1250: v_ceil_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, vcc_lo
+// GFX1250: v_ceil_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, vcc_hi
+// GFX1250: v_ceil_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, ttmp15
+// GFX1250: v_ceil_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, m0
+// GFX1250: v_ceil_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, exec_lo
+// GFX1250: v_ceil_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, exec_hi
+// GFX1250: v_ceil_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, null
+// GFX1250: v_ceil_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, -1
+// GFX1250: v_ceil_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_ceil_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+
+v_ceil_f16_e64 v5, src_scc mul:4
+// GFX1250: v_ceil_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+
+v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_ceil_f32_e64 v5, v1
+// GFX1250: v_ceil_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00]
+
+v_ceil_f32_e64 v5, v255
+// GFX1250: v_ceil_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00]
+
+v_ceil_f32_e64 v5, s1
+// GFX1250: v_ceil_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, s105
+// GFX1250: v_ceil_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, vcc_lo
+// GFX1250: v_ceil_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, vcc_hi
+// GFX1250: v_ceil_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, ttmp15
+// GFX1250: v_ceil_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, m0
+// GFX1250: v_ceil_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, exec_lo
+// GFX1250: v_ceil_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, exec_hi
+// GFX1250: v_ceil_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, null
+// GFX1250: v_ceil_f32_e64 v5, null                 ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, -1
+// GFX1250: v_ceil_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_ceil_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08]
+
+v_ceil_f32_e64 v5, src_scc mul:4
+// GFX1250: v_ceil_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10]
+
+v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_ceil_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_ceil_f64_e64 v[6:7], v[2:3]           ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_ceil_f64_e64 v[6:7], v[254:255]       ; encoding: [0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_ceil_f64_e64 v[6:7], s[2:3]           ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_ceil_f64_e64 v[6:7], s[104:105]       ; encoding: [0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], vcc
+// GFX1250: v_ceil_f64_e64 v[6:7], vcc              ; encoding: [0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_ceil_f64_e64 v[6:7], ttmp[14:15]      ; encoding: [0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], exec
+// GFX1250: v_ceil_f64_e64 v[6:7], exec             ; encoding: [0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], null
+// GFX1250: v_ceil_f64_e64 v[6:7], null             ; encoding: [0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], -1
+// GFX1250: v_ceil_f64_e64 v[6:7], -1               ; encoding: [0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_ceil_f64_e64 v[6:7], 0.5 mul:2        ; encoding: [0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08]
+
+v_ceil_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30]
+
+v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cls_i32_e64 v5, v1
+// GFX1250: v_cls_i32_e64 v5, v1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00]
+
+v_cls_i32_e64 v5, v255
+// GFX1250: v_cls_i32_e64 v5, v255                  ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00]
+
+v_cls_i32_e64 v5, s1
+// GFX1250: v_cls_i32_e64 v5, s1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, s105
+// GFX1250: v_cls_i32_e64 v5, s105                  ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, vcc_lo
+// GFX1250: v_cls_i32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, vcc_hi
+// GFX1250: v_cls_i32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, ttmp15
+// GFX1250: v_cls_i32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, m0
+// GFX1250: v_cls_i32_e64 v5, m0                    ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, exec_lo
+// GFX1250: v_cls_i32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, exec_hi
+// GFX1250: v_cls_i32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, null
+// GFX1250: v_cls_i32_e64 v5, null                  ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, -1
+// GFX1250: v_cls_i32_e64 v5, -1                    ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, 0.5
+// GFX1250: v_cls_i32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, src_scc
+// GFX1250: v_cls_i32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cls_i32_e64 v255, 0xaf123456
+// GFX1250: v_cls_i32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_clz_i32_u32_e64 v5, v1
+// GFX1250: v_clz_i32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, v255
+// GFX1250: v_clz_i32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, s1
+// GFX1250: v_clz_i32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, s105
+// GFX1250: v_clz_i32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, vcc_lo
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, vcc_hi
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, ttmp15
+// GFX1250: v_clz_i32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, m0
+// GFX1250: v_clz_i32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, exec_lo
+// GFX1250: v_clz_i32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, exec_hi
+// GFX1250: v_clz_i32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, null
+// GFX1250: v_clz_i32_u32_e64 v5, null              ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, -1
+// GFX1250: v_clz_i32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, 0.5
+// GFX1250: v_clz_i32_u32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, src_scc
+// GFX1250: v_clz_i32_u32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v255, 0xaf123456
+// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cos_f16_e64 v5, v1
+// GFX1250: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5, v255
+// GFX1250: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5, s1
+// GFX1250: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, s105
+// GFX1250: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, vcc_lo
+// GFX1250: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, vcc_hi
+// GFX1250: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, ttmp15
+// GFX1250: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, m0
+// GFX1250: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, exec_lo
+// GFX1250: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, exec_hi
+// GFX1250: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, null
+// GFX1250: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, -1
+// GFX1250: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_f16_e64 v5, src_scc mul:4
+// GFX1250: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cos_f32_e64 v5, v1
+// GFX1250: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f32_e64 v5, v255
+// GFX1250: v_cos_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f32_e64 v5, s1
+// GFX1250: v_cos_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, s105
+// GFX1250: v_cos_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, vcc_lo
+// GFX1250: v_cos_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, vcc_hi
+// GFX1250: v_cos_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, ttmp15
+// GFX1250: v_cos_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, m0
+// GFX1250: v_cos_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, exec_lo
+// GFX1250: v_cos_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, exec_hi
+// GFX1250: v_cos_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, null
+// GFX1250: v_cos_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, -1
+// GFX1250: v_cos_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_f32_e64 v5, src_scc mul:4
+// GFX1250: v_cos_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_ctz_i32_b32_e64 v5, v1
+// GFX1250: v_ctz_i32_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, v255
+// GFX1250: v_ctz_i32_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, s1
+// GFX1250: v_ctz_i32_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, s105
+// GFX1250: v_ctz_i32_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, vcc_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, vcc_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, ttmp15
+// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, m0
+// GFX1250: v_ctz_i32_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, exec_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, exec_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, null
+// GFX1250: v_ctz_i32_b32_e64 v5, null              ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, -1
+// GFX1250: v_ctz_i32_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, 0.5
+// GFX1250: v_ctz_i32_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, src_scc
+// GFX1250: v_ctz_i32_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v255, 0xaf123456
+// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_bf8_e64 v1, s3
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:1
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:2
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:3
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:1
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:2
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:3
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:1
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:2
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:3
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:1
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:2
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:3
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:1
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:2
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:3
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:1
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:2
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:3
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], s3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3         ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3         ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], s3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], s3         ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], 3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3          ; encoding: [0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3         ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, v1
+// GFX1250: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, v255
+// GFX1250: v_cvt_f16_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, s1
+// GFX1250: v_cvt_f16_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, s105
+// GFX1250: v_cvt_f16_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_f16_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_f16_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_f16_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, m0
+// GFX1250: v_cvt_f16_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_f16_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_f16_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, null
+// GFX1250: v_cvt_f16_f32_e64 v5, null              ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, -1
+// GFX1250: v_cvt_f16_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f16_f32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f16_f32_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f16_f32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_cvt_f16_i16_e64 v5, v1
+// GFX1250: v_cvt_f16_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, v255
+// GFX1250: v_cvt_f16_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, s1
+// GFX1250: v_cvt_f16_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, s105
+// GFX1250: v_cvt_f16_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f16_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f16_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, ttmp15
+// GFX1250: v_cvt_f16_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, m0
+// GFX1250: v_cvt_f16_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, exec_lo
+// GFX1250: v_cvt_f16_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, exec_hi
+// GFX1250: v_cvt_f16_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, null
+// GFX1250: v_cvt_f16_i16_e64 v5, null              ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, -1
+// GFX1250: v_cvt_f16_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f16_i16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f16_i16_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f16_i16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2
+// GFX1250: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, v1
+// GFX1250: v_cvt_f16_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, v255
+// GFX1250: v_cvt_f16_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, s1
+// GFX1250: v_cvt_f16_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, s105
+// GFX1250: v_cvt_f16_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f16_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f16_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, ttmp15
+// GFX1250: v_cvt_f16_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, m0
+// GFX1250: v_cvt_f16_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, exec_lo
+// GFX1250: v_cvt_f16_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, exec_hi
+// GFX1250: v_cvt_f16_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, null
+// GFX1250: v_cvt_f16_u16_e64 v5, null              ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, -1
+// GFX1250: v_cvt_f16_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f16_u16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f16_u16_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f16_u16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2
+// GFX1250: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, v1
+// GFX1250: v_cvt_f32_f16_e64 v5, v1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, v255
+// GFX1250: v_cvt_f32_f16_e64 v5, v255              ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, s1
+// GFX1250: v_cvt_f32_f16_e64 v5, s1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, s105
+// GFX1250: v_cvt_f32_f16_e64 v5, s105              ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, m0
+// GFX1250: v_cvt_f32_f16_e64 v5, m0                ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, null
+// GFX1250: v_cvt_f32_f16_e64 v5, null              ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, -1
+// GFX1250: v_cvt_f32_f16_e64 v5, -1                ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_f16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_f16_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_f16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, v[2:3]
+// GFX1250: v_cvt_f32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, v[254:255]
+// GFX1250: v_cvt_f32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, s[2:3]
+// GFX1250: v_cvt_f32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, s[104:105]
+// GFX1250: v_cvt_f32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, vcc
+// GFX1250: v_cvt_f32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_cvt_f32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, exec
+// GFX1250: v_cvt_f32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, null
+// GFX1250: v_cvt_f32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, -1
+// GFX1250: v_cvt_f32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_f64_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_f64_e64 v5, -|src_scc| mul:4
+// GFX1250: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4  ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30]
+
+v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_i32_e64 v5, v1
+// GFX1250: v_cvt_f32_i32_e64 v5, v1                ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, v255
+// GFX1250: v_cvt_f32_i32_e64 v5, v255              ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, s1
+// GFX1250: v_cvt_f32_i32_e64 v5, s1                ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, s105
+// GFX1250: v_cvt_f32_i32_e64 v5, s105              ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_i32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_i32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_i32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, m0
+// GFX1250: v_cvt_f32_i32_e64 v5, m0                ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_i32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_i32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, null
+// GFX1250: v_cvt_f32_i32_e64 v5, null              ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, -1
+// GFX1250: v_cvt_f32_i32_e64 v5, -1                ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_i32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_i32_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_i32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_u32_e64 v5, v1
+// GFX1250: v_cvt_f32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, v255
+// GFX1250: v_cvt_f32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, s1
+// GFX1250: v_cvt_f32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, s105
+// GFX1250: v_cvt_f32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, m0
+// GFX1250: v_cvt_f32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, null
+// GFX1250: v_cvt_f32_u32_e64 v5, null              ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, -1
+// GFX1250: v_cvt_f32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_u32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_u32_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_u32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte0_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, v1             ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, v255           ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, s1             ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, s105           ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, m0             ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, null           ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, -1             ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte0_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte1_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, v1             ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, v255           ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, s1             ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, s105           ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, m0             ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, null           ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, -1             ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte1_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte2_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, v1             ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, v255           ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, s1             ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, s105           ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, m0             ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, null           ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, -1             ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte2_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte3_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, v1             ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, v255           ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, s1             ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, s105           ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, m0             ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, null           ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, -1             ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte3_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_f32_e64 v[6:7], v1
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], v255
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], s1
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], s105
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], vcc_lo
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], vcc_hi
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], ttmp15
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], m0
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], exec_lo
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], exec_hi
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], null
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], -1
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f64_f32_e64 v[6:7], src_scc mul:4
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2
+// GFX1250: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_i32_e64 v[6:7], v1
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], v255
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], s1
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], s105
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], vcc_lo
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], vcc_hi
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], ttmp15
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], m0
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], exec_lo
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], exec_hi
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], null
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], -1
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f64_i32_e64 v[6:7], src_scc mul:4
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_u32_e64 v[6:7], v1
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], v255
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], s1
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], s105
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], vcc_lo
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], vcc_hi
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], ttmp15
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], m0
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], exec_lo
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], exec_hi
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], null
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], -1
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f64_u32_e64 v[6:7], src_scc mul:4
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_floor_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, null
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_flr_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, null
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_i16_f16_e64 v5, v1
+// GFX1250: v_cvt_i16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, v255
+// GFX1250: v_cvt_i16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, s1
+// GFX1250: v_cvt_i16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, s105
+// GFX1250: v_cvt_i16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_i16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_i16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_i16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, m0
+// GFX1250: v_cvt_i16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_i16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_i16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, null
+// GFX1250: v_cvt_i16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, -1
+// GFX1250: v_cvt_i16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_i16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_i16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp
+// GFX1250: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_i32_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_i32_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_i32_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_i32_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_i32_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_i32_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_i32_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_i32_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_i32_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_i32_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, null
+// GFX1250: v_cvt_i32_f32_e64 v5, null              ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_i32_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_i32_f32_e64 v5, 0.5               ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_i32_f32_e64 v5, src_scc           ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp
+// GFX1250: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_f64_e64 v5, v[2:3]
+// GFX1250: v_cvt_i32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, v[254:255]
+// GFX1250: v_cvt_i32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, s[2:3]
+// GFX1250: v_cvt_i32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, s[104:105]
+// GFX1250: v_cvt_i32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, vcc
+// GFX1250: v_cvt_i32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_cvt_i32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, exec
+// GFX1250: v_cvt_i32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, null
+// GFX1250: v_cvt_i32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, -1
+// GFX1250: v_cvt_i32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, 0.5
+// GFX1250: v_cvt_i32_f64_e64 v5, 0.5               ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, -|src_scc|
+// GFX1250: v_cvt_i32_f64_e64 v5, -|src_scc|        ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20]
+
+v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
+// GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_i16_e64 v5, v1
+// GFX1250: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, v255
+// GFX1250: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, s1
+// GFX1250: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, s105
+// GFX1250: v_cvt_i32_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, vcc_lo
+// GFX1250: v_cvt_i32_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, vcc_hi
+// GFX1250: v_cvt_i32_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, ttmp15
+// GFX1250: v_cvt_i32_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, m0
+// GFX1250: v_cvt_i32_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, exec_lo
+// GFX1250: v_cvt_i32_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, exec_hi
+// GFX1250: v_cvt_i32_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, null
+// GFX1250: v_cvt_i32_i16_e64 v5, null              ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, -1
+// GFX1250: v_cvt_i32_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, 0.5
+// GFX1250: v_cvt_i32_i16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xea,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, src_scc
+// GFX1250: v_cvt_i32_i16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v255, 0xfe0b
+// GFX1250: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255      ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105      ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo    ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi    ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15    ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0        ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo   ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi   ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, null
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null      ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1        ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5       ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc   ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_norm_i16_f16_e64 v5, v1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, v255
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, s1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, s105
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, m0
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, null
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, -1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|
+// GFX1250: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, v1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, v255
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, s1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, s105
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, m0
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, null
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, -1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|
+// GFX1250: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, v1
+// GFX1250: v_cvt_off_f32_i4_e64 v5, v1             ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, v255
+// GFX1250: v_cvt_off_f32_i4_e64 v5, v255           ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, s1
+// GFX1250: v_cvt_off_f32_i4_e64 v5, s1             ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, s105
+// GFX1250: v_cvt_off_f32_i4_e64 v5, s105           ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, vcc_lo
+// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, vcc_hi
+// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, ttmp15
+// GFX1250: v_cvt_off_f32_i4_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, m0
+// GFX1250: v_cvt_off_f32_i4_e64 v5, m0             ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, exec_lo
+// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, exec_hi
+// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, null
+// GFX1250: v_cvt_off_f32_i4_e64 v5, null           ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, -1
+// GFX1250: v_cvt_off_f32_i4_e64 v5, -1             ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_off_f32_i4_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_off_f32_i4_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2
+// GFX1250: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255      ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105      ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo    ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi    ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15    ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0        ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo   ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi   ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, null
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null      ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1        ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5       ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc   ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_u16_f16_e64 v5, v1
+// GFX1250: v_cvt_u16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, v255
+// GFX1250: v_cvt_u16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, s1
+// GFX1250: v_cvt_u16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, s105
+// GFX1250: v_cvt_u16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_u16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_u16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_u16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, m0
+// GFX1250: v_cvt_u16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_u16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_u16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, null
+// GFX1250: v_cvt_u16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, -1
+// GFX1250: v_cvt_u16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_u16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_u16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp
+// GFX1250: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, v1
+// GFX1250: v_cvt_u32_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, v255
+// GFX1250: v_cvt_u32_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, s1
+// GFX1250: v_cvt_u32_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, s105
+// GFX1250: v_cvt_u32_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_u32_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_u32_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_u32_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, m0
+// GFX1250: v_cvt_u32_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_u32_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_u32_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, null
+// GFX1250: v_cvt_u32_f32_e64 v5, null              ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, -1
+// GFX1250: v_cvt_u32_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_u32_f32_e64 v5, 0.5               ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_u32_f32_e64 v5, src_scc           ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp
+// GFX1250: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_f64_e64 v5, v[2:3]
+// GFX1250: v_cvt_u32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, v[254:255]
+// GFX1250: v_cvt_u32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, s[2:3]
+// GFX1250: v_cvt_u32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, s[104:105]
+// GFX1250: v_cvt_u32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, vcc
+// GFX1250: v_cvt_u32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_cvt_u32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, exec
+// GFX1250: v_cvt_u32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, null
+// GFX1250: v_cvt_u32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, -1
+// GFX1250: v_cvt_u32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, 0.5
+// GFX1250: v_cvt_u32_f64_e64 v5, 0.5               ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, -|src_scc|
+// GFX1250: v_cvt_u32_f64_e64 v5, -|src_scc|        ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20]
+
+v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
+// GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_u16_e64 v5, v1
+// GFX1250: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, v255
+// GFX1250: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, s1
+// GFX1250: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, s105
+// GFX1250: v_cvt_u32_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, vcc_lo
+// GFX1250: v_cvt_u32_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, vcc_hi
+// GFX1250: v_cvt_u32_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, ttmp15
+// GFX1250: v_cvt_u32_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, m0
+// GFX1250: v_cvt_u32_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, exec_lo
+// GFX1250: v_cvt_u32_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, exec_hi
+// GFX1250: v_cvt_u32_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, null
+// GFX1250: v_cvt_u32_u16_e64 v5, null              ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, -1
+// GFX1250: v_cvt_u32_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, 0.5
+// GFX1250: v_cvt_u32_u16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xeb,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, src_scc
+// GFX1250: v_cvt_u32_u16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v255, 0xfe0b
+// GFX1250: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_exp_f16_e64 v5, v1
+// GFX1250: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, v255
+// GFX1250: v_exp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, s1
+// GFX1250: v_exp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, s105
+// GFX1250: v_exp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, vcc_lo
+// GFX1250: v_exp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, vcc_hi
+// GFX1250: v_exp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, ttmp15
+// GFX1250: v_exp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, m0
+// GFX1250: v_exp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, exec_lo
+// GFX1250: v_exp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, exec_hi
+// GFX1250: v_exp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, null
+// GFX1250: v_exp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, -1
+// GFX1250: v_exp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_f16_e64 v5, src_scc mul:4
+// GFX1250: v_exp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_exp_f32_e64 v5, v1
+// GFX1250: v_exp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_f32_e64 v5, v255
+// GFX1250: v_exp_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_f32_e64 v5, s1
+// GFX1250: v_exp_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, s105
+// GFX1250: v_exp_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, vcc_lo
+// GFX1250: v_exp_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, vcc_hi
+// GFX1250: v_exp_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, ttmp15
+// GFX1250: v_exp_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, m0
+// GFX1250: v_exp_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, exec_lo
+// GFX1250: v_exp_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, exec_hi
+// GFX1250: v_exp_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, null
+// GFX1250: v_exp_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, -1
+// GFX1250: v_exp_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_f32_e64 v5, src_scc mul:4
+// GFX1250: v_exp_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_ffbh_i32_e64 v5, v1
+// GFX1250: v_cls_i32_e64 v5, v1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00]
+
+v_ffbh_i32_e64 v5, v255
+// GFX1250: v_cls_i32_e64 v5, v255                  ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00]
+
+v_ffbh_i32_e64 v5, s1
+// GFX1250: v_cls_i32_e64 v5, s1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, s105
+// GFX1250: v_cls_i32_e64 v5, s105                  ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, vcc_lo
+// GFX1250: v_cls_i32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, vcc_hi
+// GFX1250: v_cls_i32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, ttmp15
+// GFX1250: v_cls_i32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, m0
+// GFX1250: v_cls_i32_e64 v5, m0                    ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, exec_lo
+// GFX1250: v_cls_i32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, exec_hi
+// GFX1250: v_cls_i32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, null
+// GFX1250: v_cls_i32_e64 v5, null                  ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, -1
+// GFX1250: v_cls_i32_e64 v5, -1                    ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, 0.5
+// GFX1250: v_cls_i32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, src_scc
+// GFX1250: v_cls_i32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v255, 0xaf123456
+// GFX1250: v_cls_i32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_ffbh_u32_e64 v5, v1
+// GFX1250: v_clz_i32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00]
+
+v_ffbh_u32_e64 v5, v255
+// GFX1250: v_clz_i32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00]
+
+v_ffbh_u32_e64 v5, s1
+// GFX1250: v_clz_i32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, s105
+// GFX1250: v_clz_i32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, vcc_lo
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, vcc_hi
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, ttmp15
+// GFX1250: v_clz_i32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, m0
+// GFX1250: v_clz_i32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, exec_lo
+// GFX1250: v_clz_i32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, exec_hi
+// GFX1250: v_clz_i32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, null
+// GFX1250: v_clz_i32_u32_e64 v5, null              ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, -1
+// GFX1250: v_clz_i32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, 0.5
+// GFX1250: v_clz_i32_u32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, src_scc
+// GFX1250: v_clz_i32_u32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v255, 0xaf123456
+// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_ffbl_b32_e64 v5, v1
+// GFX1250: v_ctz_i32_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00]
+
+v_ffbl_b32_e64 v5, v255
+// GFX1250: v_ctz_i32_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00]
+
+v_ffbl_b32_e64 v5, s1
+// GFX1250: v_ctz_i32_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, s105
+// GFX1250: v_ctz_i32_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, vcc_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, vcc_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, ttmp15
+// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, m0
+// GFX1250: v_ctz_i32_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, exec_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, exec_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, null
+// GFX1250: v_ctz_i32_b32_e64 v5, null              ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, -1
+// GFX1250: v_ctz_i32_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, 0.5
+// GFX1250: v_ctz_i32_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, src_scc
+// GFX1250: v_ctz_i32_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v255, 0xaf123456
+// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_floor_f16_e64 v5, v1
+// GFX1250: v_floor_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+
+v_floor_f16_e64 v5, v255
+// GFX1250: v_floor_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+
+v_floor_f16_e64 v5, s1
+// GFX1250: v_floor_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, s105
+// GFX1250: v_floor_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, vcc_lo
+// GFX1250: v_floor_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, vcc_hi
+// GFX1250: v_floor_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, ttmp15
+// GFX1250: v_floor_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, m0
+// GFX1250: v_floor_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, exec_lo
+// GFX1250: v_floor_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, exec_hi
+// GFX1250: v_floor_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, null
+// GFX1250: v_floor_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, -1
+// GFX1250: v_floor_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_floor_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+
+v_floor_f16_e64 v5, src_scc mul:4
+// GFX1250: v_floor_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+
+v_floor_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_floor_f32_e64 v5, v1
+// GFX1250: v_floor_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00]
+
+v_floor_f32_e64 v5, v255
+// GFX1250: v_floor_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00]
+
+v_floor_f32_e64 v5, s1
+// GFX1250: v_floor_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, s105
+// GFX1250: v_floor_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, vcc_lo
+// GFX1250: v_floor_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, vcc_hi
+// GFX1250: v_floor_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, ttmp15
+// GFX1250: v_floor_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, m0
+// GFX1250: v_floor_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, exec_lo
+// GFX1250: v_floor_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, exec_hi
+// GFX1250: v_floor_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, null
+// GFX1250: v_floor_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, -1
+// GFX1250: v_floor_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_floor_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08]
+
+v_floor_f32_e64 v5, src_scc mul:4
+// GFX1250: v_floor_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10]
+
+v_floor_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_floor_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_floor_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_floor_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_floor_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_floor_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], vcc
+// GFX1250: v_floor_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_floor_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], exec
+// GFX1250: v_floor_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], null
+// GFX1250: v_floor_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], -1
+// GFX1250: v_floor_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_floor_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08]
+
+v_floor_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_floor_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30]
+
+v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_fract_f16_e64 v5, v1
+// GFX1250: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5, v255
+// GFX1250: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5, s1
+// GFX1250: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, s105
+// GFX1250: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, vcc_lo
+// GFX1250: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, vcc_hi
+// GFX1250: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, ttmp15
+// GFX1250: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, m0
+// GFX1250: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, exec_lo
+// GFX1250: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, exec_hi
+// GFX1250: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, null
+// GFX1250: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, -1
+// GFX1250: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+
+v_fract_f16_e64 v5, src_scc mul:4
+// GFX1250: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+
+v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_fract_f32_e64 v5, v1
+// GFX1250: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f32_e64 v5, v255
+// GFX1250: v_fract_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f32_e64 v5, s1
+// GFX1250: v_fract_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, s105
+// GFX1250: v_fract_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, vcc_lo
+// GFX1250: v_fract_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, vcc_hi
+// GFX1250: v_fract_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, ttmp15
+// GFX1250: v_fract_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, m0
+// GFX1250: v_fract_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, exec_lo
+// GFX1250: v_fract_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, exec_hi
+// GFX1250: v_fract_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, null
+// GFX1250: v_fract_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, -1
+// GFX1250: v_fract_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_fract_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_fract_f32_e64 v5, src_scc mul:4
+// GFX1250: v_fract_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_fract_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_fract_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_fract_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_fract_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_fract_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_fract_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], vcc
+// GFX1250: v_fract_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_fract_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], exec
+// GFX1250: v_fract_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], null
+// GFX1250: v_fract_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], -1
+// GFX1250: v_fract_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_fract_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08]
+
+v_fract_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_fract_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30]
+
+v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i16_f16_e64 v5, v1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, v1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, v255
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, v255        ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, s1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, s1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, s105
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, s105        ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, vcc_lo
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, vcc_hi
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, ttmp15
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, m0
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, m0          ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, exec_lo
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, exec_hi
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, null
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, null        ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, -1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, -1          ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, 0.5
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, 0.5         ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, src_scc
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, src_scc     ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v255, -|0xfe0b|
+// GFX1250: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, v1
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, v255
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, s1
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, s105
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, ttmp15
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, m0
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, exec_lo
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, exec_hi
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, null
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, -1
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, 0.5
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, src_scc
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i32_f64_e64 v5, v[2:3]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[2:3]      ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, v[254:255]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[254:255]  ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, s[2:3]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[2:3]      ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, s[104:105]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[104:105]  ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, vcc
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, vcc         ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, exec
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, exec        ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, null
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, null        ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, -1
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, -1          ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, 0.5
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, 0.5         ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, -|src_scc|
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, -|src_scc|  ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20]
+
+v_frexp_exp_i32_f64_e64 v255, 0xaf123456
+// GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f16_e64 v5, v1
+// GFX1250: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, v255
+// GFX1250: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, s1
+// GFX1250: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, s105
+// GFX1250: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, vcc_lo
+// GFX1250: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, vcc_hi
+// GFX1250: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, ttmp15
+// GFX1250: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, m0
+// GFX1250: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, exec_lo
+// GFX1250: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, exec_hi
+// GFX1250: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, null
+// GFX1250: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, -1
+// GFX1250: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+
+v_frexp_mant_f16_e64 v5, src_scc mul:4
+// GFX1250: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+
+v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, v1
+// GFX1250: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, v255
+// GFX1250: v_frexp_mant_f32_e64 v5, v255           ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, s1
+// GFX1250: v_frexp_mant_f32_e64 v5, s1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, s105
+// GFX1250: v_frexp_mant_f32_e64 v5, s105           ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, vcc_lo
+// GFX1250: v_frexp_mant_f32_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, vcc_hi
+// GFX1250: v_frexp_mant_f32_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, ttmp15
+// GFX1250: v_frexp_mant_f32_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, m0
+// GFX1250: v_frexp_mant_f32_e64 v5, m0             ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, exec_lo
+// GFX1250: v_frexp_mant_f32_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, exec_hi
+// GFX1250: v_frexp_mant_f32_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, null
+// GFX1250: v_frexp_mant_f32_e64 v5, null           ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, -1
+// GFX1250: v_frexp_mant_f32_e64 v5, -1             ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_frexp_mant_f32_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_frexp_mant_f32_e64 v5, src_scc mul:4
+// GFX1250: v_frexp_mant_f32_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[2:3]     ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[2:3]     ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], vcc
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], vcc        ; encoding: [0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], exec
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], exec       ; encoding: [0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], null
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], null       ; encoding: [0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], -1
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], -1         ; encoding: [0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2  ; encoding: [0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30]
+
+v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_log_f16_e64 v5, v1
+// GFX1250: v_log_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_f16_e64 v5, v255
+// GFX1250: v_log_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_f16_e64 v5, s1
+// GFX1250: v_log_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, s105
+// GFX1250: v_log_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, vcc_lo
+// GFX1250: v_log_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, vcc_hi
+// GFX1250: v_log_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, ttmp15
+// GFX1250: v_log_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, m0
+// GFX1250: v_log_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, exec_lo
+// GFX1250: v_log_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, exec_hi
+// GFX1250: v_log_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, null
+// GFX1250: v_log_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, -1
+// GFX1250: v_log_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_log_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_f16_e64 v5, src_scc mul:4
+// GFX1250: v_log_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_log_f32_e64 v5, v1
+// GFX1250: v_log_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_f32_e64 v5, v255
+// GFX1250: v_log_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_f32_e64 v5, s1
+// GFX1250: v_log_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, s105
+// GFX1250: v_log_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, vcc_lo
+// GFX1250: v_log_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, vcc_hi
+// GFX1250: v_log_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, ttmp15
+// GFX1250: v_log_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, m0
+// GFX1250: v_log_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, exec_lo
+// GFX1250: v_log_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, exec_hi
+// GFX1250: v_log_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, null
+// GFX1250: v_log_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, -1
+// GFX1250: v_log_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_log_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_f32_e64 v5, src_scc mul:4
+// GFX1250: v_log_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_mov_b32_e64 v5, v1
+// GFX1250: v_mov_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00]
+
+v_mov_b32_e64 v5, v255
+// GFX1250: v_mov_b32_e64 v5, v255                  ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00]
+
+v_mov_b32_e64 v5, s1
+// GFX1250: v_mov_b32_e64 v5, s1                    ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, s105
+// GFX1250: v_mov_b32_e64 v5, s105                  ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, vcc_lo
+// GFX1250: v_mov_b32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, vcc_hi
+// GFX1250: v_mov_b32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, ttmp15
+// GFX1250: v_mov_b32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, m0
+// GFX1250: v_mov_b32_e64 v5, m0                    ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, exec_lo
+// GFX1250: v_mov_b32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, exec_hi
+// GFX1250: v_mov_b32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, null
+// GFX1250: v_mov_b32_e64 v5, null                  ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, -1
+// GFX1250: v_mov_b32_e64 v5, -1                    ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, 0.5
+// GFX1250: v_mov_b32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, src_scc
+// GFX1250: v_mov_b32_e64 v5, src_scc               ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00]
+
+v_mov_b32_e64 v255, 0xaf123456
+// GFX1250: v_mov_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_movreld_b32_e64 v5, v1
+// GFX1250: v_movreld_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00]
+
+v_movreld_b32_e64 v5, v255
+// GFX1250: v_movreld_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00]
+
+v_movreld_b32_e64 v5, s1
+// GFX1250: v_movreld_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, s105
+// GFX1250: v_movreld_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, vcc_lo
+// GFX1250: v_movreld_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, vcc_hi
+// GFX1250: v_movreld_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, ttmp15
+// GFX1250: v_movreld_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, m0
+// GFX1250: v_movreld_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, exec_lo
+// GFX1250: v_movreld_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, exec_hi
+// GFX1250: v_movreld_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, null
+// GFX1250: v_movreld_b32_e64 v5, null              ; encoding: [0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, -1
+// GFX1250: v_movreld_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, 0.5
+// GFX1250: v_movreld_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, src_scc
+// GFX1250: v_movreld_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v255, 0xaf123456
+// GFX1250: v_movreld_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_movrels_b32_e64 v5, v1
+// GFX1250: v_movrels_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00]
+
+v_movrels_b32_e64 v255, v255
+// GFX1250: v_movrels_b32_e64 v255, v255            ; encoding: [0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00]
+
+v_movrelsd_2_b32_e64 v5, v1
+// GFX1250: v_movrelsd_2_b32_e64 v5, v1             ; encoding: [0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00]
+
+v_movrelsd_2_b32_e64 v255, v255
+// GFX1250: v_movrelsd_2_b32_e64 v255, v255         ; encoding: [0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v5, v1
+// GFX1250: v_movrelsd_b32_e64 v5, v1               ; encoding: [0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v255, v255
+// GFX1250: v_movrelsd_b32_e64 v255, v255           ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00]
+
+v_nop_e64
+// GFX1250: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, v1
+// GFX1250: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5, v255
+// GFX1250: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b16_e64 v5, s1
+// GFX1250: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, s105
+// GFX1250: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, vcc_lo
+// GFX1250: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, vcc_hi
+// GFX1250: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, ttmp15
+// GFX1250: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, m0
+// GFX1250: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, exec_lo
+// GFX1250: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, exec_hi
+// GFX1250: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, null
+// GFX1250: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, -1
+// GFX1250: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, 0.5
+// GFX1250: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, src_scc
+// GFX1250: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_not_b16_e64 v255, 0xfe0b
+// GFX1250: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b32_e64 v5, v1
+// GFX1250: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b32_e64 v5, v255
+// GFX1250: v_not_b32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b32_e64 v5, s1
+// GFX1250: v_not_b32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, s105
+// GFX1250: v_not_b32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, vcc_lo
+// GFX1250: v_not_b32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, vcc_hi
+// GFX1250: v_not_b32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, ttmp15
+// GFX1250: v_not_b32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, m0
+// GFX1250: v_not_b32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, exec_lo
+// GFX1250: v_not_b32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, exec_hi
+// GFX1250: v_not_b32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, null
+// GFX1250: v_not_b32_e64 v5, null                  ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, -1
+// GFX1250: v_not_b32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, 0.5
+// GFX1250: v_not_b32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, src_scc
+// GFX1250: v_not_b32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00]
+
+v_not_b32_e64 v255, 0xaf123456
+// GFX1250: v_not_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_pipeflush_e64
+// GFX1250: v_pipeflush                             ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, v1
+// GFX1250: v_rcp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_f16_e64 v5, v255
+// GFX1250: v_rcp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_f16_e64 v5, s1
+// GFX1250: v_rcp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, s105
+// GFX1250: v_rcp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, vcc_lo
+// GFX1250: v_rcp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, vcc_hi
+// GFX1250: v_rcp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, ttmp15
+// GFX1250: v_rcp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, m0
+// GFX1250: v_rcp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, exec_lo
+// GFX1250: v_rcp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, exec_hi
+// GFX1250: v_rcp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, null
+// GFX1250: v_rcp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, -1
+// GFX1250: v_rcp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_f16_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rcp_f32_e64 v5, v1
+// GFX1250: v_rcp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_f32_e64 v5, v255
+// GFX1250: v_rcp_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_f32_e64 v5, s1
+// GFX1250: v_rcp_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, s105
+// GFX1250: v_rcp_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, vcc_lo
+// GFX1250: v_rcp_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, vcc_hi
+// GFX1250: v_rcp_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, ttmp15
+// GFX1250: v_rcp_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, m0
+// GFX1250: v_rcp_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, exec_lo
+// GFX1250: v_rcp_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, exec_hi
+// GFX1250: v_rcp_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, null
+// GFX1250: v_rcp_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, -1
+// GFX1250: v_rcp_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rcp_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_rcp_f64_e64 v[6:7], v[2:3]            ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_rcp_f64_e64 v[6:7], v[254:255]        ; encoding: [0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_rcp_f64_e64 v[6:7], s[2:3]            ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_rcp_f64_e64 v[6:7], s[104:105]        ; encoding: [0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], vcc
+// GFX1250: v_rcp_f64_e64 v[6:7], vcc               ; encoding: [0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_rcp_f64_e64 v[6:7], ttmp[14:15]       ; encoding: [0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], exec
+// GFX1250: v_rcp_f64_e64 v[6:7], exec              ; encoding: [0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], null
+// GFX1250: v_rcp_f64_e64 v[6:7], null              ; encoding: [0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], -1
+// GFX1250: v_rcp_f64_e64 v[6:7], -1                ; encoding: [0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_rcp_f64_e64 v[6:7], 0.5 mul:2         ; encoding: [0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_rcp_f64_e64 v[6:7], -|src_scc| mul:4  ; encoding: [0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30]
+
+v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_rcp_iflag_f32_e64 v5, v1
+// GFX1250: v_rcp_iflag_f32_e64 v5, v1              ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, v255
+// GFX1250: v_rcp_iflag_f32_e64 v5, v255            ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, s1
+// GFX1250: v_rcp_iflag_f32_e64 v5, s1              ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, s105
+// GFX1250: v_rcp_iflag_f32_e64 v5, s105            ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, vcc_lo
+// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, vcc_hi
+// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, ttmp15
+// GFX1250: v_rcp_iflag_f32_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, m0
+// GFX1250: v_rcp_iflag_f32_e64 v5, m0              ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, exec_lo
+// GFX1250: v_rcp_iflag_f32_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, exec_hi
+// GFX1250: v_rcp_iflag_f32_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, null
+// GFX1250: v_rcp_iflag_f32_e64 v5, null            ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, -1
+// GFX1250: v_rcp_iflag_f32_e64 v5, -1              ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_iflag_f32_e64 v5, 0.5 mul:2       ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_iflag_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_iflag_f32_e64 v5, src_scc mul:4   ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rndne_f16_e64 v5, v1
+// GFX1250: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5, v255
+// GFX1250: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5, s1
+// GFX1250: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, s105
+// GFX1250: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, vcc_lo
+// GFX1250: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, vcc_hi
+// GFX1250: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, ttmp15
+// GFX1250: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, m0
+// GFX1250: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, exec_lo
+// GFX1250: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, exec_hi
+// GFX1250: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, null
+// GFX1250: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, -1
+// GFX1250: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rndne_f16_e64 v5, src_scc mul:4
+// GFX1250: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f32_e64 v5, v1
+// GFX1250: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f32_e64 v5, v255
+// GFX1250: v_rndne_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f32_e64 v5, s1
+// GFX1250: v_rndne_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, s105
+// GFX1250: v_rndne_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, vcc_lo
+// GFX1250: v_rndne_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, vcc_hi
+// GFX1250: v_rndne_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, ttmp15
+// GFX1250: v_rndne_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, m0
+// GFX1250: v_rndne_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, exec_lo
+// GFX1250: v_rndne_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, exec_hi
+// GFX1250: v_rndne_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, null
+// GFX1250: v_rndne_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, -1
+// GFX1250: v_rndne_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rndne_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rndne_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rndne_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rndne_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_rndne_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_rndne_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_rndne_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_rndne_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], vcc
+// GFX1250: v_rndne_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_rndne_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], exec
+// GFX1250: v_rndne_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], null
+// GFX1250: v_rndne_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], -1
+// GFX1250: v_rndne_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_rndne_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rndne_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30]
+
+v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_rsq_f16_e64 v5, v1
+// GFX1250: v_rsq_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_f16_e64 v5, v255
+// GFX1250: v_rsq_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_f16_e64 v5, s1
+// GFX1250: v_rsq_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, s105
+// GFX1250: v_rsq_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, vcc_lo
+// GFX1250: v_rsq_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, vcc_hi
+// GFX1250: v_rsq_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, ttmp15
+// GFX1250: v_rsq_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, m0
+// GFX1250: v_rsq_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, exec_lo
+// GFX1250: v_rsq_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, exec_hi
+// GFX1250: v_rsq_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, null
+// GFX1250: v_rsq_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, -1
+// GFX1250: v_rsq_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_f16_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rsq_f32_e64 v5, v1
+// GFX1250: v_rsq_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_f32_e64 v5, v255
+// GFX1250: v_rsq_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_f32_e64 v5, s1
+// GFX1250: v_rsq_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, s105
+// GFX1250: v_rsq_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, vcc_lo
+// GFX1250: v_rsq_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, vcc_hi
+// GFX1250: v_rsq_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, ttmp15
+// GFX1250: v_rsq_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, m0
+// GFX1250: v_rsq_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, exec_lo
+// GFX1250: v_rsq_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, exec_hi
+// GFX1250: v_rsq_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, null
+// GFX1250: v_rsq_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, -1
+// GFX1250: v_rsq_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rsq_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_rsq_f64_e64 v[6:7], v[2:3]            ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_rsq_f64_e64 v[6:7], v[254:255]        ; encoding: [0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_rsq_f64_e64 v[6:7], s[2:3]            ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_rsq_f64_e64 v[6:7], s[104:105]        ; encoding: [0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], vcc
+// GFX1250: v_rsq_f64_e64 v[6:7], vcc               ; encoding: [0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_rsq_f64_e64 v[6:7], ttmp[14:15]       ; encoding: [0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], exec
+// GFX1250: v_rsq_f64_e64 v[6:7], exec              ; encoding: [0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], null
+// GFX1250: v_rsq_f64_e64 v[6:7], null              ; encoding: [0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], -1
+// GFX1250: v_rsq_f64_e64 v[6:7], -1                ; encoding: [0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_rsq_f64_e64 v[6:7], 0.5 mul:2         ; encoding: [0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_rsq_f64_e64 v[6:7], -|src_scc| mul:4  ; encoding: [0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30]
+
+v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_sat_pk_u8_i16_e64 v5, v1
+// GFX1250: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, v255
+// GFX1250: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, s1
+// GFX1250: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, s105
+// GFX1250: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, vcc_lo
+// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, vcc_hi
+// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, ttmp15
+// GFX1250: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, m0
+// GFX1250: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, exec_lo
+// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, exec_hi
+// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, null
+// GFX1250: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, -1
+// GFX1250: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, 0.5
+// GFX1250: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, src_scc
+// GFX1250: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v255, 0xfe0b
+// GFX1250: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16_e64 v5, v1
+// GFX1250: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, v255
+// GFX1250: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, s1
+// GFX1250: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, s105
+// GFX1250: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, vcc_lo
+// GFX1250: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, vcc_hi
+// GFX1250: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, ttmp15
+// GFX1250: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, m0
+// GFX1250: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, exec_lo
+// GFX1250: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, exec_hi
+// GFX1250: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, null
+// GFX1250: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, -1
+// GFX1250: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_f16_e64 v5, src_scc mul:4
+// GFX1250: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sin_f32_e64 v5, v1
+// GFX1250: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f32_e64 v5, v255
+// GFX1250: v_sin_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f32_e64 v5, s1
+// GFX1250: v_sin_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, s105
+// GFX1250: v_sin_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, vcc_lo
+// GFX1250: v_sin_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, vcc_hi
+// GFX1250: v_sin_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, ttmp15
+// GFX1250: v_sin_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, m0
+// GFX1250: v_sin_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, exec_lo
+// GFX1250: v_sin_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, exec_hi
+// GFX1250: v_sin_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, null
+// GFX1250: v_sin_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, -1
+// GFX1250: v_sin_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_f32_e64 v5, src_scc mul:4
+// GFX1250: v_sin_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f16_e64 v5, v1
+// GFX1250: v_sqrt_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+
+v_sqrt_f16_e64 v5, v255
+// GFX1250: v_sqrt_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+
+v_sqrt_f16_e64 v5, s1
+// GFX1250: v_sqrt_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, s105
+// GFX1250: v_sqrt_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, vcc_lo
+// GFX1250: v_sqrt_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, vcc_hi
+// GFX1250: v_sqrt_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, ttmp15
+// GFX1250: v_sqrt_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, m0
+// GFX1250: v_sqrt_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, exec_lo
+// GFX1250: v_sqrt_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, exec_hi
+// GFX1250: v_sqrt_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, null
+// GFX1250: v_sqrt_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, -1
+// GFX1250: v_sqrt_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_sqrt_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_f16_e64 v5, src_scc mul:4
+// GFX1250: v_sqrt_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sqrt_f32_e64 v5, v1
+// GFX1250: v_sqrt_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00]
+
+v_sqrt_f32_e64 v5, v255
+// GFX1250: v_sqrt_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00]
+
+v_sqrt_f32_e64 v5, s1
+// GFX1250: v_sqrt_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, s105
+// GFX1250: v_sqrt_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, vcc_lo
+// GFX1250: v_sqrt_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, vcc_hi
+// GFX1250: v_sqrt_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, ttmp15
+// GFX1250: v_sqrt_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, m0
+// GFX1250: v_sqrt_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, exec_lo
+// GFX1250: v_sqrt_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, exec_hi
+// GFX1250: v_sqrt_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, null
+// GFX1250: v_sqrt_f32_e64 v5, null                 ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, -1
+// GFX1250: v_sqrt_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_sqrt_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_f32_e64 v5, src_scc mul:4
+// GFX1250: v_sqrt_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_sqrt_f64_e64 v[6:7], v[2:3]           ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_sqrt_f64_e64 v[6:7], v[254:255]       ; encoding: [0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_sqrt_f64_e64 v[6:7], s[2:3]           ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_sqrt_f64_e64 v[6:7], s[104:105]       ; encoding: [0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], vcc
+// GFX1250: v_sqrt_f64_e64 v[6:7], vcc              ; encoding: [0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_sqrt_f64_e64 v[6:7], ttmp[14:15]      ; encoding: [0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], exec
+// GFX1250: v_sqrt_f64_e64 v[6:7], exec             ; encoding: [0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], null
+// GFX1250: v_sqrt_f64_e64 v[6:7], null             ; encoding: [0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], -1
+// GFX1250: v_sqrt_f64_e64 v[6:7], -1               ; encoding: [0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_sqrt_f64_e64 v[6:7], 0.5 mul:2        ; encoding: [0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30]
+
+v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_trunc_f16_e64 v5, v1
+// GFX1250: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5, v255
+// GFX1250: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5, s1
+// GFX1250: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, s105
+// GFX1250: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, vcc_lo
+// GFX1250: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, vcc_hi
+// GFX1250: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, ttmp15
+// GFX1250: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, m0
+// GFX1250: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, exec_lo
+// GFX1250: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, exec_hi
+// GFX1250: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, null
+// GFX1250: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, -1
+// GFX1250: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_trunc_f16_e64 v5, src_scc mul:4
+// GFX1250: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+
+v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f32_e64 v5, v1
+// GFX1250: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f32_e64 v5, v255
+// GFX1250: v_trunc_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f32_e64 v5, s1
+// GFX1250: v_trunc_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, s105
+// GFX1250: v_trunc_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, vcc_lo
+// GFX1250: v_trunc_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, vcc_hi
+// GFX1250: v_trunc_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, ttmp15
+// GFX1250: v_trunc_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, m0
+// GFX1250: v_trunc_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, exec_lo
+// GFX1250: v_trunc_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, exec_hi
+// GFX1250: v_trunc_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, null
+// GFX1250: v_trunc_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, -1
+// GFX1250: v_trunc_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_trunc_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_trunc_f32_e64 v5, src_scc mul:4
+// GFX1250: v_trunc_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10]
+
+v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_trunc_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_trunc_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_trunc_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_trunc_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_trunc_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], vcc
+// GFX1250: v_trunc_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_trunc_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], exec
+// GFX1250: v_trunc_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], null
+// GFX1250: v_trunc_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], -1
+// GFX1250: v_trunc_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_trunc_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08]
+
+v_trunc_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30]
+
+v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_tanh_f32_e64 v5, v1
+// GFX1250: v_tanh_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, v255
+// GFX1250: v_tanh_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, s1
+// GFX1250: v_tanh_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, s105
+// GFX1250: v_tanh_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_lo
+// GFX1250: v_tanh_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_hi
+// GFX1250: v_tanh_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, ttmp15
+// GFX1250: v_tanh_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, m0
+// GFX1250: v_tanh_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_lo
+// GFX1250: v_tanh_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_hi
+// GFX1250: v_tanh_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, null
+// GFX1250: v_tanh_f32_e64 v5, null                 ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, -1
+// GFX1250: v_tanh_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f32_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16_e64 v5, v1
+// GFX1250: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, v255
+// GFX1250: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, s1
+// GFX1250: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, s105
+// GFX1250: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_lo
+// GFX1250: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_hi
+// GFX1250: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, ttmp15
+// GFX1250: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, m0
+// GFX1250: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_lo
+// GFX1250: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_hi
+// GFX1250: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, null
+// GFX1250: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, -1
+// GFX1250: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f16_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
 v_tanh_bf16_e64 v5, v1
 // GFX1250: v_tanh_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00]
 
@@ -46,86 +3790,41 @@ v_tanh_bf16_e64 v5, src_scc mul:4
 v_tanh_bf16_e64 v255, -|0x8000| clamp div:2
 // GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, s3
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, s3 byte_sel:1
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, s3 byte_sel:2
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, s3 byte_sel:3
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3 byte_sel:1
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3 byte_sel:2
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3 byte_sel:3
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3 byte_sel:1
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3 byte_sel:2
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3 byte_sel:3
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3 byte_sel:1
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3 byte_sel:2
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3 byte_sel:3
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+v_prng_b32_e64 v5, v1
+// GFX1250: v_prng_b32_e64 v5, v1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, v255
+// GFX1250: v_prng_b32_e64 v5, v255                 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 byte_sel:1
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, s1
+// GFX1250: v_prng_b32_e64 v5, s1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 byte_sel:2
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, s105
+// GFX1250: v_prng_b32_e64 v5, s105                 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 byte_sel:3
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, vcc_lo
+// GFX1250: v_prng_b32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, vcc_hi
+// GFX1250: v_prng_b32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 byte_sel:1
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, ttmp15
+// GFX1250: v_prng_b32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 byte_sel:2
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, m0
+// GFX1250: v_prng_b32_e64 v5, m0                   ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 byte_sel:3
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, exec_lo
+// GFX1250: v_prng_b32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, exec_hi
+// GFX1250: v_prng_b32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, null
+// GFX1250: v_prng_b32_e64 v5, null                 ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, -1
+// GFX1250: v_prng_b32_e64 v5, -1                   ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
 
 v_rcp_bf16_e64 v5, v1
 // GFX1250: v_rcp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
@@ -217,6 +3916,231 @@ v_sqrt_bf16_e64 v5, src_scc mul:4
 v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
 // GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
+v_rsq_bf16_e64 v5, v1
+// GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, v255
+// GFX1250: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s1
+// GFX1250: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s105
+// GFX1250: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, ttmp15
+// GFX1250: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, m0
+// GFX1250: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_lo
+// GFX1250: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_hi
+// GFX1250: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, null
+// GFX1250: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, -1
+// GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_log_bf16_e64 v5, v1
+// GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, v255
+// GFX1250: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, s1
+// GFX1250: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, s105
+// GFX1250: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_lo
+// GFX1250: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_hi
+// GFX1250: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, ttmp15
+// GFX1250: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, m0
+// GFX1250: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_lo
+// GFX1250: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_hi
+// GFX1250: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, null
+// GFX1250: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, -1
+// GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_exp_bf16_e64 v5, v1
+// GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, v255
+// GFX1250: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, s1
+// GFX1250: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, s105
+// GFX1250: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_lo
+// GFX1250: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_hi
+// GFX1250: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, ttmp15
+// GFX1250: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, m0
+// GFX1250: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_lo
+// GFX1250: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_hi
+// GFX1250: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, null
+// GFX1250: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, -1
+// GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_sin_bf16_e64 v5, v1
+// GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, v255
+// GFX1250: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, s1
+// GFX1250: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, s105
+// GFX1250: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_lo
+// GFX1250: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_hi
+// GFX1250: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, ttmp15
+// GFX1250: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, m0
+// GFX1250: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_lo
+// GFX1250: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_hi
+// GFX1250: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, null
+// GFX1250: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, -1
+// GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_cos_bf16_e64 v5, v1
+// GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, v255
+// GFX1250: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, s1
+// GFX1250: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, s105
+// GFX1250: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_lo
+// GFX1250: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_hi
+// GFX1250: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, ttmp15
+// GFX1250: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, m0
+// GFX1250: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_lo
+// GFX1250: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_hi
+// GFX1250: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, null
+// GFX1250: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, -1
+// GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
 v_cvt_f32_bf16_e64 v5, v1
 // GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
@@ -369,3 +4293,48 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
 
 v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
 // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, s2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2             ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, s2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2             ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1   ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
index 8b16e42566fde..8e73ecb4232e0 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -1,6 +1,3921 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 
+v_bfrev_b32_e64 v5, v1
+// GFX1250: v_bfrev_b32_e64 v5, v1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00]
+
+v_bfrev_b32_e64 v5, v255
+// GFX1250: v_bfrev_b32_e64 v5, v255                ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00]
+
+v_bfrev_b32_e64 v5, s1
+// GFX1250: v_bfrev_b32_e64 v5, s1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, s105
+// GFX1250: v_bfrev_b32_e64 v5, s105                ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, vcc_lo
+// GFX1250: v_bfrev_b32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, vcc_hi
+// GFX1250: v_bfrev_b32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, ttmp15
+// GFX1250: v_bfrev_b32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, m0
+// GFX1250: v_bfrev_b32_e64 v5, m0                  ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, exec_lo
+// GFX1250: v_bfrev_b32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, exec_hi
+// GFX1250: v_bfrev_b32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, null
+// GFX1250: v_bfrev_b32_e64 v5, null                ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, -1
+// GFX1250: v_bfrev_b32_e64 v5, -1                  ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, 0.5
+// GFX1250: v_bfrev_b32_e64 v5, 0.5                 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v5, src_scc
+// GFX1250: v_bfrev_b32_e64 v5, src_scc             ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00]
+
+v_bfrev_b32_e64 v255, 0xaf123456
+// GFX1250: v_bfrev_b32_e64 v255, 0xaf123456        ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_ceil_f16_e64 v5, v1
+// GFX1250: v_ceil_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+
+v_ceil_f16_e64 v5, v255
+// GFX1250: v_ceil_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+
+v_ceil_f16_e64 v5, s1
+// GFX1250: v_ceil_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, s105
+// GFX1250: v_ceil_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, vcc_lo
+// GFX1250: v_ceil_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, vcc_hi
+// GFX1250: v_ceil_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, ttmp15
+// GFX1250: v_ceil_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, m0
+// GFX1250: v_ceil_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, exec_lo
+// GFX1250: v_ceil_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, exec_hi
+// GFX1250: v_ceil_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, null
+// GFX1250: v_ceil_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, -1
+// GFX1250: v_ceil_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_ceil_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+
+v_ceil_f16_e64 v5, src_scc mul:4
+// GFX1250: v_ceil_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+
+v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_ceil_f16 v5.l, v128.l
+// GFX1250: v_ceil_f16_e64 v5.l, v128.l             ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00]
+
+v_ceil_f16 v5.h, v128.h
+// GFX1250: v_ceil_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdc,0xd5,0x80,0x01,0x00,0x00]
+
+v_ceil_f32_e64 v5, v1
+// GFX1250: v_ceil_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00]
+
+v_ceil_f32_e64 v5, v255
+// GFX1250: v_ceil_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00]
+
+v_ceil_f32_e64 v5, s1
+// GFX1250: v_ceil_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, s105
+// GFX1250: v_ceil_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, vcc_lo
+// GFX1250: v_ceil_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, vcc_hi
+// GFX1250: v_ceil_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, ttmp15
+// GFX1250: v_ceil_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, m0
+// GFX1250: v_ceil_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, exec_lo
+// GFX1250: v_ceil_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, exec_hi
+// GFX1250: v_ceil_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, null
+// GFX1250: v_ceil_f32_e64 v5, null                 ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, -1
+// GFX1250: v_ceil_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ceil_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_ceil_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08]
+
+v_ceil_f32_e64 v5, src_scc mul:4
+// GFX1250: v_ceil_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10]
+
+v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_ceil_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_ceil_f64_e64 v[6:7], v[2:3]           ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_ceil_f64_e64 v[6:7], v[254:255]       ; encoding: [0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_ceil_f64_e64 v[6:7], s[2:3]           ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_ceil_f64_e64 v[6:7], s[104:105]       ; encoding: [0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], vcc
+// GFX1250: v_ceil_f64_e64 v[6:7], vcc              ; encoding: [0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_ceil_f64_e64 v[6:7], ttmp[14:15]      ; encoding: [0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], exec
+// GFX1250: v_ceil_f64_e64 v[6:7], exec             ; encoding: [0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], null
+// GFX1250: v_ceil_f64_e64 v[6:7], null             ; encoding: [0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], -1
+// GFX1250: v_ceil_f64_e64 v[6:7], -1               ; encoding: [0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ceil_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_ceil_f64_e64 v[6:7], 0.5 mul:2        ; encoding: [0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08]
+
+v_ceil_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30]
+
+v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cls_i32_e64 v5, v1
+// GFX1250: v_cls_i32_e64 v5, v1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00]
+
+v_cls_i32_e64 v5, v255
+// GFX1250: v_cls_i32_e64 v5, v255                  ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00]
+
+v_cls_i32_e64 v5, s1
+// GFX1250: v_cls_i32_e64 v5, s1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, s105
+// GFX1250: v_cls_i32_e64 v5, s105                  ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, vcc_lo
+// GFX1250: v_cls_i32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, vcc_hi
+// GFX1250: v_cls_i32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, ttmp15
+// GFX1250: v_cls_i32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, m0
+// GFX1250: v_cls_i32_e64 v5, m0                    ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, exec_lo
+// GFX1250: v_cls_i32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, exec_hi
+// GFX1250: v_cls_i32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, null
+// GFX1250: v_cls_i32_e64 v5, null                  ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, -1
+// GFX1250: v_cls_i32_e64 v5, -1                    ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, 0.5
+// GFX1250: v_cls_i32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cls_i32_e64 v5, src_scc
+// GFX1250: v_cls_i32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cls_i32_e64 v255, 0xaf123456
+// GFX1250: v_cls_i32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_clz_i32_u32_e64 v5, v1
+// GFX1250: v_clz_i32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, v255
+// GFX1250: v_clz_i32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, s1
+// GFX1250: v_clz_i32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, s105
+// GFX1250: v_clz_i32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, vcc_lo
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, vcc_hi
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, ttmp15
+// GFX1250: v_clz_i32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, m0
+// GFX1250: v_clz_i32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, exec_lo
+// GFX1250: v_clz_i32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, exec_hi
+// GFX1250: v_clz_i32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, null
+// GFX1250: v_clz_i32_u32_e64 v5, null              ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, -1
+// GFX1250: v_clz_i32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, 0.5
+// GFX1250: v_clz_i32_u32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v5, src_scc
+// GFX1250: v_clz_i32_u32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_clz_i32_u32_e64 v255, 0xaf123456
+// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cos_f16_e64 v5, v1
+// GFX1250: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5, v255
+// GFX1250: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5, s1
+// GFX1250: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, s105
+// GFX1250: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, vcc_lo
+// GFX1250: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, vcc_hi
+// GFX1250: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, ttmp15
+// GFX1250: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, m0
+// GFX1250: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, exec_lo
+// GFX1250: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, exec_hi
+// GFX1250: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, null
+// GFX1250: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, -1
+// GFX1250: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_f16_e64 v5, src_scc mul:4
+// GFX1250: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16 v5.l, v128.l
+// GFX1250: v_cos_f16_e64 v5.l, v128.l              ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00]
+
+v_cos_f16 v5.h, v128.h
+// GFX1250: v_cos_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x80,0x01,0x00,0x00]
+
+v_cos_f32_e64 v5, v1
+// GFX1250: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_f32_e64 v5, v255
+// GFX1250: v_cos_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_f32_e64 v5, s1
+// GFX1250: v_cos_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, s105
+// GFX1250: v_cos_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, vcc_lo
+// GFX1250: v_cos_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, vcc_hi
+// GFX1250: v_cos_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, ttmp15
+// GFX1250: v_cos_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, m0
+// GFX1250: v_cos_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, exec_lo
+// GFX1250: v_cos_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, exec_hi
+// GFX1250: v_cos_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, null
+// GFX1250: v_cos_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, -1
+// GFX1250: v_cos_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_f32_e64 v5, src_scc mul:4
+// GFX1250: v_cos_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_ctz_i32_b32_e64 v5, v1
+// GFX1250: v_ctz_i32_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, v255
+// GFX1250: v_ctz_i32_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, s1
+// GFX1250: v_ctz_i32_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, s105
+// GFX1250: v_ctz_i32_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, vcc_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, vcc_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, ttmp15
+// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, m0
+// GFX1250: v_ctz_i32_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, exec_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, exec_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, null
+// GFX1250: v_ctz_i32_b32_e64 v5, null              ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, -1
+// GFX1250: v_ctz_i32_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, 0.5
+// GFX1250: v_ctz_i32_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v5, src_scc
+// GFX1250: v_ctz_i32_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ctz_i32_b32_e64 v255, 0xaf123456
+// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_bf8_e64 v1, s3
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:1
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:2
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 byte_sel:3
+// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:1
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:2
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 byte_sel:3
+// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:1
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:2
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 byte_sel:3
+// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:1
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:2
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 byte_sel:3
+// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:1
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:2
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 byte_sel:3
+// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:1
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:2
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 byte_sel:3
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
+// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8 v[2:3], v128.h
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8 v[2:3], v128.l
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v128.l     ; encoding: [0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8 v[2:3], v128.h
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8 v[2:3], v128.l
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v128.l     ; encoding: [0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], s3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3         ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3         ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], s3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], s3         ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], 3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3          ; encoding: [0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3         ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, v1
+// GFX1250: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, v255
+// GFX1250: v_cvt_f16_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, s1
+// GFX1250: v_cvt_f16_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, s105
+// GFX1250: v_cvt_f16_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_f16_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_f16_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_f16_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, m0
+// GFX1250: v_cvt_f16_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_f16_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_f16_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, null
+// GFX1250: v_cvt_f16_f32_e64 v5, null              ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, -1
+// GFX1250: v_cvt_f16_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f16_f32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f16_f32_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f16_f32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_cvt_f16_f32 v128.l, v15
+// GFX1250: v_cvt_f16_f32_e64 v128.l, v15           ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00]
+
+v_cvt_f16_f32 v128.h, v15
+// GFX1250: v_cvt_f16_f32_e64 v128.h, v15 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, v1
+// GFX1250: v_cvt_f16_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, v255
+// GFX1250: v_cvt_f16_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, s1
+// GFX1250: v_cvt_f16_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, s105
+// GFX1250: v_cvt_f16_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f16_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f16_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, ttmp15
+// GFX1250: v_cvt_f16_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, m0
+// GFX1250: v_cvt_f16_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, exec_lo
+// GFX1250: v_cvt_f16_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, exec_hi
+// GFX1250: v_cvt_f16_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, null
+// GFX1250: v_cvt_f16_i16_e64 v5, null              ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, -1
+// GFX1250: v_cvt_f16_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f16_i16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f16_i16_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f16_i16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2
+// GFX1250: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_i16 v128.l, v15.l
+// GFX1250: v_cvt_f16_i16_e64 v128.l, v15.l         ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00]
+
+v_cvt_f16_i16 v128.h, v15.h
+// GFX1250: v_cvt_f16_i16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, v1
+// GFX1250: v_cvt_f16_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, v255
+// GFX1250: v_cvt_f16_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, s1
+// GFX1250: v_cvt_f16_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, s105
+// GFX1250: v_cvt_f16_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f16_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f16_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, ttmp15
+// GFX1250: v_cvt_f16_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, m0
+// GFX1250: v_cvt_f16_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, exec_lo
+// GFX1250: v_cvt_f16_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, exec_hi
+// GFX1250: v_cvt_f16_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, null
+// GFX1250: v_cvt_f16_u16_e64 v5, null              ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, -1
+// GFX1250: v_cvt_f16_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f16_u16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f16_u16_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f16_u16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2
+// GFX1250: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_u16 v128.l, v15.l
+// GFX1250: v_cvt_f16_u16_e64 v128.l, v15.l         ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00]
+
+v_cvt_f16_u16 v128.h, v15.h
+// GFX1250: v_cvt_f16_u16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, v1
+// GFX1250: v_cvt_f32_f16_e64 v5, v1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, v255
+// GFX1250: v_cvt_f32_f16_e64 v5, v255              ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, s1
+// GFX1250: v_cvt_f32_f16_e64 v5, s1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, s105
+// GFX1250: v_cvt_f32_f16_e64 v5, s105              ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, m0
+// GFX1250: v_cvt_f32_f16_e64 v5, m0                ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, null
+// GFX1250: v_cvt_f32_f16_e64 v5, null              ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, -1
+// GFX1250: v_cvt_f32_f16_e64 v5, -1                ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_f16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_f16_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_f16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f16 v1, v128.l
+// GFX1250: v_cvt_f32_f16_e64 v1, v128.l            ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_f32_f16 v1, v128.h
+// GFX1250: v_cvt_f32_f16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0x8b,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, v[2:3]
+// GFX1250: v_cvt_f32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, v[254:255]
+// GFX1250: v_cvt_f32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, s[2:3]
+// GFX1250: v_cvt_f32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, s[104:105]
+// GFX1250: v_cvt_f32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, vcc
+// GFX1250: v_cvt_f32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_cvt_f32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, exec
+// GFX1250: v_cvt_f32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, null
+// GFX1250: v_cvt_f32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, -1
+// GFX1250: v_cvt_f32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_f64_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_f64_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_f64_e64 v5, -|src_scc| mul:4
+// GFX1250: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4  ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30]
+
+v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_i32_e64 v5, v1
+// GFX1250: v_cvt_f32_i32_e64 v5, v1                ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, v255
+// GFX1250: v_cvt_f32_i32_e64 v5, v255              ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, s1
+// GFX1250: v_cvt_f32_i32_e64 v5, s1                ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, s105
+// GFX1250: v_cvt_f32_i32_e64 v5, s105              ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_i32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_i32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_i32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, m0
+// GFX1250: v_cvt_f32_i32_e64 v5, m0                ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_i32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_i32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, null
+// GFX1250: v_cvt_f32_i32_e64 v5, null              ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, -1
+// GFX1250: v_cvt_f32_i32_e64 v5, -1                ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_i32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_i32_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_i32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_u32_e64 v5, v1
+// GFX1250: v_cvt_f32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, v255
+// GFX1250: v_cvt_f32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, s1
+// GFX1250: v_cvt_f32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, s105
+// GFX1250: v_cvt_f32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, m0
+// GFX1250: v_cvt_f32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, null
+// GFX1250: v_cvt_f32_u32_e64 v5, null              ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, -1
+// GFX1250: v_cvt_f32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_u32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_u32_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_u32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte0_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, v1             ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, v255           ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, s1             ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, s105           ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, m0             ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, null           ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, -1             ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte0_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte1_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, v1             ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, v255           ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, s1             ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, s105           ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, m0             ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, null           ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, -1             ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte1_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte2_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, v1             ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, v255           ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, s1             ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, s105           ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, m0             ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, null           ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, -1             ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte2_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte3_e64 v5, v1
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, v1             ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, v255
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, v255           ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, s1
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, s1             ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, s105
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, s105           ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, vcc_lo
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, vcc_hi
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, ttmp15
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, m0
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, m0             ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, exec_lo
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, exec_hi
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, null
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, null           ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, -1
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, -1             ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte3_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_f32_e64 v[6:7], v1
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], v255
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], s1
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], s105
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], vcc_lo
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], vcc_hi
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], ttmp15
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], m0
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], exec_lo
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], exec_hi
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], null
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], -1
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f64_f32_e64 v[6:7], src_scc mul:4
+// GFX1250: v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2
+// GFX1250: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_i32_e64 v[6:7], v1
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], v255
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], s1
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], s105
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], vcc_lo
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], vcc_hi
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], ttmp15
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], m0
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], exec_lo
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], exec_hi
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], null
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], -1
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f64_i32_e64 v[6:7], src_scc mul:4
+// GFX1250: v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_u32_e64 v[6:7], v1
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], v255
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], s1
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], s105
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], vcc_lo
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], vcc_hi
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], ttmp15
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], m0
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], exec_lo
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], exec_hi
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], null
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], -1
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_f64_u32_e64 v[6:7], src_scc mul:4
+// GFX1250: v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_cvt_floor_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, null
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_flr_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, null
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_i16_f16_e64 v5, v1
+// GFX1250: v_cvt_i16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, v255
+// GFX1250: v_cvt_i16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, s1
+// GFX1250: v_cvt_i16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, s105
+// GFX1250: v_cvt_i16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_i16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_i16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_i16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, m0
+// GFX1250: v_cvt_i16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_i16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_i16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, null
+// GFX1250: v_cvt_i16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, -1
+// GFX1250: v_cvt_i16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_i16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_i16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp
+// GFX1250: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_i16_f16 v1.l, v128.l
+// GFX1250: v_cvt_i16_f16_e64 v1.l, v128.l          ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_i16_f16 v1.h, v128.h
+// GFX1250: v_cvt_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd3,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_i32_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_i32_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_i32_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_i32_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_i32_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_i32_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_i32_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_i32_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_i32_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_i32_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, null
+// GFX1250: v_cvt_i32_f32_e64 v5, null              ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_i32_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_i32_f32_e64 v5, 0.5               ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_i32_f32_e64 v5, src_scc           ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp
+// GFX1250: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_f64_e64 v5, v[2:3]
+// GFX1250: v_cvt_i32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, v[254:255]
+// GFX1250: v_cvt_i32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, s[2:3]
+// GFX1250: v_cvt_i32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, s[104:105]
+// GFX1250: v_cvt_i32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, vcc
+// GFX1250: v_cvt_i32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_cvt_i32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, exec
+// GFX1250: v_cvt_i32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, null
+// GFX1250: v_cvt_i32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, -1
+// GFX1250: v_cvt_i32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, 0.5
+// GFX1250: v_cvt_i32_f64_e64 v5, 0.5               ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i32_f64_e64 v5, -|src_scc|
+// GFX1250: v_cvt_i32_f64_e64 v5, -|src_scc|        ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20]
+
+v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
+// GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_i16_e64 v5, v1
+// GFX1250: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, v255
+// GFX1250: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, s1
+// GFX1250: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, s105
+// GFX1250: v_cvt_i32_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, vcc_lo
+// GFX1250: v_cvt_i32_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, vcc_hi
+// GFX1250: v_cvt_i32_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, ttmp15
+// GFX1250: v_cvt_i32_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, m0
+// GFX1250: v_cvt_i32_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, exec_lo
+// GFX1250: v_cvt_i32_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, exec_hi
+// GFX1250: v_cvt_i32_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, null
+// GFX1250: v_cvt_i32_i16_e64 v5, null              ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, -1
+// GFX1250: v_cvt_i32_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, 0.5
+// GFX1250: v_cvt_i32_i16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xea,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v5, src_scc
+// GFX1250: v_cvt_i32_i16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_i32_i16_e64 v255, 0xfe0b
+// GFX1250: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_cvt_i32_i16 v1, v128.l
+// GFX1250: v_cvt_i32_i16_e64 v1, v128.l            ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_i32_i16 v1, v128.h
+// GFX1250: v_cvt_i32_i16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xea,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255      ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105      ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo    ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi    ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15    ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0        ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo   ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi   ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, null
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null      ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1        ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5       ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc   ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_norm_i16_f16_e64 v5, v1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, v255
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, s1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, s105
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, m0
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, null
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, -1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_norm_i16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|
+// GFX1250: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_norm_i16_f16 v1.l, v128.l
+// GFX1250: v_cvt_norm_i16_f16_e64 v1.l, v128.l     ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_norm_i16_f16 v1.l, v128.h
+// GFX1250: v_cvt_norm_i16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, v1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, v255
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, s1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, s105
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, m0
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, null
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, -1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_norm_u16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|
+// GFX1250: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_norm_u16_f16 v1.l, v128.l
+// GFX1250: v_cvt_norm_u16_f16_e64 v1.l, v128.l     ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_norm_u16_f16 v1.l, v128.h
+// GFX1250: v_cvt_norm_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe4,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, v1
+// GFX1250: v_cvt_off_f32_i4_e64 v5, v1             ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, v255
+// GFX1250: v_cvt_off_f32_i4_e64 v5, v255           ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, s1
+// GFX1250: v_cvt_off_f32_i4_e64 v5, s1             ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, s105
+// GFX1250: v_cvt_off_f32_i4_e64 v5, s105           ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, vcc_lo
+// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, vcc_hi
+// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, ttmp15
+// GFX1250: v_cvt_off_f32_i4_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, m0
+// GFX1250: v_cvt_off_f32_i4_e64 v5, m0             ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, exec_lo
+// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, exec_hi
+// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, null
+// GFX1250: v_cvt_off_f32_i4_e64 v5, null           ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, -1
+// GFX1250: v_cvt_off_f32_i4_e64 v5, -1             ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, 0.5 mul:2
+// GFX1250: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cvt_off_f32_i4_e64 v5, src_scc mul:4
+// GFX1250: v_cvt_off_f32_i4_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2
+// GFX1250: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, v1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, v255
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255      ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, s1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, s105
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105      ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo    ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi    ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15    ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, m0
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0        ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo   ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi   ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, null
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null      ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, -1
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1        ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5       ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc   ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_u16_f16_e64 v5, v1
+// GFX1250: v_cvt_u16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, v255
+// GFX1250: v_cvt_u16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, s1
+// GFX1250: v_cvt_u16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, s105
+// GFX1250: v_cvt_u16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, vcc_lo
+// GFX1250: v_cvt_u16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, vcc_hi
+// GFX1250: v_cvt_u16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, ttmp15
+// GFX1250: v_cvt_u16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, m0
+// GFX1250: v_cvt_u16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, exec_lo
+// GFX1250: v_cvt_u16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, exec_hi
+// GFX1250: v_cvt_u16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, null
+// GFX1250: v_cvt_u16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, -1
+// GFX1250: v_cvt_u16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, 0.5
+// GFX1250: v_cvt_u16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, src_scc
+// GFX1250: v_cvt_u16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp
+// GFX1250: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_cvt_u16_f16 v1.l, v128.l
+// GFX1250: v_cvt_u16_f16_e64 v1.l, v128.l          ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_u16_f16 v1.l, v128.h
+// GFX1250: v_cvt_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xd2,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, v1
+// GFX1250: v_cvt_u32_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, v255
+// GFX1250: v_cvt_u32_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, s1
+// GFX1250: v_cvt_u32_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, s105
+// GFX1250: v_cvt_u32_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, vcc_lo
+// GFX1250: v_cvt_u32_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, vcc_hi
+// GFX1250: v_cvt_u32_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, ttmp15
+// GFX1250: v_cvt_u32_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, m0
+// GFX1250: v_cvt_u32_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, exec_lo
+// GFX1250: v_cvt_u32_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, exec_hi
+// GFX1250: v_cvt_u32_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, null
+// GFX1250: v_cvt_u32_f32_e64 v5, null              ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, -1
+// GFX1250: v_cvt_u32_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, 0.5
+// GFX1250: v_cvt_u32_f32_e64 v5, 0.5               ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v5, src_scc
+// GFX1250: v_cvt_u32_f32_e64 v5, src_scc           ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp
+// GFX1250: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_f64_e64 v5, v[2:3]
+// GFX1250: v_cvt_u32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, v[254:255]
+// GFX1250: v_cvt_u32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, s[2:3]
+// GFX1250: v_cvt_u32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, s[104:105]
+// GFX1250: v_cvt_u32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, vcc
+// GFX1250: v_cvt_u32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_cvt_u32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, exec
+// GFX1250: v_cvt_u32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, null
+// GFX1250: v_cvt_u32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, -1
+// GFX1250: v_cvt_u32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, 0.5
+// GFX1250: v_cvt_u32_f64_e64 v5, 0.5               ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u32_f64_e64 v5, -|src_scc|
+// GFX1250: v_cvt_u32_f64_e64 v5, -|src_scc|        ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20]
+
+v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
+// GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_u16_e64 v5, v1
+// GFX1250: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, v255
+// GFX1250: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, s1
+// GFX1250: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, s105
+// GFX1250: v_cvt_u32_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, vcc_lo
+// GFX1250: v_cvt_u32_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, vcc_hi
+// GFX1250: v_cvt_u32_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, ttmp15
+// GFX1250: v_cvt_u32_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, m0
+// GFX1250: v_cvt_u32_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, exec_lo
+// GFX1250: v_cvt_u32_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, exec_hi
+// GFX1250: v_cvt_u32_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, null
+// GFX1250: v_cvt_u32_u16_e64 v5, null              ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, -1
+// GFX1250: v_cvt_u32_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, 0.5
+// GFX1250: v_cvt_u32_u16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xeb,0xd5,0xf0,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v5, src_scc
+// GFX1250: v_cvt_u32_u16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00]
+
+v_cvt_u32_u16_e64 v255, 0xfe0b
+// GFX1250: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_cvt_u32_u16 v1, v128.l
+// GFX1250: v_cvt_u32_u16_e64 v1, v128.l            ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00]
+
+v_cvt_u32_u16 v1, v128.h
+// GFX1250: v_cvt_u32_u16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, v1
+// GFX1250: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, v255
+// GFX1250: v_exp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, s1
+// GFX1250: v_exp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, s105
+// GFX1250: v_exp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, vcc_lo
+// GFX1250: v_exp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, vcc_hi
+// GFX1250: v_exp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, ttmp15
+// GFX1250: v_exp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, m0
+// GFX1250: v_exp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, exec_lo
+// GFX1250: v_exp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, exec_hi
+// GFX1250: v_exp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, null
+// GFX1250: v_exp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, -1
+// GFX1250: v_exp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_f16_e64 v5, src_scc mul:4
+// GFX1250: v_exp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_exp_f16 v1.h, v128.l
+// GFX1250: v_exp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00]
+
+v_exp_f16 v1.h, v128.h
+// GFX1250: v_exp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd8,0xd5,0x80,0x01,0x00,0x00]
+
+v_exp_f32_e64 v5, v1
+// GFX1250: v_exp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_f32_e64 v5, v255
+// GFX1250: v_exp_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_f32_e64 v5, s1
+// GFX1250: v_exp_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, s105
+// GFX1250: v_exp_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, vcc_lo
+// GFX1250: v_exp_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, vcc_hi
+// GFX1250: v_exp_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, ttmp15
+// GFX1250: v_exp_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, m0
+// GFX1250: v_exp_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, exec_lo
+// GFX1250: v_exp_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, exec_hi
+// GFX1250: v_exp_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, null
+// GFX1250: v_exp_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, -1
+// GFX1250: v_exp_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_f32_e64 v5, src_scc mul:4
+// GFX1250: v_exp_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_ffbh_i32_e64 v5, v1
+// GFX1250: v_cls_i32_e64 v5, v1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00]
+
+v_ffbh_i32_e64 v5, v255
+// GFX1250: v_cls_i32_e64 v5, v255                  ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00]
+
+v_ffbh_i32_e64 v5, s1
+// GFX1250: v_cls_i32_e64 v5, s1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, s105
+// GFX1250: v_cls_i32_e64 v5, s105                  ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, vcc_lo
+// GFX1250: v_cls_i32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, vcc_hi
+// GFX1250: v_cls_i32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, ttmp15
+// GFX1250: v_cls_i32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, m0
+// GFX1250: v_cls_i32_e64 v5, m0                    ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, exec_lo
+// GFX1250: v_cls_i32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, exec_hi
+// GFX1250: v_cls_i32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, null
+// GFX1250: v_cls_i32_e64 v5, null                  ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, -1
+// GFX1250: v_cls_i32_e64 v5, -1                    ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, 0.5
+// GFX1250: v_cls_i32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v5, src_scc
+// GFX1250: v_cls_i32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ffbh_i32_e64 v255, 0xaf123456
+// GFX1250: v_cls_i32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_ffbh_u32_e64 v5, v1
+// GFX1250: v_clz_i32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00]
+
+v_ffbh_u32_e64 v5, v255
+// GFX1250: v_clz_i32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00]
+
+v_ffbh_u32_e64 v5, s1
+// GFX1250: v_clz_i32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, s105
+// GFX1250: v_clz_i32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, vcc_lo
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, vcc_hi
+// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, ttmp15
+// GFX1250: v_clz_i32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, m0
+// GFX1250: v_clz_i32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, exec_lo
+// GFX1250: v_clz_i32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, exec_hi
+// GFX1250: v_clz_i32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, null
+// GFX1250: v_clz_i32_u32_e64 v5, null              ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, -1
+// GFX1250: v_clz_i32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, 0.5
+// GFX1250: v_clz_i32_u32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v5, src_scc
+// GFX1250: v_clz_i32_u32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ffbh_u32_e64 v255, 0xaf123456
+// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_ffbl_b32_e64 v5, v1
+// GFX1250: v_ctz_i32_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00]
+
+v_ffbl_b32_e64 v5, v255
+// GFX1250: v_ctz_i32_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00]
+
+v_ffbl_b32_e64 v5, s1
+// GFX1250: v_ctz_i32_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, s105
+// GFX1250: v_ctz_i32_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, vcc_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, vcc_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, ttmp15
+// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, m0
+// GFX1250: v_ctz_i32_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, exec_lo
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, exec_hi
+// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, null
+// GFX1250: v_ctz_i32_b32_e64 v5, null              ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, -1
+// GFX1250: v_ctz_i32_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, 0.5
+// GFX1250: v_ctz_i32_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v5, src_scc
+// GFX1250: v_ctz_i32_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00]
+
+v_ffbl_b32_e64 v255, 0xaf123456
+// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_floor_f16_e64 v5, v1
+// GFX1250: v_floor_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+
+v_floor_f16_e64 v5, v255
+// GFX1250: v_floor_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+
+v_floor_f16_e64 v5, s1
+// GFX1250: v_floor_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, s105
+// GFX1250: v_floor_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, vcc_lo
+// GFX1250: v_floor_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, vcc_hi
+// GFX1250: v_floor_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, ttmp15
+// GFX1250: v_floor_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, m0
+// GFX1250: v_floor_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, exec_lo
+// GFX1250: v_floor_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, exec_hi
+// GFX1250: v_floor_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, null
+// GFX1250: v_floor_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, -1
+// GFX1250: v_floor_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_floor_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+
+v_floor_f16_e64 v5, src_scc mul:4
+// GFX1250: v_floor_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+
+v_floor_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_floor_f16 v1.h, v128.l
+// GFX1250: v_floor_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00]
+
+v_floor_f16 v1.h, v128.h
+// GFX1250: v_floor_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdb,0xd5,0x80,0x01,0x00,0x00]
+
+v_floor_f32_e64 v5, v1
+// GFX1250: v_floor_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00]
+
+v_floor_f32_e64 v5, v255
+// GFX1250: v_floor_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00]
+
+v_floor_f32_e64 v5, s1
+// GFX1250: v_floor_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, s105
+// GFX1250: v_floor_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, vcc_lo
+// GFX1250: v_floor_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, vcc_hi
+// GFX1250: v_floor_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, ttmp15
+// GFX1250: v_floor_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, m0
+// GFX1250: v_floor_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, exec_lo
+// GFX1250: v_floor_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, exec_hi
+// GFX1250: v_floor_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, null
+// GFX1250: v_floor_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, -1
+// GFX1250: v_floor_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_floor_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_floor_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08]
+
+v_floor_f32_e64 v5, src_scc mul:4
+// GFX1250: v_floor_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10]
+
+v_floor_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_floor_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_floor_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_floor_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_floor_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_floor_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], vcc
+// GFX1250: v_floor_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_floor_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], exec
+// GFX1250: v_floor_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], null
+// GFX1250: v_floor_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], -1
+// GFX1250: v_floor_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00]
+
+v_floor_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_floor_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08]
+
+v_floor_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_floor_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30]
+
+v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_fract_f16_e64 v5, v1
+// GFX1250: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5, v255
+// GFX1250: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5, s1
+// GFX1250: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, s105
+// GFX1250: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, vcc_lo
+// GFX1250: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, vcc_hi
+// GFX1250: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, ttmp15
+// GFX1250: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, m0
+// GFX1250: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, exec_lo
+// GFX1250: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, exec_hi
+// GFX1250: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, null
+// GFX1250: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, -1
+// GFX1250: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+
+v_fract_f16_e64 v5, src_scc mul:4
+// GFX1250: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+
+v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v1.h, v128.l
+// GFX1250: v_fract_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00]
+
+v_fract_f16 v1.h, v128.h
+// GFX1250: v_fract_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdf,0xd5,0x80,0x01,0x00,0x00]
+
+v_fract_f32_e64 v5, v1
+// GFX1250: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
+
+v_fract_f32_e64 v5, v255
+// GFX1250: v_fract_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00]
+
+v_fract_f32_e64 v5, s1
+// GFX1250: v_fract_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, s105
+// GFX1250: v_fract_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, vcc_lo
+// GFX1250: v_fract_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, vcc_hi
+// GFX1250: v_fract_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, ttmp15
+// GFX1250: v_fract_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, m0
+// GFX1250: v_fract_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, exec_lo
+// GFX1250: v_fract_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, exec_hi
+// GFX1250: v_fract_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, null
+// GFX1250: v_fract_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, -1
+// GFX1250: v_fract_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_fract_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_fract_f32_e64 v5, src_scc mul:4
+// GFX1250: v_fract_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_fract_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_fract_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_fract_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_fract_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_fract_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_fract_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], vcc
+// GFX1250: v_fract_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_fract_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], exec
+// GFX1250: v_fract_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], null
+// GFX1250: v_fract_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], -1
+// GFX1250: v_fract_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00]
+
+v_fract_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_fract_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08]
+
+v_fract_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_fract_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30]
+
+v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i16_f16_e64 v5, v1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, v1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, v255
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, v255        ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, s1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, s1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, s105
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, s105        ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, vcc_lo
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, vcc_hi
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, ttmp15
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, m0
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, m0          ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, exec_lo
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, exec_hi
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, null
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, null        ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, -1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, -1          ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, 0.5
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, 0.5         ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, src_scc
+// GFX1250: v_frexp_exp_i16_f16_e64 v5, src_scc     ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v255, -|0xfe0b|
+// GFX1250: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+v_frexp_exp_i16_f16 v1.h, v128.l
+// GFX1250: v_frexp_exp_i16_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16 v1.h, v128.h
+// GFX1250: v_frexp_exp_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xda,0xd5,0x80,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, v1
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, v255
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, s1
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, s105
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, vcc_lo
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, vcc_hi
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, ttmp15
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, m0
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, exec_lo
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, exec_hi
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, null
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, -1
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, 0.5
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v5, src_scc
+// GFX1250: v_frexp_exp_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32_e64 v255, -|0xaf123456|
+// GFX1250: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i32_f64_e64 v5, v[2:3]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[2:3]      ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, v[254:255]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[254:255]  ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, s[2:3]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[2:3]      ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, s[104:105]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[104:105]  ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, vcc
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, vcc         ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, ttmp[14:15]
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, exec
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, exec        ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, null
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, null        ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, -1
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, -1          ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, 0.5
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, 0.5         ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f64_e64 v5, -|src_scc|
+// GFX1250: v_frexp_exp_i32_f64_e64 v5, -|src_scc|  ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20]
+
+v_frexp_exp_i32_f64_e64 v255, 0xaf123456
+// GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f16_e64 v5, v1
+// GFX1250: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, v255
+// GFX1250: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, s1
+// GFX1250: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, s105
+// GFX1250: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, vcc_lo
+// GFX1250: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, vcc_hi
+// GFX1250: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, ttmp15
+// GFX1250: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, m0
+// GFX1250: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, exec_lo
+// GFX1250: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, exec_hi
+// GFX1250: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, null
+// GFX1250: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, -1
+// GFX1250: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+
+v_frexp_mant_f16_e64 v5, src_scc mul:4
+// GFX1250: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+
+v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v1.h, v128.l
+// GFX1250: v_frexp_mant_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00]
+
+v_frexp_mant_f16 v1.h, v128.h
+// GFX1250: v_frexp_mant_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd9,0xd5,0x80,0x01,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, v1
+// GFX1250: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, v255
+// GFX1250: v_frexp_mant_f32_e64 v5, v255           ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, s1
+// GFX1250: v_frexp_mant_f32_e64 v5, s1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, s105
+// GFX1250: v_frexp_mant_f32_e64 v5, s105           ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, vcc_lo
+// GFX1250: v_frexp_mant_f32_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, vcc_hi
+// GFX1250: v_frexp_mant_f32_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, ttmp15
+// GFX1250: v_frexp_mant_f32_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, m0
+// GFX1250: v_frexp_mant_f32_e64 v5, m0             ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, exec_lo
+// GFX1250: v_frexp_mant_f32_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, exec_hi
+// GFX1250: v_frexp_mant_f32_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, null
+// GFX1250: v_frexp_mant_f32_e64 v5, null           ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, -1
+// GFX1250: v_frexp_mant_f32_e64 v5, -1             ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_frexp_mant_f32_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_frexp_mant_f32_e64 v5, src_scc mul:4
+// GFX1250: v_frexp_mant_f32_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[2:3]     ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[2:3]     ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], vcc
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], vcc        ; encoding: [0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], exec
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], exec       ; encoding: [0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], null
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], null       ; encoding: [0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], -1
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], -1         ; encoding: [0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2  ; encoding: [0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30]
+
+v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_log_f16_e64 v5, v1
+// GFX1250: v_log_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_f16_e64 v5, v255
+// GFX1250: v_log_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_f16_e64 v5, s1
+// GFX1250: v_log_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, s105
+// GFX1250: v_log_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, vcc_lo
+// GFX1250: v_log_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, vcc_hi
+// GFX1250: v_log_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, ttmp15
+// GFX1250: v_log_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, m0
+// GFX1250: v_log_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, exec_lo
+// GFX1250: v_log_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, exec_hi
+// GFX1250: v_log_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, null
+// GFX1250: v_log_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, -1
+// GFX1250: v_log_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_log_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_f16_e64 v5, src_scc mul:4
+// GFX1250: v_log_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_log_f16 v1.h, v128.l
+// GFX1250: v_log_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00]
+
+v_log_f16 v1.h, v128.h
+// GFX1250: v_log_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd7,0xd5,0x80,0x01,0x00,0x00]
+
+v_log_f32_e64 v5, v1
+// GFX1250: v_log_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_f32_e64 v5, v255
+// GFX1250: v_log_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_f32_e64 v5, s1
+// GFX1250: v_log_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, s105
+// GFX1250: v_log_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, vcc_lo
+// GFX1250: v_log_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, vcc_hi
+// GFX1250: v_log_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, ttmp15
+// GFX1250: v_log_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, m0
+// GFX1250: v_log_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, exec_lo
+// GFX1250: v_log_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, exec_hi
+// GFX1250: v_log_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, null
+// GFX1250: v_log_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, -1
+// GFX1250: v_log_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_log_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_f32_e64 v5, src_scc mul:4
+// GFX1250: v_log_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_mov_b32_e64 v5, v1
+// GFX1250: v_mov_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00]
+
+v_mov_b32_e64 v5, v255
+// GFX1250: v_mov_b32_e64 v5, v255                  ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00]
+
+v_mov_b32_e64 v5, s1
+// GFX1250: v_mov_b32_e64 v5, s1                    ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, s105
+// GFX1250: v_mov_b32_e64 v5, s105                  ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, vcc_lo
+// GFX1250: v_mov_b32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, vcc_hi
+// GFX1250: v_mov_b32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, ttmp15
+// GFX1250: v_mov_b32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, m0
+// GFX1250: v_mov_b32_e64 v5, m0                    ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, exec_lo
+// GFX1250: v_mov_b32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, exec_hi
+// GFX1250: v_mov_b32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, null
+// GFX1250: v_mov_b32_e64 v5, null                  ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, -1
+// GFX1250: v_mov_b32_e64 v5, -1                    ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, 0.5
+// GFX1250: v_mov_b32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00]
+
+v_mov_b32_e64 v5, src_scc
+// GFX1250: v_mov_b32_e64 v5, src_scc               ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00]
+
+v_mov_b32_e64 v255, 0xaf123456
+// GFX1250: v_mov_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_movreld_b32_e64 v5, v1
+// GFX1250: v_movreld_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00]
+
+v_movreld_b32_e64 v5, v255
+// GFX1250: v_movreld_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00]
+
+v_movreld_b32_e64 v5, s1
+// GFX1250: v_movreld_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, s105
+// GFX1250: v_movreld_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, vcc_lo
+// GFX1250: v_movreld_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, vcc_hi
+// GFX1250: v_movreld_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, ttmp15
+// GFX1250: v_movreld_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, m0
+// GFX1250: v_movreld_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, exec_lo
+// GFX1250: v_movreld_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, exec_hi
+// GFX1250: v_movreld_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, null
+// GFX1250: v_movreld_b32_e64 v5, null              ; encoding: [0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, -1
+// GFX1250: v_movreld_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, 0.5
+// GFX1250: v_movreld_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, src_scc
+// GFX1250: v_movreld_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v255, 0xaf123456
+// GFX1250: v_movreld_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_movrels_b32_e64 v5, v1
+// GFX1250: v_movrels_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00]
+
+v_movrels_b32_e64 v255, v255
+// GFX1250: v_movrels_b32_e64 v255, v255            ; encoding: [0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00]
+
+v_movrelsd_2_b32_e64 v5, v1
+// GFX1250: v_movrelsd_2_b32_e64 v5, v1             ; encoding: [0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00]
+
+v_movrelsd_2_b32_e64 v255, v255
+// GFX1250: v_movrelsd_2_b32_e64 v255, v255         ; encoding: [0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v5, v1
+// GFX1250: v_movrelsd_b32_e64 v5, v1               ; encoding: [0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v255, v255
+// GFX1250: v_movrelsd_b32_e64 v255, v255           ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00]
+
+v_nop_e64
+// GFX1250: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, v1
+// GFX1250: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b16_e64 v5, v255
+// GFX1250: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b16_e64 v5, s1
+// GFX1250: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, s105
+// GFX1250: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, vcc_lo
+// GFX1250: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, vcc_hi
+// GFX1250: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, ttmp15
+// GFX1250: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, m0
+// GFX1250: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, exec_lo
+// GFX1250: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, exec_hi
+// GFX1250: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, null
+// GFX1250: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, -1
+// GFX1250: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, 0.5
+// GFX1250: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b16_e64 v5, src_scc
+// GFX1250: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+
+v_not_b16_e64 v255, 0xfe0b
+// GFX1250: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_not_b16 v1.h, v128.l
+// GFX1250: v_not_b16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00]
+
+v_not_b16 v1.h, v128.h
+// GFX1250: v_not_b16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe9,0xd5,0x80,0x01,0x00,0x00]
+
+v_not_b32_e64 v5, v1
+// GFX1250: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
+
+v_not_b32_e64 v5, v255
+// GFX1250: v_not_b32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00]
+
+v_not_b32_e64 v5, s1
+// GFX1250: v_not_b32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, s105
+// GFX1250: v_not_b32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, vcc_lo
+// GFX1250: v_not_b32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, vcc_hi
+// GFX1250: v_not_b32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, ttmp15
+// GFX1250: v_not_b32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, m0
+// GFX1250: v_not_b32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, exec_lo
+// GFX1250: v_not_b32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, exec_hi
+// GFX1250: v_not_b32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, null
+// GFX1250: v_not_b32_e64 v5, null                  ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, -1
+// GFX1250: v_not_b32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, 0.5
+// GFX1250: v_not_b32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00]
+
+v_not_b32_e64 v5, src_scc
+// GFX1250: v_not_b32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00]
+
+v_not_b32_e64 v255, 0xaf123456
+// GFX1250: v_not_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+v_pipeflush_e64
+// GFX1250: v_pipeflush                             ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, v1
+// GFX1250: v_rcp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_f16_e64 v5, v255
+// GFX1250: v_rcp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_f16_e64 v5, s1
+// GFX1250: v_rcp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, s105
+// GFX1250: v_rcp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, vcc_lo
+// GFX1250: v_rcp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, vcc_hi
+// GFX1250: v_rcp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, ttmp15
+// GFX1250: v_rcp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, m0
+// GFX1250: v_rcp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, exec_lo
+// GFX1250: v_rcp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, exec_hi
+// GFX1250: v_rcp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, null
+// GFX1250: v_rcp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, -1
+// GFX1250: v_rcp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_f16_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rcp_f16 v1.h, v128.l
+// GFX1250: v_rcp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00]
+
+v_rcp_f16 v1.h, v128.h
+// GFX1250: v_rcp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd4,0xd5,0x80,0x01,0x00,0x00]
+
+v_rcp_f32_e64 v5, v1
+// GFX1250: v_rcp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_f32_e64 v5, v255
+// GFX1250: v_rcp_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_f32_e64 v5, s1
+// GFX1250: v_rcp_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, s105
+// GFX1250: v_rcp_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, vcc_lo
+// GFX1250: v_rcp_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, vcc_hi
+// GFX1250: v_rcp_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, ttmp15
+// GFX1250: v_rcp_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, m0
+// GFX1250: v_rcp_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, exec_lo
+// GFX1250: v_rcp_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, exec_hi
+// GFX1250: v_rcp_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, null
+// GFX1250: v_rcp_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, -1
+// GFX1250: v_rcp_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rcp_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_rcp_f64_e64 v[6:7], v[2:3]            ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_rcp_f64_e64 v[6:7], v[254:255]        ; encoding: [0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_rcp_f64_e64 v[6:7], s[2:3]            ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_rcp_f64_e64 v[6:7], s[104:105]        ; encoding: [0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], vcc
+// GFX1250: v_rcp_f64_e64 v[6:7], vcc               ; encoding: [0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_rcp_f64_e64 v[6:7], ttmp[14:15]       ; encoding: [0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], exec
+// GFX1250: v_rcp_f64_e64 v[6:7], exec              ; encoding: [0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], null
+// GFX1250: v_rcp_f64_e64 v[6:7], null              ; encoding: [0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], -1
+// GFX1250: v_rcp_f64_e64 v[6:7], -1                ; encoding: [0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_rcp_f64_e64 v[6:7], 0.5 mul:2         ; encoding: [0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_rcp_f64_e64 v[6:7], -|src_scc| mul:4  ; encoding: [0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30]
+
+v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_rcp_iflag_f32_e64 v5, v1
+// GFX1250: v_rcp_iflag_f32_e64 v5, v1              ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, v255
+// GFX1250: v_rcp_iflag_f32_e64 v5, v255            ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, s1
+// GFX1250: v_rcp_iflag_f32_e64 v5, s1              ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, s105
+// GFX1250: v_rcp_iflag_f32_e64 v5, s105            ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, vcc_lo
+// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, vcc_hi
+// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, ttmp15
+// GFX1250: v_rcp_iflag_f32_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, m0
+// GFX1250: v_rcp_iflag_f32_e64 v5, m0              ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, exec_lo
+// GFX1250: v_rcp_iflag_f32_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, exec_hi
+// GFX1250: v_rcp_iflag_f32_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, null
+// GFX1250: v_rcp_iflag_f32_e64 v5, null            ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, -1
+// GFX1250: v_rcp_iflag_f32_e64 v5, -1              ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_iflag_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_iflag_f32_e64 v5, 0.5 mul:2       ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_iflag_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_iflag_f32_e64 v5, src_scc mul:4   ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rndne_f16_e64 v5, v1
+// GFX1250: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5, v255
+// GFX1250: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5, s1
+// GFX1250: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, s105
+// GFX1250: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, vcc_lo
+// GFX1250: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, vcc_hi
+// GFX1250: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, ttmp15
+// GFX1250: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, m0
+// GFX1250: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, exec_lo
+// GFX1250: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, exec_hi
+// GFX1250: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, null
+// GFX1250: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, -1
+// GFX1250: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rndne_f16_e64 v5, src_scc mul:4
+// GFX1250: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v1.h, v128.l
+// GFX1250: v_rndne_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00]
+
+v_rndne_f16 v1.h, v128.h
+// GFX1250: v_rndne_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xde,0xd5,0x80,0x01,0x00,0x00]
+
+v_rndne_f32_e64 v5, v1
+// GFX1250: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
+
+v_rndne_f32_e64 v5, v255
+// GFX1250: v_rndne_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00]
+
+v_rndne_f32_e64 v5, s1
+// GFX1250: v_rndne_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, s105
+// GFX1250: v_rndne_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, vcc_lo
+// GFX1250: v_rndne_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, vcc_hi
+// GFX1250: v_rndne_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, ttmp15
+// GFX1250: v_rndne_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, m0
+// GFX1250: v_rndne_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, exec_lo
+// GFX1250: v_rndne_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, exec_hi
+// GFX1250: v_rndne_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, null
+// GFX1250: v_rndne_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, -1
+// GFX1250: v_rndne_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rndne_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rndne_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rndne_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rndne_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_rndne_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_rndne_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_rndne_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_rndne_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], vcc
+// GFX1250: v_rndne_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_rndne_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], exec
+// GFX1250: v_rndne_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], null
+// GFX1250: v_rndne_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], -1
+// GFX1250: v_rndne_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rndne_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_rndne_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rndne_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30]
+
+v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_rsq_f16_e64 v5, v1
+// GFX1250: v_rsq_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_f16_e64 v5, v255
+// GFX1250: v_rsq_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_f16_e64 v5, s1
+// GFX1250: v_rsq_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, s105
+// GFX1250: v_rsq_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, vcc_lo
+// GFX1250: v_rsq_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, vcc_hi
+// GFX1250: v_rsq_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, ttmp15
+// GFX1250: v_rsq_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, m0
+// GFX1250: v_rsq_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, exec_lo
+// GFX1250: v_rsq_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, exec_hi
+// GFX1250: v_rsq_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, null
+// GFX1250: v_rsq_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, -1
+// GFX1250: v_rsq_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_f16_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_rsq_f16 v1.h, v128.l
+// GFX1250: v_rsq_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00]
+
+v_rsq_f16 v1.h, v128.h
+// GFX1250: v_rsq_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd6,0xd5,0x80,0x01,0x00,0x00]
+
+v_rsq_f32_e64 v5, v1
+// GFX1250: v_rsq_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_f32_e64 v5, v255
+// GFX1250: v_rsq_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_f32_e64 v5, s1
+// GFX1250: v_rsq_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, s105
+// GFX1250: v_rsq_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, vcc_lo
+// GFX1250: v_rsq_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, vcc_hi
+// GFX1250: v_rsq_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, ttmp15
+// GFX1250: v_rsq_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, m0
+// GFX1250: v_rsq_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, exec_lo
+// GFX1250: v_rsq_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, exec_hi
+// GFX1250: v_rsq_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, null
+// GFX1250: v_rsq_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, -1
+// GFX1250: v_rsq_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_f32_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_rsq_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_rsq_f64_e64 v[6:7], v[2:3]            ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_rsq_f64_e64 v[6:7], v[254:255]        ; encoding: [0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_rsq_f64_e64 v[6:7], s[2:3]            ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_rsq_f64_e64 v[6:7], s[104:105]        ; encoding: [0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], vcc
+// GFX1250: v_rsq_f64_e64 v[6:7], vcc               ; encoding: [0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_rsq_f64_e64 v[6:7], ttmp[14:15]       ; encoding: [0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], exec
+// GFX1250: v_rsq_f64_e64 v[6:7], exec              ; encoding: [0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], null
+// GFX1250: v_rsq_f64_e64 v[6:7], null              ; encoding: [0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], -1
+// GFX1250: v_rsq_f64_e64 v[6:7], -1                ; encoding: [0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_rsq_f64_e64 v[6:7], 0.5 mul:2         ; encoding: [0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_rsq_f64_e64 v[6:7], -|src_scc| mul:4  ; encoding: [0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30]
+
+v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_sat_pk_u8_i16_e64 v5, v1
+// GFX1250: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, v255
+// GFX1250: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, s1
+// GFX1250: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, s105
+// GFX1250: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, vcc_lo
+// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, vcc_hi
+// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, ttmp15
+// GFX1250: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, m0
+// GFX1250: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, exec_lo
+// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, exec_hi
+// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, null
+// GFX1250: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, -1
+// GFX1250: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, 0.5
+// GFX1250: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v5, src_scc
+// GFX1250: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16_e64 v255, 0xfe0b
+// GFX1250: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sat_pk_u8_i16 v128.l, v1
+// GFX1250: v_sat_pk_u8_i16_e64 v128.l, v1          ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+v_sat_pk_u8_i16 v128.h, v1
+// GFX1250: v_sat_pk_u8_i16_e64 v128.h, v1 op_sel:[0,1] ; encoding: [0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, v1
+// GFX1250: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, v255
+// GFX1250: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, s1
+// GFX1250: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, s105
+// GFX1250: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, vcc_lo
+// GFX1250: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, vcc_hi
+// GFX1250: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, ttmp15
+// GFX1250: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, m0
+// GFX1250: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, exec_lo
+// GFX1250: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, exec_hi
+// GFX1250: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, null
+// GFX1250: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, -1
+// GFX1250: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_f16_e64 v5, src_scc mul:4
+// GFX1250: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v1.h, v128.l
+// GFX1250: v_sin_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00]
+
+v_sin_f16 v1.h, v128.h
+// GFX1250: v_sin_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe0,0xd5,0x80,0x01,0x00,0x00]
+
+v_sin_f32_e64 v5, v1
+// GFX1250: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_f32_e64 v5, v255
+// GFX1250: v_sin_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_f32_e64 v5, s1
+// GFX1250: v_sin_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, s105
+// GFX1250: v_sin_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, vcc_lo
+// GFX1250: v_sin_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, vcc_hi
+// GFX1250: v_sin_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, ttmp15
+// GFX1250: v_sin_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, m0
+// GFX1250: v_sin_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, exec_lo
+// GFX1250: v_sin_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, exec_hi
+// GFX1250: v_sin_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, null
+// GFX1250: v_sin_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, -1
+// GFX1250: v_sin_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_f32_e64 v5, src_scc mul:4
+// GFX1250: v_sin_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f16_e64 v5, v1
+// GFX1250: v_sqrt_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+
+v_sqrt_f16_e64 v5, v255
+// GFX1250: v_sqrt_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+
+v_sqrt_f16_e64 v5, s1
+// GFX1250: v_sqrt_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, s105
+// GFX1250: v_sqrt_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, vcc_lo
+// GFX1250: v_sqrt_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, vcc_hi
+// GFX1250: v_sqrt_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, ttmp15
+// GFX1250: v_sqrt_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, m0
+// GFX1250: v_sqrt_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, exec_lo
+// GFX1250: v_sqrt_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, exec_hi
+// GFX1250: v_sqrt_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, null
+// GFX1250: v_sqrt_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, -1
+// GFX1250: v_sqrt_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_sqrt_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_f16_e64 v5, src_scc mul:4
+// GFX1250: v_sqrt_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_sqrt_f16 v1.h, v128.l
+// GFX1250: v_sqrt_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00]
+
+v_sqrt_f16 v1.h, v128.h
+// GFX1250: v_sqrt_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd5,0xd5,0x80,0x01,0x00,0x00]
+
+v_sqrt_f32_e64 v5, v1
+// GFX1250: v_sqrt_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00]
+
+v_sqrt_f32_e64 v5, v255
+// GFX1250: v_sqrt_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00]
+
+v_sqrt_f32_e64 v5, s1
+// GFX1250: v_sqrt_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, s105
+// GFX1250: v_sqrt_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, vcc_lo
+// GFX1250: v_sqrt_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, vcc_hi
+// GFX1250: v_sqrt_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, ttmp15
+// GFX1250: v_sqrt_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, m0
+// GFX1250: v_sqrt_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, exec_lo
+// GFX1250: v_sqrt_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, exec_hi
+// GFX1250: v_sqrt_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, null
+// GFX1250: v_sqrt_f32_e64 v5, null                 ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, -1
+// GFX1250: v_sqrt_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_sqrt_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_f32_e64 v5, src_scc mul:4
+// GFX1250: v_sqrt_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_sqrt_f64_e64 v[6:7], v[2:3]           ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_sqrt_f64_e64 v[6:7], v[254:255]       ; encoding: [0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_sqrt_f64_e64 v[6:7], s[2:3]           ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_sqrt_f64_e64 v[6:7], s[104:105]       ; encoding: [0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], vcc
+// GFX1250: v_sqrt_f64_e64 v[6:7], vcc              ; encoding: [0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_sqrt_f64_e64 v[6:7], ttmp[14:15]      ; encoding: [0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], exec
+// GFX1250: v_sqrt_f64_e64 v[6:7], exec             ; encoding: [0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], null
+// GFX1250: v_sqrt_f64_e64 v[6:7], null             ; encoding: [0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], -1
+// GFX1250: v_sqrt_f64_e64 v[6:7], -1               ; encoding: [0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_sqrt_f64_e64 v[6:7], 0.5 mul:2        ; encoding: [0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30]
+
+v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_trunc_f16_e64 v5, v1
+// GFX1250: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5, v255
+// GFX1250: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5, s1
+// GFX1250: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, s105
+// GFX1250: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, vcc_lo
+// GFX1250: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, vcc_hi
+// GFX1250: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, ttmp15
+// GFX1250: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, m0
+// GFX1250: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, exec_lo
+// GFX1250: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, exec_hi
+// GFX1250: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, null
+// GFX1250: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, -1
+// GFX1250: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_trunc_f16_e64 v5, src_scc mul:4
+// GFX1250: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+
+v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
+// GFX1250: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v1.h, v128.l
+// GFX1250: v_trunc_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00]
+
+v_trunc_f16 v1.h, v128.h
+// GFX1250: v_trunc_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdd,0xd5,0x80,0x01,0x00,0x00]
+
+v_trunc_f32_e64 v5, v1
+// GFX1250: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
+
+v_trunc_f32_e64 v5, v255
+// GFX1250: v_trunc_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00]
+
+v_trunc_f32_e64 v5, s1
+// GFX1250: v_trunc_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, s105
+// GFX1250: v_trunc_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, vcc_lo
+// GFX1250: v_trunc_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, vcc_hi
+// GFX1250: v_trunc_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, ttmp15
+// GFX1250: v_trunc_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, m0
+// GFX1250: v_trunc_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, exec_lo
+// GFX1250: v_trunc_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, exec_hi
+// GFX1250: v_trunc_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, null
+// GFX1250: v_trunc_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, -1
+// GFX1250: v_trunc_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_trunc_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08]
+
+v_trunc_f32_e64 v5, src_scc mul:4
+// GFX1250: v_trunc_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10]
+
+v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_trunc_f64_e64 v[6:7], v[2:3]
+// GFX1250: v_trunc_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], v[254:255]
+// GFX1250: v_trunc_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], s[2:3]
+// GFX1250: v_trunc_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], s[104:105]
+// GFX1250: v_trunc_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], vcc
+// GFX1250: v_trunc_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], ttmp[14:15]
+// GFX1250: v_trunc_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], exec
+// GFX1250: v_trunc_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], null
+// GFX1250: v_trunc_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], -1
+// GFX1250: v_trunc_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00]
+
+v_trunc_f64_e64 v[6:7], 0.5 mul:2
+// GFX1250: v_trunc_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08]
+
+v_trunc_f64_e64 v[6:7], -|src_scc| mul:4
+// GFX1250: v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30]
+
+v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2
+// GFX1250: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+v_tanh_f32_e64 v5, v1
+// GFX1250: v_tanh_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, v255
+// GFX1250: v_tanh_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, s1
+// GFX1250: v_tanh_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, s105
+// GFX1250: v_tanh_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_lo
+// GFX1250: v_tanh_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_hi
+// GFX1250: v_tanh_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, ttmp15
+// GFX1250: v_tanh_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, m0
+// GFX1250: v_tanh_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_lo
+// GFX1250: v_tanh_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_hi
+// GFX1250: v_tanh_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, null
+// GFX1250: v_tanh_f32_e64 v5, null                 ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, -1
+// GFX1250: v_tanh_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f32_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16_e64 v5, v1
+// GFX1250: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, v255
+// GFX1250: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, s1
+// GFX1250: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, s105
+// GFX1250: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_lo
+// GFX1250: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_hi
+// GFX1250: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, ttmp15
+// GFX1250: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, m0
+// GFX1250: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_lo
+// GFX1250: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_hi
+// GFX1250: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, null
+// GFX1250: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, -1
+// GFX1250: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f16_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_tanh_f16 v5.l, v128.h
+// GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
+
 v_tanh_bf16_e64 v5, v1
 // GFX1250: v_tanh_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00]
 
@@ -49,86 +3964,41 @@ v_tanh_bf16_e64 v255, -|0x8000| clamp div:2
 v_tanh_bf16 v5.l, v128.h
 // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
 
-v_cvt_f32_bf8_e64 v1, s3
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, s3 byte_sel:1
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, s3 byte_sel:2
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, s3 byte_sel:3
-// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3 byte_sel:1
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3 byte_sel:2
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, 3 byte_sel:3
-// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3 byte_sel:1
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3 byte_sel:2
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_bf8_e64 v1, v3 byte_sel:3
-// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3 byte_sel:1
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3 byte_sel:2
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
-
-v_cvt_f32_fp8_e64 v1, s3 byte_sel:3
-// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+v_prng_b32_e64 v5, v1
+// GFX1250: v_prng_b32_e64 v5, v1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, v255
+// GFX1250: v_prng_b32_e64 v5, v255                 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 byte_sel:1
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, s1
+// GFX1250: v_prng_b32_e64 v5, s1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 byte_sel:2
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, s105
+// GFX1250: v_prng_b32_e64 v5, s105                 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, 3 byte_sel:3
-// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+v_prng_b32_e64 v5, vcc_lo
+// GFX1250: v_prng_b32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, vcc_hi
+// GFX1250: v_prng_b32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 byte_sel:1
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, ttmp15
+// GFX1250: v_prng_b32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 byte_sel:2
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, m0
+// GFX1250: v_prng_b32_e64 v5, m0                   ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 byte_sel:3
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, exec_lo
+// GFX1250: v_prng_b32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, exec_hi
+// GFX1250: v_prng_b32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, null
+// GFX1250: v_prng_b32_e64 v5, null                 ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
-// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, -1
+// GFX1250: v_prng_b32_e64 v5, -1                   ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
 
 v_rcp_bf16_e64 v5, v1
 // GFX1250: v_rcp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
@@ -226,6 +4096,246 @@ v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
 v_sqrt_bf16 v5.h, v128.h
 // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
+v_rsq_bf16_e64 v5, v1
+// GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, v255
+// GFX1250: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s1
+// GFX1250: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s105
+// GFX1250: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, ttmp15
+// GFX1250: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, m0
+// GFX1250: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_lo
+// GFX1250: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_hi
+// GFX1250: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, null
+// GFX1250: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, -1
+// GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_rsq_bf16 v5.h, v128.h
+// GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, v1
+// GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, v255
+// GFX1250: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, s1
+// GFX1250: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, s105
+// GFX1250: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_lo
+// GFX1250: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_hi
+// GFX1250: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, ttmp15
+// GFX1250: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, m0
+// GFX1250: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_lo
+// GFX1250: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_hi
+// GFX1250: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, null
+// GFX1250: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, -1
+// GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_log_bf16 v5.h, v128.h
+// GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, v1
+// GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, v255
+// GFX1250: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, s1
+// GFX1250: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, s105
+// GFX1250: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_lo
+// GFX1250: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_hi
+// GFX1250: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, ttmp15
+// GFX1250: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, m0
+// GFX1250: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_lo
+// GFX1250: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_hi
+// GFX1250: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, null
+// GFX1250: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, -1
+// GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_exp_bf16 v5.h, v128.h
+// GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, v1
+// GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, v255
+// GFX1250: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, s1
+// GFX1250: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, s105
+// GFX1250: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_lo
+// GFX1250: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_hi
+// GFX1250: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, ttmp15
+// GFX1250: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, m0
+// GFX1250: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_lo
+// GFX1250: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_hi
+// GFX1250: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, null
+// GFX1250: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, -1
+// GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_sin_bf16 v5.h, v128.h
+// GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, v1
+// GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, v255
+// GFX1250: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, s1
+// GFX1250: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, s105
+// GFX1250: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_lo
+// GFX1250: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_hi
+// GFX1250: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, ttmp15
+// GFX1250: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, m0
+// GFX1250: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_lo
+// GFX1250: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_hi
+// GFX1250: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, null
+// GFX1250: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, -1
+// GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_cos_bf16_e64 v5.h, v128.h
+// GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
+
 v_cvt_f32_bf16_e64 v5, v1
 // GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
@@ -381,3 +4491,54 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
 
 v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
 // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, s2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2             ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150.h, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, s2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2             ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150.h, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1   ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
index 2c2aef4940b57..b21fca654590a 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
@@ -2,6 +2,118 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -58,6 +170,50 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mas
 // GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -170,6 +326,286 @@ v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mas
 // GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -281,3 +717,19 @@ v_cvt_pk_f16_fp8 v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
 v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x04,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
index 1588b6b391198..f14705fa9143c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -2,6 +2,122 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -62,6 +178,50 @@ v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -182,6 +342,306 @@ v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -305,3 +765,27 @@ v_cvt_pk_f16_fp8 v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
 v_cvt_pk_f16_fp8 v1, v128.h quad_perm:[0,1,2,3]
 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
index 0402565695975..78afa10b984cb 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
@@ -2,6 +2,38 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -18,6 +50,10 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -50,6 +86,86 @@ v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -70,10 +186,6 @@ v_cvt_f16_bf8 v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f16_bf8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f16_bf8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f16_bf8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cvt_f16_bf8 v150, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX1250: v_cvt_f16_bf8_e64_dpp v150, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x18,0xf8,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -137,3 +249,19 @@ v_cvt_pk_f16_fp8 v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
index 71cda1b36dd3c..0414421f0a906 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -2,6 +2,42 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -22,6 +58,10 @@ v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -62,16 +102,104 @@ v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v128.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f16_bf8 v150.l, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f16_bf8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f16_bf8 v1.l, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
@@ -134,6 +262,14 @@ v_cvt_f16_fp8 v128.l, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f16_fp8_e64_dpp v128.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x80,0x00,0xf7,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cvt_f32_bf16_e64_dpp v5, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_fp8 v1, v3 clamp dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_fp8_e64_dpp v1, v3 clamp dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x80,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:22: error: invalid operand for instruction
@@ -161,3 +297,27 @@ v_cvt_pk_f16_fp8 v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cvt_pk_f16_fp8 v1, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index e81b6a1d13391..d8dfd1e349145 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -923,6 +923,71 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
 // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
 // GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
 // WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
index 47445d368d3a0..421d96b5e9da6 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
@@ -363,6 +363,82 @@ v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:2
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
 
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:-1
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:xxx
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid matrix_b_fmt value
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47]
+// GFX1250-ERR-NEXT: {{^}}                                    ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}                                    ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}}                                    ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}                                    ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}}                                    ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}}                                    ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP8
+// GFX1250-ERR-NEXT: {{^}}                                             ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX1250-ERR-NEXT: {{^}}                                             ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX1250-ERR-NEXT: {{^}}                                             ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:27], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX1250-ERR-NEXT: {{^}}                                             ^
+
+v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: wrong register tuple size for MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[20:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX1250-ERR-NEXT: {{^}}                                             ^
+
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
 // GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
diff --git a/llvm/test/MC/AMDGPU/reloc-directive.s b/llvm/test/MC/AMDGPU/reloc-directive.s
index 99e972b3105a2..5d55fde4945ef 100644
--- a/llvm/test/MC/AMDGPU/reloc-directive.s
+++ b/llvm/test/MC/AMDGPU/reloc-directive.s
@@ -3,25 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=amdgcn--amdhsa %s -o %t
 # RUN: llvm-readobj -r %t | FileCheck %s
 
-# PRINT:      .reloc 2, R_AMDGPU_NONE, .data
-# PRINT-NEXT: .reloc 1, R_AMDGPU_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_AMDGPU_NONE, 8
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32_LO, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32_HI, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS64, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL64, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL32_LO, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL32_HI, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32_LO, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32_HI, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_RELATIVE64, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL16, .data
-# PRINT-NEXT: .reloc 0, BFD_RELOC_NONE, .data
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, .data
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, .data
+# PRINT:      .reloc {{.*}}+2, R_AMDGPU_NONE, .data
 
 # CHECK:      0x2 R_AMDGPU_NONE .data
 # CHECK-NEXT: 0x1 R_AMDGPU_NONE foo 0x4
@@ -44,27 +26,27 @@
 # CHECK-NEXT: 0x0 R_AMDGPU_ABS64 .data
 
 .text
+  .reloc .+2, R_AMDGPU_NONE, .data
+  .reloc .+1, R_AMDGPU_NONE, foo+4
+  .reloc .+0, R_AMDGPU_NONE, 8
+  .reloc .+0, R_AMDGPU_ABS32_LO, .data
+  .reloc .+0, R_AMDGPU_ABS32_HI, .data
+  .reloc .+0, R_AMDGPU_ABS64, .data
+  .reloc .+0, R_AMDGPU_REL32, .data
+  .reloc .+0, R_AMDGPU_REL64, .data
+  .reloc .+0, R_AMDGPU_ABS32, .data
+  .reloc .+0, R_AMDGPU_GOTPCREL, .data
+  .reloc .+0, R_AMDGPU_GOTPCREL32_LO, .data
+  .reloc .+0, R_AMDGPU_GOTPCREL32_HI, .data
+  .reloc .+0, R_AMDGPU_REL32_LO, .data
+  .reloc .+0, R_AMDGPU_REL32_HI, .data
+  .reloc .+0, R_AMDGPU_RELATIVE64, .data
+  .reloc .+0, R_AMDGPU_REL16, .data
+  .reloc .+0, BFD_RELOC_NONE, .data
+  .reloc .+0, BFD_RELOC_32, .data
+  .reloc .+0, BFD_RELOC_64, .data
   s_nop 0
   s_nop 0
-  .reloc 2, R_AMDGPU_NONE, .data
-  .reloc 1, R_AMDGPU_NONE, foo+4
-  .reloc 0, R_AMDGPU_NONE, 8
-  .reloc 0, R_AMDGPU_ABS32_LO, .data
-  .reloc 0, R_AMDGPU_ABS32_HI, .data
-  .reloc 0, R_AMDGPU_ABS64, .data
-  .reloc 0, R_AMDGPU_REL32, .data
-  .reloc 0, R_AMDGPU_REL64, .data
-  .reloc 0, R_AMDGPU_ABS32, .data
-  .reloc 0, R_AMDGPU_GOTPCREL, .data
-  .reloc 0, R_AMDGPU_GOTPCREL32_LO, .data
-  .reloc 0, R_AMDGPU_GOTPCREL32_HI, .data
-  .reloc 0, R_AMDGPU_REL32_LO, .data
-  .reloc 0, R_AMDGPU_REL32_HI, .data
-  .reloc 0, R_AMDGPU_RELATIVE64, .data
-  .reloc 0, R_AMDGPU_REL16, .data
-  .reloc 0, BFD_RELOC_NONE, .data
-  .reloc 0, BFD_RELOC_32, .data
-  .reloc 0, BFD_RELOC_64, .data
 
 .data
 .globl foo
diff --git a/llvm/test/MC/ARM/reloc-directive-err.s b/llvm/test/MC/ARM/reloc-directive-err.s
index c291fd62d2ba9..113158c51c36b 100644
--- a/llvm/test/MC/ARM/reloc-directive-err.s
+++ b/llvm/test/MC/ARM/reloc-directive-err.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=armv7 %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=armv7 %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/ARM/reloc-directive.s b/llvm/test/MC/ARM/reloc-directive.s
index 682f0e1185c72..6a3b2496cfc8d 100644
--- a/llvm/test/MC/ARM/reloc-directive.s
+++ b/llvm/test/MC/ARM/reloc-directive.s
@@ -10,21 +10,21 @@
 # RUN: llvm-readelf -x .data %t | FileCheck --check-prefix=HEX %s
 
 .text
+  .reloc .+8, R_ARM_NONE, .data
+  .reloc .+4, R_ARM_NONE, foo+4
+  .reloc .+0, R_ARM_NONE, 8
+
+  .reloc .+0, R_ARM_ALU_PC_G0, .data+2
+  .reloc .+0, R_ARM_LDR_PC_G0, foo+3
+  .reloc .+0, R_ARM_THM_ALU_PREL_11_0, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   bx lr
   nop
   nop
-  .reloc 8, R_ARM_NONE, .data
-  .reloc 4, R_ARM_NONE, foo+4
-  .reloc 0, R_ARM_NONE, 8
-
-  .reloc 0, R_ARM_ALU_PC_G0, .data+2
-  .reloc 0, R_ARM_LDR_PC_G0, foo+3
-  .reloc 0, R_ARM_THM_ALU_PREL_11_0, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
@@ -33,16 +33,7 @@ foo:
   .word 0
   .word 0
 
-# PRINT: .reloc 8, R_ARM_NONE, .data
-# PRINT: .reloc 4, R_ARM_NONE, foo+4
-# PRINT: .reloc 0, R_ARM_NONE, 8
-# PRINT: .reloc 0, R_ARM_ALU_PC_G0, .data+2
-# PRINT: .reloc 0, R_ARM_LDR_PC_G0, foo+3
-# PRINT: .reloc 0, R_ARM_THM_ALU_PREL_11_0, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+8, R_ARM_NONE, .data
 
 # ARM relocations use the Elf32_Rel format. Addends are neither stored in the
 # relocation entries nor applied in the referenced locations.
diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s
index 6d96393b42ad1..bf73188cf7c3f 100644
--- a/llvm/test/MC/AVR/inst-brbc.s
+++ b/llvm/test/MC/AVR/inst-brbc.s
@@ -15,8 +15,10 @@ foo:
 ; CHECK: brcc .Ltmp1-16+2  ; encoding: [0bAAAAA000,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 23 f4   brvc .+8
-; INST-NEXT: c0 f7   brsh .-16
+; INST-NEXT: fb f7   brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xa
+; INST-NEXT: f8 f7   brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0xc
 ; INST-NEXT: 59 f7   brne .-42
 ; INST-NEXT: 52 f7   brpl .-44
 ; INST-NEXT: 4c f7   brge .-46
diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s
index 9dde5e16abde3..3e64ebce542b8 100644
--- a/llvm/test/MC/AVR/inst-brbs.s
+++ b/llvm/test/MC/AVR/inst-brbs.s
@@ -14,8 +14,10 @@ foo:
 ; CHECK: brcs .Ltmp1-12+2  ; encoding: [0bAAAAA000,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 23 f0   brvs .+8
-; INST-NEXT: d0 f3   brlo .-12
+; INST-NEXT: fb f3   brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xa
+; INST-NEXT: f8 f3   brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x8
 ; INST-NEXT: 59 f3   breq .-42
 ; INST-NEXT: 52 f3   brmi .-44
 ; INST-NEXT: 4c f3   brlt .-46
diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s
index 0edefa167ac44..eba05e06bb43b 100644
--- a/llvm/test/MC/AVR/inst-brcc.s
+++ b/llvm/test/MC/AVR/inst-brcc.s
@@ -18,7 +18,11 @@ bar:
 ; CHECK: brcc bar            ; encoding: [0bAAAAA000,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 08 f5      brsh .+66
-; INST-NEXT: a8 f7      brsh .-22
-; INST-NEXT: 08 f5      brsh .+66
-; INST-NEXT: 00 f4      brsh .+0
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x44
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x12
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x48
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x8
diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s
index ea8a3f527c9e4..fb4e0dda171d8 100644
--- a/llvm/test/MC/AVR/inst-brcs.s
+++ b/llvm/test/MC/AVR/inst-brcs.s
@@ -18,7 +18,11 @@ bar:
 ; CHECK: brcs bar           ; encoding: [0bAAAAA000,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 20 f0      brlo .+8
-; INST-NEXT: 10 f0      brlo .+4
-; INST-NEXT: 20 f0      brlo .+8
-; INST-NEXT: 00 f0      brlo .+0
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xa
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x8
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xe
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x8
diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s
index d916f6dc18c8f..8b8e85aa39a39 100644
--- a/llvm/test/MC/AVR/inst-breq.s
+++ b/llvm/test/MC/AVR/inst-breq.s
@@ -18,7 +18,10 @@ bar:
 ; CHECK: brbs    1, bar            ; encoding: [0bAAAAA001,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: b9 f3      breq .-18
-; INST-NEXT: d1 f3      breq .-12
-; INST-NEXT: b9 f3      breq .-18
-; INST-NEXT: 01 f0      breq .+0
+; INST-NEXT: f9 f3      breq .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x10
+; INST-NEXT: f9 f3      breq .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x8
+; INST-NEXT: f9 f3      breq .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0xc
+; INST-NEXT: f9 f3      breq .-2
diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s
index 3a8fd727d773e..ed96d8961ba16 100644
--- a/llvm/test/MC/AVR/inst-brge.s
+++ b/llvm/test/MC/AVR/inst-brge.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brge bar            ; encoding: [0bAAAAA100,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: cc f4      brge .+50
-; INST-NEXT: ac f4      brge .+42
-; INST-NEXT: 04 f4      brge .+0
+; INST-NEXT: fc f7      brge .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x34
+; INST-NEXT: fc f7      brge .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x2e
+; INST-NEXT: fc f7      brge .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s
index 4fc55b6ab0347..8421c9112edd6 100644
--- a/llvm/test/MC/AVR/inst-brhc.s
+++ b/llvm/test/MC/AVR/inst-brhc.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brhc bar            ; encoding: [0bAAAAA101,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 35 f4      brhc .+12
-; INST-NEXT: 3d f4      brhc .+14
-; INST-NEXT: 05 f4      brhc .+0
+; INST-NEXT: fd f7      brhc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xe
+; INST-NEXT: fd f7      brhc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x12
+; INST-NEXT: fd f7      brhc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s
index d0968753cded2..a3777b40b25d0 100644
--- a/llvm/test/MC/AVR/inst-brhs.s
+++ b/llvm/test/MC/AVR/inst-brhs.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brhs bar            ; encoding: [0bAAAAA101,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: fd f2      brhs .-66
-; INST-NEXT: 3d f0      brhs .+14
-; INST-NEXT: 05 f0      brhs .+0
+; INST-NEXT: fd f3      brhs .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x40
+; INST-NEXT: fd f3      brhs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x12
+; INST-NEXT: fd f3      brhs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s
index 2a3a30f905cf5..888ae02ed6522 100644
--- a/llvm/test/MC/AVR/inst-brid.s
+++ b/llvm/test/MC/AVR/inst-brid.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brid bar            ; encoding: [0bAAAAA111,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: af f4      brid .+42
-; INST-NEXT: ff f4      brid .+62
-; INST-NEXT: 07 f4      brid .+0
+; INST-NEXT: ff f7      brid .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x2c
+; INST-NEXT: ff f7      brid .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x42
+; INST-NEXT: ff f7      brid .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s
index 4f867ae99852a..1d175f184baa7 100644
--- a/llvm/test/MC/AVR/inst-brie.s
+++ b/llvm/test/MC/AVR/inst-brie.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brie bar            ; encoding: [0bAAAAA111,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 57 f0      brie .+20
-; INST-NEXT: a7 f0      brie .+40
-; INST-NEXT: 07 f0      brie .+0
+; INST-NEXT: ff f3      brie .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x16
+; INST-NEXT: ff f3      brie .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x2c
+; INST-NEXT: ff f3      brie .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s
index 48499aa69926b..4b57e77ed77b1 100644
--- a/llvm/test/MC/AVR/inst-brlo.s
+++ b/llvm/test/MC/AVR/inst-brlo.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brlo bar            ; encoding: [0bAAAAA000,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 30 f0      brlo .+12
-; INST-NEXT: 70 f0      brlo .+28
-; INST-NEXT: 00 f0      brlo .+0
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xe
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x20
+; INST-NEXT: f8 f3      brlo .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s
index e16fd05b3e144..58e57c4dc7e1d 100644
--- a/llvm/test/MC/AVR/inst-brlt.s
+++ b/llvm/test/MC/AVR/inst-brlt.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brlt bar            ; encoding: [0bAAAAA100,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 44 f0    brlt .+16
-; INST-NEXT: 0c f0    brlt .+2
-; INST-NEXT: 04 f0    brlt .+0
+; INST-NEXT: fc f3    brlt .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x12
+; INST-NEXT: fc f3    brlt .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
+; INST-NEXT: fc f3    brlt .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s
index 0d46af8b75969..c40644885fdcf 100644
--- a/llvm/test/MC/AVR/inst-brmi.s
+++ b/llvm/test/MC/AVR/inst-brmi.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brmi bar            ; encoding: [0bAAAAA010,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 0a f1      brmi .+66
-; INST-NEXT: ea f0      brmi .+58
-; INST-NEXT: 02 f0      brmi .+0
+; INST-NEXT: fa f3      brmi .-2
+; INST-NEXT: VR_7_PCREL .text+0x44
+; INST-NEXT: fa f3      brmi .-2
+; INST-NEXT: VR_7_PCREL .text+0x3e
+; INST-NEXT: fa f3      brmi .-2
+; INST-NEXT: VR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s
index e87813a60b504..4b00c6301d7ff 100644
--- a/llvm/test/MC/AVR/inst-brne.s
+++ b/llvm/test/MC/AVR/inst-brne.s
@@ -18,7 +18,10 @@ bar:
 ; CHECK: brbc    1, bar            ; encoding: [0bAAAAA001,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 29 f4      brne .+10
-; INST-NEXT: 09 f4      brne .+2
-; INST-NEXT: 29 f4      brne .+10
-; INST-NEXT: 01 f4      brne .+0
+; INST-NEXT: f9 f7      brne .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0xc
+; INST-NEXT: f9 f7      brne .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
+; INST-NEXT: f9 f7      brne .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x10
+; INST-NEXT: f9 f7      brne .-2
diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s
index 34877961bf328..9049e24197b89 100644
--- a/llvm/test/MC/AVR/inst-brpl.s
+++ b/llvm/test/MC/AVR/inst-brpl.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brpl bar            ; encoding: [0bAAAAA010,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: d2 f7      brpl .-12
-; INST-NEXT: 4a f4      brpl .+18
-; INST-NEXT: 02 f4      brpl .+0
+; INST-NEXT: fa f7      brpl .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0xa
+; INST-NEXT: fa f7      brpl .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x16
+; INST-NEXT: fa f7      brpl .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s
index be0a06c445e65..0f32fbae73357 100644
--- a/llvm/test/MC/AVR/inst-brsh.s
+++ b/llvm/test/MC/AVR/inst-brsh.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brsh bar            ; encoding: [0bAAAAA000,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 80 f4      brsh .+32
-; INST-NEXT: 18 f5      brsh .+70
-; INST-NEXT: 00 f4      brsh .+0
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x22
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x4a
+; INST-NEXT: f8 f7      brsh .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s
index 312c55c3f4729..731b495a787ea 100644
--- a/llvm/test/MC/AVR/inst-brtc.s
+++ b/llvm/test/MC/AVR/inst-brtc.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brtc bar            ; encoding: [0bAAAAA110,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: d6 f4      brtc .+52
-; INST-NEXT: ce f4      brtc .+50
-; INST-NEXT: 06 f4      brtc .+0
+; INST-NEXT: fe f7      brtc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x36
+; INST-NEXT: fe f7      brtc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x36
+; INST-NEXT: fe f7      brtc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s
index 40ef6af5eef16..bb00acb55d28c 100644
--- a/llvm/test/MC/AVR/inst-brts.s
+++ b/llvm/test/MC/AVR/inst-brts.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brts bar            ; encoding: [0bAAAAA110,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 4e f0      brts .+18
-; INST-NEXT: 5e f0      brts .+22
-; INST-NEXT: 06 f0      brts .+0
+; INST-NEXT: fe f3      brts .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x14
+; INST-NEXT: fe f3      brts .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x1a
+; INST-NEXT: fe f3      brts .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s
index d493ff1fbf544..f65e735440274 100644
--- a/llvm/test/MC/AVR/inst-brvc.s
+++ b/llvm/test/MC/AVR/inst-brvc.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brvc bar            ; encoding: [0bAAAAA011,0b111101AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 93 f7      brvc .-28
-; INST-NEXT: 0b f7      brvc .-62
-; INST-NEXT: 03 f4      brvc .+0
+; INST-NEXT: fb f7      brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x1a
+; INST-NEXT: fb f7      brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text-0x3a
+; INST-NEXT: fb f7      brvc .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s
index 07755d8aea21f..a5b7e4b7904e6 100644
--- a/llvm/test/MC/AVR/inst-brvs.s
+++ b/llvm/test/MC/AVR/inst-brvs.s
@@ -16,6 +16,9 @@ bar:
 ; CHECK: brvs bar            ; encoding: [0bAAAAA011,0b111100AA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 4b f0      brvs .+18
-; INST-NEXT: 83 f0      brvs .+32
-; INST-NEXT: 03 f0      brvs .+0
+; INST-NEXT: fb f3      brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x14
+; INST-NEXT: fb f3      brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x24
+; INST-NEXT: fb f3      brvs .-2
+; INST-NEXT: R_AVR_7_PCREL .text+0x6
diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s
index 1da6e7f5dddf9..f7818aa9f6e83 100644
--- a/llvm/test/MC/AVR/inst-rcall.s
+++ b/llvm/test/MC/AVR/inst-rcall.s
@@ -17,8 +17,11 @@ foo:
 ; CHECK: rcall .Ltmp3+46+2  ; encoding: [A,0b1101AAAA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 00 d0    rcall .+0
-; INST-NEXT: fc df    rcall .-8
-; INST-NEXT: 06 d0    rcall .+12
-; INST-NEXT: 17 d0    rcall .+46
-; INST-NEXT: ea df    rcall .-44
+; INST-NEXT: ff df    rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x2
+; INST-NEXT: ff df    rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text-0x4
+; INST-NEXT: ff df    rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x12
+; INST-NEXT: ff df    rcall .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x36
diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s
index 6712319bbc268..6ac6343894cb8 100644
--- a/llvm/test/MC/AVR/inst-rjmp.s
+++ b/llvm/test/MC/AVR/inst-rjmp.s
@@ -33,18 +33,28 @@ x:
 ; CHECK: rjmp .Ltmp6+4094+2 ; encoding: [A,0b1100AAAA]
 
 ; INST-LABEL: <foo>:
-; INST-NEXT: 01 c0      rjmp  .+2
 ; INST-NEXT: ff cf      rjmp  .-2
-; INST-NEXT: fd cf      rjmp  .-6
-; INST-NEXT: 04 c0      rjmp  .+8
-; INST-NEXT: 01 c0      rjmp  .+2
-; INST-NEXT: 00 c0      rjmp  .+0
+; INST-NEXT: R_AVR_13_PCREL .text+0x4
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x2
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x10
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xc
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xc
 ; INST-EMPTY:
 ; INST-LABEL: <end>:
-; INST-NEXT: fe cf      rjmp  .-4
-; INST-NEXT: fd cf      rjmp  .-6
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xa
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0xa
 ; INST-EMPTY:
 ; INST-LABEL: <x>:
 ; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x10
 ; INST-NEXT: 0f c0      rjmp  .+30
-; INST-NEXT: ff c7      rjmp  .+4094
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: R_AVR_13_PCREL .text+0x1014
diff --git a/llvm/test/MC/AVR/reloc-directive-err.s b/llvm/test/MC/AVR/reloc-directive-err.s
index d660bde487e3a..8494a66c6b727 100644
--- a/llvm/test/MC/AVR/reloc-directive-err.s
+++ b/llvm/test/MC/AVR/reloc-directive-err.s
@@ -1,10 +1,10 @@
 # RUN: llvm-mc -triple=avr %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=avr %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[#@LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
 
-# PRINT: .reloc 0, BFD_RELOC_64, 0
+# PRINT: .reloc {{.*}}, BFD_RELOC_64, 0
 # CHECK: {{.*}}.s:[[#@LINE+1]]:11: error: unknown relocation name
-.reloc 0, BFD_RELOC_64, 0
+.reloc ., BFD_RELOC_64, 0
diff --git a/llvm/test/MC/AVR/reloc-directive.s b/llvm/test/MC/AVR/reloc-directive.s
index 60913172502cf..9940842171eef 100644
--- a/llvm/test/MC/AVR/reloc-directive.s
+++ b/llvm/test/MC/AVR/reloc-directive.s
@@ -1,14 +1,7 @@
 # RUN: llvm-mc -triple=avr %s | FileCheck --check-prefix=PRINT %s
 # RUN: llvm-mc -filetype=obj -triple=avr %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT:      .reloc 4, R_AVR_NONE, .data
-# PRINT-NEXT: .reloc 2, R_AVR_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_AVR_NONE, 8
-# PRINT:      .reloc 0, R_AVR_32, .data+2
-# PRINT-NEXT: .reloc 0, R_AVR_16, foo+3
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
+# PRINT:      .reloc {{.*}}+4, R_AVR_NONE, .data
 
 # CHECK:      Section ({{.*}}) .rela.text {
 # CHECK-NEXT:   0x4 R_AVR_NONE .data 0x0
@@ -22,19 +15,19 @@
 # CHECK-NEXT: }
 
 .text
+  .reloc .+4, R_AVR_NONE, .data
+  .reloc .+2, R_AVR_NONE, foo+4
+  .reloc .+0, R_AVR_NONE, 8
+
+  .reloc .+0, R_AVR_32, .data+2
+  .reloc .+0, R_AVR_16, foo+3
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   ret
   nop
   nop
-  .reloc 4, R_AVR_NONE, .data
-  .reloc 2, R_AVR_NONE, foo+4
-  .reloc 0, R_AVR_NONE, 8
-
-  .reloc 0, R_AVR_32, .data+2
-  .reloc 0, R_AVR_16, foo+3
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/AsmParser/llvm_section_types.s b/llvm/test/MC/AsmParser/llvm_section_types.s
index 147b1499d2b88..83e5db0256647 100644
--- a/llvm/test/MC/AsmParser/llvm_section_types.s
+++ b/llvm/test/MC/AsmParser/llvm_section_types.s
@@ -1,22 +1,34 @@
-## Verify that LLVM-specific section types are correctly inferred from assembly input.
+## Verify that LLVM-specific section types are correctly inferred from assembly input and printed.
+# RUN: llvm-mc -triple i386-pc-linux %s | FileCheck --check-prefix=ASM %s
 # RUN: llvm-mc -triple i386-pc-linux -filetype=obj -o %t %s
 # RUN: llvm-readobj -S %t | FileCheck %s
+# ASM: .section    .section1,"",@llvm_bb_addr_map
 .section    .section1,"",@llvm_bb_addr_map
 .byte 1
+# ASM: .section    .section2,"",@llvm_call_graph_profile
 .section    .section2,"",@llvm_call_graph_profile
 .byte 1
+# ASM: .section    .section3,"",@llvm_odrtab
 .section    .section3,"",@llvm_odrtab
 .byte 1
+# ASM: .section    .section4,"",@llvm_linker_options
 .section    .section4,"",@llvm_linker_options
 .byte 1
+# ASM: .section    .section5,"",@llvm_sympart
 .section    .section5,"",@llvm_sympart
 .byte 1
+# ASM: .section    .section6,"",@llvm_dependent_libraries
 .section    .section6,"",@llvm_dependent_libraries
 .byte 1
+# ASM: .section    .section7,"",@llvm_offloading
 .section    .section7,"",@llvm_offloading
 .byte 1
+# ASM: .section    .section8,"",@llvm_lto
 .section    .section8,"",@llvm_lto
 .byte 1
+# ASM: .section    .section9,"",@llvm_cfi_jump_table,1
+.section    .section9,"",@llvm_cfi_jump_table,1
+.byte 1
 
 # CHECK:        Name: .section1
 # CHECK-NEXT:   Type: SHT_LLVM_BB_ADDR_MAP
@@ -34,3 +46,6 @@
 # CHECK-NEXT:   Type: SHT_LLVM_OFFLOADING
 # CHECK:        Name: .section8
 # CHECK-NEXT:   Type: SHT_LLVM_LTO
+# CHECK:        Name: .section9
+# CHECK-NEXT:   Type: SHT_LLVM_CFI_JUMP_TABLE
+# CHECK:        EntrySize: 1
diff --git a/llvm/test/MC/COFF/bss-text.s b/llvm/test/MC/COFF/bss-text.s
index ed6890565b9a3..cedbb2f032236 100644
--- a/llvm/test/MC/COFF/bss-text.s
+++ b/llvm/test/MC/COFF/bss-text.s
@@ -1,13 +1,15 @@
-# RUN: not llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
 
 ## -filetype=asm does not check the error.
 # RUN: llvm-mc -triple=x86_64-pc-win32 %s
 
+.bss
+# CHECK: <unknown>:0: error: BSS section '.bss' cannot have non-zero bytes
+  addb %bl,(%rax)
+
 .section uninitialized,"b"
-# MCRelaxableFragment
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: IMAGE_SCN_CNT_UNINITIALIZED_DATA section 'uninitialized' cannot have instructions
+# CHECK: <unknown>:0: error: BSS section 'uninitialized' cannot have non-zero bytes
   jmp foo
 
-.bss
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: IMAGE_SCN_CNT_UNINITIALIZED_DATA section '.bss' cannot have instructions
+.section bss0,"b"
   addb %al,(%rax)
diff --git a/llvm/test/MC/COFF/section.s b/llvm/test/MC/COFF/section.s
index 9c1a11effa341..fdd65701b1050 100644
--- a/llvm/test/MC/COFF/section.s
+++ b/llvm/test/MC/COFF/section.s
@@ -29,7 +29,7 @@
 .section s      ; .long 1
 .section s_, "" ; .long 1
 .section s_a,"a"; .long 1
-.section s_b,"b"; .long 1
+.section s_b,"b"; .long 0
 .section s_d,"d"; .long 1
 .section s_D,"D"; .long 1
 .section s_n,"n"; .long 1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
new file mode 100644
index 0000000000000..92fa802802f29
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+# GFX1250: s_buffer_load_i8 s5, s[4:7], s0 offset:0x0 nv ; encoding: [0x42,0x01,0x13,0xf4,0x00,0x00,0x00,0x00]
+0x42,0x01,0x13,0xf4,0x00,0x00,0x00,0x00
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa nv           ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8]
+0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9]
+0x01,0x01,0x00,0xf4,0x0a,0x00,0x00,0xf9
+
+# GFX1250: s_load_b32 s4, s[2:3], 0xa scale_offset nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9]
+0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf9
+
+# GFX1250: s_load_b32 s4, s[2:3], m0 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb]
+0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0xfb
+
+# GFX1250: s_load_b32 s4, s[2:3], s5 offset:0x20 scale_offset ; encoding: [0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b]
+0x01,0x01,0x00,0xf4,0x20,0x00,0x00,0x0b
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt
new file mode 100644
index 0000000000000..a2f12115bb64b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+# GFX1250: buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00]
+0x83,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00
+
+# GFX1250: buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00]
+0x83,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00
+
+# GFX1250: buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv ; encoding: [0x84,0x40,0x07,0xc4,0x02,0x18,0x80,0x80,0x00,0xff,0x0f,0x00]
+0x84,0x40,0x07,0xc4,0x02,0x18,0x80,0x80,0x00,0xff,0x0f,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
index 6421c6f30e177..3455f4c3b46e9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
@@ -1,5 +1,3050 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
+# GFX1250: flat_atomic_add_f32 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x80,0x15,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v0, v1, v2, s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff]
+0x02,0x40,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0xc0,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v2, v[2:3], s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00]
+0x02,0x00,0x0d,0xec,0x00,0x00,0x11,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v2, v[2:5], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x10,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v[0:1], v2, v[2:3], s[2:3] offset:-64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x13,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x00,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x14,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x0e,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x40,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x80,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v0, v1, v2, s[2:3] offset:8000000 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a]
+0x02,0x40,0x16,0xec,0x00,0x00,0x11,0x01,0x01,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0xc0,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0d,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x11,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v0, v2, s[2:3] scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00]
+0x02,0xc0,0x0c,0xec,0x00,0x00,0x11,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x10,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v0, v1, v2, s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00]
+0x02,0x80,0x0f,0xec,0x00,0x00,0x11,0x01,0x01,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v[0:1], v2, v[2:3], s[2:3] offset:64 scale_offset th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x12,0xec,0x00,0x00,0x11,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x05,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x05,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x08,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x07,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x04,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b128 v2, v[2:5], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b32 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b64 v2, v[2:3], s[2:3] offset:64 scale_offset ; encoding: [0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x06,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_b96 v2, v[2:4], s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x07,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v2, v2, s[2:3] offset:64 scale_offset ; encoding: [0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x09,0xec,0x00,0x00,0x01,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_add_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v[4:5], v5          ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x80,0x02,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x0d,0xec,0x00,0x00,0x80,0x02,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_add_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v[2:3], v[4:5] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v[2:3], v[4:5] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v1, v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v1, v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v[2:5] offset:-64 ; encoding: [0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v[2:5] offset:64 ; encoding: [0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_num_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_num_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2      ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2       ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:-2048 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0xf8,0xff]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0xf8,0xff
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:2048 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x08,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x08,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[4:5], v[6:7] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[4:5], v[6:7] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b128 v[2:5], v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v[6:7]           ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v[0:1] offset:-64     ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b32 v1, v[0:1] offset:64      ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5]                ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:-2048   ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0xf8,0xff]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0xf8,0xff
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:-4      ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xfc,0xff,0xff]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xfc,0xff,0xff
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:2047    ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:2048    ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x08,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x08,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:4       ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x04,0x00,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x04,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b64 v[2:3], v[0:1] offset:64  ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v[4:5]            ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b96 v[2:4], v[0:1] offset:64  ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v[6:7]            ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_b16 v1, v[0:1] offset:64  ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v[4:5]            ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_hi_b16 v1, v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v[4:5]         ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_hi_i8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v[4:5]          ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_hi_u8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v[4:5]          ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v[0:1] offset:-64  ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_i8 v1, v[0:1] offset:64   ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v[4:5]             ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v[0:1] offset:-64  ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_u8 v1, v[0:1] offset:64   ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v[4:5]             ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v[0:1] offset:-64     ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_i16 v1, v[0:1] offset:64      ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v[4:5]                ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v[0:1] offset:-64      ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_i8 v1, v[0:1] offset:64       ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v[4:5]                 ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v[0:1] offset:-64     ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_u16 v1, v[0:1] offset:64      ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v[4:5]                ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v[0:1] offset:-64      ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_u8 v1, v[0:1] offset:64       ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v[4:5]                 ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_b128 v[0:1], v[2:5] offset:-64 ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b128 v[0:1], v[2:5] offset:64 ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b128 v[2:3], v[4:7]          ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_b16 v[0:1], v2 offset:-64    ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b16 v[0:1], v2 offset:64     ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b16 v[4:5], v1               ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_b32 v[0:1], v2 offset:-64    ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b32 v[0:1], v2 offset:64     ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b32 v[4:5], v1 offset:-16    ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff
+
+# GFX1250: flat_store_b32 v[4:5], v1 offset:16     ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00
+
+# GFX1250: flat_store_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b64 v[2:3], v[4:5]           ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_b8 v[0:1], v2 offset:-64     ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b8 v[0:1], v2 offset:64      ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b8 v[4:5], v1                ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_b96 v[0:1], v[2:4] offset:-64 ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b96 v[0:1], v[2:4] offset:64 ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b96 v[2:3], v[4:6]           ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_d16_hi_b16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v[4:5], v1        ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_d16_hi_b8 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v[4:5], v1         ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0xff,0x07,0x00]
+0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0x00,0x00,0x00]
+0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v0, v[2:5], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v0, v[2:5], s[0:1] offset:64 ; encoding: [0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[0:1], v[2:5], off offset:-64 ; encoding: [0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v[0:1], v[2:5], off offset:64 ; encoding: [0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v0, v[2:5], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v0, v[2:5], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v3, v[6:9], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00]
+0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v3, v[6:9], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00]
+0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v1, v3, s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0xff,0x07,0x00]
+0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v1, v3, s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0x00,0x00,0x00]
+0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v[2:3], v3, off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v[2:3], v3, off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v3, v[6:7], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00]
+0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v3, v[6:7], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00]
+0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[4:5], v[6:7], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[4:5], v[6:7], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_inv                              ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_inv scope:SCOPE_DEV              ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_inv scope:SCOPE_SYS              ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, off          ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, off offset:-64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_addtid_b32 v1, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, s[2:3]       ; encoding: [0x02,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x02,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b128 v[2:5], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v5, s[2:3]     ; encoding: [0x02,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b128 v[2:5], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v[6:7], off    ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b32 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v3, s[2:3]          ; encoding: [0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v3, s[2:3] offset:2047 ; encoding: [0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0xff,0x07,0x00]
+0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0xff,0x07,0x00
+
+# GFX1250: global_load_b32 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b32 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v[4:5], off         ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v[4:5], off offset:2047 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00
+
+# GFX1250: global_load_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b64 v[2:3], v3, s[2:3]      ; encoding: [0x02,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_b64 v[2:3], v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b64 v[2:3], v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b64 v[2:3], v[4:5], off     ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b96 v[2:4], v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v5, s[2:3]      ; encoding: [0x02,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b96 v[2:4], v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v[6:7], off     ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_block v[8:39], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v5, s[2:3]   ; encoding: [0x02,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_block v[8:39], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v[6:7], off  ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v[6:7], off th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x24,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x24,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_b16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v3, s[2:3]      ; encoding: [0x02,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_b16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v[4:5], off     ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_b16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v3, s[2:3]   ; encoding: [0x02,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_b16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v[4:5], off  ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v3, s[2:3]    ; encoding: [0x02,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v[4:5], off   ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v3, s[2:3]    ; encoding: [0x02,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v[4:5], off   ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v3, s[2:3]       ; encoding: [0x02,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v[4:5], off      ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v3, s[2:3]       ; encoding: [0x02,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v[4:5], off      ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v3, s[2:3]          ; encoding: [0x02,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v[4:5], off         ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v3, s[2:3]           ; encoding: [0x02,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v[4:5], off          ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v3, s[2:3]          ; encoding: [0x02,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v[4:5], off         ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v3, s[2:3]           ; encoding: [0x02,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v[4:5], off          ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_addtid_b32 v2, off offset:-64 ; encoding: [0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_addtid_b32 v2, off offset:64 ; encoding: [0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_addtid_b32 v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_addtid_b32 v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b128 v0, v[2:5], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b128 v0, v[2:5], s[0:1] offset:64 ; encoding: [0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b128 v1, v[4:7], s[2:3]    ; encoding: [0x02,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00]
+0x02,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_b128 v[0:1], v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b128 v[0:1], v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b128 v[2:3], v[4:7], off   ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_b16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b16 v3, v1, s[2:3]         ; encoding: [0x02,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_b16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b16 v[4:5], v1, off        ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b32 v3, v1, s[2:3] offset:-16 ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0xf0,0xff,0xff]
+0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0xf0,0xff,0xff
+
+# GFX1250: global_store_b32 v3, v1, s[2:3] offset:16 ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x10,0x00,0x00]
+0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x10,0x00,0x00
+
+# GFX1250: global_store_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b32 v[4:5], v1, off offset:-16 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff
+
+# GFX1250: global_store_b32 v[4:5], v1, off offset:16 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00
+
+# GFX1250: global_store_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b64 v1, v[2:3], s[2:3]     ; encoding: [0x02,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00]
+0x02,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b64 v[2:3], v[4:5], off    ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_b8 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b8 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b8 v3, v1, s[2:3]          ; encoding: [0x02,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_b8 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b8 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b8 v[4:5], v1, off         ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_b96 v0, v[2:4], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b96 v0, v[2:4], s[0:1] offset:64 ; encoding: [0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b96 v1, v[4:6], s[2:3]     ; encoding: [0x02,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00]
+0x02,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_b96 v[0:1], v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b96 v[0:1], v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b96 v[2:3], v[4:6], off    ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_block v0, v[2:33], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_block v0, v[2:33], s[0:1] offset:64 ; encoding: [0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_block v1, v[4:35], s[2:3]  ; encoding: [0x02,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00]
+0x02,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_block v[0:1], v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_block v[0:1], v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_block v[2:3], v[4:35], off ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_block v[2:3], v[4:35], off th:TH_STORE_HT scope:SCOPE_SE ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x24,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x24,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v3, v1, s[2:3]  ; encoding: [0x02,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v[4:5], v1, off ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b8 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v3, v1, s[2:3]   ; encoding: [0x02,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b8 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v[4:5], v1, off  ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_wb                               ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wb scope:SCOPE_DEV               ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wb scope:SCOPE_SYS               ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wbinv                            ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wbinv scope:SCOPE_DEV            ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wbinv scope:SCOPE_SYS            ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], off, off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], off, off offset:64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], off, s0 offset:-64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], off, s0 offset:64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], v0, off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], v0, off offset:64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], v0, s0 offset:64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], v2, s1        ; encoding: [0x01,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, off           ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, off, off offset:2047 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, off, s0 offset:64  ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, s1 offset:2047 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, v0, off offset:64  ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, v0, s0 offset:-64  ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, v0, s0 offset:64   ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, off offset:2047 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1             ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:-4095 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x01,0xf0,0xff]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x01,0xf0,0xff
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:-61440 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x10,0xff]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x10,0xff
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:2047 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:4095 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x0f,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x0f,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:61440 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0xf0,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0xf0,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], off, off offset:-64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], off, off offset:64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], off, s0 offset:-64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], off, s0 offset:64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], v0, off offset:-64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], v0, off offset:64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], v0, s0 offset:-64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], v0, s0 offset:64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], v2, s1         ; encoding: [0x01,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], off, off offset:-64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], off, off offset:64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], off, s0 offset:-64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], off, s0 offset:64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], v0, off offset:-64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], v0, off offset:64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], v0, s0 offset:-64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], v0, s0 offset:64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], v2, s1         ; encoding: [0x01,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], off, off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], off, off offset:64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], off, s0 offset:-64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], off, s0 offset:64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v0, off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], v0, off offset:64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], v0, s0 offset:64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v2, s1      ; encoding: [0x01,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v2, s1 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x01,0xc0,0x14,0xed,0x04,0x00,0x26,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x14,0xed,0x04,0x00,0x26,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, off, s0 offset:64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, v0, off offset:64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, v0, s0 offset:-64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, v0, s0 offset:64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, v2, s1         ; encoding: [0x01,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v2, s1      ; encoding: [0x01,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v2, s1       ; encoding: [0x01,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, off offset:64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v2, s1       ; encoding: [0x01,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, v2, s1          ; encoding: [0x01,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, v2, s1          ; encoding: [0x01,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, off, s0 offset:64  ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, v0, off offset:64  ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, v0, s0 offset:-64  ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, v0, s0 offset:64   ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, v2, s1             ; encoding: [0x01,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, off, off offset:-64 ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, off, off offset:64  ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, off, s0 offset:-64  ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, off, s0 offset:64   ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, v0, off offset:-64  ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, v0, off offset:64   ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, v0, s0 offset:-64   ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, v0, s0 offset:64    ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, v2, s1              ; encoding: [0x01,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, off, s0 offset:64  ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, v0, off offset:64  ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, v0, s0 offset:-64  ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, v0, s0 offset:64   ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, v2, s1             ; encoding: [0x01,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, off, off offset:64  ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, off, s0 offset:-64  ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, off, s0 offset:64   ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, v0, off offset:-64  ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, v0, off offset:64   ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, v0, s0 offset:-64   ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, v0, s0 offset:64    ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, v2, s1              ; encoding: [0x01,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b128 off, v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 off, v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 off, v[2:5], s0 offset:-64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 off, v[2:5], s0 offset:64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 v0, v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 v0, v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 v0, v[2:5], s0 offset:-64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 v0, v[2:5], s0 offset:64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 v1, v[2:5], s3       ; encoding: [0x03,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b16 off, v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 off, v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 off, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 off, v2, s0 offset:64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 v0, v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 v0, v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 v0, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 v0, v2, s0 offset:64  ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 v1, v2, s3            ; encoding: [0x03,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, off          ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 off, v2, off offset:2047 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b32 off, v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, s0 offset:-64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 off, v2, s0 offset:64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, s3 offset:2047 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00]
+0x03,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b32 v0, v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 v0, v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 v0, v2, s0 offset:-64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 v0, v2, s0 offset:64  ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, off offset:2047 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:-4095 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x01,0xf0,0xff]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x01,0xf0,0xff
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:-61440 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x10,0xff]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x10,0xff
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:4095 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x0f,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x0f,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:61440 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0xf0,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0xf0,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s3            ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s3 offset:2047 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00]
+0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b64 off, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 off, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 off, v[2:3], s0 offset:-64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 off, v[2:3], s0 offset:64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 v0, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 v0, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 v0, v[2:3], s0 offset:-64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 v0, v[2:3], s0 offset:64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 v1, v[2:3], s3        ; encoding: [0x03,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b8 off, v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 off, v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 off, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 off, v2, s0 offset:64  ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 v0, v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 v0, v2, off offset:64  ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 v0, v2, s0 offset:-64  ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 v0, v2, s0 offset:64   ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 v1, v2, s3             ; encoding: [0x03,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b96 off, v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 off, v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 off, v[2:4], s0 offset:-64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 off, v[2:4], s0 offset:64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 v0, v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 v0, v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 v0, v[2:4], s0 offset:-64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 v0, v[2:4], s0 offset:64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 v1, v[2:4], s3        ; encoding: [0x03,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_block off, v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block off, v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block off, v[2:33], s0 offset:-64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block off, v[2:33], s0 offset:64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block v0, v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block v0, v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block v0, v[2:33], s0 offset:-64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block v0, v[2:33], s0 offset:64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block v1, v[2:33], s3     ; encoding: [0x03,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_block v1, v[2:33], s3 th:TH_STORE_HT scope:SCOPE_SE ; encoding: [0x03,0x00,0x15,0xed,0x00,0x00,0x26,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x15,0xed,0x00,0x00,0x26,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, s0 offset:64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, s0 offset:64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 v1, v2, s3     ; encoding: [0x03,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, s0 offset:64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, s0 offset:64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 v1, v2, s3      ; encoding: [0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v[2:3], v2 nv       ; encoding: [0xfc,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+0xfc,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_load_b32 v0, v[2:3] nv             ; encoding: [0xfc,0x00,0x05,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x00,0x05,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_b32 v[2:3], v0 nv            ; encoding: [0xfc,0x80,0x06,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x80,0x06,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v[2:3], v2, off nv ; encoding: [0xfc,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+0xfc,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v5, s[2:3] nv    ; encoding: [0x82,0x00,0x0a,0xee,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x82,0x00,0x0a,0xee,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v0, v[2:3], off nv      ; encoding: [0xfc,0x00,0x05,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x00,0x05,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_b32 v[2:3], v0, off nv     ; encoding: [0xfc,0x80,0x06,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x80,0x06,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v0, v2, off nv         ; encoding: [0xfc,0x00,0x05,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x00,0x05,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v0, off nv        ; encoding: [0xfc,0x80,0x06,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x80,0x06,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off nv         ; encoding: [0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00]
+0x02,0x00,0x05,0xee,0x05,0x00,0x01,0x00,0x01,0x20,0x00,0x00
+
+# GFX1250: global_store_b32 v5, v1, s[2:3] offset:32 scale_offset ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00]
+0x02,0x80,0x06,0xee,0x00,0x00,0x81,0x00,0x05,0x20,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v2, v5, s[2:3] scale_offset ; encoding: [0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00]
+0x02,0x40,0x0d,0xee,0x00,0x00,0x81,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, off offset:32 scale_offset ; encoding: [0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00
+
+# GFX1250: scratch_load_b32 v5, v2, s1 offset:32 scale_offset ; encoding: [0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00]
+0x01,0x00,0x05,0xed,0x05,0x00,0x03,0x00,0x02,0x20,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v5, off scale_offset ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 v2, v5, s1 scale_offset ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x83,0x02,0x02,0x00,0x00,0x00
+
 # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
index 739a2034a079e..07dbbddcdc2f9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
@@ -2,6 +2,9 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xff,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf
+# GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
 0xc1,0x3a,0x08,0x7e
 # GFX1250: v_mov_b64_e32 v[4:5], -1                ; encoding: [0xc1,0x3a,0x08,0x7e]
 
@@ -26,6 +29,114 @@
 0x6a,0x3a,0x08,0x7e
 # GFX1250: v_mov_b64_e32 v[4:5], vcc               ; encoding: [0x6a,0x3a,0x08,0x7e]
 
+0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf
+# GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+0xc1,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, -1                   ; encoding: [0xc1,0x3c,0x0a,0x7e]
+
+0xf0,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x3c,0x0a,0x7e]
+
+0x7f,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x3c,0x0a,0x7e]
+
+0x7e,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x3c,0x0a,0x7e]
+
+0x7d,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, m0                   ; encoding: [0x7d,0x3c,0x0a,0x7e]
+
+0x7c,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, null                 ; encoding: [0x7c,0x3c,0x0a,0x7e]
+
+0x01,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, s1                   ; encoding: [0x01,0x3c,0x0a,0x7e]
+
+0x69,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, s105                 ; encoding: [0x69,0x3c,0x0a,0x7e]
+
+0xfd,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, src_scc              ; encoding: [0xfd,0x3c,0x0a,0x7e]
+
+0x7b,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x3c,0x0a,0x7e]
+
+0x01,0x3d,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
+
+0xff,0x3d,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, v255                 ; encoding: [0xff,0x3d,0x0a,0x7e]
+
+0x6b,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x3c,0x0a,0x7e]
+
+0x6a,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x3c,0x0a,0x7e]
+
+0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e32 v127.l, 0x8000           ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, -1                 ; encoding: [0xc1,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+0xf0,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+0x7f,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, exec_hi            ; encoding: [0x7f,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+0x7e,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, exec_lo            ; encoding: [0x7e,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+0x7d,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, m0                 ; encoding: [0x7d,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+0x7c,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, null               ; encoding: [0x7c,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+0x01,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, s1                 ; encoding: [0x01,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+0x69,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, s105               ; encoding: [0x69,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+0xfd,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+0x7b,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, ttmp15             ; encoding: [0x7b,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+0x01,0x3f,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, v1.l               ; encoding: [0x01,0x3f,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+0x7f,0x3f,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, v127.l             ; encoding: [0x7f,0x3f,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+0x6b,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+0x6a,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+0x81,0x3f,0x0a,0x7f
+# GFX1250-REAL16: v_tanh_f16_e32 v5.h, v1.h               ; encoding: [0x81,0x3f,0x0a,0x7f]
+
 0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e32 v127.l, 0x8000          ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e32 v127, 0x8000            ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
@@ -89,6 +200,51 @@
 0x81,0x95,0x0a,0x7f
 # GFX1250-REAL16: v_tanh_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0x95,0x0a,0x7f]
 
+0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf
+# GFX1250: v_prng_b32_e32 v255, 0xaf123456         ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+0xc1,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, -1                   ; encoding: [0xc1,0x96,0x0a,0x7e]
+
+0xf0,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, 0.5                  ; encoding: [0xf0,0x96,0x0a,0x7e]
+
+0x7f,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, exec_hi              ; encoding: [0x7f,0x96,0x0a,0x7e]
+
+0x7e,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, exec_lo              ; encoding: [0x7e,0x96,0x0a,0x7e]
+
+0x7d,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, m0                   ; encoding: [0x7d,0x96,0x0a,0x7e]
+
+0x7c,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, null                 ; encoding: [0x7c,0x96,0x0a,0x7e]
+
+0x01,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, s1                   ; encoding: [0x01,0x96,0x0a,0x7e]
+
+0x69,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, s105                 ; encoding: [0x69,0x96,0x0a,0x7e]
+
+0xfd,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, src_scc              ; encoding: [0xfd,0x96,0x0a,0x7e]
+
+0x7b,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, ttmp15               ; encoding: [0x7b,0x96,0x0a,0x7e]
+
+0x01,0x97,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, v1                   ; encoding: [0x01,0x97,0x0a,0x7e]
+
+0xff,0x97,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, v255                 ; encoding: [0xff,0x97,0x0a,0x7e]
+
+0x6b,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, vcc_hi               ; encoding: [0x6b,0x96,0x0a,0x7e]
+
+0x6a,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, vcc_lo               ; encoding: [0x6a,0x96,0x0a,0x7e]
+
 0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
@@ -215,6 +371,321 @@
 0x81,0xf5,0x0a,0x7f
 # GFX1250-REAL16: v_sqrt_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0xf5,0x0a,0x7f]
 
+0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+
+0xf0,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+
+0x7f,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+
+0x7e,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+
+0x7d,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+
+0x7c,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+
+0x01,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+
+0x69,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+
+0xfd,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+
+0x7b,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+
+0x01,0xf7,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf7,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+
+0x7f,0xf7,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf7,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+
+0x6b,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+
+0x6a,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+
+0x81,0xf7,0x0a,0x7f
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf7,0x0a,0x7f]
+
+0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+0xf0,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+0x7f,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+0x7e,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+0x7d,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+0x7c,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+0x01,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+0x69,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+0xfd,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+0x7b,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+0x01,0xf9,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf9,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+0x7f,0xf9,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf9,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+0x6b,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+0x6a,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+0x81,0xf9,0x0a,0x7f
+# GFX1250-REAL16: v_log_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf9,0x0a,0x7f]
+
+0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+
+0xf0,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+
+0x7f,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+
+0x7e,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+
+0x7d,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+
+0x7c,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+
+0x01,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+
+0x69,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+
+0xfd,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+
+0x7b,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+
+0x01,0xfb,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xfb,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+
+0x7f,0xfb,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xfb,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+
+0x6b,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+
+0x6a,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+
+0x81,0xfb,0x0a,0x7f
+# GFX1250-REAL16: v_exp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfb,0x0a,0x7f]
+
+0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+
+0xf0,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+
+0x7f,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+
+0x7e,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+
+0x7d,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+
+0x7c,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+
+0x01,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+
+0x69,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+
+0xfd,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+
+0x7b,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+
+0x01,0xfd,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xfd,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+
+0x7f,0xfd,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xfd,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+
+0x6b,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+
+0x6a,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+
+0x81,0xfd,0x0a,0x7f
+# GFX1250-REAL16: v_sin_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfd,0x0a,0x7f]
+
+0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+
+0xf0,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+
+0x7f,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+
+0x7e,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+
+0x7d,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+
+0x7c,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+
+0x01,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+
+0x69,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+
+0xfd,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+
+0x7b,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+
+0x01,0xff,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xff,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+
+0x7f,0xff,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xff,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+
+0x6b,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+
+0x6a,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+
+0x81,0xff,0x0a,0x7f
+# GFX1250-REAL16: v_cos_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xff,0x0a,0x7f]
+
 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00
 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
@@ -338,3 +809,90 @@
 
 0x03,0xd9,0x02,0x7e
 # GFX1250: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+
+0x03,0xde,0x04,0x7e
+# GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+
+0x83,0xde,0x04,0x7e
+# GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+
+0x03,0xdf,0x04,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[2:3], v3.l       ; encoding: [0x03,0xdf,0x04,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+
+0x05,0xde,0x08,0x7e
+# GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5         ; encoding: [0x05,0xde,0x08,0x7e]
+
+0x83,0xde,0x08,0x7e
+# GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3          ; encoding: [0x83,0xde,0x08,0x7e]
+
+0x03,0xdf,0x08,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[4:5], v3.l       ; encoding: [0x03,0xdf,0x08,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[4:5], v3         ; encoding: [0x03,0xdf,0x08,0x7e]
+
+0xff,0xdf,0x08,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[4:5], v127.h     ; encoding: [0xff,0xdf,0x08,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[4:5], v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xdf,0x08,0x7e]
+
+0x7f,0xdf,0x08,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[4:5], v127.l     ; encoding: [0x7f,0xdf,0x08,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[4:5], v127       ; encoding: [0x7f,0xdf,0x08,0x7e]
+
+0x03,0xdc,0x04,0x7e
+# GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+
+0x83,0xdc,0x04,0x7e
+# GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+
+0x03,0xdd,0x04,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e32 v[2:3], v3.l       ; encoding: [0x03,0xdd,0x04,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+
+0xff,0xdd,0x08,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e32 v[4:5], v127.h     ; encoding: [0xff,0xdd,0x08,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e32 v[4:5], v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xdd,0x08,0x7e]
+
+0x7f,0xdd,0x08,0x7e
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e32 v[4:5], v127.l     ; encoding: [0x7f,0xdd,0x08,0x7e]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e32 v[4:5], v127       ; encoding: [0x7f,0xdd,0x08,0x7e]
+
+0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 0x1234        ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 0x1234          ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+0x82,0xe6,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 2             ; encoding: [0x82,0xe6,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 2               ; encoding: [0x82,0xe6,0x02,0x7e]
+
+0x02,0xe6,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, s2            ; encoding: [0x02,0xe6,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, s2              ; encoding: [0x02,0xe6,0x02,0x7e]
+
+0x02,0xe7,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, v2            ; encoding: [0x02,0xe7,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, v2              ; encoding: [0x02,0xe7,0x02,0x7e]
+
+0x02,0xe7,0x02,0x7f
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.h, v2            ; encoding: [0x02,0xe7,0x02,0x7f]
+
+0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 0x1234        ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 0x1234          ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+0x82,0xe8,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 2             ; encoding: [0x82,0xe8,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 2               ; encoding: [0x82,0xe8,0x02,0x7e]
+
+0x02,0xe8,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, s2            ; encoding: [0x02,0xe8,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, s2              ; encoding: [0x02,0xe8,0x02,0x7e]
+
+0x02,0xe9,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, v2            ; encoding: [0x02,0xe9,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, v2              ; encoding: [0x02,0xe9,0x02,0x7e]
+
+0x02,0xe9,0x02,0x7f
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.h, v2            ; encoding: [0x02,0xe9,0x02,0x7f]
+
+0x02,0x93,0x02,0x7e
+# GFX1250: v_permlane16_swap_b32_e32 v1, v2        ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
index 04b38093d30f6..c12ecb8d868aa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
@@ -2,6 +2,107 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30
+# GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_tanh_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
 0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30
 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 # GFX1250-FAKE16: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
@@ -61,6 +162,45 @@
 0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff
 # GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 
+0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
 0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30
 # GFX1250-REAL16: v_rcp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 # GFX1250-FAKE16: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
@@ -179,6 +319,301 @@
 0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff
 # GFX1250-REAL16: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 
+0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_rsq_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_log_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_exp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_sin_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_cos_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30
 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
@@ -273,3 +708,25 @@
 0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff
 # GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
+
+0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+
+0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+
+0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff]
+
+0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+
+0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+
+0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
index 58994519a5234..d3706f975e914 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
@@ -2,6 +2,31 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
 0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
@@ -18,6 +43,15 @@
 # GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
 
+0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
 0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
@@ -50,6 +84,81 @@
 # GFX1250-REAL16: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
 
+0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00
 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
@@ -99,3 +208,25 @@
 0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05
 # GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
+
+0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05]
+
+0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
index d9d8f60fe3d17..a1a1d0c5d7ed2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
@@ -16,6 +16,52 @@
 
 0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
 # GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi       ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08
+# GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2     ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
+
+0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null     ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1       ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
+
+0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5           ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
+
+0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo     ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105        ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10
+# GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
+
+0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc   ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
+
+0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
+
+0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255        ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
+
+0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15    ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
+
 ## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 # GFX1250-FAKE16: {{.*}}
 # GFX1250-REAL16: {{.*}}
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt
new file mode 100644
index 0000000000000..dec73b74afc8d
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x6d,0xd7,0xfa,0xfe,0x03,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt
new file mode 100644
index 0000000000000..db211f9061dca
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x6d,0xd7,0xe9,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd7,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05
+# GFX1250: v_cvt_pk_bf16_f32_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x6d,0xd7,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
index 89d9b02cdbd52..67747a65ee52a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
@@ -2,6 +2,4127 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_bfrev_b32_e64 v255, 0xaf123456        ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, -1                  ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, 0.5                 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, m0                  ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, null                ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, s1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, s105                ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, src_scc             ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, v1                  ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, v255                ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_bfrev_b32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.l, v128.l             ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, v128                 ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00]
+
+0x05,0x48,0xdc,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_ceil_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdc,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_ceil_f16_e64 v5, v128                 ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_ceil_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, null                 ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_ceil_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_ceil_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], -1               ; encoding: [0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_ceil_f64_e64 v[6:7], 0.5 mul:2        ; encoding: [0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], exec             ; encoding: [0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], null             ; encoding: [0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], s[104:105]       ; encoding: [0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], s[2:3]           ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], ttmp[14:15]      ; encoding: [0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], v[254:255]       ; encoding: [0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], v[2:3]           ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e64 v[6:7], vcc              ; encoding: [0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_cls_i32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, -1                    ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, m0                    ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, null                  ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, s1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, s105                  ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, v1                    ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, v255                  ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cls_i32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, null              ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_clz_i32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.l, v128.l              ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, v128                  ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00]
+
+0x05,0x48,0xe1,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_f16_e64 v5, v128                  ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cos_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cos_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cos_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, null              ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v255.l, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, -1              ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, 0.5 mul:2       ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, m0              ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, null            ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, null              ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, s1              ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, s105            ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, src_scc mul:4   ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, v1              ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, v255            ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00]
+
+0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v128.l, v15           ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v128, v15             ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00]
+
+0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_f32_e64 v128.h, v15 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_f32_e64 v128, v15             ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00]
+
+0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v255.l, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, 0x3800 mul:2    ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, 0x3800 mul:2      ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, null              ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, src_scc mul:4   ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00]
+
+0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v128.l, v15.l         ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v128, v15             ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00]
+
+0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_i16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_i16_e64 v128, v15             ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00]
+
+0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v255.l, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, 0x3800 mul:2    ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, 0x3800 mul:2      ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, null              ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, src_scc mul:4   ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00]
+
+0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v128.l, v15.l         ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v128, v15             ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00]
+
+0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f16_u16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f16_u16_e64 v128, v15             ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00]
+
+0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, -1                ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_f16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, m0                ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, null              ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, s1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, s105              ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_f16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_f16_e64 v5, v1.l              ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_f16_e64 v5, v1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_f16_e64 v5, v255.l            ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_f16_e64 v5, v255              ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_f16_e64 v1, v128.l            ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_f16_e64 v1, v128              ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x08,0x8b,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_f16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0x8b,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_f16_e64 v1, v128              ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4  ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30]
+
+0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_f64_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00]
+
+0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, -1                ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_i32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, m0                ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, null              ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, s1                ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, s105              ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_i32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, v1                ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, v255              ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_i32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, -1                ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_u32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, m0                ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, null              ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, s1                ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, s105              ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_u32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, v1                ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, v255              ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_u32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, -1             ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, m0             ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, null           ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, s1             ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, s105           ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, v1             ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, v255           ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, -1             ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, m0             ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, null           ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, s1             ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, s105           ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, v1             ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, v255           ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, -1             ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, m0             ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, null           ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, s1             ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, s105           ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, v1             ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, v255           ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, -1             ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, m0             ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, null           ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, s1             ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, s105           ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, v1             ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, v255           ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10]
+
+0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00]
+
+0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10]
+
+0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00]
+
+0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], -1            ; encoding: [0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2     ; encoding: [0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_hi       ; encoding: [0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_lo       ; encoding: [0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], m0            ; encoding: [0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], null          ; encoding: [0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], s1            ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], s105          ; encoding: [0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10]
+
+0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], ttmp15        ; encoding: [0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], v1            ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], v255          ; encoding: [0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_hi        ; encoding: [0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00]
+
+0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_lo        ; encoding: [0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v255.l, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, 0.5             ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, src_scc         ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v1.l, v128.l          ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v1, v128              ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd3,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd3,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i16_f16_e64 v1, v128              ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, 0.5               ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, null              ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, src_scc           ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20
+# GFX1250: v_cvt_i32_f64_e64 v5, -|src_scc|        ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20]
+
+0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, 0.5               ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00]
+
+0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, 0x3800            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, null              ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_i32_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i32_i16_e64 v1, v128.l            ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i32_i16_e64 v1, v128              ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x08,0xea,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_i32_i16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xea,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_i32_i16_e64 v1, v128              ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1        ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5       ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi   ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo   ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0        ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, null      ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105      ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc   ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15    ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1        ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255      ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi    ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo    ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, -1         ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, 0.5        ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, exec_hi    ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, exec_lo    ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, m0         ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, null       ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, s1         ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, s105       ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, src_scc    ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, ttmp15     ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, v1.l       ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, v255.l     ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, vcc_hi     ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, vcc_lo     ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v1.l, v128.l     ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v1, v128         ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v1, v128         ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, -1         ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, 0.5        ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, exec_hi    ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, exec_lo    ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, m0         ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, null       ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, s1         ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, s105       ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, src_scc    ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, ttmp15     ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, v1.l       ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, v255.l     ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, vcc_hi     ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, vcc_lo     ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v1.l, v128.l     ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v1, v128         ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x08,0xe4,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe4,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v1, v128         ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, -1             ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, exec_hi        ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, exec_lo        ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, m0             ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, null           ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, s1             ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, s105           ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_cvt_off_f32_i4_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, ttmp15         ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, v1             ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, v255           ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00]
+
+0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.l       ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3         ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[4:5], v3.l       ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[4:5], v3         ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[4:5], v3.h op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v128 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00]
+
+0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v128.l     ; encoding: [0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v128       ; encoding: [0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00]
+
+0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.l       ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3          ; encoding: [0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], s3         ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[4:5], v3.l       ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[4:5], v3         ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[4:5], v3.h op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v128 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00]
+
+0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v128.l     ; encoding: [0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v128       ; encoding: [0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v255.l, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, 0.5             ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, src_scc         ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v1.l, v128.l          ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v1, v128              ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x08,0xd2,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xd2,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u16_f16_e64 v1, v128              ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, 0.5               ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, null              ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, src_scc           ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, -1                ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20
+# GFX1250: v_cvt_u32_f64_e64 v5, -|src_scc|        ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20]
+
+0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, 0.5               ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, exec              ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, null              ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, s[104:105]        ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, s[2:3]            ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, ttmp[14:15]       ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, v[254:255]        ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, v[2:3]            ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00]
+
+0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_f64_e64 v5, vcc               ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v255, 0xfe0b          ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, 0x3800            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, null              ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_u32_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u32_u16_e64 v1, v128.l            ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u32_u16_e64 v1, v128              ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_u32_u16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_u32_u16_e64 v1, v128              ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd8,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd8,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd8,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd8,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_exp_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_exp_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_exp_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xdb,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xdb,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_floor_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdb,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_floor_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xdb,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_floor_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_floor_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_floor_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_floor_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_floor_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_floor_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xdf,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xdf,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_fract_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdf,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_fract_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xdf,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_fract_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_fract_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_fract_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_fract_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_fract_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_fract_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, -1        ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, -1          ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, 0.5       ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, 0.5         ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, exec_hi   ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, exec_lo   ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, m0        ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, m0          ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, null      ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, null        ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, s1        ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, s1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, s105      ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, s105        ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, src_scc   ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, src_scc     ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, ttmp15    ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, v1.l      ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, v1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, v255.l    ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, v255        ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, vcc_hi    ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, vcc_lo    ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v1, v128        ; encoding: [0x01,0x00,0xda,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xda,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xda,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v1, v128        ; encoding: [0x01,0x00,0xda,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf
+# GFX1250: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, -1          ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, 0.5         ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, m0          ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, null        ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, s1          ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, s105        ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, src_scc     ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, v1          ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, v255        ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, -1          ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, -|src_scc|  ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20]
+
+0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, 0.5         ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, exec        ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, null        ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, s[104:105]  ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, s[2:3]      ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, v[254:255]  ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, v[2:3]      ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00]
+
+0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_frexp_exp_i32_f64_e64 v5, vcc         ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v1, v128           ; encoding: [0x01,0x00,0xd9,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd9,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_frexp_mant_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd9,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_frexp_mant_f16_e64 v1, v128           ; encoding: [0x01,0x00,0xd9,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, -1             ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_frexp_mant_f32_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, m0             ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, null           ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, s1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, s105           ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_frexp_mant_f32_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, v1             ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, v255           ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f32_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], -1         ; encoding: [0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2  ; encoding: [0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], exec       ; encoding: [0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], null       ; encoding: [0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], s[2:3]     ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], v[2:3]     ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_frexp_mant_f64_e64 v[6:7], vcc        ; encoding: [0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_log_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_log_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_log_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_log_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd7,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd7,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd7,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd7,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_log_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_log_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_log_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_mov_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, -1                    ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, m0                    ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, null                  ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, s1                    ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, s105                  ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, src_scc               ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, v255                  ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_mov_b32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_movreld_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, -1                ; encoding: [0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, 0.5               ; encoding: [0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, m0                ; encoding: [0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, null              ; encoding: [0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, s1                ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, s105              ; encoding: [0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, src_scc           ; encoding: [0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, v255              ; encoding: [0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_movreld_b32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_movrels_b32_e64 v255, v255            ; encoding: [0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_movrels_b32_e64 v5, v1                ; encoding: [0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00]
+
+0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_movrelsd_2_b32_e64 v255, v255         ; encoding: [0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_movrelsd_2_b32_e64 v5, v1             ; encoding: [0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00]
+
+0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_movrelsd_b32_e64 v255, v255           ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_movrelsd_b32_e64 v5, v1               ; encoding: [0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00]
+
+0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00
+# GFX1250: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
+
+0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, 0x3800              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, 0x3800                ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v1, v128                  ; encoding: [0x01,0x00,0xe9,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xe9,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_not_b16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe9,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_not_b16_e64 v1, v128                  ; encoding: [0x01,0x00,0xe9,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
+# GFX1250: v_not_b32_e64 v255, 0xaf123456          ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, null                  ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, src_scc               ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_not_b32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00]
+
+0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00
+# GFX1250: v_pipeflush                             ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00]
+
+0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd4,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd4,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd4,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd4,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rcp_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_rcp_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rcp_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], -1                ; encoding: [0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_rcp_f64_e64 v[6:7], -|src_scc| mul:4  ; encoding: [0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rcp_f64_e64 v[6:7], 0.5 mul:2         ; encoding: [0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], exec              ; encoding: [0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], null              ; encoding: [0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], s[104:105]        ; encoding: [0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], s[2:3]            ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], ttmp[14:15]       ; encoding: [0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], v[254:255]        ; encoding: [0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], v[2:3]            ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rcp_f64_e64 v[6:7], vcc               ; encoding: [0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, -1              ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rcp_iflag_f32_e64 v5, 0.5 mul:2       ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, m0              ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, null            ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, s1              ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, s105            ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_rcp_iflag_f32_e64 v5, src_scc mul:4   ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, v1              ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, v255            ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rcp_iflag_f32_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xde,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xde,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rndne_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xde,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rndne_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xde,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rndne_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_rndne_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rndne_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rndne_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rndne_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd6,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd6,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd6,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xd6,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rsq_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_rsq_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rsq_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], -1                ; encoding: [0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_rsq_f64_e64 v[6:7], -|src_scc| mul:4  ; encoding: [0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_rsq_f64_e64 v[6:7], 0.5 mul:2         ; encoding: [0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], exec              ; encoding: [0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], null              ; encoding: [0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], s[104:105]        ; encoding: [0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], s[2:3]            ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], ttmp[14:15]       ; encoding: [0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], v[254:255]        ; encoding: [0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], v[2:3]            ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_rsq_f64_e64 v[6:7], vcc               ; encoding: [0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+
+0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v128.l, v1          ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v128, v1            ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v128.h, v1 op_sel:[0,1] ; encoding: [0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v128, v1            ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+
+0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xe0,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xe0,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe0,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_f16_e64 v1, v128                  ; encoding: [0x01,0x00,0xe0,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, -1                    ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_sin_f32_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, m0                    ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, null                  ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, s1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, s105                  ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_sin_f32_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, v1                    ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, v255                  ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_sin_f32_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v1, v128                 ; encoding: [0x01,0x00,0xd5,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xd5,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd5,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_f16_e64 v1, v128                 ; encoding: [0x01,0x00,0xd5,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_sqrt_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, null                 ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_sqrt_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_sqrt_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], -1               ; encoding: [0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_sqrt_f64_e64 v[6:7], 0.5 mul:2        ; encoding: [0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], exec             ; encoding: [0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], null             ; encoding: [0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], s[104:105]       ; encoding: [0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], s[2:3]           ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], ttmp[14:15]      ; encoding: [0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], v[254:255]       ; encoding: [0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], v[2:3]           ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_sqrt_f64_e64 v[6:7], vcc              ; encoding: [0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+
+0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xdd,0xd5,0x80,0x01,0x00,0x00]
+
+0x01,0x48,0xdd,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_trunc_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdd,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_trunc_f16_e64 v1, v128                ; encoding: [0x01,0x00,0xdd,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, -1                  ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_trunc_f32_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, m0                  ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, null                ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, s1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, s105                ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_trunc_f32_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, v1                  ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, v255                ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_trunc_f32_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00]
+
+0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf
+# GFX1250: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+
+0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], -1              ; encoding: [0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00]
+
+0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30
+# GFX1250: v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30]
+
+0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_trunc_f64_e64 v[6:7], 0.5 mul:2       ; encoding: [0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08]
+
+0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], exec            ; encoding: [0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], null            ; encoding: [0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], s[104:105]      ; encoding: [0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], s[2:3]          ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], ttmp[14:15]     ; encoding: [0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], v[254:255]      ; encoding: [0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], v[2:3]          ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00]
+
+0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_trunc_f64_e64 v[6:7], vcc             ; encoding: [0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, null                 ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_tanh_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, v128                 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00]
+
 0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
@@ -66,6 +4187,42 @@
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, v128                ; encoding: [0x05,0x00,0xca,0xd5,0x80,0x01,0x00,0x00]
 
+0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, -1                   ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, m0                   ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, null                 ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, s1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, s105                 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, v1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, v255                 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
+
 0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
@@ -194,169 +4351,325 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128                ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
-0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
-0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
 
-0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
 
-0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
 
-0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
 
-0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
 
-0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
 
-0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
 
-0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
 
-0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
 
-0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
 
-0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
 
-0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1      ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
 
-0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2      ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
 
-0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3      ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00]
 
-0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
-0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
 
-0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
 
-0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
 
-0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1     ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
 
-0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2     ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
 
-0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3     ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
 
-0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp          ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
 
-0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
 
-0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00
-# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, -1               ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi          ; encoding: [0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo          ; encoding: [0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, m0               ; encoding: [0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, null             ; encoding: [0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, s1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00]
+0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, s105             ; encoding: [0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15           ; encoding: [0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
 
-0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00
-# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v1.l             ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
-# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00
-# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.l           ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
-# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255             ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi           ; encoding: [0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
 
-0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo           ; encoding: [0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00
-# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v1.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
-# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00
-# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
-# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
 
-0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, s105 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, m0 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, null op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, -1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
 
-0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00]
+0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
 
-0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00
-# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00]
-# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v128 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00]
+0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xff,0xd5,0x80,0x01,0x00,0x00]
 
 0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00
 # GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:1   ; encoding: [0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00]
@@ -430,6 +4743,89 @@
 # GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.h, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00]
 
+0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, s105 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, m0 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, null op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, -1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v128 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, -1               ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi          ; encoding: [0x05,0x00,0xf2,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo          ; encoding: [0x05,0x00,0xf2,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, m0               ; encoding: [0x05,0x00,0xf2,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, null             ; encoding: [0x05,0x00,0xf2,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, s1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, s105             ; encoding: [0x05,0x00,0xf2,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15           ; encoding: [0x05,0x00,0xf2,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v1.l             ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.l           ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255             ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi           ; encoding: [0x05,0x00,0xf2,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo           ; encoding: [0x05,0x00,0xf2,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v1.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+
 0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00
 # GFX1250: v_cvt_pk_f16_bf8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00]
 
@@ -459,3 +4855,55 @@
 0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00
 # GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v2.h op_sel:[1,0]  ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, 0x1234      ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, 2           ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, 2             ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, s2          ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, s2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, v2          ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, v2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, v2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 0x1234      ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 2           ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 2             ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, s2          ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, s2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, v2          ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2        ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1   ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
index 9e45aca0168d6..7c29f8ab01a1b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
@@ -1,6 +1,201 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
+
+0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
 0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
@@ -122,65 +317,305 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
 0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -302,3 +737,27 @@
 0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff
 # GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+
+0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+
+0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+
+0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+
+0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+
+0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+
+0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
index 18959f8dec20a..d26bc46a1f272 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
@@ -1,6 +1,61 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
+
+0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
@@ -42,25 +97,105 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 
-0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 
-0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
 0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
@@ -154,3 +289,27 @@
 0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index d76ec4c7e6185..e20f020cf878e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -364,6 +364,45 @@
 0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c
 # GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
 
+0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
+
+0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
+
+0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84
+# GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
+
 0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b
 # GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
 
diff --git a/llvm/test/MC/ELF/mc-dump.s b/llvm/test/MC/ELF/mc-dump.s
index 5cc2e9fa50179..fd6cf95f4af44 100644
--- a/llvm/test/MC/ELF/mc-dump.s
+++ b/llvm/test/MC/ELF/mc-dump.s
@@ -6,9 +6,9 @@
 #CHECK-LABEL:assembler backend - final-layout
 #      CHECK:Sections:[
 # CHECK-NEXT:MCSection Name:.text
-# CHECK-NEXT:0 Data Size:0 []
+# CHECK-NEXT:0 Align Size:0+0 []
+# CHECK-NEXT:  Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:  Symbol @0 .text
-# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:0 Data Size:0 []
 # CHECK-NEXT:  Symbol @0 _start
 # CHECK-NEXT:  Symbol @0  Temporary
@@ -22,9 +22,9 @@
 # CHECK-NEXT:  Symbol @0  Temporary
 # CHECK-NEXT:  Symbol @16  Temporary
 # CHECK-NEXT:MCSection Name:.data
-# CHECK-NEXT:0 Data Size:0 []
+# CHECK-NEXT:0 Align Size:0+0 []
+# CHECK-NEXT:  Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4
 # CHECK-NEXT:  Symbol @0 .data
-# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4
 # CHECK-NEXT:0 Data Size:4 [01,00,00,00]
 # CHECK-NEXT:4 Fill Value:0 ValueSize:1 NumValues:1
 # CHECK-NEXT:5 LEB Size:0+1 [15] Value:.Ltmp0-_start Signed:0
diff --git a/llvm/test/MC/ELF/nobits-non-zero-value.s b/llvm/test/MC/ELF/nobits-non-zero-value.s
index ff43e69baaedc..ea95ec97ac8d2 100644
--- a/llvm/test/MC/ELF/nobits-non-zero-value.s
+++ b/llvm/test/MC/ELF/nobits-non-zero-value.s
@@ -1,26 +1,45 @@
-# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: --implicit-check-not=warning:
 
 ## -filetype=asm does not check the error.
 # RUN: llvm-mc -triple=x86_64 %s
 
 .section .tbss,"aw",@nobits
-# MCRelaxableFragment
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: SHT_NOBITS section '.tbss' cannot have instructions
   jmp foo
 
 .bss
-# CHECK: {{.*}}.s:[[#@LINE+1]]:3: error: SHT_NOBITS section '.bss' cannot have instructions
   addb %al,(%rax)
 
-# CHECK: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in SHT_NOBITS section '.bss'
+# CHECK: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in BSS section '.bss'
 .align 4, 42
 
-# CHECK-NOT: {{.*}}.s:[[#@LINE+1]]:11: warning: ignoring non-zero fill value in SHT_NOBITS section '.bss'
-.align 4, 0
-
-# CHECK: <unknown>:0: error: SHT_NOBITS section '.bss' cannot have non-zero initializers
   .long 1
 
-.section .bss1,"aw",%nobits
-# CHECK: <unknown>:0: error: SHT_NOBITS section '.bss1' cannot have fixups
+.section .bss0,"aw",%nobits
+addb %al,(%rax)
+
+.section data_fixup,"aw",%nobits
 .quad foo
+
+.section fill,"aw",%nobits
+.fill b-a,1,1
+
+.section org,"aw",%nobits
+.org 1,1
+
+.section ok,"aw",%nobits
+.org 1
+.fill 1
+.fill b-a,1,0
+.align 4, 0
+.long 0
+
+.text
+a: nop
+b:
+
+## Location is not tracked for efficiency.
+# CHECK: <unknown>:0: error: BSS section '.tbss' cannot have non-zero bytes
+# CHECK: <unknown>:0: error: BSS section '.bss' cannot have non-zero bytes
+# CHECK: <unknown>:0: error: BSS section 'data_fixup' cannot have fixups
+# CHECK: <unknown>:0: error: BSS section 'fill' cannot have non-zero bytes
+# CHECK: <unknown>:0: error: BSS section 'org' cannot have non-zero bytes
diff --git a/llvm/test/MC/ELF/reloc-directive.s b/llvm/test/MC/ELF/reloc-directive.s
index 42995aa9e7d81..9871fba2e0021 100644
--- a/llvm/test/MC/ELF/reloc-directive.s
+++ b/llvm/test/MC/ELF/reloc-directive.s
@@ -9,15 +9,18 @@
 # ASM-NEXT:  .reloc .Ltmp1-1, R_X86_64_NONE, foo
 # ASM-NEXT: .Ltmp2:
 # ASM-NEXT:  .reloc 2+.Ltmp2, R_X86_64_NONE, local
-# ASM-NEXT:  .reloc 1+foo+3, R_X86_64_NONE, data+1
-# ASM-NEXT: .Ltmp3:
-# ASM-NEXT:  .reloc .Ltmp3, BFD_RELOC_NONE, unused
 
 # CHECK:      0x2 R_X86_64_NONE foo 0x0
 # CHECK-NEXT: 0x0 R_X86_64_NONE foo 0x0
 # CHECK-NEXT: 0x3 R_X86_64_NONE local 0x0
-# CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1
 # CHECK-NEXT: 0x1 R_X86_64_NONE unused 0x0
+# CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1
+
+# CHECK:      .rela.my {
+# CHECK:        0x0 R_X86_64_NONE foo 0x0
+# CHECK-NEXT:   0x4 R_X86_64_NONE foo 0x0
+# CHECK-NEXT:   0x8 R_X86_64_NONE foo 0x0
+# CHECK-NEXT: }
 
 .text
 .globl foo
@@ -27,17 +30,25 @@ local:
   .reloc .+3-2, R_X86_64_NONE, foo
   .reloc .-1, R_X86_64_NONE, foo
   .reloc 2+., R_X86_64_NONE, local
-  .reloc 1+foo+3, R_X86_64_NONE, data+1
   .reloc ., BFD_RELOC_NONE, unused
+  .space 3
 
 .data
 .globl data
 data:
+  .reloc 1+foo+3, R_X86_64_NONE, data+1
   .long 0
 
-# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=ERR=1 %s 2>&1 | FileCheck %s --check-prefix=ERR
+## Constant offsets are relative to the section start.
+.section .my
+.word 0
+.reloc 0, BFD_RELOC_NONE, foo
+.word 0
+.p2align 3
+.reloc 2+2, BFD_RELOC_NONE, foo
+.p2align 4
+.reloc 8, BFD_RELOC_NONE, foo
 
-.ifdef ERR
 .text
 .globl a, b
 a: ret
@@ -45,22 +56,26 @@ b: ret
 x: ret
 y: ret
 
-# ERR: {{.*}}.s:[[#@LINE+1]]:10: error: expected comma
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=PARSE=1 %s 2>&1 | FileCheck %s --check-prefix=PARSE
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=ERR=1 %s 2>&1 | FileCheck %s --check-prefix=ERR
+
+.ifdef PARSE
+# PARSE: {{.*}}.s:[[#@LINE+1]]:10: error: expected comma
 .reloc 0 R_X86_64_NONE, a
 
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is negative
+# PARSE: {{.*}}.s:[[#@LINE+1]]:8: error: directional label undefined
+.reloc 1f, R_X86_64_NONE, a
+.endif
+
+.ifdef ERR
 .reloc -1, R_X86_64_NONE, a
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not relocatable
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relocatable
 .reloc 2*., R_X86_64_NONE, a
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not relocatable
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relocatable
 .reloc a+a, R_X86_64_NONE, a
-## GNU as accepts a-a but rejects b-a.
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not representable
-.reloc a-a, R_X86_64_NONE, a
-## TODO GNU as accepts x-x and y-x.
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not representable
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relative to a section
+.reloc b-a, R_X86_64_NONE, a
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relative to a section
 .reloc x-x, R_X86_64_NONE, a
 
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: directional label undefined
-.reloc 1f, R_X86_64_NONE, a
 .endif
diff --git a/llvm/test/MC/Hexagon/two_ext.s b/llvm/test/MC/Hexagon/two_ext.s
index 28b2aa3f1ecae..09b51c5f029a7 100644
--- a/llvm/test/MC/Hexagon/two_ext.s
+++ b/llvm/test/MC/Hexagon/two_ext.s
@@ -6,7 +6,7 @@
   if (!p1) call foo_b
 }
 # CHECK: 00004000 { immext(#0)
-# CHECK: 5d004100   if (p1) call 0x0
+# CHECK: 5d004100   if (p1) call 0x0 <.text>
 # CHECK: 00004000   immext(#0)
-# CHECK: 5d20c100   if (!p1) call 0x0 }
+# CHECK: 5d20c100   if (!p1) call 0x0 <.text> }
 
diff --git a/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s b/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s
index 60fd145564ae5..7658865b0f083 100644
--- a/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s
+++ b/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s
@@ -2,6 +2,6 @@
 # RUN: not llvm-mc --filetype=obj --triple=loongarch64 %s -o /dev/null 2>&1 \
 # RUN:			| FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/LoongArch/Relocations/reloc-directive.s b/llvm/test/MC/LoongArch/Relocations/reloc-directive.s
index f900f17c06c39..2fc0c816d7057 100644
--- a/llvm/test/MC/LoongArch/Relocations/reloc-directive.s
+++ b/llvm/test/MC/LoongArch/Relocations/reloc-directive.s
@@ -2,31 +2,23 @@
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 %s \
 # RUN:     | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_LARCH_NONE, .data
-# PRINT: .reloc 4, R_LARCH_NONE, foo+4
-# PRINT: .reloc 0, R_LARCH_NONE, 8
-# PRINT: .reloc 0, R_LARCH_32, .data+2
-# PRINT: .reloc 0, R_LARCH_TLS_DTPMOD32, foo+3
-# PRINT: .reloc 0, R_LARCH_IRELATIVE, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_LARCH_NONE, .data
 
 .text
+  .reloc .+8, R_LARCH_NONE, .data
+  .reloc .+4, R_LARCH_NONE, foo+4
+  .reloc .+0, R_LARCH_NONE, 8
+
+  .reloc .+0, R_LARCH_32, .data+2
+  .reloc .+0, R_LARCH_TLS_DTPMOD32, foo+3
+  .reloc .+0, R_LARCH_IRELATIVE, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   ret
   nop
   nop
-  .reloc 8, R_LARCH_NONE, .data
-  .reloc 4, R_LARCH_NONE, foo+4
-  .reloc 0, R_LARCH_NONE, 8
-
-  .reloc 0, R_LARCH_32, .data+2
-  .reloc 0, R_LARCH_TLS_DTPMOD32, foo+3
-  .reloc 0, R_LARCH_IRELATIVE, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/Mips/reloc-directive-bad-obj.s b/llvm/test/MC/Mips/reloc-directive-bad-obj.s
index 86d6d0cc66c57..74e5dae5264f6 100644
--- a/llvm/test/MC/Mips/reloc-directive-bad-obj.s
+++ b/llvm/test/MC/Mips/reloc-directive-bad-obj.s
@@ -2,8 +2,8 @@
 # RUN:     -target-abi=o32 -filetype=obj -o /dev/null 2>&1 | FileCheck %s
 .text
 nop
-.reloc foo, R_MIPS_32, .text  # CHECK: :[[@LINE]]:24: error: unresolved relocation offset
+.reloc foo, R_MIPS_32, .text  # CHECK: :[[@LINE]]:8: error: .reloc offset is not relative to a section
 nop
 nop
-.reloc bar, R_MIPS_32, .text  # CHECK: :[[@LINE]]:24: error: unresolved relocation offset
+.reloc bar, R_MIPS_32, .text  # CHECK: :[[@LINE]]:8: error: .reloc offset is not relative to a section
 nop
diff --git a/llvm/test/MC/Mips/reloc-directive-bad.s b/llvm/test/MC/Mips/reloc-directive-bad.s
index bb056b752fb9f..f09a73b962b01 100644
--- a/llvm/test/MC/Mips/reloc-directive-bad.s
+++ b/llvm/test/MC/Mips/reloc-directive-bad.s
@@ -2,6 +2,6 @@
 # RUN:     -target-abi=o32 2>&1 | FileCheck %s
 	.text
 foo:
-	.reloc 0, R_MIPS_32, .text+.text  # CHECK: :[[@LINE]]:23: error: expression must be relocatable
-	.reloc 0, 0, R_MIPS_32, .text     # CHECK: :[[@LINE]]:12: error: expected relocation name
+	.reloc ., R_MIPS_32, .text+.text  # CHECK: :[[@LINE]]:23: error: expression must be relocatable
+	.reloc ., 0, R_MIPS_32, .text     # CHECK: :[[@LINE]]:12: error: expected relocation name
 	nop
diff --git a/llvm/test/MC/Mips/reloc-directive-label-offset.s b/llvm/test/MC/Mips/reloc-directive-label-offset.s
index 257bfeb10d151..279fc7860dcea 100644
--- a/llvm/test/MC/Mips/reloc-directive-label-offset.s
+++ b/llvm/test/MC/Mips/reloc-directive-label-offset.s
@@ -58,18 +58,18 @@ bar:
 
 # OBJ-N32-LABEL: Relocations [
 # OBJ-N32:           0x4 R_MIPS_NONE .text
-# OBJ-N32-NEXT:      0x1C R_MIPS_GOT_OFST .text
-# OBJ-N32-NEXT:      0x0 R_MIPS_32 .text
 # OBJ-N32-NEXT:      0xC R_MIPS_32 .text
 # OBJ-N32-NEXT:      0x10 R_MIPS_CALL16 foo
 # OBJ-N32-NEXT:      0x20 R_MIPS_GOT_DISP foo
 # OBJ-N32-NEXT:      0x24 R_MIPS_GOT_PAGE .text
+# OBJ-N32-NEXT:      0x1C R_MIPS_GOT_OFST .text
+# OBJ-N32-NEXT:      0x0 R_MIPS_32 .text
 
 # OBJ-N64-LABEL: Relocations [
 # OBJ-N64:           0x4 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
-# OBJ-N64-NEXT:      0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0
-# OBJ-N64-NEXT:      0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
 # OBJ-N64-NEXT:      0xC R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
 # OBJ-N64-NEXT:      0x10 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE foo 0x0
 # OBJ-N64-NEXT:      0x20 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE foo 0x0
 # OBJ-N64-NEXT:      0x24 R_MIPS_GOT_PAGE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
diff --git a/llvm/test/MC/Mips/reloc-directive.s b/llvm/test/MC/Mips/reloc-directive.s
index 2f699ec98a609..4f875687f33b7 100644
--- a/llvm/test/MC/Mips/reloc-directive.s
+++ b/llvm/test/MC/Mips/reloc-directive.s
@@ -15,78 +15,79 @@
 # RUN:     FileCheck -check-prefix=OBJ-N64 %s
 	.text
 foo:
-	.reloc 4, R_MIPS_NONE, foo   # ASM: .reloc 4, R_MIPS_NONE, foo
-	.reloc 0, R_MIPS_NONE, foo+4 # ASM: .reloc 0, R_MIPS_NONE, foo+4
-	.reloc 8, R_MIPS_32, foo+8   # ASM: .reloc 8, R_MIPS_32, foo+8
+# ASM: .reloc {{.*}}+4, R_MIPS_NONE, foo
+	.reloc .+4, R_MIPS_NONE, foo
+	.reloc .+0, R_MIPS_NONE, foo+4
+	.reloc .+8, R_MIPS_32, foo+8
 	nop
 	nop
 	nop
-	.reloc 12, R_MIPS_NONE       # ASM: .reloc 12, R_MIPS_NONE{{$}}
+	.reloc ., R_MIPS_NONE
         nop
-  .reloc 16, R_MIPS_CALL_HI16, 4        # ASM: .reloc 16, R_MIPS_CALL_HI16, 4
+  .reloc ., R_MIPS_CALL_HI16, 4
   nop
-  .reloc 20, R_MIPS_CALL_LO16, 4        # ASM: .reloc 20, R_MIPS_CALL_LO16, 4
+  .reloc ., R_MIPS_CALL_LO16, 4
   nop
-  .reloc 24, R_MIPS_CALL16, 4           # ASM: .reloc 24, R_MIPS_CALL16, 4
+  .reloc ., R_MIPS_CALL16, 4
   nop
-  .reloc 28, R_MIPS_GOT16, 4            # ASM: .reloc 28, R_MIPS_GOT16, 4
+  .reloc ., R_MIPS_GOT16, 4
   nop
-  .reloc 32, R_MIPS_GOT_PAGE, 4         # ASM: .reloc 32, R_MIPS_GOT_PAGE, 4
+  .reloc ., R_MIPS_GOT_PAGE, 4
   nop
-  .reloc 36, R_MIPS_GOT_OFST, 4         # ASM: .reloc 36, R_MIPS_GOT_OFST, 4
+  .reloc ., R_MIPS_GOT_OFST, 4
   nop
-  .reloc 40, R_MIPS_GOT_DISP, 4         # ASM: .reloc 40, R_MIPS_GOT_DISP, 4
+  .reloc ., R_MIPS_GOT_DISP, 4
   nop
-  .reloc 44, R_MIPS_GOT_HI16, 4         # ASM: .reloc 44, R_MIPS_GOT_HI16, 4
+  .reloc ., R_MIPS_GOT_HI16, 4
   nop
-  .reloc 48, R_MIPS_GOT_LO16, 4         # ASM: .reloc 48, R_MIPS_GOT_LO16, 4
+  .reloc ., R_MIPS_GOT_LO16, 4
   nop
-  .reloc 52, R_MIPS_TLS_GOTTPREL, 4     # ASM: .reloc 52, R_MIPS_TLS_GOTTPREL, 4
+  .reloc ., R_MIPS_TLS_GOTTPREL, 4
   nop
-  .reloc 56, R_MIPS_TLS_DTPREL_HI16, 4  # ASM: .reloc 56, R_MIPS_TLS_DTPREL_HI16, 4
+  .reloc ., R_MIPS_TLS_DTPREL_HI16, 4
   nop
-  .reloc 60, R_MIPS_TLS_DTPREL_LO16, 4  # ASM: .reloc 60, R_MIPS_TLS_DTPREL_LO16, 4
+  .reloc ., R_MIPS_TLS_DTPREL_LO16, 4
   nop
-  .reloc 64, R_MIPS_TLS_GD, 4           # ASM: .reloc 64, R_MIPS_TLS_GD, 4
+  .reloc ., R_MIPS_TLS_GD, 4
   nop
-  .reloc 68, R_MIPS_TLS_LDM, 4          # ASM: .reloc 68, R_MIPS_TLS_LDM, 4
+  .reloc ., R_MIPS_TLS_LDM, 4
   nop
-  .reloc 72, R_MIPS_TLS_TPREL_HI16, 4   # ASM: .reloc 72, R_MIPS_TLS_TPREL_HI16, 4
+  .reloc ., R_MIPS_TLS_TPREL_HI16, 4
   nop
-  .reloc 76, R_MIPS_TLS_TPREL_LO16, 4   # ASM: .reloc 76, R_MIPS_TLS_TPREL_LO16, 4
+  .reloc ., R_MIPS_TLS_TPREL_LO16, 4
   nop
-  .reloc 80, R_MICROMIPS_CALL16, 4      # ASM: .reloc 80, R_MICROMIPS_CALL16, 4
+  .reloc ., R_MICROMIPS_CALL16, 4
   nop
-  .reloc 84, R_MICROMIPS_GOT_DISP, 4    # ASM: .reloc 84, R_MICROMIPS_GOT_DISP, 4
+  .reloc ., R_MICROMIPS_GOT_DISP, 4
   nop
-  .reloc 88, R_MICROMIPS_GOT_PAGE, 4    # ASM: .reloc 88, R_MICROMIPS_GOT_PAGE, 4
+  .reloc ., R_MICROMIPS_GOT_PAGE, 4
   nop
-  .reloc 92, R_MICROMIPS_GOT_OFST, 4    # ASM: .reloc 92, R_MICROMIPS_GOT_OFST, 4
+  .reloc ., R_MICROMIPS_GOT_OFST, 4
   nop
-  .reloc 96, R_MICROMIPS_GOT16, 4       # ASM: .reloc 96, R_MICROMIPS_GOT16, 4
+  .reloc ., R_MICROMIPS_GOT16, 4
   nop
-  .reloc 100, R_MICROMIPS_TLS_GOTTPREL, 4       # ASM: .reloc 100, R_MICROMIPS_TLS_GOTTPREL, 4
+  .reloc ., R_MICROMIPS_TLS_GOTTPREL, 4
   nop
-  .reloc 104, R_MICROMIPS_TLS_DTPREL_HI16, 4    # ASM: .reloc 104, R_MICROMIPS_TLS_DTPREL_HI16, 4
+  .reloc ., R_MICROMIPS_TLS_DTPREL_HI16, 4
   nop
-  .reloc 108, R_MICROMIPS_TLS_DTPREL_LO16, 4    # ASM: .reloc 108, R_MICROMIPS_TLS_DTPREL_LO16, 4
+  .reloc ., R_MICROMIPS_TLS_DTPREL_LO16, 4
   nop
-  .reloc 112, R_MICROMIPS_TLS_GD, 4             # ASM: .reloc 112, R_MICROMIPS_TLS_GD, 4
+  .reloc ., R_MICROMIPS_TLS_GD, 4
   nop
-  .reloc 116, R_MICROMIPS_TLS_LDM, 4            # ASM: .reloc 116, R_MICROMIPS_TLS_LDM, 4
+  .reloc ., R_MICROMIPS_TLS_LDM, 4
   nop
-  .reloc 120, R_MICROMIPS_TLS_TPREL_HI16, 4     # ASM: .reloc 120, R_MICROMIPS_TLS_TPREL_HI16, 4
+  .reloc ., R_MICROMIPS_TLS_TPREL_HI16, 4
   nop
-  .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4     # ASM: .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4
+  .reloc ., R_MICROMIPS_TLS_TPREL_LO16, 4
   nop
-  .reloc 128, R_MIPS_JALR, 4            # ASM: .reloc 128, R_MIPS_JALR, 4
+  .reloc ., R_MIPS_JALR, 4
   nop
-  .reloc 132, R_MICROMIPS_JALR, 4       # ASM: .reloc 132, R_MICROMIPS_JALR, 4
+  .reloc ., R_MICROMIPS_JALR, 4
   nop
-  .reloc 136, BFD_RELOC_NONE, 9         # ASM: .reloc 136, BFD_RELOC_NONE, 9
-  .reloc 137, BFD_RELOC_16, 9           # ASM: .reloc 137, BFD_RELOC_16, 9
-  .reloc 138, BFD_RELOC_32, 9           # ASM: .reloc 138, BFD_RELOC_32, 9
-  .reloc 139, BFD_RELOC_64, 9           # ASM: .reloc 139, BFD_RELOC_64, 9
+  .reloc ., BFD_RELOC_NONE, 9
+  .reloc ., BFD_RELOC_16, 9
+  .reloc ., BFD_RELOC_32, 9
+  .reloc ., BFD_RELOC_64, 9
   nop
 
 # OBJ-O32-LABEL: Name: .text
@@ -134,9 +135,9 @@ foo:
 # OBJ-O32-NEXT:  0x80 R_MIPS_JALR -
 # OBJ-O32-NEXT:  0x84 R_MICROMIPS_JALR -
 # OBJ-O32-NEXT:  0x88 R_MIPS_NONE -
-# OBJ-O32-NEXT:  0x89 R_MIPS_16 -
-# OBJ-O32-NEXT:  0x8A R_MIPS_32 -
-# OBJ-O32-NEXT:  0x8B R_MIPS_64 -
+# OBJ-O32-NEXT:  0x88 R_MIPS_16 -
+# OBJ-O32-NEXT:  0x88 R_MIPS_32 -
+# OBJ-O32-NEXT:  0x88 R_MIPS_64 -
 # OBJ-O32-NEXT:  0x1C R_MIPS_GOT16 -
 # OBJ-O32-NEXT:  0x60 R_MICROMIPS_GOT16 -
 
@@ -188,9 +189,9 @@ foo:
 # OBJ-N32-NEXT:  0x80 R_MIPS_JALR - 0x4
 # OBJ-N32-NEXT:  0x84 R_MICROMIPS_JALR - 0x4
 # OBJ-N32-NEXT:  0x88 R_MIPS_NONE - 0x9
-# OBJ-N32-NEXT:  0x89 R_MIPS_16 - 0x9
-# OBJ-N32-NEXT:  0x8A R_MIPS_32 - 0x9
-# OBJ-N32-NEXT:  0x8B R_MIPS_64 - 0x9
+# OBJ-N32-NEXT:  0x88 R_MIPS_16 - 0x9
+# OBJ-N32-NEXT:  0x88 R_MIPS_32 - 0x9
+# OBJ-N32-NEXT:  0x88 R_MIPS_64 - 0x9
 
 # OBJ-N64-LABEL: Name: .text
 # OBJ-N64:       0000: 00000000 00000000 00000000 00000000
@@ -239,6 +240,6 @@ foo:
 # OBJ-N64-NEXT:  0x80 R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4
 # OBJ-N64-NEXT:  0x84 R_MICROMIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4
 # OBJ-N64-NEXT:  0x88 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE - 0x9
-# OBJ-N64-NEXT:  0x89 R_MIPS_16/R_MIPS_NONE/R_MIPS_NONE - 0x9
-# OBJ-N64-NEXT:  0x8A R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE - 0x9
-# OBJ-N64-NEXT:  0x8B R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE - 0x9
+# OBJ-N64-NEXT:  0x88 R_MIPS_16/R_MIPS_NONE/R_MIPS_NONE - 0x9
+# OBJ-N64-NEXT:  0x88 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE - 0x9
+# OBJ-N64-NEXT:  0x88 R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE - 0x9
diff --git a/llvm/test/MC/PowerPC/ppc32-reloc-directive.s b/llvm/test/MC/PowerPC/ppc32-reloc-directive.s
index 3eb6c2964c85c..ca809a750a373 100644
--- a/llvm/test/MC/PowerPC/ppc32-reloc-directive.s
+++ b/llvm/test/MC/PowerPC/ppc32-reloc-directive.s
@@ -2,15 +2,15 @@
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc-linux-musl %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_PPC_NONE, .data
-# PRINT: .reloc 4, R_PPC_NONE, foo+4
-# PRINT: .reloc 0, R_PPC_NONE, 8
-# PRINT: .reloc 0, R_PPC_ADDR32, .data+2
-# PRINT: .reloc 0, R_PPC_REL16_HI, foo+3
-# PRINT: .reloc 0, R_PPC_REL16_HA, 5
-# PRINT: .reloc 0, BFD_RELOC_NONE, 9
-# PRINT: .reloc 0, BFD_RELOC_16, 9
-# PRINT: .reloc 0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+8, R_PPC_NONE, .data
+# PRINT: .reloc {{.*}}+4, R_PPC_NONE, foo+4
+# PRINT: .reloc {{.*}}+0, R_PPC_NONE, 8
+# PRINT: .reloc {{.*}}+0, R_PPC_ADDR32, .data+2
+# PRINT: .reloc {{.*}}+0, R_PPC_REL16_HI, foo+3
+# PRINT: .reloc {{.*}}+0, R_PPC_REL16_HA, 5
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9
 
 # CHECK:      0x8 R_PPC_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_PPC_NONE foo 0x4
@@ -23,19 +23,19 @@
 # CHECK-NEXT: 0x0 R_PPC_ADDR32 - 0x9
 
 .text
+  .reloc .+8, R_PPC_NONE, .data
+  .reloc .+4, R_PPC_NONE, foo+4
+  .reloc .+0, R_PPC_NONE, 8
+  .reloc .+0, R_PPC_ADDR32, .data+2
+  .reloc .+0, R_PPC_REL16_HI, foo+3
+  .reloc .+0, R_PPC_REL16_HA, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   blr
   nop
   nop
-  .reloc 8, R_PPC_NONE, .data
-  .reloc 4, R_PPC_NONE, foo+4
-  .reloc 0, R_PPC_NONE, 8
-  .reloc 0, R_PPC_ADDR32, .data+2
-  .reloc 0, R_PPC_REL16_HI, foo+3
-  .reloc 0, R_PPC_REL16_HA, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/PowerPC/ppc64-reloc-directive.s b/llvm/test/MC/PowerPC/ppc64-reloc-directive.s
index 5f54ac73bcf16..2268a3c18bf97 100644
--- a/llvm/test/MC/PowerPC/ppc64-reloc-directive.s
+++ b/llvm/test/MC/PowerPC/ppc64-reloc-directive.s
@@ -4,16 +4,16 @@
 # RUN: llvm-mc -filetype=obj -triple=powerpc64-linux-musl %s | llvm-readobj -r - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=powerpc64le-linux-musl %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_PPC64_NONE, .data
-# PRINT: .reloc 4, R_PPC64_NONE, foo+4
-# PRINT: .reloc 0, R_PPC64_NONE, 8
-# PRINT: .reloc 0, R_PPC64_ADDR32, .data+2
-# PRINT: .reloc 0, R_PPC64_REL16_HI, foo+3
-# PRINT: .reloc 0, R_PPC64_REL16_HA, 5
-# PRINT: .reloc 0, BFD_RELOC_NONE, 9
-# PRINT: .reloc 0, BFD_RELOC_16, 9
-# PRINT: .reloc 0, BFD_RELOC_32, 9
-# PRINT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_PPC64_NONE, .data
+# PRINT: .reloc {{.*}}+4, R_PPC64_NONE, foo+4
+# PRINT: .reloc {{.*}}+0, R_PPC64_NONE, 8
+# PRINT: .reloc {{.*}}+0, R_PPC64_ADDR32, .data+2
+# PRINT: .reloc {{.*}}+0, R_PPC64_REL16_HI, foo+3
+# PRINT: .reloc {{.*}}+0, R_PPC64_REL16_HA, 5
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_64, 9
 
 # CHECK:      0x8 R_PPC64_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_PPC64_NONE foo 0x4
@@ -27,20 +27,20 @@
 # CHECK-NEXT: 0x0 R_PPC64_ADDR64 - 0x9
 
 .text
+  .reloc .+8, R_PPC64_NONE, .data
+  .reloc .+4, R_PPC64_NONE, foo+4
+  .reloc .+0, R_PPC64_NONE, 8
+  .reloc .+0, R_PPC64_ADDR32, .data+2
+  .reloc .+0, R_PPC64_REL16_HI, foo+3
+  .reloc .+0, R_PPC64_REL16_HA, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   blr
   nop
   nop
-  .reloc 8, R_PPC64_NONE, .data
-  .reloc 4, R_PPC64_NONE, foo+4
-  .reloc 0, R_PPC64_NONE, 8
-  .reloc 0, R_PPC64_ADDR32, .data+2
-  .reloc 0, R_PPC64_REL16_HI, foo+3
-  .reloc 0, R_PPC64_REL16_HA, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index 24f3e67ebbdda..e8f4b14ce3725 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -3,15 +3,18 @@
 
 #      CHECK:Sections:[
 # CHECK-NEXT:MCSection Name:.text
-# CHECK-NEXT:0 Data Size:0 []
+# CHECK-NEXT:0 Align Size:0+0 []
+# CHECK-NEXT:  Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:  Symbol @0 .text
-# CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
 # CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023
 # CHECK-NEXT:  Symbol @0 $x
-# CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
-# CHECK-NEXT:12 Data Size:4 [13,05,30,00]
-# CHECK-NEXT:16 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
+# CHECK-NEXT:8 Align Size:0+4 []
+# CHECK-NEXT:  Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
+# CHECK-NEXT:  Fixup @0 Value:4 Kind:[[#]]
+# CHECK-NEXT:12 Align Size:4+4 [13,05,30,00]
+# CHECK-NEXT:  Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
+# CHECK-NEXT:  Fixup @4 Value:4 Kind:[[#]]
 # CHECK-NEXT:]
 
 call ext
diff --git a/llvm/test/MC/RISCV/reloc-directive-err.s b/llvm/test/MC/RISCV/reloc-directive-err.s
index 2b00019fcb8ea..370e4ceb95734 100644
--- a/llvm/test/MC/RISCV/reloc-directive-err.s
+++ b/llvm/test/MC/RISCV/reloc-directive-err.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=riscv64 %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=riscv64 %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/RISCV/reloc-directive.s b/llvm/test/MC/RISCV/reloc-directive.s
index 0e217fa798482..4ab2889a17ac9 100644
--- a/llvm/test/MC/RISCV/reloc-directive.s
+++ b/llvm/test/MC/RISCV/reloc-directive.s
@@ -3,15 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv32 %s | llvm-readobj -r - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_RISCV_NONE, .data
-# PRINT: .reloc 4, R_RISCV_NONE, foo+4
-# PRINT: .reloc 0, R_RISCV_NONE, 8
-# PRINT: .reloc 0, R_RISCV_32, .data+2
-# PRINT: .reloc 0, R_RISCV_SET32, foo+3
-# PRINT: .reloc 0, R_RISCV_32_PCREL, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_RISCV_NONE, .data
 
 # CHECK:      0x8 R_RISCV_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_RISCV_NONE foo 0x4
@@ -37,26 +29,26 @@
 # CHECK-NEXT: }
 
 .text
-  ret
-  nop
-  nop
-  .reloc 8, R_RISCV_NONE, .data
-  .reloc 4, R_RISCV_NONE, foo+4
-  .reloc 0, R_RISCV_NONE, 8
+  .reloc .+8, R_RISCV_NONE, .data
+  .reloc .+4, R_RISCV_NONE, foo+4
+  .reloc .+0, R_RISCV_NONE, 8
 
-  .reloc 0, R_RISCV_32, .data+2
-  .reloc 0, R_RISCV_SET32, foo+3
-  .reloc 0, R_RISCV_32_PCREL, 5
+  .reloc .+0, R_RISCV_32, .data+2
+  .reloc .+0, R_RISCV_SET32, foo+3
+  .reloc .+0, R_RISCV_32_PCREL, 5
 
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
 
   .reloc foo, R_RISCV_32, 6
   .reloc line, R_RISCV_32, 6
   .reloc probe, R_RISCV_32, 6
-
   .reloc foo+4, R_RISCV_32, 6
+  ret
+  nop
+  nop
+
 .data
 .globl foo
 foo:
diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s
index c259c142f92b2..ffff0f25642a3 100644
--- a/llvm/test/MC/RISCV/rv32p-valid.s
+++ b/llvm/test/MC/RISCV/rv32p-valid.s
@@ -71,8 +71,8 @@ psabs.h a1, a2
 # CHECK-ASM: encoding: [0x9b,0x22,0x73,0xe4]
 psabs.b t0, t1
 # CHECK-ASM-AND-OBJ: plui.h gp, 32
-# CHECK-ASM: encoding: [0x9b,0x21,0x20,0xf0]
+# CHECK-ASM: encoding: [0x9b,0x21,0x08,0xf0]
 plui.h gp, 32
 # CHECK-ASM-AND-OBJ: plui.h gp, -412
-# CHECK-ASM: encoding: [0x9b,0xa1,0x64,0xf0]
+# CHECK-ASM: encoding: [0x9b,0x21,0x99,0xf0]
 plui.h gp, 612
diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s
index 3ea6b00bbe11c..a0d6eadfb6c30 100644
--- a/llvm/test/MC/RISCV/rv64p-valid.s
+++ b/llvm/test/MC/RISCV/rv64p-valid.s
@@ -95,13 +95,13 @@ psabs.h t1, t5
 # CHECK-ASM: encoding: [0x1b,0x25,0x79,0xe4]
 psabs.b a0, s2
 # CHECK-ASM-AND-OBJ: plui.h s2, 4
-# CHECK-ASM: encoding: [0x1b,0x29,0x04,0xf0]
+# CHECK-ASM: encoding: [0x1b,0x29,0x01,0xf0]
 plui.h s2, 4
 # CHECK-ASM-AND-OBJ: plui.h gp, -412
-# CHECK-ASM: encoding: [0x9b,0xa1,0x64,0xf0]
+# CHECK-ASM: encoding: [0x9b,0x21,0x99,0xf0]
 plui.h gp, 612
 # CHECK-ASM-AND-OBJ: plui.w a2, 1
-# CHECK-ASM: encoding: [0x1b,0x26,0x01,0xf2]
+# CHECK-ASM: encoding: [0x1b,0x26,0x00,0xf3]
 plui.w a2, 1
 # CHECK-ASM-AND-OBJ: plui.w a2, -1
 # CHECK-ASM: encoding: [0x1b,0xa6,0xff,0xf3]
diff --git a/llvm/test/MC/Sparc/Relocations/reloc-directive.s b/llvm/test/MC/Sparc/Relocations/reloc-directive.s
index 8899408ee428d..26164b3c2eb38 100644
--- a/llvm/test/MC/Sparc/Relocations/reloc-directive.s
+++ b/llvm/test/MC/Sparc/Relocations/reloc-directive.s
@@ -3,15 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=sparc %s | llvm-readobj -r - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=sparcv9 %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_SPARC_NONE, .data
-# PRINT: .reloc 4, R_SPARC_NONE, foo+4
-# PRINT: .reloc 0, R_SPARC_NONE, 8
-# PRINT: .reloc 0, R_SPARC_32, .data+2
-# PRINT: .reloc 0, R_SPARC_UA16, foo+3
-# PRINT: .reloc 0, R_SPARC_DISP32, foo+5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, foo+2
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, foo+3
+# PRINT: .reloc {{.*}}+8, R_SPARC_NONE, .data
 
 # CHECK:      0x8 R_SPARC_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_SPARC_NONE foo 0x4
@@ -23,20 +15,20 @@
 # CHECK-NEXT: 0x0 R_SPARC_32 foo 0x2
 # CHECK-NEXT: 0x0 R_SPARC_64 foo 0x3
 .text
+  .reloc .+8, R_SPARC_NONE, .data
+  .reloc .+4, R_SPARC_NONE, foo+4
+  .reloc .+0, R_SPARC_NONE, 8
+
+  .reloc .+0, R_SPARC_32, .data+2
+  .reloc .+0, R_SPARC_UA16, foo+3
+  .reloc .+0, R_SPARC_DISP32, foo+5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_32, foo+2
+  .reloc .+0, BFD_RELOC_64, foo+3
   ret
   nop
   nop
-  .reloc 8, R_SPARC_NONE, .data
-  .reloc 4, R_SPARC_NONE, foo+4
-  .reloc 0, R_SPARC_NONE, 8
-
-  .reloc 0, R_SPARC_32, .data+2
-  .reloc 0, R_SPARC_UA16, foo+3
-  .reloc 0, R_SPARC_DISP32, foo+5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_32, foo+2
-  .reloc 0, BFD_RELOC_64, foo+3
 
 .data
 .globl foo
diff --git a/llvm/test/MC/SystemZ/reloc-directive.s b/llvm/test/MC/SystemZ/reloc-directive.s
index abc6ca320642d..78c36e1434574 100644
--- a/llvm/test/MC/SystemZ/reloc-directive.s
+++ b/llvm/test/MC/SystemZ/reloc-directive.s
@@ -3,19 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=s390x-linux-gnu %s -o %t
 # RUN: llvm-readobj -r %t | FileCheck %s
 
-# PRINT:      .reloc 2, R_390_NONE, .data
-# PRINT-NEXT: .reloc 1, R_390_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_390_NONE, 8
-# PRINT-NEXT: .reloc 0, R_390_64, .data+2
-# PRINT-NEXT: .reloc 0, R_390_GOTENT, foo+3
-# PRINT-NEXT: .reloc 0, R_390_PC32DBL, 6
-# PRINT-NEXT: .reloc 4, R_390_12, foo
-# PRINT-NEXT: .reloc 2, R_390_20, foo
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT:      .reloc {{.*}}+2, R_390_NONE, .data
 
 # CHECK:      0x2 R_390_NONE .data 0x0
 # CHECK-NEXT: 0x1 R_390_NONE foo 0x4
@@ -32,23 +20,23 @@
 # CHECK-NEXT: 0x0 R_390_64 - 0x9
 
 .text
+  .reloc .+2, R_390_NONE, .data
+  .reloc .+1, R_390_NONE, foo+4
+  .reloc .+0, R_390_NONE, 8
+  .reloc .+0, R_390_64, .data+2
+  .reloc .+0, R_390_GOTENT, foo+3
+  .reloc .+0, R_390_PC32DBL, 6
+  .reloc .+4, R_390_12, foo
+  .reloc .+2, R_390_20, foo
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   br %r14
   nop
   nop
-  .reloc 2, R_390_NONE, .data
-  .reloc 1, R_390_NONE, foo+4
-  .reloc 0, R_390_NONE, 8
-  .reloc 0, R_390_64, .data+2
-  .reloc 0, R_390_GOTENT, foo+3
-  .reloc 0, R_390_PC32DBL, 6
-  .reloc 4, R_390_12, foo
-  .reloc 2, R_390_20, foo
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/X86/reloc-directive-elf-32.s b/llvm/test/MC/X86/reloc-directive-elf-32.s
index d4b612ebfcefc..d3112dd5f7daf 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-32.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-32.s
@@ -4,16 +4,7 @@
 # RUN: llvm-readobj -r %t | FileCheck %s
 # RUN: llvm-readelf -x .data %t | FileCheck --check-prefix=HEX %s
 
-# PRINT:      .reloc 2, R_386_NONE, .data
-# PRINT-NEXT: .reloc 1, R_386_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_386_NONE, 8
-# PRINT-NEXT: .reloc 0, R_386_32, .data+2
-# PRINT-NEXT: .reloc 0, R_386_IRELATIVE, foo+3
-# PRINT-NEXT: .reloc 0, R_386_GOT32X, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
+# PRINT:      .reloc {{.*}}+2, R_386_NONE, .data
 
 # X86 relocations use the Elf32_Rel format. Addends are neither stored in the
 # relocation entries nor applied in the referenced locations.
@@ -31,20 +22,20 @@
 # HEX: 0x00000000 00000000 00000000
 
 .text
+  .reloc .+2, R_386_NONE, .data
+  .reloc .+1, R_386_NONE, foo+4
+  .reloc .+0, R_386_NONE, 8
+  .reloc .+0, R_386_32, .data+2
+  .reloc .+0, R_386_IRELATIVE, foo+3
+  .reloc .+0, R_386_GOT32X, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   ret
   nop
   nop
-  .reloc 2, R_386_NONE, .data
-  .reloc 1, R_386_NONE, foo+4
-  .reloc 0, R_386_NONE, 8
-  .reloc 0, R_386_32, .data+2
-  .reloc 0, R_386_IRELATIVE, foo+3
-  .reloc 0, R_386_GOT32X, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s
index e0a1a5730597f..d6b8db98d5d08 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-64.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-64.s
@@ -3,18 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux-musl %s -o %t
 # RUN: llvm-readobj -r %t | FileCheck %s
 
-# PRINT:      .reloc 2, R_X86_64_NONE, .data
-# PRINT-NEXT: .reloc 1, R_X86_64_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_X86_64_NONE, 8
-# PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2
-# PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3
-# PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5
-# PRINT-NEXT: .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT:      .reloc {{.*}}+2, R_X86_64_NONE, .data
 
 # CHECK:      0x2 R_X86_64_NONE .data 0x0
 # CHECK-NEXT: 0x1 R_X86_64_NONE foo 0x4
@@ -30,22 +19,22 @@
 # CHECK-NEXT: 0x0 R_X86_64_64 - 0x9
 
 .text
+  .reloc .+2, R_X86_64_NONE, .data
+  .reloc .+1, R_X86_64_NONE, foo+4
+  .reloc .+0, R_X86_64_NONE, 8
+  .reloc .+0, R_X86_64_64, .data+2
+  .reloc .+0, R_X86_64_GOTPCRELX, foo+3
+  .reloc .+0, R_X86_64_REX_GOTPCRELX, 5
+  .reloc .+0, R_X86_64_CODE_4_GOTPCRELX, 7
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   ret
   nop
   nop
-  .reloc 2, R_X86_64_NONE, .data
-  .reloc 1, R_X86_64_NONE, foo+4
-  .reloc 0, R_X86_64_NONE, 8
-  .reloc 0, R_X86_64_64, .data+2
-  .reloc 0, R_X86_64_GOTPCRELX, foo+3
-  .reloc 0, R_X86_64_REX_GOTPCRELX, 5
-  .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/X86/reloc-directive.s b/llvm/test/MC/X86/reloc-directive.s
index 5f4fc2394f5e7..124dc06951122 100644
--- a/llvm/test/MC/X86/reloc-directive.s
+++ b/llvm/test/MC/X86/reloc-directive.s
@@ -8,16 +8,16 @@
 # RUN:     FileCheck -check-prefix=OBJ-64 %s
 	.text
 foo:
+	.reloc .+4, dir32,    foo          # ASM: .reloc {{.*}}+4, dir32, foo
+	.reloc .+0, secrel32, foo+4        # ASM: .reloc {{.*}}+0, secrel32, foo+4
+	.reloc .+8, secidx,   foo+8        # ASM: .reloc {{.*}}+8, secidx, foo+8
+	.reloc .+12, dir32,   foo@secrel32 # ASM: .reloc {{.*}}+12, dir32, foo@SECREL32
+	.reloc .+16, dir32,   foo@imgrel   # ASM: .reloc {{.*}}+16, dir32, foo@IMGREL
 	.long 0
 	.long 0
 	.long 0
 	.long 0
 	.long 0
-	.reloc 4, dir32,    foo          # ASM: .reloc 4, dir32, foo
-	.reloc 0, secrel32, foo+4        # ASM: .reloc 0, secrel32, foo+4
-	.reloc 8, secidx,   foo+8        # ASM: .reloc 8, secidx, foo+8
-	.reloc 12, dir32,   foo@secrel32 # ASM: .reloc 12, dir32, foo@SECREL32
-	.reloc 16, dir32,   foo@imgrel   # ASM: .reloc 16, dir32, foo@IMGREL
 
 # OBJ-32-LABEL: Name: .text
 # OBJ-32:       0000: 04000000 00000000 00000000
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index db398d68fd426..6fa57f17174e9 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -32,7 +32,7 @@
 ; CHECK-10: function(loop-unroll<O2>,loop-unroll<partial;peeling;runtime;upperbound;profile-peeling;full-unroll-max=5;O1>,loop-unroll<no-partial;no-peeling;no-runtime;no-upperbound;no-profile-peeling;full-unroll-max=7;O1>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-11
-; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;no-memdep;no-memoryssa>)
+; CHECK-11: function(gvn<>,gvn<pre;load-pre;split-backedge-load-pre;no-memdep;memoryssa>,gvn<no-pre;no-load-pre;no-split-backedge-load-pre;memdep;no-memoryssa>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(early-cse<>,early-cse<memssa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-12
 ; CHECK-12: function(early-cse<>,early-cse<memssa>)
diff --git a/llvm/test/Other/opt-bisect-new-pass-manager.ll b/llvm/test/Other/opt-bisect-new-pass-manager.ll
index 01dad705ec362..8f8078d4d8409 100644
--- a/llvm/test/Other/opt-bisect-new-pass-manager.ll
+++ b/llvm/test/Other/opt-bisect-new-pass-manager.ll
@@ -11,84 +11,84 @@
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-PASS
-; CHECK-MODULE-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-MODULE-PASS: BISECT: running pass (1) inferattrs on [module]
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=0 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-MODULE-PASS
-; CHECK-LIMIT-MODULE-PASS: BISECT: NOT running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-LIMIT-MODULE-PASS: BISECT: NOT running pass (1) inferattrs on [module]
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-REQUIRED-PASS
-; CHECK-REQUIRED-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-REQUIRED-PASS: BISECT: running pass (1) inferattrs on [module]
 ; CHECK-REQUIRED-PASS-NOT: BISECT: {{.*}}VerifierPass
 ; CHECK-REQUIRED-PASS: Running pass: VerifierPass
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=0 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-REQUIRED-PASS
-; CHECK-LIMIT-REQUIRED-PASS: BISECT: NOT running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-LIMIT-REQUIRED-PASS: BISECT: NOT running pass (1) inferattrs on [module]
 ; CHECK-LIMIT-REQUIRED-PASS-NOT: BISECT: {{.*}}VerifierPass
 ; CHECK-LIMIT-REQUIRED-PASS: Running pass: VerifierPass
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=early-cse -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS
-; CHECK-FUNCTION-PASS: BISECT: running pass (1) EarlyCSEPass on f1
-; CHECK-FUNCTION-PASS: BISECT: running pass (2) EarlyCSEPass on f2
-; CHECK-FUNCTION-PASS: BISECT: running pass (3) EarlyCSEPass on f3
-; CHECK-FUNCTION-PASS: BISECT: running pass (4) EarlyCSEPass on f4
+; CHECK-FUNCTION-PASS: BISECT: running pass (1) early-cse on f1
+; CHECK-FUNCTION-PASS: BISECT: running pass (2) early-cse on f2
+; CHECK-FUNCTION-PASS: BISECT: running pass (3) early-cse on f3
+; CHECK-FUNCTION-PASS: BISECT: running pass (4) early-cse on f4
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=early-cse -opt-bisect-limit=2 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-FUNCTION-PASS
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (1) EarlyCSEPass on f1
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (2) EarlyCSEPass on f2
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (3) EarlyCSEPass on f3
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (4) EarlyCSEPass on f4
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (1) early-cse on f1
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (2) early-cse on f2
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (3) early-cse on f3
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (4) early-cse on f4
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=function-attrs -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-CGSCC-PASS
-; CHECK-CGSCC-PASS: BISECT: running pass (1) PostOrderFunctionAttrsPass on (f1)
-; CHECK-CGSCC-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f2)
-; CHECK-CGSCC-PASS: BISECT: running pass (3) PostOrderFunctionAttrsPass on (f3)
-; CHECK-CGSCC-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f4)
+; CHECK-CGSCC-PASS: BISECT: running pass (1) function-attrs on (f1)
+; CHECK-CGSCC-PASS: BISECT: running pass (2) function-attrs on (f2)
+; CHECK-CGSCC-PASS: BISECT: running pass (3) function-attrs on (f3)
+; CHECK-CGSCC-PASS: BISECT: running pass (4) function-attrs on (f4)
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=function-attrs -opt-bisect-limit=3 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-CGSCC-PASS
-; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (1) PostOrderFunctionAttrsPass on (f1)
-; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f2)
-; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (3) PostOrderFunctionAttrsPass on (f3)
-; CHECK-LIMIT-CGSCC-PASS: BISECT: NOT running pass (4) PostOrderFunctionAttrsPass on (f4)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (1) function-attrs on (f1)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (2) function-attrs on (f2)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (3) function-attrs on (f3)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: NOT running pass (4) function-attrs on (f4)
 
 ; RUN: opt -disable-output -disable-verify -opt-bisect-limit=-1 \
 ; RUN:     -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MULTI-PASS
-; CHECK-MULTI-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
-; CHECK-MULTI-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f1)
-; CHECK-MULTI-PASS: BISECT: running pass (3) EarlyCSEPass on f1
-; CHECK-MULTI-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f2)
-; CHECK-MULTI-PASS: BISECT: running pass (5) EarlyCSEPass on f2
-; CHECK-MULTI-PASS: BISECT: running pass (6) PostOrderFunctionAttrsPass on (f3)
-; CHECK-MULTI-PASS: BISECT: running pass (7) EarlyCSEPass on f3
-; CHECK-MULTI-PASS: BISECT: running pass (8) PostOrderFunctionAttrsPass on (f4)
-; CHECK-MULTI-PASS: BISECT: running pass (9) EarlyCSEPass on f4
+; CHECK-MULTI-PASS: BISECT: running pass (1) inferattrs on [module]
+; CHECK-MULTI-PASS: BISECT: running pass (2) function-attrs on (f1)
+; CHECK-MULTI-PASS: BISECT: running pass (3) early-cse on f1
+; CHECK-MULTI-PASS: BISECT: running pass (4) function-attrs on (f2)
+; CHECK-MULTI-PASS: BISECT: running pass (5) early-cse on f2
+; CHECK-MULTI-PASS: BISECT: running pass (6) function-attrs on (f3)
+; CHECK-MULTI-PASS: BISECT: running pass (7) early-cse on f3
+; CHECK-MULTI-PASS: BISECT: running pass (8) function-attrs on (f4)
+; CHECK-MULTI-PASS: BISECT: running pass (9) early-cse on f4
 
 ; RUN: opt -disable-output -disable-verify -opt-bisect-limit=7 \
 ; RUN:     -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-MULTI-PASS
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f1)
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (3) EarlyCSEPass on f1
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f2)
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (5) EarlyCSEPass on f2
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (6) PostOrderFunctionAttrsPass on (f3)
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (7) EarlyCSEPass on f3
-; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (8) PostOrderFunctionAttrsPass on (f4)
-; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (9) EarlyCSEPass on f4
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (1) inferattrs on [module]
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (2) function-attrs on (f1)
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (3) early-cse on f1
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (4) function-attrs on (f2)
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (5) early-cse on f2
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (6) function-attrs on (f3)
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (7) early-cse on f3
+; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (8) function-attrs on (f4)
+; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (9) early-cse on f4
 
 ; Make sure we don't skip writing the output to stdout.
 ; RUN: opt %s -opt-bisect-limit=0 -passes=early-cse | opt -S | FileCheck %s -check-prefix=CHECK-OUTPUT
diff --git a/llvm/test/Other/opt-disable.ll b/llvm/test/Other/opt-disable.ll
new file mode 100644
index 0000000000000..4506042215cbf
--- /dev/null
+++ b/llvm/test/Other/opt-disable.ll
@@ -0,0 +1,91 @@
+; This test uses the same IR functions of the opt-bisect test
+; but it checks the correctness of the -opt-disable flag.
+; -opt-disable-enable-verbosity is required to have output.
+
+; RUN: opt -disable-output -disable-verify \
+; RUN:     -opt-disable-enable-verbosity \
+; RUN:     -passes=inferattrs -opt-disable=inferattrs %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-PASS
+; CHECK-MODULE-PASS: OptDisable: NOT running pass inferattrs on [module]
+
+; RUN: opt -disable-output -disable-verify \
+; RUN:     -opt-disable-enable-verbosity \
+; RUN:     -passes=sroa -opt-disable=sroa %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f1
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f2
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f3
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f4
+
+; RUN: opt -disable-output -disable-verify \
+; RUN:     -opt-disable=inferattrs,function-attrs  \
+; RUN:     -opt-disable-enable-verbosity \
+; RUN:     -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-MULTI-PASS
+; CHECK-MULTI-PASS: OptDisable: NOT running pass inferattrs on [module]
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f1)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f1
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f2)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f2
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f3)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f3
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f4)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f4
+
+declare i32 @g()
+
+define void @f1(i1 %arg) {
+entry:
+  br label %loop.0
+loop.0:
+  br i1 %arg, label %loop.0.0, label %loop.1
+loop.0.0:
+  br i1 %arg, label %loop.0.0, label %loop.0.1
+loop.0.1:
+  br i1 %arg, label %loop.0.1, label %loop.0
+loop.1:
+  br i1 %arg, label %loop.1, label %loop.1.bb1
+loop.1.bb1:
+  br i1 %arg, label %loop.1, label %loop.1.bb2
+loop.1.bb2:
+  br i1 %arg, label %end, label %loop.1.0
+loop.1.0:
+  br i1 %arg, label %loop.1.0, label %loop.1
+end:
+  ret void
+}
+
+define i32 @f2() {
+entry:
+  ret i32 0
+}
+
+define i32 @f3() {
+entry:
+  %temp = call i32 @g()
+  %icmp = icmp ugt i32 %temp, 2
+  br i1 %icmp, label %bb.true, label %bb.false
+bb.true:
+  %temp2 = call i32 @f2()
+  ret i32 %temp2
+bb.false:
+  ret i32 0
+}
+
+define void @f4(i1 %arg) {
+entry:
+  %i = alloca i32, align 4
+  call void @llvm.lifetime.start(i64 4, ptr %i)
+  br label %for.cond
+
+for.cond:
+  br i1 %arg, label %for.body, label %for.end
+
+for.body:
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, ptr nocapture)
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 1d2bd51204e4f..3eda077eeabf7 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -53,6 +53,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  #include "llvm/ADT/ArrayRef.h"
 // CHECK-NEXT:  #include "llvm/ADT/BitmaskEnum.h"
+// CHECK-NEXT:  #include "llvm/ADT/Sequence.h"
 // CHECK-NEXT:  #include "llvm/ADT/StringRef.h"
 // CHECK-NEXT:  #include "llvm/Frontend/Directive/Spelling.h"
 // CHECK-NEXT:  #include "llvm/Support/Compiler.h"
@@ -66,22 +67,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
 // CHECK-NEXT:    Block,
+// CHECK-NEXT:    First_ = Block,
 // CHECK-NEXT:    Declaration,
 // CHECK-NEXT:    Delimited,
 // CHECK-NEXT:    Loop,
 // CHECK-NEXT:    None,
 // CHECK-NEXT:    Separating,
+// CHECK-NEXT:    Last_ = Separating,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Category {
 // CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    First_ = Declarative,
 // CHECK-NEXT:    Executable,
 // CHECK-NEXT:    Informational,
 // CHECK-NEXT:    Meta,
 // CHECK-NEXT:    Subsidiary,
 // CHECK-NEXT:    Utility,
+// CHECK-NEXT:    Last_ = Utility,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
@@ -96,6 +101,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
+// CHECK-NEXT:    First_ = TDLD_dira,
+// CHECK-NEXT:    Last_ = TDLD_dira,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Directive_enumSize = 1;
@@ -104,8 +111,10 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Clause {
 // CHECK-NEXT:    TDLC_clausea,
+// CHECK-NEXT:    First_ = TDLC_clausea,
 // CHECK-NEXT:    TDLC_clauseb,
 // CHECK-NEXT:    TDLC_clausec,
+// CHECK-NEXT:    Last_ = TDLC_clausec,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Clause_enumSize = 3;
@@ -151,6 +160,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI StringRef getTdlAKindName(AKind x);
 // CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace tdl
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Category> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Directive> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 3a64bb3900a31..a25197c3efd93 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -46,6 +46,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #define LLVM_Tdl_INC
 // CHECK-EMPTY:
 // CHECK-NEXT:  #include "llvm/ADT/ArrayRef.h"
+// CHECK-NEXT:  #include "llvm/ADT/Sequence.h"
 // CHECK-NEXT:  #include "llvm/ADT/StringRef.h"
 // CHECK-NEXT:  #include "llvm/Frontend/Directive/Spelling.h"
 // CHECK-NEXT:  #include "llvm/Support/Compiler.h"
@@ -57,22 +58,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
 // CHECK-NEXT:    Block,
+// CHECK-NEXT:    First_ = Block,
 // CHECK-NEXT:    Declaration,
 // CHECK-NEXT:    Delimited,
 // CHECK-NEXT:    Loop,
 // CHECK-NEXT:    None,
 // CHECK-NEXT:    Separating,
+// CHECK-NEXT:    Last_ = Separating,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Category {
 // CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    First_ = Declarative,
 // CHECK-NEXT:    Executable,
 // CHECK-NEXT:    Informational,
 // CHECK-NEXT:    Meta,
 // CHECK-NEXT:    Subsidiary,
 // CHECK-NEXT:    Utility,
+// CHECK-NEXT:    Last_ = Utility,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
@@ -87,15 +92,19 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
+// CHECK-NEXT:    First_ = TDLD_dira,
+// CHECK-NEXT:    Last_ = TDLD_dira,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Directive_enumSize = 1;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Clause {
 // CHECK-NEXT:    TDLC_clausea,
+// CHECK-NEXT:    First_ = TDLC_clausea,
 // CHECK-NEXT:    TDLC_clauseb,
 // CHECK-NEXT:    TDLC_clausec,
 // CHECK-NEXT:    TDLC_claused,
+// CHECK-NEXT:    Last_ = TDLC_claused,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Clause_enumSize = 4;
@@ -124,6 +133,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);
 // CHECK-NEXT:  } // namespace tdl
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Category> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Directive> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 72d282fe7bef5..c5eedb2841b8c 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -143,13 +143,14 @@ attributes #0 = { noinline optnone }
 !12 = !{i64 789, i64 300}
 !13 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !14, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git e391301e0e4d9183fe06e69602e87b0bc889aeda)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !14 = !DIFile(filename: "basic.cc", directory: "", checksumkind: CSK_MD5, checksum: "8636c46e81402013b9d54e8307d2f149")
-!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13)
+!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13, declaration: !22)
 !16 = !DISubroutineType(types: !17)
 !17 = !{!18}
 !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
 !19 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
 !20 = !{i32 7, !"Dwarf Version", i32 5}
 !21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
 
 ; DUMP: CCG before cloning:
 ; DUMP: Callsite Context Graph:
@@ -321,7 +322,8 @@ attributes #0 = { noinline optnone }
 ; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
 ; IR: attributes #[[COLD]] = { "memprof"="cold" }
 ;; Make sure the clone's linkageName was updated.
-; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
+; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1", {{.*}} declaration: ![[SP2:[0-9]+]])
+; IR: ![[SP2]] = !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
 
 
 ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll
new file mode 100644
index 0000000000000..34f39245500b9
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge-be.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu -data-layout="E-n64" < %s | FileCheck %s
+
+; Pretend X86 is big endian.
+
+; FIXME: Big endian not supported yet.
+
+define void @test_i32_be(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_be(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[GEP_0]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i32 [[TMP1]] to i8
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  %gep.0 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.0, ptr %gep.0
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.1, ptr %gep.1
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  store i8 %x.3, ptr %p
+  ret void
+}
+
+define void @test_i32_le(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_le(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_i32_mixed_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_mixed_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[GEP_0]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i16 [[X_1]], ptr [[GEP_1]], align 2
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  %gep.0 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.0, ptr %gep.0
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i16 %x.1, ptr %gep.1
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  store i8 %x.3, ptr %p
+  ret void
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
new file mode 100644
index 0000000000000..4ab8d18eb69b5
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
@@ -0,0 +1,911 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+declare void @use.i16(i16)
+declare void @use.i32(i32)
+
+define void @test_i16(i16 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i16(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i16 [[X]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_i8_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_i8_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_i32_i16_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_i16_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_mixed_parts(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_mixed_parts(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i16 %x.1, ptr %gep.1
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_i64(i64 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i64(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i64 [[X]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i64 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i64 %x, 8
+  %x.1 = trunc i64 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.2 = lshr i64 %x, 16
+  %x.2 = trunc i64 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i64 %x, 24
+  %x.3 = trunc i64 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  %shr.4 = lshr i64 %x, 32
+  %x.4 = trunc i64 %shr.4 to i8
+  %gep.4 = getelementptr i8, ptr %p, i64 4
+  store i8 %x.4, ptr %gep.4
+  %shr.5 = lshr i64 %x, 40
+  %x.5 = trunc i64 %shr.5 to i8
+  %gep.5 = getelementptr i8, ptr %p, i64 5
+  store i8 %x.5, ptr %gep.5
+  %shr.6 = lshr i64 %x, 48
+  %x.6 = trunc i64 %shr.6 to i8
+  %gep.6 = getelementptr i8, ptr %p, i64 6
+  store i8 %x.6, ptr %gep.6
+  %shr.7 = lshr i64 %x, 56
+  %x.7 = trunc i64 %shr.7 to i8
+  %gep.7 = getelementptr i8, ptr %p, i64 7
+  store i8 %x.7, ptr %gep.7
+  ret void
+}
+
+define void @test_i128(i128 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i128(
+; CHECK-SAME: i128 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i128 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i128 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i128 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    [[SHR_2:%.*]] = lshr i128 [[X]], 16
+; CHECK-NEXT:    [[X_2:%.*]] = trunc i128 [[SHR_2]] to i8
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i128 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i128 [[SHR_3]] to i8
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT:    [[SHR_4:%.*]] = lshr i128 [[X]], 32
+; CHECK-NEXT:    [[X_4:%.*]] = trunc i128 [[SHR_4]] to i8
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr i8, ptr [[P]], i64 4
+; CHECK-NEXT:    store i8 [[X_4]], ptr [[GEP_4]], align 1
+; CHECK-NEXT:    [[SHR_5:%.*]] = lshr i128 [[X]], 40
+; CHECK-NEXT:    [[X_5:%.*]] = trunc i128 [[SHR_5]] to i8
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr i8, ptr [[P]], i64 5
+; CHECK-NEXT:    store i8 [[X_5]], ptr [[GEP_5]], align 1
+; CHECK-NEXT:    [[SHR_6:%.*]] = lshr i128 [[X]], 48
+; CHECK-NEXT:    [[X_6:%.*]] = trunc i128 [[SHR_6]] to i8
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr i8, ptr [[P]], i64 6
+; CHECK-NEXT:    store i8 [[X_6]], ptr [[GEP_6]], align 1
+; CHECK-NEXT:    [[SHR_7:%.*]] = lshr i128 [[X]], 56
+; CHECK-NEXT:    [[X_7:%.*]] = trunc i128 [[SHR_7]] to i8
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr i8, ptr [[P]], i64 7
+; CHECK-NEXT:    store i8 [[X_7]], ptr [[GEP_7]], align 1
+; CHECK-NEXT:    [[SHR_8:%.*]] = lshr i128 [[X]], 64
+; CHECK-NEXT:    [[X_8:%.*]] = trunc i128 [[SHR_8]] to i8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i8 [[X_8]], ptr [[GEP_8]], align 1
+; CHECK-NEXT:    [[SHR_9:%.*]] = lshr i128 [[X]], 72
+; CHECK-NEXT:    [[X_9:%.*]] = trunc i128 [[SHR_9]] to i8
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr i8, ptr [[P]], i64 9
+; CHECK-NEXT:    store i8 [[X_9]], ptr [[GEP_9]], align 1
+; CHECK-NEXT:    [[SHR_10:%.*]] = lshr i128 [[X]], 80
+; CHECK-NEXT:    [[X_10:%.*]] = trunc i128 [[SHR_10]] to i8
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr i8, ptr [[P]], i64 10
+; CHECK-NEXT:    store i8 [[X_10]], ptr [[GEP_10]], align 1
+; CHECK-NEXT:    [[SHR_11:%.*]] = lshr i128 [[X]], 88
+; CHECK-NEXT:    [[X_11:%.*]] = trunc i128 [[SHR_11]] to i8
+; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr i8, ptr [[P]], i64 11
+; CHECK-NEXT:    store i8 [[X_11]], ptr [[GEP_11]], align 1
+; CHECK-NEXT:    [[SHR_12:%.*]] = lshr i128 [[X]], 96
+; CHECK-NEXT:    [[X_12:%.*]] = trunc i128 [[SHR_12]] to i8
+; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr i8, ptr [[P]], i64 12
+; CHECK-NEXT:    store i8 [[X_12]], ptr [[GEP_12]], align 1
+; CHECK-NEXT:    [[SHR_13:%.*]] = lshr i128 [[X]], 104
+; CHECK-NEXT:    [[X_13:%.*]] = trunc i128 [[SHR_13]] to i8
+; CHECK-NEXT:    [[GEP_13:%.*]] = getelementptr i8, ptr [[P]], i64 13
+; CHECK-NEXT:    store i8 [[X_13]], ptr [[GEP_13]], align 1
+; CHECK-NEXT:    [[SHR_14:%.*]] = lshr i128 [[X]], 112
+; CHECK-NEXT:    [[X_14:%.*]] = trunc i128 [[SHR_14]] to i8
+; CHECK-NEXT:    [[GEP_14:%.*]] = getelementptr i8, ptr [[P]], i64 14
+; CHECK-NEXT:    store i8 [[X_14]], ptr [[GEP_14]], align 1
+; CHECK-NEXT:    [[SHR_15:%.*]] = lshr i128 [[X]], 120
+; CHECK-NEXT:    [[X_15:%.*]] = trunc i128 [[SHR_15]] to i8
+; CHECK-NEXT:    [[GEP_15:%.*]] = getelementptr i8, ptr [[P]], i64 15
+; CHECK-NEXT:    store i8 [[X_15]], ptr [[GEP_15]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i128 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i128 %x, 8
+  %x.1 = trunc i128 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.2 = lshr i128 %x, 16
+  %x.2 = trunc i128 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i128 %x, 24
+  %x.3 = trunc i128 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  %shr.4 = lshr i128 %x, 32
+  %x.4 = trunc i128 %shr.4 to i8
+  %gep.4 = getelementptr i8, ptr %p, i64 4
+  store i8 %x.4, ptr %gep.4
+  %shr.5 = lshr i128 %x, 40
+  %x.5 = trunc i128 %shr.5 to i8
+  %gep.5 = getelementptr i8, ptr %p, i64 5
+  store i8 %x.5, ptr %gep.5
+  %shr.6 = lshr i128 %x, 48
+  %x.6 = trunc i128 %shr.6 to i8
+  %gep.6 = getelementptr i8, ptr %p, i64 6
+  store i8 %x.6, ptr %gep.6
+  %shr.7 = lshr i128 %x, 56
+  %x.7 = trunc i128 %shr.7 to i8
+  %gep.7 = getelementptr i8, ptr %p, i64 7
+  store i8 %x.7, ptr %gep.7
+  %shr.8 = lshr i128 %x, 64
+  %x.8 = trunc i128 %shr.8 to i8
+  %gep.8 = getelementptr i8, ptr %p, i64 8
+  store i8 %x.8, ptr %gep.8
+  %shr.9 = lshr i128 %x, 72
+  %x.9 = trunc i128 %shr.9 to i8
+  %gep.9 = getelementptr i8, ptr %p, i64 9
+  store i8 %x.9, ptr %gep.9
+  %shr.10 = lshr i128 %x, 80
+  %x.10 = trunc i128 %shr.10 to i8
+  %gep.10 = getelementptr i8, ptr %p, i64 10
+  store i8 %x.10, ptr %gep.10
+  %shr.11 = lshr i128 %x, 88
+  %x.11 = trunc i128 %shr.11 to i8
+  %gep.11 = getelementptr i8, ptr %p, i64 11
+  store i8 %x.11, ptr %gep.11
+  %shr.12 = lshr i128 %x, 96
+  %x.12 = trunc i128 %shr.12 to i8
+  %gep.12 = getelementptr i8, ptr %p, i64 12
+  store i8 %x.12, ptr %gep.12
+  %shr.13 = lshr i128 %x, 104
+  %x.13 = trunc i128 %shr.13 to i8
+  %gep.13 = getelementptr i8, ptr %p, i64 13
+  store i8 %x.13, ptr %gep.13
+  %shr.14 = lshr i128 %x, 112
+  %x.14 = trunc i128 %shr.14 to i8
+  %gep.14 = getelementptr i8, ptr %p, i64 14
+  store i8 %x.14, ptr %gep.14
+  %shr.15 = lshr i128 %x, 120
+  %x.15 = trunc i128 %shr.15 to i8
+  %gep.15 = getelementptr i8, ptr %p, i64 15
+  store i8 %x.15, ptr %gep.15
+  ret void
+}
+
+define void @test_i32_lo(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_lo(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_hi(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_hi(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[SHR_0:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 16
+  %x.0 = trunc i32 %shr.0 to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 24
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_mid(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_mid(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X]], 10
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 10
+  %x.0 = trunc i32 %shr.0 to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 18
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_shift_in_zeros(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_shift_in_zeros(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[SHR_0:%.*]] = lshr i32 [[X]], 20
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SHR_0]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 20
+  %x.0 = trunc i32 %shr.0 to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 28
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_base_ptr_with_offset(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_base_ptr_with_offset(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 7
+; CHECK-NEXT:    store i32 [[X]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  %gep.0 = getelementptr i8, ptr %p, i64 7
+  store i16 %x.0, ptr %gep.0
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 9
+  store i16 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_aliasing_store(i16 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define void @test_aliasing_store(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    store i8 0, ptr [[P2]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p
+  store i8 0, ptr %p2
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_non_aliasing_store(i16 %x, ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_non_aliasing_store(
+; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    store i8 0, ptr [[P2]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p
+  store i8 0, ptr %p2
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define i8 @test_aliasing_load(i16 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i8 @test_aliasing_load(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[P2]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret i8 [[V]]
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p
+  %v = load i8, ptr %p2
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret i8 %v
+}
+
+define i8 @test_non_aliasing_load(i16 %x, ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: define i8 @test_non_aliasing_load(
+; CHECK-SAME: i16 [[X:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[P2]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret i8 [[V]]
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p
+  %v = load i8, ptr %p2
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret i8 %v
+}
+
+define i8 @test_aliasing_load_partially_mergeable(i32 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i8 @test_aliasing_load_partially_mergeable(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    ret i8 [[V]]
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %v = load i8, ptr %p2
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  ret i8 %v
+}
+
+declare void @may_unwind() memory(none)
+
+define void @test_unwind(i16 %x, ptr %p, ptr %p2) {
+; CHECK-LABEL: define void @test_unwind(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i16 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    call void @may_unwind()
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i16 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i16 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p
+  call void @may_unwind()
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_multi_group(i16 %x, ptr %p1, i16 %y, ptr %p2) {
+; CHECK-LABEL: define void @test_multi_group(
+; CHECK-SAME: i16 [[X:%.*]], ptr [[P1:%.*]], i16 [[Y:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    store i16 [[X]], ptr [[P1]], align 1
+; CHECK-NEXT:    call void @may_unwind()
+; CHECK-NEXT:    store i16 [[Y]], ptr [[P2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p1
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p1, i64 1
+  store i8 %x.1, ptr %gep.1
+  call void @may_unwind()
+  %y.0 = trunc i16 %y to i8
+  store i8 %y.0, ptr %p2
+  %shr.2 = lshr i16 %y, 8
+  %y.1 = trunc i16 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p2, i64 1
+  store i8 %y.1, ptr %gep.2
+  ret void
+}
+
+define void @test_stores_out_of_order(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_stores_out_of_order(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.2, ptr %gep.2
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_gap(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_gap(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[SHR_0:%.*]] = lshr i32 [[X]], 7
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 7
+  %x.0 = trunc i32 %shr.0 to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_non_byte_sized(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_non_byte_sized(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i15
+; CHECK-NEXT:    store i15 [[X_0]], ptr [[P]], align 2
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 15
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i17
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i17 [[X_1]], ptr [[GEP_1]], align 4
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i15
+  store i15 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 15
+  %x.1 = trunc i32 %shr.1 to i17
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i17 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_wrong_ptr_offset(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_wrong_ptr_offset(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[SHR_0:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 8
+  %x.0 = trunc i32 %shr.0 to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_wrong_endian(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_wrong_endian(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[GEP_0]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    [[SHR_2:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_2:%.*]] = trunc i32 [[SHR_2]] to i8
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  %gep.0 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.0, ptr %gep.0
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.1, ptr %gep.1
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  store i8 %x.3, ptr %p
+  ret void
+}
+
+define void @test_i32_volatile(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_volatile(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[TMP1]] to i8
+; CHECK-NEXT:    store volatile i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 8
+  %x.0 = trunc i32 %shr.0 to i8
+  store volatile i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_atomic(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_atomic(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[SHR_0:%.*]] = lshr i32 [[X]], 8
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[SHR_0]] to i8
+; CHECK-NEXT:    store atomic i8 [[X_0]], ptr [[P]] monotonic, align 1
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i8
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    store i8 [[X_1]], ptr [[GEP_1]], align 1
+; CHECK-NEXT:    ret void
+;
+  %shr.0 = lshr i32 %x, 8
+  %x.0 = trunc i32 %shr.0 to i8
+  store atomic i8 %x.0, ptr %p monotonic, align 1
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  ret void
+}
+
+define void @test_i32_multiple_pointers(i32 %x, i32 %y, ptr %p, ptr %p2) {
+; CHECK-LABEL: define void @test_i32_multiple_pointers(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT:    store i32 [[Y]], ptr [[P2]], align 2
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1
+
+  %y.0 = trunc i32 %y to i16
+  store i16 %y.0, ptr %p2
+  %y.shr.1 = lshr i32 %y, 16
+  %y.1 = trunc i32 %y.shr.1 to i16
+  %p2.gep.1 = getelementptr i8, ptr %p2, i64 2
+  store i16 %y.1, ptr %p2.gep.1
+  ret void
+}
+
+define void @test_i32_multiple_pointers_interleaved(i32 %x, i32 %y, ptr noalias %p, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_i32_multiple_pointers_interleaved(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr noalias [[P:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[X_0]], ptr [[P]], align 2
+; CHECK-NEXT:    [[Y_0:%.*]] = trunc i32 [[Y]] to i16
+; CHECK-NEXT:    store i16 [[Y_0]], ptr [[P2]], align 2
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    store i16 [[X_1]], ptr [[GEP_1]], align 2
+; CHECK-NEXT:    [[Y_SHR_1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT:    [[Y_1:%.*]] = trunc i32 [[Y_SHR_1]] to i16
+; CHECK-NEXT:    [[P2_GEP_1:%.*]] = getelementptr i8, ptr [[P2]], i64 2
+; CHECK-NEXT:    store i16 [[Y_1]], ptr [[P2_GEP_1]], align 2
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p
+  %y.0 = trunc i32 %y to i16
+  store i16 %y.0, ptr %p2
+
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1
+  %y.shr.1 = lshr i32 %y, 16
+  %y.1 = trunc i32 %y.shr.1 to i16
+  %p2.gep.1 = getelementptr i8, ptr %p2, i64 2
+  store i16 %y.1, ptr %p2.gep.1
+  ret void
+}
+
+define void @test_i32_multi_use(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_multi_use(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[X_1:%.*]] = trunc i32 [[SHR_1]] to i16
+; CHECK-NEXT:    call void @use.i16(i16 [[X_0]])
+; CHECK-NEXT:    call void @use.i16(i16 [[X_1]])
+; CHECK-NEXT:    call void @use.i32(i32 [[SHR_1]])
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1
+  call void @use.i16(i16 %x.0)
+  call void @use.i16(i16 %x.1)
+  call void @use.i32(i32 %shr.1)
+  ret void
+}
+
+define void @test_i32_scoped_aa_same(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_scoped_aa_same(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 2, !noalias [[META0:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p, !noalias !0
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1, !noalias !0
+  ret void
+}
+
+define void @test_i32_scoped_aa_different(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_scoped_aa_different(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 2, !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p, !noalias !0
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1, !noalias !3
+  ret void
+}
+
+define void @test_i32_tbaa(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_i32_tbaa(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i16
+  store i16 %x.0, ptr %p, !tbaa !6
+  %shr.1 = lshr i32 %x, 16
+  %x.1 = trunc i32 %shr.1 to i16
+  %gep.1 = getelementptr i8, ptr %p, i64 2
+  store i16 %x.1, ptr %gep.1, !tbaa !6
+  ret void
+}
+
+define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap1(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap2(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap3(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[X]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i64 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i64 %x, 8
+  %x.1 = trunc i64 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.3 = lshr i64 %x, 24
+  %x.3 = trunc i64 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  %shr.4 = lshr i64 %x, 32
+  %x.4 = trunc i64 %shr.4 to i8
+  %gep.4 = getelementptr i8, ptr %p, i64 4
+  store i8 %x.4, ptr %gep.4
+  ret void
+}
+
+define void @test_store_same_parts_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_store_same_parts_twice(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.0, ptr %gep.2
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.1, ptr %gep.3
+  ret void
+}
+
+!0 = !{!1}
+!1 = !{!1, !2}
+!2 = !{!2}
+
+!3 = !{!4}
+!4 = !{!4, !5}
+!5 = !{!5}
+
+!6 = !{!7, !7, i64 0}
+!7 = !{!"short", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]]}
+; CHECK: [[META3]] = !{}
+;.
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index fad4acb86e101..67192902b52df 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -393,26 +393,6 @@ bb:
   ret i32 %i2
 }
 
-define i32 @test_lifetime() {
-; CHECK-LABEL: define {{[^@]+}}@test_lifetime() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I_H2S:%.*]] = alloca i8, i64 4, align 1
-; CHECK-NEXT:    tail call void @no_sync_func(ptr noalias nofree captures(none) [[I_H2S]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I_H2S]])
-; CHECK-NEXT:    store i32 10, ptr [[I_H2S]], align 4
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I_H2S]], align 4
-; CHECK-NEXT:    ret i32 [[I2]]
-;
-bb:
-  %i = tail call noalias ptr @malloc(i64 4)
-  tail call void @no_sync_func(ptr %i)
-  call void @llvm.lifetime.start.p0(i64 4, ptr %i)
-  store i32 10, ptr %i, align 4
-  %i2 = load i32, ptr %i, align 4
-  tail call void @free(ptr %i)
-  ret i32 %i2
-}
-
 ; TEST 11
 
 define void @test11() {
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
index c7a9ec8edc0f6..0be9434829cce 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -340,27 +340,6 @@ bb:
   ret i32 %i2
 }
 
-define i32 @test_lifetime() {
-; CHECK-LABEL: define {{[^@]+}}@test_lifetime() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = tail call noalias ptr @malloc(i64 noundef 4)
-; CHECK-NEXT:    tail call void @no_sync_func(ptr noalias nofree captures(none) [[I]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noalias nofree nonnull align 4 captures(none) dereferenceable(4) [[I]])
-; CHECK-NEXT:    store i32 10, ptr [[I]], align 4
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[I]], align 4
-; CHECK-NEXT:    tail call void @free(ptr noalias nonnull align 4 captures(none) dereferenceable(4) [[I]])
-; CHECK-NEXT:    ret i32 [[I2]]
-;
-bb:
-  %i = tail call noalias ptr @malloc(i64 4)
-  tail call void @no_sync_func(ptr %i)
-  call void @llvm.lifetime.start.p0(i64 4, ptr %i)
-  store i32 10, ptr %i, align 4
-  %i2 = load i32, ptr %i, align 4
-  tail call void @free(ptr %i)
-  ret i32 %i2
-}
-
 ; TEST 11
 
 define void @test11() {
diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
index 005c02184ba16..54782c505e8f2 100644
--- a/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
+++ b/llvm/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll
@@ -18,11 +18,11 @@ bb:
   br i1 %tmp4, label %bb6, label %bb5
 
 bb5:                                              ; preds = %bb
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp) #2
   store i32 %tmp3, ptr %tmp, align 4, !tbaa !2
   store i32 %tmp3, ptr @g, align 4, !tbaa !2
   call void @bar(ptr nonnull %tmp) #2
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %tmp) #2
   br label %bb6
 
 bb6:                                              ; preds = %bb5, %bb
diff --git a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll
index 03ff31b0c11a8..e9d5fb6556faf 100644
--- a/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll
+++ b/llvm/test/Transforms/CodeExtractor/live_shrink_gep.ll
@@ -9,8 +9,7 @@
 define void @_Z3foov() local_unnamed_addr  {
 bb:
   %tmp = alloca %class.A, align 1
-  %tmp1 = getelementptr inbounds %class.A, ptr %tmp, i64 0, i32 0
-  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp1)
+  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %tmp)
   %tmp2 = load i32, ptr @cond, align 4, !tbaa !2
   %tmp3 = icmp eq i32 %tmp2, 0
   br i1 %tmp3, label %bb4, label %bb5
@@ -20,7 +19,7 @@ bb4:                                              ; preds = %bb
   br label %bb5
 
 bb5:                                              ; preds = %bb4, %bb
-  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp1)
+  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %tmp)
   ret void
 }
 
@@ -38,7 +37,6 @@ define void @_Z3goov() local_unnamed_addr  {
 bb:
 ; CHECK: bb:
 ; CHECK-NOT: alloca
-; CHECK-NOT: getelementptr
 ; CHECK-NOT: llvm.lifetime
 ; CHECK: br i1
 ; CHECK: codeRepl.i:
@@ -50,7 +48,6 @@ bb:
 ; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  %tmp = alloca %class.A
-; CHECK-NEXT:  %tmp1 = getelementptr
 ; CHECK-NEXT:  call void @llvm.lifetime.start.p0
 ; CHECK:  call void @llvm.lifetime.end.p0
 ; CHECK-NEXT:  br label %bb5.exitStub
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
index 9b5362dc4180d..6bf268b1c162c 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
@@ -61,10 +61,11 @@ entry:
 
 declare i64 @llvm.aarch64.udiv.i64.i64(i64, i64)
 
-define void @test_free_intrinsics(i64 %x, ptr %ptr) {
+define void @test_free_intrinsics(i64 %x) {
 ; CHECK-LABEL: @test_free_intrinsics(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR:%.*]])
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 100000000032, ptr [[PTR]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 100000000064, ptr [[PTR]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 100000000128, ptr [[PTR]])
 ; CHECK-NEXT:    [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 100000000256, ptr [[PTR]])
@@ -72,6 +73,7 @@ define void @test_free_intrinsics(i64 %x, ptr %ptr) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %ptr = alloca i8
   call void @llvm.lifetime.start.p0(i64 100000000032, ptr %ptr)
   call void @llvm.lifetime.start.p0(i64 100000000064, ptr %ptr)
   call void @llvm.lifetime.end.p0(i64 100000000128, ptr %ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll
index 2306b72a0055f..40101595092b0 100644
--- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll
@@ -87,9 +87,9 @@ loop_exit:
 }
 
 ; CHECK: define {{.*}} void @my_async_function.resume.0(
-; CHECK-NOT:  call void @llvm.lifetime.start.p0(i64 4, ptr %3)
-; CHECK:  br i1 %exitCond, label %loop_exit, label %loop
-; CHECK: lifetime.end
+; CHECK-NOT: llvm.lifetime
+; CHECK:  br i1 %exitCond, label %common.ret, label %loop
+; CHECK-NOT: llvm.lifetime
 ; CHECK: }
 
 declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32, ptr, ptr, ...)
diff --git a/llvm/test/Transforms/DCE/basic.ll b/llvm/test/Transforms/DCE/basic.ll
index 134994aff390e..1a3b12e8cfc92 100644
--- a/llvm/test/Transforms/DCE/basic.ll
+++ b/llvm/test/Transforms/DCE/basic.ll
@@ -26,47 +26,5 @@ define i32 @test_lifetime_alloca() {
   ret i32 0
 }
 
-; CHECK-LABEL: @test_lifetime_arg
-define i32 @test_lifetime_arg(ptr) {
-; Check that lifetime intrinsics are removed along with the pointer.
-; CHECK-NEXT: #dbg_value
-; CHECK-NEXT: ret i32 0
-; CHECK-NOT: llvm.lifetime.start
-; CHECK-NOT: llvm.lifetime.end
-  call void @llvm.lifetime.start.p0(i64 -1, ptr %0)
-  call void @llvm.lifetime.end.p0(i64 -1, ptr %0)
-  ret i32 0
-}
-
-@glob = global i8 1
-
-; CHECK-LABEL: @test_lifetime_global
-define i32 @test_lifetime_global() {
-; Check that lifetime intrinsics are removed along with the pointer.
-; CHECK-NEXT: #dbg_value
-; CHECK-NEXT: ret i32 0
-; CHECK-NOT: llvm.lifetime.start
-; CHECK-NOT: llvm.lifetime.end
-  call void @llvm.lifetime.start.p0(i64 -1, ptr @glob)
-  call void @llvm.lifetime.end.p0(i64 -1, ptr @glob)
-  ret i32 0
-}
-
-; CHECK-LABEL: @test_lifetime_bitcast
-define i32 @test_lifetime_bitcast(ptr %arg) {
-; Check that lifetime intrinsics are NOT removed when the pointer is a bitcast.
-; It's not uncommon for two bitcasts to be made: one for lifetime, one for use.
-; TODO: Support the above case.
-; CHECK-NEXT: bitcast
-; CHECK-NEXT: #dbg_value
-; CHECK-NEXT: llvm.lifetime.start.p0(i64 -1, ptr %cast)
-; CHECK-NEXT: llvm.lifetime.end.p0(i64 -1, ptr %cast)
-; CHECK-NEXT: ret i32 0
-  %cast = bitcast ptr %arg to ptr
-  call void @llvm.lifetime.start.p0(i64 -1, ptr %cast)
-  call void @llvm.lifetime.end.p0(i64 -1, ptr %cast)
-  ret i32 0
-}
-
 ; CHECK: [[add]] = !DILocalVariable
 ; CHECK: [[sub]] = !DILocalVariable
diff --git a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll
index 4d9a767e08d49..27ad6390e5ddb 100644
--- a/llvm/test/Transforms/DeadStoreElimination/libcalls.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/libcalls.ll
@@ -67,19 +67,6 @@ define void @test_strcat_with_lifetime(ptr %src) {
   ret void
 }
 
-define void @test_strcat_with_lifetime_nonlocal(ptr %dest, ptr %src) {
-; CHECK-LABEL: @test_strcat_with_lifetime_nonlocal(
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[DEST:%.*]])
-; CHECK-NEXT:    [[CALL:%.*]] = call ptr @strcat(ptr [[DEST]], ptr [[SRC:%.*]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[DEST]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %dest)
-  %call = call ptr @strcat(ptr %dest, ptr %src)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %dest)
-  ret void
-}
-
 declare ptr @strncat(ptr %dest, ptr %src, i64 %n) nounwind
 define void @test4(ptr %src) {
 ; CHECK-LABEL: @test4(
diff --git a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll
index 73b9903a75fb0..19e7b0d1eacd9 100644
--- a/llvm/test/Transforms/DeadStoreElimination/lifetime.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/lifetime.ll
@@ -25,12 +25,12 @@ define void @test1() {
 
 define void @test2(ptr %P) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 1
+; CHECK-NEXT:    [[Q:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[Q]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[Q]])
 ; CHECK-NEXT:    ret void
 ;
-  %Q = getelementptr i32, ptr %P, i32 1
+  %Q = alloca i32
   call void @llvm.lifetime.start.p0(i64 4, ptr %Q)
   store i32 0, ptr %Q  ;; This store is dead.
   call void @llvm.lifetime.end.p0(i64 4, ptr %Q)
@@ -114,19 +114,19 @@ exit:
 
 ; lifetime.end only marks the first two bytes of %A as dead. Make sure
 ; `store i8 20, ptr %A.2 is not removed.
-define void @test5_lifetime_end_partial(ptr %A) {
+define void @test5_lifetime_end_partial() {
 ; CHECK-LABEL: @test5_lifetime_end_partial(
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr [[A:%.*]])
+; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr [[A]])
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i8, ptr [[A]], i64 1
 ; CHECK-NEXT:    [[A_2:%.*]] = getelementptr i8, ptr [[A]], i64 2
 ; CHECK-NEXT:    store i8 20, ptr [[A_2]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr [[A]])
 ; CHECK-NEXT:    call void @use(ptr [[A_1]])
-; CHECK-NEXT:    store i8 30, ptr [[A_1]], align 1
-; CHECK-NEXT:    store i8 40, ptr [[A_2]], align 1
 ; CHECK-NEXT:    ret void
 ;
 
+  %A = alloca [4 x i8]
   call void @llvm.lifetime.start.p0(i64 2, ptr %A)
   %A.1 = getelementptr i8, ptr %A, i64 1
   %A.2 = getelementptr i8, ptr %A, i64 2
diff --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll
index 95bd859db4a8b..588bdc003920d 100644
--- a/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-multipath.ll
@@ -398,7 +398,7 @@ bb5:
 
 @linenum = external local_unnamed_addr global i32, align 4
 
-define void @accessible_after_return11_loop() {
+define void @accessible_after_return11_loop(ptr noalias %p) {
 ; CHECK-LABEL: @accessible_after_return11_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY_I:%.*]]
@@ -406,7 +406,7 @@ define void @accessible_after_return11_loop() {
 ; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C_1]], label [[FOR_BODY_I]], label [[INIT_PARSE_EXIT:%.*]]
 ; CHECK:       init_parse.exit:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef)
+; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    store i32 0, ptr @linenum, align 4
 ; CHECK-NEXT:    br label [[FOR_BODY_I20:%.*]]
 ; CHECK:       for.body.i20:
@@ -424,7 +424,7 @@ for.body.i:                                       ; preds = %for.body.i, %entry
 
 init_parse.exit:                                  ; preds = %for.body.i
   store i32 0, ptr @linenum, align 4
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull undef) #2
+  store i32 1, ptr %p
   store i32 0, ptr @linenum, align 4
   br label %for.body.i20
 
@@ -435,7 +435,6 @@ for.body.i20:                                     ; preds = %for.body.i20, %init
 exit:
   ret void
 }
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 declare i1 @cond() readnone nounwind
 
 ; Tests where the pointer/object is *NOT* accessible after the function returns.
diff --git a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
index e390d4bdca632..303afc207c023 100644
--- a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
@@ -12,6 +12,6 @@ define ptr @undeclared_customalloc(i64 %size, i64 %align) {
   ret ptr %call
 }
 
-declare ptr @customalloc2(i64, i64) allockind("alloc") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
+declare ptr @customalloc2(i64, i64) allockind("alloc,uninitialized") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
 ; CHECK-DAG: declare ptr @customalloc2_zeroed(i64, i64) #[[CA2ATTR:[0-9]+]]
 ; CHECK-DAG: attributes #[[CA2ATTR]] = { allockind("alloc,zeroed") "alloc-family"="customalloc2" }
diff --git a/llvm/test/Transforms/EarlyCSE/memoryssa.ll b/llvm/test/Transforms/EarlyCSE/memoryssa.ll
index 942b6f87011bf..ba4cce406a12e 100644
--- a/llvm/test/Transforms/EarlyCSE/memoryssa.ll
+++ b/llvm/test/Transforms/EarlyCSE/memoryssa.ll
@@ -142,10 +142,12 @@ end:
 
 ;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting
 ;; stores that, without the lifetime calls, would be writebacks.
-define void @test_writeback_lifetimes(ptr %p) {
+define void @test_writeback_lifetimes() {
 ; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes(
 ; CHECK-NOMEMSSA-NEXT:  entry:
-; CHECK-NOMEMSSA-NEXT:    [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
+; CHECK-NOMEMSSA-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NOMEMSSA-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NOMEMSSA-NEXT:    [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1
 ; CHECK-NOMEMSSA-NEXT:    [[PV:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NOMEMSSA-NEXT:    [[QV:%.*]] = load i32, ptr [[Q]], align 4
 ; CHECK-NOMEMSSA-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
@@ -156,7 +158,9 @@ define void @test_writeback_lifetimes(ptr %p) {
 ;
 ; CHECK-LABEL: @test_writeback_lifetimes(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[Q:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr i32, ptr [[P]], i64 1
 ; CHECK-NEXT:    [[PV:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[QV:%.*]] = load i32, ptr [[Q]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
@@ -166,6 +170,8 @@ define void @test_writeback_lifetimes(ptr %p) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %p = alloca i64
+  call void @llvm.lifetime.start.p0(i64 8, ptr %p)
   %q = getelementptr i32, ptr %p, i64 1
   %pv = load i32, ptr %p
   %qv = load i32, ptr %q
@@ -178,10 +184,12 @@ entry:
 
 ;; Check that we respect lifetime.start/lifetime.end intrinsics when deleting
 ;; stores that, without the lifetime calls, would be writebacks.
-define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) {
+define void @test_writeback_lifetimes_multi_arg(ptr %q) {
 ; CHECK-NOMEMSSA-LABEL: @test_writeback_lifetimes_multi_arg(
 ; CHECK-NOMEMSSA-NEXT:  entry:
-; CHECK-NOMEMSSA-NEXT:    [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NOMEMSSA-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NOMEMSSA-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NOMEMSSA-NEXT:    [[PV:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NOMEMSSA-NEXT:    [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4
 ; CHECK-NOMEMSSA-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
 ; CHECK-NOMEMSSA-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
@@ -191,15 +199,18 @@ define void @test_writeback_lifetimes_multi_arg(ptr %p, ptr %q) {
 ;
 ; CHECK-LABEL: @test_writeback_lifetimes_multi_arg(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PV:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
+; CHECK-NEXT:    [[PV:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[QV:%.*]] = load i32, ptr [[Q:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[P]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[P]])
 ; CHECK-NEXT:    store i32 [[PV]], ptr [[P]], align 4
-; CHECK-NEXT:    store i32 [[QV]], ptr [[Q]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %p = alloca i64
+  call void @llvm.lifetime.start.p0(i64 8, ptr %p)
   %pv = load i32, ptr %p
   %qv = load i32, ptr %q
   call void @llvm.lifetime.end.p0(i64 8, ptr %p)
diff --git a/llvm/test/Transforms/GVN/assume.ll b/llvm/test/Transforms/GVN/assume.ll
index 1498aa4fa7b0b..5d3a23b158879 100644
--- a/llvm/test/Transforms/GVN/assume.ll
+++ b/llvm/test/Transforms/GVN/assume.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=gvn -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -passes='gvn<memoryssa>' -verify-analysis-invalidation -S | FileCheck --check-prefixes=CHECK,MSSA %s
 
 declare void @llvm.assume(i1)
 declare void @use(i1)
diff --git a/llvm/test/Transforms/GVN/basic.ll b/llvm/test/Transforms/GVN/basic.ll
index c1a358af903f4..2e360aac60a91 100644
--- a/llvm/test/Transforms/GVN/basic.ll
+++ b/llvm/test/Transforms/GVN/basic.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=gvn -S | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -passes='gvn<memoryssa;no-memdep>' -S | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -passes='gvn<memoryssa>' -S | FileCheck --check-prefixes=CHECK,MSSA %s
 
 define i32 @main() {
 ; CHECK-LABEL: define i32 @main() {
diff --git a/llvm/test/Transforms/GVN/nonescaping.ll b/llvm/test/Transforms/GVN/nonescaping.ll
index 29137556f14de..0866a27b249f5 100644
--- a/llvm/test/Transforms/GVN/nonescaping.ll
+++ b/llvm/test/Transforms/GVN/nonescaping.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -passes=gvn 2>&1 | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt < %s -S -passes='gvn<memoryssa;no-memdep>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -S -passes='gvn<memoryssa>' 2>&1 | FileCheck --check-prefixes=CHECK,MSSA %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll
index 8fb2d5756f95d..87cd54dd566b9 100644
--- a/llvm/test/Transforms/GVN/opt-remarks.ll
+++ b/llvm/test/Transforms/GVN/opt-remarks.ll
@@ -107,7 +107,8 @@ entry:
   ret i32 %add
 }
 
-define i8 @lifetime_end(ptr %p, i8 %val) {
+define i8 @lifetime_end(i8 %val) {
+  %p = alloca [32 x i8]
   call void @llvm.lifetime.start.p0(i64 32, ptr %p)
   store i8 %val, ptr %p
   call void @llvm.lifetime.end.p0(i64 32, ptr %p)
diff --git a/llvm/test/Transforms/GVN/phi.ll b/llvm/test/Transforms/GVN/phi.ll
index 5b607f7559c1b..a0207cfde1a16 100644
--- a/llvm/test/Transforms/GVN/phi.ll
+++ b/llvm/test/Transforms/GVN/phi.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' < %s | FileCheck %s
+; RUN: opt -S -passes='gvn<memoryssa>' < %s | FileCheck %s
 
 
 define i64 @test1(i1 %c, i64 %a, i64 %b) {
diff --git a/llvm/test/Transforms/GVN/pr14166.ll b/llvm/test/Transforms/GVN/pr14166.ll
index bbc8c89ddaa63..6e23bdcf30053 100644
--- a/llvm/test/Transforms/GVN/pr14166.ll
+++ b/llvm/test/Transforms/GVN/pr14166.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -disable-basic-aa -passes=gvn -S < %s | FileCheck %s --check-prefixes=CHECK,MDEP
-; RUN: opt -disable-basic-aa -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -disable-basic-aa -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
 target datalayout = "e-p:32:32:32"
 define <2 x i32> @test1() {
 ; MDEP-LABEL: define <2 x i32> @test1() {
diff --git a/llvm/test/Transforms/GVN/pre-compare.ll b/llvm/test/Transforms/GVN/pre-compare.ll
index 574d40dfb71d5..c4f083b5750a2 100644
--- a/llvm/test/Transforms/GVN/pre-compare.ll
+++ b/llvm/test/Transforms/GVN/pre-compare.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
 
 ; C source:
 ;
diff --git a/llvm/test/Transforms/GVN/readattrs.ll b/llvm/test/Transforms/GVN/readattrs.ll
index be018834014d5..6e02dd36b5749 100644
--- a/llvm/test/Transforms/GVN/readattrs.ll
+++ b/llvm/test/Transforms/GVN/readattrs.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=gvn -S -o - < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -passes='gvn<memoryssa>' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/GVN/setjmp.ll b/llvm/test/Transforms/GVN/setjmp.ll
index 7777038f89cb1..5351878419963 100644
--- a/llvm/test/Transforms/GVN/setjmp.ll
+++ b/llvm/test/Transforms/GVN/setjmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -S -passes='gvn<memoryssa;no-memdep>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -S -passes='gvn<memoryssa>' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s
 declare i32 @setjmp() returns_twice
 declare void @longjmp()
 declare ptr @malloc(i64)
diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll
index 366dfeca8b758..59ace145b5657 100644
--- a/llvm/test/Transforms/GVN/tbaa.ll
+++ b/llvm/test/Transforms/GVN/tbaa.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -passes='gvn<memoryssa;no-memdep>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -passes='gvn<memoryssa>' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s
 
 define i32 @test1(ptr %p, ptr %q) {
 ; MDEP-LABEL: define i32 @test1(
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
index 646a67d15d392..5d6c55990a855 100644
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S < %s -passes=gvn,dce | FileCheck --check-prefixes=CHECK,MDEP %s
-; RUN: opt -S < %s -passes='gvn<memoryssa;no-memdep>',dce | FileCheck --check-prefixes=CHECK,MSSA %s
+; RUN: opt -S < %s -passes='gvn<memoryssa>',dce | FileCheck --check-prefixes=CHECK,MSSA %s
 
 ; Analyze Load from clobbering Load.
 
diff --git a/llvm/test/Transforms/GVNSink/lifetime.ll b/llvm/test/Transforms/GVNSink/lifetime.ll
new file mode 100644
index 0000000000000..1a8a69bb0986e
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/lifetime.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s
+
+; Make sure we do not sink lifetime markers if this would introduce a
+; lifetime with non-alloca operand.
+
+define void @test_cant_sink(i1 %c) {
+; CHECK-LABEL: define void @test_cant_sink(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr [[B]])
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    store i64 1, ptr [[A]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    store i64 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[B]])
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  %b = alloca i8
+  call void @llvm.lifetime.start(i64 1, ptr %a)
+  call void @llvm.lifetime.start(i64 1, ptr %b)
+  br i1 %c, label %if, label %else
+
+if:
+  store i64 1, ptr %a
+  call void @llvm.lifetime.end(i64 1, ptr %a)
+  br label %join
+
+else:
+  store i64 1, ptr %b
+  call void @llvm.lifetime.end(i64 1, ptr %b)
+  br label %join
+
+join:
+  ret void
+}
+
+define void @test_can_sink(i1 %c) {
+; CHECK-LABEL: define void @test_can_sink(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    store i64 1, ptr [[A]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  call void @llvm.lifetime.start(i64 1, ptr %a)
+  br i1 %c, label %if, label %else
+
+if:
+  store i64 1, ptr %a
+  call void @llvm.lifetime.end(i64 1, ptr %a)
+  br label %join
+
+else:
+  store i64 1, ptr %a
+  call void @llvm.lifetime.end(i64 1, ptr %a)
+  br label %join
+
+join:
+  ret void
+}
diff --git a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll
index e4e68ae039a94..e5bab0c02222a 100644
--- a/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll
+++ b/llvm/test/Transforms/HotColdSplit/lifetime-markers-on-inputs-1.ll
@@ -36,11 +36,10 @@ outlinedPath:
   ; These two uses of stack slots are overlapping. This should prevent
   ; merging of stack slots. CodeExtractor must replicate the effects of
   ; these markers in the caller to inhibit stack coloring.
-  %gep1 = getelementptr inbounds i8, ptr %local1, i64 1
-  call void @llvm.lifetime.start.p0(i64 1, ptr %gep1)
+  call void @llvm.lifetime.start.p0(i64 1, ptr %local1)
   call void @llvm.lifetime.start.p0(i64 1, ptr %local2)
   call void @cold_use2(ptr %local1, ptr %local2)
-  call void @llvm.lifetime.end.p0(i64 1, ptr %gep1)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %local1)
   call void @llvm.lifetime.end.p0(i64 1, ptr %local2)
   br i1 undef, label %outlinedPath2, label %outlinedPathExit
 
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll
index 8bf63127ba636..5926c3242f208 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/lifetime.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s
 
 target triple = "nvptx64-nvidia-cuda"
@@ -6,11 +7,13 @@ define i32 @lifetime_flat_pointer() {
 ; CHECK-LABEL: define i32 @lifetime_flat_pointer() {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[ALLOCA]] to ptr addrspace(5)
-; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[ALLOCA]])
 ; CHECK-NEXT:    store i32 1, ptr addrspace(5) [[TMP1]], align 4
-; CHECK-NEXT:    %ret = load i32, ptr addrspace(5) [[TMP1]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
-; CHECK-NEXT:    ret i32 %ret
+; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[ALLOCA]])
+; CHECK-NEXT:    ret i32 [[RET]]
 ;
   %alloca = alloca i32, align 4
   %1 = addrspacecast ptr %alloca to ptr addrspace(5)
diff --git a/llvm/test/Transforms/Inline/alloca-bonus.ll b/llvm/test/Transforms/Inline/alloca-bonus.ll
index 1dec6609c584e..45ff5273bbe23 100644
--- a/llvm/test/Transforms/Inline/alloca-bonus.ll
+++ b/llvm/test/Transforms/Inline/alloca-bonus.ll
@@ -3,8 +3,6 @@
 
 target datalayout = "p:32:32"
 
-declare void @llvm.lifetime.start.p0(i64 %size, ptr nocapture %ptr)
-
 @glbl = external global i32
 
 define void @outer1() {
@@ -20,7 +18,6 @@ define void @inner1(ptr %ptr) {
   store i32 0, ptr %ptr
   %D = getelementptr inbounds i32, ptr %ptr, i32 1
   %F = select i1 false, ptr %ptr, ptr @glbl
-  call void @llvm.lifetime.start.p0(i64 0, ptr %ptr)
   call void @extern()
   ret void
 }
@@ -39,7 +36,6 @@ define void @inner2(ptr %ptr) {
   store i32 0, ptr %ptr
   %D = getelementptr inbounds i32, ptr %ptr, i32 %A
   %F = select i1 false, ptr %ptr, ptr @glbl
-  call void @llvm.lifetime.start.p0(i64 0, ptr %ptr)
   call void @extern()
   ret void
 }
@@ -146,7 +142,6 @@ define void @inner5(i1 %flag, ptr %ptr) {
 if.then:
   %D = getelementptr inbounds i32, ptr %ptr, i32 %A
   %F = select i1 false, ptr %ptr, ptr @glbl
-  call void @llvm.lifetime.start.p0(i64 0, ptr %ptr)
   ret void
 
 exit:
diff --git a/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll b/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll
new file mode 100644
index 0000000000000..547588089c5b0
--- /dev/null
+++ b/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;; Test if the callee_type metadata is dropped when it is
+;; is mapped to a direct function call from an indirect call during inlining.
+
+; RUN: opt -passes=inline -S < %s | FileCheck %s
+
+define i32 @_Z13call_indirectPFicEc(ptr %func, i8 %x) !type !0 {
+; CHECK-LABEL: define i32 @_Z13call_indirectPFicEc(
+; CHECK-SAME: ptr [[FUNC:%.*]], i8 [[X:%.*]]) !type [[META0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 [[FUNC]](i8 [[X]]), !callee_type [[META1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %call = call i32 %func(i8 %x), !callee_type !1
+  ret i32 %call
+}
+
+define i32 @_Z3barv() !type !3 {
+; CHECK-LABEL: define i32 @_Z3barv(
+; CHECK-SAME: ) !type [[META3:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i32 @_Z3fooc(i8 97)
+; CHECK-NEXT:    ret i32 [[CALL_I]]
+;
+entry:
+  %call = call i32 @_Z13call_indirectPFicEc(ptr nonnull @_Z3fooc, i8 97)
+  ret i32 %call
+}
+declare !type !2 i32 @_Z3fooc(i8 signext)
+
+!0 = !{i64 0, !"_ZTSFiPvcE.generalized"}
+!1 = !{!2}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
+!3 = !{i64 0, !"_ZTSFivE.generalized"}
+;.
+; CHECK: [[META0]] = !{i64 0, !"_ZTSFiPvcE.generalized"}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{i64 0, !"_ZTSFicE.generalized"}
+; CHECK: [[META3]] = !{i64 0, !"_ZTSFivE.generalized"}
+;.
diff --git a/llvm/test/Transforms/Inline/memprof_inline2.ll b/llvm/test/Transforms/Inline/memprof_inline2.ll
index 21448f142ed07..d2e3927602b81 100644
--- a/llvm/test/Transforms/Inline/memprof_inline2.ll
+++ b/llvm/test/Transforms/Inline/memprof_inline2.ll
@@ -38,6 +38,9 @@
 ;; }
 
 ; RUN: opt -passes=inline %s -S | FileCheck %s
+;; We should not perform additional discarding of non-cold contexts when
+;; rebuilding the tries after inlining, even with a very low threshold.
+; RUN: opt -passes=inline -memprof-callsite-cold-threshold=1 %s -S | FileCheck %s
 
 ; ModuleID = 'memprof_inline2.cc'
 source_filename = "memprof_inline2.cc"
diff --git a/llvm/test/Transforms/Inline/memprof_inline3.ll b/llvm/test/Transforms/Inline/memprof_inline3.ll
new file mode 100644
index 0000000000000..e802f2b150da7
--- /dev/null
+++ b/llvm/test/Transforms/Inline/memprof_inline3.ll
@@ -0,0 +1,296 @@
+;; This test is the same code as memprof_inline2.ll, except that it has
+;; manually synthesized context size information. This test ensures that we
+;; don't attempt to apply -memprof-callsite-cold-threshold again when
+;; rebuilding the metadata after inlining.
+;
+; RUN: opt -passes=inline %s -S | FileCheck %s
+;; We should not perform additional discarding of non-cold contexts when
+;; rebuilding the tries after inlining, even with a very low threshold.
+; RUN: opt -passes=inline -memprof-callsite-cold-threshold=0 %s -S | FileCheck %s
+
+; ModuleID = 'memprof_inline2.cc'
+source_filename = "memprof_inline2.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress uwtable
+; CHECK-LABEL: define dso_local noundef ptr @_Z3foov
+define dso_local noundef ptr @_Z3foov() #0 !dbg !39 {
+entry:
+  ;; We should keep the original memprof metadata intact.
+  ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[ORIGMEMPROF:[0-9]+]]
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !dbg !42, !memprof !43, !callsite !52
+  ret ptr %call, !dbg !53
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) #1
+
+;; Mark noinline so we don't inline into calls from bar and baz. We should end
+;; up with a memprof metadata on the call to foo below.
+; Function Attrs: mustprogress noinline uwtable
+; CHECK-LABEL: define dso_local noundef ptr @_Z4foo2v
+define dso_local noundef ptr @_Z4foo2v() #2 !dbg !54 {
+entry:
+  ;; We should have memprof metadata for the call stacks from bar and baz,
+  ;; and the callsite metadata should be the concatentation of the id from the
+  ;; inlined call to new and the original callsite.
+  ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[NEWMEMPROF:[0-9]+]], !callsite ![[NEWCALLSITE:[0-9]+]]
+  %call = call noundef ptr @_Z3foov(), !dbg !55, !callsite !56
+  ret ptr %call, !dbg !57
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noundef ptr @_Z3barv() #0 !dbg !58 {
+entry:
+  %call = call noundef ptr @_Z4foo2v(), !dbg !59, !callsite !60
+  ret ptr %call, !dbg !61
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noundef ptr @_Z3bazv() #0 !dbg !62 {
+entry:
+  %call = call noundef ptr @_Z4foo2v(), !dbg !63, !callsite !64
+  ret ptr %call, !dbg !65
+}
+
+;; Make sure we don't propagate any memprof/callsite metadata
+; Function Attrs: mustprogress uwtable
+; CHECK-LABEL: define dso_local noundef ptr @notprofiled
+define dso_local noundef ptr @notprofiled() #0 !dbg !66 {
+entry:
+  ;; When foo is inlined, both the memprof and callsite metadata should be
+  ;; stripped from the inlined call to new, as there is no callsite metadata on
+  ;; the call.
+  ; CHECK: call {{.*}} @_Znam
+  ; CHECK-NOT: !memprof
+  ; CHECK-NOT: !callsite
+  %call = call noundef ptr @_Z3foov(), !dbg !67
+  ;; When baz is inlined, the callsite metadata should be stripped from the
+  ;; inlined call to foo2, as there is no callsite metadata on the call.
+  ; CHECK: call {{.*}} @_Z4foo2v
+  ; CHECK-NOT: !callsite
+  %call2 = call noundef ptr @_Z3bazv()
+  ; CHECK-NEXT: ret
+  ret ptr %call, !dbg !68
+}
+
+; Function Attrs: mustprogress noinline norecurse optnone uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 !dbg !69 {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca ptr, align 8
+  %c = alloca ptr, align 8
+  %d = alloca ptr, align 8
+  %e = alloca ptr, align 8
+  %f = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store i32 %argc, ptr %argc.addr, align 4
+  store ptr %argv, ptr %argv.addr, align 8
+  ;; The below 4 callsites are all annotated as noinline
+  %call = call noundef ptr @_Z3foov() #8, !dbg !70, !callsite !71
+  store ptr %call, ptr %c, align 8, !dbg !72
+  %call1 = call noundef ptr @_Z3foov() #8, !dbg !73, !callsite !74
+  store ptr %call1, ptr %d, align 8, !dbg !75
+  %call2 = call noundef ptr @_Z3barv() #8, !dbg !76, !callsite !77
+  store ptr %call2, ptr %e, align 8, !dbg !78
+  %call3 = call noundef ptr @_Z3bazv() #8, !dbg !79, !callsite !80
+  store ptr %call3, ptr %f, align 8, !dbg !81
+  %0 = load ptr, ptr %c, align 8, !dbg !82
+  call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false), !dbg !83
+  %1 = load ptr, ptr %d, align 8, !dbg !84
+  call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false), !dbg !85
+  %2 = load ptr, ptr %e, align 8, !dbg !86
+  call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false), !dbg !87
+  %3 = load ptr, ptr %f, align 8, !dbg !88
+  call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false), !dbg !89
+  %4 = load ptr, ptr %c, align 8, !dbg !90
+  %isnull = icmp eq ptr %4, null, !dbg !91
+  br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !91
+
+delete.notnull:                                   ; preds = %entry
+  call void @_ZdaPv(ptr noundef %4) #9, !dbg !92
+  br label %delete.end, !dbg !92
+
+delete.end:                                       ; preds = %delete.notnull, %entry
+  %call4 = call i32 @sleep(i32 noundef 200), !dbg !94
+  %5 = load ptr, ptr %d, align 8, !dbg !95
+  %isnull5 = icmp eq ptr %5, null, !dbg !96
+  br i1 %isnull5, label %delete.end7, label %delete.notnull6, !dbg !96
+
+delete.notnull6:                                  ; preds = %delete.end
+  call void @_ZdaPv(ptr noundef %5) #9, !dbg !97
+  br label %delete.end7, !dbg !97
+
+delete.end7:                                      ; preds = %delete.notnull6, %delete.end
+  %6 = load ptr, ptr %e, align 8, !dbg !98
+  %isnull8 = icmp eq ptr %6, null, !dbg !99
+  br i1 %isnull8, label %delete.end10, label %delete.notnull9, !dbg !99
+
+delete.notnull9:                                  ; preds = %delete.end7
+  call void @_ZdaPv(ptr noundef %6) #9, !dbg !100
+  br label %delete.end10, !dbg !100
+
+delete.end10:                                     ; preds = %delete.notnull9, %delete.end7
+  %7 = load ptr, ptr %f, align 8, !dbg !101
+  %isnull11 = icmp eq ptr %7, null, !dbg !102
+  br i1 %isnull11, label %delete.end13, label %delete.notnull12, !dbg !102
+
+delete.notnull12:                                 ; preds = %delete.end10
+  call void @_ZdaPv(ptr noundef %7) #9, !dbg !103
+  br label %delete.end13, !dbg !103
+
+delete.end13:                                     ; preds = %delete.notnull12, %delete.end10
+  ret i32 0, !dbg !104
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdaPv(ptr noundef) #5
+
+declare i32 @sleep(i32 noundef) #6
+
+attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { argmemonly nofree nounwind willreturn writeonly }
+attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #7 = { builtin allocsize(0) }
+attributes #8 = { noinline }
+attributes #9 = { builtin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!llvm.ident = !{!38}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_inline.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "8711f6fd269e6cb5611fef48bc906eab")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = !{i32 1, !"ProfileSummary", !10}
+!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20}
+!11 = !{!"ProfileFormat", !"InstrProf"}
+!12 = !{!"TotalCount", i64 0}
+!13 = !{!"MaxCount", i64 0}
+!14 = !{!"MaxInternalCount", i64 0}
+!15 = !{!"MaxFunctionCount", i64 0}
+!16 = !{!"NumCounts", i64 0}
+!17 = !{!"NumFunctions", i64 0}
+!18 = !{!"IsPartialProfile", i64 0}
+!19 = !{!"PartialProfileRatio", double 0.000000e+00}
+!20 = !{!"DetailedSummary", !21}
+!21 = !{!22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37}
+!22 = !{i32 10000, i64 0, i32 0}
+!23 = !{i32 100000, i64 0, i32 0}
+!24 = !{i32 200000, i64 0, i32 0}
+!25 = !{i32 300000, i64 0, i32 0}
+!26 = !{i32 400000, i64 0, i32 0}
+!27 = !{i32 500000, i64 0, i32 0}
+!28 = !{i32 600000, i64 0, i32 0}
+!29 = !{i32 700000, i64 0, i32 0}
+!30 = !{i32 800000, i64 0, i32 0}
+!31 = !{i32 900000, i64 0, i32 0}
+!32 = !{i32 950000, i64 0, i32 0}
+!33 = !{i32 990000, i64 0, i32 0}
+!34 = !{i32 999000, i64 0, i32 0}
+!35 = !{i32 999900, i64 0, i32 0}
+!36 = !{i32 999990, i64 0, i32 0}
+!37 = !{i32 999999, i64 0, i32 0}
+!38 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)"}
+!39 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 5, column: 10, scope: !39)
+;; The first 2 are from the direct calls to foo from main. Those stay on the
+;; callsite in foo, which isn't inlined into main due to the callsites in main
+;; being annotated as noinline.
+;; The second 2 are from the calls from foo2, which inlines its callsite to foo
+;; but is not itself inlined into its callers. Therefore they get moved to a
+;; new memprof metadata within foo2.
+!43 = !{!44, !46, !48, !50}
+!44 = !{!45, !"cold", !105}
+!105 = !{i64 123, i64 5000}
+!45 = !{i64 -2458008693472584243, i64 7394638144382192936}
+!46 = !{!47, !"notcold", !106}
+!47 = !{i64 -2458008693472584243, i64 -8908997186479157179}
+!106 = !{i64 345, i64 1}
+!48 = !{!49, !"notcold", !107}
+!49 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872}
+!107 = !{i64 678, i64 1}
+!50 = !{!51, !"cold", !108}
+!51 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905}
+!108 = !{i64 234, i64 5000}
+; CHECK: ![[ORIGMEMPROF]] = !{![[ORIGMIB1:[0-9]+]], ![[ORIGMIB2:[0-9]+]], ![[ORIGMIB3:[0-9]+]], ![[ORIGMIB4:[0-9]+]]}
+; CHECK: ![[ORIGMIB1]] = !{![[ORIGMIBSTACK1:[0-9]+]], !"cold"
+; CHECK: ![[ORIGMIBSTACK1]] = !{i64 -2458008693472584243, i64 7394638144382192936}
+; CHECK: ![[ORIGMIB2]] = !{![[ORIGMIBSTACK2:[0-9]+]], !"notcold"
+; CHECK: ![[ORIGMIBSTACK2]] = !{i64 -2458008693472584243, i64 -8908997186479157179}
+; CHECK: ![[ORIGMIB3]] = !{![[ORIGMIBSTACK3:[0-9]+]], !"notcold"
+; CHECK: ![[ORIGMIBSTACK3]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872}
+; CHECK: ![[ORIGMIB4]] = !{![[ORIGMIBSTACK4:[0-9]+]], !"cold"
+; CHECK: ![[ORIGMIBSTACK4]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905}
+; CHECK: ![[NEWMEMPROF]] = !{![[ORIGMIB3:[0-9]+]], ![[ORIGMIB4:[0-9]+]]}
+; CHECK: ![[NEWCALLSITE]] = !{i64 -2458008693472584243, i64 -8079659623765193173}
+!52 = !{i64 -2458008693472584243}
+!53 = !DILocation(line: 5, column: 3, scope: !39)
+!54 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2v", scope: !1, file: !1, line: 7, type: !40, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!55 = !DILocation(line: 8, column: 10, scope: !54)
+!56 = !{i64 -8079659623765193173}
+!57 = !DILocation(line: 8, column: 3, scope: !54)
+!58 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 10, type: !40, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!59 = !DILocation(line: 11, column: 10, scope: !58)
+!60 = !{i64 -972865200055133905}
+!61 = !DILocation(line: 11, column: 3, scope: !58)
+!62 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 13, type: !40, scopeLine: 13, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!63 = !DILocation(line: 14, column: 10, scope: !62)
+!64 = !{i64 -4805294506621015872}
+!65 = !DILocation(line: 14, column: 3, scope: !62)
+!66 = distinct !DISubprogram(name: "notprofiled", linkageName: "notprofiled", scope: !1, file: !1, line: 400, type: !40, scopeLine: 400, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!67 = !DILocation(line: 401, column: 10, scope: !66)
+!68 = !DILocation(line: 401, column: 3, scope: !66)
+!69 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 16, type: !40, scopeLine: 16, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41)
+!70 = !DILocation(line: 17, column: 13, scope: !69)
+!71 = !{i64 -8908997186479157179}
+!72 = !DILocation(line: 17, column: 9, scope: !69)
+!73 = !DILocation(line: 18, column: 13, scope: !69)
+!74 = !{i64 7394638144382192936}
+!75 = !DILocation(line: 18, column: 9, scope: !69)
+!76 = !DILocation(line: 19, column: 13, scope: !69)
+!77 = !{i64 -5510257407004945023}
+!78 = !DILocation(line: 19, column: 9, scope: !69)
+!79 = !DILocation(line: 20, column: 13, scope: !69)
+!80 = !{i64 8771588133652501463}
+!81 = !DILocation(line: 20, column: 9, scope: !69)
+!82 = !DILocation(line: 21, column: 10, scope: !69)
+!83 = !DILocation(line: 21, column: 3, scope: !69)
+!84 = !DILocation(line: 22, column: 10, scope: !69)
+!85 = !DILocation(line: 22, column: 3, scope: !69)
+!86 = !DILocation(line: 23, column: 10, scope: !69)
+!87 = !DILocation(line: 23, column: 3, scope: !69)
+!88 = !DILocation(line: 24, column: 10, scope: !69)
+!89 = !DILocation(line: 24, column: 3, scope: !69)
+!90 = !DILocation(line: 25, column: 12, scope: !69)
+!91 = !DILocation(line: 25, column: 3, scope: !69)
+!92 = !DILocation(line: 25, column: 3, scope: !93)
+!93 = !DILexicalBlockFile(scope: !69, file: !1, discriminator: 2)
+!94 = !DILocation(line: 26, column: 3, scope: !69)
+!95 = !DILocation(line: 27, column: 12, scope: !69)
+!96 = !DILocation(line: 27, column: 3, scope: !69)
+!97 = !DILocation(line: 27, column: 3, scope: !93)
+!98 = !DILocation(line: 28, column: 12, scope: !69)
+!99 = !DILocation(line: 28, column: 3, scope: !69)
+!100 = !DILocation(line: 28, column: 3, scope: !93)
+!101 = !DILocation(line: 29, column: 12, scope: !69)
+!102 = !DILocation(line: 29, column: 3, scope: !69)
+!103 = !DILocation(line: 29, column: 3, scope: !93)
+!104 = !DILocation(line: 30, column: 3, scope: !69)
diff --git a/llvm/test/Transforms/Inline/redundant-loads.ll b/llvm/test/Transforms/Inline/redundant-loads.ll
index 773be7813727b..3b066ef10a242 100644
--- a/llvm/test/Transforms/Inline/redundant-loads.ll
+++ b/llvm/test/Transforms/Inline/redundant-loads.ll
@@ -104,11 +104,8 @@ define void @outer6(ptr %a, ptr %ptr) {
   ret void
 }
 
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) argmemonly nounwind
-
 define void @inner6(ptr %a, ptr %ptr) {
   %1 = load i32, ptr %a
-  call void @llvm.lifetime.start.p0(i64 32, ptr %ptr) ; This intrinsic does not clobber the first load.
   %2 = load i32, ptr %a
   call void @pad()
   %3 = load i32, ptr %a
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll
new file mode 100644
index 0000000000000..d255eb06cd68d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/wmma-f8f6f4.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s
+
+; ------------------------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too large) wmma.f32.16x16x128.f8f6f4
+; ------------------------------------------------------------------------------------
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp6___v16i32_fp8(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 2, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_fp6(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 2, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v16i32_bf8(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> [[TMP0]], i32 1, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf8___v16i32_bf6(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> [[A]], i32 3, <12 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 3, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp4___v16i32_fp8(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_fp4(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp8(
+; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> [[TMP0]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp4(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> [[A]], i32 4, <8 x i32> [[TMP0]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp4___v16i32_fp6(
+; CHECK-SAME: <12 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <12 x i32> [[A]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> [[TMP0]], i32 2, <12 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 4, <12 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_bf6___v12i32_fp4(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <12 x i32> [[B:%.*]], <8 x float> [[C:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <12 x i32> [[B]], <12 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> [[TMP0]], i32 4, <8 x i32> [[TMP1]], i16 0, <8 x float> [[C]])
+; CHECK-NEXT:    store <8 x float> [[RES]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 3, <16 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/deadcode.ll b/llvm/test/Transforms/InstCombine/deadcode.ll
index e65f0ab6e8d87..f3e1ba6787ad2 100644
--- a/llvm/test/Transforms/InstCombine/deadcode.ll
+++ b/llvm/test/Transforms/InstCombine/deadcode.ll
@@ -26,8 +26,9 @@ declare void @llvm.lifetime.start.p0(i64, ptr)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 
 define void @test3() {
-  call void @llvm.lifetime.start.p0(i64 -1, ptr undef)
-  call void @llvm.lifetime.end.p0(i64 -1, ptr undef)
+  %a = alloca i32
+  call void @llvm.lifetime.start.p0(i64 -1, ptr %a)
+  call void @llvm.lifetime.end.p0(i64 -1, ptr %a)
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll b/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll
new file mode 100644
index 0000000000000..83215f78be1b0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll
@@ -0,0 +1,25 @@
+;; Test if the callee_type metadata is dropped when it is attached
+;; to a direct function call during instcombine.
+
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i32 @_Z3barv() !type !0 {
+; CHECK-LABEL: define i32 @_Z3barv(
+; CHECK-SAME: ) !type [[META0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooc(i8 97){{$}}
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %call = call i32 @_Z3fooc(i8 97), !callee_type !1
+  ret i32 %call
+}
+
+declare !type !2 i32 @_Z3fooc(i8 signext)
+
+!0 = !{i64 0, !"_ZTSFivE.generalized"}
+!1 = !{!2}
+!2 = !{i64 0, !"_ZTSFicE.generalized"}
+;.
+; CHECK: [[META0]] = !{i64 0, !"_ZTSFivE.generalized"}
+;.
diff --git a/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll b/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll
new file mode 100644
index 0000000000000..371f9b6807fe4
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll
@@ -0,0 +1,674 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+
+define i1 @fcmp_trunc(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ult double [[TMP0]], 0x4068FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ult float %trunc, 2.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ole(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ole(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0x4072C00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ole float %trunc, 3.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ogt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ogt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x4079000010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 4.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_zero(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_zero(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xB690000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0.000000
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_nnan(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_nnan(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp nnan oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp nnan oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_ninf(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_ninf(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ninf oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ninf oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_nsz(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_nsz(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp nsz oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp nsz oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_reassoc(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_reassoc(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp reassoc oge double [[TMP0]], 0x4058FFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp reassoc oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_with_fast(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_with_fast(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge double [[TMP0]], 0x4058FFFFF0000000 
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define <4 x i1> @fcmp_vec_trunc(<4 x double> %0) {
+; CHECK-LABEL: define <4 x i1> @fcmp_vec_trunc(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x double> [[TMP0]], splat (double 0x3FEFFFFFF0000000)
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+  %vec = fptrunc <4 x double> %0 to <4 x float>
+  %cmp = fcmp olt <4 x float> %vec, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x i1> %cmp
+}
+
+define <1 x i1> @fcmp_vec_trunc_scalar(<1 x double> %0) {
+; CHECK-LABEL: define <1 x i1> @fcmp_vec_trunc_scalar(
+; CHECK-SAME: <1 x double> [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast olt <1 x double> [[TMP0]], splat (double 0x3FEFFFFFF0000000)
+; CHECK-NEXT:    ret <1 x i1> [[CMP]]
+;
+  %vec = fptrunc <1 x double> %0 to <1 x float>
+  %cmp = fcmp fast olt <1 x float> %vec, <float 1.0>
+  ret <1 x i1> %cmp
+}
+
+define i1 @fcmp_trunc_fp128(fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_fp128(
+; CHECK-SAME: fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge fp128 [[TMP0]], 0xL000000000000000040058FFFFF000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc fp128 %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_x86_fp80(x86_fp80 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_x86_fp80(
+; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge x86_fp80 [[TMP0]], 0xK4005C7FFFF8000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc x86_fp80 %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_ppc_fp128(ppc_fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ppc_fp128(
+; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast oge ppc_fp128 [[TMP0]], 0xM4058FFFFF00000000000000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc ppc_fp128 %0 to float
+  %result = fcmp fast oge float %trunc, 1.000000e+02
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_nan(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_nan(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x7FF8000000000000
+  ret i1 %result
+}
+
+; denomalized 0x00000001
+define i1 @fcmp_trunc_d1(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d1(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x3690000000000001
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45
+  ret i1 %result
+}
+
+; denomalized 0x00000001 ole
+define i1 @fcmp_trunc_d1_ole(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d1_ole(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0x36A7FFFFFFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ole float %trunc, 1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45
+  ret i1 %result
+}
+
+; denomalized 0x00000002
+define i1 @fcmp_trunc_d2(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d2(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x36A8000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 2.8025969286496341418474591665798322625605238837530315435141365677795821653717212029732763767242431640625e-45
+  ret i1 %result
+}
+
+; denomalized 0x7fffff
+define i1 @fcmp_trunc_d3(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d3(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x380FFFFFDFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 1.175494210692441075487029444849287348827052428745893333857174530571588870475618904265502351336181163787841796875e-38
+  ret i1 %result
+}
+
+; denomalized 0x80000001
+define i1 @fcmp_trunc_d4(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d4(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0xB690000000000001
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, -1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45
+  ret i1 %result
+}
+
+; denomalized 0x80000001
+define i1 @fcmp_trunc_d5(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_d5(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xB80FFFFFDFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, -1.175494210692441075487029444849287348827052428745893333857174530571588870475618904265502351336181163787841796875e-38
+  ret i1 %result
+}
+
+
+; +0
+define i1 @fcmp_trunc_p0(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_p0(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xB690000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x00000000
+  ret i1 %result
+}
+
+
+; -0
+define i1 @fcmp_trunc_n0(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_n0(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x3690000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 0x8000000000000000
+  ret i1 %result
+}
+
+
+; max representable
+define i1 @fcmp_trunc_mx(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x47EFFFFFEFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+; negative max representable
+define i1 @fcmp_trunc_mn(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xC7EFFFFFEFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, -3.4028234663852885981170418348451692544e38
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_literal_nan(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_literal_nan(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x7FF8000000000000
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_literal_positive_inf(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_literal_positive_inf(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oeq float [[TRUNC]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 0x7FF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_literal_negative_inf(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_literal_negative_inf(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uno float [[TRUNC]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ult float %trunc, 0xFFF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_nan_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_nan_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ugt float %trunc, 0x7FF8000000000000
+  ret i1 %result
+}
+
+define i1 @fcmp_trunc_inf_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_inf_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ueq float [[TRUNC]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp uge float %trunc, 0x7FF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_ninf_olt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ninf_olt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, 0xFFF0000000000000
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uge double [[TMP0]], 0x405EBFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp uge float %trunc, 123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uge double [[TMP0]], 0xC05EC00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp uge float %trunc, -123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_oge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_oge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x405EBFFFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, 123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_oge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_oge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xC05EC00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp oge float %trunc, -123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ugt double [[TMP0]], 0x40FE0F3010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ugt float %trunc, 123123.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ugt double [[TMP0]], 0xC0FE1B8FF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ugt float %trunc, -123321.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_ogt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ogt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0xC0FE1B8FF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ogt float %trunc, -123321.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_ule(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_ule(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ule double [[TMP0]], 0x408ED80010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ule float %trunc, 987.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_neg_ule(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ule(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ule double [[TMP0]], 0xC088A7FFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ule float %trunc, -789.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_neg_ole(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ole(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0xC088A7FFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ole float %trunc, -789.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ult double [[TMP0]], 0xC088A80010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp ult float %trunc, -789.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_olt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_olt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0x408ED7FFF0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, 987.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_neg_olt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_olt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xC088A80010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp olt float %trunc, -789.0
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_nsz_uge(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_nsz_uge(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp nsz uge double [[TMP0]], 0xC05EC00010000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp nsz uge float %trunc, -123.0
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_reassoc_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_reassoc_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp reassoc ugt double [[TMP0]], 0x40889F8210000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp reassoc ugt float %trunc, 787.9384765625
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_reassoc_ugt(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_reassoc_ugt(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp reassoc ugt double [[TMP0]], 0xC0889F81F0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp reassoc ugt float %trunc, -787.9384765625
+  ret i1 %result
+}
+
+
+
+define i1 @fcmp_trunc_fast_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_fast_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast uge double [[TMP0]], 0x40F8E8E010000001
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp fast uge float %trunc, 102030.0078125
+  ret i1 %result
+}
+
+
+define i1 @fcmp_trunc_neg_fast_ult(double %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_neg_fast_ult(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp fast uge double [[TMP0]], 0xC0F8E8E02FFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc double %0 to float
+  %result = fcmp fast uge float %trunc, -102030.0078125
+  ret i1 %result
+}
+
+
+; max representable float to fp128
+define i1 @fcmp_trunc_mx_fp128(fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx_fp128(
+; CHECK-SAME: fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole fp128 [[TMP0]], 0xLFFFFFFFFFFFFFFFF407EFFFFFEFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc fp128 %0 to float
+  %result = fcmp ole float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+
+; max representable float to x86_fp80
+define i1 @fcmp_trunc_mx_x86_fp80(x86_fp80 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx_x86_fp80(
+; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ule x86_fp80 [[TMP0]], 0xK407EFFFFFF7FFFFFFFFF
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc x86_fp80 %0 to float
+  %result = fcmp ule float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+
+; max representable float to ppc_fp128
+define i1 @fcmp_trunc_mx_ppc_fp128(ppc_fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mx_ppc_fp128(
+; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc ppc_fp128 [[TMP0]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp ole float [[TRUNC]], 0x47EFFFFFE0000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc ppc_fp128 %0 to float
+  %result = fcmp ole float %trunc, 0x47EFFFFFE0000000
+  ret i1 %result
+}
+
+
+; negative max representable float to fp128
+define i1 @fcmp_trunc_mn_fp128(fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn_fp128(
+; CHECK-SAME: fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp olt fp128 [[TMP0]], 0xL0000000000000000C07EFFFFF1000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc fp128 %0 to float
+  %result = fcmp olt float %trunc, 0xC7EFFFFF00000000
+  ret i1 %result
+}
+
+
+; negative max representable float to x86_fp80
+define i1 @fcmp_trunc_mn_x86_fp80(x86_fp80 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn_x86_fp80(
+; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp oge x86_fp80 [[TMP0]], 0xKC07EFFFFF88000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc x86_fp80 %0 to float
+  %result = fcmp oge float %trunc, 0xC7EFFFFF00000000
+  ret i1 %result
+}
+
+
+; negative max representable float to ppc_fp128
+define i1 @fcmp_trunc_mn_ppc_fp128(ppc_fp128 %0) {
+; CHECK-LABEL: define i1 @fcmp_trunc_mn_ppc_fp128(
+; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = fcmp uge ppc_fp128 [[TMP0]], 0xMC7EFFFFF100000000000000000000000
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+  %trunc = fptrunc ppc_fp128 %0 to float
+  %result = fcmp uge float %trunc, 0xC7EFFFFF00000000
+  ret i1 %result
+}
+
diff --git a/llvm/test/Transforms/InstCombine/freeze.ll b/llvm/test/Transforms/InstCombine/freeze.ll
index 9733f1b732c3f..3fedead2feab8 100644
--- a/llvm/test/Transforms/InstCombine/freeze.ll
+++ b/llvm/test/Transforms/InstCombine/freeze.ll
@@ -142,6 +142,17 @@ define i32 @early_freeze_test3(i32 %v1) {
   ret i32 %v4.fr
 }
 
+define i32 @early_freeze_test4(i32 %v1) {
+; CHECK-LABEL: @early_freeze_test4(
+; CHECK-NEXT:    [[V2_FR:%.*]] = freeze i32 [[V2:%.*]]
+; CHECK-NEXT:    [[V3:%.*]] = mul i32 [[V2_FR]], [[V2_FR]]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v2 = mul i32 %v1, %v1
+  %v2.fr = freeze i32 %v2
+  ret i32 %v2.fr
+}
+
 ; If replace all dominated uses of v to freeze(v).
 
 define void @freeze_dominated_uses_test1(i32 %v) {
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 752ff0cae5b78..bb0a94cb01494 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -682,15 +682,15 @@ define i32 @test28() nounwind  {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ORIENTATIONS:%.*]] = alloca [1 x [1 x %struct.x]], align 8
 ; CHECK-NEXT:    [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0]]
-; CHECK-NEXT:    [[T45:%.*]] = getelementptr inbounds nuw i8, ptr [[ORIENTATIONS]], i64 1
 ; CHECK-NEXT:    br label [[BB10:%.*]]
 ; CHECK:       bb10:
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[BB10]] ]
 ; CHECK-NEXT:    [[T12_REC:%.*]] = xor i32 [[INDVAR]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[T12_REC]] to i64
-; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i8, ptr [[T45]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i8, ptr [[ORIENTATIONS]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[T16:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str1, ptr nonnull [[T12]]) #[[ATTR0]]
-; CHECK-NEXT:    [[T84:%.*]] = icmp eq i32 [[INDVAR]], 0
+; CHECK-NEXT:    [[T84:%.*]] = icmp eq i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[T84]], label [[BB17:%.*]], label [[BB10]]
 ; CHECK:       bb17:
diff --git a/llvm/test/Transforms/InstCombine/icmp-select.ll b/llvm/test/Transforms/InstCombine/icmp-select.ll
index a038731abbc48..c6c0ba385a6fd 100644
--- a/llvm/test/Transforms/InstCombine/icmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-select.ll
@@ -248,10 +248,9 @@ define i1 @icmp_select_implied_cond_relational_off_by_one(i8 %x, i8 %y) {
 
 define i1 @umin_seq_comparison(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umin_seq_comparison(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[CMP21:%.*]] = icmp ule i8 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMP2:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP21]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    [[Y:%.*]] = freeze i8 [[Y1:%.*]]
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp ule i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP21]]
 ;
   %min = call i8 @llvm.umin.i8(i8 %x, i8 %y)
   %cmp1 = icmp eq i8 %x, 0
diff --git a/llvm/test/Transforms/InstCombine/malloc-free.ll b/llvm/test/Transforms/InstCombine/malloc-free.ll
index 989074f97aaf6..d8a1c07a55429 100644
--- a/llvm/test/Transforms/InstCombine/malloc-free.ll
+++ b/llvm/test/Transforms/InstCombine/malloc-free.ll
@@ -109,8 +109,6 @@ define void @test3(ptr %src) {
 ; CHECK-NEXT:    ret void
 ;
   %a = call noalias ptr @malloc(i32 10)
-  call void @llvm.lifetime.start.p0(i64 10, ptr %a)
-  call void @llvm.lifetime.end.p0(i64 10, ptr %a)
   %size = call i64 @llvm.objectsize.i64(ptr %a, i1 true)
   store i8 42, ptr %a
   call void @llvm.memcpy.p0.p0.i32(ptr %a, ptr %src, i32 32, i1 false)
diff --git a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
index 9a0a6ae6324e7..95753a2609cf1 100644
--- a/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
+++ b/llvm/test/Transforms/InstCombine/scalable-vector-struct.ll
@@ -174,16 +174,12 @@ define { <16 x i8>, <32 x i8> } @differenttypes({ <4 x i32>, <8 x i32> } %a, ptr
 ; CHECK-LABEL: define { <16 x i8>, <32 x i8> } @differenttypes
 ; CHECK-SAME: ({ <4 x i32>, <8 x i32> } [[A:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[P]])
 ; CHECK-NEXT:    store { <4 x i32>, <8 x i32> } [[A]], ptr [[P]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = load { <16 x i8>, <32 x i8> }, ptr [[P]], align 16
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[P]])
 ; CHECK-NEXT:    ret { <16 x i8>, <32 x i8> } [[TMP0]]
 ;
 entry:
-  call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull %p) #5
   store { <4 x i32>, <8 x i32> } %a, ptr %p, align 16
   %2 = load { <16 x i8>, <32 x i8> }, ptr %p, align 16
-  call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %p) #5
   ret { <16 x i8>, <32 x i8> } %2
 }
diff --git a/llvm/test/Transforms/InstCombine/select-fixed-zero.ll b/llvm/test/Transforms/InstCombine/select-fixed-zero.ll
new file mode 100644
index 0000000000000..7f326d158776b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-fixed-zero.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; (select (icmp x, 0, eq), 0, (umin x, y)) -> (umin x, y)
+define i64 @umin_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @umin_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[A:%.*]], i64 [[B_FR]])
+; CHECK-NEXT:    ret i64 [[UMIN]]
+;
+  %cond = icmp eq i64 %a, 0
+  %umin = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+  %select = select i1 %cond, i64 0, i64 %umin
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (mul x, y)) -> (mul x, y)
+define i64 @mul_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @mul_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[A:%.*]], [[B_FR]]
+; CHECK-NEXT:    ret i64 [[MUL]]
+;
+  %cond = icmp eq i64 %a, 0
+  %mul = mul i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %mul
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (mul x, y)) -> (mul x, y)
+define i64 @mul_select_comm(i64 %a, i64 %b) {
+; CHECK-LABEL: @mul_select_comm(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[B_FR]], [[A:%.*]]
+; CHECK-NEXT:    ret i64 [[MUL]]
+;
+  %cond = icmp eq i64 %a, 0
+  %mul = mul i64 %b, %a
+  %select = select i1 %cond, i64 0, i64 %mul
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (shl x, y)) -> (shl x, y)
+define i64 @shl_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @shl_select(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], [[B_FR:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], i64 0, i64 [[SHL]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp eq i64 %a, 0
+  %shl = shl i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %shl
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (and x, y)) -> (and x, y)
+define i64 @and_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @and_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[A:%.*]], [[B_FR]]
+; CHECK-NEXT:    ret i64 [[AND]]
+;
+  %cond = icmp eq i64 %a, 0
+  %and = and i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %and
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (and x, y)) -> (and x, y)
+define i64 @and_select_comm(i64 %a, i64 %b) {
+; CHECK-LABEL: @and_select_comm(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[B_FR]], [[A:%.*]]
+; CHECK-NEXT:    ret i64 [[AND]]
+;
+  %cond = icmp eq i64 %a, 0
+  %and = and i64 %b, %a
+  %select = select i1 %cond, i64 0, i64 %and
+  ret i64 %select
+}
+
+; (select (icmp x, 0, ne), (ashr x, y), 0) -> (ashr x, y)
+define i64 @ashr_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @ashr_select(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[A]], [[B_FR:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND_NOT]], i64 0, i64 [[ASHR]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp ne i64 0, %a
+  %ashr = ashr i64 %a, %b
+  %select = select i1 %cond, i64 %ashr, i64 0
+  ret i64 %select
+}
+
+; (select (icmp x, 0, ne), (lshr x, y), 0) -> (lshr x, y)
+define i64 @lshr_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @lshr_select(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[A]], [[B_FR:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND_NOT]], i64 0, i64 [[LSHR]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp ne i64 0, %a
+  %lshr = lshr i64 %a, %b
+  %select = select i1 %cond, i64 %lshr, i64 0
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, fshr(x, x, y)) -> fshr(x, x, y)
+define i64 @fshr_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @fshr_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[FSHR:%.*]] = call i64 @llvm.fshr.i64(i64 [[A:%.*]], i64 [[A]], i64 [[B_FR]])
+; CHECK-NEXT:    ret i64 [[FSHR]]
+;
+  %cond = icmp eq i64 %a, 0
+  %fshr = call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+  %select = select i1 %cond, i64 0, i64 %fshr
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (fshl x, x, y)) -> (fshl x, x, y)
+define i64 @fshl_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @fshl_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[FSHL:%.*]] = call i64 @llvm.fshl.i64(i64 [[A:%.*]], i64 [[A]], i64 [[B_FR]])
+; CHECK-NEXT:    ret i64 [[FSHL]]
+;
+  %cond = icmp eq i64 %a, 0
+  %fshl = call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
+  %select = select i1 %cond, i64 0, i64 %fshl
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (fshr x, z, y)) -> leave as is
+define i64 @fshr_select_no_combine(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @fshr_select_no_combine(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[FSHR:%.*]] = call i64 @llvm.fshr.i64(i64 [[A]], i64 [[B:%.*]], i64 [[C:%.*]])
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], i64 0, i64 [[FSHR]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp eq i64 %a, 0
+  %fshr = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  %select = select i1 %cond, i64 0, i64 %fshr
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (sdiv x, y)) -> (sdiv x, y)
+define i64 @sdiv_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @sdiv_select(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i64 [[A:%.*]], [[B_FR:%.*]]
+; CHECK-NEXT:    ret i64 [[DIV]]
+;
+  %cond = icmp eq i64 %a, 0
+  %div = sdiv i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %div
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (udiv x, y)) -> (udiv x, y)
+define i64 @udiv_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @udiv_select(
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A:%.*]], [[B_FR:%.*]]
+; CHECK-NEXT:    ret i64 [[DIV]]
+;
+  %cond = icmp eq i64 %a, 0
+  %div = udiv i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %div
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (srem x, y)) -> (srem x, y)
+define i64 @srem_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @srem_select(
+; CHECK-NEXT:    [[REM:%.*]] = srem i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+  %cond = icmp eq i64 %a, 0
+  %rem = srem i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %rem
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (urem x, y)) -> (urem x, y)
+define i64 @urem_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @urem_select(
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+  %cond = icmp eq i64 %a, 0
+  %rem = urem i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %rem
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (icmp x, 0, slt)) -> (icmp x, 0, slt)
+define i1 @icmp_slt_select(i64 %a) {
+; CHECK-LABEL: @icmp_slt_select(
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i64 [[A:%.*]], 0
+; CHECK-NEXT:    ret i1 [[ICMP]]
+;
+  %cond = icmp eq i64 %a, 0
+  %icmp = icmp slt i64 %a, 0
+  %select = select i1 %cond, i1 0, i1 %icmp
+  ret i1 %select
+}
+
+; (select (icmp x, 0, eq), 0, (sub 0, x)) -> (sub 0, x)
+define i64 @sub_select(i64 %a) {
+; CHECK-LABEL: @sub_select(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 0, [[A:%.*]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %cond = icmp eq i64 %a, 0
+  %sub = sub i64 0, %a
+  %select = select i1 %cond, i64 0, i64 %sub
+  ret i64 %select
+}
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index ef5874ffd46ad..1f9ee83536016 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -893,10 +893,9 @@ define i32 @test56(i16 %x) {
 
 define i32 @test57(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test57(
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    [[DOTAND:%.*]] = select i1 [[TOBOOL]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[DOTAND]]
+; CHECK-NEXT:    [[Y:%.*]] = freeze i32 [[Y1:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %and = and i32 %x, %y
   %tobool = icmp eq i32 %x, 0
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 11af6b4a0197f..84e570395e03b 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -945,19 +945,15 @@ define i64 @multiple_geps_two_chains_gep_base(ptr %base, i64 %base.idx, i64 %idx
 
 define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4) {
 ; CHECK-LABEL: @multiple_geps_two_chains_multi_use(
-; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]]
-; CHECK-NEXT:    [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]]
-; CHECK-NEXT:    [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P3_IDX]]
-; CHECK-NEXT:    [[P4_IDX1:%.*]] = shl nsw i64 [[IDX5:%.*]], 2
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, ptr [[P3]], i64 [[P4_IDX1]]
+; CHECK-NEXT:    [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT:    [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]]
+; CHECK-NEXT:    [[P3_IDX2:%.*]] = add i64 [[IDX3:%.*]], [[IDX4:%.*]]
+; CHECK-NEXT:    [[P4_IDX1:%.*]] = shl i64 [[P3_IDX2]], 2
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX1]]
 ; CHECK-NEXT:    call void @use(ptr [[P5]])
 ; CHECK-NEXT:    call void @use(ptr [[P4]])
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[P3_IDX]], [[P4_IDX1]]
-; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 [[P4_IDX]], [[P4_IDX1]]
 ; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
   %p1 = getelementptr inbounds i32, ptr %base, i64 %idx1
@@ -974,23 +970,18 @@ define i64 @multiple_geps_two_chains_multi_use(ptr %base, i64 %idx1, i64 %idx2,
 
 define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64 %idx2, i64 %idx3, i64 %idx4, i64 %idx5, i64 %idx6) {
 ; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use(
-; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nsw i64 [[IDX2:%.*]], 2
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 [[P2_IDX]]
-; CHECK-NEXT:    [[P4_IDX:%.*]] = shl nsw i64 [[IDX4:%.*]], 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P4_IDX]]
-; CHECK-NEXT:    [[P3_IDX:%.*]] = shl nsw i64 [[IDX3:%.*]], 2
-; CHECK-NEXT:    [[P4_IDX1:%.*]] = shl nsw i64 [[IDX7:%.*]], 2
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[P4_IDX1]]
-; CHECK-NEXT:    [[P5_IDX:%.*]] = shl nsw i64 [[IDX5:%.*]], 2
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, ptr [[P5]], i64 [[P5_IDX]]
-; CHECK-NEXT:    [[P6_IDX:%.*]] = shl nsw i64 [[IDX6:%.*]], 2
+; CHECK-NEXT:    [[P1_IDX1:%.*]] = add i64 [[IDX1:%.*]], [[IDX2:%.*]]
+; CHECK-NEXT:    [[P4_IDX:%.*]] = shl i64 [[P1_IDX1]], 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 [[P4_IDX]]
+; CHECK-NEXT:    [[P4_IDX2:%.*]] = add i64 [[IDX4:%.*]], [[IDX5:%.*]]
+; CHECK-NEXT:    [[P5_IDX:%.*]] = shl i64 [[P4_IDX2]], 2
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[P5_IDX]]
 ; CHECK-NEXT:    call void @use(ptr [[P3]])
 ; CHECK-NEXT:    call void @use(ptr [[P4]])
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[P2_IDX]], [[P4_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[P3_IDX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[P4_IDX1]], [[P5_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], [[P6_IDX]]
-; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[P1_IDX1]], [[IDX3:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[P4_IDX2]], [[IDX6:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
   %p1 = getelementptr inbounds i32, ptr %base, i64 %idx1
@@ -1007,6 +998,29 @@ define i64 @multiple_geps_two_chains_partial_multi_use(ptr %base, i64 %idx1, i64
   ret i64 %d
 }
 
+define i64 @multiple_geps_two_chains_partial_multi_use_insert_point(ptr %p, i64 %idx1, i64 %idx2, i64 %idx3) {
+; CHECK-LABEL: @multiple_geps_two_chains_partial_multi_use_insert_point(
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8
+; CHECK-NEXT:    call void @use(ptr [[GEP2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[IDX2:%.*]], [[IDX3:%.*]]
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[TMP1]]
+; CHECK-NEXT:    call void @use(ptr [[GEP4]])
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 8
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub i64 [[IDX1:%.*]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %idx1
+  %gep2 = getelementptr i8, ptr %p, i64 8
+  call void @use(ptr %gep2)
+  %gep3 = getelementptr i8, ptr %gep2, i64 %idx2
+  %gep4 = getelementptr i8, ptr %gep3, i64 %idx3
+  call void @use(ptr %gep4)
+  %gep1.int = ptrtoint ptr %gep1 to i64
+  %gep4.int = ptrtoint ptr %gep4 to i64
+  %sub = sub i64 %gep1.int, %gep4.int
+  ret i64 %sub
+}
+
 define i64 @multiple_geps_inbounds(ptr %base, i64 %idx, i64 %idx2) {
 ; CHECK-LABEL: @multiple_geps_inbounds(
 ; CHECK-NEXT:    [[D:%.*]] = add nsw i64 [[IDX:%.*]], [[IDX2:%.*]]
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
index d824d6d35643d..3cb6290b1a808 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
@@ -35,8 +35,7 @@ define double @test_atan_neg0() {
 
 define double @test_atan_poison() {
 ; CHECK-LABEL: define double @test_atan_poison() {
-; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double poison)
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    ret double poison
 ;
   %res = call double @llvm.atan.f64(double poison)
   ret double %res
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll
index a4f318bbc834c..96419382c7b7f 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll
@@ -35,8 +35,7 @@ define double @test_sinh_neg0() {
 
 define double @test_sinh_poison() {
 ; CHECK-LABEL: define double @test_sinh_poison() {
-; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.sinh.f64(double poison)
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    ret double poison
 ;
   %res = call double @llvm.sinh.f64(double poison)
   ret double %res
@@ -121,8 +120,7 @@ define double @test_cosh_neg0() {
 
 define double @test_cosh_poison() {
 ; CHECK-LABEL: define double @test_cosh_poison() {
-; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.cosh.f64(double poison)
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    ret double poison
 ;
   %res = call double @llvm.cosh.f64(double poison)
   ret double %res
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
new file mode 100644
index 0000000000000..75b850978b75a
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-unary-arithmetic.ll
@@ -0,0 +1,646 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; Test constant-folding for various NVVM unary arithmetic intrinsics.
+
+;###############################################################
+;#                          Ceil                               #
+;###############################################################
+
+define double @test_ceil_d_1_25() {
+; CHECK-LABEL: define double @test_ceil_d_1_25() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.ceil.d(double 1.25)
+  ret double %res
+}
+
+define float @test_ceil_f_1_25() {
+; CHECK-LABEL: define float @test_ceil_f_1_25() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.f(float 1.25)
+  ret float %res
+}
+
+define float @test_ceil_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_ceil_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_ceil_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_ceil_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.ceil.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_ceil_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ceil_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_ceil_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_ceil_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.ceil.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                          FAbs                               #
+;###############################################################
+
+define float @test_fabs_neg_1_5() {
+; CHECK-LABEL: define float @test_fabs_neg_1_5() {
+; CHECK-NEXT:    ret float 1.500000e+00
+;
+  %res = call float @llvm.nvvm.fabs(float -1.5)
+  ret float %res
+}
+
+define float @test_fabs_ftz_neg_1_5() {
+; CHECK-LABEL: define float @test_fabs_ftz_neg_1_5() {
+; CHECK-NEXT:    ret float 1.500000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float -1.5)
+  ret float %res
+}
+
+define float @test_fabs_1_25() {
+; CHECK-LABEL: define float @test_fabs_1_25() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fabs(float 1.25)
+  ret float %res
+}
+
+define float @test_fabs_ftz_1_25() {
+; CHECK-LABEL: define float @test_fabs_ftz_1_25() {
+; CHECK-NEXT:    ret float 1.250000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float 1.25)
+  ret float %res
+}
+
+define float @test_fabs_neg_subnorm() {
+; CHECK-LABEL: define float @test_fabs_neg_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fabs(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fabs_ftz_neg_subnorm() {
+; CHECK-LABEL: define float @test_fabs_ftz_neg_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fabs_pos_subnorm() {
+; CHECK-LABEL: define float @test_fabs_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.fabs(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_fabs_ftz_pos_subnorm() {
+; CHECK-LABEL: define float @test_fabs_ftz_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.fabs.ftz(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+
+;###############################################################
+;#                          Floor                              #
+;###############################################################
+
+define double @test_floor_d_1_25() {
+; CHECK-LABEL: define double @test_floor_d_1_25() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.floor.d(double 1.25)
+  ret double %res
+}
+
+define float @test_floor_f_1_25() {
+; CHECK-LABEL: define float @test_floor_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.f(float 1.25)
+  ret float %res
+}
+
+define float @test_floor_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_floor_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_floor_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_floor_d_neg_subnorm() {
+; CHECK-NEXT:    ret double -1.000000e+00
+;
+  %res = call double @llvm.nvvm.floor.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_floor_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_floor_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -1.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_floor_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_floor_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.floor.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                            Rcp                              #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                       rcp_rm                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rm_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rm_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rm.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rm_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rm_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rm.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rm_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rm_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rm_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rm_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000041
+;
+  %res = call double @llvm.nvvm.rcp.rm.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rm_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rm_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000040000000
+;
+  %res = call float @llvm.nvvm.rcp.rm.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rm_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rm_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rm.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                       rcp_rn                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rn_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rn_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rn.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rn_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rn_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rn.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rn_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rn_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rn_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rn_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.rn.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rn_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rn_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000020000000
+;
+  %res = call float @llvm.nvvm.rcp.rn.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rn_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rn_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rn.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                       rcp_rp                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rp_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rp_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rp.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rp_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rp_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rp.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rp_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rp_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rp_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rp_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.rp.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rp_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rp_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000020000000
+;
+  %res = call float @llvm.nvvm.rcp.rp.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rp_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rp_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rp.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;+-------------------------------------------------------------+
+;|                       rcp_rz                                |
+;+-------------------------------------------------------------+
+define double @test_rcp_rz_d_0_5() {
+; CHECK-LABEL: define double @test_rcp_rz_d_0_5() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.rcp.rz.d(double 0.5)
+  ret double %res
+}
+
+define float @test_rcp_rz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rz.f(float 0.5)
+  ret float %res
+}
+
+define float @test_rcp_rz_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_rcp_rz_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_rcp_rz_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_rcp_rz_d_neg_subnorm() {
+; CHECK-NEXT:    ret double 0xC7D0000020000040
+;
+  %res = call double @llvm.nvvm.rcp.rz.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_rcp_rz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float 0xC7D0000020000000
+;
+  %res = call float @llvm.nvvm.rcp.rz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_rcp_rz_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_rcp_rz_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %res = call float @llvm.nvvm.rcp.rz.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                          Round                              #
+;###############################################################
+
+define double @test_round_d_neg_1_5() {
+; CHECK-LABEL: define double @test_round_d_neg_1_5() {
+; CHECK-NEXT:    ret double -2.000000e+00
+;
+  %res = call double @llvm.nvvm.round.d(double -1.5)
+  ret double %res
+}
+
+define float @test_round_f_neg_1_5() {
+; CHECK-LABEL: define float @test_round_f_neg_1_5() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.round.f(float -1.5)
+  ret float %res
+}
+
+define float @test_round_ftz_f_neg_1_5() {
+; CHECK-LABEL: define float @test_round_ftz_f_neg_1_5() {
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %res = call float @llvm.nvvm.round.ftz.f(float -1.5)
+  ret float %res
+}
+
+define double @test_round_d_neg_subnorm() {
+; CHECK-LABEL: define double @test_round_d_neg_subnorm() {
+; CHECK-NEXT:    ret double -0.000000e+00
+;
+  %res = call double @llvm.nvvm.round.d(double 0xB80FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_round_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_round_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.round.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_round_ftz_f_neg_subnorm() {
+; CHECK-LABEL: define float @test_round_ftz_f_neg_subnorm() {
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %res = call float @llvm.nvvm.round.ftz.f(float 0xB80FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                        Saturate                             #
+;###############################################################
+
+define double @test_saturate_d_1_25() {
+; CHECK-LABEL: define double @test_saturate_d_1_25() {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %res = call double @llvm.nvvm.saturate.d(double 1.25)
+  ret double %res
+}
+
+define float @test_saturate_f_1_25() {
+; CHECK-LABEL: define float @test_saturate_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.f(float 1.25)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_1_25() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_1_25() {
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float 1.25)
+  ret float %res
+}
+
+define double @test_saturate_d_neg_1_25() {
+; CHECK-LABEL: define double @test_saturate_d_neg_1_25() {
+; CHECK-NEXT:    ret double 0.000000e+00
+;
+  %res = call double @llvm.nvvm.saturate.d(double -1.25)
+  ret double %res
+}
+
+define float @test_saturate_f_neg_1_25() {
+; CHECK-LABEL: define float @test_saturate_f_neg_1_25() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.f(float -1.25)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_neg_1_25() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_neg_1_25() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float -1.25)
+  ret float %res
+}
+
+define double @test_saturate_d_0_5() {
+; CHECK-LABEL: define double @test_saturate_d_0_5() {
+; CHECK-NEXT:    ret double 5.000000e-01
+;
+  %res = call double @llvm.nvvm.saturate.d(double 0.5)
+  ret double %res
+}
+
+define float @test_saturate_f_0_5() {
+; CHECK-LABEL: define float @test_saturate_f_0_5() {
+; CHECK-NEXT:    ret float 5.000000e-01
+;
+  %res = call float @llvm.nvvm.saturate.f(float 0.5)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_0_5() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_0_5() {
+; CHECK-NEXT:    ret float 5.000000e-01
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float 0.5)
+  ret float %res
+}
+
+define double @test_saturate_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_saturate_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x380FFFFFC0000000
+;
+  %res = call double @llvm.nvvm.saturate.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_saturate_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_saturate_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x380FFFFFC0000000
+;
+  %res = call float @llvm.nvvm.saturate.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_saturate_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_saturate_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.saturate.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+;###############################################################
+;#                            Sqrt                             #
+;###############################################################
+
+define float @test_sqrt_f_4() {
+; CHECK-LABEL: define float @test_sqrt_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.f(float 4.0)
+  ret float %res
+}
+
+define float @test_sqrt_rn_f_4() {
+; CHECK-LABEL: define float @test_sqrt_rn_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.rn.f(float 4.0)
+  ret float %res
+}
+
+define double @test_sqrt_rn_d_4() {
+; CHECK-LABEL: define double @test_sqrt_rn_d_4() {
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %res = call double @llvm.nvvm.sqrt.rn.d(double 4.0)
+  ret double %res
+}
+
+define float @test_sqrt_rn_ftz_f_4() {
+; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_4() {
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 4.0)
+  ret float %res
+}
+
+define float @test_sqrt_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x3BFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.sqrt.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define float @test_sqrt_rn_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_rn_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0x3BFFFFFFE0000000
+;
+  %res = call float @llvm.nvvm.sqrt.rn.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+define double @test_sqrt_rn_d_pos_subnorm() {
+; CHECK-LABEL: define double @test_sqrt_rn_d_pos_subnorm() {
+; CHECK-NEXT:    ret double 0x3BFFFFFFDFFFFFF0
+;
+  %res = call double @llvm.nvvm.sqrt.rn.d(double 0x380FFFFFC0000000)
+  ret double %res
+}
+
+define float @test_sqrt_rn_ftz_f_pos_subnorm() {
+; CHECK-LABEL: define float @test_sqrt_rn_ftz_f_pos_subnorm() {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %res = call float @llvm.nvvm.sqrt.rn.ftz.f(float 0x380FFFFFC0000000)
+  ret float %res
+}
+
+declare double @llvm.nvvm.ceil.d(double)
+declare float @llvm.nvvm.ceil.f(float)
+declare float @llvm.nvvm.ceil.ftz.f(float)
+
+declare float @llvm.nvvm.fabs(float)
+declare float @llvm.nvvm.fabs.ftz(float)
+
+declare double @llvm.nvvm.floor.d(double)
+declare float @llvm.nvvm.floor.f(float)
+declare float @llvm.nvvm.floor.ftz.f(float)
+
+declare double @llvm.nvvm.rcp.rm.d(double)
+declare float @llvm.nvvm.rcp.rm.f(float)
+declare float @llvm.nvvm.rcp.rm.ftz.f(float)
+declare double @llvm.nvvm.rcp.rn.d(double)
+declare float @llvm.nvvm.rcp.rn.f(float)
+declare float @llvm.nvvm.rcp.rn.ftz.f(float)
+declare double @llvm.nvvm.rcp.rp.d(double)
+declare float @llvm.nvvm.rcp.rp.f(float)
+declare float @llvm.nvvm.rcp.rp.ftz.f(float)
+declare double @llvm.nvvm.rcp.rz.d(double)
+declare float @llvm.nvvm.rcp.rz.f(float)
+declare float @llvm.nvvm.rcp.rz.ftz.f(float)
+
+declare double @llvm.nvvm.round.d(double)
+declare float @llvm.nvvm.round.f(float)
+declare float @llvm.nvvm.round.ftz.f(float)
+
+declare double @llvm.nvvm.saturate.d(double)
+declare float @llvm.nvvm.saturate.f(float)
+declare float @llvm.nvvm.saturate.ftz.f(float)
+
+declare float @llvm.nvvm.sqrt.f(float)
+declare double @llvm.nvvm.sqrt.rn.d(double)
+declare float @llvm.nvvm.sqrt.rn.f(float)
+declare float @llvm.nvvm.sqrt.rn.ftz.f(float)
diff --git a/llvm/test/Transforms/InstSimplify/exp10.ll b/llvm/test/Transforms/InstSimplify/exp10.ll
index c415c419aad84..17c081137ad1c 100644
--- a/llvm/test/Transforms/InstSimplify/exp10.ll
+++ b/llvm/test/Transforms/InstSimplify/exp10.ll
@@ -57,8 +57,7 @@ define <vscale x 2 x float> @exp10_exp10_scalable_vector(<vscale x 2 x float> %x
 
 define float @exp10_poison() {
 ; CHECK-LABEL: define float @exp10_poison() {
-; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.exp10.f32(float poison)
-; CHECK-NEXT:    ret float [[RET]]
+; CHECK-NEXT:    ret float poison
 ;
   %ret = call float @llvm.exp10.f32(float poison)
   ret float %ret
@@ -66,8 +65,7 @@ define float @exp10_poison() {
 
 define <2 x float> @exp10_poison_vector() {
 ; CHECK-LABEL: define <2 x float> @exp10_poison_vector() {
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison)
-; CHECK-NEXT:    ret <2 x float> [[RET]]
+; CHECK-NEXT:    ret <2 x float> poison
 ;
   %ret = call <2 x float> @llvm.exp10.v2f32(<2 x float> poison)
   ret <2 x float> %ret
@@ -75,8 +73,7 @@ define <2 x float> @exp10_poison_vector() {
 
 define <vscale x 2 x float> @exp10_poison_scaleable_vector() {
 ; CHECK-LABEL: define <vscale x 2 x float> @exp10_poison_scaleable_vector() {
-; CHECK-NEXT:    [[RET:%.*]] = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison)
-; CHECK-NEXT:    ret <vscale x 2 x float> [[RET]]
+; CHECK-NEXT:    ret <vscale x 2 x float> poison
 ;
   %ret = call <vscale x 2 x float> @llvm.exp10.nxv2f32(<vscale x 2 x float> poison)
   ret <vscale x 2 x float> %ret
diff --git a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
index 8578aa9fa84b3..45f5e3768725f 100644
--- a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
+++ b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
@@ -87,3 +87,526 @@ define void @pow_poison(i16 %arg_int,float %arg_flt, ptr %P) {
 
   ret void
 }
+
+define void @sin_poison(ptr %P) {
+; CHECK-LABEL: @sin_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %sin_f32 = call float @llvm.sin(float poison)
+  store volatile float %sin_f32, ptr %P
+
+  %sin_2xf32 = call <2 x float> @llvm.sin(<2 x float> poison)
+  store volatile <2 x float> %sin_2xf32, ptr %P
+
+  %sin_4xf64 = call <4 x double> @llvm.sin(<4 x double> poison)
+  store volatile <4 x double> %sin_4xf64, ptr %P
+
+  %asin_f32 = call float @llvm.asin(float poison)
+  store volatile float %asin_f32, ptr %P
+
+  %asin_2xf32 = call <2 x float> @llvm.asin(<2 x float> poison)
+  store volatile <2 x float> %asin_2xf32, ptr %P
+
+  %asin_4xf64 = call <4 x double> @llvm.asin(<4 x double> poison)
+  store volatile <4 x double> %asin_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @cos_poison(ptr %P) {
+; CHECK-LABEL: @cos_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %cos_f32 = call float @llvm.cos(float poison)
+  store volatile float %cos_f32, ptr %P
+
+  %cos_2xf32 = call <2 x float> @llvm.cos(<2 x float> poison)
+  store volatile <2 x float> %cos_2xf32, ptr %P
+
+  %cos_4xf64 = call <4 x double> @llvm.cos(<4 x double> poison)
+  store volatile <4 x double> %cos_4xf64, ptr %P
+
+  %acos_f32 = call float @llvm.acos(float poison)
+  store volatile float %acos_f32, ptr %P
+
+  %acos_2xf32 = call <2 x float> @llvm.acos(<2 x float> poison)
+  store volatile <2 x float> %acos_2xf32, ptr %P
+
+  %acos_4xf64 = call <4 x double> @llvm.acos(<4 x double> poison)
+  store volatile <4 x double> %acos_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @tan_poison(ptr %P) {
+; CHECK-LABEL: @tan_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %tan_f32 = call float @llvm.tan(float poison)
+  store volatile float %tan_f32, ptr %P
+
+  %tan_2xf32 = call <2 x float> @llvm.tan(<2 x float> poison)
+  store volatile <2 x float> %tan_2xf32, ptr %P
+
+  %tan_4xf64 = call <4 x double> @llvm.tan(<4 x double> poison)
+  store volatile <4 x double> %tan_4xf64, ptr %P
+
+  %atan_f32 = call float @llvm.atan(float poison)
+  store volatile float %atan_f32, ptr %P
+
+  %atan_2xf32 = call <2 x float> @llvm.atan(<2 x float> poison)
+  store volatile <2 x float> %atan_2xf32, ptr %P
+
+  %atan_4xf64 = call <4 x double> @llvm.atan(<4 x double> poison)
+  store volatile <4 x double> %atan_4xf64, ptr %P
+
+  %atan2_f32 = call float @llvm.atan2(float poison, float poison)
+  store volatile float %atan2_f32, ptr %P
+
+  %atan2_2xf32 = call <2 x float> @llvm.atan2(<2 x float> poison, <2 x float> poison)
+  store volatile <2 x float> %atan2_2xf32, ptr %P
+
+  %atan2_4xf64 = call <4 x double> @llvm.atan2(<4 x double> poison, <4 x double> poison)
+  store volatile <4 x double> %atan2_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @sincos_poison(ptr %P) {
+; CHECK-LABEL: @sincos_poison(
+; CHECK-NEXT:    store volatile { float, float } poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile { float, float } poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %sincos_f32 = call { float, float } @llvm.sincos(float poison)
+  store volatile { float, float } %sincos_f32, ptr %P
+
+  %sincos_2xf32 = call { <2 x float>, <2 x float> } @llvm.sincos(<2 x float> poison)
+  store volatile { <2 x float>, <2 x float> } %sincos_2xf32, ptr %P
+
+  %sincos_4xf64 = call { <4 x double>, <4 x double> } @llvm.sincos(<4 x double> poison)
+  store volatile { <4 x double>, <4 x double> } %sincos_4xf64, ptr %P
+
+  %sincospi_f32 = call { float, float } @llvm.sincospi(float poison)
+  store volatile { float, float } %sincospi_f32, ptr %P
+
+  %sincospi_2xf32 = call { <2 x float>, <2 x float> } @llvm.sincospi(<2 x float> poison)
+  store volatile { <2 x float>, <2 x float> } %sincospi_2xf32, ptr %P
+
+  %sincospi_4xf64 = call { <4 x double>, <4 x double> } @llvm.sincospi(<4 x double> poison)
+  store volatile { <4 x double>, <4 x double> } %sincospi_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @sinh_poison(ptr %P) {
+; CHECK-LABEL: @sinh_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %sinh_f32 = call float @llvm.sinh(float poison)
+  store volatile float %sinh_f32, ptr %P
+
+  %sinh_2xf32 = call <2 x float> @llvm.sinh(<2 x float> poison)
+  store volatile <2 x float> %sinh_2xf32, ptr %P
+
+  %sinh_4xf64 = call <4 x double> @llvm.sinh(<4 x double> poison)
+  store volatile <4 x double> %sinh_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @cosh_poison(ptr %P) {
+; CHECK-LABEL: @cosh_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %cosh_f32 = call float @llvm.cosh(float poison)
+  store volatile float %cosh_f32, ptr %P
+
+  %cosh_2xf32 = call <2 x float> @llvm.cosh(<2 x float> poison)
+  store volatile <2 x float> %cosh_2xf32, ptr %P
+
+  %cosh_4xf64 = call <4 x double> @llvm.cosh(<4 x double> poison)
+  store volatile <4 x double> %cosh_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @tanh_poison(ptr %P) {
+; CHECK-LABEL: @tanh_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %tanh_f32 = call float @llvm.tanh(float poison)
+  store volatile float %tanh_f32, ptr %P
+
+  %tanh_2xf32 = call <2 x float> @llvm.tanh(<2 x float> poison)
+  store volatile <2 x float> %tanh_2xf32, ptr %P
+
+  %tanh_4xf64 = call <4 x double> @llvm.tanh(<4 x double> poison)
+  store volatile <4 x double> %tanh_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @exp_poison(ptr %P) {
+; CHECK-LABEL: @exp_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %exp_f32 = call float @llvm.exp(float poison)
+  store volatile float %exp_f32, ptr %P
+
+  %exp_2xf32 = call <2 x float> @llvm.exp(<2 x float> poison)
+  store volatile <2 x float> %exp_2xf32, ptr %P
+
+  %exp_4xf64 = call <4 x double> @llvm.exp(<4 x double> poison)
+  store volatile <4 x double> %exp_4xf64, ptr %P
+
+  %exp2_f32 = call float @llvm.exp2(float poison)
+  store volatile float %exp2_f32, ptr %P
+
+  %exp2_2xf32 = call <2 x float> @llvm.exp2(<2 x float> poison)
+  store volatile <2 x float> %exp2_2xf32, ptr %P
+
+  %exp2_4xf64 = call <4 x double> @llvm.exp2(<4 x double> poison)
+  store volatile <4 x double> %exp2_4xf64, ptr %P
+
+  %exp10_f32 = call float @llvm.exp10(float poison)
+  store volatile float %exp10_f32, ptr %P
+
+  %exp10_2xf32 = call <2 x float> @llvm.exp10(<2 x float> poison)
+  store volatile <2 x float> %exp10_2xf32, ptr %P
+
+  %exp10_4xf64 = call <4 x double> @llvm.exp10(<4 x double> poison)
+  store volatile <4 x double> %exp10_4xf64, ptr %P
+  ret void
+}
+
+
+define void @log_poison(ptr %P) {
+; CHECK-LABEL: @log_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %log_f32 = call float @llvm.log(float poison)
+  store volatile float %log_f32, ptr %P
+
+  %log_2xf32 = call <2 x float> @llvm.log(<2 x float> poison)
+  store volatile <2 x float> %log_2xf32, ptr %P
+
+  %log_4xf64 = call <4 x double> @llvm.log(<4 x double> poison)
+  store volatile <4 x double> %log_4xf64, ptr %P
+
+  %log2_f32 = call float @llvm.log2(float poison)
+  store volatile float %log2_f32, ptr %P
+
+  %log2_2xf32 = call <2 x float> @llvm.log2(<2 x float> poison)
+  store volatile <2 x float> %log2_2xf32, ptr %P
+
+  %log2_4xf64 = call <4 x double> @llvm.log2(<4 x double> poison)
+  store volatile <4 x double> %log2_4xf64, ptr %P
+
+  %log10_f32 = call float @llvm.log10(float poison)
+  store volatile float %log10_f32, ptr %P
+
+  %log10_2xf32 = call <2 x float> @llvm.log10(<2 x float> poison)
+  store volatile <2 x float> %log10_2xf32, ptr %P
+
+  %log10_4xf64 = call <4 x double> @llvm.log10(<4 x double> poison)
+  store volatile <4 x double> %log10_4xf64, ptr %P
+  ret void
+}
+
+
+define void @modf_poison(ptr %P) {
+; CHECK-LABEL: @modf_poison(
+; CHECK-NEXT:    store volatile { float, float } poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %modf_f32 = call { float, float } @llvm.modf(float poison)
+  store volatile { float, float } %modf_f32, ptr %P
+
+  %modf_2xf32 = call { <2 x float>, <2 x float> } @llvm.modf(<2 x float> poison)
+  store volatile { <2 x float>, <2 x float> } %modf_2xf32, ptr %P
+
+  %modf_4xf64 = call { <4 x double>, <4 x double> } @llvm.modf(<4 x double> poison)
+  store volatile { <4 x double>, <4 x double> } %modf_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @floor_poison(ptr %P) {
+; CHECK-LABEL: @floor_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %floor_f32 = call float @llvm.floor(float poison)
+  store volatile float %floor_f32, ptr %P
+
+  %floor_2xf32 = call <2 x float> @llvm.floor(<2 x float> poison)
+  store volatile <2 x float> %floor_2xf32, ptr %P
+
+  %floor_4xf64 = call <4 x double> @llvm.floor(<4 x double> poison)
+  store volatile <4 x double> %floor_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @ceil_poison(ptr %P) {
+; CHECK-LABEL: @ceil_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %ceil_f32 = call float @llvm.ceil(float poison)
+  store volatile float %ceil_f32, ptr %P
+
+  %ceil_2xf32 = call <2 x float> @llvm.ceil(<2 x float> poison)
+  store volatile <2 x float> %ceil_2xf32, ptr %P
+
+  %ceil_4xf64 = call <4 x double> @llvm.ceil(<4 x double> poison)
+  store volatile <4 x double> %ceil_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @trunc_poison(ptr %P) {
+; CHECK-LABEL: @trunc_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %trunc_f32 = call float @llvm.trunc(float poison)
+  store volatile float %trunc_f32, ptr %P
+
+  %trunc_2xf32 = call <2 x float> @llvm.trunc(<2 x float> poison)
+  store volatile <2 x float> %trunc_2xf32, ptr %P
+
+  %trunc_4xf64 = call <4 x double> @llvm.trunc(<4 x double> poison)
+  store volatile <4 x double> %trunc_4xf64, ptr %P
+
+  ret void
+}
+
+define void @rint_poison(ptr %P) {
+; CHECK-LABEL: @rint_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %rint_f32 = call float @llvm.rint(float poison)
+  store volatile float %rint_f32, ptr %P
+
+  %rint_2xf32 = call <2 x float> @llvm.rint(<2 x float> poison)
+  store volatile <2 x float> %rint_2xf32, ptr %P
+
+  %rint_4xf64 = call <4 x double> @llvm.rint(<4 x double> poison)
+  store volatile <4 x double> %rint_4xf64, ptr %P
+
+  ret void
+}
+
+define void @nearbyint_poison(ptr %P) {
+; CHECK-LABEL: @nearbyint_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %nearbyint_f32 = call float @llvm.nearbyint(float poison)
+  store volatile float %nearbyint_f32, ptr %P
+
+  %nearbyint_2xf32 = call <2 x float> @llvm.nearbyint(<2 x float> poison)
+  store volatile <2 x float> %nearbyint_2xf32, ptr %P
+
+  %nearbyint_4xf64 = call <4 x double> @llvm.nearbyint(<4 x double> poison)
+  store volatile <4 x double> %nearbyint_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @round_poison(ptr %P) {
+; CHECK-LABEL: @round_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %round_f32 = call float @llvm.round(float poison)
+  store volatile float %round_f32, ptr %P
+
+  %round_2xf32 = call <2 x float> @llvm.round(<2 x float> poison)
+  store volatile <2 x float> %round_2xf32, ptr %P
+
+  %round_4xf64 = call <4 x double> @llvm.round(<4 x double> poison)
+  store volatile <4 x double> %round_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @roundeven_poison(ptr %P) {
+; CHECK-LABEL: @roundeven_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %roundeven_f32 = call float @llvm.roundeven(float poison)
+  store volatile float %roundeven_f32, ptr %P
+
+  %roundeven_2xf32 = call <2 x float> @llvm.roundeven(<2 x float> poison)
+  store volatile <2 x float> %roundeven_2xf32, ptr %P
+
+  %roundeven_4xf64 = call <4 x double> @llvm.roundeven(<4 x double> poison)
+  store volatile <4 x double> %roundeven_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @lrint_poison(ptr %P) {
+; CHECK-LABEL: @lrint_poison(
+; CHECK-NEXT:    store volatile i32 poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x i32> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x i64> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %lrint_f32 = call i32 @llvm.lrint(float poison)
+  store volatile i32 %lrint_f32, ptr %P
+
+  %lrint_2xf32 = call <2 x i32> @llvm.lrint(<2 x float> poison)
+  store volatile <2 x i32> %lrint_2xf32, ptr %P
+
+  %lrint_4xf64 = call <4 x i64> @llvm.lrint(<4 x double> poison)
+  store volatile <4 x i64> %lrint_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @llrint_poison(ptr %P) {
+; CHECK-LABEL: @llrint_poison(
+; CHECK-NEXT:    store volatile i32 poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x i32> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x i64> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %llrint_f32 = call i32 @llvm.llrint(float poison)
+  store volatile i32 %llrint_f32, ptr %P
+
+  %llrint_2xf32 = call <2 x i32> @llvm.llrint(<2 x float> poison)
+  store volatile <2 x i32> %llrint_2xf32, ptr %P
+
+  %llrint_4xf64 = call <4 x i64> @llvm.llrint(<4 x double> poison)
+  store volatile <4 x i64> %llrint_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @umul_fix_poison(ptr %P) {
+; CHECK-LABEL: @umul_fix_poison(
+; CHECK-NEXT:    store volatile i16 poison, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    store volatile i32 poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <4 x i32> poison, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  %umul_fix_i16 = call i16 @llvm.umul.fix(i16 poison, i16 poison, i32 2)
+  store volatile i16 %umul_fix_i16, ptr %P
+
+  %umul_fix_i32 = call i32 @llvm.umul.fix(i32 poison, i32 poison, i32 2)
+  store volatile i32 %umul_fix_i32, ptr %P
+
+  %umul_fix_4xi32 = call <4 x i32> @llvm.umul.fix(<4 x i32> poison, <4 x i32> poison, i32 2)
+  store volatile <4 x i32> %umul_fix_4xi32, ptr %P
+
+  ret void
+}
+
+
+define void @umul_fix_sat_poison(ptr %P) {
+; CHECK-LABEL: @umul_fix_sat_poison(
+; CHECK-NEXT:    store volatile i16 poison, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    store volatile i32 poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <4 x i32> poison, ptr [[P]], align 16
+; CHECK-NEXT:    ret void
+;
+  %umul_fix_sati16 = call i16 @llvm.umul.fix.sat(i16 poison, i16 poison, i32 2)
+  store volatile i16 %umul_fix_sati16, ptr %P
+
+  %umul_fix_sati32 = call i32 @llvm.umul.fix.sat(i32 poison, i32 poison, i32 2)
+  store volatile i32 %umul_fix_sati32, ptr %P
+
+  %umul_fix_sat4xi32 = call <4 x i32> @llvm.umul.fix.sat(<4 x i32> poison, <4 x i32> poison, i32 2)
+  store volatile <4 x i32> %umul_fix_sat4xi32, ptr %P
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InstSimplify/sincos.ll b/llvm/test/Transforms/InstSimplify/sincos.ll
index e0f81ee45af05..144da53c6917b 100644
--- a/llvm/test/Transforms/InstSimplify/sincos.ll
+++ b/llvm/test/Transforms/InstSimplify/sincos.ll
@@ -50,8 +50,7 @@ define { <2 x float>, <2 x float> } @sincos_zero_vector() {
 
 define { float, float } @sincos_poison() {
 ; CHECK-LABEL: define { float, float } @sincos_poison() {
-; CHECK-NEXT:    [[RET:%.*]] = call { float, float } @llvm.sincos.f32(float poison)
-; CHECK-NEXT:    ret { float, float } [[RET]]
+; CHECK-NEXT:    ret { float, float } poison 
 ;
   %ret = call { float, float } @llvm.sincos.f32(float poison)
   ret { float, float } %ret
@@ -59,8 +58,7 @@ define { float, float } @sincos_poison() {
 
 define { <2 x float>, <2 x float> } @sincos_poison_vector() {
 ; CHECK-LABEL: define { <2 x float>, <2 x float> } @sincos_poison_vector() {
-; CHECK-NEXT:    [[RET:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> poison)
-; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[RET]]
+; CHECK-NEXT:    ret { <2 x float>, <2 x float> } poison 
 ;
   %ret = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> poison)
   ret { <2 x float>, <2 x float> } %ret
@@ -68,8 +66,7 @@ define { <2 x float>, <2 x float> } @sincos_poison_vector() {
 
 define { <vscale x 2 x float>, <vscale x 2 x float> } @sincos_poison_scalable_vector() {
 ; CHECK-LABEL: define { <vscale x 2 x float>, <vscale x 2 x float> } @sincos_poison_scalable_vector() {
-; CHECK-NEXT:    [[RET:%.*]] = call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.sincos.nxv2f32(<vscale x 2 x float> poison)
-; CHECK-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float> } [[RET]]
+; CHECK-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float> } poison
 ;
   %ret = call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.sincos.nxv2f32(<vscale x 2 x float> poison)
   ret { <vscale x 2 x float>, <vscale x 2 x float> } %ret
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 09e2c53465cd7..6c81d9a4d2ed6 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -10,8 +10,8 @@ define void @deinterleave_i8_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i8_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2
@@ -33,8 +33,8 @@ define void @deinterleave_i16_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2
@@ -56,8 +56,8 @@ define void @deinterleave_8xi32_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_8xi32_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2
@@ -79,8 +79,8 @@ define void @deinterleave_i64_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i64_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2
@@ -102,8 +102,8 @@ define void @deinterleave_float_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_float_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_float_factor2
@@ -125,8 +125,8 @@ define void @deinterleave_double_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_double_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_double_factor2
@@ -148,8 +148,8 @@ define void @deinterleave_ptr_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_ptr_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2
@@ -301,6 +301,10 @@ define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; NEON-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
 ; NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
 ; NEON-NEXT:    [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
+; NEON-NEXT:    [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
+; NEON-NEXT:    [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 436389ba5b991..d7649801ea2fc 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -8,8 +8,8 @@ define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi8_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
@@ -23,8 +23,8 @@ define void @deinterleave_nxi16_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi16_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
@@ -38,8 +38,8 @@ define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
@@ -53,8 +53,8 @@ define void @deinterleave_nxi64_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi64_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
@@ -68,8 +68,8 @@ define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxfloat_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
@@ -83,8 +83,8 @@ define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
@@ -98,8 +98,8 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxptr_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
@@ -215,6 +215,10 @@ define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
@@ -239,6 +243,10 @@ define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index c565066541d1d..58c0bccc3be38 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -49,8 +49,16 @@ define void @wide_deinterleave4(ptr %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
-; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
+; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %src, align 4
@@ -73,8 +81,8 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
 ; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[SRC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
 ; CHECK-NEXT:    ret void
 ;
 
@@ -95,12 +103,11 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 define void @negative_deinterleave4_test(ptr %src) {
 ; CHECK-LABEL: define void @negative_deinterleave4_test
 ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[LOAD]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i32>, ptr %src, align 4
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index b109448bd5d7c..1418ca09c0d61 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -606,6 +606,10 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #2 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 1
 ; CHECK-NEXT:    ret void
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
diff --git a/llvm/test/Transforms/LoopInterchange/force-interchange.ll b/llvm/test/Transforms/LoopInterchange/force-interchange.ll
new file mode 100644
index 0000000000000..c33ecdf7d9905
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/force-interchange.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -passes=loop-interchange -pass-remarks-output=%t -disable-output -loop-interchange-profitabilities=ignore -S
+; RUN: FileCheck --input-file=%t %s
+
+; There should be no reason to interchange this, unless it is forced.
+;
+;     for (int i = 0; i<1024; i++)
+;       for (int j = 0; j<1024; j++)
+;         A[i][j] = 42;
+;
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        f
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+
+@A = dso_local local_unnamed_addr global [1024 x [1024 x i32]] zeroinitializer, align 4
+
+define dso_local void @f() local_unnamed_addr #0 {
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %inner.header ]
+  br label %inner.body
+
+inner.header:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond20.not = icmp eq i64 %i.next, 1024
+  br i1 %exitcond20.not, label %exit, label %outer.header
+
+inner.body:
+  %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+  %arrayidx6 = getelementptr inbounds nuw [1024 x [1024 x i32]], ptr @A, i64 0, i64 %i, i64 %j
+  store i32 42, ptr %arrayidx6, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, 1024
+  br i1 %exitcond.not, label %inner.header, label %inner.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/fp-reductions.ll b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll
new file mode 100644
index 0000000000000..0703a7b27979a
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll
@@ -0,0 +1,437 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-output=%t -disable-output \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
+; RUN: FileCheck -input-file=%t %s
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point addition.
+;
+; float sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fadd
+define void @reduction_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point addition is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fadd
+define void @reduction_reassoc_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: Is it really legal to interchange the loops when
+; both reassoc and ninf are set?
+; Check that the interchange is legal if the floating-point addition is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_ninf_fadd
+define void @reduction_reassoc_ninf_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc ninf float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point multiplication.
+;
+; float prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmul
+define void @reduction_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point multiplication is
+; marked as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmul
+define void @reduction_reassoc_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul reassoc float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point fmuladd.
+;
+; float fmuladd = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmuladd += A[j][i] * B[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmuladd
+define void @reduction_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point fmuladd is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmuladd
+define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the reassociative
+; floating-point minimum.
+;
+; float fmin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmin = (A[j][i] < fmin) ? A[j][i] : fmin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmin
+define void @reduction_fmin(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz olt float %a, %fmin.j
+  %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the floating-point
+; llvm.minimumnum.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmininumnum
+define void @reduction_fmininumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the reassociative
+; floating-point maximum.
+;
+; float fmax = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmax = (A[j][i] > fmax) ? A[j][i] : fmax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmax
+define void @reduction_fmax(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz ogt float %a, %fmax.j
+  %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the floating-point
+; llvm.maximumnum.
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmaxinumnum
+define void @reduction_fmaxinumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
+declare float @llvm.minimumnum.f32(float %a, float %b)
+declare float @llvm.maximumnum.f32(float %a, float %b)
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
index 3ed69485bc8f2..f5c6ad7889366 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
@@ -333,399 +333,3 @@ for.i.latch:
 exit:
   ret void
 }
-
-; Check that the loops aren't exchanged if there is a reduction of
-; non-reassociative floating-point addition.
-;
-; float sum = 0;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     sum += A[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_fadd
-define void @reduction_fadd(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %sum.j.next = fadd float %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the interchange is legal if the floating-point addition is marked
-; as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_fadd
-define void @reduction_reassoc_fadd(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %sum.j.next = fadd reassoc float %sum.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the loops aren't exchanged if there is a reduction of
-; non-reassociative floating-point multiplication.
-;
-; float prod = 1;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     prod *= A[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_fmul
-define void @reduction_fmul(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %prod.j.next = fmul float %prod.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the interchange is legal if the floating-point multiplication is
-; marked as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_fmul
-define void @reduction_reassoc_fmul(ptr %A) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %prod.j.next = fmul reassoc float %prod.j, %a
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the loops aren't exchanged if there is a reduction of
-; non-reassociative floating-point fmuladd.
-;
-; float fmuladd = 0;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     fmuladd += A[j][i] * B[j][i];
-
-; CHECK:      --- !Missed
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            UnsupportedPHIOuter
-; CHECK-NEXT: Function:        reduction_fmuladd
-define void @reduction_fmuladd(ptr %A, ptr %B) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
-  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx.a, align 4
-  %b = load float, ptr %idx.b, align 4
-  %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that the interchange is legal if the floating-point fmuladd is marked
-; as reassoc.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_reassoc_fmuladd
-define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
-  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx.a, align 4
-  %b = load float, ptr %idx.b, align 4
-  %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that interchanging the loops is legal for the reassociative
-; floating-point minimum.
-;
-; float fmin = init;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     fmin = (A[j][i] < fmin) ? A[j][i] : fmin;
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmin
-define void @reduction_fmin(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %cmp = fcmp nnan nsz olt float %a, %fmin.j
-  %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-
-; Check that interchanging the loops is legal for the floating-point
-; llvm.minimumnum.
-;
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmininumnum
-define void @reduction_fmininumnum(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that interchanging the loops is legal for the reassociative
-; floating-point maximum.
-;
-; float fmax = init;
-; for (int i = 0; i < 2; i++)
-;   for (int j = 0; j < 2; j++)
-;     fmax = (A[j][i] > fmax) ? A[j][i] : fmax;
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmax
-define void @reduction_fmax(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %cmp = fcmp nnan nsz ogt float %a, %fmax.j
-  %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-; Check that interchanging the loops is legal for the floating-point
-; llvm.maximumnum.
-
-; CHECK:      --- !Pass
-; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
-; CHECK-NEXT: Function:        reduction_fmaxinumnum
-define void @reduction_fmaxinumnum(ptr %A, float %init) {
-entry:
-  br label %for.i.header
-
-for.i.header:
-  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
-  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
-  br label %for.j
-
-for.j:
-  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
-  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
-  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
-  %a = load float, ptr %idx, align 4
-  %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j)
-  %j.inc = add i32 %j, 1
-  %cmp.j = icmp slt i32 %j.inc, 2
-  br i1 %cmp.j, label %for.j, label %for.i.latch
-
-for.i.latch:
-  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
-  %i.inc = add i32 %i, 1
-  %cmp.i = icmp slt i32 %i.inc, 2
-  br i1 %cmp.i, label %for.i.header, label %exit
-
-exit:
-  ret void
-}
-
-declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
-declare float @llvm.minimumnum.f32(float %a, float %b)
-declare float @llvm.maximumnum.f32(float %a, float %b)
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
new file mode 100644
index 0000000000000..dd524ab7d140d
--- /dev/null
+++ b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+
+define void @test(ptr %addr) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[ADDR:%.*]]) {
+; CHECK-NEXT:    indirectbr ptr [[ADDR]], [label %[[A:.*]], label %C]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br i1 true, label %[[B:.*]], label %[[C_LOOPEXIT:.*]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    br i1 true, label %[[A]], label %[[C_LOOPEXIT]]
+; CHECK:       [[C_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[C:.*]]
+; CHECK:       [[C]]:
+; CHECK-NEXT:    unreachable
+;
+
+  indirectbr ptr %addr, [label %A, label %C]
+
+A:
+  br i1 true, label %B, label %C
+
+B:
+  br i1 true, label %A, label %C
+
+C:
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll
new file mode 100644
index 0000000000000..c7a0de22b200b
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-reduce -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %p, i64 %idx) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x [4 x i32]], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr [[ALLOCA]])
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[IDX]], 6
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 48
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[ALLOCA]], i64 48
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ -8, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT:    [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[SCEVGEP8]], i64 32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SCEVGEP9]], align 4
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[LSR_IV]]
+; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[SCEVGEP6]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[SCEVGEP7]], align 4
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SCEVGEP3]], i64 [[LSR_IV]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SCEVGEP5]], align 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[SCEVGEP2]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr [[ALLOCA]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [4 x [4 x i32]], align 16
+  call void @llvm.lifetime.start.p0(i64 64, ptr %alloca)
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ]
+  %gep1 = getelementptr [4 x [12 x [4 x [4 x i32]]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv, i64 0
+  %0 = load i32, ptr %gep1, align 4
+  %gep2 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv
+  %1 = load i32, ptr %gep2, align 4
+  %gep3 = getelementptr [4 x [4 x i32]], ptr %alloca, i64 0, i64 3, i64 %indvars.iv
+  %2 = load i32, ptr %gep3, align 4
+  %gep4 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 %idx, i64 3, i64 %indvars.iv
+  %3 = load i32, ptr %gep4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv, 1
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  call void @llvm.lifetime.end.p0(i64 64, ptr %alloca)
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index 1a091e847ca34..0b78beea54aa9 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -578,8 +578,323 @@ loop.latch:
 exit:
   ret void
 }
+
+define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_unroll_partial(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    ret i32 [[BIN_RDX2]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; OTHER-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP2]]
+; OTHER-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; OTHER-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; OTHER-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP3]]
+; OTHER-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; OTHER-NEXT:    [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
+; OTHER-NEXT:    br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    ret i32 [[BIN_RDX2]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+declare i1 @cond()
+
+define i32 @test_add_reduction_multi_block(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_multi_block(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[C:%.*]] = call i1 @cond()
+; APPLE-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[THEN]]:
+; APPLE-NEXT:    store i32 0, ptr [[GEP_A]], align 4
+; APPLE-NEXT:    br label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    ret i32 [[RES]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_multi_block(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[C:%.*]] = call i1 @cond()
+; OTHER-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[THEN]]:
+; OTHER-NEXT:    store i32 0, ptr [[GEP_A]], align 4
+; OTHER-NEXT:    br label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; OTHER-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop.latch ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %c = call i1 @cond()
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i32 0, ptr %gep.a
+  br label %loop.latch
+
+loop.latch:
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop.latch ]
+  ret i32 %res
+}
+
+define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]]
+; APPLE-NEXT:    ret i32 [[SUM]]
+;
+; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[RDX_2_NEXT_1]] = mul i32 [[RDX_2_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT_1]], 1024
+; OTHER-NEXT:    br i1 [[EC_1]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[BIN_RDX:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    [[SUM:%.*]] = add i32 [[BIN_RDX]], [[RES_2]]
+; OTHER-NEXT:    ret i32 [[SUM]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %rdx.2 = phi i32 [ 0, %entry ], [ %rdx.2.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %rdx.2.next = mul i32 %rdx.2, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res.1 = phi i32 [ %rdx.next, %loop ]
+  %res.2 = phi i32 [ %rdx.2.next, %loop ]
+  %sum = add i32 %res.1, %res.2
+  ret i32 %sum
+}
+
+
+define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_runtime(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; APPLE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; APPLE-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ]
+; APPLE-NEXT:    ret i32 [[RES]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_runtime(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; OTHER-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 3
+; OTHER-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
+; OTHER-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; OTHER:       [[ENTRY_NEW]]:
+; OTHER-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; OTHER-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; OTHER-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]]
+; OTHER-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; OTHER-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; OTHER-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]]
+; OTHER-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; OTHER-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; OTHER-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; OTHER-NEXT:    br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; OTHER-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; OTHER:       [[EXIT_UNR_LCSSA]]:
+; OTHER-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; OTHER-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; OTHER:       [[LOOP_EPIL_PREHEADER]]:
+; OTHER-NEXT:    br label %[[LOOP_EPIL:.*]]
+; OTHER:       [[LOOP_EPIL]]:
+; OTHER-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; OTHER-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
+; OTHER-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; OTHER-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; OTHER-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; OTHER-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; OTHER-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; OTHER:       [[EXIT_EPILOG_LCSSA]]:
+; OTHER-NEXT:    [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    br label %[[EXIT]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
+; OTHER-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
 ; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
 ;.
+; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
new file mode 100644
index 0000000000000..953dc278b6644
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -0,0 +1,446 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
+
+define i32 @test_add(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_tc_not_multiple_of_4(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_tc_not_multiple_of_4(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_1:.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1001
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_1]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i32, ptr [[GEP_SRC_12]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_12]]
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1001
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_rdx_used_in_loop(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_rdx_used_in_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT]], ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_1]], ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_2]], ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_24]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_24]], ptr [[GEP_SRC_24]], align 4
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  store i32 %rdx.next, ptr %gep.src
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_phi_used_outside_loop(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_phi_used_outside_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_2]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx
+}
+
+define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_and_mul_reduction(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]]
+; CHECK-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]]
+; CHECK-NEXT:    [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx.1 = phi i32 [ %start, %entry ], [ %rdx.1.next, %loop ]
+  %rdx.2 = phi i32 [ %start, %entry ], [ %rdx.2.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.1.next = add i32 %rdx.1, %l
+  %rdx.2.next = mul i32 %rdx.2, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %res = add i32 %rdx.1.next, %rdx.2.next
+  ret i32 %res
+}
+
+define float @test_fadd_no_fmfs(ptr %src, i64 %n, float %start) {
+; CHECK-LABEL: define float @test_fadd_no_fmfs(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 1
+  %rdx.next = fadd float %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %rdx.next
+}
+
+define float @test_fadd_with_ressaoc(ptr %src, i64 %n, float %start) {
+; CHECK-LABEL: define float @test_fadd_with_ressaoc(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 1
+  %rdx.next = fadd float %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %rdx.next
+}
+define i32 @test_smin(ptr %src, i64 %n) {
+; CHECK-LABEL: define i32 @test_smin(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi i32 [ 1000, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = call i32 @llvm.smin.i32(i32 [[MIN]], i32 [[L]])
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT]], i32 [[L_1]])
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_1]], i32 [[L_2]])
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_2]], i32 [[L_24]])
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi i32 [ 1000, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = call i32 @llvm.smin(i32 %min, i32 %l)
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i64 @test_any_of_reduction(ptr %src, i64 %n) {
+; CHECK-LABEL: define i64 @test_any_of_reduction(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ANY_OF_RDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = select i1 [[C]], i64 [[ANY_OF_RDX]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L_1]], 0
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = select i1 [[C_1]], i64 [[RDX_NEXT]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[L_2]], 0
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = select i1 [[C_2]], i64 [[RDX_NEXT_1]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i8, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[C_24:%.*]] = icmp eq i8 [[L_24]], 0
+; CHECK-NEXT:    [[RDX_NEXT_3]] = select i1 [[C_24]], i64 [[RDX_NEXT_2]], i64 0
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %any.of.rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src, align 1
+  %c = icmp eq i8 %l, 0
+  %rdx.next = select i1 %c, i64 %any.of.rdx, i64 0
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i64 %rdx.next
+}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
new file mode 100644
index 0000000000000..89f06ad373aa9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -S %s | FileCheck %s
+
+define i32 @test_add_reduction(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], [[TMP4]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_add_reduction_constant_op(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction_constant_op(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX]], 2
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], 1
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %rdx.next = add nuw nsw i32 %rdx, 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_add_reduction_8x_unroll(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction_8x_unroll(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; CHECK-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; CHECK-NEXT:    [[RDX_4:%.*]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]]
+; CHECK-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_A_4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_4]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_4:%.*]] = add nuw nsw i32 [[RDX_4]], [[TMP6]]
+; CHECK-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; CHECK-NEXT:    [[GEP_A_5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[GEP_A_5]], align 2
+; CHECK-NEXT:    [[RDX_6:%.*]] = add nuw nsw i32 [[RDX_NEXT_4]], [[TMP7]]
+; CHECK-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; CHECK-NEXT:    [[GEP_A_6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[GEP_A_6]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_6:%.*]] = add nuw nsw i32 [[RDX_6]], [[TMP8]]
+; CHECK-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; CHECK-NEXT:    [[GEP_A_7:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[GEP_A_7]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_7]] = add nuw nsw i32 [[RDX_NEXT_6]], [[TMP9]]
+; CHECK-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
+; CHECK-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; CHECK-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP10]]
+; CHECK-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; CHECK-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; CHECK-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !2
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.count", i32 2}
+
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.unroll.count", i32 8}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 1f619898ea788..812bca9841f85 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -46,27 +46,17 @@ define void @_Z3foov() {
 ; CHECK-V2-IC4-LABEL: define void @_Z3foov(
 ; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY1:.*:]]
-; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
-; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-V2-IC4:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK-V2-IC4:  [[VECTOR_PH]]:
 ; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC4:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
-; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
-; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
-; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
-; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4:  [[SCALAR_PH]]:
 ; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[FOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
 ;
 entry:
@@ -111,9 +101,6 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0}
-; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
-; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1}
-; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 451574a258c2b..427a05cc1c843 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index e93ee5563b057..1a8e5940d88e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
new file mode 100644
index 0000000000000..298ef09904b77
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -0,0 +1,399 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+; Original loop has trip count 16, but contains interleave groups with gaps, so
+; the last iteration must execute in the scalar loop. Thus the vector loop can
+; only execute up to 15 iterations.
+define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @vector_loop_with_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64>
+; CHECK-NEXT:    [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 17, [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <vscale x 2 x i64> [[TMP25]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT4]], [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul i64 1, [[TMP21]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP27]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]]
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT:    [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT:    [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+  %l = load i8, ptr %gep.src.i.i, align 1
+  %l.ext = zext i8 %l to i32
+  %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+  %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+  %gep.dst = getelementptr inbounds i8, ptr  %dst, i64 %iv
+  store i8 0, ptr %gep.dst, align 1
+  %min.ext = zext i32 %min.1 to i64
+  %red.next = or i64 %red, %min.ext
+  %iv.next = add i64 %iv, 1
+  %exitcond.not.i.i = icmp eq i64 %iv.next, 17
+  br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+; Original loop has trip count 17, but contains interleave groups with gaps, so
+; the last iteration must execute in the scalar loop. Thus the vector loop can
+; only execute up to 16 iterations.
+define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64>
+; CHECK-NEXT:    [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 17, [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = mul <vscale x 2 x i64> [[TMP25]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT4]], [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = mul i64 1, [[TMP21]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP39]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]]
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT:    [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT:    [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+  %l = load i8, ptr %gep.src.i.i, align 1
+  %l.ext = zext i8 %l to i32
+  %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+  %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+  %gep.dst = getelementptr inbounds i8, ptr  %dst, i64 %iv
+  store i8 0, ptr %gep.dst, align 1
+  %min.ext = zext i32 %min.1 to i64
+  %red.next = or i64 %red, %min.ext
+  %iv.next = add i64 %iv, 1
+  %exitcond.not.i.i = icmp eq i64 %iv.next, 17
+  br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/149726.
+define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, ptr noalias %J, ptr noalias %K, ptr %L) #1 {
+; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[J]], i64 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[STRIDED_VEC]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[K]], i64 8
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[K]], i64 10
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[K]], i64 12
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[K]], i64 14
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; CHECK-NEXT:    store i16 [[TMP14]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; CHECK-NEXT:    store i16 [[TMP15]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; CHECK-NEXT:    store i16 [[TMP16]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    store i16 [[TMP17]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; CHECK-NEXT:    store i16 [[TMP18]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; CHECK-NEXT:    store i16 [[TMP19]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; CHECK-NEXT:    store i16 [[TMP20]], ptr [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; CHECK-NEXT:    store i16 [[TMP21]], ptr [[TMP13]], align 2
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[B]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[C]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[D]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[E]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[F]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[G]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[H]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[I]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[L]], align 8
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]]
+; CHECK-NEXT:    [[L_J:%.*]] = load i64, ptr [[GEP_J]], align 8
+; CHECK-NEXT:    [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16
+; CHECK-NEXT:    [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
+; CHECK-NEXT:    store i16 [[L_TRUNC]], ptr [[GEP_K]], align 2
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[B]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[C]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[D]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[E]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[F]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[G]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[H]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[I]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[L]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV]], 14
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.J = getelementptr i64, ptr %J, i64 %iv
+  %l.J = load i64, ptr %gep.J, align 8
+  %l.trunc = trunc i64 %l.J to i16
+  %gep.K = getelementptr i16, ptr %K, i64 %iv
+  store i16 %l.trunc, ptr %gep.K, align 2
+  store i64 0, ptr %A, align 8
+  store i64 0, ptr %B, align 8
+  store i64 0, ptr %C, align 8
+  store i64 0, ptr %D, align 8
+  store i64 0, ptr %E, align 8
+  store i64 0, ptr %F, align 8
+  store i64 0, ptr %G, align 8
+  store i64 0, ptr %H, align 8
+  store i64 0, ptr %I, align 8
+  store i64 0, ptr %L, align 8
+  %iv.next = add i64 %iv, 2
+  %ec = icmp ult i64 %iv, 14
+  br i1 %ec, label %loop, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+declare i32 @llvm.umin.i32(i32, i32)
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }
+attributes #1 = { "target-cpu"="grace" }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
new file mode 100644
index 0000000000000..2d1543185098f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+;
+; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
+; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %sub = sub i32 0, %mul
+  %add = add i32 %accum, %sub
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 400b031917a72..7090ae83be868 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -7,11 +7,7 @@ target triple = "aarch64-none-unknown-elf"
 define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define i32 @dotp(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -33,64 +29,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX2]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX2]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
-; CHECK-NEXT:    [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       for.exit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK:       scalar.ph:
 ;
 entry:
   br label %for.body
@@ -142,7 +82,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -174,7 +114,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -198,7 +138,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
 ; CHECK-NEXT:    [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       while.end.loopexit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret void
@@ -557,7 +497,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
deleted file mode 100644
index d85bc484af0b0..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
+++ /dev/null
@@ -1,146 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
-
-target triple = "aarch64-linux-gnu"
-
-define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
-; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ITER_CHECK:.*]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64>
-; CHECK-NEXT:    [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]])
-; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
-; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP31]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
-; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]]
-; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP20]]
-; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
-; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]])
-; CHECK-NEXT:    [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64>
-; CHECK-NEXT:    [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]]
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]]
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]])
-; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
-; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
-; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
-; CHECK-NEXT:    [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
-; CHECK-NEXT:    [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
-; CHECK-NEXT:    [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
-; CHECK-NEXT:    [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
-; CHECK-NEXT:    [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
-; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
-  %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
-  %l = load i8, ptr %gep.src.i.i, align 1
-  %l.ext = zext i8 %l to i32
-  %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
-  %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
-  %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
-  %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
-  %gep.dst = getelementptr inbounds i8, ptr  %dst, i64 %iv
-  store i8 0, ptr %gep.dst, align 1
-  %min.ext = zext i32 %min.1 to i64
-  %red.next = or i64 %red, %min.ext
-  %iv.next = add i64 %iv, 1
-  %exitcond.not.i.i = icmp eq i64 %iv.next, 16
-  br i1 %exitcond.not.i.i, label %exit, label %loop
-
-exit:
-  ret i64 %red.next
-}
-
-declare i32 @llvm.umin.i32(i32, i32)
-
-declare i32 @llvm.abs.i32(i32, i1 immarg)
-
-attributes #0 = { "target-cpu"="neoverse-512tvb" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 173766cc0a656..ccfa72579de23 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -386,8 +386,7 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX24]]
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[TMP45]], i32 0
-; CHECK-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP46]], align 8
+; CHECK-NEXT:    [[TMP47:%.*]] = load double, ptr [[TMP45]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP47]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP48:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 5.000000e+00)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 813d61b52100f..38e224f703c24 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -166,8 +166,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
@@ -959,13 +958,11 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 45357dd6bf0d6..dbe6f27d61749 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2
 ; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=SCALAR_EPILOGUE
-; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING
-; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_EVL
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA-WITH-EVL
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
@@ -55,105 +55,105 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SCALAR_EPILOGUE:       scalar.ph:
 ;
-; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor2
-; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
-; PREDICATED_TAIL_FOLDING-NEXT:  entry:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.body:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; PREDICATED_TAIL_FOLDING:       middle.block:
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_DATA-LABEL: define void @masked_strided_factor2
+; PREDICATED_DATA-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; PREDICATED_DATA-NEXT:  entry:
+; PREDICATED_DATA-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA:       vector.ph:
+; PREDICATED_DATA-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA:       vector.body:
+; PREDICATED_DATA-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_DATA-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_DATA-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_DATA-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_DATA-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_DATA:       middle.block:
+; PREDICATED_DATA-NEXT:    br label [[FOR_END:%.*]]
+; PREDICATED_DATA:       scalar.ph:
 ;
-; PREDICATED_EVL-LABEL: define void @masked_strided_factor2
-; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
-; PREDICATED_EVL-NEXT:  entry:
-; PREDICATED_EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_EVL:       vector.ph:
-; PREDICATED_EVL-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDICATED_EVL:       vector.body:
-; PREDICATED_EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT:    [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_EVL-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_EVL-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_EVL-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_EVL-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_EVL-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; PREDICATED_EVL:       middle.block:
-; PREDICATED_EVL-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_EVL:       scalar.ph:
+; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor2
+; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
+; PREDICATED_DATA-WITH-EVL-NEXT:  entry:
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA-WITH-EVL:       vector.ph:
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA-WITH-EVL:       vector.body:
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL:       middle.block:
+; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[FOR_END:%.*]]
+; PREDICATED_DATA-WITH-EVL:       scalar.ph:
 ;
 entry:
   %conv = zext i8 %guard to i32
@@ -256,137 +256,137 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SCALAR_EPILOGUE:       scalar.ph:
 ;
-; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor4
-; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
-; PREDICATED_TAIL_FOLDING-NEXT:  entry:
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDICATED_TAIL_FOLDING:       vector.body:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PREDICATED_TAIL_FOLDING:       middle.block:
-; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_DATA-LABEL: define void @masked_strided_factor4
+; PREDICATED_DATA-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; PREDICATED_DATA-NEXT:  entry:
+; PREDICATED_DATA-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA:       vector.ph:
+; PREDICATED_DATA-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_DATA-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA:       vector.body:
+; PREDICATED_DATA-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_DATA-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_DATA-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_DATA-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_DATA-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_DATA-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_DATA-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
+; PREDICATED_DATA-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_DATA-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_DATA:       middle.block:
+; PREDICATED_DATA-NEXT:    br label [[FOR_END:%.*]]
+; PREDICATED_DATA:       scalar.ph:
 ;
-; PREDICATED_EVL-LABEL: define void @masked_strided_factor4
-; PREDICATED_EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
-; PREDICATED_EVL-NEXT:  entry:
-; PREDICATED_EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDICATED_EVL:       vector.ph:
-; PREDICATED_EVL-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDICATED_EVL:       vector.body:
-; PREDICATED_EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT:    [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; PREDICATED_EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_EVL-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
-; PREDICATED_EVL-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_EVL-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
-; PREDICATED_EVL-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
-; PREDICATED_EVL-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_EVL-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_EVL-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
-; PREDICATED_EVL-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
-; PREDICATED_EVL-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
-; PREDICATED_EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_EVL-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; PREDICATED_EVL:       middle.block:
-; PREDICATED_EVL-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_EVL:       scalar.ph:
+; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor4
+; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
+; PREDICATED_DATA-WITH-EVL-NEXT:  entry:
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_DATA-WITH-EVL:       vector.ph:
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_DATA-WITH-EVL:       vector.body:
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP16]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> align 1 [[TMP24]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> align 1 [[TMP26]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> align 1 [[TMP28]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> align 1 [[TMP30]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL:       middle.block:
+; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[FOR_END:%.*]]
+; PREDICATED_DATA-WITH-EVL:       scalar.ph:
 ;
 entry:
   %conv = zext i8 %guard to i32
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 6c57d2f2f00e6..e2641ab02fd35 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -133,15 +133,15 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr align 1 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -358,3 +358,64 @@ for.end:                                          ; preds = %for.body
 
 attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) }
 
+; This is a non-power-of-2 low trip count, so we will try to tail-fold this. But
+; the reduction is a multiply which is only legal for fixed-length VFs. But
+; fixed-length VFs aren't legal for the default tail-folding style
+; data-with-evl, so make sure we gracefully fall back to data-without-lane-mask.
+
+define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) {
+; CHECK-LABEL: @mul_non_pow_2_low_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i8> [ <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = icmp ule <16 x i64> [[VEC_IV]], splat (i64 9)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP2]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> [[TMP3]])
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[MUL]] = mul i8 [[TMP5]], [[RDX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[MUL_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i8 [ 2, %entry ], [ %mul, %for.body ]
+  %gep = getelementptr i8, ptr %a, i64 %iv
+  %0 = load i8, ptr %gep
+  %mul = mul i8 %0, %rdx
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 10
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                 ; preds = %for.body, %entry
+  ret i8 %mul
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index a1201dcfbdf57..02288112f3389 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -7,29 +7,49 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = shl <16 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <16 x i64> [[TMP0]], splat (i64 52)
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 9, [[TMP2]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <vscale x 2 x i64> [[BROADCAST_SPLAT2]], splat (i64 48)
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <vscale x 2 x i64> [[TMP5]], splat (i64 52)
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i64> [[TMP6]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 2 x i8> [[BROADCAST_SPLAT]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP9]], splat (i32 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 9)
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <16 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP4]], <16 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[TMP8]], i32 15
-; CHECK-NEXT:    store i8 [[TMP40]], ptr [[P]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_COND]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 1, [[TMP11]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ule <vscale x 2 x i32> [[VEC_IND]], splat (i32 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sge <vscale x 2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i32> [[TMP7]], <vscale x 2 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <vscale x 2 x i32> [[PREDPHI]], splat (i32 8)
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <vscale x 2 x i32> [[TMP16]] to <vscale x 2 x i8>
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> [[TMP17]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP11]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT1:%.*]]
 ; CHECK:       scalar.ph:
@@ -52,7 +72,7 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
 ; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[P]], align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 8
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -84,8 +104,9 @@ exit:                                             ; preds = %for.body
   ret void
 }
 ;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
deleted file mode 100644
index 4844c2f8905d5..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse-output.ll
+++ /dev/null
@@ -1,690 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-;; This is the loop in c++ being vectorize in this file with
-;; vector.reverse
-;;  #pragma clang loop vectorize_width(4, scalable)
-;;  for (int i = N-1; i >= 0; --i)
-;;    a[i] = b[i] + 1.0;
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S < %s \
-; RUN: | FileCheck --check-prefix=RV64 %s
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v -S < %s \
-; RUN: | FileCheck --check-prefix=RV32 %s
-
-; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -force-vector-interleave=2 -S < %s \
-; RUN: | FileCheck --check-prefix=RV64-UF2 %s
-
-define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_i32(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64:       [[VECTOR_PH]]:
-; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64:       [[VECTOR_BODY]]:
-; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; RV64-NEXT:    store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP19]], align 4
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64:       [[SCALAR_PH]]:
-; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT:    [[ADD:%.*]] = add i32 [[TMP21]], 1
-; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV64:       [[EXIT]]:
-; RV64-NEXT:    ret void
-;
-; RV32-LABEL: define void @vector_reverse_i32(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV32-NEXT:  [[ENTRY:.*]]:
-; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32:       [[VECTOR_PH]]:
-; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV32-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV32-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV32:       [[VECTOR_BODY]]:
-; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV32-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP17]], 1
-; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; RV32-NEXT:    store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP21]], align 4
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV32-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV32:       [[SCALAR_PH]]:
-; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
-; RV32-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT:    [[ADD:%.*]] = add i32 [[TMP23]], 1
-; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV32:       [[EXIT]]:
-; RV32-NEXT:    ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_i32(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; RV64-UF2-NEXT:  [[ENTRY:.*]]:
-; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
-; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2:       [[VECTOR_PH]]:
-; RV64-UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64-UF2:       [[VECTOR_BODY]]:
-; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
-; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2:       [[SCALAR_PH]]:
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64-UF2:       [[FOR_BODY]]:
-; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT:    [[ADD:%.*]] = add i32 [[TMP31]], 1
-; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; RV64-UF2:       [[EXIT]]:
-; RV64-UF2-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
-  %iv.next = add nsw i64 %dec.iv, -1
-  %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next
-  %0 = load i32, ptr %arrayidx.b, align 4
-  %add = add i32 %0, 1
-  %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next
-  store i32 %add, ptr %arrayidx.a, align 4
-  %cmp = icmp ugt i64 %dec.iv, 1
-  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-define void @vector_reverse_f32(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_f32(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64:       [[VECTOR_PH]]:
-; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64:       [[VECTOR_BODY]]:
-; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP22]]
-; RV64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
-; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
-; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
-; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-NEXT:    [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
-; RV64-NEXT:    [[TMP23:%.*]] = sub i64 [[TMP5]], 1
-; RV64-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP23]]
-; RV64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
-; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
-; RV64-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
-; RV64-NEXT:    store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP19]], align 4
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64:       [[SCALAR_PH]]:
-; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-NEXT:    [[FADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV64:       [[EXIT]]:
-; RV64-NEXT:    ret void
-;
-; RV32-LABEL: define void @vector_reverse_f32(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV32-NEXT:  [[ENTRY:.*]]:
-; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32:       [[VECTOR_PH]]:
-; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV32-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV32-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
-; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV32:       [[VECTOR_BODY]]:
-; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT:    [[TMP11:%.*]] = mul i32 0, [[TMP10]]
-; RV32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP10]], 1
-; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP24]]
-; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 [[TMP11]]
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
-; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV32-NEXT:    [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; RV32-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
-; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP17]]
-; RV32-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP17]], 1
-; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP25]]
-; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
-; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
-; RV32-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
-; RV32-NEXT:    store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP21]], align 4
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; RV32-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV32-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV32-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV32:       [[SCALAR_PH]]:
-; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
-; RV32-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV32-NEXT:    [[FADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
-; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV32:       [[EXIT]]:
-; RV32-NEXT:    ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_f32(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-UF2-NEXT:  [[ENTRY:.*]]:
-; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
-; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
-; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2:       [[VECTOR_PH]]:
-; RV64-UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
-; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
-; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64-UF2:       [[VECTOR_BODY]]:
-; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP11:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP32]]
-; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul i64 -1, [[TMP33]]
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP16]]
-; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
-; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
-; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP18]], align 4
-; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 -1, [[TMP34]]
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP22]]
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP23]]
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP5]]
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP5]], 1
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP35]]
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP26]]
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
-; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP19]])
-; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP25]], align 4
-; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
-; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
-; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; RV64-UF2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; RV64-UF2-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
-; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2:       [[SCALAR_PH]]:
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64-UF2:       [[FOR_BODY]]:
-; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX_B]], align 4
-; RV64-UF2-NEXT:    [[FADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    store float [[FADD]], ptr [[ARRAYIDX_A]], align 4
-; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; RV64-UF2:       [[EXIT]]:
-; RV64-UF2-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
-  %iv.next = add nsw i64 %dec.iv, -1
-  %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next
-  %0 = load float, ptr %arrayidx.b, align 4
-  %fadd = fadd float %0, 1.000000e+00
-  %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next
-  store float %fadd, ptr %arrayidx.a, align 4
-  %cmp = icmp ugt i64 %dec.iv, 1
-  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
-; RV64-LABEL: define void @vector_reverse_irregular_type(
-; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-NEXT:  [[ENTRY:.*]]:
-; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64:       [[VECTOR_PH]]:
-; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64:       [[VECTOR_BODY]]:
-; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV64-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV64-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV64-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV64-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
-; RV64-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
-; RV64-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
-; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
-; RV64-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
-; RV64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
-; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV64-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
-; RV64-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
-; RV64-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
-; RV64-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
-; RV64-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
-; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
-; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
-; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
-; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
-; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
-; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
-; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
-; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
-; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
-; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV64-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64:       [[SCALAR_PH]]:
-; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
-; RV64-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
-; RV64-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
-; RV64-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
-; RV64-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
-; RV64-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
-; RV64-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
-; RV64-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV64-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
-; RV64-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV64:       [[EXIT]]:
-; RV64-NEXT:    ret void
-;
-; RV32-LABEL: define void @vector_reverse_irregular_type(
-; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV32-NEXT:  [[ENTRY:.*]]:
-; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV32:       [[VECTOR_PH]]:
-; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV32:       [[VECTOR_BODY]]:
-; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV32-NEXT:    [[DEC_IV:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV32-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV32-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV32-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV32-NEXT:    [[IV_NEXT:%.*]] = add nsw i64 [[DEC_IV]], -1
-; RV32-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
-; RV32-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
-; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
-; RV32-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
-; RV32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
-; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP0:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV32-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
-; RV32-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
-; RV32-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
-; RV32-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP0]], i32 0
-; RV32-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
-; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
-; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
-; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
-; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
-; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
-; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
-; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
-; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
-; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
-; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; RV32-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV32:       [[SCALAR_PH]]:
-; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
-; RV32-NEXT:    [[DEC_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[FOR_BODY]] ]
-; RV32-NEXT:    [[IV_NEXT1]] = add nsw i64 [[DEC_IV1]], -1
-; RV32-NEXT:    [[ARRAYIDX_B1:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT1]]
-; RV32-NEXT:    [[TMP30:%.*]] = load i7, ptr [[ARRAYIDX_B1]], align 1
-; RV32-NEXT:    [[ADD:%.*]] = add i7 [[TMP30]], 1
-; RV32-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT1]]
-; RV32-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV32-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV1]], 1
-; RV32-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV32:       [[EXIT]]:
-; RV32-NEXT:    ret void
-;
-; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
-; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
-; RV64-UF2-NEXT:  [[ENTRY:.*]]:
-; RV64-UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; RV64-UF2:       [[VECTOR_PH]]:
-; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
-; RV64-UF2:       [[VECTOR_BODY]]:
-; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; RV64-UF2-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
-; RV64-UF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
-; RV64-UF2-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -2
-; RV64-UF2-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], -3
-; RV64-UF2-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], -4
-; RV64-UF2-NEXT:    [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], -5
-; RV64-UF2-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], -6
-; RV64-UF2-NEXT:    [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], -7
-; RV64-UF2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP16]], -1
-; RV64-UF2-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP0]], -1
-; RV64-UF2-NEXT:    [[TMP51:%.*]] = add nsw i64 [[TMP17]], -1
-; RV64-UF2-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP24]], -1
-; RV64-UF2-NEXT:    [[TMP59:%.*]] = add nsw i64 [[TMP25]], -1
-; RV64-UF2-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP42]], -1
-; RV64-UF2-NEXT:    [[TMP14:%.*]] = add nsw i64 [[TMP43]], -1
-; RV64-UF2-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP50]], -1
-; RV64-UF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP1]]
-; RV64-UF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP2]]
-; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP51]]
-; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP59]]
-; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
-; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP5:%.*]] = load i7, ptr [[TMP3]], align 1
-; RV64-UF2-NEXT:    [[TMP6:%.*]] = load i7, ptr [[TMP4]], align 1
-; RV64-UF2-NEXT:    [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
-; RV64-UF2-NEXT:    [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
-; RV64-UF2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP5]], i32 0
-; RV64-UF2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP6]], i32 1
-; RV64-UF2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
-; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
-; RV64-UF2-NEXT:    [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
-; RV64-UF2-NEXT:    [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
-; RV64-UF2-NEXT:    [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
-; RV64-UF2-NEXT:    [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
-; RV64-UF2-NEXT:    [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
-; RV64-UF2-NEXT:    [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
-; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
-; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
-; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
-; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
-; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP1]]
-; RV64-UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP2]]
-; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP51]]
-; RV64-UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
-; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP59]]
-; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
-; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
-; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
-; RV64-UF2-NEXT:    store i7 [[TMP7]], ptr [[TMP9]], align 1
-; RV64-UF2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
-; RV64-UF2-NEXT:    store i7 [[TMP8]], ptr [[TMP10]], align 1
-; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
-; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
-; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
-; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
-; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
-; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
-; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
-; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
-; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
-; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
-; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
-; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
-; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
-; RV64-UF2-NEXT:    br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; RV64-UF2:       [[SCALAR_PH]]:
-; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64-UF2:       [[FOR_BODY]]:
-; RV64-UF2-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; RV64-UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[DEC_IV]], -1
-; RV64-UF2-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    [[TMP12:%.*]] = load i7, ptr [[ARRAYIDX_B]], align 1
-; RV64-UF2-NEXT:    [[ADD:%.*]] = add i7 [[TMP12]], 1
-; RV64-UF2-NEXT:    [[ARRAYIDX_A:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[IV_NEXT]]
-; RV64-UF2-NEXT:    store i7 [[ADD]], ptr [[ARRAYIDX_A]], align 1
-; RV64-UF2-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[DEC_IV]], 1
-; RV64-UF2-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; RV64-UF2:       [[EXIT]]:
-; RV64-UF2-NEXT:    ret void
-;
-entry:
-  br label %for.body
-
-for.body:
-  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
-  %iv.next = add nsw i64 %dec.iv, -1
-  %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
-  %0 = load i7, ptr %arrayidx.b, align 1
-  %add = add i7 %0, 1
-  %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
-  store i7 %add, ptr %arrayidx.a, align 1
-  %cmp = icmp ugt i64 %dec.iv, 1
-  br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
-
-exit:
-  ret void
-}
-
-!0 = distinct !{!0, !1, !2, !3}
-!1 = !{!"llvm.loop.vectorize.width", i32 4}
-!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ad445c8b43f01..f59ab56711685 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,400 +1,455 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; This is the loop in c++ being vectorize in this file with
-;vector.reverse
-;  #pragma clang loop vectorize_width(4, scalable)
-;  for (int i = N-1; i >= 0; --i)
-;    a[i] = b[i] + 1.0;
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "for.body:" --version 5
+;; This is the loop in c++ being vectorize in this file with
+;; vector.reverse
+;;  #pragma clang loop vectorize_width(4, scalable)
+;;  for (int i = N-1; i >= 0; --i)
+;;    a[i] = b[i] + 1.0;
 
-; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple riscv64-linux-gnu \
-; RUN:   -mattr=+v -debug-only=loop-vectorize,vplan -scalable-vectorization=on \
-; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S < %s \
+; RUN: | FileCheck --check-prefix=RV64 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv32 -mattr=+v -S < %s \
+; RUN: | FileCheck --check-prefix=RV32 %s
+
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -force-vector-interleave=2 -S < %s \
+; RUN: | FileCheck --check-prefix=RV64-UF2 %s
+
+define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_i32(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP10]]
+; RV64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]]
+; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP11]]
+; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
+; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
+; RV64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP18]]
+; RV64-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; RV64-NEXT:    store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP20]], align 4
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_i32(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV32-NEXT:  [[ENTRY:.*]]:
+; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV32-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV32-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT:    [[TMP10:%.*]] = mul i32 0, [[TMP9]]
+; RV32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], 1
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP11]]
+; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 [[TMP10]]
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP12]]
+; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP17]]
+; RV32-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT:    [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[TMP18]]
+; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 [[TMP20]]
+; RV32-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; RV32-NEXT:    store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP22]], align 4
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV32-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV32-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_i32(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP17]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD1]])
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = add <vscale x 4 x i32> [[REVERSE2]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = mul i64 -1, [[TMP24]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = mul i64 -1, [[TMP29]]
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP28]]
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP30]]
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE3]], ptr [[TMP27]], align 4
+; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP32]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds i32, ptr %B, i64 %iv.next
+  %0 = load i32, ptr %arrayidx.b, align 4
+  %add = add i32 %0, 1
+  %arrayidx.a = getelementptr inbounds i32, ptr %A, i64 %iv.next
+  store i32 %add, ptr %arrayidx.a, align 4
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
 
 define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) {
-; CHECK-LABEL: 'vector_reverse_i64'
-; CHECK-NEXT:  LV: Loop hints: force=enabled width=vscale x 4 interleave=0
-; CHECK-NEXT:  LV: Found a loop: for.body
-; CHECK-NEXT:  LV: Found an induction variable.
-; CHECK-NEXT:  LV: Found an induction variable.
-; CHECK-NEXT:  LV: Did not find one integer induction var.
-; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Found trip count: 0
-; CHECK-NEXT:  LV: Found maximum trip count: 4294967295
-; CHECK-NEXT:  LV: Scalable vectorization is available
-; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
-; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
-; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK-NEXT:  Creating VPBasicBlock for for.body
-; CHECK-NEXT:  VPlan 'Plain CFG
-; CHECK-NEXT:   for UF>=1' {
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:  Successor(s): for.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:    EMIT-SCALAR ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
-; CHECK-NEXT:    EMIT ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:    EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    EMIT store ir<%add9>, ir<%arrayidx3>
-; CHECK-NEXT:    EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
-; CHECK-NEXT:    EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
-; CHECK-NEXT:    EMIT branch-on-cond ir<%cmp>
-; CHECK-NEXT:  Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF
-; CHECK-NEXT:  Live-in vp<%1> = VF * UF
-; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
-; CHECK-NEXT:  vp<%3> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
-; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
-; CHECK-NEXT:  Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:  <x1> vector loop: {
-; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:      vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
-; CHECK-NEXT:      vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT:      WIDEN store vp<%10>, ir<%add9>
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
-; CHECK-NEXT:    No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:  Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
-; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:    IR %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:    IR %add9 = add i32 %1, 1
-; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:    IR store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV(REG): Calculating max register usage:
-; CHECK-NEXT:  LV(REG): At #0 Interval # 0
-; CHECK-NEXT:  LV(REG): At #1 Interval # 1
-; CHECK-NEXT:  LV(REG): At #2 Interval # 2
-; CHECK-NEXT:  LV(REG): At #3 Interval # 2
-; CHECK-NEXT:  LV(REG): At #4 Interval # 2
-; CHECK-NEXT:  LV(REG): At #5 Interval # 2
-; CHECK-NEXT:  LV(REG): At #6 Interval # 3
-; CHECK-NEXT:  LV(REG): At #7 Interval # 3
-; CHECK-NEXT:  LV(REG): At #8 Interval # 3
-; CHECK-NEXT:  LV(REG): At #9 Interval # 3
-; CHECK-NEXT:  LV(REG): At #10 Interval # 3
-; CHECK-NEXT:  LV(REG): At #11 Interval # 3
-; CHECK-NEXT:  LV(REG): At #12 Interval # 2
-; CHECK-NEXT:  LV(REG): At #13 Interval # 2
-; CHECK-NEXT:  LV(REG): VF = vscale x 4
-; CHECK-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
-; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 24
-; CHECK-NEXT:  LV: IC is 1
-; CHECK-NEXT:  LV: VF is vscale x 4
-; CHECK-NEXT:  LV: Not Interleaving.
-; CHECK-NEXT:  LV: Interleaving is not beneficial.
-; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<%18> = VF
-; CHECK-NEXT:  Live-in ir<%18>.1 = VF * UF
-; CHECK-NEXT:  Live-in ir<%n.vec> = vector-trip-count
-; CHECK-NEXT:  Live-in ir<%0> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<vector.scevcheck>:
-; CHECK-NEXT:    IR %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    IR %4 = add i32 %n, -1
-; CHECK-NEXT:    IR %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    IR %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    IR %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    IR %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    IR %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    IR %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    IR %10 = or i1 %8, %9
-; CHECK-NEXT:    EMIT branch-on-cond ir<%10>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<vector.memcheck>:
-; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %12 = mul nuw i64 %11, 4
-; CHECK-NEXT:    IR %13 = mul i64 %12, 4
-; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT:    EMIT branch-on-cond ir<%diff.check>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<vector.ph>:
-; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %16 = mul nuw i64 %15, 4
-; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT:  Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT:    WIDEN ir<%19> = load vp<%6>
-; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%19>, ir<1>
-; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT:    WIDEN store vp<%7>, ir<%add9>
-; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
-; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
-; CHECK-NEXT:  Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT:  Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:    IR %19 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:    IR %add9 = add i32 %19, 1
-; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:    IR store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  for.body.preheader: ; preds = %entry
-; CHECK-NEXT:    %0 = zext i32 %n to i64
-; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %2 = mul nuw i64 %1, 4
-; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.scevcheck: ; No predecessors!
-; CHECK-NEXT:    %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    %4 = add i32 %n, -1
-; CHECK-NEXT:    %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    %10 = or i1 %8, %9
-; CHECK-NEXT:    br i1 %10, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: draw edge from for.body.preheader
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.memcheck: ; No predecessors!
-; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %12 = mul nuw i64 %11, 4
-; CHECK-NEXT:    %13 = mul i64 %12, 4
-; CHECK-NEXT:    %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT:    br i1 %diff.check, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: draw edge from vector.scevcheck
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.ph: ; No predecessors!
-; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %16 = mul nuw i64 %15, 4
-; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %18 = mul nuw i64 %17, 4
-; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
-; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    %20 = sub i32 %n, %.cast
-; CHECK-NEXT:    br
-; CHECK-NEXT:  LV: draw edge from vector.memcheck
-; CHECK-NEXT:  LV: created vector.body
-; CHECK-NEXT:  LV: draw edge from vector.ph
-; CHECK-NEXT:  LV: vectorizing VPBB: vector.body in BB: vector.body
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT:    %.cast3 = trunc i64 %index to i32
-; CHECK-NEXT:    %offset.idx = sub i32 %n, %.cast3
-; CHECK-NEXT:    %21 = add nsw i32 %offset.idx, -1
-; CHECK-NEXT:    %22 = zext i32 %21 to i64
-; CHECK-NEXT:    %23 = getelementptr inbounds i32, ptr %B, i64 %22
-; CHECK-NEXT:    %24 = mul i64 0, %18
-; CHECK-NEXT:    %25 = sub i64 %18, 1
-; CHECK-NEXT:    %26 = mul i64 -1, %25
-; CHECK-NEXT:    %27 = getelementptr inbounds i32, ptr %23, i64 %24
-; CHECK-NEXT:    %28 = getelementptr inbounds i32, ptr %27, i64 %26
-; CHECK-NEXT:    %wide.load = load <vscale x 4 x i32>, ptr %28, align 4
-; CHECK-NEXT:    %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
-; CHECK-NEXT:    %29 = add <vscale x 4 x i32> %reverse, splat (i32 1)
-; CHECK-NEXT:    %30 = getelementptr inbounds i32, ptr %A, i64 %22
-; CHECK-NEXT:    %31 = mul i64 0, %18
-; CHECK-NEXT:    %32 = sub i64 %18, 1
-; CHECK-NEXT:    %33 = mul i64 -1, %32
-; CHECK-NEXT:    %34 = getelementptr inbounds i32, ptr %30, i64 %31
-; CHECK-NEXT:    %35 = getelementptr inbounds i32, ptr %34, i64 %33
-; CHECK-NEXT:    %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %29)
-; CHECK-NEXT:    store <vscale x 4 x i32> %reverse4, ptr %35, align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
-; CHECK-NEXT:    %36 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT:    br i1 %36, <null operand!>, label %vector.body
-; CHECK-NEXT:  LV: created middle.block
-; CHECK-NEXT:  LV: draw edge from vector.body
-; CHECK-NEXT:  LV: vectorizing VPBB: middle.block in BB: middle.block
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  middle.block: ; preds = %vector.body
-; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
-; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
-; CHECK-NEXT:    br label %for.cond.cleanup
-; CHECK-NEXT:  LV: draw edge from middle.block
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  scalar.ph: ; preds = %for.body.preheader
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
-; CHECK-NEXT:    br label %for.body
-; CHECK-NEXT:  LV: draw edge from middle.block
-; CHECK-NEXT:  LV: draw edge from for.body.preheader
-; CHECK-NEXT:  LV: draw edge from vector.scevcheck
-; CHECK-NEXT:  LV: draw edge from vector.memcheck
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
-; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
-; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:    %37 = load i32, ptr %arrayidx, align 4
-; CHECK-NEXT:    %add9 = add i32 %37, 1
-; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:    store i32 %add9, ptr %arrayidx3, align 4
-; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV: draw edge from scalar.ph
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
-; CHECK-NEXT:  LV: Vectorizing: innermost loop.
-; CHECK-EMPTY:
+; RV64-LABEL: define void @vector_reverse_i64(
+; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64:       [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64:       [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64:       [[VECTOR_MEMCHECK]]:
+; RV64-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; RV64-NEXT:    [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT:    [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-NEXT:    [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP22]]
+; RV64-NEXT:    [[TMP24:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP25]]
+; RV64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP24]]
+; RV64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP26]]
+; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP28]], align 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-NEXT:    [[TMP29:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; RV64-NEXT:    [[TMP31:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT:    [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
+; RV64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 [[TMP31]]
+; RV64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP33]]
+; RV64-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP29]])
+; RV64-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP35]], align 4
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; RV64-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64:       [[FOR_COND_CLEANUP]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_i64(
+; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*:]]
+; RV32-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i32
+; RV32-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i32
+; RV32-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32:       [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; RV32:       [[VECTOR_MEMCHECK]]:
+; RV32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
+; RV32-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; RV32-NEXT:    [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
+; RV32-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+; RV32-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; RV32-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT:    [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV32-NEXT:    [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+; RV32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP14]]
+; RV32-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT:    [[TMP17:%.*]] = mul i32 0, [[TMP16]]
+; RV32-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP16]], 1
+; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP18]]
+; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 [[TMP17]]
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 [[TMP19]]
+; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV32-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
+; RV32-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT:    [[TMP25:%.*]] = mul i32 0, [[TMP24]]
+; RV32-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], 1
+; RV32-NEXT:    [[TMP27:%.*]] = mul i32 -1, [[TMP26]]
+; RV32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 [[TMP25]]
+; RV32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 [[TMP27]]
+; RV32-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP22]])
+; RV32-NEXT:    store <vscale x 4 x i32> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; RV32-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV32:       [[FOR_COND_CLEANUP]]:
+; RV32-NEXT:    ret void
+; RV32:       [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_i64(
+; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*:]]
+; RV64-UF2-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-UF2-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2:       [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2:       [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64-UF2:       [[VECTOR_MEMCHECK]]:
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-UF2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-UF2-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
+; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = mul i64 -1, [[TMP31]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP30]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 [[TMP32]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, ptr [[TMP34]], align 4
+; RV64-UF2-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD4]])
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = add <vscale x 4 x i32> [[REVERSE]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = add <vscale x 4 x i32> [[REVERSE5]], splat (i32 1)
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = mul i64 -1, [[TMP39]]
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP38]]
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i64 [[TMP40]]
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = mul i64 -1, [[TMP44]]
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[TMP43]]
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP45]]
+; RV64-UF2-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP35]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE6]], ptr [[TMP42]], align 4
+; RV64-UF2-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP36]])
+; RV64-UF2-NEXT:    store <vscale x 4 x i32> [[REVERSE7]], ptr [[TMP47]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2:       [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT:    ret void
+; RV64-UF2:       [[FOR_BODY]]:
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
@@ -423,390 +478,259 @@ for.body:                                         ; preds = %for.body.preheader,
 }
 
 define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) {
-; CHECK-LABEL: 'vector_reverse_f32'
-; CHECK-NEXT:  LV: Loop hints: force=enabled width=vscale x 4 interleave=0
-; CHECK-NEXT:  LV: Found a loop: for.body
-; CHECK-NEXT:  LV: Found an induction variable.
-; CHECK-NEXT:  LV: Found an induction variable.
-; CHECK-NEXT:  LV: Found FP op with unsafe algebra.
-; CHECK-NEXT:  LV: Did not find one integer induction var.
-; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Found trip count: 0
-; CHECK-NEXT:  LV: Found maximum trip count: 4294967295
-; CHECK-NEXT:  LV: Scalable vectorization is available
-; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
-; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
-; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK-NEXT:  Creating VPBasicBlock for for.body
-; CHECK-NEXT:  VPlan 'Plain CFG
-; CHECK-NEXT:   for UF>=1' {
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:  Successor(s): for.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
-; CHECK-NEXT:    EMIT-SCALAR ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
-; CHECK-NEXT:    EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:    EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    EMIT store ir<%conv1>, ir<%arrayidx3>
-; CHECK-NEXT:    EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
-; CHECK-NEXT:    EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
-; CHECK-NEXT:    EMIT branch-on-cond ir<%cmp>
-; CHECK-NEXT:  Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<%0> = VF
-; CHECK-NEXT:  Live-in vp<%1> = VF * UF
-; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
-; CHECK-NEXT:  vp<%3> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
-; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
-; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
-; CHECK-NEXT:  Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:  <x1> vector loop: {
-; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:      vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
-; CHECK-NEXT:      vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<%9>
-; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
-; CHECK-NEXT:      WIDEN store vp<%10>, ir<%conv1>
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
-; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
-; CHECK-NEXT:    No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:  Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
-; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
-; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:    IR %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:    IR %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:    IR store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ]
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV(REG): Calculating max register usage:
-; CHECK-NEXT:  LV(REG): At #0 Interval # 0
-; CHECK-NEXT:  LV(REG): At #1 Interval # 1
-; CHECK-NEXT:  LV(REG): At #2 Interval # 2
-; CHECK-NEXT:  LV(REG): At #3 Interval # 2
-; CHECK-NEXT:  LV(REG): At #4 Interval # 2
-; CHECK-NEXT:  LV(REG): At #5 Interval # 2
-; CHECK-NEXT:  LV(REG): At #6 Interval # 3
-; CHECK-NEXT:  LV(REG): At #7 Interval # 3
-; CHECK-NEXT:  LV(REG): At #8 Interval # 3
-; CHECK-NEXT:  LV(REG): At #9 Interval # 3
-; CHECK-NEXT:  LV(REG): At #10 Interval # 3
-; CHECK-NEXT:  LV(REG): At #11 Interval # 3
-; CHECK-NEXT:  LV(REG): At #12 Interval # 2
-; CHECK-NEXT:  LV(REG): At #13 Interval # 2
-; CHECK-NEXT:  LV(REG): VF = vscale x 4
-; CHECK-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
-; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 26
-; CHECK-NEXT:  LV: IC is 1
-; CHECK-NEXT:  LV: VF is vscale x 4
-; CHECK-NEXT:  LV: Not Interleaving.
-; CHECK-NEXT:  LV: Interleaving is not beneficial.
-; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<%18> = VF
-; CHECK-NEXT:  Live-in ir<%18>.1 = VF * UF
-; CHECK-NEXT:  Live-in ir<%n.vec> = vector-trip-count
-; CHECK-NEXT:  Live-in ir<%0> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body.preheader>:
-; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<vector.scevcheck>:
-; CHECK-NEXT:    IR %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    IR %4 = add i32 %n, -1
-; CHECK-NEXT:    IR %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    IR %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    IR %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    IR %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    IR %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    IR %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    IR %10 = or i1 %8, %9
-; CHECK-NEXT:    EMIT branch-on-cond ir<%10>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<vector.memcheck>:
-; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %12 = mul nuw i64 %11, 4
-; CHECK-NEXT:    IR %13 = mul i64 %12, 4
-; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT:    EMIT branch-on-cond ir<%diff.check>
-; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<vector.ph>:
-; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %16 = mul nuw i64 %15, 4
-; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR %18 = mul nuw i64 %17, 4
-; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
-; CHECK-NEXT:  Successor(s): vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<%6> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
-; CHECK-NEXT:    WIDEN ir<%19> = load vp<%6>
-; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
-; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<%7> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
-; CHECK-NEXT:    WIDEN store vp<%7>, ir<%conv1>
-; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
-; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
-; CHECK-NEXT:  Successor(s): middle.block, vector.body
-; CHECK-EMPTY:
-; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
-; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
-; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
-; CHECK-NEXT:  No successors
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%4>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
-; CHECK-NEXT:  Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:    IR %19 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:    IR %conv1 = fadd float %19, 1.000000e+00
-; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:    IR store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:  No successors
-; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body.preheader> in BB: for.body.preheader
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  for.body.preheader: ; preds = %entry
-; CHECK-NEXT:    %0 = zext i32 %n to i64
-; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %2 = mul nuw i64 %1, 4
-; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
-; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.scevcheck> in BB: vector.scevcheck
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.scevcheck: ; No predecessors!
-; CHECK-NEXT:    %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    %4 = add i32 %n, -1
-; CHECK-NEXT:    %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    %10 = or i1 %8, %9
-; CHECK-NEXT:    br i1 %10, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: draw edge from for.body.preheader
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.memcheck> in BB: vector.memcheck
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.memcheck: ; No predecessors!
-; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %12 = mul nuw i64 %11, 4
-; CHECK-NEXT:    %13 = mul i64 %12, 4
-; CHECK-NEXT:    %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
-; CHECK-NEXT:    br i1 %diff.check, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: draw edge from vector.scevcheck
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<vector.ph> in BB: vector.ph
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.ph: ; No predecessors!
-; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %16 = mul nuw i64 %15, 4
-; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    %18 = mul nuw i64 %17, 4
-; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
-; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
-; CHECK-NEXT:    %20 = sub i32 %n, %.cast
-; CHECK-NEXT:    br
-; CHECK-NEXT:  LV: draw edge from vector.memcheck
-; CHECK-NEXT:  LV: created vector.body
-; CHECK-NEXT:  LV: draw edge from vector.ph
-; CHECK-NEXT:  LV: vectorizing VPBB: vector.body in BB: vector.body
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
-; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
-; CHECK-NEXT:    %.cast3 = trunc i64 %index to i32
-; CHECK-NEXT:    %offset.idx = sub i32 %n, %.cast3
-; CHECK-NEXT:    %21 = add nsw i32 %offset.idx, -1
-; CHECK-NEXT:    %22 = zext i32 %21 to i64
-; CHECK-NEXT:    %23 = getelementptr inbounds float, ptr %B, i64 %22
-; CHECK-NEXT:    %24 = mul i64 0, %18
-; CHECK-NEXT:    %25 = sub i64 %18, 1
-; CHECK-NEXT:    %26 = mul i64 -1, %25
-; CHECK-NEXT:    %27 = getelementptr inbounds float, ptr %23, i64 %24
-; CHECK-NEXT:    %28 = getelementptr inbounds float, ptr %27, i64 %26
-; CHECK-NEXT:    %wide.load = load <vscale x 4 x float>, ptr %28, align 4
-; CHECK-NEXT:    %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
-; CHECK-NEXT:    %29 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
-; CHECK-NEXT:    %30 = getelementptr inbounds float, ptr %A, i64 %22
-; CHECK-NEXT:    %31 = mul i64 0, %18
-; CHECK-NEXT:    %32 = sub i64 %18, 1
-; CHECK-NEXT:    %33 = mul i64 -1, %32
-; CHECK-NEXT:    %34 = getelementptr inbounds float, ptr %30, i64 %31
-; CHECK-NEXT:    %35 = getelementptr inbounds float, ptr %34, i64 %33
-; CHECK-NEXT:    %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %29)
-; CHECK-NEXT:    store <vscale x 4 x float> %reverse4, ptr %35, align 4
-; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
-; CHECK-NEXT:    %36 = icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT:    br i1 %36, <null operand!>, label %vector.body
-; CHECK-NEXT:  LV: created middle.block
-; CHECK-NEXT:  LV: draw edge from vector.body
-; CHECK-NEXT:  LV: vectorizing VPBB: middle.block in BB: middle.block
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  middle.block: ; preds = %vector.body
-; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
-; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.cond.cleanup.loopexit> in BB: for.cond.cleanup.loopexit
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
-; CHECK-NEXT:    br label %for.cond.cleanup
-; CHECK-NEXT:  LV: draw edge from middle.block
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<scalar.ph> in BB: scalar.ph
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  scalar.ph: ; preds = %for.body.preheader
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
-; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
-; CHECK-NEXT:    br label %for.body
-; CHECK-NEXT:  LV: draw edge from middle.block
-; CHECK-NEXT:  LV: draw edge from for.body.preheader
-; CHECK-NEXT:  LV: draw edge from vector.scevcheck
-; CHECK-NEXT:  LV: draw edge from vector.memcheck
-; CHECK-NEXT:  LV: vectorizing VPBB: ir-bb<for.body> in BB: for.body
-; CHECK-NEXT:  LV: filled BB:
-; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
-; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
-; CHECK-NEXT:    %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
-; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
-; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:    %37 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:    %conv1 = fadd float %37, 1.000000e+00
-; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:    store float %conv1, ptr %arrayidx3, align 4
-; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
-; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
-; CHECK-NEXT:  LV: draw edge from scalar.ph
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
-; CHECK-NEXT:  LV: Vectorizing: innermost loop.
+; RV64-LABEL: define void @vector_reverse_f32(
+; RV64-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*:]]
+; RV64-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64:       [[FOR_BODY_PREHEADER]]:
+; RV64-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64:       [[VECTOR_SCEVCHECK]]:
+; RV64-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64:       [[VECTOR_MEMCHECK]]:
+; RV64-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; RV64-NEXT:    [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-NEXT:    [[TMP20:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-NEXT:    [[TMP21:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP22]]
+; RV64-NEXT:    [[TMP24:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT:    [[TMP26:%.*]] = mul i64 -1, [[TMP25]]
+; RV64-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP24]]
+; RV64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP26]]
+; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP28]], align 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT:    [[TMP29:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
+; RV64-NEXT:    [[TMP31:%.*]] = mul i64 0, [[TMP18]]
+; RV64-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP18]], 1
+; RV64-NEXT:    [[TMP33:%.*]] = mul i64 -1, [[TMP32]]
+; RV64-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP30]], i64 [[TMP31]]
+; RV64-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP33]]
+; RV64-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP29]])
+; RV64-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP35]], align 4
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; RV64-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64:       [[FOR_COND_CLEANUP]]:
+; RV64-NEXT:    ret void
+; RV64:       [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_f32(
+; RV32-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*:]]
+; RV32-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i32
+; RV32-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i32
+; RV32-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV32-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV32:       [[FOR_BODY_PREHEADER]]:
+; RV32-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; RV32:       [[VECTOR_MEMCHECK]]:
+; RV32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; RV32-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
+; RV32-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; RV32-NEXT:    [[TMP6:%.*]] = sub i32 [[B1]], [[A2]]
+; RV32-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP6]], [[TMP5]]
+; RV32-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP8]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV32-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; RV32-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV32-NEXT:    [[TMP12:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV32-NEXT:    [[TMP13:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+; RV32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
+; RV32-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT:    [[TMP17:%.*]] = mul i32 0, [[TMP16]]
+; RV32-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP16]], 1
+; RV32-NEXT:    [[TMP19:%.*]] = mul i32 -1, [[TMP18]]
+; RV32-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 [[TMP17]]
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP19]]
+; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP21]], align 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT:    [[TMP22:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
+; RV32-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP10]] to i32
+; RV32-NEXT:    [[TMP25:%.*]] = mul i32 0, [[TMP24]]
+; RV32-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], 1
+; RV32-NEXT:    [[TMP27:%.*]] = mul i32 -1, [[TMP26]]
+; RV32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP25]]
+; RV32-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i32 [[TMP27]]
+; RV32-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP22]])
+; RV32-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP29]], align 4
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; RV32-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV32-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV32:       [[FOR_COND_CLEANUP]]:
+; RV32-NEXT:    ret void
+; RV32:       [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_f32(
+; RV64-UF2-SAME: ptr noundef writeonly captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*:]]
+; RV64-UF2-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; RV64-UF2-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; RV64-UF2-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; RV64-UF2-NEXT:    br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; RV64-UF2:       [[FOR_BODY_PREHEADER]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; RV64-UF2:       [[VECTOR_SCEVCHECK]]:
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; RV64-UF2-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP5]])
+; RV64-UF2-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV64-UF2-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[MUL_RESULT]]
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP4]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[TMP3]], 4294967295
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; RV64-UF2-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; RV64-UF2:       [[VECTOR_MEMCHECK]]:
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = sub i64 [[B1]], [[A2]]
+; RV64-UF2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP14]], [[TMP13]]
+; RV64-UF2-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
+; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]]
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 4
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = sub i32 [[N]], [[DOTCAST]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i32
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[DOTCAST3]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = add nsw i32 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP27]]
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = mul i64 -1, [[TMP31]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i64 [[TMP30]]
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP33]], i64 [[TMP32]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP29]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP34]], align 4
+; RV64-UF2-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD4]])
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = fadd <vscale x 4 x float> [[REVERSE5]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = mul i64 0, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = mul i64 -1, [[TMP39]]
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP38]]
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP40]]
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = mul i64 -1, [[TMP18]]
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = sub i64 [[TMP18]], 1
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = mul i64 -1, [[TMP44]]
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[TMP43]]
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP45]]
+; RV64-UF2-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP35]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE6]], ptr [[TMP42]], align 4
+; RV64-UF2-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP36]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE7]], ptr [[TMP47]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[FOR_BODY_PREHEADER]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; RV64-UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; RV64-UF2:       [[FOR_COND_CLEANUP]]:
+; RV64-UF2-NEXT:    ret void
+; RV64-UF2:       [[FOR_BODY]]:
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
@@ -834,8 +758,397 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
 }
 
-!0 = distinct !{!0, !1, !2, !3, !4}
-!1 = !{!"llvm.loop.mustprogress"}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_f32_simplify(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 -1, [[TMP10]]
+; RV64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]]
+; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP11]]
+; RV64-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; RV64-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-NEXT:    [[TMP14:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; RV64-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP5]], 1
+; RV64-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
+; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
+; RV64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[TMP18]]
+; RV64-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP14]])
+; RV64-NEXT:    store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP20]], align 4
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_f32_simplify(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*]]:
+; RV32-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV32-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV32-NEXT:    [[TMP6:%.*]] = sub i64 1023, [[N_VEC]]
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT:    [[TMP10:%.*]] = mul i32 0, [[TMP9]]
+; RV32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], 1
+; RV32-NEXT:    [[TMP12:%.*]] = mul i32 -1, [[TMP11]]
+; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 [[TMP10]]
+; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP12]]
+; RV32-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; RV32-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV32-NEXT:    [[TMP15:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP5]] to i32
+; RV32-NEXT:    [[TMP18:%.*]] = mul i32 0, [[TMP17]]
+; RV32-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP17]], 1
+; RV32-NEXT:    [[TMP20:%.*]] = mul i32 -1, [[TMP19]]
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP18]]
+; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP20]]
+; RV32-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP15]])
+; RV32-NEXT:    store <vscale x 4 x float> [[REVERSE1]], ptr [[TMP22]], align 4
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; RV32-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV32-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV32-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_f32_simplify(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; RV64-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1023, [[TMP1]]
+; RV64-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; RV64-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1023, [[TMP3]]
+; RV64-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1023, [[N_MOD_VF]]
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = sub i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP12]]
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[TMP17]]
+; RV64-UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; RV64-UF2-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
+; RV64-UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP19]], align 4
+; RV64-UF2-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1]])
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = fadd <vscale x 4 x float> [[REVERSE]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = fadd <vscale x 4 x float> [[REVERSE2]], splat (float 1.000000e+00)
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = mul i64 -1, [[TMP24]]
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP23]]
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP25]]
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = mul i64 -1, [[TMP5]]
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = sub i64 [[TMP5]], 1
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = mul i64 -1, [[TMP29]]
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[TMP28]]
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP31]], i64 [[TMP30]]
+; RV64-UF2-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP20]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE3]], ptr [[TMP27]], align 4
+; RV64-UF2-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP21]])
+; RV64-UF2-NEXT:    store <vscale x 4 x float> [[REVERSE4]], ptr [[TMP32]], align 4
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1023, [[N_VEC]]
+; RV64-UF2-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds float, ptr %B, i64 %iv.next
+  %0 = load float, ptr %arrayidx.b, align 4
+  %fadd = fadd float %0, 1.000000e+00
+  %arrayidx.a = getelementptr inbounds float, ptr %A, i64 %iv.next
+  store float %fadd, ptr %arrayidx.a, align 4
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
+; RV64-LABEL: define void @vector_reverse_irregular_type(
+; RV64-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-NEXT:  [[ENTRY:.*]]:
+; RV64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64:       [[VECTOR_PH]]:
+; RV64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64:       [[VECTOR_BODY]]:
+; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV64-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV64-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP4]]
+; RV64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP12:%.*]] = load i7, ptr [[TMP8]], align 1
+; RV64-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV64-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV64-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV64-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP12]], i32 0
+; RV64-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
+; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV64-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64:       [[MIDDLE_BLOCK]]:
+; RV64-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64:       [[SCALAR_PH]]:
+; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64:       [[FOR_BODY]]:
+;
+; RV32-LABEL: define void @vector_reverse_irregular_type(
+; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV32-NEXT:  [[ENTRY:.*]]:
+; RV32-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV32:       [[VECTOR_PH]]:
+; RV32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV32:       [[VECTOR_BODY]]:
+; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV32-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV32-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV32-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV32-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV32-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV32-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP0]], -1
+; RV32-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP1]], -1
+; RV32-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP2]], -1
+; RV32-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP3]], -1
+; RV32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP4]]
+; RV32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP5]]
+; RV32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP6]]
+; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP12:%.*]] = load i7, ptr [[TMP8]], align 1
+; RV32-NEXT:    [[TMP13:%.*]] = load i7, ptr [[TMP9]], align 1
+; RV32-NEXT:    [[TMP14:%.*]] = load i7, ptr [[TMP10]], align 1
+; RV32-NEXT:    [[TMP15:%.*]] = load i7, ptr [[TMP11]], align 1
+; RV32-NEXT:    [[TMP16:%.*]] = insertelement <4 x i7> poison, i7 [[TMP12]], i32 0
+; RV32-NEXT:    [[TMP17:%.*]] = insertelement <4 x i7> [[TMP16]], i7 [[TMP13]], i32 1
+; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
+; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
+; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
+; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
+; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
+; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
+; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
+; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
+; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
+; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
+; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; RV32-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV32:       [[MIDDLE_BLOCK]]:
+; RV32-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32:       [[SCALAR_PH]]:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV32-NEXT:    br label %[[FOR_BODY:.*]]
+; RV32:       [[FOR_BODY]]:
+;
+; RV64-UF2-LABEL: define void @vector_reverse_irregular_type(
+; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; RV64-UF2-NEXT:  [[ENTRY:.*]]:
+; RV64-UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; RV64-UF2:       [[VECTOR_PH]]:
+; RV64-UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; RV64-UF2:       [[VECTOR_BODY]]:
+; RV64-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; RV64-UF2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; RV64-UF2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; RV64-UF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -1
+; RV64-UF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -2
+; RV64-UF2-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -3
+; RV64-UF2-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], -4
+; RV64-UF2-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], -5
+; RV64-UF2-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], -6
+; RV64-UF2-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -7
+; RV64-UF2-NEXT:    [[TMP8:%.*]] = add nsw i64 [[TMP0]], -1
+; RV64-UF2-NEXT:    [[TMP9:%.*]] = add nsw i64 [[TMP1]], -1
+; RV64-UF2-NEXT:    [[TMP10:%.*]] = add nsw i64 [[TMP2]], -1
+; RV64-UF2-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP3]], -1
+; RV64-UF2-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP4]], -1
+; RV64-UF2-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP5]], -1
+; RV64-UF2-NEXT:    [[TMP14:%.*]] = add nsw i64 [[TMP6]], -1
+; RV64-UF2-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP7]], -1
+; RV64-UF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP8]]
+; RV64-UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP9]]
+; RV64-UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP10]]
+; RV64-UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP12]]
+; RV64-UF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[B]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP24:%.*]] = load i7, ptr [[TMP16]], align 1
+; RV64-UF2-NEXT:    [[TMP25:%.*]] = load i7, ptr [[TMP17]], align 1
+; RV64-UF2-NEXT:    [[TMP26:%.*]] = load i7, ptr [[TMP18]], align 1
+; RV64-UF2-NEXT:    [[TMP27:%.*]] = load i7, ptr [[TMP19]], align 1
+; RV64-UF2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i7> poison, i7 [[TMP24]], i32 0
+; RV64-UF2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i7> [[TMP28]], i7 [[TMP25]], i32 1
+; RV64-UF2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i7> [[TMP29]], i7 [[TMP26]], i32 2
+; RV64-UF2-NEXT:    [[TMP31:%.*]] = insertelement <4 x i7> [[TMP30]], i7 [[TMP27]], i32 3
+; RV64-UF2-NEXT:    [[TMP32:%.*]] = load i7, ptr [[TMP20]], align 1
+; RV64-UF2-NEXT:    [[TMP33:%.*]] = load i7, ptr [[TMP21]], align 1
+; RV64-UF2-NEXT:    [[TMP34:%.*]] = load i7, ptr [[TMP22]], align 1
+; RV64-UF2-NEXT:    [[TMP35:%.*]] = load i7, ptr [[TMP23]], align 1
+; RV64-UF2-NEXT:    [[TMP36:%.*]] = insertelement <4 x i7> poison, i7 [[TMP32]], i32 0
+; RV64-UF2-NEXT:    [[TMP37:%.*]] = insertelement <4 x i7> [[TMP36]], i7 [[TMP33]], i32 1
+; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
+; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
+; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP8]]
+; RV64-UF2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP9]]
+; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP10]]
+; RV64-UF2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP11]]
+; RV64-UF2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP12]]
+; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
+; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
+; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
+; RV64-UF2-NEXT:    [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
+; RV64-UF2-NEXT:    store i7 [[TMP50]], ptr [[TMP42]], align 1
+; RV64-UF2-NEXT:    [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
+; RV64-UF2-NEXT:    store i7 [[TMP51]], ptr [[TMP43]], align 1
+; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
+; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
+; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
+; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
+; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
+; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
+; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
+; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
+; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
+; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
+; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
+; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
+; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
+; RV64-UF2-NEXT:    br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RV64-UF2:       [[MIDDLE_BLOCK]]:
+; RV64-UF2-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2:       [[SCALAR_PH]]:
+; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
+; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; RV64-UF2:       [[FOR_BODY]]:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %dec.iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+  %iv.next = add nsw i64 %dec.iv, -1
+  %arrayidx.b = getelementptr inbounds i7, ptr %B, i64 %iv.next
+  %0 = load i7, ptr %arrayidx.b, align 1
+  %add = add i7 %0, 1
+  %arrayidx.a = getelementptr inbounds i7, ptr %A, i64 %iv.next
+  store i7 %add, ptr %arrayidx.a, align 1
+  %cmp = icmp ugt i64 %dec.iv, 1
+  br i1 %cmp, label %for.body, label %exit, !llvm.loop !4
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = distinct !{!4, !1, !3}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index ff9c58525e51c..b046f61e4d50e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -24,12 +24,16 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -46,7 +50,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -87,15 +91,19 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
@@ -109,7 +117,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -146,20 +154,24 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP11]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT:    [[TMP11]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[VEC_PHI]], i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]])
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
@@ -175,7 +187,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -217,13 +229,17 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
@@ -235,7 +251,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -272,14 +288,18 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
@@ -292,7 +312,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -363,15 +383,19 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1024)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
@@ -385,7 +409,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
index b4afdd7cf7d54..cd53ea0263649 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
@@ -1,17 +1,17 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
-; RUN:   -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data \
+; RUN:   -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=DATA
 
 ; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN:   -mtriple riscv64-linux-gnu -force-tail-folding-style=data-with-evl -mattr=+v,+f -S \
 ; RUN:   -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=EVL
 
-; CHECK: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask
-; CHECK: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask
+; DATA: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask
+; DATA: Cost of 4 for VF 4: EMIT{{.*}} = active lane mask
+; DATA: Cost of 8 for VF 8: EMIT{{.*}} = active lane mask
+; DATA: Cost of 2 for VF vscale x 1: EMIT{{.*}} = active lane mask
+; DATA: Cost of 4 for VF vscale x 2: EMIT{{.*}} = active lane mask
+; DATA: Cost of 8 for VF vscale x 4: EMIT{{.*}} = active lane mask
 
 ; EVL: Cost of 1 for VF vscale x 1: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
 ; EVL: Cost of 1 for VF vscale x 2: EMIT{{.*}} = EXPLICIT-VECTOR-LENGTH
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 528cec077d8a9..b56e7128af4c2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -170,15 +170,11 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP13]]
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT4]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP15]], i32 9)
-; CHECK-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[BROADCAST_SPLAT2]], i32 1, <vscale x 4 x i1> [[TMP11]])
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT2]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP6]])
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -199,7 +195,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 8
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -298,7 +294,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[WIDE_MASKED_GATHER]], <vscale x 2 x ptr> [[BROADCAST_SPLAT6]], i32 8, <vscale x 2 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -319,7 +315,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -359,8 +355,9 @@ attributes #1 = { "target-features"="+64bit,+v" }
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META7:![0-9]+]], [[META2]]}
+; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 8baf9d90c795a..c6955f11d8520 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -2,9 +2,6 @@
 ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=SCALABLE
 ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=FIXEDLEN
 ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-SCALABLE
-; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-FIXEDLEN
-
-
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
@@ -103,15 +100,19 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
@@ -126,44 +127,10 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @uniform_load(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -277,22 +244,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ]
 ; TF-SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
 ;
-; TF-FIXEDLEN-LABEL: define i64 @uniform_load_outside_use(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    ret i64 [[V_LCSSA]]
-;
 entry:
   br label %for.body
 
@@ -437,25 +388,31 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP11]]
+; TF-SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], splat (i64 1024)
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10)
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> poison)
+; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP7]])
 ; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr [[TMP13]], i32 8, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -474,55 +431,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @conditional_uniform_load(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; TF-FIXEDLEN-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; TF-FIXEDLEN-NEXT:    br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]]
-; TF-FIXEDLEN:       [[DO_LOAD]]:
-; TF-FIXEDLEN-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    br label %[[LATCH]]
-; TF-FIXEDLEN:       [[LATCH]]:
-; TF-FIXEDLEN-NEXT:    [[PHI:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[V]], %[[DO_LOAD]] ]
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -640,17 +552,21 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[B]], align 1
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -663,44 +579,10 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @uniform_load_unaligned(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = load i64, ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -813,15 +695,19 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -834,44 +720,10 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @uniform_store(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -1003,22 +855,27 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
-; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP13]]
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -1031,71 +888,10 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @uniform_store_of_loop_varying(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP0]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; TF-FIXEDLEN:       [[PRED_STORE_IF]]:
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; TF-FIXEDLEN-NEXT:    store i64 [[TMP1]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; TF-FIXEDLEN:       [[PRED_STORE_CONTINUE]]:
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; TF-FIXEDLEN:       [[PRED_STORE_IF1]]:
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 1
-; TF-FIXEDLEN-NEXT:    store i64 [[TMP3]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; TF-FIXEDLEN:       [[PRED_STORE_CONTINUE2]]:
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; TF-FIXEDLEN:       [[PRED_STORE_IF3]]:
-; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 2
-; TF-FIXEDLEN-NEXT:    store i64 [[TMP5]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
-; TF-FIXEDLEN:       [[PRED_STORE_CONTINUE4]]:
-; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
-; TF-FIXEDLEN:       [[PRED_STORE_IF5]]:
-; TF-FIXEDLEN-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 3
-; TF-FIXEDLEN-NEXT:    store i64 [[TMP7]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
-; TF-FIXEDLEN:       [[PRED_STORE_CONTINUE6]]:
-; TF-FIXEDLEN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    store i64 [[IV]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -1240,24 +1036,28 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TF-SCALABLE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP11]]
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10)
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP9]])
+; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[TMP10]], i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -1275,55 +1075,10 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @conditional_uniform_store(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]])
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; TF-FIXEDLEN-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; TF-FIXEDLEN-NEXT:    br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]]
-; TF-FIXEDLEN:       [[DO_STORE]]:
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; TF-FIXEDLEN-NEXT:    br label %[[LATCH]]
-; TF-FIXEDLEN:       [[LATCH]]:
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -1442,15 +1197,19 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 1025)
+; TF-SCALABLE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[AVL:%.*]] = sub i64 1025, [[INDEX]]
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 1
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -1463,44 +1222,10 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
-; TF-FIXEDLEN-LABEL: define void @uniform_store_unaligned(
-; TF-FIXEDLEN-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; TF-FIXEDLEN-NEXT:  [[ENTRY:.*]]:
-; TF-FIXEDLEN-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; TF-FIXEDLEN:       [[VECTOR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
-; TF-FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; TF-FIXEDLEN-NEXT:    br label %[[VECTOR_BODY:.*]]
-; TF-FIXEDLEN:       [[VECTOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 1025)
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
-; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
-; TF-FIXEDLEN-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; TF-FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_END:.*]]
-; TF-FIXEDLEN:       [[SCALAR_PH]]:
-; TF-FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
-; TF-FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-FIXEDLEN:       [[FOR_BODY]]:
-; TF-FIXEDLEN-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 1
-; TF-FIXEDLEN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; TF-FIXEDLEN:       [[FOR_END]]:
-; TF-FIXEDLEN-NEXT:    ret void
-;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index fe6a693e83816..acfcf90b813ef 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -16,48 +16,39 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP5]], 1
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
-; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP13]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
-; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
-; IF-EVL-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 4
-; IF-EVL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP31]]
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP25]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP19]])
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP20]])
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
-; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr align 4 [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
@@ -76,50 +67,36 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       for.cond.cleanup:
 ; IF-EVL-NEXT:    ret void
 ;
 ; NO-VP-LABEL: @interleave(
 ; NO-VP-NEXT:  entry:
 ; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP4]], 4
-; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP8]], 2
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 0
-; NO-VP-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; NO-VP-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0
-; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
-; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; NO-VP-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; NO-VP-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0
 ; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; NO-VP-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; NO-VP-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
-; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP15]]
 ; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
 ; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; NO-VP-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
-; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
-; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP24]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-VP:       middle.block:
@@ -163,6 +140,5 @@ for.cond.cleanup:
   ret void
 }
 
-!0 = distinct !{!0, !1, !2}
-!1 = !{!"llvm.loop.interleave.count", i32 2}
-!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
new file mode 100644
index 0000000000000..d7c9ce4216c1e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-riscv-vector-reverse.ll
@@ -0,0 +1,80 @@
+; This is the loop in c++ being vectorize in this file with
+;vector.reverse
+;  #pragma clang loop vectorize_width(4, scalable)
+;  for (int i = N-1; i >= 0; --i)
+;    a[i] = b[i] + 1.0;
+
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v \
+; RUN: -debug-only=loop-vectorize -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck %s
+
+define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) {
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: vp<[[OTC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   EMIT vp<[[OTC]]> = EXPAND SCEV (1 + (-1 * (1 umin %n))<nuw><nsw> + %n)
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   vp<[[RESUME_IV_A:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT:   vp<[[RESUME_IV_B:%.+]]> = DERIVED-IV ir<%n> + vp<[[VTC]]> * ir<-1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[INDUCTION:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEX_NEXT:%.+]]>
+; CHECK-NEXT:     vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[INDUCTION]]> * ir<-1>
+; CHECK-NEXT:     vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT:     CLONE ir<[[IDX:%.+]]> = add nsw vp<[[SCALAR_STEPS]]>, ir<-1>
+; CHECK-NEXT:     CLONE ir<[[IDX_PROM:%.+]]> = zext ir<[[IDX]]>
+; CHECK-NEXT:     CLONE ir<[[ARRAY_IDX_B:%.+]]> = getelementptr inbounds ir<[[B:%.+]]>, ir<[[IDX_PROM]]>
+; CHECK-NEXT:     vp<[[VEC_END_PTR_B:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_B]]>, vp<[[VF]]>
+; CHECK-NEXT:     WIDEN ir<[[VAL_B:%.+]]> = load vp<[[VEC_END_PTR_B]]>
+; CHECK-NEXT:     WIDEN ir<[[ADD_RESULT:%.+]]> = add ir<[[VAL_B]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<[[ARRAY_IDX_A:%.+]]> = getelementptr inbounds ir<[[A:%.+]]>, ir<[[IDX_PROM]]>
+; CHECK-NEXT:     vp<[[VEC_END_PTR_A:%.+]]> = vector-end-pointer inbounds ir<[[ARRAY_IDX_A]]>, vp<[[VF]]>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_END_PTR_A]]>, ir<[[ADD_RESULT]]>
+; CHECK-NEXT:     EMIT vp<[[INDEX_NEXT]]> = add nuw vp<[[INDUCTION]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[INDEX_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[OTC]]>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.cond.cleanup>:
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[RESUME_IV_A]]>, middle.block ], [ ir<%n>, ir-bb<entry> ]
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<[[RESUME_IV_B]]>, middle.block ], [ ir<%n>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %n, %entry ], [ %indvars.iv.next, %for.body ]
+  %i.0.in8 = phi i32 [ %n, %entry ], [ %i.0, %for.body ]
+  %i.0 = add nsw i32 %i.0.in8, -1
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+  %1 = load i32, ptr %arrayidx, align 4
+  %add9 = add i32 %1, 1
+  %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+  store i32 %add9, ptr %arrayidx3, align 4
+  %cmp = icmp ugt i32 %indvars.iv, 1
+  %indvars.iv.next = add nsw i32 %indvars.iv, -1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index f226ae98e4225..cb7f0bfc64be1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -18,8 +18,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index b2e080fef2e57..a2eddad179216 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 5661406b88a5a..1ca5586942d7c 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,234 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_10(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -10
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 10, [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 10, [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 10, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_value(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[START:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[START]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[START]], [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_with_additional_add(ptr noalias %src, ptr noalias %src.2, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_with_additional_add(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC_2]], i64 [[IV]]
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[L_SRC_2]]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    store i32 [[SUM_NEXT_LCSSA]], ptr [[SRC_2]], align 4
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -247,14 +496,19 @@ entry:
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ]
+  %gep.src.2 = getelementptr inbounds nuw i32, ptr %src.2, i64 %iv
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %sum.next = add i32 %sum, %l.src.2
   %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
   %l = load float, ptr %gep.src, align 4
-  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
   %iv.next = add nuw nsw i64 %iv, 1
   %ec = icmp eq i64 %iv.next, %n
   br i1 %ec, label %exit, label %loop
 
 exit:
+  store i32 %sum.next, ptr %src.2
   ret float %max.next
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 148beb64a3609..68bc8d0640a3f 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,51 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
index 85a90f2e04c5e..e7ab02cd98a5e 100644
--- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -1001,8 +1001,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmin_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.minnum.v2f32
+; CHECK: <2 x float> @llvm.minnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmin_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
@@ -1021,8 +1023,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmax_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.maxnum.v2f32
+; CHECK: <2 x float> @llvm.maxnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmax_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 2747895f06a7b..ce4270dc4b7fa 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -18,11 +18,9 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[PH:.*]], label %[[LOOP_1]]
 ; CHECK:       [[PH]]:
-; CHECK-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SRC_2:%.*]] = tail call noalias noundef dereferenceable_or_null(8) ptr @calloc(i64 1, i64 8)
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV_2_LCSSA]], 1
 ; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
index bdd0c6f728ae7..7cc8458b56dbc 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
@@ -431,195 +431,26 @@ exit:
   ret void
 }
 
-define void @lifetime_for_ptr_first_arg_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
-; CHECK-LABEL: @lifetime_for_ptr_first_arg_before_multiply(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
-; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
-; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
-; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
-; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
-; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
-; CHECK-NEXT:    ret void
-;
-entry:
-  %a = load <4 x double>, ptr %A, align 8
-  %b = load <4 x double>, ptr %B, align 8
-  br i1 %c.0, label %then, label %exit
-
-then:
-  call void @llvm.lifetime.end(i64 -1, ptr %A)
-  br label %exit
-
-exit:
-  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
-  store <4 x double> %m, ptr %C, align 8
-  ret void
-}
-
-define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
-; CHECK-LABEL: @lifetime_for_both_ptr_args_before_multiply(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
-; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
-; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
-; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
-; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
-; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
-; CHECK-NEXT:    ret void
-;
-entry:
-  %a = load <4 x double>, ptr %A, align 8
-  %b = load <4 x double>, ptr %B, align 8
-  br i1 %c.0, label %then, label %exit
-
-then:
-  call void @llvm.lifetime.end(i64 -1, ptr %B)
-  call void @llvm.lifetime.end(i64 -1, ptr %A)
-  br label %exit
-
-exit:
-  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
-  store <4 x double> %m, ptr %C, align 8
-  ret void
-}
-
-define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @multiple_unrelated_lifetimes(ptr noalias %C, i1 %c.0) {
 ; CHECK-LABEL: @multiple_unrelated_lifetimes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOC_1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[ALLOC_2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_1]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_2]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -682,6 +513,10 @@ define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr no
 entry:
   %alloc.1 = alloca i32
   %alloc.2 = alloca i32
+  %A = alloca <4 x double>
+  %B = alloca <4 x double>
+  call void @init(ptr %A)
+  call void @init(ptr %B)
   %a = load <4 x double>, ptr %A, align 8
   %b = load <4 x double>, ptr %B, align 8
   br i1 %c.0, label %then, label %exit
@@ -699,106 +534,20 @@ exit:
   ret void
 }
 
-define void @lifetime_for_ptr_select_before_multiply(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: @lifetime_for_ptr_select_before_multiply(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P:%.*]] = select i1 [[C_0:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]
-; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[P]], i64 0
-; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
-; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
-; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
-; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
-; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
-; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
-; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
-; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
-; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
-; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
-; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C:%.*]], i64 0
-; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
-; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
-; CHECK-NEXT:    ret void
-;
-entry:
-  %P = select i1 %c.0, ptr %A, ptr %B
-  %a = load <4 x double>, ptr %P, align 8
-  %b = load <4 x double>, ptr %B, align 8
-  br i1 %c.1, label %then, label %exit
-
-then:
-  call void @llvm.lifetime.end(i64 -1, ptr %P)
-  br label %exit
-
-exit:
-  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
-  store <4 x double> %m, ptr %C, align 8
-  ret void
-}
-
-define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_in_different_blocks(ptr noalias %C, i1 %c.0) {
 ; CHECK-LABEL: @lifetimes_for_args_in_different_blocks(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -864,7 +613,9 @@ define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias
 ;
 entry:
   %A = alloca <4 x double>
+  %B = alloca <4 x double>
   call void @init(ptr %A)
+  call void @init(ptr %B)
   br i1 %c.0, label %then, label %exit
 
 then:
@@ -880,15 +631,17 @@ exit:
   ret void
 }
 
-define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_in_different_blocks2(ptr noalias %C, i1 %c.0) {
 ; CHECK-LABEL: @lifetimes_for_args_in_different_blocks2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
@@ -957,7 +710,9 @@ define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias
 ;
 entry:
   %A = alloca <4 x double>
+  %B = alloca <4 x double>
   call void @init(ptr %A)
+  call void @init(ptr %B)
   br i1 %c.0, label %then, label %exit
 
 then:
@@ -973,18 +728,20 @@ exit:
   ret void
 }
 
-define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_load0_in_different_block(ptr noalias %C, i1 %c.0) {
 ; CHECK-LABEL: @lifetimes_for_args_load0_in_different_block(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -1048,7 +805,9 @@ define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noa
 ;
 entry:
   %A = alloca <4 x double>
+  %B = alloca <4 x double>
   call void @init(ptr %A)
+  call void @init(ptr %B)
   %a = load <4 x double>, ptr %A, align 8
   call void @llvm.lifetime.end(i64 -1, ptr %A)
   br i1 %c.0, label %then, label %exit
@@ -1064,18 +823,20 @@ exit:
   ret void
 }
 
-define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+define void @lifetimes_for_args_load1_in_different_block(ptr noalias %C, i1 %c.0) {
 ; CHECK-LABEL: @lifetimes_for_args_load1_in_different_block(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -1139,7 +900,9 @@ define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noa
 ;
 entry:
   %A = alloca <4 x double>
+  %B = alloca <4 x double>
   call void @init(ptr %A)
+  call void @init(ptr %B)
   %b = load <4 x double>, ptr %B, align 8
   call void @llvm.lifetime.end(i64 -1, ptr %B)
   br i1 %c.0, label %then, label %exit
diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index 615887474aaaa..e9fc06b1e1da9 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -116,22 +116,3 @@ define i32 @call_slot_clobber_before_lifetime_start() {
   %v = load i32, ptr %dst
   ret i32 %v
 }
-
-define void @call_slot_lifetime_bitcast(ptr %ptr) {
-; CHECK-LABEL: @call_slot_lifetime_bitcast(
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 4 [[PTR:%.*]], i64 4, i1 false)
-; CHECK-NEXT:    [[TMP1_CAST:%.*]] = bitcast ptr [[TMP1]] to ptr
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1_CAST]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1]], ptr align 4 [[PTR]], i64 4, i1 false)
-; CHECK-NEXT:    ret void
-;
-  %tmp1 = alloca i32
-  %tmp2 = alloca i32
-  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp2, ptr align 4 %ptr, i64 4, i1 false)
-  %tmp1.cast = bitcast ptr %tmp1 to ptr
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %tmp1.cast)
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp1.cast, ptr align 4 %tmp2, i64 4, i1 false)
-  ret void
-}
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
index 2f1ce37ea2256..816e10324179a 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
@@ -26,35 +26,41 @@ define i32 @test1(ptr nocapture %foobie) nounwind noinline ssp uwtable {
 }
 
 ; Check that the memcpy is removed.
-define void @test2(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable {
+define void @test2(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[IN:%.*]])
+; CHECK-NEXT:    [[IN:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[IN]])
 ; CHECK-NEXT:    ret void
 ;
+  %in = alloca i64
   call void @llvm.lifetime.start.p0(i64 8, ptr %in)
   call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false)
   ret void
 }
 
 ; Check that the memcpy is not removed.
-define void @test3(ptr sret(i8) noalias nocapture %out, ptr %in) nounwind noinline ssp uwtable {
+define void @test3(ptr sret(i8) noalias nocapture %out) nounwind noinline ssp uwtable {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IN:%.*]])
+; CHECK-NEXT:    [[IN:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IN]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[OUT:%.*]], ptr [[IN]], i64 8, i1 false)
 ; CHECK-NEXT:    ret void
 ;
+  %in = alloca i64
   call void @llvm.lifetime.start.p0(i64 4, ptr %in)
   call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 8, i1 false)
   ret void
 }
 
 ; Check that the memcpy is not removed.
-define void @test_lifetime_may_alias(ptr %lifetime, ptr %src, ptr %dst) {
+define void @test_lifetime_may_alias(ptr %src, ptr %dst) {
 ; CHECK-LABEL: @test_lifetime_may_alias(
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME:%.*]])
+; CHECK-NEXT:    [[LIFETIME:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[LIFETIME]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 8, i1 false)
 ; CHECK-NEXT:    ret void
 ;
+  %lifetime = alloca i64
   call void @llvm.lifetime.start.p0(i64 8, ptr %lifetime)
   call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 8, i1 false)
   ret void
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
index 0c16f34590fc7..7ea63bb5d3c6b 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
@@ -37,29 +37,10 @@ define void @test_alloca_with_lifetimes(ptr %result) {
   ret void
 }
 
-define void @test_malloc_with_lifetimes(ptr %result) {
-; CHECK-LABEL: @test_malloc_with_lifetimes(
-; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 16)
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[A]])
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 12, i1 false)
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[A]])
-; CHECK-NEXT:    call void @free(ptr [[A]])
-; CHECK-NEXT:    ret void
-;
-  %a = call ptr @malloc(i64 16)
-  call void @llvm.lifetime.start.p0(i64 16, ptr %a)
-  call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
-  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr %a)
-  call void @free(ptr %a)
-  ret void
-}
-
 ; memcpy size is larger than lifetime, don't optimize.
 define void @test_copy_larger_than_lifetime_size(ptr %result) {
 ; CHECK-LABEL: @test_copy_larger_than_lifetime_size(
-; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[A]])
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false)
@@ -67,7 +48,7 @@ define void @test_copy_larger_than_lifetime_size(ptr %result) {
 ; CHECK-NEXT:    call void @free(ptr [[A]])
 ; CHECK-NEXT:    ret void
 ;
-  %a = call ptr @malloc(i64 16)
+  %a = alloca %T, align 8
   call void @llvm.lifetime.start.p0(i64 12, ptr %a)
   call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false)
diff --git a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
index b654319b9432e..ff36bf0315311 100644
--- a/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/preserve-memssa.ll
@@ -94,21 +94,6 @@ entry:
   ret void
 }
 
-define i8 @test6(ptr %ptr, ptr noalias %ptr.1) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PTR]], align 8
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[PTR]], ptr [[PTR_1:%.*]], i64 24, i1 false)
-; CHECK-NEXT:    ret i8 [[TMP0]]
-;
-entry:
-  call void @llvm.lifetime.start.p0(i64 24, ptr %ptr)
-  %0 = load i8, ptr %ptr, align 8
-  call void @llvm.memmove.p0.p0.i64(ptr %ptr, ptr %ptr.1, i64 24, i1 false)
-  ret i8 %0
-}
-
 define void @test7(ptr %ptr) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index 323df12f6335c..1784c2fd208c3 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -121,13 +121,14 @@ attributes #6 = { builtin }
 !12 = !{i64 789, i64 300}
 !13 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !14, producer: "clang version 21.0.0git (git@github.com:llvm/llvm-project.git e391301e0e4d9183fe06e69602e87b0bc889aeda)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !14 = !DIFile(filename: "basic.cc", directory: "", checksumkind: CSK_MD5, checksum: "8636c46e81402013b9d54e8307d2f149")
-!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13)
+!15 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !13, declaration: !22)
 !16 = !DISubroutineType(types: !17)
 !17 = !{!18}
 !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
 !19 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
 !20 = !{i32 7, !"Dwarf Version", i32 5}
 !21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !14, file: !14, line: 1, type: !16, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
 
 ; DUMP: CCG before cloning:
 ; DUMP: Callsite Context Graph:
@@ -290,7 +291,8 @@ attributes #6 = { builtin }
 ; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
 ; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
 ;; Make sure the clone's linkageName was updated.
-; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
+; IR: ![[SP]] = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1", {{.*}} declaration: ![[SP2:[0-9]+]])
+; IR: ![[SP2]] = !DISubprogram(name: "bar", linkageName: "_Z3barv.memprof.1"
 
 
 ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
diff --git a/llvm/test/Transforms/MoveAutoInit/clobber.ll b/llvm/test/Transforms/MoveAutoInit/clobber.ll
index 09084b6ddc51b..08ffb13ad4b99 100644
--- a/llvm/test/Transforms/MoveAutoInit/clobber.ll
+++ b/llvm/test/Transforms/MoveAutoInit/clobber.ll
@@ -10,14 +10,14 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca [100 x i8], align 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = alloca [2 x i8], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 0
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 0
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP1:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[TMP15:%.*]], label [[TMP10:%.*]]
 ; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation !0
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[TMP6]], i8 -86, i64 100, i1 false), !annotation [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [100 x i8], ptr [[TMP4]], i64 0, i64 [[TMP11]]
 ; CHECK-NEXT:    store i8 12, ptr [[TMP12]], align 1
@@ -28,8 +28,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP2:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[TMP22]], label [[TMP17:%.*]]
 ; CHECK:       17:
-; CHECK-NEXT:    store i8 -86, ptr [[TMP7]], align 1, !annotation !0
-; CHECK-NEXT:    store i8 -86, ptr [[TMP8]], align 1, !annotation !0
+; CHECK-NEXT:    store i8 -86, ptr [[TMP7]], align 1, !annotation [[META0]]
+; CHECK-NEXT:    store i8 -86, ptr [[TMP8]], align 1, !annotation [[META0]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8], ptr [[TMP5]], i64 0, i64 [[TMP18]]
 ; CHECK-NEXT:    store i8 12, ptr [[TMP19]], align 1
@@ -38,19 +38,19 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
 ; CHECK-NEXT:    br label [[TMP22]]
 ; CHECK:       22:
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi i32 [ [[TMP14]], [[TMP10]] ], [ [[TMP21]], [[TMP17]] ], [ 0, [[TMP15]] ]
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP7]]) #[[ATTR3]]
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP6]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[TMP5]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 100, ptr nonnull [[TMP4]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret i32 [[TMP23]]
 ;
 
   %4 = alloca [100 x i8], align 16
   %5 = alloca [2 x i8], align 1
   %6 = getelementptr inbounds [100 x i8], ptr %4, i64 0, i64 0
-  call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %6) #3
+  call void @llvm.lifetime.start.p0(i64 100, ptr nonnull %4) #3
   ; This memset must move.
   call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) %6, i8 -86, i64 100, i1 false), !annotation !0
   %7 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 0
-  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %7) #3
+  call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %5) #3
   ; This store must move.
   store i8 -86, ptr %7, align 1, !annotation !0
   %8 = getelementptr inbounds [2 x i8], ptr %5, i64 0, i64 1
@@ -81,8 +81,8 @@ define i32 @foo(i32 noundef %0, i32 noundef %1, i32 noundef %2) #0 {
 
 22:
   %23 = phi i32 [ %14, %10 ], [ %21, %17 ], [ 0, %15 ]
-  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %7) #3
-  call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %6) #3
+  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %5) #3
+  call void @llvm.lifetime.end.p0(i64 100, ptr nonnull %4) #3
   ret i32 %23
 }
 
diff --git a/llvm/test/Transforms/NewGVN/lifetime-simple.ll b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
index 55e46111fc9c9..0a7bd339bfabe 100644
--- a/llvm/test/Transforms/NewGVN/lifetime-simple.ll
+++ b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
@@ -4,10 +4,11 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
 
-define i8 @test(ptr %P) nounwind {
+define i8 @test() nounwind {
 ; CHECK-LABEL: define i8 @test(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = alloca [32 x i8], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[P]])
 ; CHECK-NEXT:    store i8 1, ptr [[P]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[P]])
@@ -15,6 +16,7 @@ define i8 @test(ptr %P) nounwind {
 ; CHECK-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
+  %P = alloca [32 x i8]
   call void @llvm.lifetime.start.p0(i64 32, ptr %P)
   %0 = load i8, ptr %P
   store i8 1, ptr %P
diff --git a/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll
new file mode 100644
index 0000000000000..d1da7ea3d7502
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/salvage-eliminate-instruction.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -passes=newgvn %s | FileCheck %s
+
+; Check that eliminateInstruction() replaces the debug uses of the instructions
+; marked for deletion with the dominating leader.
+
+define void @binop(i32 %x, i32 %y) !dbg !5 {
+; CHECK:      #dbg_value(i32 %add1, [[META9:![0-9]+]], !DIExpression(), [[META12:![0-9]+]])
+; CHECK-NEXT: #dbg_value(i32 %add1, [[META11:![0-9]+]], !DIExpression(), [[META13:![0-9]+]])
+;
+  %add1 = add i32 %x, %y, !dbg !12
+    #dbg_value(i32 %add1, !9, !DIExpression(), !12)
+  %add2 = add i32 %y, %x, !dbg !13
+    #dbg_value(i32 %add2, !11, !DIExpression(), !13)
+  call void @use(i32 %add1, i32 %add2), !dbg !14
+  ret void, !dbg !15
+}
+
+declare void @use(i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 4}
+!3 = !{i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "binop", linkageName: "binop", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10)
+!12 = !DILocation(line: 1, column: 1, scope: !5)
+!13 = !DILocation(line: 2, column: 1, scope: !5)
+!14 = !DILocation(line: 3, column: 1, scope: !5)
+!15 = !DILocation(line: 4, column: 1, scope: !5)
+;.
+; CHECK: [[META9]] = !DILocalVariable(name: "1",
+; CHECK: [[META11]] = !DILocalVariable(name: "2",
+; CHECK: [[META12]] = !DILocation(line: 1,
+; CHECK: [[META13]] = !DILocation(line: 2,
+;.
diff --git a/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll
new file mode 100644
index 0000000000000..cc695417bc4e6
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/salvage-trivially-dead-inst.ll
@@ -0,0 +1,33 @@
+; RUN: opt -passes=newgvn -S %s | FileCheck %s
+
+; Check that assignDFSNumbers() in NewGVN salvages the debug values of the
+; trivially dead instructions that are marked for deletion.
+
+; CHECK: #dbg_value(i8 %tmp, [[META11:![0-9]+]], !DIExpression(DW_OP_constu, 8, DW_OP_eq, DW_OP_stack_value), [[META26:![0-9]+]])
+; CHECK: [[META11]] = !DILocalVariable(name: "2"
+; CHECK: [[META26]] = !DILocation(line: 2
+
+define void @test13() !dbg !5 {
+entry:
+  %tmp = load i8, ptr null, align 1
+  %tmp2 = icmp eq i8 %tmp, 8, !dbg !13
+    #dbg_value(i1 %tmp2, !11, !DIExpression(), !13)
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 3}
+!3 = !{i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test13", linkageName: "test13", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!11}
+!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10)
+!13 = !DILocation(line: 2, column: 1, scope: !5)
\ No newline at end of file
diff --git a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll
index 60180c4ab017f..180fd0a561716 100644
--- a/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll
+++ b/llvm/test/Transforms/ObjCARC/inlined-autorelease-return-value.ll
@@ -80,12 +80,14 @@ entry:
 
 ; CHECK-LABEL: define ptr @elide_with_retainRV_splitByLifetime(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %x = alloca ptr
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %x)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %x)
 ; CHECK-NEXT:    ret ptr %x
-define ptr @elide_with_retainRV_splitByLifetime(ptr %x) nounwind {
+define ptr @elide_with_retainRV_splitByLifetime() nounwind {
 entry:
   ; Cleanup should skip over lifetime intrinsics.
+  %x = alloca ptr
   call void @llvm.lifetime.start(i64 8, ptr %x)
   %b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind
   call void @llvm.lifetime.end(i64 8, ptr %x)
@@ -218,13 +220,15 @@ entry:
 
 ; CHECK-LABEL: define ptr @elide_with_claimRV_splitByLifetime(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %x = alloca ptr
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %x)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %x)
 ; CHECK-NEXT:    tail call void @llvm.objc.release(ptr %x)
 ; CHECK-NEXT:    ret ptr %x
-define ptr @elide_with_claimRV_splitByLifetime(ptr %x) nounwind {
+define ptr @elide_with_claimRV_splitByLifetime() nounwind {
 entry:
   ; Cleanup should skip over lifetime intrinsics.
+  %x = alloca ptr
   call void @llvm.lifetime.start(i64 8, ptr %x)
   %b = call ptr @llvm.objc.autoreleaseReturnValue(ptr %x) nounwind
   call void @llvm.lifetime.end(i64 8, ptr %x)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
index 09a59de44c745..d55559d632019 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
@@ -62,12 +62,11 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    store <8 x i16> [[PREDPHI]], ptr [[DCT]], align 2, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    store <8 x i16> [[PREDPHI34]], ptr [[TMP0]], align 2, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[BIN_RDX35:%.*]] = or <8 x i16> [[PREDPHI34]], [[PREDPHI]]
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = sext <8 x i16> [[BIN_RDX35]] to <8 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP29:%.*]] = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[BIN_RDX35]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[TMP29]], [[VECTOR_BODY]] ], [ [[OR_15:%.*]], [[IF_END_15:%.*]] ]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[OR_LCSSA]], 0
+; CHECK-NEXT:    [[OR_LCSSA_IN:%.*]] = phi i16 [ [[TMP29]], [[VECTOR_BODY]] ], [ [[OR_1551:%.*]], [[IF_END_15:%.*]] ]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 [[OR_LCSSA_IN]], 0
 ; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL]] to i32
 ; CHECK-NEXT:    ret i32 [[LNOT_EXT]]
 ; CHECK:       for.body:
@@ -514,8 +513,7 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK:       if.end.15:
 ; CHECK-NEXT:    [[STOREMERGE_15:%.*]] = phi i16 [ [[CONV28_15]], [[IF_ELSE_15]] ], [ [[CONV12_15]], [[IF_THEN_15]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_15]], ptr [[ARRAYIDX_15]], align 2
-; CHECK-NEXT:    [[OR_1551:%.*]] = or i16 [[OR_1450]], [[STOREMERGE_15]]
-; CHECK-NEXT:    [[OR_15]] = sext i16 [[OR_1551]] to i32
+; CHECK-NEXT:    [[OR_1551]] = or i16 [[OR_1450]], [[STOREMERGE_15]]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/uscmp.ll b/llvm/test/Transforms/SCCP/uscmp.ll
new file mode 100644
index 0000000000000..d010c0632c809
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/uscmp.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=sccp -S < %s | FileCheck %s
+
+define i32 @scmp_to_sub(i32 range(i32 -1, 2) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub(
+; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %scmp = call i32 @llvm.scmp(i32 %a, i32 0)
+  ret i32 %scmp
+}
+
+define i32 @scmp_zext_to_sub(i1 %a, i1 %b) {
+; CHECK-LABEL: define i32 @scmp_zext_to_sub(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[ZEXT_A:%.*]] = zext i1 [[A]] to i32
+; CHECK-NEXT:    [[ZEXT_B:%.*]] = zext i1 [[B]] to i32
+; CHECK-NEXT:    [[SCMP:%.*]] = sub nsw i32 [[ZEXT_A]], [[ZEXT_B]]
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %zext_a = zext i1 %a to i32
+  %zext_b = zext i1 %b to i32
+  %scmp = call i32 @llvm.scmp(i32 %zext_a, i32 %zext_b)
+  ret i32 %scmp
+}
+
+define i8 @scmp_to_sub_trunc(i32 range(i32 -1, 2) %a) {
+; CHECK-LABEL: define i8 @scmp_to_sub_trunc(
+; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP1:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT:    [[SCMP:%.*]] = trunc i32 [[SCMP1]] to i8
+; CHECK-NEXT:    ret i8 [[SCMP]]
+;
+  %scmp = call i8 @llvm.scmp(i32 %a, i32 0)
+  ret i8 %scmp
+}
+
+define i64 @scmp_to_sub_sext(i32 range(i32 -1, 2) %a) {
+; CHECK-LABEL: define i64 @scmp_to_sub_sext(
+; CHECK-SAME: i32 range(i32 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP1:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT:    [[SCMP:%.*]] = sext i32 [[SCMP1]] to i64
+; CHECK-NEXT:    ret i64 [[SCMP]]
+;
+  %scmp = call i64 @llvm.scmp(i32 %a, i32 0)
+  ret i64 %scmp
+}
+
+define i32 @scmp_to_sub_small_range(i32 range(i32 -1, 1) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_small_range(
+; CHECK-SAME: i32 range(i32 -1, 1) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = sub nsw i32 [[A]], 0
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %scmp = call i32 @llvm.scmp(i32 %a, i32 0)
+  ret i32 %scmp
+}
+
+define i32 @ucmp_to_sub(i32 range(i32 0, 3) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub(
+; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i32 %a, i32 1)
+  ret i32 %ucmp
+}
+
+define i8 @ucmp_to_sub_trunc(i32 range(i32 0, 3) %a) {
+; CHECK-LABEL: define i8 @ucmp_to_sub_trunc(
+; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) {
+; CHECK-NEXT:    [[UCMP1:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT:    [[UCMP:%.*]] = trunc i32 [[UCMP1]] to i8
+; CHECK-NEXT:    ret i8 [[UCMP]]
+;
+  %ucmp = call i8 @llvm.ucmp(i32 %a, i32 1)
+  ret i8 %ucmp
+}
+
+define i64 @ucmp_to_sub_sext(i32 range(i32 0, 3) %a) {
+; CHECK-LABEL: define i64 @ucmp_to_sub_sext(
+; CHECK-SAME: i32 range(i32 0, 3) [[A:%.*]]) {
+; CHECK-NEXT:    [[UCMP1:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT:    [[UCMP:%.*]] = sext i32 [[UCMP1]] to i64
+; CHECK-NEXT:    ret i64 [[UCMP]]
+;
+  %ucmp = call i64 @llvm.ucmp(i32 %a, i32 1)
+  ret i64 %ucmp
+}
+
+; TODO: we can fold this into %a.
+define i32 @ucmp_to_sub_small_range(i32 range(i32 0, 2) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_small_range(
+; CHECK-SAME: i32 range(i32 0, 2) [[A:%.*]]) {
+; CHECK-NEXT:    [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[UCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0)
+  ret i32 %ucmp
+}
+
+define i32 @scmp_to_sub_large_range(i32 range(i32 -1, 3) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_large_range(
+; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %scmp = call i32 @llvm.scmp(i32 %a, i32 0)
+  ret i32 %scmp
+}
+
+define i32 @ucmp_to_sub_large_range(i32 range(i32 -1, 3) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_large_range(
+; CHECK-SAME: i32 range(i32 -1, 3) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i32 %a, i32 0)
+  ret i32 %ucmp
+}
+
+define i32 @scmp_to_sub_wrap(i8 range(i8 127, -126) %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_wrap(
+; CHECK-SAME: i8 range(i8 127, -126) [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i8(i8 [[A]], i8 -128)
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %scmp = call i32 @llvm.scmp(i8 %a, i8 -128)
+  ret i32 %scmp
+}
+
+define i32 @ucmp_to_sub_wrap(i8 range(i8 -1, 2) %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_wrap(
+; CHECK-SAME: i8 range(i8 -1, 2) [[A:%.*]]) {
+; CHECK-NEXT:    [[UCMP:%.*]] = call i32 @llvm.ucmp.i32.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i32 [[UCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i8 %a, i8 0)
+  ret i32 %ucmp
+}
+
+; It is incorrect to convert a ucmp into sub when the input type is i1.
+define i32 @ucmp_to_sub_i1_rhs_const(i1 %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_i1_rhs_const(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 false)
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i1 %a, i1 false)
+  ret i32 %ucmp
+}
+
+; It is incorrect to convert a ucmp into sub when the input type is i1.
+define i32 @ucmp_to_sub_i1_lhs_const(i1 %a) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_i1_lhs_const(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 false, i1 [[A]])
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i1 false, i1 %a)
+  ret i32 %ucmp
+}
+
+; It is incorrect to convert a ucmp into sub when the input type is i1.
+define i32 @ucmp_to_sub_i1(i1 %a, i1 %b) {
+; CHECK-LABEL: define i32 @ucmp_to_sub_i1(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.ucmp.i32.i1(i1 [[A]], i1 [[B]])
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %ucmp = call i32 @llvm.ucmp(i1 %a, i1 %b)
+  ret i32 %ucmp
+}
+
+; It is incorrect to convert a scmp into sub when the input type is i1.
+define i32 @scmp_to_sub_i1_rhs_const(i1 %a) {
+; CHECK-LABEL: define i32 @scmp_to_sub_i1_rhs_const(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[SCMP:%.*]] = call i32 @llvm.scmp.i32.i1(i1 [[A]], i1 false)
+; CHECK-NEXT:    ret i32 [[SCMP]]
+;
+  %scmp = call i32 @llvm.scmp(i1 %a, i1 false)
+  ret i32 %scmp
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index f7e629f7212e9..d4e323819402c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s -slp-threshold=-3 | FileCheck %s --check-prefix=THRESH
 
 %struct.ImageParameters = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], ptr, ptr, ptr, ptr, ptr, [1200 x %struct.syntaxelement], ptr, ptr, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, ptr, ptr, ptr, ptr, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [15 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], i32, i32, i32 }
 %struct.syntaxelement = type { i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr }
@@ -94,6 +95,89 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; CHECK-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
 ; CHECK-NEXT:    ret i32 0
 ;
+; THRESH-LABEL: define fastcc i32 @test(
+; THRESH-SAME: i32 [[TMP0:%.*]], i32 [[ADD111_I_I:%.*]], <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; THRESH-NEXT:  [[ENTRY:.*:]]
+; THRESH-NEXT:    [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1
+; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
+; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
+; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
+; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
+; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
+; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
+; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
+; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
+; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
+; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
+; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
+; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
+; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
+; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
+; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
+; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
+; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
+; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
+; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
+; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
+; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
+; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
+; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
+; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
+; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
+; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
+; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
+; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
+; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
+; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
+; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    ret i32 0
+;
 entry:
   %LoopArray.sroa.24.0.i.i3 = ashr i32 %0, 1
   %shr143.5.i.i9 = ashr i32 %0, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
index f56af934f19f5..b1864b43512d8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
@@ -14,50 +14,44 @@ define i32 @test(i32 %s.0) {
 ; CHECK:       [[IF_END3:.*]]:
 ; CHECK-NEXT:    br label %[[IF_END6:.*]]
 ; CHECK:       [[IF_END6]]:
-; CHECK-NEXT:    [[J_4:%.*]] = phi i32 [ 0, %[[IF_END3]] ], [ [[TMP28:%.*]], %[[O]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP22:%.*]], %[[O]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP24:%.*]], %[[O]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP29:%.*]], %[[O]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP22:%.*]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x i32> [[TMP27]], <8 x i32> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    br i1 false, label %[[IF_END24:.*]], label %[[IF_THEN11:.*]]
 ; CHECK:       [[IF_THEN11]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[J_4]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 14>
 ; CHECK-NEXT:    br label %[[IF_END24]]
 ; CHECK:       [[IF_THEN18:.*]]:
 ; CHECK-NEXT:    br label %[[T]]
 ; CHECK:       [[T]]:
-; CHECK-NEXT:    [[TMP34:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
 ; CHECK-NEXT:    [[TMP17]] = extractelement <4 x i32> [[TMP23:%.*]], i32 0
 ; CHECK-NEXT:    br i1 false, label %[[IF_END24]], label %[[K]]
 ; CHECK:       [[IF_END24]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP34]], %[[T]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 7, i32 1>
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <8 x i32> [ [[TMP12]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP13]], %[[T]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 6, i32 1>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    br label %[[O]]
 ; CHECK:       [[O]]:
-; CHECK-NEXT:    [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ]
 ; CHECK-NEXT:    [[TMP23]] = phi <4 x i32> [ [[TMP1]], %[[K]] ], [ [[TMP20]], %[[IF_END24]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP21]], %[[IF_END24]] ]
+; CHECK-NEXT:    [[TMP24]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP34]], %[[IF_END24]] ]
+; CHECK-NEXT:    [[TMP22]] = extractelement <2 x i32> [[TMP24]], i32 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP25]], <8 x i32> <i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP33]] = shufflevector <8 x i32> [[TMP26]], <8 x i32> [[TMP32]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP28]] = extractelement <4 x i32> [[TMP24]], i32 3
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <2 x i32> [[TMP24]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP29]] = shufflevector <2 x i32> [[TMP35]], <2 x i32> [[TMP28]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    br i1 false, label %[[T]], label %[[IF_END6]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
index 8e09847e9264e..cfff11758a37a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
@@ -58,7 +58,6 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    br label %[[BB54:.*]]
 ; CHECK:       [[BB54]]:
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 0, ptr null)
 ; CHECK-NEXT:    [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]])
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <8 x float> [[TMP56]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison>
@@ -198,7 +197,6 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
   %95 = or i64 %94, %91
   %96 = or i64 %95, %37
   store i64 %96, ptr null, align 1
-  call void @llvm.lifetime.start.p0(i64 0, ptr null)
   store i64 %42, ptr null, align 1
   %97 = bitcast float %3 to i32
   %98 = icmp ult i32 %97, 1325400064
diff --git a/llvm/test/Transforms/SafeStack/X86/coloring2.ll b/llvm/test/Transforms/SafeStack/X86/coloring2.ll
index 2e02ea66f9c76..ae5f375972365 100644
--- a/llvm/test/Transforms/SafeStack/X86/coloring2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/coloring2.ll
@@ -478,43 +478,6 @@ l2:
   br label %l2
 }
 
-; This test checks for a bug where the stack coloring algorithm was not tracking
-; the live range of allocas through phi instructions, so it did not consider
-; alloca and alloca2 to be live at the same time.  As a result it was using
-; the same stack slot for both allocas.  To ensure this bug isn't present, we
-; check that there are 64 bytes allocated for the unsafe stack which is enough
-; space for both allocas.
-; CHECK-LABEL: @stack_coloring_liveness_bug
-define void @stack_coloring_liveness_bug(i32 %arg0) #0 {
-entry:
-; CHECK:        %[[USP:.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr
-; CHECK-NEXT:   getelementptr i8, ptr %[[USP]], i32 -64
-  %alloca = alloca [32 x i8], align 16
-  %alloca2 = alloca [32 x i8], align 16
-  %cond = icmp eq i32 %arg0, 0
-  br i1 %cond, label %if, label %else
-
-if:
-  br label %end
-
-else:
-; CHECK:   getelementptr i8, ptr %[[USP]], i32 -32
-  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca)
-  call void @capture8(ptr %alloca)
-  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca)
-  br label %end
-
-end:
-; CHECK:   getelementptr i8, ptr %[[USP]], i32 -64
-  %alloca.end = phi ptr [ %alloca, %if], [%alloca, %else]
-  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca2)
-  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloca.end)
-  call void @capture2_8(ptr %alloca2, ptr %alloca.end)
-  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca2)
-  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloca.end)
-  ret void
-}
-
 attributes #0 = { safestack }
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
diff --git a/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll b/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll
new file mode 100644
index 0000000000000..b8d1b922680cc
--- /dev/null
+++ b/llvm/test/Transforms/Scalarizer/extractvalue-struct-of-vectors.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='function(scalarizer)' -S < %s | FileCheck %s
+
+define void @func(<2 x i32> noundef %a, <2 x i32> noundef %b) {
+; CHECK-LABEL: define void @func(
+; CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) {
+; CHECK-NEXT:    [[A_I0:%.*]] = extractelement <2 x i32> [[A]], i64 0
+; CHECK-NEXT:    [[B_I0:%.*]] = extractelement <2 x i32> [[B]], i64 0
+; CHECK-NEXT:    [[UADDC_I0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A_I0]], i32 [[B_I0]])
+; CHECK-NEXT:    [[A_I1:%.*]] = extractelement <2 x i32> [[A]], i64 1
+; CHECK-NEXT:    [[B_I1:%.*]] = extractelement <2 x i32> [[B]], i64 1
+; CHECK-NEXT:    [[UADDC_I1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A_I1]], i32 [[B_I1]])
+; CHECK-NEXT:    [[CARRY_ELEM1:%.*]] = extractvalue { i32, i1 } [[UADDC_I0]], 1
+; CHECK-NEXT:    [[CARRY_ELEM11:%.*]] = extractvalue { i32, i1 } [[UADDC_I1]], 1
+; CHECK-NEXT:    [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_ELEM1]] to i32
+; CHECK-NEXT:    [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_ELEM11]] to i32
+; CHECK-NEXT:    ret void
+;
+  %uaddc = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %carry = extractvalue { <2 x i32>, <2 x i1> } %uaddc, 1
+  %carry_zext = zext <2 x i1> %carry to <2 x i32>
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll b/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll
new file mode 100644
index 0000000000000..3e56939b1642f
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+;; Test if the callee_type metadata is merged correctly.
+
+; RUN: opt -passes=simplifycfg -S < %s | FileCheck %s
+
+;; Test if the callee_type metadata is merged correctly when
+;; the instructions carry differring callee_type metadata.
+define ptr @_Z10test_diffb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_diffb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META0:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is merged correctly when
+;; the instructions carry same callee_type metadata.
+define ptr @_Z10test_sameb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_sameb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META3:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is dropped correctly when
+;; only the left instruction has callee_type metadata.
+define ptr @_Z10test_leftb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_leftb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4)
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is dropped correctly when
+;; only the right instruction has callee_type metadata.
+define ptr @_Z10test_rightb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_rightb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4)
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !3
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+;; Test if the callee_type metadata is merged correctly when
+;; each of the callee_type metadata are lists.
+define ptr @_Z10test_listb(i1 zeroext %b) {
+; CHECK-LABEL: define ptr @_Z10test_listb(
+; CHECK-SAME: i1 zeroext [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FN:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @_Znwm, ptr [[FN]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META4:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
+entry:
+  %fn = alloca ptr
+  store ptr @_Znwm, ptr %fn
+  br i1 %b, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call ptr %fn(i64 4), !callee_type !6
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call1 = call ptr %fn(i64 4), !callee_type !5
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ]
+  ret ptr %x.0
+}
+
+declare ptr @_Znwm(i64)
+
+!0 = !{i64 0, !"callee_type0.generalized"}
+!1 = !{i64 0, !"callee_type1.generalized"}
+!2 = !{i64 0, !"callee_type2.generalized"}
+!3 = !{!0}
+!4 = !{!2}
+!5 = !{!1, !2}
+!6 = !{!0, !2}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{i64 0, !"callee_type2.generalized"}
+; CHECK: [[META2]] = !{i64 0, !"callee_type0.generalized"}
+; CHECK: [[META3]] = !{[[META2]]}
+; CHECK: [[META4]] = !{[[META2]], [[META1]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{i64 0, !"callee_type1.generalized"}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
index 32581bbf8f141..d2d917de11897 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
@@ -199,3 +199,44 @@ exit:
   %ret = phi i64 [ 0, %default ], [ 0, %bb1 ], [ 1, %entry ], [ 1, %bb2 ]
   ret i64 %ret
 }
+
+define i32 @switch_dup_unbounded_predecessors(i32 %val) {
+; SIMPLIFY-CFG-LABEL: define i32 @switch_dup_unbounded_predecessors(
+; SIMPLIFY-CFG-SAME: i32 [[VAL:%.*]]) {
+; SIMPLIFY-CFG-NEXT:  [[ENTRY:.*]]:
+; SIMPLIFY-CFG-NEXT:    switch i32 [[VAL]], label %[[EXIT:.*]] [
+; SIMPLIFY-CFG-NEXT:      i32 99, label %[[BB1:.*]]
+; SIMPLIFY-CFG-NEXT:      i32 115, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 102, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 70, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 101, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 69, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 103, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:    ]
+; SIMPLIFY-CFG:       [[BB1]]:
+; SIMPLIFY-CFG-NEXT:    br label %[[EXIT]]
+; SIMPLIFY-CFG:       [[EXIT]]:
+; SIMPLIFY-CFG-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[BB1]] ]
+; SIMPLIFY-CFG-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  switch i32 %val, label %exit [
+  i32 99, label %bb1
+  i32 115, label %bb1
+  i32 102, label %bb2
+  i32 70, label %bb2
+  i32 101, label %bb2
+  i32 69, label %bb2
+  i32 103, label %bb2
+  ]
+
+bb1:
+  br label %exit
+
+bb2:
+  br label %exit
+
+exit:
+  %phi = phi i32 [ 0, %entry ], [ 1, %bb1 ], [ 1, %bb2 ]
+  ret i32 %phi
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 4136f33983a2b..8f2ae2d054f1e 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -149,7 +149,7 @@ unreach2:
 
 define void @pr53208_single_reachable_dest(i8 %sw, ptr %p0) {
 ; CHECK-LABEL: @pr53208_single_reachable_dest(
-; CHECK-NEXT:  group2:
+; CHECK-NEXT:  exit:
 ; CHECK-NEXT:    call void @bar(ptr [[P0:%.*]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
new file mode 100644
index 0000000000000..9a102768b1277
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
@@ -0,0 +1,34 @@
+;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when
+;; CrossDSOCFI is used. This test just needs to not crash.
+; REQUIRES: x86-registered-target
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector
+
+define hidden void @__stack_chk_fail() !type !1{
+  unreachable
+}
+
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+define void @func(ptr %0) {
+entry:
+  %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid")
+  br i1 %1, label %cont, label %trap
+
+trap:                                             ; preds = %entry
+  call void @llvm.trap()
+  unreachable
+
+cont:                                             ; preds = %entry
+  call void %0()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"Cross-DSO CFI", i32 1}
+!1 = !{i64 0, !"typeid"}
diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
new file mode 100644
index 0000000000000..def3e014797de
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
@@ -0,0 +1,22 @@
+;; __stack_chk_fail should have the noreturn attr even if it is an alias
+; REQUIRES: x86-registered-target
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s
+
+define hidden void @__stack_chk_fail_impl() {
+  unreachable
+}
+
+@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl
+
+; CHECK-LABEL: @store_captures(
+; CHECK:       CallStackCheckFailBlk:
+; CHECK-NEXT:      call void @__stack_chk_fail() [[ATTRS:#.*]]
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+; CHECK: attributes [[ATTRS]] = { noreturn }
diff --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
new file mode 100644
index 0000000000000..220556c8c38c3
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
@@ -0,0 +1,262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- | FileCheck %s
+
+; Negative test: bitcast from float to int (optimization should not apply)
+define <4 x i32> @and_bitcast_v4f32_to_v4i32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @and_bitcast_v4f32_to_v4i32(
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[BC1]], [[BC2]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %bc1 = bitcast <4 x float> %a to <4 x i32>
+  %bc2 = bitcast <4 x float> %b to <4 x i32>
+  %and = and <4 x i32> %bc1, %bc2
+  ret <4 x i32> %and
+}
+
+; Test bitwise operations with integer-to-integer bitcast
+define <2 x i32> @or_bitcast_v4i16_to_v2i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @or_bitcast_v4i16_to_v2i32(
+; CHECK-NEXT:    [[B:%.*]] = or <4 x i16> [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <4 x i16> [[B]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[BC2]]
+;
+  %bc1 = bitcast <4 x i16> %a to <2 x i32>
+  %bc2 = bitcast <4 x i16> %b to <2 x i32>
+  %or = or <2 x i32> %bc1, %bc2
+  ret <2 x i32> %or
+}
+
+define <16 x i8> @xor_bitcast_v2i64_to_v16i8(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @xor_bitcast_v2i64_to_v16i8(
+; CHECK-NEXT:    [[B:%.*]] = xor <2 x i64> [[A:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[BC2]]
+;
+  %bc1 = bitcast <2 x i64> %a to <16 x i8>
+  %bc2 = bitcast <2 x i64> %b to <16 x i8>
+  %xor = xor <16 x i8> %bc1, %bc2
+  ret <16 x i8> %xor
+}
+
+; Test bitwise operations with truncate
+define <4 x i16> @and_trunc_v4i32_to_v4i16(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @and_trunc_v4i32_to_v4i16(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[AND]]
+;
+  %t1 = trunc <4 x i32> %a to <4 x i16>
+  %t2 = trunc <4 x i32> %b to <4 x i16>
+  %and = and <4 x i16> %t1, %t2
+  ret <4 x i16> %and
+}
+
+define <8 x i8> @or_trunc_v8i16_to_v8i8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @or_trunc_v8i16_to_v8i8(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <8 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = trunc <8 x i16> [[OR_INNER]] to <8 x i8>
+; CHECK-NEXT:    ret <8 x i8> [[OR]]
+;
+  %t1 = trunc <8 x i16> %a to <8 x i8>
+  %t2 = trunc <8 x i16> %b to <8 x i8>
+  %or = or <8 x i8> %t1, %t2
+  ret <8 x i8> %or
+}
+
+define <2 x i32> @xor_trunc_v2i64_to_v2i32(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @xor_trunc_v2i64_to_v2i32(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = trunc <2 x i64> [[XOR_INNER]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[XOR]]
+;
+  %t1 = trunc <2 x i64> %a to <2 x i32>
+  %t2 = trunc <2 x i64> %b to <2 x i32>
+  %xor = xor <2 x i32> %t1, %t2
+  ret <2 x i32> %xor
+}
+
+; Test bitwise operations with zero extend
+define <4 x i32> @and_zext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_v4i16_to_v4i32(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %z1, %z2
+  ret <4 x i32> %and
+}
+
+define <8 x i16> @or_zext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @or_zext_v8i8_to_v8i16(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = zext <8 x i8> [[OR_INNER]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[OR]]
+;
+  %z1 = zext <8 x i8> %a to <8 x i16>
+  %z2 = zext <8 x i8> %b to <8 x i16>
+  %or = or <8 x i16> %z1, %z2
+  ret <8 x i16> %or
+}
+
+define <2 x i64> @xor_zext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @xor_zext_v2i32_to_v2i64(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = zext <2 x i32> [[XOR_INNER]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[XOR]]
+;
+  %z1 = zext <2 x i32> %a to <2 x i64>
+  %z2 = zext <2 x i32> %b to <2 x i64>
+  %xor = xor <2 x i64> %z1, %z2
+  ret <2 x i64> %xor
+}
+
+; Test bitwise operations with sign extend
+define <4 x i32> @and_sext_v4i16_to_v4i32(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_sext_v4i16_to_v4i32(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = sext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %s1 = sext <4 x i16> %a to <4 x i32>
+  %s2 = sext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %s1, %s2
+  ret <4 x i32> %and
+}
+
+define <8 x i16> @or_sext_v8i8_to_v8i16(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @or_sext_v8i8_to_v8i16(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <8 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = sext <8 x i8> [[OR_INNER]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[OR]]
+;
+  %s1 = sext <8 x i8> %a to <8 x i16>
+  %s2 = sext <8 x i8> %b to <8 x i16>
+  %or = or <8 x i16> %s1, %s2
+  ret <8 x i16> %or
+}
+
+define <2 x i64> @xor_sext_v2i32_to_v2i64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @xor_sext_v2i32_to_v2i64(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = sext <2 x i32> [[XOR_INNER]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[XOR]]
+;
+  %s1 = sext <2 x i32> %a to <2 x i64>
+  %s2 = sext <2 x i32> %b to <2 x i64>
+  %xor = xor <2 x i64> %s1, %s2
+  ret <2 x i64> %xor
+}
+
+; Negative test: mismatched cast types (zext and sext)
+define <4 x i32> @and_zext_sext_mismatch(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_sext_mismatch(
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[S2:%.*]] = sext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i32> [[Z1]], [[S2]]
+; CHECK-NEXT:    ret <4 x i32> [[AND]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %s2 = sext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %z1, %s2
+  ret <4 x i32> %and
+}
+
+; Negative test: mismatched source types
+define <4 x i32> @or_zext_different_src_types(<4 x i16> %a, <4 x i8> %b) {
+; CHECK-LABEL: @or_zext_different_src_types(
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <4 x i8> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i32> [[Z1]], [[Z2]]
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i8> %b to <4 x i32>
+  %or = or <4 x i32> %z1, %z2
+  ret <4 x i32> %or
+}
+
+; Negative test: scalar types (not vectors)
+define i32 @xor_zext_scalar(i16 %a, i16 %b) {
+; CHECK-LABEL: @xor_zext_scalar(
+; CHECK-NEXT:    [[Z1:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[Z2:%.*]] = zext i16 [[B:%.*]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Z1]], [[Z2]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %z1 = zext i16 %a to i32
+  %z2 = zext i16 %b to i32
+  %xor = xor i32 %z1, %z2
+  ret i32 %xor
+}
+
+; Test multi-use: one cast has multiple uses
+define <4 x i32> @and_zext_multiuse(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @and_zext_multiuse(
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i16> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = zext <4 x i16> [[AND_INNER]] to <4 x i32>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[Z1]], [[AND]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD]]
+;
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i16> %b to <4 x i32>
+  %and = and <4 x i32> %z1, %z2
+  %add = add <4 x i32> %z1, %and  ; z1 has multiple uses
+  ret <4 x i32> %add
+}
+
+; Test with different vector sizes
+define <16 x i16> @or_zext_v16i8_to_v16i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @or_zext_v16i8_to_v16i16(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <16 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = zext <16 x i8> [[OR_INNER]] to <16 x i16>
+; CHECK-NEXT:    ret <16 x i16> [[OR]]
+;
+  %z1 = zext <16 x i8> %a to <16 x i16>
+  %z2 = zext <16 x i8> %b to <16 x i16>
+  %or = or <16 x i16> %z1, %z2
+  ret <16 x i16> %or
+}
+
+; Test bitcast with different element counts
+define <8 x i16> @xor_bitcast_v4i32_to_v8i16(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @xor_bitcast_v4i32_to_v8i16(
+; CHECK-NEXT:    [[XOR_INNER:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = bitcast <4 x i32> [[XOR_INNER]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[XOR]]
+;
+  %bc1 = bitcast <4 x i32> %a to <8 x i16>
+  %bc2 = bitcast <4 x i32> %b to <8 x i16>
+  %xor = xor <8 x i16> %bc1, %bc2
+  ret <8 x i16> %xor
+}
+
+; Test truncate with flag preservation
+define <4 x i16> @and_trunc_nuw_nsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @and_trunc_nuw_nsw(
+; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT:    ret <4 x i16> [[AND]]
+;
+  %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16>
+  %t2 = trunc nuw nsw <4 x i32> %b to <4 x i16>
+  %and = and <4 x i16> %t1, %t2
+  ret <4 x i16> %and
+}
+
+; Test sign extend with nneg flag
+define <4 x i32> @or_zext_nneg(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @or_zext_nneg(
+; CHECK-NEXT:    [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[OR]]
+;
+  %z1 = zext nneg <4 x i16> %a to <4 x i32>
+  %z2 = zext nneg <4 x i16> %b to <4 x i32>
+  %or = or <4 x i32> %z1, %z2
+  ret <4 x i32> %or
+}
diff --git a/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
new file mode 100644
index 0000000000000..af0d7f1169189
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/wmma-f8f6f4.ll
@@ -0,0 +1,165 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+; --------------------------------------------------------------------
+; Wrong mangled types
+; --------------------------------------------------------------------
+
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <16 x i64> %A
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i64_fp8___v16i32_fp8(<16 x i64> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i64.v16i32(i32 0, <16 x i64> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <16 x i64> %B
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i64_fp8(<16 x i32> %A, <16 x i64> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i64(i32 0, <16 x i32> %A, i32 0, <16 x i64> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Impossible vector types
+; --------------------------------------------------------------------
+
+; CHECK: operand 1 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <9 x i32> %A
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v9i32_fp8___v16i32_fp8(<9 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v9i32.v16i32(i32 0, <9 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: operand 3 must be 8, 12 or 16 element i32 vector
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <9 x i32> %B
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v9i32_fp8(<16 x i32> %A, <9 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v9i32(i32 0, <16 x i32> %A, i32 0, <9 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Out of bounds format
+; --------------------------------------------------------------------
+
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: i32 9999
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_invalid0___v16i32_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 9999, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid value for matrix format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: i32 9999
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v16i32_invalid1(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 9999, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; --------------------------------------------------------------------
+; Incorrect signature for format cases (IR vector too small)
+; --------------------------------------------------------------------
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_fp8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 0, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %A
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_fp8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 0, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %A
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v8i32_bf8___v16i32_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 1, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %A
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v12i32_bf8___v16i32_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 1, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_fp8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 0, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %B
+; CHECK-NEXT: i32 0
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_fp8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 0, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <8 x i32> %B
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v8i32_bf8(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 1, <8 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: invalid vector type for format
+; CHECK-NEXT: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
+; CHECK-NEXT: <12 x i32> %B
+; CHECK-NEXT: i32 1
+define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4___v16i32_fp8___v12i32_bf8(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 1, <12 x i32> %B, i16 0, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/Verifier/amdgpu-cc.ll b/llvm/test/Verifier/amdgpu-cc.ll
index aec09771d2e4f..e86825e088753 100644
--- a/llvm/test/Verifier/amdgpu-cc.ll
+++ b/llvm/test/Verifier/amdgpu-cc.ll
@@ -217,3 +217,36 @@ define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(p
 define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) {
   ret void
 }
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_no_args
+define amdgpu_gfx_whole_wave void @whole_wave_no_args() {
+  ret void
+}
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_must_have_i1_active
+define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) {
+  ret void
+}
+
+; CHECK: Calling convention requires first argument to not be inreg
+; CHECK-NEXT: ptr @whole_wave_i1_active_inreg
+define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) {
+  ret void
+}
+
+; CHECK: Calling convention does not support varargs
+; CHECK-NEXT: ptr @whole_wave_varargs
+define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) {
+  ret void
+}
+
+declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active)
+
+; CHECK: calling convention does not permit calls
+; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+define amdgpu_cs void @cant_call_whole_wave_func() {
+  call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+  ret void
+}
diff --git a/llvm/test/Verifier/callee-type-metadata.ll b/llvm/test/Verifier/callee-type-metadata.ll
new file mode 100644
index 0000000000000..50cf37b941fe9
--- /dev/null
+++ b/llvm/test/Verifier/callee-type-metadata.ll
@@ -0,0 +1,33 @@
+;; Test if the callee_type metadata attached to indirect call sites adhere to the expected format.
+
+; RUN: not llvm-as -disable-output < %s 2>&1 | FileCheck %s
+define i32 @_Z13call_indirectPFicEc(ptr %func, i8 signext %x) !type !0 {
+entry:
+  %func.addr = alloca ptr, align 8
+  %x.addr = alloca i8, align 1
+  store ptr %func, ptr %func.addr, align 8
+  store i8 %x, ptr %x.addr, align 1
+  %fptr = load ptr, ptr %func.addr, align 8
+  %x_val = load i8, ptr %x.addr, align 1  
+  ; CHECK: The callee_type metadata must be a list of type metadata nodes
+  %call = call i32 %fptr(i8 signext %x_val), !callee_type !0
+  ; CHECK: Well-formed generalized type metadata must contain exactly two operands
+  %call1 = call i32 %fptr(i8 signext %x_val), !callee_type !2
+  ; CHECK: The first operand of type metadata for functions must be zero
+  %call2 = call i32 %fptr(i8 signext %x_val), !callee_type !4
+  ; CHECK: The first operand of type metadata for functions must be zero
+  %call3 = call i32 %fptr(i8 signext %x_val), !callee_type !6
+  ; CHECK: Only generalized type metadata can be part of the callee_type metadata list
+  %call4 = call i32 %fptr(i8 signext %x_val), !callee_type !8
+  ret i32 %call
+}
+
+!0 = !{i64 0, !"_ZTSFiPvcE.generalized"}
+!1 = !{!"_ZTSFicE"}
+!2 = !{!2}
+!3 = !{i64 1, !"_ZTSFicE"}
+!4 = !{!3}
+!5 = !{!"expected_int", !"_ZTSFicE"}
+!6 = !{!5}
+!7 = !{i64 0, !"_ZTSFicE"}
+!8 = !{!7}
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index dd940d550c4e9..c1bb9323b8701 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -164,19 +164,21 @@ define void @test_scatter_8i32(<8 x i32> %a1, <8 x ptr> %ptr, <8 x i1> %mask, i3
 }
 
 declare void @llvm.lifetime.start.p0(i64, ptr)
-define void @test_lifetime_start(i64 %arg0, ptr %ptr) {
+define void @test_lifetime_start(i64 %arg0) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i64 %arg0
   ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 %arg0, ptr %ptr)
+  %ptr = alloca i64
   call void @llvm.lifetime.start.p0(i64 %arg0, ptr %ptr)
   ret void
 }
 
 declare void @llvm.lifetime.end.p0(i64, ptr)
-define void @test_lifetime_end(i64 %arg0, ptr %ptr) {
+define void @test_lifetime_end(i64 %arg0) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i64 %arg0
   ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 %arg0, ptr %ptr)
+  %ptr = alloca i64
   call void @llvm.lifetime.end.p0(i64 %arg0, ptr %ptr)
   ret void
 }
diff --git a/llvm/test/Verifier/opaque-ptr.ll b/llvm/test/Verifier/opaque-ptr.ll
index 1f29000db5692..10e43a45e5740 100644
--- a/llvm/test/Verifier/opaque-ptr.ll
+++ b/llvm/test/Verifier/opaque-ptr.ll
@@ -37,12 +37,14 @@ define void @atomicrmw(ptr %a, i32 %i) {
   ret void
 }
 
-define void @opaque_mangle(ptr %a) {
+define void @opaque_mangle() {
 ; CHECK-LABEL: @opaque_mangle(
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[A:%.*]])
+; CHECK-NEXT:    [[A:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[A]])
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[A]])
 ; CHECK-NEXT:    ret void
 ;
+  %a = alloca i64
   call void @llvm.lifetime.start.p0(i64 8, ptr %a)
   call void @llvm.lifetime.end.p0(i64 8, ptr %a)
   ret void
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bd6e37c848d8c..143cc3817bd08 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -63,7 +63,7 @@
 def get_asan_rtlib():
     if (
         not "Address" in config.llvm_use_sanitizer
-        or not "Darwin" in config.host_os
+        or not "Darwin" in config.target_os
         or not "x86" in config.host_triple
     ):
         return ""
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+    (
+        "%ir2vec_test_vocab_dir",
+        os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+    )
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
@@ -197,6 +204,7 @@ def get_asan_rtlib():
         "llvm-dlltool",
         "llvm-exegesis",
         "llvm-extract",
+        "llvm-ir2vec",
         "llvm-isel-fuzzer",
         "llvm-ifs",
         "llvm-install-name-tool",
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index caee6c1db92ee..ee76beb51cce6 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -26,7 +26,7 @@ config.enable_assertions = @ENABLE_ASSERTIONS@
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.native_target = "@LLVM_NATIVE_ARCH@"
 config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
-config.host_os = "@HOST_OS@"
+config.target_os = "@HOST_OS@"
 config.host_cc = "@HOST_CC@"
 config.host_cxx = "@HOST_CXX@"
 # Note: ldflags can contain double-quoted paths, so must use single quotes here.
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected
index 897209a566149..56058bbc4c402 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/mips64_eh.ll.expected
@@ -8,17 +8,17 @@ define i32 @main() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset 31, -8
-; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:  .Ltmp0: # EH_LABEL
 ; CHECK-NEXT:    jal foo
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:  .Ltmp1: # EH_LABEL
 ; CHECK-NEXT:  # %bb.1: # %good
 ; CHECK-NEXT:    addiu $2, $zero, 5
 ; CHECK-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    daddiu $sp, $sp, 16
 ; CHECK-NEXT:  .LBB0_2: # %bad
-; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:  .Ltmp2: # EH_LABEL
 ; CHECK-NEXT:    jal _Unwind_Resume
 ; CHECK-NEXT:    nop
   %1 = invoke i32 @foo() to label %good unwind label %bad
diff --git a/llvm/test/tools/llc/new-pm/start-stop-inserted.ll b/llvm/test/tools/llc/new-pm/start-stop-inserted.ll
new file mode 100644
index 0000000000000..ce5ad2d9e5065
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/start-stop-inserted.ll
@@ -0,0 +1,15 @@
+; REQUIRES: amdgpu-registered-target
+
+; AMDGPU inserts the fourth instance of dead-mi-elimination pass after detect-dead-lanes
+; This checks that the pipeline stops before that.
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -enable-new-pm -stop-before=dead-mi-elimination,4 --print-pipeline-passes -filetype=null %s | FileCheck %s
+
+; There is no way to -start-after an inserted pass right now.
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -O3 -enable-new-pm -start-after=dead-mi-elimination,4 --print-pipeline-passes -filetype=null %s
+
+
+; CHECK: dead-mi-elimination
+; CHECK: dead-mi-elimination
+; CHECK: dead-mi-elimination
+; CHECK-NOT: dead-mi-elimination
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
index 13d9663221115..e4c454900fd38 100644
--- a/llvm/test/tools/llc/new-pm/start-stop.ll
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -2,4 +2,4 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ
 
 ; NULL: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify)
-; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),invalidate<machine-function-info>)
+; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function)
diff --git a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
index a4350fc6dc2cb..3ef664f899551 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
@@ -3,7 +3,7 @@ REQUIRES: aarch64-registered-target
 ## PPR Register Class Initialization Testcase
 ## Ideally, we should use PTRUE_{B/H/S/D} instead of FADDV_VPZ_D for an isolated test case; 
 ## However, exegesis does not yet support PTRUE_{B/H/S/D}.
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=PPR_ASM < %t.s
 PPR_ASM:            <foo>:
@@ -14,7 +14,7 @@ PPR_ASM-NEXT:       faddv d{{[0-9]+}}, p{{[0-9]+}}, z{{[0-9]+}}
 ## ZPR Register Class Initialization Testcase
 ## Ideally, we should use DUP_ZI_{B/H/S/D} instead of FADDV_VPZ_D for an isolated test case; 
 ## However, exegesis does not yet support DUP_ZI_{B/H/S/D}.
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=ZPR_ASM < %t.s
 ZPR_ASM:            <foo>:
@@ -23,7 +23,7 @@ ZPR_ASM-NEXT:       mov z{{[0-9]+}}.d, #0x0
 ZPR_ASM-NEXT:       faddv d{{[0-9]+}}, p{{[0-9]+}}, z{{[0-9]+}}
 
 ## FPR128 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv16i8v --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv16i8v --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR128-ASM < %t.s
 FPR128-ASM:         <foo>:
@@ -31,7 +31,7 @@ FPR128-ASM:         movi v{{[0-9]+}}.2d, #0000000000000000
 FPR128-ASM-NEXT:    addv b{{[0-9]+}}, v{{[0-9]+}}.16b
 
 ## FPR64 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv4i16v --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv4i16v --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR64-ASM < %t.s
 FPR64-ASM:          <foo>:
@@ -39,7 +39,7 @@ FPR64-ASM:          movi d{{[0-9]+}}, #0000000000000000
 FPR64-ASM-NEXT:     addv h{{[0-9]+}}, v{{[0-9]+}}.4h
 
 ## FPR32 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSSr --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSSr --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR32-ASM < %t.s
 FPR32-ASM:         <foo>:
@@ -48,7 +48,7 @@ FPR32-ASM-NEXT:    fabs s{{[0-9]+}}, s{{[0-9]+}}
 
 
 ## FPR16 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSHr --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSHr --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR16-ASM < %t.s
 FPR16-ASM:         <foo>:
@@ -56,7 +56,7 @@ FPR16-ASM:         movi d{{[0-9]+}}, #0000000000000000
 FPR16-ASM-NEXT:    fabs h{{[0-9]+}}, h{{[0-9]+}}
 
 ## FPR8 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=SQABSv1i8 --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=SQABSv1i8 --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR8-ASM < %t.s
 FPR8-ASM:         <foo>:
@@ -65,7 +65,7 @@ FPR8-ASM-NEXT:    sqabs   b{{[0-9]+}}, b{{[0-9]+}}
 
 
 ## FPCR Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=BFCVT --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=BFCVT --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPCR-ASM < %t.s
 FPCR-ASM:         <foo>:
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0000000000000..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00  98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4 [ 97.00  98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %0 = load i32, ptr %a.addr, align 4 [ 94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %1 = load i32, ptr %a.addr, align 4 [ 94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %mul = mul nsw i32 %0, %1 [ 49.00  50.00  51.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %conv = sitofp i32 %mul to float [ 130.00  131.00  132.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %2 = load float, ptr %b.addr, align 4 [ 94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %add = fadd float %conv, %2 [ 40.00  41.00  42.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: ret float %add [ 1.00  2.00  3.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll b/llvm/test/tools/llvm-ir2vec/triplets.ll
new file mode 100644
index 0000000000000..d1ef5b388e258
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-ir2vec --mode=triplets %s | FileCheck %s -check-prefix=TRIPLETS
+
+define i32 @simple_add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i32 @simple_mul(i32 %x, i32 %y) {
+entry:
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i32 @test_function(i32 %arg1, i32 %arg2) {
+entry:
+  %local1 = alloca i32, align 4
+  %local2 = alloca i32, align 4
+  store i32 %arg1, ptr %local1, align 4
+  store i32 %arg2, ptr %local2, align 4
+  %load1 = load i32, ptr %local1, align 4
+  %load2 = load i32, ptr %local2, align 4
+  %result = add i32 %load1, %load2
+  ret i32 %result
+}
+
+; TRIPLETS: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Mul IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
index bc9229471b20e..8838c862e6b75 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
@@ -107,6 +107,9 @@ amomaxu.d.aqrl s5, s4, (s3)
 # CHECK-NEXT: [2]   - SMX60_IEUA:1
 # CHECK-NEXT: [3]   - SMX60_IEUB:1
 # CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -215,98 +218,101 @@ amomaxu.d.aqrl s5, s4, (s3)
 # CHECK-NEXT: [2]   - SMX60_IEUB
 # CHECK-NEXT: [3.0] - SMX60_LS
 # CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]
-# CHECK-NEXT:  -      -      -     44.00  44.00
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -      -      -     44.00  44.00   -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w	t0, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w.aq	t1, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w.rl	t2, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w.aqrl	t3, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w	t6, t5, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w.aq	t5, t4, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w.rl	t4, t3, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w.aqrl	t3, t2, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d	t0, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d.aq	t1, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d.rl	t2, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d.aqrl	t3, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d	t6, t5, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d.aq	t5, t4, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d.rl	t4, t3, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d.aqrl	t3, t2, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w.aq	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w.aq	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w.aq	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w.aq	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w.aq	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w.aq	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w.aq	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w.aq	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w.aq	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w.rl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w.rl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w.rl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w.rl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w.rl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w.rl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w.rl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w.rl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w.rl	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w.aqrl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w.aqrl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w.aqrl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w.aqrl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w.aqrl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w.aqrl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w.aqrl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w.aqrl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w.aqrl	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d.aq	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d.aq	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d.aq	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d.aq	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d.aq	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d.aq	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d.aq	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d.aq	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d.aq	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d.rl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d.rl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d.rl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d.rl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d.rl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d.rl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d.rl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d.rl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d.rl	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d.aqrl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d.aqrl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d.aqrl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d.aqrl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d.aqrl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d.aqrl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d.aqrl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d.aqrl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d.aqrl	s5, s4, (s3)
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w	t0, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w.aq	t1, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w.rl	t2, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w.aqrl	t3, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w	t6, t5, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w.aq	t5, t4, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w.rl	t4, t3, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w.aqrl	t3, t2, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d	t0, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d.aq	t1, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d.rl	t2, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d.aqrl	t3, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d	t6, t5, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d.aq	t5, t4, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d.rl	t4, t3, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d.aqrl	t3, t2, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w.aq	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w.aq	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w.aq	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w.aq	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w.aq	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w.aq	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w.aq	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w.aq	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w.aq	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w.rl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w.rl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w.rl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w.rl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w.rl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w.rl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w.rl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w.rl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w.rl	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w.aqrl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w.aqrl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w.aqrl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w.aqrl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w.aqrl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w.aqrl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w.aqrl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w.aqrl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w.aqrl	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d.aq	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d.aq	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d.aq	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d.aq	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d.aq	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d.aq	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d.aq	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d.aq	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d.aq	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d.rl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d.rl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d.rl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d.rl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d.rl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d.rl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d.rl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d.rl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d.rl	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d.aqrl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d.aqrl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d.aqrl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d.aqrl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d.aqrl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d.aqrl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d.aqrl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d.aqrl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d.aqrl	s5, s4, (s3)
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
index b86fcbccbeabb..78f4e7f50c745 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
@@ -135,6 +135,9 @@ fclass.d a3, ft10
 # CHECK-NEXT: [2]   - SMX60_IEUA:1
 # CHECK-NEXT: [3]   - SMX60_IEUB:1
 # CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -240,95 +243,98 @@ fclass.d a3, ft10
 # CHECK-NEXT: [2]   - SMX60_IEUB
 # CHECK-NEXT: [3.0] - SMX60_LS
 # CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]
-# CHECK-NEXT: 149.00 11.00  11.00  3.00   3.00
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT: 149.00 11.00  11.00  3.00   3.00    -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.50   0.50   flh	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fsh	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   flw	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fsw	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fld	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fsd	ft0, 0(a0)
-# CHECK-NEXT: 1.00    -      -      -      -     fadd.h	fs10, fs11, ft8
-# CHECK-NEXT: 1.00    -      -      -      -     fsub.h	ft9, ft10, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmul.h	ft0, ft1, ft2
-# CHECK-NEXT: 12.00   -      -      -      -     fdiv.h	ft3, ft4, ft5
-# CHECK-NEXT: 12.00   -      -      -      -     fsqrt.h	ft6, ft7
-# CHECK-NEXT: 1.00    -      -      -      -     fmin.h	fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fmax.h	fs2, fs3, fs4
-# CHECK-NEXT: 1.00    -      -      -      -     fmadd.h	fa0, fa1, fa2, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmsub.h	fa4, fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fnmsub.h	fs2, fs3, fs4, fs5
-# CHECK-NEXT: 1.00    -      -      -      -     fnmadd.h	fs6, fs7, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     fadd.s	fs10, fs11, ft8
-# CHECK-NEXT: 1.00    -      -      -      -     fsub.s	ft9, ft10, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmul.s	ft0, ft1, ft2
-# CHECK-NEXT: 15.00   -      -      -      -     fdiv.s	ft3, ft4, ft5
-# CHECK-NEXT: 15.00   -      -      -      -     fsqrt.s	ft6, ft7
-# CHECK-NEXT: 1.00    -      -      -      -     fmin.s	fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fmax.s	fs2, fs3, fs4
-# CHECK-NEXT: 1.00    -      -      -      -     fmadd.s	fa0, fa1, fa2, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmsub.s	fa4, fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fnmsub.s	fs2, fs3, fs4, fs5
-# CHECK-NEXT: 1.00    -      -      -      -     fnmadd.s	fs6, fs7, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     fadd.d	fs10, fs11, ft8
-# CHECK-NEXT: 1.00    -      -      -      -     fsub.d	ft9, ft10, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmul.d	ft0, ft1, ft2
-# CHECK-NEXT: 22.00   -      -      -      -     fdiv.d	ft3, ft4, ft5
-# CHECK-NEXT: 22.00   -      -      -      -     fsqrt.d	ft6, ft7
-# CHECK-NEXT: 1.00    -      -      -      -     fmin.d	fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fmax.d	fs2, fs3, fs4
-# CHECK-NEXT: 1.00    -      -      -      -     fmadd.d	fa0, fa1, fa2, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmsub.d	fa4, fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fnmsub.d	fs2, fs3, fs4, fs5
-# CHECK-NEXT: 1.00    -      -      -      -     fnmadd.d	fs6, fs7, fs8, fs9
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.x.h	a2, fs7
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.h.x	ft1, a6
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.s.h	fa0, ft0
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.s.h	fa0, ft0, rup
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.h.s	ft2, fa2
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.d.h	fa0, ft0
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.d.h	fa0, ft0, rup
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.h.d	ft2, fa2
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.w.s	a0, fs5
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.wu.s	a1, fs6
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.w	ft11, a4
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.wu	ft0, a5
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.l.s	a0, ft0
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.lu.s	a1, ft1
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.l	ft2, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.lu	ft3, a3
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.x.w	a2, fs7
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.w.x	ft1, a6
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnj.s	fs1, fa0, fa1
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnjn.s	fa1, fa3, fa4
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.wu.d	a4, ft11
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.w.d	a4, ft11
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.w	ft0, a5
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.wu	ft1, a6
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.s.d	fs5, fs6
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.d.s	fs7, fs8
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.l.d	a0, ft0
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.lu.d	a1, ft1
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.l	ft3, a3
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.lu	ft4, a4
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.x.d	a2, ft2
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.d.x	ft5, a5
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnj.d	fs1, fa0, fa1
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnjn.d	fa1, fa3, fa4
-# CHECK-NEXT: 1.00    -      -      -      -     feq.h	a1, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     flt.h	a2, fs10, fs11
-# CHECK-NEXT: 1.00    -      -      -      -     fle.h	a3, ft8, ft9
-# CHECK-NEXT: 1.00    -      -      -      -     feq.s	a1, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     flt.s	a2, fs10, fs11
-# CHECK-NEXT: 1.00    -      -      -      -     fle.s	a3, ft8, ft9
-# CHECK-NEXT: 1.00    -      -      -      -     feq.d	a1, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     flt.d	a2, fs10, fs11
-# CHECK-NEXT: 1.00    -      -      -      -     fle.d	a3, ft8, ft9
-# CHECK-NEXT: 1.00    -      -      -      -     fclass.s	a3, ft10
-# CHECK-NEXT: 1.00    -      -      -      -     fclass.s	a3, ft10
-# CHECK-NEXT: 1.00    -      -      -      -     fclass.d	a3, ft10
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     flh	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fsh	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     flw	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fsw	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fld	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fsd	ft0, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fadd.h	fs10, fs11, ft8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsub.h	ft9, ft10, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmul.h	ft0, ft1, ft2
+# CHECK-NEXT: 12.00   -      -      -      -      -      -      -     fdiv.h	ft3, ft4, ft5
+# CHECK-NEXT: 12.00   -      -      -      -      -      -      -     fsqrt.h	ft6, ft7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmin.h	fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmax.h	fs2, fs3, fs4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmadd.h	fa0, fa1, fa2, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmsub.h	fa4, fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmsub.h	fs2, fs3, fs4, fs5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmadd.h	fs6, fs7, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fadd.s	fs10, fs11, ft8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsub.s	ft9, ft10, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmul.s	ft0, ft1, ft2
+# CHECK-NEXT: 15.00   -      -      -      -      -      -      -     fdiv.s	ft3, ft4, ft5
+# CHECK-NEXT: 15.00   -      -      -      -      -      -      -     fsqrt.s	ft6, ft7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmin.s	fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmax.s	fs2, fs3, fs4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmadd.s	fa0, fa1, fa2, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmsub.s	fa4, fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmsub.s	fs2, fs3, fs4, fs5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmadd.s	fs6, fs7, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fadd.d	fs10, fs11, ft8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsub.d	ft9, ft10, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmul.d	ft0, ft1, ft2
+# CHECK-NEXT: 22.00   -      -      -      -      -      -      -     fdiv.d	ft3, ft4, ft5
+# CHECK-NEXT: 22.00   -      -      -      -      -      -      -     fsqrt.d	ft6, ft7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmin.d	fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmax.d	fs2, fs3, fs4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmadd.d	fa0, fa1, fa2, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmsub.d	fa4, fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmsub.d	fs2, fs3, fs4, fs5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmadd.d	fs6, fs7, fs8, fs9
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.x.h	a2, fs7
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.h.x	ft1, a6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.s.h	fa0, ft0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.s.h	fa0, ft0, rup
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.h.s	ft2, fa2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.d.h	fa0, ft0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.d.h	fa0, ft0, rup
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.h.d	ft2, fa2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.w.s	a0, fs5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.wu.s	a1, fs6
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.w	ft11, a4
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.wu	ft0, a5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.l.s	a0, ft0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.lu.s	a1, ft1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.l	ft2, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.lu	ft3, a3
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.x.w	a2, fs7
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.w.x	ft1, a6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnj.s	fs1, fa0, fa1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnjn.s	fa1, fa3, fa4
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.wu.d	a4, ft11
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.w.d	a4, ft11
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.w	ft0, a5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.wu	ft1, a6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.s.d	fs5, fs6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.d.s	fs7, fs8
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.l.d	a0, ft0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.lu.d	a1, ft1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.l	ft3, a3
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.lu	ft4, a4
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.x.d	a2, ft2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.d.x	ft5, a5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnj.d	fs1, fa0, fa1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnjn.d	fa1, fa3, fa4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     feq.h	a1, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     flt.h	a2, fs10, fs11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fle.h	a3, ft8, ft9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     feq.s	a1, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     flt.s	a2, fs10, fs11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fle.s	a3, ft8, ft9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     feq.d	a1, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     flt.d	a2, fs10, fs11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fle.d	a3, ft8, ft9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fclass.s	a3, ft10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fclass.s	a3, ft10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fclass.d	a3, ft10
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
index b72540f29f487..51a036aaae784 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
@@ -170,6 +170,9 @@ bseti a0, a1, 1
 # CHECK-NEXT: [2]   - SMX60_IEUA:1
 # CHECK-NEXT: [3]   - SMX60_IEUB:1
 # CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -306,126 +309,129 @@ bseti a0, a1, 1
 # CHECK-NEXT: [2]   - SMX60_IEUB
 # CHECK-NEXT: [3.0] - SMX60_LS
 # CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]
-# CHECK-NEXT:  -     180.50 44.50  5.50   5.50
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     180.50 44.50  5.50   5.50    -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  Instructions:
-# CHECK-NEXT:  -     0.50   0.50    -      -     addi	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     addiw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     slti	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     seqz	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     andi	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     ori	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     xori	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     slli	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     srli	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     srai	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     slliw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     srliw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     sraiw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     lui	a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     auipc	a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     add	a0, a0, a1
-# CHECK-NEXT:  -     0.50   0.50    -      -     addw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     slt	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sltu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     and	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     or	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     xor	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sll	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     srl	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sra	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sllw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     srlw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sraw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sub	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     subw	a0, a0, a0
-# CHECK-NEXT:  -     1.00    -      -      -     jal	a0, .Ltmp0
-# CHECK-NEXT:  -     1.00    -      -      -     jalr	a0
-# CHECK-NEXT:  -     1.00    -      -      -     beq	a0, a0, .Ltmp1
-# CHECK-NEXT:  -     1.00    -      -      -     bne	a0, a0, .Ltmp2
-# CHECK-NEXT:  -     1.00    -      -      -     blt	a0, a0, .Ltmp3
-# CHECK-NEXT:  -     1.00    -      -      -     bltu	a0, a0, .Ltmp4
-# CHECK-NEXT:  -     1.00    -      -      -     bge	a0, a0, .Ltmp5
-# CHECK-NEXT:  -     1.00    -      -      -     bgeu	a0, a0, .Ltmp6
-# CHECK-NEXT:  -     0.50   0.50    -      -     add	a0, a0, a0
-# CHECK-NEXT:  -      -      -     0.50   0.50   lb	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lbu	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lh	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lhu	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lw	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lwu	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   ld	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sb	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sh	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sw	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sd	t0, 0(a0)
-# CHECK-NEXT:  -     0.50   0.50    -      -     mul	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulh	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulhu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulhsu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulw	a0, a0, a0
-# CHECK-NEXT:  -     20.00   -      -      -     div	a0, a1, a2
-# CHECK-NEXT:  -     20.00   -      -      -     divu	a0, a1, a2
-# CHECK-NEXT:  -     20.00   -      -      -     rem	a0, a1, a2
-# CHECK-NEXT:  -     20.00   -      -      -     remu	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     divw	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     divuw	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     remw	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     remuw	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrw	t0, 4095, t1
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrs	s3, fflags, s5
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrc	sp, 0, ra
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrwi	a5, 0, 0
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrsi	t2, 4095, 31
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrci	t1, sscratch, 5
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.eqz	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.nez	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.eqz	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.nez	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     slli.uw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh1add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh2add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh3add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh1add	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh2add	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh3add	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     andn	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     orn	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     xnor	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clz	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clzw	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     ctz	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     ctzw	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     cpop	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     cpopw	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     min	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     minu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     max	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     maxu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sext.b	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sext.h	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     zext.h	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rol	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rolw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     ror	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rorw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rori	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     roriw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     orc.b	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rev8	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clmul	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clmulr	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clmulh	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     bclr	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     bclri	a0, a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     bext	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     bexti	a0, a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     binv	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     binvi	a0, a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     bset	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     bseti	a0, a1, 1
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addi	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addiw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slti	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     seqz	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     andi	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ori	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     xori	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slli	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srli	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srai	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slliw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srliw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sraiw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     lui	a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     auipc	a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     add	a0, a0, a1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slt	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sltu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     and	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     or	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     xor	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sll	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srl	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sra	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sllw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srlw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sraw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sub	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     subw	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     jal	a0, .Ltmp0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     jalr	a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     beq	a0, a0, .Ltmp1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bne	a0, a0, .Ltmp2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     blt	a0, a0, .Ltmp3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bltu	a0, a0, .Ltmp4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bge	a0, a0, .Ltmp5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bgeu	a0, a0, .Ltmp6
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lb	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lbu	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lh	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lhu	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lw	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lwu	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     ld	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sb	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sh	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sw	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sd	t0, 0(a0)
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mul	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulh	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulhu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulhsu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulw	a0, a0, a0
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     div	a0, a1, a2
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     divu	a0, a1, a2
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     rem	a0, a1, a2
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     remu	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     divw	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     divuw	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     remw	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     remuw	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrw	t0, 4095, t1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrs	s3, fflags, s5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrc	sp, 0, ra
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrwi	a5, 0, 0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrsi	t2, 4095, 31
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrci	t1, sscratch, 5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.eqz	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.nez	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.eqz	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.nez	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slli.uw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh1add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh2add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh3add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh1add	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh2add	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh3add	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     andn	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     orn	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     xnor	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clz	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clzw	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ctz	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ctzw	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     cpop	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     cpopw	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     min	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     minu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     max	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     maxu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sext.b	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sext.h	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     zext.h	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rol	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rolw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ror	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rorw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rori	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     roriw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     orc.b	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rev8	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clmul	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clmulr	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clmulh	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bclr	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bclri	a0, a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bext	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bexti	a0, a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     binv	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     binvi	a0, a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bset	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bseti	a0, a1, 1
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
new file mode 100644
index 0000000000000..c7755dcc37658
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
@@ -0,0 +1,6820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Basic arithmetic operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vadd.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vadd.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vaaddu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vaaddu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vaadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vaadd.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vasubu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vasubu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vasub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vasub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vrsub.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vrsub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsaddu.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsaddu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsaddu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsadd.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsadd.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssubu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssubu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.wx v8, v16, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     1120.00  -     -      -      -     1120.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
new file mode 100644
index 0000000000000..0b5dd607c7196
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
@@ -0,0 +1,4328 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Bitwise and logical operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vand.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vand.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vand.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vor.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vor.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vor.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vxor.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vxor.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vxor.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnsra.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnsra.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnsra.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnsrl.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnsrl.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnsrl.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnclipu.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnclipu.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnclipu.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnclip.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnclip.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnclip.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsll.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsll.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsll.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsra.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsra.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsra.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsrl.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsrl.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsrl.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vssra.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssra.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssra.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vssrl.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssrl.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssrl.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     708.00  -      -      -      -     708.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
new file mode 100644
index 0000000000000..e381b45642628
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
@@ -0,0 +1,2704 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Comparison operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmseq.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmseq.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmseq.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsle.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsle.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsle.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsleu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsleu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsleu.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsne.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsne.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsne.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgt.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgt.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsltu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsltu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmslt.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmslt.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     440.00  -      -      -      -     440.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
new file mode 100644
index 0000000000000..ca6e9d15332af
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
@@ -0,0 +1,1757 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Conversion operations
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vsext.vf2 v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vzext.vf2 v8, v16
+
+vsetvli x28, x0, e32, mf2, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vsext.vf4 v8, v16
+
+vsetvli x28, x0, e32, mf2, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vzext.vf4 v8, v16
+
+vsetvli x28, x0, e64, m1, tu, mu
+vsext.vf8 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vsext.vf8 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vsext.vf8 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vsext.vf8 v8, v16
+
+vsetvli x28, x0, e64, m1, tu, mu
+vzext.vf8 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vzext.vf8 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vzext.vf8 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vzext.vf8 v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.f.xu.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.f.x.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.x.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.xu.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.f.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.f.xu.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.f.x.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.x.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.xu.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.f.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.f.x.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.f.xu.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.x.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.xu.f.v v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     281.00  -      -      -     225.00 56.00   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
new file mode 100644
index 0000000000000..a3105c39ff978
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
@@ -0,0 +1,2185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Fused multiply-add operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmacc.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmacc.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmadd.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsac.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsac.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsub.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccu.vx v8, x16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmacc.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmacc.vx v8, x16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccsu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccsu.vx v8, x16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccus.vx v8, x16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmacc.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmacc.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmsac.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmsac.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmacc.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmacc.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmsac.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmsac.vv v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     353.00  -      -      -     72.00  281.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
new file mode 100644
index 0000000000000..f59c7987b615b
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
@@ -0,0 +1,5599 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Floating point operations
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfeq.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfeq.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfge.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfge.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfgt.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfgt.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfle.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfle.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmflt.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmflt.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfne.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfne.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfadd.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfadd.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsub.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsub.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfclass.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfdiv.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfdiv.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmax.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmax.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmin.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmin.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsac.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsac.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsub.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsub.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmul.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmul.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmacc.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmacc.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmadd.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmadd.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmv.f.s f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmv.s.f v8, f8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmv.v.f v8, f8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmacc.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmacc.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmadd.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmadd.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsac.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsac.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsub.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsub.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfrdiv.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfrec7.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfrsqrt7.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfrsub.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsqrt.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjn.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjn.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnj.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnj.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjx.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjx.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.wf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.wv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmul.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmul.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.wf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.wv v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     915.00  -      -      -     885.00 30.00   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s
new file mode 100644
index 0000000000000..ce1ade0f143af
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s
@@ -0,0 +1,1864 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Mask operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmand.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmnand.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmandn.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmxor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmnor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmorn.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmxnor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbf.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vmsif.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vmsof.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m8, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m8, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m8, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m8, tu, mu
+vid.v v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vcpop.m x8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfirst.m x8, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     301.00  -      -      -      -     301.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
new file mode 100644
index 0000000000000..4cc496b7de72e
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
@@ -0,0 +1,1108 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Min/max operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmax.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmax.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmaxu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmaxu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmin.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmin.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vminu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vminu.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     176.00  -      -      -      -     176.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
new file mode 100644
index 0000000000000..5faf2628105f7
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
@@ -0,0 +1,2984 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Multiplication and division operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmul.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmul.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vdiv.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vdiv.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vdivu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vdivu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vrem.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vrem.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vremu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vremu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmulh.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmulh.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmul.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwmul.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulsu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulsu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsmul.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsmul.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     486.00  -      -      -      -     486.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
new file mode 100644
index 0000000000000..fa53c08995c29
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
@@ -0,0 +1,3504 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Permutation and shuffle operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.v.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.v.x v8, x8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.v.i v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.x.s x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.s.x v8, x8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv1r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv2r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv4r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv8r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+viota.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m8, tu, mu
+vcompress.vm v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslide1up.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslide1down.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslideup.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vslideup.vi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslidedown.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vslidedown.vi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m8, tu, mu
+vrgather.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vrgather.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vrgather.vi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m8, tu, mu
+vrgatherei16.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfslide1down.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfslide1up.vf v8, v16, ft0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     572.00  -      -      -     45.00  527.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
new file mode 100644
index 0000000000000..3d7a67d8ba161
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
@@ -0,0 +1,1824 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Reduction operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredand.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredmax.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredminu.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredmin.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredor.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredsum.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredxor.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vwredsumu.vs v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vwredsum.vs v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredmax.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredmin.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredosum.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredusum.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfwredosum.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfwredusum.vs v8, v8, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     294.00  -      -      -     82.00  212.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
diff --git a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
index f88b7575002a9..3547b728a426d 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
@@ -33,6 +33,12 @@
 # RUN: llvm-objcopy -I binary -O elf64-littleriscv %t.txt %t.rv64.o
 # RUN: llvm-readobj --file-headers %t.rv64.o | FileCheck %s --check-prefixes=CHECK,LE,RISCV64,64
 
+# RUN: llvm-objcopy -I binary -O elf32-bigriscv %t.txt %t.rv32.o
+# RUN: llvm-readobj --file-headers %t.rv32.o | FileCheck %s --check-prefixes=CHECK,BE,RISCV32,32
+
+# RUN: llvm-objcopy -I binary -O elf64-bigriscv %t.txt %t.rv64.o
+# RUN: llvm-readobj --file-headers %t.rv64.o | FileCheck %s --check-prefixes=CHECK,BE,RISCV64,64
+
 # RUN: llvm-objcopy -I binary -O elf32-sparc %t.txt %t.sparc.o
 # RUN: llvm-readobj --file-headers %t.sparc.o | FileCheck %s --check-prefixes=CHECK,BE,SPARC,32
 
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s b/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s
index 69b7489e7e62e..085f258edfa57 100644
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/debug-vars-dwarf4.s
@@ -15,10 +15,10 @@
 
 ## Check that passing the default value for --debug-vars-indent (52) makes no
 ## change to the output.
-# RUN: llvm-objdump %t.o -d --debug-vars --debug-vars-indent=52 | \
+# RUN: llvm-objdump %t.o -d --debug-vars --debug-indent=52 | \
 # RUN:     FileCheck %s --check-prefix=RAW --strict-whitespace
 
-# RUN: llvm-objdump %t.o -d --debug-vars --debug-vars-indent=30 | \
+# RUN: llvm-objdump %t.o -d --debug-vars --debug-indent=30 | \
 # RUN:     FileCheck %s --check-prefix=INDENT --strict-whitespace
 
 # RUN: llvm-objdump %t.o -d --debug-vars --no-show-raw-insn | \
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
new file mode 100644
index 0000000000000..6a4927e4af2a4
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
@@ -0,0 +1,47 @@
+/// Checks that various hexagon scenarios are handled correctly:
+///  - branch targets
+///  - endloops
+///  - inline-relocs
+///  - multi-insn bundles
+
+{
+  r6 = sub(r1, r0)
+  r7 = and(r4, #0x0)
+  if (p1) jump:t target1
+  if (p2) jump:nt target2
+}
+
+{
+  r8 = r7
+  r9 = add(r8, #0)
+  r10 = memw(r9)
+} :endloop0
+
+{ jump ##sym }
+
+target1:
+  nop
+
+target2:
+  nop
+
+// RUN: llvm-mc %s --triple=hexagon -filetype=obj | llvm-objdump -d -r - | FileCheck %s
+
+//      CHECK: 00000000 <.text>:
+// CHECK-NEXT:        0:       12 51 00 5c     5c005112 {      if (p1) jump:t 0x24 <target1>
+// CHECK-NEXT:        4:       14 42 00 5c     5c004214        if (p2) jump:nt 0x28 <target2>
+// CHECK-NEXT:        8:       06 41 20 f3     f3204106        r6 = sub(r1,r0)
+// CHECK-NEXT:        c:       07 c0 04 76     7604c007        r7 = and(r4,#0x0) }
+// CHECK-NEXT:       10:       08 80 67 70     70678008 {      r8 = r7
+// CHECK-NEXT:       14:       09 40 08 b0     b0084009        r9 = add(r8,#0x0)
+// CHECK-NEXT:       18:       0a c0 89 91     9189c00a        r10 = memw(r9+#0x0) }  :endloop0
+// CHECK-NEXT:       1c:       00 40 00 00     00004000 {      immext(#0x0)
+// CHECK-NEXT:                         0000001c:  R_HEX_B32_PCREL_X    sym
+// CHECK-NEXT:       20:       00 c0 00 58     5800c000        jump 0x1c <.text+0x1c> }
+// CHECK-NEXT:                         00000020:  R_HEX_B22_PCREL_X    sym+0x4
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000024 <target1>:
+// CHECK-NEXT:       24:       00 c0 00 7f     7f00c000 {      nop }
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000028 <target2>:
+// CHECK-NEXT:       28:       00 c0 00 7f     7f00c000 {      nop }
diff --git a/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc b/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc
new file mode 100644
index 0000000000000..a708bc0cae604
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc
@@ -0,0 +1,10 @@
+int bar(int x, int y) {
+  int sum = x + y;
+  int mul = x * y;
+  return sum + mul;
+}
+
+int foo(int a, int b) {
+  int result = bar(a, b);
+  return result;
+}
diff --git a/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s b/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s
new file mode 100644
index 0000000000000..6ed3507b34e34
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/debug-inlined-functions.s
@@ -0,0 +1,643 @@
+## Generated with this compile command, with the source code in Inputs/debug-inlined-functions.cc:
+## clang++ -g -c debug-inlined-functions.cc -O1 -S -o -
+
+# RUN: llvm-mc -triple=x86_64 %s -filetype=obj -o %t.o
+
+# RUN: llvm-objdump %t.o -d --debug-inlined-funcs=unicode | \
+# RUN:     FileCheck %s --check-prefixes=UNICODE,UNICODE-MANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs | \
+# RUN:     FileCheck %s --check-prefixes=UNICODE,UNICODE-DEMANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode | \
+# RUN:     FileCheck %s --check-prefixes=UNICODE,UNICODE-DEMANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode --debug-indent=30 | \
+# RUN:     FileCheck %s --check-prefix=UNICODE-DEMANGLED-INDENT --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=ascii | \
+# RUN:     FileCheck %s --check-prefix=ASCII-DEMANGLED --strict-whitespace
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=limits-only | \
+# RUN:     FileCheck %s --check-prefix=LIMITS-ONLY-DEMANGLED
+
+# RUN: llvm-objdump %t.o -d -C --debug-inlined-funcs=unicode --debug-vars=unicode | \
+# RUN:     FileCheck %s --check-prefix=DEBUG-DEMANGLED-ALL --strict-whitespace
+
+# UNICODE-MANGLED: 0000000000000000 <_Z3barii>:
+# UNICODE-DEMANGLED: 0000000000000000 <bar(int, int)>:
+# UNICODE-NEXT:        0: 8d 04 3e                     	leal	(%rsi,%rdi), %eax
+# UNICODE-NEXT:        3: 0f af f7                     	imull	%edi, %esi
+# UNICODE-NEXT:        6: 01 f0                        	addl	%esi, %eax
+# UNICODE-NEXT:        8: c3                           	retq
+# UNICODE-NEXT:        9: 0f 1f 80 00 00 00 00         	nopl	(%rax)
+# UNICODE-EMPTY:
+# UNICODE-MANGLED-NEXT: 0000000000000010 <_Z3fooii>:
+# UNICODE-DEMANGLED-NEXT: 0000000000000010 <foo(int, int)>:
+# UNICODE-MANGLED-NEXT:                                                                                     ┠─ _Z3barii = inlined into _Z3fooii
+# UNICODE-DEMANGLED-NEXT:                                                                                   ┠─ bar(int, int) = inlined into foo(int, int)
+# UNICODE-NEXT:      10: 8d 04 3e                     	leal	(%rsi,%rdi), %eax                           ┃
+# UNICODE-NEXT:      13: 0f af f7                     	imull	%edi, %esi                                  ┃
+# UNICODE-NEXT:      16: 01 f0                        	addl	%esi, %eax                                  ┻
+# UNICODE-NEXT:      18: c3                           	retq
+
+# UNICODE-DEMANGLED-INDENT: 0000000000000010 <foo(int, int)>:
+# UNICODE-DEMANGLED-INDENT-NEXT:                                                                          ┠─ bar(int, int) = inlined into foo(int, int)
+# UNICODE-DEMANGLED-INDENT-NEXT:       10: 8d 04 3e                     	leal	(%rsi,%rdi), %eax     ┃
+# UNICODE-DEMANGLED-INDENT-NEXT:       13: 0f af f7                     	imull	%edi, %esi            ┃
+# UNICODE-DEMANGLED-INDENT-NEXT:       16: 01 f0                        	addl	%esi, %eax            ┻
+# UNICODE-DEMANGLED-INDENT-NEXT:       18: c3                           	retq
+
+# ASCII-DEMANGLED: 0000000000000010 <foo(int, int)>:
+# ASCII-DEMANGLED-NEXT:                                                                                                 |- bar(int, int) = inlined into foo(int, int)
+# ASCII-DEMANGLED-NEXT:        10: 8d 04 3e                     	leal	(%rsi,%rdi), %eax                           |
+# ASCII-DEMANGLED-NEXT:        13: 0f af f7                     	imull	%edi, %esi                                  |
+# ASCII-DEMANGLED-NEXT:        16: 01 f0                        	addl	%esi, %eax                                  v
+# ASCII-DEMANGLED-NEXT:        18: c3                           	retq
+
+# LIMITS-ONLY-DEMANGLED: 0000000000000010 <foo(int, int)>:
+# LIMITS-ONLY-DEMANGLED-NEXT: debug-inlined-functions.cc:8:16: bar(int, int) inlined into foo(int, int)
+# LIMITS-ONLY-DEMANGLED-NEXT: 10: 8d 04 3e                     leal    (%rsi,%rdi), %eax
+# LIMITS-ONLY-DEMANGLED-NEXT: 13: 0f af f7                     imull   %edi, %esi
+# LIMITS-ONLY-DEMANGLED-NEXT: 16: 01 f0                        addl    %esi, %eax
+# LIMITS-ONLY-DEMANGLED-NEXT: debug-inlined-functions.cc:8:16: end of bar(int, int) inlined into foo(int, int)
+# LIMITS-ONLY-DEMANGLED-NEXT: 18: c3                           retq
+
+# DEBUG-DEMANGLED-ALL: 0000000000000010 <foo(int, int)>:
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┠─ a = RDI
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┠─ b = RSI
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┃ ┠─ bar(int, int) = inlined into foo(int, int)
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┃ ┃ ┠─ x = RDI
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┃ ┃ ┃ ┠─ y = RSI
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┃ ┃ ┃ ┃ ┌─ sum = RAX
+# DEBUG-DEMANGLED-ALL-NEXT:  10: 8d 04 3e                     	leal	(%rsi,%rdi), %eax                           ┃ ┃ ┃ ┃ ┃ ╈
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┃ ┃ ┃ ┃ ┃ ┌─ b = entry(RSI)
+# DEBUG-DEMANGLED-ALL-NEXT:                                                                                           ┃ ┃ ┃ ┃ ┃ ┃ │ ┌─ mul = RSI
+# DEBUG-DEMANGLED-ALL-NEXT:  13: 0f af f7                     	imull	%edi, %esi                                  ┃ ┻ ┃ ┃ ┻ ┃ ╈ ╈
+# DEBUG-DEMANGLED-ALL-NEXT:  																							┃ ┌─ result = RAX
+# DEBUG-DEMANGLED-ALL-NEXT:  16: 01 f0                        	addl	%esi, %eax                                  ┃ ╈ ┻ ┻   ┻ ┃ ┃
+# DEBUG-DEMANGLED-ALL-NEXT:  18: c3                           	retq                                                ┻ ┻         ┻ ┻
+
+	.file	"debug-inlined-functions.cc"
+	.text
+	.globl	_Z3barii                        # -- Begin function _Z3barii
+	.p2align	4
+	.type	_Z3barii,@function
+_Z3barii:                               # @_Z3barii
+.Lfunc_begin0:
+	.file	0 "debug-inlined-functions.cc" md5 0xf07b869ec4d0996589aa6856ae4e6c83
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: bar:x <- $edi
+	#DEBUG_VALUE: bar:y <- $esi
+                                        # kill: def $esi killed $esi def $rsi
+                                        # kill: def $edi killed $edi def $rdi
+	.loc	0 2 15 prologue_end             # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:2:15
+	leal	(%rsi,%rdi), %eax
+.Ltmp0:
+	#DEBUG_VALUE: bar:sum <- $eax
+	.loc	0 3 15                          # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:3:15
+	imull	%edi, %esi
+.Ltmp1:
+	#DEBUG_VALUE: bar:y <- [DW_OP_LLVM_entry_value 1] $esi
+	#DEBUG_VALUE: bar:mul <- $esi
+	.loc	0 4 14                          # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:14
+	addl	%esi, %eax
+.Ltmp2:
+	.loc	0 4 3 is_stmt 0                 # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:3
+	retq
+.Ltmp3:
+.Lfunc_end0:
+	.size	_Z3barii, .Lfunc_end0-_Z3barii
+	.cfi_endproc
+                                        # -- End function
+	.globl	_Z3fooii                        # -- Begin function _Z3fooii
+	.p2align	4
+	.type	_Z3fooii,@function
+_Z3fooii:                               # @_Z3fooii
+.Lfunc_begin1:
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: foo:a <- $edi
+	#DEBUG_VALUE: foo:b <- $esi
+	#DEBUG_VALUE: bar:x <- $edi
+	#DEBUG_VALUE: bar:y <- $esi
+                                        # kill: def $esi killed $esi def $rsi
+                                        # kill: def $edi killed $edi def $rdi
+	.loc	0 2 15 prologue_end is_stmt 1   # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:2:15 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ]
+	leal	(%rsi,%rdi), %eax
+.Ltmp4:
+	#DEBUG_VALUE: bar:sum <- $eax
+	.loc	0 3 15                          # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:3:15 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ]
+	imull	%edi, %esi
+.Ltmp5:
+	#DEBUG_VALUE: foo:b <- [DW_OP_LLVM_entry_value 1] $esi
+	#DEBUG_VALUE: bar:mul <- $esi
+	.loc	0 4 14                          # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:4:14 @[ llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:8:16 ]
+	addl	%esi, %eax
+.Ltmp6:
+	#DEBUG_VALUE: foo:result <- $eax
+	.loc	0 9 3                           # llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc:9:3
+	retq
+.Ltmp7:
+.Lfunc_end1:
+	.size	_Z3fooii, .Lfunc_end1-_Z3fooii
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_loclists,"",@progbits
+	.long	.Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+	.short	5                               # Version
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+	.long	8                               # Offset entry count
+.Lloclists_table_base0:
+	.long	.Ldebug_loc0-.Lloclists_table_base0
+	.long	.Ldebug_loc1-.Lloclists_table_base0
+	.long	.Ldebug_loc2-.Lloclists_table_base0
+	.long	.Ldebug_loc3-.Lloclists_table_base0
+	.long	.Ldebug_loc4-.Lloclists_table_base0
+	.long	.Ldebug_loc5-.Lloclists_table_base0
+	.long	.Ldebug_loc6-.Lloclists_table_base0
+	.long	.Ldebug_loc7-.Lloclists_table_base0
+.Ldebug_loc0:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Lfunc_begin0-.Lfunc_begin0    #   starting offset
+	.uleb128 .Ltmp1-.Lfunc_begin0           #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp1-.Lfunc_begin0           #   starting offset
+	.uleb128 .Lfunc_end0-.Lfunc_begin0      #   ending offset
+	.byte	4                               # Loc expr size
+	.byte	163                             # DW_OP_entry_value
+	.byte	1                               # 1
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	159                             # DW_OP_stack_value
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc1:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp0-.Lfunc_begin0           #   starting offset
+	.uleb128 .Ltmp2-.Lfunc_begin0           #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	80                              # super-register DW_OP_reg0
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc2:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp1-.Lfunc_begin0           #   starting offset
+	.uleb128 .Lfunc_end0-.Lfunc_begin0      #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc3:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Lfunc_begin1-.Lfunc_begin0    #   starting offset
+	.uleb128 .Ltmp5-.Lfunc_begin0           #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp5-.Lfunc_begin0           #   starting offset
+	.uleb128 .Lfunc_end1-.Lfunc_begin0      #   ending offset
+	.byte	4                               # Loc expr size
+	.byte	163                             # DW_OP_entry_value
+	.byte	1                               # 1
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	159                             # DW_OP_stack_value
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc4:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Lfunc_begin1-.Lfunc_begin0    #   starting offset
+	.uleb128 .Ltmp5-.Lfunc_begin0           #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc5:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp4-.Lfunc_begin0           #   starting offset
+	.uleb128 .Ltmp6-.Lfunc_begin0           #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	80                              # super-register DW_OP_reg0
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc6:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp5-.Lfunc_begin0           #   starting offset
+	.uleb128 .Lfunc_end1-.Lfunc_begin0      #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	84                              # super-register DW_OP_reg4
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_loc7:
+	.byte	4                               # DW_LLE_offset_pair
+	.uleb128 .Ltmp6-.Lfunc_begin0           #   starting offset
+	.uleb128 .Lfunc_end1-.Lfunc_begin0      #   ending offset
+	.byte	1                               # Loc expr size
+	.byte	80                              # super-register DW_OP_reg0
+	.byte	0                               # DW_LLE_end_of_list
+.Ldebug_list_header_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.ascii	"\214\001"                      # DW_AT_loclists_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	122                             # DW_AT_call_all_calls
+	.byte	25                              # DW_FORM_flag_present
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	34                              # DW_FORM_loclistx
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	34                              # DW_FORM_loclistx
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	32                              # DW_AT_inline
+	.byte	33                              # DW_FORM_implicit_const
+	.byte	1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	122                             # DW_AT_call_all_calls
+	.byte	25                              # DW_FORM_flag_present
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	34                              # DW_FORM_loclistx
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	34                              # DW_FORM_loclistx
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	14                              # Abbreviation Code
+	.byte	29                              # DW_TAG_inlined_subroutine
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	49                              # DW_AT_abstract_origin
+	.byte	19                              # DW_FORM_ref4
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	88                              # DW_AT_call_file
+	.byte	11                              # DW_FORM_data1
+	.byte	89                              # DW_AT_call_line
+	.byte	11                              # DW_FORM_data1
+	.byte	87                              # DW_AT_call_column
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0xc4 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.long	.Lloclists_table_base0          # DW_AT_loclists_base
+	.byte	2                               # Abbrev [2] 0x27:0x26 DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_call_all_calls
+	.long	77                              # DW_AT_abstract_origin
+	.byte	3                               # Abbrev [3] 0x33:0x7 DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.long	86                              # DW_AT_abstract_origin
+	.byte	4                               # Abbrev [4] 0x3a:0x6 DW_TAG_formal_parameter
+	.byte	0                               # DW_AT_location
+	.long	94                              # DW_AT_abstract_origin
+	.byte	5                               # Abbrev [5] 0x40:0x6 DW_TAG_variable
+	.byte	1                               # DW_AT_location
+	.long	102                             # DW_AT_abstract_origin
+	.byte	5                               # Abbrev [5] 0x46:0x6 DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.long	110                             # DW_AT_abstract_origin
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0x4d:0x2a DW_TAG_subprogram
+	.byte	3                               # DW_AT_linkage_name
+	.byte	4                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+                                        # DW_AT_external
+                                        # DW_AT_inline
+	.byte	7                               # Abbrev [7] 0x56:0x8 DW_TAG_formal_parameter
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	7                               # Abbrev [7] 0x5e:0x8 DW_TAG_formal_parameter
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x66:0x8 DW_TAG_variable
+	.byte	8                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x6e:0x8 DW_TAG_variable
+	.byte	9                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x77:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x7b:0x54 DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_call_all_calls
+	.byte	10                              # DW_AT_linkage_name
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	11                              # Abbrev [11] 0x8b:0xa DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.byte	12                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	12                              # Abbrev [12] 0x95:0x9 DW_TAG_formal_parameter
+	.byte	3                               # DW_AT_location
+	.byte	13                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x9e:0x9 DW_TAG_variable
+	.byte	7                               # DW_AT_location
+	.byte	14                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	14                              # Abbrev [14] 0xa7:0x27 DW_TAG_inlined_subroutine
+	.long	77                              # DW_AT_abstract_origin
+	.byte	1                               # DW_AT_low_pc
+	.long	.Ltmp6-.Lfunc_begin1            # DW_AT_high_pc
+	.byte	0                               # DW_AT_call_file
+	.byte	8                               # DW_AT_call_line
+	.byte	16                              # DW_AT_call_column
+	.byte	3                               # Abbrev [3] 0xb4:0x7 DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.long	86                              # DW_AT_abstract_origin
+	.byte	4                               # Abbrev [4] 0xbb:0x6 DW_TAG_formal_parameter
+	.byte	4                               # DW_AT_location
+	.long	94                              # DW_AT_abstract_origin
+	.byte	5                               # Abbrev [5] 0xc1:0x6 DW_TAG_variable
+	.byte	5                               # DW_AT_location
+	.long	102                             # DW_AT_abstract_origin
+	.byte	5                               # Abbrev [5] 0xc7:0x6 DW_TAG_variable
+	.byte	6                               # DW_AT_location
+	.long	110                             # DW_AT_abstract_origin
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	64                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 21.0.0git (git@github.com:llvm/llvm-project.git eed98e1493414ae9c30596b1eeb8f4a9b260e42)" # string offset=0
+.Linfo_string1:
+	.asciz	"llvm/test/tools/llvm-objdump/X86/Inputs/debug-inlined-functions.cc" # string offset=112
+.Linfo_string2:
+	.asciz	"llvm-project" # string offset=179
+.Linfo_string3:
+	.asciz	"_Z3barii"                      # string offset=229
+.Linfo_string4:
+	.asciz	"bar"                           # string offset=238
+.Linfo_string5:
+	.asciz	"int"                           # string offset=242
+.Linfo_string6:
+	.asciz	"x"                             # string offset=246
+.Linfo_string7:
+	.asciz	"y"                             # string offset=248
+.Linfo_string8:
+	.asciz	"sum"                           # string offset=250
+.Linfo_string9:
+	.asciz	"mul"                           # string offset=254
+.Linfo_string10:
+	.asciz	"_Z3fooii"                      # string offset=258
+.Linfo_string11:
+	.asciz	"foo"                           # string offset=267
+.Linfo_string12:
+	.asciz	"a"                             # string offset=271
+.Linfo_string13:
+	.asciz	"b"                             # string offset=273
+.Linfo_string14:
+	.asciz	"result"                        # string offset=275
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.long	.Linfo_string12
+	.long	.Linfo_string13
+	.long	.Linfo_string14
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_begin1
+.Ldebug_addr_end0:
+	.ident	"clang version 21.0.0git (git@github.com:llvm/llvm-project.git eed98e1493414ae9c30596b1eeb8f4a9b260e42a)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test b/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test
new file mode 100644
index 0000000000000..0b8c33d24396a
--- /dev/null
+++ b/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test
@@ -0,0 +1,70 @@
+RUN: not %llvm-original-di-preservation %p/Inputs/sample.json --acceptance-test | FileCheck %s
+CHECK:      DILocation Bugs:
+CHECK-NEXT:   test.ll:
+CHECK-NEXT:     no-name:
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn
+CHECK-NEXT:       instr: extractvalue
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn
+CHECK-NEXT:       instr: insertvalue
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn1
+CHECK-NEXT:       instr: insertvalue
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn1
+CHECK-NEXT:       instr: extractvalue
+CHECK:      Errors detected for:
+
+RUN: not %llvm-original-di-preservation %p/Inputs/sample.json --acceptance-test --reduce | FileCheck %s --check-prefix=COMPRESS
+COMPRESS:      DILocation Bugs:
+COMPRESS-NEXT:   test.ll:
+COMPRESS-NEXT:     no-name:
+COMPRESS-NEXT:     - action: not-generate
+COMPRESS-NEXT:       bb_name: no-name
+COMPRESS-NEXT:       fn_name: fn
+COMPRESS-NEXT:       instr: extractvalue
+COMPRESS-NEXT:     - action: not-generate
+COMPRESS-NEXT:       bb_name: no-name
+COMPRESS-NEXT:       fn_name: fn
+COMPRESS-NEXT:       instr: insertvalue
+COMPRESS:      Errors detected for:
+
+RUN: not %llvm-original-di-preservation %p/Inputs/origin.json --acceptance-test --reduce | FileCheck %s --check-prefix=ORIGIN
+ORIGIN:      DILocation Bugs:
+ORIGIN-NEXT:   test.ll:
+ORIGIN-NEXT:     LoopVectorizePass:
+ORIGIN-NEXT:     - action: not-generate
+ORIGIN-NEXT:       bb_name: no-name
+ORIGIN-NEXT:       fn_name: fn
+ORIGIN-NEXT:       instr: add
+ORIGIN-NEXT:       origin: |
+ORIGIN-NEXT:         Stack Trace 0:
+ORIGIN-NEXT:          #0 0x00005895d035c935 llvm::DbgLocOrigin::DbgLocOrigin(bool) /tmp/llvm-project/llvm/lib/IR/DebugLoc.cpp:22:9
+ORIGIN-NEXT:          #1 0x00005895d03af013 llvm::DILocAndCoverageTracking::DILocAndCoverageTracking() /tmp/llvm-project/llvm/include/llvm/IR/DebugLoc.h:90:11
+ORIGIN-NEXT:          #2 0x00005895d03af013 llvm::DebugLoc::DebugLoc() /tmp/llvm-project/llvm/include/llvm/IR/DebugLoc.h:133:5
+ORIGIN-NEXT:          #3 0x00005895d03af013 llvm::Instruction::Instruction(llvm::Type*, unsigned int, llvm::User::AllocInfo, llvm::InsertPosition) /tmp/llvm-project/llvm/lib/IR/Instruction.cpp:37:14
+ORIGIN-NEXT:          #4 0x00005895d06862b5 llvm::PHINode::PHINode(llvm::Type*, unsigned int, llvm::Twine const&, llvm::InsertPosition) /tmp/llvm-project/llvm/include/llvm/IR/Instructions.h:0:9
+ORIGIN-NEXT:          #5 0x00005895d06862b5 llvm::PHINode::Create(llvm::Type*, unsigned int, llvm::Twine const&, llvm::InsertPosition) /tmp/llvm-project/llvm/include/llvm/IR/Instructions.h:2651:9
+ORIGIN-NEXT:          #6 0x00005895d06862b5 llvm::InstCombinerImpl::foldPHIArgGEPIntoPHI(llvm::PHINode&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp:617:9
+ORIGIN-NEXT:          #7 0x00005895d0688fe0 llvm::InstCombinerImpl::visitPHINode(llvm::PHINode&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp:1456:22
+ORIGIN-NEXT:          #8 0x00005895d05cd21f llvm::InstCombinerImpl::run() /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5327:22
+ORIGIN-NEXT:          #9 0x00005895d05d067e combineInstructionsOverFunction(llvm::Function&, llvm::InstructionWorklist&, llvm::AAResults*, llvm::AssumptionCache&, llvm::TargetLibraryInfo&, llvm::TargetTransformInfo&, llvm::DominatorTree&, llvm::OptimizationRemarkEmitter&, llvm::BlockFrequencyInfo*, llvm::BranchProbabilityInfo*, llvm::ProfileSummaryInfo*, llvm::InstCombineOptions const&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5643:31
+ORIGIN-NEXT:         #10 0x00005895d05cf9a9 llvm::InstCombinePass::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5706:8
+ORIGIN-NEXT:         #11 0x00005895d107d07d llvm::detail::PassModel>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #12 0x00005895d04204a7 llvm::PassManager>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h:85:8
+ORIGIN-NEXT:         #13 0x00005895ce4cb09d llvm::detail::PassModel>, llvm::AnalysisManager>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #14 0x00005895cfae2865 llvm::CGSCCToFunctionPassAdaptor::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:0:38
+ORIGIN-NEXT:         #15 0x00005895ce4cad5d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #16 0x00005895cfade813 llvm::PassManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:93:12
+ORIGIN-NEXT:         #17 0x00005895d1e3968d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>, llvm::AnalysisManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #18 0x00005895cfae1224 llvm::DevirtSCCRepeatedPass::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:0:38
+ORIGIN-NEXT:         #19 0x00005895d1e5067d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN:      Errors detected for:
+
+RUN: %llvm-original-di-preservation %p/Inputs/non-existent.json --acceptance-test | FileCheck %s --check-prefix=EMPTY
+EMPTY: No errors detected for:
diff --git a/llvm/test/tools/llvm-original-di-preservation/basic.test b/llvm/test/tools/llvm-original-di-preservation/basic.test
index 5ef670b42c667..df43fbb3b5b9f 100644
--- a/llvm/test/tools/llvm-original-di-preservation/basic.test
+++ b/llvm/test/tools/llvm-original-di-preservation/basic.test
@@ -1,17 +1,17 @@
-RUN: %llvm-original-di-preservation %p/Inputs/sample.json %t.html | FileCheck %s
+RUN: %llvm-original-di-preservation %p/Inputs/sample.json --report-html-file %t.html | FileCheck %s
 RUN: diff -w %p/Inputs/expected-sample.html %t.html
 CHECK: The {{.+}}.html generated.
 CHECK-NOT: Skipped lines:
 
-RUN: %llvm-original-di-preservation %p/Inputs/corrupted.json %t2.html | FileCheck %s -check-prefix=CORRUPTED
+RUN: %llvm-original-di-preservation %p/Inputs/corrupted.json --report-html-file %t2.html | FileCheck %s -check-prefix=CORRUPTED
 RUN: diff -w %p/Inputs/expected-skipped.html %t2.html
 CORRUPTED: Skipped lines: 3
 CORRUPTED: Skipped bugs: 1
 
-RUN: %llvm-original-di-preservation -compress %p/Inputs/sample.json %t3.html | FileCheck %s -check-prefix=COMPRESSED
+RUN: %llvm-original-di-preservation --reduce %p/Inputs/sample.json --report-html-file %t3.html | FileCheck %s -check-prefix=REDUCE
 RUN: diff -w %p/Inputs/expected-compressed.html %t3.html
-COMPRESSED: The {{.+}}.html generated.
-COMPRESSED-NOT: Skipped lines:
+REDUCE: The {{.+}}.html generated.
+REDUCE-NOT: Skipped lines:
 
-RUN: %llvm-original-di-preservation %p/Inputs/origin.json %t4.html | FileCheck %s
+RUN: %llvm-original-di-preservation %p/Inputs/origin.json --report-html-file %t4.html | FileCheck %s
 RUN: diff -w %p/Inputs/expected-origin.html %t4.html
diff --git a/llvm/test/tools/llvm-profdata/c-general.test b/llvm/test/tools/llvm-profdata/c-general.test
index 7c48f7b04a05c..ab4849fac034f 100644
--- a/llvm/test/tools/llvm-profdata/c-general.test
+++ b/llvm/test/tools/llvm-profdata/c-general.test
@@ -22,6 +22,6 @@ SWITCHES-LABEL: Functions shown: 1
 CHECK-LABEL: Total functions: 12
 CHECK-NEXT: Maximum function count: 1
 CHECK-NEXT: Maximum internal block count: 100
-TOPN: boolean_operators, max count = 100
-TOPN-NEXT: simple_loops, max count = 100
-TOPN-NEXT:  conditionals, max count = 100
+TOPN:         simple_loops, max count = 100
+TOPN-NEXT:    conditionals, max count = 100
+TOPN-NEXT:    boolean_operators, max count = 100
diff --git a/llvm/test/tools/llvm-profdata/show-hot.proftext b/llvm/test/tools/llvm-profdata/show-hot.proftext
new file mode 100644
index 0000000000000..5c9bd61c20d28
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-hot.proftext
@@ -0,0 +1,35 @@
+# RUN: llvm-profdata show %s --hot-func-list | FileCheck %s
+
+# CHECK: # Hot count threshold: 101
+# CHECK: hot_b
+# CHECK: hot_a
+# CHECK: hot_c
+
+:ir
+hot_a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+hot_b
+0x5678
+1
+202
+
+hot_c
+0x5678
+1
+101
+
+cold_d
+0xabcd
+1
+1
+
+cold_e
+0xefff
+1
+0
diff --git a/llvm/test/tools/llvm-readobj/ELF/sframe-header.test b/llvm/test/tools/llvm-readobj/ELF/sframe-header.test
new file mode 100644
index 0000000000000..f827296b1c399
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/sframe-header.test
@@ -0,0 +1,148 @@
+## Check parsing and dumping of the SFrame header.
+# RUN: yaml2obj --docnum=1 %s -o %t.1
+# RUN: llvm-readobj --sframe=.sframe_bad_sh_size --sframe=.sframe_1b \
+# RUN:   --sframe=.sframe_bad_magic --sframe=.sframe_bad_version \
+# RUN:   --sframe=.sframe_6b --sframe=.sframe_header %t.1 2>&1 | \
+# RUN:   FileCheck %s --strict-whitespace --match-full-lines \
+# RUN:   -DFILE=%t.1 --check-prefix=CASE1
+
+## Check big-endian support and the handling of --sframe argument default.
+# RUN: yaml2obj --docnum=2 %s -o %t.2
+# RUN: llvm-readobj --sframe %t.2 2>&1 | \
+# RUN:   FileCheck %s --strict-whitespace --match-full-lines \
+# RUN:   -DFILE=%t.2 --check-prefix=CASE2
+
+## Check handling of corrupted elf files (bad sh_name)
+# RUN: yaml2obj --docnum=3 %s -o %t.3
+# RUN: not llvm-readobj --sframe %t.3 2>&1 | \
+# RUN:   FileCheck %s --strict-whitespace --match-full-lines \
+# RUN:   -DFILE=%t.3 --check-prefix=CASE3
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+Sections:
+  - Name:  .sframe_bad_sh_size
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ShSize: 0xfffff
+# CASE1-LABEL:SFrame section '.sframe_bad_sh_size' {
+#       CASE1:{{.*}}: warning: '[[FILE]]': The end of the file was unexpectedly encountered
+  - Name:  .sframe_1b
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ContentArray: [ 0x00 ]
+# CASE1-LABEL:SFrame section '.sframe_1b' {
+#       CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: unexpected end of data at offset 0x1 while reading [0x0, 0x4)
+
+  - Name:  .sframe_bad_magic
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ContentArray: [ 0xde, 0xad, 0xbe, 0xef]
+# CASE1-LABEL:SFrame section '.sframe_bad_magic' {
+#       CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: invalid magic number (0xadde)
+
+  - Name:  .sframe_bad_version
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ContentArray: [
+      0xe2, 0xde, 0x01, 0x00  # Preamble (magic, version, flags)
+    ]
+# CASE1-LABEL:SFrame section '.sframe_bad_version' {
+#       CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: invalid/unsupported version number (1)
+
+  - Name:  .sframe_6b
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ContentArray: [
+      0xe2, 0xde, 0x02, 0x00,  # Preamble (magic, version, flags)
+      0x01, 0x02
+    ]
+
+# CASE1-LABEL:SFrame section '.sframe_6b' {
+#       CASE1:{{.*}}: warning: '[[FILE]]': invalid sframe section: unexpected end of data at offset 0x6 while reading [0x0, 0x1c)
+
+  - Name:  .sframe_header
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ContentArray: [
+      0xe2, 0xde, 0x02, 0x06,  # Preamble (magic, version, flags)
+      # Header:
+      0x03, 0x42, 0x47, 0x00,  # ABI, Fixed FP offset, Fixed RA Offset, AUX header length
+      0x01, 0x00, 0x00, 0x00,  # Number of FDEs
+      0x10, 0x00, 0x00, 0x00,  # Number of FREs
+      0x00, 0x10, 0x00, 0x00,  # FRE length
+      0x04, 0x00, 0x00, 0x00,  # FDE offset
+      0x00, 0x01, 0x00, 0x00,  # FRE offset
+    ]
+# CASE1-LABEL:SFrame section '.sframe_header' {
+#       CASE1:  Header {
+#  CASE1-NEXT:    Magic: 0xDEE2
+#  CASE1-NEXT:    Version: V2 (0x2)
+#  CASE1-NEXT:    Flags [ (0x6)
+#  CASE1-NEXT:      FDEFuncStartPCRel (0x4){{ *}}
+#  CASE1-NEXT:      FramePointer (0x2){{ *}}
+#  CASE1-NEXT:    ]
+#  CASE1-NEXT:    ABI: AMD64EndianLittle (0x3)
+#  CASE1-NEXT:    CFA fixed FP offset (unused): 66
+#  CASE1-NEXT:    CFA fixed RA offset: 71
+#  CASE1-NEXT:    Auxiliary header length: 0
+#  CASE1-NEXT:    Num FDEs: 1
+#  CASE1-NEXT:    Num FREs: 16
+#  CASE1-NEXT:    FRE subsection length: 4096
+#  CASE1-NEXT:    FDE subsection offset: 4
+#  CASE1-NEXT:    FRE subsection offset: 256
+#  CASE1-NEXT:  }
+#  CASE1-NEXT:}
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2MSB
+  Type:  ET_EXEC
+Sections:
+  - Name:  .sframe
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ContentArray: [
+      0xde, 0xe2, 0x02, 0x01,  # Preamble (magic, version, flags)
+      # Header:
+      0x01, 0x42, 0x47, 0x00,  # ABI, Fixed FP offset, Fixed RA Offset, AUX header length
+      0x00, 0x00, 0x00, 0x01,  # Number of FDEs
+      0x00, 0x00, 0x00, 0x10,  # Number of FREs
+      0x00, 0x00, 0x10, 0x00,  # FRE length
+      0x00, 0x00, 0x00, 0x04,  # FDE offset
+      0x00, 0x00, 0x01, 0x00,  # FRE offset
+    ]
+# CASE2-LABEL:SFrame section '.sframe' {
+#       CASE2:  Header {
+#  CASE2-NEXT:    Magic: 0xDEE2
+#  CASE2-NEXT:    Version: V2 (0x2)
+#  CASE2-NEXT:    Flags [ (0x1)
+#  CASE2-NEXT:      FDESorted (0x1){{ *}}
+#  CASE2-NEXT:    ]
+#  CASE2-NEXT:    ABI: AArch64EndianBig (0x1)
+#  CASE2-NEXT:    CFA fixed FP offset (unused): 66
+#  CASE2-NEXT:    CFA fixed RA offset (unused): 71
+#  CASE2-NEXT:    Auxiliary header length: 0
+#  CASE2-NEXT:    Num FDEs: 1
+#  CASE2-NEXT:    Num FREs: 16
+#  CASE2-NEXT:    FRE subsection length: 4096
+#  CASE2-NEXT:    FDE subsection offset: 4
+#  CASE2-NEXT:    FRE subsection offset: 256
+#  CASE2-NEXT:  }
+#  CASE2-NEXT:}
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2MSB
+  Type:  ET_EXEC
+Sections:
+  - Name:  .corrupted
+    Type:  SHT_GNU_SFRAME
+    Flags: [ SHF_ALLOC ]
+    ShName: 0x10000
+# CASE3:{{.*}}: error: '[[FILE]]': a section [index 1] has an invalid sh_name (0x10000) offset which goes past the end of the section name string table
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 422eb855ba2cf..2b43d27f292a0 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -131,20 +131,6 @@ class CommentWriter : public AssemblyAnnotationWriter {
         printDebugLoc(DL,OS);
         OS << "]";
       }
-      if (const DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
-        if (!Padded) {
-          OS.PadToColumn(50);
-          OS << ";";
-        }
-        OS << " [debug variable = " << DDI->getVariable()->getName() << "]";
-      }
-      else if (const DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
-        if (!Padded) {
-          OS.PadToColumn(50);
-          OS << ";";
-        }
-        OS << " [debug variable = " << DVI->getVariable()->getName() << "]";
-      }
     }
   }
 };
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
new file mode 100644
index 0000000000000..a4cf9690e86b5
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
new file mode 100644
index 0000000000000..e1e5fad13f413
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -0,0 +1,314 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// This tool provides two main functionalities:
+///
+/// 1. Triplet Generation Mode (--mode=triplets):
+///    Generates triplets (opcode, type, operands) for vocabulary training.
+///    Usage: llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+///
+/// 2. Embedding Generation Mode (--mode=embeddings):
+///    Generates IR2Vec embeddings using a trained vocabulary.
+///    Usage: llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json
+///    --level=func input.bc -o embeddings.txt Levels: --level=inst
+///    (instructions), --level=bb (basic blocks), --level=func (functions)
+///    (See IR2Vec.cpp for more embedding generation options)
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "ir2vec"
+
+namespace llvm {
+namespace ir2vec {
+
+static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
+
+static cl::opt<std::string>
+    InputFilename(cl::Positional,
+                  cl::desc("<input bitcode file or '-' for stdin>"),
+                  cl::init("-"), cl::cat(IR2VecToolCategory));
+
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"),
+                                           cl::init("-"),
+                                           cl::cat(IR2VecToolCategory));
+
+enum ToolMode {
+  TripletMode,  // Generate triplets for vocabulary training
+  EmbeddingMode // Generate embeddings using trained vocabulary
+};
+
+static cl::opt<ToolMode>
+    Mode("mode", cl::desc("Tool operation mode:"),
+         cl::values(clEnumValN(TripletMode, "triplets",
+                               "Generate triplets for vocabulary training"),
+                    clEnumValN(EmbeddingMode, "embeddings",
+                               "Generate embeddings using trained vocabulary")),
+         cl::init(EmbeddingMode), cl::cat(IR2VecToolCategory));
+
+static cl::opt<std::string>
+    FunctionName("function", cl::desc("Process specific function only"),
+                 cl::value_desc("name"), cl::Optional, cl::init(""),
+                 cl::cat(IR2VecToolCategory));
+
+enum EmbeddingLevel {
+  InstructionLevel, // Generate instruction-level embeddings
+  BasicBlockLevel,  // Generate basic block-level embeddings
+  FunctionLevel     // Generate function-level embeddings
+};
+
+static cl::opt<EmbeddingLevel>
+    Level("level", cl::desc("Embedding generation level (for embedding mode):"),
+          cl::values(clEnumValN(InstructionLevel, "inst",
+                                "Generate instruction-level embeddings"),
+                     clEnumValN(BasicBlockLevel, "bb",
+                                "Generate basic block-level embeddings"),
+                     clEnumValN(FunctionLevel, "func",
+                                "Generate function-level embeddings")),
+          cl::init(FunctionLevel), cl::cat(IR2VecToolCategory));
+
+namespace {
+
+/// Helper class for collecting IR triplets and generating embeddings
+class IR2VecTool {
+private:
+  Module &M;
+  ModuleAnalysisManager MAM;
+  const Vocabulary *Vocab = nullptr;
+
+public:
+  explicit IR2VecTool(Module &M) : M(M) {}
+
+  /// Initialize the IR2Vec vocabulary analysis
+  bool initializeVocabulary() {
+    // Register and run the IR2Vec vocabulary analysis
+    // The vocabulary file path is specified via --ir2vec-vocab-path global
+    // option
+    MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
+    MAM.registerPass([&] { return IR2VecVocabAnalysis(); });
+    Vocab = &MAM.getResult<IR2VecVocabAnalysis>(M);
+    return Vocab->isValid();
+  }
+
+  /// Generate triplets for the entire module
+  void generateTriplets(raw_ostream &OS) const {
+    for (const Function &F : M)
+      generateTriplets(F, OS);
+  }
+
+  /// Generate triplets for a single function
+  void generateTriplets(const Function &F, raw_ostream &OS) const {
+    if (F.isDeclaration())
+      return;
+
+    std::string LocalOutput;
+    raw_string_ostream LocalOS(LocalOutput);
+
+    for (const BasicBlock &BB : F)
+      traverseBasicBlock(BB, LocalOS);
+
+    LocalOS.flush();
+    OS << LocalOutput;
+  }
+
+  /// Generate embeddings for the entire module
+  void generateEmbeddings(raw_ostream &OS) const {
+    if (!Vocab->isValid()) {
+      OS << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n";
+      return;
+    }
+
+    for (const Function &F : M)
+      generateEmbeddings(F, OS);
+  }
+
+  /// Generate embeddings for a single function
+  void generateEmbeddings(const Function &F, raw_ostream &OS) const {
+    if (F.isDeclaration()) {
+      OS << "Function " << F.getName() << " is a declaration, skipping.\n";
+      return;
+    }
+
+    // Create embedder for this function
+    assert(Vocab->isValid() && "Vocabulary is not valid");
+    auto Emb = Embedder::create(IR2VecKind::Symbolic, F, *Vocab);
+    if (!Emb) {
+      OS << "Error: Failed to create embedder for function " << F.getName()
+         << "\n";
+      return;
+    }
+
+    OS << "Function: " << F.getName() << "\n";
+
+    // Generate embeddings based on the specified level
+    switch (Level) {
+    case FunctionLevel: {
+      Emb->getFunctionVector().print(OS);
+      break;
+    }
+    case BasicBlockLevel: {
+      const auto &BBVecMap = Emb->getBBVecMap();
+      for (const BasicBlock &BB : F) {
+        auto It = BBVecMap.find(&BB);
+        if (It != BBVecMap.end()) {
+          OS << BB.getName() << ":";
+          It->second.print(OS);
+        }
+      }
+      break;
+    }
+    case InstructionLevel: {
+      const auto &InstMap = Emb->getInstVecMap();
+      for (const BasicBlock &BB : F) {
+        for (const Instruction &I : BB) {
+          auto It = InstMap.find(&I);
+          if (It != InstMap.end()) {
+            I.print(OS);
+            It->second.print(OS);
+          }
+        }
+      }
+      break;
+    }
+    }
+  }
+
+private:
+  /// Process a single basic block for triplet generation
+  void traverseBasicBlock(const BasicBlock &BB, raw_string_ostream &OS) const {
+    // Consider only non-debug and non-pseudo instructions
+    for (const auto &I : BB.instructionsWithoutDebug()) {
+      StringRef OpcStr = Vocabulary::getVocabKeyForOpcode(I.getOpcode());
+      StringRef TypeStr =
+          Vocabulary::getVocabKeyForTypeID(I.getType()->getTypeID());
+
+      OS << '\n' << OpcStr << ' ' << TypeStr << ' ';
+
+      LLVM_DEBUG({
+        I.print(dbgs());
+        dbgs() << "\n";
+        I.getType()->print(dbgs());
+        dbgs() << " Type\n";
+      });
+
+      for (const Use &U : I.operands())
+        OS << Vocabulary::getVocabKeyForOperandKind(
+                  Vocabulary::getOperandKind(U.get()))
+           << ' ';
+    }
+  }
+};
+
+Error processModule(Module &M, raw_ostream &OS) {
+  IR2VecTool Tool(M);
+
+  if (Mode == EmbeddingMode) {
+    // Initialize vocabulary for embedding generation
+    // Note: Requires --ir2vec-vocab-path option to be set
+    if (!Tool.initializeVocabulary())
+      return createStringError(
+          errc::invalid_argument,
+          "Failed to initialize IR2Vec vocabulary. "
+          "Make sure to specify --ir2vec-vocab-path for embedding mode.");
+
+    if (!FunctionName.empty()) {
+      // Process single function
+      if (const Function *F = M.getFunction(FunctionName))
+        Tool.generateEmbeddings(*F, OS);
+      else
+        return createStringError(errc::invalid_argument,
+                                 "Function '%s' not found",
+                                 FunctionName.c_str());
+    } else {
+      // Process all functions
+      Tool.generateEmbeddings(OS);
+    }
+  } else {
+    // Triplet generation mode - no vocabulary needed
+    if (!FunctionName.empty())
+      // Process single function
+      if (const Function *F = M.getFunction(FunctionName))
+        Tool.generateTriplets(*F, OS);
+      else
+        return createStringError(errc::invalid_argument,
+                                 "Function '%s' not found",
+                                 FunctionName.c_str());
+    else
+      // Process all functions
+      Tool.generateTriplets(OS);
+  }
+  return Error::success();
+}
+} // namespace
+} // namespace ir2vec
+} // namespace llvm
+
+int main(int argc, char **argv) {
+  using namespace llvm;
+  using namespace llvm::ir2vec;
+
+  InitLLVM X(argc, argv);
+  cl::HideUnrelatedOptions(IR2VecToolCategory);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "IR2Vec - Embedding Generation Tool\n"
+      "Generates embeddings for a given LLVM IR and "
+      "supports triplet generation for vocabulary "
+      "training and embedding generation.\n\n"
+      "See https://llvm.org/docs/CommandGuide/llvm-ir2vec.html for more "
+      "information.\n");
+
+  // Validate command line options
+  if (Mode == TripletMode && Level.getNumOccurrences() > 0)
+    errs() << "Warning: --level option is ignored in triplet mode\n";
+
+  // Parse the input LLVM IR file or stdin
+  SMDiagnostic Err;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename, EC);
+  if (EC) {
+    errs() << "Error opening output file: " << EC.message() << "\n";
+    return 1;
+  }
+
+  if (Error Err = processModule(*M, OS)) {
+    handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+      errs() << "Error: " << EIB.message() << "\n";
+    });
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp
index 607184e3b7247..86727931067a5 100644
--- a/llvm/tools/llvm-mc/Disassembler.cpp
+++ b/llvm/tools/llvm-mc/Disassembler.cpp
@@ -45,7 +45,11 @@ static bool PrintInsts(const MCDisassembler &DisAsm, const ByteArrayTy &Bytes,
     MCInst Inst;
 
     MCDisassembler::DecodeStatus S;
-    S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
+    if (STI.getTargetTriple().getArch() == Triple::hexagon)
+      S = DisAsm.getInstructionBundle(Inst, Size, Data.slice(Index), Index,
+                                      nulls());
+    else
+      S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
     switch (S) {
     case MCDisassembler::Fail:
       SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 0d209590655ef..175f77c894825 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -308,6 +308,8 @@ static const StringMap<MachineInfo> TargetMap{
     // RISC-V
     {"elf32-littleriscv", {ELF::EM_RISCV, false, true}},
     {"elf64-littleriscv", {ELF::EM_RISCV, true, true}},
+    {"elf32-bigriscv", {ELF::EM_RISCV, false, false}},
+    {"elf64-bigriscv", {ELF::EM_RISCV, true, false}},
     // PowerPC
     {"elf32-powerpc", {ELF::EM_PPC, false, false}},
     {"elf32-powerpcle", {ELF::EM_PPC, false, true}},
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index c3764c6e97534..c97e06f3ed173 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -241,17 +241,23 @@ defm prefix_strip
                          "paths. No effect without --prefix">,
       MetaVarName<"prefix">;
 
+def debug_indent_EQ : Joined<["--"], "debug-indent=">,
+  HelpText<"Distance to indent the source-level variable and inlined function display, "
+           "relative to the start of the disassembly">;
+
+def debug_inlined_funcs_EQ : Joined<["--"], "debug-inlined-funcs=">,
+  HelpText<"Print the locations of inlined functions alongside disassembly. "
+           "Supported formats: ascii, limits-only, and unicode (default)">,
+  Values<"ascii,limits-only,unicode">;
+def : Flag<["--"], "debug-inlined-funcs">, Alias<debug_inlined_funcs_EQ>, AliasArgs<["unicode"]>;
+
 def debug_vars_EQ : Joined<["--"], "debug-vars=">,
   HelpText<"Print the locations (in registers or memory) of "
            "source-level variables alongside disassembly. "
            "Supported formats: ascii, unicode (default)">,
-  Values<"unicode,ascii">;
+  Values<"ascii,unicode">;
 def : Flag<["--"], "debug-vars">, Alias<debug_vars_EQ>, AliasArgs<["unicode"]>;
 
-def debug_vars_indent_EQ : Joined<["--"], "debug-vars-indent=">,
-  HelpText<"Distance to indent the source-level variable display, "
-           "relative to the start of the disassembly">;
-
 def x86_asm_syntax_att : Flag<["--"], "x86-asm-syntax=att">,
   HelpText<"Emit AT&T-style disassembly">;
 
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.cpp b/llvm/tools/llvm-objdump/SourcePrinter.cpp
index 3630502172977..b0ff89da97123 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.cpp
+++ b/llvm/tools/llvm-objdump/SourcePrinter.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the LiveVariablePrinter and SourcePrinter classes to
+// This file implements the LiveElementPrinter and SourcePrinter classes to
 // keep track of DWARF info as the current address is updated, and print out the
-// source file line and variable liveness as needed.
+// source file line and variable or inlined function liveness as needed.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpressionPrinter.h"
 #include "llvm/DebugInfo/DWARF/LowLevel/DWARFExpression.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/Support/FormatVariadic.h"
 
 #define DEBUG_TYPE "objdump"
@@ -24,7 +25,70 @@
 namespace llvm {
 namespace objdump {
 
-bool LiveVariable::liveAtAddress(object::SectionedAddress Addr) {
+bool InlinedFunction::liveAtAddress(object::SectionedAddress Addr) const {
+  if (!Range.valid())
+    return false;
+
+  return Range.LowPC <= Addr.Address && Range.HighPC > Addr.Address;
+}
+
+void InlinedFunction::print(raw_ostream &OS, const MCRegisterInfo &MRI) const {
+  const char *MangledCallerName = FuncDie.getName(DINameKind::LinkageName);
+  if (!MangledCallerName)
+    return;
+
+  if (Demangle)
+    OS << "inlined into " << demangle(MangledCallerName);
+  else
+    OS << "inlined into " << MangledCallerName;
+}
+
+void InlinedFunction::dump(raw_ostream &OS) const {
+  OS << Name << " @ " << Range << ": ";
+}
+
+void InlinedFunction::printElementLine(raw_ostream &OS,
+                                       object::SectionedAddress Addr,
+                                       bool IsEnd) const {
+  bool LiveIn = !IsEnd && Range.LowPC == Addr.Address;
+  bool LiveOut = IsEnd && Range.HighPC == Addr.Address;
+  if (!(LiveIn || LiveOut))
+    return;
+
+  uint32_t CallFile, CallLine, CallColumn, CallDiscriminator;
+  InlinedFuncDie.getCallerFrame(CallFile, CallLine, CallColumn,
+                                CallDiscriminator);
+  const DWARFDebugLine::LineTable *LineTable =
+      Unit->getContext().getLineTableForUnit(Unit);
+  std::string FileName;
+  if (!LineTable->hasFileAtIndex(CallFile))
+    return;
+  if (!LineTable->getFileNameByIndex(
+          CallFile, Unit->getCompilationDir(),
+          DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FileName))
+    return;
+
+  if (FileName.empty())
+    return;
+
+  const char *MangledCallerName = FuncDie.getName(DINameKind::LinkageName);
+  if (!MangledCallerName)
+    return;
+
+  std::string CallerName = MangledCallerName;
+  std::string CalleeName = Name;
+  if (Demangle) {
+    CallerName = demangle(MangledCallerName);
+    CalleeName = demangle(Name);
+  }
+
+  OS << "; " << FileName << ":" << CallLine << ":" << CallColumn << ": ";
+  if (IsEnd)
+    OS << "end of ";
+  OS << CalleeName << " inlined into " << CallerName << "\n";
+}
+
+bool LiveVariable::liveAtAddress(object::SectionedAddress Addr) const {
   if (LocExpr.Range == std::nullopt)
     return false;
   return LocExpr.Range->SectionIndex == Addr.SectionIndex &&
@@ -49,7 +113,24 @@ void LiveVariable::print(raw_ostream &OS, const MCRegisterInfo &MRI) const {
   printDwarfExpressionCompact(&Expression, OS, GetRegName);
 }
 
-void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
+void LiveVariable::dump(raw_ostream &OS) const {
+  OS << Name << " @ " << LocExpr.Range << ": ";
+}
+
+void LiveElementPrinter::addInlinedFunction(DWARFDie FuncDie,
+                                            DWARFDie InlinedFuncDie) {
+  uint64_t FuncLowPC, FuncHighPC, SectionIndex;
+  if (!InlinedFuncDie.getLowAndHighPC(FuncLowPC, FuncHighPC, SectionIndex))
+    return;
+
+  DWARFUnit *U = InlinedFuncDie.getDwarfUnit();
+  const char *InlinedFuncName = InlinedFuncDie.getName(DINameKind::LinkageName);
+  DWARFAddressRange Range{FuncLowPC, FuncHighPC, SectionIndex};
+  LiveElements.emplace_back(std::make_unique<InlinedFunction>(
+      InlinedFuncName, U, FuncDie, InlinedFuncDie, Range));
+}
+
+void LiveElementPrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
   uint64_t FuncLowPC, FuncHighPC, SectionIndex;
   FuncDie.getLowAndHighPC(FuncLowPC, FuncHighPC, SectionIndex);
   const char *VarName = VarDie.getName(DINameKind::ShortName);
@@ -67,7 +148,8 @@ void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
 
   for (const DWARFLocationExpression &LocExpr : *Locs) {
     if (LocExpr.Range) {
-      LiveVariables.emplace_back(LocExpr, VarName, U, FuncDie);
+      LiveElements.emplace_back(
+          std::make_unique<LiveVariable>(LocExpr, VarName, U, FuncDie));
     } else {
       // If the LocExpr does not have an associated range, it is valid for
       // the whole of the function.
@@ -75,24 +157,30 @@ void LiveVariablePrinter::addVariable(DWARFDie FuncDie, DWARFDie VarDie) {
       // LocExpr, does that happen in reality?
       DWARFLocationExpression WholeFuncExpr{
           DWARFAddressRange(FuncLowPC, FuncHighPC, SectionIndex), LocExpr.Expr};
-      LiveVariables.emplace_back(WholeFuncExpr, VarName, U, FuncDie);
+      LiveElements.emplace_back(
+          std::make_unique<LiveVariable>(WholeFuncExpr, VarName, U, FuncDie));
     }
   }
 }
 
-void LiveVariablePrinter::addFunction(DWARFDie D) {
+void LiveElementPrinter::addFunction(DWARFDie D) {
   for (const DWARFDie &Child : D.children()) {
-    if (Child.getTag() == dwarf::DW_TAG_variable ||
-        Child.getTag() == dwarf::DW_TAG_formal_parameter)
+    if (DbgVariables != DFDisabled &&
+        (Child.getTag() == dwarf::DW_TAG_variable ||
+         Child.getTag() == dwarf::DW_TAG_formal_parameter)) {
       addVariable(D, Child);
-    else
+    } else if (DbgInlinedFunctions != DFDisabled &&
+               Child.getTag() == dwarf::DW_TAG_inlined_subroutine) {
+      addInlinedFunction(D, Child);
+      addFunction(Child);
+    } else
       addFunction(Child);
   }
 }
 
-// Get the column number (in characters) at which the first live variable
+// Get the column number (in characters) at which the first live element
 // line should be printed.
-unsigned LiveVariablePrinter::getIndentLevel() const {
+unsigned LiveElementPrinter::getIndentLevel() const {
   return DbgIndent + getInstStartColumn(STI);
 }
 
@@ -100,8 +188,8 @@ unsigned LiveVariablePrinter::getIndentLevel() const {
 // printed line, and return the index of that column.
 // TODO: formatted_raw_ostream uses "column" to mean a number of characters
 // since the last \n, and we use it to mean the number of slots in which we
-// put live variable lines. Pick a less overloaded word.
-unsigned LiveVariablePrinter::moveToFirstVarColumn(formatted_raw_ostream &OS) {
+// put live element lines. Pick a less overloaded word.
+unsigned LiveElementPrinter::moveToFirstVarColumn(formatted_raw_ostream &OS) {
   // Logical column number: column zero is the first column we print in, each
   // logical column is 2 physical columns wide.
   unsigned FirstUnprintedLogicalColumn =
@@ -117,7 +205,7 @@ unsigned LiveVariablePrinter::moveToFirstVarColumn(formatted_raw_ostream &OS) {
   return FirstUnprintedLogicalColumn;
 }
 
-unsigned LiveVariablePrinter::findFreeColumn() {
+unsigned LiveElementPrinter::findFreeColumn() {
   for (unsigned ColIdx = 0; ColIdx < ActiveCols.size(); ++ColIdx)
     if (!ActiveCols[ColIdx].isActive())
       return ColIdx;
@@ -127,15 +215,15 @@ unsigned LiveVariablePrinter::findFreeColumn() {
   return OldSize;
 }
 
-void LiveVariablePrinter::dump() const {
-  for (const LiveVariable &LV : LiveVariables) {
-    dbgs() << LV.VarName << " @ " << LV.LocExpr.Range << ": ";
-    LV.print(dbgs(), MRI);
+void LiveElementPrinter::dump() const {
+  for (const std::unique_ptr<LiveElement> &LE : LiveElements) {
+    LE->dump(dbgs());
+    LE->print(dbgs(), MRI);
     dbgs() << "\n";
   }
 }
 
-void LiveVariablePrinter::addCompileUnit(DWARFDie D) {
+void LiveElementPrinter::addCompileUnit(DWARFDie D) {
   if (D.getTag() == dwarf::DW_TAG_subprogram)
     addFunction(D);
   else
@@ -148,47 +236,57 @@ void LiveVariablePrinter::addCompileUnit(DWARFDie D) {
 /// live-in to the instruction, and any live range active at NextAddr is
 /// live-out of the instruction. If IncludeDefinedVars is false, then live
 /// ranges starting at NextAddr will be ignored.
-void LiveVariablePrinter::update(object::SectionedAddress ThisAddr,
-                                 object::SectionedAddress NextAddr,
-                                 bool IncludeDefinedVars) {
+void LiveElementPrinter::update(object::SectionedAddress ThisAddr,
+                                object::SectionedAddress NextAddr,
+                                bool IncludeDefinedVars) {
+  // Do not create live ranges when debug-inlined-funcs option is provided with
+  // line format option.
+  if (DbgInlinedFunctions == DFLimitsOnly)
+    return;
+
   // First, check variables which have already been assigned a column, so
   // that we don't change their order.
-  SmallSet<unsigned, 8> CheckedVarIdxs;
+  SmallSet<unsigned, 8> CheckedElementIdxs;
   for (unsigned ColIdx = 0, End = ActiveCols.size(); ColIdx < End; ++ColIdx) {
     if (!ActiveCols[ColIdx].isActive())
       continue;
-    CheckedVarIdxs.insert(ActiveCols[ColIdx].VarIdx);
-    LiveVariable &LV = LiveVariables[ActiveCols[ColIdx].VarIdx];
-    ActiveCols[ColIdx].LiveIn = LV.liveAtAddress(ThisAddr);
-    ActiveCols[ColIdx].LiveOut = LV.liveAtAddress(NextAddr);
+
+    CheckedElementIdxs.insert(ActiveCols[ColIdx].ElementIdx);
+    const std::unique_ptr<LiveElement> &LE =
+        LiveElements[ActiveCols[ColIdx].ElementIdx];
+    ActiveCols[ColIdx].LiveIn = LE->liveAtAddress(ThisAddr);
+    ActiveCols[ColIdx].LiveOut = LE->liveAtAddress(NextAddr);
+    std::string Name = Demangle ? demangle(LE->getName()) : LE->getName();
     LLVM_DEBUG(dbgs() << "pass 1, " << ThisAddr.Address << "-"
-                      << NextAddr.Address << ", " << LV.VarName << ", Col "
-                      << ColIdx << ": LiveIn=" << ActiveCols[ColIdx].LiveIn
+                      << NextAddr.Address << ", " << Name << ", Col " << ColIdx
+                      << ": LiveIn=" << ActiveCols[ColIdx].LiveIn
                       << ", LiveOut=" << ActiveCols[ColIdx].LiveOut << "\n");
 
     if (!ActiveCols[ColIdx].LiveIn && !ActiveCols[ColIdx].LiveOut)
-      ActiveCols[ColIdx].VarIdx = Column::NullVarIdx;
+      ActiveCols[ColIdx].ElementIdx = Column::NullElementIdx;
   }
 
   // Next, look for variables which don't already have a column, but which
   // are now live.
   if (IncludeDefinedVars) {
-    for (unsigned VarIdx = 0, End = LiveVariables.size(); VarIdx < End;
-         ++VarIdx) {
-      if (CheckedVarIdxs.count(VarIdx))
+    for (unsigned ElementIdx = 0, End = LiveElements.size(); ElementIdx < End;
+         ++ElementIdx) {
+      if (CheckedElementIdxs.count(ElementIdx))
         continue;
-      LiveVariable &LV = LiveVariables[VarIdx];
-      bool LiveIn = LV.liveAtAddress(ThisAddr);
-      bool LiveOut = LV.liveAtAddress(NextAddr);
+
+      const std::unique_ptr<LiveElement> &LE = LiveElements[ElementIdx];
+      bool LiveIn = LE->liveAtAddress(ThisAddr);
+      bool LiveOut = LE->liveAtAddress(NextAddr);
       if (!LiveIn && !LiveOut)
         continue;
 
       unsigned ColIdx = findFreeColumn();
+      std::string Name = Demangle ? demangle(LE->getName()) : LE->getName();
       LLVM_DEBUG(dbgs() << "pass 2, " << ThisAddr.Address << "-"
-                        << NextAddr.Address << ", " << LV.VarName << ", Col "
+                        << NextAddr.Address << ", " << Name << ", Col "
                         << ColIdx << ": LiveIn=" << LiveIn
                         << ", LiveOut=" << LiveOut << "\n");
-      ActiveCols[ColIdx].VarIdx = VarIdx;
+      ActiveCols[ColIdx].ElementIdx = ElementIdx;
       ActiveCols[ColIdx].LiveIn = LiveIn;
       ActiveCols[ColIdx].LiveOut = LiveOut;
       ActiveCols[ColIdx].MustDrawLabel = true;
@@ -205,8 +303,8 @@ enum class LineChar {
   LabelCornerActive,
   LabelHoriz,
 };
-const char *LiveVariablePrinter::getLineChar(LineChar C) const {
-  bool IsASCII = DbgVariables == DVASCII;
+const char *LiveElementPrinter::getLineChar(LineChar C) const {
+  bool IsASCII = DbgVariables == DFASCII || DbgInlinedFunctions == DFASCII;
   switch (C) {
   case LineChar::RangeStart:
     return IsASCII ? "^" : (const char *)u8"\u2548";
@@ -231,8 +329,8 @@ const char *LiveVariablePrinter::getLineChar(LineChar C) const {
 /// we only need to print active ranges or empty columns. If AfterInst is
 /// true, this is being printed after the last instruction fed to update(),
 /// otherwise this is being printed before it.
-void LiveVariablePrinter::printAfterOtherLine(formatted_raw_ostream &OS,
-                                              bool AfterInst) {
+void LiveElementPrinter::printAfterOtherLine(formatted_raw_ostream &OS,
+                                             bool AfterInst) {
   if (ActiveCols.size()) {
     unsigned FirstUnprintedColumn = moveToFirstVarColumn(OS);
     for (size_t ColIdx = FirstUnprintedColumn, End = ActiveCols.size();
@@ -252,15 +350,15 @@ void LiveVariablePrinter::printAfterOtherLine(formatted_raw_ostream &OS,
   OS << "\n";
 }
 
-/// Print any live variable range info needed to the right of a
-/// non-instruction line of disassembly. This is where we print the variable
+/// Print any live element range info needed to the right of a
+/// non-instruction line of disassembly. This is where we print the element
 /// names and expressions, with thin line-drawing characters connecting them
 /// to the live range which starts at the next instruction. If MustPrint is
 /// true, we have to print at least one line (with the continuation of any
 /// already-active live ranges) because something has already been printed
 /// earlier on this line.
-void LiveVariablePrinter::printBetweenInsts(formatted_raw_ostream &OS,
-                                            bool MustPrint) {
+void LiveElementPrinter::printBetweenInsts(formatted_raw_ostream &OS,
+                                           bool MustPrint) {
   bool PrintedSomething = false;
   for (unsigned ColIdx = 0, End = ActiveCols.size(); ColIdx < End; ++ColIdx) {
     if (ActiveCols[ColIdx].isActive() && ActiveCols[ColIdx].MustDrawLabel) {
@@ -277,17 +375,20 @@ void LiveVariablePrinter::printBetweenInsts(formatted_raw_ostream &OS,
           OS << "  ";
       }
 
+      const std::unique_ptr<LiveElement> &LE =
+          LiveElements[ActiveCols[ColIdx].ElementIdx];
       // Then print the variable name and location of the new live range,
       // with box drawing characters joining it to the live range line.
       OS << getLineChar(ActiveCols[ColIdx].LiveIn ? LineChar::LabelCornerActive
                                                   : LineChar::LabelCornerNew)
          << getLineChar(LineChar::LabelHoriz) << " ";
-      WithColor(OS, raw_ostream::GREEN)
-          << LiveVariables[ActiveCols[ColIdx].VarIdx].VarName;
+
+      std::string Name = Demangle ? demangle(LE->getName()) : LE->getName();
+      WithColor(OS, raw_ostream::GREEN) << Name;
       OS << " = ";
       {
         WithColor ExprColor(OS, raw_ostream::CYAN);
-        LiveVariables[ActiveCols[ColIdx].VarIdx].print(OS, MRI);
+        LE->print(OS, MRI);
       }
 
       // If there are any columns to the right of the expression we just
@@ -317,8 +418,8 @@ void LiveVariablePrinter::printBetweenInsts(formatted_raw_ostream &OS,
     printAfterOtherLine(OS, false);
 }
 
-/// Print the live variable ranges to the right of a disassembled instruction.
-void LiveVariablePrinter::printAfterInst(formatted_raw_ostream &OS) {
+/// Print the live element ranges to the right of a disassembled instruction.
+void LiveElementPrinter::printAfterInst(formatted_raw_ostream &OS) {
   if (!ActiveCols.size())
     return;
   unsigned FirstUnprintedColumn = moveToFirstVarColumn(OS);
@@ -337,6 +438,24 @@ void LiveVariablePrinter::printAfterInst(formatted_raw_ostream &OS) {
   }
 }
 
+void LiveElementPrinter::printStartLine(formatted_raw_ostream &OS,
+                                        object::SectionedAddress Addr) {
+  // Print a line to idenfity the start of an inlined function if line format
+  // is specified.
+  if (DbgInlinedFunctions == DFLimitsOnly)
+    for (const std::unique_ptr<LiveElement> &LE : LiveElements)
+      LE->printElementLine(OS, Addr, false);
+}
+
+void LiveElementPrinter::printEndLine(formatted_raw_ostream &OS,
+                                      object::SectionedAddress Addr) {
+  // Print a line to idenfity the end of an inlined function if line format is
+  // specified.
+  if (DbgInlinedFunctions == DFLimitsOnly)
+    for (const std::unique_ptr<LiveElement> &LE : LiveElements)
+      LE->printElementLine(OS, Addr, true);
+}
+
 bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
   std::unique_ptr<MemoryBuffer> Buffer;
   if (LineInfo.Source) {
@@ -371,7 +490,7 @@ bool SourcePrinter::cacheSource(const DILineInfo &LineInfo) {
 void SourcePrinter::printSourceLine(formatted_raw_ostream &OS,
                                     object::SectionedAddress Address,
                                     StringRef ObjectFilename,
-                                    LiveVariablePrinter &LVP,
+                                    LiveElementPrinter &LEP,
                                     StringRef Delimiter) {
   if (!Symbolizer)
     return;
@@ -419,15 +538,16 @@ void SourcePrinter::printSourceLine(formatted_raw_ostream &OS,
   }
 
   if (PrintLines)
-    printLines(OS, LineInfo, Delimiter, LVP);
+    printLines(OS, Address, LineInfo, Delimiter, LEP);
   if (PrintSource)
-    printSources(OS, LineInfo, ObjectFilename, Delimiter, LVP);
+    printSources(OS, LineInfo, ObjectFilename, Delimiter, LEP);
   OldLineInfo = LineInfo;
 }
 
 void SourcePrinter::printLines(formatted_raw_ostream &OS,
+                               object::SectionedAddress Address,
                                const DILineInfo &LineInfo, StringRef Delimiter,
-                               LiveVariablePrinter &LVP) {
+                               LiveElementPrinter &LEP) {
   bool PrintFunctionName = LineInfo.FunctionName != DILineInfo::BadString &&
                            LineInfo.FunctionName != OldLineInfo.FunctionName;
   if (PrintFunctionName) {
@@ -442,7 +562,7 @@ void SourcePrinter::printLines(formatted_raw_ostream &OS,
       (OldLineInfo.Line != LineInfo.Line ||
        OldLineInfo.FileName != LineInfo.FileName || PrintFunctionName)) {
     OS << Delimiter << LineInfo.FileName << ":" << LineInfo.Line;
-    LVP.printBetweenInsts(OS, true);
+    LEP.printBetweenInsts(OS, true);
   }
 }
 
@@ -477,7 +597,7 @@ StringRef SourcePrinter::getLine(const DILineInfo &LineInfo,
 void SourcePrinter::printSources(formatted_raw_ostream &OS,
                                  const DILineInfo &LineInfo,
                                  StringRef ObjectFilename, StringRef Delimiter,
-                                 LiveVariablePrinter &LVP) {
+                                 LiveElementPrinter &LEP) {
   if (LineInfo.FileName == DILineInfo::BadString || LineInfo.Line == 0 ||
       (OldLineInfo.Line == LineInfo.Line &&
        OldLineInfo.FileName == LineInfo.FileName))
@@ -486,7 +606,7 @@ void SourcePrinter::printSources(formatted_raw_ostream &OS,
   StringRef Line = getLine(LineInfo, ObjectFilename);
   if (!Line.empty()) {
     OS << Delimiter << Line;
-    LVP.printBetweenInsts(OS, true);
+    LEP.printBetweenInsts(OS, true);
   }
 }
 
diff --git a/llvm/tools/llvm-objdump/SourcePrinter.h b/llvm/tools/llvm-objdump/SourcePrinter.h
index fc67fc6507444..5c131a0eb1fd7 100644
--- a/llvm/tools/llvm-objdump/SourcePrinter.h
+++ b/llvm/tools/llvm-objdump/SourcePrinter.h
@@ -22,40 +22,83 @@
 namespace llvm {
 namespace objdump {
 
+/// Base class for representing the location of a source-level variable or
+/// an inlined function.
+class LiveElement {
+protected:
+  const char *Name;
+  DWARFUnit *Unit;
+  const DWARFDie FuncDie;
+
+public:
+  LiveElement(const char *Name, DWARFUnit *Unit, const DWARFDie FuncDie)
+      : Name(Name), Unit(Unit), FuncDie(FuncDie) {}
+
+  virtual ~LiveElement() {};
+  const char *getName() const { return Name; }
+
+  virtual bool liveAtAddress(object::SectionedAddress Addr) const = 0;
+  virtual void print(raw_ostream &OS, const MCRegisterInfo &MRI) const = 0;
+  virtual void dump(raw_ostream &OS) const = 0;
+  virtual void printElementLine(raw_ostream &OS,
+                                object::SectionedAddress Address,
+                                bool IsEnd) const {}
+};
+
+class InlinedFunction : public LiveElement {
+private:
+  DWARFDie InlinedFuncDie;
+  DWARFAddressRange Range;
+
+public:
+  InlinedFunction(const char *FunctionName, DWARFUnit *Unit,
+                  const DWARFDie FuncDie, const DWARFDie InlinedFuncDie,
+                  DWARFAddressRange &Range)
+      : LiveElement(FunctionName, Unit, FuncDie),
+        InlinedFuncDie(InlinedFuncDie), Range(Range) {}
+
+  bool liveAtAddress(object::SectionedAddress Addr) const override;
+  void print(raw_ostream &OS, const MCRegisterInfo &MRI) const override;
+  void dump(raw_ostream &OS) const override;
+  void printElementLine(raw_ostream &OS, object::SectionedAddress Address,
+                        bool IsEnd) const override;
+};
+
 /// Stores a single expression representing the location of a source-level
 /// variable, along with the PC range for which that expression is valid.
-struct LiveVariable {
+class LiveVariable : public LiveElement {
+private:
   DWARFLocationExpression LocExpr;
-  const char *VarName;
-  DWARFUnit *Unit;
-  const DWARFDie FuncDie;
 
+public:
   LiveVariable(const DWARFLocationExpression &LocExpr, const char *VarName,
                DWARFUnit *Unit, const DWARFDie FuncDie)
-      : LocExpr(LocExpr), VarName(VarName), Unit(Unit), FuncDie(FuncDie) {}
+      : LiveElement(VarName, Unit, FuncDie), LocExpr(LocExpr) {}
 
-  bool liveAtAddress(object::SectionedAddress Addr);
-
-  void print(raw_ostream &OS, const MCRegisterInfo &MRI) const;
+  bool liveAtAddress(object::SectionedAddress Addr) const override;
+  void print(raw_ostream &OS, const MCRegisterInfo &MRI) const override;
+  void dump(raw_ostream &OS) const override;
 };
 
-/// Helper class for printing source variable locations alongside disassembly.
-class LiveVariablePrinter {
-  // Information we want to track about one column in which we are printing a
-  // variable live range.
+/// Helper class for printing source locations for variables and inlined
+/// subroutines alongside disassembly.
+class LiveElementPrinter {
+  // Information we want to track about one column in which we are printing an
+  // element live range.
   struct Column {
-    unsigned VarIdx = NullVarIdx;
+    unsigned ElementIdx = NullElementIdx;
     bool LiveIn = false;
     bool LiveOut = false;
     bool MustDrawLabel = false;
 
-    bool isActive() const { return VarIdx != NullVarIdx; }
+    bool isActive() const { return ElementIdx != NullElementIdx; }
 
-    static constexpr unsigned NullVarIdx = std::numeric_limits<unsigned>::max();
+    static constexpr unsigned NullElementIdx =
+        std::numeric_limits<unsigned>::max();
   };
 
-  // All live variables we know about in the object/image file.
-  std::vector<LiveVariable> LiveVariables;
+  // All live elements we know about in the object/image file.
+  std::vector<std::unique_ptr<LiveElement>> LiveElements;
 
   // The columns we are currently drawing.
   IndexedMap<Column> ActiveCols;
@@ -63,11 +106,12 @@ class LiveVariablePrinter {
   const MCRegisterInfo &MRI;
   const MCSubtargetInfo &STI;
 
+  void addInlinedFunction(DWARFDie FuncDie, DWARFDie InlinedFuncDie);
   void addVariable(DWARFDie FuncDie, DWARFDie VarDie);
 
   void addFunction(DWARFDie D);
 
-  // Get the column number (in characters) at which the first live variable
+  // Get the column number (in characters) at which the first live element
   // line should be printed.
   unsigned getIndentLevel() const;
 
@@ -75,13 +119,13 @@ class LiveVariablePrinter {
   // printed line, and return the index of that column.
   // TODO: formatted_raw_ostream uses "column" to mean a number of characters
   // since the last \n, and we use it to mean the number of slots in which we
-  // put live variable lines. Pick a less overloaded word.
+  // put live element lines. Pick a less overloaded word.
   unsigned moveToFirstVarColumn(formatted_raw_ostream &OS);
 
   unsigned findFreeColumn();
 
 public:
-  LiveVariablePrinter(const MCRegisterInfo &MRI, const MCSubtargetInfo &STI)
+  LiveElementPrinter(const MCRegisterInfo &MRI, const MCSubtargetInfo &STI)
       : ActiveCols(Column()), MRI(MRI), STI(STI) {}
 
   void dump() const;
@@ -114,7 +158,7 @@ class LiveVariablePrinter {
   /// otherwise this is being printed before it.
   void printAfterOtherLine(formatted_raw_ostream &OS, bool AfterInst);
 
-  /// Print any live variable range info needed to the right of a
+  /// Print any live element range info needed to the right of a
   /// non-instruction line of disassembly. This is where we print the variable
   /// names and expressions, with thin line-drawing characters connecting them
   /// to the live range which starts at the next instruction. If MustPrint is
@@ -123,8 +167,13 @@ class LiveVariablePrinter {
   /// earlier on this line.
   void printBetweenInsts(formatted_raw_ostream &OS, bool MustPrint);
 
-  /// Print the live variable ranges to the right of a disassembled instruction.
+  /// Print the live element ranges to the right of a disassembled instruction.
   void printAfterInst(formatted_raw_ostream &OS);
+
+  /// Print a line to idenfity the start of a live element.
+  void printStartLine(formatted_raw_ostream &OS, object::SectionedAddress Addr);
+  /// Print a line to idenfity the end of a live element.
+  void printEndLine(formatted_raw_ostream &OS, object::SectionedAddress Addr);
 };
 
 class SourcePrinter {
@@ -144,12 +193,13 @@ class SourcePrinter {
 private:
   bool cacheSource(const DILineInfo &LineInfoFile);
 
-  void printLines(formatted_raw_ostream &OS, const DILineInfo &LineInfo,
-                  StringRef Delimiter, LiveVariablePrinter &LVP);
+  void printLines(formatted_raw_ostream &OS, object::SectionedAddress Address,
+                  const DILineInfo &LineInfo, StringRef Delimiter,
+                  LiveElementPrinter &LEP);
 
   void printSources(formatted_raw_ostream &OS, const DILineInfo &LineInfo,
                     StringRef ObjectFilename, StringRef Delimiter,
-                    LiveVariablePrinter &LVP);
+                    LiveElementPrinter &LEP);
 
   // Returns line source code corresponding to `LineInfo`.
   // Returns empty string if source code cannot be found.
@@ -162,7 +212,7 @@ class SourcePrinter {
   virtual void printSourceLine(formatted_raw_ostream &OS,
                                object::SectionedAddress Address,
                                StringRef ObjectFilename,
-                               LiveVariablePrinter &LVP,
+                               LiveElementPrinter &LEP,
                                StringRef Delimiter = "; ");
 };
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index c5967cd090eec..0316c4ba51da6 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -348,7 +348,8 @@ static bool Wide;
 std::string objdump::Prefix;
 uint32_t objdump::PrefixStrip;
 
-DebugVarsFormat objdump::DbgVariables = DVDisabled;
+DebugFormat objdump::DbgVariables = DFDisabled;
+DebugFormat objdump::DbgInlinedFunctions = DFDisabled;
 
 int objdump::DbgIndent = 52;
 
@@ -523,8 +524,8 @@ static const Target *getTarget(const ObjectFile *Obj) {
 
   // Get the target specific parser.
   std::string Error;
-  const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
-                                                         Error);
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
   if (!TheTarget)
     reportError(Obj->getFileName(), "can't find target: " + Error);
 
@@ -633,7 +634,7 @@ static bool isCSKYElf(const ObjectFile &Obj) {
 }
 
 static bool hasMappingSymbols(const ObjectFile &Obj) {
-  return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj) ;
+  return isArmElf(Obj) || isAArch64Elf(Obj) || isCSKYElf(Obj);
 }
 
 static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
@@ -653,7 +654,7 @@ static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
 
 static void printBTFRelocation(formatted_raw_ostream &FOS, llvm::BTFParser &BTF,
                                object::SectionedAddress Address,
-                               LiveVariablePrinter &LVP) {
+                               LiveElementPrinter &LEP) {
   const llvm::BTF::BPFFieldReloc *Reloc = BTF.findFieldReloc(Address);
   if (!Reloc)
     return;
@@ -664,7 +665,7 @@ static void printBTFRelocation(formatted_raw_ostream &FOS, llvm::BTFParser &BTF,
   if (LeadingAddr)
     FOS << format("%016" PRIx64 ":  ", Address.Address + AdjustVMA);
   FOS << "CO-RE " << Val;
-  LVP.printAfterOtherLine(FOS, true);
+  LEP.printAfterOtherLine(FOS, true);
 }
 
 class PrettyPrinter {
@@ -675,10 +676,11 @@ class PrettyPrinter {
             object::SectionedAddress Address, formatted_raw_ostream &OS,
             StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
             StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-            LiveVariablePrinter &LVP) {
+            LiveElementPrinter &LEP) {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
-    LVP.printBetweenInsts(OS, false);
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+    LEP.printStartLine(OS, Address);
+    LEP.printBetweenInsts(OS, false);
 
     printRawData(Bytes, Address.Address, OS, STI);
 
@@ -693,6 +695,30 @@ class PrettyPrinter {
     } else
       OS << "\t<unknown>";
   }
+
+  virtual void emitPostInstructionInfo(formatted_raw_ostream &FOS,
+                                       const MCAsmInfo &MAI,
+                                       const MCSubtargetInfo &STI,
+                                       StringRef Comments,
+                                       LiveElementPrinter &LEP) {
+    do {
+      if (!Comments.empty()) {
+        // Emit a line of comments.
+        StringRef Comment;
+        std::tie(Comment, Comments) = Comments.split('\n');
+        // MAI.getCommentColumn() assumes that instructions are printed at the
+        // position of 8, while getInstStartColumn() returns the actual
+        // position.
+        unsigned CommentColumn =
+            MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
+        FOS.PadToColumn(CommentColumn);
+        FOS << MAI.getCommentString() << ' ' << Comment;
+      }
+      LEP.printAfterInst(FOS);
+      FOS << "\n";
+    } while (!Comments.empty());
+    FOS.flush();
+  }
 };
 PrettyPrinter PrettyPrinterInst;
 
@@ -714,70 +740,103 @@ class HexagonPrettyPrinter : public PrettyPrinter {
       }
     }
   }
+
+  std::string getInstructionSeparator() const {
+    SmallString<40> Separator;
+    raw_svector_ostream OS(Separator);
+    if (ShouldClosePacket) {
+      OS << " }";
+      if (IsLoop0 || IsLoop1)
+        OS << "  ";
+      if (IsLoop0)
+        OS << (IsLoop1 ? ":endloop01" : ":endloop0");
+      else if (IsLoop1)
+        OS << ":endloop1";
+    }
+    OS << '\n';
+    return OS.str().str();
+  }
+
+  void emitPostInstructionInfo(formatted_raw_ostream &FOS, const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI, StringRef Comments,
+                               LiveElementPrinter &LEP) override {
+    // Hexagon does not write anything to the comment stream, so we can just
+    // print the separator.
+    LEP.printAfterInst(FOS);
+    FOS << getInstructionSeparator();
+    FOS.flush();
+    if (ShouldClosePacket)
+      reset();
+  }
+
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
                  StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-                 LiveVariablePrinter &LVP) override {
+                 LiveElementPrinter &LEP) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP, "");
     if (!MI) {
       printLead(Bytes, Address.Address, OS);
       OS << " <unknown>";
+      reset();
       return;
     }
-    std::string Buffer;
+
+    StringRef Preamble = IsStartOfBundle ? " { " : "   ";
+
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP, "");
+    printLead(Bytes, Address.Address, OS);
+    OS << Preamble;
+    std::string Buf;
     {
-      raw_string_ostream TempStream(Buffer);
+      raw_string_ostream TempStream(Buf);
       IP.printInst(MI, Address.Address, "", STI, TempStream);
     }
-    StringRef Contents(Buffer);
-    // Split off bundle attributes
-    auto PacketBundle = Contents.rsplit('\n');
-    // Split off first instruction from the rest
-    auto HeadTail = PacketBundle.first.split('\n');
-    auto Preamble = " { ";
-    auto Separator = "";
-
-    // Hexagon's packets require relocations to be inline rather than
-    // clustered at the end of the packet.
-    std::vector<RelocationRef>::const_iterator RelCur = Rels->begin();
-    std::vector<RelocationRef>::const_iterator RelEnd = Rels->end();
-    auto PrintReloc = [&]() -> void {
-      while ((RelCur != RelEnd) && (RelCur->getOffset() <= Address.Address)) {
-        if (RelCur->getOffset() == Address.Address) {
-          printRelocation(OS, ObjectFilename, *RelCur, Address.Address, false);
-          return;
-        }
-        ++RelCur;
-      }
-    };
+    StringRef Contents(Buf);
+
+    auto Duplex = Contents.split('\v');
+    bool HasDuplex = !Duplex.second.empty();
+    if (HasDuplex) {
+      OS << Duplex.first;
+      OS << "; ";
+      OS << Duplex.second;
+    } else {
+      OS << Duplex.first;
+    }
 
-    while (!HeadTail.first.empty()) {
-      OS << Separator;
-      Separator = "\n";
-      if (SP && (PrintSource || PrintLines))
-        SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
-      printLead(Bytes, Address.Address, OS);
-      OS << Preamble;
-      Preamble = "   ";
-      StringRef Inst;
-      auto Duplex = HeadTail.first.split('\v');
-      if (!Duplex.second.empty()) {
-        OS << Duplex.first;
-        OS << "; ";
-        Inst = Duplex.second;
-      }
+    uint32_t Instruction = support::endian::read32le(Bytes.data());
+
+    uint32_t ParseMask = 0x0000c000;
+    uint32_t PacketEndMask = 0x0000c000;
+    uint32_t LoopEndMask = 0x00008000;
+    uint32_t ParseBits = Instruction & ParseMask;
+
+    if (ParseBits == LoopEndMask) {
+      if (IsStartOfBundle)
+        IsLoop0 = true;
       else
-        Inst = HeadTail.first;
-      OS << Inst;
-      HeadTail = HeadTail.second.split('\n');
-      if (HeadTail.first.empty())
-        OS << " } " << PacketBundle.second;
-      PrintReloc();
-      Bytes = Bytes.slice(4);
-      Address.Address += 4;
+        IsLoop1 = true;
     }
+
+    IsStartOfBundle = false;
+
+    if (ParseBits == PacketEndMask || HasDuplex)
+      ShouldClosePacket = true;
+  }
+
+private:
+  bool IsStartOfBundle = true;
+  bool IsLoop0 = false;
+  bool IsLoop1 = false;
+  bool ShouldClosePacket = false;
+
+  void reset() {
+    IsStartOfBundle = true;
+    IsLoop0 = false;
+    IsLoop1 = false;
+    ShouldClosePacket = false;
   }
 };
 HexagonPrettyPrinter HexagonPrettyPrinterInst;
@@ -788,9 +847,9 @@ class AMDGCNPrettyPrinter : public PrettyPrinter {
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
                  StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-                 LiveVariablePrinter &LVP) override {
+                 LiveElementPrinter &LEP) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP);
 
     if (MI) {
       SmallString<40> InstStr;
@@ -809,10 +868,10 @@ class AMDGCNPrettyPrinter : public PrettyPrinter {
             support::endian::read32<llvm::endianness::little>(Bytes.data()));
         OS.indent(42);
       } else {
-          OS << format("\t.byte 0x%02" PRIx8, Bytes[0]);
-          for (unsigned int i = 1; i < Bytes.size(); i++)
-            OS << format(", 0x%02" PRIx8, Bytes[i]);
-          OS.indent(55 - (6 * Bytes.size()));
+        OS << format("\t.byte 0x%02" PRIx8, Bytes[0]);
+        for (unsigned int i = 1; i < Bytes.size(); i++)
+          OS << format(", 0x%02" PRIx8, Bytes[i]);
+        OS.indent(55 - (6 * Bytes.size()));
       }
     }
 
@@ -823,7 +882,7 @@ class AMDGCNPrettyPrinter : public PrettyPrinter {
       for (uint32_t D :
            ArrayRef(reinterpret_cast<const support::little32_t *>(Bytes.data()),
                     Bytes.size() / 4))
-          OS << format(" %08" PRIX32, D);
+        OS << format(" %08" PRIX32, D);
     } else {
       for (unsigned char B : Bytes)
         OS << format(" %02" PRIX8, B);
@@ -841,9 +900,9 @@ class BPFPrettyPrinter : public PrettyPrinter {
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
                  StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-                 LiveVariablePrinter &LVP) override {
+                 LiveElementPrinter &LEP) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP);
     if (LeadingAddr)
       OS << format("%8" PRId64 ":", Address.Address / 8);
     if (ShowRawInsn) {
@@ -864,10 +923,11 @@ class ARMPrettyPrinter : public PrettyPrinter {
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
                  StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-                 LiveVariablePrinter &LVP) override {
+                 LiveElementPrinter &LEP) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
-    LVP.printBetweenInsts(OS, false);
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+    LEP.printStartLine(OS, Address);
+    LEP.printBetweenInsts(OS, false);
 
     size_t Start = OS.tell();
     if (LeadingAddr)
@@ -918,10 +978,11 @@ class AArch64PrettyPrinter : public PrettyPrinter {
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
                  StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-                 LiveVariablePrinter &LVP) override {
+                 LiveElementPrinter &LEP) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
-    LVP.printBetweenInsts(OS, false);
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+    LEP.printStartLine(OS, Address);
+    LEP.printBetweenInsts(OS, false);
 
     size_t Start = OS.tell();
     if (LeadingAddr)
@@ -956,10 +1017,11 @@ class RISCVPrettyPrinter : public PrettyPrinter {
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
                  StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
-                 LiveVariablePrinter &LVP) override {
+                 LiveElementPrinter &LEP) override {
     if (SP && (PrintSource || PrintLines))
-      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
-    LVP.printBetweenInsts(OS, false);
+      SP->printSourceLine(OS, Address, ObjectFilename, LEP);
+    LEP.printStartLine(OS, Address);
+    LEP.printBetweenInsts(OS, false);
 
     size_t Start = OS.tell();
     if (LeadingAddr)
@@ -1000,7 +1062,7 @@ class RISCVPrettyPrinter : public PrettyPrinter {
 RISCVPrettyPrinter RISCVPrettyPrinterInst;
 
 PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
-  switch(Triple.getArch()) {
+  switch (Triple.getArch()) {
   default:
     return PrettyPrinterInst;
   case Triple::hexagon:
@@ -1051,8 +1113,7 @@ class DisassemblerTarget {
 DisassemblerTarget::DisassemblerTarget(const Target *TheTarget, ObjectFile &Obj,
                                        StringRef TripleName, StringRef MCPU,
                                        SubtargetFeatures &Features)
-    : TheTarget(TheTarget),
-      Printer(&selectPrettyPrinter(Triple(TripleName))),
+    : TheTarget(TheTarget), Printer(&selectPrettyPrinter(Triple(TripleName))),
       RegisterInfo(TheTarget->createMCRegInfo(TripleName)) {
   if (!RegisterInfo)
     reportError(Obj.getFileName(), "no register info for target " + TripleName);
@@ -1331,7 +1392,6 @@ static bool shouldAdjustVA(const SectionRef &Section) {
   return false;
 }
 
-
 typedef std::pair<uint64_t, char> MappingSymbolPair;
 static char getMappingSymbolKind(ArrayRef<MappingSymbolPair> MappingSymbols,
                                  uint64_t Address) {
@@ -1359,8 +1419,7 @@ static uint64_t dumpARMELFData(uint64_t SectionAddr, uint64_t Index,
     dumpBytes(Bytes.slice(Index, 4), OS);
     AlignToInstStartColumn(Start, STI, OS);
     OS << "\t.word\t"
-           << format_hex(support::endian::read32(Bytes.data() + Index, Endian),
-                         10);
+       << format_hex(support::endian::read32(Bytes.data() + Index, Endian), 10);
     return 4;
   }
   if (Index + 2 <= End) {
@@ -1610,29 +1669,6 @@ static StringRef getSegmentName(const MachOObjectFile *MachO,
   return "";
 }
 
-static void emitPostInstructionInfo(formatted_raw_ostream &FOS,
-                                    const MCAsmInfo &MAI,
-                                    const MCSubtargetInfo &STI,
-                                    StringRef Comments,
-                                    LiveVariablePrinter &LVP) {
-  do {
-    if (!Comments.empty()) {
-      // Emit a line of comments.
-      StringRef Comment;
-      std::tie(Comment, Comments) = Comments.split('\n');
-      // MAI.getCommentColumn() assumes that instructions are printed at the
-      // position of 8, while getInstStartColumn() returns the actual position.
-      unsigned CommentColumn =
-          MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
-      FOS.PadToColumn(CommentColumn);
-      FOS << MAI.getCommentString() << ' ' << Comment;
-    }
-    LVP.printAfterInst(FOS);
-    FOS << '\n';
-  } while (!Comments.empty());
-  FOS.flush();
-}
-
 static void createFakeELFSections(ObjectFile &Obj) {
   assert(Obj.isELF());
   if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(&Obj))
@@ -1757,9 +1793,9 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
       // STAB symbol's section field refers to a valid section index. Otherwise
       // the symbol may error trying to load a section that does not exist.
       DataRefImpl SymDRI = Symbol.getRawDataRefImpl();
-      uint8_t NType = (MachO->is64Bit() ?
-                       MachO->getSymbol64TableEntry(SymDRI).n_type:
-                       MachO->getSymbolTableEntry(SymDRI).n_type);
+      uint8_t NType =
+          (MachO->is64Bit() ? MachO->getSymbol64TableEntry(SymDRI).n_type
+                            : MachO->getSymbolTableEntry(SymDRI).n_type);
       if (NType & MachO::N_STAB)
         continue;
     }
@@ -1858,15 +1894,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
   llvm::stable_sort(AbsoluteSymbols);
 
   std::unique_ptr<DWARFContext> DICtx;
-  LiveVariablePrinter LVP(*DT->Context->getRegisterInfo(), *DT->SubtargetInfo);
+  LiveElementPrinter LEP(*DT->Context->getRegisterInfo(), *DT->SubtargetInfo);
 
-  if (DbgVariables != DVDisabled) {
+  if (DbgVariables != DFDisabled || DbgInlinedFunctions != DFDisabled) {
     DICtx = DWARFContext::create(DbgObj);
     for (const std::unique_ptr<DWARFUnit> &CU : DICtx->compile_units())
-      LVP.addCompileUnit(CU->getUnitDIE(false));
+      LEP.addCompileUnit(CU->getUnitDIE(false));
   }
 
-  LLVM_DEBUG(LVP.dump());
+  LLVM_DEBUG(LEP.dump());
 
   BBAddrMapInfo FullAddrMap;
   auto ReadBBAddrMap = [&](std::optional<unsigned> SectionIndex =
@@ -2334,8 +2370,9 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                 ThisBytes.size(),
                 DT->DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr));
 
-          LVP.update({Index, Section.getIndex()},
-                     {Index + Size, Section.getIndex()}, Index + Size != End);
+          LEP.update({ThisAddr, Section.getIndex()},
+                     {ThisAddr + Size, Section.getIndex()},
+                     Index + Size != End);
 
           DT->InstPrinter->setCommentStream(CommentStream);
 
@@ -2343,7 +2380,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
               *DT->InstPrinter, Disassembled ? &Inst : nullptr,
               Bytes.slice(Index, Size),
               {SectionAddr + Index + VMAAdjustment, Section.getIndex()}, FOS,
-              "", *DT->SubtargetInfo, &SP, Obj.getFileName(), &Rels, LVP);
+              "", *DT->SubtargetInfo, &SP, Obj.getFileName(), &Rels, LEP);
 
           DT->InstPrinter->setCommentStream(llvm::nulls());
 
@@ -2526,24 +2563,28 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         }
 
         assert(DT->Context->getAsmInfo());
-        emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
-                                *DT->SubtargetInfo, CommentStream.str(), LVP);
+        DT->Printer->emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
+                                             *DT->SubtargetInfo,
+                                             CommentStream.str(), LEP);
         Comments.clear();
 
         if (BTF)
-          printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
+          printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LEP);
 
-        // Hexagon handles relocs in pretty printer
-        if (InlineRelocs && Obj.getArch() != Triple::hexagon) {
+        if (InlineRelocs) {
           while (findRel()) {
             // When --adjust-vma is used, update the address printed.
             printRelocation(FOS, Obj.getFileName(), *RelCur,
                             SectionAddr + RelOffset + VMAAdjustment, Is64Bits);
-            LVP.printAfterOtherLine(FOS, true);
+            LEP.printAfterOtherLine(FOS, true);
             ++RelCur;
           }
         }
 
+        object::SectionedAddress NextAddr = {
+            SectionAddr + Index + VMAAdjustment + Size, Section.getIndex()};
+        LEP.printEndLine(FOS, NextAddr);
+
         Index += Size;
       }
     }
@@ -2835,7 +2876,8 @@ void objdump::printSectionContents(const ObjectFile *Obj) {
       continue;
     }
 
-    StringRef Contents = unwrapOrError(Section.getContents(), Obj->getFileName());
+    StringRef Contents =
+        unwrapOrError(Section.getContents(), Obj->getFileName());
 
     // Dump out the content as hex and printable ascii characters.
     for (std::size_t Addr = 0, End = Contents.size(); Addr < End; Addr += 16) {
@@ -3259,8 +3301,8 @@ static bool shouldWarnForInvalidStartStopAddress(ObjectFile *Obj) {
   return false;
 }
 
-static void checkForInvalidStartStopAddress(ObjectFile *Obj,
-                                            uint64_t Start, uint64_t Stop) {
+static void checkForInvalidStartStopAddress(ObjectFile *Obj, uint64_t Start,
+                                            uint64_t Stop) {
   if (!shouldWarnForInvalidStartStopAddress(Obj))
     return;
 
@@ -3583,13 +3625,25 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
   Prefix = InputArgs.getLastArgValue(OBJDUMP_prefix).str();
   parseIntArg(InputArgs, OBJDUMP_prefix_strip, PrefixStrip);
   if (const opt::Arg *A = InputArgs.getLastArg(OBJDUMP_debug_vars_EQ)) {
-    DbgVariables = StringSwitch<DebugVarsFormat>(A->getValue())
-                       .Case("ascii", DVASCII)
-                       .Case("unicode", DVUnicode)
-                       .Default(DVInvalid);
-    if (DbgVariables == DVInvalid)
+    DbgVariables = StringSwitch<DebugFormat>(A->getValue())
+                       .Case("ascii", DFASCII)
+                       .Case("unicode", DFUnicode)
+                       .Default(DFInvalid);
+    if (DbgVariables == DFInvalid)
       invalidArgValue(A);
   }
+
+  if (const opt::Arg *A =
+          InputArgs.getLastArg(OBJDUMP_debug_inlined_funcs_EQ)) {
+    DbgInlinedFunctions = StringSwitch<DebugFormat>(A->getValue())
+                              .Case("ascii", DFASCII)
+                              .Case("limits-only", DFLimitsOnly)
+                              .Case("unicode", DFUnicode)
+                              .Default(DFInvalid);
+    if (DbgInlinedFunctions == DFInvalid)
+      invalidArgValue(A);
+  }
+
   if (const opt::Arg *A = InputArgs.getLastArg(OBJDUMP_disassembler_color_EQ)) {
     DisassemblyColor = StringSwitch<ColorOutput>(A->getValue())
                            .Case("on", ColorOutput::Enable)
@@ -3600,7 +3654,7 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
       invalidArgValue(A);
   }
 
-  parseIntArg(InputArgs, OBJDUMP_debug_vars_indent_EQ, DbgIndent);
+  parseIntArg(InputArgs, OBJDUMP_debug_indent_EQ, DbgIndent);
 
   parseMachOOptions(InputArgs);
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 25d9c1e106a6c..ce0642950ebdd 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -40,11 +40,12 @@ class XCOFFObjectFile;
 
 namespace objdump {
 
-enum DebugVarsFormat { DVDisabled, DVUnicode, DVASCII, DVInvalid };
+enum DebugFormat { DFASCII, DFDisabled, DFInvalid, DFLimitsOnly, DFUnicode };
 
 extern bool ArchiveHeaders;
 extern int DbgIndent;
-extern DebugVarsFormat DbgVariables;
+extern DebugFormat DbgVariables;
+extern DebugFormat DbgInlinedFunctions;
 extern bool Demangle;
 extern bool Disassemble;
 extern bool DisassembleAll;
@@ -126,7 +127,7 @@ void printSectionContents(const object::ObjectFile *O);
 void reportWarning(const Twine &Message, StringRef File);
 
 template <typename T, typename... Ts>
-T unwrapOrError(Expected<T> EO, Ts &&... Args) {
+T unwrapOrError(Expected<T> EO, Ts &&...Args) {
   if (EO)
     return std::move(*EO);
   reportError(EO.takeError(), std::forward<Ts>(Args)...);
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 45eac90aef935..207ae2ddd4cf2 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -47,7 +47,6 @@
 #include <algorithm>
 #include <cmath>
 #include <optional>
-#include <queue>
 
 using namespace llvm;
 using ProfCorrelatorKind = InstrProfCorrelator::ProfCorrelatorKind;
@@ -2849,9 +2848,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
   auto FS = vfs::getRealFileSystem();
   auto ReaderOrErr = InstrProfReader::create(Filename, *FS);
   std::vector<uint32_t> Cutoffs = std::move(DetailedSummaryCutoffs);
-  if (ShowDetailedSummary && Cutoffs.empty()) {
+  if (Cutoffs.empty() && (ShowDetailedSummary || ShowHotFuncList))
     Cutoffs = ProfileSummaryBuilder::DefaultCutoffs;
-  }
   InstrProfSummaryBuilder Builder(std::move(Cutoffs));
   if (Error E = ReaderOrErr.takeError())
     exitWithError(std::move(E), Filename);
@@ -2863,15 +2861,7 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
   int NumVPKind = IPVK_Last - IPVK_First + 1;
   std::vector<ValueSitesStats> VPStats(NumVPKind);
 
-  auto MinCmp = [](const std::pair<std::string, uint64_t> &v1,
-                   const std::pair<std::string, uint64_t> &v2) {
-    return v1.second > v2.second;
-  };
-
-  std::priority_queue<std::pair<std::string, uint64_t>,
-                      std::vector<std::pair<std::string, uint64_t>>,
-                      decltype(MinCmp)>
-      HottestFuncs(MinCmp);
+  std::vector<std::pair<StringRef, uint64_t>> NameAndMaxCount;
 
   if (!TextFormat && OnlyListBelow) {
     OS << "The list of functions with the maximum counter less than "
@@ -2946,15 +2936,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
     } else if (OnlyListBelow)
       continue;
 
-    if (TopNFunctions) {
-      if (HottestFuncs.size() == TopNFunctions) {
-        if (HottestFuncs.top().second < FuncMax) {
-          HottestFuncs.pop();
-          HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax));
-        }
-      } else
-        HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax));
-    }
+    if (TopNFunctions || ShowHotFuncList)
+      NameAndMaxCount.emplace_back(Func.Name, FuncMax);
 
     if (Show) {
       if (!ShownFunctions)
@@ -3034,16 +3017,27 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
        << "): " << PS->getNumFunctions() - BelowCutoffFunctions << "\n";
   }
 
+  // Sort by MaxCount in decreasing order
+  llvm::stable_sort(NameAndMaxCount, [](const auto &L, const auto &R) {
+    return L.second > R.second;
+  });
   if (TopNFunctions) {
-    std::vector<std::pair<std::string, uint64_t>> SortedHottestFuncs;
-    while (!HottestFuncs.empty()) {
-      SortedHottestFuncs.emplace_back(HottestFuncs.top());
-      HottestFuncs.pop();
-    }
     OS << "Top " << TopNFunctions
        << " functions with the largest internal block counts: \n";
-    for (auto &hotfunc : llvm::reverse(SortedHottestFuncs))
-      OS << "  " << hotfunc.first << ", max count = " << hotfunc.second << "\n";
+    auto TopFuncs = ArrayRef(NameAndMaxCount).take_front(TopNFunctions);
+    for (auto [Name, MaxCount] : TopFuncs)
+      OS << "  " << Name << ", max count = " << MaxCount << "\n";
+  }
+
+  if (ShowHotFuncList) {
+    auto HotCountThreshold =
+        ProfileSummaryBuilder::getHotCountThreshold(PS->getDetailedSummary());
+    OS << "# Hot count threshold: " << HotCountThreshold << "\n";
+    for (auto [Name, MaxCount] : NameAndMaxCount) {
+      if (MaxCount < HotCountThreshold)
+        break;
+      OS << Name << "\n";
+    }
   }
 
   if (ShownFunctions && ShowIndirectCallTargets) {
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 465c189680cae..2699e1061e1b8 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -30,6 +30,7 @@
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
+#include "llvm/BinaryFormat/SFrame.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ELF.h"
@@ -38,6 +39,7 @@
 #include "llvm/Object/Error.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocationResolver.h"
+#include "llvm/Object/SFrameParser.h"
 #include "llvm/Object/StackMapParser.h"
 #include "llvm/Support/AArch64AttributeParser.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -225,6 +227,8 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
   void printArchSpecificInfo() override;
   void printStackMap() const override;
   void printMemtag() override;
+  void printSectionsAsSFrame(ArrayRef<std::string> Sections) override;
+
   ArrayRef<uint8_t> getMemtagGlobalsSectionContents(uint64_t ExpectedAddr);
 
   // Hash histogram shows statistics of how efficient the hash was for the
@@ -1083,26 +1087,25 @@ const EnumEntry<unsigned> ElfObjectFileType[] = {
 };
 
 const EnumEntry<unsigned> ElfOSABI[] = {
-  {"SystemV",      "UNIX - System V",      ELF::ELFOSABI_NONE},
-  {"HPUX",         "UNIX - HP-UX",         ELF::ELFOSABI_HPUX},
-  {"NetBSD",       "UNIX - NetBSD",        ELF::ELFOSABI_NETBSD},
-  {"GNU/Linux",    "UNIX - GNU",           ELF::ELFOSABI_LINUX},
-  {"GNU/Hurd",     "GNU/Hurd",             ELF::ELFOSABI_HURD},
-  {"Solaris",      "UNIX - Solaris",       ELF::ELFOSABI_SOLARIS},
-  {"AIX",          "UNIX - AIX",           ELF::ELFOSABI_AIX},
-  {"IRIX",         "UNIX - IRIX",          ELF::ELFOSABI_IRIX},
-  {"FreeBSD",      "UNIX - FreeBSD",       ELF::ELFOSABI_FREEBSD},
-  {"TRU64",        "UNIX - TRU64",         ELF::ELFOSABI_TRU64},
-  {"Modesto",      "Novell - Modesto",     ELF::ELFOSABI_MODESTO},
-  {"OpenBSD",      "UNIX - OpenBSD",       ELF::ELFOSABI_OPENBSD},
-  {"OpenVMS",      "VMS - OpenVMS",        ELF::ELFOSABI_OPENVMS},
-  {"NSK",          "HP - Non-Stop Kernel", ELF::ELFOSABI_NSK},
-  {"AROS",         "AROS",                 ELF::ELFOSABI_AROS},
-  {"FenixOS",      "FenixOS",              ELF::ELFOSABI_FENIXOS},
-  {"CloudABI",     "CloudABI",             ELF::ELFOSABI_CLOUDABI},
-  {"CUDA",         "NVIDIA - CUDA",        ELF::ELFOSABI_CUDA},
-  {"Standalone",   "Standalone App",       ELF::ELFOSABI_STANDALONE}
-};
+    {"SystemV", "UNIX - System V", ELF::ELFOSABI_NONE},
+    {"HPUX", "UNIX - HP-UX", ELF::ELFOSABI_HPUX},
+    {"NetBSD", "UNIX - NetBSD", ELF::ELFOSABI_NETBSD},
+    {"GNU/Linux", "UNIX - GNU", ELF::ELFOSABI_LINUX},
+    {"GNU/Hurd", "GNU/Hurd", ELF::ELFOSABI_HURD},
+    {"Solaris", "UNIX - Solaris", ELF::ELFOSABI_SOLARIS},
+    {"AIX", "UNIX - AIX", ELF::ELFOSABI_AIX},
+    {"IRIX", "UNIX - IRIX", ELF::ELFOSABI_IRIX},
+    {"FreeBSD", "UNIX - FreeBSD", ELF::ELFOSABI_FREEBSD},
+    {"TRU64", "UNIX - TRU64", ELF::ELFOSABI_TRU64},
+    {"Modesto", "Novell - Modesto", ELF::ELFOSABI_MODESTO},
+    {"OpenBSD", "UNIX - OpenBSD", ELF::ELFOSABI_OPENBSD},
+    {"OpenVMS", "VMS - OpenVMS", ELF::ELFOSABI_OPENVMS},
+    {"NSK", "HP - Non-Stop Kernel", ELF::ELFOSABI_NSK},
+    {"AROS", "AROS", ELF::ELFOSABI_AROS},
+    {"FenixOS", "FenixOS", ELF::ELFOSABI_FENIXOS},
+    {"CloudABI", "CloudABI", ELF::ELFOSABI_CLOUDABI},
+    {"CUDA", "NVIDIA - CUDA", ELF::ELFOSABI_CUDA},
+    {"Standalone", "Standalone App", ELF::ELFOSABI_STANDALONE}};
 
 const EnumEntry<unsigned> AMDGPUElfOSABI[] = {
   {"AMDGPU_HSA",    "AMDGPU - HSA",    ELF::ELFOSABI_AMDGPU_HSA},
@@ -1667,16 +1670,17 @@ const EnumEntry<unsigned> ElfHeaderAMDGPUFlagsABIVersion4[] = {
 };
 
 const EnumEntry<unsigned> ElfHeaderNVPTXFlags[] = {
-    ENUM_ENT(EF_CUDA_SM20, "sm_20"), ENUM_ENT(EF_CUDA_SM21, "sm_21"),
-    ENUM_ENT(EF_CUDA_SM30, "sm_30"), ENUM_ENT(EF_CUDA_SM32, "sm_32"),
-    ENUM_ENT(EF_CUDA_SM35, "sm_35"), ENUM_ENT(EF_CUDA_SM37, "sm_37"),
-    ENUM_ENT(EF_CUDA_SM50, "sm_50"), ENUM_ENT(EF_CUDA_SM52, "sm_52"),
-    ENUM_ENT(EF_CUDA_SM53, "sm_53"), ENUM_ENT(EF_CUDA_SM60, "sm_60"),
-    ENUM_ENT(EF_CUDA_SM61, "sm_61"), ENUM_ENT(EF_CUDA_SM62, "sm_62"),
-    ENUM_ENT(EF_CUDA_SM70, "sm_70"), ENUM_ENT(EF_CUDA_SM72, "sm_72"),
-    ENUM_ENT(EF_CUDA_SM75, "sm_75"), ENUM_ENT(EF_CUDA_SM80, "sm_80"),
-    ENUM_ENT(EF_CUDA_SM86, "sm_86"), ENUM_ENT(EF_CUDA_SM87, "sm_87"),
-    ENUM_ENT(EF_CUDA_SM89, "sm_89"), ENUM_ENT(EF_CUDA_SM90, "sm_90"),
+    ENUM_ENT(EF_CUDA_SM20, "sm_20"),   ENUM_ENT(EF_CUDA_SM21, "sm_21"),
+    ENUM_ENT(EF_CUDA_SM30, "sm_30"),   ENUM_ENT(EF_CUDA_SM32, "sm_32"),
+    ENUM_ENT(EF_CUDA_SM35, "sm_35"),   ENUM_ENT(EF_CUDA_SM37, "sm_37"),
+    ENUM_ENT(EF_CUDA_SM50, "sm_50"),   ENUM_ENT(EF_CUDA_SM52, "sm_52"),
+    ENUM_ENT(EF_CUDA_SM53, "sm_53"),   ENUM_ENT(EF_CUDA_SM60, "sm_60"),
+    ENUM_ENT(EF_CUDA_SM61, "sm_61"),   ENUM_ENT(EF_CUDA_SM62, "sm_62"),
+    ENUM_ENT(EF_CUDA_SM70, "sm_70"),   ENUM_ENT(EF_CUDA_SM72, "sm_72"),
+    ENUM_ENT(EF_CUDA_SM75, "sm_75"),   ENUM_ENT(EF_CUDA_SM80, "sm_80"),
+    ENUM_ENT(EF_CUDA_SM86, "sm_86"),   ENUM_ENT(EF_CUDA_SM87, "sm_87"),
+    ENUM_ENT(EF_CUDA_SM89, "sm_89"),   ENUM_ENT(EF_CUDA_SM90, "sm_90"),
+    ENUM_ENT(EF_CUDA_SM100, "sm_100"), ENUM_ENT(EF_CUDA_SM120, "sm_120"),
 };
 
 const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
@@ -3651,10 +3655,16 @@ template <class ELFT> void GNUELFDumper<ELFT>::printFileHeaders() {
   else if (e.e_machine == EM_XTENSA)
     ElfFlags = printFlags(e.e_flags, ArrayRef(ElfHeaderXtensaFlags),
                           unsigned(ELF::EF_XTENSA_MACH));
-  else if (e.e_machine == EM_CUDA)
+  else if (e.e_machine == EM_CUDA) {
     ElfFlags = printFlags(e.e_flags, ArrayRef(ElfHeaderNVPTXFlags),
                           unsigned(ELF::EF_CUDA_SM));
-  else if (e.e_machine == EM_AMDGPU) {
+    if (e.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V1 &&
+        (e.e_flags & ELF::EF_CUDA_ACCELERATORS_V1))
+      ElfFlags += "a";
+    else if (e.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V2 &&
+             (e.e_flags & ELF::EF_CUDA_ACCELERATORS))
+      ElfFlags += "a";
+  } else if (e.e_machine == EM_AMDGPU) {
     switch (e.e_ident[ELF::EI_ABIVERSION]) {
     default:
       break;
@@ -5511,7 +5521,7 @@ template <typename ELFT> static GNUAbiTag getGNUAbiTag(ArrayRef<uint8_t> Desc) {
     return {"", "", /*IsValid=*/false};
 
   static const char *OSNames[] = {
-      "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl",
+      "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable",
   };
   StringRef OSName = "Unknown";
   if (Words[0] < std::size(OSNames))
@@ -6429,6 +6439,61 @@ template <typename ELFT> void ELFDumper<ELFT>::printMemtag() {
   printMemtag(DynamicEntries, AndroidNoteDesc, GlobalDescriptors);
 }
 
+template <typename ELFT>
+void ELFDumper<ELFT>::printSectionsAsSFrame(ArrayRef<std::string> Sections) {
+  constexpr endianness E = ELFT::Endianness;
+  for (object::SectionRef Section :
+       getSectionRefsByNameOrIndex(ObjF, Sections)) {
+    // Validity of sections names checked in getSectionRefsByNameOrIndex.
+    StringRef SectionName = cantFail(Section.getName());
+
+    DictScope SectionScope(W,
+                           formatv("SFrame section '{0}'", SectionName).str());
+
+    StringRef SectionContent;
+    if (Error Err = Section.getContents().moveInto(SectionContent)) {
+      reportWarning(std::move(Err), FileName);
+      continue;
+    }
+
+    Expected<object::SFrameParser<E>> Parser =
+        object::SFrameParser<E>::create(arrayRefFromStringRef(SectionContent));
+    if (!Parser) {
+      reportWarning(createError("invalid sframe section: " +
+                                toString(Parser.takeError())),
+                    FileName);
+      continue;
+    }
+
+    DictScope HeaderScope(W, "Header");
+
+    const sframe::Preamble<E> &Preamble = Parser->getPreamble();
+    W.printHex("Magic", Preamble.Magic.value());
+    W.printEnum("Version", Preamble.Version.value(), sframe::getVersions());
+    W.printFlags("Flags", Preamble.Flags.value(), sframe::getFlags());
+
+    const sframe::Header<E> &Header = Parser->getHeader();
+    W.printEnum("ABI", Header.ABIArch.value(), sframe::getABIs());
+
+    W.printNumber(("CFA fixed FP offset" +
+                   Twine(Parser->usesFixedFPOffset() ? "" : " (unused)"))
+                      .str(),
+                  Header.CFAFixedFPOffset.value());
+
+    W.printNumber(("CFA fixed RA offset" +
+                   Twine(Parser->usesFixedRAOffset() ? "" : " (unused)"))
+                      .str(),
+                  Header.CFAFixedRAOffset.value());
+
+    W.printNumber("Auxiliary header length", Header.AuxHdrLen.value());
+    W.printNumber("Num FDEs", Header.NumFDEs.value());
+    W.printNumber("Num FREs", Header.NumFREs.value());
+    W.printNumber("FRE subsection length", Header.FRELen.value());
+    W.printNumber("FDE subsection offset", Header.FDEOff.value());
+    W.printNumber("FRE subsection offset", Header.FREOff.value());
+  }
+}
+
 template <class ELFT> void GNUELFDumper<ELFT>::printELFLinkerOptions() {
   OS << "printELFLinkerOptions not implemented!\n";
 }
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 1a535ede07096..bd670aeab9ed8 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -102,9 +102,9 @@ void ObjDumper::printFileSummary(StringRef FileStr, object::ObjectFile &Obj,
   this->printLoadName();
 }
 
-static std::vector<object::SectionRef>
-getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
-                            ArrayRef<std::string> Sections) {
+std::vector<object::SectionRef>
+ObjDumper::getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
+                                       ArrayRef<std::string> Sections) {
   std::vector<object::SectionRef> Ret;
   std::map<std::string, bool, std::less<>> SecNames;
   std::map<unsigned, bool> SecIndices;
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index a76afbe9c88c7..1dc29661f7178 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -139,6 +139,7 @@ class ObjDumper {
   virtual void printSectionDetails() {}
   virtual void printArchSpecificInfo() {}
   virtual void printMemtag() {}
+  virtual void printSectionsAsSFrame(ArrayRef<std::string> Sections) {}
 
   // Only implemented for PE/COFF.
   virtual void printCOFFImports() { }
@@ -190,6 +191,10 @@ class ObjDumper {
 protected:
   ScopedPrinter &W;
 
+  static std::vector<object::SectionRef>
+  getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
+                              ArrayRef<std::string> Sections);
+
 private:
   virtual void printSymbols(bool ExtraSymInfo) {}
   virtual void printSymbols(std::optional<SymbolComparator> Comp) {}
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index f95461aaca1a7..48d43cc635a4f 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -62,6 +62,8 @@ def memtag : FF<"memtag", "Display memory tagging metadata (modes, Android notes
 def needed_libs : FF<"needed-libs", "Display the needed libraries">, Group<grp_elf>;
 def notes : FF<"notes", "Display notes">, Group<grp_elf>;
 def program_headers : FF<"program-headers", "Display program headers">, Group<grp_elf>;
+def sframe_EQ : Joined<["--"], "sframe=">, HelpText<"Display SFrame section <name>">, MetaVarName<"<name>">, Group<grp_elf>;
+def sframe: FF<"sframe", "Alias for --sframe=.sframe">, Alias<sframe_EQ>, AliasArgs<[".sframe"]>, Group<grp_elf>;
 def version_info : FF<"version-info", "Display version sections">, Group<grp_elf>;
 
 // Mach-O specific options.
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 1231c02035d1f..4c84ed701bb9a 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -137,6 +137,7 @@ static bool NeededLibraries;
 static bool Notes;
 static bool ProgramHeaders;
 static bool SectionGroups;
+static std::vector<std::string> SFrame;
 static bool VersionInfo;
 
 // Mach-O specific options.
@@ -275,6 +276,7 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::PrettyPrint = Args.hasArg(OPT_pretty_print);
   opts::ProgramHeaders = Args.hasArg(OPT_program_headers);
   opts::SectionGroups = Args.hasArg(OPT_section_groups);
+  opts::SFrame = Args.getAllArgValues(OPT_sframe_EQ);
   if (Arg *A = Args.getLastArg(OPT_sort_symbols_EQ)) {
     for (StringRef KeyStr : llvm::split(A->getValue(), ",")) {
       SortSymbolKeyTy KeyType = StringSwitch<SortSymbolKeyTy>(KeyStr)
@@ -478,6 +480,8 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
       Dumper->printNotes();
     if (opts::Memtag)
       Dumper->printMemtag();
+    if (!opts::SFrame.empty())
+      Dumper->printSectionsAsSFrame(opts::SFrame);
   }
   if (Obj.isCOFF()) {
     if (opts::COFFImports)
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index e3d500aa7b55a..4a8f53cf72f94 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -10,6 +10,11 @@
 #include "CountCopyAndMove.h"
 #include "gtest/gtest.h"
 
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
 namespace {
 
 template <typename T>
@@ -142,6 +147,26 @@ TEST(TransformTest, MoveTransformLlvm) {
   EXPECT_EQ(0, CountCopyAndMove::Destructions);
 }
 
+TEST(TransformTest, TransformCategory) {
+  struct StructA {
+    int x;
+  };
+  struct StructB : StructA {
+    StructB(StructA &&A) : StructA(std::move(A)) {}
+  };
+
+  std::optional<StructA> A{StructA{}};
+  llvm::transformOptional(A, [](auto &&s) {
+    EXPECT_FALSE(std::is_rvalue_reference_v<decltype(s)>);
+    return StructB{std::move(s)};
+  });
+
+  llvm::transformOptional(std::move(A), [](auto &&s) {
+    EXPECT_TRUE(std::is_rvalue_reference_v<decltype(s)>);
+    return StructB{std::move(s)};
+  });
+}
+
 TEST(TransformTest, ToUnderlying) {
   enum E { A1 = 0, B1 = -1 };
   static_assert(llvm::to_underlying(A1) == 0);
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+            MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+            MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  EXPECT_EQ(Vocabulary::getNumericID(PtrVal),
+            MaxOpcodes + MaxTypeIDs + 1u); // Pointer = 1
+
+  // Test Variable operand (function argument)
+  Argument *Arg = F->getArg(0);
+  EXPECT_EQ(Vocabulary::getNumericID(Arg),
+            MaxOpcodes + MaxTypeIDs + 3u); // Variable = 3
+}
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(IR2VecVocabularyTest, NumericIDMapInvalidInputs) {
+  // Test invalid opcode IDs
+  EXPECT_DEATH(Vocabulary::getNumericID(0u), "Invalid opcode");
+  EXPECT_DEATH(Vocabulary::getNumericID(MaxOpcodes + 1), "Invalid opcode");
+
+  // Test invalid type IDs
+  EXPECT_DEATH(Vocabulary::getNumericID(static_cast<Type::TypeID>(MaxTypeIDs)),
+               "Invalid type ID");
+  EXPECT_DEATH(
+      Vocabulary::getNumericID(static_cast<Type::TypeID>(MaxTypeIDs + 10)),
+      "Invalid type ID");
+}
+#endif // NDEBUG
+#endif // GTEST_HAS_DEATH_TEST
+
 TEST(IR2VecVocabularyTest, StringKeyGeneration) {
   EXPECT_EQ(Vocabulary::getStringKey(0), "Ret");
   EXPECT_EQ(Vocabulary::getStringKey(12), "Add");
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 7a48105a1dc99..6af20065213ac 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -912,14 +912,14 @@ TEST(ValueTracking, propagatesPoison) {
       {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 2},
       {true, "call float @llvm.sqrt.f32(float %fx)", 0},
       {true, "call float @llvm.powi.f32.i32(float %fx, i32 %x)", 0},
-      {false, "call float @llvm.sin.f32(float %fx)", 0},
-      {false, "call float @llvm.cos.f32(float %fx)", 0},
+      {true, "call float @llvm.sin.f32(float %fx)", 0},
+      {true, "call float @llvm.cos.f32(float %fx)", 0},
       {true, "call float @llvm.pow.f32(float %fx, float %fy)", 0},
-      {false, "call float @llvm.exp.f32(float %fx)", 0},
-      {false, "call float @llvm.exp2.f32(float %fx)", 0},
-      {false, "call float @llvm.log.f32(float %fx)", 0},
-      {false, "call float @llvm.log10.f32(float %fx)", 0},
-      {false, "call float @llvm.log2.f32(float %fx)", 0},
+      {true, "call float @llvm.exp.f32(float %fx)", 0},
+      {true, "call float @llvm.exp2.f32(float %fx)", 0},
+      {true, "call float @llvm.log.f32(float %fx)", 0},
+      {true, "call float @llvm.log10.f32(float %fx)", 0},
+      {true, "call float @llvm.log2.f32(float %fx)", 0},
       {false, "call float @llvm.fma.f32(float %fx, float %fx, float %fy)", 0},
       {false, "call float @llvm.fabs.f32(float %fx)", 0},
       {false, "call float @llvm.minnum.f32(float %fx, float %fy)", 0},
@@ -927,17 +927,17 @@ TEST(ValueTracking, propagatesPoison) {
       {false, "call float @llvm.minimum.f32(float %fx, float %fy)", 0},
       {false, "call float @llvm.maximum.f32(float %fx, float %fy)", 0},
       {false, "call float @llvm.copysign.f32(float %fx, float %fy)", 0},
-      {false, "call float @llvm.floor.f32(float %fx)", 0},
-      {false, "call float @llvm.ceil.f32(float %fx)", 0},
-      {false, "call float @llvm.trunc.f32(float %fx)", 0},
-      {false, "call float @llvm.rint.f32(float %fx)", 0},
-      {false, "call float @llvm.nearbyint.f32(float %fx)", 0},
-      {false, "call float @llvm.round.f32(float %fx)", 0},
-      {false, "call float @llvm.roundeven.f32(float %fx)", 0},
+      {true, "call float @llvm.floor.f32(float %fx)", 0},
+      {true, "call float @llvm.ceil.f32(float %fx)", 0},
+      {true, "call float @llvm.trunc.f32(float %fx)", 0},
+      {true, "call float @llvm.rint.f32(float %fx)", 0},
+      {true, "call float @llvm.nearbyint.f32(float %fx)", 0},
+      {true, "call float @llvm.round.f32(float %fx)", 0},
+      {true, "call float @llvm.roundeven.f32(float %fx)", 0},
       {false, "call i32 @llvm.lround.f32(float %fx)", 0},
       {false, "call i64 @llvm.llround.f32(float %fx)", 0},
-      {false, "call i32 @llvm.lrint.f32(float %fx)", 0},
-      {false, "call i64 @llvm.llrint.f32(float %fx)", 0},
+      {true, "call i32 @llvm.lrint.f32(float %fx)", 0},
+      {true, "call i64 @llvm.llrint.f32(float %fx)", 0},
       {false, "call float @llvm.fmuladd.f32(float %fx, float %fx, float %fy)",
        0}};
 
diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
index 3d707462fa615..563d496d1e600 100644
--- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp
+++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -67,7 +67,7 @@ class LexicalScopesTest : public testing::Test {
     BeanInst.Opcode = 1;
     BeanInst.Size = 1;
 
-    memset(&DbgValueInst, 0, sizeof(DbgValueInst));
+    memset(&DbgValueInst, 0, sizeof(MCInstrDesc));
     DbgValueInst.Opcode = TargetOpcode::DBG_VALUE;
     DbgValueInst.Size = 1;
     DbgValueInst.Flags = 1U << MCID::Meta;
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
index b1890d884d173..dfabb4ab76180 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
@@ -8,6 +8,7 @@
 
 #include <llvm/BinaryFormat/ELF.h>
 #include <llvm/ExecutionEngine/JITLink/aarch32.h>
+#include <llvm/Support/Compiler.h>
 
 #include "gtest/gtest.h"
 
@@ -96,21 +97,21 @@ namespace llvm {
 namespace jitlink {
 namespace aarch32 {
 
-HalfWords encodeImmBT4BlT1BlxT2(int64_t Value);
-HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value);
-uint32_t encodeImmBA1BlA1BlxA2(int64_t Value);
-HalfWords encodeImmMovtT1MovwT3(uint16_t Value);
-HalfWords encodeRegMovtT1MovwT3(int64_t Value);
-uint32_t encodeImmMovtA1MovwA2(uint16_t Value);
-uint32_t encodeRegMovtA1MovwA2(int64_t Value);
-
-int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo);
-int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo);
-int64_t decodeImmBA1BlA1BlxA2(int64_t Value);
-uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
-int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
-uint16_t decodeImmMovtA1MovwA2(uint64_t Value);
-int64_t decodeRegMovtA1MovwA2(uint64_t Value);
+LLVM_ABI HalfWords encodeImmBT4BlT1BlxT2(int64_t Value);
+LLVM_ABI HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value);
+LLVM_ABI uint32_t encodeImmBA1BlA1BlxA2(int64_t Value);
+LLVM_ABI HalfWords encodeImmMovtT1MovwT3(uint16_t Value);
+LLVM_ABI HalfWords encodeRegMovtT1MovwT3(int64_t Value);
+LLVM_ABI uint32_t encodeImmMovtA1MovwA2(uint16_t Value);
+LLVM_ABI uint32_t encodeRegMovtA1MovwA2(int64_t Value);
+
+LLVM_ABI int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo);
+LLVM_ABI int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo);
+LLVM_ABI int64_t decodeImmBA1BlA1BlxA2(int64_t Value);
+LLVM_ABI uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
+LLVM_ABI int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
+LLVM_ABI uint16_t decodeImmMovtA1MovwA2(uint64_t Value);
+LLVM_ABI int64_t decodeRegMovtA1MovwA2(uint64_t Value);
 
 } // namespace aarch32
 } // namespace jitlink
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index 6189d0954891b..95c26b10c9a0c 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -431,8 +431,8 @@ TEST_F(OpenMPDecompositionTest, Firstprivate3) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   std::string Dir2 = stringify(Dec.output[2]);
-  ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (12), (27)
-  ASSERT_EQ(Dir1, "teams shared(x)");          // (6), (17)
+  ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (12), (27)
+  ASSERT_EQ(Dir1, "teams shared(x)");            // (6), (17)
   ASSERT_EQ(Dir2, "distribute firstprivate(x) lastprivate(, (x))"); // (5), (21)
 }
 
@@ -574,9 +574,9 @@ TEST_F(OpenMPDecompositionTest, Lastprivate3) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   std::string Dir2 = stringify(Dec.output[2]);
-  ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (21), (27)
-  ASSERT_EQ(Dir1, "parallel shared(x)");       // (22)
-  ASSERT_EQ(Dir2, "do lastprivate(, (x))");    // (21)
+  ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (21), (27)
+  ASSERT_EQ(Dir1, "parallel shared(x)");         // (22)
+  ASSERT_EQ(Dir2, "do lastprivate(, (x))");      // (21)
 }
 
 // SHARED
@@ -984,9 +984,9 @@ TEST_F(OpenMPDecompositionTest, Reduction7) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   std::string Dir2 = stringify(Dec.output[2]);
-  ASSERT_EQ(Dir0, "target map(2, , , , (x))"); // (36), (10)
-  ASSERT_EQ(Dir1, "parallel shared(x)");       // (36), (1), (4)
-  ASSERT_EQ(Dir2, "do reduction(, (3), (x))"); // (36)
+  ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (36), (10)
+  ASSERT_EQ(Dir1, "parallel shared(x)");         // (36), (1), (4)
+  ASSERT_EQ(Dir2, "do reduction(, (3), (x))");   // (36)
 }
 
 // IF
diff --git a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
index 0363a08cc0f03..10329820bef76 100644
--- a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
@@ -48,12 +48,6 @@ static std::string &prepareParamName(std::string &Name) {
   return Name;
 }
 
-namespace llvm {
-template <> struct enum_iteration_traits<omp::Directive> {
-  static constexpr bool is_iterable = true;
-};
-} // namespace llvm
-
 // Test tokenizing.
 
 class Tokenize : public testing::TestWithParam<omp::Directive> {};
@@ -87,12 +81,10 @@ getParamName1(const testing::TestParamInfo<Tokenize::ParamType> &Info) {
   return prepareParamName(Name);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    DirectiveNameParserTest, Tokenize,
-    testing::ValuesIn(
-        llvm::enum_seq(static_cast<omp::Directive>(0),
-                       static_cast<omp::Directive>(omp::Directive_enumSize))),
-    getParamName1);
+INSTANTIATE_TEST_SUITE_P(DirectiveNameParserTest, Tokenize,
+                         testing::ValuesIn(llvm::enum_seq_inclusive(
+                             omp::Directive::First_, omp::Directive::Last_)),
+                         getParamName1);
 
 // Test parsing of valid names.
 
@@ -131,9 +123,8 @@ getParamName2(const testing::TestParamInfo<ParseValid::ParamType> &Info) {
 
 INSTANTIATE_TEST_SUITE_P(
     DirectiveNameParserTest, ParseValid,
-    testing::Combine(testing::ValuesIn(llvm::enum_seq(
-                         static_cast<omp::Directive>(0),
-                         static_cast<omp::Directive>(omp::Directive_enumSize))),
+    testing::Combine(testing::ValuesIn(llvm::enum_seq_inclusive(
+                         omp::Directive::First_, omp::Directive::Last_)),
                      testing::ValuesIn(omp::getOpenMPVersions())),
     getParamName2);
 
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 55dc8a79fd9ab..d6b578aa8ffd1 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -2736,8 +2736,8 @@ TEST_P(OpenMPIRBuilderTestWithParams, DynamicWorkShareLoop) {
   EXPECT_EQ(OrigUpperBound->getValue(), 21);
   EXPECT_EQ(OrigStride->getValue(), 1);
 
-  CallInst *FiniCall = dyn_cast<CallInst>(
-      &*(LatchBlock->getTerminator()->getPrevNonDebugInstruction(true)));
+  CallInst *FiniCall =
+      dyn_cast<CallInst>(&*(LatchBlock->getTerminator()->getPrevNode()));
   EXPECT_EQ(FiniCall, nullptr);
 
   // The original loop iterator should only be used in the condition, in the
@@ -2840,8 +2840,8 @@ TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoopOrdered) {
   EXPECT_EQ(SchedVal->getValue(),
             static_cast<uint64_t>(OMPScheduleType::OrderedStaticChunked));
 
-  CallInst *FiniCall = dyn_cast<CallInst>(
-      &*(LatchBlock->getTerminator()->getPrevNonDebugInstruction(true)));
+  CallInst *FiniCall =
+      dyn_cast<CallInst>(&*(LatchBlock->getTerminator()->getPrevNode()));
   ASSERT_NE(FiniCall, nullptr);
   EXPECT_EQ(FiniCall->getCalledFunction()->getName(),
             "__kmpc_dispatch_fini_4u");
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 41bf863420304..0065615cbfe13 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -185,9 +185,8 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
 
   // Find the dbg.value using %b.
-  SmallVector<DbgValueInst *, 1> DVIs;
   SmallVector<DbgVariableRecord *, 1> DVRs;
-  findDbgValues(DVIs, &I, &DVRs);
+  findDbgValues(&I, DVRs);
 
   // Delete %b. The dbg.value should now point to undef.
   I.eraseFromParent();
@@ -229,7 +228,6 @@ TEST(MetadataTest, GlobalConstantMetadataUsedByDbgRecord) {
   Value *V = M->getNamedValue("x");
 
   // Find the dbg.value
-  auto DVIs = findDbgDeclares(V);
   auto DVRs = findDVRDeclares(V);
   auto DVRVs = findDVRValues(V);
 
@@ -311,9 +309,8 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
 
   // Find the DbgVariableRecords using %b.
-  SmallVector<DbgValueInst *, 2> DVIs;
   SmallVector<DbgVariableRecord *, 2> DVRs;
-  findDbgValues(DVIs, &I, &DVRs);
+  findDbgValues(&I, DVRs);
   ASSERT_EQ(DVRs.size(), 2u);
 
   // Delete %b. The DbgVariableRecord should now point to undef.
@@ -357,11 +354,9 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
 
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
 
-  SmallVector<DbgValueInst *, 2> DVIs;
   SmallVector<DbgVariableRecord *, 2> DVRs;
 
-  findDbgValues(DVIs, &I, &DVRs);
-  ASSERT_EQ(DVIs.size(), 0u);
+  findDbgValues(&I, DVRs);
   ASSERT_EQ(DVRs.size(), 2u);
 
   // The correct order of dbg.values is given by their use-list, which becomes
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 21606fff1f32b..25b390758f172 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -177,10 +177,10 @@ TEST(ELFObjectFileTest, MachineTestForPPC) {
 }
 
 TEST(ELFObjectFileTest, MachineTestForRISCV) {
-  std::array<StringRef, 4> Formats = {"elf32-littleriscv", "elf32-littleriscv",
-                                      "elf64-littleriscv", "elf64-littleriscv"};
-  std::array<Triple::ArchType, 4> Archs = {Triple::riscv32, Triple::riscv32,
-                                           Triple::riscv64, Triple::riscv64};
+  std::array<StringRef, 4> Formats = {"elf32-littleriscv", "elf32-bigriscv",
+                                      "elf64-littleriscv", "elf64-bigriscv"};
+  std::array<Triple::ArchType, 4> Archs = {Triple::riscv32, Triple::riscv32be,
+                                           Triple::riscv64, Triple::riscv64be};
   for (auto [Idx, Data] : enumerate(generateData(ELF::EM_RISCV)))
     checkFormatAndArch(Data, Formats[Idx], Archs[Idx]);
 }
diff --git a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
index 0b00734fc4d75..1daf381ee2862 100644
--- a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
+++ b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
@@ -54,20 +54,13 @@ struct DebugInfoDrop : public FunctionPass {
 struct DebugValueDrop : public FunctionPass {
   static char ID;
   bool runOnFunction(Function &F) override {
-    SmallVector<DbgVariableIntrinsic *, 4> Dbgs;
     for (BasicBlock &BB : F) {
-      // Remove dbg var intrinsics.
       for (Instruction &I : BB) {
-        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-          Dbgs.push_back(DVI);
-        // If there are any non-intrinsic records (DbgRecords), drop those too.
+        // If there are any debug records, drop them.
         I.dropDbgRecords();
       }
     }
 
-    for (auto &I : Dbgs)
-      I->eraseFromParent();
-
     return true;
   }
 
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index b922216ef8893..0c70feb64e7e4 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -633,62 +633,6 @@ TEST(Local, ChangeToUnreachable) {
   EXPECT_EQ(DLA, DLB);
 }
 
-TEST(Local, FindDbgUsers) {
-  LLVMContext Ctx;
-  std::unique_ptr<Module> M = parseIR(Ctx,
-                                      R"(
-  define dso_local void @fun(ptr %a) #0 !dbg !11 {
-  entry:
-      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
-    ret void
-  }
-
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2, !3, !9}
-  !llvm.ident = !{!10}
-
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 17.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{i32 7, !"Dwarf Version", i32 5}
-  !3 = !{i32 2, !"Debug Info Version", i32 3}
-  !4 = !{i32 1, !"wchar_size", i32 4}
-  !9 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
-  !10 = !{!"clang version 17.0.0"}
-  !11 = distinct !DISubprogram(name: "fun", linkageName: "fun", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14)
-  !12 = !DISubroutineType(types: !13)
-  !13 = !{null}
-  !14 = !{}
-  !15 = distinct !DIAssignID()
-  !16 = !DILocalVariable(name: "x", scope: !11, file: !1, line: 2, type: !17)
-  !17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64)
-  !18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !19 = !DILocation(line: 0, scope: !11)
-  )");
-
-  bool BrokenDebugInfo = true;
-  verifyModule(*M, &errs(), &BrokenDebugInfo);
-  ASSERT_FALSE(BrokenDebugInfo);
-
-  // Convert to debug intrinsics as we want to test findDbgUsers and
-  // findDbgValue's debug-intrinsic-finding code here.
-  // TODO: Remove this test when debug intrinsics are removed.
-  M->convertFromNewDbgValues();
-
-  Function &Fun = *cast<Function>(M->getNamedValue("fun"));
-  Value *Arg = Fun.getArg(0);
-  SmallVector<DbgVariableIntrinsic *> Users;
-  // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns
-  // only 1 pointer to it rather than 2.
-  findDbgUsers(Users, Arg);
-  EXPECT_EQ(Users.size(), 1u);
-
-  SmallVector<DbgValueInst *> Vals;
-  // Arg (%a) is used twice by a single dbg.assign. Check findDbgValues returns
-  // only 1 pointer to it rather than 2.
-  findDbgValues(Vals, Arg);
-  EXPECT_EQ(Vals.size(), 1u);
-}
-
 TEST(Local, FindDbgRecords) {
   // DbgRecord copy of the FindDbgUsers test above.
   LLVMContext Ctx;
@@ -729,20 +673,17 @@ TEST(Local, FindDbgRecords) {
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
 
-  SmallVector<DbgVariableIntrinsic *> Users;
   SmallVector<DbgVariableRecord *> Records;
   // Arg (%a) is used twice by a single dbg_assign. Check findDbgUsers returns
   // only 1 pointer to it rather than 2.
-  findDbgUsers(Users, Arg, &Records);
-  EXPECT_EQ(Users.size(), 0u);
+  findDbgUsers(Arg, Records);
   EXPECT_EQ(Records.size(), 1u);
 
   SmallVector<DbgValueInst *> Vals;
   Records.clear();
   // Arg (%a) is used twice by a single dbg_assign. Check findDbgValues returns
   // only 1 pointer to it rather than 2.
-  findDbgValues(Vals, Arg, &Records);
-  EXPECT_EQ(Vals.size(), 0u);
+  findDbgValues(Arg, Records);
   EXPECT_EQ(Records.size(), 1u);
 }
 
@@ -843,20 +784,16 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   // Simulate i32* <-> i64* conversion.
   EXPECT_TRUE(replaceAllDbgUsesWith(D, C, C, DT));
 
-  SmallVector<DbgVariableIntrinsic *, 2> CDbgVals;
   SmallVector<DbgVariableRecord *, 2> CDbgRecords;
-  findDbgUsers(CDbgVals, &C, &CDbgRecords);
-  EXPECT_EQ(0U, CDbgVals.size());
+  findDbgUsers(&C, CDbgRecords);
   EXPECT_EQ(2U, CDbgRecords.size());
   EXPECT_TRUE(all_of(
       CDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
 
   EXPECT_TRUE(replaceAllDbgUsesWith(C, D, D, DT));
 
-  SmallVector<DbgVariableIntrinsic *, 2> DDbgVals;
   SmallVector<DbgVariableRecord *, 2> DDbgRecords;
-  findDbgUsers(DDbgVals, &D, &DDbgRecords);
-  EXPECT_EQ(0U, DDbgVals.size());
+  findDbgUsers(&D, DDbgRecords);
   EXPECT_EQ(2U, DDbgRecords.size());
   EXPECT_TRUE(all_of(
       DDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
@@ -880,10 +817,8 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   EXPECT_EQ(BarrierDbgVal->getNumVariableLocationOps(), 1u);
   EXPECT_TRUE(BarrierDbgVal->isKillLocation());
 
-  SmallVector<DbgValueInst *, 1> BarrierDbgVals;
   SmallVector<DbgVariableRecord *, 8> BarrierDbgRecs;
-  findDbgValues(BarrierDbgVals, &F_, &BarrierDbgRecs);
-  EXPECT_EQ(0U, BarrierDbgVals.size());
+  findDbgValues(&F_, BarrierDbgRecs);
   EXPECT_EQ(0U, BarrierDbgRecs.size());
 
   // Simulate i32 -> i64 conversion to test sign-extension. Here are some
@@ -894,10 +829,8 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   //  4-6) like (1-3), but with a fragment
   EXPECT_TRUE(replaceAllDbgUsesWith(B, A, A, DT));
 
-  SmallVector<DbgValueInst *, 8> BDbgVals;
   SmallVector<DbgVariableRecord *, 8> BDbgRecs;
-  findDbgValues(BDbgVals, &A, &BDbgRecs);
-  EXPECT_EQ(0U, BDbgVals.size());
+  findDbgValues(&A, BDbgRecs);
   EXPECT_EQ(6U, BDbgRecs.size());
 
   // Check that %a has a dbg.value with a DIExpression matching \p Ops.
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index 177eecebce9a5..f0e23690367db 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -106,8 +106,16 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
                               bool ExportEnums) {
   OS << "\n";
   OS << "enum class " << Enum << " {\n";
-  for (const Record *R : Records) {
-    OS << "  " << getIdentifierName(R, Prefix) << ",\n";
+  if (!Records.empty()) {
+    std::string N;
+    for (auto [I, R] : llvm::enumerate(Records)) {
+      N = getIdentifierName(R, Prefix);
+      OS << "  " << N << ",\n";
+      // Make the sentinel names less likely to conflict with actual names...
+      if (I == 0)
+        OS << "  First_ = " << N << ",\n";
+    }
+    OS << "  Last_ = " << N << ",\n";
   }
   OS << "};\n";
   OS << "\n";
@@ -282,6 +290,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
 
+  OS << "#include \"llvm/ADT/Sequence.h\"\n";
   OS << "#include \"llvm/ADT/StringRef.h\"\n";
   OS << "#include \"llvm/Frontend/Directive/Spelling.h\"\n";
   OS << "#include \"llvm/Support/Compiler.h\"\n";
@@ -375,6 +384,15 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   for (auto Ns : reverse(Namespaces))
     OS << "} // namespace " << Ns << "\n";
 
+  // These specializations need to be in ::llvm.
+  for (StringRef Enum : {"Association", "Category", "Directive", "Clause"}) {
+    OS << "\n";
+    OS << "template <> struct enum_iteration_traits<"
+       << DirLang.getCppNamespace() << "::" << Enum << "> {\n";
+    OS << "  static constexpr bool is_iterable = true;\n";
+    OS << "};\n";
+  }
+
   OS << "} // namespace llvm\n";
 
   OS << "#endif // LLVM_" << Lang << "_INC\n";
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 652bea9dc7f65..7f90d6b4fdacc 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -236,8 +236,19 @@ class RuntimeLibcallEmitter {
     for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
       Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
 
-    ArrayRef<const Record *> AllRuntimeLibcallImpls =
+    ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
         Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
+
+    SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
+        AllRuntimeLibcallImplsRaw);
+
+    // Sort by libcall impl name, not the enum name. This keeps the order
+    // suitable for using the name table for libcall recognition binary search.
+    llvm::sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
+      return A->getValueAsString("LibCallFuncName") <
+             B->getValueAsString("LibCallFuncName");
+    });
+
     RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
 
     size_t LibCallImplEnumVal = 1;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index c43cc9afe1e3c..f78427940b276 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TGTimer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -701,11 +702,13 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
   Orders.resize(1 + AltOrders->size());
 
   // Default allocation order always contains all registers.
+  MemberBV.resize(RegBank.getRegisters().size());
   Artificial = true;
   for (const Record *Element : *Elements) {
     Orders[0].push_back(Element);
     const CodeGenRegister *Reg = RegBank.getReg(Element);
     Members.push_back(Reg);
+    MemberBV.set(CodeGenRegBank::getRegIndex(Reg));
     Artificial &= Reg->Artificial;
     if (!Reg->getSuperRegs().empty())
       RegsWithSuperRegsTopoSigs.set(Reg->getTopoSig());
@@ -767,9 +770,11 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
       RegsWithSuperRegsTopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1),
       RSI(Props.RSI), CopyCost(0), Allocatable(true), AllocationPriority(0),
       GlobalPriority(false), TSFlags(0) {
+  MemberBV.resize(RegBank.getRegisters().size());
   Artificial = true;
   GeneratePressureSet = false;
   for (const auto R : Members) {
+    MemberBV.set(CodeGenRegBank::getRegIndex(R));
     if (!R->getSuperRegs().empty())
       RegsWithSuperRegsTopoSigs.set(R->getTopoSig());
     Artificial &= R->Artificial;
@@ -833,7 +838,7 @@ bool CodeGenRegisterClass::hasType(const ValueTypeByHwMode &VT) const {
 }
 
 bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
-  return llvm::binary_search(Members, Reg, deref<std::less<>>());
+  return MemberBV.test(CodeGenRegBank::getRegIndex(Reg));
 }
 
 unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const {
@@ -1126,7 +1131,7 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
 
 CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records,
                                const CodeGenHwModes &Modes)
-    : CGH(Modes) {
+    : Records(Records), CGH(Modes) {
   // Configure register Sets to understand register classes and tuples.
   Sets.addFieldExpander("RegisterClass", "MemberList");
   Sets.addFieldExpander("CalleeSavedRegs", "SaveList");
@@ -2198,7 +2203,9 @@ void CodeGenRegBank::computeDerivedInfo() {
 
   // Compute a weight for each register unit created during getSubRegs.
   // This may create adopted register units (with unit # >= NumNativeRegUnits).
+  Records.getTimer().startTimer("Compute reg unit weights");
   computeRegUnitWeights();
+  Records.getTimer().stopTimer();
 
   // Compute a unique set of RegUnitSets. One for each RegClass and inferred
   // supersets for the union of overlapping sets.
@@ -2295,9 +2302,6 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
       SRSets[I].push_back(R);
   }
 
-  for (auto I : SRSets)
-    sortAndUniqueRegisters(I.second);
-
   // Find matching classes for all SRSets entries.  Iterate in SubRegIndex
   // numerical order to visit synthetic indices last.
   for (const CodeGenSubRegIndex &SubIdx : SubRegIndices) {
@@ -2332,8 +2336,7 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
     CodeGenRegisterClass *RC,
     std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
   DenseSet<const CodeGenSubRegIndex *> ImpliedSubRegIndices;
-  std::vector<std::pair<const CodeGenRegister *, const CodeGenRegister *>>
-      SubToSuperRegs;
+  std::vector<const CodeGenRegister *> SubRegs;
   BitVector TopoSigs(getNumTopoSigs());
 
   // Iterate subregister indices in topological order to visit larger indices
@@ -2351,15 +2354,14 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
 
     // Build list of (Sub, Super) pairs for this SubIdx, sorted by Sub. Note
     // that the list may contain entries with the same Sub but different Supers.
-    SubToSuperRegs.clear();
+    SubRegs.clear();
     TopoSigs.reset();
     for (const CodeGenRegister *Super : RC->getMembers()) {
       const CodeGenRegister *Sub = Super->getSubRegs().find(SubIdx)->second;
       assert(Sub && "Missing sub-register");
-      SubToSuperRegs.emplace_back(Sub, Super);
+      SubRegs.push_back(Sub);
       TopoSigs.set(Sub->getTopoSig());
     }
-    sort(SubToSuperRegs, on_first<deref<std::less<>>>());
 
     // Iterate over sub-register class candidates.  Ignore classes created by
     // this loop. They will never be useful.
@@ -2374,16 +2376,10 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
       // Topological shortcut: SubRC members have the wrong shape.
       if (!TopoSigs.anyCommon(SubRC.getRegsWithSuperRegsTopoSigs()))
         continue;
-      // Compute the subset of RC that maps into SubRC with a single linear scan
-      // through SubToSuperRegs and the members of SubRC.
+      // Compute the subset of RC that maps into SubRC.
       CodeGenRegister::Vec SubSetVec;
-      auto SubI = SubRC.getMembers().begin(), SubE = SubRC.getMembers().end();
-      for (auto &[Sub, Super] : SubToSuperRegs) {
-        while (SubI != SubE && **SubI < *Sub)
-          ++SubI;
-        if (SubI == SubE)
-          break;
-        if (**SubI == *Sub)
+      for (const auto &[Sub, Super] : zip_equal(SubRegs, RC->getMembers())) {
+        if (SubRC.contains(Sub))
           SubSetVec.push_back(Super);
       }
 
@@ -2391,7 +2387,6 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
         continue;
 
       // RC injects completely into SubRC.
-      sortAndUniqueRegisters(SubSetVec);
       if (SubSetVec.size() == RC->getMembers().size()) {
         SubRC.addSuperRegClass(SubIdx, RC);
 
@@ -2454,6 +2449,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
   // and assigned EnumValues yet.  That means getSubClasses(),
   // getSuperClasses(), and hasSubClass() functions are defunct.
 
+  Records.getTimer().startTimer("Compute inferred register classes");
+
   // Use one-before-the-end so it doesn't move forward when new elements are
   // added.
   auto FirstNewRC = std::prev(RegClasses.end());
@@ -2489,6 +2486,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
     }
   }
 
+  Records.getTimer().startTimer("Extend super-register classes");
+
   // Compute the transitive closure for super-register classes.
   //
   // By iterating over sub-register indices in topological order, we only ever
@@ -2499,6 +2498,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
     for (CodeGenRegisterClass &SubRC : RegClasses)
       SubRC.extendSuperRegClasses(SubIdx);
   }
+
+  Records.getTimer().stopTimer();
 }
 
 /// getRegisterClassForRegister - Find the register class that contains the
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index bbcd44ce2cc5b..81aa663b8f11e 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -315,6 +315,8 @@ inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
 
 class CodeGenRegisterClass {
   CodeGenRegister::Vec Members;
+  // Bit mask of members, indexed by getRegIndex.
+  BitVector MemberBV;
   // Allocation orders. Order[0] always contains all registers in Members.
   std::vector<SmallVector<const Record *, 16>> Orders;
   // Bit mask of sub-classes including this, indexed by their EnumValue.
@@ -605,6 +607,8 @@ typedef SmallVector<unsigned, 16> TopoSigId;
 // CodeGenRegBank - Represent a target's registers and the relations between
 // them.
 class CodeGenRegBank {
+  const RecordKeeper &Records;
+
   SetTheory Sets;
 
   const CodeGenHwModes &CGH;
@@ -752,7 +756,7 @@ class CodeGenRegBank {
   CodeGenRegister *getReg(const Record *);
 
   // Get a Register's index into the Registers array.
-  unsigned getRegIndex(const CodeGenRegister *Reg) const {
+  static unsigned getRegIndex(const CodeGenRegister *Reg) {
     return Reg->EnumValue - 1;
   }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 50346c29e8862..b07ea9e9d5caf 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -2114,7 +2114,8 @@ void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef,
     const Record *WRDef = ProcWriteResDef->getValueAsDef("WriteType");
     if (!WRMap.try_emplace(WRDef, ProcWriteResDef).second)
       PrintFatalError(ProcWriteResDef->getLoc(),
-                      "WriteType already used in another WriteRes");
+                      "WriteType of " + WRDef->getName() +
+                          " already used in another WriteRes");
   }
 
   // Visit ProcResourceKinds referenced by the newly discovered WriteRes.
@@ -2148,7 +2149,8 @@ void CodeGenSchedModels::addReadAdvance(const Record *ProcReadAdvanceDef,
     const Record *RADef = ProcReadAdvanceDef->getValueAsDef("ReadType");
     if (!RAMap.try_emplace(RADef, ProcReadAdvanceDef).second)
       PrintFatalError(ProcReadAdvanceDef->getLoc(),
-                      "ReadType already used in another ReadAdvance");
+                      "ReadType of " + RADef->getName() +
+                          " already used in another ReadAdvance");
   }
 }
 
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 7d24c0f80cddb..2a311b7ff96b8 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1644,7 +1644,7 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) {
       for (const CodeGenRegister &Reg : Regs) {
         const CodeGenRegisterClass *BaseRC = nullptr;
         for (const CodeGenRegisterClass *RC : BaseClasses) {
-          if (is_contained(RC->getMembers(), &Reg)) {
+          if (RC->contains(&Reg)) {
             BaseRC = RC;
             break;
           }
diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index 9b5254e4c9f0b..f080a4c64c37d 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -179,6 +179,7 @@ config("compiler_defaults") {
       "_HAS_EXCEPTIONS=0",
       "_UNICODE",
       "UNICODE",
+      "CLANG_BUILD_STATIC",
     ]
     cflags += [ "/EHs-c-" ]
     cflags_cc += [ "/std:c++17" ]
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
index 8d19295b4d3d8..defa12c240deb 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
@@ -30,6 +30,7 @@ source_set("tweaks") {
     "MemberwiseConstructor.cpp",
     "ObjCLocalizeStringLiteral.cpp",
     "ObjCMemberwiseInitializer.cpp",
+    "OverridePureVirtuals.cpp",
     "PopulateSwitch.cpp",
     "RawStringLiteral.cpp",
     "RemoveUsingNamespace.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index 7deefe9dc0613..ad32aa9ce7bf2 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -144,6 +144,7 @@ unittest("ClangdTests") {
     "tweaks/MemberwiseConstructorTests.cpp",
     "tweaks/ObjCLocalizeStringLiteralTests.cpp",
     "tweaks/ObjCMemberwiseInitializerTests.cpp",
+    "tweaks/OverridePureVirtualsTests.cpp",
     "tweaks/PopulateSwitchTests.cpp",
     "tweaks/RawStringLiteralTests.cpp",
     "tweaks/RemoveUsingNamespaceTests.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index f21822f178655..4eab61b6e9ce2 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -305,6 +305,7 @@ copy("Headers") {
     "riscv_bitmanip.h",
     "riscv_corev_alu.h",
     "riscv_crypto.h",
+    "riscv_nds.h",
     "riscv_ntlh.h",
     "rtmintrin.h",
     "s390intrin.h",
diff --git a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
index 8f7beea152ab7..30b8bb61184bd 100644
--- a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
@@ -87,6 +87,7 @@ shared_library("libclang") {
     "Index_Internal.h",
     "Indexing.cpp",
     "Rewrite.cpp",
+    "Obsolete.cpp",
   ]
   if (host_os == "mac") {
     ldflags = [
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
index ab5dae85c9213..ac2ce0c59c6b3 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/BUILD.gn
@@ -8,6 +8,7 @@ unittest("ClangAnalysisTests") {
     "//clang/lib/Analysis",
     "//clang/lib/Basic",
     "//clang/lib/Frontend",
+    "//clang/lib/Testing",
     "//clang/lib/Tooling",
     "//llvm/lib/Support",
   ]
@@ -17,6 +18,7 @@ unittest("ClangAnalysisTests") {
     "CloneDetectionTest.cpp",
     "ExprMutationAnalyzerTest.cpp",
     "IntervalPartitionTest.cpp",
+    "LifetimeSafetyTest.cpp",
     "MacroExpansionContextTest.cpp",
     "UnsafeBufferUsageTest.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index d594141e358a3..05ac4c3f6c5f2 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -35,7 +35,6 @@ if (current_toolchain == default_toolchain) {
       "_LIBCPP_HAS_LOCALIZATION=1",
       "_LIBCPP_HAS_UNICODE=1",
       "_LIBCPP_HAS_WIDE_CHARACTERS=1",
-      "_LIBCPP_HAS_NO_STD_MODULES=",
       "_LIBCPP_HAS_TERMINAL=1",
       "_LIBCPP_INSTRUMENTED_WITH_ASAN=",
       "_LIBCPP_ABI_DEFINES=",
@@ -1061,6 +1060,7 @@ if (current_toolchain == default_toolchain) {
       "__format/indic_conjunct_break_table.h",
       "__format/parser_std_format_spec.h",
       "__format/range_default_formatter.h",
+      "__format/range_format.h",
       "__format/range_formatter.h",
       "__format/unicode.h",
       "__format/width_estimation_table.h",
@@ -1172,7 +1172,6 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/check_grouping.h",
       "__locale_dir/get_c_locale.h",
       "__locale_dir/locale_base_api.h",
-      "__locale_dir/locale_base_api/android.h",
       "__locale_dir/locale_base_api/bsd_locale_fallbacks.h",
       "__locale_dir/locale_base_api/ibm.h",
       "__locale_dir/locale_base_api/musl.h",
@@ -1395,6 +1394,7 @@ if (current_toolchain == default_toolchain) {
       "__ranges/transform_view.h",
       "__ranges/view_interface.h",
       "__ranges/views.h",
+      "__ranges/zip_transform_view.h",
       "__ranges/zip_view.h",
       "__split_buffer",
       "__std_mbstate_t.h",
@@ -1438,7 +1438,6 @@ if (current_toolchain == default_toolchain) {
       "__tuple/make_tuple_types.h",
       "__tuple/sfinae_helpers.h",
       "__tuple/tuple_element.h",
-      "__tuple/tuple_indices.h",
       "__tuple/tuple_like.h",
       "__tuple/tuple_like_ext.h",
       "__tuple/tuple_like_no_subrange.h",
diff --git a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
index db608e3cc7449..b118d16441960 100644
--- a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
@@ -37,6 +37,7 @@ static_library("MachO") {
     "ICF.cpp",
     "InputFiles.cpp",
     "InputSection.cpp",
+    "LinkerOptimizationHints.cpp",
     "LTO.cpp",
     "MapFile.cpp",
     "MarkLive.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index fc256c8d14063..7ea5f8c1196d8 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -41,11 +41,11 @@ static_library("CPlusPlus") {
     "CxxStringTypes.cpp",
     "Generic.cpp",
     "GenericBitset.cpp",
+    "GenericList.cpp",
     "GenericOptional.cpp",
     "LibCxx.cpp",
     "LibCxxAtomic.cpp",
     "LibCxxInitializerList.cpp",
-    "LibCxxList.cpp",
     "LibCxxMap.cpp",
     "LibCxxProxyArray.cpp",
     "LibCxxQueue.cpp",
@@ -62,7 +62,11 @@ static_library("CPlusPlus") {
     "LibStdcppUniquePointer.cpp",
     "MSVCUndecoratedNameParser.cpp",
     "MsvcStl.cpp",
+    "MsvcStlAtomic.cpp",
     "MsvcStlSmartPointer.cpp",
     "MsvcStlTuple.cpp",
+    "MsvcStlUnordered.cpp",
+    "MsvcStlVariant.cpp",
+    "MsvcStlVector.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
index 2959d22c6c97f..1a890f6733597 100644
--- a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -17,6 +17,7 @@ static_library("BinaryFormat") {
     "MsgPackDocumentYAML.cpp",
     "MsgPackReader.cpp",
     "MsgPackWriter.cpp",
+    "SFrame.cpp",
     "Wasm.cpp",
     "XCOFF.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn
index 883c648d83daa..7d55ac8d030e8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Object/BUILD.gn
@@ -42,6 +42,7 @@ static_library("Object") {
     "OffloadBundle.cpp",
     "RecordStreamer.cpp",
     "RelocationResolver.cpp",
+    "SFrameParser.cpp",
     "SymbolSize.cpp",
     "SymbolicFile.cpp",
     "TapiFile.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index d4adeddd9b4e4..3d11ce566207a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -176,6 +176,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUPreLegalizerCombiner.cpp",
     "AMDGPUPreloadKernArgProlog.cpp",
     "AMDGPUPreloadKernelArguments.cpp",
+    "AMDGPUPrepareAGPRAlloc.cpp",
     "AMDGPUPrintfRuntimeBinding.cpp",
     "AMDGPUPromoteAlloca.cpp",
     "AMDGPUPromoteKernelArguments.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 946b63f8a54fb..7ed0d3c6824e2 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -286,6 +286,7 @@ group("test") {
     "//llvm/tools/llvm-extract",
     "//llvm/tools/llvm-gsymutil:llvm-gsymutil",
     "//llvm/tools/llvm-ifs",
+    "//llvm/tools/llvm-ir2vec",
     "//llvm/tools/llvm-isel-fuzzer",
     "//llvm/tools/llvm-jitlink",
     "//llvm/tools/llvm-jitlink/llvm-jitlink-executor",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn
new file mode 100644
index 0000000000000..07a795122c76c
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn
@@ -0,0 +1,9 @@
+executable("llvm-ir2vec") {
+  deps = [
+    "//llvm/lib/Analysis",
+    "//llvm/lib/IR",
+    "//llvm/lib/IRReader",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "llvm-ir2vec.cpp" ]
+}
diff --git a/llvm/utils/llvm-original-di-preservation.py b/llvm/utils/llvm-original-di-preservation.py
index 03793b1136f8d..b5ccd7a3224f8 100755
--- a/llvm/utils/llvm-original-di-preservation.py
+++ b/llvm/utils/llvm-original-di-preservation.py
@@ -11,7 +11,6 @@
 from collections import defaultdict
 from collections import OrderedDict
 
-
 class DILocBug:
     def __init__(self, origin, action, bb_name, fn_name, instr):
         self.origin = origin
@@ -20,18 +19,35 @@ def __init__(self, origin, action, bb_name, fn_name, instr):
         self.fn_name = fn_name
         self.instr = instr
 
-    def __str__(self):
+    def key(self):
         return self.action + self.bb_name + self.fn_name + self.instr
 
+    def to_dict(self):
+        result = {
+            "instr": self.instr,
+            "fn_name": self.fn_name,
+            "bb_name": self.bb_name,
+            "action": self.action,
+        }
+        if self.origin:
+            result["origin"] = self.origin
+        return result
+
 
 class DISPBug:
     def __init__(self, action, fn_name):
         self.action = action
         self.fn_name = fn_name
 
-    def __str__(self):
+    def key(self):
         return self.action + self.fn_name
 
+    def to_dict(self):
+        return {
+            "fn_name": self.fn_name,
+            "action": self.action,
+        }
+
 
 class DIVarBug:
     def __init__(self, action, name, fn_name):
@@ -39,9 +55,41 @@ def __init__(self, action, name, fn_name):
         self.name = name
         self.fn_name = fn_name
 
-    def __str__(self):
+    def key(self):
         return self.action + self.name + self.fn_name
 
+    def to_dict(self):
+        return {
+            "fn_name": self.fn_name,
+            "name": self.name,
+            "action": self.action,
+        }
+
+
+def print_bugs_yaml(name, bugs_dict, indent=2):
+    def get_bug_line(indent_level: int, text: str, margin_mark: bool = False):
+        if margin_mark:
+            return "- ".rjust(indent_level * indent) + text
+        return " " * indent * indent_level + text
+
+    print(f"{name}:")
+    for bugs_file, bugs_pass_dict in sorted(iter(bugs_dict.items())):
+        print(get_bug_line(1, f"{bugs_file}:"))
+        for bugs_pass, bugs_list in sorted(iter(bugs_pass_dict.items())):
+            print(get_bug_line(2, f"{bugs_pass}:"))
+            for bug in bugs_list:
+                bug_dict = bug.to_dict()
+                first_line = True
+                # First item needs a '-' in the margin.
+                for key, val in sorted(iter(bug_dict.items())):
+                    if "\n" in val:
+                        # Output block text for any multiline string.
+                        print(get_bug_line(3, f"{key}: |", first_line))
+                        for line in val.splitlines():
+                            print(get_bug_line(4, line))
+                    else:
+                        print(get_bug_line(3, f"{key}: {val}", first_line))
+                    first_line = False
 
 # Report the bugs in form of html.
 def generate_html_report(
@@ -430,9 +478,16 @@ def get_json_chunk(file, start, size):
 # Parse the program arguments.
 def parse_program_args(parser):
     parser.add_argument("file_name", type=str, help="json file to process")
-    parser.add_argument("html_file", type=str, help="html file to output data")
-    parser.add_argument(
-        "-compress", action="store_true", help="create reduced html report"
+    parser.add_argument("--reduce", action="store_true", help="create reduced report")
+
+    report_type_group = parser.add_mutually_exclusive_group(required=True)
+    report_type_group.add_argument(
+        "--report-html-file", type=str, help="output HTML file for the generated report"
+    )
+    report_type_group.add_argument(
+        "--acceptance-test",
+        action="store_true",
+        help="if set, produce terminal-friendly output and return 0 iff the input file is empty or does not exist",
     )
 
     return parser.parse_args()
@@ -442,10 +497,22 @@ def Main():
     parser = argparse.ArgumentParser()
     opts = parse_program_args(parser)
 
-    if not opts.html_file.endswith(".html"):
+    if opts.report_html_file is not None and not opts.report_html_file.endswith(
+        ".html"
+    ):
         print("error: The output file must be '.html'.")
         sys.exit(1)
 
+    if opts.acceptance_test:
+        if os.path.isdir(opts.file_name):
+            print(f"error: Directory passed as input file: '{opts.file_name}'")
+            sys.exit(1)
+        if not os.path.exists(opts.file_name):
+            # We treat an empty input file as a success, as debugify will generate an output file iff any errors are
+            # found, meaning we expect 0 errors to mean that the expected file does not exist.
+            print(f"No errors detected for: {opts.file_name}")
+            sys.exit(0)
+
     # Use the defaultdict in order to make multidim dicts.
     di_location_bugs = defaultdict(lambda: defaultdict(list))
     di_subprogram_bugs = defaultdict(lambda: defaultdict(list))
@@ -489,9 +556,9 @@ def Main():
                 skipped_lines += 1
                 continue
 
-            di_loc_bugs = di_location_bugs[bugs_file][bugs_pass]
-            di_sp_bugs = di_subprogram_bugs[bugs_file][bugs_pass]
-            di_var_bugs = di_variable_bugs[bugs_file][bugs_pass]
+            di_loc_bugs = di_location_bugs.get("bugs_file", {}).get("bugs_pass", [])
+            di_sp_bugs = di_subprogram_bugs.get("bugs_file", {}).get("bugs_pass", [])
+            di_var_bugs = di_variable_bugs.get("bugs_file", {}).get("bugs_pass", [])
 
             # Omit duplicated bugs.
             di_loc_set = set()
@@ -515,9 +582,9 @@ def Main():
                         skipped_bugs += 1
                         continue
                     di_loc_bug = DILocBug(origin, action, bb_name, fn_name, instr)
-                    if not str(di_loc_bug) in di_loc_set:
-                        di_loc_set.add(str(di_loc_bug))
-                        if opts.compress:
+                    if not di_loc_bug.key() in di_loc_set:
+                        di_loc_set.add(di_loc_bug.key())
+                        if opts.reduce:
                             pass_instr = bugs_pass + instr
                             if not pass_instr in di_loc_pass_instr_set:
                                 di_loc_pass_instr_set.add(pass_instr)
@@ -538,9 +605,9 @@ def Main():
                         skipped_bugs += 1
                         continue
                     di_sp_bug = DISPBug(action, name)
-                    if not str(di_sp_bug) in di_sp_set:
-                        di_sp_set.add(str(di_sp_bug))
-                        if opts.compress:
+                    if not di_sp_bug.key() in di_sp_set:
+                        di_sp_set.add(di_sp_bug.key())
+                        if opts.reduce:
                             pass_fn = bugs_pass + name
                             if not pass_fn in di_sp_pass_fn_set:
                                 di_sp_pass_fn_set.add(pass_fn)
@@ -562,9 +629,9 @@ def Main():
                         skipped_bugs += 1
                         continue
                     di_var_bug = DIVarBug(action, name, fn_name)
-                    if not str(di_var_bug) in di_var_set:
-                        di_var_set.add(str(di_var_bug))
-                        if opts.compress:
+                    if not di_var_bug.key() in di_var_set:
+                        di_var_set.add(di_var_bug.key())
+                        if opts.reduce:
                             pass_var = bugs_pass + name
                             if not pass_var in di_var_pass_var_set:
                                 di_var_pass_var_set.add(pass_var)
@@ -582,19 +649,40 @@ def Main():
                     skipped_bugs += 1
                     continue
 
-            di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs
-            di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs
-            di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs
-
-    generate_html_report(
-        di_location_bugs,
-        di_subprogram_bugs,
-        di_variable_bugs,
-        di_location_bugs_summary,
-        di_sp_bugs_summary,
-        di_var_bugs_summary,
-        opts.html_file,
-    )
+            if di_loc_bugs:
+                di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs
+            if di_sp_bugs:
+                di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs
+            if di_var_bugs:
+                di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs
+
+    if opts.report_html_file is not None:
+        generate_html_report(
+            di_location_bugs,
+            di_subprogram_bugs,
+            di_variable_bugs,
+            di_location_bugs_summary,
+            di_sp_bugs_summary,
+            di_var_bugs_summary,
+            opts.report_html_file,
+        )
+    else:
+        # Pretty(ish) print the detected bugs, but check if any exist first so that we don't print an empty dict.
+        if di_location_bugs:
+            print_bugs_yaml("DILocation Bugs", di_location_bugs)
+        if di_subprogram_bugs:
+            print_bugs_yaml("DISubprogram Bugs", di_subprogram_bugs)
+        if di_variable_bugs:
+            print_bugs_yaml("DIVariable Bugs", di_variable_bugs)
+
+    if opts.acceptance_test:
+        if any((di_location_bugs, di_subprogram_bugs, di_variable_bugs)):
+            # Add a newline gap after printing at least one error.
+            print()
+            print(f"Errors detected for: {opts.file_name}")
+            sys.exit(1)
+        else:
+            print(f"No errors detected for: {opts.file_name}")
 
     if skipped_lines > 0:
         print("Skipped lines: " + str(skipped_lines))
diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 66bef82586a34..0ac392cbed7be 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -123,7 +123,7 @@ export_sources() {
                 tar -C test-suite-$release$rc.src --strip-components=1 -xzf -
         fi
         echo "Creating tarball for test-suite ..."
-        tar --sort=name --owner=0 --group=0 \
+        XZ_OPT="-T0" tar --sort=name --owner=0 --group=0 \
             --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
             -cJf test-suite-$release$rc.src.tar.xz test-suite-$release$rc.src
     fi
diff --git a/llvm/utils/update_mir_regclass_numbers b/llvm/utils/update_mir_regclass_numbers
new file mode 100755
index 0000000000000..21a8ae2f7817a
--- /dev/null
+++ b/llvm/utils/update_mir_regclass_numbers
@@ -0,0 +1,27 @@
+#!/bin/sh
+set -e
+
+# Update operands like "1966090 /* regdef:VGPR_32 */" in MIR tests when register
+# class numbers change.
+
+if [ $# -eq 0 ] ; then
+    echo "usage: ${0##*/} /path/to/<Target>GenRegisterInfo.inc test/CodeGen/<Target>/testfile.mir..." >&2
+    exit 1
+fi
+
+reginfo="$1"
+shift
+
+files=$(grep -El ' [0-9]+ /\* [a-z-]+:\w+ \*/' "$@")
+[ "$files" ] || exit 0
+
+grep -Eho ' [0-9]+ /\* [a-z-]+:\w+ \*/' $files | sed -E 's/.*:(\w+).*/\1/' | sort -u | while read -r class ; do
+    id=$(grep -E "^  ${class}RegClassID = " "$reginfo" | sed -E 's/.* = ([0-9]+).*/\1/')
+    if [ "$id" ] ; then
+        echo "$class..."
+        sed -Ei -e 's| [0-9]+ (/\* reguse:'"$class"' \*/)| '"$(((id + 1) << 16 | 9))"' \1|g' \
+            -e 's| [0-9]+ (/\* regdef:'"$class"' \*/)| '"$(((id + 1) << 16 | 10))"' \1|g' \
+            -e 's| [0-9]+ (/\* regdef-ec:'"$class"' \*/)| '"$(((id + 1) << 16 | 11))"' \1|g' \
+            $files
+    fi
+done
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index b3bde055f04f0..f988bebea1223 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -89,7 +89,7 @@ their semantics via a special [TableGen backend][TableGenBackend]:
     help of the following constructs.
 *   The `Dialect` class: Operations belonging to one logical group are placed in
     the same dialect. The `Dialect` class contains dialect-level information.
-*   The `OpTrait` class hierarchy: They are used to specify special properties
+*   The `Trait` class hierarchy: They are used to specify special properties
     and constraints of the operation, including whether the operation has side
     effect or whether its output has the same shape as the input.
 *   The `ins`/`outs` marker: These are two special markers builtin to the
@@ -306,6 +306,8 @@ Right now, the following primitive constraints are supported:
 *   `IntPositive`: Specifying an integer attribute whose value is positive
 *   `IntNonNegative`: Specifying an integer attribute whose value is
     non-negative
+*   `IntPowerOf2`: Specifying an integer attribute whose value is a power of
+    two > 0
 *   `ArrayMinCount<N>`: Specifying an array attribute to have at least `N`
     elements
 *   `ArrayMaxCount<N>`: Specifying an array attribute to have at most `N`
@@ -434,7 +436,7 @@ various traits in the `mlir::OpTrait` namespace.
 Both operation traits, [interfaces](../Interfaces.md/#utilizing-the-ods-framework),
 and constraints involving multiple operands/attributes/results are provided as
 the third template parameter to the `Op` class. They should be deriving from
-the `OpTrait` class. See [Constraints](#constraints) for more information.
+the `Trait` class. See [Constraints](#constraints) for more information.
 
 ### Builder methods
 
@@ -1353,7 +1355,7 @@ results. These constraints should be specified as the `Op` class template
 parameter as described in
 [Operation traits and constraints](#operation-traits-and-constraints).
 
-Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `OpTrait`)
+Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `Trait`)
 in [`OpBase.td`][OpBase].A bunch of constraint primitives are provided to help
 specification. See [`OpBase.td`][OpBase] for the complete list.
 
@@ -1364,7 +1366,7 @@ commutative or not, whether is a terminator, etc. These constraints should be
 specified as the `Op` class template parameter as described in
 [Operation traits and constraints](#operation-traits-and-constraints).
 
-Traits are modeled as `NativeOpTrait` (a subclass of `OpTrait`) in
+Traits are modeled as `NativeTrait` (a subclass of `Trait`) in
 [`OpBase.td`][OpBase]. They are backed and will be translated into the
 corresponding C++ `mlir::OpTrait` classes.
 
diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md
index 0f45f5607bab9..08c4e0f9d859c 100644
--- a/mlir/docs/Tutorials/transform/Ch2.md
+++ b/mlir/docs/Tutorials/transform/Ch2.md
@@ -133,6 +133,8 @@ This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, tha
 ```c++
 // In MyExtension.cpp.
 
+#include "MyExtension.h"
+
 #define GET_OP_CLASSES
 #include "MyExtension.cpp.inc"
 
@@ -283,7 +285,7 @@ void registerMyExtension(::mlir::DialectRegistry &registry) {
 }
 ```
 
-After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is.
+After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is. It actually exists in `mlir/test/Examples/transform/Ch2/sequence.mlir`, which contains the `microkernel` implementation. 
 
 ```mlir
 module attributes {transform.with_named_sequence} {
@@ -300,7 +302,7 @@ module attributes {transform.with_named_sequence} {
 
     // The actual tiling transformation takes tile sizes as attributes. It
     // produces a handle to the loop generated during tiling.
-    %loop, %tiled = transform.structured.tile_using_forall %max
+    %tiled, %loop = transform.structured.tile_using_forall %max
                     tile_sizes [8, 32]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
@@ -311,32 +313,32 @@ module attributes {transform.with_named_sequence} {
     // a single handle to all operations and give it to
     // `fuse_into_containing_op` that would take care of the ordering in this
     // case.
-    %add_fused = transform.structured.fuse_into_containing_op %add into %loop
-        : (!transform.any_op, !transform.any_op) -> !transform.any_op
-    %matmul_fused = transform.structured.fuse_into_containing_op %arg1
-                    into %loop
+    %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1
+                    into %loop2
         : (!transform.op<"linalg.matmul">, !transform.any_op)
-       -> !transform.any_op
+       -> (!transform.any_op, !transform.any_op)
 
     // Tile again to get the desired size. Note that this time this tiles the
     // "add" operation and fuses matmul into the loop, but doesn't affect the
     // "max" operation. This illustrates the precise targeting with the
     // transform dialect. Otherwise, it is difficult to differentiate "add" and
     // "max", both of which having the same kind.
-    %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused
+    %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused
                         tile_sizes [4, 4]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused
-                      into %loop_2
-        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    %matmul_fused_2, %loop_second_2 = transform.structured.fuse_into_containing_op %matmul_fused
+                      into %loop_second
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Since outlining is currently only implemented for region-holding
     // operations such as loops, use tiling to size 1 to materialize the outer
     // loop that is going to be outlined.
-    %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1]
+    %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target
-        : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third
+        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
     %func, %call = transform.loop.outline %outline_target
                    {func_name = "outlined"}
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
diff --git a/mlir/include/mlir-c/Pass.h b/mlir/include/mlir-c/Pass.h
index 8fd8e9956a65a..0d2e19ee7fb0a 100644
--- a/mlir/include/mlir-c/Pass.h
+++ b/mlir/include/mlir-c/Pass.h
@@ -88,6 +88,10 @@ MLIR_CAPI_EXPORTED void mlirPassManagerEnableIRPrinting(
 MLIR_CAPI_EXPORTED void
 mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable);
 
+/// Enable pass timing.
+MLIR_CAPI_EXPORTED void
+mlirPassManagerEnableTiming(MlirPassManager passManager);
+
 /// Nest an OpPassManager under the top-level PassManager, the nested
 /// passmanager will only run on operations matching the provided name.
 /// The returned OpPassManager will be destroyed when the parent is destroyed.
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index b68262f09f485..ee401cca8f552 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -707,6 +707,19 @@ class IntegerRelation {
   /// this for uniformity with `applyDomain`.
   void applyRange(const IntegerRelation &rel);
 
+  /// Let the relation `this` be R1, and the relation `rel` be R2. Requires
+  /// R1 and R2 to have the same domain.
+  ///
+  /// Let R3 be the rangeProduct of R1 and R2. Then x R3 (y, z) iff
+  /// (x R1 y and x R2 z).
+  ///
+  /// Example:
+  ///
+  /// R1: (i, j) -> k : f(i, j, k) = 0
+  /// R2: (i, j) -> l : g(i, j, l) = 0
+  /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0
+  IntegerRelation rangeProduct(const IntegerRelation &rel);
+
   /// Given a relation `other: (A -> B)`, this operation merges the symbol and
   /// local variables and then takes the composition of `other` on `this: (B ->
   /// C)`. The resulting relation represents tuples of the form: `A -> C`.
diff --git a/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h b/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h
new file mode 100644
index 0000000000000..daac2a99ed80f
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h
@@ -0,0 +1,27 @@
+//===- ComplexToROCDLLibraryCalls.h - convert from Complex to ROCDL calls -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_
+#define MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class RewritePatternSet;
+
+#define GEN_PASS_DECL_CONVERTCOMPLEXTOROCDLLIBRARYCALLS
+#include "mlir/Conversion/Passes.h.inc"
+
+/// Populate the given list with patterns that convert from Complex to ROCDL
+/// calls.
+void populateComplexToROCDLLibraryCallsConversionPatterns(
+    RewritePatternSet &patterns);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 8a5976e547169..d93fbefab74aa 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -23,6 +23,7 @@
 #include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
 #include "mlir/Conversion/ComplexToLibm/ComplexToLibm.h"
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
 #include "mlir/Conversion/ComplexToSPIRV/ComplexToSPIRVPass.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 50c67da91a4af..8183f355795a9 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -312,6 +312,18 @@ def ConvertComplexToLibm : Pass<"convert-complex-to-libm", "ModuleOp"> {
   let dependentDialects = ["func::FuncDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexToROCDLLibraryCalls
+//===----------------------------------------------------------------------===//
+
+def ConvertComplexToROCDLLibraryCalls : Pass<"convert-complex-to-rocdl-library-calls", "ModuleOp"> {
+  let summary = "Convert Complex dialect to ROCDL library calls";
+  let description = [{
+    This pass converts supported Complex ops to calls to the AMD device library.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexToSPIRV
 //===----------------------------------------------------------------------===//
@@ -1437,6 +1449,10 @@ def ConvertVectorToLLVMPass : Pass<"convert-vector-to-llvm"> {
            "bool", /*default=*/"false",
            "Enables the use of Arm FEAT_I8MM instructions while lowering "
            "the vector dialect.">,
+    Option<"armBF16", "enable-arm-bf16",
+           "bool", /*default=*/"false",
+           "Enables the use of Arm FEAT_BF16 instructions while lowering "
+           "the vector dialect.">,
     Option<"x86Vector", "enable-x86vector",
            "bool", /*default=*/"false",
            "Enables the use of X86Vector dialect while lowering the vector "
diff --git a/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h b/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h
index f6b09deb4e44c..cfb6cc313bc63 100644
--- a/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h
+++ b/mlir/include/mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h
@@ -13,12 +13,6 @@
 namespace mlir {
 class LLVMTypeConverter;
 
-/// Collect a set of patterns to convert from Vector contractions to LLVM Matrix
-/// Intrinsics. To lower to assembly, the LLVM flag -lower-matrix-intrinsics
-/// will be needed when invoking LLVM.
-void populateVectorToLLVMMatrixConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns);
-
 /// Collect a set of patterns to convert from the Vector dialect to LLVM.
 void populateVectorToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
diff --git a/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h b/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h
index ac4915901fdec..ff99d7ce96daf 100644
--- a/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h
+++ b/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h
@@ -21,9 +21,6 @@ class RewritePatternSet;
 /// Collect a set of patterns to convert from the vector to XeGPU ops.
 void populateVectorToXeGPUConversionPatterns(RewritePatternSet &patterns);
 
-/// Create a pass to convert ops from vector to XeGPU.
-std::unique_ptr<Pass> createConvertVectorToXeGPUPass();
-
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_VECTORTOXEGPU_VECTORTOXEGPU_H
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index eadb5d9326798..b237f7b5749e7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
   let summary = "Extend a vector of packed floating point values";
 
   let description = [{
-    Extend and scale two packed floats in `source[index]` to two floats and 
+    Extend and scale two packed floats in `source[index]` to two floats and
     return them.
 
     This rather unusual signature arises from the fact that AMD GPUs cannot
@@ -237,9 +237,11 @@ def AMDGPU_FatRawBufferCastOp :
     Arguments<(ins AnyMemRef:$source,
       Optional<I32>:$validBytes,
       Optional<I<14>>:$cacheSwizzleStride,
-      DefaultValuedProp<BoolProp, "true">:$boundsCheck,
-      UnitProp:$resetOffset)>,
+      DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
+      UnitAttr:$resetOffset)>,
     Results<(outs AnyMemRef:$result)> {
+  // TODO: Set `resetOffset` and `boundsCheck` to use `Property` once
+  //       we implemented pythonic binding for `Property`.
   let summary = "Create a raw buffer fat pointer that matches `memref`";
   let description = [{
     Wraps the memory pointed to by `source` as a raw buffer fat pointer, or,
@@ -717,6 +719,29 @@ def AMDGPU_SchedBarrierOp :
   }];
 }
 
+def AMDGPU_MemoryCounterWaitOp :
+  AMDGPU_Op<"memory_counter_wait">,
+  Arguments<(ins
+      OptionalAttr<I32Attr>:$load,
+      OptionalAttr<I32Attr>:$store,
+      OptionalAttr<I32Attr>:$ds,
+      OptionalAttr<I32Attr>:$exp
+    )>
+  {
+  let summary = "Wait for specified hardware counters";
+  let description = [{
+    Wait for the specified counters to be less-than or equal-to the provided
+    values before continuing.
+
+    Counters can lower to different instructions on different architectires,
+    including clamping to the some HW supported max value or combining multiple
+    counters into one.
+  }];
+  let assemblyFormat = [{
+    oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
+  }];
+}
+
 def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
     "The possible permutations of the lanes storing B available in an MFMA",
     [
@@ -861,7 +886,7 @@ def AMDGPU_WMMAOp :
 }
 
 def AMDGPU_GatherToLDSOp :
-    AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
+    AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
     Arguments<(ins
                    Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
                    Variadic<Index>:$srcIndices,
@@ -966,13 +991,13 @@ def AMDGPU_ScaledMFMAOp :
     order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
 
     This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
-    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and 
-    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile 
-    size. 
-    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` 
+    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
+    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
+    size.
+    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
     are omitted from this wrapper.
-    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for 
-    double-precision operations on gfx94x and so are not included here. 
+    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
+    double-precision operations on gfx94x and so are not included here.
   }];
   let assemblyFormat = [{
     `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
index 2091faa6b0b02..333de6bbd8a05 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h
@@ -114,6 +114,21 @@ class AffineDmaStartOp
                     AffineMap tagMap, ValueRange tagIndices, Value numElements,
                     Value stride = nullptr, Value elementsPerStride = nullptr);
 
+  static AffineDmaStartOp
+  create(OpBuilder &builder, Location location, Value srcMemRef,
+         AffineMap srcMap, ValueRange srcIndices, Value destMemRef,
+         AffineMap dstMap, ValueRange destIndices, Value tagMemRef,
+         AffineMap tagMap, ValueRange tagIndices, Value numElements,
+         Value stride = nullptr, Value elementsPerStride = nullptr);
+
+  static AffineDmaStartOp create(ImplicitLocOpBuilder &builder, Value srcMemRef,
+                                 AffineMap srcMap, ValueRange srcIndices,
+                                 Value destMemRef, AffineMap dstMap,
+                                 ValueRange destIndices, Value tagMemRef,
+                                 AffineMap tagMap, ValueRange tagIndices,
+                                 Value numElements, Value stride = nullptr,
+                                 Value elementsPerStride = nullptr);
+
   /// Returns the operand index of the source memref.
   unsigned getSrcMemRefOperandIndex() { return 0; }
 
@@ -319,6 +334,12 @@ class AffineDmaWaitOp
 
   static void build(OpBuilder &builder, OperationState &result, Value tagMemRef,
                     AffineMap tagMap, ValueRange tagIndices, Value numElements);
+  static AffineDmaWaitOp create(OpBuilder &builder, Location location,
+                                Value tagMemRef, AffineMap tagMap,
+                                ValueRange tagIndices, Value numElements);
+  static AffineDmaWaitOp create(ImplicitLocOpBuilder &builder, Value tagMemRef,
+                                AffineMap tagMap, ValueRange tagIndices,
+                                Value numElements);
 
   static StringRef getOperationName() { return "affine.dma_wait"; }
 
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 7fe1f6d48ceeb..9b59af73977d3 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -98,12 +98,6 @@ void promoteSingleIterationLoops(func::FuncOp f);
 LogicalResult affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
                                   bool unrollPrologueEpilogue = false);
 
-/// Identify valid and profitable bands of loops to tile. This is currently just
-/// a temporary placeholder to test the mechanics of tiled code generation.
-/// Returns all maximal outermost perfect loop nests to tile.
-void getTileableBands(func::FuncOp f,
-                      std::vector<SmallVector<AffineForOp, 6>> *bands);
-
 /// Tiles the specified band of perfectly nested loops creating tile-space loops
 /// and intra-tile loops. A band is a contiguous set of loops. This utility
 /// doesn't check for the validity of tiling itself, but just performs it.
diff --git a/mlir/include/mlir/Dialect/Arith/IR/Arith.h b/mlir/include/mlir/Dialect/Arith/IR/Arith.h
index 7c50c2036ffdc..0fc3db8e993d8 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/Arith.h
+++ b/mlir/include/mlir/Dialect/Arith/IR/Arith.h
@@ -59,15 +59,27 @@ class ConstantIntOp : public arith::ConstantOp {
   /// Build a constant int op that produces an integer of the specified width.
   static void build(OpBuilder &builder, OperationState &result, int64_t value,
                     unsigned width);
+  static ConstantIntOp create(OpBuilder &builder, Location location,
+                              int64_t value, unsigned width);
+  static ConstantIntOp create(ImplicitLocOpBuilder &builder, int64_t value,
+                              unsigned width);
 
   /// Build a constant int op that produces an integer of the specified type,
   /// which must be an integer type.
   static void build(OpBuilder &builder, OperationState &result, Type type,
                     int64_t value);
+  static ConstantIntOp create(OpBuilder &builder, Location location, Type type,
+                              int64_t value);
+  static ConstantIntOp create(ImplicitLocOpBuilder &builder, Type type,
+                              int64_t value);
 
   /// Build a constant int op that produces an integer from an APInt
   static void build(OpBuilder &builder, OperationState &result, Type type,
                     const APInt &value);
+  static ConstantIntOp create(OpBuilder &builder, Location location, Type type,
+                              const APInt &value);
+  static ConstantIntOp create(ImplicitLocOpBuilder &builder, Type type,
+                              const APInt &value);
 
   inline int64_t value() {
     return cast<IntegerAttr>(arith::ConstantOp::getValue()).getInt();
@@ -85,6 +97,10 @@ class ConstantFloatOp : public arith::ConstantOp {
   /// Build a constant float op that produces a float of the specified type.
   static void build(OpBuilder &builder, OperationState &result, FloatType type,
                     const APFloat &value);
+  static ConstantFloatOp create(OpBuilder &builder, Location location,
+                                FloatType type, const APFloat &value);
+  static ConstantFloatOp create(ImplicitLocOpBuilder &builder, FloatType type,
+                                const APFloat &value);
 
   inline APFloat value() {
     return cast<FloatAttr>(arith::ConstantOp::getValue()).getValue();
@@ -100,6 +116,9 @@ class ConstantIndexOp : public arith::ConstantOp {
   static ::mlir::TypeID resolveTypeID() { return TypeID::get<ConstantOp>(); }
   /// Build a constant int op that produces an index.
   static void build(OpBuilder &builder, OperationState &result, int64_t value);
+  static ConstantIndexOp create(OpBuilder &builder, Location location,
+                                int64_t value);
+  static ConstantIndexOp create(ImplicitLocOpBuilder &builder, int64_t value);
 
   inline int64_t value() {
     return cast<IntegerAttr>(arith::ConstantOp::getValue()).getInt();
diff --git a/mlir/include/mlir/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.td b/mlir/include/mlir/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.td
index 53784982be6dc..7777e6060ea76 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.td
@@ -12,15 +12,25 @@ include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 
-def ApplyArmSVELowerContractionPatternsOp
+def ApplyArmSVELowerContractionToI8MMPatternsOp
     : Op<Transform_Dialect, "apply_patterns.arm_sve.vector_contract_to_i8mm",
          [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
   let description = [{
-    Indicates that vector contraction-like operations should be lowered to
-    finer-grained vector primitives using the ArmSVE dialect.
+    Indicates that vector contract operations should be lowered to
+    to ArmSVE dialect operations mapping to instructions from FEAT_I8MM.
   }];
 
   let assemblyFormat = "attr-dict";
 }
 
+def ApplyArmSVELowerContractionToBFMMLAPatternsOp
+    : Op<Transform_Dialect, "apply_patterns.arm_sve.vector_contract_to_bfmmla",
+         [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+    Indicates that vector contract operations should be lowered to
+    ArmSVE dialect operations mapping to instructions from FEAT_BF16.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
 #endif // ARMSVE_VECTOR_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Transforms.h b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Transforms.h
index 232e2be29e574..de160dbf8ed94 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Transforms.h
@@ -23,6 +23,8 @@ void populateArmSVELegalizeForLLVMExportPatterns(
 void populateLowerContractionToSVEI8MMPatternPatterns(
     RewritePatternSet &patterns);
 
+void populateLowerContractionToSVEBFMMLAPatterns(RewritePatternSet &patterns);
+
 /// Configure the target to support lowering ArmSVE ops to ops that map to LLVM
 /// intrinsics.
 void configureArmSVELegalizeForExportTarget(LLVMConversionTarget &target);
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 6245f88db3d19..f3b34f9fded7f 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -651,7 +651,8 @@ void replaceOpWithBufferizedValues(RewriterBase &rewriter, Operation *op,
 template <typename OpTy, typename... Args>
 OpTy replaceOpWithNewBufferizedOp(RewriterBase &rewriter, Operation *op,
                                   Args &&...args) {
-  auto newOp = rewriter.create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+  auto newOp =
+      OpTy::create(rewriter, op->getLoc(), std::forward<Args>(args)...);
   replaceOpWithBufferizedValues(rewriter, op, newOp->getResults());
   return newOp;
 }
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index f175b15c8770f..271b42025e0af 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -401,7 +401,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     SameOperandsAndResultElementType,
     Bufferization_TensorAndBufferMatch<"result", "buffer">
   ]> {
-  let summary = "create a buffer-like type from a tensor-like type";
+  let summary = "create a tensor-like type from a buffer-like type";
   let description = [{
     An operation that creates a tensor from a buffer. The result value is a
     tensor-like type that must match the corresponding buffer-like operand as
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 596c470ef6d23..a2409f2796b94 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -148,14 +148,14 @@ struct BufferResultsToOutParamsOpts {
   /// Default memref.alloc is used
   AllocationFn allocationFn = [](OpBuilder &builder, Location loc,
                                  MemRefType type) {
-    return builder.create<memref::AllocOp>(loc, type).getResult();
+    return memref::AllocOp::create(builder, loc, type).getResult();
   };
 
   /// Memcpy function; used to create a copy between two memrefs.
   /// Default memref.copy is used.
   MemCpyFn memCpyFn = [](OpBuilder &builder, Location loc, Value from,
                          Value to) {
-    builder.create<memref::CopyOp>(loc, from, to);
+    memref::CopyOp::create(builder, loc, from, to);
     return success();
   };
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td b/mlir/include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td
index df5a2448bd779..e98b94b5b3052 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td
@@ -114,7 +114,7 @@ def BasicPtxBuilderOpInterface : OpInterface<"BasicPtxBuilderInterface"> {
         /*methodBody=*/"",
         /*defaultImpl=*/ [{
             mlir::Operation* op = $_op;
-            return rewriter.create<LLVM::ConstantOp>(
+            return LLVM::ConstantOp::create(rewriter,
               op->getLoc(), rewriter.getIntegerType(32), val);
         }]
      >,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index 63e007cdc335c..e355bb8f5ddae 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -223,6 +223,9 @@ Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
 /// function confirms that the Operation has the desired properties.
 bool satisfiesLLVMModule(Operation *op);
 
+/// Lookup parent Module satisfying LLVM conditions on the Module Operation.
+Operation *parentLLVMModule(Operation *op);
+
 /// Convert an array of integer attributes to a vector of integers that can be
 /// used as indices in LLVM operations.
 template <typename IntT = int64_t>
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index cd8b68e5b1410..caba614bf2742 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -324,8 +324,8 @@ def LLVM_NoAliasScopeDeclOp
       return success();
     if (scopeAttrs->size() != 1)
       return failure();
-    $_op = $_builder.create<LLVM::NoAliasScopeDeclOp>(
-      $_location, (*scopeAttrs)[0]);
+    $_op = LLVM::NoAliasScopeDeclOp::create(
+      $_builder, $_location, (*scopeAttrs)[0]);
   }];
   let assemblyFormat = "$scope attr-dict";
 }
@@ -468,7 +468,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
         $_builder.getNamedAttr($_qualCppClassName::getRoundingModeAttrName(),
                                roundingModeAttr));
     }], true : "") # [{
-    $res = $_builder.create<$_qualCppClassName>($_location,
+    $res = $_qualCppClassName::create($_builder, $_location,
       $_resultType, mlirOperands, mlirAttrs);
   }];
 }
@@ -743,7 +743,7 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
     // Drop the intrinsic if the label translation fails due to cylic metadata.
     if (!labelAttr)
       return success();
-    $_op = $_builder.create<$_qualCppClassName>($_location, labelAttr);
+    $_op = $_qualCppClassName::create($_builder, $_location, labelAttr);
   }];
   let assemblyFormat = "$label attr-dict";
 }
@@ -883,7 +883,7 @@ def LLVM_MatrixColumnMajorLoadOp : LLVM_OneResultIntrOp<"matrix.column.major.loa
       $columns);
   }];
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::MatrixColumnMajorLoadOp>(
+    $res = LLVM::MatrixColumnMajorLoadOp::create($_builder,
       $_location, $_resultType, $data, $stride,
       $_int_attr($isVolatile), $_int_attr($rows), $_int_attr($columns));
   }];
@@ -917,7 +917,7 @@ def LLVM_MatrixColumnMajorStoreOp : LLVM_ZeroResultIntrOp<"matrix.column.major.s
       $rows, $columns);
   }];
   string mlirBuilder = [{
-    $_op = $_builder.create<LLVM::MatrixColumnMajorStoreOp>(
+    $_op = LLVM::MatrixColumnMajorStoreOp::create($_builder,
       $_location, $matrix, $data, $stride,
       $_int_attr($isVolatile), $_int_attr($rows), $_int_attr($columns));
   }];
@@ -940,7 +940,7 @@ def LLVM_MatrixMultiplyOp : LLVM_OneResultIntrOp<"matrix.multiply"> {
       $rhs_columns);
   }];
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::MatrixMultiplyOp>(
+    $res = LLVM::MatrixMultiplyOp::create($_builder,
       $_location, $_resultType, $lhs, $rhs,
       $_int_attr($lhs_rows), $_int_attr($lhs_columns), $_int_attr($rhs_columns));
   }];
@@ -960,7 +960,7 @@ def LLVM_MatrixTransposeOp : LLVM_OneResultIntrOp<"matrix.transpose"> {
       $matrix, $rows, $columns);
   }];
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::MatrixTransposeOp>(
+    $res = LLVM::MatrixTransposeOp::create($_builder,
       $_location, $_resultType, $matrix,
       $_int_attr($rows), $_int_attr($columns));
   }];
@@ -997,7 +997,7 @@ def LLVM_MaskedLoadOp : LLVM_OneResultIntrOp<"masked.load"> {
   string mlirBuilder = [{
     auto *intrinInst = dyn_cast<llvm::IntrinsicInst>(inst);
     bool nontemporal = intrinInst->hasMetadata(llvm::LLVMContext::MD_nontemporal);
-    $res = $_builder.create<LLVM::MaskedLoadOp>($_location,
+    $res = LLVM::MaskedLoadOp::create($_builder, $_location,
       $_resultType, $data, $mask, $pass_thru, $_int_attr($alignment),
         nontemporal ? $_builder.getUnitAttr() : nullptr);
   }];
@@ -1017,7 +1017,7 @@ def LLVM_MaskedStoreOp : LLVM_ZeroResultIntrOp<"masked.store"> {
       $value, $data, llvm::Align($alignment), $mask);
   }];
   string mlirBuilder = [{
-    $_op = $_builder.create<LLVM::MaskedStoreOp>($_location,
+    $_op = LLVM::MaskedStoreOp::create($_builder, $_location,
       $value, $data, $mask, $_int_attr($alignment));
   }];
   list<int> llvmArgIndices = [0, 1, 3, 2];
@@ -1040,7 +1040,7 @@ def LLVM_masked_gather : LLVM_OneResultIntrOp<"masked.gather"> {
         $_resultType, $ptrs, llvm::Align($alignment), $mask, $pass_thru[0]);
   }];
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::masked_gather>($_location,
+    $res = LLVM::masked_gather::create($_builder, $_location,
       $_resultType, $ptrs, $mask, $pass_thru, $_int_attr($alignment));
   }];
   list<int> llvmArgIndices = [0, 2, 3, 1];
@@ -1061,7 +1061,7 @@ def LLVM_masked_scatter : LLVM_ZeroResultIntrOp<"masked.scatter"> {
       $value, $ptrs, llvm::Align($alignment), $mask);
   }];
   string mlirBuilder = [{
-    $_op = $_builder.create<LLVM::masked_scatter>($_location,
+    $_op = LLVM::masked_scatter::create($_builder, $_location,
       $value, $ptrs, $mask, $_int_attr($alignment));
   }];
   list<int> llvmArgIndices = [0, 1, 3, 2];
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index e08c7b7969330..e845ea9f1e604 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -363,7 +363,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
     }
     SmallVector<Type> resultTypes =
     }] # !if(!gt(numResults, 0), "{$_resultType};", "{};") # [{
-    auto op = $_builder.create<$_qualCppClassName>(
+    auto op = $_qualCppClassName::create($_builder,
       $_location, resultTypes, mlirOperands, mlirAttrs);
     }];
   string baseMlirBuilderCoda = !if(!gt(numResults, 0), "$res = op;", "$_op = op;");
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index f4c1640098320..51004f592eca4 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -53,7 +53,7 @@ class LLVM_IntArithmeticOp<string mnemonic, string instName,
     LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName, traits> {
   let arguments = commonArgs;
   string mlirBuilder = [{
-    $res = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
+    $res = $_qualCppClassName::create($_builder, $_location, $lhs, $rhs);
   }];
 }
 class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
@@ -64,7 +64,7 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
   let arguments = !con(commonArgs, iofArg);
 
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
+    auto op = $_qualCppClassName::create($_builder, $_location, $lhs, $rhs);
     moduleImport.setIntegerOverflowFlags(inst, op);
     $res = op;
   }];
@@ -82,7 +82,7 @@ class LLVM_IntArithmeticOpWithExactFlag<string mnemonic, string instName,
   let arguments = !con(commonArgs, (ins UnitAttr:$isExact));
 
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
+    auto op = $_qualCppClassName::create($_builder, $_location, $lhs, $rhs);
     moduleImport.setExactFlag(inst, op);
     $res = op;
   }];
@@ -100,7 +100,7 @@ class LLVM_IntArithmeticOpWithDisjointFlag<string mnemonic, string instName,
   let arguments = !con(commonArgs, (ins UnitAttr:$isDisjoint));
 
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
+    auto op = $_qualCppClassName::create($_builder, $_location, $lhs, $rhs);
     moduleImport.setDisjointFlag(inst, op);
     $res = op;
   }];
@@ -121,7 +121,7 @@ class LLVM_FloatArithmeticOp<string mnemonic, string instName,
     ins DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags);
   let arguments = !con(commonArgs, fmfArg);
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
+    auto op = $_qualCppClassName::create($_builder, $_location, $lhs, $rhs);
     moduleImport.setFastmathFlagsAttr(inst, op);
     $res = op;
   }];
@@ -141,7 +141,7 @@ class LLVM_UnaryFloatArithmeticOp<Type type, string mnemonic,
   let assemblyFormat = "$operand attr-dict `:` type($res)";
   string llvmInstName = instName;
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>($_location, $operand);
+    auto op = $_qualCppClassName::create($_builder, $_location, $operand);
     moduleImport.setFastmathFlagsAttr(inst, op);
     $res = op;
    }];
@@ -193,7 +193,7 @@ def LLVM_ICmpOp : LLVM_ArithmeticCmpOp<"icmp", [Pure]> {
   }];
   string mlirBuilder = [{
     auto *iCmpInst = cast<llvm::ICmpInst>(inst);
-    $res = $_builder.create<$_qualCppClassName>($_location,
+    $res = $_qualCppClassName::create($_builder, $_location,
             convertICmpPredicateFromLLVM(iCmpInst->getPredicate()), $lhs, $rhs);
   }];
   // Set the $predicate index to -1 to indicate there is no matching operand
@@ -217,7 +217,7 @@ def LLVM_FCmpOp : LLVM_ArithmeticCmpOp<"fcmp", [
   }];
   string mlirBuilder = [{
     auto *fCmpInst = cast<llvm::FCmpInst>(inst);
-    auto op = $_builder.create<$_qualCppClassName>(
+    auto op = $_qualCppClassName::create($_builder,
       $_location, convertFCmpPredicateFromLLVM(fCmpInst->getPredicate()), $lhs, $rhs);
     moduleImport.setFastmathFlagsAttr(inst, op);
     $res = op;
@@ -261,7 +261,7 @@ def LLVM_AllocaOp : LLVM_Op<"alloca",
     Type allocatedType =
       moduleImport.convertType(allocaInst->getAllocatedType());
     unsigned alignment = allocaInst->getAlign().value();
-    $res = $_builder.create<LLVM::AllocaOp>(
+    $res = LLVM::AllocaOp::create($_builder,
       $_location, $_resultType, $arraySize,
       alignment == 0 ? IntegerAttr() : $_builder.getI64IntegerAttr(alignment),
       allocatedType, allocaInst->isUsedWithInAlloca());
@@ -440,7 +440,7 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
   string mlirBuilder = [{
     auto *loadInst = cast<llvm::LoadInst>(inst);
     unsigned alignment = loadInst->getAlign().value();
-    $res = $_builder.create<LLVM::LoadOp>($_location, $_resultType, $addr,
+    $res = LLVM::LoadOp::create($_builder, $_location, $_resultType, $addr,
         alignment, loadInst->isVolatile(),
         loadInst->hasMetadata(llvm::LLVMContext::MD_nontemporal),
         loadInst->hasMetadata(llvm::LLVMContext::MD_invariant_load),
@@ -518,7 +518,7 @@ def LLVM_StoreOp : LLVM_MemAccessOpBase<"store",
   string mlirBuilder = [{
     auto *storeInst = cast<llvm::StoreInst>(inst);
     unsigned alignment = storeInst->getAlign().value();
-    $_op = $_builder.create<LLVM::StoreOp>($_location, $value, $addr,
+    $_op = LLVM::StoreOp::create($_builder, $_location, $value, $addr,
         alignment, storeInst->isVolatile(),
         storeInst->hasMetadata(llvm::LLVMContext::MD_nontemporal),
         storeInst->hasMetadata(llvm::LLVMContext::MD_invariant_group),
@@ -547,7 +547,7 @@ class LLVM_CastOp<string mnemonic, string instName, Type type,
   let assemblyFormat = "$arg attr-dict `:` type($arg) `to` type($res)";
   string llvmInstName = instName;
   string mlirBuilder = [{
-    $res = $_builder.create<$_qualCppClassName>(
+    $res = $_qualCppClassName::create($_builder,
       $_location, $_resultType, $arg);
   }];
 }
@@ -561,7 +561,7 @@ class LLVM_CastOpWithNNegFlag<string mnemonic, string instName, Type type,
   let assemblyFormat = "(`nneg` $nonNeg^)? $arg attr-dict `:` type($arg) `to` type($res)";
   string llvmInstName = instName;
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>(
+    auto op = $_qualCppClassName::create($_builder,
       $_location, $_resultType, $arg);
     moduleImport.setNonNegFlag(inst, op);
     $res = op;
@@ -578,7 +578,7 @@ class LLVM_CastOpWithOverflowFlag<string mnemonic, string instName, Type type,
   let assemblyFormat = "$arg ($overflowFlags^)? attr-dict `:` type($arg) `to` type($res)";
   string llvmInstName = instName;
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>(
+    auto op = $_qualCppClassName::create($_builder,
       $_location, $_resultType, $arg);
     moduleImport.setIntegerOverflowFlags(inst, op);
     $res = op;
@@ -602,7 +602,7 @@ class LLVM_DereferenceableCastOp<string mnemonic, string instName, Type type,
     }
   }];
   string mlirBuilder = [{
-    auto op = $_builder.create<$_qualCppClassName>(
+    auto op = $_qualCppClassName::create($_builder,
       $_location, $_resultType, $arg);
     $res = op;
   }];
@@ -725,7 +725,7 @@ def LLVM_VaArgOp : LLVM_Op<"va_arg"> {
 
   string llvmInstName = "VAArg";
   string mlirBuilder = [{
-    $res = $_builder.create<mlir::LLVM::VaArgOp>(
+    $res = mlir::LLVM::VaArgOp::create($_builder,
       $_location, $_resultType, $arg);
   }];
 }
@@ -847,7 +847,7 @@ def LLVM_ExtractElementOp : LLVM_Op<"extractelement", [Pure,
     $res = builder.CreateExtractElement($vector, $position);
   }];
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::ExtractElementOp>(
+    $res = LLVM::ExtractElementOp::create($_builder,
       $_location, $vector, $position);
   }];
 }
@@ -881,7 +881,7 @@ def LLVM_ExtractValueOp : LLVM_Op<"extractvalue", [Pure]> {
   }];
   string mlirBuilder = [{
     auto *evInst = cast<llvm::ExtractValueInst>(inst);
-    $res = $_builder.create<LLVM::ExtractValueOp>($_location,
+    $res = LLVM::ExtractValueOp::create($_builder, $_location,
       $container, getPositionFromIndices(evInst->getIndices()));
   }];
 }
@@ -913,7 +913,7 @@ def LLVM_InsertElementOp : LLVM_Op<"insertelement", [Pure,
     $res = builder.CreateInsertElement($vector, $value, $position);
   }];
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::InsertElementOp>(
+    $res = LLVM::InsertElementOp::create($_builder,
       $_location, $vector, $value, $position);
   }];
 }
@@ -945,7 +945,7 @@ def LLVM_InsertValueOp : LLVM_Op<
   }];
   string mlirBuilder = [{
     auto *ivInst = cast<llvm::InsertValueInst>(inst);
-    $res = $_builder.create<LLVM::InsertValueOp>($_location,
+    $res = LLVM::InsertValueOp::create($_builder, $_location,
       $container, $value, getPositionFromIndices(ivInst->getIndices()));
   }];
 }
@@ -982,7 +982,7 @@ def LLVM_ShuffleVectorOp : LLVM_Op<"shufflevector",
   string mlirBuilder = [{
     auto *svInst = cast<llvm::ShuffleVectorInst>(inst);
     SmallVector<int32_t> mask(svInst->getShuffleMask());
-    $res = $_builder.create<LLVM::ShuffleVectorOp>(
+    $res = LLVM::ShuffleVectorOp::create($_builder,
       $_location, $v1, $v2, mask);
   }];
 }
@@ -1003,7 +1003,7 @@ def LLVM_SelectOp
   let assemblyFormat = "operands attr-dict `:` type($condition) `,` type($res)";
   string llvmInstName = "Select";
   string mlirBuilder = [{
-    auto op = $_builder.create<LLVM::SelectOp>(
+    auto op = LLVM::SelectOp::create($_builder,
       $_location, $_resultType, $condition, $trueValue, $falseValue);
     moduleImport.setFastmathFlagsAttr(inst, op);
     $res = op;
@@ -1017,7 +1017,7 @@ def LLVM_FreezeOp : LLVM_Op<"freeze", [Pure, SameOperandsAndResultType]> {
   string llvmInstName = "Freeze";
   string llvmBuilder = "$res = builder.CreateFreeze($val);";
   string mlirBuilder = [{
-    $res = $_builder.create<LLVM::FreezeOp>($_location, $val);
+    $res = LLVM::FreezeOp::create($_builder, $_location, $val);
   }];
 }
 
@@ -1108,7 +1108,7 @@ def LLVM_ReturnOp : LLVM_TerminatorOp<"return", [Pure, ReturnLike]> {
       moduleImport.convertValues(llvmOperands);
     if (failed(mlirOperands))
       return failure();
-    $_op = $_builder.create<LLVM::ReturnOp>($_location, *mlirOperands);
+    $_op = LLVM::ReturnOp::create($_builder, $_location, *mlirOperands);
   }];
 }
 
@@ -1120,7 +1120,7 @@ def LLVM_ResumeOp : LLVM_TerminatorOp<"resume"> {
   string llvmInstName = "Resume";
   string llvmBuilder = [{ builder.CreateResume($value); }];
   string mlirBuilder = [{
-    $_op = $_builder.create<LLVM::ResumeOp>($_location, $value);
+    $_op = LLVM::ResumeOp::create($_builder, $_location, $value);
   }];
 }
 def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable"> {
@@ -1128,7 +1128,7 @@ def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable"> {
   string llvmInstName = "Unreachable";
   string llvmBuilder = [{ builder.CreateUnreachable(); }];
   string mlirBuilder = [{
-    $_op = $_builder.create<LLVM::UnreachableOp>($_location);
+    $_op = LLVM::UnreachableOp::create($_builder, $_location);
   }];
 }
 
@@ -1285,6 +1285,10 @@ def LLVM_AddressOfOp : LLVM_Op<"mlir.addressof",
     /// Return the llvm.mlir.alias operation that defined the value referenced
     /// here.
     AliasOp getAlias(SymbolTableCollection &symbolTable);
+
+    /// Return the llvm.mlir.ifunc operation that defined the value referenced
+    /// here.
+    IFuncOp getIFunc(SymbolTableCollection &symbolTable);
   }];
 
   let assemblyFormat = "$global_name attr-dict `:` qualified(type($res))";
@@ -1601,6 +1605,67 @@ def LLVM_AliasOp : LLVM_Op<"mlir.alias",
   let hasRegionVerifier = 1;
 }
 
+def LLVM_IFuncOp : LLVM_Op<"mlir.ifunc",
+    [IsolatedFromAbove, Symbol, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let arguments = (ins
+    SymbolNameAttr:$sym_name,
+    TypeAttr:$i_func_type,
+    FlatSymbolRefAttr:$resolver,
+    TypeAttr:$resolver_type,
+    Linkage:$linkage,
+    UnitAttr:$dso_local,
+    DefaultValuedAttr<ConfinedAttr<I32Attr, [IntNonNegative]>, "0">:$address_space,
+    DefaultValuedAttr<UnnamedAddr, "mlir::LLVM::UnnamedAddr::None">:$unnamed_addr,
+    DefaultValuedAttr<Visibility, "mlir::LLVM::Visibility::Default">:$visibility_
+  );
+  let summary = "LLVM dialect ifunc";
+  let description = [{
+    `llvm.mlir.ifunc` is a top level operation that defines a global ifunc.
+    It defines a new symbol and takes a symbol refering to a resolver function.
+    IFuncs can be called as regular functions. The function type is the same
+    as the IFuncType. The symbol is resolved at runtime by calling a resolver
+    function.
+
+    Examples:
+
+    ```mlir
+    // IFuncs resolve a symbol at runtime using a resovler function.
+    llvm.mlir.ifunc external @foo: !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+
+    llvm.func @foo_1(i64) -> f32
+    llvm.func @foo_2(i64) -> f32
+
+    llvm.func @resolve_foo() -> !llvm.ptr attributes {
+      %0 = llvm.mlir.addressof @foo_2 : !llvm.ptr
+      %1 = llvm.mlir.addressof @foo_1 : !llvm.ptr
+
+      // ... Logic selecting from foo_{1, 2}
+
+      // Return function pointer to the selected function
+      llvm.return %7 : !llvm.ptr
+    }
+
+    llvm.func @use_foo() {
+      // IFuncs are called as regular functions
+      %res = llvm.call @foo(%value) : i64 -> f32
+    }
+    ```
+  }];
+
+  let builders = [
+    OpBuilder<(ins "StringRef":$name, "Type":$i_func_type,
+      "StringRef":$resolver, "Type":$resolver_type,
+      "Linkage":$linkage, "LLVM::Visibility":$visibility)>
+  ];
+
+  let assemblyFormat = [{
+    custom<LLVMLinkage>($linkage) ($visibility_^)? ($unnamed_addr^)?
+    $sym_name `:` $i_func_type `,` $resolver_type $resolver attr-dict
+  }];
+  let hasVerifier = 1;
+}
+
+
 def LLVM_DSOLocalEquivalentOp : LLVM_Op<"dso_local_equivalent",
     [Pure, ConstantLike, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let arguments = (ins FlatSymbolRefAttr:$function_name);
@@ -2191,7 +2256,7 @@ def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [
   string mlirBuilder = [{
     auto *atomicInst = cast<llvm::AtomicRMWInst>(inst);
     unsigned alignment = atomicInst->getAlign().value();
-    $res = $_builder.create<LLVM::AtomicRMWOp>($_location,
+    $res = LLVM::AtomicRMWOp::create($_builder, $_location,
         convertAtomicBinOpFromLLVM(atomicInst->getOperation()), $ptr, $val,
         convertAtomicOrderingFromLLVM(atomicInst->getOrdering()),
         getLLVMSyncScope(atomicInst), alignment, atomicInst->isVolatile());
@@ -2246,7 +2311,7 @@ def LLVM_AtomicCmpXchgOp : LLVM_MemAccessOpBase<"cmpxchg", [
   string mlirBuilder = [{
     auto *cmpXchgInst = cast<llvm::AtomicCmpXchgInst>(inst);
     unsigned alignment = cmpXchgInst->getAlign().value();
-    $res = $_builder.create<LLVM::AtomicCmpXchgOp>(
+    $res = LLVM::AtomicCmpXchgOp::create($_builder,
       $_location, $ptr, $cmp, $val,
       convertAtomicOrderingFromLLVM(cmpXchgInst->getSuccessOrdering()),
       convertAtomicOrderingFromLLVM(cmpXchgInst->getFailureOrdering()),
@@ -2275,7 +2340,7 @@ def LLVM_FenceOp : LLVM_Op<"fence">, LLVM_MemOpPatterns {
   }] # setSyncScopeCode;
   string mlirBuilder = [{
     llvm::FenceInst *fenceInst = cast<llvm::FenceInst>(inst);
-    $_op = $_builder.create<LLVM::FenceOp>(
+    $_op = LLVM::FenceOp::create($_builder,
       $_location,
       convertAtomicOrderingFromLLVM(fenceInst->getOrdering()),
       getLLVMSyncScope(fenceInst));
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 906aaca21187b..04a0b58a85211 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -284,10 +284,56 @@ def ROCDL_BarrierWaitOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.wait", [], 0, [0
   let assemblyFormat = "$id attr-dict";
 }
 
-def ROCDL_WaitDscntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.dscnt", [], 0, [0], ["id"]>,
-  Arguments<(ins I16Attr:$id)> {
+def ROCDL_WaitDscntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.dscnt", [], 0, [0], ["count"]>,
+  Arguments<(ins I16Attr:$count)> {
+  let summary = "Wait until DSCNT is less than or equal to `count`";
+  let description = [{
+      Wait for the counter specified to be less-than or equal-to the `count`
+      before continuing.
+
+      Available on gfx12+.
+  }];
   let results = (outs);
-  let assemblyFormat = "$id attr-dict";
+  let assemblyFormat = "$count attr-dict";
+}
+
+def ROCDL_WaitLoadcntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.loadcnt", [], 0, [0], ["count"]>,
+  Arguments<(ins I16Attr:$count)> {
+  let summary = "Wait until LOADCNT is less than or equal to `count`";
+  let description = [{
+      Wait for the counter specified to be less-than or equal-to the `count`
+      before continuing.
+
+      Available on gfx12+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$count attr-dict";
+}
+
+def ROCDL_WaitStorecntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.storecnt", [], 0, [0], ["count"]>,
+  Arguments<(ins I16Attr:$count)> {
+  let summary = "Wait until STORECNT is less than or equal to `count`";
+  let description = [{
+      Wait for the counter specified to be less-than or equal-to the `count`
+      before continuing.
+
+      Available on gfx12+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$count attr-dict";
+}
+
+def ROCDL_WaitExpcntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.expcnt", [], 0, [0], ["count"]>,
+  Arguments<(ins I16Attr:$count)> {
+  let summary = "Wait until EXPCNT is less than or equal to `count`";
+  let description = [{
+      Wait for the counter specified to be less-than or equal-to the `count`
+      before continuing.
+
+      Available on gfx12+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$count attr-dict";
 }
 
 def ROCDL_SetPrioOp : ROCDL_ConcreteNonMemIntrOp<"s.setprio", [], 0, [0], ["priority"]>,
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 7bbc56f549c0b..ca0cc03acd7ad 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -454,7 +454,7 @@ def TransposeOp : LinalgStructuredBase_Op<"transpose", [
     static void regionBuilder(mlir::ImplicitLocOpBuilder &b, mlir::Block &block,
         mlir::ArrayRef<mlir::NamedAttribute>, function_ref<InFlightDiagnostic()> emitError) {
       OpBuilder::InsertionGuard guard(b);
-      b.create<linalg::YieldOp>(b.getLoc(), block.getArgument(0));
+      linalg::YieldOp::create(b, b.getLoc(), block.getArgument(0));
     }
 
     static std::function<void(mlir::ImplicitLocOpBuilder &, mlir::Block &,
@@ -528,7 +528,7 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
                               mlir::ArrayRef<mlir::NamedAttribute>, 
                               function_ref<InFlightDiagnostic()> emitError) {
       OpBuilder::InsertionGuard guard(b);
-      b.create<linalg::YieldOp>(b.getLoc(), block.getArgument(0));
+      linalg::YieldOp::create(b, b.getLoc(), block.getArgument(0));
     }
 
     static std::function<void(mlir::ImplicitLocOpBuilder &, mlir::Block &,
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index b4dde776822a1..8d45c40a93e2b 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -2431,12 +2431,12 @@ def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
   }];
 
   let arguments = (ins TransformHandleTypeInterface:$target,
-                       Variadic<TransformAnyParamTypeOrAnyHandle>:$vector_sizes,
-                       DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:
-                          $static_vector_sizes,
-                       OptionalAttr<UnitAttr>:$vectorize_nd_extract,
-                       DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:
-                          $scalable_sizes);
+      Variadic<TransformAnyParamTypeOrAnyHandle>:$vector_sizes,
+      DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_vector_sizes,
+      OptionalAttr<UnitAttr>:$vectorize_nd_extract,
+      OptionalAttr<UnitAttr>:$assume_dynamic_dims_match_vec_sizes,
+      OptionalAttr<UnitAttr>:$create_named_contraction,
+      DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:$scalable_sizes);
 
   let results = (outs);
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 74280fdd82f4e..38e53648e7c34 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -876,11 +876,15 @@ struct VectorizationResult {
 /// greater than or equal to their counterpart iteration space sizes, if static.
 /// `inputVectorShapes` also allows the vectorization of operations with dynamic
 /// shapes.
+/// Optionally, `createNamedContraction` can force compatible contractions to be
+/// vectorized directly to vector.contract operation.
 FailureOr<VectorizationResult>
 vectorize(RewriterBase &rewriter, Operation *op,
           ArrayRef<int64_t> inputVectorSizes = {},
           ArrayRef<bool> inputScalableVecDims = {},
-          bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false);
+          bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false,
+          bool assumeDynamicDimsMatchVecSizes = false,
+          bool createNamedContraction = false);
 
 /// Emit a suitable vector form for a Copy op with fully static shape.
 LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 09bb3932ef293..9321089ab55fa 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1216,6 +1216,11 @@ def LoadOp : MemRef_Op<"load",
     be reused in the cache. For details, refer to the
     [https://llvm.org/docs/LangRef.html#load-instruction](LLVM load instruction).
 
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    load operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
     Example:
 
     ```mlir
@@ -1226,7 +1231,39 @@ def LoadOp : MemRef_Op<"load",
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
                            [MemRead]>:$memref,
                        Variadic<Index>:$indices,
-                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal);
+                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+                       ConfinedAttr<OptionalAttr<I64Attr>,
+                                    [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+
+  let builders = [
+    OpBuilder<(ins "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
+    OpBuilder<(ins "Type":$resultType,
+                   "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultType, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
+    OpBuilder<(ins "TypeRange":$resultTypes,
+                   "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultTypes, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>
+  ];
+
   let results = (outs AnyType:$result);
 
   let extraClassDeclaration = [{
@@ -1912,6 +1949,11 @@ def MemRef_StoreOp : MemRef_Op<"store",
     be reused in the cache. For details, refer to the
     [https://llvm.org/docs/LangRef.html#store-instruction](LLVM store instruction).
 
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    store operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
     Example:
 
     ```mlir
@@ -1923,13 +1965,25 @@ def MemRef_StoreOp : MemRef_Op<"store",
                        Arg<AnyMemRef, "the reference to store to",
                            [MemWrite]>:$memref,
                        Variadic<Index>:$indices,
-                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal);
+                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+                       ConfinedAttr<OptionalAttr<I64Attr>,
+                                    [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
 
   let builders = [
+    OpBuilder<(ins "Value":$valueToStore,
+                   "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, valueToStore, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
     OpBuilder<(ins "Value":$valueToStore, "Value":$memref), [{
       $_state.addOperands(valueToStore);
       $_state.addOperands(memref);
-    }]>];
+    }]>
+  ];
 
   let extraClassDeclaration = [{
       Value getValueToStore() { return getOperand(0); }
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
index 3f1041cb25103..243dbf081b999 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
@@ -62,9 +62,11 @@ void populateAllReduceEndomorphismSimplificationPatterns(
   auto isEndomorphismOp = [reduction](Operation *op,
                                       std::optional<Operation *> referenceOp) {
     auto allReduceOp = llvm::dyn_cast<AllReduceOp>(op);
+    if (!allReduceOp)
+      return false;
     auto inType = cast<ShapedType>(allReduceOp.getInput().getType());
     auto outType = cast<ShapedType>(allReduceOp.getResult().getType());
-    if (!allReduceOp || inType.getElementType() != outType.getElementType() ||
+    if (inType.getElementType() != outType.getElementType() ||
         allReduceOp.getReduction() != reduction) {
       return false;
     }
@@ -87,9 +89,7 @@ void populateAllReduceEndomorphismSimplificationPatterns(
     return refAllReduceOp->getAttrs() == allReduceOp->getAttrs() &&
            inType.getElementType() == refType.getElementType();
   };
-  auto isAlgebraicOp = [](Operation *op) {
-    return static_cast<bool>(llvm::dyn_cast<AlgebraicOp>(op));
-  };
+  auto isAlgebraicOp = [](Operation *op) { return isa<AlgebraicOp>(op); };
 
   using ConcreteEndomorphismSimplification = EndomorphismSimplification<
       std::decay_t<decltype(getEndomorphismOpOperand)>,
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 4eb666239d4e4..8f87235fcd237 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -29,6 +29,7 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <variant>
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc"
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 66378f116784e..96b9adcc53b3c 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2772,8 +2772,10 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name, SymbolRefAttr:$func_name,
-      OptionalAttr<StrArrayAttr>:$bindName,
-      OptionalAttr<DeviceTypeArrayAttr>:$bindNameDeviceType,
+      OptionalAttr<SymbolRefArrayAttr>:$bindIdName,
+      OptionalAttr<StrArrayAttr>:$bindStrName,
+      OptionalAttr<DeviceTypeArrayAttr>:$bindIdNameDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$bindStrNameDeviceType,
       OptionalAttr<DeviceTypeArrayAttr>:$worker,
       OptionalAttr<DeviceTypeArrayAttr>:$vector,
       OptionalAttr<DeviceTypeArrayAttr>:$seq, UnitAttr:$nohost,
@@ -2815,14 +2817,14 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
     std::optional<int64_t> getGangDimValue();
     std::optional<int64_t> getGangDimValue(mlir::acc::DeviceType deviceType);
 
-    std::optional<llvm::StringRef> getBindNameValue();
-    std::optional<llvm::StringRef> getBindNameValue(mlir::acc::DeviceType deviceType);
+    std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue();
+    std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     $sym_name `func` `(` $func_name `)`
     oilist (
-        `bind` `(` custom<BindName>($bindName, $bindNameDeviceType) `)`
+        `bind` `(` custom<BindName>($bindIdName, $bindStrName ,$bindIdNameDeviceType, $bindStrNameDeviceType) `)`
       | `gang` `` custom<RoutineGangClause>($gang, $gangDim, $gangDimDeviceType)
       | `worker` custom<DeviceTypeArrayAttr>($worker)
       | `vector` custom<DeviceTypeArrayAttr>($vector)
diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
index 2d9befe78001d..2016bea43fc8a 100644
--- a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
@@ -77,4 +77,11 @@ def SPIRVWebGPUPreparePass : Pass<"spirv-webgpu-prepare", "spirv::ModuleOp"> {
                 "and replacing with supported ones";
 }
 
+def SPIRVReplicatedConstantCompositePass
+    : Pass<"spirv-promote-to-replicated-constants", "spirv::ModuleOp"> {
+  let summary = "Convert splat composite constants and spec constants to "
+                "corresponding replicated constant composite ops defined by "
+                "SPV_EXT_replicated_composites";
+}
+
 #endif // MLIR_DIALECT_SPIRV_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 620fd7c63146d..a6d904df78f0d 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -131,7 +131,7 @@ class SparseTensorType {
   /// ambiguity whenever passing a `SparseTensorType` directly to a
   /// function which is overloaded to accept either `Type` or `TypeRange`.
   /// In particular, this includes `RewriterBase::replaceOpWithNewOp<OpTy>`
-  /// and `OpBuilder::create<OpTy>` whenever the `OpTy::build` is overloaded
+  /// and `OpTy::create` whenever the `OpTy::build` is overloaded
   /// thus.  This happens because the `TypeRange<T>(T&&)` ctor is implicit
   /// as well, and there's no SFINAE we can add to this method that would
   /// block subsequent application of that ctor.  The only way to fix the
diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index 6f3b0916a7a60..cdcd099ec7d22 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -70,7 +70,7 @@ checkHasDynamicBatchDims(PatternRewriter &rewriter, Op op,
   }
 
   dynamicDims.push_back(
-      rewriter.create<tensor::DimOp>(op->getLoc(), params[0], 0));
+      tensor::DimOp::create(rewriter, op->getLoc(), params[0], 0));
   return dynamicDims;
 }
 
@@ -91,7 +91,7 @@ namespace {
 template <typename TosaOp, typename... Args>
 TosaOp createOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy,
                              Args &&...args) {
-  auto op = builder.create<TosaOp>(resultTy, args...);
+  auto op = TosaOp::create(builder, resultTy, args...);
 
   InferShapedTypeOpInterface shapeInterface =
       dyn_cast<InferShapedTypeOpInterface>(op.getOperation());
diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
index bdd8713037eea..9d9a934cdfd5e 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/QuantUtils.h
@@ -47,7 +47,7 @@ Value getConstTensorInt(OpBuilder &builder, Location loc,
   mlir::RankedTensorType const_type =
       RankedTensorType::get({count}, element_type);
   mlir::DenseElementsAttr const_attr = DenseElementsAttr::get(const_type, vec);
-  auto const_op = builder.create<tosa::ConstOp>(loc, const_type, const_attr);
+  auto const_op = tosa::ConstOp::create(builder, loc, const_type, const_attr);
   return const_op.getResult();
 }
 
diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index 704e39e908841..6d4ea5b5136de 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -332,11 +332,11 @@ struct ComposeCollapseOfExpandOp : public OpRewritePattern<CollapseOpTy> {
         // the first dynamic size.
         Value result = dynamicSizes[0];
         for (Value v : llvm::drop_begin(dynamicSizes))
-          result = rewriter.create<arith::MulIOp>(loc, result, v);
+          result = arith::MulIOp::create(rewriter, loc, result, v);
         if (numStaticElems != 1) {
-          result = rewriter.create<arith::MulIOp>(
-              loc, result,
-              rewriter.create<arith::ConstantIndexOp>(loc, numStaticElems));
+          result = arith::MulIOp::create(
+              rewriter, loc, result,
+              arith::ConstantIndexOp::create(rewriter, loc, numStaticElems));
         }
         newOutputShape.push_back(result);
       }
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index cbe490f6e4dd1..0a5c1e5d9ab97 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1809,12 +1809,42 @@ def Vector_LoadOp : Vector_Op<"load", [
     ```mlir
     %result = vector.load %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
+
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    load operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
   }];
 
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
       [MemRead]>:$base,
       Variadic<Index>:$indices,
-      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal);
+      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+      ConfinedAttr<OptionalAttr<I64Attr>,
+                   [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+
+  let builders = [
+    OpBuilder<(ins "VectorType":$resultType,
+                   "Value":$base,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultType, base, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
+    OpBuilder<(ins "TypeRange":$resultTypes,
+                   "Value":$base,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultTypes, base, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>
+  ];
+
   let results = (outs AnyVectorOfAnyRank:$result);
 
   let extraClassDeclaration = [{
@@ -1895,6 +1925,12 @@ def Vector_StoreOp : Vector_Op<"store", [
     ```mlir
     vector.store %valueToStore, %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
+
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    store operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
   }];
 
   let arguments = (ins
@@ -1902,8 +1938,21 @@ def Vector_StoreOp : Vector_Op<"store", [
       Arg<AnyMemRef, "the reference to store to",
       [MemWrite]>:$base,
       Variadic<Index>:$indices,
-      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal
-  );
+      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+      ConfinedAttr<OptionalAttr<I64Attr>,
+                   [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+
+  let builders = [
+    OpBuilder<(ins "Value":$valueToStore,
+                   "Value":$base,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, valueToStore, base, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>
+  ];
 
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
@@ -2788,124 +2837,6 @@ def Vector_PrintOp :
     }];
 }
 
-//===----------------------------------------------------------------------===//
-// Ops used for supporting progressive lowering and conversion type changes.
-// The Ops are typically not used directly by higher level dialects, but are
-// used by intra-dialect rewriting rules to bring vector operations closer
-// to the hardware ISA.
-//===----------------------------------------------------------------------===//
-
-/// Vector dialect matrix multiplication op that operates on flattened 1-D
-/// MLIR vectors. This is the counterpart of llvm.matrix.multiply in MLIR.
-/// This may seem redundant with vector.contract but it serves the purposes of
-/// more progressive lowering and localized type conversion on the path:
-///   `vector<...x...xf32> -> vector<...xf32> -> !llvm<... x float>`.
-def Vector_MatmulOp : Vector_Op<"matrix_multiply", [Pure,
-        PredOpTrait<"lhs operand and result have same element type",
-                    TCresVTEtIsSameAsOpBase<0, 0>>,
-        PredOpTrait<"rhs operand and result have same element type",
-                    TCresVTEtIsSameAsOpBase<0, 1>>]>,
-      Arguments<(
-        // TODO: tighten vector element types that make sense.
-        ins FixedVectorOfRankAndType<[1],
-              [AnySignlessInteger, AnySignedInteger, Index, AnyFloat]>:$lhs,
-            FixedVectorOfRankAndType<[1],
-              [AnySignlessInteger, AnySignedInteger, Index, AnyFloat]>:$rhs,
-            I32Attr:$lhs_rows, I32Attr:$lhs_columns, I32Attr:$rhs_columns)>,
-      Results<(
-        outs FixedVectorOfRankAndType<[1],
-               [AnySignlessInteger, AnySignedInteger, Index, AnyFloat]>:$res)>
-{
-  let summary = "Vector matrix multiplication op that operates on flattened 1-D"
-    " MLIR vectors";
-  let description = [{
-    This is the counterpart of llvm.matrix.multiply in MLIR. It serves the
-    purposes of more progressive lowering and localized type conversion.
-    Higher levels typically lower matrix multiplications into 'vector.contract'
-    operations. Subsequent rewriting rule progressively lower these operations
-    into 'vector.matrix_multiply' operations to bring the operations closer
-    to the hardware ISA.
-
-    The ‘vector.matrix_multiply’ op treats `lhs` as matrix with <lhs_rows> rows
-    and <lhs_columns> columns, `rhs` as matrix with <lhs_columns> rows and
-    <rhs_columns> and multiplies them. The result matrix is returned embedded in
-    the result vector.
-
-    Note, the corresponding LLVM intrinsic, `@llvm.matrix.multiply.*`, does not
-    support scalable vectors. Hence, this Op is only available for fixed-width
-    vectors. Also see:
-
-    http://llvm.org/docs/LangRef.html#llvm-matrix-multiply-intrinsic
-
-    Example:
-
-    ```mlir
-    %C = vector.matrix_multiply %A, %B
-      { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } :
-      (vector<64xf64>, vector<48xf64>) -> vector<12xf64>
-    ```
-  }];
-  let builders = [
-   OpBuilder<(ins "Value":$lhs, "Value":$rhs, "unsigned":$lhsRows,
-     "unsigned":$lhsColumns, "unsigned":$rhsColumns),
-   [{
-     $_state.addOperands({lhs, rhs});
-     $_state.addAttribute("lhs_rows",$_builder.getI32IntegerAttr(lhsRows));
-     $_state.addAttribute("lhs_columns",$_builder.getI32IntegerAttr(lhsColumns));
-     $_state.addAttribute("rhs_columns",$_builder.getI32IntegerAttr(rhsColumns));
-     $_state.addTypes(VectorType::get(lhsRows * rhsColumns,
-       ::llvm::cast<VectorType>(lhs.getType()).getElementType()));
-   }]>,
-  ];
-  let assemblyFormat = "$lhs `,` $rhs attr-dict "
-    "`:` `(` type($lhs) `,` type($rhs) `)` `->` type($res)";
-}
-
-/// Vector dialect matrix transposition op that operates on flattened 1-D
-/// MLIR vectors. This is the counterpart of llvm.matrix.transpose in MLIR.
-/// This may seem redundant with vector.transpose but it serves the purposes of
-/// more progressive lowering and localized type conversion on the path:
-///   `vector<...x...xf32> -> vector<...xf32> -> !llvm<... x float>`.
-def Vector_FlatTransposeOp : Vector_Op<"flat_transpose", [Pure,
-  PredOpTrait<"source operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(
-      // TODO: tighten vector element types that make sense.
-      ins FixedVectorOfRankAndType<[1],
-            [AnySignlessInteger, AnySignedInteger, Index, AnyFloat]>:$matrix,
-          I32Attr:$rows, I32Attr:$columns)>,
-    Results<(
-      outs FixedVectorOfRankAndType<[1],
-             [AnySignlessInteger, AnySignedInteger, Index, AnyFloat]>:$res)> {
-  let summary = "Vector matrix transposition on flattened 1-D MLIR vectors";
-  let description = [{
-    This is the counterpart of llvm.matrix.transpose in MLIR. It serves
-    the purposes of more progressive lowering and localized type conversion.
-    Higher levels typically lower matrix transpositions into 'vector.transpose'
-    operations. Subsequent rewriting rule progressively lower these operations
-    into 'vector.flat_transpose' operations to bring the operations closer
-    to the hardware ISA.
-
-    The `vector.flat_transpose` op treats the 1-D input `matrix` as
-    a 2-D matrix with <rows> rows and <columns> columns, and returns the
-    transposed matrix in flattened form in 'res'.
-
-    Note, the corresponding LLVM intrinsic, `@llvm.matrix.transpose.*`, does not
-    support scalable vectors. Hence, this Op is only available for fixed-width
-    vectors. Also see:
-
-    http://llvm.org/docs/LangRef.html#llvm-matrix-transpose-intrinsic
-
-    Example:
-
-    ```mlir
-    %1 = vector.flat_transpose %0 {columns = 4 : i32, rows = 4 : i32}
-       : vector<16xf32> -> vector<16xf32>
-    ```
-  }];
-  let assemblyFormat = "$matrix attr-dict `:` type($matrix) `->` type($res)";
-}
-
 //===----------------------------------------------------------------------===//
 // SplatOp
 //===----------------------------------------------------------------------===//
@@ -2919,6 +2850,8 @@ def Vector_SplatOp : Vector_Op<"splat", [
   ]> {
   let summary = "vector splat or broadcast operation";
   let description = [{
+    Note: This operation is deprecated. Please use vector.broadcast.
+
     Broadcast the operand to all elements of the result vector. The type of the
     operand must match the element type of the vector type.
 
@@ -2928,6 +2861,13 @@ def Vector_SplatOp : Vector_Op<"splat", [
     %s = arith.constant 10.1 : f32
     %t = vector.splat %s : vector<8x16xf32>
     ```
+
+    This operation is deprecated, the preferred representation of the above is:
+
+    ```mlir
+    %s = arith.constant 10.1 : f32
+    %t = vector.broadcast %s : f32 to vector<8x16xf32>
+    ```
   }];
 
   let arguments = (ins AnyType:$input);
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
index 6761cd65c5009..e03f0dabece52 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h
@@ -303,6 +303,28 @@ void populateVectorRankReducingFMAPattern(RewritePatternSet &patterns);
 void populateVectorToFromElementsToShuffleTreePatterns(
     RewritePatternSet &patterns, PatternBenefit benefit = 1);
 
+/// Populate the pattern set with the following patterns:
+///
+/// [ContractionOpToMatmulOpLowering]
+/// Lowers `vector.contract` to `llvm.intr.matrix.multiply`.
+///
+/// Given the high benefit, this will be prioriotised over other
+/// contract-lowering patterns. As such, the convert-vector-to-llvm pass will
+/// only run this registration conditionally.
+void populateVectorContractToMatrixMultiply(RewritePatternSet &patterns,
+                                            PatternBenefit benefit = 100);
+
+/// Populate the pattern set with the following patterns:
+///
+/// [TransposeOpLowering]
+/// Lowers `vector.transpose` to `llvm.intr.matrix.flat_transpose`.
+///
+/// Given the high benefit, this will be prioriotised over other
+/// transpose-lowering patterns. As such, the convert-vector-to-llvm pass will
+/// only run this registration conditionally.
+void populateVectorTransposeToFlatTranspose(RewritePatternSet &patterns,
+                                            PatternBenefit benefit = 100);
+
 } // namespace vector
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index cc8421b23a074..7cd70e42d363c 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -118,9 +118,10 @@ inline auto makeVscaleConstantBuilder(PatternRewriter &rewriter, Location loc) {
   Value vscale = nullptr;
   return [loc, vscale, &rewriter](int64_t multiplier) mutable {
     if (!vscale)
-      vscale = rewriter.create<vector::VectorScaleOp>(loc);
-    return rewriter.create<arith::MulIOp>(
-        loc, vscale, rewriter.create<arith::ConstantIndexOp>(loc, multiplier));
+      vscale = vector::VectorScaleOp::create(rewriter, loc);
+    return arith::MulIOp::create(
+        rewriter, loc, vscale,
+        arith::ConstantIndexOp::create(rewriter, loc, multiplier));
   };
 }
 
@@ -226,7 +227,8 @@ bool isLinearizableVector(VectorType type);
 /// Note: all read offsets are set to 0.
 Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
                              ArrayRef<int64_t> inputVectorSizes, Value padValue,
-                             bool useInBoundsInsteadOfMasking = false);
+                             bool useInBoundsInsteadOfMasking = false,
+                             ArrayRef<bool> scalableDims = {});
 
 /// Returns success if `inputVectorSizes` is a valid masking configuraion for
 /// given `shape`, i.e., it meets:
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 42b5b7a0d4e3f..d022361d1e376 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -64,6 +64,12 @@ def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_td
     )>
   ];
 
+  let extraClassDeclaration = [{
+    // return true if all fields of the BlockTensorDescAttr are set with
+    // default values.
+    bool hasDefaultsOnly();
+  }];
+
 }
 
 def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index bd5ea9fd83781..81e25f7537cb0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -110,23 +110,34 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     Variadic<Index>: $offsets,
     Variadic<Index>: $shape,
     Variadic<Index>: $strides,
-    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
     OptionalAttr<DenseI64ArrayAttr>: $const_shape,
     OptionalAttr<DenseI64ArrayAttr>: $const_strides
   );
-  let results = (outs XeGPU_TensorDesc: $TensorDesc);
 
   let assemblyFormat = [{
     $source ``
-    custom<DynamicIndexList>($offsets, $const_offsets)
-    (`,` custom<DynamicIndexList>($shape, $const_shape)^
-     `,` custom<DynamicIndexList>($strides, $const_strides))?
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
+    (`,` `shape` `:` custom<DynamicIndexList>($shape, $const_shape)^
+     `,` `strides``:` custom<DynamicIndexList>($strides, $const_strides))?
     attr-dict `:` type($source) `->` qualified(type($TensorDesc))
   }];
 
+  let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
   let hasVerifier = 1;
 
   let builders = [
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source)>,
+
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType> ": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $shape,
+                   "llvm::ArrayRef<OpFoldResult>": $strides)>,
+
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $shape,
+                   "llvm::ArrayRef<OpFoldResult>": $strides)>,
+
     OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
                    "llvm::ArrayRef<OpFoldResult>": $offsets)>,
 
@@ -163,7 +174,17 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     }
 
     ArrayRef<int64_t> getStaticOffsets(){
-      return getConstOffsets();
+      auto attr = getConstOffsetsAttr();
+
+      if (attr) 
+        return attr;
+
+      int64_t rank = getMixedSizes().size();
+      
+      setConstOffsets(llvm::SmallVector<int64_t, 4>(rank, 0));
+
+      attr = getConstOffsetsAttr();
+      return attr;
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
@@ -172,10 +193,16 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// and `const_shape` will be used to represent the shape of
     /// source operand. They overide static shape from source memref type.
     ArrayRef<int64_t> getStaticSizes() {
+      /// To be compatible with OffsetSizeAndStrideOpInterface, which expects valid return value and perform checks
+      static  llvm::SmallVector<int64_t, 4> emptyShape;
+
       auto attr = getConstShapeAttr();
-      if (llvm::isa<IntegerType>(getSourceType()) || attr)
+      if (attr)
         return attr;
 
+      if (llvm::isa<IntegerType>(getSourceType()))
+        return emptyShape;
+
       auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticSizes");
       return memrefType.getShape();
@@ -187,9 +214,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// and `const_strides` will be used to represent the strides of
     /// source operand. They overide static strides from source memref type.
     ArrayRef<int64_t> getStaticStrides() {
+      /// To be compatible with OffsetSizeAndStrideOpInterface, which expects valid return value and perform checks
+      static llvm::SmallVector<int64_t, 4> emptyStrides;
+
       auto attr = getConstStridesAttr();
-      if (llvm::isa<IntegerType>(getSourceType()) || attr)
+      if (attr)
         return attr;
+      
+      if (llvm::isa<IntegerType>(getSourceType()))
+        return emptyStrides;
 
       auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticStrides");
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 277158ac85409..1f4e817dc549c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -131,12 +131,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
     }
 
-    BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const {
-      return llvm::dyn_cast_if_present<BlockTensorDescAttr>(getEncoding());
-    }
-
-    ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const {
-      return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
+    template <typename T,
+              typename = std::enable_if_t<
+                            std::is_same_v<T, BlockTensorDescAttr> ||
+                            std::is_same_v<T, ScatterTensorDescAttr>>>
+    T getEncodingOfType() const {
+      return llvm::dyn_cast_if_present<T>(getEncoding());
     }
 
     LayoutAttr getLayoutAttr() const {
@@ -144,49 +144,35 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     }
 
     xegpu::MemorySpace getMemorySpace() const {
-      auto block_attr = getEncodingAsBlockTensorDescAttr();
-      if (block_attr && block_attr.getMemorySpace())
-        return block_attr.getMemorySpace().getValue();
-
-      auto scatter_attr = getEncodingAsScatterTensorDescAttr();
-      if (scatter_attr && scatter_attr.getMemorySpace())
-        return scatter_attr.getMemorySpace().getValue();
+      if (auto attr = getEncodingOfType<BlockTensorDescAttr>())
+        return attr.getMemorySpace().getValue();
 
-      // return default value
-      return MemorySpace::Global;
+      auto attr = getEncodingOfType<ScatterTensorDescAttr>();
+      return attr.getMemorySpace().getValue();
     }
 
     // get the ArrayLength for blocked TensorDesc
     int getArrayLength() {
-      auto attr = getEncoding();
-      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
-      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
-      if (block_attr && block_attr.getArrayLength())
-        return block_attr.getArrayLength().getInt();
-      // return default value
-      return 1;
+      auto attr = getEncodingOfType<BlockTensorDescAttr>();
+      assert(attr && "invalid on non BlockTensorDescAttr.");
+      return attr.getArrayLength().getInt();
     }
 
     bool getBoundaryCheck() {
-      auto attr = getEncoding();
-      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
-      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
-      if (block_attr && block_attr.getBoundaryCheck())
-        return block_attr.getBoundaryCheck().getValue();
-      // return default value
-      return true;
+      auto attr = getEncodingOfType<BlockTensorDescAttr>();
+      assert(attr && "invalid on non BlockTensorDescAttr.");
+      return attr.getBoundaryCheck().getValue();
     }
 
     bool isScattered() {
-      return bool(getEncodingAsScatterTensorDescAttr());
+      return bool(getEncodingOfType<ScatterTensorDescAttr>());
     }
 
     // get the ChunkSize for scattered TensorDesc
     int getChunkSizeAsInt() {
-      auto attr = getEncoding();
-      auto scatter_attr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(attr);
-      assert(scatter_attr && "invalid on non ScatterTensorDescAttr.");
-      return scatter_attr.getChunkSizeAsInt();
+      auto attr = getEncodingOfType<ScatterTensorDescAttr>();
+      assert(attr && "invalid on non ScatterTensorDescAttr.");
+      return attr.getChunkSizeAsInt();
     }
 
     /// Helper to drop all layout information from the TensorDesc type.
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 6fea10185402a..488f358ff3802 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -76,6 +76,17 @@ LayoutAttr getLayoutAttr(const Value value);
 /// it will check the operand itself and its defining op.
 LayoutAttr getLayoutAttr(const OpOperand &opr);
 
+/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+                                      std::is_same_v<T, OpResult>>>
+void removeLayoutAttr(const T &operandOrResult);
+
+/// Removes the LayoutAttr for each OpOperand and OpResult of the given
+/// operation if they exist. If the operation contains regions, it is also
+/// applied recursively to the contained operations
+void removeLayoutAttrs(Operation *op);
+
 /// Sets the LayoutAttr for a given OpOperand or OpResult by attaching
 /// it to the owner's dictionary attributes
 template <typename T,
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 5a2520b48a7b3..2e356dec1981f 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -552,7 +552,7 @@ class OpBuilder : public Builder {
   template <typename OpTy, typename... Args>
   std::enable_if_t<OpTy::template hasTrait<OpTrait::ZeroResults>(), OpTy>
   createOrFold(Location location, Args &&...args) {
-    auto op = create<OpTy>(location, std::forward<Args>(args)...);
+    auto op = OpTy::create(*this, location, std::forward<Args>(args)...);
     SmallVector<Value, 0> unused;
     (void)tryFold(op.getOperation(), unused);
 
@@ -662,7 +662,7 @@ class ImplicitLocOpBuilder : public mlir::OpBuilder {
   /// location.
   template <typename OpTy, typename... Args>
   OpTy create(Args &&...args) {
-    return OpBuilder::create<OpTy>(curLoc, std::forward<Args>(args)...);
+    return OpTy::create(*this, curLoc, std::forward<Args>(args)...);
   }
 
   /// Create an operation of specific op type at the current insertion point,
diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td
index e91a13fea5c7f..18da85a580710 100644
--- a/mlir/include/mlir/IR/CommonAttrConstraints.td
+++ b/mlir/include/mlir/IR/CommonAttrConstraints.td
@@ -796,6 +796,10 @@ def IntPositive : AttrConstraint<
     CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getValue().isStrictlyPositive()">,
     "whose value is positive">;
 
+def IntPowerOf2 : AttrConstraint<
+    CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getValue().isPowerOf2()">,
+    "whose value is a power of two > 0">;
+
 class ArrayMaxCount<int n> : AttrConstraint<
     CPred<"::llvm::cast<::mlir::ArrayAttr>($_self).size() <= " # n>,
     "with at most " # n # " elements">;
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 43ef28624fb19..9e5fb5659a22b 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -22,7 +22,7 @@ include "mlir/IR/Utils.td"
 include "mlir/IR/AttrTypeBase.td"
 
 //===----------------------------------------------------------------------===//
-// OpTrait definitions
+// *OpTrait definitions
 //===----------------------------------------------------------------------===//
 
 // A trait that describes the structure of operation will be marked with
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index 75c3aea0792ac..883ece32967e4 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -115,7 +115,7 @@ class OpState {
   MLIRContext *getContext() { return getOperation()->getContext(); }
 
   /// Print the operation to the given stream.
-  void print(raw_ostream &os, OpPrintingFlags flags = std::nullopt) {
+  void print(raw_ostream &os, OpPrintingFlags flags = {}) {
     state->print(os, flags);
   }
   void print(raw_ostream &os, AsmState &asmState) {
diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 1c2c04e718bf7..fa8a4873572ce 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -318,7 +318,7 @@ class alignas(8) Operation final
   /// take O(N) where N is the number of operations within the parent block.
   bool isBeforeInBlock(Operation *other);
 
-  void print(raw_ostream &os, const OpPrintingFlags &flags = std::nullopt);
+  void print(raw_ostream &os, const OpPrintingFlags &flags = {});
   void print(raw_ostream &os, AsmState &state);
   void dump();
 
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index 65e6d4f64e36c..1ff7c56ddca38 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -802,6 +802,7 @@ class NamedAttrList {
   using size_type = size_t;
 
   NamedAttrList() : dictionarySorted({}, true) {}
+  LLVM_DEPRECATED("Use NamedAttrList() instead", "NamedAttrList()")
   NamedAttrList(std::nullopt_t none) : NamedAttrList() {}
   NamedAttrList(ArrayRef<NamedAttribute> attributes);
   NamedAttrList(DictionaryAttr attributes);
@@ -1175,6 +1176,7 @@ class alignas(8) OperandStorage {
 class OpPrintingFlags {
 public:
   OpPrintingFlags();
+  LLVM_DEPRECATED("Use OpPrintingFlags() instead", "OpPrintingFlags()")
   OpPrintingFlags(std::nullopt_t) : OpPrintingFlags() {}
 
   /// Enables the elision of large elements attributes by printing a lexically
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index afeb784b85a12..b3608b4394f45 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -475,6 +475,25 @@ class RewriterBase : public OpBuilder {
     RewriterBase::Listener *rewriteListener;
   };
 
+  /// A listener that logs notification events to llvm::dbgs() before
+  /// forwarding to the base listener.
+  struct PatternLoggingListener : public RewriterBase::ForwardingListener {
+    PatternLoggingListener(OpBuilder::Listener *listener, StringRef patternName)
+        : RewriterBase::ForwardingListener(listener), patternName(patternName) {
+    }
+
+    void notifyOperationInserted(Operation *op, InsertPoint previous) override;
+    void notifyOperationModified(Operation *op) override;
+    void notifyOperationReplaced(Operation *op, Operation *newOp) override;
+    void notifyOperationReplaced(Operation *op,
+                                 ValueRange replacement) override;
+    void notifyOperationErased(Operation *op) override;
+    void notifyPatternBegin(const Pattern &pattern, Operation *op) override;
+
+  private:
+    StringRef patternName;
+  };
+
   /// Move the blocks that belong to "region" before the given position in
   /// another region "parent". The two regions must be different. The caller
   /// is responsible for creating or updating the operation transferring flow
@@ -498,7 +517,9 @@ class RewriterBase : public OpBuilder {
   /// ops must match. The original op is erased.
   template <typename OpTy, typename... Args>
   OpTy replaceOpWithNewOp(Operation *op, Args &&...args) {
-    auto newOp = create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+    auto builder = static_cast<OpBuilder *>(this);
+    auto newOp =
+        OpTy::create(*builder, op->getLoc(), std::forward<Args>(args)...);
     replaceOp(op, newOp.getOperation());
     return newOp;
   }
diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.h b/mlir/include/mlir/Interfaces/ViewLikeInterface.h
index 14427a97a5502..db9c37fc3dc99 100644
--- a/mlir/include/mlir/Interfaces/ViewLikeInterface.h
+++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.h
@@ -109,8 +109,8 @@ class OpWithOffsetSizesAndStridesConstantArgumentFolder final
 
     // Create the new op in canonical form.
     auto newOp =
-        rewriter.create<OpType>(op.getLoc(), resultType, op.getSource(),
-                                mixedOffsets, mixedSizes, mixedStrides);
+        OpType::create(rewriter, op.getLoc(), resultType, op.getSource(),
+                       mixedOffsets, mixedSizes, mixedStrides);
     CastOpFunc()(rewriter, op, newOp);
 
     return success();
diff --git a/mlir/include/mlir/Parser/Parser.h b/mlir/include/mlir/Parser/Parser.h
index 828760fcbefa9..8a8cfb9090143 100644
--- a/mlir/include/mlir/Parser/Parser.h
+++ b/mlir/include/mlir/Parser/Parser.h
@@ -64,7 +64,7 @@ inline OwningOpRef<ContainerOpT> constructContainerOpForParserIfNecessary(
         "block that has an implicit terminator or does not require one");
 
     OpBuilder builder(context);
-    ContainerOpT op = builder.create<ContainerOpT>(sourceFileLoc);
+    ContainerOpT op = ContainerOpT::create(builder, sourceFileLoc);
     OwningOpRef<ContainerOpT> opRef(op);
     assert(op->getNumRegions() == 1 &&
            llvm::hasSingleElement(op->getRegion(0)) &&
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 9902c6bb15caf..c484072ffaa80 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -71,6 +71,9 @@ class ModuleImport {
   /// Converts all aliases of the LLVM module to MLIR variables.
   LogicalResult convertAliases();
 
+  /// Converts all ifuncs of the LLVM module to MLIR variables.
+  LogicalResult convertIFuncs();
+
   /// Converts the data layout of the LLVM module to an MLIR data layout
   /// specification.
   LogicalResult convertDataLayout();
@@ -320,6 +323,8 @@ class ModuleImport {
   /// Converts an LLVM global alias variable into an MLIR LLVM dialect alias
   /// operation if a conversion exists. Otherwise, returns failure.
   LogicalResult convertAlias(llvm::GlobalAlias *alias);
+  // Converts an LLVM global ifunc into an MLIR LLVM dialect ifunc operation.
+  LogicalResult convertIFunc(llvm::GlobalIFunc *ifunc);
   /// Returns personality of `func` as a FlatSymbolRefAttr.
   FlatSymbolRefAttr getPersonalityAsAttr(llvm::Function *func);
   /// Imports `bb` into `block`, which must be initially empty.
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 5d52cf3f04b6a..f3f73f49f199a 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -260,6 +260,12 @@ class ModuleTranslation {
     return aliasesMapping.lookup(op);
   }
 
+  /// Finds an LLVM IR global value that corresponds to the given MLIR operation
+  /// defining an IFunc.
+  llvm::GlobalValue *lookupIFunc(Operation *op) {
+    return ifuncMapping.lookup(op);
+  }
+
   /// Returns the OpenMP IR builder associated with the LLVM IR module being
   /// constructed.
   llvm::OpenMPIRBuilder *getOpenMPBuilder();
@@ -345,6 +351,7 @@ class ModuleTranslation {
                                  bool recordInsertions = false);
   LogicalResult convertFunctionSignatures();
   LogicalResult convertFunctions();
+  LogicalResult convertIFuncs();
   LogicalResult convertComdats();
 
   LogicalResult convertUnresolvedBlockAddress();
@@ -406,6 +413,10 @@ class ModuleTranslation {
   /// aliases.
   DenseMap<Operation *, llvm::GlobalValue *> aliasesMapping;
 
+  /// Mappings between llvm.mlir.ifunc definitions and corresponding global
+  /// ifuncs.
+  DenseMap<Operation *, llvm::GlobalValue *> ifuncMapping;
+
   /// A stateful object used to translate types.
   TypeToLLVMIRTranslator typeTranslator;
 
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index 17e48e0d069b7..5c4d4d13580a0 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -2481,6 +2481,44 @@ void IntegerRelation::applyDomain(const IntegerRelation &rel) {
 
 void IntegerRelation::applyRange(const IntegerRelation &rel) { compose(rel); }
 
+IntegerRelation IntegerRelation::rangeProduct(const IntegerRelation &rel) {
+  /// R1: (i, j) -> k : f(i, j, k) = 0
+  /// R2: (i, j) -> l : g(i, j, l) = 0
+  /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0
+  assert(getNumDomainVars() == rel.getNumDomainVars() &&
+         "Range product is only defined for relations with equal domains");
+
+  // explicit copy of `this`
+  IntegerRelation result = *this;
+  unsigned relRangeVarStart = rel.getVarKindOffset(VarKind::Range);
+  unsigned numThisRangeVars = getNumRangeVars();
+  unsigned numNewSymbolVars = result.getNumSymbolVars() - getNumSymbolVars();
+
+  result.appendVar(VarKind::Range, rel.getNumRangeVars());
+
+  // Copy each equality from `rel` and update the copy to account for range
+  // variables from `this`. The `rel` equality is a list of coefficients of the
+  // variables from `rel`, and so the range variables need to be shifted right
+  // by the number of `this` range variables and symbols.
+  for (unsigned i = 0; i < rel.getNumEqualities(); ++i) {
+    SmallVector<DynamicAPInt> copy =
+        SmallVector<DynamicAPInt>(rel.getEquality(i));
+    copy.insert(copy.begin() + relRangeVarStart,
+                numThisRangeVars + numNewSymbolVars, DynamicAPInt(0));
+    result.addEquality(copy);
+  }
+
+  for (unsigned i = 0; i < rel.getNumInequalities(); ++i) {
+    SmallVector<DynamicAPInt> copy =
+        SmallVector<DynamicAPInt>(rel.getInequality(i));
+    copy.insert(copy.begin() + relRangeVarStart,
+                numThisRangeVars + numNewSymbolVars, DynamicAPInt(0));
+    result.addInequality(copy);
+  }
+
+  return result;
+}
+
 void IntegerRelation::printSpace(raw_ostream &os) const {
   space.print(os);
   os << getNumConstraints() << " constraints\n";
diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index 756d3d01a4534..435ff713a1b29 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -1198,8 +1198,8 @@ Value OperationParser::createForwardRefPlaceholder(SMLoc loc, Type type) {
   auto name = OperationName("builtin.unrealized_conversion_cast", getContext());
   auto *op = Operation::create(
       getEncodedSourceLocation(loc), name, type, /*operands=*/{},
-      /*attributes=*/std::nullopt, /*properties=*/nullptr, /*successors=*/{},
-      /*numRegions=*/0);
+      /*attributes=*/NamedAttrList(), /*properties=*/nullptr,
+      /*successors=*/{}, /*numRegions=*/0);
   forwardRefPlaceholders[op->getResult(0)] = loc;
   forwardRefOps.insert(op);
   return op->getResult(0);
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index d961482885300..7b790e90e0d87 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -97,6 +97,10 @@ static const char kOperationPrintDocstring[] =
   binary: Whether to write bytes (True) or str (False). Defaults to False.
   large_elements_limit: Whether to elide elements attributes above this
     number of elements. Defaults to None (no limit).
+  large_resource_limit: Whether to elide resource attributes above this
+    number of characters. Defaults to None (no limit). If large_elements_limit
+    is set and this is None, the behavior will be to use large_elements_limit
+    as large_resource_limit.
   enable_debug_info: Whether to print debug/location information. Defaults
     to False.
   pretty_debug_info: Whether to format debug information for easier reading
@@ -1303,6 +1307,7 @@ void PyOperation::checkValid() const {
 }
 
 void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
+                            std::optional<int64_t> largeResourceLimit,
                             bool enableDebugInfo, bool prettyDebugInfo,
                             bool printGenericOpForm, bool useLocalScope,
                             bool useNameLocAsPrefix, bool assumeVerified,
@@ -1314,10 +1319,10 @@ void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
     fileObject = nb::module_::import_("sys").attr("stdout");
 
   MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
-  if (largeElementsLimit) {
+  if (largeElementsLimit)
     mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit);
-    mlirOpPrintingFlagsElideLargeResourceString(flags, *largeElementsLimit);
-  }
+  if (largeResourceLimit)
+    mlirOpPrintingFlagsElideLargeResourceString(flags, *largeResourceLimit);
   if (enableDebugInfo)
     mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true,
                                        /*prettyForm=*/prettyDebugInfo);
@@ -1405,6 +1410,7 @@ void PyOperationBase::walk(
 
 nb::object PyOperationBase::getAsm(bool binary,
                                    std::optional<int64_t> largeElementsLimit,
+                                   std::optional<int64_t> largeResourceLimit,
                                    bool enableDebugInfo, bool prettyDebugInfo,
                                    bool printGenericOpForm, bool useLocalScope,
                                    bool useNameLocAsPrefix, bool assumeVerified,
@@ -1416,6 +1422,7 @@ nb::object PyOperationBase::getAsm(bool binary,
     fileObject = nb::module_::import_("io").attr("StringIO")();
   }
   print(/*largeElementsLimit=*/largeElementsLimit,
+        /*largeResourceLimit=*/largeResourceLimit,
         /*enableDebugInfo=*/enableDebugInfo,
         /*prettyDebugInfo=*/prettyDebugInfo,
         /*printGenericOpForm=*/printGenericOpForm,
@@ -3348,6 +3355,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           [](PyOperationBase &self) {
             return self.getAsm(/*binary=*/false,
                                /*largeElementsLimit=*/std::nullopt,
+                               /*largeResourceLimit=*/std::nullopt,
                                /*enableDebugInfo=*/false,
                                /*prettyDebugInfo=*/false,
                                /*printGenericOpForm=*/false,
@@ -3363,11 +3371,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("state"), nb::arg("file").none() = nb::none(),
            nb::arg("binary") = false, kOperationPrintStateDocstring)
       .def("print",
-           nb::overload_cast<std::optional<int64_t>, bool, bool, bool, bool,
-                             bool, bool, nb::object, bool, bool>(
-               &PyOperationBase::print),
+           nb::overload_cast<std::optional<int64_t>, std::optional<int64_t>,
+                             bool, bool, bool, bool, bool, bool, nb::object,
+                             bool, bool>(&PyOperationBase::print),
            // Careful: Lots of arguments must match up with print method.
            nb::arg("large_elements_limit").none() = nb::none(),
+           nb::arg("large_resource_limit").none() = nb::none(),
            nb::arg("enable_debug_info") = false,
            nb::arg("pretty_debug_info") = false,
            nb::arg("print_generic_op_form") = false,
@@ -3383,6 +3392,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            // Careful: Lots of arguments must match up with get_asm method.
            nb::arg("binary") = false,
            nb::arg("large_elements_limit").none() = nb::none(),
+           nb::arg("large_resource_limit").none() = nb::none(),
            nb::arg("enable_debug_info") = false,
            nb::arg("pretty_debug_info") = false,
            nb::arg("print_generic_op_form") = false,
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 9befcce725bb7..0fdd2d1a7eff6 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -599,18 +599,18 @@ class PyOperationBase {
 public:
   virtual ~PyOperationBase() = default;
   /// Implements the bound 'print' method and helps with others.
-  void print(std::optional<int64_t> largeElementsLimit, bool enableDebugInfo,
+  void print(std::optional<int64_t> largeElementsLimit,
+             std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
              bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
              bool useNameLocAsPrefix, bool assumeVerified,
              nanobind::object fileObject, bool binary, bool skipRegions);
   void print(PyAsmState &state, nanobind::object fileObject, bool binary);
 
-  nanobind::object getAsm(bool binary,
-                          std::optional<int64_t> largeElementsLimit,
-                          bool enableDebugInfo, bool prettyDebugInfo,
-                          bool printGenericOpForm, bool useLocalScope,
-                          bool useNameLocAsPrefix, bool assumeVerified,
-                          bool skipRegions);
+  nanobind::object
+  getAsm(bool binary, std::optional<int64_t> largeElementsLimit,
+         std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
+         bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
+         bool useNameLocAsPrefix, bool assumeVerified, bool skipRegions);
 
   // Implement the bound 'writeBytecode' method.
   void writeBytecode(const nanobind::object &fileObject,
diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp
index 858c3bd5745fe..20017e25b69bb 100644
--- a/mlir/lib/Bindings/Python/Pass.cpp
+++ b/mlir/lib/Bindings/Python/Pass.cpp
@@ -78,12 +78,19 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
           [](PyPassManager &passManager, bool printBeforeAll,
              bool printAfterAll, bool printModuleScope, bool printAfterChange,
              bool printAfterFailure, std::optional<int64_t> largeElementsLimit,
-             bool enableDebugInfo, bool printGenericOpForm,
+             std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
+             bool printGenericOpForm,
              std::optional<std::string> optionalTreePrintingPath) {
             MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
-            if (largeElementsLimit)
+            if (largeElementsLimit) {
               mlirOpPrintingFlagsElideLargeElementsAttrs(flags,
                                                          *largeElementsLimit);
+              mlirOpPrintingFlagsElideLargeResourceString(flags,
+                                                          *largeElementsLimit);
+            }
+            if (largeResourceLimit)
+              mlirOpPrintingFlagsElideLargeResourceString(flags,
+                                                          *largeResourceLimit);
             if (enableDebugInfo)
               mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true,
                                                  /*prettyForm=*/false);
@@ -103,6 +110,7 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
           "print_module_scope"_a = false, "print_after_change"_a = false,
           "print_after_failure"_a = false,
           "large_elements_limit"_a.none() = nb::none(),
+          "large_resource_limit"_a.none() = nb::none(),
           "enable_debug_info"_a = false, "print_generic_op_form"_a = false,
           "tree_printing_dir_path"_a.none() = nb::none(),
           "Enable IR printing, default as mlir-print-ir-after-all.")
@@ -112,6 +120,12 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
             mlirPassManagerEnableVerifier(passManager.get(), enable);
           },
           "enable"_a, "Enable / disable verify-each.")
+      .def(
+          "enable_timing",
+          [](PyPassManager &passManager) {
+            mlirPassManagerEnableTiming(passManager.get());
+          },
+          "Enable pass timing.")
       .def_static(
           "parse",
           [](const std::string &pipeline, DefaultingPyMlirContext context) {
diff --git a/mlir/lib/CAPI/IR/Pass.cpp b/mlir/lib/CAPI/IR/Pass.cpp
index 883b7e8bb832d..3c499c3e4974d 100644
--- a/mlir/lib/CAPI/IR/Pass.cpp
+++ b/mlir/lib/CAPI/IR/Pass.cpp
@@ -75,6 +75,10 @@ void mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable) {
   unwrap(passManager)->enableVerifier(enable);
 }
 
+void mlirPassManagerEnableTiming(MlirPassManager passManager) {
+  unwrap(passManager)->enableTiming();
+}
+
 MlirOpPassManager mlirPassManagerGetNestedUnder(MlirPassManager passManager,
                                                 MlirStringRef operationName) {
   return wrap(&unwrap(passManager)->nest(unwrap(operationName)));
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index ef35ee208f002..309476ca7136a 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -419,6 +419,112 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
   }
 };
 
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+///  Vmcnt, Expcnt and Lgkmcnt are decoded as follows:
+///     Vmcnt = Waitcnt[3:0]        (pre-gfx9)
+///     Vmcnt = Waitcnt[15:14,3:0]  (gfx9,10)
+///     Vmcnt = Waitcnt[15:10]      (gfx11)
+///     Expcnt = Waitcnt[6:4]       (pre-gfx11)
+///     Expcnt = Waitcnt[2:0]       (gfx11)
+///     Lgkmcnt = Waitcnt[11:8]     (pre-gfx10)
+///     Lgkmcnt = Waitcnt[13:8]     (gfx10)
+///     Lgkmcnt = Waitcnt[9:4]      (gfx11)
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+                                         unsigned expcnt, unsigned lgkmcnt) {
+  if (chipset.majorVersion < 9) {
+    vmcnt = std::min(15u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(15u, lgkmcnt);
+    return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+  }
+  if (chipset.majorVersion == 9) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(15u, lgkmcnt);
+    unsigned lowBits = vmcnt & 0xF;
+    unsigned highBits = (vmcnt >> 4) << 14;
+    unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+    return lowBits | highBits | otherCnts;
+  }
+  if (chipset.majorVersion == 10) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(63u, lgkmcnt);
+    unsigned lowBits = vmcnt & 0xF;
+    unsigned highBits = (vmcnt >> 4) << 14;
+    unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+    return lowBits | highBits | otherCnts;
+  }
+  if (chipset.majorVersion == 11) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(63u, lgkmcnt);
+    return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+  }
+  return failure();
+}
+
+struct MemoryCounterWaitOpLowering
+    : public ConvertOpToLLVMPattern<MemoryCounterWaitOp> {
+  MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter,
+                              Chipset chipset)
+      : ConvertOpToLLVMPattern<MemoryCounterWaitOp>(converter),
+        chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (chipset.majorVersion >= 12) {
+      Location loc = op.getLoc();
+      if (std::optional<int> ds = adaptor.getDs())
+        rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
+
+      if (std::optional<int> load = adaptor.getLoad())
+        rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
+
+      if (std::optional<int> store = adaptor.getStore())
+        rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
+
+      if (std::optional<int> exp = adaptor.getExp())
+        rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    auto getVal = [](Attribute attr) -> unsigned {
+      if (attr)
+        return cast<IntegerAttr>(attr).getInt();
+
+      // This value will be clamped to the maximum value for the chipset.
+      return 1024;
+    };
+    unsigned ds = getVal(adaptor.getDsAttr());
+    unsigned exp = getVal(adaptor.getExpAttr());
+
+    unsigned vmcnt = 1024;
+    Attribute load = adaptor.getLoadAttr();
+    Attribute store = adaptor.getStoreAttr();
+    if (load && store) {
+      vmcnt = getVal(load) + getVal(store);
+    } else if (load) {
+      vmcnt = getVal(load);
+    } else if (store) {
+      vmcnt = getVal(store);
+    }
+
+    FailureOr<unsigned> waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds);
+    if (failed(waitcnt))
+      return op.emitOpError("unsupported chipset");
+
+    rewriter.replaceOpWithNewOp<ROCDL::SWaitcntOp>(op, *waitcnt);
+    return success();
+  }
+};
+
 struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
   LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
       : ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -1825,9 +1931,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicUminOp>,
            RawBufferOpLowering<RawBufferAtomicCmpswapOp,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
-           AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
-           MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
-           ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+           AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
+           SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
+           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
            TransposeLoadOpLowering>(converter, chipset);
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index cf9bb3a000050..156c679c5039e 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -153,7 +153,7 @@ ExtFOnFloat8RewritePattern::matchAndRewrite(arith::ExtFOp op,
 
   if (inVecType.getShape().empty()) {
     Value zerodSplat =
-        rewriter.createOrFold<vector::SplatOp>(loc, outType, zero);
+        rewriter.createOrFold<vector::BroadcastOp>(loc, outType, zero);
     Value scalarIn =
         rewriter.create<vector::ExtractOp>(loc, in, ArrayRef<int64_t>{});
     Value scalarExt =
@@ -166,7 +166,7 @@ ExtFOnFloat8RewritePattern::matchAndRewrite(arith::ExtFOp op,
 
   VectorType flatTy = VectorType::get(SmallVector<int64_t>{numElements},
                                       outType.getElementType());
-  Value result = rewriter.createOrFold<vector::SplatOp>(loc, flatTy, zero);
+  Value result = rewriter.createOrFold<vector::BroadcastOp>(loc, flatTy, zero);
 
   if (inVecType.getRank() > 1) {
     inVecType = VectorType::get(SmallVector<int64_t>{numElements},
@@ -315,7 +315,7 @@ TruncFToFloat8RewritePattern::matchAndRewrite(arith::TruncFOp op,
 
   VectorType flatTy = VectorType::get(SmallVector<int64_t>{numElements},
                                       outVecType.getElementType());
-  Value result = rewriter.createOrFold<vector::SplatOp>(loc, flatTy, zero);
+  Value result = rewriter.createOrFold<vector::BroadcastOp>(loc, flatTy, zero);
 
   if (inVectorTy.getRank() > 1) {
     inVectorTy = VectorType::get(SmallVector<int64_t>{numElements},
@@ -383,7 +383,8 @@ LogicalResult TruncfToFloat16RewritePattern::matchAndRewrite(
   int64_t numElements = outVecType.getNumElements();
   Value zero = rewriter.createOrFold<arith::ConstantOp>(
       loc, outElemType, rewriter.getFloatAttr(outElemType, 0.0));
-  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
+  Value result =
+      rewriter.createOrFold<vector::BroadcastOp>(loc, outVecType, zero);
 
   if (inVectorTy.getRank() > 1) {
     inVectorTy = VectorType::get(SmallVector<int64_t>{numElements},
@@ -478,8 +479,8 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
   VectorType extScaleResultType = VectorType::get(opWidth, outType);
 
   if (!outVecType) {
-    Value inCast =
-        rewriter.create<vector::SplatOp>(loc, VectorType::get(1, inType), in);
+    Value inCast = rewriter.create<vector::BroadcastOp>(
+        loc, VectorType::get(1, inType), in);
     // TODO: replace this with non-packed ScaledExtOp
     Value scaleExt = rewriter.create<amdgpu::ScaledExtPackedOp>(
         loc, extScaleResultType, inCast, scale, 0);
@@ -509,7 +510,8 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
 
   Value zero = rewriter.create<arith::ConstantOp>(
       loc, outType, rewriter.getFloatAttr(outType, 0.0));
-  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
+  Value result =
+      rewriter.createOrFold<vector::BroadcastOp>(loc, outVecType, zero);
 
   for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, ratio)) {
     SmallVector<int64_t> strides(offsets.size(), 1);
@@ -523,7 +525,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op,
 
     VectorType blockResultType = VectorType::get(blockSize, outType);
     Value blockResult =
-        rewriter.createOrFold<vector::SplatOp>(loc, blockResultType, zero);
+        rewriter.createOrFold<vector::BroadcastOp>(loc, blockResultType, zero);
 
     for (int64_t i = 0, sliceWidth = std::min(opWidth, blockSize - i);
          i < blockSize;
@@ -587,7 +589,7 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
 
   if (!outVecType) {
     Type inVecType = VectorType::get(1, inType);
-    Value inCast = rewriter.create<vector::SplatOp>(loc, inVecType, in);
+    Value inCast = rewriter.create<vector::BroadcastOp>(loc, inVecType, in);
     // TODO: replace this with non-packed ScaledTruncOp
     Value scaleTrunc = rewriter.create<amdgpu::PackedScaledTruncOp>(
         loc, truncScaleResultType, inCast, scale, 0, /*existing=*/nullptr);
@@ -616,7 +618,8 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
 
   int64_t blockSize = computeProduct(ratio);
 
-  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outVecType, zero);
+  Value result =
+      rewriter.createOrFold<vector::BroadcastOp>(loc, outVecType, zero);
 
   for (SmallVector<int64_t> offsets : StaticTileOffsetRange(inShape, ratio)) {
     SmallVector<int64_t> strides(offsets.size(), 1);
@@ -630,7 +633,7 @@ ScalingTruncFRewritePattern::matchAndRewrite(arith::ScalingTruncFOp op,
 
     VectorType blockResultType = VectorType::get(blockSize, outType);
     Value blockResult =
-        rewriter.createOrFold<vector::SplatOp>(loc, blockResultType, zero);
+        rewriter.createOrFold<vector::BroadcastOp>(loc, blockResultType, zero);
 
     for (int64_t i = 0, sliceWidth = std::min(opWidth, blockSize - i);
          i < blockSize;
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 24a48993ad80c..f84375b6b8d6a 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(AsyncToLLVM)
 add_subdirectory(BufferizationToMemRef)
 add_subdirectory(ComplexCommon)
 add_subdirectory(ComplexToLibm)
+add_subdirectory(ComplexToROCDLLibraryCalls)
 add_subdirectory(ComplexToLLVM)
 add_subdirectory(ComplexToSPIRV)
 add_subdirectory(ComplexToStandard)
diff --git a/mlir/lib/Conversion/ComplexCommon/DivisionConverter.cpp b/mlir/lib/Conversion/ComplexCommon/DivisionConverter.cpp
index 70b22386f1eea..14fbb9bf09545 100644
--- a/mlir/lib/Conversion/ComplexCommon/DivisionConverter.cpp
+++ b/mlir/lib/Conversion/ComplexCommon/DivisionConverter.cpp
@@ -23,41 +23,43 @@ void mlir::complex::convertDivToLLVMUsingAlgebraic(
     ConversionPatternRewriter &rewriter, Location loc, Value lhsRe, Value lhsIm,
     Value rhsRe, Value rhsIm, LLVM::FastmathFlagsAttr fmf, Value *resultRe,
     Value *resultIm) {
-  Value rhsSqNorm = rewriter.create<LLVM::FAddOp>(
-      loc, rewriter.create<LLVM::FMulOp>(loc, rhsRe, rhsRe, fmf),
-      rewriter.create<LLVM::FMulOp>(loc, rhsIm, rhsIm, fmf), fmf);
+  Value rhsSqNorm = LLVM::FAddOp::create(
+      rewriter, loc, LLVM::FMulOp::create(rewriter, loc, rhsRe, rhsRe, fmf),
+      LLVM::FMulOp::create(rewriter, loc, rhsIm, rhsIm, fmf), fmf);
 
-  Value realNumerator = rewriter.create<LLVM::FAddOp>(
-      loc, rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsRe, fmf),
-      rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsIm, fmf), fmf);
+  Value realNumerator = LLVM::FAddOp::create(
+      rewriter, loc, LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsRe, fmf),
+      LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsIm, fmf), fmf);
 
-  Value imagNumerator = rewriter.create<LLVM::FSubOp>(
-      loc, rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsRe, fmf),
-      rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsIm, fmf), fmf);
+  Value imagNumerator = LLVM::FSubOp::create(
+      rewriter, loc, LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsRe, fmf),
+      LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsIm, fmf), fmf);
 
-  *resultRe = rewriter.create<LLVM::FDivOp>(loc, realNumerator, rhsSqNorm, fmf);
-  *resultIm = rewriter.create<LLVM::FDivOp>(loc, imagNumerator, rhsSqNorm, fmf);
+  *resultRe =
+      LLVM::FDivOp::create(rewriter, loc, realNumerator, rhsSqNorm, fmf);
+  *resultIm =
+      LLVM::FDivOp::create(rewriter, loc, imagNumerator, rhsSqNorm, fmf);
 }
 
 void mlir::complex::convertDivToStandardUsingAlgebraic(
     ConversionPatternRewriter &rewriter, Location loc, Value lhsRe, Value lhsIm,
     Value rhsRe, Value rhsIm, arith::FastMathFlagsAttr fmf, Value *resultRe,
     Value *resultIm) {
-  Value rhsSqNorm = rewriter.create<arith::AddFOp>(
-      loc, rewriter.create<arith::MulFOp>(loc, rhsRe, rhsRe, fmf),
-      rewriter.create<arith::MulFOp>(loc, rhsIm, rhsIm, fmf), fmf);
+  Value rhsSqNorm = arith::AddFOp::create(
+      rewriter, loc, arith::MulFOp::create(rewriter, loc, rhsRe, rhsRe, fmf),
+      arith::MulFOp::create(rewriter, loc, rhsIm, rhsIm, fmf), fmf);
 
-  Value realNumerator = rewriter.create<arith::AddFOp>(
-      loc, rewriter.create<arith::MulFOp>(loc, lhsRe, rhsRe, fmf),
-      rewriter.create<arith::MulFOp>(loc, lhsIm, rhsIm, fmf), fmf);
-  Value imagNumerator = rewriter.create<arith::SubFOp>(
-      loc, rewriter.create<arith::MulFOp>(loc, lhsIm, rhsRe, fmf),
-      rewriter.create<arith::MulFOp>(loc, lhsRe, rhsIm, fmf), fmf);
+  Value realNumerator = arith::AddFOp::create(
+      rewriter, loc, arith::MulFOp::create(rewriter, loc, lhsRe, rhsRe, fmf),
+      arith::MulFOp::create(rewriter, loc, lhsIm, rhsIm, fmf), fmf);
+  Value imagNumerator = arith::SubFOp::create(
+      rewriter, loc, arith::MulFOp::create(rewriter, loc, lhsIm, rhsRe, fmf),
+      arith::MulFOp::create(rewriter, loc, lhsRe, rhsIm, fmf), fmf);
 
   *resultRe =
-      rewriter.create<arith::DivFOp>(loc, realNumerator, rhsSqNorm, fmf);
+      arith::DivFOp::create(rewriter, loc, realNumerator, rhsSqNorm, fmf);
   *resultIm =
-      rewriter.create<arith::DivFOp>(loc, imagNumerator, rhsSqNorm, fmf);
+      arith::DivFOp::create(rewriter, loc, imagNumerator, rhsSqNorm, fmf);
 }
 
 // Smith's algorithm to divide complex numbers. It is just a bit smarter
@@ -94,181 +96,185 @@ void mlir::complex::convertDivToLLVMUsingRangeReduction(
   auto elementType = cast<FloatType>(rhsRe.getType());
 
   Value rhsRealImagRatio =
-      rewriter.create<LLVM::FDivOp>(loc, rhsRe, rhsIm, fmf);
-  Value rhsRealImagDenom = rewriter.create<LLVM::FAddOp>(
-      loc, rhsIm,
-      rewriter.create<LLVM::FMulOp>(loc, rhsRealImagRatio, rhsRe, fmf), fmf);
-  Value realNumerator1 = rewriter.create<LLVM::FAddOp>(
-      loc, rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsRealImagRatio, fmf),
-      lhsIm, fmf);
-  Value resultReal1 =
-      rewriter.create<LLVM::FDivOp>(loc, realNumerator1, rhsRealImagDenom, fmf);
-  Value imagNumerator1 = rewriter.create<LLVM::FSubOp>(
-      loc, rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsRealImagRatio, fmf),
-      lhsRe, fmf);
-  Value resultImag1 =
-      rewriter.create<LLVM::FDivOp>(loc, imagNumerator1, rhsRealImagDenom, fmf);
+      LLVM::FDivOp::create(rewriter, loc, rhsRe, rhsIm, fmf);
+  Value rhsRealImagDenom = LLVM::FAddOp::create(
+      rewriter, loc, rhsIm,
+      LLVM::FMulOp::create(rewriter, loc, rhsRealImagRatio, rhsRe, fmf), fmf);
+  Value realNumerator1 = LLVM::FAddOp::create(
+      rewriter, loc,
+      LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsRealImagRatio, fmf), lhsIm,
+      fmf);
+  Value resultReal1 = LLVM::FDivOp::create(rewriter, loc, realNumerator1,
+                                           rhsRealImagDenom, fmf);
+  Value imagNumerator1 = LLVM::FSubOp::create(
+      rewriter, loc,
+      LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsRealImagRatio, fmf), lhsRe,
+      fmf);
+  Value resultImag1 = LLVM::FDivOp::create(rewriter, loc, imagNumerator1,
+                                           rhsRealImagDenom, fmf);
 
   Value rhsImagRealRatio =
-      rewriter.create<LLVM::FDivOp>(loc, rhsIm, rhsRe, fmf);
-  Value rhsImagRealDenom = rewriter.create<LLVM::FAddOp>(
-      loc, rhsRe,
-      rewriter.create<LLVM::FMulOp>(loc, rhsImagRealRatio, rhsIm, fmf), fmf);
-  Value realNumerator2 = rewriter.create<LLVM::FAddOp>(
-      loc, lhsRe,
-      rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsImagRealRatio, fmf), fmf);
-  Value resultReal2 =
-      rewriter.create<LLVM::FDivOp>(loc, realNumerator2, rhsImagRealDenom, fmf);
-  Value imagNumerator2 = rewriter.create<LLVM::FSubOp>(
-      loc, lhsIm,
-      rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsImagRealRatio, fmf), fmf);
-  Value resultImag2 =
-      rewriter.create<LLVM::FDivOp>(loc, imagNumerator2, rhsImagRealDenom, fmf);
+      LLVM::FDivOp::create(rewriter, loc, rhsIm, rhsRe, fmf);
+  Value rhsImagRealDenom = LLVM::FAddOp::create(
+      rewriter, loc, rhsRe,
+      LLVM::FMulOp::create(rewriter, loc, rhsImagRealRatio, rhsIm, fmf), fmf);
+  Value realNumerator2 = LLVM::FAddOp::create(
+      rewriter, loc, lhsRe,
+      LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsImagRealRatio, fmf), fmf);
+  Value resultReal2 = LLVM::FDivOp::create(rewriter, loc, realNumerator2,
+                                           rhsImagRealDenom, fmf);
+  Value imagNumerator2 = LLVM::FSubOp::create(
+      rewriter, loc, lhsIm,
+      LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsImagRealRatio, fmf), fmf);
+  Value resultImag2 = LLVM::FDivOp::create(rewriter, loc, imagNumerator2,
+                                           rhsImagRealDenom, fmf);
 
   // Consider corner cases.
   // Case 1. Zero denominator, numerator contains at most one NaN value.
-  Value zero = rewriter.create<LLVM::ConstantOp>(
-      loc, elementType, rewriter.getZeroAttr(elementType));
-  Value rhsRealAbs = rewriter.create<LLVM::FAbsOp>(loc, rhsRe, fmf);
-  Value rhsRealIsZero = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::oeq, rhsRealAbs, zero);
-  Value rhsImagAbs = rewriter.create<LLVM::FAbsOp>(loc, rhsIm, fmf);
-  Value rhsImagIsZero = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::oeq, rhsImagAbs, zero);
-  Value lhsRealIsNotNaN =
-      rewriter.create<LLVM::FCmpOp>(loc, LLVM::FCmpPredicate::ord, lhsRe, zero);
-  Value lhsImagIsNotNaN =
-      rewriter.create<LLVM::FCmpOp>(loc, LLVM::FCmpPredicate::ord, lhsIm, zero);
+  Value zero = LLVM::ConstantOp::create(rewriter, loc, elementType,
+                                        rewriter.getZeroAttr(elementType));
+  Value rhsRealAbs = LLVM::FAbsOp::create(rewriter, loc, rhsRe, fmf);
+  Value rhsRealIsZero = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::oeq, rhsRealAbs, zero);
+  Value rhsImagAbs = LLVM::FAbsOp::create(rewriter, loc, rhsIm, fmf);
+  Value rhsImagIsZero = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::oeq, rhsImagAbs, zero);
+  Value lhsRealIsNotNaN = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::ord, lhsRe, zero);
+  Value lhsImagIsNotNaN = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::ord, lhsIm, zero);
   Value lhsContainsNotNaNValue =
-      rewriter.create<LLVM::OrOp>(loc, lhsRealIsNotNaN, lhsImagIsNotNaN);
-  Value resultIsInfinity = rewriter.create<LLVM::AndOp>(
-      loc, lhsContainsNotNaNValue,
-      rewriter.create<LLVM::AndOp>(loc, rhsRealIsZero, rhsImagIsZero));
-  Value inf = rewriter.create<LLVM::ConstantOp>(
-      loc, elementType,
+      LLVM::OrOp::create(rewriter, loc, lhsRealIsNotNaN, lhsImagIsNotNaN);
+  Value resultIsInfinity = LLVM::AndOp::create(
+      rewriter, loc, lhsContainsNotNaNValue,
+      LLVM::AndOp::create(rewriter, loc, rhsRealIsZero, rhsImagIsZero));
+  Value inf = LLVM::ConstantOp::create(
+      rewriter, loc, elementType,
       rewriter.getFloatAttr(elementType,
                             APFloat::getInf(elementType.getFloatSemantics())));
   Value infWithSignOfrhsReal =
-      rewriter.create<LLVM::CopySignOp>(loc, inf, rhsRe);
+      LLVM::CopySignOp::create(rewriter, loc, inf, rhsRe);
   Value infinityResultReal =
-      rewriter.create<LLVM::FMulOp>(loc, infWithSignOfrhsReal, lhsRe, fmf);
+      LLVM::FMulOp::create(rewriter, loc, infWithSignOfrhsReal, lhsRe, fmf);
   Value infinityResultImag =
-      rewriter.create<LLVM::FMulOp>(loc, infWithSignOfrhsReal, lhsIm, fmf);
+      LLVM::FMulOp::create(rewriter, loc, infWithSignOfrhsReal, lhsIm, fmf);
 
   // Case 2. Infinite numerator, finite denominator.
-  Value rhsRealFinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::one, rhsRealAbs, inf);
-  Value rhsImagFinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::one, rhsImagAbs, inf);
+  Value rhsRealFinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::one, rhsRealAbs, inf);
+  Value rhsImagFinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::one, rhsImagAbs, inf);
   Value rhsFinite =
-      rewriter.create<LLVM::AndOp>(loc, rhsRealFinite, rhsImagFinite);
-  Value lhsRealAbs = rewriter.create<LLVM::FAbsOp>(loc, lhsRe, fmf);
-  Value lhsRealInfinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::oeq, lhsRealAbs, inf);
-  Value lhsImagAbs = rewriter.create<LLVM::FAbsOp>(loc, lhsIm, fmf);
-  Value lhsImagInfinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::oeq, lhsImagAbs, inf);
+      LLVM::AndOp::create(rewriter, loc, rhsRealFinite, rhsImagFinite);
+  Value lhsRealAbs = LLVM::FAbsOp::create(rewriter, loc, lhsRe, fmf);
+  Value lhsRealInfinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::oeq, lhsRealAbs, inf);
+  Value lhsImagAbs = LLVM::FAbsOp::create(rewriter, loc, lhsIm, fmf);
+  Value lhsImagInfinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::oeq, lhsImagAbs, inf);
   Value lhsInfinite =
-      rewriter.create<LLVM::OrOp>(loc, lhsRealInfinite, lhsImagInfinite);
+      LLVM::OrOp::create(rewriter, loc, lhsRealInfinite, lhsImagInfinite);
   Value infNumFiniteDenom =
-      rewriter.create<LLVM::AndOp>(loc, lhsInfinite, rhsFinite);
-  Value one = rewriter.create<LLVM::ConstantOp>(
-      loc, elementType, rewriter.getFloatAttr(elementType, 1));
-  Value lhsRealIsInfWithSign = rewriter.create<LLVM::CopySignOp>(
-      loc, rewriter.create<LLVM::SelectOp>(loc, lhsRealInfinite, one, zero),
-      lhsRe);
-  Value lhsImagIsInfWithSign = rewriter.create<LLVM::CopySignOp>(
-      loc, rewriter.create<LLVM::SelectOp>(loc, lhsImagInfinite, one, zero),
-      lhsIm);
+      LLVM::AndOp::create(rewriter, loc, lhsInfinite, rhsFinite);
+  Value one = LLVM::ConstantOp::create(rewriter, loc, elementType,
+                                       rewriter.getFloatAttr(elementType, 1));
+  Value lhsRealIsInfWithSign = LLVM::CopySignOp::create(
+      rewriter, loc,
+      LLVM::SelectOp::create(rewriter, loc, lhsRealInfinite, one, zero), lhsRe);
+  Value lhsImagIsInfWithSign = LLVM::CopySignOp::create(
+      rewriter, loc,
+      LLVM::SelectOp::create(rewriter, loc, lhsImagInfinite, one, zero), lhsIm);
   Value lhsRealIsInfWithSignTimesrhsReal =
-      rewriter.create<LLVM::FMulOp>(loc, lhsRealIsInfWithSign, rhsRe, fmf);
+      LLVM::FMulOp::create(rewriter, loc, lhsRealIsInfWithSign, rhsRe, fmf);
   Value lhsImagIsInfWithSignTimesrhsImag =
-      rewriter.create<LLVM::FMulOp>(loc, lhsImagIsInfWithSign, rhsIm, fmf);
-  Value resultReal3 = rewriter.create<LLVM::FMulOp>(
-      loc, inf,
-      rewriter.create<LLVM::FAddOp>(loc, lhsRealIsInfWithSignTimesrhsReal,
-                                    lhsImagIsInfWithSignTimesrhsImag, fmf),
+      LLVM::FMulOp::create(rewriter, loc, lhsImagIsInfWithSign, rhsIm, fmf);
+  Value resultReal3 = LLVM::FMulOp::create(
+      rewriter, loc, inf,
+      LLVM::FAddOp::create(rewriter, loc, lhsRealIsInfWithSignTimesrhsReal,
+                           lhsImagIsInfWithSignTimesrhsImag, fmf),
       fmf);
   Value lhsRealIsInfWithSignTimesrhsImag =
-      rewriter.create<LLVM::FMulOp>(loc, lhsRealIsInfWithSign, rhsIm, fmf);
+      LLVM::FMulOp::create(rewriter, loc, lhsRealIsInfWithSign, rhsIm, fmf);
   Value lhsImagIsInfWithSignTimesrhsReal =
-      rewriter.create<LLVM::FMulOp>(loc, lhsImagIsInfWithSign, rhsRe, fmf);
-  Value resultImag3 = rewriter.create<LLVM::FMulOp>(
-      loc, inf,
-      rewriter.create<LLVM::FSubOp>(loc, lhsImagIsInfWithSignTimesrhsReal,
-                                    lhsRealIsInfWithSignTimesrhsImag, fmf),
+      LLVM::FMulOp::create(rewriter, loc, lhsImagIsInfWithSign, rhsRe, fmf);
+  Value resultImag3 = LLVM::FMulOp::create(
+      rewriter, loc, inf,
+      LLVM::FSubOp::create(rewriter, loc, lhsImagIsInfWithSignTimesrhsReal,
+                           lhsRealIsInfWithSignTimesrhsImag, fmf),
       fmf);
 
   // Case 3: Finite numerator, infinite denominator.
-  Value lhsRealFinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::one, lhsRealAbs, inf);
-  Value lhsImagFinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::one, lhsImagAbs, inf);
+  Value lhsRealFinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::one, lhsRealAbs, inf);
+  Value lhsImagFinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::one, lhsImagAbs, inf);
   Value lhsFinite =
-      rewriter.create<LLVM::AndOp>(loc, lhsRealFinite, lhsImagFinite);
-  Value rhsRealInfinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::oeq, rhsRealAbs, inf);
-  Value rhsImagInfinite = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::oeq, rhsImagAbs, inf);
+      LLVM::AndOp::create(rewriter, loc, lhsRealFinite, lhsImagFinite);
+  Value rhsRealInfinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::oeq, rhsRealAbs, inf);
+  Value rhsImagInfinite = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::oeq, rhsImagAbs, inf);
   Value rhsInfinite =
-      rewriter.create<LLVM::OrOp>(loc, rhsRealInfinite, rhsImagInfinite);
+      LLVM::OrOp::create(rewriter, loc, rhsRealInfinite, rhsImagInfinite);
   Value finiteNumInfiniteDenom =
-      rewriter.create<LLVM::AndOp>(loc, lhsFinite, rhsInfinite);
-  Value rhsRealIsInfWithSign = rewriter.create<LLVM::CopySignOp>(
-      loc, rewriter.create<LLVM::SelectOp>(loc, rhsRealInfinite, one, zero),
-      rhsRe);
-  Value rhsImagIsInfWithSign = rewriter.create<LLVM::CopySignOp>(
-      loc, rewriter.create<LLVM::SelectOp>(loc, rhsImagInfinite, one, zero),
-      rhsIm);
+      LLVM::AndOp::create(rewriter, loc, lhsFinite, rhsInfinite);
+  Value rhsRealIsInfWithSign = LLVM::CopySignOp::create(
+      rewriter, loc,
+      LLVM::SelectOp::create(rewriter, loc, rhsRealInfinite, one, zero), rhsRe);
+  Value rhsImagIsInfWithSign = LLVM::CopySignOp::create(
+      rewriter, loc,
+      LLVM::SelectOp::create(rewriter, loc, rhsImagInfinite, one, zero), rhsIm);
   Value rhsRealIsInfWithSignTimeslhsReal =
-      rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsRealIsInfWithSign, fmf);
+      LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsRealIsInfWithSign, fmf);
   Value rhsImagIsInfWithSignTimeslhsImag =
-      rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsImagIsInfWithSign, fmf);
-  Value resultReal4 = rewriter.create<LLVM::FMulOp>(
-      loc, zero,
-      rewriter.create<LLVM::FAddOp>(loc, rhsRealIsInfWithSignTimeslhsReal,
-                                    rhsImagIsInfWithSignTimeslhsImag, fmf),
+      LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsImagIsInfWithSign, fmf);
+  Value resultReal4 = LLVM::FMulOp::create(
+      rewriter, loc, zero,
+      LLVM::FAddOp::create(rewriter, loc, rhsRealIsInfWithSignTimeslhsReal,
+                           rhsImagIsInfWithSignTimeslhsImag, fmf),
       fmf);
   Value rhsRealIsInfWithSignTimeslhsImag =
-      rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsRealIsInfWithSign, fmf);
+      LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsRealIsInfWithSign, fmf);
   Value rhsImagIsInfWithSignTimeslhsReal =
-      rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsImagIsInfWithSign, fmf);
-  Value resultImag4 = rewriter.create<LLVM::FMulOp>(
-      loc, zero,
-      rewriter.create<LLVM::FSubOp>(loc, rhsRealIsInfWithSignTimeslhsImag,
-                                    rhsImagIsInfWithSignTimeslhsReal, fmf),
+      LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsImagIsInfWithSign, fmf);
+  Value resultImag4 = LLVM::FMulOp::create(
+      rewriter, loc, zero,
+      LLVM::FSubOp::create(rewriter, loc, rhsRealIsInfWithSignTimeslhsImag,
+                           rhsImagIsInfWithSignTimeslhsReal, fmf),
       fmf);
 
-  Value realAbsSmallerThanImagAbs = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::olt, rhsRealAbs, rhsImagAbs);
-  Value resultReal5 = rewriter.create<LLVM::SelectOp>(
-      loc, realAbsSmallerThanImagAbs, resultReal1, resultReal2);
-  Value resultImag5 = rewriter.create<LLVM::SelectOp>(
-      loc, realAbsSmallerThanImagAbs, resultImag1, resultImag2);
-  Value resultRealSpecialCase3 = rewriter.create<LLVM::SelectOp>(
-      loc, finiteNumInfiniteDenom, resultReal4, resultReal5);
-  Value resultImagSpecialCase3 = rewriter.create<LLVM::SelectOp>(
-      loc, finiteNumInfiniteDenom, resultImag4, resultImag5);
-  Value resultRealSpecialCase2 = rewriter.create<LLVM::SelectOp>(
-      loc, infNumFiniteDenom, resultReal3, resultRealSpecialCase3);
-  Value resultImagSpecialCase2 = rewriter.create<LLVM::SelectOp>(
-      loc, infNumFiniteDenom, resultImag3, resultImagSpecialCase3);
-  Value resultRealSpecialCase1 = rewriter.create<LLVM::SelectOp>(
-      loc, resultIsInfinity, infinityResultReal, resultRealSpecialCase2);
-  Value resultImagSpecialCase1 = rewriter.create<LLVM::SelectOp>(
-      loc, resultIsInfinity, infinityResultImag, resultImagSpecialCase2);
+  Value realAbsSmallerThanImagAbs = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::olt, rhsRealAbs, rhsImagAbs);
+  Value resultReal5 = LLVM::SelectOp::create(
+      rewriter, loc, realAbsSmallerThanImagAbs, resultReal1, resultReal2);
+  Value resultImag5 = LLVM::SelectOp::create(
+      rewriter, loc, realAbsSmallerThanImagAbs, resultImag1, resultImag2);
+  Value resultRealSpecialCase3 = LLVM::SelectOp::create(
+      rewriter, loc, finiteNumInfiniteDenom, resultReal4, resultReal5);
+  Value resultImagSpecialCase3 = LLVM::SelectOp::create(
+      rewriter, loc, finiteNumInfiniteDenom, resultImag4, resultImag5);
+  Value resultRealSpecialCase2 = LLVM::SelectOp::create(
+      rewriter, loc, infNumFiniteDenom, resultReal3, resultRealSpecialCase3);
+  Value resultImagSpecialCase2 = LLVM::SelectOp::create(
+      rewriter, loc, infNumFiniteDenom, resultImag3, resultImagSpecialCase3);
+  Value resultRealSpecialCase1 =
+      LLVM::SelectOp::create(rewriter, loc, resultIsInfinity,
+                             infinityResultReal, resultRealSpecialCase2);
+  Value resultImagSpecialCase1 =
+      LLVM::SelectOp::create(rewriter, loc, resultIsInfinity,
+                             infinityResultImag, resultImagSpecialCase2);
 
-  Value resultRealIsNaN = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::uno, resultReal5, zero);
-  Value resultImagIsNaN = rewriter.create<LLVM::FCmpOp>(
-      loc, LLVM::FCmpPredicate::uno, resultImag5, zero);
+  Value resultRealIsNaN = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::uno, resultReal5, zero);
+  Value resultImagIsNaN = LLVM::FCmpOp::create(
+      rewriter, loc, LLVM::FCmpPredicate::uno, resultImag5, zero);
   Value resultIsNaN =
-      rewriter.create<LLVM::AndOp>(loc, resultRealIsNaN, resultImagIsNaN);
+      LLVM::AndOp::create(rewriter, loc, resultRealIsNaN, resultImagIsNaN);
 
-  *resultRe = rewriter.create<LLVM::SelectOp>(
-      loc, resultIsNaN, resultRealSpecialCase1, resultReal5);
-  *resultIm = rewriter.create<LLVM::SelectOp>(
-      loc, resultIsNaN, resultImagSpecialCase1, resultImag5);
+  *resultRe = LLVM::SelectOp::create(rewriter, loc, resultIsNaN,
+                                     resultRealSpecialCase1, resultReal5);
+  *resultIm = LLVM::SelectOp::create(rewriter, loc, resultIsNaN,
+                                     resultImagSpecialCase1, resultImag5);
 }
 
 void mlir::complex::convertDivToStandardUsingRangeReduction(
@@ -278,179 +284,187 @@ void mlir::complex::convertDivToStandardUsingRangeReduction(
   auto elementType = cast<FloatType>(rhsRe.getType());
 
   Value rhsRealImagRatio =
-      rewriter.create<arith::DivFOp>(loc, rhsRe, rhsIm, fmf);
-  Value rhsRealImagDenom = rewriter.create<arith::AddFOp>(
-      loc, rhsIm,
-      rewriter.create<arith::MulFOp>(loc, rhsRealImagRatio, rhsRe, fmf), fmf);
-  Value realNumerator1 = rewriter.create<arith::AddFOp>(
-      loc, rewriter.create<arith::MulFOp>(loc, lhsRe, rhsRealImagRatio, fmf),
-      lhsIm, fmf);
-  Value resultReal1 = rewriter.create<arith::DivFOp>(loc, realNumerator1,
-                                                     rhsRealImagDenom, fmf);
-  Value imagNumerator1 = rewriter.create<arith::SubFOp>(
-      loc, rewriter.create<arith::MulFOp>(loc, lhsIm, rhsRealImagRatio, fmf),
-      lhsRe, fmf);
-  Value resultImag1 = rewriter.create<arith::DivFOp>(loc, imagNumerator1,
-                                                     rhsRealImagDenom, fmf);
+      arith::DivFOp::create(rewriter, loc, rhsRe, rhsIm, fmf);
+  Value rhsRealImagDenom = arith::AddFOp::create(
+      rewriter, loc, rhsIm,
+      arith::MulFOp::create(rewriter, loc, rhsRealImagRatio, rhsRe, fmf), fmf);
+  Value realNumerator1 = arith::AddFOp::create(
+      rewriter, loc,
+      arith::MulFOp::create(rewriter, loc, lhsRe, rhsRealImagRatio, fmf), lhsIm,
+      fmf);
+  Value resultReal1 = arith::DivFOp::create(rewriter, loc, realNumerator1,
+                                            rhsRealImagDenom, fmf);
+  Value imagNumerator1 = arith::SubFOp::create(
+      rewriter, loc,
+      arith::MulFOp::create(rewriter, loc, lhsIm, rhsRealImagRatio, fmf), lhsRe,
+      fmf);
+  Value resultImag1 = arith::DivFOp::create(rewriter, loc, imagNumerator1,
+                                            rhsRealImagDenom, fmf);
 
   Value rhsImagRealRatio =
-      rewriter.create<arith::DivFOp>(loc, rhsIm, rhsRe, fmf);
-  Value rhsImagRealDenom = rewriter.create<arith::AddFOp>(
-      loc, rhsRe,
-      rewriter.create<arith::MulFOp>(loc, rhsImagRealRatio, rhsIm, fmf), fmf);
-  Value realNumerator2 = rewriter.create<arith::AddFOp>(
-      loc, lhsRe,
-      rewriter.create<arith::MulFOp>(loc, lhsIm, rhsImagRealRatio, fmf), fmf);
-  Value resultReal2 = rewriter.create<arith::DivFOp>(loc, realNumerator2,
-                                                     rhsImagRealDenom, fmf);
-  Value imagNumerator2 = rewriter.create<arith::SubFOp>(
-      loc, lhsIm,
-      rewriter.create<arith::MulFOp>(loc, lhsRe, rhsImagRealRatio, fmf), fmf);
-  Value resultImag2 = rewriter.create<arith::DivFOp>(loc, imagNumerator2,
-                                                     rhsImagRealDenom, fmf);
+      arith::DivFOp::create(rewriter, loc, rhsIm, rhsRe, fmf);
+  Value rhsImagRealDenom = arith::AddFOp::create(
+      rewriter, loc, rhsRe,
+      arith::MulFOp::create(rewriter, loc, rhsImagRealRatio, rhsIm, fmf), fmf);
+  Value realNumerator2 = arith::AddFOp::create(
+      rewriter, loc, lhsRe,
+      arith::MulFOp::create(rewriter, loc, lhsIm, rhsImagRealRatio, fmf), fmf);
+  Value resultReal2 = arith::DivFOp::create(rewriter, loc, realNumerator2,
+                                            rhsImagRealDenom, fmf);
+  Value imagNumerator2 = arith::SubFOp::create(
+      rewriter, loc, lhsIm,
+      arith::MulFOp::create(rewriter, loc, lhsRe, rhsImagRealRatio, fmf), fmf);
+  Value resultImag2 = arith::DivFOp::create(rewriter, loc, imagNumerator2,
+                                            rhsImagRealDenom, fmf);
 
   // Consider corner cases.
   // Case 1. Zero denominator, numerator contains at most one NaN value.
-  Value zero = rewriter.create<arith::ConstantOp>(
-      loc, elementType, rewriter.getZeroAttr(elementType));
-  Value rhsRealAbs = rewriter.create<math::AbsFOp>(loc, rhsRe, fmf);
-  Value rhsRealIsZero = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OEQ, rhsRealAbs, zero);
-  Value rhsImagAbs = rewriter.create<math::AbsFOp>(loc, rhsIm, fmf);
-  Value rhsImagIsZero = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OEQ, rhsImagAbs, zero);
-  Value lhsRealIsNotNaN = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::ORD, lhsRe, zero);
-  Value lhsImagIsNotNaN = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::ORD, lhsIm, zero);
+  Value zero = arith::ConstantOp::create(rewriter, loc, elementType,
+                                         rewriter.getZeroAttr(elementType));
+  Value rhsRealAbs = math::AbsFOp::create(rewriter, loc, rhsRe, fmf);
+  Value rhsRealIsZero = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OEQ, rhsRealAbs, zero);
+  Value rhsImagAbs = math::AbsFOp::create(rewriter, loc, rhsIm, fmf);
+  Value rhsImagIsZero = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OEQ, rhsImagAbs, zero);
+  Value lhsRealIsNotNaN = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::ORD, lhsRe, zero);
+  Value lhsImagIsNotNaN = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::ORD, lhsIm, zero);
   Value lhsContainsNotNaNValue =
-      rewriter.create<arith::OrIOp>(loc, lhsRealIsNotNaN, lhsImagIsNotNaN);
-  Value resultIsInfinity = rewriter.create<arith::AndIOp>(
-      loc, lhsContainsNotNaNValue,
-      rewriter.create<arith::AndIOp>(loc, rhsRealIsZero, rhsImagIsZero));
-  Value inf = rewriter.create<arith::ConstantOp>(
-      loc, elementType,
+      arith::OrIOp::create(rewriter, loc, lhsRealIsNotNaN, lhsImagIsNotNaN);
+  Value resultIsInfinity = arith::AndIOp::create(
+      rewriter, loc, lhsContainsNotNaNValue,
+      arith::AndIOp::create(rewriter, loc, rhsRealIsZero, rhsImagIsZero));
+  Value inf = arith::ConstantOp::create(
+      rewriter, loc, elementType,
       rewriter.getFloatAttr(elementType,
                             APFloat::getInf(elementType.getFloatSemantics())));
   Value infWithSignOfRhsReal =
-      rewriter.create<math::CopySignOp>(loc, inf, rhsRe);
+      math::CopySignOp::create(rewriter, loc, inf, rhsRe);
   Value infinityResultReal =
-      rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsRe, fmf);
+      arith::MulFOp::create(rewriter, loc, infWithSignOfRhsReal, lhsRe, fmf);
   Value infinityResultImag =
-      rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsIm, fmf);
+      arith::MulFOp::create(rewriter, loc, infWithSignOfRhsReal, lhsIm, fmf);
 
   // Case 2. Infinite numerator, finite denominator.
-  Value rhsRealFinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::ONE, rhsRealAbs, inf);
-  Value rhsImagFinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::ONE, rhsImagAbs, inf);
+  Value rhsRealFinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::ONE, rhsRealAbs, inf);
+  Value rhsImagFinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::ONE, rhsImagAbs, inf);
   Value rhsFinite =
-      rewriter.create<arith::AndIOp>(loc, rhsRealFinite, rhsImagFinite);
-  Value lhsRealAbs = rewriter.create<math::AbsFOp>(loc, lhsRe, fmf);
-  Value lhsRealInfinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OEQ, lhsRealAbs, inf);
-  Value lhsImagAbs = rewriter.create<math::AbsFOp>(loc, lhsIm, fmf);
-  Value lhsImagInfinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OEQ, lhsImagAbs, inf);
+      arith::AndIOp::create(rewriter, loc, rhsRealFinite, rhsImagFinite);
+  Value lhsRealAbs = math::AbsFOp::create(rewriter, loc, lhsRe, fmf);
+  Value lhsRealInfinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OEQ, lhsRealAbs, inf);
+  Value lhsImagAbs = math::AbsFOp::create(rewriter, loc, lhsIm, fmf);
+  Value lhsImagInfinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OEQ, lhsImagAbs, inf);
   Value lhsInfinite =
-      rewriter.create<arith::OrIOp>(loc, lhsRealInfinite, lhsImagInfinite);
+      arith::OrIOp::create(rewriter, loc, lhsRealInfinite, lhsImagInfinite);
   Value infNumFiniteDenom =
-      rewriter.create<arith::AndIOp>(loc, lhsInfinite, rhsFinite);
-  Value one = rewriter.create<arith::ConstantOp>(
-      loc, elementType, rewriter.getFloatAttr(elementType, 1));
-  Value lhsRealIsInfWithSign = rewriter.create<math::CopySignOp>(
-      loc, rewriter.create<arith::SelectOp>(loc, lhsRealInfinite, one, zero),
+      arith::AndIOp::create(rewriter, loc, lhsInfinite, rhsFinite);
+  Value one = arith::ConstantOp::create(rewriter, loc, elementType,
+                                        rewriter.getFloatAttr(elementType, 1));
+  Value lhsRealIsInfWithSign = math::CopySignOp::create(
+      rewriter, loc,
+      arith::SelectOp::create(rewriter, loc, lhsRealInfinite, one, zero),
       lhsRe);
-  Value lhsImagIsInfWithSign = rewriter.create<math::CopySignOp>(
-      loc, rewriter.create<arith::SelectOp>(loc, lhsImagInfinite, one, zero),
+  Value lhsImagIsInfWithSign = math::CopySignOp::create(
+      rewriter, loc,
+      arith::SelectOp::create(rewriter, loc, lhsImagInfinite, one, zero),
       lhsIm);
   Value lhsRealIsInfWithSignTimesRhsReal =
-      rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsRe, fmf);
+      arith::MulFOp::create(rewriter, loc, lhsRealIsInfWithSign, rhsRe, fmf);
   Value lhsImagIsInfWithSignTimesRhsImag =
-      rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsIm, fmf);
-  Value resultReal3 = rewriter.create<arith::MulFOp>(
-      loc, inf,
-      rewriter.create<arith::AddFOp>(loc, lhsRealIsInfWithSignTimesRhsReal,
-                                     lhsImagIsInfWithSignTimesRhsImag, fmf),
+      arith::MulFOp::create(rewriter, loc, lhsImagIsInfWithSign, rhsIm, fmf);
+  Value resultReal3 = arith::MulFOp::create(
+      rewriter, loc, inf,
+      arith::AddFOp::create(rewriter, loc, lhsRealIsInfWithSignTimesRhsReal,
+                            lhsImagIsInfWithSignTimesRhsImag, fmf),
       fmf);
   Value lhsRealIsInfWithSignTimesRhsImag =
-      rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsIm, fmf);
+      arith::MulFOp::create(rewriter, loc, lhsRealIsInfWithSign, rhsIm, fmf);
   Value lhsImagIsInfWithSignTimesRhsReal =
-      rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsRe, fmf);
-  Value resultImag3 = rewriter.create<arith::MulFOp>(
-      loc, inf,
-      rewriter.create<arith::SubFOp>(loc, lhsImagIsInfWithSignTimesRhsReal,
-                                     lhsRealIsInfWithSignTimesRhsImag, fmf),
+      arith::MulFOp::create(rewriter, loc, lhsImagIsInfWithSign, rhsRe, fmf);
+  Value resultImag3 = arith::MulFOp::create(
+      rewriter, loc, inf,
+      arith::SubFOp::create(rewriter, loc, lhsImagIsInfWithSignTimesRhsReal,
+                            lhsRealIsInfWithSignTimesRhsImag, fmf),
       fmf);
 
   // Case 3: Finite numerator, infinite denominator.
-  Value lhsRealFinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::ONE, lhsRealAbs, inf);
-  Value lhsImagFinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::ONE, lhsImagAbs, inf);
+  Value lhsRealFinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::ONE, lhsRealAbs, inf);
+  Value lhsImagFinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::ONE, lhsImagAbs, inf);
   Value lhsFinite =
-      rewriter.create<arith::AndIOp>(loc, lhsRealFinite, lhsImagFinite);
-  Value rhsRealInfinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OEQ, rhsRealAbs, inf);
-  Value rhsImagInfinite = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OEQ, rhsImagAbs, inf);
+      arith::AndIOp::create(rewriter, loc, lhsRealFinite, lhsImagFinite);
+  Value rhsRealInfinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OEQ, rhsRealAbs, inf);
+  Value rhsImagInfinite = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OEQ, rhsImagAbs, inf);
   Value rhsInfinite =
-      rewriter.create<arith::OrIOp>(loc, rhsRealInfinite, rhsImagInfinite);
+      arith::OrIOp::create(rewriter, loc, rhsRealInfinite, rhsImagInfinite);
   Value finiteNumInfiniteDenom =
-      rewriter.create<arith::AndIOp>(loc, lhsFinite, rhsInfinite);
-  Value rhsRealIsInfWithSign = rewriter.create<math::CopySignOp>(
-      loc, rewriter.create<arith::SelectOp>(loc, rhsRealInfinite, one, zero),
+      arith::AndIOp::create(rewriter, loc, lhsFinite, rhsInfinite);
+  Value rhsRealIsInfWithSign = math::CopySignOp::create(
+      rewriter, loc,
+      arith::SelectOp::create(rewriter, loc, rhsRealInfinite, one, zero),
       rhsRe);
-  Value rhsImagIsInfWithSign = rewriter.create<math::CopySignOp>(
-      loc, rewriter.create<arith::SelectOp>(loc, rhsImagInfinite, one, zero),
+  Value rhsImagIsInfWithSign = math::CopySignOp::create(
+      rewriter, loc,
+      arith::SelectOp::create(rewriter, loc, rhsImagInfinite, one, zero),
       rhsIm);
   Value rhsRealIsInfWithSignTimesLhsReal =
-      rewriter.create<arith::MulFOp>(loc, lhsRe, rhsRealIsInfWithSign, fmf);
+      arith::MulFOp::create(rewriter, loc, lhsRe, rhsRealIsInfWithSign, fmf);
   Value rhsImagIsInfWithSignTimesLhsImag =
-      rewriter.create<arith::MulFOp>(loc, lhsIm, rhsImagIsInfWithSign, fmf);
-  Value resultReal4 = rewriter.create<arith::MulFOp>(
-      loc, zero,
-      rewriter.create<arith::AddFOp>(loc, rhsRealIsInfWithSignTimesLhsReal,
-                                     rhsImagIsInfWithSignTimesLhsImag, fmf),
+      arith::MulFOp::create(rewriter, loc, lhsIm, rhsImagIsInfWithSign, fmf);
+  Value resultReal4 = arith::MulFOp::create(
+      rewriter, loc, zero,
+      arith::AddFOp::create(rewriter, loc, rhsRealIsInfWithSignTimesLhsReal,
+                            rhsImagIsInfWithSignTimesLhsImag, fmf),
       fmf);
   Value rhsRealIsInfWithSignTimesLhsImag =
-      rewriter.create<arith::MulFOp>(loc, lhsIm, rhsRealIsInfWithSign, fmf);
+      arith::MulFOp::create(rewriter, loc, lhsIm, rhsRealIsInfWithSign, fmf);
   Value rhsImagIsInfWithSignTimesLhsReal =
-      rewriter.create<arith::MulFOp>(loc, lhsRe, rhsImagIsInfWithSign, fmf);
-  Value resultImag4 = rewriter.create<arith::MulFOp>(
-      loc, zero,
-      rewriter.create<arith::SubFOp>(loc, rhsRealIsInfWithSignTimesLhsImag,
-                                     rhsImagIsInfWithSignTimesLhsReal, fmf),
+      arith::MulFOp::create(rewriter, loc, lhsRe, rhsImagIsInfWithSign, fmf);
+  Value resultImag4 = arith::MulFOp::create(
+      rewriter, loc, zero,
+      arith::SubFOp::create(rewriter, loc, rhsRealIsInfWithSignTimesLhsImag,
+                            rhsImagIsInfWithSignTimesLhsReal, fmf),
       fmf);
 
-  Value realAbsSmallerThanImagAbs = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OLT, rhsRealAbs, rhsImagAbs);
-  Value resultReal5 = rewriter.create<arith::SelectOp>(
-      loc, realAbsSmallerThanImagAbs, resultReal1, resultReal2);
-  Value resultImag5 = rewriter.create<arith::SelectOp>(
-      loc, realAbsSmallerThanImagAbs, resultImag1, resultImag2);
-  Value resultRealSpecialCase3 = rewriter.create<arith::SelectOp>(
-      loc, finiteNumInfiniteDenom, resultReal4, resultReal5);
-  Value resultImagSpecialCase3 = rewriter.create<arith::SelectOp>(
-      loc, finiteNumInfiniteDenom, resultImag4, resultImag5);
-  Value resultRealSpecialCase2 = rewriter.create<arith::SelectOp>(
-      loc, infNumFiniteDenom, resultReal3, resultRealSpecialCase3);
-  Value resultImagSpecialCase2 = rewriter.create<arith::SelectOp>(
-      loc, infNumFiniteDenom, resultImag3, resultImagSpecialCase3);
-  Value resultRealSpecialCase1 = rewriter.create<arith::SelectOp>(
-      loc, resultIsInfinity, infinityResultReal, resultRealSpecialCase2);
-  Value resultImagSpecialCase1 = rewriter.create<arith::SelectOp>(
-      loc, resultIsInfinity, infinityResultImag, resultImagSpecialCase2);
+  Value realAbsSmallerThanImagAbs = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OLT, rhsRealAbs, rhsImagAbs);
+  Value resultReal5 = arith::SelectOp::create(
+      rewriter, loc, realAbsSmallerThanImagAbs, resultReal1, resultReal2);
+  Value resultImag5 = arith::SelectOp::create(
+      rewriter, loc, realAbsSmallerThanImagAbs, resultImag1, resultImag2);
+  Value resultRealSpecialCase3 = arith::SelectOp::create(
+      rewriter, loc, finiteNumInfiniteDenom, resultReal4, resultReal5);
+  Value resultImagSpecialCase3 = arith::SelectOp::create(
+      rewriter, loc, finiteNumInfiniteDenom, resultImag4, resultImag5);
+  Value resultRealSpecialCase2 = arith::SelectOp::create(
+      rewriter, loc, infNumFiniteDenom, resultReal3, resultRealSpecialCase3);
+  Value resultImagSpecialCase2 = arith::SelectOp::create(
+      rewriter, loc, infNumFiniteDenom, resultImag3, resultImagSpecialCase3);
+  Value resultRealSpecialCase1 =
+      arith::SelectOp::create(rewriter, loc, resultIsInfinity,
+                              infinityResultReal, resultRealSpecialCase2);
+  Value resultImagSpecialCase1 =
+      arith::SelectOp::create(rewriter, loc, resultIsInfinity,
+                              infinityResultImag, resultImagSpecialCase2);
 
-  Value resultRealIsNaN = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::UNO, resultReal5, zero);
-  Value resultImagIsNaN = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::UNO, resultImag5, zero);
+  Value resultRealIsNaN = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::UNO, resultReal5, zero);
+  Value resultImagIsNaN = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::UNO, resultImag5, zero);
   Value resultIsNaN =
-      rewriter.create<arith::AndIOp>(loc, resultRealIsNaN, resultImagIsNaN);
+      arith::AndIOp::create(rewriter, loc, resultRealIsNaN, resultImagIsNaN);
 
-  *resultRe = rewriter.create<arith::SelectOp>(
-      loc, resultIsNaN, resultRealSpecialCase1, resultReal5);
-  *resultIm = rewriter.create<arith::SelectOp>(
-      loc, resultIsNaN, resultImagSpecialCase1, resultImag5);
+  *resultRe = arith::SelectOp::create(rewriter, loc, resultIsNaN,
+                                      resultRealSpecialCase1, resultReal5);
+  *resultIm = arith::SelectOp::create(rewriter, loc, resultIsNaN,
+                                      resultImagSpecialCase1, resultImag5);
 }
diff --git a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
index e5e862315941d..86d02e6c6209f 100644
--- a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
+++ b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
@@ -35,7 +35,7 @@ static constexpr unsigned kImaginaryPosInComplexNumberStruct = 1;
 
 ComplexStructBuilder ComplexStructBuilder::poison(OpBuilder &builder,
                                                   Location loc, Type type) {
-  Value val = builder.create<LLVM::PoisonOp>(loc, type);
+  Value val = LLVM::PoisonOp::create(builder, loc, type);
   return ComplexStructBuilder(val);
 }
 
@@ -79,9 +79,9 @@ struct AbsOpConversion : public ConvertOpToLLVMPattern<complex::AbsOp> {
     LLVM::FastmathFlagsAttr fmf = LLVM::FastmathFlagsAttr::get(
         op.getContext(),
         convertArithFastMathFlagsToLLVM(complexFMFAttr.getValue()));
-    Value sqNorm = rewriter.create<LLVM::FAddOp>(
-        loc, rewriter.create<LLVM::FMulOp>(loc, real, real, fmf),
-        rewriter.create<LLVM::FMulOp>(loc, imag, imag, fmf), fmf);
+    Value sqNorm = LLVM::FAddOp::create(
+        rewriter, loc, LLVM::FMulOp::create(rewriter, loc, real, real, fmf),
+        LLVM::FMulOp::create(rewriter, loc, imag, imag, fmf), fmf);
 
     rewriter.replaceOpWithNewOp<LLVM::SqrtOp>(op, sqNorm);
     return success();
@@ -191,10 +191,10 @@ struct AddOpConversion : public ConvertOpToLLVMPattern<complex::AddOp> {
     LLVM::FastmathFlagsAttr fmf = LLVM::FastmathFlagsAttr::get(
         op.getContext(),
         convertArithFastMathFlagsToLLVM(complexFMFAttr.getValue()));
-    Value real =
-        rewriter.create<LLVM::FAddOp>(loc, arg.lhs.real(), arg.rhs.real(), fmf);
-    Value imag =
-        rewriter.create<LLVM::FAddOp>(loc, arg.lhs.imag(), arg.rhs.imag(), fmf);
+    Value real = LLVM::FAddOp::create(rewriter, loc, arg.lhs.real(),
+                                      arg.rhs.real(), fmf);
+    Value imag = LLVM::FAddOp::create(rewriter, loc, arg.lhs.imag(),
+                                      arg.rhs.imag(), fmf);
     result.setReal(rewriter, loc, real);
     result.setImaginary(rewriter, loc, imag);
 
@@ -278,13 +278,13 @@ struct MulOpConversion : public ConvertOpToLLVMPattern<complex::MulOp> {
     Value lhsRe = arg.lhs.real();
     Value lhsIm = arg.lhs.imag();
 
-    Value real = rewriter.create<LLVM::FSubOp>(
-        loc, rewriter.create<LLVM::FMulOp>(loc, rhsRe, lhsRe, fmf),
-        rewriter.create<LLVM::FMulOp>(loc, rhsIm, lhsIm, fmf), fmf);
+    Value real = LLVM::FSubOp::create(
+        rewriter, loc, LLVM::FMulOp::create(rewriter, loc, rhsRe, lhsRe, fmf),
+        LLVM::FMulOp::create(rewriter, loc, rhsIm, lhsIm, fmf), fmf);
 
-    Value imag = rewriter.create<LLVM::FAddOp>(
-        loc, rewriter.create<LLVM::FMulOp>(loc, lhsIm, rhsRe, fmf),
-        rewriter.create<LLVM::FMulOp>(loc, lhsRe, rhsIm, fmf), fmf);
+    Value imag = LLVM::FAddOp::create(
+        rewriter, loc, LLVM::FMulOp::create(rewriter, loc, lhsIm, rhsRe, fmf),
+        LLVM::FMulOp::create(rewriter, loc, lhsRe, rhsIm, fmf), fmf);
 
     result.setReal(rewriter, loc, real);
     result.setImaginary(rewriter, loc, imag);
@@ -313,10 +313,10 @@ struct SubOpConversion : public ConvertOpToLLVMPattern<complex::SubOp> {
     LLVM::FastmathFlagsAttr fmf = LLVM::FastmathFlagsAttr::get(
         op.getContext(),
         convertArithFastMathFlagsToLLVM(complexFMFAttr.getValue()));
-    Value real =
-        rewriter.create<LLVM::FSubOp>(loc, arg.lhs.real(), arg.rhs.real(), fmf);
-    Value imag =
-        rewriter.create<LLVM::FSubOp>(loc, arg.lhs.imag(), arg.rhs.imag(), fmf);
+    Value real = LLVM::FSubOp::create(rewriter, loc, arg.lhs.real(),
+                                      arg.rhs.real(), fmf);
+    Value imag = LLVM::FSubOp::create(rewriter, loc, arg.lhs.imag(),
+                                      arg.rhs.imag(), fmf);
     result.setReal(rewriter, loc, real);
     result.setImaginary(rewriter, loc, imag);
 
diff --git a/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp b/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp
index 56269d189873a..f83cac751ff05 100644
--- a/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp
+++ b/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp
@@ -84,8 +84,8 @@ LogicalResult ScalarOpToLibmCall<Op, TypeResolver>::matchAndRewrite(
     rewriter.setInsertionPointToStart(&module->getRegion(0).front());
     auto opFunctionTy = FunctionType::get(
         rewriter.getContext(), op->getOperandTypes(), op->getResultTypes());
-    opFunc = rewriter.create<func::FuncOp>(rewriter.getUnknownLoc(), name,
-                                           opFunctionTy);
+    opFunc = func::FuncOp::create(rewriter, rewriter.getUnknownLoc(), name,
+                                  opFunctionTy);
     opFunc.setPrivate();
   }
   assert(isa<FunctionOpInterface>(SymbolTable::lookupSymbolIn(module, name)));
diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt
new file mode 100644
index 0000000000000..695bb2dd0a82c
--- /dev/null
+++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRComplexToROCDLLibraryCalls
+  ComplexToROCDLLibraryCalls.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ComplexToROCDLLibraryCalls
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRComplexDialect
+  MLIRFuncDialect
+  MLIRPass
+  MLIRTransformUtils
+  )
diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
new file mode 100644
index 0000000000000..6f0fc2965e6fd
--- /dev/null
+++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
@@ -0,0 +1,92 @@
+//=== ComplexToROCDLLibraryCalls.cpp - convert from Complex to ROCDL calls ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTCOMPLEXTOROCDLLIBRARYCALLS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+template <typename Op, typename FloatTy>
+// Pattern to convert Complex ops to ROCDL function calls.
+struct ComplexOpToROCDLLibraryCalls : public OpRewritePattern<Op> {
+  using OpRewritePattern<Op>::OpRewritePattern;
+  ComplexOpToROCDLLibraryCalls(MLIRContext *context, StringRef funcName,
+                               PatternBenefit benefit = 1)
+      : OpRewritePattern<Op>(context, benefit), funcName(funcName) {}
+
+  LogicalResult matchAndRewrite(Op op, PatternRewriter &rewriter) const final {
+    Operation *symTable = SymbolTable::getNearestSymbolTable(op);
+    Type resType = op.getType();
+    if (auto complexType = dyn_cast<ComplexType>(resType))
+      resType = complexType.getElementType();
+    if (!isa<FloatTy>(resType))
+      return failure();
+
+    auto opFunc = dyn_cast_or_null<SymbolOpInterface>(
+        SymbolTable::lookupSymbolIn(symTable, funcName));
+    if (!opFunc) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(&symTable->getRegion(0).front());
+      auto funcTy = FunctionType::get(
+          rewriter.getContext(), op->getOperandTypes(), op->getResultTypes());
+      opFunc = func::FuncOp::create(rewriter, rewriter.getUnknownLoc(),
+                                    funcName, funcTy);
+      opFunc.setPrivate();
+    }
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, funcName, op.getType(),
+                                              op->getOperands());
+    return success();
+  }
+
+private:
+  std::string funcName;
+};
+} // namespace
+
+void mlir::populateComplexToROCDLLibraryCallsConversionPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::AbsOp, Float32Type>>(
+      patterns.getContext(), "__ocml_cabs_f32");
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::AbsOp, Float64Type>>(
+      patterns.getContext(), "__ocml_cabs_f64");
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::ExpOp, Float32Type>>(
+      patterns.getContext(), "__ocml_cexp_f32");
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::ExpOp, Float64Type>>(
+      patterns.getContext(), "__ocml_cexp_f64");
+}
+
+namespace {
+struct ConvertComplexToROCDLLibraryCallsPass
+    : public impl::ConvertComplexToROCDLLibraryCallsBase<
+          ConvertComplexToROCDLLibraryCallsPass> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void ConvertComplexToROCDLLibraryCallsPass::runOnOperation() {
+  Operation *op = getOperation();
+
+  RewritePatternSet patterns(&getContext());
+  populateComplexToROCDLLibraryCallsConversionPatterns(patterns);
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<func::FuncDialect>();
+  target.addIllegalOp<complex::AbsOp, complex::ExpOp>();
+  if (failed(applyPartialConversion(op, target, std::move(patterns))))
+    signalPassFailure();
+}
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 0c832c452718b..eeff8a93e7a72 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -31,44 +31,45 @@ enum class AbsFn { abs, sqrt, rsqrt };
 // Returns the absolute value, its square root or its reciprocal square root.
 Value computeAbs(Value real, Value imag, arith::FastMathFlags fmf,
                  ImplicitLocOpBuilder &b, AbsFn fn = AbsFn::abs) {
-  Value one = b.create<arith::ConstantOp>(real.getType(),
-                                          b.getFloatAttr(real.getType(), 1.0));
+  Value one = arith::ConstantOp::create(b, real.getType(),
+                                        b.getFloatAttr(real.getType(), 1.0));
 
-  Value absReal = b.create<math::AbsFOp>(real, fmf);
-  Value absImag = b.create<math::AbsFOp>(imag, fmf);
+  Value absReal = math::AbsFOp::create(b, real, fmf);
+  Value absImag = math::AbsFOp::create(b, imag, fmf);
 
-  Value max = b.create<arith::MaximumFOp>(absReal, absImag, fmf);
-  Value min = b.create<arith::MinimumFOp>(absReal, absImag, fmf);
+  Value max = arith::MaximumFOp::create(b, absReal, absImag, fmf);
+  Value min = arith::MinimumFOp::create(b, absReal, absImag, fmf);
 
   // The lowering below requires NaNs and infinities to work correctly.
   arith::FastMathFlags fmfWithNaNInf = arith::bitEnumClear(
       fmf, arith::FastMathFlags::nnan | arith::FastMathFlags::ninf);
-  Value ratio = b.create<arith::DivFOp>(min, max, fmfWithNaNInf);
-  Value ratioSq = b.create<arith::MulFOp>(ratio, ratio, fmfWithNaNInf);
-  Value ratioSqPlusOne = b.create<arith::AddFOp>(ratioSq, one, fmfWithNaNInf);
+  Value ratio = arith::DivFOp::create(b, min, max, fmfWithNaNInf);
+  Value ratioSq = arith::MulFOp::create(b, ratio, ratio, fmfWithNaNInf);
+  Value ratioSqPlusOne = arith::AddFOp::create(b, ratioSq, one, fmfWithNaNInf);
   Value result;
 
   if (fn == AbsFn::rsqrt) {
-    ratioSqPlusOne = b.create<math::RsqrtOp>(ratioSqPlusOne, fmfWithNaNInf);
-    min = b.create<math::RsqrtOp>(min, fmfWithNaNInf);
-    max = b.create<math::RsqrtOp>(max, fmfWithNaNInf);
+    ratioSqPlusOne = math::RsqrtOp::create(b, ratioSqPlusOne, fmfWithNaNInf);
+    min = math::RsqrtOp::create(b, min, fmfWithNaNInf);
+    max = math::RsqrtOp::create(b, max, fmfWithNaNInf);
   }
 
   if (fn == AbsFn::sqrt) {
-    Value quarter = b.create<arith::ConstantOp>(
-        real.getType(), b.getFloatAttr(real.getType(), 0.25));
+    Value quarter = arith::ConstantOp::create(
+        b, real.getType(), b.getFloatAttr(real.getType(), 0.25));
     // sqrt(sqrt(a*b)) would avoid the pow, but will overflow more easily.
-    Value sqrt = b.create<math::SqrtOp>(max, fmfWithNaNInf);
-    Value p025 = b.create<math::PowFOp>(ratioSqPlusOne, quarter, fmfWithNaNInf);
-    result = b.create<arith::MulFOp>(sqrt, p025, fmfWithNaNInf);
+    Value sqrt = math::SqrtOp::create(b, max, fmfWithNaNInf);
+    Value p025 =
+        math::PowFOp::create(b, ratioSqPlusOne, quarter, fmfWithNaNInf);
+    result = arith::MulFOp::create(b, sqrt, p025, fmfWithNaNInf);
   } else {
-    Value sqrt = b.create<math::SqrtOp>(ratioSqPlusOne, fmfWithNaNInf);
-    result = b.create<arith::MulFOp>(max, sqrt, fmfWithNaNInf);
+    Value sqrt = math::SqrtOp::create(b, ratioSqPlusOne, fmfWithNaNInf);
+    result = arith::MulFOp::create(b, max, sqrt, fmfWithNaNInf);
   }
 
-  Value isNaN = b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, result,
-                                        result, fmfWithNaNInf);
-  return b.create<arith::SelectOp>(isNaN, min, result);
+  Value isNaN = arith::CmpFOp::create(b, arith::CmpFPredicate::UNO, result,
+                                      result, fmfWithNaNInf);
+  return arith::SelectOp::create(b, isNaN, min, result);
 }
 
 struct AbsOpConversion : public OpConversionPattern<complex::AbsOp> {
@@ -81,8 +82,8 @@ struct AbsOpConversion : public OpConversionPattern<complex::AbsOp> {
 
     arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
 
-    Value real = b.create<complex::ReOp>(adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(adaptor.getComplex());
+    Value real = complex::ReOp::create(b, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, adaptor.getComplex());
     rewriter.replaceOp(op, computeAbs(real, imag, fmf, b));
 
     return success();
@@ -105,28 +106,28 @@ struct Atan2OpConversion : public OpConversionPattern<complex::Atan2Op> {
     Value lhs = adaptor.getLhs();
     Value rhs = adaptor.getRhs();
 
-    Value rhsSquared = b.create<complex::MulOp>(type, rhs, rhs, fmf);
-    Value lhsSquared = b.create<complex::MulOp>(type, lhs, lhs, fmf);
+    Value rhsSquared = complex::MulOp::create(b, type, rhs, rhs, fmf);
+    Value lhsSquared = complex::MulOp::create(b, type, lhs, lhs, fmf);
     Value rhsSquaredPlusLhsSquared =
-        b.create<complex::AddOp>(type, rhsSquared, lhsSquared, fmf);
+        complex::AddOp::create(b, type, rhsSquared, lhsSquared, fmf);
     Value sqrtOfRhsSquaredPlusLhsSquared =
-        b.create<complex::SqrtOp>(type, rhsSquaredPlusLhsSquared, fmf);
+        complex::SqrtOp::create(b, type, rhsSquaredPlusLhsSquared, fmf);
 
     Value zero =
-        b.create<arith::ConstantOp>(elementType, b.getZeroAttr(elementType));
-    Value one = b.create<arith::ConstantOp>(elementType,
-                                            b.getFloatAttr(elementType, 1));
-    Value i = b.create<complex::CreateOp>(type, zero, one);
-    Value iTimesLhs = b.create<complex::MulOp>(i, lhs, fmf);
-    Value rhsPlusILhs = b.create<complex::AddOp>(rhs, iTimesLhs, fmf);
+        arith::ConstantOp::create(b, elementType, b.getZeroAttr(elementType));
+    Value one = arith::ConstantOp::create(b, elementType,
+                                          b.getFloatAttr(elementType, 1));
+    Value i = complex::CreateOp::create(b, type, zero, one);
+    Value iTimesLhs = complex::MulOp::create(b, i, lhs, fmf);
+    Value rhsPlusILhs = complex::AddOp::create(b, rhs, iTimesLhs, fmf);
 
-    Value divResult = b.create<complex::DivOp>(
-        rhsPlusILhs, sqrtOfRhsSquaredPlusLhsSquared, fmf);
-    Value logResult = b.create<complex::LogOp>(divResult, fmf);
+    Value divResult = complex::DivOp::create(
+        b, rhsPlusILhs, sqrtOfRhsSquaredPlusLhsSquared, fmf);
+    Value logResult = complex::LogOp::create(b, divResult, fmf);
 
-    Value negativeOne = b.create<arith::ConstantOp>(
-        elementType, b.getFloatAttr(elementType, -1));
-    Value negativeI = b.create<complex::CreateOp>(type, zero, negativeOne);
+    Value negativeOne = arith::ConstantOp::create(
+        b, elementType, b.getFloatAttr(elementType, -1));
+    Value negativeI = complex::CreateOp::create(b, type, zero, negativeOne);
 
     rewriter.replaceOpWithNewOp<complex::MulOp>(op, negativeI, logResult, fmf);
     return success();
@@ -146,14 +147,18 @@ struct ComparisonOpConversion : public OpConversionPattern<ComparisonOp> {
     auto loc = op.getLoc();
     auto type = cast<ComplexType>(adaptor.getLhs().getType()).getElementType();
 
-    Value realLhs = rewriter.create<complex::ReOp>(loc, type, adaptor.getLhs());
-    Value imagLhs = rewriter.create<complex::ImOp>(loc, type, adaptor.getLhs());
-    Value realRhs = rewriter.create<complex::ReOp>(loc, type, adaptor.getRhs());
-    Value imagRhs = rewriter.create<complex::ImOp>(loc, type, adaptor.getRhs());
+    Value realLhs =
+        complex::ReOp::create(rewriter, loc, type, adaptor.getLhs());
+    Value imagLhs =
+        complex::ImOp::create(rewriter, loc, type, adaptor.getLhs());
+    Value realRhs =
+        complex::ReOp::create(rewriter, loc, type, adaptor.getRhs());
+    Value imagRhs =
+        complex::ImOp::create(rewriter, loc, type, adaptor.getRhs());
     Value realComparison =
-        rewriter.create<arith::CmpFOp>(loc, p, realLhs, realRhs);
+        arith::CmpFOp::create(rewriter, loc, p, realLhs, realRhs);
     Value imagComparison =
-        rewriter.create<arith::CmpFOp>(loc, p, imagLhs, imagRhs);
+        arith::CmpFOp::create(rewriter, loc, p, imagLhs, imagRhs);
 
     rewriter.replaceOpWithNewOp<ResultCombiner>(op, realComparison,
                                                 imagComparison);
@@ -176,14 +181,14 @@ struct BinaryComplexOpConversion : public OpConversionPattern<BinaryComplexOp> {
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
-    Value realLhs = b.create<complex::ReOp>(elementType, adaptor.getLhs());
-    Value realRhs = b.create<complex::ReOp>(elementType, adaptor.getRhs());
-    Value resultReal = b.create<BinaryStandardOp>(elementType, realLhs, realRhs,
-                                                  fmf.getValue());
-    Value imagLhs = b.create<complex::ImOp>(elementType, adaptor.getLhs());
-    Value imagRhs = b.create<complex::ImOp>(elementType, adaptor.getRhs());
-    Value resultImag = b.create<BinaryStandardOp>(elementType, imagLhs, imagRhs,
-                                                  fmf.getValue());
+    Value realLhs = complex::ReOp::create(b, elementType, adaptor.getLhs());
+    Value realRhs = complex::ReOp::create(b, elementType, adaptor.getRhs());
+    Value resultReal = BinaryStandardOp::create(b, elementType, realLhs,
+                                                realRhs, fmf.getValue());
+    Value imagLhs = complex::ImOp::create(b, elementType, adaptor.getLhs());
+    Value imagRhs = complex::ImOp::create(b, elementType, adaptor.getRhs());
+    Value resultImag = BinaryStandardOp::create(b, elementType, imagLhs,
+                                                imagRhs, fmf.getValue());
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
     return success();
@@ -205,20 +210,20 @@ struct TrigonometricOpConversion : public OpConversionPattern<TrigonometricOp> {
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value real =
-        rewriter.create<complex::ReOp>(loc, elementType, adaptor.getComplex());
+        complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex());
     Value imag =
-        rewriter.create<complex::ImOp>(loc, elementType, adaptor.getComplex());
+        complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex());
 
     // Trigonometric ops use a set of common building blocks to convert to real
     // ops. Here we create these building blocks and call into an op-specific
     // implementation in the subclass to combine them.
-    Value half = rewriter.create<arith::ConstantOp>(
-        loc, elementType, rewriter.getFloatAttr(elementType, 0.5));
-    Value exp = rewriter.create<math::ExpOp>(loc, imag, fmf);
-    Value scaledExp = rewriter.create<arith::MulFOp>(loc, half, exp, fmf);
-    Value reciprocalExp = rewriter.create<arith::DivFOp>(loc, half, exp, fmf);
-    Value sin = rewriter.create<math::SinOp>(loc, real, fmf);
-    Value cos = rewriter.create<math::CosOp>(loc, real, fmf);
+    Value half = arith::ConstantOp::create(
+        rewriter, loc, elementType, rewriter.getFloatAttr(elementType, 0.5));
+    Value exp = math::ExpOp::create(rewriter, loc, imag, fmf);
+    Value scaledExp = arith::MulFOp::create(rewriter, loc, half, exp, fmf);
+    Value reciprocalExp = arith::DivFOp::create(rewriter, loc, half, exp, fmf);
+    Value sin = math::SinOp::create(rewriter, loc, real, fmf);
+    Value cos = math::CosOp::create(rewriter, loc, real, fmf);
 
     auto resultPair =
         combine(loc, scaledExp, reciprocalExp, sin, cos, rewriter, fmf);
@@ -251,11 +256,11 @@ struct CosOpConversion : public TrigonometricOpConversion<complex::CosOp> {
     //   Re(cos(x + iy)) = (0.5/t + 0.5*t) * cos x
     //   Im(cos(x + iy)) = (0.5/t - 0.5*t) * sin x
     Value sum =
-        rewriter.create<arith::AddFOp>(loc, reciprocalExp, scaledExp, fmf);
-    Value resultReal = rewriter.create<arith::MulFOp>(loc, sum, cos, fmf);
+        arith::AddFOp::create(rewriter, loc, reciprocalExp, scaledExp, fmf);
+    Value resultReal = arith::MulFOp::create(rewriter, loc, sum, cos, fmf);
     Value diff =
-        rewriter.create<arith::SubFOp>(loc, reciprocalExp, scaledExp, fmf);
-    Value resultImag = rewriter.create<arith::MulFOp>(loc, diff, sin, fmf);
+        arith::SubFOp::create(rewriter, loc, reciprocalExp, scaledExp, fmf);
+    Value resultImag = arith::MulFOp::create(rewriter, loc, diff, sin, fmf);
     return {resultReal, resultImag};
   }
 };
@@ -275,13 +280,13 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value lhsReal =
-        rewriter.create<complex::ReOp>(loc, elementType, adaptor.getLhs());
+        complex::ReOp::create(rewriter, loc, elementType, adaptor.getLhs());
     Value lhsImag =
-        rewriter.create<complex::ImOp>(loc, elementType, adaptor.getLhs());
+        complex::ImOp::create(rewriter, loc, elementType, adaptor.getLhs());
     Value rhsReal =
-        rewriter.create<complex::ReOp>(loc, elementType, adaptor.getRhs());
+        complex::ReOp::create(rewriter, loc, elementType, adaptor.getRhs());
     Value rhsImag =
-        rewriter.create<complex::ImOp>(loc, elementType, adaptor.getRhs());
+        complex::ImOp::create(rewriter, loc, elementType, adaptor.getRhs());
 
     Value resultReal, resultImag;
 
@@ -318,16 +323,16 @@ struct ExpOpConversion : public OpConversionPattern<complex::ExpOp> {
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value real =
-        rewriter.create<complex::ReOp>(loc, elementType, adaptor.getComplex());
+        complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex());
     Value imag =
-        rewriter.create<complex::ImOp>(loc, elementType, adaptor.getComplex());
-    Value expReal = rewriter.create<math::ExpOp>(loc, real, fmf.getValue());
-    Value cosImag = rewriter.create<math::CosOp>(loc, imag, fmf.getValue());
+        complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex());
+    Value expReal = math::ExpOp::create(rewriter, loc, real, fmf.getValue());
+    Value cosImag = math::CosOp::create(rewriter, loc, imag, fmf.getValue());
     Value resultReal =
-        rewriter.create<arith::MulFOp>(loc, expReal, cosImag, fmf.getValue());
-    Value sinImag = rewriter.create<math::SinOp>(loc, imag, fmf.getValue());
+        arith::MulFOp::create(rewriter, loc, expReal, cosImag, fmf.getValue());
+    Value sinImag = math::SinOp::create(rewriter, loc, imag, fmf.getValue());
     Value resultImag =
-        rewriter.create<arith::MulFOp>(loc, expReal, sinImag, fmf.getValue());
+        arith::MulFOp::create(rewriter, loc, expReal, sinImag, fmf.getValue());
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
@@ -340,11 +345,11 @@ Value evaluatePolynomial(ImplicitLocOpBuilder &b, Value arg,
                          arith::FastMathFlagsAttr fmf) {
   auto argType = mlir::cast<FloatType>(arg.getType());
   Value poly =
-      b.create<arith::ConstantOp>(b.getFloatAttr(argType, coefficients[0]));
+      arith::ConstantOp::create(b, b.getFloatAttr(argType, coefficients[0]));
   for (unsigned i = 1; i < coefficients.size(); ++i) {
-    poly = b.create<math::FmaOp>(
-        poly, arg,
-        b.create<arith::ConstantOp>(b.getFloatAttr(argType, coefficients[i])),
+    poly = math::FmaOp::create(
+        b, poly, arg,
+        arith::ConstantOp::create(b, b.getFloatAttr(argType, coefficients[i])),
         fmf);
   }
   return poly;
@@ -365,26 +370,26 @@ struct Expm1OpConversion : public OpConversionPattern<complex::Expm1Op> {
 
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    Value real = b.create<complex::ReOp>(adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(adaptor.getComplex());
+    Value real = complex::ReOp::create(b, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, adaptor.getComplex());
 
-    Value zero = b.create<arith::ConstantOp>(b.getFloatAttr(elemType, 0.0));
-    Value one = b.create<arith::ConstantOp>(b.getFloatAttr(elemType, 1.0));
+    Value zero = arith::ConstantOp::create(b, b.getFloatAttr(elemType, 0.0));
+    Value one = arith::ConstantOp::create(b, b.getFloatAttr(elemType, 1.0));
 
-    Value expm1Real = b.create<math::ExpM1Op>(real, fmf);
-    Value expReal = b.create<arith::AddFOp>(expm1Real, one, fmf);
+    Value expm1Real = math::ExpM1Op::create(b, real, fmf);
+    Value expReal = arith::AddFOp::create(b, expm1Real, one, fmf);
 
-    Value sinImag = b.create<math::SinOp>(imag, fmf);
+    Value sinImag = math::SinOp::create(b, imag, fmf);
     Value cosm1Imag = emitCosm1(imag, fmf, b);
-    Value cosImag = b.create<arith::AddFOp>(cosm1Imag, one, fmf);
+    Value cosImag = arith::AddFOp::create(b, cosm1Imag, one, fmf);
 
-    Value realResult = b.create<arith::AddFOp>(
-        b.create<arith::MulFOp>(expm1Real, cosImag, fmf), cosm1Imag, fmf);
+    Value realResult = arith::AddFOp::create(
+        b, arith::MulFOp::create(b, expm1Real, cosImag, fmf), cosm1Imag, fmf);
 
-    Value imagIsZero = b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, imag,
-                                               zero, fmf.getValue());
-    Value imagResult = b.create<arith::SelectOp>(
-        imagIsZero, zero, b.create<arith::MulFOp>(expReal, sinImag, fmf));
+    Value imagIsZero = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, imag,
+                                             zero, fmf.getValue());
+    Value imagResult = arith::SelectOp::create(
+        b, imagIsZero, zero, arith::MulFOp::create(b, expReal, sinImag, fmf));
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, realResult,
                                                    imagResult);
@@ -395,8 +400,8 @@ struct Expm1OpConversion : public OpConversionPattern<complex::Expm1Op> {
   Value emitCosm1(Value arg, arith::FastMathFlagsAttr fmf,
                   ImplicitLocOpBuilder &b) const {
     auto argType = mlir::cast<FloatType>(arg.getType());
-    auto negHalf = b.create<arith::ConstantOp>(b.getFloatAttr(argType, -0.5));
-    auto negOne = b.create<arith::ConstantOp>(b.getFloatAttr(argType, -1.0));
+    auto negHalf = arith::ConstantOp::create(b, b.getFloatAttr(argType, -0.5));
+    auto negOne = arith::ConstantOp::create(b, b.getFloatAttr(argType, -1.0));
 
     // Algorithm copied from cephes cosm1.
     SmallVector<double, 7> kCoeffs{
@@ -405,23 +410,23 @@ struct Expm1OpConversion : public OpConversionPattern<complex::Expm1Op> {
         2.4801587301570552304991E-5,  -1.3888888888888872993737E-3,
         4.1666666666666666609054E-2,
     };
-    Value cos = b.create<math::CosOp>(arg, fmf);
-    Value forLargeArg = b.create<arith::AddFOp>(cos, negOne, fmf);
+    Value cos = math::CosOp::create(b, arg, fmf);
+    Value forLargeArg = arith::AddFOp::create(b, cos, negOne, fmf);
 
-    Value argPow2 = b.create<arith::MulFOp>(arg, arg, fmf);
-    Value argPow4 = b.create<arith::MulFOp>(argPow2, argPow2, fmf);
+    Value argPow2 = arith::MulFOp::create(b, arg, arg, fmf);
+    Value argPow4 = arith::MulFOp::create(b, argPow2, argPow2, fmf);
     Value poly = evaluatePolynomial(b, argPow2, kCoeffs, fmf);
 
     auto forSmallArg =
-        b.create<arith::AddFOp>(b.create<arith::MulFOp>(argPow4, poly, fmf),
-                                b.create<arith::MulFOp>(negHalf, argPow2, fmf));
+        arith::AddFOp::create(b, arith::MulFOp::create(b, argPow4, poly, fmf),
+                              arith::MulFOp::create(b, negHalf, argPow2, fmf));
 
     // (pi/4)^2 is approximately 0.61685
     Value piOver4Pow2 =
-        b.create<arith::ConstantOp>(b.getFloatAttr(argType, 0.61685));
-    Value cond = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGE, argPow2,
-                                         piOver4Pow2, fmf.getValue());
-    return b.create<arith::SelectOp>(cond, forLargeArg, forSmallArg);
+        arith::ConstantOp::create(b, b.getFloatAttr(argType, 0.61685));
+    Value cond = arith::CmpFOp::create(b, arith::CmpFPredicate::OGE, argPow2,
+                                       piOver4Pow2, fmf.getValue());
+    return arith::SelectOp::create(b, cond, forLargeArg, forSmallArg);
   }
 };
 
@@ -436,13 +441,13 @@ struct LogOpConversion : public OpConversionPattern<complex::LogOp> {
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    Value abs = b.create<complex::AbsOp>(elementType, adaptor.getComplex(),
-                                         fmf.getValue());
-    Value resultReal = b.create<math::LogOp>(elementType, abs, fmf.getValue());
-    Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(elementType, adaptor.getComplex());
+    Value abs = complex::AbsOp::create(b, elementType, adaptor.getComplex(),
+                                       fmf.getValue());
+    Value resultReal = math::LogOp::create(b, elementType, abs, fmf.getValue());
+    Value real = complex::ReOp::create(b, elementType, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, elementType, adaptor.getComplex());
     Value resultImag =
-        b.create<math::Atan2Op>(elementType, imag, real, fmf.getValue());
+        math::Atan2Op::create(b, elementType, imag, real, fmf.getValue());
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
     return success();
@@ -460,40 +465,42 @@ struct Log1pOpConversion : public OpConversionPattern<complex::Log1pOp> {
     arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    Value real = b.create<complex::ReOp>(adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(adaptor.getComplex());
+    Value real = complex::ReOp::create(b, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, adaptor.getComplex());
 
-    Value half = b.create<arith::ConstantOp>(elementType,
-                                             b.getFloatAttr(elementType, 0.5));
-    Value one = b.create<arith::ConstantOp>(elementType,
-                                            b.getFloatAttr(elementType, 1));
-    Value realPlusOne = b.create<arith::AddFOp>(real, one, fmf);
-    Value absRealPlusOne = b.create<math::AbsFOp>(realPlusOne, fmf);
-    Value absImag = b.create<math::AbsFOp>(imag, fmf);
+    Value half = arith::ConstantOp::create(b, elementType,
+                                           b.getFloatAttr(elementType, 0.5));
+    Value one = arith::ConstantOp::create(b, elementType,
+                                          b.getFloatAttr(elementType, 1));
+    Value realPlusOne = arith::AddFOp::create(b, real, one, fmf);
+    Value absRealPlusOne = math::AbsFOp::create(b, realPlusOne, fmf);
+    Value absImag = math::AbsFOp::create(b, imag, fmf);
 
-    Value maxAbs = b.create<arith::MaximumFOp>(absRealPlusOne, absImag, fmf);
-    Value minAbs = b.create<arith::MinimumFOp>(absRealPlusOne, absImag, fmf);
+    Value maxAbs = arith::MaximumFOp::create(b, absRealPlusOne, absImag, fmf);
+    Value minAbs = arith::MinimumFOp::create(b, absRealPlusOne, absImag, fmf);
 
-    Value useReal = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT,
-                                            realPlusOne, absImag, fmf);
-    Value maxMinusOne = b.create<arith::SubFOp>(maxAbs, one, fmf);
+    Value useReal = arith::CmpFOp::create(b, arith::CmpFPredicate::OGT,
+                                          realPlusOne, absImag, fmf);
+    Value maxMinusOne = arith::SubFOp::create(b, maxAbs, one, fmf);
     Value maxAbsOfRealPlusOneAndImagMinusOne =
-        b.create<arith::SelectOp>(useReal, real, maxMinusOne);
+        arith::SelectOp::create(b, useReal, real, maxMinusOne);
     arith::FastMathFlags fmfWithNaNInf = arith::bitEnumClear(
         fmf, arith::FastMathFlags::nnan | arith::FastMathFlags::ninf);
-    Value minMaxRatio = b.create<arith::DivFOp>(minAbs, maxAbs, fmfWithNaNInf);
+    Value minMaxRatio = arith::DivFOp::create(b, minAbs, maxAbs, fmfWithNaNInf);
     Value logOfMaxAbsOfRealPlusOneAndImag =
-        b.create<math::Log1pOp>(maxAbsOfRealPlusOneAndImagMinusOne, fmf);
-    Value logOfSqrtPart = b.create<math::Log1pOp>(
-        b.create<arith::MulFOp>(minMaxRatio, minMaxRatio, fmfWithNaNInf),
+        math::Log1pOp::create(b, maxAbsOfRealPlusOneAndImagMinusOne, fmf);
+    Value logOfSqrtPart = math::Log1pOp::create(
+        b, arith::MulFOp::create(b, minMaxRatio, minMaxRatio, fmfWithNaNInf),
         fmfWithNaNInf);
-    Value r = b.create<arith::AddFOp>(
-        b.create<arith::MulFOp>(half, logOfSqrtPart, fmfWithNaNInf),
+    Value r = arith::AddFOp::create(
+        b, arith::MulFOp::create(b, half, logOfSqrtPart, fmfWithNaNInf),
         logOfMaxAbsOfRealPlusOneAndImag, fmfWithNaNInf);
-    Value resultReal = b.create<arith::SelectOp>(
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, r, r, fmfWithNaNInf),
+    Value resultReal = arith::SelectOp::create(
+        b,
+        arith::CmpFOp::create(b, arith::CmpFPredicate::UNO, r, r,
+                              fmfWithNaNInf),
         minAbs, r);
-    Value resultImag = b.create<math::Atan2Op>(imag, realPlusOne, fmf);
+    Value resultImag = math::Atan2Op::create(b, imag, realPlusOne, fmf);
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
     return success();
@@ -511,22 +518,22 @@ struct MulOpConversion : public OpConversionPattern<complex::MulOp> {
     auto elementType = cast<FloatType>(type.getElementType());
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
     auto fmfValue = fmf.getValue();
-    Value lhsReal = b.create<complex::ReOp>(elementType, adaptor.getLhs());
-    Value lhsImag = b.create<complex::ImOp>(elementType, adaptor.getLhs());
-    Value rhsReal = b.create<complex::ReOp>(elementType, adaptor.getRhs());
-    Value rhsImag = b.create<complex::ImOp>(elementType, adaptor.getRhs());
+    Value lhsReal = complex::ReOp::create(b, elementType, adaptor.getLhs());
+    Value lhsImag = complex::ImOp::create(b, elementType, adaptor.getLhs());
+    Value rhsReal = complex::ReOp::create(b, elementType, adaptor.getRhs());
+    Value rhsImag = complex::ImOp::create(b, elementType, adaptor.getRhs());
     Value lhsRealTimesRhsReal =
-        b.create<arith::MulFOp>(lhsReal, rhsReal, fmfValue);
+        arith::MulFOp::create(b, lhsReal, rhsReal, fmfValue);
     Value lhsImagTimesRhsImag =
-        b.create<arith::MulFOp>(lhsImag, rhsImag, fmfValue);
-    Value real = b.create<arith::SubFOp>(lhsRealTimesRhsReal,
-                                         lhsImagTimesRhsImag, fmfValue);
+        arith::MulFOp::create(b, lhsImag, rhsImag, fmfValue);
+    Value real = arith::SubFOp::create(b, lhsRealTimesRhsReal,
+                                       lhsImagTimesRhsImag, fmfValue);
     Value lhsImagTimesRhsReal =
-        b.create<arith::MulFOp>(lhsImag, rhsReal, fmfValue);
+        arith::MulFOp::create(b, lhsImag, rhsReal, fmfValue);
     Value lhsRealTimesRhsImag =
-        b.create<arith::MulFOp>(lhsReal, rhsImag, fmfValue);
-    Value imag = b.create<arith::AddFOp>(lhsImagTimesRhsReal,
-                                         lhsRealTimesRhsImag, fmfValue);
+        arith::MulFOp::create(b, lhsReal, rhsImag, fmfValue);
+    Value imag = arith::AddFOp::create(b, lhsImagTimesRhsReal,
+                                       lhsRealTimesRhsImag, fmfValue);
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, real, imag);
     return success();
   }
@@ -543,11 +550,11 @@ struct NegOpConversion : public OpConversionPattern<complex::NegOp> {
     auto elementType = cast<FloatType>(type.getElementType());
 
     Value real =
-        rewriter.create<complex::ReOp>(loc, elementType, adaptor.getComplex());
+        complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex());
     Value imag =
-        rewriter.create<complex::ImOp>(loc, elementType, adaptor.getComplex());
-    Value negReal = rewriter.create<arith::NegFOp>(loc, real);
-    Value negImag = rewriter.create<arith::NegFOp>(loc, imag);
+        complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex());
+    Value negReal = arith::NegFOp::create(rewriter, loc, real);
+    Value negImag = arith::NegFOp::create(rewriter, loc, imag);
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, negReal, negImag);
     return success();
   }
@@ -570,11 +577,11 @@ struct SinOpConversion : public TrigonometricOpConversion<complex::SinOp> {
     //   Re(sin(x + iy)) = (0.5*t + 0.5/t) * sin x
     //   Im(cos(x + iy)) = (0.5*t - 0.5/t) * cos x
     Value sum =
-        rewriter.create<arith::AddFOp>(loc, scaledExp, reciprocalExp, fmf);
-    Value resultReal = rewriter.create<arith::MulFOp>(loc, sum, sin, fmf);
+        arith::AddFOp::create(rewriter, loc, scaledExp, reciprocalExp, fmf);
+    Value resultReal = arith::MulFOp::create(rewriter, loc, sum, sin, fmf);
     Value diff =
-        rewriter.create<arith::SubFOp>(loc, scaledExp, reciprocalExp, fmf);
-    Value resultImag = rewriter.create<arith::MulFOp>(loc, diff, cos, fmf);
+        arith::SubFOp::create(rewriter, loc, scaledExp, reciprocalExp, fmf);
+    Value resultImag = arith::MulFOp::create(rewriter, loc, diff, cos, fmf);
     return {resultReal, resultImag};
   }
 };
@@ -593,64 +600,65 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
 
     auto cst = [&](APFloat v) {
-      return b.create<arith::ConstantOp>(elementType,
-                                         b.getFloatAttr(elementType, v));
+      return arith::ConstantOp::create(b, elementType,
+                                       b.getFloatAttr(elementType, v));
     };
     const auto &floatSemantics = elementType.getFloatSemantics();
     Value zero = cst(APFloat::getZero(floatSemantics));
-    Value half = b.create<arith::ConstantOp>(elementType,
-                                             b.getFloatAttr(elementType, 0.5));
+    Value half = arith::ConstantOp::create(b, elementType,
+                                           b.getFloatAttr(elementType, 0.5));
 
-    Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(elementType, adaptor.getComplex());
+    Value real = complex::ReOp::create(b, elementType, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, elementType, adaptor.getComplex());
     Value absSqrt = computeAbs(real, imag, fmf, b, AbsFn::sqrt);
-    Value argArg = b.create<math::Atan2Op>(imag, real, fmf);
-    Value sqrtArg = b.create<arith::MulFOp>(argArg, half, fmf);
-    Value cos = b.create<math::CosOp>(sqrtArg, fmf);
-    Value sin = b.create<math::SinOp>(sqrtArg, fmf);
+    Value argArg = math::Atan2Op::create(b, imag, real, fmf);
+    Value sqrtArg = arith::MulFOp::create(b, argArg, half, fmf);
+    Value cos = math::CosOp::create(b, sqrtArg, fmf);
+    Value sin = math::SinOp::create(b, sqrtArg, fmf);
     // sin(atan2(0, inf)) = 0, sqrt(abs(inf)) = inf, but we can't multiply
     // 0 * inf.
     Value sinIsZero =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, sin, zero, fmf);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, sin, zero, fmf);
 
-    Value resultReal = b.create<arith::MulFOp>(absSqrt, cos, fmf);
-    Value resultImag = b.create<arith::SelectOp>(
-        sinIsZero, zero, b.create<arith::MulFOp>(absSqrt, sin, fmf));
+    Value resultReal = arith::MulFOp::create(b, absSqrt, cos, fmf);
+    Value resultImag = arith::SelectOp::create(
+        b, sinIsZero, zero, arith::MulFOp::create(b, absSqrt, sin, fmf));
     if (!arith::bitEnumContainsAll(fmf, arith::FastMathFlags::nnan |
                                             arith::FastMathFlags::ninf)) {
       Value inf = cst(APFloat::getInf(floatSemantics));
       Value negInf = cst(APFloat::getInf(floatSemantics, true));
       Value nan = cst(APFloat::getNaN(floatSemantics));
-      Value absImag = b.create<math::AbsFOp>(elementType, imag, fmf);
+      Value absImag = math::AbsFOp::create(b, elementType, imag, fmf);
 
-      Value absImagIsInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, absImag, inf, fmf);
-      Value absImagIsNotInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::ONE, absImag, inf, fmf);
+      Value absImagIsInf = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ,
+                                                 absImag, inf, fmf);
+      Value absImagIsNotInf = arith::CmpFOp::create(
+          b, arith::CmpFPredicate::ONE, absImag, inf, fmf);
       Value realIsInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, real, inf, fmf);
-      Value realIsNegInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, real, negInf, fmf);
+          arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, real, inf, fmf);
+      Value realIsNegInf = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ,
+                                                 real, negInf, fmf);
 
-      resultReal = b.create<arith::SelectOp>(
-          b.create<arith::AndIOp>(realIsNegInf, absImagIsNotInf), zero,
+      resultReal = arith::SelectOp::create(
+          b, arith::AndIOp::create(b, realIsNegInf, absImagIsNotInf), zero,
           resultReal);
-      resultReal = b.create<arith::SelectOp>(
-          b.create<arith::OrIOp>(absImagIsInf, realIsInf), inf, resultReal);
+      resultReal = arith::SelectOp::create(
+          b, arith::OrIOp::create(b, absImagIsInf, realIsInf), inf, resultReal);
 
-      Value imagSignInf = b.create<math::CopySignOp>(inf, imag, fmf);
-      resultImag = b.create<arith::SelectOp>(
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, absSqrt, absSqrt),
+      Value imagSignInf = math::CopySignOp::create(b, inf, imag, fmf);
+      resultImag = arith::SelectOp::create(
+          b,
+          arith::CmpFOp::create(b, arith::CmpFPredicate::UNO, absSqrt, absSqrt),
           nan, resultImag);
-      resultImag = b.create<arith::SelectOp>(
-          b.create<arith::OrIOp>(absImagIsInf, realIsNegInf), imagSignInf,
+      resultImag = arith::SelectOp::create(
+          b, arith::OrIOp::create(b, absImagIsInf, realIsNegInf), imagSignInf,
           resultImag);
     }
 
     Value resultIsZero =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, absSqrt, zero, fmf);
-    resultReal = b.create<arith::SelectOp>(resultIsZero, zero, resultReal);
-    resultImag = b.create<arith::SelectOp>(resultIsZero, zero, resultImag);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, absSqrt, zero, fmf);
+    resultReal = arith::SelectOp::create(b, resultIsZero, zero, resultReal);
+    resultImag = arith::SelectOp::create(b, resultIsZero, zero, resultImag);
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
@@ -669,19 +677,20 @@ struct SignOpConversion : public OpConversionPattern<complex::SignOp> {
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
-    Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(elementType, adaptor.getComplex());
+    Value real = complex::ReOp::create(b, elementType, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, elementType, adaptor.getComplex());
     Value zero =
-        b.create<arith::ConstantOp>(elementType, b.getZeroAttr(elementType));
+        arith::ConstantOp::create(b, elementType, b.getZeroAttr(elementType));
     Value realIsZero =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, real, zero);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, real, zero);
     Value imagIsZero =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, imag, zero);
-    Value isZero = b.create<arith::AndIOp>(realIsZero, imagIsZero);
-    auto abs = b.create<complex::AbsOp>(elementType, adaptor.getComplex(), fmf);
-    Value realSign = b.create<arith::DivFOp>(real, abs, fmf);
-    Value imagSign = b.create<arith::DivFOp>(imag, abs, fmf);
-    Value sign = b.create<complex::CreateOp>(type, realSign, imagSign);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, imag, zero);
+    Value isZero = arith::AndIOp::create(b, realIsZero, imagIsZero);
+    auto abs =
+        complex::AbsOp::create(b, elementType, adaptor.getComplex(), fmf);
+    Value realSign = arith::DivFOp::create(b, real, abs, fmf);
+    Value imagSign = arith::DivFOp::create(b, imag, abs, fmf);
+    Value sign = complex::CreateOp::create(b, type, realSign, imagSign);
     rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isZero,
                                                  adaptor.getComplex(), sign);
     return success();
@@ -703,84 +712,84 @@ struct TanTanhOpConversion : public OpConversionPattern<Op> {
     const auto &floatSemantics = elementType.getFloatSemantics();
 
     Value real =
-        b.create<complex::ReOp>(loc, elementType, adaptor.getComplex());
+        complex::ReOp::create(b, loc, elementType, adaptor.getComplex());
     Value imag =
-        b.create<complex::ImOp>(loc, elementType, adaptor.getComplex());
-    Value negOne = b.create<arith::ConstantOp>(
-        elementType, b.getFloatAttr(elementType, -1.0));
+        complex::ImOp::create(b, loc, elementType, adaptor.getComplex());
+    Value negOne = arith::ConstantOp::create(b, elementType,
+                                             b.getFloatAttr(elementType, -1.0));
 
     if constexpr (std::is_same_v<Op, complex::TanOp>) {
       // tan(x+yi) = -i*tanh(-y + xi)
       std::swap(real, imag);
-      real = b.create<arith::MulFOp>(real, negOne, fmf);
+      real = arith::MulFOp::create(b, real, negOne, fmf);
     }
 
     auto cst = [&](APFloat v) {
-      return b.create<arith::ConstantOp>(elementType,
-                                         b.getFloatAttr(elementType, v));
+      return arith::ConstantOp::create(b, elementType,
+                                       b.getFloatAttr(elementType, v));
     };
     Value inf = cst(APFloat::getInf(floatSemantics));
-    Value four = b.create<arith::ConstantOp>(elementType,
-                                             b.getFloatAttr(elementType, 4.0));
-    Value twoReal = b.create<arith::AddFOp>(real, real, fmf);
-    Value negTwoReal = b.create<arith::MulFOp>(negOne, twoReal, fmf);
-
-    Value expTwoRealMinusOne = b.create<math::ExpM1Op>(twoReal, fmf);
-    Value expNegTwoRealMinusOne = b.create<math::ExpM1Op>(negTwoReal, fmf);
-    Value realNum =
-        b.create<arith::SubFOp>(expTwoRealMinusOne, expNegTwoRealMinusOne, fmf);
-
-    Value cosImag = b.create<math::CosOp>(imag, fmf);
-    Value cosImagSq = b.create<arith::MulFOp>(cosImag, cosImag, fmf);
-    Value twoCosTwoImagPlusOne = b.create<arith::MulFOp>(cosImagSq, four, fmf);
-    Value sinImag = b.create<math::SinOp>(imag, fmf);
-
-    Value imagNum = b.create<arith::MulFOp>(
-        four, b.create<arith::MulFOp>(cosImag, sinImag, fmf), fmf);
-
-    Value expSumMinusTwo =
-        b.create<arith::AddFOp>(expTwoRealMinusOne, expNegTwoRealMinusOne, fmf);
+    Value four = arith::ConstantOp::create(b, elementType,
+                                           b.getFloatAttr(elementType, 4.0));
+    Value twoReal = arith::AddFOp::create(b, real, real, fmf);
+    Value negTwoReal = arith::MulFOp::create(b, negOne, twoReal, fmf);
+
+    Value expTwoRealMinusOne = math::ExpM1Op::create(b, twoReal, fmf);
+    Value expNegTwoRealMinusOne = math::ExpM1Op::create(b, negTwoReal, fmf);
+    Value realNum = arith::SubFOp::create(b, expTwoRealMinusOne,
+                                          expNegTwoRealMinusOne, fmf);
+
+    Value cosImag = math::CosOp::create(b, imag, fmf);
+    Value cosImagSq = arith::MulFOp::create(b, cosImag, cosImag, fmf);
+    Value twoCosTwoImagPlusOne = arith::MulFOp::create(b, cosImagSq, four, fmf);
+    Value sinImag = math::SinOp::create(b, imag, fmf);
+
+    Value imagNum = arith::MulFOp::create(
+        b, four, arith::MulFOp::create(b, cosImag, sinImag, fmf), fmf);
+
+    Value expSumMinusTwo = arith::AddFOp::create(b, expTwoRealMinusOne,
+                                                 expNegTwoRealMinusOne, fmf);
     Value denom =
-        b.create<arith::AddFOp>(expSumMinusTwo, twoCosTwoImagPlusOne, fmf);
+        arith::AddFOp::create(b, expSumMinusTwo, twoCosTwoImagPlusOne, fmf);
 
-    Value isInf = b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ,
-                                          expSumMinusTwo, inf, fmf);
-    Value realLimit = b.create<math::CopySignOp>(negOne, real, fmf);
+    Value isInf = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ,
+                                        expSumMinusTwo, inf, fmf);
+    Value realLimit = math::CopySignOp::create(b, negOne, real, fmf);
 
-    Value resultReal = b.create<arith::SelectOp>(
-        isInf, realLimit, b.create<arith::DivFOp>(realNum, denom, fmf));
-    Value resultImag = b.create<arith::DivFOp>(imagNum, denom, fmf);
+    Value resultReal = arith::SelectOp::create(
+        b, isInf, realLimit, arith::DivFOp::create(b, realNum, denom, fmf));
+    Value resultImag = arith::DivFOp::create(b, imagNum, denom, fmf);
 
     if (!arith::bitEnumContainsAll(fmf, arith::FastMathFlags::nnan |
                                             arith::FastMathFlags::ninf)) {
-      Value absReal = b.create<math::AbsFOp>(real, fmf);
-      Value zero = b.create<arith::ConstantOp>(
-          elementType, b.getFloatAttr(elementType, 0.0));
+      Value absReal = math::AbsFOp::create(b, real, fmf);
+      Value zero = arith::ConstantOp::create(b, elementType,
+                                             b.getFloatAttr(elementType, 0.0));
       Value nan = cst(APFloat::getNaN(floatSemantics));
 
-      Value absRealIsInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, absReal, inf, fmf);
+      Value absRealIsInf = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ,
+                                                 absReal, inf, fmf);
       Value imagIsZero =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, imag, zero, fmf);
-      Value absRealIsNotInf = b.create<arith::XOrIOp>(
-          absRealIsInf, b.create<arith::ConstantIntOp>(true, /*width=*/1));
+          arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, imag, zero, fmf);
+      Value absRealIsNotInf = arith::XOrIOp::create(
+          b, absRealIsInf, arith::ConstantIntOp::create(b, true, /*width=*/1));
 
-      Value imagNumIsNaN = b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO,
-                                                   imagNum, imagNum, fmf);
+      Value imagNumIsNaN = arith::CmpFOp::create(b, arith::CmpFPredicate::UNO,
+                                                 imagNum, imagNum, fmf);
       Value resultRealIsNaN =
-          b.create<arith::AndIOp>(imagNumIsNaN, absRealIsNotInf);
-      Value resultImagIsZero = b.create<arith::OrIOp>(
-          imagIsZero, b.create<arith::AndIOp>(absRealIsInf, imagNumIsNaN));
+          arith::AndIOp::create(b, imagNumIsNaN, absRealIsNotInf);
+      Value resultImagIsZero = arith::OrIOp::create(
+          b, imagIsZero, arith::AndIOp::create(b, absRealIsInf, imagNumIsNaN));
 
-      resultReal = b.create<arith::SelectOp>(resultRealIsNaN, nan, resultReal);
+      resultReal = arith::SelectOp::create(b, resultRealIsNaN, nan, resultReal);
       resultImag =
-          b.create<arith::SelectOp>(resultImagIsZero, zero, resultImag);
+          arith::SelectOp::create(b, resultImagIsZero, zero, resultImag);
     }
 
     if constexpr (std::is_same_v<Op, complex::TanOp>) {
       // tan(x+yi) = -i*tanh(-y + xi)
       std::swap(resultReal, resultImag);
-      resultImag = b.create<arith::MulFOp>(resultImag, negOne, fmf);
+      resultImag = arith::MulFOp::create(b, resultImag, negOne, fmf);
     }
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
@@ -799,10 +808,10 @@ struct ConjOpConversion : public OpConversionPattern<complex::ConjOp> {
     auto type = cast<ComplexType>(adaptor.getComplex().getType());
     auto elementType = cast<FloatType>(type.getElementType());
     Value real =
-        rewriter.create<complex::ReOp>(loc, elementType, adaptor.getComplex());
+        complex::ReOp::create(rewriter, loc, elementType, adaptor.getComplex());
     Value imag =
-        rewriter.create<complex::ImOp>(loc, elementType, adaptor.getComplex());
-    Value negImag = rewriter.create<arith::NegFOp>(loc, elementType, imag);
+        complex::ImOp::create(rewriter, loc, elementType, adaptor.getComplex());
+    Value negImag = arith::NegFOp::create(rewriter, loc, elementType, imag);
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, real, negImag);
 
@@ -818,97 +827,102 @@ static Value powOpConversionImpl(mlir::ImplicitLocOpBuilder &builder,
                                  arith::FastMathFlags fmf) {
   auto elementType = cast<FloatType>(type.getElementType());
 
-  Value a = builder.create<complex::ReOp>(lhs);
-  Value b = builder.create<complex::ImOp>(lhs);
+  Value a = complex::ReOp::create(builder, lhs);
+  Value b = complex::ImOp::create(builder, lhs);
 
-  Value abs = builder.create<complex::AbsOp>(lhs, fmf);
-  Value absToC = builder.create<math::PowFOp>(abs, c, fmf);
+  Value abs = complex::AbsOp::create(builder, lhs, fmf);
+  Value absToC = math::PowFOp::create(builder, abs, c, fmf);
 
-  Value negD = builder.create<arith::NegFOp>(d, fmf);
-  Value argLhs = builder.create<math::Atan2Op>(b, a, fmf);
-  Value negDArgLhs = builder.create<arith::MulFOp>(negD, argLhs, fmf);
-  Value expNegDArgLhs = builder.create<math::ExpOp>(negDArgLhs, fmf);
+  Value negD = arith::NegFOp::create(builder, d, fmf);
+  Value argLhs = math::Atan2Op::create(builder, b, a, fmf);
+  Value negDArgLhs = arith::MulFOp::create(builder, negD, argLhs, fmf);
+  Value expNegDArgLhs = math::ExpOp::create(builder, negDArgLhs, fmf);
 
-  Value coeff = builder.create<arith::MulFOp>(absToC, expNegDArgLhs, fmf);
-  Value lnAbs = builder.create<math::LogOp>(abs, fmf);
-  Value cArgLhs = builder.create<arith::MulFOp>(c, argLhs, fmf);
-  Value dLnAbs = builder.create<arith::MulFOp>(d, lnAbs, fmf);
-  Value q = builder.create<arith::AddFOp>(cArgLhs, dLnAbs, fmf);
-  Value cosQ = builder.create<math::CosOp>(q, fmf);
-  Value sinQ = builder.create<math::SinOp>(q, fmf);
+  Value coeff = arith::MulFOp::create(builder, absToC, expNegDArgLhs, fmf);
+  Value lnAbs = math::LogOp::create(builder, abs, fmf);
+  Value cArgLhs = arith::MulFOp::create(builder, c, argLhs, fmf);
+  Value dLnAbs = arith::MulFOp::create(builder, d, lnAbs, fmf);
+  Value q = arith::AddFOp::create(builder, cArgLhs, dLnAbs, fmf);
+  Value cosQ = math::CosOp::create(builder, q, fmf);
+  Value sinQ = math::SinOp::create(builder, q, fmf);
 
-  Value inf = builder.create<arith::ConstantOp>(
-      elementType,
+  Value inf = arith::ConstantOp::create(
+      builder, elementType,
       builder.getFloatAttr(elementType,
                            APFloat::getInf(elementType.getFloatSemantics())));
-  Value zero = builder.create<arith::ConstantOp>(
-      elementType, builder.getFloatAttr(elementType, 0.0));
-  Value one = builder.create<arith::ConstantOp>(
-      elementType, builder.getFloatAttr(elementType, 1.0));
-  Value complexOne = builder.create<complex::CreateOp>(type, one, zero);
-  Value complexZero = builder.create<complex::CreateOp>(type, zero, zero);
-  Value complexInf = builder.create<complex::CreateOp>(type, inf, zero);
+  Value zero = arith::ConstantOp::create(
+      builder, elementType, builder.getFloatAttr(elementType, 0.0));
+  Value one = arith::ConstantOp::create(builder, elementType,
+                                        builder.getFloatAttr(elementType, 1.0));
+  Value complexOne = complex::CreateOp::create(builder, type, one, zero);
+  Value complexZero = complex::CreateOp::create(builder, type, zero, zero);
+  Value complexInf = complex::CreateOp::create(builder, type, inf, zero);
 
   // Case 0:
   // d^c is 0 if d is 0 and c > 0. 0^0 is defined to be 1.0, see
   // Branch Cuts for Complex Elementary Functions or Much Ado About
   // Nothing's Sign Bit, W. Kahan, Section 10.
   Value absEqZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, abs, zero, fmf);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, abs, zero, fmf);
   Value dEqZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, d, zero, fmf);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, d, zero, fmf);
   Value cEqZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, c, zero, fmf);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, c, zero, fmf);
   Value bEqZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, b, zero, fmf);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, b, zero, fmf);
 
   Value zeroLeC =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLE, zero, c, fmf);
-  Value coeffCosQ = builder.create<arith::MulFOp>(coeff, cosQ, fmf);
-  Value coeffSinQ = builder.create<arith::MulFOp>(coeff, sinQ, fmf);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLE, zero, c, fmf);
+  Value coeffCosQ = arith::MulFOp::create(builder, coeff, cosQ, fmf);
+  Value coeffSinQ = arith::MulFOp::create(builder, coeff, sinQ, fmf);
   Value complexOneOrZero =
-      builder.create<arith::SelectOp>(cEqZero, complexOne, complexZero);
+      arith::SelectOp::create(builder, cEqZero, complexOne, complexZero);
   Value coeffCosSin =
-      builder.create<complex::CreateOp>(type, coeffCosQ, coeffSinQ);
-  Value cutoff0 = builder.create<arith::SelectOp>(
-      builder.create<arith::AndIOp>(
-          builder.create<arith::AndIOp>(absEqZero, dEqZero), zeroLeC),
+      complex::CreateOp::create(builder, type, coeffCosQ, coeffSinQ);
+  Value cutoff0 = arith::SelectOp::create(
+      builder,
+      arith::AndIOp::create(
+          builder, arith::AndIOp::create(builder, absEqZero, dEqZero), zeroLeC),
       complexOneOrZero, coeffCosSin);
 
   // Case 1:
   // x^0 is defined to be 1 for any x, see
   // Branch Cuts for Complex Elementary Functions or Much Ado About
   // Nothing's Sign Bit, W. Kahan, Section 10.
-  Value rhsEqZero = builder.create<arith::AndIOp>(cEqZero, dEqZero);
+  Value rhsEqZero = arith::AndIOp::create(builder, cEqZero, dEqZero);
   Value cutoff1 =
-      builder.create<arith::SelectOp>(rhsEqZero, complexOne, cutoff0);
+      arith::SelectOp::create(builder, rhsEqZero, complexOne, cutoff0);
 
   // Case 2:
   // 1^(c + d*i) = 1 + 0*i
-  Value lhsEqOne = builder.create<arith::AndIOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, a, one, fmf),
+  Value lhsEqOne = arith::AndIOp::create(
+      builder,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, a, one, fmf),
       bEqZero);
   Value cutoff2 =
-      builder.create<arith::SelectOp>(lhsEqOne, complexOne, cutoff1);
+      arith::SelectOp::create(builder, lhsEqOne, complexOne, cutoff1);
 
   // Case 3:
   // inf^(c + 0*i) = inf + 0*i, c > 0
-  Value lhsEqInf = builder.create<arith::AndIOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, a, inf, fmf),
+  Value lhsEqInf = arith::AndIOp::create(
+      builder,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, a, inf, fmf),
       bEqZero);
-  Value rhsGt0 = builder.create<arith::AndIOp>(
-      dEqZero,
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, c, zero, fmf));
-  Value cutoff3 = builder.create<arith::SelectOp>(
-      builder.create<arith::AndIOp>(lhsEqInf, rhsGt0), complexInf, cutoff2);
+  Value rhsGt0 = arith::AndIOp::create(
+      builder, dEqZero,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, c, zero, fmf));
+  Value cutoff3 = arith::SelectOp::create(
+      builder, arith::AndIOp::create(builder, lhsEqInf, rhsGt0), complexInf,
+      cutoff2);
 
   // Case 4:
   // inf^(c + 0*i) = 0 + 0*i, c < 0
-  Value rhsLt0 = builder.create<arith::AndIOp>(
-      dEqZero,
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, c, zero, fmf));
-  Value cutoff4 = builder.create<arith::SelectOp>(
-      builder.create<arith::AndIOp>(lhsEqInf, rhsLt0), complexZero, cutoff3);
+  Value rhsLt0 = arith::AndIOp::create(
+      builder, dEqZero,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, c, zero, fmf));
+  Value cutoff4 = arith::SelectOp::create(
+      builder, arith::AndIOp::create(builder, lhsEqInf, rhsLt0), complexZero,
+      cutoff3);
 
   return cutoff4;
 }
@@ -923,8 +937,8 @@ struct PowOpConversion : public OpConversionPattern<complex::PowOp> {
     auto type = cast<ComplexType>(adaptor.getLhs().getType());
     auto elementType = cast<FloatType>(type.getElementType());
 
-    Value c = builder.create<complex::ReOp>(elementType, adaptor.getRhs());
-    Value d = builder.create<complex::ImOp>(elementType, adaptor.getRhs());
+    Value c = complex::ReOp::create(builder, elementType, adaptor.getRhs());
+    Value d = complex::ImOp::create(builder, elementType, adaptor.getRhs());
 
     rewriter.replaceOp(op, {powOpConversionImpl(builder, type, adaptor.getLhs(),
                                                 c, d, op.getFastmath())});
@@ -945,64 +959,64 @@ struct RsqrtOpConversion : public OpConversionPattern<complex::RsqrtOp> {
     arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
 
     auto cst = [&](APFloat v) {
-      return b.create<arith::ConstantOp>(elementType,
-                                         b.getFloatAttr(elementType, v));
+      return arith::ConstantOp::create(b, elementType,
+                                       b.getFloatAttr(elementType, v));
     };
     const auto &floatSemantics = elementType.getFloatSemantics();
     Value zero = cst(APFloat::getZero(floatSemantics));
     Value inf = cst(APFloat::getInf(floatSemantics));
-    Value negHalf = b.create<arith::ConstantOp>(
-        elementType, b.getFloatAttr(elementType, -0.5));
+    Value negHalf = arith::ConstantOp::create(
+        b, elementType, b.getFloatAttr(elementType, -0.5));
     Value nan = cst(APFloat::getNaN(floatSemantics));
 
-    Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
-    Value imag = b.create<complex::ImOp>(elementType, adaptor.getComplex());
+    Value real = complex::ReOp::create(b, elementType, adaptor.getComplex());
+    Value imag = complex::ImOp::create(b, elementType, adaptor.getComplex());
     Value absRsqrt = computeAbs(real, imag, fmf, b, AbsFn::rsqrt);
-    Value argArg = b.create<math::Atan2Op>(imag, real, fmf);
-    Value rsqrtArg = b.create<arith::MulFOp>(argArg, negHalf, fmf);
-    Value cos = b.create<math::CosOp>(rsqrtArg, fmf);
-    Value sin = b.create<math::SinOp>(rsqrtArg, fmf);
+    Value argArg = math::Atan2Op::create(b, imag, real, fmf);
+    Value rsqrtArg = arith::MulFOp::create(b, argArg, negHalf, fmf);
+    Value cos = math::CosOp::create(b, rsqrtArg, fmf);
+    Value sin = math::SinOp::create(b, rsqrtArg, fmf);
 
-    Value resultReal = b.create<arith::MulFOp>(absRsqrt, cos, fmf);
-    Value resultImag = b.create<arith::MulFOp>(absRsqrt, sin, fmf);
+    Value resultReal = arith::MulFOp::create(b, absRsqrt, cos, fmf);
+    Value resultImag = arith::MulFOp::create(b, absRsqrt, sin, fmf);
 
     if (!arith::bitEnumContainsAll(fmf, arith::FastMathFlags::nnan |
                                             arith::FastMathFlags::ninf)) {
-      Value negOne = b.create<arith::ConstantOp>(
-          elementType, b.getFloatAttr(elementType, -1));
+      Value negOne = arith::ConstantOp::create(b, elementType,
+                                               b.getFloatAttr(elementType, -1));
 
-      Value realSignedZero = b.create<math::CopySignOp>(zero, real, fmf);
-      Value imagSignedZero = b.create<math::CopySignOp>(zero, imag, fmf);
+      Value realSignedZero = math::CopySignOp::create(b, zero, real, fmf);
+      Value imagSignedZero = math::CopySignOp::create(b, zero, imag, fmf);
       Value negImagSignedZero =
-          b.create<arith::MulFOp>(negOne, imagSignedZero, fmf);
+          arith::MulFOp::create(b, negOne, imagSignedZero, fmf);
 
-      Value absReal = b.create<math::AbsFOp>(real, fmf);
-      Value absImag = b.create<math::AbsFOp>(imag, fmf);
+      Value absReal = math::AbsFOp::create(b, real, fmf);
+      Value absImag = math::AbsFOp::create(b, imag, fmf);
 
-      Value absImagIsInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, absImag, inf, fmf);
+      Value absImagIsInf = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ,
+                                                 absImag, inf, fmf);
       Value realIsNan =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, real, real, fmf);
-      Value realIsInf =
-          b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, absReal, inf, fmf);
-      Value inIsNanInf = b.create<arith::AndIOp>(absImagIsInf, realIsNan);
+          arith::CmpFOp::create(b, arith::CmpFPredicate::UNO, real, real, fmf);
+      Value realIsInf = arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ,
+                                              absReal, inf, fmf);
+      Value inIsNanInf = arith::AndIOp::create(b, absImagIsInf, realIsNan);
 
-      Value resultIsZero = b.create<arith::OrIOp>(inIsNanInf, realIsInf);
+      Value resultIsZero = arith::OrIOp::create(b, inIsNanInf, realIsInf);
 
       resultReal =
-          b.create<arith::SelectOp>(resultIsZero, realSignedZero, resultReal);
-      resultImag = b.create<arith::SelectOp>(resultIsZero, negImagSignedZero,
-                                             resultImag);
+          arith::SelectOp::create(b, resultIsZero, realSignedZero, resultReal);
+      resultImag = arith::SelectOp::create(b, resultIsZero, negImagSignedZero,
+                                           resultImag);
     }
 
     Value isRealZero =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, real, zero, fmf);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, real, zero, fmf);
     Value isImagZero =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, imag, zero, fmf);
-    Value isZero = b.create<arith::AndIOp>(isRealZero, isImagZero);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, imag, zero, fmf);
+    Value isZero = arith::AndIOp::create(b, isRealZero, isImagZero);
 
-    resultReal = b.create<arith::SelectOp>(isZero, inf, resultReal);
-    resultImag = b.create<arith::SelectOp>(isZero, nan, resultImag);
+    resultReal = arith::SelectOp::create(b, isZero, inf, resultReal);
+    resultImag = arith::SelectOp::create(b, isZero, nan, resultImag);
 
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
@@ -1021,9 +1035,9 @@ struct AngleOpConversion : public OpConversionPattern<complex::AngleOp> {
     arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value real =
-        rewriter.create<complex::ReOp>(loc, type, adaptor.getComplex());
+        complex::ReOp::create(rewriter, loc, type, adaptor.getComplex());
     Value imag =
-        rewriter.create<complex::ImOp>(loc, type, adaptor.getComplex());
+        complex::ImOp::create(rewriter, loc, type, adaptor.getComplex());
 
     rewriter.replaceOpWithNewOp<math::Atan2Op>(op, imag, real, fmf);
 
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 13a084407e53f..ff6d369176393 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -73,13 +73,13 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
         OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointToStart(module.getBody());
         auto abortFuncTy = LLVM::LLVMFunctionType::get(getVoidType(), {});
-        abortFunc = rewriter.create<LLVM::LLVMFuncOp>(rewriter.getUnknownLoc(),
-                                                      "abort", abortFuncTy);
+        abortFunc = LLVM::LLVMFuncOp::create(rewriter, rewriter.getUnknownLoc(),
+                                             "abort", abortFuncTy);
       }
-      rewriter.create<LLVM::CallOp>(loc, abortFunc, ValueRange());
-      rewriter.create<LLVM::UnreachableOp>(loc);
+      LLVM::CallOp::create(rewriter, loc, abortFunc, ValueRange());
+      LLVM::UnreachableOp::create(rewriter, loc);
     } else {
-      rewriter.create<LLVM::BrOp>(loc, ValueRange(), continuationBlock);
+      LLVM::BrOp::create(rewriter, loc, ValueRange(), continuationBlock);
     }
 
     // Generate assertion test.
diff --git a/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp b/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
index 9831dcaaaccc8..c8311eb5a6433 100644
--- a/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
+++ b/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
@@ -33,8 +33,8 @@ ControlFlowToSCFTransformation::createStructuredBranchRegionOp(
     MutableArrayRef<Region> regions) {
   if (auto condBrOp = dyn_cast<cf::CondBranchOp>(controlFlowCondOp)) {
     assert(regions.size() == 2);
-    auto ifOp = builder.create<scf::IfOp>(controlFlowCondOp->getLoc(),
-                                          resultTypes, condBrOp.getCondition());
+    auto ifOp = scf::IfOp::create(builder, controlFlowCondOp->getLoc(),
+                                  resultTypes, condBrOp.getCondition());
     ifOp.getThenRegion().takeBody(regions[0]);
     ifOp.getElseRegion().takeBody(regions[1]);
     return ifOp.getOperation();
@@ -43,8 +43,8 @@ ControlFlowToSCFTransformation::createStructuredBranchRegionOp(
   if (auto switchOp = dyn_cast<cf::SwitchOp>(controlFlowCondOp)) {
     // `getCFGSwitchValue` returns an i32 that we need to convert to index
     // fist.
-    auto cast = builder.create<arith::IndexCastUIOp>(
-        controlFlowCondOp->getLoc(), builder.getIndexType(),
+    auto cast = arith::IndexCastUIOp::create(
+        builder, controlFlowCondOp->getLoc(), builder.getIndexType(),
         switchOp.getFlag());
     SmallVector<int64_t> cases;
     if (auto caseValues = switchOp.getCaseValues())
@@ -55,8 +55,9 @@ ControlFlowToSCFTransformation::createStructuredBranchRegionOp(
 
     assert(regions.size() == cases.size() + 1);
 
-    auto indexSwitchOp = builder.create<scf::IndexSwitchOp>(
-        controlFlowCondOp->getLoc(), resultTypes, cast, cases, cases.size());
+    auto indexSwitchOp =
+        scf::IndexSwitchOp::create(builder, controlFlowCondOp->getLoc(),
+                                   resultTypes, cast, cases, cases.size());
 
     indexSwitchOp.getDefaultRegion().takeBody(regions[0]);
     for (auto &&[targetRegion, sourceRegion] :
@@ -75,7 +76,7 @@ LogicalResult
 ControlFlowToSCFTransformation::createStructuredBranchRegionTerminatorOp(
     Location loc, OpBuilder &builder, Operation *branchRegionOp,
     Operation *replacedControlFlowOp, ValueRange results) {
-  builder.create<scf::YieldOp>(loc, results);
+  scf::YieldOp::create(builder, loc, results);
   return success();
 }
 
@@ -84,23 +85,24 @@ ControlFlowToSCFTransformation::createStructuredDoWhileLoopOp(
     OpBuilder &builder, Operation *replacedOp, ValueRange loopVariablesInit,
     Value condition, ValueRange loopVariablesNextIter, Region &&loopBody) {
   Location loc = replacedOp->getLoc();
-  auto whileOp = builder.create<scf::WhileOp>(loc, loopVariablesInit.getTypes(),
-                                              loopVariablesInit);
+  auto whileOp = scf::WhileOp::create(
+      builder, loc, loopVariablesInit.getTypes(), loopVariablesInit);
 
   whileOp.getBefore().takeBody(loopBody);
 
   builder.setInsertionPointToEnd(&whileOp.getBefore().back());
   // `getCFGSwitchValue` returns a i32. We therefore need to truncate the
   // condition to i1 first. It is guaranteed to be either 0 or 1 already.
-  builder.create<scf::ConditionOp>(
-      loc, builder.create<arith::TruncIOp>(loc, builder.getI1Type(), condition),
+  scf::ConditionOp::create(
+      builder, loc,
+      arith::TruncIOp::create(builder, loc, builder.getI1Type(), condition),
       loopVariablesNextIter);
 
   Block *afterBlock = builder.createBlock(&whileOp.getAfter());
   afterBlock->addArguments(
       loopVariablesInit.getTypes(),
       SmallVector<Location>(loopVariablesInit.size(), loc));
-  builder.create<scf::YieldOp>(loc, afterBlock->getArguments());
+  scf::YieldOp::create(builder, loc, afterBlock->getArguments());
 
   return whileOp.getOperation();
 }
@@ -108,8 +110,8 @@ ControlFlowToSCFTransformation::createStructuredDoWhileLoopOp(
 Value ControlFlowToSCFTransformation::getCFGSwitchValue(Location loc,
                                                         OpBuilder &builder,
                                                         unsigned int value) {
-  return builder.create<arith::ConstantOp>(loc,
-                                           builder.getI32IntegerAttr(value));
+  return arith::ConstantOp::create(builder, loc,
+                                   builder.getI32IntegerAttr(value));
 }
 
 void ControlFlowToSCFTransformation::createCFGSwitchOp(
@@ -117,15 +119,15 @@ void ControlFlowToSCFTransformation::createCFGSwitchOp(
     ArrayRef<unsigned int> caseValues, BlockRange caseDestinations,
     ArrayRef<ValueRange> caseArguments, Block *defaultDest,
     ValueRange defaultArgs) {
-  builder.create<cf::SwitchOp>(loc, flag, defaultDest, defaultArgs,
-                               llvm::to_vector_of<int32_t>(caseValues),
-                               caseDestinations, caseArguments);
+  cf::SwitchOp::create(builder, loc, flag, defaultDest, defaultArgs,
+                       llvm::to_vector_of<int32_t>(caseValues),
+                       caseDestinations, caseArguments);
 }
 
 Value ControlFlowToSCFTransformation::getUndefValue(Location loc,
                                                     OpBuilder &builder,
                                                     Type type) {
-  return builder.create<ub::PoisonOp>(loc, type, nullptr);
+  return ub::PoisonOp::create(builder, loc, type, nullptr);
 }
 
 FailureOr<Operation *>
diff --git a/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp b/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp
index f8dc06f41ab87..197caeb4ffbfa 100644
--- a/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp
+++ b/mlir/lib/Conversion/FuncToEmitC/FuncToEmitC.cpp
@@ -99,8 +99,8 @@ class FuncOpConversion final : public OpConversionPattern<func::FuncOp> {
     }
 
     // Create the converted `emitc.func` op.
-    emitc::FuncOp newFuncOp = rewriter.create<emitc::FuncOp>(
-        funcOp.getLoc(), funcOp.getName(),
+    emitc::FuncOp newFuncOp = emitc::FuncOp::create(
+        rewriter, funcOp.getLoc(), funcOp.getName(),
         FunctionType::get(rewriter.getContext(),
                           signatureConverter.getConvertedTypes(),
                           resultType ? TypeRange(resultType) : TypeRange()));
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 36235636d6ba2..67bb1c14c99a2 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -115,8 +115,8 @@ static void wrapForExternalCallers(OpBuilder &rewriter, Location loc,
   SmallVector<NamedAttribute> attributes;
   filterFuncAttributes(funcOp, attributes);
 
-  auto wrapperFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-      loc, llvm::formatv("_mlir_ciface_{0}", funcOp.getName()).str(),
+  auto wrapperFuncOp = LLVM::LLVMFuncOp::create(
+      rewriter, loc, llvm::formatv("_mlir_ciface_{0}", funcOp.getName()).str(),
       wrapperFuncType, LLVM::Linkage::External, /*dsoLocal=*/false,
       /*cconv=*/LLVM::CConv::C, /*comdat=*/nullptr, attributes);
   propagateArgResAttrs(rewriter, !!resultStructType, funcOp, wrapperFuncOp);
@@ -129,14 +129,14 @@ static void wrapForExternalCallers(OpBuilder &rewriter, Location loc,
   for (auto [index, argType] : llvm::enumerate(type.getInputs())) {
     Value arg = wrapperFuncOp.getArgument(index + argOffset);
     if (auto memrefType = dyn_cast<MemRefType>(argType)) {
-      Value loaded = rewriter.create<LLVM::LoadOp>(
-          loc, typeConverter.convertType(memrefType), arg);
+      Value loaded = LLVM::LoadOp::create(
+          rewriter, loc, typeConverter.convertType(memrefType), arg);
       MemRefDescriptor::unpack(rewriter, loc, loaded, memrefType, args);
       continue;
     }
     if (isa<UnrankedMemRefType>(argType)) {
-      Value loaded = rewriter.create<LLVM::LoadOp>(
-          loc, typeConverter.convertType(argType), arg);
+      Value loaded = LLVM::LoadOp::create(
+          rewriter, loc, typeConverter.convertType(argType), arg);
       UnrankedMemRefDescriptor::unpack(rewriter, loc, loaded, args);
       continue;
     }
@@ -144,14 +144,14 @@ static void wrapForExternalCallers(OpBuilder &rewriter, Location loc,
     args.push_back(arg);
   }
 
-  auto call = rewriter.create<LLVM::CallOp>(loc, newFuncOp, args);
+  auto call = LLVM::CallOp::create(rewriter, loc, newFuncOp, args);
 
   if (resultStructType) {
-    rewriter.create<LLVM::StoreOp>(loc, call.getResult(),
-                                   wrapperFuncOp.getArgument(0));
-    rewriter.create<LLVM::ReturnOp>(loc, ValueRange{});
+    LLVM::StoreOp::create(rewriter, loc, call.getResult(),
+                          wrapperFuncOp.getArgument(0));
+    LLVM::ReturnOp::create(rewriter, loc, ValueRange{});
   } else {
-    rewriter.create<LLVM::ReturnOp>(loc, call.getResults());
+    LLVM::ReturnOp::create(rewriter, loc, call.getResults());
   }
 }
 
@@ -182,8 +182,8 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
   filterFuncAttributes(funcOp, attributes);
 
   // Create the auxiliary function.
-  auto wrapperFunc = builder.create<LLVM::LLVMFuncOp>(
-      loc, llvm::formatv("_mlir_ciface_{0}", funcOp.getName()).str(),
+  auto wrapperFunc = LLVM::LLVMFuncOp::create(
+      builder, loc, llvm::formatv("_mlir_ciface_{0}", funcOp.getName()).str(),
       wrapperType, LLVM::Linkage::External, /*dsoLocal=*/false,
       /*cconv=*/LLVM::CConv::C, /*comdat=*/nullptr, attributes);
   propagateArgResAttrs(builder, !!resultStructType, funcOp, wrapperFunc);
@@ -201,11 +201,11 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
   if (resultStructType) {
     // Allocate the struct on the stack and pass the pointer.
     Type resultType = cast<LLVM::LLVMFunctionType>(wrapperType).getParamType(0);
-    Value one = builder.create<LLVM::ConstantOp>(
-        loc, typeConverter.convertType(builder.getIndexType()),
+    Value one = LLVM::ConstantOp::create(
+        builder, loc, typeConverter.convertType(builder.getIndexType()),
         builder.getIntegerAttr(builder.getIndexType(), 1));
     Value result =
-        builder.create<LLVM::AllocaOp>(loc, resultType, resultStructType, one);
+        LLVM::AllocaOp::create(builder, loc, resultType, resultStructType, one);
     args.push_back(result);
   }
 
@@ -229,12 +229,12 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
                     wrapperArgsRange.take_front(numToDrop));
 
       auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
-      Value one = builder.create<LLVM::ConstantOp>(
-          loc, typeConverter.convertType(builder.getIndexType()),
+      Value one = LLVM::ConstantOp::create(
+          builder, loc, typeConverter.convertType(builder.getIndexType()),
           builder.getIntegerAttr(builder.getIndexType(), 1));
-      Value allocated = builder.create<LLVM::AllocaOp>(
-          loc, ptrTy, packed.getType(), one, /*alignment=*/0);
-      builder.create<LLVM::StoreOp>(loc, packed, allocated);
+      Value allocated = LLVM::AllocaOp::create(
+          builder, loc, ptrTy, packed.getType(), one, /*alignment=*/0);
+      LLVM::StoreOp::create(builder, loc, packed, allocated);
       arg = allocated;
     } else {
       arg = wrapperArgsRange[0];
@@ -245,14 +245,14 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
   }
   assert(wrapperArgsRange.empty() && "did not map some of the arguments");
 
-  auto call = builder.create<LLVM::CallOp>(loc, wrapperFunc, args);
+  auto call = LLVM::CallOp::create(builder, loc, wrapperFunc, args);
 
   if (resultStructType) {
     Value result =
-        builder.create<LLVM::LoadOp>(loc, resultStructType, args.front());
-    builder.create<LLVM::ReturnOp>(loc, result);
+        LLVM::LoadOp::create(builder, loc, resultStructType, args.front());
+    LLVM::ReturnOp::create(builder, loc, result);
   } else {
-    builder.create<LLVM::ReturnOp>(loc, call.getResults());
+    LLVM::ReturnOp::create(builder, loc, call.getResults());
   }
 }
 
@@ -283,7 +283,7 @@ static void restoreByValRefArgumentType(
     Type resTy = typeConverter.convertType(
         cast<TypeAttr>(byValRefAttr->getValue()).getValue());
 
-    Value valueArg = rewriter.create<LLVM::LoadOp>(arg.getLoc(), resTy, arg);
+    Value valueArg = LLVM::LoadOp::create(rewriter, arg.getLoc(), resTy, arg);
     rewriter.replaceUsesOfBlockArgument(arg, valueArg);
   }
 }
@@ -357,8 +357,8 @@ FailureOr<LLVM::LLVMFuncOp> mlir::convertFuncOpToLLVMFuncOp(
     symbolTable.remove(funcOp);
   }
 
-  auto newFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-      funcOp.getLoc(), funcOp.getName(), llvmType, linkage,
+  auto newFuncOp = LLVM::LLVMFuncOp::create(
+      rewriter, funcOp.getLoc(), funcOp.getName(), llvmType, linkage,
       /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C, /*comdat=*/nullptr,
       attributes);
 
@@ -509,7 +509,7 @@ struct ConstantOpLowering : public ConvertOpToLLVMPattern<func::ConstantOp> {
       return rewriter.notifyMatchFailure(op, "failed to convert result type");
 
     auto newOp =
-        rewriter.create<LLVM::AddressOfOp>(op.getLoc(), type, op.getValue());
+        LLVM::AddressOfOp::create(rewriter, op.getLoc(), type, op.getValue());
     for (const NamedAttribute &attr : op->getAttrs()) {
       if (attr.getName().strref() == "value")
         continue;
@@ -556,9 +556,10 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern<CallOpType> {
     auto promoted = this->getTypeConverter()->promoteOperands(
         callOp.getLoc(), /*opOperands=*/callOp->getOperands(),
         adaptor.getOperands(), rewriter, useBarePtrCallConv);
-    auto newOp = rewriter.create<LLVM::CallOp>(
-        callOp.getLoc(), packedResult ? TypeRange(packedResult) : TypeRange(),
-        promoted, callOp->getAttrs());
+    auto newOp = LLVM::CallOp::create(rewriter, callOp.getLoc(),
+                                      packedResult ? TypeRange(packedResult)
+                                                   : TypeRange(),
+                                      promoted, callOp->getAttrs());
 
     newOp.getProperties().operandSegmentSizes = {
         static_cast<int32_t>(promoted.size()), 0};
@@ -573,8 +574,8 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern<CallOpType> {
       // Extract individual results from the structure and return them as list.
       results.reserve(numResults);
       for (unsigned i = 0; i < numResults; ++i) {
-        results.push_back(rewriter.create<LLVM::ExtractValueOp>(
-            callOp.getLoc(), newOp->getResult(0), i));
+        results.push_back(LLVM::ExtractValueOp::create(
+            rewriter, callOp.getLoc(), newOp->getResult(0), i));
       }
     }
 
@@ -726,9 +727,9 @@ struct ReturnOpLowering : public ConvertOpToLLVMPattern<func::ReturnOp> {
       return rewriter.notifyMatchFailure(op, "could not convert result types");
     }
 
-    Value packed = rewriter.create<LLVM::PoisonOp>(loc, packedType);
+    Value packed = LLVM::PoisonOp::create(rewriter, loc, packedType);
     for (auto [idx, operand] : llvm::enumerate(updatedOperands)) {
-      packed = rewriter.create<LLVM::InsertValueOp>(loc, packed, operand, idx);
+      packed = LLVM::InsertValueOp::create(rewriter, loc, packed, operand, idx);
     }
     rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), packed,
                                                 op->getAttrs());
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 01ca5e99a9aff..1037e296c8128 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -28,7 +28,7 @@ LLVM::LLVMFuncOp mlir::getOrDefineFunction(gpu::GPUModuleOp moduleOp,
   if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
     OpBuilder::InsertionGuard guard(b);
     b.setInsertionPointToStart(moduleOp.getBody());
-    ret = b.create<LLVM::LLVMFuncOp>(loc, name, type, LLVM::Linkage::External);
+    ret = LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
   }
   return ret;
 }
@@ -68,9 +68,9 @@ mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
   OpBuilder::InsertionGuard guard(b);
   b.setInsertionPointToStart(moduleOp.getBody());
   SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
-  return b.create<LLVM::GlobalOp>(loc, globalType,
-                                  /*isConstant=*/true, LLVM::Linkage::Internal,
-                                  name, attr, alignment, addrSpace);
+  return LLVM::GlobalOp::create(b, loc, globalType,
+                                /*isConstant=*/true, LLVM::Linkage::Internal,
+                                name, attr, alignment, addrSpace);
 }
 
 LogicalResult
@@ -151,8 +151,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
               gpuFuncOp.getWorkgroupAttributionAttr(
                   idx, LLVM::LLVMDialect::getAlignAttrName())))
         alignment = alignAttr.getInt();
-      auto globalOp = rewriter.create<LLVM::GlobalOp>(
-          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+      auto globalOp = LLVM::GlobalOp::create(
+          rewriter, gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
           LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
           workgroupAddrSpace);
       workgroupBuffers.push_back(globalOp);
@@ -220,8 +220,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   LLVM::CConv callingConvention = gpuFuncOp.isKernel()
                                       ? kernelCallingConvention
                                       : nonKernelCallingConvention;
-  auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-      gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
+  auto llvmFuncOp = LLVM::LLVMFuncOp::create(
+      rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
       LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
       /*comdat=*/nullptr, attributes);
 
@@ -266,11 +266,11 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
         auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
                                                   global.getAddrSpace());
-        Value address = rewriter.create<LLVM::AddressOfOp>(
-            loc, ptrType, global.getSymNameAttr());
+        Value address = LLVM::AddressOfOp::create(rewriter, loc, ptrType,
+                                                  global.getSymNameAttr());
         Value memory =
-            rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(),
-                                         address, ArrayRef<LLVM::GEPArg>{0, 0});
+            LLVM::GEPOp::create(rewriter, loc, ptrType, global.getType(),
+                                address, ArrayRef<LLVM::GEPArg>{0, 0});
 
         // Build a memref descriptor pointing to the buffer to plug with the
         // existing memref infrastructure. This may use more registers than
@@ -298,15 +298,16 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       Type elementType = typeConverter->convertType(type.getElementType());
       auto ptrType =
           LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
-      Value numElements = rewriter.create<LLVM::ConstantOp>(
-          gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
+      Value numElements = LLVM::ConstantOp::create(
+          rewriter, gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
       uint64_t alignment = 0;
       if (auto alignAttr =
               dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
                   idx, LLVM::LLVMDialect::getAlignAttrName())))
         alignment = alignAttr.getInt();
-      Value allocated = rewriter.create<LLVM::AllocaOp>(
-          gpuFuncOp.getLoc(), ptrType, elementType, numElements, alignment);
+      Value allocated =
+          LLVM::AllocaOp::create(rewriter, gpuFuncOp.getLoc(), ptrType,
+                                 elementType, numElements, alignment);
       Value descr = MemRefDescriptor::fromStaticShape(
           rewriter, loc, *getTypeConverter(), type, allocated);
       signatureConversion.remapInput(
@@ -418,8 +419,9 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
           {llvmI64, ptrType, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));
 
   /// Start the printf hostcall
-  Value zeroI64 = rewriter.create<LLVM::ConstantOp>(loc, llvmI64, 0);
-  auto printfBeginCall = rewriter.create<LLVM::CallOp>(loc, ocklBegin, zeroI64);
+  Value zeroI64 = LLVM::ConstantOp::create(rewriter, loc, llvmI64, 0);
+  auto printfBeginCall =
+      LLVM::CallOp::create(rewriter, loc, ocklBegin, zeroI64);
   Value printfDesc = printfBeginCall.getResult();
 
   // Create the global op or find an existing one.
@@ -427,21 +429,21 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
       rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());
 
   // Get a pointer to the format string's first element and pass it to printf()
-  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
-      loc,
+  Value globalPtr = LLVM::AddressOfOp::create(
+      rewriter, loc,
       LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
   Value stringStart =
-      rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
-                                   globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
-  Value stringLen = rewriter.create<LLVM::ConstantOp>(
-      loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());
+      LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
+                          globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+  Value stringLen = LLVM::ConstantOp::create(
+      rewriter, loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());
 
-  Value oneI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 1);
-  Value zeroI32 = rewriter.create<LLVM::ConstantOp>(loc, llvmI32, 0);
+  Value oneI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 1);
+  Value zeroI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 0);
 
-  auto appendFormatCall = rewriter.create<LLVM::CallOp>(
-      loc, ocklAppendStringN,
+  auto appendFormatCall = LLVM::CallOp::create(
+      rewriter, loc, ocklAppendStringN,
       ValueRange{printfDesc, stringStart, stringLen,
                  adaptor.getArgs().empty() ? oneI32 : zeroI32});
   printfDesc = appendFormatCall.getResult();
@@ -456,17 +458,18 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
     SmallVector<mlir::Value, 2 + argsPerAppend + 1> arguments;
     arguments.push_back(printfDesc);
     arguments.push_back(
-        rewriter.create<LLVM::ConstantOp>(loc, llvmI32, numArgsThisCall));
+        LLVM::ConstantOp::create(rewriter, loc, llvmI32, numArgsThisCall));
     for (size_t i = group; i < bound; ++i) {
       Value arg = adaptor.getArgs()[i];
       if (auto floatType = dyn_cast<FloatType>(arg.getType())) {
         if (!floatType.isF64())
-          arg = rewriter.create<LLVM::FPExtOp>(
-              loc, typeConverter->convertType(rewriter.getF64Type()), arg);
-        arg = rewriter.create<LLVM::BitcastOp>(loc, llvmI64, arg);
+          arg = LLVM::FPExtOp::create(
+              rewriter, loc, typeConverter->convertType(rewriter.getF64Type()),
+              arg);
+        arg = LLVM::BitcastOp::create(rewriter, loc, llvmI64, arg);
       }
       if (arg.getType().getIntOrFloatBitWidth() != 64)
-        arg = rewriter.create<LLVM::ZExtOp>(loc, llvmI64, arg);
+        arg = LLVM::ZExtOp::create(rewriter, loc, llvmI64, arg);
 
       arguments.push_back(arg);
     }
@@ -477,7 +480,7 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
 
     auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
     arguments.push_back(isLast);
-    auto call = rewriter.create<LLVM::CallOp>(loc, ocklAppendArgs, arguments);
+    auto call = LLVM::CallOp::create(rewriter, loc, ocklAppendArgs, arguments);
     printfDesc = call.getResult();
   }
   rewriter.eraseOp(gpuPrintfOp);
@@ -510,13 +513,13 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
       /*alignment=*/0, addressSpace);
 
   // Get a pointer to the format string's first element
-  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
-      loc,
+  Value globalPtr = LLVM::AddressOfOp::create(
+      rewriter, loc,
       LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
       global.getSymNameAttr());
   Value stringStart =
-      rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
-                                   globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
+                          globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
 
   // Construct arguments and function call
   auto argsRange = adaptor.getArgs();
@@ -525,7 +528,7 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   printfArgs.push_back(stringStart);
   printfArgs.append(argsRange.begin(), argsRange.end());
 
-  rewriter.create<LLVM::CallOp>(loc, printfDecl, printfArgs);
+  LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
   rewriter.eraseOp(gpuPrintfOp);
   return success();
 }
@@ -559,10 +562,10 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
                                 "printfFormat_", adaptor.getFormat());
 
   // Get a pointer to the format string's first element
-  Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
+  Value globalPtr = LLVM::AddressOfOp::create(rewriter, loc, global);
   Value stringStart =
-      rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
-                                   globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
+                          globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
   SmallVector<Type> types;
   SmallVector<Value> args;
   // Promote and pack the arguments into a stack allocation.
@@ -572,27 +575,27 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
     assert(type.isIntOrFloat());
     if (isa<FloatType>(type)) {
       type = rewriter.getF64Type();
-      promotedArg = rewriter.create<LLVM::FPExtOp>(loc, type, arg);
+      promotedArg = LLVM::FPExtOp::create(rewriter, loc, type, arg);
     }
     types.push_back(type);
     args.push_back(promotedArg);
   }
   Type structType =
       LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
-  Value one = rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64Type(),
-                                                rewriter.getIndexAttr(1));
+  Value one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(),
+                                       rewriter.getIndexAttr(1));
   Value tempAlloc =
-      rewriter.create<LLVM::AllocaOp>(loc, ptrType, structType, one,
-                                      /*alignment=*/0);
+      LLVM::AllocaOp::create(rewriter, loc, ptrType, structType, one,
+                             /*alignment=*/0);
   for (auto [index, arg] : llvm::enumerate(args)) {
-    Value ptr = rewriter.create<LLVM::GEPOp>(
-        loc, ptrType, structType, tempAlloc,
+    Value ptr = LLVM::GEPOp::create(
+        rewriter, loc, ptrType, structType, tempAlloc,
         ArrayRef<LLVM::GEPArg>{0, static_cast<int32_t>(index)});
-    rewriter.create<LLVM::StoreOp>(loc, arg, ptr);
+    LLVM::StoreOp::create(rewriter, loc, arg, ptr);
   }
   std::array<Value, 2> printfArgs = {stringStart, tempAlloc};
 
-  rewriter.create<LLVM::CallOp>(loc, vprintfDecl, printfArgs);
+  LLVM::CallOp::create(rewriter, loc, vprintfDecl, printfArgs);
   rewriter.eraseOp(gpuPrintfOp);
   return success();
 }
@@ -607,23 +610,23 @@ static Value scalarizeVectorOpHelper(Operation *op, ValueRange operands,
   TypeRange operandTypes(operands);
   VectorType vectorType = cast<VectorType>(llvm1DVectorTy);
   Location loc = op->getLoc();
-  Value result = rewriter.create<LLVM::PoisonOp>(loc, vectorType);
+  Value result = LLVM::PoisonOp::create(rewriter, loc, vectorType);
   Type indexType = converter.convertType(rewriter.getIndexType());
   StringAttr name = op->getName().getIdentifier();
   Type elementType = vectorType.getElementType();
 
   for (int64_t i = 0; i < vectorType.getNumElements(); ++i) {
-    Value index = rewriter.create<LLVM::ConstantOp>(loc, indexType, i);
+    Value index = LLVM::ConstantOp::create(rewriter, loc, indexType, i);
     auto extractElement = [&](Value operand) -> Value {
       if (!isa<VectorType>(operand.getType()))
         return operand;
-      return rewriter.create<LLVM::ExtractElementOp>(loc, operand, index);
+      return LLVM::ExtractElementOp::create(rewriter, loc, operand, index);
     };
     auto scalarOperands = llvm::map_to_vector(operands, extractElement);
     Operation *scalarOp =
         rewriter.create(loc, name, scalarOperands, elementType, op->getAttrs());
-    result = rewriter.create<LLVM::InsertElementOp>(
-        loc, result, scalarOp->getResult(0), index);
+    result = LLVM::InsertElementOp::create(rewriter, loc, result,
+                                           scalarOp->getResult(0), index);
   }
   return result;
 }
@@ -705,10 +708,10 @@ LLVM::GlobalOp getDynamicSharedMemorySymbol(
   auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
       typeConverter->convertType(memrefType.getElementType()), 0);
 
-  return rewriter.create<LLVM::GlobalOp>(
-      op->getLoc(), zeroSizedArrayType, /*isConstant=*/false,
-      LLVM::Linkage::Internal, symName, /*value=*/Attribute(), alignmentByte,
-      addressSpace.value());
+  return LLVM::GlobalOp::create(rewriter, op->getLoc(), zeroSizedArrayType,
+                                /*isConstant=*/false, LLVM::Linkage::Internal,
+                                symName, /*value=*/Attribute(), alignmentByte,
+                                addressSpace.value());
 }
 
 LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
@@ -732,13 +735,13 @@ LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
   // Step 3. Get address of the global symbol
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(op);
-  auto basePtr = rewriter.create<LLVM::AddressOfOp>(loc, shmemOp);
+  auto basePtr = LLVM::AddressOfOp::create(rewriter, loc, shmemOp);
   Type baseType = basePtr->getResultTypes().front();
 
   // Step 4. Generate GEP using offsets
   SmallVector<LLVM::GEPArg> gepArgs = {0};
-  Value shmemPtr = rewriter.create<LLVM::GEPOp>(loc, baseType, elementType,
-                                                basePtr, gepArgs);
+  Value shmemPtr = LLVM::GEPOp::create(rewriter, loc, baseType, elementType,
+                                       basePtr, gepArgs);
   // Step 5. Create a memref descriptor
   SmallVector<Value> shape, strides;
   Value sizeBytes;
@@ -799,9 +802,9 @@ LogicalResult GPUReturnOpLowering::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "could not convert result types");
   }
 
-  Value packed = rewriter.create<LLVM::PoisonOp>(loc, packedType);
+  Value packed = LLVM::PoisonOp::create(rewriter, loc, packedType);
   for (auto [idx, operand] : llvm::enumerate(updatedOperands)) {
-    packed = rewriter.create<LLVM::InsertValueOp>(loc, packed, operand, idx);
+    packed = LLVM::InsertValueOp::create(rewriter, loc, packed, operand, idx);
   }
   rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), packed,
                                               op->getAttrs());
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 167cabbc57db9..63eb6c58e87a7 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -79,8 +79,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
     uint64_t rank = type.getRank();
     Value numElements = desc.size(rewriter, loc, /*pos=*/0);
     for (unsigned i = 1; i < rank; i++)
-      numElements = rewriter.create<LLVM::MulOp>(
-          loc, numElements, desc.size(rewriter, loc, /*pos=*/i));
+      numElements = LLVM::MulOp::create(rewriter, loc, numElements,
+                                        desc.size(rewriter, loc, /*pos=*/i));
     return numElements;
   }
 
@@ -582,7 +582,7 @@ LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
     return OpBuilder::atBlockEnd(module.getBody())
         .create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
   }();
-  return builder.create<LLVM::CallOp>(loc, function, arguments);
+  return LLVM::CallOp::create(builder, loc, function, arguments);
 }
 
 // Corresponding to cusparseIndexType_t defined in cusparse.h.
@@ -780,13 +780,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   // Allocate the underlying buffer and store a pointer to it in the MemRef
   // descriptor.
-  auto nullPtr = rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPointerType);
+  auto nullPtr = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmPointerType);
   Value stream = adaptor.getAsyncDependencies().empty()
                      ? nullPtr
                      : adaptor.getAsyncDependencies().front();
 
-  auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared));
+  auto isHostShared = mlir::LLVM::ConstantOp::create(
+      rewriter, loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared));
 
   Value allocatedPtr =
       allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
@@ -1012,8 +1012,8 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
       uint64_t staticSize = static_cast<uint64_t>(bitwidth / 8) *
                             static_cast<uint64_t>(memrefTy.getNumElements());
 
-      Value sizeArg = rewriter.create<LLVM::ConstantOp>(
-          loc, getIndexType(), rewriter.getIndexAttr(staticSize));
+      Value sizeArg = LLVM::ConstantOp::create(
+          rewriter, loc, getIndexType(), rewriter.getIndexAttr(staticSize));
       llvmArgumentsWithSizes.push_back(llvmArg); // Presumably a bare pointer.
       llvmArgumentsWithSizes.push_back(sizeArg);
     }
@@ -1025,8 +1025,8 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
         gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(),
                         adaptor.getClusterSizeZ()};
   }
-  rewriter.create<gpu::LaunchFuncOp>(
-      launchOp.getLoc(), launchOp.getKernelAttr(),
+  gpu::LaunchFuncOp::create(
+      rewriter, launchOp.getLoc(), launchOp.getKernelAttr(),
       gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
                       adaptor.getGridSizeZ()},
       gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
@@ -1048,8 +1048,8 @@ static Value bitAndAddrspaceCast(Location loc,
                                  const LLVMTypeConverter &typeConverter) {
   auto sourceTy = cast<LLVM::LLVMPointerType>(sourcePtr.getType());
   if (destinationType.getAddressSpace() != sourceTy.getAddressSpace())
-    sourcePtr = rewriter.create<LLVM::AddrSpaceCastOp>(
-        loc,
+    sourcePtr = LLVM::AddrSpaceCastOp::create(
+        rewriter, loc,
         LLVM::LLVMPointerType::get(rewriter.getContext(),
                                    destinationType.getAddressSpace()),
         sourcePtr);
@@ -1072,13 +1072,13 @@ LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
   Value numElements = getNumElements(rewriter, loc, memRefType, srcDesc);
 
   Type elementPtrType = getElementPtrType(memRefType);
-  Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, elementPtrType);
-  Value gepPtr = rewriter.create<LLVM::GEPOp>(
-      loc, elementPtrType,
+  Value nullPtr = LLVM::ZeroOp::create(rewriter, loc, elementPtrType);
+  Value gepPtr = LLVM::GEPOp::create(
+      rewriter, loc, elementPtrType,
       typeConverter->convertType(memRefType.getElementType()), nullPtr,
       numElements);
   auto sizeBytes =
-      rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
+      LLVM::PtrToIntOp::create(rewriter, loc, getIndexType(), gepPtr);
 
   auto src = bitAndAddrspaceCast(loc, rewriter, llvmPointerType,
                                  srcDesc.alignedPtr(rewriter, loc),
@@ -1123,7 +1123,7 @@ LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite(
   Value numElements = getNumElements(rewriter, loc, memRefType, dstDesc);
 
   auto value =
-      rewriter.create<LLVM::BitcastOp>(loc, bitCastType, adaptor.getValue());
+      LLVM::BitcastOp::create(rewriter, loc, bitCastType, adaptor.getValue());
   auto dst = bitAndAddrspaceCast(loc, rewriter, llvmPointerType,
                                  dstDesc.alignedPtr(rewriter, loc),
                                  *getTypeConverter());
@@ -1150,15 +1150,15 @@ LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite(
 template <typename T>
 static Value genConstInt32From(OpBuilder &builder, Location loc, T tValue) {
   Type llvmInt32Type = builder.getIntegerType(32);
-  return builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
-                                          static_cast<int32_t>(tValue));
+  return LLVM::ConstantOp::create(builder, loc, llvmInt32Type,
+                                  static_cast<int32_t>(tValue));
 }
 
 template <typename T>
 static Value genConstFloat32From(OpBuilder &builder, Location loc, T tValue) {
   Type llvmFloat32Type = builder.getF32Type();
-  return builder.create<LLVM::ConstantOp>(
-      loc, llvmFloat32Type,
+  return LLVM::ConstantOp::create(
+      builder, loc, llvmFloat32Type,
       builder.getF32FloatAttr(static_cast<float>(tValue)));
 }
 
@@ -1189,11 +1189,11 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
   // the dnmat is used with spmat with 2:4 sparsity
   if (dims.size() == 2) {
     if (isSpMMCusparseLtOp(op.getDnTensor())) {
-      auto handleSz = rewriter.create<LLVM::ConstantOp>(
-          loc, getIndexType(), rewriter.getIndexAttr(11032));
-      handle = rewriter.create<LLVM::AllocaOp>(
-          loc, llvmPointerType, llvmInt8Type, handleSz, /*alignment=*/16);
-      handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+      auto handleSz = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                               rewriter.getIndexAttr(11032));
+      handle = LLVM::AllocaOp::create(rewriter, loc, llvmPointerType,
+                                      llvmInt8Type, handleSz, /*alignment=*/16);
+      handle = LLVM::BitcastOp::create(rewriter, loc, llvmPointerType, handle);
 
       createLtDnMatCallBuilder
           .create(loc, rewriter,
@@ -1351,11 +1351,11 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
 
   // CUDA runner asserts the size is 44104 bytes.
-  auto handleSz = rewriter.create<LLVM::ConstantOp>(
-      loc, getIndexType(), rewriter.getIndexAttr(44104));
-  Value handle = rewriter.create<LLVM::AllocaOp>(
-      loc, llvmPointerType, llvmInt8Type, handleSz, /*alignment=*/16);
-  handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
+  auto handleSz = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                           rewriter.getIndexAttr(44104));
+  Value handle = LLVM::AllocaOp::create(
+      rewriter, loc, llvmPointerType, llvmInt8Type, handleSz, /*alignment=*/16);
+  handle = LLVM::BitcastOp::create(rewriter, loc, llvmPointerType, handle);
 
   create2To4SpMatCallBuilder
       .create(loc, rewriter,
@@ -1441,10 +1441,11 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
         genConstInt32From(rewriter, loc, get2To4PruneFlag(op.getSpmatA()));
     auto computeType = genConstInt32From(
         rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
-    auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                   rewriter.getIndexAttr(3));
-    auto bufferSize = rewriter.create<LLVM::AllocaOp>(
-        loc, llvmPointerType, llvmPointerType, three, /*alignment=*/16);
+    auto three = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                          rewriter.getIndexAttr(3));
+    auto bufferSize =
+        LLVM::AllocaOp::create(rewriter, loc, llvmPointerType, llvmPointerType,
+                               three, /*alignment=*/16);
     createCuSparseLtSpMMBufferSizeBuilder
         .create(loc, rewriter,
                 {bufferSize, modeA, modeB, adaptor.getSpmatA(),
@@ -1452,20 +1453,20 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
                  pruneFlag, stream})
         .getResult();
 
-    auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(
-        loc, llvmPointerType, llvmPointerType, bufferSize,
-        ValueRange{rewriter.create<LLVM::ConstantOp>(
-            loc, getIndexType(), rewriter.getIndexAttr(1))});
-    auto bufferSizePtr2 = rewriter.create<LLVM::GEPOp>(
-        loc, llvmPointerType, llvmPointerType, bufferSize,
-        ValueRange{rewriter.create<LLVM::ConstantOp>(
-            loc, getIndexType(), rewriter.getIndexAttr(2))});
+    auto bufferSizePtr1 = LLVM::GEPOp::create(
+        rewriter, loc, llvmPointerType, llvmPointerType, bufferSize,
+        ValueRange{LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                            rewriter.getIndexAttr(1))});
+    auto bufferSizePtr2 = LLVM::GEPOp::create(
+        rewriter, loc, llvmPointerType, llvmPointerType, bufferSize,
+        ValueRange{LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                            rewriter.getIndexAttr(2))});
     auto bufferSize0 =
-        rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSize);
+        LLVM::LoadOp::create(rewriter, loc, llvmInt64Type, bufferSize);
     auto bufferSize1 =
-        rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSizePtr1);
+        LLVM::LoadOp::create(rewriter, loc, llvmInt64Type, bufferSizePtr1);
     auto bufferSize2 =
-        rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, bufferSizePtr2);
+        LLVM::LoadOp::create(rewriter, loc, llvmInt64Type, bufferSizePtr2);
 
     rewriter.replaceOp(op, {bufferSize0, bufferSize1, bufferSize2, stream});
   } else {
@@ -1669,28 +1670,28 @@ LogicalResult ConvertSpMatGetSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
   Location loc = op.getLoc();
   auto stream = adaptor.getAsyncDependencies().front();
 
-  auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                 rewriter.getIndexAttr(3));
-  auto buffer = rewriter.create<LLVM::AllocaOp>(
-      loc, llvmPointerType, llvmInt64Type, three, /*alignment=*/16);
-
-  auto rowsPtr = rewriter.create<LLVM::GEPOp>(
-      loc, llvmPointerType, llvmPointerType, buffer,
-      ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                   rewriter.getIndexAttr(0))});
-  auto colsPtr = rewriter.create<LLVM::GEPOp>(
-      loc, llvmPointerType, llvmPointerType, buffer,
-      ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                   rewriter.getIndexAttr(1))});
-  auto nnzsPtr = rewriter.create<LLVM::GEPOp>(
-      loc, llvmPointerType, llvmPointerType, buffer,
-      ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                   rewriter.getIndexAttr(2))});
+  auto three = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                        rewriter.getIndexAttr(3));
+  auto buffer = LLVM::AllocaOp::create(rewriter, loc, llvmPointerType,
+                                       llvmInt64Type, three, /*alignment=*/16);
+
+  auto rowsPtr = LLVM::GEPOp::create(
+      rewriter, loc, llvmPointerType, llvmPointerType, buffer,
+      ValueRange{LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                          rewriter.getIndexAttr(0))});
+  auto colsPtr = LLVM::GEPOp::create(
+      rewriter, loc, llvmPointerType, llvmPointerType, buffer,
+      ValueRange{LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                          rewriter.getIndexAttr(1))});
+  auto nnzsPtr = LLVM::GEPOp::create(
+      rewriter, loc, llvmPointerType, llvmPointerType, buffer,
+      ValueRange{LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                          rewriter.getIndexAttr(2))});
   createSpMatGetSizeBuilder.create(
       loc, rewriter, {adaptor.getSpmat(), rowsPtr, colsPtr, nnzsPtr, stream});
-  auto rows = rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, rowsPtr);
-  auto cols = rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, colsPtr);
-  auto nnzs = rewriter.create<LLVM::LoadOp>(loc, llvmInt64Type, nnzsPtr);
+  auto rows = LLVM::LoadOp::create(rewriter, loc, llvmInt64Type, rowsPtr);
+  auto cols = LLVM::LoadOp::create(rewriter, loc, llvmInt64Type, colsPtr);
+  auto nnzs = LLVM::LoadOp::create(rewriter, loc, llvmInt64Type, nnzsPtr);
 
   rewriter.replaceOp(op, {rows, cols, nnzs, stream});
   return success();
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index aab2409ed6328..91c43e8bd1117 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -59,13 +59,13 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
     Operation *newOp;
     switch (op.getDimension()) {
     case gpu::Dimension::x:
-      newOp = rewriter.create<XOp>(loc, IntegerType::get(context, 32));
+      newOp = XOp::create(rewriter, loc, IntegerType::get(context, 32));
       break;
     case gpu::Dimension::y:
-      newOp = rewriter.create<YOp>(loc, IntegerType::get(context, 32));
+      newOp = YOp::create(rewriter, loc, IntegerType::get(context, 32));
       break;
     case gpu::Dimension::z:
-      newOp = rewriter.create<ZOp>(loc, IntegerType::get(context, 32));
+      newOp = ZOp::create(rewriter, loc, IntegerType::get(context, 32));
       break;
     }
 
@@ -124,11 +124,13 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
                                   rewriter.getContext(), 32, min, max));
     }
     if (indexBitwidth > 32) {
-      newOp = rewriter.create<LLVM::SExtOp>(
-          loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
+      newOp = LLVM::SExtOp::create(rewriter, loc,
+                                   IntegerType::get(context, indexBitwidth),
+                                   newOp->getResult(0));
     } else if (indexBitwidth < 32) {
-      newOp = rewriter.create<LLVM::TruncOp>(
-          loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
+      newOp = LLVM::TruncOp::create(rewriter, loc,
+                                    IntegerType::get(context, indexBitwidth),
+                                    newOp->getResult(0));
     }
 
     rewriter.replaceOp(op, newOp->getResults());
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index 64cf09e600b88..9f36e5c369d06 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -103,7 +103,7 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 
     LLVMFuncOp funcOp = appendOrGetFuncOp(funcName, funcType, op);
     auto callOp =
-        rewriter.create<LLVM::CallOp>(op->getLoc(), funcOp, castedOperands);
+        LLVM::CallOp::create(rewriter, op->getLoc(), funcOp, castedOperands);
 
     if (resultType == adaptor.getOperands().front().getType()) {
       rewriter.replaceOp(op, {callOp.getResult()});
@@ -115,19 +115,20 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     // there is no guarantee of a specific value being used to indicate true,
     // compare for inequality with zero (rather than truncate or shift).
     if (isResultBool) {
-      Value zero = rewriter.create<LLVM::ConstantOp>(
-          op->getLoc(), rewriter.getIntegerType(32),
-          rewriter.getI32IntegerAttr(0));
-      Value truncated = rewriter.create<LLVM::ICmpOp>(
-          op->getLoc(), LLVM::ICmpPredicate::ne, callOp.getResult(), zero);
+      Value zero = LLVM::ConstantOp::create(rewriter, op->getLoc(),
+                                            rewriter.getIntegerType(32),
+                                            rewriter.getI32IntegerAttr(0));
+      Value truncated =
+          LLVM::ICmpOp::create(rewriter, op->getLoc(), LLVM::ICmpPredicate::ne,
+                               callOp.getResult(), zero);
       rewriter.replaceOp(op, {truncated});
       return success();
     }
 
     assert(callOp.getResult().getType().isF32() &&
            "only f32 types are supposed to be truncated back");
-    Value truncated = rewriter.create<LLVM::FPTruncOp>(
-        op->getLoc(), adaptor.getOperands().front().getType(),
+    Value truncated = LLVM::FPTruncOp::create(
+        rewriter, op->getLoc(), adaptor.getOperands().front().getType(),
         callOp.getResult());
     rewriter.replaceOp(op, {truncated});
     return success();
@@ -142,8 +143,9 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     if (!f16Func.empty() && isa<Float16Type>(type))
       return operand;
 
-    return rewriter.create<LLVM::FPExtOp>(
-        operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
+    return LLVM::FPExtOp::create(rewriter, operand.getLoc(),
+                                 Float32Type::get(rewriter.getContext()),
+                                 operand);
   }
 
   Type getFunctionType(Type resultType, ValueRange operands) const {
@@ -169,7 +171,7 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     // location as debug info metadata inside of a function cannot be used
     // outside of that function.
     auto globalloc = op->getLoc()->findInstanceOfOrUnknown<FileLineColLoc>();
-    return b.create<LLVMFuncOp>(globalloc, funcName, funcType);
+    return LLVMFuncOp::create(b, globalloc, funcName, funcType);
   }
 
   StringRef getFunctionName(Type type, SourceOp op) const {
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 8b6b553f6eed0..c2363a1a40294 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -54,8 +54,8 @@ static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
       SymbolTable::lookupSymbolIn(symbolTable, name));
   if (!func) {
     OpBuilder b(symbolTable->getRegion(0));
-    func = b.create<LLVM::LLVMFuncOp>(
-        symbolTable->getLoc(), name,
+    func = LLVM::LLVMFuncOp::create(
+        b, symbolTable->getLoc(), name,
         LLVM::LLVMFunctionType::get(resultType, paramTypes));
     func.setCConv(LLVM::cconv::CConv::SPIR_FUNC);
     func.setNoUnwind(true);
@@ -79,7 +79,7 @@ static LLVM::CallOp createSPIRVBuiltinCall(Location loc,
                                            ConversionPatternRewriter &rewriter,
                                            LLVM::LLVMFuncOp func,
                                            ValueRange args) {
-  auto call = rewriter.create<LLVM::CallOp>(loc, func, args);
+  auto call = LLVM::CallOp::create(rewriter, loc, func, args);
   call.setCConv(func.getCConv());
   call.setConvergentAttr(func.getConvergentAttr());
   call.setNoUnwindAttr(func.getNoUnwindAttr());
@@ -121,7 +121,7 @@ struct GPUBarrierConversion final : ConvertOpToLLVMPattern<gpu::BarrierOp> {
     constexpr int64_t localMemFenceFlag = 1;
     Location loc = op->getLoc();
     Value flag =
-        rewriter.create<LLVM::ConstantOp>(loc, flagTy, localMemFenceFlag);
+        LLVM::ConstantOp::create(rewriter, loc, flagTy, localMemFenceFlag);
     rewriter.replaceOp(op, createSPIRVBuiltinCall(loc, rewriter, func, flag));
     return success();
   }
@@ -162,8 +162,8 @@ struct LaunchConfigConversion : ConvertToLLVMPattern {
 
     Location loc = op->getLoc();
     gpu::Dimension dim = getDimension(op);
-    Value dimVal = rewriter.create<LLVM::ConstantOp>(loc, dimTy,
-                                                     static_cast<int64_t>(dim));
+    Value dimVal = LLVM::ConstantOp::create(rewriter, loc, dimTy,
+                                            static_cast<int64_t>(dim));
     rewriter.replaceOp(op, createSPIRVBuiltinCall(loc, rewriter, func, dimVal));
     return success();
   }
@@ -291,13 +291,13 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
                                          ConversionPatternRewriter &rewriter) {
     return TypeSwitch<Type, Value>(oldVal.getType())
         .Case([&](BFloat16Type) {
-          return rewriter.create<LLVM::BitcastOp>(loc, rewriter.getI16Type(),
-                                                  oldVal);
+          return LLVM::BitcastOp::create(rewriter, loc, rewriter.getI16Type(),
+                                         oldVal);
         })
         .Case([&](IntegerType intTy) -> Value {
           if (intTy.getWidth() == 1)
-            return rewriter.create<LLVM::ZExtOp>(loc, rewriter.getI8Type(),
-                                                 oldVal);
+            return LLVM::ZExtOp::create(rewriter, loc, rewriter.getI8Type(),
+                                        oldVal);
           return oldVal;
         })
         .Default(oldVal);
@@ -308,11 +308,11 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
                                           ConversionPatternRewriter &rewriter) {
     return TypeSwitch<Type, Value>(newTy)
         .Case([&](BFloat16Type) {
-          return rewriter.create<LLVM::BitcastOp>(loc, newTy, oldVal);
+          return LLVM::BitcastOp::create(rewriter, loc, newTy, oldVal);
         })
         .Case([&](IntegerType intTy) -> Value {
           if (intTy.getWidth() == 1)
-            return rewriter.create<LLVM::TruncOp>(loc, newTy, oldVal);
+            return LLVM::TruncOp::create(rewriter, loc, newTy, oldVal);
           return oldVal;
         })
         .Default(oldVal);
@@ -349,7 +349,7 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
         bitcastOrTruncAfterShuffle(result, op.getType(0), loc, rewriter);
 
     Value trueVal =
-        rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI1Type(), true);
+        LLVM::ConstantOp::create(rewriter, loc, rewriter.getI1Type(), true);
     rewriter.replaceOp(op, {resultOrConversion, trueVal});
     return success();
   }
@@ -426,7 +426,7 @@ struct GPUSubgroupOpConversion final : ConvertOpToLLVMPattern<SubgroupOp> {
       if (indexTy.getIntOrFloatBitWidth() < resultTy.getIntOrFloatBitWidth()) {
         return failure();
       }
-      result = rewriter.create<LLVM::ZExtOp>(loc, indexTy, result);
+      result = LLVM::ZExtOp::create(rewriter, loc, indexTy, result);
     }
 
     rewriter.replaceOp(op, result);
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 1ef6edea93c58..317bfc2970cf5 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -118,10 +118,10 @@ struct GPUSubgroupReduceOpLowering
 
     Location loc = op->getLoc();
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
-    Value offset = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
+    Value offset = LLVM::ConstantOp::create(rewriter, loc, int32Type, -1);
 
-    auto reduxOp = rewriter.create<NVVM::ReduxOp>(loc, int32Type, op.getValue(),
-                                                  mode.value(), offset);
+    auto reduxOp = NVVM::ReduxOp::create(rewriter, loc, int32Type,
+                                         op.getValue(), mode.value(), offset);
 
     rewriter.replaceOp(op, reduxOp->getResult(0));
     return success();
@@ -158,22 +158,22 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
     auto predTy = IntegerType::get(rewriter.getContext(), 1);
 
-    Value one = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 1);
-    Value minusOne = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
-    Value thirtyTwo = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 32);
-    Value numLeadInactiveLane = rewriter.create<LLVM::SubOp>(
-        loc, int32Type, thirtyTwo, adaptor.getWidth());
+    Value one = LLVM::ConstantOp::create(rewriter, loc, int32Type, 1);
+    Value minusOne = LLVM::ConstantOp::create(rewriter, loc, int32Type, -1);
+    Value thirtyTwo = LLVM::ConstantOp::create(rewriter, loc, int32Type, 32);
+    Value numLeadInactiveLane = LLVM::SubOp::create(
+        rewriter, loc, int32Type, thirtyTwo, adaptor.getWidth());
     // Bit mask of active lanes: `(-1) >> (32 - activeWidth)`.
-    Value activeMask = rewriter.create<LLVM::LShrOp>(loc, int32Type, minusOne,
-                                                     numLeadInactiveLane);
+    Value activeMask = LLVM::LShrOp::create(rewriter, loc, int32Type, minusOne,
+                                            numLeadInactiveLane);
     Value maskAndClamp;
     if (op.getMode() == gpu::ShuffleMode::UP) {
       // Clamp lane: `32 - activeWidth`
       maskAndClamp = numLeadInactiveLane;
     } else {
       // Clamp lane: `activeWidth - 1`
-      maskAndClamp =
-          rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.getWidth(), one);
+      maskAndClamp = LLVM::SubOp::create(rewriter, loc, int32Type,
+                                         adaptor.getWidth(), one);
     }
 
     bool predIsUsed = !op->getResult(1).use_empty();
@@ -184,13 +184,14 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
       resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
                                                   {valueTy, predTy});
     }
-    Value shfl = rewriter.create<NVVM::ShflOp>(
-        loc, resultTy, activeMask, adaptor.getValue(), adaptor.getOffset(),
-        maskAndClamp, convertShflKind(op.getMode()), returnValueAndIsValidAttr);
+    Value shfl = NVVM::ShflOp::create(
+        rewriter, loc, resultTy, activeMask, adaptor.getValue(),
+        adaptor.getOffset(), maskAndClamp, convertShflKind(op.getMode()),
+        returnValueAndIsValidAttr);
     if (predIsUsed) {
-      Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
+      Value shflValue = LLVM::ExtractValueOp::create(rewriter, loc, shfl, 0);
       Value isActiveSrcLane =
-          rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
+          LLVM::ExtractValueOp::create(rewriter, loc, shfl, 1);
       rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
     } else {
       rewriter.replaceOp(op, {shfl, nullptr});
@@ -215,16 +216,16 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
       bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
           /*bitWidth=*/32, /*lower=*/0, /*upper=*/kWarpSize);
     Value newOp =
-        rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds);
+        NVVM::LaneIdOp::create(rewriter, loc, rewriter.getI32Type(), bounds);
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
     if (indexBitwidth > 32) {
-      newOp = rewriter.create<LLVM::SExtOp>(
-          loc, IntegerType::get(context, indexBitwidth), newOp);
+      newOp = LLVM::SExtOp::create(
+          rewriter, loc, IntegerType::get(context, indexBitwidth), newOp);
     } else if (indexBitwidth < 32) {
-      newOp = rewriter.create<LLVM::TruncOp>(
-          loc, IntegerType::get(context, indexBitwidth), newOp);
+      newOp = LLVM::TruncOp::create(
+          rewriter, loc, IntegerType::get(context, indexBitwidth), newOp);
     }
     rewriter.replaceOp(op, {newOp});
     return success();
@@ -271,10 +272,10 @@ struct AssertOpToAssertfailLowering
     Block *afterBlock =
         rewriter.splitBlock(assertBlock, ++assertOp->getIterator());
     rewriter.setInsertionPointToEnd(beforeBlock);
-    rewriter.create<cf::CondBranchOp>(loc, adaptor.getArg(), afterBlock,
-                                      assertBlock);
+    cf::CondBranchOp::create(rewriter, loc, adaptor.getArg(), afterBlock,
+                             assertBlock);
     rewriter.setInsertionPointToEnd(assertBlock);
-    rewriter.create<cf::BranchOp>(loc, afterBlock);
+    cf::BranchOp::create(rewriter, loc, afterBlock);
 
     // Continue cf.assert lowering.
     rewriter.setInsertionPoint(assertOp);
@@ -301,12 +302,12 @@ struct AssertOpToAssertfailLowering
     // Create constants.
     auto getGlobal = [&](LLVM::GlobalOp global) {
       // Get a pointer to the format string's first element.
-      Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
-          loc, LLVM::LLVMPointerType::get(ctx, global.getAddrSpace()),
+      Value globalPtr = LLVM::AddressOfOp::create(
+          rewriter, loc, LLVM::LLVMPointerType::get(ctx, global.getAddrSpace()),
           global.getSymNameAttr());
       Value start =
-          rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
-                                       globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+          LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
+                              globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
       return start;
     };
     Value assertMessage = getGlobal(getOrCreateStringConstant(
@@ -316,8 +317,8 @@ struct AssertOpToAssertfailLowering
     Value assertFunc = getGlobal(getOrCreateStringConstant(
         rewriter, loc, moduleOp, i8Type, "assert_func_", funcName));
     Value assertLine =
-        rewriter.create<LLVM::ConstantOp>(loc, i32Type, fileLine);
-    Value c1 = rewriter.create<LLVM::ConstantOp>(loc, i64Type, 1);
+        LLVM::ConstantOp::create(rewriter, loc, i32Type, fileLine);
+    Value c1 = LLVM::ConstantOp::create(rewriter, loc, i64Type, 1);
 
     // Insert function call to __assertfail.
     SmallVector<Value> arguments{assertMessage, assertFile, assertLine,
diff --git a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
index 45fd933d58857..99c059cb03299 100644
--- a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
@@ -126,8 +126,8 @@ struct WmmaLoadOpToNVVMLowering
         cast<MemRefType>(subgroupMmaLoadMatrixOp.getSrcMemref().getType()),
         adaptor.getSrcMemref(), adaptor.getIndices());
 
-    Value leadingDim = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI32Type(),
+    Value leadingDim = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
         subgroupMmaLoadMatrixOp.getLeadDimensionAttr());
     rewriter.replaceOpWithNewOp<NVVM::WMMALoadOp>(
         op, resType, dataPtr, leadingDim, m, n, k, layout, eltype, frag);
@@ -173,7 +173,7 @@ struct WmmaStoreOpToNVVMLowering
     auto matrixType = cast<LLVM::LLVMStructType>(adaptor.getSrc().getType());
     for (unsigned i = 0, e = matrixType.getBody().size(); i < e; ++i) {
       Value toUse =
-          rewriter.create<LLVM::ExtractValueOp>(loc, adaptor.getSrc(), i);
+          LLVM::ExtractValueOp::create(rewriter, loc, adaptor.getSrc(), i);
       storeOpOperands.push_back(toUse);
     }
 
@@ -181,8 +181,8 @@ struct WmmaStoreOpToNVVMLowering
         rewriter, loc,
         cast<MemRefType>(subgroupMmaStoreMatrixOp.getDstMemref().getType()),
         adaptor.getDstMemref(), adaptor.getIndices());
-    Value leadingDim = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI32Type(),
+    Value leadingDim = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
         subgroupMmaStoreMatrixOp.getLeadDimensionAttr());
     rewriter.replaceOpWithNewOp<NVVM::WMMAStoreOp>(
         op, dataPtr, m, n, k, layout, eltype, storeOpOperands, leadingDim);
@@ -216,7 +216,7 @@ struct WmmaMmaOpToNVVMLowering
     auto unpackOp = [&](Value operand) {
       auto structType = cast<LLVM::LLVMStructType>(operand.getType());
       for (size_t i = 0, e = structType.getBody().size(); i < e; ++i) {
-        Value toUse = rewriter.create<LLVM::ExtractValueOp>(loc, operand, i);
+        Value toUse = LLVM::ExtractValueOp::create(rewriter, loc, operand, i);
         unpackedOps.push_back(toUse);
       }
     };
@@ -280,19 +280,19 @@ struct WmmaConstantOpToNVVMLowering
         cast<gpu::MMAMatrixType>(subgroupMmaConstantOp.getType()));
     // If the element type is a vector create a vector from the operand.
     if (auto vecType = dyn_cast<VectorType>(type.getBody()[0])) {
-      Value vecCst = rewriter.create<LLVM::PoisonOp>(loc, vecType);
+      Value vecCst = LLVM::PoisonOp::create(rewriter, loc, vecType);
       for (int64_t vecEl = 0; vecEl < vecType.getNumElements(); vecEl++) {
-        Value idx = rewriter.create<LLVM::ConstantOp>(
-            loc, rewriter.getI32Type(), vecEl);
-        vecCst = rewriter.create<LLVM::InsertElementOp>(loc, vecType, vecCst,
-                                                        cst, idx);
+        Value idx = LLVM::ConstantOp::create(rewriter, loc,
+                                             rewriter.getI32Type(), vecEl);
+        vecCst = LLVM::InsertElementOp::create(rewriter, loc, vecType, vecCst,
+                                               cst, idx);
       }
       cst = vecCst;
     }
-    Value matrixStruct = rewriter.create<LLVM::PoisonOp>(loc, type);
+    Value matrixStruct = LLVM::PoisonOp::create(rewriter, loc, type);
     for (size_t i : llvm::seq(size_t(0), type.getBody().size())) {
       matrixStruct =
-          rewriter.create<LLVM::InsertValueOp>(loc, matrixStruct, cst, i);
+          LLVM::InsertValueOp::create(rewriter, loc, matrixStruct, cst, i);
     }
     rewriter.replaceOp(subgroupMmaConstantOp, matrixStruct);
     return success();
@@ -305,17 +305,17 @@ static Value createMinMaxF(OpBuilder &builder, Location loc, Value lhs,
   Type i1Type = builder.getI1Type();
   if (auto vecType = dyn_cast<VectorType>(lhs.getType()))
     i1Type = VectorType::get(vecType.getShape(), i1Type);
-  Value cmp = builder.create<LLVM::FCmpOp>(
-      loc, i1Type, isMin ? LLVM::FCmpPredicate::olt : LLVM::FCmpPredicate::ogt,
-      lhs, rhs);
-  Value sel = builder.create<LLVM::SelectOp>(loc, cmp, lhs, rhs);
-  Value isNan = builder.create<LLVM::FCmpOp>(
-      loc, i1Type, LLVM::FCmpPredicate::uno, lhs, rhs);
-  Value nan = builder.create<LLVM::ConstantOp>(
-      loc, lhs.getType(),
+  Value cmp = LLVM::FCmpOp::create(
+      builder, loc, i1Type,
+      isMin ? LLVM::FCmpPredicate::olt : LLVM::FCmpPredicate::ogt, lhs, rhs);
+  Value sel = LLVM::SelectOp::create(builder, loc, cmp, lhs, rhs);
+  Value isNan = LLVM::FCmpOp::create(builder, loc, i1Type,
+                                     LLVM::FCmpPredicate::uno, lhs, rhs);
+  Value nan = LLVM::ConstantOp::create(
+      builder, loc, lhs.getType(),
       builder.getFloatAttr(floatType,
                            APFloat::getQNaN(floatType.getFloatSemantics())));
-  return builder.create<LLVM::SelectOp>(loc, isNan, nan, sel);
+  return LLVM::SelectOp::create(builder, loc, isNan, nan, sel);
 }
 
 static Value createScalarOp(OpBuilder &builder, Location loc,
@@ -323,11 +323,11 @@ static Value createScalarOp(OpBuilder &builder, Location loc,
                             ArrayRef<Value> operands) {
   switch (op) {
   case gpu::MMAElementwiseOp::ADDF:
-    return builder.create<LLVM::FAddOp>(loc, operands[0].getType(), operands);
+    return LLVM::FAddOp::create(builder, loc, operands[0].getType(), operands);
   case gpu::MMAElementwiseOp::MULF:
-    return builder.create<LLVM::FMulOp>(loc, operands[0].getType(), operands);
+    return LLVM::FMulOp::create(builder, loc, operands[0].getType(), operands);
   case gpu::MMAElementwiseOp::DIVF:
-    return builder.create<LLVM::FDivOp>(loc, operands[0].getType(), operands);
+    return LLVM::FDivOp::create(builder, loc, operands[0].getType(), operands);
   case gpu::MMAElementwiseOp::MAXF:
     return createMinMaxF(builder, loc, operands[0], operands[1],
                          /*isMin=*/false);
@@ -356,18 +356,18 @@ struct WmmaElementwiseOpToNVVMLowering
     size_t numOperands = adaptor.getOperands().size();
     LLVM::LLVMStructType destType = convertMMAToLLVMType(
         cast<gpu::MMAMatrixType>(subgroupMmaElementwiseOp.getType()));
-    Value matrixStruct = rewriter.create<LLVM::PoisonOp>(loc, destType);
+    Value matrixStruct = LLVM::PoisonOp::create(rewriter, loc, destType);
     for (size_t i = 0, e = destType.getBody().size(); i < e; ++i) {
       SmallVector<Value> extractedOperands;
       for (size_t opIdx = 0; opIdx < numOperands; opIdx++) {
-        extractedOperands.push_back(rewriter.create<LLVM::ExtractValueOp>(
-            loc, adaptor.getOperands()[opIdx], i));
+        extractedOperands.push_back(LLVM::ExtractValueOp::create(
+            rewriter, loc, adaptor.getOperands()[opIdx], i));
       }
       Value element =
           createScalarOp(rewriter, loc, subgroupMmaElementwiseOp.getOpType(),
                          extractedOperands);
       matrixStruct =
-          rewriter.create<LLVM::InsertValueOp>(loc, matrixStruct, element, i);
+          LLVM::InsertValueOp::create(rewriter, loc, matrixStruct, element, i);
     }
     rewriter.replaceOp(subgroupMmaElementwiseOp, matrixStruct);
     return success();
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 456bfaba980ca..d22364e1ef441 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -61,10 +61,10 @@ static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
       IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
   // TODO: use <=> in C++20.
   if (indexBitwidth > intWidth) {
-    return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
+    return LLVM::SExtOp::create(rewriter, loc, indexBitwidthType, value);
   }
   if (indexBitwidth < intWidth) {
-    return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
+    return LLVM::TruncOp::create(rewriter, loc, indexBitwidthType, value);
   }
   return value;
 }
@@ -82,12 +82,12 @@ static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
 static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
                        const unsigned indexBitwidth) {
   auto int32Type = IntegerType::get(rewriter.getContext(), 32);
-  Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
-  Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
-  Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
-                                                    ValueRange{minus1, zero});
-  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
-                                                   ValueRange{minus1, mbcntLo});
+  Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
+  Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
+  Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, int32Type,
+                                           ValueRange{minus1, zero});
+  Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, int32Type,
+                                          ValueRange{minus1, mbcntLo});
   return laneId;
 }
 static constexpr StringLiteral amdgcnDataLayout =
@@ -110,21 +110,21 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
     // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
 
     Type intTy = IntegerType::get(context, 32);
-    Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
-    Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
-    Value mbcntLo =
-        rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
-    Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
-        loc, intTy, ValueRange{minus1, mbcntLo});
+    Value zero = arith::ConstantIntOp::create(rewriter, loc, 0, 32);
+    Value minus1 = arith::ConstantIntOp::create(rewriter, loc, -1, 32);
+    Value mbcntLo = ROCDL::MbcntLoOp::create(rewriter, loc, intTy,
+                                             ValueRange{minus1, zero});
+    Value laneId = ROCDL::MbcntHiOp::create(rewriter, loc, intTy,
+                                            ValueRange{minus1, mbcntLo});
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
     if (indexBitwidth > 32) {
-      laneId = rewriter.create<LLVM::SExtOp>(
-          loc, IntegerType::get(context, indexBitwidth), laneId);
+      laneId = LLVM::SExtOp::create(
+          rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
     } else if (indexBitwidth < 32) {
-      laneId = rewriter.create<LLVM::TruncOp>(
-          loc, IntegerType::get(context, indexBitwidth), laneId);
+      laneId = LLVM::TruncOp::create(
+          rewriter, loc, IntegerType::get(context, indexBitwidth), laneId);
     }
     rewriter.replaceOp(op, {laneId});
     return success();
@@ -149,8 +149,8 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
           /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
           /*upper=*/op.getUpperBoundAttr().getInt() + 1);
     }
-    Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
-        op.getLoc(), rewriter.getI32Type(), bounds);
+    Value wavefrontOp = ROCDL::WavefrontSizeOp::create(
+        rewriter, op.getLoc(), rewriter.getI32Type(), bounds);
     wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
                                        *getTypeConverter());
     rewriter.replaceOp(op, {wavefrontOp});
@@ -190,44 +190,44 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
 
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
     Value width = adaptor.getWidth();
-    Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
-    Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
-    Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
+    Value zero = LLVM::ConstantOp::create(rewriter, loc, int32Type, 0);
+    Value negwidth = LLVM::SubOp::create(rewriter, loc, int32Type, zero, width);
+    Value add = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId, width);
     Value widthOrZeroIfOutside =
-        rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
+        LLVM::AndOp::create(rewriter, loc, int32Type, add, negwidth);
     Value dstLane;
 
     switch (op.getMode()) {
     case gpu::ShuffleMode::UP:
-      dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
-                                             adaptor.getOffset());
+      dstLane = LLVM::SubOp::create(rewriter, loc, int32Type, srcLaneId,
+                                    adaptor.getOffset());
       break;
     case gpu::ShuffleMode::DOWN:
-      dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
-                                             adaptor.getOffset());
+      dstLane = LLVM::AddOp::create(rewriter, loc, int32Type, srcLaneId,
+                                    adaptor.getOffset());
       break;
     case gpu::ShuffleMode::XOR:
-      dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
-                                             adaptor.getOffset());
+      dstLane = LLVM::XOrOp::create(rewriter, loc, int32Type, srcLaneId,
+                                    adaptor.getOffset());
       break;
     case gpu::ShuffleMode::IDX:
       dstLane = adaptor.getOffset();
       break;
     }
-    Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
-        loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
-    Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
-                                                          dstLane, srcLaneId);
-    Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
+    Value isActiveSrcLane = LLVM::ICmpOp::create(
+        rewriter, loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
+    Value selectDstLane = LLVM::SelectOp::create(rewriter, loc, isActiveSrcLane,
+                                                 dstLane, srcLaneId);
+    Value two = LLVM::ConstantOp::create(rewriter, loc, int32Type, 2);
     Value dwordAlignedDstLane =
-        rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
+        LLVM::ShlOp::create(rewriter, loc, int32Type, selectDstLane, two);
 
     SmallVector<Value> decomposed =
         LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
     SmallVector<Value> swizzled;
     for (Value v : decomposed) {
-      Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
-                                                       dwordAlignedDstLane, v);
+      Value res = ROCDL::DsBpermuteOp::create(rewriter, loc, int32Type,
+                                              dwordAlignedDstLane, v);
       swizzled.emplace_back(res);
     }
     Value shflValue =
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index b99ed261ecfa3..a19194eb181fb 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -169,11 +169,11 @@ LogicalResult LaunchConfigConversion<SourceOp, builtin>::matchAndRewrite(
 
   Value vector =
       spirv::getBuiltinVariableValue(op, builtin, builtinType, rewriter);
-  Value dim = rewriter.create<spirv::CompositeExtractOp>(
-      op.getLoc(), builtinType, vector,
+  Value dim = spirv::CompositeExtractOp::create(
+      rewriter, op.getLoc(), builtinType, vector,
       rewriter.getI32ArrayAttr({static_cast<int32_t>(op.getDimension())}));
   if (forShader && builtinType != indexType)
-    dim = rewriter.create<spirv::UConvertOp>(op.getLoc(), indexType, dim);
+    dim = spirv::UConvertOp::create(rewriter, op.getLoc(), indexType, dim);
   rewriter.replaceOp(op, dim);
   return success();
 }
@@ -198,8 +198,8 @@ SingleDimLaunchConfigConversion<SourceOp, builtin>::matchAndRewrite(
   Value builtinValue =
       spirv::getBuiltinVariableValue(op, builtin, i32Type, rewriter);
   if (i32Type != indexType)
-    builtinValue = rewriter.create<spirv::UConvertOp>(op.getLoc(), indexType,
-                                                      builtinValue);
+    builtinValue = spirv::UConvertOp::create(rewriter, op.getLoc(), indexType,
+                                             builtinValue);
   rewriter.replaceOp(op, builtinValue);
   return success();
 }
@@ -257,8 +257,8 @@ lowerAsEntryFunction(gpu::GPUFuncOp funcOp, const TypeConverter &typeConverter,
       signatureConverter.addInputs(argType.index(), convertedType);
     }
   }
-  auto newFuncOp = rewriter.create<spirv::FuncOp>(
-      funcOp.getLoc(), funcOp.getName(),
+  auto newFuncOp = spirv::FuncOp::create(
+      rewriter, funcOp.getLoc(), funcOp.getName(),
       rewriter.getFunctionType(signatureConverter.getConvertedTypes(), {}));
   for (const auto &namedAttr : funcOp->getAttrs()) {
     if (namedAttr.getName() == funcOp.getFunctionTypeAttrName() ||
@@ -367,8 +367,8 @@ LogicalResult GPUModuleConversion::matchAndRewrite(
 
   // Add a keyword to the module name to avoid symbolic conflict.
   std::string spvModuleName = (kSPIRVModule + moduleOp.getName()).str();
-  auto spvModule = rewriter.create<spirv::ModuleOp>(
-      moduleOp.getLoc(), addressingModel, *memoryModel, std::nullopt,
+  auto spvModule = spirv::ModuleOp::create(
+      rewriter, moduleOp.getLoc(), addressingModel, *memoryModel, std::nullopt,
       StringRef(spvModuleName));
 
   // Move the region from the module op into the SPIR-V module.
@@ -452,42 +452,42 @@ LogicalResult GPUShuffleConversion::matchAndRewrite(
 
   switch (shuffleOp.getMode()) {
   case gpu::ShuffleMode::XOR: {
-    result = rewriter.create<spirv::GroupNonUniformShuffleXorOp>(
-        loc, scope, adaptor.getValue(), adaptor.getOffset());
+    result = spirv::GroupNonUniformShuffleXorOp::create(
+        rewriter, loc, scope, adaptor.getValue(), adaptor.getOffset());
     validVal = spirv::ConstantOp::getOne(rewriter.getI1Type(),
                                          shuffleOp.getLoc(), rewriter);
     break;
   }
   case gpu::ShuffleMode::IDX: {
-    result = rewriter.create<spirv::GroupNonUniformShuffleOp>(
-        loc, scope, adaptor.getValue(), adaptor.getOffset());
+    result = spirv::GroupNonUniformShuffleOp::create(
+        rewriter, loc, scope, adaptor.getValue(), adaptor.getOffset());
     validVal = spirv::ConstantOp::getOne(rewriter.getI1Type(),
                                          shuffleOp.getLoc(), rewriter);
     break;
   }
   case gpu::ShuffleMode::DOWN: {
-    result = rewriter.create<spirv::GroupNonUniformShuffleDownOp>(
-        loc, scope, adaptor.getValue(), adaptor.getOffset());
+    result = spirv::GroupNonUniformShuffleDownOp::create(
+        rewriter, loc, scope, adaptor.getValue(), adaptor.getOffset());
 
-    Value laneId = rewriter.create<gpu::LaneIdOp>(loc, widthAttr);
+    Value laneId = gpu::LaneIdOp::create(rewriter, loc, widthAttr);
     Value resultLaneId =
-        rewriter.create<arith::AddIOp>(loc, laneId, adaptor.getOffset());
-    validVal = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                              resultLaneId, adaptor.getWidth());
+        arith::AddIOp::create(rewriter, loc, laneId, adaptor.getOffset());
+    validVal = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ult,
+                                     resultLaneId, adaptor.getWidth());
     break;
   }
   case gpu::ShuffleMode::UP: {
-    result = rewriter.create<spirv::GroupNonUniformShuffleUpOp>(
-        loc, scope, adaptor.getValue(), adaptor.getOffset());
+    result = spirv::GroupNonUniformShuffleUpOp::create(
+        rewriter, loc, scope, adaptor.getValue(), adaptor.getOffset());
 
-    Value laneId = rewriter.create<gpu::LaneIdOp>(loc, widthAttr);
+    Value laneId = gpu::LaneIdOp::create(rewriter, loc, widthAttr);
     Value resultLaneId =
-        rewriter.create<arith::SubIOp>(loc, laneId, adaptor.getOffset());
+        arith::SubIOp::create(rewriter, loc, laneId, adaptor.getOffset());
     auto i32Type = rewriter.getIntegerType(32);
-    validVal = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sge, resultLaneId,
-        rewriter.create<arith::ConstantOp>(
-            loc, i32Type, rewriter.getIntegerAttr(i32Type, 0)));
+    validVal = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sge, resultLaneId,
+        arith::ConstantOp::create(rewriter, loc, i32Type,
+                                  rewriter.getIntegerAttr(i32Type, 0)));
     break;
   }
   }
@@ -516,15 +516,16 @@ LogicalResult GPURotateConversion::matchAndRewrite(
 
   Location loc = rotateOp.getLoc();
   auto scope = rewriter.getAttr<spirv::ScopeAttr>(spirv::Scope::Subgroup);
-  Value rotateResult = rewriter.create<spirv::GroupNonUniformRotateKHROp>(
-      loc, scope, adaptor.getValue(), adaptor.getOffset(), adaptor.getWidth());
+  Value rotateResult = spirv::GroupNonUniformRotateKHROp::create(
+      rewriter, loc, scope, adaptor.getValue(), adaptor.getOffset(),
+      adaptor.getWidth());
   Value validVal;
   if (widthAttr.getValue().getZExtValue() == subgroupSize) {
     validVal = spirv::ConstantOp::getOne(rewriter.getI1Type(), loc, rewriter);
   } else {
-    Value laneId = rewriter.create<gpu::LaneIdOp>(loc, widthAttr);
-    validVal = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                              laneId, adaptor.getWidth());
+    Value laneId = gpu::LaneIdOp::create(rewriter, loc, widthAttr);
+    validVal = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ult,
+                                     laneId, adaptor.getWidth());
   }
 
   rewriter.replaceOp(rotateOp, {rotateResult, validVal});
@@ -548,14 +549,14 @@ static Value createGroupReduceOpImpl(OpBuilder &builder, Location loc,
                                 ? spirv::GroupOperation::ClusteredReduce
                                 : spirv::GroupOperation::Reduce);
   if (isUniform) {
-    return builder.create<UniformOp>(loc, type, scope, groupOp, arg)
+    return UniformOp::create(builder, loc, type, scope, groupOp, arg)
         .getResult();
   }
 
   Value clusterSizeValue;
   if (clusterSize.has_value())
-    clusterSizeValue = builder.create<spirv::ConstantOp>(
-        loc, builder.getI32Type(),
+    clusterSizeValue = spirv::ConstantOp::create(
+        builder, loc, builder.getI32Type(),
         builder.getIntegerAttr(builder.getI32Type(), *clusterSize));
 
   return builder
@@ -740,8 +741,8 @@ LogicalResult GPUPrintfConversion::matchAndRewrite(
     std::string specCstName =
         makeVarName(moduleOp, llvm::Twine(globalVarName) + "_sc");
 
-    return rewriter.create<spirv::SpecConstantOp>(
-        loc, rewriter.getStringAttr(specCstName), attr);
+    return spirv::SpecConstantOp::create(
+        rewriter, loc, rewriter.getStringAttr(specCstName), attr);
   };
   {
     Operation *parent =
@@ -774,8 +775,8 @@ LogicalResult GPUPrintfConversion::matchAndRewrite(
     std::string specCstCompositeName =
         (llvm::Twine(globalVarName) + "_scc").str();
 
-    specCstComposite = rewriter.create<spirv::SpecConstantCompositeOp>(
-        loc, TypeAttr::get(globalType),
+    specCstComposite = spirv::SpecConstantCompositeOp::create(
+        rewriter, loc, TypeAttr::get(globalType),
         rewriter.getStringAttr(specCstCompositeName),
         rewriter.getArrayAttr(constituents));
 
@@ -785,23 +786,24 @@ LogicalResult GPUPrintfConversion::matchAndRewrite(
     // Define a GlobalVarOp initialized using specialized constants
     // that is used to specify the printf format string
     // to be passed to the SPIRV CLPrintfOp.
-    globalVar = rewriter.create<spirv::GlobalVariableOp>(
-        loc, ptrType, globalVarName, FlatSymbolRefAttr::get(specCstComposite));
+    globalVar = spirv::GlobalVariableOp::create(
+        rewriter, loc, ptrType, globalVarName,
+        FlatSymbolRefAttr::get(specCstComposite));
 
     globalVar->setAttr("Constant", rewriter.getUnitAttr());
   }
   // Get SSA value of Global variable and create pointer to i8 to point to
   // the format string.
-  Value globalPtr = rewriter.create<spirv::AddressOfOp>(loc, globalVar);
-  Value fmtStr = rewriter.create<spirv::BitcastOp>(
-      loc,
+  Value globalPtr = spirv::AddressOfOp::create(rewriter, loc, globalVar);
+  Value fmtStr = spirv::BitcastOp::create(
+      rewriter, loc,
       spirv::PointerType::get(i8Type, spirv::StorageClass::UniformConstant),
       globalPtr);
 
   // Get printf arguments.
   auto printfArgs = llvm::to_vector_of<Value, 4>(adaptor.getArgs());
 
-  rewriter.create<spirv::CLPrintfOp>(loc, i32Type, fmtStr, printfArgs);
+  spirv::CLPrintfOp::create(rewriter, loc, i32Type, fmtStr, printfArgs);
 
   // Need to erase the gpu.printf op as gpu.printf does not use result vs
   // spirv::CLPrintfOp has i32 resultType so cannot replace with new SPIR-V
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 0b2c06a08db2d..a344f88326089 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -144,11 +144,12 @@ void GPUToSPIRVPass::runOnOperation() {
     if (targetEnvSupportsKernelCapability(moduleOp)) {
       moduleOp.walk([&](gpu::GPUFuncOp funcOp) {
         builder.setInsertionPoint(funcOp);
-        auto newFuncOp = builder.create<func::FuncOp>(
-            funcOp.getLoc(), funcOp.getName(), funcOp.getFunctionType());
+        auto newFuncOp =
+            func::FuncOp::create(builder, funcOp.getLoc(), funcOp.getName(),
+                                 funcOp.getFunctionType());
         auto entryBlock = newFuncOp.addEntryBlock();
         builder.setInsertionPointToEnd(entryBlock);
-        builder.create<func::ReturnOp>(funcOp.getLoc());
+        func::ReturnOp::create(builder, funcOp.getLoc());
         newFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                            builder.getUnitAttr());
         funcOp.erase();
diff --git a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
index 7bb86b5ce1ddd..51dc50048024f 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
@@ -283,8 +283,8 @@ struct WmmaLoadOpToSPIRVLowering final
 
     int64_t stride = op.getLeadDimension().getSExtValue();
     IntegerType i32Type = rewriter.getI32Type();
-    auto strideValue = rewriter.create<spirv::ConstantOp>(
-        loc, i32Type, IntegerAttr::get(i32Type, stride));
+    auto strideValue = spirv::ConstantOp::create(
+        rewriter, loc, i32Type, IntegerAttr::get(i32Type, stride));
 
     bool isColMajor = op.getTranspose().value_or(false);
     auto layout = isColMajor ? spirv::CooperativeMatrixLayoutKHR::ColumnMajor
@@ -315,8 +315,8 @@ struct WmmaStoreOpToSPIRVLowering final
 
     int64_t stride = op.getLeadDimension().getSExtValue();
     IntegerType i32Type = rewriter.getI32Type();
-    auto strideValue = rewriter.create<spirv::ConstantOp>(
-        loc, i32Type, IntegerAttr::get(i32Type, stride));
+    auto strideValue = spirv::ConstantOp::create(
+        rewriter, loc, i32Type, IntegerAttr::get(i32Type, stride));
 
     bool isColMajor = op.getTranspose().value_or(false);
     auto layout = isColMajor ? spirv::CooperativeMatrixLayoutKHR::ColumnMajor
diff --git a/mlir/lib/Conversion/IndexToLLVM/IndexToLLVM.cpp b/mlir/lib/Conversion/IndexToLLVM/IndexToLLVM.cpp
index 0473bb59fa6aa..99d2f6ca78c38 100644
--- a/mlir/lib/Conversion/IndexToLLVM/IndexToLLVM.cpp
+++ b/mlir/lib/Conversion/IndexToLLVM/IndexToLLVM.cpp
@@ -36,34 +36,34 @@ struct ConvertIndexCeilDivS : mlir::ConvertOpToLLVMPattern<CeilDivSOp> {
     Location loc = op.getLoc();
     Value n = adaptor.getLhs();
     Value m = adaptor.getRhs();
-    Value zero = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), 0);
-    Value posOne = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), 1);
-    Value negOne = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), -1);
+    Value zero = LLVM::ConstantOp::create(rewriter, loc, n.getType(), 0);
+    Value posOne = LLVM::ConstantOp::create(rewriter, loc, n.getType(), 1);
+    Value negOne = LLVM::ConstantOp::create(rewriter, loc, n.getType(), -1);
 
     // Compute `x`.
     Value mPos =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::sgt, m, zero);
-    Value x = rewriter.create<LLVM::SelectOp>(loc, mPos, negOne, posOne);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::sgt, m, zero);
+    Value x = LLVM::SelectOp::create(rewriter, loc, mPos, negOne, posOne);
 
     // Compute the positive result.
-    Value nPlusX = rewriter.create<LLVM::AddOp>(loc, n, x);
-    Value nPlusXDivM = rewriter.create<LLVM::SDivOp>(loc, nPlusX, m);
-    Value posRes = rewriter.create<LLVM::AddOp>(loc, nPlusXDivM, posOne);
+    Value nPlusX = LLVM::AddOp::create(rewriter, loc, n, x);
+    Value nPlusXDivM = LLVM::SDivOp::create(rewriter, loc, nPlusX, m);
+    Value posRes = LLVM::AddOp::create(rewriter, loc, nPlusXDivM, posOne);
 
     // Compute the negative result.
-    Value negN = rewriter.create<LLVM::SubOp>(loc, zero, n);
-    Value negNDivM = rewriter.create<LLVM::SDivOp>(loc, negN, m);
-    Value negRes = rewriter.create<LLVM::SubOp>(loc, zero, negNDivM);
+    Value negN = LLVM::SubOp::create(rewriter, loc, zero, n);
+    Value negNDivM = LLVM::SDivOp::create(rewriter, loc, negN, m);
+    Value negRes = LLVM::SubOp::create(rewriter, loc, zero, negNDivM);
 
     // Pick the positive result if `n` and `m` have the same sign and `n` is
     // non-zero, i.e. `(n > 0) == (m > 0) && n != 0`.
     Value nPos =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::sgt, n, zero);
-    Value sameSign =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, nPos, mPos);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::sgt, n, zero);
+    Value sameSign = LLVM::ICmpOp::create(rewriter, loc,
+                                          LLVM::ICmpPredicate::eq, nPos, mPos);
     Value nNonZero =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::ne, n, zero);
-    Value cmp = rewriter.create<LLVM::AndOp>(loc, sameSign, nNonZero);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::ne, n, zero);
+    Value cmp = LLVM::AndOp::create(rewriter, loc, sameSign, nNonZero);
     rewriter.replaceOpWithNewOp<LLVM::SelectOp>(op, cmp, posRes, negRes);
     return success();
   }
@@ -83,17 +83,17 @@ struct ConvertIndexCeilDivU : mlir::ConvertOpToLLVMPattern<CeilDivUOp> {
     Location loc = op.getLoc();
     Value n = adaptor.getLhs();
     Value m = adaptor.getRhs();
-    Value zero = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), 0);
-    Value one = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), 1);
+    Value zero = LLVM::ConstantOp::create(rewriter, loc, n.getType(), 0);
+    Value one = LLVM::ConstantOp::create(rewriter, loc, n.getType(), 1);
 
     // Compute the non-zero result.
-    Value minusOne = rewriter.create<LLVM::SubOp>(loc, n, one);
-    Value quotient = rewriter.create<LLVM::UDivOp>(loc, minusOne, m);
-    Value plusOne = rewriter.create<LLVM::AddOp>(loc, quotient, one);
+    Value minusOne = LLVM::SubOp::create(rewriter, loc, n, one);
+    Value quotient = LLVM::UDivOp::create(rewriter, loc, minusOne, m);
+    Value plusOne = LLVM::AddOp::create(rewriter, loc, quotient, one);
 
     // Pick the result.
     Value cmp =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, n, zero);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::eq, n, zero);
     rewriter.replaceOpWithNewOp<LLVM::SelectOp>(op, cmp, zero, plusOne);
     return success();
   }
@@ -114,32 +114,32 @@ struct ConvertIndexFloorDivS : mlir::ConvertOpToLLVMPattern<FloorDivSOp> {
     Location loc = op.getLoc();
     Value n = adaptor.getLhs();
     Value m = adaptor.getRhs();
-    Value zero = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), 0);
-    Value posOne = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), 1);
-    Value negOne = rewriter.create<LLVM::ConstantOp>(loc, n.getType(), -1);
+    Value zero = LLVM::ConstantOp::create(rewriter, loc, n.getType(), 0);
+    Value posOne = LLVM::ConstantOp::create(rewriter, loc, n.getType(), 1);
+    Value negOne = LLVM::ConstantOp::create(rewriter, loc, n.getType(), -1);
 
     // Compute `x`.
     Value mNeg =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::slt, m, zero);
-    Value x = rewriter.create<LLVM::SelectOp>(loc, mNeg, posOne, negOne);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::slt, m, zero);
+    Value x = LLVM::SelectOp::create(rewriter, loc, mNeg, posOne, negOne);
 
     // Compute the negative result.
-    Value xMinusN = rewriter.create<LLVM::SubOp>(loc, x, n);
-    Value xMinusNDivM = rewriter.create<LLVM::SDivOp>(loc, xMinusN, m);
-    Value negRes = rewriter.create<LLVM::SubOp>(loc, negOne, xMinusNDivM);
+    Value xMinusN = LLVM::SubOp::create(rewriter, loc, x, n);
+    Value xMinusNDivM = LLVM::SDivOp::create(rewriter, loc, xMinusN, m);
+    Value negRes = LLVM::SubOp::create(rewriter, loc, negOne, xMinusNDivM);
 
     // Compute the positive result.
-    Value posRes = rewriter.create<LLVM::SDivOp>(loc, n, m);
+    Value posRes = LLVM::SDivOp::create(rewriter, loc, n, m);
 
     // Pick the negative result if `n` and `m` have different signs and `n` is
     // non-zero, i.e. `(n < 0) != (m < 0) && n != 0`.
     Value nNeg =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::slt, n, zero);
-    Value diffSign =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::ne, nNeg, mNeg);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::slt, n, zero);
+    Value diffSign = LLVM::ICmpOp::create(rewriter, loc,
+                                          LLVM::ICmpPredicate::ne, nNeg, mNeg);
     Value nNonZero =
-        rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::ne, n, zero);
-    Value cmp = rewriter.create<LLVM::AndOp>(loc, diffSign, nNonZero);
+        LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::ne, n, zero);
+    Value cmp = LLVM::AndOp::create(rewriter, loc, diffSign, nNonZero);
     rewriter.replaceOpWithNewOp<LLVM::SelectOp>(op, cmp, negRes, posRes);
     return success();
   }
diff --git a/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp b/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp
index 4821962f989e6..36cfe9dd6e2db 100644
--- a/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp
+++ b/mlir/lib/Conversion/IndexToSPIRV/IndexToSPIRV.cpp
@@ -111,33 +111,33 @@ struct ConvertIndexCeilDivSPattern final : OpConversionPattern<CeilDivSOp> {
     Value m = adaptor.getRhs();
 
     // Define the constants
-    Value zero = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, 0));
-    Value posOne = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, 1));
-    Value negOne = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, -1));
+    Value zero = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                           IntegerAttr::get(n_type, 0));
+    Value posOne = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                             IntegerAttr::get(n_type, 1));
+    Value negOne = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                             IntegerAttr::get(n_type, -1));
 
     // Compute `x`.
-    Value mPos = rewriter.create<spirv::SGreaterThanOp>(loc, m, zero);
-    Value x = rewriter.create<spirv::SelectOp>(loc, mPos, negOne, posOne);
+    Value mPos = spirv::SGreaterThanOp::create(rewriter, loc, m, zero);
+    Value x = spirv::SelectOp::create(rewriter, loc, mPos, negOne, posOne);
 
     // Compute the positive result.
-    Value nPlusX = rewriter.create<spirv::IAddOp>(loc, n, x);
-    Value nPlusXDivM = rewriter.create<spirv::SDivOp>(loc, nPlusX, m);
-    Value posRes = rewriter.create<spirv::IAddOp>(loc, nPlusXDivM, posOne);
+    Value nPlusX = spirv::IAddOp::create(rewriter, loc, n, x);
+    Value nPlusXDivM = spirv::SDivOp::create(rewriter, loc, nPlusX, m);
+    Value posRes = spirv::IAddOp::create(rewriter, loc, nPlusXDivM, posOne);
 
     // Compute the negative result.
-    Value negN = rewriter.create<spirv::ISubOp>(loc, zero, n);
-    Value negNDivM = rewriter.create<spirv::SDivOp>(loc, negN, m);
-    Value negRes = rewriter.create<spirv::ISubOp>(loc, zero, negNDivM);
+    Value negN = spirv::ISubOp::create(rewriter, loc, zero, n);
+    Value negNDivM = spirv::SDivOp::create(rewriter, loc, negN, m);
+    Value negRes = spirv::ISubOp::create(rewriter, loc, zero, negNDivM);
 
     // Pick the positive result if `n` and `m` have the same sign and `n` is
     // non-zero, i.e. `(n > 0) == (m > 0) && n != 0`.
-    Value nPos = rewriter.create<spirv::SGreaterThanOp>(loc, n, zero);
-    Value sameSign = rewriter.create<spirv::LogicalEqualOp>(loc, nPos, mPos);
-    Value nNonZero = rewriter.create<spirv::INotEqualOp>(loc, n, zero);
-    Value cmp = rewriter.create<spirv::LogicalAndOp>(loc, sameSign, nNonZero);
+    Value nPos = spirv::SGreaterThanOp::create(rewriter, loc, n, zero);
+    Value sameSign = spirv::LogicalEqualOp::create(rewriter, loc, nPos, mPos);
+    Value nNonZero = spirv::INotEqualOp::create(rewriter, loc, n, zero);
+    Value cmp = spirv::LogicalAndOp::create(rewriter, loc, sameSign, nNonZero);
     rewriter.replaceOpWithNewOp<spirv::SelectOp>(op, cmp, posRes, negRes);
     return success();
   }
@@ -161,18 +161,18 @@ struct ConvertIndexCeilDivUPattern final : OpConversionPattern<CeilDivUOp> {
     Value m = adaptor.getRhs();
 
     // Define the constants
-    Value zero = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, 0));
-    Value one = rewriter.create<spirv::ConstantOp>(loc, n_type,
-                                                   IntegerAttr::get(n_type, 1));
+    Value zero = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                           IntegerAttr::get(n_type, 0));
+    Value one = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                          IntegerAttr::get(n_type, 1));
 
     // Compute the non-zero result.
-    Value minusOne = rewriter.create<spirv::ISubOp>(loc, n, one);
-    Value quotient = rewriter.create<spirv::UDivOp>(loc, minusOne, m);
-    Value plusOne = rewriter.create<spirv::IAddOp>(loc, quotient, one);
+    Value minusOne = spirv::ISubOp::create(rewriter, loc, n, one);
+    Value quotient = spirv::UDivOp::create(rewriter, loc, minusOne, m);
+    Value plusOne = spirv::IAddOp::create(rewriter, loc, quotient, one);
 
     // Pick the result
-    Value cmp = rewriter.create<spirv::IEqualOp>(loc, n, zero);
+    Value cmp = spirv::IEqualOp::create(rewriter, loc, n, zero);
     rewriter.replaceOpWithNewOp<spirv::SelectOp>(op, cmp, zero, plusOne);
     return success();
   }
@@ -197,32 +197,33 @@ struct ConvertIndexFloorDivSPattern final : OpConversionPattern<FloorDivSOp> {
     Value m = adaptor.getRhs();
 
     // Define the constants
-    Value zero = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, 0));
-    Value posOne = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, 1));
-    Value negOne = rewriter.create<spirv::ConstantOp>(
-        loc, n_type, IntegerAttr::get(n_type, -1));
+    Value zero = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                           IntegerAttr::get(n_type, 0));
+    Value posOne = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                             IntegerAttr::get(n_type, 1));
+    Value negOne = spirv::ConstantOp::create(rewriter, loc, n_type,
+                                             IntegerAttr::get(n_type, -1));
 
     // Compute `x`.
-    Value mNeg = rewriter.create<spirv::SLessThanOp>(loc, m, zero);
-    Value x = rewriter.create<spirv::SelectOp>(loc, mNeg, posOne, negOne);
+    Value mNeg = spirv::SLessThanOp::create(rewriter, loc, m, zero);
+    Value x = spirv::SelectOp::create(rewriter, loc, mNeg, posOne, negOne);
 
     // Compute the negative result
-    Value xMinusN = rewriter.create<spirv::ISubOp>(loc, x, n);
-    Value xMinusNDivM = rewriter.create<spirv::SDivOp>(loc, xMinusN, m);
-    Value negRes = rewriter.create<spirv::ISubOp>(loc, negOne, xMinusNDivM);
+    Value xMinusN = spirv::ISubOp::create(rewriter, loc, x, n);
+    Value xMinusNDivM = spirv::SDivOp::create(rewriter, loc, xMinusN, m);
+    Value negRes = spirv::ISubOp::create(rewriter, loc, negOne, xMinusNDivM);
 
     // Compute the positive result.
-    Value posRes = rewriter.create<spirv::SDivOp>(loc, n, m);
+    Value posRes = spirv::SDivOp::create(rewriter, loc, n, m);
 
     // Pick the negative result if `n` and `m` have different signs and `n` is
     // non-zero, i.e. `(n < 0) != (m < 0) && n != 0`.
-    Value nNeg = rewriter.create<spirv::SLessThanOp>(loc, n, zero);
-    Value diffSign = rewriter.create<spirv::LogicalNotEqualOp>(loc, nNeg, mNeg);
-    Value nNonZero = rewriter.create<spirv::INotEqualOp>(loc, n, zero);
+    Value nNeg = spirv::SLessThanOp::create(rewriter, loc, n, zero);
+    Value diffSign =
+        spirv::LogicalNotEqualOp::create(rewriter, loc, nNeg, mNeg);
+    Value nNonZero = spirv::INotEqualOp::create(rewriter, loc, n, zero);
 
-    Value cmp = rewriter.create<spirv::LogicalAndOp>(loc, diffSign, nNonZero);
+    Value cmp = spirv::LogicalAndOp::create(rewriter, loc, diffSign, nNonZero);
     rewriter.replaceOpWithNewOp<spirv::SelectOp>(op, cmp, posRes, negRes);
     return success();
   }
diff --git a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
index e34d5f74d232f..fce7a3f324b86 100644
--- a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
@@ -32,7 +32,7 @@ MemRefDescriptor::MemRefDescriptor(Value descriptor)
 MemRefDescriptor MemRefDescriptor::poison(OpBuilder &builder, Location loc,
                                           Type descriptorType) {
 
-  Value descriptor = builder.create<LLVM::PoisonOp>(loc, descriptorType);
+  Value descriptor = LLVM::PoisonOp::create(builder, loc, descriptorType);
   return MemRefDescriptor(descriptor);
 }
 
@@ -99,21 +99,21 @@ void MemRefDescriptor::setAlignedPtr(OpBuilder &builder, Location loc,
 // integer attribute.
 static Value createIndexAttrConstant(OpBuilder &builder, Location loc,
                                      Type resultType, int64_t value) {
-  return builder.create<LLVM::ConstantOp>(loc, resultType,
-                                          builder.getIndexAttr(value));
+  return LLVM::ConstantOp::create(builder, loc, resultType,
+                                  builder.getIndexAttr(value));
 }
 
 /// Builds IR extracting the offset from the descriptor.
 Value MemRefDescriptor::offset(OpBuilder &builder, Location loc) {
-  return builder.create<LLVM::ExtractValueOp>(loc, value,
-                                              kOffsetPosInMemRefDescriptor);
+  return LLVM::ExtractValueOp::create(builder, loc, value,
+                                      kOffsetPosInMemRefDescriptor);
 }
 
 /// Builds IR inserting the offset into the descriptor.
 void MemRefDescriptor::setOffset(OpBuilder &builder, Location loc,
                                  Value offset) {
-  value = builder.create<LLVM::InsertValueOp>(loc, value, offset,
-                                              kOffsetPosInMemRefDescriptor);
+  value = LLVM::InsertValueOp::create(builder, loc, value, offset,
+                                      kOffsetPosInMemRefDescriptor);
 }
 
 /// Builds IR inserting the offset into the descriptor.
@@ -125,8 +125,9 @@ void MemRefDescriptor::setConstantOffset(OpBuilder &builder, Location loc,
 
 /// Builds IR extracting the pos-th size from the descriptor.
 Value MemRefDescriptor::size(OpBuilder &builder, Location loc, unsigned pos) {
-  return builder.create<LLVM::ExtractValueOp>(
-      loc, value, ArrayRef<int64_t>({kSizePosInMemRefDescriptor, pos}));
+  return LLVM::ExtractValueOp::create(
+      builder, loc, value,
+      ArrayRef<int64_t>({kSizePosInMemRefDescriptor, pos}));
 }
 
 Value MemRefDescriptor::size(OpBuilder &builder, Location loc, Value pos,
@@ -137,23 +138,25 @@ Value MemRefDescriptor::size(OpBuilder &builder, Location loc, Value pos,
 
   // Copy size values to stack-allocated memory.
   auto one = createIndexAttrConstant(builder, loc, indexType, 1);
-  auto sizes = builder.create<LLVM::ExtractValueOp>(
-      loc, value, llvm::ArrayRef<int64_t>({kSizePosInMemRefDescriptor}));
-  auto sizesPtr = builder.create<LLVM::AllocaOp>(loc, ptrTy, arrayTy, one,
-                                                 /*alignment=*/0);
-  builder.create<LLVM::StoreOp>(loc, sizes, sizesPtr);
+  auto sizes = LLVM::ExtractValueOp::create(
+      builder, loc, value,
+      llvm::ArrayRef<int64_t>({kSizePosInMemRefDescriptor}));
+  auto sizesPtr = LLVM::AllocaOp::create(builder, loc, ptrTy, arrayTy, one,
+                                         /*alignment=*/0);
+  LLVM::StoreOp::create(builder, loc, sizes, sizesPtr);
 
   // Load an return size value of interest.
-  auto resultPtr = builder.create<LLVM::GEPOp>(loc, ptrTy, arrayTy, sizesPtr,
-                                               ArrayRef<LLVM::GEPArg>{0, pos});
-  return builder.create<LLVM::LoadOp>(loc, indexType, resultPtr);
+  auto resultPtr = LLVM::GEPOp::create(builder, loc, ptrTy, arrayTy, sizesPtr,
+                                       ArrayRef<LLVM::GEPArg>{0, pos});
+  return LLVM::LoadOp::create(builder, loc, indexType, resultPtr);
 }
 
 /// Builds IR inserting the pos-th size into the descriptor
 void MemRefDescriptor::setSize(OpBuilder &builder, Location loc, unsigned pos,
                                Value size) {
-  value = builder.create<LLVM::InsertValueOp>(
-      loc, value, size, ArrayRef<int64_t>({kSizePosInMemRefDescriptor, pos}));
+  value = LLVM::InsertValueOp::create(
+      builder, loc, value, size,
+      ArrayRef<int64_t>({kSizePosInMemRefDescriptor, pos}));
 }
 
 void MemRefDescriptor::setConstantSize(OpBuilder &builder, Location loc,
@@ -164,15 +167,16 @@ void MemRefDescriptor::setConstantSize(OpBuilder &builder, Location loc,
 
 /// Builds IR extracting the pos-th stride from the descriptor.
 Value MemRefDescriptor::stride(OpBuilder &builder, Location loc, unsigned pos) {
-  return builder.create<LLVM::ExtractValueOp>(
-      loc, value, ArrayRef<int64_t>({kStridePosInMemRefDescriptor, pos}));
+  return LLVM::ExtractValueOp::create(
+      builder, loc, value,
+      ArrayRef<int64_t>({kStridePosInMemRefDescriptor, pos}));
 }
 
 /// Builds IR inserting the pos-th stride into the descriptor
 void MemRefDescriptor::setStride(OpBuilder &builder, Location loc, unsigned pos,
                                  Value stride) {
-  value = builder.create<LLVM::InsertValueOp>(
-      loc, value, stride,
+  value = LLVM::InsertValueOp::create(
+      builder, loc, value, stride,
       ArrayRef<int64_t>({kStridePosInMemRefDescriptor, pos}));
 }
 
@@ -207,8 +211,8 @@ Value MemRefDescriptor::bufferPtr(OpBuilder &builder, Location loc,
           ? offset(builder, loc)
           : createIndexAttrConstant(builder, loc, indexType, offsetCst);
   Type elementType = converter.convertType(type.getElementType());
-  ptr = builder.create<LLVM::GEPOp>(loc, ptr.getType(), elementType, ptr,
-                                    offsetVal);
+  ptr = LLVM::GEPOp::create(builder, loc, ptr.getType(), elementType, ptr,
+                            offsetVal);
   return ptr;
 }
 
@@ -303,7 +307,7 @@ UnrankedMemRefDescriptor::UnrankedMemRefDescriptor(Value descriptor)
 UnrankedMemRefDescriptor UnrankedMemRefDescriptor::poison(OpBuilder &builder,
                                                           Location loc,
                                                           Type descriptorType) {
-  Value descriptor = builder.create<LLVM::PoisonOp>(loc, descriptorType);
+  Value descriptor = LLVM::PoisonOp::create(builder, loc, descriptorType);
   return UnrankedMemRefDescriptor(descriptor);
 }
 Value UnrankedMemRefDescriptor::rank(OpBuilder &builder, Location loc) const {
@@ -380,19 +384,19 @@ void UnrankedMemRefDescriptor::computeSizes(
         builder, loc, indexType,
         llvm::divideCeil(typeConverter.getPointerBitwidth(addressSpace), 8));
     Value doublePointerSize =
-        builder.create<LLVM::MulOp>(loc, indexType, two, pointerSize);
+        LLVM::MulOp::create(builder, loc, indexType, two, pointerSize);
 
     // (1 + 2 * rank) * sizeof(index)
     Value rank = desc.rank(builder, loc);
-    Value doubleRank = builder.create<LLVM::MulOp>(loc, indexType, two, rank);
+    Value doubleRank = LLVM::MulOp::create(builder, loc, indexType, two, rank);
     Value doubleRankIncremented =
-        builder.create<LLVM::AddOp>(loc, indexType, doubleRank, one);
-    Value rankIndexSize = builder.create<LLVM::MulOp>(
-        loc, indexType, doubleRankIncremented, indexSize);
+        LLVM::AddOp::create(builder, loc, indexType, doubleRank, one);
+    Value rankIndexSize = LLVM::MulOp::create(builder, loc, indexType,
+                                              doubleRankIncremented, indexSize);
 
     // Total allocation size.
-    Value allocationSize = builder.create<LLVM::AddOp>(
-        loc, indexType, doublePointerSize, rankIndexSize);
+    Value allocationSize = LLVM::AddOp::create(
+        builder, loc, indexType, doublePointerSize, rankIndexSize);
     sizes.push_back(allocationSize);
   }
 }
@@ -400,13 +404,13 @@ void UnrankedMemRefDescriptor::computeSizes(
 Value UnrankedMemRefDescriptor::allocatedPtr(
     OpBuilder &builder, Location loc, Value memRefDescPtr,
     LLVM::LLVMPointerType elemPtrType) {
-  return builder.create<LLVM::LoadOp>(loc, elemPtrType, memRefDescPtr);
+  return LLVM::LoadOp::create(builder, loc, elemPtrType, memRefDescPtr);
 }
 
 void UnrankedMemRefDescriptor::setAllocatedPtr(
     OpBuilder &builder, Location loc, Value memRefDescPtr,
     LLVM::LLVMPointerType elemPtrType, Value allocatedPtr) {
-  builder.create<LLVM::StoreOp>(loc, allocatedPtr, memRefDescPtr);
+  LLVM::StoreOp::create(builder, loc, allocatedPtr, memRefDescPtr);
 }
 
 static std::pair<Value, Type>
@@ -423,9 +427,9 @@ Value UnrankedMemRefDescriptor::alignedPtr(
       castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);
 
   Value alignedGep =
-      builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
-                                  elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
-  return builder.create<LLVM::LoadOp>(loc, elemPtrType, alignedGep);
+      LLVM::GEPOp::create(builder, loc, elemPtrPtrType, elemPtrType,
+                          elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
+  return LLVM::LoadOp::create(builder, loc, elemPtrType, alignedGep);
 }
 
 void UnrankedMemRefDescriptor::setAlignedPtr(
@@ -435,9 +439,9 @@ void UnrankedMemRefDescriptor::setAlignedPtr(
       castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);
 
   Value alignedGep =
-      builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
-                                  elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
-  builder.create<LLVM::StoreOp>(loc, alignedPtr, alignedGep);
+      LLVM::GEPOp::create(builder, loc, elemPtrPtrType, elemPtrType,
+                          elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
+  LLVM::StoreOp::create(builder, loc, alignedPtr, alignedGep);
 }
 
 Value UnrankedMemRefDescriptor::offsetBasePtr(
@@ -446,8 +450,8 @@ Value UnrankedMemRefDescriptor::offsetBasePtr(
   auto [elementPtrPtr, elemPtrPtrType] =
       castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);
 
-  return builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
-                                     elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});
+  return LLVM::GEPOp::create(builder, loc, elemPtrPtrType, elemPtrType,
+                             elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});
 }
 
 Value UnrankedMemRefDescriptor::offset(OpBuilder &builder, Location loc,
@@ -456,8 +460,8 @@ Value UnrankedMemRefDescriptor::offset(OpBuilder &builder, Location loc,
                                        LLVM::LLVMPointerType elemPtrType) {
   Value offsetPtr =
       offsetBasePtr(builder, loc, typeConverter, memRefDescPtr, elemPtrType);
-  return builder.create<LLVM::LoadOp>(loc, typeConverter.getIndexType(),
-                                      offsetPtr);
+  return LLVM::LoadOp::create(builder, loc, typeConverter.getIndexType(),
+                              offsetPtr);
 }
 
 void UnrankedMemRefDescriptor::setOffset(OpBuilder &builder, Location loc,
@@ -467,7 +471,7 @@ void UnrankedMemRefDescriptor::setOffset(OpBuilder &builder, Location loc,
                                          Value offset) {
   Value offsetPtr =
       offsetBasePtr(builder, loc, typeConverter, memRefDescPtr, elemPtrType);
-  builder.create<LLVM::StoreOp>(loc, offset, offsetPtr);
+  LLVM::StoreOp::create(builder, loc, offset, offsetPtr);
 }
 
 Value UnrankedMemRefDescriptor::sizeBasePtr(
@@ -477,8 +481,8 @@ Value UnrankedMemRefDescriptor::sizeBasePtr(
   Type structTy = LLVM::LLVMStructType::getLiteral(
       indexTy.getContext(), {elemPtrType, elemPtrType, indexTy, indexTy});
   auto resultType = LLVM::LLVMPointerType::get(builder.getContext());
-  return builder.create<LLVM::GEPOp>(loc, resultType, structTy, memRefDescPtr,
-                                     ArrayRef<LLVM::GEPArg>{0, 3});
+  return LLVM::GEPOp::create(builder, loc, resultType, structTy, memRefDescPtr,
+                             ArrayRef<LLVM::GEPArg>{0, 3});
 }
 
 Value UnrankedMemRefDescriptor::size(OpBuilder &builder, Location loc,
@@ -489,8 +493,8 @@ Value UnrankedMemRefDescriptor::size(OpBuilder &builder, Location loc,
   auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
   Value sizeStoreGep =
-      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, sizeBasePtr, index);
-  return builder.create<LLVM::LoadOp>(loc, indexTy, sizeStoreGep);
+      LLVM::GEPOp::create(builder, loc, ptrType, indexTy, sizeBasePtr, index);
+  return LLVM::LoadOp::create(builder, loc, indexTy, sizeStoreGep);
 }
 
 void UnrankedMemRefDescriptor::setSize(OpBuilder &builder, Location loc,
@@ -501,8 +505,8 @@ void UnrankedMemRefDescriptor::setSize(OpBuilder &builder, Location loc,
   auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
   Value sizeStoreGep =
-      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, sizeBasePtr, index);
-  builder.create<LLVM::StoreOp>(loc, size, sizeStoreGep);
+      LLVM::GEPOp::create(builder, loc, ptrType, indexTy, sizeBasePtr, index);
+  LLVM::StoreOp::create(builder, loc, size, sizeStoreGep);
 }
 
 Value UnrankedMemRefDescriptor::strideBasePtr(
@@ -511,7 +515,7 @@ Value UnrankedMemRefDescriptor::strideBasePtr(
   Type indexTy = typeConverter.getIndexType();
   auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
-  return builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, sizeBasePtr, rank);
+  return LLVM::GEPOp::create(builder, loc, ptrType, indexTy, sizeBasePtr, rank);
 }
 
 Value UnrankedMemRefDescriptor::stride(OpBuilder &builder, Location loc,
@@ -522,8 +526,8 @@ Value UnrankedMemRefDescriptor::stride(OpBuilder &builder, Location loc,
   auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
   Value strideStoreGep =
-      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, strideBasePtr, index);
-  return builder.create<LLVM::LoadOp>(loc, indexTy, strideStoreGep);
+      LLVM::GEPOp::create(builder, loc, ptrType, indexTy, strideBasePtr, index);
+  return LLVM::LoadOp::create(builder, loc, indexTy, strideStoreGep);
 }
 
 void UnrankedMemRefDescriptor::setStride(OpBuilder &builder, Location loc,
@@ -534,6 +538,6 @@ void UnrankedMemRefDescriptor::setStride(OpBuilder &builder, Location loc,
   auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
 
   Value strideStoreGep =
-      builder.create<LLVM::GEPOp>(loc, ptrType, indexTy, strideBasePtr, index);
-  builder.create<LLVM::StoreOp>(loc, stride, strideStoreGep);
+      LLVM::GEPOp::create(builder, loc, ptrType, indexTy, strideBasePtr, index);
+  LLVM::StoreOp::create(builder, loc, stride, strideStoreGep);
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
index c5f72f7e10b8c..ecd5b6367fba4 100644
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -57,8 +57,8 @@ Value ConvertToLLVMPattern::createIndexAttrConstant(OpBuilder &builder,
                                                     Location loc,
                                                     Type resultType,
                                                     int64_t value) {
-  return builder.create<LLVM::ConstantOp>(loc, resultType,
-                                          builder.getIndexAttr(value));
+  return LLVM::ConstantOp::create(builder, loc, resultType,
+                                  builder.getIndexAttr(value));
 }
 
 Value ConvertToLLVMPattern::getStridedElementPtr(
@@ -123,7 +123,7 @@ void ConvertToLLVMPattern::getMemRefDescriptorSizes(
       runningStride = sizes[i];
     else if (stride == ShapedType::kDynamic)
       runningStride =
-          rewriter.create<LLVM::MulOp>(loc, runningStride, sizes[i]);
+          LLVM::MulOp::create(rewriter, loc, runningStride, sizes[i]);
     else
       runningStride = createIndexAttrConstant(rewriter, loc, indexType, stride);
   }
@@ -131,10 +131,10 @@ void ConvertToLLVMPattern::getMemRefDescriptorSizes(
     // Buffer size in bytes.
     Type elementType = typeConverter->convertType(memRefType.getElementType());
     auto elementPtrType = LLVM::LLVMPointerType::get(rewriter.getContext());
-    Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, elementPtrType);
-    Value gepPtr = rewriter.create<LLVM::GEPOp>(
-        loc, elementPtrType, elementType, nullPtr, runningStride);
-    size = rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
+    Value nullPtr = LLVM::ZeroOp::create(rewriter, loc, elementPtrType);
+    Value gepPtr = LLVM::GEPOp::create(rewriter, loc, elementPtrType,
+                                       elementType, nullPtr, runningStride);
+    size = LLVM::PtrToIntOp::create(rewriter, loc, getIndexType(), gepPtr);
   } else {
     size = runningStride;
   }
@@ -149,10 +149,10 @@ Value ConvertToLLVMPattern::getSizeInBytes(
   // which is a common pattern of getting the size of a type in bytes.
   Type llvmType = typeConverter->convertType(type);
   auto convertedPtrType = LLVM::LLVMPointerType::get(rewriter.getContext());
-  auto nullPtr = rewriter.create<LLVM::ZeroOp>(loc, convertedPtrType);
-  auto gep = rewriter.create<LLVM::GEPOp>(loc, convertedPtrType, llvmType,
-                                          nullPtr, ArrayRef<LLVM::GEPArg>{1});
-  return rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
+  auto nullPtr = LLVM::ZeroOp::create(rewriter, loc, convertedPtrType);
+  auto gep = LLVM::GEPOp::create(rewriter, loc, convertedPtrType, llvmType,
+                                 nullPtr, ArrayRef<LLVM::GEPArg>{1});
+  return LLVM::PtrToIntOp::create(rewriter, loc, getIndexType(), gep);
 }
 
 Value ConvertToLLVMPattern::getNumElements(
@@ -175,7 +175,7 @@ Value ConvertToLLVMPattern::getNumElements(
           staticSize == ShapedType::kDynamic
               ? dynamicSizes[dynamicIndex++]
               : createIndexAttrConstant(rewriter, loc, indexType, staticSize);
-      numElements = rewriter.create<LLVM::MulOp>(loc, numElements, size);
+      numElements = LLVM::MulOp::create(rewriter, loc, numElements, size);
     } else {
       numElements =
           staticSize == ShapedType::kDynamic
@@ -276,14 +276,14 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
             ? builder
                   .create<LLVM::CallOp>(loc, mallocFunc.value(), allocationSize)
                   .getResult()
-            : builder.create<LLVM::AllocaOp>(loc, getPtrType(),
-                                             IntegerType::get(getContext(), 8),
-                                             allocationSize,
-                                             /*alignment=*/0);
+            : LLVM::AllocaOp::create(builder, loc, getPtrType(),
+                                     IntegerType::get(getContext(), 8),
+                                     allocationSize,
+                                     /*alignment=*/0);
     Value source = desc.memRefDescPtr(builder, loc);
-    builder.create<LLVM::MemcpyOp>(loc, memory, source, allocationSize, false);
+    LLVM::MemcpyOp::create(builder, loc, memory, source, allocationSize, false);
     if (!toDynamic)
-      builder.create<LLVM::CallOp>(loc, freeFunc.value(), source);
+      LLVM::CallOp::create(builder, loc, freeFunc.value(), source);
 
     // Create a new descriptor. The same descriptor can be returned multiple
     // times, attempting to modify its pointer can lead to memory leaks
@@ -349,8 +349,8 @@ LogicalResult LLVM::detail::oneToOneRewrite(
   SmallVector<Value, 4> results;
   results.reserve(numResults);
   for (unsigned i = 0; i < numResults; ++i) {
-    results.push_back(rewriter.create<LLVM::ExtractValueOp>(
-        op->getLoc(), newOp->getResult(0), i));
+    results.push_back(LLVM::ExtractValueOp::create(rewriter, op->getLoc(),
+                                                   newOp->getResult(0), i));
   }
   rewriter.replaceOp(op, results);
   return success();
@@ -371,8 +371,8 @@ LogicalResult LLVM::detail::intrinsicRewrite(
   if (numResults != 0)
     resType = typeConverter.packOperationResults(op->getResultTypes());
 
-  auto callIntrOp = rewriter.create<LLVM::CallIntrinsicOp>(
-      loc, resType, rewriter.getStringAttr(intrinsic), operands);
+  auto callIntrOp = LLVM::CallIntrinsicOp::create(
+      rewriter, loc, resType, rewriter.getStringAttr(intrinsic), operands);
   // Propagate attributes.
   callIntrOp->setAttrs(op->getAttrDictionary());
 
@@ -388,7 +388,7 @@ LogicalResult LLVM::detail::intrinsicRewrite(
   results.reserve(numResults);
   Value intrRes = callIntrOp.getResults();
   for (unsigned i = 0; i < numResults; ++i)
-    results.push_back(rewriter.create<LLVM::ExtractValueOp>(loc, intrRes, i));
+    results.push_back(LLVM::ExtractValueOp::create(rewriter, loc, intrRes, i));
   rewriter.replaceOp(op, results);
 
   return success();
@@ -406,7 +406,7 @@ static unsigned getBitWidth(Type type) {
 static Value createI32Constant(OpBuilder &builder, Location loc,
                                int32_t value) {
   Type i32 = builder.getI32Type();
-  return builder.create<LLVM::ConstantOp>(loc, i32, value);
+  return LLVM::ConstantOp::create(builder, loc, i32, value);
 }
 
 SmallVector<Value> mlir::LLVM::decomposeValue(OpBuilder &builder, Location loc,
@@ -418,17 +418,17 @@ SmallVector<Value> mlir::LLVM::decomposeValue(OpBuilder &builder, Location loc,
   unsigned srcBitWidth = getBitWidth(srcType);
   unsigned dstBitWidth = getBitWidth(dstType);
   if (srcBitWidth == dstBitWidth) {
-    Value cast = builder.create<LLVM::BitcastOp>(loc, dstType, src);
+    Value cast = LLVM::BitcastOp::create(builder, loc, dstType, src);
     return {cast};
   }
 
   if (dstBitWidth > srcBitWidth) {
     auto smallerInt = builder.getIntegerType(srcBitWidth);
     if (srcType != smallerInt)
-      src = builder.create<LLVM::BitcastOp>(loc, smallerInt, src);
+      src = LLVM::BitcastOp::create(builder, loc, smallerInt, src);
 
     auto largerInt = builder.getIntegerType(dstBitWidth);
-    Value res = builder.create<LLVM::ZExtOp>(loc, largerInt, src);
+    Value res = LLVM::ZExtOp::create(builder, loc, largerInt, src);
     return {res};
   }
   assert(srcBitWidth % dstBitWidth == 0 &&
@@ -436,12 +436,12 @@ SmallVector<Value> mlir::LLVM::decomposeValue(OpBuilder &builder, Location loc,
   int64_t numElements = srcBitWidth / dstBitWidth;
   auto vecType = VectorType::get(numElements, dstType);
 
-  src = builder.create<LLVM::BitcastOp>(loc, vecType, src);
+  src = LLVM::BitcastOp::create(builder, loc, vecType, src);
 
   SmallVector<Value> res;
   for (auto i : llvm::seq(numElements)) {
     Value idx = createI32Constant(builder, loc, i);
-    Value elem = builder.create<LLVM::ExtractElementOp>(loc, src, idx);
+    Value elem = LLVM::ExtractElementOp::create(builder, loc, src, idx);
     res.emplace_back(elem);
   }
 
@@ -461,28 +461,28 @@ Value mlir::LLVM::composeValue(OpBuilder &builder, Location loc, ValueRange src,
     if (dstBitWidth < srcBitWidth) {
       auto largerInt = builder.getIntegerType(srcBitWidth);
       if (res.getType() != largerInt)
-        res = builder.create<LLVM::BitcastOp>(loc, largerInt, res);
+        res = LLVM::BitcastOp::create(builder, loc, largerInt, res);
 
       auto smallerInt = builder.getIntegerType(dstBitWidth);
-      res = builder.create<LLVM::TruncOp>(loc, smallerInt, res);
+      res = LLVM::TruncOp::create(builder, loc, smallerInt, res);
     }
 
     if (res.getType() != dstType)
-      res = builder.create<LLVM::BitcastOp>(loc, dstType, res);
+      res = LLVM::BitcastOp::create(builder, loc, dstType, res);
 
     return res;
   }
 
   int64_t numElements = src.size();
   auto srcType = VectorType::get(numElements, src.front().getType());
-  Value res = builder.create<LLVM::PoisonOp>(loc, srcType);
+  Value res = LLVM::PoisonOp::create(builder, loc, srcType);
   for (auto &&[i, elem] : llvm::enumerate(src)) {
     Value idx = createI32Constant(builder, loc, i);
-    res = builder.create<LLVM::InsertElementOp>(loc, srcType, res, elem, idx);
+    res = LLVM::InsertElementOp::create(builder, loc, srcType, res, elem, idx);
   }
 
   if (res.getType() != dstType)
-    res = builder.create<LLVM::BitcastOp>(loc, dstType, res);
+    res = LLVM::BitcastOp::create(builder, loc, dstType, res);
 
   return res;
 }
@@ -518,20 +518,20 @@ Value mlir::LLVM::getStridedElementPtr(OpBuilder &builder, Location loc,
       Value stride =
           ShapedType::isDynamic(strides[i])
               ? memRefDescriptor.stride(builder, loc, i)
-              : builder.create<LLVM::ConstantOp>(
-                    loc, indexType, builder.getIndexAttr(strides[i]));
-      increment =
-          builder.create<LLVM::MulOp>(loc, increment, stride, intOverflowFlags);
+              : LLVM::ConstantOp::create(builder, loc, indexType,
+                                         builder.getIndexAttr(strides[i]));
+      increment = LLVM::MulOp::create(builder, loc, increment, stride,
+                                      intOverflowFlags);
     }
-    index = index ? builder.create<LLVM::AddOp>(loc, index, increment,
-                                                intOverflowFlags)
+    index = index ? LLVM::AddOp::create(builder, loc, index, increment,
+                                        intOverflowFlags)
                   : increment;
   }
 
   Type elementPtrType = memRefDescriptor.getElementPtrType();
-  return index ? builder.create<LLVM::GEPOp>(
-                     loc, elementPtrType,
-                     converter.convertType(type.getElementType()), base, index,
-                     noWrapFlags)
-               : base;
+  return index
+             ? LLVM::GEPOp::create(builder, loc, elementPtrType,
+                                   converter.convertType(type.getElementType()),
+                                   base, index, noWrapFlags)
+             : base;
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
index 49c73fbc9dd79..d95aeba8a4488 100644
--- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -66,23 +66,23 @@ LogicalResult mlir::LLVM::createPrintStrCall(
       DenseElementsAttr::get(dataAttrType, llvm::ArrayRef(elementVals));
   auto arrayTy =
       LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size());
-  auto globalOp = builder.create<LLVM::GlobalOp>(
-      loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
+  auto globalOp = LLVM::GlobalOp::create(
+      builder, loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
       ensureSymbolNameIsUnique(moduleOp, symbolName, symbolTables), dataAttr);
 
   auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
   // Emit call to `printStr` in runtime library.
   builder.restoreInsertionPoint(ip);
   auto msgAddr =
-      builder.create<LLVM::AddressOfOp>(loc, ptrTy, globalOp.getName());
+      LLVM::AddressOfOp::create(builder, loc, ptrTy, globalOp.getName());
   SmallVector<LLVM::GEPArg> indices(1, 0);
   Value gep =
-      builder.create<LLVM::GEPOp>(loc, ptrTy, arrayTy, msgAddr, indices);
+      LLVM::GEPOp::create(builder, loc, ptrTy, arrayTy, msgAddr, indices);
   FailureOr<LLVM::LLVMFuncOp> printer =
       LLVM::lookupOrCreatePrintStringFn(builder, moduleOp, runtimeFunctionName);
   if (failed(printer))
     return failure();
-  builder.create<LLVM::CallOp>(loc, TypeRange(),
-                               SymbolRefAttr::get(printer.value()), gep);
+  LLVM::CallOp::create(builder, loc, TypeRange(),
+                       SymbolRefAttr::get(printer.value()), gep);
   return success();
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/StructBuilder.cpp b/mlir/lib/Conversion/LLVMCommon/StructBuilder.cpp
index 1cd0bd85f9894..13ed4628c3c9e 100644
--- a/mlir/lib/Conversion/LLVMCommon/StructBuilder.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/StructBuilder.cpp
@@ -24,10 +24,10 @@ StructBuilder::StructBuilder(Value v) : value(v), structType(v.getType()) {
 
 Value StructBuilder::extractPtr(OpBuilder &builder, Location loc,
                                 unsigned pos) const {
-  return builder.create<LLVM::ExtractValueOp>(loc, value, pos);
+  return LLVM::ExtractValueOp::create(builder, loc, value, pos);
 }
 
 void StructBuilder::setPtr(OpBuilder &builder, Location loc, unsigned pos,
                            Value ptr) {
-  value = builder.create<LLVM::InsertValueOp>(loc, value, ptr, pos);
+  value = LLVM::InsertValueOp::create(builder, loc, value, ptr, pos);
 }
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 7312594c761f7..1a9bf569086da 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -91,7 +91,7 @@ static Value unrankedMemRefMaterialization(OpBuilder &builder,
       packUnrankedMemRefDesc(builder, resultType, inputs, loc, converter);
   if (!packed)
     return Value();
-  return builder.create<UnrealizedConversionCastOp>(loc, resultType, packed)
+  return UnrealizedConversionCastOp::create(builder, loc, resultType, packed)
       .getResult(0);
 }
 
@@ -107,7 +107,7 @@ static Value rankedMemRefMaterialization(OpBuilder &builder,
       packRankedMemRefDesc(builder, resultType, inputs, loc, converter);
   if (!packed)
     return Value();
-  return builder.create<UnrealizedConversionCastOp>(loc, resultType, packed)
+  return UnrealizedConversionCastOp::create(builder, loc, resultType, packed)
       .getResult(0);
 }
 
@@ -224,12 +224,12 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
   // non-LLVM types persist after an LLVM conversion.
   addSourceMaterialization([&](OpBuilder &builder, Type resultType,
                                ValueRange inputs, Location loc) {
-    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+    return UnrealizedConversionCastOp::create(builder, loc, resultType, inputs)
         .getResult(0);
   });
   addTargetMaterialization([&](OpBuilder &builder, Type resultType,
                                ValueRange inputs, Location loc) {
-    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+    return UnrealizedConversionCastOp::create(builder, loc, resultType, inputs)
         .getResult(0);
   });
 
@@ -731,12 +731,12 @@ Value LLVMTypeConverter::promoteOneMemRefDescriptor(Location loc, Value operand,
   // Alloca with proper alignment. We do not expect optimizations of this
   // alloca op and so we omit allocating at the entry block.
   auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
-  Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
-                                               builder.getIndexAttr(1));
+  Value one = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                                       builder.getIndexAttr(1));
   Value allocated =
-      builder.create<LLVM::AllocaOp>(loc, ptrType, operand.getType(), one);
+      LLVM::AllocaOp::create(builder, loc, ptrType, operand.getType(), one);
   // Store into the alloca'ed descriptor.
-  builder.create<LLVM::StoreOp>(loc, operand, allocated);
+  LLVM::StoreOp::create(builder, loc, operand, allocated);
   return allocated;
 }
 
diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
index bf3f31729c3da..e7dd0b506e12d 100644
--- a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
@@ -87,17 +87,17 @@ LogicalResult LLVM::detail::handleMultidimensionalVectors(
   auto result1DVectorTy = resultTypeInfo.llvm1DVectorTy;
   auto resultNDVectoryTy = resultTypeInfo.llvmNDVectorTy;
   auto loc = op->getLoc();
-  Value desc = rewriter.create<LLVM::PoisonOp>(loc, resultNDVectoryTy);
+  Value desc = LLVM::PoisonOp::create(rewriter, loc, resultNDVectoryTy);
   nDVectorIterate(resultTypeInfo, rewriter, [&](ArrayRef<int64_t> position) {
     // For this unrolled `position` corresponding to the `linearIndex`^th
     // element, extract operand vectors
     SmallVector<Value, 4> extractedOperands;
     for (const auto &operand : llvm::enumerate(operands)) {
-      extractedOperands.push_back(rewriter.create<LLVM::ExtractValueOp>(
-          loc, operand.value(), position));
+      extractedOperands.push_back(LLVM::ExtractValueOp::create(
+          rewriter, loc, operand.value(), position));
     }
     Value newVal = createOperand(result1DVectorTy, extractedOperands);
-    desc = rewriter.create<LLVM::InsertValueOp>(loc, desc, newVal, position);
+    desc = LLVM::InsertValueOp::create(rewriter, loc, desc, newVal, position);
   });
   rewriter.replaceOp(op, desc);
   return success();
diff --git a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
index c3f213147b7a7..3f4b4d6cbc8ab 100644
--- a/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
+++ b/mlir/lib/Conversion/LinalgToStandard/LinalgToStandard.cpp
@@ -78,8 +78,8 @@ getLibraryCallSymbolRef(Operation *op, PatternRewriter &rewriter) {
   // Insert before module terminator.
   rewriter.setInsertionPoint(module.getBody(),
                              std::prev(module.getBody()->end()));
-  func::FuncOp funcOp = rewriter.create<func::FuncOp>(
-      op->getLoc(), fnNameAttr.getValue(), libFnType);
+  func::FuncOp funcOp = func::FuncOp::create(rewriter, op->getLoc(),
+                                             fnNameAttr.getValue(), libFnType);
   // Insert a function attribute that will trigger the emission of the
   // corresponding `_mlir_ciface_xxx` interface so that external libraries see
   // a normalized ABI. This interface is added during std to llvm conversion.
@@ -100,8 +100,8 @@ createTypeCanonicalizedMemRefOperands(OpBuilder &b, Location loc,
       res.push_back(op);
       continue;
     }
-    Value cast =
-        b.create<memref::CastOp>(loc, makeStridedLayoutDynamic(memrefType), op);
+    Value cast = memref::CastOp::create(
+        b, loc, makeStridedLayoutDynamic(memrefType), op);
     res.push_back(cast);
   }
   return res;
diff --git a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
index d4deff5b88070..5b68eb8188996 100644
--- a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
+++ b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp
@@ -54,18 +54,18 @@ std::pair<Value, Value> getRawPtrAndSize(const Location loc,
                                          Value memRef, Type elType) {
   Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
   Value dataPtr =
-      rewriter.create<LLVM::ExtractValueOp>(loc, ptrType, memRef, 1);
-  Value offset = rewriter.create<LLVM::ExtractValueOp>(
-      loc, rewriter.getI64Type(), memRef, 2);
+      LLVM::ExtractValueOp::create(rewriter, loc, ptrType, memRef, 1);
+  Value offset = LLVM::ExtractValueOp::create(rewriter, loc,
+                                              rewriter.getI64Type(), memRef, 2);
   Value resPtr =
-      rewriter.create<LLVM::GEPOp>(loc, ptrType, elType, dataPtr, offset);
+      LLVM::GEPOp::create(rewriter, loc, ptrType, elType, dataPtr, offset);
   Value size;
   if (cast<LLVM::LLVMStructType>(memRef.getType()).getBody().size() > 3) {
-    size = rewriter.create<LLVM::ExtractValueOp>(loc, memRef,
-                                                 ArrayRef<int64_t>{3, 0});
-    size = rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), size);
+    size = LLVM::ExtractValueOp::create(rewriter, loc, memRef,
+                                        ArrayRef<int64_t>{3, 0});
+    size = LLVM::TruncOp::create(rewriter, loc, rewriter.getI32Type(), size);
   } else {
-    size = rewriter.create<arith::ConstantIntOp>(loc, 1, 32);
+    size = arith::ConstantIntOp::create(rewriter, loc, 1, 32);
   }
   return {resPtr, size};
 }
@@ -157,13 +157,13 @@ class MPICHImplTraits : public MPIImplTraits {
   Value getCommWorld(const Location loc,
                      ConversionPatternRewriter &rewriter) override {
     static constexpr int MPI_COMM_WORLD = 0x44000000;
-    return rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64Type(),
-                                             MPI_COMM_WORLD);
+    return LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(),
+                                    MPI_COMM_WORLD);
   }
 
   Value castComm(const Location loc, ConversionPatternRewriter &rewriter,
                  Value comm) override {
-    return rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), comm);
+    return LLVM::TruncOp::create(rewriter, loc, rewriter.getI32Type(), comm);
   }
 
   intptr_t getStatusIgnore() override { return 1; }
@@ -195,7 +195,8 @@ class MPICHImplTraits : public MPIImplTraits {
       mtype = MPI_UINT8_T;
     else
       assert(false && "unsupported type");
-    return rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI32Type(), mtype);
+    return LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(),
+                                    mtype);
   }
 
   Value getMPIOp(const Location loc, ConversionPatternRewriter &rewriter,
@@ -245,7 +246,7 @@ class MPICHImplTraits : public MPIImplTraits {
       op = MPI_REPLACE;
       break;
     }
-    return rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI32Type(), op);
+    return LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(), op);
   }
 };
 
@@ -281,16 +282,16 @@ class OMPIImplTraits : public MPIImplTraits {
     getOrDefineExternalStruct(loc, rewriter, name, commStructT);
 
     // get address of symbol
-    auto comm = rewriter.create<LLVM::AddressOfOp>(
-        loc, LLVM::LLVMPointerType::get(context),
-        SymbolRefAttr::get(context, name));
-    return rewriter.create<LLVM::PtrToIntOp>(loc, rewriter.getI64Type(), comm);
+    auto comm = LLVM::AddressOfOp::create(rewriter, loc,
+                                          LLVM::LLVMPointerType::get(context),
+                                          SymbolRefAttr::get(context, name));
+    return LLVM::PtrToIntOp::create(rewriter, loc, rewriter.getI64Type(), comm);
   }
 
   Value castComm(const Location loc, ConversionPatternRewriter &rewriter,
                  Value comm) override {
-    return rewriter.create<LLVM::IntToPtrOp>(
-        loc, LLVM::LLVMPointerType::get(rewriter.getContext()), comm);
+    return LLVM::IntToPtrOp::create(
+        rewriter, loc, LLVM::LLVMPointerType::get(rewriter.getContext()), comm);
   }
 
   intptr_t getStatusIgnore() override { return 0; }
@@ -330,9 +331,9 @@ class OMPIImplTraits : public MPIImplTraits {
     // make sure global op definition exists
     getOrDefineExternalStruct(loc, rewriter, mtype, typeStructT);
     // get address of symbol
-    return rewriter.create<LLVM::AddressOfOp>(
-        loc, LLVM::LLVMPointerType::get(context),
-        SymbolRefAttr::get(context, mtype));
+    return LLVM::AddressOfOp::create(rewriter, loc,
+                                     LLVM::LLVMPointerType::get(context),
+                                     SymbolRefAttr::get(context, mtype));
   }
 
   Value getMPIOp(const Location loc, ConversionPatternRewriter &rewriter,
@@ -389,9 +390,9 @@ class OMPIImplTraits : public MPIImplTraits {
     // make sure global op definition exists
     getOrDefineExternalStruct(loc, rewriter, op, opStructT);
     // get address of symbol
-    return rewriter.create<LLVM::AddressOfOp>(
-        loc, LLVM::LLVMPointerType::get(context),
-        SymbolRefAttr::get(context, op));
+    return LLVM::AddressOfOp::create(rewriter, loc,
+                                     LLVM::LLVMPointerType::get(context),
+                                     SymbolRefAttr::get(context, op));
   }
 };
 
@@ -424,7 +425,7 @@ struct InitOpLowering : public ConvertOpToLLVMPattern<mpi::InitOp> {
     Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
 
     // instantiate nullptr `%nullptr = llvm.mlir.zero : !llvm.ptr`
-    auto nullPtrOp = rewriter.create<LLVM::ZeroOp>(loc, ptrType);
+    auto nullPtrOp = LLVM::ZeroOp::create(rewriter, loc, ptrType);
     Value llvmnull = nullPtrOp.getRes();
 
     // grab a reference to the global module op:
@@ -513,9 +514,9 @@ struct CommSplitOpLowering : public ConvertOpToLLVMPattern<mpi::CommSplitOp> {
 
     // get communicator
     Value comm = mpiTraits->castComm(loc, rewriter, adaptor.getComm());
-    auto one = rewriter.create<LLVM::ConstantOp>(loc, i32, 1);
+    auto one = LLVM::ConstantOp::create(rewriter, loc, i32, 1);
     auto outPtr =
-        rewriter.create<LLVM::AllocaOp>(loc, ptrType, comm.getType(), one);
+        LLVM::AllocaOp::create(rewriter, loc, ptrType, comm.getType(), one);
 
     // int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm * newcomm)
     auto funcType =
@@ -524,14 +525,14 @@ struct CommSplitOpLowering : public ConvertOpToLLVMPattern<mpi::CommSplitOp> {
     LLVM::LLVMFuncOp funcDecl = getOrDefineFunction(moduleOp, loc, rewriter,
                                                     "MPI_Comm_split", funcType);
 
-    auto callOp = rewriter.create<LLVM::CallOp>(
-        loc, funcDecl,
-        ValueRange{comm, adaptor.getColor(), adaptor.getKey(),
-                   outPtr.getRes()});
+    auto callOp =
+        LLVM::CallOp::create(rewriter, loc, funcDecl,
+                             ValueRange{comm, adaptor.getColor(),
+                                        adaptor.getKey(), outPtr.getRes()});
 
     // load the communicator into a register
-    Value res = rewriter.create<LLVM::LoadOp>(loc, i32, outPtr.getResult());
-    res = rewriter.create<LLVM::SExtOp>(loc, rewriter.getI64Type(), res);
+    Value res = LLVM::LoadOp::create(rewriter, loc, i32, outPtr.getResult());
+    res = LLVM::SExtOp::create(rewriter, loc, rewriter.getI64Type(), res);
 
     // if retval is checked, replace uses of retval with the results from the
     // call op
@@ -580,14 +581,14 @@ struct CommRankOpLowering : public ConvertOpToLLVMPattern<mpi::CommRankOp> {
         moduleOp, loc, rewriter, "MPI_Comm_rank", rankFuncType);
 
     // replace with function call
-    auto one = rewriter.create<LLVM::ConstantOp>(loc, i32, 1);
-    auto rankptr = rewriter.create<LLVM::AllocaOp>(loc, ptrType, i32, one);
-    auto callOp = rewriter.create<LLVM::CallOp>(
-        loc, initDecl, ValueRange{comm, rankptr.getRes()});
+    auto one = LLVM::ConstantOp::create(rewriter, loc, i32, 1);
+    auto rankptr = LLVM::AllocaOp::create(rewriter, loc, ptrType, i32, one);
+    auto callOp = LLVM::CallOp::create(rewriter, loc, initDecl,
+                                       ValueRange{comm, rankptr.getRes()});
 
     // load the rank into a register
     auto loadedRank =
-        rewriter.create<LLVM::LoadOp>(loc, i32, rankptr.getResult());
+        LLVM::LoadOp::create(rewriter, loc, i32, rankptr.getResult());
 
     // if retval is checked, replace uses of retval with the results from the
     // call op
@@ -641,10 +642,10 @@ struct SendOpLowering : public ConvertOpToLLVMPattern<mpi::SendOp> {
         getOrDefineFunction(moduleOp, loc, rewriter, "MPI_Send", funcType);
 
     // replace op with function call
-    auto funcCall = rewriter.create<LLVM::CallOp>(
-        loc, funcDecl,
-        ValueRange{dataPtr, size, dataType, adaptor.getDest(), adaptor.getTag(),
-                   comm});
+    auto funcCall = LLVM::CallOp::create(rewriter, loc, funcDecl,
+                                         ValueRange{dataPtr, size, dataType,
+                                                    adaptor.getDest(),
+                                                    adaptor.getTag(), comm});
     if (op.getRetval())
       rewriter.replaceOp(op, funcCall.getResult());
     else
@@ -683,10 +684,10 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern<mpi::RecvOp> {
     auto mpiTraits = MPIImplTraits::get(moduleOp);
     Value dataType = mpiTraits->getDataType(loc, rewriter, elemType);
     Value comm = mpiTraits->castComm(loc, rewriter, adaptor.getComm());
-    Value statusIgnore = rewriter.create<LLVM::ConstantOp>(
-        loc, i64, mpiTraits->getStatusIgnore());
+    Value statusIgnore = LLVM::ConstantOp::create(rewriter, loc, i64,
+                                                  mpiTraits->getStatusIgnore());
     statusIgnore =
-        rewriter.create<LLVM::IntToPtrOp>(loc, ptrType, statusIgnore);
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrType, statusIgnore);
 
     // LLVM Function type representing `i32 MPI_Recv(data, count, datatype, dst,
     // tag, comm)`
@@ -698,8 +699,8 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern<mpi::RecvOp> {
         getOrDefineFunction(moduleOp, loc, rewriter, "MPI_Recv", funcType);
 
     // replace op with function call
-    auto funcCall = rewriter.create<LLVM::CallOp>(
-        loc, funcDecl,
+    auto funcCall = LLVM::CallOp::create(
+        rewriter, loc, funcDecl,
         ValueRange{dataPtr, size, dataType, adaptor.getSource(),
                    adaptor.getTag(), comm, statusIgnore});
     if (op.getRetval())
@@ -738,9 +739,10 @@ struct AllReduceOpLowering : public ConvertOpToLLVMPattern<mpi::AllReduceOp> {
 
     // If input and output are the same, request in-place operation.
     if (adaptor.getSendbuf() == adaptor.getRecvbuf()) {
-      sendPtr = rewriter.create<LLVM::ConstantOp>(
-          loc, i64, reinterpret_cast<int64_t>(mpiTraits->getInPlace()));
-      sendPtr = rewriter.create<LLVM::IntToPtrOp>(loc, ptrType, sendPtr);
+      sendPtr = LLVM::ConstantOp::create(
+          rewriter, loc, i64,
+          reinterpret_cast<int64_t>(mpiTraits->getInPlace()));
+      sendPtr = LLVM::IntToPtrOp::create(rewriter, loc, ptrType, sendPtr);
     }
 
     Value dataType = mpiTraits->getDataType(loc, rewriter, elemType);
@@ -757,8 +759,8 @@ struct AllReduceOpLowering : public ConvertOpToLLVMPattern<mpi::AllReduceOp> {
         getOrDefineFunction(moduleOp, loc, rewriter, "MPI_Allreduce", funcType);
 
     // replace op with function call
-    auto funcCall = rewriter.create<LLVM::CallOp>(
-        loc, funcDecl,
+    auto funcCall = LLVM::CallOp::create(
+        rewriter, loc, funcDecl,
         ValueRange{sendPtr, recvPtr, sendSize, dataType, mpiOp, commWorld});
 
     if (op.getRetval())
diff --git a/mlir/lib/Conversion/MathToFuncs/MathToFuncs.cpp b/mlir/lib/Conversion/MathToFuncs/MathToFuncs.cpp
index 7f4655e53609e..08a456691880c 100644
--- a/mlir/lib/Conversion/MathToFuncs/MathToFuncs.cpp
+++ b/mlir/lib/Conversion/MathToFuncs/MathToFuncs.cpp
@@ -121,19 +121,19 @@ VecOpToScalarOp<Op>::matchAndRewrite(Op op, PatternRewriter &rewriter) const {
     initValueAttr = FloatAttr::get(resultElementType, 0.0);
   else
     initValueAttr = IntegerAttr::get(resultElementType, 0);
-  Value result = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(vecType, initValueAttr));
+  Value result = arith::ConstantOp::create(
+      rewriter, loc, DenseElementsAttr::get(vecType, initValueAttr));
   SmallVector<int64_t> strides = computeStrides(shape);
   for (int64_t linearIndex = 0; linearIndex < numElements; ++linearIndex) {
     SmallVector<int64_t> positions = delinearize(linearIndex, strides);
     SmallVector<Value> operands;
     for (Value input : op->getOperands())
       operands.push_back(
-          rewriter.create<vector::ExtractOp>(loc, input, positions));
+          vector::ExtractOp::create(rewriter, loc, input, positions));
     Value scalarOp =
-        rewriter.create<Op>(loc, vecType.getElementType(), operands);
+        Op::create(rewriter, loc, vecType.getElementType(), operands);
     result =
-        rewriter.create<vector::InsertOp>(loc, scalarOp, result, positions);
+        vector::InsertOp::create(rewriter, loc, scalarOp, result, positions);
   }
   rewriter.replaceOp(op, result);
   return success();
@@ -195,7 +195,7 @@ static func::FuncOp createElementIPowIFunc(ModuleOp *module, Type elementType) {
 
   FunctionType funcType = FunctionType::get(
       builder.getContext(), {elementType, elementType}, elementType);
-  auto funcOp = builder.create<func::FuncOp>(funcName, funcType);
+  auto funcOp = func::FuncOp::create(builder, funcName, funcType);
   LLVM::linkage::Linkage inlineLinkage = LLVM::linkage::Linkage::LinkonceODR;
   Attribute linkage =
       LLVM::LinkageAttr::get(builder.getContext(), inlineLinkage);
@@ -208,12 +208,12 @@ static func::FuncOp createElementIPowIFunc(ModuleOp *module, Type elementType) {
   Value bArg = funcOp.getArgument(0);
   Value pArg = funcOp.getArgument(1);
   builder.setInsertionPointToEnd(entryBlock);
-  Value zeroValue = builder.create<arith::ConstantOp>(
-      elementType, builder.getIntegerAttr(elementType, 0));
-  Value oneValue = builder.create<arith::ConstantOp>(
-      elementType, builder.getIntegerAttr(elementType, 1));
-  Value minusOneValue = builder.create<arith::ConstantOp>(
-      elementType,
+  Value zeroValue = arith::ConstantOp::create(
+      builder, elementType, builder.getIntegerAttr(elementType, 0));
+  Value oneValue = arith::ConstantOp::create(
+      builder, elementType, builder.getIntegerAttr(elementType, 1));
+  Value minusOneValue = arith::ConstantOp::create(
+      builder, elementType,
       builder.getIntegerAttr(elementType,
                              APInt(elementType.getIntOrFloatBitWidth(), -1ULL,
                                    /*isSigned=*/true)));
@@ -221,82 +221,83 @@ static func::FuncOp createElementIPowIFunc(ModuleOp *module, Type elementType) {
   // if (p == T(0))
   //   return T(1);
   auto pIsZero =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, pArg, zeroValue);
+      arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, pArg, zeroValue);
   Block *thenBlock = builder.createBlock(funcBody);
-  builder.create<func::ReturnOp>(oneValue);
+  func::ReturnOp::create(builder, oneValue);
   Block *fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (p == T(0)).
   builder.setInsertionPointToEnd(pIsZero->getBlock());
-  builder.create<cf::CondBranchOp>(pIsZero, thenBlock, fallthroughBlock);
+  cf::CondBranchOp::create(builder, pIsZero, thenBlock, fallthroughBlock);
 
   // if (p < T(0)) {
   builder.setInsertionPointToEnd(fallthroughBlock);
-  auto pIsNeg =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::sle, pArg, zeroValue);
+  auto pIsNeg = arith::CmpIOp::create(builder, arith::CmpIPredicate::sle, pArg,
+                                      zeroValue);
   //   if (b == T(0))
   builder.createBlock(funcBody);
   auto bIsZero =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, bArg, zeroValue);
+      arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, bArg, zeroValue);
   //     return T(1) / T(0);
   thenBlock = builder.createBlock(funcBody);
-  builder.create<func::ReturnOp>(
-      builder.create<arith::DivSIOp>(oneValue, zeroValue).getResult());
+  func::ReturnOp::create(
+      builder,
+      arith::DivSIOp::create(builder, oneValue, zeroValue).getResult());
   fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (b == T(0)).
   builder.setInsertionPointToEnd(bIsZero->getBlock());
-  builder.create<cf::CondBranchOp>(bIsZero, thenBlock, fallthroughBlock);
+  cf::CondBranchOp::create(builder, bIsZero, thenBlock, fallthroughBlock);
 
   //   if (b == T(1))
   builder.setInsertionPointToEnd(fallthroughBlock);
   auto bIsOne =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, bArg, oneValue);
+      arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, bArg, oneValue);
   //    return T(1);
   thenBlock = builder.createBlock(funcBody);
-  builder.create<func::ReturnOp>(oneValue);
+  func::ReturnOp::create(builder, oneValue);
   fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (b == T(1)).
   builder.setInsertionPointToEnd(bIsOne->getBlock());
-  builder.create<cf::CondBranchOp>(bIsOne, thenBlock, fallthroughBlock);
+  cf::CondBranchOp::create(builder, bIsOne, thenBlock, fallthroughBlock);
 
   //   if (b == T(-1)) {
   builder.setInsertionPointToEnd(fallthroughBlock);
-  auto bIsMinusOne = builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                                   bArg, minusOneValue);
+  auto bIsMinusOne = arith::CmpIOp::create(builder, arith::CmpIPredicate::eq,
+                                           bArg, minusOneValue);
   //     if (p & T(1))
   builder.createBlock(funcBody);
-  auto pIsOdd = builder.create<arith::CmpIOp>(
-      arith::CmpIPredicate::ne, builder.create<arith::AndIOp>(pArg, oneValue),
-      zeroValue);
+  auto pIsOdd = arith::CmpIOp::create(
+      builder, arith::CmpIPredicate::ne,
+      arith::AndIOp::create(builder, pArg, oneValue), zeroValue);
   //       return T(-1);
   thenBlock = builder.createBlock(funcBody);
-  builder.create<func::ReturnOp>(minusOneValue);
+  func::ReturnOp::create(builder, minusOneValue);
   fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (p & T(1)).
   builder.setInsertionPointToEnd(pIsOdd->getBlock());
-  builder.create<cf::CondBranchOp>(pIsOdd, thenBlock, fallthroughBlock);
+  cf::CondBranchOp::create(builder, pIsOdd, thenBlock, fallthroughBlock);
 
   //     return T(1);
   //   } // b == T(-1)
   builder.setInsertionPointToEnd(fallthroughBlock);
-  builder.create<func::ReturnOp>(oneValue);
+  func::ReturnOp::create(builder, oneValue);
   fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (b == T(-1)).
   builder.setInsertionPointToEnd(bIsMinusOne->getBlock());
-  builder.create<cf::CondBranchOp>(bIsMinusOne, pIsOdd->getBlock(),
-                                   fallthroughBlock);
+  cf::CondBranchOp::create(builder, bIsMinusOne, pIsOdd->getBlock(),
+                           fallthroughBlock);
 
   //   return T(0);
   // } // (p < T(0))
   builder.setInsertionPointToEnd(fallthroughBlock);
-  builder.create<func::ReturnOp>(zeroValue);
+  func::ReturnOp::create(builder, zeroValue);
   Block *loopHeader = builder.createBlock(
       funcBody, funcBody->end(), {elementType, elementType, elementType},
       {builder.getLoc(), builder.getLoc(), builder.getLoc()});
   // Set up conditional branch for (p < T(0)).
   builder.setInsertionPointToEnd(pIsNeg->getBlock());
   // Set initial values of 'result', 'b' and 'p' for the loop.
-  builder.create<cf::CondBranchOp>(pIsNeg, bIsZero->getBlock(), loopHeader,
-                                   ValueRange{oneValue, bArg, pArg});
+  cf::CondBranchOp::create(builder, pIsNeg, bIsZero->getBlock(), loopHeader,
+                           ValueRange{oneValue, bArg, pArg});
 
   // T result = T(1);
   // while (true) {
@@ -313,45 +314,46 @@ static func::FuncOp createElementIPowIFunc(ModuleOp *module, Type elementType) {
   builder.setInsertionPointToEnd(loopHeader);
 
   //   if (p & T(1))
-  auto powerTmpIsOdd = builder.create<arith::CmpIOp>(
-      arith::CmpIPredicate::ne,
-      builder.create<arith::AndIOp>(powerTmp, oneValue), zeroValue);
+  auto powerTmpIsOdd = arith::CmpIOp::create(
+      builder, arith::CmpIPredicate::ne,
+      arith::AndIOp::create(builder, powerTmp, oneValue), zeroValue);
   thenBlock = builder.createBlock(funcBody);
   //     result *= b;
-  Value newResultTmp = builder.create<arith::MulIOp>(resultTmp, baseTmp);
+  Value newResultTmp = arith::MulIOp::create(builder, resultTmp, baseTmp);
   fallthroughBlock = builder.createBlock(funcBody, funcBody->end(), elementType,
                                          builder.getLoc());
   builder.setInsertionPointToEnd(thenBlock);
-  builder.create<cf::BranchOp>(newResultTmp, fallthroughBlock);
+  cf::BranchOp::create(builder, newResultTmp, fallthroughBlock);
   // Set up conditional branch for (p & T(1)).
   builder.setInsertionPointToEnd(powerTmpIsOdd->getBlock());
-  builder.create<cf::CondBranchOp>(powerTmpIsOdd, thenBlock, fallthroughBlock,
-                                   resultTmp);
+  cf::CondBranchOp::create(builder, powerTmpIsOdd, thenBlock, fallthroughBlock,
+                           resultTmp);
   // Merged 'result'.
   newResultTmp = fallthroughBlock->getArgument(0);
 
   //   p >>= T(1);
   builder.setInsertionPointToEnd(fallthroughBlock);
-  Value newPowerTmp = builder.create<arith::ShRUIOp>(powerTmp, oneValue);
+  Value newPowerTmp = arith::ShRUIOp::create(builder, powerTmp, oneValue);
 
   //   if (p == T(0))
-  auto newPowerIsZero = builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                                      newPowerTmp, zeroValue);
+  auto newPowerIsZero = arith::CmpIOp::create(builder, arith::CmpIPredicate::eq,
+                                              newPowerTmp, zeroValue);
   //     return result;
   thenBlock = builder.createBlock(funcBody);
-  builder.create<func::ReturnOp>(newResultTmp);
+  func::ReturnOp::create(builder, newResultTmp);
   fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (p == T(0)).
   builder.setInsertionPointToEnd(newPowerIsZero->getBlock());
-  builder.create<cf::CondBranchOp>(newPowerIsZero, thenBlock, fallthroughBlock);
+  cf::CondBranchOp::create(builder, newPowerIsZero, thenBlock,
+                           fallthroughBlock);
 
   //   b *= b;
   // }
   builder.setInsertionPointToEnd(fallthroughBlock);
-  Value newBaseTmp = builder.create<arith::MulIOp>(baseTmp, baseTmp);
+  Value newBaseTmp = arith::MulIOp::create(builder, baseTmp, baseTmp);
   // Pass new values for 'result', 'b' and 'p' to the loop header.
-  builder.create<cf::BranchOp>(
-      ValueRange{newResultTmp, newBaseTmp, newPowerTmp}, loopHeader);
+  cf::BranchOp::create(
+      builder, ValueRange{newResultTmp, newBaseTmp, newPowerTmp}, loopHeader);
   return funcOp;
 }
 
@@ -420,7 +422,7 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   llvm::raw_string_ostream nameOS(funcName);
   nameOS << '_' << baseType;
   nameOS << '_' << powType;
-  auto funcOp = builder.create<func::FuncOp>(funcName, funcType);
+  auto funcOp = func::FuncOp::create(builder, funcName, funcType);
   LLVM::linkage::Linkage inlineLinkage = LLVM::linkage::Linkage::LinkonceODR;
   Attribute linkage =
       LLVM::LinkageAttr::get(builder.getContext(), inlineLinkage);
@@ -433,46 +435,48 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   Value bArg = funcOp.getArgument(0);
   Value pArg = funcOp.getArgument(1);
   builder.setInsertionPointToEnd(entryBlock);
-  Value oneBValue = builder.create<arith::ConstantOp>(
-      baseType, builder.getFloatAttr(baseType, 1.0));
-  Value zeroPValue = builder.create<arith::ConstantOp>(
-      powType, builder.getIntegerAttr(powType, 0));
-  Value onePValue = builder.create<arith::ConstantOp>(
-      powType, builder.getIntegerAttr(powType, 1));
-  Value minPValue = builder.create<arith::ConstantOp>(
-      powType, builder.getIntegerAttr(powType, llvm::APInt::getSignedMinValue(
-                                                   powType.getWidth())));
-  Value maxPValue = builder.create<arith::ConstantOp>(
-      powType, builder.getIntegerAttr(powType, llvm::APInt::getSignedMaxValue(
-                                                   powType.getWidth())));
+  Value oneBValue = arith::ConstantOp::create(
+      builder, baseType, builder.getFloatAttr(baseType, 1.0));
+  Value zeroPValue = arith::ConstantOp::create(
+      builder, powType, builder.getIntegerAttr(powType, 0));
+  Value onePValue = arith::ConstantOp::create(
+      builder, powType, builder.getIntegerAttr(powType, 1));
+  Value minPValue = arith::ConstantOp::create(
+      builder, powType,
+      builder.getIntegerAttr(
+          powType, llvm::APInt::getSignedMinValue(powType.getWidth())));
+  Value maxPValue = arith::ConstantOp::create(
+      builder, powType,
+      builder.getIntegerAttr(
+          powType, llvm::APInt::getSignedMaxValue(powType.getWidth())));
 
   // if (p == Tp{0})
   //   return Tb{1};
-  auto pIsZero =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, pArg, zeroPValue);
+  auto pIsZero = arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, pArg,
+                                       zeroPValue);
   Block *thenBlock = builder.createBlock(funcBody);
-  builder.create<func::ReturnOp>(oneBValue);
+  func::ReturnOp::create(builder, oneBValue);
   Block *fallthroughBlock = builder.createBlock(funcBody);
   // Set up conditional branch for (p == Tp{0}).
   builder.setInsertionPointToEnd(pIsZero->getBlock());
-  builder.create<cf::CondBranchOp>(pIsZero, thenBlock, fallthroughBlock);
+  cf::CondBranchOp::create(builder, pIsZero, thenBlock, fallthroughBlock);
 
   builder.setInsertionPointToEnd(fallthroughBlock);
   // bool isNegativePower{p < Tp{0}}
-  auto pIsNeg = builder.create<arith::CmpIOp>(arith::CmpIPredicate::sle, pArg,
-                                              zeroPValue);
+  auto pIsNeg = arith::CmpIOp::create(builder, arith::CmpIPredicate::sle, pArg,
+                                      zeroPValue);
   // bool isMin{p == std::numeric_limits<Tp>::min()};
   auto pIsMin =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, pArg, minPValue);
+      arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, pArg, minPValue);
 
   // if (isMin) {
   //   p = std::numeric_limits<Tp>::max();
   // } else if (isNegativePower) {
   //   p = -p;
   // }
-  Value negP = builder.create<arith::SubIOp>(zeroPValue, pArg);
-  auto pInit = builder.create<arith::SelectOp>(pIsNeg, negP, pArg);
-  pInit = builder.create<arith::SelectOp>(pIsMin, maxPValue, pInit);
+  Value negP = arith::SubIOp::create(builder, zeroPValue, pArg);
+  auto pInit = arith::SelectOp::create(builder, pIsNeg, negP, pArg);
+  pInit = arith::SelectOp::create(builder, pIsMin, maxPValue, pInit);
 
   // Tb result = Tb{1};
   // Tb origBase = Tb{b};
@@ -489,7 +493,7 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
       {builder.getLoc(), builder.getLoc(), builder.getLoc()});
   // Set initial values of 'result', 'b' and 'p' for the loop.
   builder.setInsertionPointToEnd(pInit->getBlock());
-  builder.create<cf::BranchOp>(loopHeader, ValueRange{oneBValue, bArg, pInit});
+  cf::BranchOp::create(builder, loopHeader, ValueRange{oneBValue, bArg, pInit});
 
   // Create loop body.
   Value resultTmp = loopHeader->getArgument(0);
@@ -498,30 +502,30 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   builder.setInsertionPointToEnd(loopHeader);
 
   //   if (p & Tp{1})
-  auto powerTmpIsOdd = builder.create<arith::CmpIOp>(
-      arith::CmpIPredicate::ne,
-      builder.create<arith::AndIOp>(powerTmp, onePValue), zeroPValue);
+  auto powerTmpIsOdd = arith::CmpIOp::create(
+      builder, arith::CmpIPredicate::ne,
+      arith::AndIOp::create(builder, powerTmp, onePValue), zeroPValue);
   thenBlock = builder.createBlock(funcBody);
   //     result *= b;
-  Value newResultTmp = builder.create<arith::MulFOp>(resultTmp, baseTmp);
+  Value newResultTmp = arith::MulFOp::create(builder, resultTmp, baseTmp);
   fallthroughBlock = builder.createBlock(funcBody, funcBody->end(), baseType,
                                          builder.getLoc());
   builder.setInsertionPointToEnd(thenBlock);
-  builder.create<cf::BranchOp>(newResultTmp, fallthroughBlock);
+  cf::BranchOp::create(builder, newResultTmp, fallthroughBlock);
   // Set up conditional branch for (p & Tp{1}).
   builder.setInsertionPointToEnd(powerTmpIsOdd->getBlock());
-  builder.create<cf::CondBranchOp>(powerTmpIsOdd, thenBlock, fallthroughBlock,
-                                   resultTmp);
+  cf::CondBranchOp::create(builder, powerTmpIsOdd, thenBlock, fallthroughBlock,
+                           resultTmp);
   // Merged 'result'.
   newResultTmp = fallthroughBlock->getArgument(0);
 
   //   p >>= Tp{1};
   builder.setInsertionPointToEnd(fallthroughBlock);
-  Value newPowerTmp = builder.create<arith::ShRUIOp>(powerTmp, onePValue);
+  Value newPowerTmp = arith::ShRUIOp::create(builder, powerTmp, onePValue);
 
   //   if (p == Tp{0})
-  auto newPowerIsZero = builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                                      newPowerTmp, zeroPValue);
+  auto newPowerIsZero = arith::CmpIOp::create(builder, arith::CmpIPredicate::eq,
+                                              newPowerTmp, zeroPValue);
   //     break;
   //
   // The conditional branch is finalized below with a jump to
@@ -531,10 +535,10 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   //   b *= b;
   // }
   builder.setInsertionPointToEnd(fallthroughBlock);
-  Value newBaseTmp = builder.create<arith::MulFOp>(baseTmp, baseTmp);
+  Value newBaseTmp = arith::MulFOp::create(builder, baseTmp, baseTmp);
   // Pass new values for 'result', 'b' and 'p' to the loop header.
-  builder.create<cf::BranchOp>(
-      ValueRange{newResultTmp, newBaseTmp, newPowerTmp}, loopHeader);
+  cf::BranchOp::create(
+      builder, ValueRange{newResultTmp, newBaseTmp, newPowerTmp}, loopHeader);
 
   // Set up conditional branch for early loop exit:
   //   if (p == Tp{0})
@@ -542,8 +546,8 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   Block *loopExit = builder.createBlock(funcBody, funcBody->end(), baseType,
                                         builder.getLoc());
   builder.setInsertionPointToEnd(newPowerIsZero->getBlock());
-  builder.create<cf::CondBranchOp>(newPowerIsZero, loopExit, newResultTmp,
-                                   fallthroughBlock, ValueRange{});
+  cf::CondBranchOp::create(builder, newPowerIsZero, loopExit, newResultTmp,
+                           fallthroughBlock, ValueRange{});
 
   // if (isMin) {
   //   result *= origBase;
@@ -553,11 +557,11 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   fallthroughBlock = builder.createBlock(funcBody, funcBody->end(), baseType,
                                          builder.getLoc());
   builder.setInsertionPointToEnd(loopExit);
-  builder.create<cf::CondBranchOp>(pIsMin, thenBlock, fallthroughBlock,
-                                   newResultTmp);
+  cf::CondBranchOp::create(builder, pIsMin, thenBlock, fallthroughBlock,
+                           newResultTmp);
   builder.setInsertionPointToEnd(thenBlock);
-  newResultTmp = builder.create<arith::MulFOp>(newResultTmp, bArg);
-  builder.create<cf::BranchOp>(newResultTmp, fallthroughBlock);
+  newResultTmp = arith::MulFOp::create(builder, newResultTmp, bArg);
+  cf::BranchOp::create(builder, newResultTmp, fallthroughBlock);
 
   /// if (isNegativePower) {
   ///   result = Tb{1} / result;
@@ -567,15 +571,15 @@ static func::FuncOp createElementFPowIFunc(ModuleOp *module,
   Block *returnBlock = builder.createBlock(funcBody, funcBody->end(), baseType,
                                            builder.getLoc());
   builder.setInsertionPointToEnd(fallthroughBlock);
-  builder.create<cf::CondBranchOp>(pIsNeg, thenBlock, returnBlock,
-                                   newResultTmp);
+  cf::CondBranchOp::create(builder, pIsNeg, thenBlock, returnBlock,
+                           newResultTmp);
   builder.setInsertionPointToEnd(thenBlock);
-  newResultTmp = builder.create<arith::DivFOp>(oneBValue, newResultTmp);
-  builder.create<cf::BranchOp>(newResultTmp, returnBlock);
+  newResultTmp = arith::DivFOp::create(builder, oneBValue, newResultTmp);
+  cf::BranchOp::create(builder, newResultTmp, returnBlock);
 
   // return result;
   builder.setInsertionPointToEnd(returnBlock);
-  builder.create<func::ReturnOp>(returnBlock->getArgument(0));
+  func::ReturnOp::create(builder, returnBlock->getArgument(0));
 
   return funcOp;
 }
@@ -667,7 +671,7 @@ static func::FuncOp createCtlzFunc(ModuleOp *module, Type elementType) {
   nameOS << '_' << elementType;
   FunctionType funcType =
       FunctionType::get(builder.getContext(), {elementType}, elementType);
-  auto funcOp = builder.create<func::FuncOp>(funcName, funcType);
+  auto funcOp = func::FuncOp::create(builder, funcName, funcType);
 
   // LinkonceODR ensures that there is only one implementation of this function
   // across all math.ctlz functions that are lowered in this way.
@@ -683,33 +687,34 @@ static func::FuncOp createCtlzFunc(ModuleOp *module, Type elementType) {
 
   Value arg = funcOp.getArgument(0);
   Type indexType = builder.getIndexType();
-  Value bitWidthValue = builder.create<arith::ConstantOp>(
-      elementType, builder.getIntegerAttr(elementType, bitWidth));
-  Value zeroValue = builder.create<arith::ConstantOp>(
-      elementType, builder.getIntegerAttr(elementType, 0));
+  Value bitWidthValue = arith::ConstantOp::create(
+      builder, elementType, builder.getIntegerAttr(elementType, bitWidth));
+  Value zeroValue = arith::ConstantOp::create(
+      builder, elementType, builder.getIntegerAttr(elementType, 0));
 
   Value inputEqZero =
-      builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, arg, zeroValue);
+      arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, arg, zeroValue);
 
   // if input == 0, return bit width, else enter loop.
-  scf::IfOp ifOp = builder.create<scf::IfOp>(
-      elementType, inputEqZero, /*addThenBlock=*/true, /*addElseBlock=*/true);
+  scf::IfOp ifOp =
+      scf::IfOp::create(builder, elementType, inputEqZero,
+                        /*addThenBlock=*/true, /*addElseBlock=*/true);
   ifOp.getThenBodyBuilder().create<scf::YieldOp>(loc, bitWidthValue);
 
   auto elseBuilder =
       ImplicitLocOpBuilder::atBlockEnd(loc, &ifOp.getElseRegion().front());
 
-  Value oneIndex = elseBuilder.create<arith::ConstantOp>(
-      indexType, elseBuilder.getIndexAttr(1));
-  Value oneValue = elseBuilder.create<arith::ConstantOp>(
-      elementType, elseBuilder.getIntegerAttr(elementType, 1));
-  Value bitWidthIndex = elseBuilder.create<arith::ConstantOp>(
-      indexType, elseBuilder.getIndexAttr(bitWidth));
-  Value nValue = elseBuilder.create<arith::ConstantOp>(
-      elementType, elseBuilder.getIntegerAttr(elementType, 0));
-
-  auto loop = elseBuilder.create<scf::ForOp>(
-      oneIndex, bitWidthIndex, oneIndex,
+  Value oneIndex = arith::ConstantOp::create(elseBuilder, indexType,
+                                             elseBuilder.getIndexAttr(1));
+  Value oneValue = arith::ConstantOp::create(
+      elseBuilder, elementType, elseBuilder.getIntegerAttr(elementType, 1));
+  Value bitWidthIndex = arith::ConstantOp::create(
+      elseBuilder, indexType, elseBuilder.getIndexAttr(bitWidth));
+  Value nValue = arith::ConstantOp::create(
+      elseBuilder, elementType, elseBuilder.getIntegerAttr(elementType, 0));
+
+  auto loop = scf::ForOp::create(
+      elseBuilder, oneIndex, bitWidthIndex, oneIndex,
       // Initial values for two loop induction variables, the arg which is being
       // shifted left in each iteration, and the n value which tracks the count
       // of leading zeros.
@@ -725,25 +730,25 @@ static func::FuncOp createCtlzFunc(ModuleOp *module, Type elementType) {
         Value argIter = args[0];
         Value nIter = args[1];
 
-        Value argIsNonNegative = b.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::slt, argIter, zeroValue);
-        scf::IfOp ifOp = b.create<scf::IfOp>(
-            loc, argIsNonNegative,
+        Value argIsNonNegative = arith::CmpIOp::create(
+            b, loc, arith::CmpIPredicate::slt, argIter, zeroValue);
+        scf::IfOp ifOp = scf::IfOp::create(
+            b, loc, argIsNonNegative,
             [&](OpBuilder &b, Location loc) {
               // If arg is negative, continue (effectively, break)
-              b.create<scf::YieldOp>(loc, ValueRange{argIter, nIter});
+              scf::YieldOp::create(b, loc, ValueRange{argIter, nIter});
             },
             [&](OpBuilder &b, Location loc) {
               // Otherwise, increment n and shift arg left.
-              Value nNext = b.create<arith::AddIOp>(loc, nIter, oneValue);
-              Value argNext = b.create<arith::ShLIOp>(loc, argIter, oneValue);
-              b.create<scf::YieldOp>(loc, ValueRange{argNext, nNext});
+              Value nNext = arith::AddIOp::create(b, loc, nIter, oneValue);
+              Value argNext = arith::ShLIOp::create(b, loc, argIter, oneValue);
+              scf::YieldOp::create(b, loc, ValueRange{argNext, nNext});
             });
-        b.create<scf::YieldOp>(loc, ifOp.getResults());
+        scf::YieldOp::create(b, loc, ifOp.getResults());
       });
-  elseBuilder.create<scf::YieldOp>(loop.getResult(1));
+  scf::YieldOp::create(elseBuilder, loop.getResult(1));
 
-  builder.create<func::ReturnOp>(ifOp.getResult(0));
+  func::ReturnOp::create(builder, ifOp.getResult(0));
   return funcOp;
 }
 
diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
index f4d69ce8235bb..853f45498ac52 100644
--- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
+++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
@@ -107,8 +107,8 @@ struct IntOpWithFlagLowering : public ConvertOpToLLVMPattern<MathOp> {
     return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), adaptor.getOperands(), typeConverter,
         [&](Type llvm1DVectorTy, ValueRange operands) {
-          return rewriter.create<LLVMOp>(loc, llvm1DVectorTy, operands[0],
-                                         false);
+          return LLVMOp::create(rewriter, loc, llvm1DVectorTy, operands[0],
+                                false);
         },
         rewriter);
   }
@@ -145,15 +145,16 @@ struct ExpM1OpLowering : public ConvertOpToLLVMPattern<math::ExpM1Op> {
     if (!isa<LLVM::LLVMArrayType>(llvmOperandType)) {
       LLVM::ConstantOp one;
       if (LLVM::isCompatibleVectorType(llvmOperandType)) {
-        one = rewriter.create<LLVM::ConstantOp>(
-            loc, llvmOperandType,
+        one = LLVM::ConstantOp::create(
+            rewriter, loc, llvmOperandType,
             SplatElementsAttr::get(cast<ShapedType>(llvmOperandType),
                                    floatOne));
       } else {
-        one = rewriter.create<LLVM::ConstantOp>(loc, llvmOperandType, floatOne);
+        one =
+            LLVM::ConstantOp::create(rewriter, loc, llvmOperandType, floatOne);
       }
-      auto exp = rewriter.create<LLVM::ExpOp>(loc, adaptor.getOperand(),
-                                              expAttrs.getAttrs());
+      auto exp = LLVM::ExpOp::create(rewriter, loc, adaptor.getOperand(),
+                                     expAttrs.getAttrs());
       rewriter.replaceOpWithNewOp<LLVM::FSubOp>(
           op, llvmOperandType, ValueRange{exp, one}, subAttrs.getAttrs());
       return success();
@@ -170,12 +171,13 @@ struct ExpM1OpLowering : public ConvertOpToLLVMPattern<math::ExpM1Op> {
               mlir::VectorType::get({numElements.getKnownMinValue()}, floatType,
                                     {numElements.isScalable()}),
               floatOne);
-          auto one =
-              rewriter.create<LLVM::ConstantOp>(loc, llvm1DVectorTy, splatAttr);
-          auto exp = rewriter.create<LLVM::ExpOp>(
-              loc, llvm1DVectorTy, operands[0], expAttrs.getAttrs());
-          return rewriter.create<LLVM::FSubOp>(
-              loc, llvm1DVectorTy, ValueRange{exp, one}, subAttrs.getAttrs());
+          auto one = LLVM::ConstantOp::create(rewriter, loc, llvm1DVectorTy,
+                                              splatAttr);
+          auto exp = LLVM::ExpOp::create(rewriter, loc, llvm1DVectorTy,
+                                         operands[0], expAttrs.getAttrs());
+          return LLVM::FSubOp::create(rewriter, loc, llvm1DVectorTy,
+                                      ValueRange{exp, one},
+                                      subAttrs.getAttrs());
         },
         rewriter);
   }
@@ -205,16 +207,16 @@ struct Log1pOpLowering : public ConvertOpToLLVMPattern<math::Log1pOp> {
     if (!isa<LLVM::LLVMArrayType>(llvmOperandType)) {
       LLVM::ConstantOp one =
           isa<VectorType>(llvmOperandType)
-              ? rewriter.create<LLVM::ConstantOp>(
-                    loc, llvmOperandType,
+              ? LLVM::ConstantOp::create(
+                    rewriter, loc, llvmOperandType,
                     SplatElementsAttr::get(cast<ShapedType>(llvmOperandType),
                                            floatOne))
-              : rewriter.create<LLVM::ConstantOp>(loc, llvmOperandType,
-                                                  floatOne);
+              : LLVM::ConstantOp::create(rewriter, loc, llvmOperandType,
+                                         floatOne);
 
-      auto add = rewriter.create<LLVM::FAddOp>(
-          loc, llvmOperandType, ValueRange{one, adaptor.getOperand()},
-          addAttrs.getAttrs());
+      auto add = LLVM::FAddOp::create(rewriter, loc, llvmOperandType,
+                                      ValueRange{one, adaptor.getOperand()},
+                                      addAttrs.getAttrs());
       rewriter.replaceOpWithNewOp<LLVM::LogOp>(
           op, llvmOperandType, ValueRange{add}, logAttrs.getAttrs());
       return success();
@@ -231,13 +233,13 @@ struct Log1pOpLowering : public ConvertOpToLLVMPattern<math::Log1pOp> {
               mlir::VectorType::get({numElements.getKnownMinValue()}, floatType,
                                     {numElements.isScalable()}),
               floatOne);
-          auto one =
-              rewriter.create<LLVM::ConstantOp>(loc, llvm1DVectorTy, splatAttr);
-          auto add = rewriter.create<LLVM::FAddOp>(loc, llvm1DVectorTy,
-                                                   ValueRange{one, operands[0]},
-                                                   addAttrs.getAttrs());
-          return rewriter.create<LLVM::LogOp>(
-              loc, llvm1DVectorTy, ValueRange{add}, logAttrs.getAttrs());
+          auto one = LLVM::ConstantOp::create(rewriter, loc, llvm1DVectorTy,
+                                              splatAttr);
+          auto add = LLVM::FAddOp::create(rewriter, loc, llvm1DVectorTy,
+                                          ValueRange{one, operands[0]},
+                                          addAttrs.getAttrs());
+          return LLVM::LogOp::create(rewriter, loc, llvm1DVectorTy,
+                                     ValueRange{add}, logAttrs.getAttrs());
         },
         rewriter);
   }
@@ -267,15 +269,16 @@ struct RsqrtOpLowering : public ConvertOpToLLVMPattern<math::RsqrtOp> {
     if (!isa<LLVM::LLVMArrayType>(llvmOperandType)) {
       LLVM::ConstantOp one;
       if (isa<VectorType>(llvmOperandType)) {
-        one = rewriter.create<LLVM::ConstantOp>(
-            loc, llvmOperandType,
+        one = LLVM::ConstantOp::create(
+            rewriter, loc, llvmOperandType,
             SplatElementsAttr::get(cast<ShapedType>(llvmOperandType),
                                    floatOne));
       } else {
-        one = rewriter.create<LLVM::ConstantOp>(loc, llvmOperandType, floatOne);
+        one =
+            LLVM::ConstantOp::create(rewriter, loc, llvmOperandType, floatOne);
       }
-      auto sqrt = rewriter.create<LLVM::SqrtOp>(loc, adaptor.getOperand(),
-                                                sqrtAttrs.getAttrs());
+      auto sqrt = LLVM::SqrtOp::create(rewriter, loc, adaptor.getOperand(),
+                                       sqrtAttrs.getAttrs());
       rewriter.replaceOpWithNewOp<LLVM::FDivOp>(
           op, llvmOperandType, ValueRange{one, sqrt}, divAttrs.getAttrs());
       return success();
@@ -292,12 +295,13 @@ struct RsqrtOpLowering : public ConvertOpToLLVMPattern<math::RsqrtOp> {
               mlir::VectorType::get({numElements.getKnownMinValue()}, floatType,
                                     {numElements.isScalable()}),
               floatOne);
-          auto one =
-              rewriter.create<LLVM::ConstantOp>(loc, llvm1DVectorTy, splatAttr);
-          auto sqrt = rewriter.create<LLVM::SqrtOp>(
-              loc, llvm1DVectorTy, operands[0], sqrtAttrs.getAttrs());
-          return rewriter.create<LLVM::FDivOp>(
-              loc, llvm1DVectorTy, ValueRange{one, sqrt}, divAttrs.getAttrs());
+          auto one = LLVM::ConstantOp::create(rewriter, loc, llvm1DVectorTy,
+                                              splatAttr);
+          auto sqrt = LLVM::SqrtOp::create(rewriter, loc, llvm1DVectorTy,
+                                           operands[0], sqrtAttrs.getAttrs());
+          return LLVM::FDivOp::create(rewriter, loc, llvm1DVectorTy,
+                                      ValueRange{one, sqrt},
+                                      divAttrs.getAttrs());
         },
         rewriter);
   }
diff --git a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
index a0ce7d3b75fc2..f7c0d4fe3a799 100644
--- a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
+++ b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
@@ -84,20 +84,21 @@ VecOpToScalarOp<Op>::matchAndRewrite(Op op, PatternRewriter &rewriter) const {
   auto shape = vecType.getShape();
   int64_t numElements = vecType.getNumElements();
 
-  Value result = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(
-               vecType, FloatAttr::get(vecType.getElementType(), 0.0)));
+  Value result = arith::ConstantOp::create(
+      rewriter, loc,
+      DenseElementsAttr::get(vecType,
+                             FloatAttr::get(vecType.getElementType(), 0.0)));
   SmallVector<int64_t> strides = computeStrides(shape);
   for (auto linearIndex = 0; linearIndex < numElements; ++linearIndex) {
     SmallVector<int64_t> positions = delinearize(linearIndex, strides);
     SmallVector<Value> operands;
     for (auto input : op->getOperands())
       operands.push_back(
-          rewriter.create<vector::ExtractOp>(loc, input, positions));
+          vector::ExtractOp::create(rewriter, loc, input, positions));
     Value scalarOp =
-        rewriter.create<Op>(loc, vecType.getElementType(), operands);
+        Op::create(rewriter, loc, vecType.getElementType(), operands);
     result =
-        rewriter.create<vector::InsertOp>(loc, scalarOp, result, positions);
+        vector::InsertOp::create(rewriter, loc, scalarOp, result, positions);
   }
   rewriter.replaceOp(op, {result});
   return success();
@@ -114,9 +115,9 @@ PromoteOpToF32<Op>::matchAndRewrite(Op op, PatternRewriter &rewriter) const {
   auto f32 = rewriter.getF32Type();
   auto extendedOperands = llvm::to_vector(
       llvm::map_range(op->getOperands(), [&](Value operand) -> Value {
-        return rewriter.create<arith::ExtFOp>(loc, f32, operand);
+        return arith::ExtFOp::create(rewriter, loc, f32, operand);
       }));
-  auto newOp = rewriter.create<Op>(loc, f32, extendedOperands);
+  auto newOp = Op::create(rewriter, loc, f32, extendedOperands);
   rewriter.replaceOpWithNewOp<arith::TruncFOp>(op, opType, newOp);
   return success();
 }
@@ -139,8 +140,8 @@ ScalarOpToLibmCall<Op>::matchAndRewrite(Op op,
     rewriter.setInsertionPointToStart(&module->getRegion(0).front());
     auto opFunctionTy = FunctionType::get(
         rewriter.getContext(), op->getOperandTypes(), op->getResultTypes());
-    opFunc = rewriter.create<func::FuncOp>(rewriter.getUnknownLoc(), name,
-                                           opFunctionTy);
+    opFunc = func::FuncOp::create(rewriter, rewriter.getUnknownLoc(), name,
+                                  opFunctionTy);
     opFunc.setPrivate();
 
     // By definition Math dialect operations imply LLVM's "readnone"
diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index 59db14ed816be..a877ad21734a2 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -36,12 +36,12 @@ static Value getScalarOrVectorI32Constant(Type type, int value,
     if (!vectorType.getElementType().isInteger(32))
       return nullptr;
     SmallVector<int> values(vectorType.getNumElements(), value);
-    return builder.create<spirv::ConstantOp>(loc, type,
-                                             builder.getI32VectorAttr(values));
+    return spirv::ConstantOp::create(builder, loc, type,
+                                     builder.getI32VectorAttr(values));
   }
   if (type.isInteger(32))
-    return builder.create<spirv::ConstantOp>(loc, type,
-                                             builder.getI32IntegerAttr(value));
+    return spirv::ConstantOp::create(builder, loc, type,
+                                     builder.getI32IntegerAttr(value));
 
   return nullptr;
 }
@@ -144,10 +144,11 @@ struct CopySignPattern final : public OpConversionPattern<math::CopySignOp> {
     Type intType = rewriter.getIntegerType(bitwidth);
     uint64_t intValue = uint64_t(1) << (bitwidth - 1);
 
-    Value signMask = rewriter.create<spirv::ConstantOp>(
-        loc, intType, rewriter.getIntegerAttr(intType, intValue));
-    Value valueMask = rewriter.create<spirv::ConstantOp>(
-        loc, intType, rewriter.getIntegerAttr(intType, intValue - 1u));
+    Value signMask = spirv::ConstantOp::create(
+        rewriter, loc, intType, rewriter.getIntegerAttr(intType, intValue));
+    Value valueMask = spirv::ConstantOp::create(
+        rewriter, loc, intType,
+        rewriter.getIntegerAttr(intType, intValue - 1u));
 
     if (auto vectorType = dyn_cast<VectorType>(type)) {
       assert(vectorType.getRank() == 1);
@@ -155,26 +156,26 @@ struct CopySignPattern final : public OpConversionPattern<math::CopySignOp> {
       intType = VectorType::get(count, intType);
 
       SmallVector<Value> signSplat(count, signMask);
-      signMask =
-          rewriter.create<spirv::CompositeConstructOp>(loc, intType, signSplat);
+      signMask = spirv::CompositeConstructOp::create(rewriter, loc, intType,
+                                                     signSplat);
 
       SmallVector<Value> valueSplat(count, valueMask);
-      valueMask = rewriter.create<spirv::CompositeConstructOp>(loc, intType,
-                                                               valueSplat);
+      valueMask = spirv::CompositeConstructOp::create(rewriter, loc, intType,
+                                                      valueSplat);
     }
 
     Value lhsCast =
-        rewriter.create<spirv::BitcastOp>(loc, intType, adaptor.getLhs());
+        spirv::BitcastOp::create(rewriter, loc, intType, adaptor.getLhs());
     Value rhsCast =
-        rewriter.create<spirv::BitcastOp>(loc, intType, adaptor.getRhs());
+        spirv::BitcastOp::create(rewriter, loc, intType, adaptor.getRhs());
 
-    Value value = rewriter.create<spirv::BitwiseAndOp>(
-        loc, intType, ValueRange{lhsCast, valueMask});
-    Value sign = rewriter.create<spirv::BitwiseAndOp>(
-        loc, intType, ValueRange{rhsCast, signMask});
+    Value value = spirv::BitwiseAndOp::create(rewriter, loc, intType,
+                                              ValueRange{lhsCast, valueMask});
+    Value sign = spirv::BitwiseAndOp::create(rewriter, loc, intType,
+                                             ValueRange{rhsCast, signMask});
 
-    Value result = rewriter.create<spirv::BitwiseOrOp>(loc, intType,
-                                                       ValueRange{value, sign});
+    Value result = spirv::BitwiseOrOp::create(rewriter, loc, intType,
+                                              ValueRange{value, sign});
     rewriter.replaceOpWithNewOp<spirv::BitcastOp>(copySignOp, type, result);
     return success();
   }
@@ -214,18 +215,18 @@ struct CountLeadingZerosPattern final
     Value val31 = getScalarOrVectorI32Constant(type, 31, rewriter, loc);
     Value val32 = getScalarOrVectorI32Constant(type, 32, rewriter, loc);
 
-    Value msb = rewriter.create<spirv::GLFindUMsbOp>(loc, input);
+    Value msb = spirv::GLFindUMsbOp::create(rewriter, loc, input);
     // We need to subtract from 31 given that the index returned by GLSL
     // FindUMsb is counted from the least significant bit. Theoretically this
     // also gives the correct result even if the integer has all zero bits, in
     // which case GL FindUMsb would return -1.
-    Value subMsb = rewriter.create<spirv::ISubOp>(loc, val31, msb);
+    Value subMsb = spirv::ISubOp::create(rewriter, loc, val31, msb);
     // However, certain Vulkan implementations have driver bugs for the corner
     // case where the input is zero. And.. it can be smart to optimize a select
     // only involving the corner case. So separately compute the result when the
     // input is either zero or one.
-    Value subInput = rewriter.create<spirv::ISubOp>(loc, val32, input);
-    Value cmp = rewriter.create<spirv::ULessThanEqualOp>(loc, input, val1);
+    Value subInput = spirv::ISubOp::create(rewriter, loc, val32, input);
+    Value cmp = spirv::ULessThanEqualOp::create(rewriter, loc, input, val1);
     rewriter.replaceOpWithNewOp<spirv::SelectOp>(countOp, cmp, subInput,
                                                  subMsb);
     return success();
@@ -253,7 +254,7 @@ struct ExpM1OpPattern final : public OpConversionPattern<math::ExpM1Op> {
     if (!type)
       return failure();
 
-    Value exp = rewriter.create<ExpOp>(loc, type, adaptor.getOperand());
+    Value exp = ExpOp::create(rewriter, loc, type, adaptor.getOperand());
     auto one = spirv::ConstantOp::getOne(type, loc, rewriter);
     rewriter.replaceOpWithNewOp<spirv::FSubOp>(operation, exp, one);
     return success();
@@ -283,7 +284,7 @@ struct Log1pOpPattern final : public OpConversionPattern<math::Log1pOp> {
 
     auto one = spirv::ConstantOp::getOne(type, operation.getLoc(), rewriter);
     Value onePlus =
-        rewriter.create<spirv::FAddOp>(loc, one, adaptor.getOperand());
+        spirv::FAddOp::create(rewriter, loc, one, adaptor.getOperand());
     rewriter.replaceOpWithNewOp<LogOp>(operation, type, onePlus);
     return success();
   }
@@ -321,15 +322,15 @@ struct Log2Log10OpPattern final : public OpConversionPattern<MathLogOp> {
 
     auto getConstantValue = [&](double value) {
       if (auto floatType = dyn_cast<FloatType>(type)) {
-        return rewriter.create<spirv::ConstantOp>(
-            loc, type, rewriter.getFloatAttr(floatType, value));
+        return spirv::ConstantOp::create(
+            rewriter, loc, type, rewriter.getFloatAttr(floatType, value));
       }
       if (auto vectorType = dyn_cast<VectorType>(type)) {
         Type elemType = vectorType.getElementType();
 
         if (isa<FloatType>(elemType)) {
-          return rewriter.create<spirv::ConstantOp>(
-              loc, type,
+          return spirv::ConstantOp::create(
+              rewriter, loc, type,
               DenseFPElementsAttr::get(
                   vectorType, FloatAttr::get(elemType, value).getValue()));
         }
@@ -341,7 +342,7 @@ struct Log2Log10OpPattern final : public OpConversionPattern<MathLogOp> {
     Value constantValue = getConstantValue(
         std::is_same<MathLogOp, math::Log2Op>() ? log2Reciprocal
                                                 : log10Reciprocal);
-    Value log = rewriter.create<SpirvLogOp>(loc, adaptor.getOperand());
+    Value log = SpirvLogOp::create(rewriter, loc, adaptor.getOperand());
     rewriter.replaceOpWithNewOp<spirv::FMulOp>(operation, type, log,
                                                constantValue);
     return success();
@@ -386,7 +387,7 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
     Location loc = powfOp.getLoc();
     Value zero = spirv::ConstantOp::getZero(operandType, loc, rewriter);
     Value lessThan =
-        rewriter.create<spirv::FOrdLessThanOp>(loc, adaptor.getLhs(), zero);
+        spirv::FOrdLessThanOp::create(rewriter, loc, adaptor.getLhs(), zero);
 
     // Per C/C++ spec:
     // > pow(base, exponent) returns NaN (and raises FE_INVALID) if base is
@@ -394,11 +395,11 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
     // Calculate the reminder from the exponent and check whether it is zero.
     Value floatOne = spirv::ConstantOp::getOne(operandType, loc, rewriter);
     Value expRem =
-        rewriter.create<spirv::FRemOp>(loc, adaptor.getRhs(), floatOne);
+        spirv::FRemOp::create(rewriter, loc, adaptor.getRhs(), floatOne);
     Value expRemNonZero =
-        rewriter.create<spirv::FOrdNotEqualOp>(loc, expRem, zero);
+        spirv::FOrdNotEqualOp::create(rewriter, loc, expRem, zero);
     Value cmpNegativeWithFractionalExp =
-        rewriter.create<spirv::LogicalAndOp>(loc, expRemNonZero, lessThan);
+        spirv::LogicalAndOp::create(rewriter, loc, expRemNonZero, lessThan);
     // Create NaN result and replace base value if conditions are met.
     const auto &floatSemantics = scalarFloatType.getFloatSemantics();
     const auto nan = APFloat::getNaN(floatSemantics);
@@ -407,10 +408,11 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
       nanAttr = DenseElementsAttr::get(vectorType, nan);
 
     Value NanValue =
-        rewriter.create<spirv::ConstantOp>(loc, operandType, nanAttr);
-    Value lhs = rewriter.create<spirv::SelectOp>(
-        loc, cmpNegativeWithFractionalExp, NanValue, adaptor.getLhs());
-    Value abs = rewriter.create<spirv::GLFAbsOp>(loc, lhs);
+        spirv::ConstantOp::create(rewriter, loc, operandType, nanAttr);
+    Value lhs =
+        spirv::SelectOp::create(rewriter, loc, cmpNegativeWithFractionalExp,
+                                NanValue, adaptor.getLhs());
+    Value abs = spirv::GLFAbsOp::create(rewriter, loc, lhs);
 
     // TODO: The following just forcefully casts y into an integer value in
     // order to properly propagate the sign, assuming integer y cases. It
@@ -418,18 +420,18 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
 
     // Cast exponent to integer and calculate exponent % 2 != 0.
     Value intRhs =
-        rewriter.create<spirv::ConvertFToSOp>(loc, intType, adaptor.getRhs());
+        spirv::ConvertFToSOp::create(rewriter, loc, intType, adaptor.getRhs());
     Value intOne = spirv::ConstantOp::getOne(intType, loc, rewriter);
     Value bitwiseAndOne =
-        rewriter.create<spirv::BitwiseAndOp>(loc, intRhs, intOne);
-    Value isOdd = rewriter.create<spirv::IEqualOp>(loc, bitwiseAndOne, intOne);
+        spirv::BitwiseAndOp::create(rewriter, loc, intRhs, intOne);
+    Value isOdd = spirv::IEqualOp::create(rewriter, loc, bitwiseAndOne, intOne);
 
     // calculate pow based on abs(lhs)^rhs.
-    Value pow = rewriter.create<spirv::GLPowOp>(loc, abs, adaptor.getRhs());
-    Value negate = rewriter.create<spirv::FNegateOp>(loc, pow);
+    Value pow = spirv::GLPowOp::create(rewriter, loc, abs, adaptor.getRhs());
+    Value negate = spirv::FNegateOp::create(rewriter, loc, pow);
     // if the exponent is odd and lhs < 0, negate the result.
     Value shouldNegate =
-        rewriter.create<spirv::LogicalAndOp>(loc, lessThan, isOdd);
+        spirv::LogicalAndOp::create(rewriter, loc, lessThan, isOdd);
     rewriter.replaceOpWithNewOp<spirv::SelectOp>(powfOp, shouldNegate, negate,
                                                  pow);
     return success();
@@ -455,22 +457,22 @@ struct RoundOpPattern final : public OpConversionPattern<math::RoundOp> {
     auto one = spirv::ConstantOp::getOne(ty, loc, rewriter);
     Value half;
     if (VectorType vty = dyn_cast<VectorType>(ty)) {
-      half = rewriter.create<spirv::ConstantOp>(
-          loc, vty,
+      half = spirv::ConstantOp::create(
+          rewriter, loc, vty,
           DenseElementsAttr::get(vty,
                                  rewriter.getFloatAttr(ety, 0.5).getValue()));
     } else {
-      half = rewriter.create<spirv::ConstantOp>(
-          loc, ty, rewriter.getFloatAttr(ety, 0.5));
+      half = spirv::ConstantOp::create(rewriter, loc, ty,
+                                       rewriter.getFloatAttr(ety, 0.5));
     }
 
-    auto abs = rewriter.create<spirv::GLFAbsOp>(loc, operand);
-    auto floor = rewriter.create<spirv::GLFloorOp>(loc, abs);
-    auto sub = rewriter.create<spirv::FSubOp>(loc, abs, floor);
+    auto abs = spirv::GLFAbsOp::create(rewriter, loc, operand);
+    auto floor = spirv::GLFloorOp::create(rewriter, loc, abs);
+    auto sub = spirv::FSubOp::create(rewriter, loc, abs, floor);
     auto greater =
-        rewriter.create<spirv::FOrdGreaterThanEqualOp>(loc, sub, half);
-    auto select = rewriter.create<spirv::SelectOp>(loc, greater, one, zero);
-    auto add = rewriter.create<spirv::FAddOp>(loc, floor, select);
+        spirv::FOrdGreaterThanEqualOp::create(rewriter, loc, sub, half);
+    auto select = spirv::SelectOp::create(rewriter, loc, greater, one, zero);
+    auto add = spirv::FAddOp::create(rewriter, loc, floor, select);
     rewriter.replaceOpWithNewOp<math::CopySignOp>(roundOp, add, operand);
     return success();
   }
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
index db244d1d1cac8..e882845d9d99a 100644
--- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
@@ -16,7 +16,9 @@
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
@@ -77,13 +79,23 @@ struct ConvertAlloca final : public OpConversionPattern<memref::AllocaOp> {
   }
 };
 
+Type convertMemRefType(MemRefType opTy, const TypeConverter *typeConverter) {
+  Type resultTy;
+  if (opTy.getRank() == 0) {
+    resultTy = typeConverter->convertType(mlir::getElementTypeOrSelf(opTy));
+  } else {
+    resultTy = typeConverter->convertType(opTy);
+  }
+  return resultTy;
+}
+
 struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
   using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(memref::GlobalOp op, OpAdaptor operands,
                   ConversionPatternRewriter &rewriter) const override {
-
+    MemRefType opTy = op.getType();
     if (!op.getType().hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op.getLoc(), "cannot transform global with dynamic shape");
@@ -95,7 +107,9 @@ struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
           op.getLoc(), "global variable with alignment requirement is "
                        "currently not supported");
     }
-    auto resultTy = getTypeConverter()->convertType(op.getType());
+
+    Type resultTy = convertMemRefType(opTy, getTypeConverter());
+
     if (!resultTy) {
       return rewriter.notifyMatchFailure(op.getLoc(),
                                          "cannot convert result type");
@@ -114,6 +128,10 @@ struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
     bool externSpecifier = !staticSpecifier;
 
     Attribute initialValue = operands.getInitialValueAttr();
+    if (opTy.getRank() == 0) {
+      auto elementsAttr = llvm::cast<ElementsAttr>(*op.getInitialValue());
+      initialValue = elementsAttr.getSplatValue<Attribute>();
+    }
     if (isa_and_present<UnitAttr>(initialValue))
       initialValue = {};
 
@@ -132,11 +150,23 @@ struct ConvertGetGlobal final
   matchAndRewrite(memref::GetGlobalOp op, OpAdaptor operands,
                   ConversionPatternRewriter &rewriter) const override {
 
-    auto resultTy = getTypeConverter()->convertType(op.getType());
+    MemRefType opTy = op.getType();
+    Type resultTy = convertMemRefType(opTy, getTypeConverter());
+
     if (!resultTy) {
       return rewriter.notifyMatchFailure(op.getLoc(),
                                          "cannot convert result type");
     }
+
+    if (opTy.getRank() == 0) {
+      emitc::LValueType lvalueType = emitc::LValueType::get(resultTy);
+      emitc::GetGlobalOp globalLValue = emitc::GetGlobalOp::create(
+          rewriter, op.getLoc(), lvalueType, operands.getNameAttr());
+      emitc::PointerType pointerType = emitc::PointerType::get(resultTy);
+      rewriter.replaceOpWithNewOp<emitc::ApplyOp>(
+          op, pointerType, rewriter.getStringAttr("&"), globalLValue);
+      return success();
+    }
     rewriter.replaceOpWithNewOp<emitc::GetGlobalOp>(op, resultTy,
                                                     operands.getNameAttr());
     return success();
@@ -161,8 +191,8 @@ struct ConvertLoad final : public OpConversionPattern<memref::LoadOp> {
       return rewriter.notifyMatchFailure(op.getLoc(), "expected array type");
     }
 
-    auto subscript = rewriter.create<emitc::SubscriptOp>(
-        op.getLoc(), arrayValue, operands.getIndices());
+    auto subscript = emitc::SubscriptOp::create(
+        rewriter, op.getLoc(), arrayValue, operands.getIndices());
 
     rewriter.replaceOpWithNewOp<emitc::LoadOp>(op, resultTy, subscript);
     return success();
@@ -181,8 +211,8 @@ struct ConvertStore final : public OpConversionPattern<memref::StoreOp> {
       return rewriter.notifyMatchFailure(op.getLoc(), "expected array type");
     }
 
-    auto subscript = rewriter.create<emitc::SubscriptOp>(
-        op.getLoc(), arrayValue, operands.getIndices());
+    auto subscript = emitc::SubscriptOp::create(
+        rewriter, op.getLoc(), arrayValue, operands.getIndices());
     rewriter.replaceOpWithNewOp<emitc::AssignOp>(op, subscript,
                                                  operands.getValue());
     return success();
@@ -212,7 +242,7 @@ void mlir::populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter) {
     if (inputs.size() != 1)
       return Value();
 
-    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+    return UnrealizedConversionCastOp::create(builder, loc, resultType, inputs)
         .getResult(0);
   };
 
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 83681b2d5fd87..53a19129103a3 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -87,12 +87,12 @@ getAlignedAllocFn(OpBuilder &b, const LLVMTypeConverter *typeConverter,
 ///   aligned = bumped - bumped % alignment
 static Value createAligned(ConversionPatternRewriter &rewriter, Location loc,
                            Value input, Value alignment) {
-  Value one = rewriter.create<LLVM::ConstantOp>(loc, alignment.getType(),
-                                                rewriter.getIndexAttr(1));
-  Value bump = rewriter.create<LLVM::SubOp>(loc, alignment, one);
-  Value bumped = rewriter.create<LLVM::AddOp>(loc, input, bump);
-  Value mod = rewriter.create<LLVM::URemOp>(loc, bumped, alignment);
-  return rewriter.create<LLVM::SubOp>(loc, bumped, mod);
+  Value one = LLVM::ConstantOp::create(rewriter, loc, alignment.getType(),
+                                       rewriter.getIndexAttr(1));
+  Value bump = LLVM::SubOp::create(rewriter, loc, alignment, one);
+  Value bumped = LLVM::AddOp::create(rewriter, loc, input, bump);
+  Value mod = LLVM::URemOp::create(rewriter, loc, bumped, alignment);
+  return LLVM::SubOp::create(rewriter, loc, bumped, mod);
 }
 
 /// Computes the byte size for the MemRef element type.
@@ -123,8 +123,9 @@ static Value castAllocFuncResult(ConversionPatternRewriter &rewriter,
   assert(succeeded(maybeMemrefAddrSpace) && "unsupported address space");
   unsigned memrefAddrSpace = *maybeMemrefAddrSpace;
   if (allocatedPtrTy.getAddressSpace() != memrefAddrSpace)
-    allocatedPtr = rewriter.create<LLVM::AddrSpaceCastOp>(
-        loc, LLVM::LLVMPointerType::get(rewriter.getContext(), memrefAddrSpace),
+    allocatedPtr = LLVM::AddrSpaceCastOp::create(
+        rewriter, loc,
+        LLVM::LLVMPointerType::get(rewriter.getContext(), memrefAddrSpace),
         allocatedPtr);
   return allocatedPtr;
 }
@@ -168,14 +169,14 @@ class AllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
     Value alignment = getAlignment(rewriter, loc, op);
     if (alignment) {
       // Adjust the allocation size to consider alignment.
-      sizeBytes = rewriter.create<LLVM::AddOp>(loc, sizeBytes, alignment);
+      sizeBytes = LLVM::AddOp::create(rewriter, loc, sizeBytes, alignment);
     }
 
     // Allocate the underlying buffer.
     Type elementPtrType = this->getElementPtrType(memRefType);
     assert(elementPtrType && "could not compute element ptr type");
     auto results =
-        rewriter.create<LLVM::CallOp>(loc, allocFuncOp.value(), sizeBytes);
+        LLVM::CallOp::create(rewriter, loc, allocFuncOp.value(), sizeBytes);
 
     Value allocatedPtr =
         castAllocFuncResult(rewriter, loc, results.getResult(), memRefType,
@@ -184,11 +185,11 @@ class AllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
     if (alignment) {
       // Compute the aligned pointer.
       Value allocatedInt =
-          rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), allocatedPtr);
+          LLVM::PtrToIntOp::create(rewriter, loc, getIndexType(), allocatedPtr);
       Value alignmentInt =
           createAligned(rewriter, loc, allocatedInt, alignment);
       alignedPtr =
-          rewriter.create<LLVM::IntToPtrOp>(loc, elementPtrType, alignmentInt);
+          LLVM::IntToPtrOp::create(rewriter, loc, elementPtrType, alignmentInt);
     }
 
     // Create the MemRef descriptor.
@@ -268,8 +269,9 @@ class AlignedAllocOpLowering : public ConvertOpToLLVMPattern<memref::AllocOp> {
       sizeBytes = createAligned(rewriter, loc, sizeBytes, allocAlignment);
 
     Type elementPtrType = this->getElementPtrType(memRefType);
-    auto results = rewriter.create<LLVM::CallOp>(
-        loc, allocFuncOp.value(), ValueRange({allocAlignment, sizeBytes}));
+    auto results =
+        LLVM::CallOp::create(rewriter, loc, allocFuncOp.value(),
+                             ValueRange({allocAlignment, sizeBytes}));
 
     Value ptr =
         castAllocFuncResult(rewriter, loc, results.getResult(), memRefType,
@@ -360,8 +362,9 @@ struct AllocaOpLowering : public ConvertOpToLLVMPattern<memref::AllocaOp> {
     auto elementPtrType =
         LLVM::LLVMPointerType::get(rewriter.getContext(), addrSpace);
 
-    auto allocatedElementPtr = rewriter.create<LLVM::AllocaOp>(
-        loc, elementPtrType, elementType, size, op.getAlignment().value_or(0));
+    auto allocatedElementPtr =
+        LLVM::AllocaOp::create(rewriter, loc, elementPtrType, elementType, size,
+                               op.getAlignment().value_or(0));
 
     // Create the MemRef descriptor.
     auto memRefDescriptor = this->createMemRefDescriptor(
@@ -397,7 +400,7 @@ struct AllocaScopeOpLowering
           remainingOpsBlock, allocaScopeOp.getResultTypes(),
           SmallVector<Location>(allocaScopeOp->getNumResults(),
                                 allocaScopeOp.getLoc()));
-      rewriter.create<LLVM::BrOp>(loc, ValueRange(), remainingOpsBlock);
+      LLVM::BrOp::create(rewriter, loc, ValueRange(), remainingOpsBlock);
     }
 
     // Inline body region.
@@ -407,8 +410,8 @@ struct AllocaScopeOpLowering
 
     // Save stack and then branch into the body of the region.
     rewriter.setInsertionPointToEnd(currentBlock);
-    auto stackSaveOp = rewriter.create<LLVM::StackSaveOp>(loc, getPtrType());
-    rewriter.create<LLVM::BrOp>(loc, ValueRange(), beforeBody);
+    auto stackSaveOp = LLVM::StackSaveOp::create(rewriter, loc, getPtrType());
+    LLVM::BrOp::create(rewriter, loc, ValueRange(), beforeBody);
 
     // Replace the alloca_scope return with a branch that jumps out of the body.
     // Stack restore before leaving the body region.
@@ -420,7 +423,7 @@ struct AllocaScopeOpLowering
 
     // Insert stack restore before jumping out the body of the region.
     rewriter.setInsertionPoint(branchOp);
-    rewriter.create<LLVM::StackRestoreOp>(loc, stackSaveOp);
+    LLVM::StackRestoreOp::create(rewriter, loc, stackSaveOp);
 
     // Replace the op with values return from the body region.
     rewriter.replaceOp(allocaScopeOp, continueBlock->getArguments());
@@ -451,11 +454,11 @@ struct AssumeAlignmentOpLowering
     // This is more direct than ptrtoint-based checks, is explicitly supported,
     // and works with non-integral address spaces.
     Value trueCond =
-        rewriter.create<LLVM::ConstantOp>(loc, rewriter.getBoolAttr(true));
+        LLVM::ConstantOp::create(rewriter, loc, rewriter.getBoolAttr(true));
     Value alignmentConst =
         createIndexAttrConstant(rewriter, loc, getIndexType(), alignment);
-    rewriter.create<LLVM::AssumeOp>(loc, trueCond, LLVM::AssumeAlignTag(), ptr,
-                                    alignmentConst);
+    LLVM::AssumeOp::create(rewriter, loc, trueCond, LLVM::AssumeAlignTag(), ptr,
+                           alignmentConst);
     rewriter.replaceOp(op, memref);
     return success();
   }
@@ -559,18 +562,19 @@ struct DimOpLowering : public ConvertOpToLLVMPattern<memref::DimOp> {
     // Get pointer to offset field of memref<element_type> descriptor.
     auto indexPtrTy =
         LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
-    Value offsetPtr = rewriter.create<LLVM::GEPOp>(
-        loc, indexPtrTy, elementType, underlyingRankedDesc,
-        ArrayRef<LLVM::GEPArg>{0, 2});
+    Value offsetPtr =
+        LLVM::GEPOp::create(rewriter, loc, indexPtrTy, elementType,
+                            underlyingRankedDesc, ArrayRef<LLVM::GEPArg>{0, 2});
 
     // The size value that we have to extract can be obtained using GEPop with
     // `dimOp.index() + 1` index argument.
-    Value idxPlusOne = rewriter.create<LLVM::AddOp>(
-        loc, createIndexAttrConstant(rewriter, loc, getIndexType(), 1),
+    Value idxPlusOne = LLVM::AddOp::create(
+        rewriter, loc,
+        createIndexAttrConstant(rewriter, loc, getIndexType(), 1),
         adaptor.getIndex());
-    Value sizePtr = rewriter.create<LLVM::GEPOp>(
-        loc, indexPtrTy, getTypeConverter()->getIndexType(), offsetPtr,
-        idxPlusOne);
+    Value sizePtr = LLVM::GEPOp::create(rewriter, loc, indexPtrTy,
+                                        getTypeConverter()->getIndexType(),
+                                        offsetPtr, idxPlusOne);
     return rewriter
         .create<LLVM::LoadOp>(loc, getTypeConverter()->getIndexType(), sizePtr)
         .getResult();
@@ -674,9 +678,10 @@ struct GenericAtomicRMWOpLowering
     auto memRefType = cast<MemRefType>(atomicOp.getMemref().getType());
     auto dataPtr = getStridedElementPtr(
         rewriter, loc, memRefType, adaptor.getMemref(), adaptor.getIndices());
-    Value init = rewriter.create<LLVM::LoadOp>(
-        loc, typeConverter->convertType(memRefType.getElementType()), dataPtr);
-    rewriter.create<LLVM::BrOp>(loc, init, loopBlock);
+    Value init = LLVM::LoadOp::create(
+        rewriter, loc, typeConverter->convertType(memRefType.getElementType()),
+        dataPtr);
+    LLVM::BrOp::create(rewriter, loc, init, loopBlock);
 
     // Prepare the body of the loop block.
     rewriter.setInsertionPointToStart(loopBlock);
@@ -696,15 +701,16 @@ struct GenericAtomicRMWOpLowering
     // Append the cmpxchg op to the end of the loop block.
     auto successOrdering = LLVM::AtomicOrdering::acq_rel;
     auto failureOrdering = LLVM::AtomicOrdering::monotonic;
-    auto cmpxchg = rewriter.create<LLVM::AtomicCmpXchgOp>(
-        loc, dataPtr, loopArgument, result, successOrdering, failureOrdering);
+    auto cmpxchg =
+        LLVM::AtomicCmpXchgOp::create(rewriter, loc, dataPtr, loopArgument,
+                                      result, successOrdering, failureOrdering);
     // Extract the %new_loaded and %ok values from the pair.
-    Value newLoaded = rewriter.create<LLVM::ExtractValueOp>(loc, cmpxchg, 0);
-    Value ok = rewriter.create<LLVM::ExtractValueOp>(loc, cmpxchg, 1);
+    Value newLoaded = LLVM::ExtractValueOp::create(rewriter, loc, cmpxchg, 0);
+    Value ok = LLVM::ExtractValueOp::create(rewriter, loc, cmpxchg, 1);
 
     // Conditionally branch to the end or back to the loop depending on %ok.
-    rewriter.create<LLVM::CondBrOp>(loc, ok, endBlock, ArrayRef<Value>(),
-                                    loopBlock, newLoaded);
+    LLVM::CondBrOp::create(rewriter, loc, ok, endBlock, ArrayRef<Value>(),
+                           loopBlock, newLoaded);
 
     rewriter.setInsertionPointToEnd(endBlock);
 
@@ -796,8 +802,8 @@ class GlobalMemrefOpLowering : public ConvertOpToLLVMPattern<memref::GlobalOp> {
     if (!isExternal && isUninitialized) {
       rewriter.createBlock(&newGlobal.getInitializerRegion());
       Value undef[] = {
-          rewriter.create<LLVM::UndefOp>(newGlobal.getLoc(), arrayTy)};
-      rewriter.create<LLVM::ReturnOp>(newGlobal.getLoc(), undef);
+          LLVM::UndefOp::create(rewriter, newGlobal.getLoc(), arrayTy)};
+      LLVM::ReturnOp::create(rewriter, newGlobal.getLoc(), undef);
     }
     return success();
   }
@@ -842,13 +848,13 @@ struct GetGlobalMemrefOpLowering
     Type arrayTy = convertGlobalMemrefTypeToLLVM(type, *getTypeConverter());
     auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), memSpace);
     auto addressOf =
-        rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, op.getName());
+        LLVM::AddressOfOp::create(rewriter, loc, ptrTy, op.getName());
 
     // Get the address of the first element in the array by creating a GEP with
     // the address of the GV as the base, and (rank + 1) number of 0 indices.
-    auto gep = rewriter.create<LLVM::GEPOp>(
-        loc, ptrTy, arrayTy, addressOf,
-        SmallVector<LLVM::GEPArg>(type.getRank() + 1, 0));
+    auto gep =
+        LLVM::GEPOp::create(rewriter, loc, ptrTy, arrayTy, addressOf,
+                            SmallVector<LLVM::GEPArg>(type.getRank() + 1, 0));
 
     // We do not expect the memref obtained using `memref.get_global` to be
     // ever deallocated. Set the allocated pointer to be known bad value to
@@ -857,7 +863,7 @@ struct GetGlobalMemrefOpLowering
     Value deadBeefConst =
         createIndexAttrConstant(rewriter, op->getLoc(), intPtrType, 0xdeadbeef);
     auto deadBeefPtr =
-        rewriter.create<LLVM::IntToPtrOp>(loc, ptrTy, deadBeefConst);
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrTy, deadBeefConst);
 
     // Both allocated and aligned pointers are same. We could potentially stash
     // a nullptr for the allocated pointer since we do not expect any dealloc.
@@ -1009,8 +1015,8 @@ struct MemRefCastOpLowering : public ConvertOpToLLVMPattern<memref::CastOp> {
           loc, adaptor.getSource(), rewriter);
 
       // rank = ConstantOp srcRank
-      auto rankVal = rewriter.create<LLVM::ConstantOp>(
-          loc, getIndexType(), rewriter.getIndexAttr(rank));
+      auto rankVal = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                              rewriter.getIndexAttr(rank));
       // poison = PoisonOp
       UnrankedMemRefDescriptor memRefDesc =
           UnrankedMemRefDescriptor::poison(rewriter, loc, targetStructType);
@@ -1029,7 +1035,7 @@ struct MemRefCastOpLowering : public ConvertOpToLLVMPattern<memref::CastOp> {
       auto ptr = memRefDesc.memRefDescPtr(rewriter, loc);
 
       // struct = LoadOp ptr
-      auto loadOp = rewriter.create<LLVM::LoadOp>(loc, targetStructType, ptr);
+      auto loadOp = LLVM::LoadOp::create(rewriter, loc, targetStructType, ptr);
       rewriter.replaceOp(memRefCastOp, loadOp.getResult());
     } else {
       llvm_unreachable("Unsupported unranked memref to unranked memref cast");
@@ -1063,32 +1069,33 @@ class MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
     MemRefDescriptor srcDesc(adaptor.getSource());
 
     // Compute number of elements.
-    Value numElements = rewriter.create<LLVM::ConstantOp>(
-        loc, getIndexType(), rewriter.getIndexAttr(1));
+    Value numElements = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                                 rewriter.getIndexAttr(1));
     for (int pos = 0; pos < srcType.getRank(); ++pos) {
       auto size = srcDesc.size(rewriter, loc, pos);
-      numElements = rewriter.create<LLVM::MulOp>(loc, numElements, size);
+      numElements = LLVM::MulOp::create(rewriter, loc, numElements, size);
     }
 
     // Get element size.
     auto sizeInBytes = getSizeInBytes(loc, srcType.getElementType(), rewriter);
     // Compute total.
     Value totalSize =
-        rewriter.create<LLVM::MulOp>(loc, numElements, sizeInBytes);
+        LLVM::MulOp::create(rewriter, loc, numElements, sizeInBytes);
 
     Type elementType = typeConverter->convertType(srcType.getElementType());
 
     Value srcBasePtr = srcDesc.alignedPtr(rewriter, loc);
     Value srcOffset = srcDesc.offset(rewriter, loc);
-    Value srcPtr = rewriter.create<LLVM::GEPOp>(
-        loc, srcBasePtr.getType(), elementType, srcBasePtr, srcOffset);
+    Value srcPtr = LLVM::GEPOp::create(rewriter, loc, srcBasePtr.getType(),
+                                       elementType, srcBasePtr, srcOffset);
     MemRefDescriptor targetDesc(adaptor.getTarget());
     Value targetBasePtr = targetDesc.alignedPtr(rewriter, loc);
     Value targetOffset = targetDesc.offset(rewriter, loc);
-    Value targetPtr = rewriter.create<LLVM::GEPOp>(
-        loc, targetBasePtr.getType(), elementType, targetBasePtr, targetOffset);
-    rewriter.create<LLVM::MemcpyOp>(loc, targetPtr, srcPtr, totalSize,
-                                    /*isVolatile=*/false);
+    Value targetPtr =
+        LLVM::GEPOp::create(rewriter, loc, targetBasePtr.getType(), elementType,
+                            targetBasePtr, targetOffset);
+    LLVM::MemcpyOp::create(rewriter, loc, targetPtr, srcPtr, totalSize,
+                           /*isVolatile=*/false);
     rewriter.eraseOp(op);
 
     return success();
@@ -1103,8 +1110,8 @@ class MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
 
     // First make sure we have an unranked memref descriptor representation.
     auto makeUnranked = [&, this](Value ranked, MemRefType type) {
-      auto rank = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                    type.getRank());
+      auto rank = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                           type.getRank());
       auto *typeConverter = getTypeConverter();
       auto ptr =
           typeConverter->promoteOneMemRefDescriptor(loc, ranked, rewriter);
@@ -1116,7 +1123,7 @@ class MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
     };
 
     // Save stack position before promoting descriptors
-    auto stackSaveOp = rewriter.create<LLVM::StackSaveOp>(loc, getPtrType());
+    auto stackSaveOp = LLVM::StackSaveOp::create(rewriter, loc, getPtrType());
 
     auto srcMemRefType = dyn_cast<MemRefType>(srcType);
     Value unrankedSource =
@@ -1128,13 +1135,13 @@ class MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
                          : adaptor.getTarget();
 
     // Now promote the unranked descriptors to the stack.
-    auto one = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
-                                                 rewriter.getIndexAttr(1));
+    auto one = LLVM::ConstantOp::create(rewriter, loc, getIndexType(),
+                                        rewriter.getIndexAttr(1));
     auto promote = [&](Value desc) {
       auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
       auto allocated =
-          rewriter.create<LLVM::AllocaOp>(loc, ptrType, desc.getType(), one);
-      rewriter.create<LLVM::StoreOp>(loc, desc, allocated);
+          LLVM::AllocaOp::create(rewriter, loc, ptrType, desc.getType(), one);
+      LLVM::StoreOp::create(rewriter, loc, desc, allocated);
       return allocated;
     };
 
@@ -1149,11 +1156,11 @@ class MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
         sourcePtr.getType(), symbolTables);
     if (failed(copyFn))
       return failure();
-    rewriter.create<LLVM::CallOp>(loc, copyFn.value(),
-                                  ValueRange{elemSize, sourcePtr, targetPtr});
+    LLVM::CallOp::create(rewriter, loc, copyFn.value(),
+                         ValueRange{elemSize, sourcePtr, targetPtr});
 
     // Restore stack used for descriptors
-    rewriter.create<LLVM::StackRestoreOp>(loc, stackSaveOp);
+    LLVM::StackRestoreOp::create(rewriter, loc, stackSaveOp);
 
     rewriter.eraseOp(op);
 
@@ -1204,9 +1211,9 @@ struct MemorySpaceCastOpLowering
       MemRefDescriptor::unpack(rewriter, loc, adaptor.getSource(), resultTypeR,
                                descVals);
       descVals[0] =
-          rewriter.create<LLVM::AddrSpaceCastOp>(loc, newPtrType, descVals[0]);
+          LLVM::AddrSpaceCastOp::create(rewriter, loc, newPtrType, descVals[0]);
       descVals[1] =
-          rewriter.create<LLVM::AddrSpaceCastOp>(loc, newPtrType, descVals[1]);
+          LLVM::AddrSpaceCastOp::create(rewriter, loc, newPtrType, descVals[1]);
       Value result = MemRefDescriptor::pack(rewriter, loc, *getTypeConverter(),
                                             resultTypeR, descVals);
       rewriter.replaceOp(op, result);
@@ -1241,8 +1248,9 @@ struct MemorySpaceCastOpLowering
       UnrankedMemRefDescriptor::computeSizes(rewriter, loc, *getTypeConverter(),
                                              result, resultAddrSpace, sizes);
       Value resultUnderlyingSize = sizes.front();
-      Value resultUnderlyingDesc = rewriter.create<LLVM::AllocaOp>(
-          loc, getPtrType(), rewriter.getI8Type(), resultUnderlyingSize);
+      Value resultUnderlyingDesc =
+          LLVM::AllocaOp::create(rewriter, loc, getPtrType(),
+                                 rewriter.getI8Type(), resultUnderlyingSize);
       result.setMemRefDescPtr(rewriter, loc, resultUnderlyingDesc);
 
       // Copy pointers, performing address space casts.
@@ -1256,10 +1264,10 @@ struct MemorySpaceCastOpLowering
       Value alignedPtr =
           sourceDesc.alignedPtr(rewriter, loc, *getTypeConverter(),
                                 sourceUnderlyingDesc, sourceElemPtrType);
-      allocatedPtr = rewriter.create<LLVM::AddrSpaceCastOp>(
-          loc, resultElemPtrType, allocatedPtr);
-      alignedPtr = rewriter.create<LLVM::AddrSpaceCastOp>(
-          loc, resultElemPtrType, alignedPtr);
+      allocatedPtr = LLVM::AddrSpaceCastOp::create(
+          rewriter, loc, resultElemPtrType, allocatedPtr);
+      alignedPtr = LLVM::AddrSpaceCastOp::create(rewriter, loc,
+                                                 resultElemPtrType, alignedPtr);
 
       result.setAllocatedPtr(rewriter, loc, resultUnderlyingDesc,
                              resultElemPtrType, allocatedPtr);
@@ -1277,12 +1285,13 @@ struct MemorySpaceCastOpLowering
       int64_t bytesToSkip =
           2 * llvm::divideCeil(
                   getTypeConverter()->getPointerBitwidth(resultAddrSpace), 8);
-      Value bytesToSkipConst = rewriter.create<LLVM::ConstantOp>(
-          loc, getIndexType(), rewriter.getIndexAttr(bytesToSkip));
-      Value copySize = rewriter.create<LLVM::SubOp>(
-          loc, getIndexType(), resultUnderlyingSize, bytesToSkipConst);
-      rewriter.create<LLVM::MemcpyOp>(loc, resultIndexVals, sourceIndexVals,
-                                      copySize, /*isVolatile=*/false);
+      Value bytesToSkipConst = LLVM::ConstantOp::create(
+          rewriter, loc, getIndexType(), rewriter.getIndexAttr(bytesToSkip));
+      Value copySize =
+          LLVM::SubOp::create(rewriter, loc, getIndexType(),
+                              resultUnderlyingSize, bytesToSkipConst);
+      LLVM::MemcpyOp::create(rewriter, loc, resultIndexVals, sourceIndexVals,
+                             copySize, /*isVolatile=*/false);
 
       rewriter.replaceOp(op, ValueRange{result});
       return success();
@@ -1485,7 +1494,7 @@ struct MemRefReshapeOpLowering
         } else {
           Value shapeOp = reshapeOp.getShape();
           Value index = createIndexAttrConstant(rewriter, loc, indexType, i);
-          dimSize = rewriter.create<memref::LoadOp>(loc, shapeOp, index);
+          dimSize = memref::LoadOp::create(rewriter, loc, shapeOp, index);
           Type indexType = getIndexType();
           if (dimSize.getType() != indexType)
             dimSize = typeConverter->materializeTargetConversion(
@@ -1497,7 +1506,7 @@ struct MemRefReshapeOpLowering
         desc.setStride(rewriter, loc, i, stride);
 
         // Prepare the stride value for the next dimension.
-        stride = rewriter.create<LLVM::MulOp>(loc, stride, dimSize);
+        stride = LLVM::MulOp::create(rewriter, loc, stride, dimSize);
       }
 
       *descriptor = desc;
@@ -1522,8 +1531,9 @@ struct MemRefReshapeOpLowering
     SmallVector<Value, 4> sizes;
     UnrankedMemRefDescriptor::computeSizes(rewriter, loc, *getTypeConverter(),
                                            targetDesc, addressSpace, sizes);
-    Value underlyingDescPtr = rewriter.create<LLVM::AllocaOp>(
-        loc, getPtrType(), IntegerType::get(getContext(), 8), sizes.front());
+    Value underlyingDescPtr = LLVM::AllocaOp::create(
+        rewriter, loc, getPtrType(), IntegerType::get(getContext(), 8),
+        sizes.front());
     targetDesc.setMemRefDescPtr(rewriter, loc, underlyingDescPtr);
 
     // Extract pointers and offset from the source memref.
@@ -1554,7 +1564,7 @@ struct MemRefReshapeOpLowering
     Value shapeOperandPtr = shapeDesc.alignedPtr(rewriter, loc);
     Value oneIndex = createIndexAttrConstant(rewriter, loc, getIndexType(), 1);
     Value resultRankMinusOne =
-        rewriter.create<LLVM::SubOp>(loc, resultRank, oneIndex);
+        LLVM::SubOp::create(rewriter, loc, resultRank, oneIndex);
 
     Block *initBlock = rewriter.getInsertionBlock();
     Type indexType = getTypeConverter()->getIndexType();
@@ -1568,15 +1578,15 @@ struct MemRefReshapeOpLowering
     rewriter.mergeBlocks(remainingBlock, condBlock, ValueRange());
 
     rewriter.setInsertionPointToEnd(initBlock);
-    rewriter.create<LLVM::BrOp>(loc, ValueRange({resultRankMinusOne, oneIndex}),
-                                condBlock);
+    LLVM::BrOp::create(rewriter, loc,
+                       ValueRange({resultRankMinusOne, oneIndex}), condBlock);
     rewriter.setInsertionPointToStart(condBlock);
     Value indexArg = condBlock->getArgument(0);
     Value strideArg = condBlock->getArgument(1);
 
     Value zeroIndex = createIndexAttrConstant(rewriter, loc, indexType, 0);
-    Value pred = rewriter.create<LLVM::ICmpOp>(
-        loc, IntegerType::get(rewriter.getContext(), 1),
+    Value pred = LLVM::ICmpOp::create(
+        rewriter, loc, IntegerType::get(rewriter.getContext(), 1),
         LLVM::ICmpPredicate::sge, indexArg, zeroIndex);
 
     Block *bodyBlock =
@@ -1585,31 +1595,31 @@ struct MemRefReshapeOpLowering
 
     // Copy size from shape to descriptor.
     auto llvmIndexPtrType = LLVM::LLVMPointerType::get(rewriter.getContext());
-    Value sizeLoadGep = rewriter.create<LLVM::GEPOp>(
-        loc, llvmIndexPtrType,
+    Value sizeLoadGep = LLVM::GEPOp::create(
+        rewriter, loc, llvmIndexPtrType,
         typeConverter->convertType(shapeMemRefType.getElementType()),
         shapeOperandPtr, indexArg);
-    Value size = rewriter.create<LLVM::LoadOp>(loc, indexType, sizeLoadGep);
+    Value size = LLVM::LoadOp::create(rewriter, loc, indexType, sizeLoadGep);
     UnrankedMemRefDescriptor::setSize(rewriter, loc, *getTypeConverter(),
                                       targetSizesBase, indexArg, size);
 
     // Write stride value and compute next one.
     UnrankedMemRefDescriptor::setStride(rewriter, loc, *getTypeConverter(),
                                         targetStridesBase, indexArg, strideArg);
-    Value nextStride = rewriter.create<LLVM::MulOp>(loc, strideArg, size);
+    Value nextStride = LLVM::MulOp::create(rewriter, loc, strideArg, size);
 
     // Decrement loop counter and branch back.
-    Value decrement = rewriter.create<LLVM::SubOp>(loc, indexArg, oneIndex);
-    rewriter.create<LLVM::BrOp>(loc, ValueRange({decrement, nextStride}),
-                                condBlock);
+    Value decrement = LLVM::SubOp::create(rewriter, loc, indexArg, oneIndex);
+    LLVM::BrOp::create(rewriter, loc, ValueRange({decrement, nextStride}),
+                       condBlock);
 
     Block *remainder =
         rewriter.splitBlock(bodyBlock, rewriter.getInsertionPoint());
 
     // Hook up the cond exit to the remainder.
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<LLVM::CondBrOp>(loc, pred, bodyBlock, ValueRange(),
-                                    remainder, ValueRange());
+    LLVM::CondBrOp::create(rewriter, loc, pred, bodyBlock, ValueRange(),
+                           remainder, ValueRange());
 
     // Reset position to beginning of new remainder block.
     rewriter.setInsertionPointToStart(remainder);
@@ -1738,7 +1748,7 @@ struct ViewOpLowering : public ConvertOpToLLVMPattern<memref::ViewOp> {
       return createIndexAttrConstant(rewriter, loc, indexType, strides[idx]);
     if (nextSize)
       return runningStride
-                 ? rewriter.create<LLVM::MulOp>(loc, runningStride, nextSize)
+                 ? LLVM::MulOp::create(rewriter, loc, runningStride, nextSize)
                  : nextSize;
     assert(!runningStride);
     return createIndexAttrConstant(rewriter, loc, indexType, 1);
@@ -1783,8 +1793,8 @@ struct ViewOpLowering : public ConvertOpToLLVMPattern<memref::ViewOp> {
 
     // Field 2: Copy the actual aligned pointer to payload.
     Value alignedPtr = sourceMemRef.alignedPtr(rewriter, loc);
-    alignedPtr = rewriter.create<LLVM::GEPOp>(
-        loc, alignedPtr.getType(),
+    alignedPtr = LLVM::GEPOp::create(
+        rewriter, loc, alignedPtr.getType(),
         typeConverter->convertType(srcMemRefType.getElementType()), alignedPtr,
         adaptor.getByteShift());
 
diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index b866afbce98b0..7a705336bf11c 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -79,7 +79,8 @@ adjustAccessChainForBitwidth(const SPIRVTypeConverter &typeConverter,
   assert(indices.size() == 2);
   indices.back() = builder.createOrFold<spirv::SDivOp>(loc, lastDim, idx);
   Type t = typeConverter.convertType(op.getComponentPtr().getType());
-  return builder.create<spirv::AccessChainOp>(loc, t, op.getBasePtr(), indices);
+  return spirv::AccessChainOp::create(builder, loc, t, op.getBasePtr(),
+                                      indices);
 }
 
 /// Casts the given `srcBool` into an integer of `dstType`.
@@ -107,8 +108,8 @@ static Value shiftValue(Location loc, Value value, Value offset, Value mask,
     value = castBoolToIntN(loc, value, dstType, builder);
   } else {
     if (valueBits < targetBits) {
-      value = builder.create<spirv::UConvertOp>(
-          loc, builder.getIntegerType(targetBits), value);
+      value = spirv::UConvertOp::create(
+          builder, loc, builder.getIntegerType(targetBits), value);
     }
 
     value = builder.createOrFold<spirv::BitwiseAndOp>(loc, value, mask);
@@ -372,8 +373,8 @@ AllocOpPattern::matchAndRewrite(memref::AllocOp operation, OpAdaptor adaptor,
     std::string varName =
         std::string("__workgroup_mem__") +
         std::to_string(std::distance(varOps.begin(), varOps.end()));
-    varOp = rewriter.create<spirv::GlobalVariableOp>(loc, spirvType, varName,
-                                                     /*initializer=*/nullptr);
+    varOp = spirv::GlobalVariableOp::create(rewriter, loc, spirvType, varName,
+                                            /*initializer=*/nullptr);
   }
 
   // Get pointer to global variable at the current scope.
@@ -572,8 +573,8 @@ IntLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
           loadOp, "failed to determine memory requirements");
 
     auto [memoryAccess, alignment] = *memoryRequirements;
-    Value loadVal = rewriter.create<spirv::LoadOp>(loc, accessChain,
-                                                   memoryAccess, alignment);
+    Value loadVal = spirv::LoadOp::create(rewriter, loc, accessChain,
+                                          memoryAccess, alignment);
     if (isBool)
       loadVal = castIntNToBool(loc, loadVal, rewriter);
     rewriter.replaceOp(loadOp, loadVal);
@@ -601,8 +602,8 @@ IntLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
         loadOp, "failed to determine memory requirements");
 
   auto [memoryAccess, alignment] = *memoryRequirements;
-  Value spvLoadOp = rewriter.create<spirv::LoadOp>(loc, dstType, adjustedPtr,
-                                                   memoryAccess, alignment);
+  Value spvLoadOp = spirv::LoadOp::create(rewriter, loc, dstType, adjustedPtr,
+                                          memoryAccess, alignment);
 
   // Shift the bits to the rightmost.
   // ____XXXX________ -> ____________XXXX
@@ -770,12 +771,12 @@ IntStoreOpPattern::matchAndRewrite(memref::StoreOp storeOp, OpAdaptor adaptor,
   if (!scope)
     return rewriter.notifyMatchFailure(storeOp, "atomic scope not available");
 
-  Value result = rewriter.create<spirv::AtomicAndOp>(
-      loc, dstType, adjustedPtr, *scope, spirv::MemorySemantics::AcquireRelease,
-      clearBitsMask);
-  result = rewriter.create<spirv::AtomicOrOp>(
-      loc, dstType, adjustedPtr, *scope, spirv::MemorySemantics::AcquireRelease,
-      storeVal);
+  Value result = spirv::AtomicAndOp::create(
+      rewriter, loc, dstType, adjustedPtr, *scope,
+      spirv::MemorySemantics::AcquireRelease, clearBitsMask);
+  result = spirv::AtomicOrOp::create(
+      rewriter, loc, dstType, adjustedPtr, *scope,
+      spirv::MemorySemantics::AcquireRelease, storeVal);
 
   // The AtomicOrOp has no side effect. Since it is already inserted, we can
   // just remove the original StoreOp. Note that rewriter.replaceOp()
@@ -850,12 +851,12 @@ LogicalResult MemorySpaceCastOpPattern::matchAndRewrite(
     genericPtrType = typeConverter.convertType(intermediateType);
   }
   if (sourceSc != spirv::StorageClass::Generic) {
-    result =
-        rewriter.create<spirv::PtrCastToGenericOp>(loc, genericPtrType, result);
+    result = spirv::PtrCastToGenericOp::create(rewriter, loc, genericPtrType,
+                                               result);
   }
   if (resultSc != spirv::StorageClass::Generic) {
     result =
-        rewriter.create<spirv::GenericCastToPtrOp>(loc, resultPtrType, result);
+        spirv::GenericCastToPtrOp::create(rewriter, loc, resultPtrType, result);
   }
   rewriter.replaceOp(addrCastOp, result);
   return success();
diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
index b93128441f2b5..63b1fdabaf407 100644
--- a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
+++ b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
@@ -65,7 +65,7 @@ static SmallVector<Value> getMixedAsValues(OpBuilder b, const Location &loc,
       values.emplace_back(*(dyn++));
     } else {
       TypedAttr val = type == i64 ? b.getI64IntegerAttr(s) : b.getIndexAttr(s);
-      values.emplace_back(b.create<arith::ConstantOp>(loc, type, val));
+      values.emplace_back(arith::ConstantOp::create(b, loc, type, val));
     }
   }
   return values;
@@ -79,9 +79,9 @@ static SmallVector<Value> linearToMultiIndex(Location loc, OpBuilder b,
   SmallVector<Value> multiIndex(n);
 
   for (int i = n - 1; i >= 0; --i) {
-    multiIndex[i] = b.create<arith::RemSIOp>(loc, linearIndex, dimensions[i]);
+    multiIndex[i] = arith::RemSIOp::create(b, loc, linearIndex, dimensions[i]);
     if (i > 0)
-      linearIndex = b.create<arith::DivSIOp>(loc, linearIndex, dimensions[i]);
+      linearIndex = arith::DivSIOp::create(b, loc, linearIndex, dimensions[i]);
   }
 
   return multiIndex;
@@ -91,13 +91,13 @@ static SmallVector<Value> linearToMultiIndex(Location loc, OpBuilder b,
 Value multiToLinearIndex(Location loc, OpBuilder b, ValueRange multiIndex,
                          ValueRange dimensions) {
 
-  Value linearIndex = b.create<arith::ConstantIndexOp>(loc, 0);
-  Value stride = b.create<arith::ConstantIndexOp>(loc, 1);
+  Value linearIndex = arith::ConstantIndexOp::create(b, loc, 0);
+  Value stride = arith::ConstantIndexOp::create(b, loc, 1);
 
   for (int i = multiIndex.size() - 1; i >= 0; --i) {
-    Value off = b.create<arith::MulIOp>(loc, multiIndex[i], stride);
-    linearIndex = b.create<arith::AddIOp>(loc, linearIndex, off);
-    stride = b.create<arith::MulIOp>(loc, stride, dimensions[i]);
+    Value off = arith::MulIOp::create(b, loc, multiIndex[i], stride);
+    linearIndex = arith::AddIOp::create(b, loc, linearIndex, off);
+    stride = arith::MulIOp::create(b, loc, stride, dimensions[i]);
   }
 
   return linearIndex;
@@ -144,11 +144,12 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
     auto i64 = rewriter.getI64Type();
     std::array<int64_t, 2> shape = {static_cast<int64_t>(splitAxes.size()),
                                     maxNAxes};
-    Value resSplitAxes = rewriter.create<tensor::EmptyOp>(loc, shape, i16);
+    Value resSplitAxes = tensor::EmptyOp::create(rewriter, loc, shape, i16);
     auto attr = IntegerAttr::get(i16, -1);
-    Value fillValue = rewriter.create<arith::ConstantOp>(loc, i16, attr);
-    resSplitAxes = rewriter.create<linalg::FillOp>(loc, fillValue, resSplitAxes)
-                       .getResult(0);
+    Value fillValue = arith::ConstantOp::create(rewriter, loc, i16, attr);
+    resSplitAxes =
+        linalg::FillOp::create(rewriter, loc, fillValue, resSplitAxes)
+            .getResult(0);
 
     // explicitly write values into tensor row by row
     std::array<int64_t, 2> strides = {1, 1};
@@ -162,9 +163,10 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
       std::array<int64_t, 2> sizes = {1, size};
       auto tensorType = RankedTensorType::get({size}, i16);
       auto attrs = DenseIntElementsAttr::get(tensorType, axes.asArrayRef());
-      auto vals = rewriter.create<arith::ConstantOp>(loc, tensorType, attrs);
-      resSplitAxes = rewriter.create<tensor::InsertSliceOp>(
-          loc, vals, resSplitAxes, empty, empty, empty, offs, sizes, strides);
+      auto vals = arith::ConstantOp::create(rewriter, loc, tensorType, attrs);
+      resSplitAxes = tensor::InsertSliceOp::create(rewriter, loc, vals,
+                                                   resSplitAxes, empty, empty,
+                                                   empty, offs, sizes, strides);
     }
 
     // To hold halos sizes, create 2d Tensor with shape {nSplits, 2}.
@@ -179,7 +181,7 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
                   .create<tensor::EmptyOp>(loc, std::array<int64_t, 2>{0, 0},
                                            i64)
                   .getResult()
-            : rewriter.create<tensor::FromElementsOp>(loc, type, haloSizes)
+            : tensor::FromElementsOp::create(rewriter, loc, type, haloSizes)
                   .getResult();
 
     // To hold sharded dims offsets, create Tensor with shape {nSplits,
@@ -189,8 +191,8 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
     // MeshOp)
     Value resOffsets;
     if (adaptor.getStaticShardedDimsOffsets().empty()) {
-      resOffsets = rewriter.create<tensor::EmptyOp>(
-          loc, std::array<int64_t, 2>{0, 0}, i64);
+      resOffsets = tensor::EmptyOp::create(rewriter, loc,
+                                           std::array<int64_t, 2>{0, 0}, i64);
     } else {
       SymbolTableCollection symbolTableCollection;
       auto meshOp = getMesh(op, symbolTableCollection);
@@ -204,12 +206,12 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
       assert(maxSplitSize);
       ++maxSplitSize; // add one for the total size
 
-      resOffsets = rewriter.create<tensor::EmptyOp>(
-          loc, std::array<int64_t, 2>{nSplits, maxSplitSize}, i64);
-      Value zero = rewriter.create<arith::ConstantOp>(
-          loc, i64, rewriter.getI64IntegerAttr(ShapedType::kDynamic));
+      resOffsets = tensor::EmptyOp::create(
+          rewriter, loc, std::array<int64_t, 2>{nSplits, maxSplitSize}, i64);
+      Value zero = arith::ConstantOp::create(
+          rewriter, loc, i64, rewriter.getI64IntegerAttr(ShapedType::kDynamic));
       resOffsets =
-          rewriter.create<linalg::FillOp>(loc, zero, resOffsets).getResult(0);
+          linalg::FillOp::create(rewriter, loc, zero, resOffsets).getResult(0);
       SmallVector<Value> offsets =
           getMixedAsValues(rewriter, loc, adaptor.getStaticShardedDimsOffsets(),
                            adaptor.getDynamicShardedDimsOffsets());
@@ -220,11 +222,12 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
         assert(splitSize != ShapedType::kDynamic && splitSize < maxSplitSize);
         ++splitSize; // add one for the total size
         ArrayRef<Value> values(&offsets[curr], splitSize);
-        Value vals = rewriter.create<tensor::FromElementsOp>(loc, values);
+        Value vals = tensor::FromElementsOp::create(rewriter, loc, values);
         std::array<int64_t, 2> offs = {static_cast<int64_t>(i), 0};
         std::array<int64_t, 2> sizes = {1, splitSize};
-        resOffsets = rewriter.create<tensor::InsertSliceOp>(
-            loc, vals, resOffsets, empty, empty, empty, offs, sizes, strides);
+        resOffsets = tensor::InsertSliceOp::create(rewriter, loc, vals,
+                                                   resOffsets, empty, empty,
+                                                   empty, offs, sizes, strides);
         curr += splitSize;
       }
     }
@@ -236,10 +239,10 @@ struct ConvertShardingOp : public OpConversionPattern<ShardingOp> {
       return failure();
 
     resSplitAxes =
-        rewriter.create<tensor::CastOp>(loc, resTypes[0], resSplitAxes);
+        tensor::CastOp::create(rewriter, loc, resTypes[0], resSplitAxes);
     resHaloSizes =
-        rewriter.create<tensor::CastOp>(loc, resTypes[1], resHaloSizes);
-    resOffsets = rewriter.create<tensor::CastOp>(loc, resTypes[2], resOffsets);
+        tensor::CastOp::create(rewriter, loc, resTypes[1], resHaloSizes);
+    resOffsets = tensor::CastOp::create(rewriter, loc, resTypes[2], resOffsets);
 
     rewriter.replaceOpWithNewOp<UnrealizedConversionCastOp>(
         op, TupleType::get(op.getContext(), resTypes),
@@ -269,9 +272,9 @@ struct ConvertProcessMultiIndexOp
     SmallVector<Value> dims;
     llvm::transform(
         meshOp.getShape(), std::back_inserter(dims), [&](int64_t i) {
-          return rewriter.create<arith::ConstantIndexOp>(loc, i).getResult();
+          return arith::ConstantIndexOp::create(rewriter, loc, i).getResult();
         });
-    Value rank = rewriter.create<ProcessLinearIndexOp>(op.getLoc(), meshOp);
+    Value rank = ProcessLinearIndexOp::create(rewriter, op.getLoc(), meshOp);
     auto mIdx = linearToMultiIndex(loc, rewriter, rank, dims);
 
     // optionally extract subset of mesh axes
@@ -302,7 +305,7 @@ class ConvertProcessLinearIndexOp
     Location loc = op.getLoc();
     auto ctx = op.getContext();
     Value commWorld =
-        rewriter.create<mpi::CommWorldOp>(loc, mpi::CommType::get(ctx));
+        mpi::CommWorldOp::create(rewriter, loc, mpi::CommType::get(ctx));
     auto rank =
         rewriter
             .create<mpi::CommRankOp>(
@@ -341,41 +344,41 @@ struct ConvertNeighborsLinearIndicesOp
     SmallVector<Value> dims;
     llvm::transform(
         meshOp.getShape(), std::back_inserter(dims), [&](int64_t i) {
-          return rewriter.create<arith::ConstantIndexOp>(loc, i).getResult();
+          return arith::ConstantIndexOp::create(rewriter, loc, i).getResult();
         });
     Value dimSz = dims[axes[0]];
-    Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    Value minus1 = rewriter.create<arith::ConstantIndexOp>(loc, -1);
-    Value atBorder = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sle, orgIdx,
-        rewriter.create<arith::ConstantIndexOp>(loc, 0));
-    auto down = rewriter.create<scf::IfOp>(
-        loc, atBorder,
+    Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    Value minus1 = arith::ConstantIndexOp::create(rewriter, loc, -1);
+    Value atBorder =
+        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::sle, orgIdx,
+                              arith::ConstantIndexOp::create(rewriter, loc, 0));
+    auto down = scf::IfOp::create(
+        rewriter, loc, atBorder,
         [&](OpBuilder &builder, Location loc) {
-          builder.create<scf::YieldOp>(loc, minus1);
+          scf::YieldOp::create(builder, loc, minus1);
         },
         [&](OpBuilder &builder, Location loc) {
           SmallVector<Value> tmp = mIdx;
           tmp[axes[0]] =
-              rewriter.create<arith::SubIOp>(op.getLoc(), orgIdx, one)
+              arith::SubIOp::create(rewriter, op.getLoc(), orgIdx, one)
                   .getResult();
-          builder.create<scf::YieldOp>(
-              loc, multiToLinearIndex(loc, rewriter, tmp, dims));
+          scf::YieldOp::create(builder, loc,
+                               multiToLinearIndex(loc, rewriter, tmp, dims));
         });
-    atBorder = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sge, orgIdx,
-        rewriter.create<arith::SubIOp>(loc, dimSz, one).getResult());
-    auto up = rewriter.create<scf::IfOp>(
-        loc, atBorder,
+    atBorder = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sge, orgIdx,
+        arith::SubIOp::create(rewriter, loc, dimSz, one).getResult());
+    auto up = scf::IfOp::create(
+        rewriter, loc, atBorder,
         [&](OpBuilder &builder, Location loc) {
-          builder.create<scf::YieldOp>(loc, minus1);
+          scf::YieldOp::create(builder, loc, minus1);
         },
         [&](OpBuilder &builder, Location loc) {
           SmallVector<Value> tmp = mIdx;
           tmp[axes[0]] =
-              rewriter.create<arith::AddIOp>(op.getLoc(), orgIdx, one);
-          builder.create<scf::YieldOp>(
-              loc, multiToLinearIndex(loc, rewriter, tmp, dims));
+              arith::AddIOp::create(rewriter, op.getLoc(), orgIdx, one);
+          scf::YieldOp::create(builder, loc,
+                               multiToLinearIndex(loc, rewriter, tmp, dims));
         });
     rewriter.replaceOp(op, ValueRange{down.getResult(0), up.getResult(0)});
     return success();
@@ -447,8 +450,9 @@ struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {
           rewriter, loc, sharding.getStaticShardedDimsOffsets(),
           sharding.getDynamicShardedDimsOffsets(), index);
       if (!tmp.empty())
-        shardedDimsOffs = rewriter.create<tensor::FromElementsOp>(
-            loc, RankedTensorType::get({(int64_t)tmp.size()}, index), tmp);
+        shardedDimsOffs = tensor::FromElementsOp::create(
+            rewriter, loc, RankedTensorType::get({(int64_t)tmp.size()}, index),
+            tmp);
     }
 
     // With static mesh shape the sizes of the split axes are known.
@@ -457,9 +461,9 @@ struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {
     int64_t pos = 0;
     SmallVector<Value> shardShape;
     Value zero =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(index));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getZeroAttr(index));
     Value one =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getOneAttr(index));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getOneAttr(index));
 
     // Iterate over the dimensions of the tensor shape, get their split Axes,
     // and compute the sharded shape.
@@ -469,8 +473,8 @@ struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {
         auto axes = splitAxes[i];
         // The current dimension might not be sharded.
         // Create a value from the static position in shardDimsOffsets.
-        Value posVal =
-            rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(pos));
+        Value posVal = arith::ConstantOp::create(rewriter, loc,
+                                                 rewriter.getIndexAttr(pos));
         // Get the index of the local shard in the mesh axis.
         Value idx = multiIdx[axes[0]];
         auto numShards =
@@ -482,29 +486,29 @@ struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {
             return op->emitError() << "Only single axis sharding is "
                                    << "supported for each dimension.";
           }
-          idx = rewriter.create<arith::AddIOp>(loc, posVal, idx);
+          idx = arith::AddIOp::create(rewriter, loc, posVal, idx);
           // Compute size = shardedDimsOffs[idx+1] - shardedDimsOffs[idx].
           Value off =
-              rewriter.create<tensor::ExtractOp>(loc, shardedDimsOffs, idx);
-          idx = rewriter.create<arith::AddIOp>(loc, idx, one);
+              tensor::ExtractOp::create(rewriter, loc, shardedDimsOffs, idx);
+          idx = arith::AddIOp::create(rewriter, loc, idx, one);
           Value nextOff =
-              rewriter.create<tensor::ExtractOp>(loc, shardedDimsOffs, idx);
-          Value sz = rewriter.create<arith::SubIOp>(loc, nextOff, off);
+              tensor::ExtractOp::create(rewriter, loc, shardedDimsOffs, idx);
+          Value sz = arith::SubIOp::create(rewriter, loc, nextOff, off);
           shardShape.emplace_back(sz);
         } else {
-          Value numShardsVal = rewriter.create<arith::ConstantOp>(
-              loc, rewriter.getIndexAttr(numShards));
+          Value numShardsVal = arith::ConstantOp::create(
+              rewriter, loc, rewriter.getIndexAttr(numShards));
           // Compute shard dim size by distributing odd elements to trailing
           // shards:
           // sz = dim / numShards
           //      + (idx >= (numShards - (dim % numShards)) ? 1 : 0)
-          Value sz = rewriter.create<arith::DivSIOp>(loc, dim, numShardsVal);
-          Value sz1 = rewriter.create<arith::RemSIOp>(loc, dim, numShardsVal);
-          sz1 = rewriter.create<arith::SubIOp>(loc, numShardsVal, sz1);
-          auto cond = rewriter.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::sge, idx, sz1);
-          Value odd = rewriter.create<arith::SelectOp>(loc, cond, one, zero);
-          sz = rewriter.create<arith::AddIOp>(loc, sz, odd);
+          Value sz = arith::DivSIOp::create(rewriter, loc, dim, numShardsVal);
+          Value sz1 = arith::RemSIOp::create(rewriter, loc, dim, numShardsVal);
+          sz1 = arith::SubIOp::create(rewriter, loc, numShardsVal, sz1);
+          auto cond = arith::CmpIOp::create(
+              rewriter, loc, arith::CmpIPredicate::sge, idx, sz1);
+          Value odd = arith::SelectOp::create(rewriter, loc, cond, one, zero);
+          sz = arith::AddIOp::create(rewriter, loc, sz, odd);
           shardShape.emplace_back(sz);
         }
         pos += numShards + 1; // add one for the total size.
@@ -568,7 +572,7 @@ struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {
     if (isa<RankedTensorType>(input.getType())) {
       auto memrefType = MemRefType::get(
           inputShape, cast<ShapedType>(input.getType()).getElementType());
-      input = iBuilder.create<bufferization::ToBufferOp>(memrefType, input);
+      input = bufferization::ToBufferOp::create(iBuilder, memrefType, input);
     }
     MemRefType inType = cast<MemRefType>(input.getType());
 
@@ -577,15 +581,15 @@ struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {
     for (auto i = 0; i < inType.getRank(); ++i) {
       auto s = inputShape[i];
       if (ShapedType::isDynamic(s))
-        shape[i] = iBuilder.create<memref::DimOp>(input, s).getResult();
+        shape[i] = memref::DimOp::create(iBuilder, input, s).getResult();
       else
         shape[i] = iBuilder.getIndexAttr(s);
     }
 
     // Allocate buffer and copy input to buffer.
-    Value buffer = iBuilder.create<memref::AllocOp>(
-        shape, cast<ShapedType>(op.getType()).getElementType());
-    iBuilder.create<linalg::CopyOp>(input, buffer);
+    Value buffer = memref::AllocOp::create(
+        iBuilder, shape, cast<ShapedType>(op.getType()).getElementType());
+    linalg::CopyOp::create(iBuilder, input, buffer);
 
     // Get an MPI_Comm_split for the AllReduce operation.
     // The color is the linear index of the process in the mesh along the
@@ -594,9 +598,9 @@ struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {
     SmallVector<Type> indexResultTypes(meshOp.getShape().size(),
                                        iBuilder.getIndexType());
     SmallVector<Value> myMultiIndex =
-        iBuilder.create<ProcessMultiIndexOp>(indexResultTypes, mesh)
+        ProcessMultiIndexOp::create(iBuilder, indexResultTypes, mesh)
             .getResult();
-    Value zero = iBuilder.create<arith::ConstantIndexOp>(0);
+    Value zero = arith::ConstantIndexOp::create(iBuilder, 0);
     SmallVector<Value> multiKey(myMultiIndex.size(), zero);
 
     auto redAxes = adaptor.getMeshAxes();
@@ -607,15 +611,15 @@ struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {
 
     Value color =
         createProcessLinearIndex(mesh, myMultiIndex, redAxes, iBuilder);
-    color = iBuilder.create<arith::IndexCastOp>(iBuilder.getI32Type(), color);
+    color = arith::IndexCastOp::create(iBuilder, iBuilder.getI32Type(), color);
     Value key = createProcessLinearIndex(mesh, multiKey, redAxes, iBuilder);
-    key = iBuilder.create<arith::IndexCastOp>(iBuilder.getI32Type(), key);
+    key = arith::IndexCastOp::create(iBuilder, iBuilder.getI32Type(), key);
 
     // Finally split the communicator
     auto commType = mpi::CommType::get(op->getContext());
-    Value commWorld = iBuilder.create<mpi::CommWorldOp>(commType);
+    Value commWorld = mpi::CommWorldOp::create(iBuilder, commType);
     auto comm =
-        iBuilder.create<mpi::CommSplitOp>(commType, commWorld, color, key)
+        mpi::CommSplitOp::create(iBuilder, commType, commWorld, color, key)
             .getNewcomm();
 
     Value buffer1d = buffer;
@@ -623,19 +627,19 @@ struct ConvertAllReduceOp : public OpConversionPattern<AllReduceOp> {
     if (inType.getRank() > 1) {
       ReassociationIndices reassociation(inType.getRank());
       std::iota(reassociation.begin(), reassociation.end(), 0);
-      buffer1d = iBuilder.create<memref::CollapseShapeOp>(
-          buffer, ArrayRef<ReassociationIndices>(reassociation));
+      buffer1d = memref::CollapseShapeOp::create(
+          iBuilder, buffer, ArrayRef<ReassociationIndices>(reassociation));
     }
 
     // Create the MPI AllReduce operation.
-    iBuilder.create<mpi::AllReduceOp>(
-        TypeRange(), buffer1d, buffer1d,
-        getMPIReductionOp(adaptor.getReductionAttr()), comm);
+    mpi::AllReduceOp::create(iBuilder, TypeRange(), buffer1d, buffer1d,
+                             getMPIReductionOp(adaptor.getReductionAttr()),
+                             comm);
 
     // If the destination is a memref, cast it to a tensor
     if (isa<RankedTensorType>(op.getType()))
-      buffer = iBuilder.create<bufferization::ToTensorOp>(op.getType(), buffer,
-                                                          true);
+      buffer = bufferization::ToTensorOp::create(iBuilder, op.getType(), buffer,
+                                                 true);
 
     rewriter.replaceOp(op, buffer);
     return success();
@@ -676,9 +680,10 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
     auto toValue = [&rewriter, &loc](OpFoldResult &v) -> Value {
       if (auto value = dyn_cast<Value>(v))
         return value;
-      return rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexAttr(
-                   cast<IntegerAttr>(cast<Attribute>(v)).getInt()));
+      return arith::ConstantOp::create(
+          rewriter, loc,
+          rewriter.getIndexAttr(
+              cast<IntegerAttr>(cast<Attribute>(v)).getInt()));
     };
 
     auto dest = adaptor.getDestination();
@@ -689,7 +694,7 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
       auto mmemrefType = MemRefType::get(
           dstShape, cast<ShapedType>(array.getType()).getElementType());
       array =
-          rewriter.create<bufferization::ToBufferOp>(loc, mmemrefType, array);
+          bufferization::ToBufferOp::create(rewriter, loc, mmemrefType, array);
     }
     auto rank = cast<ShapedType>(array.getType()).getRank();
     auto opSplitAxes = adaptor.getSplitAxes().getAxes();
@@ -713,7 +718,7 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
     for (auto i = 0; i < rank; ++i) {
       auto s = dstShape[i];
       if (ShapedType::isDynamic(s))
-        shape[i] = rewriter.create<memref::DimOp>(loc, array, s).getResult();
+        shape[i] = memref::DimOp::create(rewriter, loc, array, s).getResult();
       else
         shape[i] = rewriter.getIndexAttr(s);
 
@@ -723,12 +728,12 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
         offsets[i] = haloSizes[currHaloDim * 2];
 
         // prepare shape and offsets of highest dim's halo exchange
-        Value _haloSz = rewriter.create<arith::AddIOp>(
-            loc, toValue(haloSizes[currHaloDim * 2]),
+        Value _haloSz = arith::AddIOp::create(
+            rewriter, loc, toValue(haloSizes[currHaloDim * 2]),
             toValue(haloSizes[currHaloDim * 2 + 1]));
         // the halo shape of lower dims exlude the halos
         dimSizes[i] =
-            rewriter.create<arith::SubIOp>(loc, toValue(shape[i]), _haloSz)
+            arith::SubIOp::create(rewriter, loc, toValue(shape[i]), _haloSz)
                 .getResult();
       } else {
         dimSizes[i] = shape[i];
@@ -736,14 +741,14 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
     }
 
     auto tagAttr = rewriter.getI32IntegerAttr(91); // we just pick something
-    auto tag = rewriter.create<arith::ConstantOp>(loc, tagAttr);
+    auto tag = arith::ConstantOp::create(rewriter, loc, tagAttr);
     auto zeroAttr = rewriter.getI32IntegerAttr(0); // for detecting v<0
-    auto zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
+    auto zero = arith::ConstantOp::create(rewriter, loc, zeroAttr);
 
     SmallVector<Type> indexResultTypes(meshOp.getShape().size(),
                                        rewriter.getIndexType());
     auto myMultiIndex =
-        rewriter.create<ProcessMultiIndexOp>(loc, indexResultTypes, mesh)
+        ProcessMultiIndexOp::create(rewriter, loc, indexResultTypes, mesh)
             .getResult();
     // traverse all split axes from high to low dim
     for (ssize_t dim = opSplitAxes.size() - 1; dim >= 0; --dim) {
@@ -758,20 +763,22 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
                                                        splitAxes)
                      .getResults();
       // MPI operates on i32...
-      Value neighbourIDs[2] = {rewriter.create<arith::IndexCastOp>(
-                                   loc, rewriter.getI32Type(), tmp[0]),
-                               rewriter.create<arith::IndexCastOp>(
-                                   loc, rewriter.getI32Type(), tmp[1])};
+      Value neighbourIDs[2] = {
+          arith::IndexCastOp::create(rewriter, loc, rewriter.getI32Type(),
+                                     tmp[0]),
+          arith::IndexCastOp::create(rewriter, loc, rewriter.getI32Type(),
+                                     tmp[1])};
 
       auto lowerRecvOffset = rewriter.getIndexAttr(0);
       auto lowerSendOffset = toValue(haloSizes[currHaloDim * 2]);
-      auto upperRecvOffset = rewriter.create<arith::SubIOp>(
-          loc, toValue(shape[dim]), toValue(haloSizes[currHaloDim * 2 + 1]));
-      auto upperSendOffset = rewriter.create<arith::SubIOp>(
-          loc, upperRecvOffset, toValue(haloSizes[currHaloDim * 2]));
+      auto upperRecvOffset =
+          arith::SubIOp::create(rewriter, loc, toValue(shape[dim]),
+                                toValue(haloSizes[currHaloDim * 2 + 1]));
+      auto upperSendOffset = arith::SubIOp::create(
+          rewriter, loc, upperRecvOffset, toValue(haloSizes[currHaloDim * 2]));
 
-      Value commWorld = rewriter.create<mpi::CommWorldOp>(
-          loc, mpi::CommType::get(op->getContext()));
+      Value commWorld = mpi::CommWorldOp::create(
+          rewriter, loc, mpi::CommType::get(op->getContext()));
 
       // Make sure we send/recv in a way that does not lead to a dead-lock.
       // The current approach is by far not optimal, this should be at least
@@ -787,37 +794,38 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
         // Processes on the mesh borders have only one neighbor
         auto to = upperHalo ? neighbourIDs[0] : neighbourIDs[1];
         auto from = upperHalo ? neighbourIDs[1] : neighbourIDs[0];
-        auto hasFrom = rewriter.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::sge, from, zero);
-        auto hasTo = rewriter.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::sge, to, zero);
-        auto buffer = rewriter.create<memref::AllocOp>(
-            loc, dimSizes, cast<ShapedType>(array.getType()).getElementType());
+        auto hasFrom = arith::CmpIOp::create(
+            rewriter, loc, arith::CmpIPredicate::sge, from, zero);
+        auto hasTo = arith::CmpIOp::create(rewriter, loc,
+                                           arith::CmpIPredicate::sge, to, zero);
+        auto buffer = memref::AllocOp::create(
+            rewriter, loc, dimSizes,
+            cast<ShapedType>(array.getType()).getElementType());
         // if has neighbor: copy halo data from array to buffer and send
-        rewriter.create<scf::IfOp>(
-            loc, hasTo, [&](OpBuilder &builder, Location loc) {
+        scf::IfOp::create(
+            rewriter, loc, hasTo, [&](OpBuilder &builder, Location loc) {
               offsets[dim] = upperHalo ? OpFoldResult(lowerSendOffset)
                                        : OpFoldResult(upperSendOffset);
-              auto subview = builder.create<memref::SubViewOp>(
-                  loc, array, offsets, dimSizes, strides);
-              builder.create<memref::CopyOp>(loc, subview, buffer);
-              builder.create<mpi::SendOp>(loc, TypeRange{}, buffer, tag, to,
-                                          commWorld);
-              builder.create<scf::YieldOp>(loc);
+              auto subview = memref::SubViewOp::create(
+                  builder, loc, array, offsets, dimSizes, strides);
+              memref::CopyOp::create(builder, loc, subview, buffer);
+              mpi::SendOp::create(builder, loc, TypeRange{}, buffer, tag, to,
+                                  commWorld);
+              scf::YieldOp::create(builder, loc);
             });
         // if has neighbor: receive halo data into buffer and copy to array
-        rewriter.create<scf::IfOp>(
-            loc, hasFrom, [&](OpBuilder &builder, Location loc) {
+        scf::IfOp::create(
+            rewriter, loc, hasFrom, [&](OpBuilder &builder, Location loc) {
               offsets[dim] = upperHalo ? OpFoldResult(upperRecvOffset)
                                        : OpFoldResult(lowerRecvOffset);
-              builder.create<mpi::RecvOp>(loc, TypeRange{}, buffer, tag, from,
-                                          commWorld);
-              auto subview = builder.create<memref::SubViewOp>(
-                  loc, array, offsets, dimSizes, strides);
-              builder.create<memref::CopyOp>(loc, buffer, subview);
-              builder.create<scf::YieldOp>(loc);
+              mpi::RecvOp::create(builder, loc, TypeRange{}, buffer, tag, from,
+                                  commWorld);
+              auto subview = memref::SubViewOp::create(
+                  builder, loc, array, offsets, dimSizes, strides);
+              memref::CopyOp::create(builder, loc, buffer, subview);
+              scf::YieldOp::create(builder, loc);
             });
-        rewriter.create<memref::DeallocOp>(loc, buffer);
+        memref::DeallocOp::create(rewriter, loc, buffer);
         offsets[dim] = orgOffset;
       };
 
@@ -825,16 +833,17 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
         OpFoldResult &v = haloSizes[currHaloDim * 2 + upOrDown];
         Value haloSz = dyn_cast<Value>(v);
         if (!haloSz)
-          haloSz = rewriter.create<arith::ConstantOp>(
-              loc, rewriter.getI32IntegerAttr(
-                       cast<IntegerAttr>(cast<Attribute>(v)).getInt()));
-        auto hasSize = rewriter.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::sgt, haloSz, zero);
-        rewriter.create<scf::IfOp>(loc, hasSize,
-                                   [&](OpBuilder &builder, Location loc) {
-                                     genSendRecv(upOrDown > 0);
-                                     builder.create<scf::YieldOp>(loc);
-                                   });
+          haloSz = arith::ConstantOp::create(
+              rewriter, loc,
+              rewriter.getI32IntegerAttr(
+                  cast<IntegerAttr>(cast<Attribute>(v)).getInt()));
+        auto hasSize = arith::CmpIOp::create(
+            rewriter, loc, arith::CmpIPredicate::sgt, haloSz, zero);
+        scf::IfOp::create(rewriter, loc, hasSize,
+                          [&](OpBuilder &builder, Location loc) {
+                            genSendRecv(upOrDown > 0);
+                            scf::YieldOp::create(builder, loc);
+                          });
       };
 
       doSendRecv(0);
@@ -852,8 +861,8 @@ struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
       rewriter.replaceOp(op, array);
     } else {
       assert(isa<RankedTensorType>(op.getResult().getType()));
-      rewriter.replaceOp(op, rewriter.create<bufferization::ToTensorOp>(
-                                 loc, op.getResult().getType(), array,
+      rewriter.replaceOp(op, bufferization::ToTensorOp::create(
+                                 rewriter, loc, op.getResult().getType(), array,
                                  /*restrict=*/true, /*writable=*/true));
     }
     return success();
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 80b3d85488495..905287e107b0b 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -53,7 +53,7 @@ static Value truncToI32(ImplicitLocOpBuilder &b, Value value) {
   assert(llvm::isa<IntegerType>(type) && "expected an integer Value");
   if (type.getIntOrFloatBitWidth() <= 32)
     return value;
-  return b.create<LLVM::TruncOp>(b.getI32Type(), value);
+  return LLVM::TruncOp::create(b, b.getI32Type(), value);
 }
 
 /// Returns the type for the intrinsic given the vectorResultType of the
@@ -113,8 +113,8 @@ static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
   Type f32x1Ty = VectorType::get(1, f32Ty);
 
   auto makeConst = [&](int32_t index) -> Value {
-    return rewriter.create<LLVM::ConstantOp>(loc, IntegerType::get(ctx, 32),
-                                             rewriter.getI32IntegerAttr(index));
+    return LLVM::ConstantOp::create(rewriter, loc, IntegerType::get(ctx, 32),
+                                    rewriter.getI32IntegerAttr(index));
   };
 
   if (arrayType) {
@@ -126,7 +126,7 @@ static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
         arrayType.getElementType() == f32x1Ty) {
       for (unsigned i = 0; i < structType.getBody().size(); i++) {
         Value el =
-            rewriter.create<LLVM::ExtractValueOp>(loc, intrinsicResult, i);
+            LLVM::ExtractValueOp::create(rewriter, loc, intrinsicResult, i);
         el = rewriter.createOrFold<LLVM::BitcastOp>(
             loc, arrayType.getElementType(), el);
         elements.push_back(el);
@@ -143,24 +143,24 @@ static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
 
       for (unsigned i = 0, e = structType.getBody().size() / 2; i < e; i++) {
         Value vec =
-            rewriter.create<LLVM::PoisonOp>(loc, arrayType.getElementType());
+            LLVM::PoisonOp::create(rewriter, loc, arrayType.getElementType());
         Value x1 =
-            rewriter.create<LLVM::ExtractValueOp>(loc, intrinsicResult, i * 2);
-        Value x2 = rewriter.create<LLVM::ExtractValueOp>(loc, intrinsicResult,
-                                                         i * 2 + 1);
-        vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
-                                                     x1, makeConst(0));
-        vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
-                                                     x2, makeConst(1));
+            LLVM::ExtractValueOp::create(rewriter, loc, intrinsicResult, i * 2);
+        Value x2 = LLVM::ExtractValueOp::create(rewriter, loc, intrinsicResult,
+                                                i * 2 + 1);
+        vec = LLVM::InsertElementOp::create(rewriter, loc, vec.getType(), vec,
+                                            x1, makeConst(0));
+        vec = LLVM::InsertElementOp::create(rewriter, loc, vec.getType(), vec,
+                                            x2, makeConst(1));
         elements.push_back(vec);
       }
     }
 
     // Create the final vectorized result.
-    Value result = rewriter.create<LLVM::PoisonOp>(loc, arrayType);
+    Value result = LLVM::PoisonOp::create(rewriter, loc, arrayType);
     for (const auto &el : llvm::enumerate(elements)) {
-      result = rewriter.create<LLVM::InsertValueOp>(loc, result, el.value(),
-                                                    el.index());
+      result = LLVM::InsertValueOp::create(rewriter, loc, result, el.value(),
+                                           el.index());
     }
     return result;
   }
@@ -187,7 +187,7 @@ static SmallVector<Value> unpackOperandVector(ImplicitLocOpBuilder &b,
   auto arrayTy = cast<LLVM::LLVMArrayType>(operand.getType());
 
   for (unsigned i = 0, e = arrayTy.getNumElements(); i < e; ++i) {
-    Value toUse = b.create<LLVM::ExtractValueOp>(operand, i);
+    Value toUse = LLVM::ExtractValueOp::create(b, operand, i);
 
     // For 4xi8 vectors, the intrinsic expects these to be provided as i32
     // scalar types.
@@ -195,7 +195,7 @@ static SmallVector<Value> unpackOperandVector(ImplicitLocOpBuilder &b,
         arrayTy.getElementType() == i4x8Ty ||
         (arrayTy.getElementType() == f32x1Ty &&
          operandPtxType == NVVM::MMATypes::tf32)) {
-      result.push_back(b.create<LLVM::BitcastOp>(i32Ty, toUse));
+      result.push_back(LLVM::BitcastOp::create(b, i32Ty, toUse));
       continue;
     }
 
@@ -208,9 +208,9 @@ static SmallVector<Value> unpackOperandVector(ImplicitLocOpBuilder &b,
                          innerArrayTy.getElementType() == f32Ty)) {
       for (unsigned idx = 0, innerSize = innerArrayTy.getNumElements();
            idx < innerSize; idx++) {
-        result.push_back(b.create<LLVM::ExtractElementOp>(
-            toUse,
-            b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(idx))));
+        result.push_back(LLVM::ExtractElementOp::create(
+            b, toUse,
+            LLVM::ConstantOp::create(b, i64Ty, b.getI64IntegerAttr(idx))));
       }
       continue;
     }
@@ -285,8 +285,8 @@ struct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<nvgpu::LdMatrixOp> {
     Value srcPtr =
         getStridedElementPtr(rewriter, b.getLoc(), srcMemrefType,
                              adaptor.getSrcMemref(), adaptor.getIndices());
-    Value ldMatrixResult = b.create<NVVM::LdMatrixOp>(
-        ldMatrixResultType, srcPtr,
+    Value ldMatrixResult = NVVM::LdMatrixOp::create(
+        b, ldMatrixResultType, srcPtr,
         /*num=*/op.getNumTiles(),
         /*layout=*/op.getTranspose() ? NVVM::MMALayout::col
                                      : NVVM::MMALayout::row);
@@ -296,13 +296,13 @@ struct MmaLdMatrixOpToNVVM : public ConvertOpToLLVMPattern<nvgpu::LdMatrixOp> {
     // actual vector type (still of width 32b) and repack them into a result
     // struct.
     Type finalResultType = typeConverter->convertType(vectorResultType);
-    Value result = b.create<LLVM::PoisonOp>(finalResultType);
+    Value result = LLVM::PoisonOp::create(b, finalResultType);
     for (int64_t i = 0, e = vectorResultType.getDimSize(0); i < e; i++) {
       Value i32Register =
-          num32BitRegs > 1 ? b.create<LLVM::ExtractValueOp>(ldMatrixResult, i)
+          num32BitRegs > 1 ? LLVM::ExtractValueOp::create(b, ldMatrixResult, i)
                            : ldMatrixResult;
-      Value casted = b.create<LLVM::BitcastOp>(innerVectorType, i32Register);
-      result = b.create<LLVM::InsertValueOp>(result, casted, i);
+      Value casted = LLVM::BitcastOp::create(b, innerVectorType, i32Register);
+      result = LLVM::InsertValueOp::create(b, result, casted, i);
     }
 
     rewriter.replaceOp(op, result);
@@ -375,16 +375,16 @@ struct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<nvgpu::MmaSyncOp> {
     Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
     Type intrinsicResTy = inferIntrinsicResultType(
         typeConverter->convertType(op->getResultTypes()[0]));
-    Value intrinsicResult = b.create<NVVM::MmaOp>(
-        intrinsicResTy, matA, matB, matC,
-        /*shape=*/gemmShape,
-        /*b1Op=*/std::nullopt,
-        /*intOverflow=*/overflow,
-        /*multiplicandPtxTypes=*/
-        std::array<NVVM::MMATypes, 2>{*ptxTypeA, *ptxTypeB},
-        /*multiplicandLayouts=*/
-        std::array<NVVM::MMALayout, 2>{NVVM::MMALayout::row,
-                                       NVVM::MMALayout::col});
+    Value intrinsicResult =
+        NVVM::MmaOp::create(b, intrinsicResTy, matA, matB, matC,
+                            /*shape=*/gemmShape,
+                            /*b1Op=*/std::nullopt,
+                            /*intOverflow=*/overflow,
+                            /*multiplicandPtxTypes=*/
+                            std::array<NVVM::MMATypes, 2>{*ptxTypeA, *ptxTypeB},
+                            /*multiplicandLayouts=*/
+                            std::array<NVVM::MMALayout, 2>{
+                                NVVM::MMALayout::row, NVVM::MMALayout::col});
     rewriter.replaceOp(op, convertIntrinsicResult(op.getLoc(), intrinsicResTy,
                                                   desiredRetTy, intrinsicResult,
                                                   rewriter));
@@ -565,15 +565,16 @@ static FailureOr<LLVM::InlineAsmOp> emitMmaSparseSyncOpAsm(
     llvm::append_range(asmVals, args);
   asmVals.push_back(indexData);
 
-  return b.create<LLVM::InlineAsmOp>(
-      /*resultTypes=*/intrinsicResultType,
-      /*operands=*/asmVals,
-      /*asm_string=*/asmStr,
-      /*constraints=*/constraintStr,
-      /*has_side_effects=*/true,
-      /*is_align_stack=*/false, LLVM::TailCallKind::None,
-      /*asm_dialect=*/asmDialectAttr,
-      /*operand_attrs=*/ArrayAttr());
+  return LLVM::InlineAsmOp::create(b,
+                                   /*resultTypes=*/intrinsicResultType,
+                                   /*operands=*/asmVals,
+                                   /*asm_string=*/asmStr,
+                                   /*constraints=*/constraintStr,
+                                   /*has_side_effects=*/true,
+                                   /*is_align_stack=*/false,
+                                   LLVM::TailCallKind::None,
+                                   /*asm_dialect=*/asmDialectAttr,
+                                   /*operand_attrs=*/ArrayAttr());
 }
 
 /// Lowers `nvgpu.mma.sp.sync` to inline assembly.
@@ -631,7 +632,7 @@ struct NVGPUMmaSparseSyncLowering
       return op->emitOpError() << "Expected metadata type to be LLVM "
                                   "VectorType of 2 i16 elements";
     sparseMetadata =
-        b.create<LLVM::BitcastOp>(rewriter.getI32Type(), sparseMetadata);
+        LLVM::BitcastOp::create(b, rewriter.getI32Type(), sparseMetadata);
 
     FailureOr<LLVM::InlineAsmOp> intrinsicResult = emitMmaSparseSyncOpAsm(
         b, *ptxTypeA, *ptxTypeB, *ptxTypeC, *ptxTypeC, overflow, matA, matB,
@@ -682,7 +683,7 @@ struct NVGPUAsyncCopyLowering
     // Intrinsics takes a global pointer so we need an address space cast.
     auto srcPointerGlobalType = LLVM::LLVMPointerType::get(
         op->getContext(), NVVM::NVVMMemorySpace::kGlobalMemorySpace);
-    scrPtr = b.create<LLVM::AddrSpaceCastOp>(srcPointerGlobalType, scrPtr);
+    scrPtr = LLVM::AddrSpaceCastOp::create(b, srcPointerGlobalType, scrPtr);
     int64_t dstElements = adaptor.getDstElements().getZExtValue();
     int64_t sizeInBytes =
         (dstMemrefType.getElementTypeBitWidth() * dstElements) / 8;
@@ -697,13 +698,13 @@ struct NVGPUAsyncCopyLowering
       // The rest of the DstElements in the destination (shared memory) are
       // filled with zeros.
       Value c3I32 =
-          b.create<LLVM::ConstantOp>(b.getI32Type(), b.getI32IntegerAttr(3));
-      Value bitwidth = b.create<LLVM::ConstantOp>(
-          b.getI32Type(),
+          LLVM::ConstantOp::create(b, b.getI32Type(), b.getI32IntegerAttr(3));
+      Value bitwidth = LLVM::ConstantOp::create(
+          b, b.getI32Type(),
           b.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth()));
-      Value srcElementsI32 = b.create<LLVM::TruncOp>(b.getI32Type(), srcBytes);
-      srcBytes = b.create<LLVM::LShrOp>(
-          b.create<LLVM::MulOp>(bitwidth, srcElementsI32), c3I32);
+      Value srcElementsI32 = LLVM::TruncOp::create(b, b.getI32Type(), srcBytes);
+      srcBytes = LLVM::LShrOp::create(
+          b, LLVM::MulOp::create(b, bitwidth, srcElementsI32), c3I32);
     }
     // Cache global (.cg) for 16 dst bytes, Cache all (.ca) for sizes other than
     // 16 dst bytes.
@@ -712,14 +713,15 @@ struct NVGPUAsyncCopyLowering
             ? NVVM::LoadCacheModifierKind::CG
             : NVVM::LoadCacheModifierKind::CA;
 
-    b.create<NVVM::CpAsyncOp>(
-        dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes),
+    NVVM::CpAsyncOp::create(
+        b, dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes),
         NVVM::LoadCacheModifierKindAttr::get(op->getContext(), cacheModifier),
         srcBytes);
 
     // Drop the result token.
-    Value zero = b.create<LLVM::ConstantOp>(
-        IntegerType::get(op.getContext(), 32), rewriter.getI32IntegerAttr(0));
+    Value zero =
+        LLVM::ConstantOp::create(b, IntegerType::get(op.getContext(), 32),
+                                 rewriter.getI32IntegerAttr(0));
     rewriter.replaceOp(op, zero);
     return success();
   }
@@ -733,11 +735,11 @@ struct NVGPUAsyncCreateGroupLowering
   LogicalResult
   matchAndRewrite(nvgpu::DeviceAsyncCreateGroupOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.create<NVVM::CpAsyncCommitGroupOp>(op.getLoc());
+    NVVM::CpAsyncCommitGroupOp::create(rewriter, op.getLoc());
     // Drop the result token.
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        op->getLoc(), IntegerType::get(op.getContext(), 32),
-        rewriter.getI32IntegerAttr(0));
+    Value zero = LLVM::ConstantOp::create(rewriter, op->getLoc(),
+                                          IntegerType::get(op.getContext(), 32),
+                                          rewriter.getI32IntegerAttr(0));
     rewriter.replaceOp(op, zero);
     return success();
   }
@@ -753,7 +755,7 @@ struct NVGPUAsyncWaitLowering
                   ConversionPatternRewriter &rewriter) const override {
     // If numGroup is not present pick 0 as a conservative correct value.
     int32_t numGroups = adaptor.getNumGroups().value_or(0);
-    rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);
+    NVVM::CpAsyncWaitGroupOp::create(rewriter, op.getLoc(), numGroups);
     rewriter.eraseOp(op);
     return success();
   }
@@ -771,8 +773,8 @@ struct NVGPUMBarrierCreateLowering
     SymbolTable symbolTable(moduleOp);
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(&moduleOp.front());
-    auto global = rewriter.create<memref::GlobalOp>(
-        funcOp->getLoc(), "__mbarrier",
+    auto global = memref::GlobalOp::create(
+        rewriter, funcOp->getLoc(), "__mbarrier",
         /*sym_visibility=*/rewriter.getStringAttr("private"),
         /*type=*/barrierType,
         /*initial_value=*/ElementsAttr(),
@@ -974,7 +976,7 @@ struct NVGPUMBarrierTryWaitParityLowering
                        adaptor.getMbarId(), rewriter);
     Value ticks = truncToI32(b, adaptor.getTicks());
     Value phase =
-        b.create<LLVM::ZExtOp>(b.getI32Type(), adaptor.getPhaseParity());
+        LLVM::ZExtOp::create(b, b.getI32Type(), adaptor.getPhaseParity());
 
     if (isMbarrierShared(op.getBarriers().getType())) {
       rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
@@ -1063,16 +1065,16 @@ struct NVGPUGenerateWarpgroupDescriptorLowering
 
     auto ti64 = b.getIntegerType(64);
     auto makeConst = [&](uint64_t index) -> Value {
-      return b.create<LLVM::ConstantOp>(ti64, b.getI64IntegerAttr(index));
+      return LLVM::ConstantOp::create(b, ti64, b.getI64IntegerAttr(index));
     };
     auto shiftLeft = [&](Value value, unsigned shift) -> Value {
-      return b.create<LLVM::ShlOp>(ti64, value, makeConst(shift));
+      return LLVM::ShlOp::create(b, ti64, value, makeConst(shift));
     };
     auto shiftRight = [&](Value value, unsigned shift) -> Value {
-      return b.create<LLVM::LShrOp>(ti64, value, makeConst(shift));
+      return LLVM::LShrOp::create(b, ti64, value, makeConst(shift));
     };
     auto insertBit = [&](Value desc, Value val, int startBit) {
-      return b.create<LLVM::OrOp>(ti64, desc, shiftLeft(val, startBit));
+      return LLVM::OrOp::create(b, ti64, desc, shiftLeft(val, startBit));
     };
 
     int64_t sizeN = op.getTensorMap().getType().getTensor().getDimSize(0);
@@ -1086,7 +1088,7 @@ struct NVGPUGenerateWarpgroupDescriptorLowering
     Value baseAddr = getStridedElementPtr(
         rewriter, op->getLoc(), cast<MemRefType>(op.getTensor().getType()),
         adaptor.getTensor(), {});
-    Value basePtr = b.create<LLVM::PtrToIntOp>(ti64, baseAddr);
+    Value basePtr = LLVM::PtrToIntOp::create(b, ti64, baseAddr);
     // Just use 14 bits for base address
     Value basePtr14bit = shiftRight(shiftLeft(basePtr, 46), 50);
 
@@ -1118,8 +1120,8 @@ struct NVGPUGenerateWarpgroupDescriptorLowering
 };
 
 static Value makeI64Const(ImplicitLocOpBuilder &b, int32_t index) {
-  return b.create<LLVM::ConstantOp>(b.getIntegerType(64),
-                                    b.getI32IntegerAttr(index));
+  return LLVM::ConstantOp::create(b, b.getIntegerType(64),
+                                  b.getI32IntegerAttr(index));
 }
 
 /// Returns a Value that holds data type enum that is expected by CUDA driver.
@@ -1182,12 +1184,12 @@ struct NVGPUTmaCreateDescriptorOpLowering
     auto promotedOperands = getTypeConverter()->promoteOperands(
         b.getLoc(), op->getOperands(), adaptor.getOperands(), b);
 
-    Value boxArrayPtr = b.create<LLVM::AllocaOp>(llvmPointerType, llvmInt64Type,
-                                                 makeI64Const(b, 5));
+    Value boxArrayPtr = LLVM::AllocaOp::create(
+        b, llvmPointerType, llvmInt64Type, makeI64Const(b, 5));
     for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) {
-      Value gep = b.create<LLVM::GEPOp>(llvmPointerType, llvmPointerType,
-                                        boxArrayPtr, makeI64Const(b, index));
-      b.create<LLVM::StoreOp>(value, gep);
+      Value gep = LLVM::GEPOp::create(b, llvmPointerType, llvmPointerType,
+                                      boxArrayPtr, makeI64Const(b, index));
+      LLVM::StoreOp::create(b, value, gep);
     }
 
     nvgpu::TensorMapDescriptorType desc = op.getTensorMap().getType();
@@ -1337,7 +1339,7 @@ struct NVGPUWarpgroupMmaOpLowering
 
     /// Basic function to generate Add
     Value makeAdd(Value lhs, Value rhs) {
-      return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
+      return LLVM::AddOp::create(b, lhs.getType(), lhs, rhs);
     };
 
     /// Moves the descriptor pointer of matrix-A for the next wgmma instruction.
@@ -1430,29 +1432,30 @@ struct NVGPUWarpgroupMmaOpLowering
       auto overflow = NVVM::MMAIntOverflowAttr::get(
           op->getContext(), NVVM::MMAIntOverflow::wrapped);
 
-      return b.create<NVVM::WgmmaMmaAsyncOp>(
-          matrixC.getType(), matrixC, descriptorA, descriptorB, shape, itypeA,
-          itypeB, itypeD, scaleOut, scaleIn, scaleIn, layoutA, layoutB,
+      return NVVM::WgmmaMmaAsyncOp::create(
+          b, matrixC.getType(), matrixC, descriptorA, descriptorB, shape,
+          itypeA, itypeB, itypeD, scaleOut, scaleIn, scaleIn, layoutA, layoutB,
           overflow);
     }
 
     /// Generates multiple wgmma instructions to complete the given GEMM shape
     Value generateWgmmaGroup() {
       Value wgmmaResult =
-          b.create<LLVM::PoisonOp>(adaptor.getMatrixC().getType());
+          LLVM::PoisonOp::create(b, adaptor.getMatrixC().getType());
 
       // Perform GEMM
       SmallVector<Value> wgmmaResults;
       for (int i = 0; i < iterationM; ++i) {
-        Value matrixC = b.create<LLVM::ExtractValueOp>(adaptor.getMatrixC(), i);
+        Value matrixC =
+            LLVM::ExtractValueOp::create(b, adaptor.getMatrixC(), i);
         for (int j = 0; j < iterationN; ++j)
           for (int k = 0; k < iterationK; ++k)
             matrixC = generateWgmma(i, j, k, matrixC);
         wgmmaResults.push_back(matrixC);
       }
       for (auto [idx, matrix] : llvm::enumerate(wgmmaResults)) {
-        wgmmaResult = b.create<LLVM::InsertValueOp>(wgmmaResult.getType(),
-                                                    wgmmaResult, matrix, idx);
+        wgmmaResult = LLVM::InsertValueOp::create(b, wgmmaResult.getType(),
+                                                  wgmmaResult, matrix, idx);
       }
       return wgmmaResult;
     }
@@ -1486,10 +1489,10 @@ struct NVGPUWarpgroupMmaOpLowering
     /// (WgmmaGroupSyncAlignedOp) for group synchronization
     /// (WgmmaWaitGroupSyncOp) after the instructions.
     Value generateWarpgroupMma() {
-      b.create<NVVM::WgmmaFenceAlignedOp>();
+      NVVM::WgmmaFenceAlignedOp::create(b);
       Value wgmmaResult = generateWgmmaGroup();
-      b.create<NVVM::WgmmaGroupSyncAlignedOp>();
-      b.create<NVVM::WgmmaWaitGroupSyncOp>(op.getWaitGroup());
+      NVVM::WgmmaGroupSyncAlignedOp::create(b);
+      NVVM::WgmmaWaitGroupSyncOp::create(b, op.getWaitGroup());
       return wgmmaResult;
     }
   };
@@ -1557,7 +1560,7 @@ struct NVGPUWarpgroupMmaStoreOpLowering
     Type i32 = b.getI32Type();
 
     auto makeConst = [&](int32_t index) -> Value {
-      return b.create<LLVM::ConstantOp>(i32, b.getI32IntegerAttr(index));
+      return LLVM::ConstantOp::create(b, i32, b.getI32IntegerAttr(index));
     };
     Value c1 = makeConst(1);
     Value c2 = makeConst(2);
@@ -1567,29 +1570,29 @@ struct NVGPUWarpgroupMmaStoreOpLowering
     Value warpSize = makeConst(kWarpSize);
 
     auto makeMul = [&](Value lhs, Value rhs) -> Value {
-      return b.create<LLVM::MulOp>(lhs.getType(), lhs, rhs);
+      return LLVM::MulOp::create(b, lhs.getType(), lhs, rhs);
     };
     auto makeAdd = [&](Value lhs, Value rhs) -> Value {
-      return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
+      return LLVM::AddOp::create(b, lhs.getType(), lhs, rhs);
     };
 
     auto makeExtractAndStore = [&](int i, Value wgmmaResult, Value x, Value y,
                                    TypedValue<::mlir::MemRefType> memref) {
       Type it = b.getIndexType();
-      Value idx = b.create<arith::IndexCastOp>(it, x);
-      Value idy0 = b.create<arith::IndexCastOp>(it, y);
-      Value idy1 = b.create<arith::IndexCastOp>(it, makeAdd(y, c1));
-      Value d0 = b.create<LLVM::ExtractValueOp>(wgmmaResult, i);
-      Value d1 = b.create<LLVM::ExtractValueOp>(wgmmaResult, i + 1);
-      b.create<memref::StoreOp>(d0, memref, ValueRange{idx, idy0});
-      b.create<memref::StoreOp>(d1, memref, ValueRange{idx, idy1});
+      Value idx = arith::IndexCastOp::create(b, it, x);
+      Value idy0 = arith::IndexCastOp::create(b, it, y);
+      Value idy1 = arith::IndexCastOp::create(b, it, makeAdd(y, c1));
+      Value d0 = LLVM::ExtractValueOp::create(b, wgmmaResult, i);
+      Value d1 = LLVM::ExtractValueOp::create(b, wgmmaResult, i + 1);
+      memref::StoreOp::create(b, d0, memref, ValueRange{idx, idy0});
+      memref::StoreOp::create(b, d1, memref, ValueRange{idx, idy1});
     };
 
-    Value tidx = b.create<NVVM::ThreadIdXOp>(i32);
-    Value laneId = b.create<LLVM::URemOp>(i32, tidx, warpSize);
-    Value warpId = b.create<LLVM::UDivOp>(i32, tidx, warpSize);
-    Value lane4Id = b.create<LLVM::UDivOp>(i32, laneId, c4);
-    Value lane4modId = b.create<LLVM::URemOp>(i32, laneId, c4);
+    Value tidx = NVVM::ThreadIdXOp::create(b, i32);
+    Value laneId = LLVM::URemOp::create(b, i32, tidx, warpSize);
+    Value warpId = LLVM::UDivOp::create(b, i32, tidx, warpSize);
+    Value lane4Id = LLVM::UDivOp::create(b, i32, laneId, c4);
+    Value lane4modId = LLVM::URemOp::create(b, i32, laneId, c4);
 
     Value tj = makeMul(lane4modId, c2);
     Value ti = makeAdd(lane4Id, makeMul(warpId, c16));
@@ -1626,7 +1629,8 @@ struct NVGPUWarpgroupMmaStoreOpLowering
     auto stype = cast<LLVM::LLVMStructType>(matriDValue.getType());
     for (auto [idx, matrixD] : llvm::enumerate(stype.getBody())) {
       auto structType = cast<LLVM::LLVMStructType>(matrixD);
-      Value innerStructValue = b.create<LLVM::ExtractValueOp>(matriDValue, idx);
+      Value innerStructValue =
+          LLVM::ExtractValueOp::create(b, matriDValue, idx);
       storeFragmentedMatrix(b, innerStructValue, op.getDstMemref(), offset);
       offset += structType.getBody().size();
     }
@@ -1648,23 +1652,23 @@ struct NVGPUWarpgroupMmaInitAccumulatorOpLowering
     Type elemType = cast<LLVM::LLVMStructType>(packStructType.getBody().front())
                         .getBody()
                         .front();
-    Value zero = b.create<LLVM::ConstantOp>(elemType, b.getZeroAttr(elemType));
-    Value packStruct = b.create<LLVM::PoisonOp>(packStructType);
+    Value zero = LLVM::ConstantOp::create(b, elemType, b.getZeroAttr(elemType));
+    Value packStruct = LLVM::PoisonOp::create(b, packStructType);
     SmallVector<Value> innerStructs;
     // Unpack the structs and set all values to zero
     for (auto [idx, s] : llvm::enumerate(packStructType.getBody())) {
       auto structType = cast<LLVM::LLVMStructType>(s);
-      Value structValue = b.create<LLVM::ExtractValueOp>(packStruct, idx);
+      Value structValue = LLVM::ExtractValueOp::create(b, packStruct, idx);
       for (unsigned i = 0; i < structType.getBody().size(); ++i) {
-        structValue = b.create<LLVM::InsertValueOp>(
-            structType, structValue, zero, ArrayRef<int64_t>({i}));
+        structValue = LLVM::InsertValueOp::create(b, structType, structValue,
+                                                  zero, ArrayRef<int64_t>({i}));
       }
       innerStructs.push_back(structValue);
     }
     // Pack the inner structs into a single struct
     for (auto [idx, matrix] : llvm::enumerate(innerStructs)) {
-      packStruct = b.create<LLVM::InsertValueOp>(packStruct.getType(),
-                                                 packStruct, matrix, idx);
+      packStruct = LLVM::InsertValueOp::create(b, packStruct.getType(),
+                                               packStruct, matrix, idx);
     }
     rewriter.replaceOp(op, packStruct);
     return success();
@@ -1681,7 +1685,7 @@ struct NVGPUTmaFenceOpLowering
     ImplicitLocOpBuilder b(op->getLoc(), rewriter);
     auto i32Ty = b.getI32Type();
     Value tensormapSize =
-        b.create<LLVM::ConstantOp>(i32Ty, rewriter.getI32IntegerAttr(128));
+        LLVM::ConstantOp::create(b, i32Ty, rewriter.getI32IntegerAttr(128));
 
     auto memscope =
         NVVM::MemScopeKindAttr::get(ctx, ::mlir::NVVM::MemScopeKind::SYS);
@@ -1716,13 +1720,13 @@ struct NVGPURcpOpLowering : public ConvertOpToLLVMPattern<nvgpu::RcpOp> {
     VectorType inTy = op.getIn().getType();
     // apply rcp.approx.ftz.f on each element in vector.
     auto convert1DVec = [&](Type llvm1DVectorTy, Value inVec) {
-      Value ret1DVec = b.create<LLVM::PoisonOp>(llvm1DVectorTy);
+      Value ret1DVec = LLVM::PoisonOp::create(b, llvm1DVectorTy);
       int numElems = llvm::cast<VectorType>(llvm1DVectorTy).getNumElements();
       for (int i = 0; i < numElems; i++) {
-        Value idx = b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(i));
-        Value elem = b.create<LLVM::ExtractElementOp>(inVec, idx);
-        Value dst = b.create<NVVM::RcpApproxFtzF32Op>(f32Ty, elem);
-        ret1DVec = b.create<LLVM::InsertElementOp>(ret1DVec, dst, idx);
+        Value idx = LLVM::ConstantOp::create(b, i64Ty, b.getI64IntegerAttr(i));
+        Value elem = LLVM::ExtractElementOp::create(b, inVec, idx);
+        Value dst = NVVM::RcpApproxFtzF32Op::create(b, f32Ty, elem);
+        ret1DVec = LLVM::InsertElementOp::create(b, ret1DVec, dst, idx);
       }
       return ret1DVec;
     };
diff --git a/mlir/lib/Conversion/OpenACCToSCF/OpenACCToSCF.cpp b/mlir/lib/Conversion/OpenACCToSCF/OpenACCToSCF.cpp
index 479725aae8afd..f5b3689c88d26 100644
--- a/mlir/lib/Conversion/OpenACCToSCF/OpenACCToSCF.cpp
+++ b/mlir/lib/Conversion/OpenACCToSCF/OpenACCToSCF.cpp
@@ -39,8 +39,8 @@ class ExpandIfCondition : public OpRewritePattern<OpTy> {
 
     IntegerAttr constAttr;
     if (!matchPattern(op.getIfCond(), m_Constant(&constAttr))) {
-      auto ifOp = rewriter.create<scf::IfOp>(op.getLoc(), TypeRange(),
-                                             op.getIfCond(), false);
+      auto ifOp = scf::IfOp::create(rewriter, op.getLoc(), TypeRange(),
+                                    op.getIfCond(), false);
       rewriter.modifyOpInPlace(op, [&]() { op.getIfCondMutable().erase(0); });
       auto thenBodyBuilder = ifOp.getThenBodyBuilder(rewriter.getListener());
       thenBodyBuilder.clone(*op.getOperation());
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 7ac9687c4eeda..021e31a8ecd97 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -95,8 +95,8 @@ struct OpenMPOpConversion : public ConvertOpToLLVMPattern<T> {
     }
 
     // Create new operation.
-    auto newOp = rewriter.create<T>(op.getLoc(), resTypes, convertedOperands,
-                                    convertedAttrs);
+    auto newOp = T::create(rewriter, op.getLoc(), resTypes, convertedOperands,
+                           convertedAttrs);
 
     // Translate regions.
     for (auto [originalRegion, convertedRegion] :
diff --git a/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp b/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp
index 7d20109b3db59..b711e33cfc0d6 100644
--- a/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp
+++ b/mlir/lib/Conversion/PDLToPDLInterp/PDLToPDLInterp.cpp
@@ -196,7 +196,7 @@ Block *PatternLowering::generateMatcher(MatcherNode &node, Region &region,
   // finalize.
   if (isa<ExitNode>(node)) {
     builder.setInsertionPointToEnd(block);
-    builder.create<pdl_interp::FinalizeOp>(matcherFunc.getLoc());
+    pdl_interp::FinalizeOp::create(builder, matcherFunc.getLoc());
     return block;
   }
 
@@ -272,8 +272,8 @@ Value PatternLowering::getValueAt(Block *&currentBlock, Position *pos) {
     auto *operationPos = cast<OperationPosition>(pos);
     if (operationPos->isOperandDefiningOp())
       // Standard (downward) traversal which directly follows the defining op.
-      value = builder.create<pdl_interp::GetDefiningOpOp>(
-          loc, builder.getType<pdl::OperationType>(), parentVal);
+      value = pdl_interp::GetDefiningOpOp::create(
+          builder, loc, builder.getType<pdl::OperationType>(), parentVal);
     else
       // A passthrough operation position.
       value = parentVal;
@@ -287,23 +287,23 @@ Value PatternLowering::getValueAt(Block *&currentBlock, Position *pos) {
     // requested to use a representative value (e.g., upward traversal).
     if (isa<pdl::RangeType>(parentVal.getType()) &&
         usersPos->useRepresentative())
-      value = builder.create<pdl_interp::ExtractOp>(loc, parentVal, 0);
+      value = pdl_interp::ExtractOp::create(builder, loc, parentVal, 0);
     else
       value = parentVal;
 
     // The second operation retrieves the users.
-    value = builder.create<pdl_interp::GetUsersOp>(loc, value);
+    value = pdl_interp::GetUsersOp::create(builder, loc, value);
     break;
   }
   case Predicates::ForEachPos: {
     assert(!failureBlockStack.empty() && "expected valid failure block");
-    auto foreach = builder.create<pdl_interp::ForEachOp>(
-        loc, parentVal, failureBlockStack.back(), /*initLoop=*/true);
+    auto foreach = pdl_interp::ForEachOp::create(
+        builder, loc, parentVal, failureBlockStack.back(), /*initLoop=*/true);
     value = foreach.getLoopVariable();
 
     // Create the continuation block.
     Block *continueBlock = builder.createBlock(&foreach.getRegion());
-    builder.create<pdl_interp::ContinueOp>(loc);
+    pdl_interp::ContinueOp::create(builder, loc);
     failureBlockStack.push_back(continueBlock);
 
     currentBlock = &foreach.getRegion().front();
@@ -311,62 +311,64 @@ Value PatternLowering::getValueAt(Block *&currentBlock, Position *pos) {
   }
   case Predicates::OperandPos: {
     auto *operandPos = cast<OperandPosition>(pos);
-    value = builder.create<pdl_interp::GetOperandOp>(
-        loc, builder.getType<pdl::ValueType>(), parentVal,
+    value = pdl_interp::GetOperandOp::create(
+        builder, loc, builder.getType<pdl::ValueType>(), parentVal,
         operandPos->getOperandNumber());
     break;
   }
   case Predicates::OperandGroupPos: {
     auto *operandPos = cast<OperandGroupPosition>(pos);
     Type valueTy = builder.getType<pdl::ValueType>();
-    value = builder.create<pdl_interp::GetOperandsOp>(
-        loc, operandPos->isVariadic() ? pdl::RangeType::get(valueTy) : valueTy,
+    value = pdl_interp::GetOperandsOp::create(
+        builder, loc,
+        operandPos->isVariadic() ? pdl::RangeType::get(valueTy) : valueTy,
         parentVal, operandPos->getOperandGroupNumber());
     break;
   }
   case Predicates::AttributePos: {
     auto *attrPos = cast<AttributePosition>(pos);
-    value = builder.create<pdl_interp::GetAttributeOp>(
-        loc, builder.getType<pdl::AttributeType>(), parentVal,
+    value = pdl_interp::GetAttributeOp::create(
+        builder, loc, builder.getType<pdl::AttributeType>(), parentVal,
         attrPos->getName().strref());
     break;
   }
   case Predicates::TypePos: {
     if (isa<pdl::AttributeType>(parentVal.getType()))
-      value = builder.create<pdl_interp::GetAttributeTypeOp>(loc, parentVal);
+      value = pdl_interp::GetAttributeTypeOp::create(builder, loc, parentVal);
     else
-      value = builder.create<pdl_interp::GetValueTypeOp>(loc, parentVal);
+      value = pdl_interp::GetValueTypeOp::create(builder, loc, parentVal);
     break;
   }
   case Predicates::ResultPos: {
     auto *resPos = cast<ResultPosition>(pos);
-    value = builder.create<pdl_interp::GetResultOp>(
-        loc, builder.getType<pdl::ValueType>(), parentVal,
+    value = pdl_interp::GetResultOp::create(
+        builder, loc, builder.getType<pdl::ValueType>(), parentVal,
         resPos->getResultNumber());
     break;
   }
   case Predicates::ResultGroupPos: {
     auto *resPos = cast<ResultGroupPosition>(pos);
     Type valueTy = builder.getType<pdl::ValueType>();
-    value = builder.create<pdl_interp::GetResultsOp>(
-        loc, resPos->isVariadic() ? pdl::RangeType::get(valueTy) : valueTy,
+    value = pdl_interp::GetResultsOp::create(
+        builder, loc,
+        resPos->isVariadic() ? pdl::RangeType::get(valueTy) : valueTy,
         parentVal, resPos->getResultGroupNumber());
     break;
   }
   case Predicates::AttributeLiteralPos: {
     auto *attrPos = cast<AttributeLiteralPosition>(pos);
-    value =
-        builder.create<pdl_interp::CreateAttributeOp>(loc, attrPos->getValue());
+    value = pdl_interp::CreateAttributeOp::create(builder, loc,
+                                                  attrPos->getValue());
     break;
   }
   case Predicates::TypeLiteralPos: {
     auto *typePos = cast<TypeLiteralPosition>(pos);
     Attribute rawTypeAttr = typePos->getValue();
     if (TypeAttr typeAttr = dyn_cast<TypeAttr>(rawTypeAttr))
-      value = builder.create<pdl_interp::CreateTypeOp>(loc, typeAttr);
+      value = pdl_interp::CreateTypeOp::create(builder, loc, typeAttr);
     else
-      value = builder.create<pdl_interp::CreateTypesOp>(
-          loc, cast<ArrayAttr>(rawTypeAttr));
+      value = pdl_interp::CreateTypesOp::create(builder, loc,
+                                                cast<ArrayAttr>(rawTypeAttr));
     break;
   }
   case Predicates::ConstraintResultPos: {
@@ -413,56 +415,59 @@ void PatternLowering::generate(BoolNode *boolNode, Block *&currentBlock,
   Predicates::Kind kind = question->getKind();
   switch (kind) {
   case Predicates::IsNotNullQuestion:
-    builder.create<pdl_interp::IsNotNullOp>(loc, val, success, failure);
+    pdl_interp::IsNotNullOp::create(builder, loc, val, success, failure);
     break;
   case Predicates::OperationNameQuestion: {
     auto *opNameAnswer = cast<OperationNameAnswer>(answer);
-    builder.create<pdl_interp::CheckOperationNameOp>(
-        loc, val, opNameAnswer->getValue().getStringRef(), success, failure);
+    pdl_interp::CheckOperationNameOp::create(
+        builder, loc, val, opNameAnswer->getValue().getStringRef(), success,
+        failure);
     break;
   }
   case Predicates::TypeQuestion: {
     auto *ans = cast<TypeAnswer>(answer);
     if (isa<pdl::RangeType>(val.getType()))
-      builder.create<pdl_interp::CheckTypesOp>(
-          loc, val, llvm::cast<ArrayAttr>(ans->getValue()), success, failure);
+      pdl_interp::CheckTypesOp::create(builder, loc, val,
+                                       llvm::cast<ArrayAttr>(ans->getValue()),
+                                       success, failure);
     else
-      builder.create<pdl_interp::CheckTypeOp>(
-          loc, val, llvm::cast<TypeAttr>(ans->getValue()), success, failure);
+      pdl_interp::CheckTypeOp::create(builder, loc, val,
+                                      llvm::cast<TypeAttr>(ans->getValue()),
+                                      success, failure);
     break;
   }
   case Predicates::AttributeQuestion: {
     auto *ans = cast<AttributeAnswer>(answer);
-    builder.create<pdl_interp::CheckAttributeOp>(loc, val, ans->getValue(),
-                                                 success, failure);
+    pdl_interp::CheckAttributeOp::create(builder, loc, val, ans->getValue(),
+                                         success, failure);
     break;
   }
   case Predicates::OperandCountAtLeastQuestion:
   case Predicates::OperandCountQuestion:
-    builder.create<pdl_interp::CheckOperandCountOp>(
-        loc, val, cast<UnsignedAnswer>(answer)->getValue(),
+    pdl_interp::CheckOperandCountOp::create(
+        builder, loc, val, cast<UnsignedAnswer>(answer)->getValue(),
         /*compareAtLeast=*/kind == Predicates::OperandCountAtLeastQuestion,
         success, failure);
     break;
   case Predicates::ResultCountAtLeastQuestion:
   case Predicates::ResultCountQuestion:
-    builder.create<pdl_interp::CheckResultCountOp>(
-        loc, val, cast<UnsignedAnswer>(answer)->getValue(),
+    pdl_interp::CheckResultCountOp::create(
+        builder, loc, val, cast<UnsignedAnswer>(answer)->getValue(),
         /*compareAtLeast=*/kind == Predicates::ResultCountAtLeastQuestion,
         success, failure);
     break;
   case Predicates::EqualToQuestion: {
     bool trueAnswer = isa<TrueAnswer>(answer);
-    builder.create<pdl_interp::AreEqualOp>(loc, val, args.front(),
-                                           trueAnswer ? success : failure,
-                                           trueAnswer ? failure : success);
+    pdl_interp::AreEqualOp::create(builder, loc, val, args.front(),
+                                   trueAnswer ? success : failure,
+                                   trueAnswer ? failure : success);
     break;
   }
   case Predicates::ConstraintQuestion: {
     auto *cstQuestion = cast<ConstraintQuestion>(question);
-    auto applyConstraintOp = builder.create<pdl_interp::ApplyConstraintOp>(
-        loc, cstQuestion->getResultTypes(), cstQuestion->getName(), args,
-        cstQuestion->getIsNegated(), success, failure);
+    auto applyConstraintOp = pdl_interp::ApplyConstraintOp::create(
+        builder, loc, cstQuestion->getResultTypes(), cstQuestion->getName(),
+        args, cstQuestion->getIsNegated(), success, failure);
 
     constraintOpMap.insert({cstQuestion, applyConstraintOp});
     break;
@@ -487,7 +492,7 @@ static void createSwitchOp(Value val, Block *defaultDest, OpBuilder &builder,
     blocks.push_back(it.second);
     values.push_back(cast<PredT>(it.first)->getValue());
   }
-  builder.create<OpT>(val.getLoc(), val, values, defaultDest, blocks);
+  OpT::create(builder, val.getLoc(), val, values, defaultDest, blocks);
 }
 
 void PatternLowering::generate(SwitchNode *switchNode, Block *currentBlock,
@@ -536,12 +541,14 @@ void PatternLowering::generate(SwitchNode *switchNode, Block *currentBlock,
       unsigned ans = cast<UnsignedAnswer>(child.first)->getValue();
       switch (kind) {
       case Predicates::OperandCountAtLeastQuestion:
-        builder.create<pdl_interp::CheckOperandCountOp>(
-            loc, val, ans, /*compareAtLeast=*/true, childBlock, defaultDest);
+        pdl_interp::CheckOperandCountOp::create(builder, loc, val, ans,
+                                                /*compareAtLeast=*/true,
+                                                childBlock, defaultDest);
         break;
       case Predicates::ResultCountAtLeastQuestion:
-        builder.create<pdl_interp::CheckResultCountOp>(
-            loc, val, ans, /*compareAtLeast=*/true, childBlock, defaultDest);
+        pdl_interp::CheckResultCountOp::create(builder, loc, val, ans,
+                                               /*compareAtLeast=*/true,
+                                               childBlock, defaultDest);
         break;
       default:
         llvm_unreachable("Generating invalid AtLeast operation");
@@ -619,8 +626,8 @@ void PatternLowering::generate(SuccessNode *successNode, Block *&currentBlock) {
       rootKindAttr = builder.getStringAttr(*rootKind);
 
   builder.setInsertionPointToEnd(currentBlock);
-  auto matchOp = builder.create<pdl_interp::RecordMatchOp>(
-      pattern.getLoc(), mappedMatchValues, locOps.getArrayRef(),
+  auto matchOp = pdl_interp::RecordMatchOp::create(
+      builder, pattern.getLoc(), mappedMatchValues, locOps.getArrayRef(),
       rewriterFuncRef, rootKindAttr, generatedOpsAttr, pattern.getBenefitAttr(),
       failureBlockStack.back());
 
@@ -632,8 +639,8 @@ void PatternLowering::generate(SuccessNode *successNode, Block *&currentBlock) {
 SymbolRefAttr PatternLowering::generateRewriter(
     pdl::PatternOp pattern, SmallVectorImpl<Position *> &usedMatchValues) {
   builder.setInsertionPointToEnd(rewriterModule.getBody());
-  auto rewriterFunc = builder.create<pdl_interp::FuncOp>(
-      pattern.getLoc(), "pdl_generated_rewriter",
+  auto rewriterFunc = pdl_interp::FuncOp::create(
+      builder, pattern.getLoc(), "pdl_generated_rewriter",
       builder.getFunctionType({}, {}));
   rewriterSymbolTable.insert(rewriterFunc);
 
@@ -651,18 +658,18 @@ SymbolRefAttr PatternLowering::generateRewriter(
     Operation *oldOp = oldValue.getDefiningOp();
     if (pdl::AttributeOp attrOp = dyn_cast<pdl::AttributeOp>(oldOp)) {
       if (Attribute value = attrOp.getValueAttr()) {
-        return newValue = builder.create<pdl_interp::CreateAttributeOp>(
-                   attrOp.getLoc(), value);
+        return newValue = pdl_interp::CreateAttributeOp::create(
+                   builder, attrOp.getLoc(), value);
       }
     } else if (pdl::TypeOp typeOp = dyn_cast<pdl::TypeOp>(oldOp)) {
       if (TypeAttr type = typeOp.getConstantTypeAttr()) {
-        return newValue = builder.create<pdl_interp::CreateTypeOp>(
-                   typeOp.getLoc(), type);
+        return newValue = pdl_interp::CreateTypeOp::create(
+                   builder, typeOp.getLoc(), type);
       }
     } else if (pdl::TypesOp typeOp = dyn_cast<pdl::TypesOp>(oldOp)) {
       if (ArrayAttr type = typeOp.getConstantTypesAttr()) {
-        return newValue = builder.create<pdl_interp::CreateTypesOp>(
-                   typeOp.getLoc(), typeOp.getType(), type);
+        return newValue = pdl_interp::CreateTypesOp::create(
+                   builder, typeOp.getLoc(), typeOp.getType(), type);
       }
     }
 
@@ -684,8 +691,9 @@ SymbolRefAttr PatternLowering::generateRewriter(
     auto mappedArgs =
         llvm::map_range(rewriter.getExternalArgs(), mapRewriteValue);
     args.append(mappedArgs.begin(), mappedArgs.end());
-    builder.create<pdl_interp::ApplyRewriteOp>(
-        rewriter.getLoc(), /*resultTypes=*/TypeRange(), rewriteName, args);
+    pdl_interp::ApplyRewriteOp::create(builder, rewriter.getLoc(),
+                                       /*resultTypes=*/TypeRange(), rewriteName,
+                                       args);
   } else {
     // Otherwise this is a dag rewriter defined using PDL operations.
     for (Operation &rewriteOp : *rewriter.getBody()) {
@@ -703,7 +711,7 @@ SymbolRefAttr PatternLowering::generateRewriter(
       llvm::to_vector<8>(rewriterFunc.front().getArgumentTypes()),
       /*results=*/{}));
 
-  builder.create<pdl_interp::FinalizeOp>(rewriter.getLoc());
+  pdl_interp::FinalizeOp::create(builder, rewriter.getLoc());
   return SymbolRefAttr::get(
       builder.getContext(),
       pdl_interp::PDLInterpDialect::getRewriterModuleName(),
@@ -716,9 +724,9 @@ void PatternLowering::generateRewriter(
   SmallVector<Value, 2> arguments;
   for (Value argument : rewriteOp.getArgs())
     arguments.push_back(mapRewriteValue(argument));
-  auto interpOp = builder.create<pdl_interp::ApplyRewriteOp>(
-      rewriteOp.getLoc(), rewriteOp.getResultTypes(), rewriteOp.getNameAttr(),
-      arguments);
+  auto interpOp = pdl_interp::ApplyRewriteOp::create(
+      builder, rewriteOp.getLoc(), rewriteOp.getResultTypes(),
+      rewriteOp.getNameAttr(), arguments);
   for (auto it : llvm::zip(rewriteOp.getResults(), interpOp.getResults()))
     rewriteValues[std::get<0>(it)] = std::get<1>(it);
 }
@@ -726,16 +734,16 @@ void PatternLowering::generateRewriter(
 void PatternLowering::generateRewriter(
     pdl::AttributeOp attrOp, DenseMap<Value, Value> &rewriteValues,
     function_ref<Value(Value)> mapRewriteValue) {
-  Value newAttr = builder.create<pdl_interp::CreateAttributeOp>(
-      attrOp.getLoc(), attrOp.getValueAttr());
+  Value newAttr = pdl_interp::CreateAttributeOp::create(
+      builder, attrOp.getLoc(), attrOp.getValueAttr());
   rewriteValues[attrOp] = newAttr;
 }
 
 void PatternLowering::generateRewriter(
     pdl::EraseOp eraseOp, DenseMap<Value, Value> &rewriteValues,
     function_ref<Value(Value)> mapRewriteValue) {
-  builder.create<pdl_interp::EraseOp>(eraseOp.getLoc(),
-                                      mapRewriteValue(eraseOp.getOpValue()));
+  pdl_interp::EraseOp::create(builder, eraseOp.getLoc(),
+                              mapRewriteValue(eraseOp.getOpValue()));
 }
 
 void PatternLowering::generateRewriter(
@@ -756,9 +764,9 @@ void PatternLowering::generateRewriter(
 
   // Create the new operation.
   Location loc = operationOp.getLoc();
-  Value createdOp = builder.create<pdl_interp::CreateOperationOp>(
-      loc, *operationOp.getOpName(), types, hasInferredResultTypes, operands,
-      attributes, operationOp.getAttributeValueNames());
+  Value createdOp = pdl_interp::CreateOperationOp::create(
+      builder, loc, *operationOp.getOpName(), types, hasInferredResultTypes,
+      operands, attributes, operationOp.getAttributeValueNames());
   rewriteValues[operationOp.getOp()] = createdOp;
 
   // Generate accesses for any results that have their types constrained.
@@ -768,8 +776,8 @@ void PatternLowering::generateRewriter(
   if (resultTys.size() == 1 && isa<pdl::RangeType>(resultTys[0].getType())) {
     Value &type = rewriteValues[resultTys[0]];
     if (!type) {
-      auto results = builder.create<pdl_interp::GetResultsOp>(loc, createdOp);
-      type = builder.create<pdl_interp::GetValueTypeOp>(loc, results);
+      auto results = pdl_interp::GetResultsOp::create(builder, loc, createdOp);
+      type = pdl_interp::GetValueTypeOp::create(builder, loc, results);
     }
     return;
   }
@@ -789,12 +797,13 @@ void PatternLowering::generateRewriter(
     // groups because the exact index of the result is not statically known.
     Value resultVal;
     if (seenVariableLength)
-      resultVal = builder.create<pdl_interp::GetResultsOp>(
-          loc, isVariadic ? valueRangeTy : valueTy, createdOp, it.index());
+      resultVal = pdl_interp::GetResultsOp::create(
+          builder, loc, isVariadic ? valueRangeTy : valueTy, createdOp,
+          it.index());
     else
-      resultVal = builder.create<pdl_interp::GetResultOp>(
-          loc, valueTy, createdOp, it.index());
-    type = builder.create<pdl_interp::GetValueTypeOp>(loc, resultVal);
+      resultVal = pdl_interp::GetResultOp::create(builder, loc, valueTy,
+                                                  createdOp, it.index());
+    type = pdl_interp::GetValueTypeOp::create(builder, loc, resultVal);
   }
 }
 
@@ -804,8 +813,8 @@ void PatternLowering::generateRewriter(
   SmallVector<Value, 4> replOperands;
   for (Value operand : rangeOp.getArguments())
     replOperands.push_back(mapRewriteValue(operand));
-  rewriteValues[rangeOp] = builder.create<pdl_interp::CreateRangeOp>(
-      rangeOp.getLoc(), rangeOp.getType(), replOperands);
+  rewriteValues[rangeOp] = pdl_interp::CreateRangeOp::create(
+      builder, rangeOp.getLoc(), rangeOp.getType(), replOperands);
 }
 
 void PatternLowering::generateRewriter(
@@ -820,8 +829,8 @@ void PatternLowering::generateRewriter(
     // Don't use replace if we know the replaced operation has no results.
     auto opOp = replaceOp.getOpValue().getDefiningOp<pdl::OperationOp>();
     if (!opOp || !opOp.getTypeValues().empty()) {
-      replOperands.push_back(builder.create<pdl_interp::GetResultsOp>(
-          replOp.getLoc(), mapRewriteValue(replOp)));
+      replOperands.push_back(pdl_interp::GetResultsOp::create(
+          builder, replOp.getLoc(), mapRewriteValue(replOp)));
     }
   } else {
     for (Value operand : replaceOp.getReplValues())
@@ -830,29 +839,29 @@ void PatternLowering::generateRewriter(
 
   // If there are no replacement values, just create an erase instead.
   if (replOperands.empty()) {
-    builder.create<pdl_interp::EraseOp>(
-        replaceOp.getLoc(), mapRewriteValue(replaceOp.getOpValue()));
+    pdl_interp::EraseOp::create(builder, replaceOp.getLoc(),
+                                mapRewriteValue(replaceOp.getOpValue()));
     return;
   }
 
-  builder.create<pdl_interp::ReplaceOp>(replaceOp.getLoc(),
-                                        mapRewriteValue(replaceOp.getOpValue()),
-                                        replOperands);
+  pdl_interp::ReplaceOp::create(builder, replaceOp.getLoc(),
+                                mapRewriteValue(replaceOp.getOpValue()),
+                                replOperands);
 }
 
 void PatternLowering::generateRewriter(
     pdl::ResultOp resultOp, DenseMap<Value, Value> &rewriteValues,
     function_ref<Value(Value)> mapRewriteValue) {
-  rewriteValues[resultOp] = builder.create<pdl_interp::GetResultOp>(
-      resultOp.getLoc(), builder.getType<pdl::ValueType>(),
+  rewriteValues[resultOp] = pdl_interp::GetResultOp::create(
+      builder, resultOp.getLoc(), builder.getType<pdl::ValueType>(),
       mapRewriteValue(resultOp.getParent()), resultOp.getIndex());
 }
 
 void PatternLowering::generateRewriter(
     pdl::ResultsOp resultOp, DenseMap<Value, Value> &rewriteValues,
     function_ref<Value(Value)> mapRewriteValue) {
-  rewriteValues[resultOp] = builder.create<pdl_interp::GetResultsOp>(
-      resultOp.getLoc(), resultOp.getType(),
+  rewriteValues[resultOp] = pdl_interp::GetResultsOp::create(
+      builder, resultOp.getLoc(), resultOp.getType(),
       mapRewriteValue(resultOp.getParent()), resultOp.getIndex());
 }
 
@@ -863,7 +872,7 @@ void PatternLowering::generateRewriter(
   // type.
   if (TypeAttr typeAttr = typeOp.getConstantTypeAttr()) {
     rewriteValues[typeOp] =
-        builder.create<pdl_interp::CreateTypeOp>(typeOp.getLoc(), typeAttr);
+        pdl_interp::CreateTypeOp::create(builder, typeOp.getLoc(), typeAttr);
   }
 }
 
@@ -873,8 +882,8 @@ void PatternLowering::generateRewriter(
   // If the type isn't constant, the users (e.g. OperationOp) will resolve this
   // type.
   if (ArrayAttr typeAttr = typeOp.getConstantTypesAttr()) {
-    rewriteValues[typeOp] = builder.create<pdl_interp::CreateTypesOp>(
-        typeOp.getLoc(), typeOp.getType(), typeAttr);
+    rewriteValues[typeOp] = pdl_interp::CreateTypesOp::create(
+        builder, typeOp.getLoc(), typeOp.getType(), typeAttr);
   }
 }
 
@@ -939,10 +948,10 @@ void PatternLowering::generateOperationResultTypeRewriter(
         !replacedOp->isBeforeInBlock(op))
       continue;
 
-    Value replacedOpResults = builder.create<pdl_interp::GetResultsOp>(
-        replacedOp->getLoc(), mapRewriteValue(replOpVal));
-    types.push_back(builder.create<pdl_interp::GetValueTypeOp>(
-        replacedOp->getLoc(), replacedOpResults));
+    Value replacedOpResults = pdl_interp::GetResultsOp::create(
+        builder, replacedOp->getLoc(), mapRewriteValue(replOpVal));
+    types.push_back(pdl_interp::GetValueTypeOp::create(
+        builder, replacedOp->getLoc(), replacedOpResults));
     return;
   }
 
@@ -985,16 +994,18 @@ void PDLToPDLInterpPass::runOnOperation() {
   // Create the main matcher function This function contains all of the match
   // related functionality from patterns in the module.
   OpBuilder builder = OpBuilder::atBlockBegin(module.getBody());
-  auto matcherFunc = builder.create<pdl_interp::FuncOp>(
-      module.getLoc(), pdl_interp::PDLInterpDialect::getMatcherFunctionName(),
+  auto matcherFunc = pdl_interp::FuncOp::create(
+      builder, module.getLoc(),
+      pdl_interp::PDLInterpDialect::getMatcherFunctionName(),
       builder.getFunctionType(builder.getType<pdl::OperationType>(),
                               /*results=*/{}),
       /*attrs=*/ArrayRef<NamedAttribute>());
 
   // Create a nested module to hold the functions invoked for rewriting the IR
   // after a successful match.
-  ModuleOp rewriterModule = builder.create<ModuleOp>(
-      module.getLoc(), pdl_interp::PDLInterpDialect::getRewriterModuleName());
+  ModuleOp rewriterModule =
+      ModuleOp::create(builder, module.getLoc(),
+                       pdl_interp::PDLInterpDialect::getRewriterModuleName());
 
   // Generate the code for the patterns within the module.
   PatternLowering generator(matcherFunc, rewriterModule, configMap);
diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
index 0df91a243d07a..240491a51d2b9 100644
--- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
+++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
@@ -340,7 +340,7 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
   Operation *terminator = lastBodyBlock->getTerminator();
   rewriter.setInsertionPointToEnd(lastBodyBlock);
   auto step = forOp.getStep();
-  auto stepped = rewriter.create<arith::AddIOp>(loc, iv, step).getResult();
+  auto stepped = arith::AddIOp::create(rewriter, loc, iv, step).getResult();
   if (!stepped)
     return failure();
 
@@ -348,7 +348,7 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
   loopCarried.push_back(stepped);
   loopCarried.append(terminator->operand_begin(), terminator->operand_end());
   auto branchOp =
-      rewriter.create<cf::BranchOp>(loc, conditionBlock, loopCarried);
+      cf::BranchOp::create(rewriter, loc, conditionBlock, loopCarried);
 
   // Let the CondBranchOp carry the LLVM attributes from the ForOp, such as the
   // llvm.loop_annotation attribute.
@@ -375,16 +375,15 @@ LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
   SmallVector<Value, 8> destOperands;
   destOperands.push_back(lowerBound);
   llvm::append_range(destOperands, forOp.getInitArgs());
-  rewriter.create<cf::BranchOp>(loc, conditionBlock, destOperands);
+  cf::BranchOp::create(rewriter, loc, conditionBlock, destOperands);
 
   // With the body block done, we can fill in the condition block.
   rewriter.setInsertionPointToEnd(conditionBlock);
-  auto comparison = rewriter.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::slt, iv, upperBound);
+  auto comparison = arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::slt, iv, upperBound);
 
-  rewriter.create<cf::CondBranchOp>(loc, comparison, firstBodyBlock,
-                                    ArrayRef<Value>(), endBlock,
-                                    ArrayRef<Value>());
+  cf::CondBranchOp::create(rewriter, loc, comparison, firstBodyBlock,
+                           ArrayRef<Value>(), endBlock, ArrayRef<Value>());
 
   // The result of the loop operation is the values of the condition block
   // arguments except the induction variable on the last iteration.
@@ -409,7 +408,7 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
     continueBlock =
         rewriter.createBlock(remainingOpsBlock, ifOp.getResultTypes(),
                              SmallVector<Location>(ifOp.getNumResults(), loc));
-    rewriter.create<cf::BranchOp>(loc, remainingOpsBlock);
+    cf::BranchOp::create(rewriter, loc, remainingOpsBlock);
   }
 
   // Move blocks from the "then" region to the region containing 'scf.if',
@@ -419,7 +418,7 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
   Operation *thenTerminator = thenRegion.back().getTerminator();
   ValueRange thenTerminatorOperands = thenTerminator->getOperands();
   rewriter.setInsertionPointToEnd(&thenRegion.back());
-  rewriter.create<cf::BranchOp>(loc, continueBlock, thenTerminatorOperands);
+  cf::BranchOp::create(rewriter, loc, continueBlock, thenTerminatorOperands);
   rewriter.eraseOp(thenTerminator);
   rewriter.inlineRegionBefore(thenRegion, continueBlock);
 
@@ -433,15 +432,15 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
     Operation *elseTerminator = elseRegion.back().getTerminator();
     ValueRange elseTerminatorOperands = elseTerminator->getOperands();
     rewriter.setInsertionPointToEnd(&elseRegion.back());
-    rewriter.create<cf::BranchOp>(loc, continueBlock, elseTerminatorOperands);
+    cf::BranchOp::create(rewriter, loc, continueBlock, elseTerminatorOperands);
     rewriter.eraseOp(elseTerminator);
     rewriter.inlineRegionBefore(elseRegion, continueBlock);
   }
 
   rewriter.setInsertionPointToEnd(condBlock);
-  rewriter.create<cf::CondBranchOp>(loc, ifOp.getCondition(), thenBlock,
-                                    /*trueArgs=*/ArrayRef<Value>(), elseBlock,
-                                    /*falseArgs=*/ArrayRef<Value>());
+  cf::CondBranchOp::create(rewriter, loc, ifOp.getCondition(), thenBlock,
+                           /*trueArgs=*/ArrayRef<Value>(), elseBlock,
+                           /*falseArgs=*/ArrayRef<Value>());
 
   // Ok, we're done!
   rewriter.replaceOp(ifOp, continueBlock->getArguments());
@@ -459,13 +458,14 @@ ExecuteRegionLowering::matchAndRewrite(ExecuteRegionOp op,
 
   auto &region = op.getRegion();
   rewriter.setInsertionPointToEnd(condBlock);
-  rewriter.create<cf::BranchOp>(loc, &region.front());
+  cf::BranchOp::create(rewriter, loc, &region.front());
 
   for (Block &block : region) {
     if (auto terminator = dyn_cast<scf::YieldOp>(block.getTerminator())) {
       ValueRange terminatorOperands = terminator->getOperands();
       rewriter.setInsertionPointToEnd(&block);
-      rewriter.create<cf::BranchOp>(loc, remainingOpsBlock, terminatorOperands);
+      cf::BranchOp::create(rewriter, loc, remainingOpsBlock,
+                           terminatorOperands);
       rewriter.eraseOp(terminator);
     }
   }
@@ -503,7 +503,7 @@ ParallelLowering::matchAndRewrite(ParallelOp parallelOp,
   for (auto [iv, lower, upper, step] :
        llvm::zip(parallelOp.getInductionVars(), parallelOp.getLowerBound(),
                  parallelOp.getUpperBound(), parallelOp.getStep())) {
-    ForOp forOp = rewriter.create<ForOp>(loc, lower, upper, step, iterArgs);
+    ForOp forOp = ForOp::create(rewriter, loc, lower, upper, step, iterArgs);
     ivs.push_back(forOp.getInductionVar());
     auto iterRange = forOp.getRegionIterArgs();
     iterArgs.assign(iterRange.begin(), iterRange.end());
@@ -517,7 +517,7 @@ ParallelLowering::matchAndRewrite(ParallelOp parallelOp,
       // A loop is constructed with an empty "yield" terminator if there are
       // no results.
       rewriter.setInsertionPointToEnd(rewriter.getInsertionBlock());
-      rewriter.create<scf::YieldOp>(loc, forOp.getResults());
+      scf::YieldOp::create(rewriter, loc, forOp.getResults());
     }
 
     rewriter.setInsertionPointToStart(forOp.getBody());
@@ -549,7 +549,7 @@ ParallelLowering::matchAndRewrite(ParallelOp parallelOp,
   // has been already created in loop construction).
   if (!yieldOperands.empty()) {
     rewriter.setInsertionPointToEnd(rewriter.getInsertionBlock());
-    rewriter.create<scf::YieldOp>(loc, yieldOperands);
+    scf::YieldOp::create(rewriter, loc, yieldOperands);
   }
 
   rewriter.replaceOp(parallelOp, loopResults);
@@ -575,7 +575,7 @@ LogicalResult WhileLowering::matchAndRewrite(WhileOp whileOp,
 
   // Branch to the "before" region.
   rewriter.setInsertionPointToEnd(currentBlock);
-  rewriter.create<cf::BranchOp>(loc, before, whileOp.getInits());
+  cf::BranchOp::create(rewriter, loc, before, whileOp.getInits());
 
   // Replace terminators with branches. Assuming bodies are SESE, which holds
   // given only the patterns from this file, we only need to look at the last
@@ -625,14 +625,14 @@ DoWhileLowering::matchAndRewrite(WhileOp whileOp,
 
   // Branch to the "before" region.
   rewriter.setInsertionPointToEnd(currentBlock);
-  rewriter.create<cf::BranchOp>(whileOp.getLoc(), before, whileOp.getInits());
+  cf::BranchOp::create(rewriter, whileOp.getLoc(), before, whileOp.getInits());
 
   // Loop around the "before" region based on condition.
   rewriter.setInsertionPointToEnd(before);
   auto condOp = cast<ConditionOp>(before->getTerminator());
-  rewriter.create<cf::CondBranchOp>(condOp.getLoc(), condOp.getCondition(),
-                                    before, condOp.getArgs(), continuation,
-                                    ValueRange());
+  cf::CondBranchOp::create(rewriter, condOp.getLoc(), condOp.getCondition(),
+                           before, condOp.getArgs(), continuation,
+                           ValueRange());
 
   // Replace the op with values "yielded" from the "before" region, which are
   // visible by dominance.
@@ -695,12 +695,12 @@ IndexSwitchLowering::matchAndRewrite(IndexSwitchOp op,
   SmallVector<ValueRange> caseOperands(caseSuccessors.size(), {});
 
   // Cast switch index to integer case value.
-  Value caseValue = rewriter.create<arith::IndexCastOp>(
-      op.getLoc(), rewriter.getI32Type(), op.getArg());
+  Value caseValue = arith::IndexCastOp::create(
+      rewriter, op.getLoc(), rewriter.getI32Type(), op.getArg());
 
-  rewriter.create<cf::SwitchOp>(
-      op.getLoc(), caseValue, *defaultBlock, ValueRange(),
-      rewriter.getDenseI32ArrayAttr(caseValues), caseSuccessors, caseOperands);
+  cf::SwitchOp::create(rewriter, op.getLoc(), caseValue, *defaultBlock,
+                       ValueRange(), rewriter.getDenseI32ArrayAttr(caseValues),
+                       caseSuccessors, caseOperands);
   rewriter.replaceOp(op, continueBlock->getArguments());
   return success();
 }
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index dcb48529a74e6..84cbd869c78ef 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -91,7 +91,7 @@ createVariablesForResults(T op, const TypeConverter *typeConverter,
     Type varType = emitc::LValueType::get(resultType);
     emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, "");
     emitc::VariableOp var =
-        rewriter.create<emitc::VariableOp>(loc, varType, noInit);
+        emitc::VariableOp::create(rewriter, loc, varType, noInit);
     resultVariables.push_back(var);
   }
 
@@ -103,14 +103,14 @@ createVariablesForResults(T op, const TypeConverter *typeConverter,
 static void assignValues(ValueRange values, ValueRange variables,
                          ConversionPatternRewriter &rewriter, Location loc) {
   for (auto [value, var] : llvm::zip(values, variables))
-    rewriter.create<emitc::AssignOp>(loc, var, value);
+    emitc::AssignOp::create(rewriter, loc, var, value);
 }
 
 SmallVector<Value> loadValues(const SmallVector<Value> &variables,
                               PatternRewriter &rewriter, Location loc) {
   return llvm::map_to_vector<>(variables, [&](Value var) {
     Type type = cast<emitc::LValueType>(var.getType()).getValueType();
-    return rewriter.create<emitc::LoadOp>(loc, type, var).getResult();
+    return emitc::LoadOp::create(rewriter, loc, type, var).getResult();
   });
 }
 
@@ -129,7 +129,7 @@ static LogicalResult lowerYield(Operation *op, ValueRange resultVariables,
 
   assignValues(yieldOperands, resultVariables, rewriter, loc);
 
-  rewriter.create<emitc::YieldOp>(loc);
+  emitc::YieldOp::create(rewriter, loc);
   rewriter.eraseOp(yield);
 
   return success();
@@ -164,8 +164,9 @@ ForLowering::matchAndRewrite(ForOp forOp, OpAdaptor adaptor,
 
   assignValues(adaptor.getInitArgs(), resultVariables, rewriter, loc);
 
-  emitc::ForOp loweredFor = rewriter.create<emitc::ForOp>(
-      loc, adaptor.getLowerBound(), adaptor.getUpperBound(), adaptor.getStep());
+  emitc::ForOp loweredFor =
+      emitc::ForOp::create(rewriter, loc, adaptor.getLowerBound(),
+                           adaptor.getUpperBound(), adaptor.getStep());
 
   Block *loweredBody = loweredFor.getBody();
 
@@ -257,7 +258,7 @@ IfLowering::matchAndRewrite(IfOp ifOp, OpAdaptor adaptor,
   bool hasElseBlock = !elseRegion.empty();
 
   auto loweredIf =
-      rewriter.create<emitc::IfOp>(loc, adaptor.getCondition(), false, false);
+      emitc::IfOp::create(rewriter, loc, adaptor.getCondition(), false, false);
 
   Region &loweredThenRegion = loweredIf.getThenRegion();
   auto result = lowerRegion(thenRegion, loweredThenRegion);
@@ -304,8 +305,9 @@ LogicalResult IndexSwitchOpLowering::matchAndRewrite(
                                        "create variables for results failed");
   }
 
-  auto loweredSwitch = rewriter.create<emitc::SwitchOp>(
-      loc, adaptor.getArg(), adaptor.getCases(), indexSwitchOp.getNumCases());
+  auto loweredSwitch =
+      emitc::SwitchOp::create(rewriter, loc, adaptor.getArg(),
+                              adaptor.getCases(), indexSwitchOp.getNumCases());
 
   // Lowering all case regions.
   for (auto pair :
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index 844e66e927c4d..f191f3502cf5a 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -84,8 +84,8 @@ static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
 // Get a Value that corresponds to the loop step.  If the step is an attribute,
 // materialize a corresponding constant using builder.
 static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
-  return builder.create<arith::ConstantIndexOp>(forOp.getLoc(),
-                                                forOp.getStepAsInt());
+  return arith::ConstantIndexOp::create(builder, forOp.getLoc(),
+                                        forOp.getStepAsInt());
 }
 
 // Get a Value for the loop lower bound.  If the value requires computation,
@@ -190,12 +190,12 @@ AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) {
       return std::nullopt;
     }
 
-    Value range = builder.create<arith::SubIOp>(currentLoop.getLoc(),
-                                                upperBound, lowerBound);
+    Value range = arith::SubIOp::create(builder, currentLoop.getLoc(),
+                                        upperBound, lowerBound);
     Value step = getOrCreateStep(currentLoop, builder);
     if (getConstantIntValue(step) != static_cast<int64_t>(1))
-      range =
-          builder.create<arith::CeilDivSIOp>(currentLoop.getLoc(), range, step);
+      range = arith::CeilDivSIOp::create(builder, currentLoop.getLoc(), range,
+                                         step);
     dims.push_back(range);
 
     lbs.push_back(lowerBound);
@@ -221,7 +221,7 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
   // no loop mapped to a specific dimension, use constant "1" as its size.
   Value constOne =
       (numBlockDims < 3 || numThreadDims < 3)
-          ? builder.create<arith::ConstantIndexOp>(rootForOp.getLoc(), 1)
+          ? arith::ConstantIndexOp::create(builder, rootForOp.getLoc(), 1)
           : nullptr;
   Value gridSizeX = numBlockDims > 0 ? dims[0] : constOne;
   Value gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
@@ -232,9 +232,9 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
 
   // Create a launch op and move the body region of the innermost loop to the
   // launch op.
-  auto launchOp = builder.create<gpu::LaunchOp>(
-      rootForOp.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
-      blockSizeY, blockSizeZ);
+  auto launchOp =
+      gpu::LaunchOp::create(builder, rootForOp.getLoc(), gridSizeX, gridSizeY,
+                            gridSizeZ, blockSizeX, blockSizeY, blockSizeZ);
 
   // Replace the loop terminator (loops contain only a single block) with the
   // gpu terminator and move the operations from the loop body block to the gpu
@@ -244,7 +244,7 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
   Location terminatorLoc = terminator.getLoc();
   terminator.erase();
   builder.setInsertionPointToEnd(innermostForOp.getBody());
-  builder.create<gpu::TerminatorOp>(terminatorLoc, TypeRange());
+  gpu::TerminatorOp::create(builder, terminatorLoc, TypeRange());
   launchOp.getBody().front().getOperations().splice(
       launchOp.getBody().front().begin(),
       innermostForOp.getBody()->getOperations());
@@ -263,10 +263,10 @@ void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp,
             : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
     Value step = steps[en.index()];
     if (getConstantIntValue(step) != static_cast<int64_t>(1))
-      id = builder.create<arith::MulIOp>(rootForOp.getLoc(), step, id);
+      id = arith::MulIOp::create(builder, rootForOp.getLoc(), step, id);
 
     Value ivReplacement =
-        builder.create<arith::AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
+        arith::AddIOp::create(builder, rootForOp.getLoc(), *lbArgumentIt, id);
     en.value().replaceAllUsesWith(ivReplacement);
     std::advance(lbArgumentIt, 1);
     std::advance(stepArgumentIt, 1);
@@ -319,8 +319,8 @@ static Value deriveStaticUpperBound(Value upperBound,
   if (auto minOp = upperBound.getDefiningOp<AffineMinOp>()) {
     for (const AffineExpr &result : minOp.getMap().getResults()) {
       if (auto constExpr = dyn_cast<AffineConstantExpr>(result)) {
-        return rewriter.create<arith::ConstantIndexOp>(minOp.getLoc(),
-                                                       constExpr.getValue());
+        return arith::ConstantIndexOp::create(rewriter, minOp.getLoc(),
+                                              constExpr.getValue());
       }
     }
   }
@@ -344,8 +344,8 @@ static Value deriveStaticUpperBound(Value upperBound,
         if ((lhs.value() < 0) != (rhs.value() < 0))
           return {};
 
-        return rewriter.create<arith::ConstantIndexOp>(
-            multiplyOp.getLoc(), lhs.value() * rhs.value());
+        return arith::ConstantIndexOp::create(rewriter, multiplyOp.getLoc(),
+                                              lhs.value() * rhs.value());
       }
   }
 
@@ -422,8 +422,8 @@ static LogicalResult processParallelLoop(
     if (launchIndependent(val))
       return val;
     if (auto constOp = val.getDefiningOp<arith::ConstantOp>())
-      return rewriter.create<arith::ConstantOp>(constOp.getLoc(),
-                                                constOp.getValue());
+      return arith::ConstantOp::create(rewriter, constOp.getLoc(),
+                                       constOp.getValue());
     return {};
   };
 
@@ -453,8 +453,8 @@ static LogicalResult processParallelLoop(
           1, 2,
           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
               rewriter.getAffineSymbolExpr(1));
-      newIndex = rewriter.create<AffineApplyOp>(
-          loc, annotation.getMap().compose(lowerAndStep),
+      newIndex = AffineApplyOp::create(
+          rewriter, loc, annotation.getMap().compose(lowerAndStep),
           ValueRange{operand, ensureLaunchIndependent(step),
                      ensureLaunchIndependent(lowerBound)});
       // If there was also a bound, insert that, too.
@@ -498,8 +498,8 @@ static LogicalResult processParallelLoop(
               1, 2,
               ((rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
                    .ceilDiv(rewriter.getAffineSymbolExpr(1))));
-          Value launchBound = rewriter.create<AffineApplyOp>(
-              loc, annotation.getBound().compose(stepMap),
+          Value launchBound = AffineApplyOp::create(
+              rewriter, loc, annotation.getBound().compose(stepMap),
               ValueRange{
                   ensureLaunchIndependent(
                       cloningMap.lookupOrDefault(upperBound)),
@@ -517,10 +517,10 @@ static LogicalResult processParallelLoop(
         if (!boundIsPrecise) {
           // We are using an approximation, create a surrounding conditional.
           Value originalBound = std::get<3>(config);
-          arith::CmpIOp pred = rewriter.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::slt, newIndex,
+          arith::CmpIOp pred = arith::CmpIOp::create(
+              rewriter, loc, arith::CmpIPredicate::slt, newIndex,
               cloningMap.lookupOrDefault(originalBound));
-          scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, pred, false);
+          scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, pred, false);
           rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
           // Put a sentinel into the worklist so we know when to pop out of the
           // if body again. We use the launchOp here, as that cannot be part of
@@ -530,10 +530,10 @@ static LogicalResult processParallelLoop(
       }
     } else {
       // Create a sequential for loop.
-      auto loopOp = rewriter.create<scf::ForOp>(
-          loc, cloningMap.lookupOrDefault(lowerBound),
-          cloningMap.lookupOrDefault(upperBound),
-          cloningMap.lookupOrDefault(step));
+      auto loopOp = scf::ForOp::create(rewriter, loc,
+                                       cloningMap.lookupOrDefault(lowerBound),
+                                       cloningMap.lookupOrDefault(upperBound),
+                                       cloningMap.lookupOrDefault(step));
       newIndex = loopOp.getInductionVar();
       rewriter.setInsertionPointToStart(loopOp.getBody());
       // Put a sentinel into the worklist so we know when to pop out of the loop
@@ -608,12 +608,12 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
   // sizes. Those will be refined later as we discover them from mappings.
   Location loc = parallelOp.getLoc();
   Value constantOne =
-      rewriter.create<arith::ConstantIndexOp>(parallelOp.getLoc(), 1);
-  gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
-      parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
-      constantOne, constantOne);
+      arith::ConstantIndexOp::create(rewriter, parallelOp.getLoc(), 1);
+  gpu::LaunchOp launchOp = gpu::LaunchOp::create(
+      rewriter, parallelOp.getLoc(), constantOne, constantOne, constantOne,
+      constantOne, constantOne, constantOne);
   rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-  rewriter.create<gpu::TerminatorOp>(loc);
+  gpu::TerminatorOp::create(rewriter, loc);
   rewriter.setInsertionPointToStart(&launchOp.getBody().front());
 
   IRMapping cloningMap;
@@ -667,7 +667,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
       if (externalValues.size())
         return failure();
       // Replace by gpu.all_reduce.
-      auto gpuRedOp = rewriter.create<gpu::AllReduceOp>(loc, newValue);
+      auto gpuRedOp = gpu::AllReduceOp::create(rewriter, loc, newValue);
       cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult());
       // Copy region.
       rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(),
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 584ac2f11b670..34f372af1e4b5 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -187,8 +187,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable,
            scf::ReduceOp reduce, int64_t reductionIndex, Attribute initValue) {
   OpBuilder::InsertionGuard guard(builder);
   Type type = reduce.getOperands()[reductionIndex].getType();
-  auto decl = builder.create<omp::DeclareReductionOp>(reduce.getLoc(),
-                                                      "__scf_reduction", type);
+  auto decl = omp::DeclareReductionOp::create(builder, reduce.getLoc(),
+                                              "__scf_reduction", type);
   symbolTable.insert(decl);
 
   builder.createBlock(&decl.getInitializerRegion(),
@@ -196,8 +196,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable,
                       {reduce.getOperands()[reductionIndex].getLoc()});
   builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
   Value init =
-      builder.create<LLVM::ConstantOp>(reduce.getLoc(), type, initValue);
-  builder.create<omp::YieldOp>(reduce.getLoc(), init);
+      LLVM::ConstantOp::create(builder, reduce.getLoc(), type, initValue);
+  omp::YieldOp::create(builder, reduce.getLoc(), init);
 
   Operation *terminator =
       &reduce.getReductions()[reductionIndex].front().back();
@@ -227,12 +227,12 @@ static omp::DeclareReductionOp addAtomicRMW(OpBuilder &builder,
                       {reduceOperandLoc, reduceOperandLoc});
   Block *atomicBlock = &decl.getAtomicReductionRegion().back();
   builder.setInsertionPointToEnd(atomicBlock);
-  Value loaded = builder.create<LLVM::LoadOp>(reduce.getLoc(), decl.getType(),
-                                              atomicBlock->getArgument(1));
-  builder.create<LLVM::AtomicRMWOp>(reduce.getLoc(), atomicKind,
-                                    atomicBlock->getArgument(0), loaded,
-                                    LLVM::AtomicOrdering::monotonic);
-  builder.create<omp::YieldOp>(reduce.getLoc(), ArrayRef<Value>());
+  Value loaded = LLVM::LoadOp::create(builder, reduce.getLoc(), decl.getType(),
+                                      atomicBlock->getArgument(1));
+  LLVM::AtomicRMWOp::create(builder, reduce.getLoc(), atomicKind,
+                            atomicBlock->getArgument(0), loaded,
+                            LLVM::AtomicOrdering::monotonic);
+  omp::YieldOp::create(builder, reduce.getLoc(), ArrayRef<Value>());
   return decl;
 }
 
@@ -380,8 +380,9 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Allocate reduction variables. Make sure the we don't overflow the stack
     // with local `alloca`s by saving and restoring the stack pointer.
     Location loc = parallelOp.getLoc();
-    Value one = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getIntegerType(64), rewriter.getI64IntegerAttr(1));
+    Value one =
+        LLVM::ConstantOp::create(rewriter, loc, rewriter.getIntegerType(64),
+                                 rewriter.getI64IntegerAttr(1));
     SmallVector<Value> reductionVariables;
     reductionVariables.reserve(parallelOp.getNumReductions());
     auto ptrType = LLVM::LLVMPointerType::get(parallelOp.getContext());
@@ -390,9 +391,9 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
               isa<LLVM::PointerElementTypeInterface>(init.getType())) &&
              "cannot create a reduction variable if the type is not an LLVM "
              "pointer element");
-      Value storage =
-          rewriter.create<LLVM::AllocaOp>(loc, ptrType, init.getType(), one, 0);
-      rewriter.create<LLVM::StoreOp>(loc, init, storage);
+      Value storage = LLVM::AllocaOp::create(rewriter, loc, ptrType,
+                                             init.getType(), one, 0);
+      LLVM::StoreOp::create(rewriter, loc, init, storage);
       reductionVariables.push_back(storage);
     }
 
@@ -411,8 +412,8 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
       assert(redRegion.hasOneBlock() &&
              "expect reduction region to have one block");
       Value pvtRedVar = parallelOp.getRegion().addArgument(x.getType(), loc);
-      Value pvtRedVal = rewriter.create<LLVM::LoadOp>(reduce.getLoc(),
-                                                      rD.getType(), pvtRedVar);
+      Value pvtRedVal = LLVM::LoadOp::create(rewriter, reduce.getLoc(),
+                                             rD.getType(), pvtRedVar);
       // Make a copy of the reduction combiner region in the body
       mlir::OpBuilder builder(rewriter.getContext());
       builder.setInsertionPoint(reduce);
@@ -427,7 +428,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
           assert(yieldOp && yieldOp.getResults().size() == 1 &&
                  "expect YieldOp in reduction region to return one result");
           Value redVal = yieldOp.getResults()[0];
-          rewriter.create<LLVM::StoreOp>(loc, redVal, pvtRedVar);
+          LLVM::StoreOp::create(rewriter, loc, redVal, pvtRedVar);
           rewriter.eraseOp(yieldOp);
           break;
         }
@@ -437,12 +438,12 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
 
     Value numThreadsVar;
     if (numThreads > 0) {
-      numThreadsVar = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32IntegerAttr(numThreads));
+      numThreadsVar = LLVM::ConstantOp::create(
+          rewriter, loc, rewriter.getI32IntegerAttr(numThreads));
     }
     // Create the parallel wrapper.
-    auto ompParallel = rewriter.create<omp::ParallelOp>(
-        loc,
+    auto ompParallel = omp::ParallelOp::create(
+        rewriter, loc,
         /* allocate_vars = */ llvm::SmallVector<Value>{},
         /* allocator_vars = */ llvm::SmallVector<Value>{},
         /* if_expr = */ Value{},
@@ -464,7 +465,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
       {
         OpBuilder::InsertionGuard allocaGuard(rewriter);
         // Create worksharing loop wrapper.
-        auto wsloopOp = rewriter.create<omp::WsloopOp>(parallelOp.getLoc());
+        auto wsloopOp = omp::WsloopOp::create(rewriter, parallelOp.getLoc());
         if (!reductionVariables.empty()) {
           wsloopOp.setReductionSymsAttr(
               ArrayAttr::get(rewriter.getContext(), reductionSyms));
@@ -476,7 +477,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
           wsloopOp.setReductionByref(
               DenseBoolArrayAttr::get(rewriter.getContext(), reductionByRef));
         }
-        rewriter.create<omp::TerminatorOp>(loc); // omp.parallel terminator.
+        omp::TerminatorOp::create(rewriter, loc); // omp.parallel terminator.
 
         // The wrapper's entry block arguments will define the reduction
         // variables.
@@ -490,8 +491,8 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
                                               parallelOp.getLoc()));
 
         // Create loop nest and populate region with contents of scf.parallel.
-        auto loopOp = rewriter.create<omp::LoopNestOp>(
-            parallelOp.getLoc(), parallelOp.getLowerBound(),
+        auto loopOp = omp::LoopNestOp::create(
+            rewriter, parallelOp.getLoc(), parallelOp.getLowerBound(),
             parallelOp.getUpperBound(), parallelOp.getStep());
 
         rewriter.inlineRegionBefore(parallelOp.getRegion(), loopOp.getRegion(),
@@ -511,13 +512,13 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
             rewriter.splitBlock(&loopOpEntryBlock, loopOpEntryBlock.begin());
         rewriter.setInsertionPointToStart(&loopOpEntryBlock);
 
-        auto scope = rewriter.create<memref::AllocaScopeOp>(parallelOp.getLoc(),
-                                                            TypeRange());
-        rewriter.create<omp::YieldOp>(loc, ValueRange());
+        auto scope = memref::AllocaScopeOp::create(
+            rewriter, parallelOp.getLoc(), TypeRange());
+        omp::YieldOp::create(rewriter, loc, ValueRange());
         Block *scopeBlock = rewriter.createBlock(&scope.getBodyRegion());
         rewriter.mergeBlocks(ops, scopeBlock);
         rewriter.setInsertionPointToEnd(&*scope.getBodyRegion().begin());
-        rewriter.create<memref::AllocaScopeReturnOp>(loc, ValueRange());
+        memref::AllocaScopeReturnOp::create(rewriter, loc, ValueRange());
       }
     }
 
@@ -526,7 +527,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     results.reserve(reductionVariables.size());
     for (auto [variable, type] :
          llvm::zip(reductionVariables, parallelOp.getResultTypes())) {
-      Value res = rewriter.create<LLVM::LoadOp>(loc, type, variable);
+      Value res = LLVM::LoadOp::create(rewriter, loc, type, variable);
       results.push_back(res);
     }
     rewriter.replaceOp(parallelOp, results);
diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
index 78d13278fef53..dc92367fc58cd 100644
--- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
+++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
@@ -71,12 +71,12 @@ void replaceSCFOutputValue(ScfOp scfOp, OpTy newOp,
     auto pointerType =
         spirv::PointerType::get(convertedType, spirv::StorageClass::Function);
     rewriter.setInsertionPoint(newOp);
-    auto alloc = rewriter.create<spirv::VariableOp>(
-        loc, pointerType, spirv::StorageClass::Function,
-        /*initializer=*/nullptr);
+    auto alloc = spirv::VariableOp::create(rewriter, loc, pointerType,
+                                           spirv::StorageClass::Function,
+                                           /*initializer=*/nullptr);
     allocas.push_back(alloc);
     rewriter.setInsertionPointAfter(newOp);
-    Value loadResult = rewriter.create<spirv::LoadOp>(loc, alloc);
+    Value loadResult = spirv::LoadOp::create(rewriter, loc, alloc);
     resultValue.push_back(loadResult);
   }
   rewriter.replaceOp(scfOp, resultValue);
@@ -135,7 +135,8 @@ struct ForOpConversion final : SCFToSPIRVPattern<scf::ForOp> {
     // a single back edge from the continue to header block, and a single exit
     // from header to merge.
     auto loc = forOp.getLoc();
-    auto loopOp = rewriter.create<spirv::LoopOp>(loc, spirv::LoopControl::None);
+    auto loopOp =
+        spirv::LoopOp::create(rewriter, loc, spirv::LoopControl::None);
     loopOp.addEntryAndMergeBlock(rewriter);
 
     OpBuilder::InsertionGuard guard(rewriter);
@@ -172,16 +173,17 @@ struct ForOpConversion final : SCFToSPIRVPattern<scf::ForOp> {
     args.append(adaptor.getInitArgs().begin(), adaptor.getInitArgs().end());
     // Branch into it from the entry.
     rewriter.setInsertionPointToEnd(&(loopOp.getBody().front()));
-    rewriter.create<spirv::BranchOp>(loc, header, args);
+    spirv::BranchOp::create(rewriter, loc, header, args);
 
     // Generate the rest of the loop header.
     rewriter.setInsertionPointToEnd(header);
     auto *mergeBlock = loopOp.getMergeBlock();
-    auto cmpOp = rewriter.create<spirv::SLessThanOp>(
-        loc, rewriter.getI1Type(), newIndVar, adaptor.getUpperBound());
+    auto cmpOp = spirv::SLessThanOp::create(rewriter, loc, rewriter.getI1Type(),
+                                            newIndVar, adaptor.getUpperBound());
 
-    rewriter.create<spirv::BranchConditionalOp>(
-        loc, cmpOp, body, ArrayRef<Value>(), mergeBlock, ArrayRef<Value>());
+    spirv::BranchConditionalOp::create(rewriter, loc, cmpOp, body,
+                                       ArrayRef<Value>(), mergeBlock,
+                                       ArrayRef<Value>());
 
     // Generate instructions to increment the step of the induction variable and
     // branch to the header.
@@ -189,9 +191,9 @@ struct ForOpConversion final : SCFToSPIRVPattern<scf::ForOp> {
     rewriter.setInsertionPointToEnd(continueBlock);
 
     // Add the step to the induction variable and branch to the header.
-    Value updatedIndVar = rewriter.create<spirv::IAddOp>(
-        loc, newIndVar.getType(), newIndVar, adaptor.getStep());
-    rewriter.create<spirv::BranchOp>(loc, header, updatedIndVar);
+    Value updatedIndVar = spirv::IAddOp::create(
+        rewriter, loc, newIndVar.getType(), newIndVar, adaptor.getStep());
+    spirv::BranchOp::create(rewriter, loc, header, updatedIndVar);
 
     // Infer the return types from the init operands. Vector type may get
     // converted to CooperativeMatrix or to Vector type, to avoid having complex
@@ -237,11 +239,11 @@ struct IfOpConversion : SCFToSPIRVPattern<scf::IfOp> {
 
     // Create `spirv.selection` operation, selection header block and merge
     // block.
-    auto selectionOp =
-        rewriter.create<spirv::SelectionOp>(loc, spirv::SelectionControl::None);
+    auto selectionOp = spirv::SelectionOp::create(
+        rewriter, loc, spirv::SelectionControl::None);
     auto *mergeBlock = rewriter.createBlock(&selectionOp.getBody(),
                                             selectionOp.getBody().end());
-    rewriter.create<spirv::MergeOp>(loc);
+    spirv::MergeOp::create(rewriter, loc);
 
     OpBuilder::InsertionGuard guard(rewriter);
     auto *selectionHeaderBlock =
@@ -251,7 +253,7 @@ struct IfOpConversion : SCFToSPIRVPattern<scf::IfOp> {
     auto &thenRegion = ifOp.getThenRegion();
     auto *thenBlock = &thenRegion.front();
     rewriter.setInsertionPointToEnd(&thenRegion.back());
-    rewriter.create<spirv::BranchOp>(loc, mergeBlock);
+    spirv::BranchOp::create(rewriter, loc, mergeBlock);
     rewriter.inlineRegionBefore(thenRegion, mergeBlock);
 
     auto *elseBlock = mergeBlock;
@@ -261,15 +263,15 @@ struct IfOpConversion : SCFToSPIRVPattern<scf::IfOp> {
       auto &elseRegion = ifOp.getElseRegion();
       elseBlock = &elseRegion.front();
       rewriter.setInsertionPointToEnd(&elseRegion.back());
-      rewriter.create<spirv::BranchOp>(loc, mergeBlock);
+      spirv::BranchOp::create(rewriter, loc, mergeBlock);
       rewriter.inlineRegionBefore(elseRegion, mergeBlock);
     }
 
     // Create a `spirv.BranchConditional` operation for selection header block.
     rewriter.setInsertionPointToEnd(selectionHeaderBlock);
-    rewriter.create<spirv::BranchConditionalOp>(loc, adaptor.getCondition(),
-                                                thenBlock, ArrayRef<Value>(),
-                                                elseBlock, ArrayRef<Value>());
+    spirv::BranchConditionalOp::create(rewriter, loc, adaptor.getCondition(),
+                                       thenBlock, ArrayRef<Value>(), elseBlock,
+                                       ArrayRef<Value>());
 
     replaceSCFOutputValue(ifOp, selectionOp, rewriter, scfToSPIRVContext,
                           returnTypes);
@@ -310,7 +312,7 @@ struct TerminatorOpConversion final : SCFToSPIRVPattern<scf::YieldOp> {
 
       auto loc = terminatorOp.getLoc();
       for (unsigned i = 0, e = operands.size(); i < e; i++)
-        rewriter.create<spirv::StoreOp>(loc, allocas[i], operands[i]);
+        spirv::StoreOp::create(rewriter, loc, allocas[i], operands[i]);
       if (isa<spirv::LoopOp>(parent)) {
         // For loops we also need to update the branch jumping back to the
         // header.
@@ -319,8 +321,8 @@ struct TerminatorOpConversion final : SCFToSPIRVPattern<scf::YieldOp> {
         SmallVector<Value, 8> args(br.getBlockArguments());
         args.append(operands.begin(), operands.end());
         rewriter.setInsertionPoint(br);
-        rewriter.create<spirv::BranchOp>(terminatorOp.getLoc(), br.getTarget(),
-                                         args);
+        spirv::BranchOp::create(rewriter, terminatorOp.getLoc(), br.getTarget(),
+                                args);
         rewriter.eraseOp(br);
       }
     }
@@ -340,7 +342,8 @@ struct WhileOpConversion final : SCFToSPIRVPattern<scf::WhileOp> {
   matchAndRewrite(scf::WhileOp whileOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = whileOp.getLoc();
-    auto loopOp = rewriter.create<spirv::LoopOp>(loc, spirv::LoopControl::None);
+    auto loopOp =
+        spirv::LoopOp::create(rewriter, loc, spirv::LoopControl::None);
     loopOp.addEntryAndMergeBlock(rewriter);
 
     Region &beforeRegion = whileOp.getBefore();
@@ -382,7 +385,7 @@ struct WhileOpConversion final : SCFToSPIRVPattern<scf::WhileOp> {
 
     // Jump from the loop entry block to the loop header block.
     rewriter.setInsertionPointToEnd(&entryBlock);
-    rewriter.create<spirv::BranchOp>(loc, &beforeBlock, adaptor.getInits());
+    spirv::BranchOp::create(rewriter, loc, &beforeBlock, adaptor.getInits());
 
     auto condLoc = cond.getLoc();
 
@@ -403,18 +406,18 @@ struct WhileOpConversion final : SCFToSPIRVPattern<scf::WhileOp> {
 
       // Create local variables before the scf.while op.
       rewriter.setInsertionPoint(loopOp);
-      auto alloc = rewriter.create<spirv::VariableOp>(
-          condLoc, pointerType, spirv::StorageClass::Function,
-          /*initializer=*/nullptr);
+      auto alloc = spirv::VariableOp::create(rewriter, condLoc, pointerType,
+                                             spirv::StorageClass::Function,
+                                             /*initializer=*/nullptr);
 
       // Load the final result values after the scf.while op.
       rewriter.setInsertionPointAfter(loopOp);
-      auto loadResult = rewriter.create<spirv::LoadOp>(condLoc, alloc);
+      auto loadResult = spirv::LoadOp::create(rewriter, condLoc, alloc);
       resultValues[i] = loadResult;
 
       // Store the current iteration's result value.
       rewriter.setInsertionPointToEnd(&beforeBlock);
-      rewriter.create<spirv::StoreOp>(condLoc, alloc, res);
+      spirv::StoreOp::create(rewriter, condLoc, alloc, res);
     }
 
     rewriter.setInsertionPointToEnd(&beforeBlock);
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
index d7ae9f0e94fe8..035f197b1eac2 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
@@ -68,7 +68,7 @@ static unsigned calculateGlobalIndex(spirv::GlobalVariableOp op) {
 /// Copies the given number of bytes from src to dst pointers.
 static void copy(Location loc, Value dst, Value src, Value size,
                  OpBuilder &builder) {
-  builder.create<LLVM::MemcpyOp>(loc, dst, src, size, /*isVolatile=*/false);
+  LLVM::MemcpyOp::create(builder, loc, dst, src, size, /*isVolatile=*/false);
 }
 
 /// Encodes the binding and descriptor set numbers into a new symbolic name.
@@ -194,8 +194,8 @@ class GPULaunchLowering : public ConvertOpToLLVMPattern<gpu::LaunchFuncOp> {
     if (!kernelFunc) {
       OpBuilder::InsertionGuard guard(rewriter);
       rewriter.setInsertionPointToStart(module.getBody());
-      kernelFunc = rewriter.create<LLVM::LLVMFuncOp>(
-          rewriter.getUnknownLoc(), newKernelFuncName,
+      kernelFunc = LLVM::LLVMFuncOp::create(
+          rewriter, rewriter.getUnknownLoc(), newKernelFuncName,
           LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(context),
                                       ArrayRef<Type>()));
       rewriter.setInsertionPoint(launchOp);
@@ -245,8 +245,8 @@ class GPULaunchLowering : public ConvertOpToLLVMPattern<gpu::LaunchFuncOp> {
       if (!dstGlobal) {
         OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointToStart(module.getBody());
-        dstGlobal = rewriter.create<LLVM::GlobalOp>(
-            loc, dstGlobalType,
+        dstGlobal = LLVM::GlobalOp::create(
+            rewriter, loc, dstGlobalType,
             /*isConstant=*/false, LLVM::Linkage::Linkonce, name, Attribute(),
             /*alignment=*/0);
         rewriter.setInsertionPoint(launchOp);
@@ -255,8 +255,8 @@ class GPULaunchLowering : public ConvertOpToLLVMPattern<gpu::LaunchFuncOp> {
       // Copy the data from src operand pointer to dst global variable. Save
       // src, dst and size so that we can copy data back after emulating the
       // kernel call.
-      Value dst = rewriter.create<LLVM::AddressOfOp>(
-          loc, typeConverter->convertType(spirvGlobal.getType()),
+      Value dst = LLVM::AddressOfOp::create(
+          rewriter, loc, typeConverter->convertType(spirvGlobal.getType()),
           dstGlobal.getSymName());
       copy(loc, dst, src, sizeBytes, rewriter);
 
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index 1d92b5d5562b5..aae3271371c1f 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -94,13 +94,13 @@ static IntegerAttr minusOneIntegerAttribute(Type type, Builder builder) {
 static Value createConstantAllBitsSet(Location loc, Type srcType, Type dstType,
                                       PatternRewriter &rewriter) {
   if (isa<VectorType>(srcType)) {
-    return rewriter.create<LLVM::ConstantOp>(
-        loc, dstType,
+    return LLVM::ConstantOp::create(
+        rewriter, loc, dstType,
         SplatElementsAttr::get(cast<ShapedType>(srcType),
                                minusOneIntegerAttribute(srcType, rewriter)));
   }
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, dstType, minusOneIntegerAttribute(srcType, rewriter));
+  return LLVM::ConstantOp::create(rewriter, loc, dstType,
+                                  minusOneIntegerAttribute(srcType, rewriter));
 }
 
 /// Creates `llvm.mlir.constant` with a floating-point scalar or vector value.
@@ -108,14 +108,14 @@ static Value createFPConstant(Location loc, Type srcType, Type dstType,
                               PatternRewriter &rewriter, double value) {
   if (auto vecType = dyn_cast<VectorType>(srcType)) {
     auto floatType = cast<FloatType>(vecType.getElementType());
-    return rewriter.create<LLVM::ConstantOp>(
-        loc, dstType,
+    return LLVM::ConstantOp::create(
+        rewriter, loc, dstType,
         SplatElementsAttr::get(vecType,
                                rewriter.getFloatAttr(floatType, value)));
   }
   auto floatType = cast<FloatType>(srcType);
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, dstType, rewriter.getFloatAttr(floatType, value));
+  return LLVM::ConstantOp::create(rewriter, loc, dstType,
+                                  rewriter.getFloatAttr(floatType, value));
 }
 
 /// Utility function for bitfield ops:
@@ -134,13 +134,13 @@ static Value optionallyTruncateOrExtend(Location loc, Value value,
                                : getBitWidth(srcType);
 
   if (valueBitWidth < targetBitWidth)
-    return rewriter.create<LLVM::ZExtOp>(loc, llvmType, value);
+    return LLVM::ZExtOp::create(rewriter, loc, llvmType, value);
   // If the bit widths of `Count` and `Offset` are greater than the bit width
   // of the target type, they are truncated. Truncation is safe since `Count`
   // and `Offset` must be no more than 64 for op behaviour to be defined. Hence,
   // both values can be expressed in 8 bits.
   if (valueBitWidth > targetBitWidth)
-    return rewriter.create<LLVM::TruncOp>(loc, llvmType, value);
+    return LLVM::TruncOp::create(rewriter, loc, llvmType, value);
   return value;
 }
 
@@ -151,12 +151,12 @@ static Value broadcast(Location loc, Value toBroadcast, unsigned numElements,
   auto vectorType = VectorType::get(numElements, toBroadcast.getType());
   auto llvmVectorType = typeConverter.convertType(vectorType);
   auto llvmI32Type = typeConverter.convertType(rewriter.getIntegerType(32));
-  Value broadcasted = rewriter.create<LLVM::PoisonOp>(loc, llvmVectorType);
+  Value broadcasted = LLVM::PoisonOp::create(rewriter, loc, llvmVectorType);
   for (unsigned i = 0; i < numElements; ++i) {
-    auto index = rewriter.create<LLVM::ConstantOp>(
-        loc, llvmI32Type, rewriter.getI32IntegerAttr(i));
-    broadcasted = rewriter.create<LLVM::InsertElementOp>(
-        loc, llvmVectorType, broadcasted, toBroadcast, index);
+    auto index = LLVM::ConstantOp::create(rewriter, loc, llvmI32Type,
+                                          rewriter.getI32IntegerAttr(i));
+    broadcasted = LLVM::InsertElementOp::create(
+        rewriter, loc, llvmVectorType, broadcasted, toBroadcast, index);
   }
   return broadcasted;
 }
@@ -217,8 +217,8 @@ static Type convertStructTypePacked(spirv::StructType type,
 /// Creates LLVM dialect constant with the given value.
 static Value createI32ConstantOf(Location loc, PatternRewriter &rewriter,
                                  unsigned value) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, IntegerType::get(rewriter.getContext(), 32),
+  return LLVM::ConstantOp::create(
+      rewriter, loc, IntegerType::get(rewriter.getContext(), 32),
       rewriter.getIntegerAttr(rewriter.getI32Type(), value));
 }
 
@@ -322,8 +322,9 @@ class AccessChainPattern : public SPIRVToLLVMConversion<spirv::AccessChainOp> {
     auto llvmIndexType = getTypeConverter()->convertType(indexType);
     if (!llvmIndexType)
       return rewriter.notifyMatchFailure(op, "type conversion failed");
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        op.getLoc(), llvmIndexType, rewriter.getIntegerAttr(indexType, 0));
+    Value zero =
+        LLVM::ConstantOp::create(rewriter, op.getLoc(), llvmIndexType,
+                                 rewriter.getIntegerAttr(indexType, 0));
     indices.insert(indices.begin(), zero);
 
     auto elementType = getTypeConverter()->convertType(
@@ -375,20 +376,20 @@ class BitFieldInsertPattern
     // Create a mask with bits set outside [Offset, Offset + Count - 1].
     Value minusOne = createConstantAllBitsSet(loc, srcType, dstType, rewriter);
     Value maskShiftedByCount =
-        rewriter.create<LLVM::ShlOp>(loc, dstType, minusOne, count);
-    Value negated = rewriter.create<LLVM::XOrOp>(loc, dstType,
-                                                 maskShiftedByCount, minusOne);
+        LLVM::ShlOp::create(rewriter, loc, dstType, minusOne, count);
+    Value negated = LLVM::XOrOp::create(rewriter, loc, dstType,
+                                        maskShiftedByCount, minusOne);
     Value maskShiftedByCountAndOffset =
-        rewriter.create<LLVM::ShlOp>(loc, dstType, negated, offset);
-    Value mask = rewriter.create<LLVM::XOrOp>(
-        loc, dstType, maskShiftedByCountAndOffset, minusOne);
+        LLVM::ShlOp::create(rewriter, loc, dstType, negated, offset);
+    Value mask = LLVM::XOrOp::create(rewriter, loc, dstType,
+                                     maskShiftedByCountAndOffset, minusOne);
 
     // Extract unchanged bits from the `Base`  that are outside of
     // [Offset, Offset + Count - 1]. Then `or` with shifted `Insert`.
     Value baseAndMask =
-        rewriter.create<LLVM::AndOp>(loc, dstType, op.getBase(), mask);
+        LLVM::AndOp::create(rewriter, loc, dstType, op.getBase(), mask);
     Value insertShiftedByOffset =
-        rewriter.create<LLVM::ShlOp>(loc, dstType, op.getInsert(), offset);
+        LLVM::ShlOp::create(rewriter, loc, dstType, op.getInsert(), offset);
     rewriter.replaceOpWithNewOp<LLVM::OrOp>(op, dstType, baseAndMask,
                                             insertShiftedByOffset);
     return success();
@@ -470,23 +471,23 @@ class BitFieldSExtractPattern
     auto baseSize = rewriter.getIntegerAttr(integerType, getBitWidth(srcType));
     Value size =
         isa<VectorType>(srcType)
-            ? rewriter.create<LLVM::ConstantOp>(
-                  loc, dstType,
+            ? LLVM::ConstantOp::create(
+                  rewriter, loc, dstType,
                   SplatElementsAttr::get(cast<ShapedType>(srcType), baseSize))
-            : rewriter.create<LLVM::ConstantOp>(loc, dstType, baseSize);
+            : LLVM::ConstantOp::create(rewriter, loc, dstType, baseSize);
 
     // Shift `Base` left by [sizeof(Base) - (Count + Offset)], so that the bit
     // at Offset + Count - 1 is the most significant bit now.
     Value countPlusOffset =
-        rewriter.create<LLVM::AddOp>(loc, dstType, count, offset);
+        LLVM::AddOp::create(rewriter, loc, dstType, count, offset);
     Value amountToShiftLeft =
-        rewriter.create<LLVM::SubOp>(loc, dstType, size, countPlusOffset);
-    Value baseShiftedLeft = rewriter.create<LLVM::ShlOp>(
-        loc, dstType, op.getBase(), amountToShiftLeft);
+        LLVM::SubOp::create(rewriter, loc, dstType, size, countPlusOffset);
+    Value baseShiftedLeft = LLVM::ShlOp::create(
+        rewriter, loc, dstType, op.getBase(), amountToShiftLeft);
 
     // Shift the result right, filling the bits with the sign bit.
     Value amountToShiftRight =
-        rewriter.create<LLVM::AddOp>(loc, dstType, offset, amountToShiftLeft);
+        LLVM::AddOp::create(rewriter, loc, dstType, offset, amountToShiftLeft);
     rewriter.replaceOpWithNewOp<LLVM::AShrOp>(op, dstType, baseShiftedLeft,
                                               amountToShiftRight);
     return success();
@@ -516,13 +517,13 @@ class BitFieldUExtractPattern
     // Create a mask with bits set at [0, Count - 1].
     Value minusOne = createConstantAllBitsSet(loc, srcType, dstType, rewriter);
     Value maskShiftedByCount =
-        rewriter.create<LLVM::ShlOp>(loc, dstType, minusOne, count);
-    Value mask = rewriter.create<LLVM::XOrOp>(loc, dstType, maskShiftedByCount,
-                                              minusOne);
+        LLVM::ShlOp::create(rewriter, loc, dstType, minusOne, count);
+    Value mask = LLVM::XOrOp::create(rewriter, loc, dstType, maskShiftedByCount,
+                                     minusOne);
 
     // Shift `Base` by `Offset` and apply the mask on it.
     Value shiftedBase =
-        rewriter.create<LLVM::LShrOp>(loc, dstType, op.getBase(), offset);
+        LLVM::LShrOp::create(rewriter, loc, dstType, op.getBase(), offset);
     rewriter.replaceOpWithNewOp<LLVM::AndOp>(op, dstType, shiftedBase, mask);
     return success();
   }
@@ -694,8 +695,8 @@ class ExecutionModePattern
     auto structType = LLVM::LLVMStructType::getLiteral(context, fields);
 
     // Create `llvm.mlir.global` with initializer region containing one block.
-    auto global = rewriter.create<LLVM::GlobalOp>(
-        UnknownLoc::get(context), structType, /*isConstant=*/true,
+    auto global = LLVM::GlobalOp::create(
+        rewriter, UnknownLoc::get(context), structType, /*isConstant=*/true,
         LLVM::Linkage::External, executionModeInfoName, Attribute(),
         /*alignment=*/0);
     Location loc = global.getLoc();
@@ -704,22 +705,23 @@ class ExecutionModePattern
 
     // Initialize the struct and set the execution mode value.
     rewriter.setInsertionPointToStart(block);
-    Value structValue = rewriter.create<LLVM::PoisonOp>(loc, structType);
-    Value executionMode = rewriter.create<LLVM::ConstantOp>(
-        loc, llvmI32Type,
+    Value structValue = LLVM::PoisonOp::create(rewriter, loc, structType);
+    Value executionMode = LLVM::ConstantOp::create(
+        rewriter, loc, llvmI32Type,
         rewriter.getI32IntegerAttr(
             static_cast<uint32_t>(executionModeAttr.getValue())));
-    structValue = rewriter.create<LLVM::InsertValueOp>(loc, structValue,
-                                                       executionMode, 0);
+    SmallVector<int64_t> position{0};
+    structValue = LLVM::InsertValueOp::create(rewriter, loc, structValue,
+                                              executionMode, position);
 
     // Insert extra operands if they exist into execution mode info struct.
     for (unsigned i = 0, e = values.size(); i < e; ++i) {
       auto attr = values.getValue()[i];
-      Value entry = rewriter.create<LLVM::ConstantOp>(loc, llvmI32Type, attr);
-      structValue = rewriter.create<LLVM::InsertValueOp>(
-          loc, structValue, entry, ArrayRef<int64_t>({1, i}));
+      Value entry = LLVM::ConstantOp::create(rewriter, loc, llvmI32Type, attr);
+      structValue = LLVM::InsertValueOp::create(
+          rewriter, loc, structValue, entry, ArrayRef<int64_t>({1, i}));
     }
-    rewriter.create<LLVM::ReturnOp>(loc, ArrayRef<Value>({structValue}));
+    LLVM::ReturnOp::create(rewriter, loc, ArrayRef<Value>({structValue}));
     rewriter.eraseOp(op);
     return success();
   }
@@ -913,7 +915,7 @@ class InverseSqrtPattern
 
     Location loc = op.getLoc();
     Value one = createFPConstant(loc, srcType, dstType, rewriter, 1.0);
-    Value sqrt = rewriter.create<LLVM::SqrtOp>(loc, dstType, op.getOperand());
+    Value sqrt = LLVM::SqrtOp::create(rewriter, loc, dstType, op.getOperand());
     rewriter.replaceOpWithNewOp<LLVM::FDivOp>(op, dstType, one, sqrt);
     return success();
   }
@@ -973,10 +975,10 @@ class NotPattern : public SPIRVToLLVMConversion<SPIRVOp> {
     IntegerAttr minusOne = minusOneIntegerAttribute(srcType, rewriter);
     auto mask =
         isa<VectorType>(srcType)
-            ? rewriter.create<LLVM::ConstantOp>(
-                  loc, dstType,
+            ? LLVM::ConstantOp::create(
+                  rewriter, loc, dstType,
                   SplatElementsAttr::get(cast<VectorType>(srcType), minusOne))
-            : rewriter.create<LLVM::ConstantOp>(loc, dstType, minusOne);
+            : LLVM::ConstantOp::create(rewriter, loc, dstType, minusOne);
     rewriter.template replaceOpWithNewOp<LLVM::XOrOp>(notOp, dstType,
                                                       notOp.getOperand(), mask);
     return success();
@@ -1034,8 +1036,8 @@ static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
     return func;
 
   OpBuilder b(symbolTable->getRegion(0));
-  func = b.create<LLVM::LLVMFuncOp>(
-      symbolTable->getLoc(), name,
+  func = LLVM::LLVMFuncOp::create(
+      b, symbolTable->getLoc(), name,
       LLVM::LLVMFunctionType::get(resultType, paramTypes));
   func.setCConv(LLVM::cconv::CConv::SPIR_FUNC);
   func.setConvergent(convergent);
@@ -1047,7 +1049,7 @@ static LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable,
 static LLVM::CallOp createSPIRVBuiltinCall(Location loc, OpBuilder &builder,
                                            LLVM::LLVMFuncOp func,
                                            ValueRange args) {
-  auto call = builder.create<LLVM::CallOp>(loc, func, args);
+  auto call = LLVM::CallOp::create(builder, loc, func, args);
   call.setCConv(func.getCConv());
   call.setConvergentAttr(func.getConvergentAttr());
   call.setNoUnwindAttr(func.getNoUnwindAttr());
@@ -1078,12 +1080,12 @@ class ControlBarrierPattern : public SPIRVToLLVMConversion<BarrierOpTy> {
         lookupOrCreateSPIRVFn(symbolTable, funcName, {i32, i32, i32}, voidTy);
 
     Location loc = controlBarrierOp->getLoc();
-    Value execution = rewriter.create<LLVM::ConstantOp>(
-        loc, i32, static_cast<int32_t>(adaptor.getExecutionScope()));
-    Value memory = rewriter.create<LLVM::ConstantOp>(
-        loc, i32, static_cast<int32_t>(adaptor.getMemoryScope()));
-    Value semantics = rewriter.create<LLVM::ConstantOp>(
-        loc, i32, static_cast<int32_t>(adaptor.getMemorySemantics()));
+    Value execution = LLVM::ConstantOp::create(
+        rewriter, loc, i32, static_cast<int32_t>(adaptor.getExecutionScope()));
+    Value memory = LLVM::ConstantOp::create(
+        rewriter, loc, i32, static_cast<int32_t>(adaptor.getMemoryScope()));
+    Value semantics = LLVM::ConstantOp::create(
+        rewriter, loc, i32, static_cast<int32_t>(adaptor.getMemorySemantics()));
 
     auto call = createSPIRVBuiltinCall(loc, rewriter, func,
                                        {execution, memory, semantics});
@@ -1255,10 +1257,12 @@ class GroupReducePattern : public SPIRVToLLVMConversion<ReduceOp> {
         lookupOrCreateSPIRVFn(symbolTable, funcName, paramTypes, retTy);
 
     Location loc = op.getLoc();
-    Value scope = rewriter.create<LLVM::ConstantOp>(
-        loc, i32Ty, static_cast<int32_t>(adaptor.getExecutionScope()));
-    Value groupOp = rewriter.create<LLVM::ConstantOp>(
-        loc, i32Ty, static_cast<int32_t>(adaptor.getGroupOperation()));
+    Value scope = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty,
+        static_cast<int32_t>(adaptor.getExecutionScope()));
+    Value groupOp = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty,
+        static_cast<int32_t>(adaptor.getGroupOperation()));
     SmallVector<Value> operands{scope, groupOp};
     operands.append(adaptor.getOperands().begin(), adaptor.getOperands().end());
 
@@ -1368,7 +1372,7 @@ class LoopPattern : public SPIRVToLLVMConversion<spirv::LoopOp> {
       return failure();
     Block *headerBlock = loopOp.getHeaderBlock();
     rewriter.setInsertionPointToEnd(currentBlock);
-    rewriter.create<LLVM::BrOp>(loc, brOp.getBlockArguments(), headerBlock);
+    LLVM::BrOp::create(rewriter, loc, brOp.getBlockArguments(), headerBlock);
     rewriter.eraseBlock(entryBlock);
 
     // Branch from merge block to end block.
@@ -1376,7 +1380,7 @@ class LoopPattern : public SPIRVToLLVMConversion<spirv::LoopOp> {
     Operation *terminator = mergeBlock->getTerminator();
     ValueRange terminatorOperands = terminator->getOperands();
     rewriter.setInsertionPointToEnd(mergeBlock);
-    rewriter.create<LLVM::BrOp>(loc, terminatorOperands, endBlock);
+    LLVM::BrOp::create(rewriter, loc, terminatorOperands, endBlock);
 
     rewriter.inlineRegionBefore(loopOp.getBody(), endBlock);
     rewriter.replaceOp(loopOp, endBlock->getArguments());
@@ -1434,16 +1438,15 @@ class SelectionPattern : public SPIRVToLLVMConversion<spirv::SelectionOp> {
     Operation *terminator = mergeBlock->getTerminator();
     ValueRange terminatorOperands = terminator->getOperands();
     rewriter.setInsertionPointToEnd(mergeBlock);
-    rewriter.create<LLVM::BrOp>(loc, terminatorOperands, continueBlock);
+    LLVM::BrOp::create(rewriter, loc, terminatorOperands, continueBlock);
 
     // Link current block to `true` and `false` blocks within the selection.
     Block *trueBlock = condBrOp.getTrueBlock();
     Block *falseBlock = condBrOp.getFalseBlock();
     rewriter.setInsertionPointToEnd(currentBlock);
-    rewriter.create<LLVM::CondBrOp>(loc, condBrOp.getCondition(), trueBlock,
-                                    condBrOp.getTrueTargetOperands(),
-                                    falseBlock,
-                                    condBrOp.getFalseTargetOperands());
+    LLVM::CondBrOp::create(rewriter, loc, condBrOp.getCondition(), trueBlock,
+                           condBrOp.getTrueTargetOperands(), falseBlock,
+                           condBrOp.getFalseTargetOperands());
 
     rewriter.eraseBlock(headerBlock);
     rewriter.inlineRegionBefore(op.getBody(), continueBlock);
@@ -1521,8 +1524,8 @@ class TanPattern : public SPIRVToLLVMConversion<spirv::GLTanOp> {
       return rewriter.notifyMatchFailure(tanOp, "type conversion failed");
 
     Location loc = tanOp.getLoc();
-    Value sin = rewriter.create<LLVM::SinOp>(loc, dstType, tanOp.getOperand());
-    Value cos = rewriter.create<LLVM::CosOp>(loc, dstType, tanOp.getOperand());
+    Value sin = LLVM::SinOp::create(rewriter, loc, dstType, tanOp.getOperand());
+    Value cos = LLVM::CosOp::create(rewriter, loc, dstType, tanOp.getOperand());
     rewriter.replaceOpWithNewOp<LLVM::FDivOp>(tanOp, dstType, sin, cos);
     return success();
   }
@@ -1549,13 +1552,13 @@ class TanhPattern : public SPIRVToLLVMConversion<spirv::GLTanhOp> {
     Location loc = tanhOp.getLoc();
     Value two = createFPConstant(loc, srcType, dstType, rewriter, 2.0);
     Value multiplied =
-        rewriter.create<LLVM::FMulOp>(loc, dstType, two, tanhOp.getOperand());
-    Value exponential = rewriter.create<LLVM::ExpOp>(loc, dstType, multiplied);
+        LLVM::FMulOp::create(rewriter, loc, dstType, two, tanhOp.getOperand());
+    Value exponential = LLVM::ExpOp::create(rewriter, loc, dstType, multiplied);
     Value one = createFPConstant(loc, srcType, dstType, rewriter, 1.0);
     Value numerator =
-        rewriter.create<LLVM::FSubOp>(loc, dstType, exponential, one);
+        LLVM::FSubOp::create(rewriter, loc, dstType, exponential, one);
     Value denominator =
-        rewriter.create<LLVM::FAddOp>(loc, dstType, exponential, one);
+        LLVM::FAddOp::create(rewriter, loc, dstType, exponential, one);
     rewriter.replaceOpWithNewOp<LLVM::FDivOp>(tanhOp, dstType, numerator,
                                               denominator);
     return success();
@@ -1594,8 +1597,8 @@ class VariablePattern : public SPIRVToLLVMConversion<spirv::VariableOp> {
     if (!elementType)
       return rewriter.notifyMatchFailure(varOp, "type conversion failed");
     Value allocated =
-        rewriter.create<LLVM::AllocaOp>(loc, dstType, elementType, size);
-    rewriter.create<LLVM::StoreOp>(loc, adaptor.getInitializer(), allocated);
+        LLVM::AllocaOp::create(rewriter, loc, dstType, elementType, size);
+    LLVM::StoreOp::create(rewriter, loc, adaptor.getInitializer(), allocated);
     rewriter.replaceOp(varOp, allocated);
     return success();
   }
@@ -1656,7 +1659,7 @@ class FuncConversionPattern : public SPIRVToLLVMConversion<spirv::FuncOp> {
     // Create a new `LLVMFuncOp`
     Location loc = funcOp.getLoc();
     StringRef name = funcOp.getName();
-    auto newFuncOp = rewriter.create<LLVM::LLVMFuncOp>(loc, name, llvmType);
+    auto newFuncOp = LLVM::LLVMFuncOp::create(rewriter, loc, name, llvmType);
 
     // Convert SPIR-V Function Control to equivalent LLVM function attribute
     MLIRContext *context = funcOp.getContext();
@@ -1710,7 +1713,7 @@ class ModuleConversionPattern : public SPIRVToLLVMConversion<spirv::ModuleOp> {
                   ConversionPatternRewriter &rewriter) const override {
 
     auto newModuleOp =
-        rewriter.create<ModuleOp>(spvModuleOp.getLoc(), spvModuleOp.getName());
+        ModuleOp::create(rewriter, spvModuleOp.getLoc(), spvModuleOp.getName());
     rewriter.inlineRegionBefore(spvModuleOp.getRegion(), newModuleOp.getBody());
 
     // Remove the terminator block that was automatically added by builder
@@ -1751,7 +1754,7 @@ class VectorShufflePattern
     auto componentsArray = components.getValue();
     auto *context = rewriter.getContext();
     auto llvmI32Type = IntegerType::get(context, 32);
-    Value targetOp = rewriter.create<LLVM::PoisonOp>(loc, dstType);
+    Value targetOp = LLVM::PoisonOp::create(rewriter, loc, dstType);
     for (unsigned i = 0; i < componentsArray.size(); i++) {
       if (!isa<IntegerAttr>(componentsArray[i]))
         return op.emitError("unable to support non-constant component");
@@ -1767,16 +1770,17 @@ class VectorShufflePattern
         baseVector = vector2;
       }
 
-      Value dstIndex = rewriter.create<LLVM::ConstantOp>(
-          loc, llvmI32Type, rewriter.getIntegerAttr(rewriter.getI32Type(), i));
-      Value index = rewriter.create<LLVM::ConstantOp>(
-          loc, llvmI32Type,
+      Value dstIndex = LLVM::ConstantOp::create(
+          rewriter, loc, llvmI32Type,
+          rewriter.getIntegerAttr(rewriter.getI32Type(), i));
+      Value index = LLVM::ConstantOp::create(
+          rewriter, loc, llvmI32Type,
           rewriter.getIntegerAttr(rewriter.getI32Type(), indexVal - offsetVal));
 
-      auto extractOp = rewriter.create<LLVM::ExtractElementOp>(
-          loc, scalarType, baseVector, index);
-      targetOp = rewriter.create<LLVM::InsertElementOp>(loc, dstType, targetOp,
-                                                        extractOp, dstIndex);
+      auto extractOp = LLVM::ExtractElementOp::create(rewriter, loc, scalarType,
+                                                      baseVector, index);
+      targetOp = LLVM::InsertElementOp::create(rewriter, loc, dstType, targetOp,
+                                               extractOp, dstIndex);
     }
     rewriter.replaceOp(op, targetOp);
     return success();
diff --git a/mlir/lib/Conversion/ShapeToStandard/ConvertShapeConstraints.cpp b/mlir/lib/Conversion/ShapeToStandard/ConvertShapeConstraints.cpp
index da9ad3dd67328..245e60b04ec31 100644
--- a/mlir/lib/Conversion/ShapeToStandard/ConvertShapeConstraints.cpp
+++ b/mlir/lib/Conversion/ShapeToStandard/ConvertShapeConstraints.cpp
@@ -32,7 +32,7 @@ class ConvertCstrRequireOp : public OpRewritePattern<shape::CstrRequireOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(shape::CstrRequireOp op,
                                 PatternRewriter &rewriter) const override {
-    rewriter.create<cf::AssertOp>(op.getLoc(), op.getPred(), op.getMsgAttr());
+    cf::AssertOp::create(rewriter, op.getLoc(), op.getPred(), op.getMsgAttr());
     rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op, true);
     return success();
   }
diff --git a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp
index bbe1490137bf8..7025c5a7daf93 100644
--- a/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp
+++ b/mlir/lib/Conversion/ShapeToStandard/ShapeToStandard.cpp
@@ -82,40 +82,40 @@ struct BroadcastOpConverter : public OpConversionPattern<BroadcastOp> {
 // number of extent tensors and shifted offsets into them.
 Value getBroadcastedDim(ImplicitLocOpBuilder lb, ValueRange extentTensors,
                         ValueRange rankDiffs, Value outputDimension) {
-  Value one = lb.create<arith::ConstantIndexOp>(1);
+  Value one = arith::ConstantIndexOp::create(lb, 1);
   Value broadcastedDim = one;
   for (auto tup : llvm::zip(extentTensors, rankDiffs)) {
     Value shape = std::get<0>(tup);
     Value rankDiff = std::get<1>(tup);
-    Value outOfBounds = lb.create<arith::CmpIOp>(arith::CmpIPredicate::ult,
-                                                 outputDimension, rankDiff);
+    Value outOfBounds = arith::CmpIOp::create(lb, arith::CmpIPredicate::ult,
+                                              outputDimension, rankDiff);
     Type indexTy = lb.getIndexType();
     broadcastedDim =
-        lb.create<IfOp>(
-              outOfBounds,
-              [&](OpBuilder &b, Location loc) {
-                b.create<scf::YieldOp>(loc, broadcastedDim);
-              },
-              [&](OpBuilder &b, Location loc) {
-                // The broadcasting logic is:
-                // - if one extent (here we arbitrarily choose the
-                // extent from the greater-rank operand) is equal to 1,
-                // then take the extent from the other operand
-                // - otherwise, take the extent as-is.
-                // Note that this logic remains correct in the presence
-                // of dimensions of zero extent.
-                Value lesserRankOperandDimension = b.create<arith::SubIOp>(
-                    loc, indexTy, outputDimension, rankDiff);
-                Value lesserRankOperandExtent = b.create<tensor::ExtractOp>(
-                    loc, shape, ValueRange{lesserRankOperandDimension});
-
-                Value dimIsOne =
-                    b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                            lesserRankOperandExtent, one);
-                Value dim = b.create<arith::SelectOp>(
-                    loc, dimIsOne, broadcastedDim, lesserRankOperandExtent);
-                b.create<scf::YieldOp>(loc, dim);
-              })
+        IfOp::create(
+            lb, outOfBounds,
+            [&](OpBuilder &b, Location loc) {
+              scf::YieldOp::create(b, loc, broadcastedDim);
+            },
+            [&](OpBuilder &b, Location loc) {
+              // The broadcasting logic is:
+              // - if one extent (here we arbitrarily choose the
+              // extent from the greater-rank operand) is equal to 1,
+              // then take the extent from the other operand
+              // - otherwise, take the extent as-is.
+              // Note that this logic remains correct in the presence
+              // of dimensions of zero extent.
+              Value lesserRankOperandDimension = arith::SubIOp::create(
+                  b, loc, indexTy, outputDimension, rankDiff);
+              Value lesserRankOperandExtent = tensor::ExtractOp::create(
+                  b, loc, shape, ValueRange{lesserRankOperandDimension});
+
+              Value dimIsOne =
+                  arith::CmpIOp::create(b, loc, arith::CmpIPredicate::eq,
+                                        lesserRankOperandExtent, one);
+              Value dim = arith::SelectOp::create(
+                  b, loc, dimIsOne, broadcastedDim, lesserRankOperandExtent);
+              scf::YieldOp::create(b, loc, dim);
+            })
             .getResult(0);
   }
   return broadcastedDim;
@@ -133,7 +133,7 @@ LogicalResult BroadcastOpConverter::matchAndRewrite(
   auto loc = op.getLoc();
   ImplicitLocOpBuilder lb(loc, rewriter);
 
-  Value zero = lb.create<arith::ConstantIndexOp>(0);
+  Value zero = arith::ConstantIndexOp::create(lb, 0);
   Type indexTy = lb.getIndexType();
 
   // Save all the ranks for bounds checking. Because this is a tensor
@@ -141,31 +141,31 @@ LogicalResult BroadcastOpConverter::matchAndRewrite(
   // dimension in the tensor.
   SmallVector<Value> ranks, rankDiffs;
   llvm::append_range(ranks, llvm::map_range(adaptor.getShapes(), [&](Value v) {
-                       return lb.create<tensor::DimOp>(v, zero);
+                       return tensor::DimOp::create(lb, v, zero);
                      }));
 
   // Find the maximum rank
   Value maxRank = ranks.front();
   for (Value v : llvm::drop_begin(ranks, 1)) {
-    maxRank = lb.create<arith::MaxUIOp>(v, maxRank);
+    maxRank = arith::MaxUIOp::create(lb, v, maxRank);
   }
 
   // Calculate the difference of ranks and the maximum rank for later offsets.
   llvm::append_range(rankDiffs, llvm::map_range(ranks, [&](Value v) {
-                       return lb.create<arith::SubIOp>(indexTy, maxRank, v);
+                       return arith::SubIOp::create(lb, indexTy, maxRank, v);
                      }));
 
-  Value replacement = lb.create<tensor::GenerateOp>(
-      getExtentTensorType(lb.getContext()), ValueRange{maxRank},
+  Value replacement = tensor::GenerateOp::create(
+      lb, getExtentTensorType(lb.getContext()), ValueRange{maxRank},
       [&](OpBuilder &b, Location loc, ValueRange args) {
         Value broadcastedDim =
             getBroadcastedDim(ImplicitLocOpBuilder(loc, b), adaptor.getShapes(),
                               rankDiffs, args[0]);
 
-        b.create<tensor::YieldOp>(loc, broadcastedDim);
+        tensor::YieldOp::create(b, loc, broadcastedDim);
       });
   if (replacement.getType() != op.getType())
-    replacement = lb.create<tensor::CastOp>(op.getType(), replacement);
+    replacement = tensor::CastOp::create(lb, op.getType(), replacement);
   rewriter.replaceOp(op, replacement);
   return success();
 }
@@ -193,13 +193,13 @@ LogicalResult ConstShapeOpConverter::matchAndRewrite(
   auto loc = op.getLoc();
   SmallVector<Value, 4> extentOperands;
   for (auto extent : op.getShape()) {
-    extentOperands.push_back(
-        rewriter.create<arith::ConstantIndexOp>(loc, extent.getLimitedValue()));
+    extentOperands.push_back(arith::ConstantIndexOp::create(
+        rewriter, loc, extent.getLimitedValue()));
   }
   Type resultTy =
       RankedTensorType::get({op.getShape().size()}, rewriter.getIndexType());
   Value tensor =
-      rewriter.create<tensor::FromElementsOp>(loc, resultTy, extentOperands);
+      tensor::FromElementsOp::create(rewriter, loc, resultTy, extentOperands);
   rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultTy, tensor);
   return success();
 }
@@ -245,8 +245,8 @@ LogicalResult IsBroadcastableOpConverter::matchAndRewrite(
 
   auto loc = op.getLoc();
   ImplicitLocOpBuilder lb(loc, rewriter);
-  Value zero = lb.create<arith::ConstantIndexOp>(0);
-  Value one = lb.create<arith::ConstantIndexOp>(1);
+  Value zero = arith::ConstantIndexOp::create(lb, 0);
+  Value one = arith::ConstantIndexOp::create(lb, 1);
   Type indexTy = lb.getIndexType();
 
   // Save all the ranks for bounds checking. Because this is a tensor
@@ -254,26 +254,26 @@ LogicalResult IsBroadcastableOpConverter::matchAndRewrite(
   // dimension in the tensor.
   SmallVector<Value> ranks, rankDiffs;
   llvm::append_range(ranks, llvm::map_range(adaptor.getShapes(), [&](Value v) {
-                       return lb.create<tensor::DimOp>(v, zero);
+                       return tensor::DimOp::create(lb, v, zero);
                      }));
 
   // Find the maximum rank
   Value maxRank = ranks.front();
   for (Value v : llvm::drop_begin(ranks, 1)) {
-    maxRank = lb.create<arith::MaxUIOp>(v, maxRank);
+    maxRank = arith::MaxUIOp::create(lb, v, maxRank);
   }
 
   // Calculate the difference of ranks and the maximum rank for later offsets.
   llvm::append_range(rankDiffs, llvm::map_range(ranks, [&](Value v) {
-                       return lb.create<arith::SubIOp>(indexTy, maxRank, v);
+                       return arith::SubIOp::create(lb, indexTy, maxRank, v);
                      }));
 
   Type i1Ty = rewriter.getI1Type();
-  Value trueVal =
-      rewriter.create<arith::ConstantOp>(loc, i1Ty, rewriter.getBoolAttr(true));
+  Value trueVal = arith::ConstantOp::create(rewriter, loc, i1Ty,
+                                            rewriter.getBoolAttr(true));
 
-  auto reduceResult = lb.create<ForOp>(
-      loc, zero, maxRank, one, ValueRange{trueVal},
+  auto reduceResult = ForOp::create(
+      lb, loc, zero, maxRank, one, ValueRange{trueVal},
       [&](OpBuilder &b, Location loc, Value iv, ValueRange iterArgs) {
         // Find a non-1 dim, if it exists. Note that the first part of this
         // could reuse the Broadcast lowering entirely, but we redo the work
@@ -285,38 +285,38 @@ LogicalResult IsBroadcastableOpConverter::matchAndRewrite(
         for (auto tup : llvm::zip(adaptor.getShapes(), rankDiffs)) {
           Value shape, rankDiff;
           std::tie(shape, rankDiff) = tup;
-          Value outOfBounds = b.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::ult, iv, rankDiff);
+          Value outOfBounds = arith::CmpIOp::create(
+              b, loc, arith::CmpIPredicate::ult, iv, rankDiff);
           broadcastable =
-              b.create<IfOp>(
-                   loc, outOfBounds,
-                   [&](OpBuilder &b, Location loc) {
-                     // Non existent dimensions are always broadcastable
-                     b.create<scf::YieldOp>(loc, broadcastable);
-                   },
-                   [&](OpBuilder &b, Location loc) {
-                     // Every value needs to be either 1, or the same non-1
-                     // value to be broadcastable in this dim.
-                     Value operandDimension =
-                         b.create<arith::SubIOp>(loc, indexTy, iv, rankDiff);
-                     Value dimensionExtent = b.create<tensor::ExtractOp>(
-                         loc, shape, ValueRange{operandDimension});
-
-                     Value equalOne = b.create<arith::CmpIOp>(
-                         loc, arith::CmpIPredicate::eq, dimensionExtent, one);
-                     Value equalBroadcasted = b.create<arith::CmpIOp>(
-                         loc, arith::CmpIPredicate::eq, dimensionExtent,
-                         broadcastedDim);
-                     Value result = b.create<arith::AndIOp>(
-                         loc, broadcastable,
-                         b.create<arith::OrIOp>(loc, equalOne,
-                                                equalBroadcasted));
-                     b.create<scf::YieldOp>(loc, result);
-                   })
+              IfOp::create(
+                  b, loc, outOfBounds,
+                  [&](OpBuilder &b, Location loc) {
+                    // Non existent dimensions are always broadcastable
+                    scf::YieldOp::create(b, loc, broadcastable);
+                  },
+                  [&](OpBuilder &b, Location loc) {
+                    // Every value needs to be either 1, or the same non-1
+                    // value to be broadcastable in this dim.
+                    Value operandDimension =
+                        arith::SubIOp::create(b, loc, indexTy, iv, rankDiff);
+                    Value dimensionExtent = tensor::ExtractOp::create(
+                        b, loc, shape, ValueRange{operandDimension});
+
+                    Value equalOne = arith::CmpIOp::create(
+                        b, loc, arith::CmpIPredicate::eq, dimensionExtent, one);
+                    Value equalBroadcasted =
+                        arith::CmpIOp::create(b, loc, arith::CmpIPredicate::eq,
+                                              dimensionExtent, broadcastedDim);
+                    Value result = arith::AndIOp::create(
+                        b, loc, broadcastable,
+                        arith::OrIOp::create(b, loc, equalOne,
+                                             equalBroadcasted));
+                    scf::YieldOp::create(b, loc, result);
+                  })
                   .getResult(0);
         }
 
-        b.create<scf::YieldOp>(loc, broadcastable);
+        scf::YieldOp::create(b, loc, broadcastable);
       });
 
   rewriter.replaceOp(op, reduceResult.getResults().front());
@@ -339,7 +339,7 @@ DimOpConverter::matchAndRewrite(DimOp op, OpAdaptor adaptor,
   // Lower to dim(X, i) to get_extent(shape_of(X), i) and rely on further
   // lowerings. This can be further optimized if needed to avoid intermediate
   // steps.
-  auto shapeOf = rewriter.create<shape::ShapeOfOp>(op.getLoc(), op.getValue());
+  auto shapeOf = shape::ShapeOfOp::create(rewriter, op.getLoc(), op.getValue());
   rewriter.replaceOpWithNewOp<shape::GetExtentOp>(op, op.getType(), shapeOf,
                                                   op.getIndex());
   return success();
@@ -421,16 +421,17 @@ ReduceOpConverter::matchAndRewrite(shape::ReduceOp op, OpAdaptor adaptor,
 
   auto loc = op.getLoc();
 
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
   Type indexTy = rewriter.getIndexType();
   Value rank =
-      rewriter.create<tensor::DimOp>(loc, indexTy, adaptor.getShape(), zero);
+      tensor::DimOp::create(rewriter, loc, indexTy, adaptor.getShape(), zero);
 
-  auto loop = rewriter.create<scf::ForOp>(
-      loc, zero, rank, one, op.getInitVals(),
+  auto loop = scf::ForOp::create(
+      rewriter, loc, zero, rank, one, op.getInitVals(),
       [&](OpBuilder &b, Location loc, Value iv, ValueRange args) {
-        Value extent = b.create<tensor::ExtractOp>(loc, adaptor.getShape(), iv);
+        Value extent =
+            tensor::ExtractOp::create(b, loc, adaptor.getShape(), iv);
 
         SmallVector<Value, 2> mappedValues{iv, extent};
         mappedValues.append(args.begin(), args.end());
@@ -444,7 +445,7 @@ ReduceOpConverter::matchAndRewrite(shape::ReduceOp op, OpAdaptor adaptor,
         SmallVector<Value, 2> mappedResults;
         for (auto result : reduceBody->getTerminator()->getOperands())
           mappedResults.push_back(mapping.lookup(result));
-        b.create<scf::YieldOp>(loc, mappedResults);
+        scf::YieldOp::create(b, loc, mappedResults);
       });
 
   rewriter.replaceOp(op, loop.getResults());
@@ -507,44 +508,44 @@ ShapeEqOpConverter::matchAndRewrite(ShapeEqOp op, OpAdaptor adaptor,
 
   auto loc = op.getLoc();
   Type indexTy = rewriter.getIndexType();
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
   Value firstShape = adaptor.getShapes().front();
   Value firstRank =
-      rewriter.create<tensor::DimOp>(loc, indexTy, firstShape, zero);
+      tensor::DimOp::create(rewriter, loc, indexTy, firstShape, zero);
   Value result = nullptr;
   // Generate a linear sequence of compares, all with firstShape as lhs.
   for (Value shape : adaptor.getShapes().drop_front(1)) {
-    Value rank = rewriter.create<tensor::DimOp>(loc, indexTy, shape, zero);
-    Value eqRank = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                  firstRank, rank);
-    auto same = rewriter.create<IfOp>(
-        loc, eqRank,
+    Value rank = tensor::DimOp::create(rewriter, loc, indexTy, shape, zero);
+    Value eqRank = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::eq, firstRank, rank);
+    auto same = IfOp::create(
+        rewriter, loc, eqRank,
         [&](OpBuilder &b, Location loc) {
-          Value one = b.create<arith::ConstantIndexOp>(loc, 1);
+          Value one = arith::ConstantIndexOp::create(b, loc, 1);
           Value init =
-              b.create<arith::ConstantOp>(loc, i1Ty, b.getBoolAttr(true));
-          auto loop = b.create<scf::ForOp>(
-              loc, zero, firstRank, one, ValueRange{init},
+              arith::ConstantOp::create(b, loc, i1Ty, b.getBoolAttr(true));
+          auto loop = scf::ForOp::create(
+              b, loc, zero, firstRank, one, ValueRange{init},
               [&](OpBuilder &b, Location nestedLoc, Value iv, ValueRange args) {
                 Value conj = args[0];
                 Value lhsExtent =
-                    b.create<tensor::ExtractOp>(loc, firstShape, iv);
-                Value rhsExtent = b.create<tensor::ExtractOp>(loc, shape, iv);
-                Value eqExtent = b.create<arith::CmpIOp>(
-                    loc, arith::CmpIPredicate::eq, lhsExtent, rhsExtent);
-                Value conjNext = b.create<arith::AndIOp>(loc, conj, eqExtent);
-                b.create<scf::YieldOp>(loc, ValueRange({conjNext}));
+                    tensor::ExtractOp::create(b, loc, firstShape, iv);
+                Value rhsExtent = tensor::ExtractOp::create(b, loc, shape, iv);
+                Value eqExtent = arith::CmpIOp::create(
+                    b, loc, arith::CmpIPredicate::eq, lhsExtent, rhsExtent);
+                Value conjNext = arith::AndIOp::create(b, loc, conj, eqExtent);
+                scf::YieldOp::create(b, loc, ValueRange({conjNext}));
               });
-          b.create<scf::YieldOp>(loc, loop.getResults());
+          scf::YieldOp::create(b, loc, loop.getResults());
         },
         [&](OpBuilder &b, Location loc) {
           Value result =
-              b.create<arith::ConstantOp>(loc, i1Ty, b.getBoolAttr(false));
-          b.create<scf::YieldOp>(loc, result);
+              arith::ConstantOp::create(b, loc, i1Ty, b.getBoolAttr(false));
+          scf::YieldOp::create(b, loc, result);
         });
     result = !result ? same.getResult(0)
-                     : rewriter.create<arith::AndIOp>(loc, result,
-                                                      same.getResult(0));
+                     : arith::AndIOp::create(rewriter, loc, result,
+                                             same.getResult(0));
   }
   rewriter.replaceOp(op, result);
   return success();
@@ -581,18 +582,18 @@ LogicalResult ShapeOfOpConversion::matchAndRewrite(
     int64_t rank = rankedTensorTy.getRank();
     for (int64_t i = 0; i < rank; i++) {
       if (rankedTensorTy.isDynamicDim(i)) {
-        Value extent = rewriter.create<tensor::DimOp>(loc, tensor, i);
+        Value extent = tensor::DimOp::create(rewriter, loc, tensor, i);
         extentValues.push_back(extent);
       } else {
-        Value extent = rewriter.create<arith::ConstantIndexOp>(
-            loc, rankedTensorTy.getDimSize(i));
+        Value extent = arith::ConstantIndexOp::create(
+            rewriter, loc, rankedTensorTy.getDimSize(i));
         extentValues.push_back(extent);
       }
     }
 
     // Materialize extent tensor.
-    Value staticExtentTensor = rewriter.create<tensor::FromElementsOp>(
-        loc, RankedTensorType::get({rank}, rewriter.getIndexType()),
+    Value staticExtentTensor = tensor::FromElementsOp::create(
+        rewriter, loc, RankedTensorType::get({rank}, rewriter.getIndexType()),
         extentValues);
     rewriter.replaceOpWithNewOp<tensor::CastOp>(op, op.getType(),
                                                 staticExtentTensor);
@@ -601,13 +602,13 @@ LogicalResult ShapeOfOpConversion::matchAndRewrite(
 
   // Lower to `tensor.generate` otherwise.
   auto *ctx = rewriter.getContext();
-  Value rank = rewriter.create<tensor::RankOp>(loc, tensor);
+  Value rank = tensor::RankOp::create(rewriter, loc, tensor);
   rewriter.replaceOpWithNewOp<tensor::GenerateOp>(
       op, getExtentTensorType(ctx), ValueRange{rank},
       [&](OpBuilder &b, Location loc, ValueRange args) {
         Value dim = args.front();
-        Value extent = b.create<tensor::DimOp>(loc, tensor, dim);
-        b.create<tensor::YieldOp>(loc, extent);
+        Value extent = tensor::DimOp::create(b, loc, tensor, dim);
+        tensor::YieldOp::create(b, loc, extent);
       });
 
   return success();
@@ -634,22 +635,22 @@ LogicalResult SplitAtOpConversion::matchAndRewrite(
     return failure();
 
   ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value rank = b.create<tensor::DimOp>(adaptor.getOperand(), zero);
+  Value zero = arith::ConstantIndexOp::create(b, 0);
+  Value rank = tensor::DimOp::create(b, adaptor.getOperand(), zero);
 
   // index < 0 ? index + rank : index
   Value originalIndex = adaptor.getIndex();
-  Value add = b.create<arith::AddIOp>(originalIndex, rank);
+  Value add = arith::AddIOp::create(b, originalIndex, rank);
   Value indexIsNegative =
-      b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, originalIndex, zero);
-  Value index = b.create<arith::SelectOp>(indexIsNegative, add, originalIndex);
+      arith::CmpIOp::create(b, arith::CmpIPredicate::slt, originalIndex, zero);
+  Value index = arith::SelectOp::create(b, indexIsNegative, add, originalIndex);
 
-  Value one = b.create<arith::ConstantIndexOp>(1);
+  Value one = arith::ConstantIndexOp::create(b, 1);
   Value head =
-      b.create<tensor::ExtractSliceOp>(adaptor.getOperand(), zero, index, one);
-  Value tailSize = b.create<arith::SubIOp>(rank, index);
-  Value tail = b.create<tensor::ExtractSliceOp>(adaptor.getOperand(), index,
-                                                tailSize, one);
+      tensor::ExtractSliceOp::create(b, adaptor.getOperand(), zero, index, one);
+  Value tailSize = arith::SubIOp::create(b, rank, index);
+  Value tail = tensor::ExtractSliceOp::create(b, adaptor.getOperand(), index,
+                                              tailSize, one);
   rewriter.replaceOp(op, {head, tail});
   return success();
 }
diff --git a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
index 2c4d27502a521..f24972f6b6ee1 100644
--- a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
+++ b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRV.cpp
@@ -68,10 +68,10 @@ class TensorExtractPattern final
       // We could use the initializer directly; but certain driver compilers
       // have bugs dealing with that. So for now, use spirv.Store for
       // initialization.
-      varOp = rewriter.create<spirv::VariableOp>(loc, varType,
-                                                 spirv::StorageClass::Function,
-                                                 /*initializer=*/nullptr);
-      rewriter.create<spirv::StoreOp>(loc, varOp, adaptor.getTensor());
+      varOp = spirv::VariableOp::create(rewriter, loc, varType,
+                                        spirv::StorageClass::Function,
+                                        /*initializer=*/nullptr);
+      spirv::StoreOp::create(rewriter, loc, varOp, adaptor.getTensor());
     } else {
       // Need to store the value to the local variable. It's questionable
       // whether we want to support such case though.
@@ -83,7 +83,7 @@ class TensorExtractPattern final
 
     Value index = spirv::linearizeIndex(adaptor.getIndices(), strides,
                                         /*offset=*/0, indexType, loc, rewriter);
-    auto acOp = rewriter.create<spirv::AccessChainOp>(loc, varOp, index);
+    auto acOp = spirv::AccessChainOp::create(rewriter, loc, varOp, index);
 
     rewriter.replaceOpWithNewOp<spirv::LoadOp>(extractOp, acOp);
 
diff --git a/mlir/lib/Conversion/TosaToArith/TosaToArith.cpp b/mlir/lib/Conversion/TosaToArith/TosaToArith.cpp
index 40ad63610e23f..044b725c7d805 100644
--- a/mlir/lib/Conversion/TosaToArith/TosaToArith.cpp
+++ b/mlir/lib/Conversion/TosaToArith/TosaToArith.cpp
@@ -51,8 +51,8 @@ TypedAttr getConstantAttr(Type type, int64_t value, PatternRewriter &rewriter) {
 
 Value getConstantValue(Location loc, Type type, int64_t value,
                        PatternRewriter &rewriter) {
-  return rewriter.create<arith::ConstantOp>(
-      loc, getConstantAttr(type, value, rewriter));
+  return arith::ConstantOp::create(rewriter, loc,
+                                   getConstantAttr(type, value, rewriter));
 }
 
 // This converts the TOSA ApplyScale operator to a set of arithmetic ops,
@@ -82,41 +82,41 @@ class ApplyScaleGenericOpConverter
     Value one64 = getConstantValue(loc, i64Ty, 1, rewriter);
     Value thirtyOne32 = getConstantValue(loc, i32Ty, 31, rewriter);
 
-    Value shift32 = rewriter.create<arith::ExtUIOp>(loc, i32Ty, op.getShift());
+    Value shift32 = arith::ExtUIOp::create(rewriter, loc, i32Ty, op.getShift());
 
     // Compute the multiplication in 64-bits then select the high / low parts.
     Value value64 = value;
     if (getElementTypeOrSelf(valueTy) != rewriter.getI64Type())
-      value64 = rewriter.create<arith::ExtSIOp>(loc, i64Ty, value);
+      value64 = arith::ExtSIOp::create(rewriter, loc, i64Ty, value);
     Value multiplier64 =
-        rewriter.create<arith::ExtSIOp>(loc, i64Ty, multiplier32);
+        arith::ExtSIOp::create(rewriter, loc, i64Ty, multiplier32);
     Value multiply64 =
-        rewriter.create<arith::MulIOp>(loc, value64, multiplier64);
+        arith::MulIOp::create(rewriter, loc, value64, multiplier64);
 
     // Apply normal rounding.
-    Value shift64 = rewriter.create<arith::ExtUIOp>(loc, i64Ty, shift32);
-    Value round = rewriter.create<arith::ShLIOp>(loc, one64, shift64);
-    round = rewriter.create<arith::ShRUIOp>(loc, round, one64);
-    multiply64 = rewriter.create<arith::AddIOp>(loc, multiply64, round);
+    Value shift64 = arith::ExtUIOp::create(rewriter, loc, i64Ty, shift32);
+    Value round = arith::ShLIOp::create(rewriter, loc, one64, shift64);
+    round = arith::ShRUIOp::create(rewriter, loc, round, one64);
+    multiply64 = arith::AddIOp::create(rewriter, loc, multiply64, round);
 
     // Apply double rounding if necessary.
     if (op.getRoundingMode() == "DOUBLE_ROUND") {
       int64_t roundInt = 1 << 30;
       Value roundUp = getConstantValue(loc, i64Ty, roundInt, rewriter);
       Value roundDown = getConstantValue(loc, i64Ty, -roundInt, rewriter);
-      Value positive = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sge, value, zero);
+      Value positive = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sge, value, zero);
       Value dir =
-          rewriter.create<arith::SelectOp>(loc, positive, roundUp, roundDown);
-      Value val = rewriter.create<arith::AddIOp>(loc, dir, multiply64);
-      Value valid = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sgt, shift32, thirtyOne32);
+          arith::SelectOp::create(rewriter, loc, positive, roundUp, roundDown);
+      Value val = arith::AddIOp::create(rewriter, loc, dir, multiply64);
+      Value valid = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sgt, shift32, thirtyOne32);
       multiply64 =
-          rewriter.create<arith::SelectOp>(loc, valid, val, multiply64);
+          arith::SelectOp::create(rewriter, loc, valid, val, multiply64);
     }
 
-    Value result64 = rewriter.create<arith::ShRSIOp>(loc, multiply64, shift64);
-    Value result32 = rewriter.create<arith::TruncIOp>(loc, i32Ty, result64);
+    Value result64 = arith::ShRSIOp::create(rewriter, loc, multiply64, shift64);
+    Value result32 = arith::TruncIOp::create(rewriter, loc, i32Ty, result64);
 
     rewriter.replaceOp(op, result32);
     return success();
@@ -146,7 +146,7 @@ class ApplyScale32BitOpConverter : public OpRewritePattern<tosa::ApplyScaleOp> {
 
     Value value32 = op.getValue();
     Value multiplier32 = op.getMultiplier();
-    Value shift32 = rewriter.create<arith::ExtUIOp>(loc, i32Ty, op.getShift());
+    Value shift32 = arith::ExtUIOp::create(rewriter, loc, i32Ty, op.getShift());
 
     // Constants used during the scaling operation.
     Value zero32 = getConstantValue(loc, i32Ty, 0, rewriter);
@@ -158,86 +158,87 @@ class ApplyScale32BitOpConverter : public OpRewritePattern<tosa::ApplyScaleOp> {
     // Compute the multiplication in 64-bits then select the high / low parts.
     // Grab out the high/low of the computation
     auto value64 =
-        rewriter.create<arith::MulSIExtendedOp>(loc, value32, multiplier32);
+        arith::MulSIExtendedOp::create(rewriter, loc, value32, multiplier32);
     Value low32 = value64.getLow();
     Value high32 = value64.getHigh();
 
     // Determine the direction and amount to shift the high bits.
-    Value shiftOver32 = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sge, shift32, thirtyTwo32);
-    Value roundHighBits = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, shift32, thirtyTwo32);
+    Value shiftOver32 = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sge, shift32, thirtyTwo32);
+    Value roundHighBits = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sgt, shift32, thirtyTwo32);
 
     Value shiftHighL =
-        rewriter.create<arith::SubIOp>(loc, thirtyTwo32, shift32);
+        arith::SubIOp::create(rewriter, loc, thirtyTwo32, shift32);
     Value shiftHighR =
-        rewriter.create<arith::SubIOp>(loc, shift32, thirtyTwo32);
+        arith::SubIOp::create(rewriter, loc, shift32, thirtyTwo32);
 
     shiftHighL =
-        rewriter.create<arith::SelectOp>(loc, shiftOver32, zero32, shiftHighL);
+        arith::SelectOp::create(rewriter, loc, shiftOver32, zero32, shiftHighL);
     shiftHighR =
-        rewriter.create<arith::SelectOp>(loc, shiftOver32, shiftHighR, zero32);
+        arith::SelectOp::create(rewriter, loc, shiftOver32, shiftHighR, zero32);
 
     // Conditionally perform our double round.
     if (op.getRoundingMode() == "DOUBLE_ROUND") {
       Value negOne32 = getConstantValue(loc, i32Ty, -1, rewriter);
-      Value valuePositive = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sge, value32, zero32);
+      Value valuePositive = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sge, value32, zero32);
 
-      Value roundDir =
-          rewriter.create<arith::SelectOp>(loc, valuePositive, one32, negOne32);
+      Value roundDir = arith::SelectOp::create(rewriter, loc, valuePositive,
+                                               one32, negOne32);
       roundDir =
-          rewriter.create<arith::SelectOp>(loc, shiftOver32, roundDir, zero32);
+          arith::SelectOp::create(rewriter, loc, shiftOver32, roundDir, zero32);
 
-      Value shiftLow = rewriter.create<arith::ShRUIOp>(loc, low32, thirty32);
-      Value rounded = rewriter.create<arith::AddIOp>(loc, shiftLow, roundDir);
-      Value carry = rewriter.create<arith::ShRSIOp>(loc, rounded, two32);
+      Value shiftLow = arith::ShRUIOp::create(rewriter, loc, low32, thirty32);
+      Value rounded = arith::AddIOp::create(rewriter, loc, shiftLow, roundDir);
+      Value carry = arith::ShRSIOp::create(rewriter, loc, rounded, two32);
 
       Value shiftRound =
-          rewriter.create<arith::ShLIOp>(loc, roundDir, thirty32);
+          arith::ShLIOp::create(rewriter, loc, roundDir, thirty32);
 
-      low32 = rewriter.create<arith::AddIOp>(loc, low32, shiftRound);
-      high32 = rewriter.create<arith::AddIOp>(loc, high32, carry);
+      low32 = arith::AddIOp::create(rewriter, loc, low32, shiftRound);
+      high32 = arith::AddIOp::create(rewriter, loc, high32, carry);
     }
 
     // Conditionally apply rounding in the low bits.
     {
-      Value shiftSubOne = rewriter.create<arith::SubIOp>(loc, shift32, one32);
-      Value roundBit = rewriter.create<arith::ShLIOp>(loc, one32, shiftSubOne);
-      roundBit = rewriter.create<arith::SelectOp>(loc, roundHighBits, zero32,
-                                                  roundBit);
-
-      Value newLow32 = rewriter.create<arith::AddIOp>(loc, low32, roundBit);
-      Value wasRounded = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::ugt, low32, newLow32);
+      Value shiftSubOne = arith::SubIOp::create(rewriter, loc, shift32, one32);
+      Value roundBit = arith::ShLIOp::create(rewriter, loc, one32, shiftSubOne);
+      roundBit = arith::SelectOp::create(rewriter, loc, roundHighBits, zero32,
+                                         roundBit);
+
+      Value newLow32 = arith::AddIOp::create(rewriter, loc, low32, roundBit);
+      Value wasRounded = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::ugt, low32, newLow32);
       low32 = newLow32;
 
-      Value rounded32 = rewriter.create<arith::ExtUIOp>(loc, i32Ty, wasRounded);
-      high32 = rewriter.create<arith::AddIOp>(loc, high32, rounded32);
+      Value rounded32 =
+          arith::ExtUIOp::create(rewriter, loc, i32Ty, wasRounded);
+      high32 = arith::AddIOp::create(rewriter, loc, high32, rounded32);
     }
 
     // Conditionally apply rounding in the high bits.
     {
       Value shiftSubOne =
-          rewriter.create<arith::SubIOp>(loc, shiftHighR, one32);
-      Value roundBit = rewriter.create<arith::ShLIOp>(loc, one32, shiftSubOne);
-      roundBit = rewriter.create<arith::SelectOp>(loc, roundHighBits, roundBit,
-                                                  zero32);
-      high32 = rewriter.create<arith::AddIOp>(loc, high32, roundBit);
+          arith::SubIOp::create(rewriter, loc, shiftHighR, one32);
+      Value roundBit = arith::ShLIOp::create(rewriter, loc, one32, shiftSubOne);
+      roundBit = arith::SelectOp::create(rewriter, loc, roundHighBits, roundBit,
+                                         zero32);
+      high32 = arith::AddIOp::create(rewriter, loc, high32, roundBit);
     }
 
     // Combine the correct high/low bits into the final rescale result.
-    high32 = rewriter.create<arith::ShLIOp>(loc, high32, shiftHighL);
-    high32 = rewriter.create<arith::ShRSIOp>(loc, high32, shiftHighR);
-    low32 = rewriter.create<arith::ShRUIOp>(loc, low32, shift32);
-    low32 = rewriter.create<arith::SelectOp>(loc, shiftOver32, zero32, low32);
+    high32 = arith::ShLIOp::create(rewriter, loc, high32, shiftHighL);
+    high32 = arith::ShRSIOp::create(rewriter, loc, high32, shiftHighR);
+    low32 = arith::ShRUIOp::create(rewriter, loc, low32, shift32);
+    low32 = arith::SelectOp::create(rewriter, loc, shiftOver32, zero32, low32);
 
     // Apply the rounding behavior and shift to the final alignment.
-    Value result = rewriter.create<arith::AddIOp>(loc, low32, high32);
+    Value result = arith::AddIOp::create(rewriter, loc, low32, high32);
 
     // Truncate if necessary.
     if (!getElementTypeOrSelf(resultTy).isInteger(32)) {
-      result = rewriter.create<arith::TruncIOp>(loc, resultTy, result);
+      result = arith::TruncIOp::create(rewriter, loc, resultTy, result);
     }
 
     rewriter.replaceOp(op, result);
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 2f608bbd637b4..ec55091cd7eb8 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -70,14 +70,14 @@ materializeBinaryNanCheckIfRequired(OpTy op, PatternRewriter &rewriter,
     return result;
 
   // Unordered comparison of NaN against itself will always return true.
-  Value lhsIsNaN = rewriter.create<arith::CmpFOp>(
-      op.getLoc(), arith::CmpFPredicate::UNO, lhs, lhs);
-  Value rhsIsNaN = rewriter.create<arith::CmpFOp>(
-      op.getLoc(), arith::CmpFPredicate::UNO, rhs, rhs);
+  Value lhsIsNaN = arith::CmpFOp::create(rewriter, op.getLoc(),
+                                         arith::CmpFPredicate::UNO, lhs, lhs);
+  Value rhsIsNaN = arith::CmpFOp::create(rewriter, op.getLoc(),
+                                         arith::CmpFPredicate::UNO, rhs, rhs);
   Value rhsOrResult =
-      rewriter.create<arith::SelectOp>(op.getLoc(), lhsIsNaN, rhs, result);
-  return rewriter.create<arith::SelectOp>(op.getLoc(), rhsIsNaN, lhs,
-                                          rhsOrResult);
+      arith::SelectOp::create(rewriter, op.getLoc(), lhsIsNaN, rhs, result);
+  return arith::SelectOp::create(rewriter, op.getLoc(), rhsIsNaN, lhs,
+                                 rhsOrResult);
 }
 
 static Value createLinalgBodyCalculationForElementwiseOp(
@@ -89,38 +89,38 @@ static Value createLinalgBodyCalculationForElementwiseOp(
 
   // tosa::AbsOp
   if (isa<tosa::AbsOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<math::AbsFOp>(loc, resultTypes, args);
+    return math::AbsFOp::create(rewriter, loc, resultTypes, args);
 
   if (isa<tosa::AbsOp>(op) && isa<IntegerType>(elementTy)) {
-    auto zero = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getZeroAttr(elementTy));
-    auto neg = rewriter.create<arith::SubIOp>(loc, zero, args[0]);
-    return rewriter.create<arith::MaxSIOp>(loc, args[0], neg);
+    auto zero = arith::ConstantOp::create(rewriter, loc,
+                                          rewriter.getZeroAttr(elementTy));
+    auto neg = arith::SubIOp::create(rewriter, loc, zero, args[0]);
+    return arith::MaxSIOp::create(rewriter, loc, args[0], neg);
   }
 
   // tosa::AddOp
   if (isa<tosa::AddOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<arith::AddFOp>(loc, resultTypes, args);
+    return arith::AddFOp::create(rewriter, loc, resultTypes, args);
 
   if (isa<tosa::AddOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::AddIOp>(loc, resultTypes, args);
+    return arith::AddIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::SubOp
   if (isa<tosa::SubOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<arith::SubFOp>(loc, resultTypes, args);
+    return arith::SubFOp::create(rewriter, loc, resultTypes, args);
 
   if (isa<tosa::SubOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::SubIOp>(loc, resultTypes, args);
+    return arith::SubIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::IntDivOp
   if (isa<tosa::IntDivOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::DivSIOp>(loc, resultTypes, args);
+    return arith::DivSIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::ReciprocalOp
   if (isa<tosa::ReciprocalOp>(op) && isa<FloatType>(elementTy)) {
     auto one =
-        rewriter.create<arith::ConstantOp>(loc, FloatAttr::get(elementTy, 1));
-    return rewriter.create<arith::DivFOp>(loc, resultTypes, one, args[0]);
+        arith::ConstantOp::create(rewriter, loc, FloatAttr::get(elementTy, 1));
+    return arith::DivFOp::create(rewriter, loc, resultTypes, one, args[0]);
   }
 
   // tosa::MulOp
@@ -140,7 +140,8 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                                           "Cannot have shift value for float");
         return nullptr;
       }
-      return rewriter.create<arith::MulFOp>(loc, resultTypes, args[0], args[1]);
+      return arith::MulFOp::create(rewriter, loc, resultTypes, args[0],
+                                   args[1]);
     }
 
     if (isa<IntegerType>(elementTy)) {
@@ -149,21 +150,21 @@ static Value createLinalgBodyCalculationForElementwiseOp(
 
       if (shift > 0) {
         auto shiftConst =
-            rewriter.create<arith::ConstantIntOp>(loc, shift, /*bitwidth=*/8);
+            arith::ConstantIntOp::create(rewriter, loc, shift, /*bitwidth=*/8);
         if (!a.getType().isInteger(32))
-          a = rewriter.create<arith::ExtSIOp>(loc, rewriter.getI32Type(), a);
+          a = arith::ExtSIOp::create(rewriter, loc, rewriter.getI32Type(), a);
 
         if (!b.getType().isInteger(32))
-          b = rewriter.create<arith::ExtSIOp>(loc, rewriter.getI32Type(), b);
+          b = arith::ExtSIOp::create(rewriter, loc, rewriter.getI32Type(), b);
 
-        auto result = rewriter.create<tosa::ApplyScaleOp>(
-            loc, rewriter.getI32Type(), a, b, shiftConst,
+        auto result = tosa::ApplyScaleOp::create(
+            rewriter, loc, rewriter.getI32Type(), a, b, shiftConst,
             rewriter.getStringAttr("SINGLE_ROUND"));
 
         if (elementTy.isInteger(32))
           return result;
 
-        return rewriter.create<arith::TruncIOp>(loc, elementTy, result);
+        return arith::TruncIOp::create(rewriter, loc, elementTy, result);
       }
 
       int aWidth = a.getType().getIntOrFloatBitWidth();
@@ -171,11 +172,11 @@ static Value createLinalgBodyCalculationForElementwiseOp(
       int cWidth = resultTypes[0].getIntOrFloatBitWidth();
 
       if (aWidth < cWidth)
-        a = rewriter.create<arith::ExtSIOp>(loc, resultTypes[0], a);
+        a = arith::ExtSIOp::create(rewriter, loc, resultTypes[0], a);
       if (bWidth < cWidth)
-        b = rewriter.create<arith::ExtSIOp>(loc, resultTypes[0], b);
+        b = arith::ExtSIOp::create(rewriter, loc, resultTypes[0], b);
 
-      return rewriter.create<arith::MulIOp>(loc, resultTypes, a, b);
+      return arith::MulIOp::create(rewriter, loc, resultTypes, a, b);
     }
   }
 
@@ -201,14 +202,14 @@ static Value createLinalgBodyCalculationForElementwiseOp(
     int64_t outZp = *maybeOutZp;
 
     if (isa<FloatType>(elementTy))
-      return rewriter.create<arith::NegFOp>(loc, resultTypes, args[0]);
+      return arith::NegFOp::create(rewriter, loc, resultTypes, args[0]);
 
     if (isa<IntegerType>(elementTy)) {
       if (!inZp && !outZp) {
-        auto constant = rewriter.create<arith::ConstantOp>(
-            loc, IntegerAttr::get(elementTy, 0));
-        return rewriter.create<arith::SubIOp>(loc, resultTypes, constant,
-                                              args[0]);
+        auto constant = arith::ConstantOp::create(
+            rewriter, loc, IntegerAttr::get(elementTy, 0));
+        return arith::SubIOp::create(rewriter, loc, resultTypes, constant,
+                                     args[0]);
       }
 
       // Compute the maximum value that can occur in the intermediate buffer.
@@ -231,214 +232,214 @@ static Value createLinalgBodyCalculationForElementwiseOp(
       }
 
       Type intermediateType = rewriter.getIntegerType(intermediateBitWidth);
-      Value zpAddValue = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIntegerAttr(intermediateType, zpAdd));
+      Value zpAddValue = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getIntegerAttr(intermediateType, zpAdd));
 
       // The negation can be applied by doing:
       //  outputValue = inZp + outZp - inputValue
       auto ext =
-          rewriter.create<arith::ExtSIOp>(loc, intermediateType, args[0]);
-      auto sub = rewriter.create<arith::SubIOp>(loc, zpAddValue, ext);
+          arith::ExtSIOp::create(rewriter, loc, intermediateType, args[0]);
+      auto sub = arith::SubIOp::create(rewriter, loc, zpAddValue, ext);
 
       // Clamp to the negation range.
-      Value min = rewriter.create<arith::ConstantIntOp>(
-          loc, intermediateType,
+      Value min = arith::ConstantIntOp::create(
+          rewriter, loc, intermediateType,
           APInt::getSignedMinValue(inputBitWidth).getSExtValue());
-      Value max = rewriter.create<arith::ConstantIntOp>(
-          loc, intermediateType,
+      Value max = arith::ConstantIntOp::create(
+          rewriter, loc, intermediateType,
           APInt::getSignedMaxValue(inputBitWidth).getSExtValue());
       auto clamp = clampIntHelper(loc, sub, min, max, rewriter, false);
 
       // Truncate to the final value.
-      return rewriter.create<arith::TruncIOp>(loc, elementTy, clamp);
+      return arith::TruncIOp::create(rewriter, loc, elementTy, clamp);
     }
   }
 
   // tosa::BitwiseAndOp
   if (isa<tosa::BitwiseAndOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::AndIOp>(loc, resultTypes, args);
+    return arith::AndIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::BitwiseOrOp
   if (isa<tosa::BitwiseOrOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::OrIOp>(loc, resultTypes, args);
+    return arith::OrIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::BitwiseNotOp
   if (isa<tosa::BitwiseNotOp>(op) && isa<IntegerType>(elementTy)) {
     auto allOnesAttr = rewriter.getIntegerAttr(
         elementTy, APInt::getAllOnes(elementTy.getIntOrFloatBitWidth()));
-    auto allOnes = rewriter.create<arith::ConstantOp>(loc, allOnesAttr);
-    return rewriter.create<arith::XOrIOp>(loc, resultTypes, args[0], allOnes);
+    auto allOnes = arith::ConstantOp::create(rewriter, loc, allOnesAttr);
+    return arith::XOrIOp::create(rewriter, loc, resultTypes, args[0], allOnes);
   }
 
   // tosa::BitwiseXOrOp
   if (isa<tosa::BitwiseXorOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::XOrIOp>(loc, resultTypes, args);
+    return arith::XOrIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::LogicalLeftShiftOp
   if (isa<tosa::LogicalLeftShiftOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::ShLIOp>(loc, resultTypes, args);
+    return arith::ShLIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::LogicalRightShiftOp
   if (isa<tosa::LogicalRightShiftOp>(op) && isa<IntegerType>(elementTy))
-    return rewriter.create<arith::ShRUIOp>(loc, resultTypes, args);
+    return arith::ShRUIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::ArithmeticRightShiftOp
   if (isa<tosa::ArithmeticRightShiftOp>(op) && isa<IntegerType>(elementTy)) {
-    auto result = rewriter.create<arith::ShRSIOp>(loc, resultTypes, args);
+    auto result = arith::ShRSIOp::create(rewriter, loc, resultTypes, args);
     auto round = cast<BoolAttr>(op->getAttr("round")).getValue();
     if (!round) {
       return result;
     }
 
     Type i1Ty = IntegerType::get(rewriter.getContext(), /*width=*/1);
-    auto one =
-        rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(elementTy, 1));
-    auto zero =
-        rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(elementTy, 0));
+    auto one = arith::ConstantOp::create(rewriter, loc,
+                                         IntegerAttr::get(elementTy, 1));
+    auto zero = arith::ConstantOp::create(rewriter, loc,
+                                          IntegerAttr::get(elementTy, 0));
     auto i1one =
-        rewriter.create<arith::ConstantOp>(loc, IntegerAttr::get(i1Ty, 1));
+        arith::ConstantOp::create(rewriter, loc, IntegerAttr::get(i1Ty, 1));
 
     // Checking that input2 != 0
-    auto shiftValueGreaterThanZero = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, args[1], zero);
+    auto shiftValueGreaterThanZero = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sgt, args[1], zero);
 
     // Checking for the last bit of input1 to be 1
     auto subtract =
-        rewriter.create<arith::SubIOp>(loc, resultTypes, args[1], one);
+        arith::SubIOp::create(rewriter, loc, resultTypes, args[1], one);
     auto shifted =
-        rewriter.create<arith::ShRSIOp>(loc, resultTypes, args[0], subtract)
+        arith::ShRSIOp::create(rewriter, loc, resultTypes, args[0], subtract)
             ->getResults();
-    auto truncated = rewriter.create<arith::TruncIOp>(
-        loc, i1Ty, shifted, ArrayRef<NamedAttribute>());
+    auto truncated = arith::TruncIOp::create(rewriter, loc, i1Ty, shifted,
+                                             ArrayRef<NamedAttribute>());
     auto isInputOdd =
-        rewriter.create<arith::AndIOp>(loc, i1Ty, truncated, i1one);
+        arith::AndIOp::create(rewriter, loc, i1Ty, truncated, i1one);
 
-    auto shouldRound = rewriter.create<arith::AndIOp>(
-        loc, i1Ty, shiftValueGreaterThanZero, isInputOdd);
+    auto shouldRound = arith::AndIOp::create(
+        rewriter, loc, i1Ty, shiftValueGreaterThanZero, isInputOdd);
     auto extended =
-        rewriter.create<arith::ExtUIOp>(loc, resultTypes, shouldRound);
-    return rewriter.create<arith::AddIOp>(loc, resultTypes, result, extended);
+        arith::ExtUIOp::create(rewriter, loc, resultTypes, shouldRound);
+    return arith::AddIOp::create(rewriter, loc, resultTypes, result, extended);
   }
 
   // tosa::ClzOp
   if (isa<tosa::ClzOp>(op) && isa<IntegerType>(elementTy)) {
-    return rewriter.create<math::CountLeadingZerosOp>(loc, elementTy, args[0]);
+    return math::CountLeadingZerosOp::create(rewriter, loc, elementTy, args[0]);
   }
 
   // tosa::LogicalAnd
   if (isa<tosa::LogicalAndOp>(op) && elementTy.isInteger(1))
-    return rewriter.create<arith::AndIOp>(loc, resultTypes, args);
+    return arith::AndIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::LogicalNot
   if (isa<tosa::LogicalNotOp>(op) && elementTy.isInteger(1)) {
-    auto one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIntegerAttr(elementTy, 1));
-    return rewriter.create<arith::XOrIOp>(loc, resultTypes, args[0], one);
+    auto one = arith::ConstantOp::create(rewriter, loc,
+                                         rewriter.getIntegerAttr(elementTy, 1));
+    return arith::XOrIOp::create(rewriter, loc, resultTypes, args[0], one);
   }
 
   // tosa::LogicalOr
   if (isa<tosa::LogicalOrOp>(op) && elementTy.isInteger(1))
-    return rewriter.create<arith::OrIOp>(loc, resultTypes, args);
+    return arith::OrIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::LogicalXor
   if (isa<tosa::LogicalXorOp>(op) && elementTy.isInteger(1))
-    return rewriter.create<arith::XOrIOp>(loc, resultTypes, args);
+    return arith::XOrIOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::PowOp
   if (isa<tosa::PowOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::PowFOp>(loc, resultTypes, args);
+    return mlir::math::PowFOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::RsqrtOp
   if (isa<tosa::RsqrtOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::RsqrtOp>(loc, resultTypes, args);
+    return mlir::math::RsqrtOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::LogOp
   if (isa<tosa::LogOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::LogOp>(loc, resultTypes, args);
+    return mlir::math::LogOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::ExpOp
   if (isa<tosa::ExpOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::ExpOp>(loc, resultTypes, args);
+    return mlir::math::ExpOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::SinOp
   if (isa<tosa::SinOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::SinOp>(loc, resultTypes, args);
+    return mlir::math::SinOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::CosOp
   if (isa<tosa::CosOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::CosOp>(loc, resultTypes, args);
+    return mlir::math::CosOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::TanhOp
   if (isa<tosa::TanhOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::TanhOp>(loc, resultTypes, args);
+    return mlir::math::TanhOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::ErfOp
   if (isa<tosa::ErfOp>(op) && llvm::isa<FloatType>(elementTy))
-    return rewriter.create<mlir::math::ErfOp>(loc, resultTypes, args);
+    return mlir::math::ErfOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::GreaterOp
   if (isa<tosa::GreaterOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::OGT,
-                                          args[0], args[1]);
+    return arith::CmpFOp::create(rewriter, loc, arith::CmpFPredicate::OGT,
+                                 args[0], args[1]);
 
   if (isa<tosa::GreaterOp>(op) && elementTy.isSignlessInteger())
-    return rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt,
-                                          args[0], args[1]);
+    return arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::sgt,
+                                 args[0], args[1]);
 
   // tosa::GreaterEqualOp
   if (isa<tosa::GreaterEqualOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::OGE,
-                                          args[0], args[1]);
+    return arith::CmpFOp::create(rewriter, loc, arith::CmpFPredicate::OGE,
+                                 args[0], args[1]);
 
   if (isa<tosa::GreaterEqualOp>(op) && elementTy.isSignlessInteger())
-    return rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sge,
-                                          args[0], args[1]);
+    return arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::sge,
+                                 args[0], args[1]);
 
   // tosa::EqualOp
   if (isa<tosa::EqualOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::OEQ,
-                                          args[0], args[1]);
+    return arith::CmpFOp::create(rewriter, loc, arith::CmpFPredicate::OEQ,
+                                 args[0], args[1]);
 
   if (isa<tosa::EqualOp>(op) && elementTy.isSignlessInteger())
-    return rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                          args[0], args[1]);
+    return arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                 args[0], args[1]);
 
   // tosa::SelectOp
   if (isa<tosa::SelectOp>(op)) {
     elementTy = cast<ShapedType>(op->getOperand(1).getType()).getElementType();
     if (isa<FloatType>(elementTy) || isa<IntegerType>(elementTy))
-      return rewriter.create<arith::SelectOp>(loc, args[0], args[1], args[2]);
+      return arith::SelectOp::create(rewriter, loc, args[0], args[1], args[2]);
   }
 
   // tosa::MaximumOp
   if (isa<tosa::MaximumOp>(op) && isa<FloatType>(elementTy)) {
-    auto max = rewriter.create<arith::MaximumFOp>(loc, args[0], args[1]);
+    auto max = arith::MaximumFOp::create(rewriter, loc, args[0], args[1]);
     return materializeBinaryNanCheckIfRequired(llvm::cast<tosa::MaximumOp>(op),
                                                rewriter, args[0], args[1], max);
   }
 
   if (isa<tosa::MaximumOp>(op) && elementTy.isSignlessInteger()) {
-    return rewriter.create<arith::MaxSIOp>(loc, args[0], args[1]);
+    return arith::MaxSIOp::create(rewriter, loc, args[0], args[1]);
   }
 
   // tosa::MinimumOp
   if (isa<tosa::MinimumOp>(op) && isa<FloatType>(elementTy)) {
-    auto min = rewriter.create<arith::MinimumFOp>(loc, args[0], args[1]);
+    auto min = arith::MinimumFOp::create(rewriter, loc, args[0], args[1]);
     return materializeBinaryNanCheckIfRequired(llvm::cast<tosa::MinimumOp>(op),
                                                rewriter, args[0], args[1], min);
   }
 
   if (isa<tosa::MinimumOp>(op) && elementTy.isSignlessInteger()) {
-    return rewriter.create<arith::MinSIOp>(loc, args[0], args[1]);
+    return arith::MinSIOp::create(rewriter, loc, args[0], args[1]);
   }
 
   // tosa::CeilOp
   if (isa<tosa::CeilOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<math::CeilOp>(loc, resultTypes, args);
+    return math::CeilOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::FloorOp
   if (isa<tosa::FloorOp>(op) && isa<FloatType>(elementTy))
-    return rewriter.create<math::FloorOp>(loc, resultTypes, args);
+    return math::FloorOp::create(rewriter, loc, resultTypes, args);
 
   // tosa::ClampOp
   if (isa<tosa::ClampOp>(op) && isa<FloatType>(elementTy)) {
@@ -449,10 +450,10 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                    APFloat::rmNearestTiesToEven, &losesInfo);
     maxApf.convert(cast<FloatType>(elementTy).getFloatSemantics(),
                    APFloat::rmNearestTiesToEven, &losesInfo);
-    auto min = rewriter.create<arith::ConstantOp>(
-        loc, elementTy, rewriter.getFloatAttr(elementTy, minApf));
-    auto max = rewriter.create<arith::ConstantOp>(
-        loc, elementTy, rewriter.getFloatAttr(elementTy, maxApf));
+    auto min = arith::ConstantOp::create(
+        rewriter, loc, elementTy, rewriter.getFloatAttr(elementTy, minApf));
+    auto max = arith::ConstantOp::create(
+        rewriter, loc, elementTy, rewriter.getFloatAttr(elementTy, maxApf));
     auto result = clampFloatHelper(loc, args[0], min, max, rewriter);
 
     auto clampOp = llvm::cast<tosa::ClampOp>(op);
@@ -478,11 +479,11 @@ static Value createLinalgBodyCalculationForElementwiseOp(
     //   return init if x == NaN else result
 
     // Unordered comparison of NaN against itself will always return true.
-    Value isNaN = rewriter.create<arith::CmpFOp>(
-        op->getLoc(), arith::CmpFPredicate::UNO, args[0], args[0]);
+    Value isNaN = arith::CmpFOp::create(
+        rewriter, op->getLoc(), arith::CmpFPredicate::UNO, args[0], args[0]);
     // TOSA specifies that in "ignore" NaN mode the result is "min" if the input
     // is NaN.
-    return rewriter.create<arith::SelectOp>(op->getLoc(), isNaN, min, result);
+    return arith::SelectOp::create(rewriter, op->getLoc(), isNaN, min, result);
   }
 
   if (isa<tosa::ClampOp>(op) && isa<IntegerType>(elementTy)) {
@@ -515,10 +516,10 @@ static Value createLinalgBodyCalculationForElementwiseOp(
     min = std::min(min, maxRepresentable);
     max = std::min(max, maxRepresentable);
 
-    auto minVal = rewriter.create<arith::ConstantIntOp>(
-        loc, min, intTy.getIntOrFloatBitWidth());
-    auto maxVal = rewriter.create<arith::ConstantIntOp>(
-        loc, max, intTy.getIntOrFloatBitWidth());
+    auto minVal = arith::ConstantIntOp::create(rewriter, loc, min,
+                                               intTy.getIntOrFloatBitWidth());
+    auto maxVal = arith::ConstantIntOp::create(rewriter, loc, max,
+                                               intTy.getIntOrFloatBitWidth());
     return clampIntHelper(loc, args[0], minVal, maxVal, rewriter,
                           intTy.isUnsignedInteger());
   }
@@ -526,11 +527,11 @@ static Value createLinalgBodyCalculationForElementwiseOp(
   // tosa::SigmoidOp
   if (isa<tosa::SigmoidOp>(op) && isa<FloatType>(elementTy)) {
     auto one =
-        rewriter.create<arith::ConstantOp>(loc, FloatAttr::get(elementTy, 1));
-    auto negate = rewriter.create<arith::NegFOp>(loc, resultTypes, args[0]);
-    auto exp = rewriter.create<mlir::math::ExpOp>(loc, resultTypes, negate);
-    auto added = rewriter.create<arith::AddFOp>(loc, resultTypes, exp, one);
-    return rewriter.create<arith::DivFOp>(loc, resultTypes, one, added);
+        arith::ConstantOp::create(rewriter, loc, FloatAttr::get(elementTy, 1));
+    auto negate = arith::NegFOp::create(rewriter, loc, resultTypes, args[0]);
+    auto exp = mlir::math::ExpOp::create(rewriter, loc, resultTypes, negate);
+    auto added = arith::AddFOp::create(rewriter, loc, resultTypes, exp, one);
+    return arith::DivFOp::create(rewriter, loc, resultTypes, one, added);
   }
 
   // tosa::CastOp
@@ -549,21 +550,21 @@ static Value createLinalgBodyCalculationForElementwiseOp(
       return args.front();
 
     if (isa<FloatType>(srcTy) && isa<FloatType>(dstTy) && bitExtend)
-      return rewriter.create<arith::ExtFOp>(loc, resultTypes, args,
-                                            ArrayRef<NamedAttribute>());
+      return arith::ExtFOp::create(rewriter, loc, resultTypes, args,
+                                   ArrayRef<NamedAttribute>());
 
     if (isa<FloatType>(srcTy) && isa<FloatType>(dstTy) && !bitExtend)
-      return rewriter.create<arith::TruncFOp>(loc, resultTypes, args,
-                                              ArrayRef<NamedAttribute>());
+      return arith::TruncFOp::create(rewriter, loc, resultTypes, args,
+                                     ArrayRef<NamedAttribute>());
 
     // 1-bit integers need to be treated as signless.
     if (srcTy.isInteger(1) && arith::UIToFPOp::areCastCompatible(srcTy, dstTy))
-      return rewriter.create<arith::UIToFPOp>(loc, resultTypes, args,
-                                              ArrayRef<NamedAttribute>());
+      return arith::UIToFPOp::create(rewriter, loc, resultTypes, args,
+                                     ArrayRef<NamedAttribute>());
 
     if (srcTy.isInteger(1) && isa<IntegerType>(dstTy) && bitExtend)
-      return rewriter.create<arith::ExtUIOp>(loc, resultTypes, args,
-                                             ArrayRef<NamedAttribute>());
+      return arith::ExtUIOp::create(rewriter, loc, resultTypes, args,
+                                    ArrayRef<NamedAttribute>());
 
     // Unsigned integers need an unrealized cast so that they can be passed
     // to UIToFP.
@@ -574,25 +575,25 @@ static Value createLinalgBodyCalculationForElementwiseOp(
                   loc, rewriter.getIntegerType(srcTy.getIntOrFloatBitWidth()),
                   args[0])
               .getResult(0);
-      return rewriter.create<arith::UIToFPOp>(loc, resultTypes[0],
-                                              unrealizedCast);
+      return arith::UIToFPOp::create(rewriter, loc, resultTypes[0],
+                                     unrealizedCast);
     }
 
     // All other si-to-fp conversions should be handled by SIToFP.
     if (arith::SIToFPOp::areCastCompatible(srcTy, dstTy))
-      return rewriter.create<arith::SIToFPOp>(loc, resultTypes, args,
-                                              ArrayRef<NamedAttribute>());
+      return arith::SIToFPOp::create(rewriter, loc, resultTypes, args,
+                                     ArrayRef<NamedAttribute>());
 
     // Casting to boolean, floats need to only be checked as not-equal to zero.
     if (isa<FloatType>(srcTy) && dstTy.isInteger(1)) {
-      Value zero = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getFloatAttr(srcTy, 0.0));
-      return rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
-                                            args.front(), zero);
+      Value zero = arith::ConstantOp::create(rewriter, loc,
+                                             rewriter.getFloatAttr(srcTy, 0.0));
+      return arith::CmpFOp::create(rewriter, loc, arith::CmpFPredicate::UNE,
+                                   args.front(), zero);
     }
 
     if (arith::FPToSIOp::areCastCompatible(srcTy, dstTy)) {
-      auto rounded = rewriter.create<math::RoundEvenOp>(loc, args[0]);
+      auto rounded = math::RoundEvenOp::create(rewriter, loc, args[0]);
 
       const auto &fltSemantics = cast<FloatType>(srcTy).getFloatSemantics();
       // Check whether neither int min nor int max can be represented in the
@@ -601,37 +602,42 @@ static Value createLinalgBodyCalculationForElementwiseOp(
           APFloat::semanticsMaxExponent(fltSemantics)) {
         // Use cmp + select to replace infinites by int min / int max. Other
         // integral values can be represented in the integer space.
-        auto conv = rewriter.create<arith::FPToSIOp>(loc, dstTy, rounded);
-        auto posInf = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getFloatAttr(getElementTypeOrSelf(srcTy),
-                                       APFloat::getInf(fltSemantics)));
-        auto negInf = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getFloatAttr(
-                     getElementTypeOrSelf(srcTy),
-                     APFloat::getInf(fltSemantics, /*Negative=*/true)));
-        auto overflow = rewriter.create<arith::CmpFOp>(
-            loc, arith::CmpFPredicate::UEQ, rounded, posInf);
-        auto underflow = rewriter.create<arith::CmpFOp>(
-            loc, arith::CmpFPredicate::UEQ, rounded, negInf);
-        auto intMin = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(
-                     getElementTypeOrSelf(dstTy),
-                     APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
-        auto intMax = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(
-                     getElementTypeOrSelf(dstTy),
-                     APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
+        auto conv = arith::FPToSIOp::create(rewriter, loc, dstTy, rounded);
+        auto posInf = arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getFloatAttr(getElementTypeOrSelf(srcTy),
+                                  APFloat::getInf(fltSemantics)));
+        auto negInf = arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getFloatAttr(
+                getElementTypeOrSelf(srcTy),
+                APFloat::getInf(fltSemantics, /*Negative=*/true)));
+        auto overflow = arith::CmpFOp::create(
+            rewriter, loc, arith::CmpFPredicate::UEQ, rounded, posInf);
+        auto underflow = arith::CmpFOp::create(
+            rewriter, loc, arith::CmpFPredicate::UEQ, rounded, negInf);
+        auto intMin = arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getIntegerAttr(
+                getElementTypeOrSelf(dstTy),
+                APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())));
+        auto intMax = arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getIntegerAttr(
+                getElementTypeOrSelf(dstTy),
+                APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
         auto maxClamped =
-            rewriter.create<arith::SelectOp>(loc, overflow, intMax, conv);
-        return rewriter.create<arith::SelectOp>(loc, underflow, intMin,
-                                                maxClamped);
+            arith::SelectOp::create(rewriter, loc, overflow, intMax, conv);
+        return arith::SelectOp::create(rewriter, loc, underflow, intMin,
+                                       maxClamped);
       }
 
-      auto intMinFP = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getFloatAttr(
-                   getElementTypeOrSelf(srcTy),
-                   APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())
-                       .getSExtValue()));
+      auto intMinFP = arith::ConstantOp::create(
+          rewriter, loc,
+          rewriter.getFloatAttr(
+              getElementTypeOrSelf(srcTy),
+              APInt::getSignedMinValue(dstTy.getIntOrFloatBitWidth())
+                  .getSExtValue()));
 
       // Check whether the mantissa has enough bits to represent int max.
       if (cast<FloatType>(srcTy).getFPMantissaWidth() >=
@@ -640,58 +646,61 @@ static Value createLinalgBodyCalculationForElementwiseOp(
         // consists of a single leading bit. Therefore we can clamp the input
         // in the floating-point domain.
 
-        auto intMaxFP = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getFloatAttr(
-                     getElementTypeOrSelf(srcTy),
-                     APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
-                         .getSExtValue()));
+        auto intMaxFP = arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getFloatAttr(
+                getElementTypeOrSelf(srcTy),
+                APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
+                    .getSExtValue()));
 
         Value clamped =
             clampFloatHelper(loc, rounded, intMinFP, intMaxFP, rewriter);
-        return rewriter.create<arith::FPToSIOp>(loc, dstTy, clamped);
+        return arith::FPToSIOp::create(rewriter, loc, dstTy, clamped);
       }
 
       // Due to earlier check we know exponant range is big enough to represent
       // int min. We can therefore rely on int max + 1 being representable as
       // well because it's just int min with a positive sign. So clamp the min
       // value and compare against that to select the max int value if needed.
-      auto intMaxPlusOneFP = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getFloatAttr(
-                   getElementTypeOrSelf(srcTy),
-                   static_cast<double>(
-                       APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
-                           .getSExtValue()) +
-                       1.0f));
-
-      auto intMax = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIntegerAttr(
-                   getElementTypeOrSelf(dstTy),
-                   APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
+      auto intMaxPlusOneFP = arith::ConstantOp::create(
+          rewriter, loc,
+          rewriter.getFloatAttr(
+              getElementTypeOrSelf(srcTy),
+              static_cast<double>(
+                  APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())
+                      .getSExtValue()) +
+                  1.0f));
+
+      auto intMax = arith::ConstantOp::create(
+          rewriter, loc,
+          rewriter.getIntegerAttr(
+              getElementTypeOrSelf(dstTy),
+              APInt::getSignedMaxValue(dstTy.getIntOrFloatBitWidth())));
       auto minClampedFP =
-          rewriter.create<arith::MaximumFOp>(loc, rounded, intMinFP);
+          arith::MaximumFOp::create(rewriter, loc, rounded, intMinFP);
       auto minClamped =
-          rewriter.create<arith::FPToSIOp>(loc, dstTy, minClampedFP);
-      auto overflow = rewriter.create<arith::CmpFOp>(
-          loc, arith::CmpFPredicate::UGE, rounded, intMaxPlusOneFP);
-      return rewriter.create<arith::SelectOp>(loc, overflow, intMax,
-                                              minClamped);
+          arith::FPToSIOp::create(rewriter, loc, dstTy, minClampedFP);
+      auto overflow = arith::CmpFOp::create(
+          rewriter, loc, arith::CmpFPredicate::UGE, rounded, intMaxPlusOneFP);
+      return arith::SelectOp::create(rewriter, loc, overflow, intMax,
+                                     minClamped);
     }
 
     // Casting to boolean, integers need to only be checked as not-equal to
     // zero.
     if (isa<IntegerType>(srcTy) && dstTy.isInteger(1)) {
-      Value zero = rewriter.create<arith::ConstantIntOp>(
-          loc, 0, srcTy.getIntOrFloatBitWidth());
-      return rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                            args.front(), zero);
+      Value zero = arith::ConstantIntOp::create(rewriter, loc, 0,
+                                                srcTy.getIntOrFloatBitWidth());
+      return arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ne,
+                                   args.front(), zero);
     }
 
     if (isa<IntegerType>(srcTy) && isa<IntegerType>(dstTy) && bitExtend)
-      return rewriter.create<arith::ExtSIOp>(loc, resultTypes, args,
-                                             ArrayRef<NamedAttribute>());
+      return arith::ExtSIOp::create(rewriter, loc, resultTypes, args,
+                                    ArrayRef<NamedAttribute>());
 
     if (isa<IntegerType>(srcTy) && isa<IntegerType>(dstTy) && !bitExtend) {
-      return rewriter.create<arith::TruncIOp>(loc, dstTy, args[0]);
+      return arith::TruncIOp::create(rewriter, loc, dstTy, args[0]);
     }
   }
 
@@ -710,14 +719,14 @@ static Value createIndex(PatternRewriter &rewriter, Location loc,
   auto [it, inserted] = indexPool.try_emplace(index);
   if (inserted)
     it->second =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(index));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getIndexAttr(index));
   return it->second;
 }
 
 static Value getTensorDim(PatternRewriter &rewriter, Location loc,
                           IndexPool &indexPool, Value tensor, int64_t index) {
   auto indexValue = createIndex(rewriter, loc, indexPool, index);
-  return rewriter.create<tensor::DimOp>(loc, tensor, indexValue).getResult();
+  return tensor::DimOp::create(rewriter, loc, tensor, indexValue).getResult();
 }
 
 static OpFoldResult getOrFoldTensorDim(PatternRewriter &rewriter, Location loc,
@@ -783,7 +792,7 @@ computeTargetSize(PatternRewriter &rewriter, Location loc, IndexPool &indexPool,
   for (size_t i = 1; i < operandsWithDynamicDim.size(); i++) {
     auto nextSize =
         getTensorDim(rewriter, loc, indexPool, operandsWithDynamicDim[i], dim);
-    targetSize = rewriter.create<arith::MaxUIOp>(loc, targetSize, nextSize);
+    targetSize = arith::MaxUIOp::create(rewriter, loc, targetSize, nextSize);
   }
   return {targetSize, nullptr};
 }
@@ -838,8 +847,8 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc,
   // Check if broadcast is necessary
   auto one = createIndex(rewriter, loc, indexPool, 1);
   auto runtimeSize = getTensorDim(rewriter, loc, indexPool, operand, dim);
-  auto broadcastNecessary = rewriter.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::eq, runtimeSize, one);
+  auto broadcastNecessary = arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::eq, runtimeSize, one);
 
   // Emit 'then' region of 'scf.if'
   auto emitThenRegion = [&](OpBuilder &opBuilder, Location loc) {
@@ -855,8 +864,8 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc,
                                                     operand, index);
       outputTensorShape.push_back(size);
     }
-    Value outputTensor = opBuilder.create<tensor::EmptyOp>(
-        loc, outputTensorShape, rankedTensorType.getElementType());
+    Value outputTensor = tensor::EmptyOp::create(
+        opBuilder, loc, outputTensorShape, rankedTensorType.getElementType());
 
     // Emit 'linalg.generic' op
     auto resultTensor =
@@ -866,7 +875,7 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc,
                 getNParallelLoopsAttrs(rank),
                 [&](OpBuilder &opBuilder, Location loc, ValueRange blockArgs) {
                   // Emit 'linalg.yield' op
-                  opBuilder.create<linalg::YieldOp>(loc, blockArgs.front());
+                  linalg::YieldOp::create(opBuilder, loc, blockArgs.front());
                 })
             .getResult(0);
 
@@ -875,17 +884,17 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc,
         loc, operand.getType(), resultTensor);
 
     // Emit 'scf.yield' op
-    opBuilder.create<scf::YieldOp>(loc, castResultTensor);
+    scf::YieldOp::create(opBuilder, loc, castResultTensor);
   };
 
   // Emit 'else' region of 'scf.if'
   auto emitElseRegion = [&](OpBuilder &opBuilder, Location loc) {
-    opBuilder.create<scf::YieldOp>(loc, operand);
+    scf::YieldOp::create(opBuilder, loc, operand);
   };
 
   // Emit 'scf.if' op
-  auto ifOp = rewriter.create<scf::IfOp>(loc, broadcastNecessary,
-                                         emitThenRegion, emitElseRegion);
+  auto ifOp = scf::IfOp::create(rewriter, loc, broadcastNecessary,
+                                emitThenRegion, emitElseRegion);
   return ifOp.getResult(0);
 }
 
@@ -930,8 +939,8 @@ emitElementwiseComputation(ConversionPatternRewriter &rewriter, Location loc,
   if (!resultType) {
     return rewriter.notifyMatchFailure(operation, "failed to convert type");
   }
-  Value outputTensor = rewriter.create<tensor::EmptyOp>(
-      loc, targetShape, resultType.getElementType());
+  Value outputTensor = tensor::EmptyOp::create(rewriter, loc, targetShape,
+                                               resultType.getElementType());
 
   // Create affine maps. Input affine maps broadcast static dimensions of size
   // 1. The output affine map is an identity map.
@@ -957,8 +966,8 @@ emitElementwiseComputation(ConversionPatternRewriter &rewriter, Location loc,
 
   // Emit 'linalg.generic' op
   bool encounteredError = false;
-  auto linalgOp = rewriter.create<linalg::GenericOp>(
-      loc, outputTensor.getType(), operands, outputTensor, affineMaps,
+  auto linalgOp = linalg::GenericOp::create(
+      rewriter, loc, outputTensor.getType(), operands, outputTensor, affineMaps,
       getNParallelLoopsAttrs(rank),
       [&](OpBuilder &opBuilder, Location loc, ValueRange blockArgs) {
         Value opResult = createLinalgBodyCalculationForElementwiseOp(
@@ -968,7 +977,7 @@ emitElementwiseComputation(ConversionPatternRewriter &rewriter, Location loc,
           encounteredError = true;
           return;
         }
-        opBuilder.create<linalg::YieldOp>(loc, opResult);
+        linalg::YieldOp::create(opBuilder, loc, opResult);
       });
   if (encounteredError)
     return rewriter.notifyMatchFailure(
@@ -1078,42 +1087,42 @@ static Value createLinalgBodyCalculationForReduceOp(Operation *op,
                                                     PatternRewriter &rewriter) {
   Location loc = op->getLoc();
   if (isa<tosa::ReduceSumOp>(op) && isa<FloatType>(elementTy)) {
-    return rewriter.create<arith::AddFOp>(loc, args);
+    return arith::AddFOp::create(rewriter, loc, args);
   }
 
   if (isa<tosa::ReduceSumOp>(op) && isa<IntegerType>(elementTy)) {
-    return rewriter.create<arith::AddIOp>(loc, args);
+    return arith::AddIOp::create(rewriter, loc, args);
   }
 
   if (isa<tosa::ReduceProductOp>(op) && isa<FloatType>(elementTy)) {
-    return rewriter.create<arith::MulFOp>(loc, args);
+    return arith::MulFOp::create(rewriter, loc, args);
   }
 
   if (isa<tosa::ReduceProductOp>(op) && isa<IntegerType>(elementTy)) {
-    return rewriter.create<arith::MulIOp>(loc, args);
+    return arith::MulIOp::create(rewriter, loc, args);
   }
 
   if (isa<tosa::ReduceMinOp>(op) && isa<FloatType>(elementTy)) {
-    return rewriter.create<arith::MinimumFOp>(loc, args[0], args[1]);
+    return arith::MinimumFOp::create(rewriter, loc, args[0], args[1]);
   }
 
   if (isa<tosa::ReduceMinOp>(op) && isa<IntegerType>(elementTy)) {
-    return rewriter.create<arith::MinSIOp>(loc, args[0], args[1]);
+    return arith::MinSIOp::create(rewriter, loc, args[0], args[1]);
   }
 
   if (isa<tosa::ReduceMaxOp>(op) && isa<FloatType>(elementTy)) {
-    return rewriter.create<arith::MaximumFOp>(loc, args[0], args[1]);
+    return arith::MaximumFOp::create(rewriter, loc, args[0], args[1]);
   }
 
   if (isa<tosa::ReduceMaxOp>(op) && isa<IntegerType>(elementTy)) {
-    return rewriter.create<arith::MaxSIOp>(loc, args[0], args[1]);
+    return arith::MaxSIOp::create(rewriter, loc, args[0], args[1]);
   }
 
   if (isa<tosa::ReduceAllOp>(op) && elementTy.isInteger(1))
-    return rewriter.create<arith::AndIOp>(loc, args);
+    return arith::AndIOp::create(rewriter, loc, args);
 
   if (isa<tosa::ReduceAnyOp>(op) && elementTy.isInteger(1))
-    return rewriter.create<arith::OrIOp>(loc, args);
+    return arith::OrIOp::create(rewriter, loc, args);
 
   return {};
 }
@@ -1139,7 +1148,7 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
     if (axis != i) {
       reduceShape.push_back(inputTy.getDimSize(i));
       if (inputTy.isDynamicDim(i))
-        dynDims.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+        dynDims.push_back(tensor::DimOp::create(rewriter, loc, input, i));
     }
   }
 
@@ -1158,7 +1167,7 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
     return rewriter.notifyMatchFailure(
         op, "No initial value found for reduction operation");
 
-  auto fillValue = rewriter.create<arith::ConstantOp>(loc, fillValueAttr);
+  auto fillValue = arith::ConstantOp::create(rewriter, loc, fillValueAttr);
   auto filledTensor = rewriter
                           .create<linalg::FillOp>(loc, ValueRange{fillValue},
                                                   ValueRange{emptyTensor})
@@ -1176,7 +1185,7 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
       // Additionally we have to keep track of whether we've seen any non-NaN
       // values and then do a final select based on this predicate.
       auto trueAttr = rewriter.getBoolAttr(true);
-      auto trueValue = rewriter.create<arith::ConstantOp>(loc, trueAttr);
+      auto trueValue = arith::ConstantOp::create(rewriter, loc, trueAttr);
       auto emptyBoolTensor =
           rewriter
               .create<tensor::EmptyOp>(loc, reduceShape, trueValue.getType(),
@@ -1202,8 +1211,8 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
   }
 
   bool didEncounterError = false;
-  linalg::LinalgOp linalgOp = rewriter.create<linalg::ReduceOp>(
-      loc, inputs, outputs, axis,
+  linalg::LinalgOp linalgOp = linalg::ReduceOp::create(
+      rewriter, loc, inputs, outputs, axis,
       [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange blockArgs) {
         std::array<Value, 2> binaryArgs{
             blockArgs[0], isNanIgnoreMode ? blockArgs[2] : blockArgs[1]};
@@ -1219,21 +1228,22 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
           auto oldAllResultsNanFlagValue = blockArgs[3];
 
           // Unordered comparison of NaN against itself will always return true.
-          Value isNaN = nestedBuilder.create<arith::CmpFOp>(
-              op->getLoc(), arith::CmpFPredicate::UNO, inputValue, inputValue);
+          Value isNaN = arith::CmpFOp::create(nestedBuilder, op->getLoc(),
+                                              arith::CmpFPredicate::UNO,
+                                              inputValue, inputValue);
           // If we've encountered a NaN, take the non-NaN value.
-          auto selectOp = nestedBuilder.create<arith::SelectOp>(
-              op->getLoc(), isNaN, initialValue, result);
+          auto selectOp = arith::SelectOp::create(nestedBuilder, op->getLoc(),
+                                                  isNaN, initialValue, result);
           // Update the flag which keeps track of whether we have seen a non-NaN
           // value.
-          auto newAllResultsNanFlagValue = nestedBuilder.create<arith::AndIOp>(
-              op->getLoc(), oldAllResultsNanFlagValue, isNaN);
+          auto newAllResultsNanFlagValue = arith::AndIOp::create(
+              nestedBuilder, op->getLoc(), oldAllResultsNanFlagValue, isNaN);
           resultsToYield.push_back(selectOp);
           resultsToYield.push_back(newAllResultsNanFlagValue);
         } else {
           resultsToYield.push_back(result);
         }
-        nestedBuilder.create<linalg::YieldOp>(loc, resultsToYield);
+        linalg::YieldOp::create(nestedBuilder, loc, resultsToYield);
       });
 
   if (!didEncounterError)
@@ -1250,7 +1260,7 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
     auto nanValueAttr = rewriter.getFloatAttr(
         elementTy,
         APFloat::getNaN(cast<FloatType>(elementTy).getFloatSemantics(), false));
-    auto nanValue = rewriter.create<arith::ConstantOp>(loc, nanValueAttr);
+    auto nanValue = arith::ConstantOp::create(rewriter, loc, nanValueAttr);
     auto emptyNanTensor =
         rewriter
             .create<tensor::EmptyOp>(loc, reduceShape,
@@ -1278,7 +1288,7 @@ static LogicalResult reduceMatchAndRewriteHelper(OpTy op, uint64_t axis,
     ins.push_back(linalgOp->getResult(0));
     outs.push_back(finalEmptyTensor);
     auto linalgSelect =
-        rewriter.create<linalg::SelectOp>(op->getLoc(), ins, outs);
+        linalg::SelectOp::create(rewriter, op->getLoc(), ins, outs);
     linalgOp = linalgSelect;
   }
 
@@ -1350,7 +1360,7 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
     SmallVector<Value> dynDims;
     for (int i = 0; i < outputTy.getRank(); i++) {
       if (outputTy.isDynamicDim(i)) {
-        dynDims.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+        dynDims.push_back(tensor::DimOp::create(rewriter, loc, input, i));
       }
     }
 
@@ -1401,16 +1411,17 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
     Value multiplierConstant;
     int64_t multiplierArg = 0;
     if (multiplierValues.size() == 1) {
-      multiplierConstant = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32IntegerAttr(multiplierValues.front()));
+      multiplierConstant = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI32IntegerAttr(multiplierValues.front()));
     } else {
       SmallVector<AffineExpr, 2> multiplierExprs{
           rewriter.getAffineDimExpr(rank - 1)};
       auto multiplierType =
           RankedTensorType::get({static_cast<int64_t>(multiplierValues.size())},
                                 rewriter.getI32Type());
-      genericInputs.push_back(rewriter.create<arith::ConstantOp>(
-          loc, DenseIntElementsAttr::get(multiplierType, multiplierValues)));
+      genericInputs.push_back(arith::ConstantOp::create(
+          rewriter, loc,
+          DenseIntElementsAttr::get(multiplierType, multiplierValues)));
 
       indexingMaps.push_back(AffineMap::get(/*dimCount=*/rank,
                                             /*symbolCount=*/0, multiplierExprs,
@@ -1424,16 +1435,16 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
     Value shiftConstant;
     int64_t shiftArg = 0;
     if (shiftValues.size() == 1) {
-      shiftConstant = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI8IntegerAttr(shiftValues.front()));
+      shiftConstant = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI8IntegerAttr(shiftValues.front()));
     } else {
       SmallVector<AffineExpr, 2> shiftExprs = {
           rewriter.getAffineDimExpr(rank - 1)};
       auto shiftType =
           RankedTensorType::get({static_cast<int64_t>(shiftValues.size())},
                                 rewriter.getIntegerType(8));
-      genericInputs.push_back(rewriter.create<arith::ConstantOp>(
-          loc, DenseIntElementsAttr::get(shiftType, shiftValues)));
+      genericInputs.push_back(arith::ConstantOp::create(
+          rewriter, loc, DenseIntElementsAttr::get(shiftType, shiftValues)));
       indexingMaps.push_back(AffineMap::get(/*dimCount=*/rank,
                                             /*symbolCount=*/0, shiftExprs,
                                             rewriter.getContext()));
@@ -1444,13 +1455,13 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
     indexingMaps.push_back(rewriter.getMultiDimIdentityMap(rank));
 
     // Construct the indexing maps needed for linalg.generic ops.
-    Value emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, outputTy.getShape(), outputTy.getElementType(),
+    Value emptyTensor = tensor::EmptyOp::create(
+        rewriter, loc, outputTy.getShape(), outputTy.getElementType(),
         ArrayRef<Value>({dynDims}));
 
-    auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, outputTy, genericInputs, ValueRange{emptyTensor}, indexingMaps,
-        getNParallelLoopsAttrs(rank),
+    auto linalgOp = linalg::GenericOp::create(
+        rewriter, loc, outputTy, genericInputs, ValueRange{emptyTensor},
+        indexingMaps, getNParallelLoopsAttrs(rank),
         [&](OpBuilder &nestedBuilder, Location nestedLoc,
             ValueRange blockArgs) {
           Value value = blockArgs[0];
@@ -1466,9 +1477,10 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
           const int32_t inBitwidth = valueTy.getIntOrFloatBitWidth();
           // Extend zeropoint for sub-32bits widths.
           const int32_t inAttrBitwidth = inBitwidth > 32 ? inBitwidth : 32;
-          auto inputZp = nestedBuilder.create<arith::ConstantOp>(
-              loc, IntegerAttr::get(rewriter.getIntegerType(inAttrBitwidth),
-                                    *maybeIZp));
+          auto inputZp = arith::ConstantOp::create(
+              nestedBuilder, loc,
+              IntegerAttr::get(rewriter.getIntegerType(inAttrBitwidth),
+                               *maybeIZp));
 
           FailureOr<int64_t> maybeOZp = op.getOutputZeroPoint();
           if (failed(maybeOZp)) {
@@ -1482,9 +1494,10 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
           unsigned outBitWidth = outIntType.getWidth();
           const int32_t outAttrBitwidth = 32;
           assert(outBitWidth <= 32 && "Unexpected output zeropoint bitwidth");
-          auto outputZp = nestedBuilder.create<arith::ConstantOp>(
-              loc, IntegerAttr::get(rewriter.getIntegerType(outAttrBitwidth),
-                                    *maybeOZp));
+          auto outputZp = arith::ConstantOp::create(
+              nestedBuilder, loc,
+              IntegerAttr::get(rewriter.getIntegerType(outAttrBitwidth),
+                               *maybeOZp));
 
           Value multiplier = multiplierConstant ? multiplierConstant
                                                 : blockArgs[multiplierArg];
@@ -1501,24 +1514,24 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
           }
           if (valueTy.getIntOrFloatBitWidth() < 32) {
             if (op.getInputUnsigned()) {
-              value = nestedBuilder.create<arith::ExtUIOp>(
-                  nestedLoc, nestedBuilder.getI32Type(), value);
+              value = arith::ExtUIOp::create(nestedBuilder, nestedLoc,
+                                             nestedBuilder.getI32Type(), value);
             } else {
-              value = nestedBuilder.create<arith::ExtSIOp>(
-                  nestedLoc, nestedBuilder.getI32Type(), value);
+              value = arith::ExtSIOp::create(nestedBuilder, nestedLoc,
+                                             nestedBuilder.getI32Type(), value);
             }
           }
 
           value =
-              nestedBuilder.create<arith::SubIOp>(nestedLoc, value, inputZp);
+              arith::SubIOp::create(nestedBuilder, nestedLoc, value, inputZp);
 
-          value = nestedBuilder.create<tosa::ApplyScaleOp>(
-              loc, nestedBuilder.getI32Type(), value, multiplier, shift,
-              roundingMode);
+          value = tosa::ApplyScaleOp::create(nestedBuilder, loc,
+                                             nestedBuilder.getI32Type(), value,
+                                             multiplier, shift, roundingMode);
 
           // Move to the new zero-point.
           value =
-              nestedBuilder.create<arith::AddIOp>(nestedLoc, value, outputZp);
+              arith::AddIOp::create(nestedBuilder, nestedLoc, value, outputZp);
 
           // Saturate to the output size.
           int32_t intMin = APInt::getSignedMinValue(outBitWidth).getSExtValue();
@@ -1530,18 +1543,18 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
             intMax = APInt::getMaxValue(outBitWidth).getZExtValue();
           }
 
-          auto intMinVal = nestedBuilder.create<arith::ConstantOp>(
-              loc, nestedBuilder.getI32IntegerAttr(intMin));
-          auto intMaxVal = nestedBuilder.create<arith::ConstantOp>(
-              loc, nestedBuilder.getI32IntegerAttr(intMax));
+          auto intMinVal = arith::ConstantOp::create(
+              nestedBuilder, loc, nestedBuilder.getI32IntegerAttr(intMin));
+          auto intMaxVal = arith::ConstantOp::create(
+              nestedBuilder, loc, nestedBuilder.getI32IntegerAttr(intMax));
 
           value = clampIntHelper(nestedLoc, value, intMinVal, intMaxVal,
                                  nestedBuilder, /*isUnsigned=*/false);
 
           if (outIntType.getWidth() < 32) {
-            value = nestedBuilder.create<arith::TruncIOp>(
-                nestedLoc, rewriter.getIntegerType(outIntType.getWidth()),
-                value);
+            value = arith::TruncIOp::create(
+                nestedBuilder, nestedLoc,
+                rewriter.getIntegerType(outIntType.getWidth()), value);
           }
 
           if (outIntType.isUnsignedInteger()) {
@@ -1550,7 +1563,7 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
                                                             outIntType, value)
                         .getResult(0);
           }
-          nestedBuilder.create<linalg::YieldOp>(loc, value);
+          linalg::YieldOp::create(nestedBuilder, loc, value);
         });
 
     rewriter.replaceOp(op, linalgOp->getResults());
@@ -1608,48 +1621,49 @@ class ResizeUnaryConverter : public OpRewritePattern<tosa::ResizeOp> {
     auto collapseTy =
         RankedTensorType::get({inputTy.getDimSize(0), inputTy.getDimSize(3)},
                               inputTy.getElementType());
-    Value collapse = builder.create<tensor::CollapseShapeOp>(collapseTy, input,
-                                                             reassociationMap);
+    Value collapse = tensor::CollapseShapeOp::create(builder, collapseTy, input,
+                                                     reassociationMap);
 
     // Get any dynamic shapes that appear in the input format.
     llvm::SmallVector<Value> outputDynSize;
     if (inputTy.isDynamicDim(0))
-      outputDynSize.push_back(builder.create<tensor::DimOp>(input, 0));
+      outputDynSize.push_back(tensor::DimOp::create(builder, input, 0));
     if (inputTy.isDynamicDim(3))
-      outputDynSize.push_back(builder.create<tensor::DimOp>(input, 3));
+      outputDynSize.push_back(tensor::DimOp::create(builder, input, 3));
 
     // Generate the elementwise operation for casting scaling the input value.
     auto genericTy = collapseTy.clone(resultTy.getElementType());
-    Value empty = builder.create<tensor::EmptyOp>(
-        genericTy.getShape(), resultTy.getElementType(), outputDynSize);
+    Value empty =
+        tensor::EmptyOp::create(builder, genericTy.getShape(),
+                                resultTy.getElementType(), outputDynSize);
     auto genericMap = rewriter.getMultiDimIdentityMap(genericTy.getRank());
     SmallVector<utils::IteratorType> iterators(genericTy.getRank(),
                                                utils::IteratorType::parallel);
 
-    auto generic = builder.create<linalg::GenericOp>(
-        genericTy, ValueRange{collapse}, ValueRange{empty},
+    auto generic = linalg::GenericOp::create(
+        builder, genericTy, ValueRange{collapse}, ValueRange{empty},
         ArrayRef<AffineMap>{genericMap, genericMap}, iterators,
         [=](OpBuilder &b, Location loc, ValueRange args) {
           Value value = args[0];
           // This is the quantized case.
           if (inputTy.getElementType() != resultTy.getElementType()) {
-            value =
-                b.create<arith::ExtSIOp>(loc, resultTy.getElementType(), value);
+            value = arith::ExtSIOp::create(b, loc, resultTy.getElementType(),
+                                           value);
 
             if (isBilinear && scale[0] != 0) {
-              Value scaleY = b.create<arith::ConstantOp>(
-                  loc, b.getI32IntegerAttr(scale[0]));
-              value = b.create<arith::MulIOp>(loc, value, scaleY);
+              Value scaleY = arith::ConstantOp::create(
+                  b, loc, b.getI32IntegerAttr(scale[0]));
+              value = arith::MulIOp::create(b, loc, value, scaleY);
             }
 
             if (isBilinear && scale[2] != 0) {
-              Value scaleX = b.create<arith::ConstantOp>(
-                  loc, b.getI32IntegerAttr(scale[2]));
-              value = b.create<arith::MulIOp>(loc, value, scaleX);
+              Value scaleX = arith::ConstantOp::create(
+                  b, loc, b.getI32IntegerAttr(scale[2]));
+              value = arith::MulIOp::create(b, loc, value, scaleX);
             }
           }
 
-          b.create<linalg::YieldOp>(loc, value);
+          linalg::YieldOp::create(b, loc, value);
         });
 
     rewriter.replaceOpWithNewOp<tensor::ExpandShapeOp>(
@@ -1697,9 +1711,9 @@ class MaterializeResizeBroadcast : public OpRewritePattern<tosa::ResizeOp> {
     resizeShape.push_back(channels);
 
     auto resizeTy = resultTy.clone(resizeShape);
-    auto resize = builder.create<tosa::ResizeOp>(resizeTy, input, op.getScale(),
-                                                 op.getOffset(), op.getBorder(),
-                                                 op.getMode());
+    auto resize =
+        tosa::ResizeOp::create(builder, resizeTy, input, op.getScale(),
+                               op.getOffset(), op.getBorder(), op.getMode());
 
     // Collapse an unit result dims.
     SmallVector<ReassociationExprs, 4> reassociationMap(2);
@@ -1720,20 +1734,20 @@ class MaterializeResizeBroadcast : public OpRewritePattern<tosa::ResizeOp> {
     collapseShape.push_back(channels);
 
     auto collapseTy = resultTy.clone(collapseShape);
-    Value collapse = builder.create<tensor::CollapseShapeOp>(collapseTy, resize,
-                                                             reassociationMap);
+    Value collapse = tensor::CollapseShapeOp::create(builder, collapseTy,
+                                                     resize, reassociationMap);
 
     // Broadcast the collapsed shape to the output result.
     llvm::SmallVector<Value> outputDynSize;
     if (inputTy.isDynamicDim(0))
-      outputDynSize.push_back(builder.create<tensor::DimOp>(input, 0));
+      outputDynSize.push_back(tensor::DimOp::create(builder, input, 0));
     if (inputTy.isDynamicDim(3))
-      outputDynSize.push_back(builder.create<tensor::DimOp>(input, 3));
+      outputDynSize.push_back(tensor::DimOp::create(builder, input, 3));
 
     SmallVector<utils::IteratorType> iterators(resultTy.getRank(),
                                                utils::IteratorType::parallel);
-    Value empty = builder.create<tensor::EmptyOp>(
-        resultTy.getShape(), resultTy.getElementType(), outputDynSize);
+    Value empty = tensor::EmptyOp::create(
+        builder, resultTy.getShape(), resultTy.getElementType(), outputDynSize);
 
     SmallVector<AffineExpr, 4> inputExprs{rewriter.getAffineDimExpr(0)};
     if (inputH != 1)
@@ -1751,7 +1765,7 @@ class MaterializeResizeBroadcast : public OpRewritePattern<tosa::ResizeOp> {
         ArrayRef<AffineMap>{inputMap, outputMap}, iterators,
         [=](OpBuilder &b, Location loc, ValueRange args) {
           Value value = args[0];
-          b.create<linalg::YieldOp>(loc, value);
+          linalg::YieldOp::create(b, loc, value);
         });
 
     return success();
@@ -1789,10 +1803,10 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
 
     SmallVector<AffineMap, 2> affineMaps = {
         rewriter.getMultiDimIdentityMap(resultTy.getRank())};
-    auto emptyTensor = b.create<tensor::EmptyOp>(resultTy.getShape(), resultETy,
-                                                 *dynamicDimsOr);
-    auto genericOp = b.create<linalg::GenericOp>(
-        resultTy, ValueRange({}), ValueRange{emptyTensor}, affineMaps,
+    auto emptyTensor = tensor::EmptyOp::create(b, resultTy.getShape(),
+                                               resultETy, *dynamicDimsOr);
+    auto genericOp = linalg::GenericOp::create(
+        b, resultTy, ValueRange({}), ValueRange{emptyTensor}, affineMaps,
         getNParallelLoopsAttrs(resultTy.getRank()));
     Value resize = genericOp.getResult(0);
 
@@ -1800,19 +1814,21 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
       OpBuilder::InsertionGuard regionGuard(b);
       b.createBlock(&genericOp.getRegion(), genericOp.getRegion().end(),
                     TypeRange({resultETy}), loc);
-      Value batch = b.create<linalg::IndexOp>(0);
-      Value y = b.create<linalg::IndexOp>(1);
-      Value x = b.create<linalg::IndexOp>(2);
-      Value channel = b.create<linalg::IndexOp>(3);
+      Value batch = linalg::IndexOp::create(b, 0);
+      Value y = linalg::IndexOp::create(b, 1);
+      Value x = linalg::IndexOp::create(b, 2);
+      Value channel = linalg::IndexOp::create(b, 3);
 
       Value zeroI32 =
-          b.create<arith::ConstantOp>(b.getZeroAttr(b.getI32Type()));
-      Value zeroFp = b.create<arith::ConstantOp>(b.getZeroAttr(floatTy));
-      Value hMax = b.create<arith::ConstantOp>(b.getI32IntegerAttr(imageH - 1));
-      Value wMax = b.create<arith::ConstantOp>(b.getI32IntegerAttr(imageW - 1));
+          arith::ConstantOp::create(b, b.getZeroAttr(b.getI32Type()));
+      Value zeroFp = arith::ConstantOp::create(b, b.getZeroAttr(floatTy));
+      Value hMax =
+          arith::ConstantOp::create(b, b.getI32IntegerAttr(imageH - 1));
+      Value wMax =
+          arith::ConstantOp::create(b, b.getI32IntegerAttr(imageW - 1));
 
-      Value inY = b.create<arith::IndexCastOp>(b.getI32Type(), y);
-      Value inX = b.create<arith::IndexCastOp>(b.getI32Type(), x);
+      Value inY = arith::IndexCastOp::create(b, b.getI32Type(), y);
+      Value inX = arith::IndexCastOp::create(b, b.getI32Type(), x);
 
       SmallVector<int64_t> scale, offset, border;
       if (!tosa::getConstShapeValues(op.getScale().getDefiningOp(), scale) ||
@@ -1824,16 +1840,16 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
       }
 
       Value yScaleN, yScaleD, xScaleN, xScaleD;
-      yScaleN = b.create<arith::ConstantOp>(b.getI32IntegerAttr(scale[0]));
-      yScaleD = b.create<arith::ConstantOp>(b.getI32IntegerAttr(scale[1]));
-      xScaleN = b.create<arith::ConstantOp>(b.getI32IntegerAttr(scale[2]));
-      xScaleD = b.create<arith::ConstantOp>(b.getI32IntegerAttr(scale[3]));
+      yScaleN = arith::ConstantOp::create(b, b.getI32IntegerAttr(scale[0]));
+      yScaleD = arith::ConstantOp::create(b, b.getI32IntegerAttr(scale[1]));
+      xScaleN = arith::ConstantOp::create(b, b.getI32IntegerAttr(scale[2]));
+      xScaleD = arith::ConstantOp::create(b, b.getI32IntegerAttr(scale[3]));
 
       Value yOffset, xOffset, yBorder, xBorder;
-      yOffset = b.create<arith::ConstantOp>(b.getI32IntegerAttr(offset[0]));
-      xOffset = b.create<arith::ConstantOp>(b.getI32IntegerAttr(offset[1]));
-      yBorder = b.create<arith::ConstantOp>(b.getI32IntegerAttr(border[0]));
-      xBorder = b.create<arith::ConstantOp>(b.getI32IntegerAttr(border[1]));
+      yOffset = arith::ConstantOp::create(b, b.getI32IntegerAttr(offset[0]));
+      xOffset = arith::ConstantOp::create(b, b.getI32IntegerAttr(offset[1]));
+      yBorder = arith::ConstantOp::create(b, b.getI32IntegerAttr(border[0]));
+      xBorder = arith::ConstantOp::create(b, b.getI32IntegerAttr(border[1]));
 
       // Compute the ix and dx values for both the X and Y dimensions.
       auto getIndexAndDeltaFp = [&](Value &index, Value &delta, Value in,
@@ -1846,16 +1862,16 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
         }
         // x = x * scale_d + offset;
         // ix = floor(x / scale_n)
-        Value val = b.create<arith::MulIOp>(in, scaleD);
-        val = b.create<arith::AddIOp>(val, offset);
-        index = b.create<arith::FloorDivSIOp>(val, scaleN);
+        Value val = arith::MulIOp::create(b, in, scaleD);
+        val = arith::AddIOp::create(b, val, offset);
+        index = arith::FloorDivSIOp::create(b, val, scaleN);
 
         // rx = x % scale_n
         // dx = rx / scale_n
-        Value r = b.create<arith::RemSIOp>(val, scaleN);
-        Value rFp = b.create<arith::SIToFPOp>(floatTy, r);
-        Value scaleNfp = b.create<arith::UIToFPOp>(floatTy, scaleN);
-        delta = b.create<arith::DivFOp>(rFp, scaleNfp);
+        Value r = arith::RemSIOp::create(b, val, scaleN);
+        Value rFp = arith::SIToFPOp::create(b, floatTy, r);
+        Value scaleNfp = arith::UIToFPOp::create(b, floatTy, scaleN);
+        delta = arith::DivFOp::create(b, rFp, scaleNfp);
       };
 
       // Compute the ix and dx values for the X and Y dimensions - int case.
@@ -1870,11 +1886,11 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
         // x = x * scale_d + offset;
         // ix = floor(x / scale_n)
         //  dx = x - ix * scale_n;
-        Value val = b.create<arith::MulIOp>(in, scaleD);
-        val = b.create<arith::AddIOp>(val, offset);
-        index = b.create<arith::DivSIOp>(val, scaleN);
-        delta = b.create<arith::MulIOp>(index, scaleN);
-        delta = b.create<arith::SubIOp>(val, delta);
+        Value val = arith::MulIOp::create(b, in, scaleD);
+        val = arith::AddIOp::create(b, val, offset);
+        index = arith::DivSIOp::create(b, val, scaleN);
+        delta = arith::MulIOp::create(b, index, scaleN);
+        delta = arith::SubIOp::create(b, val, delta);
       };
 
       Value ix, iy, dx, dy;
@@ -1887,54 +1903,55 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
       }
 
       if (op.getMode() == "NEAREST_NEIGHBOR") {
-        auto one = b.create<arith::ConstantOp>(b.getI32IntegerAttr(1));
+        auto one = arith::ConstantOp::create(b, b.getI32IntegerAttr(1));
 
         auto getNearestIndexAndClamp = [&](Value val, Value dval, Value scale,
                                            Value max, int size,
                                            ImplicitLocOpBuilder &b) -> Value {
           if (size == 1) {
-            return b.create<arith::ConstantIndexOp>(0);
+            return arith::ConstantIndexOp::create(b, 0);
           }
 
           Value pred;
           if (floatingPointMode) {
-            auto h = b.create<arith::ConstantOp>(b.getFloatAttr(floatTy, 0.5f));
-            pred = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGE, dval, h);
+            auto h =
+                arith::ConstantOp::create(b, b.getFloatAttr(floatTy, 0.5f));
+            pred = arith::CmpFOp::create(b, arith::CmpFPredicate::OGE, dval, h);
           } else {
-            Value dvalDouble = b.create<arith::ShLIOp>(dval, one);
-            pred = b.create<arith::CmpIOp>(arith::CmpIPredicate::sge,
-                                           dvalDouble, scale);
+            Value dvalDouble = arith::ShLIOp::create(b, dval, one);
+            pred = arith::CmpIOp::create(b, arith::CmpIPredicate::sge,
+                                         dvalDouble, scale);
           }
 
-          auto offset = b.create<arith::SelectOp>(pred, one, zeroI32);
-          val = b.create<arith::AddIOp>(val, offset);
+          auto offset = arith::SelectOp::create(b, pred, one, zeroI32);
+          val = arith::AddIOp::create(b, val, offset);
           val = clampIntHelper(loc, val, zeroI32, max, b, /*isUnsigned=*/false);
-          return b.create<arith::IndexCastOp>(b.getIndexType(), val);
+          return arith::IndexCastOp::create(b, b.getIndexType(), val);
         };
 
         iy = getNearestIndexAndClamp(iy, dy, yScaleN, hMax, imageH, b);
         ix = getNearestIndexAndClamp(ix, dx, xScaleN, wMax, imageW, b);
 
-        Value result = b.create<tensor::ExtractOp>(
-            input, ValueRange{batch, iy, ix, channel});
+        Value result = tensor::ExtractOp::create(
+            b, input, ValueRange{batch, iy, ix, channel});
 
-        b.create<linalg::YieldOp>(result);
+        linalg::YieldOp::create(b, result);
       } else {
         // The mode here must be BILINEAR.
         assert(op.getMode() == "BILINEAR");
 
-        auto oneVal = b.create<arith::ConstantOp>(b.getI32IntegerAttr(1));
+        auto oneVal = arith::ConstantOp::create(b, b.getI32IntegerAttr(1));
 
         auto getClampedIdxs = [&](Value &val0, Value &val1, int size, Value in,
                                   Value max, ImplicitLocOpBuilder &b) {
           val0 = in;
-          val1 = b.create<arith::AddIOp>(val0, oneVal);
+          val1 = arith::AddIOp::create(b, val0, oneVal);
           val0 =
               clampIntHelper(loc, val0, zeroI32, max, b, /*isUnsigned=*/false);
           val1 =
               clampIntHelper(loc, val1, zeroI32, max, b, /*isUnsigned=*/false);
-          val0 = b.create<arith::IndexCastOp>(b.getIndexType(), val0);
-          val1 = b.create<arith::IndexCastOp>(b.getIndexType(), val1);
+          val0 = arith::IndexCastOp::create(b, b.getIndexType(), val0);
+          val1 = arith::IndexCastOp::create(b, b.getIndexType(), val1);
         };
 
         // Linalg equivalent to the section below:
@@ -1946,27 +1963,27 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
         getClampedIdxs(y0, y1, imageH, iy, hMax, b);
         getClampedIdxs(x0, x1, imageW, ix, wMax, b);
 
-        Value y0x0 = b.create<tensor::ExtractOp>(
-            input, ValueRange{batch, y0, x0, channel});
-        Value y0x1 = b.create<tensor::ExtractOp>(
-            input, ValueRange{batch, y0, x1, channel});
-        Value y1x0 = b.create<tensor::ExtractOp>(
-            input, ValueRange{batch, y1, x0, channel});
-        Value y1x1 = b.create<tensor::ExtractOp>(
-            input, ValueRange{batch, y1, x1, channel});
+        Value y0x0 = tensor::ExtractOp::create(
+            b, input, ValueRange{batch, y0, x0, channel});
+        Value y0x1 = tensor::ExtractOp::create(
+            b, input, ValueRange{batch, y0, x1, channel});
+        Value y1x0 = tensor::ExtractOp::create(
+            b, input, ValueRange{batch, y1, x0, channel});
+        Value y1x1 = tensor::ExtractOp::create(
+            b, input, ValueRange{batch, y1, x1, channel});
 
         if (floatingPointMode) {
           auto oneVal =
-              b.create<arith::ConstantOp>(b.getFloatAttr(floatTy, 1.0f));
+              arith::ConstantOp::create(b, b.getFloatAttr(floatTy, 1.0f));
           auto interpolate = [&](Value val0, Value val1, Value delta,
                                  int inputSize,
                                  ImplicitLocOpBuilder &b) -> Value {
             if (inputSize == 1)
               return val0;
-            Value oneMinusDelta = b.create<arith::SubFOp>(oneVal, delta);
-            Value mul0 = b.create<arith::MulFOp>(val0, oneMinusDelta);
-            Value mul1 = b.create<arith::MulFOp>(val1, delta);
-            return b.create<arith::AddFOp>(mul0, mul1);
+            Value oneMinusDelta = arith::SubFOp::create(b, oneVal, delta);
+            Value mul0 = arith::MulFOp::create(b, val0, oneMinusDelta);
+            Value mul1 = arith::MulFOp::create(b, val1, delta);
+            return arith::AddFOp::create(b, mul0, mul1);
           };
 
           // Linalg equivalent to the section below:
@@ -1982,18 +1999,18 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
           // Linalg equivalent to the section below:
           //   result = topAcc * (unit_y - dy) + bottomAcc * dy
           Value result = interpolate(topAcc, bottomAcc, dy, imageH, b);
-          b.create<linalg::YieldOp>(result);
+          linalg::YieldOp::create(b, result);
         } else {
           // Perform in quantized space.
-          y0x0 = b.create<arith::ExtSIOp>(resultETy, y0x0);
-          y0x1 = b.create<arith::ExtSIOp>(resultETy, y0x1);
-          y1x0 = b.create<arith::ExtSIOp>(resultETy, y1x0);
-          y1x1 = b.create<arith::ExtSIOp>(resultETy, y1x1);
+          y0x0 = arith::ExtSIOp::create(b, resultETy, y0x0);
+          y0x1 = arith::ExtSIOp::create(b, resultETy, y0x1);
+          y1x0 = arith::ExtSIOp::create(b, resultETy, y1x0);
+          y1x1 = arith::ExtSIOp::create(b, resultETy, y1x1);
 
           const int64_t deltaBitwidth = dx.getType().getIntOrFloatBitWidth();
           if (resultETy.getIntOrFloatBitWidth() > deltaBitwidth) {
-            dx = b.create<arith::ExtSIOp>(resultETy, dx);
-            dy = b.create<arith::ExtSIOp>(resultETy, dy);
+            dx = arith::ExtSIOp::create(b, resultETy, dx);
+            dy = arith::ExtSIOp::create(b, resultETy, dy);
           }
 
           Value yScaleNExt = yScaleN;
@@ -2002,26 +2019,26 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
           const int64_t scaleBitwidth =
               xScaleN.getType().getIntOrFloatBitWidth();
           if (resultETy.getIntOrFloatBitWidth() > scaleBitwidth) {
-            yScaleNExt = b.create<arith::ExtSIOp>(resultETy, yScaleN);
-            xScaleNExt = b.create<arith::ExtSIOp>(resultETy, xScaleN);
+            yScaleNExt = arith::ExtSIOp::create(b, resultETy, yScaleN);
+            xScaleNExt = arith::ExtSIOp::create(b, resultETy, xScaleN);
           }
 
           auto interpolate = [](Value val0, Value val1, Value weight1,
                                 Value scale, int inputSize,
                                 ImplicitLocOpBuilder &b) -> Value {
             if (inputSize == 1)
-              return b.create<arith::MulIOp>(val0, scale);
-            Value weight0 = b.create<arith::SubIOp>(scale, weight1);
-            Value mul0 = b.create<arith::MulIOp>(val0, weight0);
-            Value mul1 = b.create<arith::MulIOp>(val1, weight1);
-            return b.create<arith::AddIOp>(mul0, mul1);
+              return arith::MulIOp::create(b, val0, scale);
+            Value weight0 = arith::SubIOp::create(b, scale, weight1);
+            Value mul0 = arith::MulIOp::create(b, val0, weight0);
+            Value mul1 = arith::MulIOp::create(b, val1, weight1);
+            return arith::AddIOp::create(b, mul0, mul1);
           };
 
           Value topAcc = interpolate(y0x0, y0x1, dx, xScaleNExt, imageW, b);
           Value bottomAcc = interpolate(y1x0, y1x1, dx, xScaleNExt, imageW, b);
           Value result =
               interpolate(topAcc, bottomAcc, dy, yScaleNExt, imageH, b);
-          b.create<linalg::YieldOp>(result);
+          linalg::YieldOp::create(b, result);
         }
       }
     }
@@ -2072,11 +2089,11 @@ class ReverseConverter : public OpRewritePattern<tosa::ReverseOp> {
     SmallVector<Value> dynDims;
     for (int i = 0; i < inputTy.getRank(); i++) {
       if (inputTy.isDynamicDim(i)) {
-        dynDims.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+        dynDims.push_back(tensor::DimOp::create(rewriter, loc, input, i));
       }
     }
 
-    Value axisDimSize = rewriter.create<tensor::DimOp>(loc, input, axis);
+    Value axisDimSize = tensor::DimOp::create(rewriter, loc, input, axis);
 
     // First fill the output buffer with the init value.
     auto emptyTensor = rewriter
@@ -2094,22 +2111,22 @@ class ReverseConverter : public OpRewritePattern<tosa::ReverseOp> {
           llvm::SmallVector<Value> indices;
           for (unsigned int i = 0; i < inputTy.getRank(); i++) {
             Value index =
-                rewriter.create<linalg::IndexOp>(nestedLoc, i).getResult();
+                linalg::IndexOp::create(rewriter, nestedLoc, i).getResult();
             if (i == axis) {
-              auto one = rewriter.create<arith::ConstantIndexOp>(nestedLoc, 1);
+              auto one = arith::ConstantIndexOp::create(rewriter, nestedLoc, 1);
               auto sizeMinusOne =
-                  rewriter.create<arith::SubIOp>(nestedLoc, axisDimSize, one);
-              index = rewriter.create<arith::SubIOp>(nestedLoc, sizeMinusOne,
-                                                     index);
+                  arith::SubIOp::create(rewriter, nestedLoc, axisDimSize, one);
+              index = arith::SubIOp::create(rewriter, nestedLoc, sizeMinusOne,
+                                            index);
             }
 
             indices.push_back(index);
           }
 
-          auto extract = nestedBuilder.create<tensor::ExtractOp>(
-              nestedLoc, input, indices);
-          nestedBuilder.create<linalg::YieldOp>(op.getLoc(),
-                                                extract.getResult());
+          auto extract = tensor::ExtractOp::create(nestedBuilder, nestedLoc,
+                                                   input, indices);
+          linalg::YieldOp::create(nestedBuilder, op.getLoc(),
+                                  extract.getResult());
         });
     return success();
   }
@@ -2148,12 +2165,12 @@ struct TileConverter : public OpConversionPattern<tosa::TileOp> {
     SmallVector<Value> dynDims;
     for (int i = 0; i < inputTy.getRank(); i++) {
       if (inputTy.isDynamicDim(i) || multiples[i] == -1) {
-        dynDims.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+        dynDims.push_back(tensor::DimOp::create(rewriter, loc, input, i));
       }
     }
 
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        op.getLoc(), genericShape, elementTy, dynDims);
+    auto emptyTensor = tensor::EmptyOp::create(
+        rewriter, op.getLoc(), genericShape, elementTy, dynDims);
 
     // We needs to map the input shape to the non-broadcasted dimensions.
     SmallVector<AffineExpr, 4> dimExprs;
@@ -2168,12 +2185,12 @@ struct TileConverter : public OpConversionPattern<tosa::TileOp> {
     SmallVector<AffineMap, 2> affineMaps = {
         readAffineMap, rewriter.getMultiDimIdentityMap(genericShape.size())};
 
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, RankedTensorType::get(genericShape, elementTy), input,
+    auto genericOp = linalg::GenericOp::create(
+        rewriter, loc, RankedTensorType::get(genericShape, elementTy), input,
         ValueRange{emptyTensor}, affineMaps,
         getNParallelLoopsAttrs(genericShape.size()),
         [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-          nestedBuilder.create<linalg::YieldOp>(op.getLoc(), *args.begin());
+          linalg::YieldOp::create(nestedBuilder, op.getLoc(), *args.begin());
         });
 
     auto shapeValue = getTosaConstShape(
@@ -2220,7 +2237,7 @@ class ArgMaxConverter : public OpRewritePattern<tosa::ArgMaxOp> {
     SmallVector<Value> dynDims;
     for (int i = 0; i < inputTy.getRank(); i++) {
       if (inputTy.isDynamicDim(i) && i != axis) {
-        dynDims.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+        dynDims.push_back(tensor::DimOp::create(rewriter, loc, input, i));
       }
     }
 
@@ -2229,8 +2246,8 @@ class ArgMaxConverter : public OpRewritePattern<tosa::ArgMaxOp> {
                               .create<tensor::EmptyOp>(loc, resultTy.getShape(),
                                                        outElementTy, dynDims)
                               .getResult();
-    auto fillValueIdx = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIntegerAttr(outElementTy, 0));
+    auto fillValueIdx = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getIntegerAttr(outElementTy, 0));
     auto filledTensorIdx =
         rewriter
             .create<linalg::FillOp>(loc, ValueRange{fillValueIdx},
@@ -2250,7 +2267,7 @@ class ArgMaxConverter : public OpRewritePattern<tosa::ArgMaxOp> {
           argmaxOp, "unsupported tosa.argmax element type");
 
     auto fillValueMax =
-        rewriter.create<arith::ConstantOp>(loc, fillValueMaxAttr);
+        arith::ConstantOp::create(rewriter, loc, fillValueMaxAttr);
     auto filledTensorMax =
         rewriter
             .create<linalg::FillOp>(loc, ValueRange{fillValueMax},
@@ -2274,8 +2291,8 @@ class ArgMaxConverter : public OpRewritePattern<tosa::ArgMaxOp> {
     bool didEncounterError = false;
     auto maps = AffineMap::inferFromExprList({srcExprs, dstExprs, dstExprs},
                                              rewriter.getContext());
-    auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, ArrayRef<Type>({resultTy, resultMaxTy}), input,
+    auto linalgOp = linalg::GenericOp::create(
+        rewriter, loc, ArrayRef<Type>({resultTy, resultMaxTy}), input,
         ValueRange({filledTensorIdx, filledTensorMax}), maps, iteratorTypes,
         [&](OpBuilder &nestedBuilder, Location nestedLoc,
             ValueRange blockArgs) {
@@ -2283,42 +2300,46 @@ class ArgMaxConverter : public OpRewritePattern<tosa::ArgMaxOp> {
           auto oldIndex = blockArgs[1];
           auto oldValue = blockArgs[2];
 
-          Value newIndex = rewriter.create<arith::IndexCastOp>(
-              nestedLoc, oldIndex.getType(),
-              rewriter.create<linalg::IndexOp>(loc, axis));
+          Value newIndex = arith::IndexCastOp::create(
+              rewriter, nestedLoc, oldIndex.getType(),
+              linalg::IndexOp::create(rewriter, loc, axis));
 
           Value predicate;
           if (isa<FloatType>(inElementTy)) {
             if (argmaxOp.getNanMode() == "IGNORE") {
               // Only update index & max value for non NaN values. If all
               // values are NaNs, the initial index will be return which is 0.
-              predicate = rewriter.create<arith::CmpFOp>(
-                  nestedLoc, arith::CmpFPredicate::OGT, newValue, oldValue);
+              predicate = arith::CmpFOp::create(rewriter, nestedLoc,
+                                                arith::CmpFPredicate::OGT,
+                                                newValue, oldValue);
             } else {
               // Update max value if either of the following is true:
               // - new value is bigger
               // - cur max is not NaN and new value is NaN
-              Value gt = rewriter.create<arith::CmpFOp>(
-                  nestedLoc, arith::CmpFPredicate::UGT, newValue, oldValue);
-              Value oldNonNaN = rewriter.create<arith::CmpFOp>(
-                  nestedLoc, arith::CmpFPredicate::ORD, oldValue, oldValue);
-              predicate = rewriter.create<arith::AndIOp>(
-                  nestedLoc, rewriter.getI1Type(), gt, oldNonNaN);
+              Value gt = arith::CmpFOp::create(rewriter, nestedLoc,
+                                               arith::CmpFPredicate::UGT,
+                                               newValue, oldValue);
+              Value oldNonNaN = arith::CmpFOp::create(rewriter, nestedLoc,
+                                                      arith::CmpFPredicate::ORD,
+                                                      oldValue, oldValue);
+              predicate = arith::AndIOp::create(
+                  rewriter, nestedLoc, rewriter.getI1Type(), gt, oldNonNaN);
             }
           } else if (isa<IntegerType>(inElementTy)) {
-            predicate = rewriter.create<arith::CmpIOp>(
-                nestedLoc, arith::CmpIPredicate::sgt, newValue, oldValue);
+            predicate = arith::CmpIOp::create(rewriter, nestedLoc,
+                                              arith::CmpIPredicate::sgt,
+                                              newValue, oldValue);
           } else {
             didEncounterError = true;
             return;
           }
 
-          auto resultMax = rewriter.create<arith::SelectOp>(
-              nestedLoc, predicate, newValue, oldValue);
-          auto resultIndex = rewriter.create<arith::SelectOp>(
-              nestedLoc, predicate, newIndex, oldIndex);
-          nestedBuilder.create<linalg::YieldOp>(
-              nestedLoc, ValueRange({resultIndex, resultMax}));
+          auto resultMax = arith::SelectOp::create(
+              rewriter, nestedLoc, predicate, newValue, oldValue);
+          auto resultIndex = arith::SelectOp::create(
+              rewriter, nestedLoc, predicate, newIndex, oldIndex);
+          linalg::YieldOp::create(nestedBuilder, nestedLoc,
+                                  ValueRange({resultIndex, resultMax}));
         });
 
     if (didEncounterError)
@@ -2363,19 +2384,19 @@ class GatherConverter : public OpConversionPattern<tosa::GatherOp> {
             rewriter.getContext()),
         rewriter.getMultiDimIdentityMap(resultTy.getRank())};
 
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, ArrayRef<Type>({resultTy}), ValueRange{indices},
+    auto genericOp = linalg::GenericOp::create(
+        rewriter, loc, ArrayRef<Type>({resultTy}), ValueRange{indices},
         ValueRange{emptyTensor}, affineMaps,
         getNParallelLoopsAttrs(resultTy.getRank()),
         [&](OpBuilder &b, Location loc, ValueRange args) {
           auto indexValue = args[0];
-          auto index0 = rewriter.create<linalg::IndexOp>(loc, 0);
-          Value index1 = rewriter.create<arith::IndexCastOp>(
-              loc, rewriter.getIndexType(), indexValue);
-          auto index2 = rewriter.create<linalg::IndexOp>(loc, 2);
-          Value extract = rewriter.create<tensor::ExtractOp>(
-              loc, input, ValueRange{index0, index1, index2});
-          rewriter.create<linalg::YieldOp>(loc, extract);
+          auto index0 = linalg::IndexOp::create(rewriter, loc, 0);
+          Value index1 = arith::IndexCastOp::create(
+              rewriter, loc, rewriter.getIndexType(), indexValue);
+          auto index2 = linalg::IndexOp::create(rewriter, loc, 2);
+          Value extract = tensor::ExtractOp::create(
+              rewriter, loc, input, ValueRange{index0, index1, index2});
+          linalg::YieldOp::create(rewriter, loc, extract);
         });
     rewriter.replaceOp(op, genericOp.getResult(0));
     return success();
@@ -2424,7 +2445,7 @@ class TableConverter : public OpRewritePattern<tosa::TableOp> {
     for (int i = 0; i < resultTy.getRank(); ++i) {
       if (inputTy.isDynamicDim(i)) {
         dynDims.push_back(
-            rewriter.create<tensor::DimOp>(loc, op.getOperand(0), i));
+            tensor::DimOp::create(rewriter, loc, op.getOperand(0), i));
       }
     }
 
@@ -2437,9 +2458,9 @@ class TableConverter : public OpRewritePattern<tosa::TableOp> {
         rewriter.getMultiDimIdentityMap(resultTy.getRank()),
         rewriter.getMultiDimIdentityMap(resultTy.getRank())};
 
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, resultTy, ValueRange({input}), ValueRange{emptyTensor}, affineMaps,
-        getNParallelLoopsAttrs(resultTy.getRank()));
+    auto genericOp = linalg::GenericOp::create(
+        rewriter, loc, resultTy, ValueRange({input}), ValueRange{emptyTensor},
+        affineMaps, getNParallelLoopsAttrs(resultTy.getRank()));
     rewriter.replaceOp(op, genericOp.getResult(0));
 
     {
@@ -2452,69 +2473,69 @@ class TableConverter : public OpRewritePattern<tosa::TableOp> {
       rewriter.setInsertionPointToStart(block);
       if (inputElementTy.isInteger(8) && tableElementTy.isInteger(8) &&
           resultElementTy.isInteger(8)) {
-        Value index = rewriter.create<arith::IndexCastOp>(
-            loc, rewriter.getIndexType(), inputValue);
-        Value offset = rewriter.create<arith::ConstantIndexOp>(loc, 128);
-        index = rewriter.create<arith::AddIOp>(loc, rewriter.getIndexType(),
-                                               index, offset);
+        Value index = arith::IndexCastOp::create(
+            rewriter, loc, rewriter.getIndexType(), inputValue);
+        Value offset = arith::ConstantIndexOp::create(rewriter, loc, 128);
+        index = arith::AddIOp::create(rewriter, loc, rewriter.getIndexType(),
+                                      index, offset);
         Value extract =
-            rewriter.create<tensor::ExtractOp>(loc, table, ValueRange{index});
-        rewriter.create<linalg::YieldOp>(loc, extract);
+            tensor::ExtractOp::create(rewriter, loc, table, ValueRange{index});
+        linalg::YieldOp::create(rewriter, loc, extract);
         return success();
       }
 
       if (inputElementTy.isInteger(16) && tableElementTy.isInteger(16) &&
           resultElementTy.isInteger(32)) {
-        Value extend = rewriter.create<arith::ExtSIOp>(
-            loc, rewriter.getI32Type(), inputValue);
-
-        auto offset = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getI32IntegerAttr(32768));
-        auto seven = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getI32IntegerAttr(7));
-        auto one = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getI32IntegerAttr(1));
-        auto b1111111 = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getI32IntegerAttr(127));
+        Value extend = arith::ExtSIOp::create(
+            rewriter, loc, rewriter.getI32Type(), inputValue);
+
+        auto offset = arith::ConstantOp::create(
+            rewriter, loc, rewriter.getI32IntegerAttr(32768));
+        auto seven = arith::ConstantOp::create(rewriter, loc,
+                                               rewriter.getI32IntegerAttr(7));
+        auto one = arith::ConstantOp::create(rewriter, loc,
+                                             rewriter.getI32IntegerAttr(1));
+        auto b1111111 = arith::ConstantOp::create(
+            rewriter, loc, rewriter.getI32IntegerAttr(127));
 
         // Compute the index and fractional part from the input value:
         // value = value + 32768
         // index = value >> 7;
         // fraction = 0x01111111 & value
-        auto extendAdd = rewriter.create<arith::AddIOp>(loc, extend, offset);
-        Value index = rewriter.create<arith::ShRUIOp>(loc, extendAdd, seven);
+        auto extendAdd = arith::AddIOp::create(rewriter, loc, extend, offset);
+        Value index = arith::ShRUIOp::create(rewriter, loc, extendAdd, seven);
         Value fraction =
-            rewriter.create<arith::AndIOp>(loc, extendAdd, b1111111);
+            arith::AndIOp::create(rewriter, loc, extendAdd, b1111111);
 
         // Extract the base and next values from the table.
         // base = (int32_t) table[index];
         // next = (int32_t) table[index + 1];
-        Value indexPlusOne = rewriter.create<arith::AddIOp>(loc, index, one);
+        Value indexPlusOne = arith::AddIOp::create(rewriter, loc, index, one);
 
-        index = rewriter.create<arith::IndexCastOp>(
-            loc, rewriter.getIndexType(), index);
-        indexPlusOne = rewriter.create<arith::IndexCastOp>(
-            loc, rewriter.getIndexType(), indexPlusOne);
+        index = arith::IndexCastOp::create(rewriter, loc,
+                                           rewriter.getIndexType(), index);
+        indexPlusOne = arith::IndexCastOp::create(
+            rewriter, loc, rewriter.getIndexType(), indexPlusOne);
 
         Value base =
-            rewriter.create<tensor::ExtractOp>(loc, table, ValueRange{index});
-        Value next = rewriter.create<tensor::ExtractOp>(
-            loc, table, ValueRange{indexPlusOne});
+            tensor::ExtractOp::create(rewriter, loc, table, ValueRange{index});
+        Value next = tensor::ExtractOp::create(rewriter, loc, table,
+                                               ValueRange{indexPlusOne});
 
         base =
-            rewriter.create<arith::ExtSIOp>(loc, rewriter.getI32Type(), base);
+            arith::ExtSIOp::create(rewriter, loc, rewriter.getI32Type(), base);
         next =
-            rewriter.create<arith::ExtSIOp>(loc, rewriter.getI32Type(), next);
+            arith::ExtSIOp::create(rewriter, loc, rewriter.getI32Type(), next);
 
         // Use the fractional part to interpolate between the input values:
         // result = (base << 7) + (next - base) * fraction
-        Value baseScaled = rewriter.create<arith::ShLIOp>(loc, base, seven);
-        Value diff = rewriter.create<arith::SubIOp>(loc, next, base);
-        Value diffScaled = rewriter.create<arith::MulIOp>(loc, diff, fraction);
+        Value baseScaled = arith::ShLIOp::create(rewriter, loc, base, seven);
+        Value diff = arith::SubIOp::create(rewriter, loc, next, base);
+        Value diffScaled = arith::MulIOp::create(rewriter, loc, diff, fraction);
         Value result =
-            rewriter.create<arith::AddIOp>(loc, baseScaled, diffScaled);
+            arith::AddIOp::create(rewriter, loc, baseScaled, diffScaled);
 
-        rewriter.create<linalg::YieldOp>(loc, result);
+        linalg::YieldOp::create(rewriter, loc, result);
 
         return success();
       }
@@ -2532,8 +2553,8 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
 
   static OpFoldResult halfPlusOne(OpBuilder &builder, Location loc,
                                   OpFoldResult ofr) {
-    auto one = builder.create<arith::ConstantIndexOp>(loc, 1);
-    auto two = builder.create<arith::ConstantIndexOp>(loc, 2);
+    auto one = arith::ConstantIndexOp::create(builder, loc, 1);
+    auto two = arith::ConstantIndexOp::create(builder, loc, 2);
 
     auto value = getValueOrCreateConstantIndexOp(builder, loc, ofr);
     auto divBy2 = builder.createOrFold<arith::DivUIOp>(loc, value, two);
@@ -2562,9 +2583,9 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
                                 RankedTensorType type,
                                 llvm::ArrayRef<Value> dynamicSizes) {
     auto emptyTensor =
-        rewriter.create<tensor::EmptyOp>(loc, type, dynamicSizes);
+        tensor::EmptyOp::create(rewriter, loc, type, dynamicSizes);
     auto fillValueAttr = rewriter.getZeroAttr(type.getElementType());
-    auto fillValue = rewriter.create<arith::ConstantOp>(loc, fillValueAttr);
+    auto fillValue = arith::ConstantOp::create(rewriter, loc, fillValueAttr);
     auto filledTensor = rewriter
                             .create<linalg::FillOp>(loc, ValueRange{fillValue},
                                                     ValueRange{emptyTensor})
@@ -2574,18 +2595,18 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
 
   static Value castIndexToFloat(OpBuilder &builder, Location loc,
                                 FloatType type, Value value) {
-    auto integerVal = builder.create<arith::IndexCastUIOp>(
-        loc,
+    auto integerVal = arith::IndexCastUIOp::create(
+        builder, loc,
         type.getIntOrFloatBitWidth() > 32 ? builder.getI64Type()
                                           : builder.getI32Type(),
         value);
 
-    return builder.create<arith::UIToFPOp>(loc, type, integerVal);
+    return arith::UIToFPOp::create(builder, loc, type, integerVal);
   }
 
   static Value createLinalgIndex(OpBuilder &builder, Location loc,
                                  FloatType type, int64_t index) {
-    auto indexVal = builder.create<linalg::IndexOp>(loc, index);
+    auto indexVal = linalg::IndexOp::create(builder, loc, index);
     return castIndexToFloat(builder, loc, type, indexVal);
   }
 
@@ -2640,7 +2661,7 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
 
     // Constants and dimension sizes
     auto twoPiAttr = rewriter.getFloatAttr(elementType, 6.283185307179586);
-    auto twoPi = rewriter.create<arith::ConstantOp>(loc, twoPiAttr);
+    auto twoPi = arith::ConstantOp::create(rewriter, loc, twoPiAttr);
     auto constH = castIndexToFloat(rewriter, loc, elementType, dimH);
     auto constW = castIndexToFloat(rewriter, loc, elementType, dimW);
 
@@ -2650,43 +2671,45 @@ struct RFFT2dConverter final : public OpRewritePattern<RFFT2dOp> {
       Value sumImag = args[2];
 
       // Indices for angle computation
-      Value oy = builder.create<linalg::IndexOp>(loc, 1);
-      Value ox = builder.create<linalg::IndexOp>(loc, 2);
-      Value iy = builder.create<linalg::IndexOp>(loc, 3);
-      Value ix = builder.create<linalg::IndexOp>(loc, 4);
+      Value oy = linalg::IndexOp::create(builder, loc, 1);
+      Value ox = linalg::IndexOp::create(builder, loc, 2);
+      Value iy = linalg::IndexOp::create(builder, loc, 3);
+      Value ix = linalg::IndexOp::create(builder, loc, 4);
 
       // Calculating angle without integer parts of components as sin/cos are
       // periodic: angle = 2 * pi() * ( ( (iy * oy) % H) / H + ( (ix * ox) % W )
       // / W);
-      auto iyXoy = builder.create<index::MulOp>(loc, iy, oy);
-      auto ixXox = builder.create<index::MulOp>(loc, ix, ox);
+      auto iyXoy = index::MulOp::create(builder, loc, iy, oy);
+      auto ixXox = index::MulOp::create(builder, loc, ix, ox);
 
-      auto iyRem = builder.create<index::RemUOp>(loc, iyXoy, dimH);
-      auto ixRem = builder.create<index::RemUOp>(loc, ixXox, dimW);
+      auto iyRem = index::RemUOp::create(builder, loc, iyXoy, dimH);
+      auto ixRem = index::RemUOp::create(builder, loc, ixXox, dimW);
 
       auto iyRemFloat = castIndexToFloat(builder, loc, elementType, iyRem);
       auto ixRemFloat = castIndexToFloat(builder, loc, elementType, ixRem);
 
-      auto yComponent = builder.create<arith::DivFOp>(loc, iyRemFloat, constH);
-      auto xComponent = builder.create<arith::DivFOp>(loc, ixRemFloat, constW);
-      auto sumXY = builder.create<arith::AddFOp>(loc, yComponent, xComponent);
-      auto angle = builder.create<arith::MulFOp>(loc, twoPi, sumXY);
+      auto yComponent = arith::DivFOp::create(builder, loc, iyRemFloat, constH);
+      auto xComponent = arith::DivFOp::create(builder, loc, ixRemFloat, constW);
+      auto sumXY = arith::AddFOp::create(builder, loc, yComponent, xComponent);
+      auto angle = arith::MulFOp::create(builder, loc, twoPi, sumXY);
 
       // realComponent = valReal * cos(angle)
       // imagComponent = valReal * sin(angle)
-      auto cosAngle = builder.create<math::CosOp>(loc, angle);
-      auto sinAngle = builder.create<math::SinOp>(loc, angle);
+      auto cosAngle = math::CosOp::create(builder, loc, angle);
+      auto sinAngle = math::SinOp::create(builder, loc, angle);
       auto realComponent =
-          builder.create<arith::MulFOp>(loc, valReal, cosAngle);
+          arith::MulFOp::create(builder, loc, valReal, cosAngle);
       auto imagComponent =
-          builder.create<arith::MulFOp>(loc, valReal, sinAngle);
+          arith::MulFOp::create(builder, loc, valReal, sinAngle);
 
       // outReal = sumReal + realComponent
       // outImag = sumImag - imagComponent
-      auto outReal = builder.create<arith::AddFOp>(loc, sumReal, realComponent);
-      auto outImag = builder.create<arith::SubFOp>(loc, sumImag, imagComponent);
+      auto outReal =
+          arith::AddFOp::create(builder, loc, sumReal, realComponent);
+      auto outImag =
+          arith::SubFOp::create(builder, loc, sumImag, imagComponent);
 
-      builder.create<linalg::YieldOp>(loc, ValueRange{outReal, outImag});
+      linalg::YieldOp::create(builder, loc, ValueRange{outReal, outImag});
     };
 
     rewriter.replaceOpWithNewOp<linalg::GenericOp>(
@@ -2760,7 +2783,7 @@ struct FFT2dConverter final : OpRewritePattern<FFT2dOp> {
 
     // Constants and dimension sizes
     auto twoPiAttr = rewriter.getFloatAttr(real_el_ty, 6.283185307179586);
-    auto twoPi = rewriter.create<arith::ConstantOp>(loc, twoPiAttr);
+    auto twoPi = arith::ConstantOp::create(rewriter, loc, twoPiAttr);
     Value constH =
         RFFT2dConverter::castIndexToFloat(rewriter, loc, real_el_ty, dimH);
     Value constW =
@@ -2773,57 +2796,59 @@ struct FFT2dConverter final : OpRewritePattern<FFT2dOp> {
       Value sumImag = args[3];
 
       // Indices for angle computation
-      Value oy = builder.create<linalg::IndexOp>(loc, 1);
-      Value ox = builder.create<linalg::IndexOp>(loc, 2);
-      Value iy = builder.create<linalg::IndexOp>(loc, 3);
-      Value ix = builder.create<linalg::IndexOp>(loc, 4);
+      Value oy = linalg::IndexOp::create(builder, loc, 1);
+      Value ox = linalg::IndexOp::create(builder, loc, 2);
+      Value iy = linalg::IndexOp::create(builder, loc, 3);
+      Value ix = linalg::IndexOp::create(builder, loc, 4);
 
       // float_t angle = sign_val * 2 * pi() * ( ( (iy * oy) % H) / H + ( (ix *
       // ox) % W ) / W);
-      auto iyXoy = builder.create<index::MulOp>(loc, iy, oy);
-      auto ixXox = builder.create<index::MulOp>(loc, ix, ox);
+      auto iyXoy = index::MulOp::create(builder, loc, iy, oy);
+      auto ixXox = index::MulOp::create(builder, loc, ix, ox);
 
-      auto iyRem = builder.create<index::RemUOp>(loc, iyXoy, dimH);
-      auto ixRem = builder.create<index::RemUOp>(loc, ixXox, dimW);
+      auto iyRem = index::RemUOp::create(builder, loc, iyXoy, dimH);
+      auto ixRem = index::RemUOp::create(builder, loc, ixXox, dimW);
 
       auto iyRemFloat =
           RFFT2dConverter::castIndexToFloat(builder, loc, real_el_ty, iyRem);
       auto ixRemFloat =
           RFFT2dConverter::castIndexToFloat(builder, loc, real_el_ty, ixRem);
 
-      auto yComponent = builder.create<arith::DivFOp>(loc, iyRemFloat, constH);
-      auto xComponent = builder.create<arith::DivFOp>(loc, ixRemFloat, constW);
+      auto yComponent = arith::DivFOp::create(builder, loc, iyRemFloat, constH);
+      auto xComponent = arith::DivFOp::create(builder, loc, ixRemFloat, constW);
 
-      auto sumXY = builder.create<arith::AddFOp>(loc, yComponent, xComponent);
-      auto angle = builder.create<arith::MulFOp>(loc, twoPi, sumXY);
+      auto sumXY = arith::AddFOp::create(builder, loc, yComponent, xComponent);
+      auto angle = arith::MulFOp::create(builder, loc, twoPi, sumXY);
 
       if (inverse.getValue()) {
-        angle = builder.create<arith::MulFOp>(
-            loc, angle,
-            rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getFloatAttr(real_el_ty, -1.0)));
+        angle = arith::MulFOp::create(
+            builder, loc, angle,
+            arith::ConstantOp::create(rewriter, loc,
+                                      rewriter.getFloatAttr(real_el_ty, -1.0)));
       }
 
       // realComponent = val_real * cos(a) + val_imag * sin(a);
       // imagComponent = -val_real * sin(a) + val_imag * cos(a);
-      auto cosAngle = builder.create<math::CosOp>(loc, angle);
-      auto sinAngle = builder.create<math::SinOp>(loc, angle);
+      auto cosAngle = math::CosOp::create(builder, loc, angle);
+      auto sinAngle = math::SinOp::create(builder, loc, angle);
 
-      auto rcos = builder.create<arith::MulFOp>(loc, valReal, cosAngle);
-      auto rsin = builder.create<arith::MulFOp>(loc, valImag, sinAngle);
-      auto realComponent = builder.create<arith::AddFOp>(loc, rcos, rsin);
+      auto rcos = arith::MulFOp::create(builder, loc, valReal, cosAngle);
+      auto rsin = arith::MulFOp::create(builder, loc, valImag, sinAngle);
+      auto realComponent = arith::AddFOp::create(builder, loc, rcos, rsin);
 
-      auto icos = builder.create<arith::MulFOp>(loc, valImag, cosAngle);
-      auto isin = builder.create<arith::MulFOp>(loc, valReal, sinAngle);
+      auto icos = arith::MulFOp::create(builder, loc, valImag, cosAngle);
+      auto isin = arith::MulFOp::create(builder, loc, valReal, sinAngle);
 
-      auto imagComponent = builder.create<arith::SubFOp>(loc, icos, isin);
+      auto imagComponent = arith::SubFOp::create(builder, loc, icos, isin);
 
       // outReal = sumReal + realComponent
       // outImag = sumImag - imagComponent
-      auto outReal = builder.create<arith::AddFOp>(loc, sumReal, realComponent);
-      auto outImag = builder.create<arith::AddFOp>(loc, sumImag, imagComponent);
+      auto outReal =
+          arith::AddFOp::create(builder, loc, sumReal, realComponent);
+      auto outImag =
+          arith::AddFOp::create(builder, loc, sumImag, imagComponent);
 
-      builder.create<linalg::YieldOp>(loc, ValueRange{outReal, outImag});
+      linalg::YieldOp::create(builder, loc, ValueRange{outReal, outImag});
     };
 
     rewriter.replaceOpWithNewOp<linalg::GenericOp>(
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index 00b9a065dfb3d..3a205246ddd9e 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -52,11 +52,11 @@ static mlir::Value applyPad(Location loc, Value input, ArrayRef<int64_t> pad,
     highIndices.push_back(rewriter.getIndexAttr(highPad));
   }
 
-  Value padValue = rewriter.create<arith::ConstantOp>(loc, padAttr);
+  Value padValue = arith::ConstantOp::create(rewriter, loc, padAttr);
 
-  return rewriter.create<tensor::PadOp>(
-      loc, RankedTensorType::get(paddedShape, inputETy), input, lowIndices,
-      highIndices, padValue);
+  return tensor::PadOp::create(rewriter, loc,
+                               RankedTensorType::get(paddedShape, inputETy),
+                               input, lowIndices, highIndices, padValue);
 }
 
 static mlir::Value
@@ -72,10 +72,10 @@ linalgIntBroadcastExtSIAdd(PatternRewriter &rewriter, Location loc, Value bias,
             Value biasVal = args[0];
             Type resType = args[1].getType();
             if (resType != biasVal.getType()) {
-              biasVal = builder.create<arith::ExtSIOp>(loc, resType, biasVal);
+              biasVal = arith::ExtSIOp::create(builder, loc, resType, biasVal);
             }
-            Value added = builder.create<arith::AddIOp>(loc, biasVal, args[1]);
-            builder.create<linalg::YieldOp>(loc, added);
+            Value added = arith::AddIOp::create(builder, loc, biasVal, args[1]);
+            linalg::YieldOp::create(builder, loc, added);
           })
       .getResult(0);
 }
@@ -134,19 +134,19 @@ static mlir::Value linalgBroadcastAndMaybeExt(PatternRewriter &rewriter,
             if (resType != biasVal.getType()) {
               biasVal =
                   resultTy.getElementType().isFloat()
-                      ? builder.create<arith::ExtFOp>(loc, resType, biasVal)
+                      ? arith::ExtFOp::create(builder, loc, resType, biasVal)
                             .getResult()
-                      : builder.create<arith::ExtSIOp>(loc, resType, biasVal)
+                      : arith::ExtSIOp::create(builder, loc, resType, biasVal)
                             .getResult();
             }
-            builder.create<linalg::YieldOp>(loc, biasVal);
+            linalg::YieldOp::create(builder, loc, biasVal);
           })
       .getResult(0);
 }
 
 static mlir::Value reifyConstantDim(int64_t attr,
                                     ImplicitLocOpBuilder &builder) {
-  return builder.create<arith::ConstantIndexOp>(attr);
+  return arith::ConstantIndexOp::create(builder, attr);
 }
 
 // Calculating the output width/height using the formula:
@@ -160,22 +160,22 @@ static mlir::Value getConvOrPoolOutputDim(Location loc, Value inputDim,
                                           int64_t dilationAttr,
                                           OpBuilder &rewriter) {
   ImplicitLocOpBuilder builder(loc, rewriter);
-  auto one = rewriter.create<arith::ConstantOp>(
-      loc, IntegerAttr::get(inputDim.getType(), 1));
+  auto one = arith::ConstantOp::create(rewriter, loc,
+                                       IntegerAttr::get(inputDim.getType(), 1));
   Value padBefore = reifyConstantDim(padBeforeAttr, builder);
-  Value paddedBefore = builder.create<arith::AddIOp>(inputDim, padBefore);
+  Value paddedBefore = arith::AddIOp::create(builder, inputDim, padBefore);
   Value padAfter = reifyConstantDim(padAfterAttr, builder);
-  Value paddedAfter = builder.create<arith::AddIOp>(paddedBefore, padAfter);
+  Value paddedAfter = arith::AddIOp::create(builder, paddedBefore, padAfter);
 
-  Value subOne = builder.create<arith::SubIOp>(kernelDim, one);
+  Value subOne = arith::SubIOp::create(builder, kernelDim, one);
   Value dilation = reifyConstantDim(dilationAttr, builder);
-  Value dilated = builder.create<arith::MulIOp>(dilation, subOne);
-  Value addOne = builder.create<arith::AddIOp>(dilated, one);
+  Value dilated = arith::MulIOp::create(builder, dilation, subOne);
+  Value addOne = arith::AddIOp::create(builder, dilated, one);
 
-  Value subtract = builder.create<arith::SubIOp>(paddedAfter, addOne);
+  Value subtract = arith::SubIOp::create(builder, paddedAfter, addOne);
   Value stride = reifyConstantDim(strideAttr, builder);
-  Value divide = builder.create<arith::DivUIOp>(subtract, stride);
-  return builder.create<arith::AddIOp>(divide, one);
+  Value divide = arith::DivUIOp::create(builder, subtract, stride);
+  return arith::AddIOp::create(builder, divide, one);
 }
 
 // Creates a vector of the dynamic output dims for Conv2D and Depthwise_Conv2D
@@ -198,9 +198,9 @@ static SmallVector<Value> inferDynamicDimsForConv(
       auto padBottom = padAttr[i * 2 + 1];
       auto stride = strideAttr[i];
       auto dilation = dilationAttr[i];
-      Value initDynDim = rewriter.create<tensor::DimOp>(loc, input, inputDim);
+      Value initDynDim = tensor::DimOp::create(rewriter, loc, input, inputDim);
       Value kernelDynDim =
-          rewriter.create<tensor::DimOp>(loc, weight, kernelDim);
+          tensor::DimOp::create(rewriter, loc, weight, kernelDim);
       // H = F(IH, pad_top, pad_bottom, dilation_y, KH, stride_y)
       dynDims[inputDim] =
           getConvOrPoolOutputDim(loc, initDynDim, padTop, padBottom,
@@ -211,7 +211,7 @@ static SmallVector<Value> inferDynamicDimsForConv(
   // Get the batch/channels dimensions.
   for (int i = 0; i < inputRank; i++) {
     if (resultTy.isDynamicDim(i) && !dynDims[i])
-      dynDims[i] = rewriter.create<tensor::DimOp>(loc, input, i);
+      dynDims[i] = tensor::DimOp::create(rewriter, loc, input, i);
   }
 
   SmallVector<Value> filteredDims = condenseValues(dynDims);
@@ -350,8 +350,8 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
         auto weightPermAttr = rewriter.getDenseI32ArrayAttr(weightPerm);
         Type newWeightTy =
             RankedTensorType::get(newWeightShape, weightTy.getElementType());
-        weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
-                                                    weightPermAttr);
+        weight = tosa::TransposeOp::create(rewriter, loc, newWeightTy, weight,
+                                           weightPermAttr);
       }
     }
 
@@ -372,8 +372,8 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
       auto weightPermAttr = rewriter.getDenseI32ArrayAttr(weightPerm);
       Type newWeightTy =
           RankedTensorType::get(newWeightShape, weightTy.getElementType());
-      weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
-                                                  weightPermAttr);
+      weight = tosa::TransposeOp::create(rewriter, loc, newWeightTy, weight,
+                                         weightPermAttr);
     }
 
     // Extract the attributes for convolution.
@@ -384,8 +384,8 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
     auto strideAttr = rewriter.getI64TensorAttr(stride);
     auto dilationAttr = rewriter.getI64TensorAttr(dilation);
 
-    Value biasEmptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, resultTy.getShape(), accETy, filteredDims);
+    Value biasEmptyTensor = tensor::EmptyOp::create(
+        rewriter, loc, resultTy.getShape(), accETy, filteredDims);
 
     Value broadcastBias =
         linalgBroadcastAndMaybeExt(rewriter, loc, bias, biasEmptyTensor);
@@ -394,8 +394,8 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
       auto iZp = rewriter.getI32IntegerAttr(inputZpVal);
       auto kZp = rewriter.getI32IntegerAttr(weightZpVal);
 
-      auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
-      auto kZpVal = rewriter.create<arith::ConstantOp>(loc, kZp);
+      auto iZpVal = arith::ConstantOp::create(rewriter, loc, iZp);
+      auto kZpVal = arith::ConstantOp::create(rewriter, loc, kZp);
 
       Value conv =
           rewriter
@@ -417,7 +417,7 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
     // We may need to truncate back to the result type if the accumulator was
     // wider than the result.
     if (resultTy != accTy)
-      conv = rewriter.create<tosa::CastOp>(loc, resultTy, conv);
+      conv = tosa::CastOp::create(rewriter, loc, resultTy, conv);
 
     rewriter.replaceOp(op, conv);
     return success();
@@ -526,16 +526,16 @@ class DepthwiseConvConverter
                               accETy);
 
     auto resultZeroAttr = rewriter.getZeroAttr(accETy);
-    Value emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, linalgConvTy.getShape(), accETy, filteredDims);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, resultZeroAttr);
+    Value emptyTensor = tensor::EmptyOp::create(
+        rewriter, loc, linalgConvTy.getShape(), accETy, filteredDims);
+    Value zero = arith::ConstantOp::create(rewriter, loc, resultZeroAttr);
     Value zeroTensor = rewriter
                            .create<linalg::FillOp>(loc, ValueRange{zero},
                                                    ValueRange{emptyTensor})
                            .result();
 
-    Value biasEmptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, resultTy.getShape(), resultETy, filteredDims);
+    Value biasEmptyTensor = tensor::EmptyOp::create(
+        rewriter, loc, resultTy.getShape(), resultETy, filteredDims);
 
     // Broadcast the initial value to the output tensor before convolving.
     SmallVector<AffineMap, 4> indexingMaps;
@@ -553,16 +553,16 @@ class DepthwiseConvConverter
       // We may need to truncate back to the result type if the accumulator was
       // wider than the result.
       if (accETy != resultETy)
-        conv = rewriter.create<tosa::CastOp>(
-            loc,
+        conv = tosa::CastOp::create(
+            rewriter, loc,
             RankedTensorType::get(cast<ShapedType>(conv.getType()).getShape(),
                                   resultETy),
             conv);
 
       SmallVector<ReassociationExprs, 4> reassociationMap;
       createDepthwiseConvCollapseMap(resultRank, reassociationMap, rewriter);
-      Value convReshape = rewriter.create<tensor::CollapseShapeOp>(
-          loc, resultTy, conv, reassociationMap);
+      Value convReshape = tensor::CollapseShapeOp::create(
+          rewriter, loc, resultTy, conv, reassociationMap);
 
       Value result =
           rewriter
@@ -574,20 +574,20 @@ class DepthwiseConvConverter
                       ValueRange args) {
                     Value added;
                     if (llvm::isa<FloatType>(inputETy))
-                      added = nestedBuilder.create<arith::AddFOp>(loc, args[0],
-                                                                  args[1]);
+                      added = arith::AddFOp::create(nestedBuilder, loc, args[0],
+                                                    args[1]);
                     else
-                      added = nestedBuilder.create<arith::AddIOp>(loc, args[0],
-                                                                  args[1]);
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                      added = arith::AddIOp::create(nestedBuilder, loc, args[0],
+                                                    args[1]);
+                    linalg::YieldOp::create(nestedBuilder, nestedLoc, added);
                   })
               .getResult(0);
       rewriter.replaceOp(op, result);
     } else {
       IntegerAttr iZp = rewriter.getI32IntegerAttr(inputZpVal);
       IntegerAttr wZp = rewriter.getI32IntegerAttr(weightZpVal);
-      auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
-      auto kZpVal = rewriter.create<arith::ConstantOp>(loc, wZp);
+      auto iZpVal = arith::ConstantOp::create(rewriter, loc, iZp);
+      auto kZpVal = arith::ConstantOp::create(rewriter, loc, wZp);
       Value conv =
           rewriter
               .create<linalg::DepthwiseConv2DNhwcHwcmQOp>(
@@ -596,8 +596,8 @@ class DepthwiseConvConverter
               .getResult(0);
       SmallVector<ReassociationExprs, 4> reassociationMap;
       createDepthwiseConvCollapseMap(resultRank, reassociationMap, rewriter);
-      Value convReshape = rewriter.create<tensor::CollapseShapeOp>(
-          loc, resultTy, conv, reassociationMap);
+      Value convReshape = tensor::CollapseShapeOp::create(
+          rewriter, loc, resultTy, conv, reassociationMap);
       Value result = linalgIntBroadcastExtSIAdd(
           rewriter, loc, bias, convReshape, biasEmptyTensor, indexingMaps);
       rewriter.replaceOp(op, result);
@@ -621,23 +621,24 @@ class MatMulConverter : public OpConversionPattern<tosa::MatMulOp> {
     dynDims.resize(cast<ShapedType>(op->getResult(0).getType()).getRank());
 
     if (!outputTy.hasRank() || outputTy.isDynamicDim(0)) {
-      dynDims[0] = rewriter.create<tensor::DimOp>(loc, op->getOperand(0), 0);
+      dynDims[0] = tensor::DimOp::create(rewriter, loc, op->getOperand(0), 0);
     }
 
     if (!outputTy.hasRank() || outputTy.isDynamicDim(1)) {
-      dynDims[1] = rewriter.create<tensor::DimOp>(loc, op->getOperand(0), 1);
+      dynDims[1] = tensor::DimOp::create(rewriter, loc, op->getOperand(0), 1);
     }
 
     if (!outputTy.hasRank() || outputTy.isDynamicDim(2)) {
-      dynDims[2] = rewriter.create<tensor::DimOp>(loc, op->getOperand(1), 2);
+      dynDims[2] = tensor::DimOp::create(rewriter, loc, op->getOperand(1), 2);
     }
 
     SmallVector<Value> filteredDims = condenseValues(dynDims);
 
     auto zeroAttr = rewriter.getZeroAttr(outputElementTy);
-    Value zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, outputTy.getShape(), outputTy.getElementType(), filteredDims);
+    Value zero = arith::ConstantOp::create(rewriter, loc, zeroAttr);
+    auto emptyTensor =
+        tensor::EmptyOp::create(rewriter, loc, outputTy.getShape(),
+                                outputTy.getElementType(), filteredDims);
     Value zeroTensor = rewriter
                            .create<linalg::FillOp>(loc, ValueRange{zero},
                                                    ValueRange{emptyTensor})
@@ -670,10 +671,10 @@ class MatMulConverter : public OpConversionPattern<tosa::MatMulOp> {
       return success();
     }
 
-    auto aZp = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(aZpVal));
-    auto bZp = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(bZpVal));
+    auto aZp = arith::ConstantOp::create(rewriter, loc,
+                                         rewriter.getI32IntegerAttr(aZpVal));
+    auto bZp = arith::ConstantOp::create(rewriter, loc,
+                                         rewriter.getI32IntegerAttr(bZpVal));
     rewriter.replaceOpWithNewOp<linalg::QuantizedBatchMatmulOp>(
         op, TypeRange{op.getType()},
         ValueRange{adaptor.getA(), adaptor.getB(), aZp, bZp}, zeroTensor);
@@ -702,7 +703,7 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
 
     // Batch dimension
     if (resultTy.isDynamicDim(0))
-      dynamicDims.push_back(rewriter.create<tensor::DimOp>(loc, input, 0));
+      dynamicDims.push_back(tensor::DimOp::create(rewriter, loc, input, 0));
 
     // Height/width dimensions
     for (int64_t dim : {1, 2}) {
@@ -713,10 +714,10 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
       int64_t index = dim - 1;
 
       // Input height/width
-      Value ihw = rewriter.create<tensor::DimOp>(loc, input, dim);
+      Value ihw = tensor::DimOp::create(rewriter, loc, input, dim);
 
       // Kernel height/width
-      Value khw = rewriter.create<arith::ConstantIndexOp>(loc, kernel[index]);
+      Value khw = arith::ConstantIndexOp::create(rewriter, loc, kernel[index]);
 
       // Output height/width
       Value ohw = getConvOrPoolOutputDim(loc, ihw, pad[index * 2],
@@ -727,7 +728,7 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
 
     // Channel dimension
     if (resultTy.isDynamicDim(3))
-      dynamicDims.push_back(rewriter.create<tensor::DimOp>(loc, input, 3));
+      dynamicDims.push_back(tensor::DimOp::create(rewriter, loc, input, 3));
 
     return dynamicDims;
   }
@@ -776,7 +777,7 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
 
     Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter);
 
-    Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
+    Value initialValue = arith::ConstantOp::create(rewriter, loc, initialAttr);
 
     ArrayRef<int64_t> kernel = op.getKernel();
     ArrayRef<int64_t> stride = op.getStride();
@@ -785,15 +786,16 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
     Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1});
 
     // Create the linalg op that performs pooling.
-    Value emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, resultTy.getShape(), resultTy.getElementType(), dynamicDims);
+    Value emptyTensor =
+        tensor::EmptyOp::create(rewriter, loc, resultTy.getShape(),
+                                resultTy.getElementType(), dynamicDims);
 
     Value filledEmptyTensor =
-        rewriter.create<linalg::FillOp>(loc, initialValue, emptyTensor)
+        linalg::FillOp::create(rewriter, loc, initialValue, emptyTensor)
             .result();
 
     Value fakeWindowDims =
-        rewriter.create<tensor::EmptyOp>(loc, kernel, resultETy);
+        tensor::EmptyOp::create(rewriter, loc, kernel, resultETy);
 
     if (isUnsigned) {
       rewriter.replaceOpWithNewOp<linalg::PoolingNhwcMaxUnsignedOp>(
@@ -802,8 +804,8 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
       return llvm::success();
     }
 
-    auto resultOp = rewriter.create<linalg::PoolingNhwcMaxOp>(
-        op->getLoc(), ArrayRef<Type>{resultTy},
+    auto resultOp = linalg::PoolingNhwcMaxOp::create(
+        rewriter, op->getLoc(), ArrayRef<Type>{resultTy},
         ValueRange{paddedInput, fakeWindowDims}, filledEmptyTensor, strideAttr,
         dilationAttr);
 
@@ -823,9 +825,10 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
     // it to include the appropriate checks. If the current value is NaN the
     // old value of pool will be taken otherwise we use the result.
     if (nanMode == "IGNORE") {
-      auto genericOp = rewriter.create<linalg::GenericOp>(
-          loc, resultOp.getType(0), resultOp.getInputs(), resultOp.getOutputs(),
-          resultOp.getIndexingMapsArray(), resultOp.getIteratorTypesArray(),
+      auto genericOp = linalg::GenericOp::create(
+          rewriter, loc, resultOp.getType(0), resultOp.getInputs(),
+          resultOp.getOutputs(), resultOp.getIndexingMapsArray(),
+          resultOp.getIteratorTypesArray(),
           [&](OpBuilder &opBuilder, Location loc, ValueRange blockArgs) {
             IRMapping map;
             auto oldBlock = resultOp.getRegion().begin();
@@ -833,12 +836,12 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
             auto &oldMaxOp = *resultOp.getBlock()->begin();
             map.map(oldArgs, blockArgs);
             auto *newOp = opBuilder.clone(oldMaxOp, map);
-            Value isNaN = opBuilder.create<arith::CmpFOp>(
-                loc, arith::CmpFPredicate::UNO, blockArgs.front(),
-                blockArgs.front());
-            auto selectOp = opBuilder.create<arith::SelectOp>(
-                loc, isNaN, blockArgs.back(), newOp->getResult(0));
-            opBuilder.create<linalg::YieldOp>(loc, selectOp.getResult());
+            Value isNaN =
+                arith::CmpFOp::create(opBuilder, loc, arith::CmpFPredicate::UNO,
+                                      blockArgs.front(), blockArgs.front());
+            auto selectOp = arith::SelectOp::create(
+                opBuilder, loc, isNaN, blockArgs.back(), newOp->getResult(0));
+            linalg::YieldOp::create(opBuilder, loc, selectOp.getResult());
           });
       rewriter.replaceOp(resultOp, genericOp);
     }
@@ -894,7 +897,7 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
     Value paddedInput = applyPad(loc, input, pad, padAttr, rewriter);
 
     auto initialAttr = rewriter.getZeroAttr(accETy);
-    Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
+    Value initialValue = arith::ConstantOp::create(rewriter, loc, initialAttr);
 
     ArrayRef<int64_t> kernel = op.getKernel();
     ArrayRef<int64_t> stride = op.getStride();
@@ -903,8 +906,8 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
     Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1});
 
     // Create the linalg op that performs pooling.
-    Value poolEmptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, accTy.getShape(), accETy, dynamicDims);
+    Value poolEmptyTensor = tensor::EmptyOp::create(
+        rewriter, loc, accTy.getShape(), accETy, dynamicDims);
 
     Value filledEmptyTensor =
         rewriter
@@ -913,7 +916,7 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
             .result();
 
     Value fakeWindowDims =
-        rewriter.create<tensor::EmptyOp>(loc, kernel, accETy);
+        tensor::EmptyOp::create(rewriter, loc, kernel, accETy);
 
     // Sum across the pooled region.
     Value poolingOp = rewriter
@@ -925,24 +928,24 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
 
     // Normalize the summed value by the number of elements grouped in each
     // pool.
-    Value iH = rewriter.create<tensor::DimOp>(loc, poolingOp, 1);
-    Value iW = rewriter.create<tensor::DimOp>(loc, poolingOp, 2);
+    Value iH = tensor::DimOp::create(rewriter, loc, poolingOp, 1);
+    Value iW = tensor::DimOp::create(rewriter, loc, poolingOp, 2);
 
-    auto one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    iH = rewriter.create<arith::SubIOp>(loc, iH, one);
-    iW = rewriter.create<arith::SubIOp>(loc, iW, one);
+    auto one = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    iH = arith::SubIOp::create(rewriter, loc, iH, one);
+    iW = arith::SubIOp::create(rewriter, loc, iW, one);
 
-    Value genericEmptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, resultTy.getShape(), resultETy, dynamicDims);
+    Value genericEmptyTensor = tensor::EmptyOp::create(
+        rewriter, loc, resultTy.getShape(), resultETy, dynamicDims);
 
     auto affineMap = rewriter.getMultiDimIdentityMap(resultTy.getRank());
-    auto genericOp = rewriter.create<linalg::GenericOp>(
-        loc, ArrayRef<Type>({resultTy}), ValueRange{poolingOp},
+    auto genericOp = linalg::GenericOp::create(
+        rewriter, loc, ArrayRef<Type>({resultTy}), ValueRange{poolingOp},
         ValueRange{genericEmptyTensor},
         ArrayRef<AffineMap>({affineMap, affineMap}),
         getNParallelLoopsAttrs(resultTy.getRank()),
         [&](OpBuilder &b, Location loc, ValueRange args) {
-          auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+          auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
 
           // Determines what the portion of valid input is covered by the
           // kernel.
@@ -950,30 +953,30 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
             if (pad == 0)
               return valid;
 
-            auto padVal = rewriter.create<arith::ConstantIndexOp>(loc, pad);
-            Value dpos = rewriter.create<arith::SubIOp>(loc, pos, padVal);
+            auto padVal = arith::ConstantIndexOp::create(rewriter, loc, pad);
+            Value dpos = arith::SubIOp::create(rewriter, loc, pos, padVal);
 
-            Value offset = rewriter.create<arith::MinSIOp>(loc, dpos, zero);
-            return rewriter.create<arith::AddIOp>(loc, valid, offset)
+            Value offset = arith::MinSIOp::create(rewriter, loc, dpos, zero);
+            return arith::AddIOp::create(rewriter, loc, valid, offset)
                 ->getResult(0);
           };
 
           auto coverageFn = [&](int64_t i, Value isize) -> Value {
             Value strideVal =
-                rewriter.create<arith::ConstantIndexOp>(loc, stride[i - 1]);
+                arith::ConstantIndexOp::create(rewriter, loc, stride[i - 1]);
             Value val =
-                rewriter.create<arith::ConstantIndexOp>(loc, kernel[i - 1]);
+                arith::ConstantIndexOp::create(rewriter, loc, kernel[i - 1]);
 
             // Find the position relative to the input tensor's ends.
-            Value left = rewriter.create<linalg::IndexOp>(loc, i);
-            Value right = rewriter.create<arith::SubIOp>(loc, isize, left);
-            left = rewriter.create<arith::MulIOp>(loc, left, strideVal);
-            right = rewriter.create<arith::MulIOp>(loc, right, strideVal);
+            Value left = linalg::IndexOp::create(rewriter, loc, i);
+            Value right = arith::SubIOp::create(rewriter, loc, isize, left);
+            left = arith::MulIOp::create(rewriter, loc, left, strideVal);
+            right = arith::MulIOp::create(rewriter, loc, right, strideVal);
 
             // Determine how much padding was included.
             val = padFn(val, left, pad[i * 2]);
             val = padFn(val, right, pad[i * 2 + 1]);
-            return rewriter.create<arith::MaxSIOp>(loc, one, val);
+            return arith::MaxSIOp::create(rewriter, loc, one, val);
           };
 
           // Compute the indices from either end.
@@ -981,70 +984,70 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
           Value kW3 = coverageFn(2, iW);
 
           // Compute the total number of elements and normalize.
-          auto count = rewriter.create<arith::IndexCastOp>(
-              loc, rewriter.getI32Type(),
-              rewriter.create<arith::MulIOp>(loc, kH3, kW3));
+          auto count = arith::IndexCastOp::create(
+              rewriter, loc, rewriter.getI32Type(),
+              arith::MulIOp::create(rewriter, loc, kH3, kW3));
 
           // Divide by the number of summed values. For floats this is just
           // a div however for quantized values input normalization had
           // to be applied.
           Value poolVal = args[0];
           if (isa<FloatType>(accETy)) {
-            auto countF = rewriter.create<arith::SIToFPOp>(loc, accETy, count);
-            poolVal = rewriter.create<arith::DivFOp>(loc, poolVal, countF)
+            auto countF = arith::SIToFPOp::create(rewriter, loc, accETy, count);
+            poolVal = arith::DivFOp::create(rewriter, loc, poolVal, countF)
                           ->getResult(0);
             if (accETy.getIntOrFloatBitWidth() >
                 resultETy.getIntOrFloatBitWidth())
               poolVal =
-                  rewriter.create<arith::TruncFOp>(loc, resultETy, poolVal);
+                  arith::TruncFOp::create(rewriter, loc, resultETy, poolVal);
           } else {
 
             // If we have quantization information we need to apply an offset
             // for the input zp value.
             if (inputZpVal != 0) {
-              auto inputZp = rewriter.create<arith::ConstantOp>(
-                  loc, b.getIntegerAttr(accETy, inputZpVal));
+              auto inputZp = arith::ConstantOp::create(
+                  rewriter, loc, b.getIntegerAttr(accETy, inputZpVal));
               Value offset =
-                  rewriter.create<arith::MulIOp>(loc, accETy, count, inputZp);
+                  arith::MulIOp::create(rewriter, loc, accETy, count, inputZp);
               poolVal =
-                  rewriter.create<arith::SubIOp>(loc, accETy, poolVal, offset);
+                  arith::SubIOp::create(rewriter, loc, accETy, poolVal, offset);
             }
 
             // Compute: k = 32 - count_leading_zeros(value - 1)
-            Value one32 = rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getI32IntegerAttr(1));
-            Value thirtyTwo32 = rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getI32IntegerAttr(32));
+            Value one32 = arith::ConstantOp::create(
+                rewriter, loc, rewriter.getI32IntegerAttr(1));
+            Value thirtyTwo32 = arith::ConstantOp::create(
+                rewriter, loc, rewriter.getI32IntegerAttr(32));
 
             Value countSubOne =
-                rewriter.create<arith::SubIOp>(loc, count, one32);
+                arith::SubIOp::create(rewriter, loc, count, one32);
             Value leadingZeros =
-                rewriter.create<math::CountLeadingZerosOp>(loc, countSubOne);
+                math::CountLeadingZerosOp::create(rewriter, loc, countSubOne);
             Value k =
-                rewriter.create<arith::SubIOp>(loc, thirtyTwo32, leadingZeros);
+                arith::SubIOp::create(rewriter, loc, thirtyTwo32, leadingZeros);
 
             // Compute: numerator = ((1 << 30) + 1) << k
             Value k64 =
-                rewriter.create<arith::ExtUIOp>(loc, rewriter.getI64Type(), k);
-            Value thirtyShiftPlusOne = rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getI64IntegerAttr((1 << 30) + 1));
+                arith::ExtUIOp::create(rewriter, loc, rewriter.getI64Type(), k);
+            Value thirtyShiftPlusOne = arith::ConstantOp::create(
+                rewriter, loc, rewriter.getI64IntegerAttr((1 << 30) + 1));
             Value numerator =
-                rewriter.create<arith::ShLIOp>(loc, thirtyShiftPlusOne, k64);
+                arith::ShLIOp::create(rewriter, loc, thirtyShiftPlusOne, k64);
 
             // Compute: scale.multiplier = numerator / value;
-            Value count64 = rewriter.create<arith::ExtUIOp>(
-                loc, rewriter.getI64Type(), count);
+            Value count64 = arith::ExtUIOp::create(
+                rewriter, loc, rewriter.getI64Type(), count);
             Value multiplier =
-                rewriter.create<arith::DivUIOp>(loc, numerator, count64);
-            multiplier = rewriter.create<arith::TruncIOp>(
-                loc, rewriter.getI32Type(), multiplier);
+                arith::DivUIOp::create(rewriter, loc, numerator, count64);
+            multiplier = arith::TruncIOp::create(
+                rewriter, loc, rewriter.getI32Type(), multiplier);
 
             // Compute: scale.shift = 30 + k
             Value k8 =
-                rewriter.create<arith::TruncIOp>(loc, rewriter.getI8Type(), k);
-            Value thirty8 = rewriter.create<arith::ConstantOp>(
-                loc, rewriter.getI8IntegerAttr(30));
-            Value shift = rewriter.create<arith::AddIOp>(loc, k8, thirty8);
+                arith::TruncIOp::create(rewriter, loc, rewriter.getI8Type(), k);
+            Value thirty8 = arith::ConstantOp::create(
+                rewriter, loc, rewriter.getI8IntegerAttr(30));
+            Value shift = arith::AddIOp::create(rewriter, loc, k8, thirty8);
 
             auto scaled =
                 rewriter
@@ -1056,20 +1059,21 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
             // If we have quantization information we need to apply output
             // zeropoint.
             if (outputZpVal != 0) {
-              auto outputZp = rewriter.create<arith::ConstantOp>(
-                  loc, b.getIntegerAttr(scaled.getType(), outputZpVal));
-              scaled = rewriter.create<arith::AddIOp>(loc, scaled, outputZp)
+              auto outputZp = arith::ConstantOp::create(
+                  rewriter, loc,
+                  b.getIntegerAttr(scaled.getType(), outputZpVal));
+              scaled = arith::AddIOp::create(rewriter, loc, scaled, outputZp)
                            .getResult();
             }
 
             // Apply Clip.
             int64_t outBitwidth = resultETy.getIntOrFloatBitWidth();
 
-            auto min = rewriter.create<arith::ConstantIntOp>(
-                loc, accETy,
+            auto min = arith::ConstantIntOp::create(
+                rewriter, loc, accETy,
                 APInt::getSignedMinValue(outBitwidth).getSExtValue());
-            auto max = rewriter.create<arith::ConstantIntOp>(
-                loc, accETy,
+            auto max = arith::ConstantIntOp::create(
+                rewriter, loc, accETy,
                 APInt::getSignedMaxValue(outBitwidth).getSExtValue());
             auto clamp = clampIntHelper(loc, scaled, min, max, rewriter,
                                         /*isUnsigned=*/false);
@@ -1078,11 +1082,11 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
             // Convert type.
             if (resultETy != clamp.getType()) {
               poolVal =
-                  rewriter.create<arith::TruncIOp>(loc, resultETy, poolVal);
+                  arith::TruncIOp::create(rewriter, loc, resultETy, poolVal);
             }
           }
 
-          rewriter.create<linalg::YieldOp>(loc, poolVal);
+          linalg::YieldOp::create(rewriter, loc, poolVal);
         });
 
     rewriter.replaceOp(op, genericOp.getResult(0));
@@ -1107,8 +1111,9 @@ class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
     auto permutedSizes =
         applyTOSAPermutation<OpFoldResult>(inputSizes, constantPerms);
 
-    auto permutedInit = rewriter.create<tensor::EmptyOp>(
-        loc, permutedSizes, op.getInput1().getType().getElementType());
+    auto permutedInit =
+        tensor::EmptyOp::create(rewriter, loc, permutedSizes,
+                                op.getInput1().getType().getElementType());
     rewriter.replaceOpWithNewOp<linalg::TransposeOp>(
         op, op.getInput1(), permutedInit,
         llvm::to_vector(llvm::map_range(
diff --git a/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp b/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp
index 7dbccd19a0518..b83f5ec9b0283 100644
--- a/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp
+++ b/mlir/lib/Conversion/TosaToMLProgram/TosaToMLProgram.cpp
@@ -27,8 +27,8 @@ class VariableOpConverter : public OpRewritePattern<tosa::VariableOp> {
   LogicalResult matchAndRewrite(tosa::VariableOp op,
                                 PatternRewriter &rewriter) const final {
     auto variableType = tosa::getVariableType(op);
-    auto newVariable = rewriter.create<mlir::ml_program::GlobalOp>(
-        op.getLoc(), op.getName(), variableType, /*is_mutable=*/true,
+    auto newVariable = mlir::ml_program::GlobalOp::create(
+        rewriter, op.getLoc(), op.getName(), variableType, /*is_mutable=*/true,
         op.getInitialValueAttr(), /*sym_visibility=*/nullptr);
     newVariable.setPrivate();
     rewriter.replaceOp(op, newVariable);
@@ -45,8 +45,8 @@ class VariableWriteOpConverter
                                 PatternRewriter &rewriter) const final {
     auto globalSymbolRef =
         SymbolRefAttr::get(rewriter.getContext(), op.getName());
-    auto newVariableWrite = rewriter.create<ml_program::GlobalStoreOp>(
-        op.getLoc(), globalSymbolRef, op.getInput1());
+    auto newVariableWrite = ml_program::GlobalStoreOp::create(
+        rewriter, op.getLoc(), globalSymbolRef, op.getInput1());
     rewriter.replaceOp(op, newVariableWrite);
     return success();
   }
@@ -60,8 +60,8 @@ class VariableReadOpConverter : public OpRewritePattern<tosa::VariableReadOp> {
                                 PatternRewriter &rewriter) const final {
     auto globalSymbolRef =
         SymbolRefAttr::get(rewriter.getContext(), op.getName());
-    auto newVariableRead = rewriter.create<ml_program::GlobalLoadOp>(
-        op.getLoc(), op.getType(), globalSymbolRef);
+    auto newVariableRead = ml_program::GlobalLoadOp::create(
+        rewriter, op.getLoc(), op.getType(), globalSymbolRef);
     rewriter.replaceOp(op, newVariableRead);
 
     return success();
diff --git a/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp b/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp
index 03f9d20ad69de..aa6b4164e9876 100644
--- a/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp
+++ b/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp
@@ -30,7 +30,7 @@ static void inlineIfCase(Region &srcRegion, Region &dstRegion,
 
   auto yield = cast<YieldOp>(headBlock->getTerminator());
   rewriter.setInsertionPoint(yield);
-  rewriter.create<scf::YieldOp>(yield.getLoc(), yield.getInputs());
+  scf::YieldOp::create(rewriter, yield.getLoc(), yield.getInputs());
   rewriter.eraseOp(yield);
 
   headBlock->eraseArguments(0, headBlock->getNumArguments());
@@ -46,13 +46,13 @@ static void inlineWhileCase(Region &srcRegion, Region &dstRegion,
   auto yield = cast<YieldOp>(headBlock->getTerminator());
   rewriter.setInsertionPoint(yield);
   if (isCond) {
-    auto condition =
-        rewriter.create<tensor::ExtractOp>(yield.getLoc(), yield.getOperand(0));
-    rewriter.create<scf::ConditionOp>(yield.getLoc(), condition,
-                                      headBlock->getArguments());
+    auto condition = tensor::ExtractOp::create(rewriter, yield.getLoc(),
+                                               yield.getOperand(0));
+    scf::ConditionOp::create(rewriter, yield.getLoc(), condition,
+                             headBlock->getArguments());
   } else {
     rewriter.setInsertionPoint(yield);
-    rewriter.create<scf::YieldOp>(yield.getLoc(), yield.getInputs());
+    scf::YieldOp::create(rewriter, yield.getLoc(), yield.getInputs());
   }
   rewriter.eraseOp(yield);
 }
@@ -66,9 +66,9 @@ class IfOpConverter : public OpRewritePattern<tosa::IfOp> {
   LogicalResult matchAndRewrite(tosa::IfOp op,
                                 PatternRewriter &rewriter) const final {
     auto condition =
-        rewriter.create<tensor::ExtractOp>(op.getLoc(), op.getCondition());
-    auto newIf = rewriter.create<scf::IfOp>(op.getLoc(), op.getResultTypes(),
-                                            condition, true);
+        tensor::ExtractOp::create(rewriter, op.getLoc(), op.getCondition());
+    auto newIf = scf::IfOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                   condition, true);
 
     inlineIfCase(op.getThenGraph(), newIf.getThenRegion(), op.getInputList(),
                  rewriter);
@@ -88,7 +88,7 @@ class ScatterOpConverter : public OpRewritePattern<tosa::ScatterOp> {
 
   static Value createIndexConst(OpBuilder &builder, Location loc,
                                 int64_t value) {
-    return builder.create<arith::ConstantIndexOp>(loc, value);
+    return arith::ConstantIndexOp::create(builder, loc, value);
   }
 
 public:
@@ -119,9 +119,9 @@ class ScatterOpConverter : public OpRewritePattern<tosa::ScatterOp> {
       auto n = ivs[0];
 
       // Read the index and cast it to index type
-      auto index = builder.create<tensor::ExtractOp>(loc, indices, ivs);
-      auto castIndex = builder.create<arith::IndexCastOp>(
-          loc, builder.getIndexType(), index);
+      auto index = tensor::ExtractOp::create(builder, loc, indices, ivs);
+      auto castIndex = arith::IndexCastOp::create(
+          builder, loc, builder.getIndexType(), index);
 
       // Offset, sizes, and strides for the input tensor
       auto inputOffset = llvm::to_vector(ivs);
@@ -130,13 +130,13 @@ class ScatterOpConverter : public OpRewritePattern<tosa::ScatterOp> {
       llvm::SmallVector<Value> sizes = {one, one, dimC};
       llvm::SmallVector<Value> strides = {one, one, one};
 
-      auto slice = builder.create<tensor::ExtractSliceOp>(
-          loc, input, inputOffset, sizes, strides);
+      auto slice = tensor::ExtractSliceOp::create(builder, loc, input,
+                                                  inputOffset, sizes, strides);
 
       // Insert the slice into the output accumulator tensor.
       llvm::SmallVector<Value> outputOffset = {n, castIndex, zero};
-      auto updated = builder.create<tensor::InsertSliceOp>(
-          loc, slice, args[0], outputOffset, sizes, strides);
+      auto updated = tensor::InsertSliceOp::create(
+          builder, loc, slice, args[0], outputOffset, sizes, strides);
 
       return {updated};
     };
@@ -155,8 +155,8 @@ class WhileOpConverter : public OpRewritePattern<tosa::WhileOp> {
 
   LogicalResult matchAndRewrite(tosa::WhileOp op,
                                 PatternRewriter &rewriter) const final {
-    auto newWhile = rewriter.create<scf::WhileOp>(
-        op.getLoc(), op.getResultTypes(), op.getInputList());
+    auto newWhile = scf::WhileOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getInputList());
     rewriter.createBlock(&newWhile.getBefore());
     rewriter.createBlock(&newWhile.getAfter());
 
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index c6cbcb0f8ab2b..2945ae3b49f1f 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -308,15 +308,15 @@ class SliceConverter : public OpConversionPattern<tosa::SliceOp> {
       if (ShapedType::isStatic(sizes.back()))
         continue;
 
-      auto dim = rewriter.create<tensor::DimOp>(loc, input, index);
-      auto offset = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexAttr(sliceStarts[index]));
-      dynSizes.push_back(rewriter.create<arith::SubIOp>(loc, dim, offset));
+      auto dim = tensor::DimOp::create(rewriter, loc, input, index);
+      auto offset = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getIndexAttr(sliceStarts[index]));
+      dynSizes.push_back(arith::SubIOp::create(rewriter, loc, dim, offset));
     }
 
-    auto newSliceOp = rewriter.create<tensor::ExtractSliceOp>(
-        sliceOp.getLoc(), sliceOp.getType(), input, ValueRange({}), dynSizes,
-        ValueRange({}), rewriter.getDenseI64ArrayAttr(sliceStarts),
+    auto newSliceOp = tensor::ExtractSliceOp::create(
+        rewriter, sliceOp.getLoc(), sliceOp.getType(), input, ValueRange({}),
+        dynSizes, ValueRange({}), rewriter.getDenseI64ArrayAttr(sliceStarts),
         rewriter.getDenseI64ArrayAttr(sizes),
         rewriter.getDenseI64ArrayAttr(strides));
 
@@ -361,7 +361,7 @@ class PadConverter : public OpConversionPattern<tosa::PadOp> {
 
     Value padConstant = rewriter.createOrFold<tensor::ExtractOp>(
         loc, padOp.getPadConst(),
-        ValueRange({rewriter.create<arith::ConstantIndexOp>(loc, 0)}));
+        ValueRange({arith::ConstantIndexOp::create(rewriter, loc, 0)}));
 
     if (!padConstant) {
       return rewriter.notifyMatchFailure(
@@ -375,16 +375,16 @@ class PadConverter : public OpConversionPattern<tosa::PadOp> {
     highValues.reserve(rank);
 
     for (int i = 0; i < rank; i++) {
-      Value lowVal = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexAttr(paddingVals[2 * i]));
-      Value highVal = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexAttr(paddingVals[2 * i + 1]));
+      Value lowVal = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getIndexAttr(paddingVals[2 * i]));
+      Value highVal = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getIndexAttr(paddingVals[2 * i + 1]));
       lowValues.push_back(lowVal);
       highValues.push_back(highVal);
     }
 
-    auto newPadOp = rewriter.create<tensor::PadOp>(
-        loc, padOp.getType(), input, lowValues, highValues, padConstant);
+    auto newPadOp = tensor::PadOp::create(rewriter, loc, padOp.getType(), input,
+                                          lowValues, highValues, padConstant);
 
     rewriter.replaceOp(padOp, newPadOp.getResult());
     return success();
@@ -402,7 +402,7 @@ struct ConcatConverter : public OpConversionPattern<tosa::ConcatOp> {
     Location loc = op.getLoc();
     int axis = op.getAxis();
     Value axisValue =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(axis));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getIndexAttr(axis));
     int64_t rank = resultType.getRank();
 
     SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
@@ -439,8 +439,9 @@ struct ConcatConverter : public OpConversionPattern<tosa::ConcatOp> {
       }
     }
 
-    Value result = rewriter.create<tensor::EmptyOp>(
-        loc, resultType.getShape(), resultType.getElementType(), dynDims);
+    Value result =
+        tensor::EmptyOp::create(rewriter, loc, resultType.getShape(),
+                                resultType.getElementType(), dynDims);
 
     for (auto [arg, offset] : llvm::zip(adaptor.getOperands(), axisOffsets)) {
       auto sizes = tensor::getMixedSizes(rewriter, op.getLoc(), arg);
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index d6f9495b2567c..125ea1eb60ed6 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -226,22 +226,22 @@ struct BroadcastOpToArmSMELowering
         (srcVectorType && (srcVectorType.getRank() == 0))) {
       // Broadcast scalar or 0-d vector to 1-d vector.
       VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0);
-      broadcastOp1D = rewriter.create<vector::BroadcastOp>(
-          loc, tileSliceType, broadcastOp.getSource());
+      broadcastOp1D = vector::BroadcastOp::create(rewriter, loc, tileSliceType,
+                                                  broadcastOp.getSource());
     } else if (srcVectorType && (srcVectorType.getRank() == 1))
       // Value to broadcast is already a 1-d vector, nothing to do.
       broadcastOp1D = broadcastOp.getSource();
     else
       return failure();
 
-    auto initTile = rewriter.create<arm_sme::GetTileOp>(loc, tileType);
+    auto initTile = arm_sme::GetTileOp::create(rewriter, loc, tileType);
 
     auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex,
                             Value currentTile) {
       // Create 'arm_sme.insert_tile_slice' to broadcast the value
       // to each tile slice.
-      auto nextTile = b.create<arm_sme::InsertTileSliceOp>(
-          loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
+      auto nextTile = arm_sme::InsertTileSliceOp::create(
+          b, loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
       return nextTile.getResult();
     };
 
@@ -292,15 +292,15 @@ struct SplatOpToArmSMELowering : public OpRewritePattern<vector::SplatOp> {
 
     // First, broadcast the scalar to a 1-d vector.
     VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0);
-    Value broadcastOp1D = rewriter.create<vector::BroadcastOp>(
-        loc, tileSliceType, splatOp.getInput());
+    Value broadcastOp1D = vector::BroadcastOp::create(
+        rewriter, loc, tileSliceType, splatOp.getInput());
 
-    auto initTile = rewriter.create<arm_sme::GetTileOp>(loc, tileType);
+    auto initTile = arm_sme::GetTileOp::create(rewriter, loc, tileType);
 
     auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex,
                             Value currentTile) {
-      auto nextTile = b.create<arm_sme::InsertTileSliceOp>(
-          loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
+      auto nextTile = arm_sme::InsertTileSliceOp::create(
+          b, loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
       return nextTile.getResult();
     };
 
@@ -370,22 +370,22 @@ struct TransposeOpToArmSMELowering
 
     // Allocate buffer to store input tile to.
     Value vscale =
-        rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
-    Value minTileSlices = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIndexAttr(tileType.getDimSize(0)));
+        vector::VectorScaleOp::create(rewriter, loc, rewriter.getIndexType());
+    Value minTileSlices = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getIndexAttr(tileType.getDimSize(0)));
     Value c0 =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getIndexAttr(0));
     Value numTileSlices =
-        rewriter.create<arith::MulIOp>(loc, vscale, minTileSlices);
+        arith::MulIOp::create(rewriter, loc, vscale, minTileSlices);
     auto bufferType =
         MemRefType::get({ShapedType::kDynamic, ShapedType::kDynamic},
                         tileType.getElementType());
-    auto buffer = rewriter.create<memref::AllocaOp>(
-        loc, bufferType, ValueRange{numTileSlices, numTileSlices});
+    auto buffer = memref::AllocaOp::create(
+        rewriter, loc, bufferType, ValueRange{numTileSlices, numTileSlices});
 
     // Store input tile.
-    auto tileStoreOp = rewriter.create<arm_sme::TileStoreOp>(
-        loc, input, buffer, ValueRange{c0, c0});
+    auto tileStoreOp = arm_sme::TileStoreOp::create(rewriter, loc, input,
+                                                    buffer, ValueRange{c0, c0});
 
     // Reload input tile vertically.
     rewriter.replaceOpWithNewOp<arm_sme::TileLoadOp>(
@@ -488,10 +488,10 @@ struct VectorOuterProductToArmSMELowering
     Value rhsMaskDim = createMaskOp.getOperand(1);
 
     VectorType operandMaskType = VectorType::Builder(maskType).dropDim(0);
-    Value lhsMask =
-        rewriter.create<vector::CreateMaskOp>(loc, operandMaskType, lhsMaskDim);
-    Value rhsMask =
-        rewriter.create<vector::CreateMaskOp>(loc, operandMaskType, rhsMaskDim);
+    Value lhsMask = vector::CreateMaskOp::create(rewriter, loc, operandMaskType,
+                                                 lhsMaskDim);
+    Value rhsMask = vector::CreateMaskOp::create(rewriter, loc, operandMaskType,
+                                                 rhsMaskDim);
 
     return std::make_pair(lhsMask, rhsMask);
   }
@@ -531,8 +531,8 @@ struct VectorExtractToArmSMELowering
     }
 
     Value sliceIndex = vector::getAsValues(rewriter, loc, position[0]).front();
-    auto extractTileSlice = rewriter.create<arm_sme::ExtractTileSliceOp>(
-        loc, sourceVector, sliceIndex);
+    auto extractTileSlice = arm_sme::ExtractTileSliceOp::create(
+        rewriter, loc, sourceVector, sliceIndex);
 
     if (position.size() == 1) {
       // Single index case: Extracts a 1D slice.
@@ -593,10 +593,10 @@ struct VectorInsertToArmSMELowering
     if (position.size() == 2) {
       // Two indices case: Insert single element into tile.
       // We need to first extract the existing slice and update the element.
-      tileSlice = rewriter.create<arm_sme::ExtractTileSliceOp>(
-          loc, insertOp.getDest(), sliceIndex);
-      tileSlice = rewriter.create<vector::InsertOp>(loc, source, tileSlice,
-                                                    position[1]);
+      tileSlice = arm_sme::ExtractTileSliceOp::create(
+          rewriter, loc, insertOp.getDest(), sliceIndex);
+      tileSlice = vector::InsertOp::create(rewriter, loc, source, tileSlice,
+                                           position[1]);
     }
 
     // Insert the slice into the destination tile.
@@ -642,23 +642,24 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
     auto loc = printOp.getLoc();
 
     // Create a loop over the rows of the tile.
-    auto vscale = rewriter.create<vector::VectorScaleOp>(loc);
+    auto vscale = vector::VectorScaleOp::create(rewriter, loc);
     auto minTileRows =
-        rewriter.create<arith::ConstantIndexOp>(loc, vectorType.getDimSize(0));
-    auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    auto upperBound = rewriter.create<arith::MulIOp>(loc, minTileRows, vscale);
-    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    auto forOp = rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
+        arith::ConstantIndexOp::create(rewriter, loc, vectorType.getDimSize(0));
+    auto lowerBound = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto upperBound = arith::MulIOp::create(rewriter, loc, minTileRows, vscale);
+    auto step = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    auto forOp =
+        scf::ForOp::create(rewriter, loc, lowerBound, upperBound, step);
     {
       // Loop body.
       rewriter.setInsertionPointToStart(forOp.getBody());
       // Extract the current row from the tile.
       Value rowIndex = forOp.getInductionVar();
-      auto tileSlice = rewriter.create<arm_sme::ExtractTileSliceOp>(
-          loc, printOp.getSource(), rowIndex);
+      auto tileSlice = arm_sme::ExtractTileSliceOp::create(
+          rewriter, loc, printOp.getSource(), rowIndex);
       // Print the row with a 1D vector.print.
-      rewriter.create<vector::PrintOp>(loc, tileSlice,
-                                       printOp.getPunctuation());
+      vector::PrintOp::create(rewriter, loc, tileSlice,
+                              printOp.getPunctuation());
     }
 
     rewriter.eraseOp(printOp);
@@ -707,8 +708,8 @@ struct FoldTransferWriteOfExtractTileSlice
     Value mask = writeOp.getMask();
     if (!mask) {
       auto maskType = writeOp.getVectorType().clone(rewriter.getI1Type());
-      mask = rewriter.create<arith::ConstantOp>(
-          writeOp.getLoc(), maskType, DenseElementsAttr::get(maskType, true));
+      mask = arith::ConstantOp::create(rewriter, writeOp.getLoc(), maskType,
+                                       DenseElementsAttr::get(maskType, true));
     }
 
     rewriter.replaceOpWithNewOp<arm_sme::StoreTileSliceOp>(
@@ -776,10 +777,10 @@ struct ExtractFromCreateMaskToPselLowering
     // Create the two 1-D masks at the location of the 2-D create_mask (which is
     // usually outside a loop). This prevents the need for later hoisting.
     rewriter.setInsertionPoint(createMaskOp);
-    auto rowMask = rewriter.create<vector::CreateMaskOp>(
-        loc, rowMaskType, createMaskOp.getOperand(0));
-    auto colMask = rewriter.create<vector::CreateMaskOp>(
-        loc, colMaskType, createMaskOp.getOperand(1));
+    auto rowMask = vector::CreateMaskOp::create(rewriter, loc, rowMaskType,
+                                                createMaskOp.getOperand(0));
+    auto colMask = vector::CreateMaskOp::create(rewriter, loc, colMaskType,
+                                                createMaskOp.getOperand(1));
 
     rewriter.setInsertionPoint(extractOp);
     auto position =
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 18adaa793787c..77aab85483a8b 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -412,22 +412,22 @@ struct PrepareContractToGPUMMA
     if (maps == infer({{m, k}, {k, n}, {m, n}}))
       return rewriter.notifyMatchFailure(op, "contraction already prepared");
     if (maps == infer({{m, k}, {n, k}, {m, n}})) {
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
     } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
       std::swap(rhs, lhs);
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
       std::swap(rhs, lhs);
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
     } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
       std::swap(lhs, rhs);
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
       std::swap(lhs, rhs);
     } else {
@@ -494,13 +494,13 @@ struct CombineTransferReadOpTranspose final
     // Fuse through the integer extend op.
     if (extOp) {
       if (isa<arith::ExtSIOp>(extOp))
-        result = rewriter.create<arith::ExtSIOp>(loc, op.getType(), result)
+        result = arith::ExtSIOp::create(rewriter, loc, op.getType(), result)
                      .getResult();
       else if (isa<arith::ExtUIOp>(extOp))
-        result = rewriter.create<arith::ExtUIOp>(loc, op.getType(), result)
+        result = arith::ExtUIOp::create(rewriter, loc, op.getType(), result)
                      .getResult();
       else
-        result = rewriter.create<arith::ExtFOp>(loc, op.getType(), result)
+        result = arith::ExtFOp::create(rewriter, loc, op.getType(), result)
                      .getResult();
     }
 
@@ -579,8 +579,8 @@ convertTransferReadOp(RewriterBase &rewriter, vector::TransferReadOp op,
   }
   gpu::MMAMatrixType type =
       gpu::MMAMatrixType::get(op.getVectorType().getShape(), elType, fragType);
-  Value load = rewriter.create<gpu::SubgroupMmaLoadMatrixOp>(
-      op.getLoc(), type, op.getBase(), op.getIndices(),
+  Value load = gpu::SubgroupMmaLoadMatrixOp::create(
+      rewriter, op.getLoc(), type, op.getBase(), op.getIndices(),
       rewriter.getIndexAttr(*stride),
       isTranspose ? rewriter.getUnitAttr() : UnitAttr());
   valueMapping[mappingResult] = load;
@@ -610,8 +610,8 @@ convertTransferWriteOp(RewriterBase &rewriter, vector::TransferWriteOp op,
   }
 
   Value matrix = it->second;
-  auto store = rewriter.create<gpu::SubgroupMmaStoreMatrixOp>(
-      op.getLoc(), matrix, op.getBase(), op.getIndices(),
+  auto store = gpu::SubgroupMmaStoreMatrixOp::create(
+      rewriter, op.getLoc(), matrix, op.getBase(), op.getIndices(),
       rewriter.getIndexAttr(*stride), /*transpose=*/UnitAttr());
   (void)store;
 
@@ -661,8 +661,8 @@ convertConstantOpMmaSync(RewriterBase &rewriter, arith::ConstantOp op,
     return rewriter.notifyMatchFailure(op, "not a splat");
   }
 
-  Value result = rewriter.create<arith::ConstantOp>(
-      op.getLoc(), vectorType,
+  Value result = arith::ConstantOp::create(
+      rewriter, op.getLoc(), vectorType,
       DenseElementsAttr::get(vectorType, dense.getSplatValue<Attribute>()));
   valueMapping[op.getResult()] = result;
   return success();
@@ -743,7 +743,7 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
   }
 
   // Adjust the load offset.
-  auto laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
+  auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
   FailureOr<AffineMap> offsets =
       nvgpu::getLaneIdToLdMatrixMatrixCoord(rewriter, loc, *params);
   if (failed(offsets)) {
@@ -757,8 +757,9 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
   getXferIndices<vector::TransferReadOp>(rewriter, op, *offsets, {laneId},
                                          indices);
 
-  nvgpu::LdMatrixOp newOp = rewriter.create<nvgpu::LdMatrixOp>(
-      loc, vectorType, op.getBase(), indices, *transpose, params->numTiles);
+  nvgpu::LdMatrixOp newOp =
+      nvgpu::LdMatrixOp::create(rewriter, loc, vectorType, op.getBase(),
+                                indices, *transpose, params->numTiles);
   valueMapping[op] = newOp->getResult(0);
   return success();
 }
@@ -782,17 +783,17 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
             "conversion to distributed non-ldmatrix compatible load");
   }
 
-  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
+  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
 
   // This is the individual element type.
   Type loadedElType = regInfo->registerLLVMType;
   VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
 
-  Value fill = rewriter.create<arith::ConstantOp>(
-      op.getLoc(), vectorType.getElementType(),
+  Value fill = arith::ConstantOp::create(
+      rewriter, op.getLoc(), vectorType.getElementType(),
       rewriter.getZeroAttr(vectorType.getElementType()));
   Value result =
-      rewriter.create<vector::SplatOp>(op.getLoc(), fill, vectorType);
+      vector::BroadcastOp::create(rewriter, op.getLoc(), vectorType, fill);
 
   bool isTransposeLoad = !op.getPermutationMap().isMinorIdentity();
 
@@ -809,16 +810,16 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
       if (failed(coords))
         return rewriter.notifyMatchFailure(op, "no coords");
 
-      Value logicalValueId = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexType(),
+      Value logicalValueId = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getIndexType(),
           rewriter.getIndexAttr(i * regInfo->elementsPerRegister));
       SmallVector<Value, 4> newIndices;
       getXferIndices<vector::TransferReadOp>(
           rewriter, op, *coords, {laneId, logicalValueId}, newIndices);
 
-      Value el = rewriter.create<vector::LoadOp>(loc, loadedElType,
-                                                 op.getBase(), newIndices);
-      result = rewriter.create<vector::InsertOp>(loc, el, result, i);
+      Value el = vector::LoadOp::create(rewriter, loc, loadedElType,
+                                        op.getBase(), newIndices);
+      result = vector::InsertOp::create(rewriter, loc, el, result, i);
     }
   } else {
     if (auto vecType = dyn_cast<VectorType>(loadedElType)) {
@@ -828,8 +829,8 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
       for (unsigned innerIdx = 0; innerIdx < vectorType.getShape()[1];
            innerIdx++) {
 
-        Value logicalValueId = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIndexType(),
+        Value logicalValueId = arith::ConstantOp::create(
+            rewriter, loc, rewriter.getIndexType(),
             rewriter.getIndexAttr(i * regInfo->elementsPerRegister + innerIdx));
         FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
             rewriter, op.getLoc(), *warpMatrixInfo);
@@ -839,10 +840,10 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
         SmallVector<Value, 4> newIndices;
         getXferIndices<vector::TransferReadOp>(
             rewriter, op, *coords, {laneId, logicalValueId}, newIndices);
-        Value el = rewriter.create<memref::LoadOp>(op.getLoc(), loadedElType,
-                                                   op.getBase(), newIndices);
-        result = rewriter.create<vector::InsertOp>(
-            op.getLoc(), el, result, ArrayRef<int64_t>{i, innerIdx});
+        Value el = memref::LoadOp::create(rewriter, op.getLoc(), loadedElType,
+                                          op.getBase(), newIndices);
+        result = vector::InsertOp::create(rewriter, op.getLoc(), el, result,
+                                          ArrayRef<int64_t>{i, innerIdx});
       }
     }
   }
@@ -916,11 +917,11 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op,
     return rewriter.notifyMatchFailure(op, "not mma sync reg info");
 
   VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
-  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
+  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
 
   for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
-    Value logicalValueId = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIndexType(),
+    Value logicalValueId = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getIndexType(),
         rewriter.getIndexAttr(i * regInfo->elementsPerRegister));
     FailureOr<AffineMap> coords = nvgpu::getLaneIdAndValueIdToOperandCoord(
         rewriter, op.getLoc(), *warpMatrixInfo);
@@ -928,11 +929,11 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op,
       return rewriter.notifyMatchFailure(op, "no coords");
 
     Value el =
-        rewriter.create<vector::ExtractOp>(loc, matrix, ArrayRef<int64_t>{i});
+        vector::ExtractOp::create(rewriter, loc, matrix, ArrayRef<int64_t>{i});
     SmallVector<Value, 4> newIndices;
     getXferIndices<vector::TransferWriteOp>(
         rewriter, op, *coords, {laneId, logicalValueId}, newIndices);
-    rewriter.create<vector::StoreOp>(loc, el, op.getBase(), newIndices);
+    vector::StoreOp::create(rewriter, loc, el, op.getBase(), newIndices);
   }
 
   LLVM_DEBUG(DBGS() << "erase: " << op << "\n");
@@ -1015,8 +1016,8 @@ convertExtractStridedSlice(RewriterBase &rewriter,
   else if (offsets[1])
     sliceOffset[0] = (warpVectorShape[1] / offsets[1]);
 
-  Value newOp = rewriter.create<vector::ExtractStridedSliceOp>(
-      loc, sourceVector, sliceOffset, sliceShape, strides);
+  Value newOp = vector::ExtractStridedSliceOp::create(
+      rewriter, loc, sourceVector, sliceOffset, sliceShape, strides);
 
   valueMapping[op] = newOp;
   return success();
@@ -1035,9 +1036,10 @@ convertContractOp(RewriterBase &rewriter, vector::ContractionOp op,
       itC == valueMapping.end())
     return rewriter.notifyMatchFailure(op, "no mapping");
   Value opA = itA->second, opB = itB->second, opC = itC->second;
-  Value matmul = rewriter.create<gpu::SubgroupMmaComputeOp>(
-      op.getLoc(), opC.getType(), opA, opB, opC, /*a_transpose=*/UnitAttr(),
-      /*b_transpose=*/UnitAttr());
+  Value matmul = gpu::SubgroupMmaComputeOp::create(rewriter, op.getLoc(),
+                                                   opC.getType(), opA, opB, opC,
+                                                   /*a_transpose=*/UnitAttr(),
+                                                   /*b_transpose=*/UnitAttr());
   valueMapping[op.getResult()] = matmul;
   return success();
 }
@@ -1058,8 +1060,8 @@ convertContractOpToMmaSync(RewriterBase &rewriter, vector::ContractionOp op,
   int64_t m = cast<VectorType>(op.getLhs().getType()).getShape()[0];
   int64_t n = cast<VectorType>(op.getRhs().getType()).getShape()[0];
   int64_t k = cast<VectorType>(op.getLhs().getType()).getShape()[1];
-  Value matmul = rewriter.create<nvgpu::MmaSyncOp>(
-      op.getLoc(), opA, opB, opC, rewriter.getI64ArrayAttr({m, n, k}));
+  Value matmul = nvgpu::MmaSyncOp::create(rewriter, op.getLoc(), opA, opB, opC,
+                                          rewriter.getI64ArrayAttr({m, n, k}));
   valueMapping[op.getResult()] = matmul;
   return success();
 }
@@ -1076,13 +1078,13 @@ convertConstantOp(RewriterBase &rewriter, arith::ConstantOp op,
   auto splat =
       cast<SplatElementsAttr>(op.getValue()).getSplatValue<TypedAttr>();
   auto scalarConstant =
-      rewriter.create<arith::ConstantOp>(op.getLoc(), splat.getType(), splat);
+      arith::ConstantOp::create(rewriter, op.getLoc(), splat.getType(), splat);
   const char *fragType = inferFragType(op);
   auto vecType = cast<VectorType>(op.getType());
   gpu::MMAMatrixType type = gpu::MMAMatrixType::get(
       vecType.getShape(), vecType.getElementType(), llvm::StringRef(fragType));
-  auto matrix = rewriter.create<gpu::SubgroupMmaConstantMatrixOp>(
-      op.getLoc(), type, scalarConstant);
+  auto matrix = gpu::SubgroupMmaConstantMatrixOp::create(rewriter, op.getLoc(),
+                                                         type, scalarConstant);
   valueMapping[op.getResult()] = matrix;
   return success();
 }
@@ -1100,8 +1102,8 @@ convertBroadcastOp(RewriterBase &rewriter, vector::BroadcastOp op,
   auto vecType = op.getResultVectorType();
   gpu::MMAMatrixType type = gpu::MMAMatrixType::get(
       vecType.getShape(), vecType.getElementType(), llvm::StringRef(fragType));
-  auto matrix = rewriter.create<gpu::SubgroupMmaConstantMatrixOp>(
-      op.getLoc(), type, op.getSource());
+  auto matrix = gpu::SubgroupMmaConstantMatrixOp::create(rewriter, op.getLoc(),
+                                                         type, op.getSource());
   valueMapping[op.getResult()] = matrix;
   return success();
 }
@@ -1118,9 +1120,9 @@ static scf::ForOp replaceForOpWithNewSignature(RewriterBase &rewriter,
   rewriter.setInsertionPoint(loop);
   auto operands = llvm::to_vector<4>(loop.getInitArgs());
   llvm::append_range(operands, newInitArgs);
-  scf::ForOp newLoop = rewriter.create<scf::ForOp>(
-      loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(), loop.getStep(),
-      operands);
+  scf::ForOp newLoop =
+      scf::ForOp::create(rewriter, loop.getLoc(), loop.getLowerBound(),
+                         loop.getUpperBound(), loop.getStep(), operands);
   rewriter.eraseBlock(newLoop.getBody());
 
   newLoop.getRegion().getBlocks().splice(
@@ -1189,7 +1191,7 @@ convertYieldOp(RewriterBase &rewriter, scf::YieldOp op,
     yieldOperands[operand.index()] = loop.getInitArgs()[operand.index()];
     yieldOperands.push_back(it->second);
   }
-  rewriter.create<scf::YieldOp>(op.getLoc(), yieldOperands);
+  scf::YieldOp::create(rewriter, op.getLoc(), yieldOperands);
 
   LLVM_DEBUG(DBGS() << "erase: " << op << "\n");
   rewriter.eraseOp(op);
@@ -1220,8 +1222,8 @@ convertElementwiseOp(RewriterBase &rewriter, Operation *op,
                                          resultType.getOperand());
   }
 
-  Value newOp = rewriter.create<gpu::SubgroupMmaElementwiseOp>(
-      op->getLoc(), resultType, matrixOperands, opType);
+  Value newOp = gpu::SubgroupMmaElementwiseOp::create(
+      rewriter, op->getLoc(), resultType, matrixOperands, opType);
   valueMapping[op->getResult(0)] = newOp;
   return success();
 }
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 501d98862672d..9cd491caa9421 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -43,13 +43,13 @@ static Value insertOne(ConversionPatternRewriter &rewriter,
   assert(rank > 0 && "0-D vector corner case should have been handled already");
   if (rank == 1) {
     auto idxType = rewriter.getIndexType();
-    auto constant = rewriter.create<LLVM::ConstantOp>(
-        loc, typeConverter.convertType(idxType),
+    auto constant = LLVM::ConstantOp::create(
+        rewriter, loc, typeConverter.convertType(idxType),
         rewriter.getIntegerAttr(idxType, pos));
-    return rewriter.create<LLVM::InsertElementOp>(loc, llvmType, val1, val2,
-                                                  constant);
+    return LLVM::InsertElementOp::create(rewriter, loc, llvmType, val1, val2,
+                                         constant);
   }
-  return rewriter.create<LLVM::InsertValueOp>(loc, val1, val2, pos);
+  return LLVM::InsertValueOp::create(rewriter, loc, val1, val2, pos);
 }
 
 // Helper that picks the proper sequence for extracting.
@@ -58,13 +58,13 @@ static Value extractOne(ConversionPatternRewriter &rewriter,
                         Value val, Type llvmType, int64_t rank, int64_t pos) {
   if (rank <= 1) {
     auto idxType = rewriter.getIndexType();
-    auto constant = rewriter.create<LLVM::ConstantOp>(
-        loc, typeConverter.convertType(idxType),
+    auto constant = LLVM::ConstantOp::create(
+        rewriter, loc, typeConverter.convertType(idxType),
         rewriter.getIntegerAttr(idxType, pos));
-    return rewriter.create<LLVM::ExtractElementOp>(loc, llvmType, val,
-                                                   constant);
+    return LLVM::ExtractElementOp::create(rewriter, loc, llvmType, val,
+                                          constant);
   }
-  return rewriter.create<LLVM::ExtractValueOp>(loc, val, pos);
+  return LLVM::ExtractValueOp::create(rewriter, loc, val, pos);
 }
 
 // Helper that returns data layout alignment of a vector.
@@ -141,9 +141,9 @@ static Value getIndexedPtrs(ConversionPatternRewriter &rewriter, Location loc,
   auto ptrsType =
       LLVM::getVectorType(pType, vectorType.getDimSize(0),
                           /*isScalable=*/vectorType.getScalableDims()[0]);
-  return rewriter.create<LLVM::GEPOp>(
-      loc, ptrsType, typeConverter.convertType(memRefType.getElementType()),
-      base, index);
+  return LLVM::GEPOp::create(
+      rewriter, loc, ptrsType,
+      typeConverter.convertType(memRefType.getElementType()), base, index);
 }
 
 /// Convert `foldResult` into a Value. Integer attribute is converted to
@@ -152,7 +152,7 @@ static Value getAsLLVMValue(OpBuilder &builder, Location loc,
                             OpFoldResult foldResult) {
   if (auto attr = dyn_cast<Attribute>(foldResult)) {
     auto intAttr = cast<IntegerAttr>(attr);
-    return builder.create<LLVM::ConstantOp>(loc, intAttr).getResult();
+    return LLVM::ConstantOp::create(builder, loc, intAttr).getResult();
   }
 
   return cast<Value>(foldResult);
@@ -184,41 +184,6 @@ class VectorBitCastOpConversion
   }
 };
 
-/// Conversion pattern for a vector.matrix_multiply.
-/// This is lowered directly to the proper llvm.intr.matrix.multiply.
-class VectorMatmulOpConversion
-    : public ConvertOpToLLVMPattern<vector::MatmulOp> {
-public:
-  using ConvertOpToLLVMPattern<vector::MatmulOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult
-  matchAndRewrite(vector::MatmulOp matmulOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::MatrixMultiplyOp>(
-        matmulOp, typeConverter->convertType(matmulOp.getRes().getType()),
-        adaptor.getLhs(), adaptor.getRhs(), matmulOp.getLhsRows(),
-        matmulOp.getLhsColumns(), matmulOp.getRhsColumns());
-    return success();
-  }
-};
-
-/// Conversion pattern for a vector.flat_transpose.
-/// This is lowered directly to the proper llvm.intr.matrix.transpose.
-class VectorFlatTransposeOpConversion
-    : public ConvertOpToLLVMPattern<vector::FlatTransposeOp> {
-public:
-  using ConvertOpToLLVMPattern<vector::FlatTransposeOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult
-  matchAndRewrite(vector::FlatTransposeOp transOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::MatrixTransposeOp>(
-        transOp, typeConverter->convertType(transOp.getRes().getType()),
-        adaptor.getMatrix(), transOp.getRows(), transOp.getColumns());
-    return success();
-  }
-};
-
 /// Overloaded utility that replaces a vector.load, vector.store,
 /// vector.maskedload and vector.maskedstore with their respective LLVM
 /// couterparts.
@@ -475,32 +440,32 @@ class ReductionNeutralFPMax {};
 static Value createReductionNeutralValue(ReductionNeutralZero neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(loc, llvmType,
-                                           rewriter.getZeroAttr(llvmType));
+  return LLVM::ConstantOp::create(rewriter, loc, llvmType,
+                                  rewriter.getZeroAttr(llvmType));
 }
 
 /// Create the reduction neutral integer one value.
 static Value createReductionNeutralValue(ReductionNeutralIntOne neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType, rewriter.getIntegerAttr(llvmType, 1));
+  return LLVM::ConstantOp::create(rewriter, loc, llvmType,
+                                  rewriter.getIntegerAttr(llvmType, 1));
 }
 
 /// Create the reduction neutral fp one value.
 static Value createReductionNeutralValue(ReductionNeutralFPOne neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType, rewriter.getFloatAttr(llvmType, 1.0));
+  return LLVM::ConstantOp::create(rewriter, loc, llvmType,
+                                  rewriter.getFloatAttr(llvmType, 1.0));
 }
 
 /// Create the reduction neutral all-ones value.
 static Value createReductionNeutralValue(ReductionNeutralAllOnes neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getIntegerAttr(
           llvmType, llvm::APInt::getAllOnes(llvmType.getIntOrFloatBitWidth())));
 }
@@ -509,8 +474,8 @@ static Value createReductionNeutralValue(ReductionNeutralAllOnes neutral,
 static Value createReductionNeutralValue(ReductionNeutralSIntMin neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getIntegerAttr(llvmType, llvm::APInt::getSignedMinValue(
                                             llvmType.getIntOrFloatBitWidth())));
 }
@@ -519,8 +484,8 @@ static Value createReductionNeutralValue(ReductionNeutralSIntMin neutral,
 static Value createReductionNeutralValue(ReductionNeutralUIntMin neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getIntegerAttr(llvmType, llvm::APInt::getMinValue(
                                             llvmType.getIntOrFloatBitWidth())));
 }
@@ -529,8 +494,8 @@ static Value createReductionNeutralValue(ReductionNeutralUIntMin neutral,
 static Value createReductionNeutralValue(ReductionNeutralSIntMax neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getIntegerAttr(llvmType, llvm::APInt::getSignedMaxValue(
                                             llvmType.getIntOrFloatBitWidth())));
 }
@@ -539,8 +504,8 @@ static Value createReductionNeutralValue(ReductionNeutralSIntMax neutral,
 static Value createReductionNeutralValue(ReductionNeutralUIntMax neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getIntegerAttr(llvmType, llvm::APInt::getMaxValue(
                                             llvmType.getIntOrFloatBitWidth())));
 }
@@ -550,8 +515,8 @@ static Value createReductionNeutralValue(ReductionNeutralFPMin neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
   auto floatType = cast<FloatType>(llvmType);
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getFloatAttr(
           llvmType, llvm::APFloat::getQNaN(floatType.getFloatSemantics(),
                                            /*Negative=*/false)));
@@ -562,8 +527,8 @@ static Value createReductionNeutralValue(ReductionNeutralFPMax neutral,
                                          ConversionPatternRewriter &rewriter,
                                          Location loc, Type llvmType) {
   auto floatType = cast<FloatType>(llvmType);
-  return rewriter.create<LLVM::ConstantOp>(
-      loc, llvmType,
+  return LLVM::ConstantOp::create(
+      rewriter, loc, llvmType,
       rewriter.getFloatAttr(
           llvmType, llvm::APFloat::getQNaN(floatType.getFloatSemantics(),
                                            /*Negative=*/true)));
@@ -591,19 +556,19 @@ static Value createVectorLengthValue(ConversionPatternRewriter &rewriter,
   auto vShape = vType.getShape();
   assert(vShape.size() == 1 && "Unexpected multi-dim vector type");
 
-  Value baseVecLength = rewriter.create<LLVM::ConstantOp>(
-      loc, rewriter.getI32Type(),
+  Value baseVecLength = LLVM::ConstantOp::create(
+      rewriter, loc, rewriter.getI32Type(),
       rewriter.getIntegerAttr(rewriter.getI32Type(), vShape[0]));
 
   if (!vType.getScalableDims()[0])
     return baseVecLength;
 
   // For a scalable vector type, create and return `vScale * baseVecLength`.
-  Value vScale = rewriter.create<vector::VectorScaleOp>(loc);
+  Value vScale = vector::VectorScaleOp::create(rewriter, loc);
   vScale =
-      rewriter.create<arith::IndexCastOp>(loc, rewriter.getI32Type(), vScale);
+      arith::IndexCastOp::create(rewriter, loc, rewriter.getI32Type(), vScale);
   Value scalableVecLength =
-      rewriter.create<arith::MulIOp>(loc, baseVecLength, vScale);
+      arith::MulIOp::create(rewriter, loc, baseVecLength, vScale);
   return scalableVecLength;
 }
 
@@ -616,10 +581,11 @@ static Value createIntegerReductionArithmeticOpLowering(
     ConversionPatternRewriter &rewriter, Location loc, Type llvmType,
     Value vectorOperand, Value accumulator) {
 
-  Value result = rewriter.create<LLVMRedIntrinOp>(loc, llvmType, vectorOperand);
+  Value result =
+      LLVMRedIntrinOp::create(rewriter, loc, llvmType, vectorOperand);
 
   if (accumulator)
-    result = rewriter.create<ScalarOp>(loc, accumulator, result);
+    result = ScalarOp::create(rewriter, loc, accumulator, result);
   return result;
 }
 
@@ -631,11 +597,12 @@ template <class LLVMRedIntrinOp>
 static Value createIntegerReductionComparisonOpLowering(
     ConversionPatternRewriter &rewriter, Location loc, Type llvmType,
     Value vectorOperand, Value accumulator, LLVM::ICmpPredicate predicate) {
-  Value result = rewriter.create<LLVMRedIntrinOp>(loc, llvmType, vectorOperand);
+  Value result =
+      LLVMRedIntrinOp::create(rewriter, loc, llvmType, vectorOperand);
   if (accumulator) {
     Value cmp =
-        rewriter.create<LLVM::ICmpOp>(loc, predicate, accumulator, result);
-    result = rewriter.create<LLVM::SelectOp>(loc, cmp, accumulator, result);
+        LLVM::ICmpOp::create(rewriter, loc, predicate, accumulator, result);
+    result = LLVM::SelectOp::create(rewriter, loc, cmp, accumulator, result);
   }
   return result;
 }
@@ -666,12 +633,11 @@ static Value createFPReductionComparisonOpLowering(
     ConversionPatternRewriter &rewriter, Location loc, Type llvmType,
     Value vectorOperand, Value accumulator, LLVM::FastmathFlagsAttr fmf) {
   Value result =
-      rewriter.create<LLVMRedIntrinOp>(loc, llvmType, vectorOperand, fmf);
+      LLVMRedIntrinOp::create(rewriter, loc, llvmType, vectorOperand, fmf);
 
   if (accumulator) {
-    result =
-        rewriter.create<typename VectorToScalarMapper<LLVMRedIntrinOp>::Type>(
-            loc, result, accumulator);
+    result = VectorToScalarMapper<LLVMRedIntrinOp>::Type::create(
+        rewriter, loc, result, accumulator);
   }
 
   return result;
@@ -702,7 +668,7 @@ static Value createMaskNeutralValue(ConversionPatternRewriter &rewriter,
   const auto &floatSemantics = cast<FloatType>(llvmType).getFloatSemantics();
   auto value = getMaskNeutralValue(MaskNeutral{}, floatSemantics);
   auto denseValue = DenseElementsAttr::get(cast<ShapedType>(vectorType), value);
-  return rewriter.create<LLVM::ConstantOp>(loc, vectorType, denseValue);
+  return LLVM::ConstantOp::create(rewriter, loc, vectorType, denseValue);
 }
 
 /// Lowers masked `fmaximum` and `fminimum` reductions using the non-masked
@@ -717,8 +683,8 @@ lowerMaskedReductionWithRegular(ConversionPatternRewriter &rewriter,
                                 Value mask, LLVM::FastmathFlagsAttr fmf) {
   const Value vectorMaskNeutral = createMaskNeutralValue<MaskNeutral>(
       rewriter, loc, llvmType, vectorOperand.getType());
-  const Value selectedVectorByMask = rewriter.create<LLVM::SelectOp>(
-      loc, mask, vectorOperand, vectorMaskNeutral);
+  const Value selectedVectorByMask = LLVM::SelectOp::create(
+      rewriter, loc, mask, vectorOperand, vectorMaskNeutral);
   return createFPReductionComparisonOpLowering<LLVMRedIntrinOp>(
       rewriter, loc, llvmType, selectedVectorByMask, accumulator, fmf);
 }
@@ -730,9 +696,9 @@ lowerReductionWithStartValue(ConversionPatternRewriter &rewriter, Location loc,
                              Value accumulator, LLVM::FastmathFlagsAttr fmf) {
   accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc,
                                                          llvmType, accumulator);
-  return rewriter.create<LLVMRedIntrinOp>(loc, llvmType,
-                                          /*startValue=*/accumulator,
-                                          vectorOperand, fmf);
+  return LLVMRedIntrinOp::create(rewriter, loc, llvmType,
+                                 /*startValue=*/accumulator, vectorOperand,
+                                 fmf);
 }
 
 /// Overloaded methods to lower a *predicated* reduction to an llvm intrinsic
@@ -745,9 +711,8 @@ lowerPredicatedReductionWithStartValue(ConversionPatternRewriter &rewriter,
                                        Value vectorOperand, Value accumulator) {
   accumulator = getOrCreateAccumulator<ReductionNeutral>(rewriter, loc,
                                                          llvmType, accumulator);
-  return rewriter.create<LLVMVPRedIntrinOp>(loc, llvmType,
-                                            /*startValue=*/accumulator,
-                                            vectorOperand);
+  return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType,
+                                   /*startValue=*/accumulator, vectorOperand);
 }
 
 template <class LLVMVPRedIntrinOp, class ReductionNeutral>
@@ -758,9 +723,9 @@ static Value lowerPredicatedReductionWithStartValue(
                                                          llvmType, accumulator);
   Value vectorLength =
       createVectorLengthValue(rewriter, loc, vectorOperand.getType());
-  return rewriter.create<LLVMVPRedIntrinOp>(loc, llvmType,
-                                            /*startValue=*/accumulator,
-                                            vectorOperand, mask, vectorLength);
+  return LLVMVPRedIntrinOp::create(rewriter, loc, llvmType,
+                                   /*startValue=*/accumulator, vectorOperand,
+                                   mask, vectorLength);
 }
 
 template <class LLVMIntVPRedIntrinOp, class IntReductionNeutral,
@@ -1071,8 +1036,8 @@ class VectorShuffleOpConversion
     // For rank 0 and 1, where both operands have *exactly* the same vector
     // type, there is direct shuffle support in LLVM. Use it!
     if (rank <= 1 && v1Type == v2Type) {
-      Value llvmShuffleOp = rewriter.create<LLVM::ShuffleVectorOp>(
-          loc, adaptor.getV1(), adaptor.getV2(),
+      Value llvmShuffleOp = LLVM::ShuffleVectorOp::create(
+          rewriter, loc, adaptor.getV1(), adaptor.getV2(),
           llvm::to_vector_of<int32_t>(mask));
       rewriter.replaceOp(shuffleOp, llvmShuffleOp);
       return success();
@@ -1085,7 +1050,7 @@ class VectorShuffleOpConversion
       eltType = arrayType.getElementType();
     else
       eltType = cast<VectorType>(llvmType).getElementType();
-    Value insert = rewriter.create<LLVM::PoisonOp>(loc, llvmType);
+    Value insert = LLVM::PoisonOp::create(rewriter, loc, llvmType);
     int64_t insPos = 0;
     for (int64_t extPos : mask) {
       Value value = adaptor.getV1();
@@ -1122,9 +1087,9 @@ class VectorExtractElementOpConversion
     if (vectorType.getRank() == 0) {
       Location loc = extractEltOp.getLoc();
       auto idxType = rewriter.getIndexType();
-      auto zero = rewriter.create<LLVM::ConstantOp>(
-          loc, typeConverter->convertType(idxType),
-          rewriter.getIntegerAttr(idxType, 0));
+      auto zero = LLVM::ConstantOp::create(rewriter, loc,
+                                           typeConverter->convertType(idxType),
+                                           rewriter.getIntegerAttr(idxType, 0));
       rewriter.replaceOpWithNewOp<LLVM::ExtractElementOp>(
           extractEltOp, llvmType, adaptor.getVector(), zero);
       return success();
@@ -1193,13 +1158,14 @@ class VectorExtractOpConversion
       if (!llvm::all_of(position, llvm::IsaPred<Attribute>)) {
         return failure();
       }
-      extracted = rewriter.create<LLVM::ExtractValueOp>(
-          loc, extracted, getAsIntegers(position));
+      extracted = LLVM::ExtractValueOp::create(rewriter, loc, extracted,
+                                               getAsIntegers(position));
     }
 
     if (extractsScalar) {
-      extracted = rewriter.create<LLVM::ExtractElementOp>(
-          loc, extracted, getAsLLVMValue(rewriter, loc, positionVec.back()));
+      extracted = LLVM::ExtractElementOp::create(
+          rewriter, loc, extracted,
+          getAsLLVMValue(rewriter, loc, positionVec.back()));
     }
 
     rewriter.replaceOp(extractOp, extracted);
@@ -1256,9 +1222,9 @@ class VectorInsertElementOpConversion
     if (vectorType.getRank() == 0) {
       Location loc = insertEltOp.getLoc();
       auto idxType = rewriter.getIndexType();
-      auto zero = rewriter.create<LLVM::ConstantOp>(
-          loc, typeConverter->convertType(idxType),
-          rewriter.getIntegerAttr(idxType, 0));
+      auto zero = LLVM::ConstantOp::create(rewriter, loc,
+                                           typeConverter->convertType(idxType),
+                                           rewriter.getIntegerAttr(idxType, 0));
       rewriter.replaceOpWithNewOp<LLVM::InsertElementOp>(
           insertEltOp, llvmType, adaptor.getDest(), adaptor.getSource(), zero);
       return success();
@@ -1342,8 +1308,8 @@ class VectorInsertOpConversion
           // llvm.extractvalue does not support dynamic dimensions.
           return failure();
         }
-        sourceAggregate = rewriter.create<LLVM::ExtractValueOp>(
-            loc, adaptor.getDest(),
+        sourceAggregate = LLVM::ExtractValueOp::create(
+            rewriter, loc, adaptor.getDest(),
             getAsIntegers(positionOf1DVectorWithinAggregate));
       } else {
         // No-aggregate case. The destination for the InsertElementOp is just
@@ -1351,16 +1317,16 @@ class VectorInsertOpConversion
         sourceAggregate = adaptor.getDest();
       }
       // Insert the scalar into the 1D vector.
-      sourceAggregate = rewriter.create<LLVM::InsertElementOp>(
-          loc, sourceAggregate.getType(), sourceAggregate,
+      sourceAggregate = LLVM::InsertElementOp::create(
+          rewriter, loc, sourceAggregate.getType(), sourceAggregate,
           adaptor.getValueToStore(),
           getAsLLVMValue(rewriter, loc, positionOfScalarWithin1DVector));
     }
 
     Value result = sourceAggregate;
     if (isNestedAggregate) {
-      result = rewriter.create<LLVM::InsertValueOp>(
-          loc, adaptor.getDest(), sourceAggregate,
+      result = LLVM::InsertValueOp::create(
+          rewriter, loc, adaptor.getDest(), sourceAggregate,
           getAsIntegers(positionOf1DVectorWithinAggregate));
     }
 
@@ -1408,7 +1374,7 @@ struct VectorScalableExtractOpLowering
 /// ```
 /// is rewritten into:
 /// ```
-///  %r = splat %f0: vector<2x4xf32>
+///  %r = vector.broadcast %f0 : f32 to vector<2x4xf32>
 ///  %va = vector.extractvalue %a[0] : vector<2x4xf32>
 ///  %vb = vector.extractvalue %b[0] : vector<2x4xf32>
 ///  %vc = vector.extractvalue %c[0] : vector<2x4xf32>
@@ -1439,15 +1405,15 @@ class VectorFMAOpNDRewritePattern : public OpRewritePattern<FMAOp> {
 
     auto loc = op.getLoc();
     auto elemType = vType.getElementType();
-    Value zero = rewriter.create<arith::ConstantOp>(
-        loc, elemType, rewriter.getZeroAttr(elemType));
-    Value desc = rewriter.create<vector::SplatOp>(loc, vType, zero);
+    Value zero = arith::ConstantOp::create(rewriter, loc, elemType,
+                                           rewriter.getZeroAttr(elemType));
+    Value desc = vector::BroadcastOp::create(rewriter, loc, vType, zero);
     for (int64_t i = 0, e = vType.getShape().front(); i != e; ++i) {
-      Value extrLHS = rewriter.create<ExtractOp>(loc, op.getLhs(), i);
-      Value extrRHS = rewriter.create<ExtractOp>(loc, op.getRhs(), i);
-      Value extrACC = rewriter.create<ExtractOp>(loc, op.getAcc(), i);
-      Value fma = rewriter.create<FMAOp>(loc, extrLHS, extrRHS, extrACC);
-      desc = rewriter.create<InsertOp>(loc, fma, desc, i);
+      Value extrLHS = ExtractOp::create(rewriter, loc, op.getLhs(), i);
+      Value extrRHS = ExtractOp::create(rewriter, loc, op.getRhs(), i);
+      Value extrACC = ExtractOp::create(rewriter, loc, op.getAcc(), i);
+      Value fma = FMAOp::create(rewriter, loc, extrLHS, extrRHS, extrACC);
+      desc = InsertOp::create(rewriter, loc, fma, desc, i);
     }
     rewriter.replaceOp(op, desc);
     return success();
@@ -1537,7 +1503,7 @@ class VectorTypeCastOpConversion
     desc.setAlignedPtr(rewriter, loc, ptr);
     // Fill offset 0.
     auto attr = rewriter.getIntegerAttr(rewriter.getIndexType(), 0);
-    auto zero = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, attr);
+    auto zero = LLVM::ConstantOp::create(rewriter, loc, int64Ty, attr);
     desc.setOffset(rewriter, loc, zero);
 
     // Fill size and stride descriptors in memref.
@@ -1546,11 +1512,12 @@ class VectorTypeCastOpConversion
       int64_t index = indexedSize.index();
       auto sizeAttr =
           rewriter.getIntegerAttr(rewriter.getIndexType(), indexedSize.value());
-      auto size = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, sizeAttr);
+      auto size = LLVM::ConstantOp::create(rewriter, loc, int64Ty, sizeAttr);
       desc.setSize(rewriter, loc, index, size);
       auto strideAttr = rewriter.getIntegerAttr(rewriter.getIndexType(),
                                                 (*targetStrides)[index]);
-      auto stride = rewriter.create<LLVM::ConstantOp>(loc, int64Ty, strideAttr);
+      auto stride =
+          LLVM::ConstantOp::create(rewriter, loc, int64Ty, strideAttr);
       desc.setStride(rewriter, loc, index, stride);
     }
 
@@ -1578,14 +1545,15 @@ class VectorCreateMaskOpConversion
     IntegerType idxType =
         force32BitVectorIndices ? rewriter.getI32Type() : rewriter.getI64Type();
     auto loc = op->getLoc();
-    Value indices = rewriter.create<LLVM::StepVectorOp>(
-        loc, LLVM::getVectorType(idxType, dstType.getShape()[0],
-                                 /*isScalable=*/true));
+    Value indices = LLVM::StepVectorOp::create(
+        rewriter, loc,
+        LLVM::getVectorType(idxType, dstType.getShape()[0],
+                            /*isScalable=*/true));
     auto bound = getValueOrCreateCastToIndexLike(rewriter, loc, idxType,
                                                  adaptor.getOperands()[0]);
-    Value bounds = rewriter.create<SplatOp>(loc, indices.getType(), bound);
-    Value comp = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
-                                                indices, bounds);
+    Value bounds = BroadcastOp::create(rewriter, loc, indices.getType(), bound);
+    Value comp = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt,
+                                       indices, bounds);
     rewriter.replaceOp(op, comp);
     return success();
   }
@@ -1741,16 +1709,16 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
 
     switch (conversion) {
     case PrintConversion::ZeroExt64:
-      value = rewriter.create<arith::ExtUIOp>(
-          loc, IntegerType::get(rewriter.getContext(), 64), value);
+      value = arith::ExtUIOp::create(
+          rewriter, loc, IntegerType::get(rewriter.getContext(), 64), value);
       break;
     case PrintConversion::SignExt64:
-      value = rewriter.create<arith::ExtSIOp>(
-          loc, IntegerType::get(rewriter.getContext(), 64), value);
+      value = arith::ExtSIOp::create(
+          rewriter, loc, IntegerType::get(rewriter.getContext(), 64), value);
       break;
     case PrintConversion::Bitcast16:
-      value = rewriter.create<LLVM::BitcastOp>(
-          loc, IntegerType::get(rewriter.getContext(), 16), value);
+      value = LLVM::BitcastOp::create(
+          rewriter, loc, IntegerType::get(rewriter.getContext(), 16), value);
       break;
     case PrintConversion::None:
       break;
@@ -1762,68 +1730,83 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
   // Helper to emit a call.
   static void emitCall(ConversionPatternRewriter &rewriter, Location loc,
                        Operation *ref, ValueRange params = ValueRange()) {
-    rewriter.create<LLVM::CallOp>(loc, TypeRange(), SymbolRefAttr::get(ref),
-                                  params);
+    LLVM::CallOp::create(rewriter, loc, TypeRange(), SymbolRefAttr::get(ref),
+                         params);
   }
 };
 
-/// The Splat operation is lowered to an insertelement + a shufflevector
-/// operation. Splat to only 0-d and 1-d vector result types are lowered.
-struct VectorSplatOpLowering : public ConvertOpToLLVMPattern<vector::SplatOp> {
-  using ConvertOpToLLVMPattern<vector::SplatOp>::ConvertOpToLLVMPattern;
+/// A broadcast of a scalar is lowered to an insertelement + a shufflevector
+/// operation. Only broadcasts to 0-d and 1-d vectors are lowered by this
+/// pattern, the higher rank cases are handled by another pattern.
+struct VectorBroadcastScalarToLowRankLowering
+    : public ConvertOpToLLVMPattern<vector::BroadcastOp> {
+  using ConvertOpToLLVMPattern<vector::BroadcastOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(vector::SplatOp splatOp, OpAdaptor adaptor,
+  matchAndRewrite(vector::BroadcastOp broadcast, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    VectorType resultType = cast<VectorType>(splatOp.getType());
+    if (isa<VectorType>(broadcast.getSourceType()))
+      return rewriter.notifyMatchFailure(
+          broadcast, "broadcast from vector type not handled");
+
+    VectorType resultType = broadcast.getType();
     if (resultType.getRank() > 1)
-      return failure();
+      return rewriter.notifyMatchFailure(broadcast,
+                                         "broadcast to 2+-d handled elsewhere");
 
     // First insert it into a poison vector so we can shuffle it.
-    auto vectorType = typeConverter->convertType(splatOp.getType());
+    auto vectorType = typeConverter->convertType(broadcast.getType());
     Value poison =
-        rewriter.create<LLVM::PoisonOp>(splatOp.getLoc(), vectorType);
-    auto zero = rewriter.create<LLVM::ConstantOp>(
-        splatOp.getLoc(),
+        LLVM::PoisonOp::create(rewriter, broadcast.getLoc(), vectorType);
+    auto zero = LLVM::ConstantOp::create(
+        rewriter, broadcast.getLoc(),
         typeConverter->convertType(rewriter.getIntegerType(32)),
         rewriter.getZeroAttr(rewriter.getIntegerType(32)));
 
     // For 0-d vector, we simply do `insertelement`.
     if (resultType.getRank() == 0) {
       rewriter.replaceOpWithNewOp<LLVM::InsertElementOp>(
-          splatOp, vectorType, poison, adaptor.getInput(), zero);
+          broadcast, vectorType, poison, adaptor.getSource(), zero);
       return success();
     }
 
     // For 1-d vector, we additionally do a `vectorshuffle`.
-    auto v = rewriter.create<LLVM::InsertElementOp>(
-        splatOp.getLoc(), vectorType, poison, adaptor.getInput(), zero);
+    auto v =
+        LLVM::InsertElementOp::create(rewriter, broadcast.getLoc(), vectorType,
+                                      poison, adaptor.getSource(), zero);
 
-    int64_t width = cast<VectorType>(splatOp.getType()).getDimSize(0);
+    int64_t width = cast<VectorType>(broadcast.getType()).getDimSize(0);
     SmallVector<int32_t> zeroValues(width, 0);
 
     // Shuffle the value across the desired number of elements.
-    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(splatOp, v, poison,
+    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(broadcast, v, poison,
                                                        zeroValues);
     return success();
   }
 };
 
-/// The Splat operation is lowered to an insertelement + a shufflevector
-/// operation. Splat to only 2+-d vector result types are lowered by the
-/// SplatNdOpLowering, the 1-d case is handled by SplatOpLowering.
-struct VectorSplatNdOpLowering : public ConvertOpToLLVMPattern<SplatOp> {
-  using ConvertOpToLLVMPattern<SplatOp>::ConvertOpToLLVMPattern;
+/// The broadcast of a scalar is lowered to an insertelement + a shufflevector
+/// operation. Only broadcasts to 2+-d vector result types are lowered by this
+/// pattern, the 1-d case is handled by another pattern. Broadcasts from vectors
+/// are not converted to LLVM, only broadcasts from scalars are.
+struct VectorBroadcastScalarToNdLowering
+    : public ConvertOpToLLVMPattern<BroadcastOp> {
+  using ConvertOpToLLVMPattern<BroadcastOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(SplatOp splatOp, OpAdaptor adaptor,
+  matchAndRewrite(BroadcastOp broadcast, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    VectorType resultType = splatOp.getType();
+    if (isa<VectorType>(broadcast.getSourceType()))
+      return rewriter.notifyMatchFailure(
+          broadcast, "broadcast from vector type not handled");
+
+    VectorType resultType = broadcast.getType();
     if (resultType.getRank() <= 1)
-      return failure();
+      return rewriter.notifyMatchFailure(
+          broadcast, "broadcast to 1-d or 0-d handled elsewhere");
 
     // First insert it into an undef vector so we can shuffle it.
-    auto loc = splatOp.getLoc();
+    auto loc = broadcast.getLoc();
     auto vectorTypeInfo =
         LLVM::detail::extractNDVectorTypeInfo(resultType, *getTypeConverter());
     auto llvmNDVectorTy = vectorTypeInfo.llvmNDVectorTy;
@@ -1832,28 +1815,28 @@ struct VectorSplatNdOpLowering : public ConvertOpToLLVMPattern<SplatOp> {
       return failure();
 
     // Construct returned value.
-    Value desc = rewriter.create<LLVM::PoisonOp>(loc, llvmNDVectorTy);
+    Value desc = LLVM::PoisonOp::create(rewriter, loc, llvmNDVectorTy);
 
-    // Construct a 1-D vector with the splatted value that we insert in all the
-    // places within the returned descriptor.
-    Value vdesc = rewriter.create<LLVM::PoisonOp>(loc, llvm1DVectorTy);
-    auto zero = rewriter.create<LLVM::ConstantOp>(
-        loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+    // Construct a 1-D vector with the broadcasted value that we insert in all
+    // the places within the returned descriptor.
+    Value vdesc = LLVM::PoisonOp::create(rewriter, loc, llvm1DVectorTy);
+    auto zero = LLVM::ConstantOp::create(
+        rewriter, loc, typeConverter->convertType(rewriter.getIntegerType(32)),
         rewriter.getZeroAttr(rewriter.getIntegerType(32)));
-    Value v = rewriter.create<LLVM::InsertElementOp>(loc, llvm1DVectorTy, vdesc,
-                                                     adaptor.getInput(), zero);
+    Value v = LLVM::InsertElementOp::create(rewriter, loc, llvm1DVectorTy,
+                                            vdesc, adaptor.getSource(), zero);
 
     // Shuffle the value across the desired number of elements.
     int64_t width = resultType.getDimSize(resultType.getRank() - 1);
     SmallVector<int32_t> zeroValues(width, 0);
-    v = rewriter.create<LLVM::ShuffleVectorOp>(loc, v, v, zeroValues);
+    v = LLVM::ShuffleVectorOp::create(rewriter, loc, v, v, zeroValues);
 
-    // Iterate of linear index, convert to coords space and insert splatted 1-D
-    // vector in each position.
+    // Iterate of linear index, convert to coords space and insert broadcasted
+    // 1-D vector in each position.
     nDVectorIterate(vectorTypeInfo, rewriter, [&](ArrayRef<int64_t> position) {
-      desc = rewriter.create<LLVM::InsertValueOp>(loc, desc, v, position);
+      desc = LLVM::InsertValueOp::create(rewriter, loc, desc, v, position);
     });
-    rewriter.replaceOp(splatOp, desc);
+    rewriter.replaceOp(broadcast, desc);
     return success();
   }
 };
@@ -1921,13 +1904,13 @@ struct VectorDeinterleaveOpLowering
       auto deinterleaveResults = deinterleaveOp.getResultTypes();
       auto packedOpResults =
           llvmTypeConverter->packOperationResults(deinterleaveResults);
-      auto intrinsic = rewriter.create<LLVM::vector_deinterleave2>(
-          loc, packedOpResults, adaptor.getSource());
+      auto intrinsic = LLVM::vector_deinterleave2::create(
+          rewriter, loc, packedOpResults, adaptor.getSource());
 
-      auto evenResult = rewriter.create<LLVM::ExtractValueOp>(
-          loc, intrinsic->getResult(0), 0);
-      auto oddResult = rewriter.create<LLVM::ExtractValueOp>(
-          loc, intrinsic->getResult(0), 1);
+      auto evenResult = LLVM::ExtractValueOp::create(
+          rewriter, loc, intrinsic->getResult(0), 0);
+      auto oddResult = LLVM::ExtractValueOp::create(rewriter, loc,
+                                                    intrinsic->getResult(0), 1);
 
       rewriter.replaceOp(deinterleaveOp, ValueRange{evenResult, oddResult});
       return success();
@@ -1950,11 +1933,11 @@ struct VectorDeinterleaveOpLowering
         oddShuffleMask.push_back(i);
     }
 
-    auto poison = rewriter.create<LLVM::PoisonOp>(loc, sourceType);
-    auto evenShuffle = rewriter.create<LLVM::ShuffleVectorOp>(
-        loc, adaptor.getSource(), poison, evenShuffleMask);
-    auto oddShuffle = rewriter.create<LLVM::ShuffleVectorOp>(
-        loc, adaptor.getSource(), poison, oddShuffleMask);
+    auto poison = LLVM::PoisonOp::create(rewriter, loc, sourceType);
+    auto evenShuffle = LLVM::ShuffleVectorOp::create(
+        rewriter, loc, adaptor.getSource(), poison, evenShuffleMask);
+    auto oddShuffle = LLVM::ShuffleVectorOp::create(
+        rewriter, loc, adaptor.getSource(), poison, oddShuffleMask);
 
     rewriter.replaceOp(deinterleaveOp, ValueRange{evenShuffle, oddShuffle});
     return success();
@@ -1977,9 +1960,9 @@ struct VectorFromElementsLowering
       return rewriter.notifyMatchFailure(fromElementsOp,
                                          "rank > 1 vectors are not supported");
     Type llvmType = typeConverter->convertType(vectorType);
-    Value result = rewriter.create<LLVM::PoisonOp>(loc, llvmType);
+    Value result = LLVM::PoisonOp::create(rewriter, loc, llvmType);
     for (auto [idx, val] : llvm::enumerate(adaptor.getElements()))
-      result = rewriter.create<vector::InsertOp>(loc, val, result, idx);
+      result = vector::InsertOp::create(rewriter, loc, val, result, idx);
     rewriter.replaceOp(fromElementsOp, result);
     return success();
   }
@@ -2003,12 +1986,12 @@ struct VectorToElementsLowering
       if (element.use_empty())
         continue;
 
-      auto constIdx = rewriter.create<LLVM::ConstantOp>(
-          loc, idxType, rewriter.getIntegerAttr(idxType, idx));
+      auto constIdx = LLVM::ConstantOp::create(
+          rewriter, loc, idxType, rewriter.getIntegerAttr(idxType, idx));
       auto llvmType = typeConverter->convertType(element.getType());
 
-      Value result = rewriter.create<LLVM::ExtractElementOp>(loc, llvmType,
-                                                             source, constIdx);
+      Value result = LLVM::ExtractElementOp::create(rewriter, loc, llvmType,
+                                                    source, constIdx);
       results[idx] = result;
     }
 
@@ -2035,6 +2018,196 @@ struct VectorScalableStepOpLowering
   }
 };
 
+/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to:
+/// ```
+///    %flattened_a = vector.shape_cast %a
+///    %flattened_b = vector.shape_cast %b
+///    %flattened_d = vector.matrix_multiply %flattened_a, %flattened_b
+///    %d = vector.shape_cast %%flattened_d
+///    %e = add %c, %d
+/// ```
+/// `vector.matrix_multiply` later lowers to `llvm.matrix.multiply`.
+//
+/// This only kicks in when vectorContractLowering is set to Matmul and
+/// the vector.contract op is a row-major matrix multiply.
+class ContractionOpToMatmulOpLowering
+    : public vector::MaskableOpRewritePattern<vector::ContractionOp> {
+public:
+  using MaskableOpRewritePattern::MaskableOpRewritePattern;
+
+  ContractionOpToMatmulOpLowering(
+      vector::VectorContractLowering vectorContractLowering,
+      MLIRContext *context, PatternBenefit benefit = 100)
+      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit) {}
+
+  FailureOr<Value>
+  matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,
+                            PatternRewriter &rewriter) const override;
+};
+
+/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
+/// semantics to:
+/// ```
+///    %mta = maybe_transpose
+///    %mtb = maybe_transpose
+///    %flattened_a = vector.shape_cast %mta
+///    %flattened_b = vector.shape_cast %mtb
+///    %flattened_d = llvm.intr.matrix.multiply %flattened_a, %flattened_b
+///    %mtd = vector.shape_cast %flattened_d
+///    %d = maybe_untranspose %mtd
+///    %e = add %c, %d
+/// ```
+//
+/// This only kicks in when vectorContractLowering is set to `Matmul`.
+/// vector.transpose operations are inserted if the vector.contract op is not a
+/// row-major matrix multiply.
+///
+/// Scalable vectors are not supported.
+FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp(
+    vector::ContractionOp op, MaskingOpInterface maskOp,
+    PatternRewriter &rew) const {
+  // TODO: Support vector.mask.
+  if (maskOp)
+    return failure();
+
+  auto iteratorTypes = op.getIteratorTypes().getValue();
+  if (!isParallelIterator(iteratorTypes[0]) ||
+      !isParallelIterator(iteratorTypes[1]) ||
+      !isReductionIterator(iteratorTypes[2]))
+    return failure();
+
+  Type opResType = op.getType();
+  VectorType vecType = dyn_cast<VectorType>(opResType);
+  if (vecType && vecType.isScalable()) {
+    // Note - this is sufficient to reject all cases with scalable vectors.
+    return failure();
+  }
+
+  Type elementType = op.getLhsType().getElementType();
+  if (!elementType.isIntOrFloat())
+    return failure();
+
+  Type dstElementType = vecType ? vecType.getElementType() : opResType;
+  if (elementType != dstElementType)
+    return failure();
+
+  // Perform lhs + rhs transpositions to conform to matmul row-major semantics.
+  // Bail out if the contraction cannot be put in this form.
+  MLIRContext *ctx = op.getContext();
+  Location loc = op.getLoc();
+  AffineExpr m, n, k;
+  bindDims(rew.getContext(), m, n, k);
+  // LHS must be A(m, k) or A(k, m).
+  Value lhs = op.getLhs();
+  auto lhsMap = op.getIndexingMapsArray()[0];
+  if (lhsMap == AffineMap::get(3, 0, {k, m}, ctx))
+    lhs = vector::TransposeOp::create(rew, loc, lhs, ArrayRef<int64_t>{1, 0});
+  else if (lhsMap != AffineMap::get(3, 0, {m, k}, ctx))
+    return failure();
+
+  // RHS must be B(k, n) or B(n, k).
+  Value rhs = op.getRhs();
+  auto rhsMap = op.getIndexingMapsArray()[1];
+  if (rhsMap == AffineMap::get(3, 0, {n, k}, ctx))
+    rhs = vector::TransposeOp::create(rew, loc, rhs, ArrayRef<int64_t>{1, 0});
+  else if (rhsMap != AffineMap::get(3, 0, {k, n}, ctx))
+    return failure();
+
+  // At this point lhs and rhs are in row-major.
+  VectorType lhsType = cast<VectorType>(lhs.getType());
+  VectorType rhsType = cast<VectorType>(rhs.getType());
+  int64_t lhsRows = lhsType.getDimSize(0);
+  int64_t lhsColumns = lhsType.getDimSize(1);
+  int64_t rhsColumns = rhsType.getDimSize(1);
+
+  Type flattenedLHSType =
+      VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
+  lhs = vector::ShapeCastOp::create(rew, loc, flattenedLHSType, lhs);
+
+  Type flattenedRHSType =
+      VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
+  rhs = vector::ShapeCastOp::create(rew, loc, flattenedRHSType, rhs);
+
+  Value mul = LLVM::MatrixMultiplyOp::create(
+      rew, loc,
+      VectorType::get(lhsRows * rhsColumns,
+                      cast<VectorType>(lhs.getType()).getElementType()),
+      lhs, rhs, lhsRows, lhsColumns, rhsColumns);
+
+  mul = vector::ShapeCastOp::create(
+      rew, loc,
+      VectorType::get({lhsRows, rhsColumns},
+                      getElementTypeOrSelf(op.getAcc().getType())),
+      mul);
+
+  // ACC must be C(m, n) or C(n, m).
+  auto accMap = op.getIndexingMapsArray()[2];
+  if (accMap == AffineMap::get(3, 0, {n, m}, ctx))
+    mul = vector::TransposeOp::create(rew, loc, mul, ArrayRef<int64_t>{1, 0});
+  else if (accMap != AffineMap::get(3, 0, {m, n}, ctx))
+    llvm_unreachable("invalid contraction semantics");
+
+  Value res = isa<IntegerType>(elementType)
+                  ? static_cast<Value>(
+                        arith::AddIOp::create(rew, loc, op.getAcc(), mul))
+                  : static_cast<Value>(
+                        arith::AddFOp::create(rew, loc, op.getAcc(), mul));
+
+  return res;
+}
+
+/// Lowers vector.transpose to llvm.intr.matrix.transpose
+class TransposeOpToMatrixTransposeOpLowering
+    : public OpRewritePattern<vector::TransposeOp> {
+public:
+  using OpRewritePattern<TransposeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransposeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+
+    Value input = op.getVector();
+    VectorType inputType = op.getSourceVectorType();
+    VectorType resType = op.getResultVectorType();
+
+    if (inputType.isScalable())
+      return rewriter.notifyMatchFailure(
+          op, "This lowering does not support scalable vectors");
+
+    // Set up convenience transposition table.
+    ArrayRef<int64_t> transp = op.getPermutation();
+
+    if (resType.getRank() != 2 || transp[0] != 1 || transp[1] != 0) {
+      return failure();
+    }
+
+    Type flattenedType =
+        VectorType::get(resType.getNumElements(), resType.getElementType());
+    auto matrix =
+        vector::ShapeCastOp::create(rewriter, loc, flattenedType, input);
+    auto rows = rewriter.getI32IntegerAttr(resType.getShape()[0]);
+    auto columns = rewriter.getI32IntegerAttr(resType.getShape()[1]);
+    Value trans = LLVM::MatrixTransposeOp::create(rewriter, loc, flattenedType,
+                                                  matrix, rows, columns);
+    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, resType, trans);
+    return success();
+  }
+};
+
+/// Convert `vector.splat` to `vector.broadcast`. There is a path to LLVM from
+/// `vector.broadcast` through other patterns.
+struct VectorSplatToBroadcast : public ConvertOpToLLVMPattern<vector::SplatOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+  LogicalResult
+  matchAndRewrite(vector::SplatOp splat, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(splat, splat.getType(),
+                                                     adaptor.getInput());
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::vector::populateVectorRankReducingFMAPattern(
@@ -2042,6 +2215,17 @@ void mlir::vector::populateVectorRankReducingFMAPattern(
   patterns.add<VectorFMAOpNDRewritePattern>(patterns.getContext());
 }
 
+void mlir::vector::populateVectorContractToMatrixMultiply(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<ContractionOpToMatmulOpLowering>(patterns.getContext(), benefit);
+}
+
+void mlir::vector::populateVectorTransposeToFlatTranspose(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<TransposeOpToMatrixTransposeOpLowering>(patterns.getContext(),
+                                                       benefit);
+}
+
 /// Populate the given list with patterns that convert from Vector to LLVM.
 void mlir::populateVectorToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
@@ -2063,7 +2247,8 @@ void mlir::populateVectorToLLVMConversionPatterns(
                VectorInsertOpConversion, VectorPrintOpConversion,
                VectorTypeCastOpConversion, VectorScaleOpConversion,
                VectorExpandLoadOpConversion, VectorCompressStoreOpConversion,
-               VectorSplatOpLowering, VectorSplatNdOpLowering,
+               VectorSplatToBroadcast, VectorBroadcastScalarToLowRankLowering,
+               VectorBroadcastScalarToNdLowering,
                VectorScalableInsertOpLowering, VectorScalableExtractOpLowering,
                MaskedReductionOpConversion, VectorInterleaveOpLowering,
                VectorDeinterleaveOpLowering, VectorFromElementsLowering,
@@ -2071,12 +2256,6 @@ void mlir::populateVectorToLLVMConversionPatterns(
       converter);
 }
 
-void mlir::populateVectorToLLVMMatrixConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
-  patterns.add<VectorMatmulOpConversion>(converter);
-  patterns.add<VectorFlatTransposeOpConversion>(converter);
-}
-
 namespace {
 struct VectorToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
   using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index 549d0210af7ad..d3d0a45eb2463 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -70,10 +70,22 @@ void ConvertVectorToLLVMPass::runOnOperation() {
     populateVectorBitCastLoweringPatterns(patterns);
     populateVectorBroadcastLoweringPatterns(patterns);
     populateVectorContractLoweringPatterns(patterns, vectorContractLowering);
+    if (vectorContractLowering == vector::VectorContractLowering::Matmul) {
+      // This pattern creates a dependency on the LLVM dialect, hence we don't
+      // include it in `populateVectorContractLoweringPatterns` that is part of
+      // the Vector dialect (and should not depend on LLVM).
+      populateVectorContractToMatrixMultiply(patterns);
+    }
     populateVectorMaskOpLoweringPatterns(patterns);
     populateVectorShapeCastLoweringPatterns(patterns);
     populateVectorInterleaveLoweringPatterns(patterns);
     populateVectorTransposeLoweringPatterns(patterns, vectorTransposeLowering);
+    if (vectorTransposeLowering == vector::VectorTransposeLowering::Flat) {
+      // This pattern creates a dependency on the LLVM dialect, hence we don't
+      // include it in `populateVectorTransposeLoweringPatterns` that is part of
+      // the Vector dialect (and should not depend on LLVM).
+      populateVectorTransposeToFlatTranspose(patterns);
+    }
     // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
     populateVectorTransferLoweringPatterns(patterns, /*maxTransferRank=*/1);
     populateVectorMaskMaterializationPatterns(patterns,
@@ -88,6 +100,9 @@ void ConvertVectorToLLVMPass::runOnOperation() {
       if (armSVE)
         populateLowerContractionToSVEI8MMPatternPatterns(patterns);
     }
+    if (armBF16)
+      populateLowerContractionToSVEBFMMLAPatterns(patterns);
+
     (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 
@@ -96,11 +111,9 @@ void ConvertVectorToLLVMPass::runOnOperation() {
   LLVMTypeConverter converter(&getContext(), options);
   RewritePatternSet patterns(&getContext());
   populateVectorTransferLoweringPatterns(patterns);
-  populateVectorToLLVMMatrixConversionPatterns(converter, patterns);
   populateVectorToLLVMConversionPatterns(
       converter, patterns, reassociateFPReductions, force32BitVectorIndices,
       useVectorAlignment);
-  populateVectorToLLVMMatrixConversionPatterns(converter, patterns);
 
   // Architecture specific augmentations.
   LLVMConversionTarget target(getContext());
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index ed51b2126dcdd..4c1047a8871a5 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -132,9 +132,9 @@ static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
                             Value value) {
   if (hasRetVal) {
     assert(value && "Expected non-empty value");
-    b.create<scf::YieldOp>(loc, value);
+    scf::YieldOp::create(b, loc, value);
   } else {
-    b.create<scf::YieldOp>(loc);
+    scf::YieldOp::create(b, loc);
   }
 }
 
@@ -154,7 +154,7 @@ static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) {
     return Value();
 
   Location loc = xferOp.getLoc();
-  return b.create<vector::ExtractOp>(loc, xferOp.getMask(), iv);
+  return vector::ExtractOp::create(b, loc, xferOp.getMask(), iv);
 }
 
 /// Helper function TransferOpConversion and TransferOp1dConversion.
@@ -201,22 +201,22 @@ static Value generateInBoundsCheck(
     Value base = xferOp.getIndices()[*dim];
     Value memrefIdx =
         affine::makeComposedAffineApply(b, loc, d0 + d1, {base, iv});
-    cond = lb.create<arith::CmpIOp>(arith::CmpIPredicate::sgt, memrefDim,
-                                    memrefIdx);
+    cond = arith::CmpIOp::create(lb, arith::CmpIPredicate::sgt, memrefDim,
+                                 memrefIdx);
   }
 
   // Condition check 2: Masked in?
   if (auto maskCond = generateMaskCheck(b, xferOp, iv)) {
     if (cond)
-      cond = lb.create<arith::AndIOp>(cond, maskCond);
+      cond = arith::AndIOp::create(lb, cond, maskCond);
     else
       cond = maskCond;
   }
 
   // If the condition is non-empty, generate an SCF::IfOp.
   if (cond) {
-    auto check = lb.create<scf::IfOp>(
-        cond,
+    auto check = scf::IfOp::create(
+        lb, cond,
         /*thenBuilder=*/
         [&](OpBuilder &b, Location loc) {
           maybeYieldValue(b, loc, hasRetVal, inBoundsCase(b, loc));
@@ -226,7 +226,7 @@ static Value generateInBoundsCheck(
           if (outOfBoundsCase) {
             maybeYieldValue(b, loc, hasRetVal, outOfBoundsCase(b, loc));
           } else {
-            b.create<scf::YieldOp>(loc);
+            scf::YieldOp::create(b, loc);
           }
         });
 
@@ -303,14 +303,15 @@ static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) {
 
   BufferAllocs result;
   auto bufferType = MemRefType::get({}, xferOp.getVectorType());
-  result.dataBuffer = b.create<memref::AllocaOp>(loc, bufferType);
+  result.dataBuffer = memref::AllocaOp::create(b, loc, bufferType);
 
   if (xferOp.getMask()) {
     auto maskType = MemRefType::get({}, xferOp.getMask().getType());
-    auto maskBuffer = b.create<memref::AllocaOp>(loc, maskType);
+    auto maskBuffer = memref::AllocaOp::create(b, loc, maskType);
     b.setInsertionPoint(xferOp);
-    b.create<memref::StoreOp>(loc, xferOp.getMask(), maskBuffer);
-    result.maskBuffer = b.create<memref::LoadOp>(loc, maskBuffer, ValueRange());
+    memref::StoreOp::create(b, loc, xferOp.getMask(), maskBuffer);
+    result.maskBuffer =
+        memref::LoadOp::create(b, loc, maskBuffer, ValueRange());
   }
 
   return result;
@@ -421,14 +422,15 @@ struct Strategy<TransferReadOp> {
     auto bufferType = dyn_cast<ShapedType>(buffer.getType());
     auto vecType = dyn_cast<VectorType>(bufferType.getElementType());
     auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
-    auto newXferOp = b.create<vector::TransferReadOp>(
-        loc, vecType, xferOp.getBase(), xferIndices,
+    auto newXferOp = vector::TransferReadOp::create(
+        b, loc, vecType, xferOp.getBase(), xferIndices,
         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
         xferOp.getPadding(), Value(), inBoundsAttr);
 
     maybeApplyPassLabel(b, newXferOp, options.targetRank);
 
-    b.create<memref::StoreOp>(loc, newXferOp.getVector(), buffer, storeIndices);
+    memref::StoreOp::create(b, loc, newXferOp.getVector(), buffer,
+                            storeIndices);
     return newXferOp;
   }
 
@@ -444,8 +446,9 @@ struct Strategy<TransferReadOp> {
     Location loc = xferOp.getLoc();
     auto bufferType = dyn_cast<ShapedType>(buffer.getType());
     auto vecType = dyn_cast<VectorType>(bufferType.getElementType());
-    auto vec = b.create<vector::SplatOp>(loc, vecType, xferOp.getPadding());
-    b.create<memref::StoreOp>(loc, vec, buffer, storeIndices);
+    auto vec =
+        vector::BroadcastOp::create(b, loc, vecType, xferOp.getPadding());
+    memref::StoreOp::create(b, loc, vec, buffer, storeIndices);
 
     return Value();
   }
@@ -506,12 +509,12 @@ struct Strategy<TransferWriteOp> {
     getXferIndices(b, xferOp, iv, xferIndices);
 
     Location loc = xferOp.getLoc();
-    auto vec = b.create<memref::LoadOp>(loc, buffer, loadIndices);
+    auto vec = memref::LoadOp::create(b, loc, buffer, loadIndices);
     auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
     auto source = loopState.empty() ? xferOp.getBase() : loopState[0];
     Type type = isTensorOp(xferOp) ? xferOp.getShapedType() : Type();
-    auto newXferOp = b.create<vector::TransferWriteOp>(
-        loc, type, vec, source, xferIndices,
+    auto newXferOp = vector::TransferWriteOp::create(
+        b, loc, type, vec, source, xferIndices,
         AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
         inBoundsAttr);
 
@@ -610,8 +613,8 @@ struct PrepareTransferReadConversion
     }
 
     Location loc = xferOp.getLoc();
-    rewriter.create<memref::StoreOp>(loc, newXfer->getResult(0),
-                                     buffers.dataBuffer);
+    memref::StoreOp::create(rewriter, loc, newXfer->getResult(0),
+                            buffers.dataBuffer);
     rewriter.replaceOpWithNewOp<memref::LoadOp>(xferOp, buffers.dataBuffer);
 
     return success();
@@ -653,9 +656,9 @@ struct PrepareTransferWriteConversion
 
     Location loc = xferOp.getLoc();
     auto buffers = allocBuffers(rewriter, xferOp);
-    rewriter.create<memref::StoreOp>(loc, xferOp.getVector(),
-                                     buffers.dataBuffer);
-    auto loadedVec = rewriter.create<memref::LoadOp>(loc, buffers.dataBuffer);
+    memref::StoreOp::create(rewriter, loc, xferOp.getVector(),
+                            buffers.dataBuffer);
+    auto loadedVec = memref::LoadOp::create(rewriter, loc, buffers.dataBuffer);
     rewriter.modifyOpInPlace(xferOp, [&]() {
       xferOp.getValueToStoreMutable().assign(loadedVec);
       xferOp->setAttr(kPassLabel, rewriter.getUnitAttr());
@@ -735,17 +738,17 @@ struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> {
       auto signlessTargetVectorType =
           vectorType.cloneWith({}, getIntTypeWithSignlessSemantics(legalIntTy));
       auto targetVectorType = vectorType.cloneWith({}, legalIntTy);
-      value = rewriter.create<vector::BitCastOp>(loc, signlessSourceVectorType,
-                                                 value);
+      value = vector::BitCastOp::create(rewriter, loc, signlessSourceVectorType,
+                                        value);
       if (value.getType() != signlessTargetVectorType) {
         if (width == 1 || intTy.isUnsigned())
-          value = rewriter.create<arith::ExtUIOp>(loc, signlessTargetVectorType,
-                                                  value);
+          value = arith::ExtUIOp::create(rewriter, loc,
+                                         signlessTargetVectorType, value);
         else
-          value = rewriter.create<arith::ExtSIOp>(loc, signlessTargetVectorType,
-                                                  value);
+          value = arith::ExtSIOp::create(rewriter, loc,
+                                         signlessTargetVectorType, value);
       }
-      value = rewriter.create<vector::BitCastOp>(loc, targetVectorType, value);
+      value = vector::BitCastOp::create(rewriter, loc, targetVectorType, value);
       vectorType = targetVectorType;
     }
 
@@ -762,29 +765,30 @@ struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> {
                                         std::multiplies<int64_t>());
       auto flatVectorType =
           VectorType::get({flatLength}, vectorType.getElementType());
-      value = rewriter.create<vector::ShapeCastOp>(loc, flatVectorType, value);
+      value = vector::ShapeCastOp::create(rewriter, loc, flatVectorType, value);
     }
 
     vector::PrintOp firstClose;
     SmallVector<Value, 8> loopIndices;
     for (unsigned d = 0; d < shape.size(); d++) {
       // Setup loop bounds and step.
-      Value lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-      Value upperBound = rewriter.create<arith::ConstantIndexOp>(loc, shape[d]);
-      Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+      Value lowerBound = arith::ConstantIndexOp::create(rewriter, loc, 0);
+      Value upperBound =
+          arith::ConstantIndexOp::create(rewriter, loc, shape[d]);
+      Value step = arith::ConstantIndexOp::create(rewriter, loc, 1);
       if (!scalableDimensions.empty() && scalableDimensions[d]) {
-        auto vscale = rewriter.create<vector::VectorScaleOp>(
-            loc, rewriter.getIndexType());
-        upperBound = rewriter.create<arith::MulIOp>(loc, upperBound, vscale);
+        auto vscale = vector::VectorScaleOp::create(rewriter, loc,
+                                                    rewriter.getIndexType());
+        upperBound = arith::MulIOp::create(rewriter, loc, upperBound, vscale);
       }
-      auto lastIndex = rewriter.create<arith::SubIOp>(loc, upperBound, step);
+      auto lastIndex = arith::SubIOp::create(rewriter, loc, upperBound, step);
 
       // Create a loop to print the elements surrounded by parentheses.
-      rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
+      vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Open);
       auto loop =
-          rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
-      auto printClose = rewriter.create<vector::PrintOp>(
-          loc, vector::PrintPunctuation::Close);
+          scf::ForOp::create(rewriter, loc, lowerBound, upperBound, step);
+      auto printClose = vector::PrintOp::create(
+          rewriter, loc, vector::PrintPunctuation::Close);
       if (!firstClose)
         firstClose = printClose;
 
@@ -793,14 +797,14 @@ struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> {
 
       // Print a comma after all but the last element.
       rewriter.setInsertionPointToStart(loop.getBody());
-      auto notLastIndex = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::ult, loopIdx, lastIndex);
-      rewriter.create<scf::IfOp>(loc, notLastIndex,
-                                 [&](OpBuilder &builder, Location loc) {
-                                   builder.create<vector::PrintOp>(
-                                       loc, vector::PrintPunctuation::Comma);
-                                   builder.create<scf::YieldOp>(loc);
-                                 });
+      auto notLastIndex = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::ult, loopIdx, lastIndex);
+      scf::IfOp::create(rewriter, loc, notLastIndex,
+                        [&](OpBuilder &builder, Location loc) {
+                          vector::PrintOp::create(
+                              builder, loc, vector::PrintPunctuation::Comma);
+                          scf::YieldOp::create(builder, loc);
+                        });
 
       rewriter.setInsertionPointToStart(loop.getBody());
     }
@@ -810,22 +814,23 @@ struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> {
     Value flatIndex;
     auto currentStride = 1;
     for (int d = shape.size() - 1; d >= 0; d--) {
-      auto stride = rewriter.create<arith::ConstantIndexOp>(loc, currentStride);
-      auto index = rewriter.create<arith::MulIOp>(loc, stride, loopIndices[d]);
+      auto stride =
+          arith::ConstantIndexOp::create(rewriter, loc, currentStride);
+      auto index = arith::MulIOp::create(rewriter, loc, stride, loopIndices[d]);
       if (flatIndex)
-        flatIndex = rewriter.create<arith::AddIOp>(loc, flatIndex, index);
+        flatIndex = arith::AddIOp::create(rewriter, loc, flatIndex, index);
       else
         flatIndex = index;
       currentStride *= shape[d];
     }
 
     // Print the scalar elements in the inner most loop.
-    auto element = rewriter.create<vector::ExtractOp>(loc, value, flatIndex);
-    rewriter.create<vector::PrintOp>(loc, element,
-                                     vector::PrintPunctuation::NoPunctuation);
+    auto element = vector::ExtractOp::create(rewriter, loc, value, flatIndex);
+    vector::PrintOp::create(rewriter, loc, element,
+                            vector::PrintPunctuation::NoPunctuation);
 
     rewriter.setInsertionPointAfter(firstClose);
-    rewriter.create<vector::PrintOp>(loc, printOp.getPunctuation());
+    vector::PrintOp::create(rewriter, loc, printOp.getPunctuation());
     rewriter.eraseOp(printOp);
     return success();
   }
@@ -916,7 +921,7 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
                                          "Failed to unpack one vector dim.");
 
     auto castedDataBuffer =
-        locB.create<vector::TypeCastOp>(*castedDataType, dataBuffer);
+        vector::TypeCastOp::create(locB, *castedDataType, dataBuffer);
 
     // If the xferOp has a mask: Find and cast mask buffer.
     Value castedMaskBuffer;
@@ -935,22 +940,22 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
         auto maskBufferType = cast<MemRefType>(maskBuffer.getType());
         MemRefType castedMaskType = *unpackOneDim(maskBufferType);
         castedMaskBuffer =
-            locB.create<vector::TypeCastOp>(castedMaskType, maskBuffer);
+            vector::TypeCastOp::create(locB, castedMaskType, maskBuffer);
       }
     }
 
     // Loop bounds and step.
-    auto lb = locB.create<arith::ConstantIndexOp>(0);
-    auto ub = locB.create<arith::ConstantIndexOp>(
-        castedDataType->getDimSize(castedDataType->getRank() - 1));
-    auto step = locB.create<arith::ConstantIndexOp>(1);
+    auto lb = arith::ConstantIndexOp::create(locB, 0);
+    auto ub = arith::ConstantIndexOp::create(
+        locB, castedDataType->getDimSize(castedDataType->getRank() - 1));
+    auto step = arith::ConstantIndexOp::create(locB, 1);
     // TransferWriteOps that operate on tensors return the modified tensor and
     // require a loop state.
     auto loopState = Strategy<OpTy>::initialLoopState(xferOp);
 
     // Generate for loop.
-    auto result = locB.create<scf::ForOp>(
-        lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
+    auto result = scf::ForOp::create(
+        locB, lb, ub, step, loopState ? ValueRange(loopState) : ValueRange(),
         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopState) {
           Type stateType = loopState.empty() ? Type() : loopState[0].getType();
 
@@ -975,8 +980,8 @@ struct TransferOpConversion : public VectorToSCFPattern<OpTy> {
                   SmallVector<Value, 8> loadIndices;
                   getMaskBufferLoadIndices(xferOp, castedMaskBuffer,
                                            loadIndices, iv);
-                  auto mask = b.create<memref::LoadOp>(loc, castedMaskBuffer,
-                                                       loadIndices);
+                  auto mask = memref::LoadOp::create(b, loc, castedMaskBuffer,
+                                                     loadIndices);
                   rewriter.modifyOpInPlace(newXfer, [&]() {
                     newXfer.getMaskMutable().assign(mask);
                   });
@@ -1119,30 +1124,30 @@ struct ScalableTransposeTransferWriteConversion
     auto transposeSource = transposeOp.getVector();
     SmallVector<Value> transposeSourceSlices =
         llvm::map_to_vector(fixedDimOffsets, [&](int64_t idx) -> Value {
-          return rewriter.create<vector::ExtractOp>(loc, transposeSource, idx);
+          return vector::ExtractOp::create(rewriter, loc, transposeSource, idx);
         });
 
     // Loop bounds and step.
-    auto lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    auto lb = arith::ConstantIndexOp::create(rewriter, loc, 0);
     auto ub =
         maskDims->empty()
             ? Value(createVscaleMultiple(vectorType.getDimSize(0)))
             : vector::getAsValues(rewriter, loc, maskDims->front()).front();
-    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    auto step = arith::ConstantIndexOp::create(rewriter, loc, 1);
 
     // Generate a new mask for the slice.
     VectorType sliceType = VectorType::Builder(vectorType).dropDim(0);
     Value sliceMask = nullptr;
     if (!maskDims->empty()) {
-      sliceMask = rewriter.create<vector::CreateMaskOp>(
-          loc, sliceType.clone(rewriter.getI1Type()),
+      sliceMask = vector::CreateMaskOp::create(
+          rewriter, loc, sliceType.clone(rewriter.getI1Type()),
           ArrayRef<OpFoldResult>(*maskDims).drop_front());
     }
 
     Value initDest = isTensorOp(writeOp) ? writeOp.getBase() : Value{};
     ValueRange initLoopArgs = initDest ? initDest : ValueRange{};
-    auto result = rewriter.create<scf::ForOp>(
-        loc, lb, ub, step, initLoopArgs,
+    auto result = scf::ForOp::create(
+        rewriter, loc, lb, ub, step, initLoopArgs,
         [&](OpBuilder &b, Location loc, Value iv, ValueRange loopIterArgs) {
           // Indices for the new transfer op.
           SmallVector<Value, 8> xferIndices;
@@ -1151,25 +1156,25 @@ struct ScalableTransposeTransferWriteConversion
           // Extract a transposed slice from the source vector.
           SmallVector<Value> transposeElements =
               llvm::map_to_vector(fixedDimOffsets, [&](int64_t idx) -> Value {
-                return b.create<vector::ExtractOp>(
-                    loc, transposeSourceSlices[idx], iv);
+                return vector::ExtractOp::create(
+                    b, loc, transposeSourceSlices[idx], iv);
               });
-          auto sliceVec = b.create<vector::FromElementsOp>(loc, sliceType,
-                                                           transposeElements);
+          auto sliceVec = vector::FromElementsOp::create(b, loc, sliceType,
+                                                         transposeElements);
 
           // Create the transfer_write for the slice.
           Value dest =
               loopIterArgs.empty() ? writeOp.getBase() : loopIterArgs.front();
-          auto newWriteOp = b.create<vector::TransferWriteOp>(
-              loc, sliceVec, dest, xferIndices,
+          auto newWriteOp = vector::TransferWriteOp::create(
+              b, loc, sliceVec, dest, xferIndices,
               ArrayRef<bool>(writeOp.getInBoundsValues()).drop_front());
           if (sliceMask)
             newWriteOp.getMaskMutable().assign(sliceMask);
 
           // Yield from the loop.
-          b.create<scf::YieldOp>(loc, loopIterArgs.empty()
-                                          ? ValueRange{}
-                                          : newWriteOp.getResult());
+          scf::YieldOp::create(b, loc,
+                               loopIterArgs.empty() ? ValueRange{}
+                                                    : newWriteOp.getResult());
         });
 
     if (isTensorOp(writeOp))
@@ -1207,7 +1212,7 @@ static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp,
 
     llvm::SmallVector<int64_t, 1> indices({i});
     Location loc = xferOp.getLoc();
-    auto newMask = b.create<vector::ExtractOp>(loc, xferOp.getMask(), indices);
+    auto newMask = vector::ExtractOp::create(b, loc, xferOp.getMask(), indices);
     newXferOp.getMaskMutable().assign(newMask);
   }
 
@@ -1261,8 +1266,8 @@ struct UnrollTransferReadConversion
     if (auto insertOp = getInsertOp(xferOp))
       return insertOp.getDest();
     Location loc = xferOp.getLoc();
-    return rewriter.create<vector::SplatOp>(loc, xferOp.getVectorType(),
-                                            xferOp.getPadding());
+    return vector::BroadcastOp::create(rewriter, loc, xferOp.getVectorType(),
+                                       xferOp.getPadding());
   }
 
   /// If the result of the TransferReadOp has exactly one user, which is a
@@ -1317,7 +1322,7 @@ struct UnrollTransferReadConversion
     // Generate fully unrolled loop of transfer ops.
     Location loc = xferOp.getLoc();
     for (int64_t i = 0; i < dimSize; ++i) {
-      Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i);
+      Value iv = arith::ConstantIndexOp::create(rewriter, loc, i);
 
       // FIXME: Rename this lambda - it does much more than just
       // in-bounds-check generation.
@@ -1336,8 +1341,8 @@ struct UnrollTransferReadConversion
 
             auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
 
-            auto newXferOp = b.create<vector::TransferReadOp>(
-                loc, newXferVecType, xferOp.getBase(), xferIndices,
+            auto newXferOp = vector::TransferReadOp::create(
+                b, loc, newXferVecType, xferOp.getBase(), xferIndices,
                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)),
                 xferOp.getPadding(), Value(), inBoundsAttr);
             maybeAssignMask(b, xferOp, newXferOp, i);
@@ -1346,11 +1351,11 @@ struct UnrollTransferReadConversion
             if (newXferVecType.getRank() == 0) {
               // vector.insert does not accept rank-0 as the non-indexed
               // argument. Extract the scalar before inserting.
-              valToInser = b.create<vector::ExtractOp>(loc, valToInser,
-                                                       SmallVector<int64_t>());
+              valToInser = vector::ExtractOp::create(b, loc, valToInser,
+                                                     SmallVector<int64_t>());
             }
-            return b.create<vector::InsertOp>(loc, valToInser, vec,
-                                              insertionIndices);
+            return vector::InsertOp::create(b, loc, valToInser, vec,
+                                            insertionIndices);
           },
           /*outOfBoundsCase=*/
           [&](OpBuilder &b, Location loc) {
@@ -1460,7 +1465,7 @@ struct UnrollTransferWriteConversion
     // Generate fully unrolled loop of transfer ops.
     Location loc = xferOp.getLoc();
     for (int64_t i = 0; i < dimSize; ++i) {
-      Value iv = rewriter.create<arith::ConstantIndexOp>(loc, i);
+      Value iv = arith::ConstantIndexOp::create(rewriter, loc, i);
 
       auto updatedSource = generateInBoundsCheck(
           rewriter, xferOp, iv, unpackedDim(xferOp),
@@ -1477,20 +1482,20 @@ struct UnrollTransferWriteConversion
             extractionIndices.push_back(b.getI64IntegerAttr(i));
 
             auto extracted =
-                b.create<vector::ExtractOp>(loc, vec, extractionIndices);
+                vector::ExtractOp::create(b, loc, vec, extractionIndices);
             auto inBoundsAttr = dropFirstElem(b, xferOp.getInBoundsAttr());
             Value xferVec;
             if (inputVectorTy.getRank() == 1) {
               // When target-rank=0, unrolling would causes the vector input
               // argument into `transfer_write` to become a scalar. We solve
               // this by broadcasting the scalar to a 0D vector.
-              xferVec = b.create<vector::BroadcastOp>(
-                  loc, VectorType::get({}, extracted.getType()), extracted);
+              xferVec = vector::BroadcastOp::create(
+                  b, loc, VectorType::get({}, extracted.getType()), extracted);
             } else {
               xferVec = extracted;
             }
-            auto newXferOp = b.create<vector::TransferWriteOp>(
-                loc, sourceType, xferVec, source, xferIndices,
+            auto newXferOp = vector::TransferWriteOp::create(
+                b, loc, sourceType, xferVec, source, xferIndices,
                 AffineMapAttr::get(unpackedPermutationMap(b, xferOp)), Value(),
                 inBoundsAttr);
 
@@ -1572,19 +1577,19 @@ struct Strategy1d<TransferReadOp> {
         b, xferOp, iv, dim, TypeRange(xferOp.getVectorType()),
         /*inBoundsCase=*/
         [&](OpBuilder &b, Location loc) {
-          Value val = b.create<memref::LoadOp>(loc, xferOp.getBase(), indices);
-          return b.create<vector::InsertOp>(loc, val, vec, iv);
+          Value val = memref::LoadOp::create(b, loc, xferOp.getBase(), indices);
+          return vector::InsertOp::create(b, loc, val, vec, iv);
         },
         /*outOfBoundsCase=*/
         [&](OpBuilder & /*b*/, Location loc) { return vec; });
-    b.create<scf::YieldOp>(loc, nextVec);
+    scf::YieldOp::create(b, loc, nextVec);
   }
 
   static Value initialLoopState(OpBuilder &b, TransferReadOp xferOp) {
     // Inititalize vector with padding value.
     Location loc = xferOp.getLoc();
-    return b.create<vector::SplatOp>(loc, xferOp.getVectorType(),
-                                     xferOp.getPadding());
+    return vector::BroadcastOp::create(b, loc, xferOp.getVectorType(),
+                                       xferOp.getPadding());
   }
 };
 
@@ -1601,10 +1606,10 @@ struct Strategy1d<TransferWriteOp> {
     generateInBoundsCheck(
         b, xferOp, iv, dim,
         /*inBoundsCase=*/[&](OpBuilder &b, Location loc) {
-          auto val = b.create<vector::ExtractOp>(loc, xferOp.getVector(), iv);
-          b.create<memref::StoreOp>(loc, val, xferOp.getBase(), indices);
+          auto val = vector::ExtractOp::create(b, loc, xferOp.getVector(), iv);
+          memref::StoreOp::create(b, loc, val, xferOp.getBase(), indices);
         });
-    b.create<scf::YieldOp>(loc);
+    scf::YieldOp::create(b, loc);
   }
 
   static Value initialLoopState(OpBuilder &b, TransferWriteOp xferOp) {
@@ -1665,15 +1670,15 @@ struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> {
     // Loop bounds, step, state...
     Location loc = xferOp.getLoc();
     auto vecType = xferOp.getVectorType();
-    auto lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    auto lb = arith::ConstantIndexOp::create(rewriter, loc, 0);
     Value ub =
-        rewriter.create<arith::ConstantIndexOp>(loc, vecType.getDimSize(0));
+        arith::ConstantIndexOp::create(rewriter, loc, vecType.getDimSize(0));
     if (vecType.isScalable()) {
       Value vscale =
-          rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
-      ub = rewriter.create<arith::MulIOp>(loc, ub, vscale);
+          vector::VectorScaleOp::create(rewriter, loc, rewriter.getIndexType());
+      ub = arith::MulIOp::create(rewriter, loc, ub, vscale);
     }
-    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    auto step = arith::ConstantIndexOp::create(rewriter, loc, 1);
     auto loopState = Strategy1d<OpTy>::initialLoopState(rewriter, xferOp);
 
     // Generate for loop.
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index 21d8e1d9f1156..986eae33503d1 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -79,6 +79,20 @@ struct VectorShapeCast final : public OpConversionPattern<vector::ShapeCastOp> {
   }
 };
 
+// Convert `vector.splat` to `vector.broadcast`. There is a path from
+// `vector.broadcast` to SPIRV via other patterns.
+struct VectorSplatToBroadcast final
+    : public OpConversionPattern<vector::SplatOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(vector::SplatOp splat, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(splat, splat.getType(),
+                                                     adaptor.getInput());
+    return success();
+  }
+};
+
 struct VectorBitcastConvert final
     : public OpConversionPattern<vector::BitCastOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -147,19 +161,19 @@ static Value sanitizeDynamicIndex(ConversionPatternRewriter &rewriter,
                                   Location loc, Value dynamicIndex,
                                   int64_t kPoisonIndex, unsigned vectorSize) {
   if (llvm::isPowerOf2_32(vectorSize)) {
-    Value inBoundsMask = rewriter.create<spirv::ConstantOp>(
-        loc, dynamicIndex.getType(),
+    Value inBoundsMask = spirv::ConstantOp::create(
+        rewriter, loc, dynamicIndex.getType(),
         rewriter.getIntegerAttr(dynamicIndex.getType(), vectorSize - 1));
-    return rewriter.create<spirv::BitwiseAndOp>(loc, dynamicIndex,
-                                                inBoundsMask);
+    return spirv::BitwiseAndOp::create(rewriter, loc, dynamicIndex,
+                                       inBoundsMask);
   }
-  Value poisonIndex = rewriter.create<spirv::ConstantOp>(
-      loc, dynamicIndex.getType(),
+  Value poisonIndex = spirv::ConstantOp::create(
+      rewriter, loc, dynamicIndex.getType(),
       rewriter.getIntegerAttr(dynamicIndex.getType(), kPoisonIndex));
   Value cmpResult =
-      rewriter.create<spirv::IEqualOp>(loc, dynamicIndex, poisonIndex);
-  return rewriter.create<spirv::SelectOp>(
-      loc, cmpResult,
+      spirv::IEqualOp::create(rewriter, loc, dynamicIndex, poisonIndex);
+  return spirv::SelectOp::create(
+      rewriter, loc, cmpResult,
       spirv::ConstantOp::getZero(dynamicIndex.getType(), loc, rewriter),
       dynamicIndex);
 }
@@ -427,8 +441,8 @@ static SmallVector<Value> extractAllElements(
   Location loc = reduceOp.getLoc();
 
   for (int i = 0; i < numElements; ++i) {
-    values.push_back(rewriter.create<spirv::CompositeExtractOp>(
-        loc, srcVectorType.getElementType(), adaptor.getVector(),
+    values.push_back(spirv::CompositeExtractOp::create(
+        rewriter, loc, srcVectorType.getElementType(), adaptor.getVector(),
         rewriter.getI32ArrayAttr({i})));
   }
   if (Value acc = adaptor.getAcc())
@@ -481,16 +495,16 @@ struct VectorReductionPattern final : OpConversionPattern<vector::ReductionOp> {
 #define INT_AND_FLOAT_CASE(kind, iop, fop)                                     \
   case vector::CombiningKind::kind:                                            \
     if (llvm::isa<IntegerType>(resultType)) {                                  \
-      result = rewriter.create<spirv::iop>(loc, resultType, result, next);     \
+      result = spirv::iop::create(rewriter, loc, resultType, result, next);    \
     } else {                                                                   \
       assert(llvm::isa<FloatType>(resultType));                                \
-      result = rewriter.create<spirv::fop>(loc, resultType, result, next);     \
+      result = spirv::fop::create(rewriter, loc, resultType, result, next);    \
     }                                                                          \
     break
 
 #define INT_OR_FLOAT_CASE(kind, fop)                                           \
   case vector::CombiningKind::kind:                                            \
-    result = rewriter.create<fop>(loc, resultType, result, next);              \
+    result = fop::create(rewriter, loc, resultType, result, next);             \
     break
 
         INT_AND_FLOAT_CASE(ADD, IAddOp, FAddOp);
@@ -537,7 +551,7 @@ struct VectorReductionFloatMinMax final
 
 #define INT_OR_FLOAT_CASE(kind, fop)                                           \
   case vector::CombiningKind::kind:                                            \
-    result = rewriter.create<fop>(loc, resultType, result, next);              \
+    result = fop::create(rewriter, loc, resultType, result, next);             \
     break
 
         INT_OR_FLOAT_CASE(MAXIMUMF, SPIRVFMaxOp);
@@ -556,22 +570,27 @@ struct VectorReductionFloatMinMax final
   }
 };
 
-class VectorSplatPattern final : public OpConversionPattern<vector::SplatOp> {
+class VectorScalarBroadcastPattern final
+    : public OpConversionPattern<vector::BroadcastOp> {
 public:
-  using OpConversionPattern<vector::SplatOp>::OpConversionPattern;
+  using OpConversionPattern<vector::BroadcastOp>::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(vector::SplatOp op, OpAdaptor adaptor,
+  matchAndRewrite(vector::BroadcastOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (isa<VectorType>(op.getSourceType())) {
+      return rewriter.notifyMatchFailure(
+          op, "only conversion of 'broadcast from scalar' is supported");
+    }
     Type dstType = getTypeConverter()->convertType(op.getType());
     if (!dstType)
       return failure();
     if (isa<spirv::ScalarType>(dstType)) {
-      rewriter.replaceOp(op, adaptor.getInput());
+      rewriter.replaceOp(op, adaptor.getSource());
     } else {
       auto dstVecType = cast<VectorType>(dstType);
       SmallVector<Value, 4> source(dstVecType.getNumElements(),
-                                   adaptor.getInput());
+                                   adaptor.getSource());
       rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(op, dstType,
                                                                source);
     }
@@ -613,8 +632,8 @@ struct VectorShuffleOpConvert final
     auto getElementAtIdx = [&rewriter, loc = shuffleOp.getLoc()](
                                Value scalarOrVec, int32_t idx) -> Value {
       if (auto vecTy = dyn_cast<VectorType>(scalarOrVec.getType()))
-        return rewriter.create<spirv::CompositeExtractOp>(loc, scalarOrVec,
-                                                          idx);
+        return spirv::CompositeExtractOp::create(rewriter, loc, scalarOrVec,
+                                                 idx);
 
       assert(idx == 0 && "Invalid scalar element index");
       return scalarOrVec;
@@ -712,11 +731,13 @@ struct VectorDeinterleaveOpConvert final
     // We cannot use `spirv::VectorShuffleOp` directly in this case, and need to
     // use `spirv::CompositeExtractOp`.
     if (n == 2) {
-      auto elem0 = rewriter.create<spirv::CompositeExtractOp>(
-          loc, newResultType, sourceVector, rewriter.getI32ArrayAttr({0}));
+      auto elem0 = spirv::CompositeExtractOp::create(
+          rewriter, loc, newResultType, sourceVector,
+          rewriter.getI32ArrayAttr({0}));
 
-      auto elem1 = rewriter.create<spirv::CompositeExtractOp>(
-          loc, newResultType, sourceVector, rewriter.getI32ArrayAttr({1}));
+      auto elem1 = spirv::CompositeExtractOp::create(
+          rewriter, loc, newResultType, sourceVector,
+          rewriter.getI32ArrayAttr({1}));
 
       rewriter.replaceOp(deinterleaveOp, {elem0, elem1});
       return success();
@@ -733,12 +754,12 @@ struct VectorDeinterleaveOpConvert final
         llvm::map_to_vector(seqOdd, [](int i) { return i * 2 + 1; });
 
     // Create two SPIR-V shuffles.
-    auto shuffleEven = rewriter.create<spirv::VectorShuffleOp>(
-        loc, newResultType, sourceVector, sourceVector,
+    auto shuffleEven = spirv::VectorShuffleOp::create(
+        rewriter, loc, newResultType, sourceVector, sourceVector,
         rewriter.getI32ArrayAttr(indicesEven));
 
-    auto shuffleOdd = rewriter.create<spirv::VectorShuffleOp>(
-        loc, newResultType, sourceVector, sourceVector,
+    auto shuffleOdd = spirv::VectorShuffleOp::create(
+        rewriter, loc, newResultType, sourceVector, sourceVector,
         rewriter.getI32ArrayAttr(indicesOdd));
 
     rewriter.replaceOp(deinterleaveOp, {shuffleEven, shuffleOdd});
@@ -774,15 +795,19 @@ struct VectorLoadOpConverter final
     // Use the converted vector type instead of original (single element vector
     // would get converted to scalar).
     auto spirvVectorType = typeConverter.convertType(vectorType);
+    if (!spirvVectorType)
+      return rewriter.notifyMatchFailure(loadOp, "unsupported vector type");
+
     auto vectorPtrType = spirv::PointerType::get(spirvVectorType, storageClass);
 
     // For single element vectors, we don't need to bitcast the access chain to
     // the original vector type. Both is going to be the same, a pointer
     // to a scalar.
-    Value castedAccessChain = (vectorType.getNumElements() == 1)
-                                  ? accessChain
-                                  : rewriter.create<spirv::BitcastOp>(
-                                        loc, vectorPtrType, accessChain);
+    Value castedAccessChain =
+        (vectorType.getNumElements() == 1)
+            ? accessChain
+            : spirv::BitcastOp::create(rewriter, loc, vectorPtrType,
+                                       accessChain);
 
     rewriter.replaceOpWithNewOp<spirv::LoadOp>(loadOp, spirvVectorType,
                                                castedAccessChain);
@@ -821,10 +846,11 @@ struct VectorStoreOpConverter final
     // For single element vectors, we don't need to bitcast the access chain to
     // the original vector type. Both is going to be the same, a pointer
     // to a scalar.
-    Value castedAccessChain = (vectorType.getNumElements() == 1)
-                                  ? accessChain
-                                  : rewriter.create<spirv::BitcastOp>(
-                                        loc, vectorPtrType, accessChain);
+    Value castedAccessChain =
+        (vectorType.getNumElements() == 1)
+            ? accessChain
+            : spirv::BitcastOp::create(rewriter, loc, vectorPtrType,
+                                       accessChain);
 
     rewriter.replaceOpWithNewOp<spirv::StoreOp>(storeOp, castedAccessChain,
                                                 adaptor.getValueToStore());
@@ -905,10 +931,10 @@ struct VectorReductionToIntDotProd final
       auto v4i8Type = VectorType::get({4}, i8Type);
       Location loc = op.getLoc();
       Value zero = spirv::ConstantOp::getZero(i8Type, loc, rewriter);
-      lhsIn = rewriter.create<spirv::CompositeConstructOp>(
-          loc, v4i8Type, ValueRange{lhsIn, zero});
-      rhsIn = rewriter.create<spirv::CompositeConstructOp>(
-          loc, v4i8Type, ValueRange{rhsIn, zero});
+      lhsIn = spirv::CompositeConstructOp::create(rewriter, loc, v4i8Type,
+                                                  ValueRange{lhsIn, zero});
+      rhsIn = spirv::CompositeConstructOp::create(rewriter, loc, v4i8Type,
+                                                  ValueRange{rhsIn, zero});
     }
 
     // There's no variant of dot prod ops for unsigned LHS and signed RHS, so
@@ -971,14 +997,14 @@ struct VectorReductionToFPDotProd final
       Attribute oneAttr =
           rewriter.getFloatAttr(vectorType.getElementType(), 1.0);
       oneAttr = SplatElementsAttr::get(vectorType, oneAttr);
-      rhs = rewriter.create<spirv::ConstantOp>(loc, vectorType, oneAttr);
+      rhs = spirv::ConstantOp::create(rewriter, loc, vectorType, oneAttr);
     }
     assert(lhs);
     assert(rhs);
 
-    Value res = rewriter.create<spirv::DotOp>(loc, resultType, lhs, rhs);
+    Value res = spirv::DotOp::create(rewriter, loc, resultType, lhs, rhs);
     if (acc)
-      res = rewriter.create<spirv::FAddOp>(loc, acc, res);
+      res = spirv::FAddOp::create(rewriter, loc, acc, res);
 
     rewriter.replaceOp(op, res);
     return success();
@@ -1013,7 +1039,8 @@ struct VectorStepOpConvert final : OpConversionPattern<vector::StepOp> {
     source.reserve(numElements);
     for (int64_t i = 0; i < numElements; ++i) {
       Attribute intAttr = rewriter.getIntegerAttr(intType, i);
-      Value constOp = rewriter.create<spirv::ConstantOp>(loc, intType, intAttr);
+      Value constOp =
+          spirv::ConstantOp::create(rewriter, loc, intType, intAttr);
       source.push_back(constOp);
     }
     rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(stepOp, dstType,
@@ -1056,8 +1083,8 @@ struct VectorToElementOpConvert final
       if (element.use_empty())
         continue;
 
-      Value result = rewriter.create<spirv::CompositeExtractOp>(
-          loc, elementType, adaptor.getSource(),
+      Value result = spirv::CompositeExtractOp::create(
+          rewriter, loc, elementType, adaptor.getSource(),
           rewriter.getI32ArrayAttr({static_cast<int32_t>(idx)}));
       results[idx] = result;
     }
@@ -1089,11 +1116,11 @@ void mlir::populateVectorToSPIRVPatterns(
       VectorReductionPattern<CL_INT_MAX_MIN_OPS>,
       VectorReductionFloatMinMax<CL_FLOAT_MAX_MIN_OPS>,
       VectorReductionFloatMinMax<GL_FLOAT_MAX_MIN_OPS>, VectorShapeCast,
-      VectorInsertStridedSliceOpConvert, VectorShuffleOpConvert,
-      VectorInterleaveOpConvert, VectorDeinterleaveOpConvert,
-      VectorSplatPattern, VectorLoadOpConverter, VectorStoreOpConverter,
-      VectorStepOpConvert>(typeConverter, patterns.getContext(),
-                           PatternBenefit(1));
+      VectorSplatToBroadcast, VectorInsertStridedSliceOpConvert,
+      VectorShuffleOpConvert, VectorInterleaveOpConvert,
+      VectorDeinterleaveOpConvert, VectorScalarBroadcastPattern,
+      VectorLoadOpConverter, VectorStoreOpConverter, VectorStepOpConvert>(
+      typeConverter, patterns.getContext(), PatternBenefit(1));
 
   // Make sure that the more specialized dot product pattern has higher benefit
   // than the generic one that extracts all elements.
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 2e6a16ddbfdaa..80107554144cf 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -108,15 +108,15 @@ createNdDescriptor(PatternRewriter &rewriter, Location loc,
 
   xegpu::CreateNdDescOp ndDesc;
   if (srcTy.hasStaticShape()) {
-    ndDesc = rewriter.create<xegpu::CreateNdDescOp>(loc, descType, src,
-                                                    getAsOpFoldResult(offsets));
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
+                                           getAsOpFoldResult(offsets));
   } else {
     // In case of any dynamic shapes, source's shape and strides have to be
     // explicitly provided.
     SmallVector<Value> sourceDims;
     unsigned srcRank = srcTy.getRank();
     for (unsigned i = 0; i < srcRank; ++i)
-      sourceDims.push_back(rewriter.create<memref::DimOp>(loc, src, i));
+      sourceDims.push_back(memref::DimOp::create(rewriter, loc, src, i));
 
     SmallVector<int64_t> constOffsets;
     SmallVector<Value> dynOffsets;
@@ -135,18 +135,18 @@ createNdDescriptor(PatternRewriter &rewriter, Location loc,
 
     // Compute strides in reverse order.
     SmallVector<Value> dynStrides;
-    Value accStride = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value accStride = arith::ConstantIndexOp::create(rewriter, loc, 1);
     // Last stride is guaranteed to be static and unit.
     for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) {
       accStride =
-          rewriter.create<arith::MulIOp>(loc, accStride, sourceDims[i + 1]);
+          arith::MulIOp::create(rewriter, loc, accStride, sourceDims[i + 1]);
       if (strides[i] == ShapedType::kDynamic)
         dynStrides.push_back(accStride);
     }
     std::reverse(dynStrides.begin(), dynStrides.end());
 
-    ndDesc = rewriter.create<xegpu::CreateNdDescOp>(
-        loc, descType, src, dynOffsets, dynShapes, dynStrides,
+    ndDesc = xegpu::CreateNdDescOp::create(
+        rewriter, loc, descType, src, dynOffsets, dynShapes, dynStrides,
         DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets),
         DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()),
         DenseI64ArrayAttr::get(rewriter.getContext(), strides));
@@ -200,10 +200,10 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
                                                   ArrayRef<int64_t>{1, 0});
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto loadOp = rewriter.create<xegpu::LoadNdOp>(
-        loc, vecTy, ndDesc, /*packed=*/nullptr, transposeAttr,
-        /*l1_hint=*/hint,
-        /*l2_hint=*/hint, /*l3_hint=*/hint);
+    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc,
+                                          /*packed=*/nullptr, transposeAttr,
+                                          /*l1_hint=*/hint,
+                                          /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(readOp, loadOp);
 
     return success();
@@ -238,9 +238,9 @@ struct TransferWriteLowering
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
     auto storeOp =
-        rewriter.create<xegpu::StoreNdOp>(loc, writeOp.getVector(), ndDesc,
-                                          /*l1_hint=*/hint,
-                                          /*l2_hint=*/hint, /*l3_hint=*/hint);
+        xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), ndDesc,
+                                 /*l1_hint=*/hint,
+                                 /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(writeOp, storeOp);
 
     return success();
@@ -269,8 +269,8 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
 
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto loadNdOp = rewriter.create<xegpu::LoadNdOp>(
-        loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,
+    auto loadNdOp = xegpu::LoadNdOp::create(
+        rewriter, loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,
         /*l1_hint=*/hint,
         /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(loadOp, loadNdOp);
@@ -303,9 +303,9 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
     auto storeNdOp =
-        rewriter.create<xegpu::StoreNdOp>(loc, vector, ndDesc,
-                                          /*l1_hint=*/hint,
-                                          /*l2_hint=*/hint, /*l3_hint=*/hint);
+        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc,
+                                 /*l1_hint=*/hint,
+                                 /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(storeOp, storeNdOp);
 
     return success();
@@ -339,8 +339,9 @@ struct ContractionLowering : public OpRewritePattern<vector::ContractionOp> {
     if (!isRowMajorMatmul(contractOp.getIndexingMapsAttr()))
       return rewriter.notifyMatchFailure(contractOp, "Invalid indexing maps");
 
-    auto dpasOp = rewriter.create<xegpu::DpasOp>(
-        loc, TypeRange{contractOp.getResultType()}, ValueRange{lhs, rhs, acc});
+    auto dpasOp = xegpu::DpasOp::create(rewriter, loc,
+                                        TypeRange{contractOp.getResultType()},
+                                        ValueRange{lhs, rhs, acc});
     rewriter.replaceOp(contractOp, dpasOp);
 
     return success();
diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index a8380b9669f0f..2411af043f3f7 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -251,7 +251,7 @@ static LLVM::CallOp createDeviceFunctionCall(
   for (auto [idx, attrName] : paramAttrs)
     funcOp.setArgAttr(idx, attrName, rewriter.getUnitAttr());
 
-  auto callOp = rewriter.create<LLVM::CallOp>(loc, funcOp, args);
+  auto callOp = LLVM::CallOp::create(rewriter, loc, funcOp, args);
   callOp->setAttrs(funcOp->getAttrs());
 
   return callOp;
@@ -299,7 +299,7 @@ class MMAToOCLPattern : public OpConversionPattern<xevm::MMAOp> {
       VectorType newTy = VectorType::get(
           vecBitSize / packedType.getIntOrFloatBitWidth(), packedType);
       if (origTy != newTy)
-        val = rewriter.create<LLVM::BitcastOp>(loc, newTy, val);
+        val = LLVM::BitcastOp::create(rewriter, loc, newTy, val);
       return val;
     };
 
@@ -326,7 +326,7 @@ class MMAToOCLPattern : public OpConversionPattern<xevm::MMAOp> {
             : cOrigTy;
     VectorType resTy = cTy;
     if (cOrigTy != cTy)
-      c = rewriter.create<LLVM::BitcastOp>(loc, cTy, c);
+      c = LLVM::BitcastOp::create(rewriter, loc, cTy, c);
 
     constexpr int32_t systolicDepth{8};
     std::string fnName =
@@ -352,7 +352,7 @@ class MMAToOCLPattern : public OpConversionPattern<xevm::MMAOp> {
             ->getResult(0);
 
     if (resOrigTy != resTy)
-      result = rewriter.create<LLVM::BitcastOp>(loc, resOrigTy, result);
+      result = LLVM::BitcastOp::create(rewriter, loc, resOrigTy, result);
 
     rewriter.replaceOp(op, result);
     return success();
@@ -383,7 +383,7 @@ class PrefetchToOCLPattern : public OpConversionPattern<PrefetchOp> {
     auto loc = op.getLoc();
     const std::string fnName{"_Z8prefetchPU3AS1Kcm"};
     Value one =
-        rewriter.create<LLVM::ConstantOp>(loc, rewriter.getI64Type(), 1);
+        LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 1);
     SmallVector<Value> args{op.getPtr(), one};
     SmallVector<Type> argTypes;
     for (auto arg : args)
@@ -439,11 +439,11 @@ class MemfenceToOCLPattern : public OpConversionPattern<MemfenceOp> {
           op, "Fence only supports workgroup and device memory scopes.");
     }
     Type i32Type = rewriter.getI32Type();
-    Value acqRel = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 4);
+    Value acqRel = LLVM::ConstantOp::create(rewriter, loc, i32Type, 4);
     Value memScopeConst =
-        rewriter.create<LLVM::ConstantOp>(loc, i32Type, memScope);
+        LLVM::ConstantOp::create(rewriter, loc, i32Type, memScope);
     Value addrSpaceConst =
-        rewriter.create<LLVM::ConstantOp>(loc, i32Type, addrSpace);
+        LLVM::ConstantOp::create(rewriter, loc, i32Type, addrSpace);
     SmallVector<Value> args{addrSpaceConst, acqRel, memScopeConst};
     SmallVector<Type> argTypes{3, i32Type};
     createDeviceFunctionCall(rewriter, mangle(fnName, argTypes),
@@ -477,13 +477,13 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
 
     auto i32Type = rewriter.getI32Type();
     Value byteCoord =
-        rewriter.create<LLVM::UndefOp>(loc, VectorType::get(2, i32Type));
-    Value zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 0);
-    Value one = rewriter.create<LLVM::ConstantOp>(loc, i32Type, 1);
-    byteCoord = rewriter.create<LLVM::InsertElementOp>(
-        loc, VectorType::get(2, i32Type), byteCoord, op.getX(), zero);
-    byteCoord = rewriter.create<LLVM::InsertElementOp>(
-        loc, VectorType::get(2, i32Type), byteCoord, op.getY(), one);
+        LLVM::UndefOp::create(rewriter, loc, VectorType::get(2, i32Type));
+    Value zero = LLVM::ConstantOp::create(rewriter, loc, i32Type, 0);
+    Value one = LLVM::ConstantOp::create(rewriter, loc, i32Type, 1);
+    byteCoord = LLVM::InsertElementOp::create(
+        rewriter, loc, VectorType::get(2, i32Type), byteCoord, op.getX(), zero);
+    byteCoord = LLVM::InsertElementOp::create(
+        rewriter, loc, VectorType::get(2, i32Type), byteCoord, op.getY(), one);
     SmallVector<Value> args{op.getPtr(), op.getBaseWidth(), op.getBaseHeight(),
                             op.getBasePitch(), byteCoord};
     SmallVector<Type> retTypes;
@@ -504,11 +504,11 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
     } else {
       auto vecElemType = vecType.getElementType();
       auto vecElemBitWidth = vecElemType.getIntOrFloatBitWidth();
-      Value numElems = rewriter.create<LLVM::ConstantOp>(
-          loc, i32Type, vecType.getNumElements());
-      auto dstOrSrcPtr = rewriter.create<LLVM::AllocaOp>(
-          loc, LLVM::LLVMPointerType::get(rewriter.getContext()), vecElemType,
-          numElems);
+      Value numElems = LLVM::ConstantOp::create(rewriter, loc, i32Type,
+                                                vecType.getNumElements());
+      auto dstOrSrcPtr = LLVM::AllocaOp::create(
+          rewriter, loc, LLVM::LLVMPointerType::get(rewriter.getContext()),
+          vecElemType, numElems);
       args.push_back(dstOrSrcPtr);
       if constexpr (isLoad) { // Load
         funcName += "read";
@@ -530,7 +530,7 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
         bitWidthId = (vecElemBitWidth == 32)
                          ? "j"
                          : ((vecElemBitWidth == 16) ? "t" : "h");
-        rewriter.create<LLVM::StoreOp>(loc, op.getStoredVal(), dstOrSrcPtr);
+        LLVM::StoreOp::create(rewriter, loc, op.getStoredVal(), dstOrSrcPtr);
         paramAttrs = {
             std::make_pair(0, LLVM::LLVMDialect::getNonNullAttrName()),
             std::make_pair(0, LLVM::LLVMDialect::getWriteOnlyAttrName()),
@@ -563,7 +563,7 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
     }
     if constexpr (isLoad)
       rewriter.replaceOp(
-          op, rewriter.create<LLVM::LoadOp>(loc, vecType, spvLoadDstPtr));
+          op, LLVM::LoadOp::create(rewriter, loc, vecType, spvLoadDstPtr));
     else
       rewriter.eraseOp(op);
     return success();
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index acaf6a2f8792a..18e8270f5aa99 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
@@ -89,7 +90,22 @@ static FailureOr<MemRefType> getFatRawBufferTypeLike(MemRefType source,
     auto stridedLayout = dyn_cast<StridedLayoutAttr>(layout);
     if (!stridedLayout)
       return failure();
-    mb.setLayout(StridedLayoutAttr::get(ctx, 0, stridedLayout.getStrides()));
+    MemRefLayoutAttrInterface newLayout =
+        StridedLayoutAttr::get(ctx, 0, stridedLayout.getStrides());
+    // Special case: if resetting the offset causes the strided layout to become
+    // the identity layout, then reset to the identity layout.
+    // TODO: this'll get a lot simpler when we have the contiguous layout.
+    SmallVector<int64_t> stridesIfIdentity;
+    if (source.hasStaticShape()) {
+      stridesIfIdentity = computeSuffixProduct(source.getShape());
+    } else if (source.getRank() <= 1) {
+      stridesIfIdentity = SmallVector<int64_t>(source.getRank(), 1);
+    }
+    if (stridesIfIdentity == stridedLayout.getStrides()) {
+      newLayout = AffineMapAttr::get(
+          AffineMap::getMultiDimIdentityMap(source.getRank(), ctx));
+    }
+    mb.setLayout(newLayout);
   }
   return (MemRefType)(mb);
 }
@@ -134,6 +150,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 3;
   if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
@@ -142,6 +160,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 7;
   if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index e8e8f624d806e..ee5db073ffc4e 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -49,7 +49,7 @@ using llvm::mod;
 /// top level of a `AffineScope` region is always a valid symbol for all
 /// uses in that region.
 bool mlir::affine::isTopLevelValue(Value value, Region *region) {
-  if (auto arg = llvm::dyn_cast<BlockArgument>(value))
+  if (auto arg = dyn_cast<BlockArgument>(value))
     return arg.getParentRegion() == region;
   return value.getDefiningOp()->getParentRegion() == region;
 }
@@ -240,7 +240,7 @@ Operation *AffineDialect::materializeConstant(OpBuilder &builder,
                                               Attribute value, Type type,
                                               Location loc) {
   if (auto poison = dyn_cast<ub::PoisonAttr>(value))
-    return builder.create<ub::PoisonOp>(loc, type, poison);
+    return ub::PoisonOp::create(builder, loc, type, poison);
   return arith::ConstantOp::materialize(builder, value, type, loc);
 }
 
@@ -249,7 +249,7 @@ Operation *AffineDialect::materializeConstant(OpBuilder &builder,
 /// conservatively assume it is not top-level. A value of index type defined at
 /// the top level is always a valid symbol.
 bool mlir::affine::isTopLevelValue(Value value) {
-  if (auto arg = llvm::dyn_cast<BlockArgument>(value)) {
+  if (auto arg = dyn_cast<BlockArgument>(value)) {
     // The block owning the argument may be unlinked, e.g. when the surrounding
     // region has not yet been attached to an Op, at which point the parent Op
     // is null.
@@ -1282,7 +1282,7 @@ mlir::affine::makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map,
   map = foldAttributesIntoMap(b, map, operands, valueOperands);
   composeAffineMapAndOperands(&map, &valueOperands, composeAffineMin);
   assert(map);
-  return b.create<AffineApplyOp>(loc, map, valueOperands);
+  return AffineApplyOp::create(b, loc, map, valueOperands);
 }
 
 AffineApplyOp
@@ -1389,7 +1389,7 @@ static OpTy makeComposedMinMax(OpBuilder &b, Location loc, AffineMap map,
   SmallVector<Value> valueOperands;
   map = foldAttributesIntoMap(b, map, operands, valueOperands);
   composeMultiResultAffineMap(map, valueOperands);
-  return b.create<OpTy>(loc, b.getIndexType(), map, valueOperands);
+  return OpTy::create(b, loc, b.getIndexType(), map, valueOperands);
 }
 
 AffineMinOp
@@ -1747,6 +1747,32 @@ void AffineDmaStartOp::build(OpBuilder &builder, OperationState &result,
   }
 }
 
+AffineDmaStartOp AffineDmaStartOp::create(
+    OpBuilder &builder, Location location, Value srcMemRef, AffineMap srcMap,
+    ValueRange srcIndices, Value destMemRef, AffineMap dstMap,
+    ValueRange destIndices, Value tagMemRef, AffineMap tagMap,
+    ValueRange tagIndices, Value numElements, Value stride,
+    Value elementsPerStride) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, srcMemRef, srcMap, srcIndices, destMemRef, dstMap,
+        destIndices, tagMemRef, tagMap, tagIndices, numElements, stride,
+        elementsPerStride);
+  auto result = dyn_cast<AffineDmaStartOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+AffineDmaStartOp AffineDmaStartOp::create(
+    ImplicitLocOpBuilder &builder, Value srcMemRef, AffineMap srcMap,
+    ValueRange srcIndices, Value destMemRef, AffineMap dstMap,
+    ValueRange destIndices, Value tagMemRef, AffineMap tagMap,
+    ValueRange tagIndices, Value numElements, Value stride,
+    Value elementsPerStride) {
+  return create(builder, builder.getLoc(), srcMemRef, srcMap, srcIndices,
+                destMemRef, dstMap, destIndices, tagMemRef, tagMap, tagIndices,
+                numElements, stride, elementsPerStride);
+}
+
 void AffineDmaStartOp::print(OpAsmPrinter &p) {
   p << " " << getSrcMemRef() << '[';
   p.printAffineMapOfSSAIds(getSrcMapAttr(), getSrcIndices());
@@ -1917,6 +1943,25 @@ void AffineDmaWaitOp::build(OpBuilder &builder, OperationState &result,
   result.addOperands(numElements);
 }
 
+AffineDmaWaitOp AffineDmaWaitOp::create(OpBuilder &builder, Location location,
+                                        Value tagMemRef, AffineMap tagMap,
+                                        ValueRange tagIndices,
+                                        Value numElements) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, tagMemRef, tagMap, tagIndices, numElements);
+  auto result = dyn_cast<AffineDmaWaitOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+AffineDmaWaitOp AffineDmaWaitOp::create(ImplicitLocOpBuilder &builder,
+                                        Value tagMemRef, AffineMap tagMap,
+                                        ValueRange tagIndices,
+                                        Value numElements) {
+  return create(builder, builder.getLoc(), tagMemRef, tagMap, tagIndices,
+                numElements);
+}
+
 void AffineDmaWaitOp::print(OpAsmPrinter &p) {
   p << " " << getTagMemRef() << '[';
   SmallVector<Value, 2> operands(getTagIndices());
@@ -2153,7 +2198,7 @@ static ParseResult parseBound(bool isLower, OperationState &result,
     return failure();
 
   // Parse full form - affine map followed by dim and symbol list.
-  if (auto affineMapAttr = llvm::dyn_cast<AffineMapAttr>(boundAttr)) {
+  if (auto affineMapAttr = dyn_cast<AffineMapAttr>(boundAttr)) {
     unsigned currentNumOperands = result.operands.size();
     unsigned numDims;
     if (parseDimAndSymbolList(p, result.operands, numDims))
@@ -2186,7 +2231,7 @@ static ParseResult parseBound(bool isLower, OperationState &result,
   }
 
   // Parse custom assembly form.
-  if (auto integerAttr = llvm::dyn_cast<IntegerAttr>(boundAttr)) {
+  if (auto integerAttr = dyn_cast<IntegerAttr>(boundAttr)) {
     result.attributes.pop_back();
     result.addAttribute(
         boundAttrStrName,
@@ -2688,8 +2733,8 @@ FailureOr<LoopLikeOpInterface> AffineForOp::replaceWithAdditionalYields(
   rewriter.setInsertionPoint(getOperation());
   auto inits = llvm::to_vector(getInits());
   inits.append(newInitOperands.begin(), newInitOperands.end());
-  AffineForOp newLoop = rewriter.create<AffineForOp>(
-      getLoc(), getLowerBoundOperands(), getLowerBoundMap(),
+  AffineForOp newLoop = AffineForOp::create(
+      rewriter, getLoc(), getLowerBoundOperands(), getLowerBoundMap(),
       getUpperBoundOperands(), getUpperBoundMap(), getStepAsInt(), inits);
 
   // Generate the new yield values and append them to the scf.yield operation.
@@ -2756,7 +2801,7 @@ bool mlir::affine::isAffineInductionVar(Value val) {
 }
 
 AffineForOp mlir::affine::getForInductionVarOwner(Value val) {
-  auto ivArg = llvm::dyn_cast<BlockArgument>(val);
+  auto ivArg = dyn_cast<BlockArgument>(val);
   if (!ivArg || !ivArg.getOwner() || !ivArg.getOwner()->getParent())
     return AffineForOp();
   if (auto forOp =
@@ -2767,7 +2812,7 @@ AffineForOp mlir::affine::getForInductionVarOwner(Value val) {
 }
 
 AffineParallelOp mlir::affine::getAffineParallelInductionVarOwner(Value val) {
-  auto ivArg = llvm::dyn_cast<BlockArgument>(val);
+  auto ivArg = dyn_cast<BlockArgument>(val);
   if (!ivArg || !ivArg.getOwner())
     return nullptr;
   Operation *containingOp = ivArg.getOwner()->getParentOp();
@@ -2831,7 +2876,7 @@ static void buildAffineLoopNestImpl(
         OpBuilder::InsertionGuard nestedGuard(nestedBuilder);
         bodyBuilderFn(nestedBuilder, nestedLoc, ivs);
       }
-      nestedBuilder.create<AffineYieldOp>(nestedLoc);
+      AffineYieldOp::create(nestedBuilder, nestedLoc);
     };
 
     // Delegate actual loop creation to the callback in order to dispatch
@@ -2846,8 +2891,8 @@ static AffineForOp
 buildAffineLoopFromConstants(OpBuilder &builder, Location loc, int64_t lb,
                              int64_t ub, int64_t step,
                              AffineForOp::BodyBuilderFn bodyBuilderFn) {
-  return builder.create<AffineForOp>(loc, lb, ub, step,
-                                     /*iterArgs=*/ValueRange(), bodyBuilderFn);
+  return AffineForOp::create(builder, loc, lb, ub, step,
+                             /*iterArgs=*/ValueRange(), bodyBuilderFn);
 }
 
 /// Creates an affine loop from the bounds that may or may not be constants.
@@ -2860,9 +2905,9 @@ buildAffineLoopFromValues(OpBuilder &builder, Location loc, Value lb, Value ub,
   if (lbConst && ubConst)
     return buildAffineLoopFromConstants(builder, loc, lbConst.value(),
                                         ubConst.value(), step, bodyBuilderFn);
-  return builder.create<AffineForOp>(loc, lb, builder.getDimIdentityMap(), ub,
-                                     builder.getDimIdentityMap(), step,
-                                     /*iterArgs=*/ValueRange(), bodyBuilderFn);
+  return AffineForOp::create(builder, loc, lb, builder.getDimIdentityMap(), ub,
+                             builder.getDimIdentityMap(), step,
+                             /*iterArgs=*/ValueRange(), bodyBuilderFn);
 }
 
 void mlir::affine::buildAffineLoopNest(
@@ -3294,11 +3339,11 @@ OpFoldResult AffineLoadOp::fold(FoldAdaptor adaptor) {
 
   // Check if the global memref is a constant.
   auto cstAttr =
-      llvm::dyn_cast_or_null<DenseElementsAttr>(global.getConstantInitValue());
+      dyn_cast_or_null<DenseElementsAttr>(global.getConstantInitValue());
   if (!cstAttr)
     return {};
   // If it's a splat constant, we can fold irrespective of indices.
-  if (auto splatAttr = llvm::dyn_cast<SplatElementsAttr>(cstAttr))
+  if (auto splatAttr = dyn_cast<SplatElementsAttr>(cstAttr))
     return splatAttr.getSplatValue<Attribute>();
   // Otherwise, we can fold only if we know the indices.
   if (!getAffineMap().isConstant())
@@ -4065,19 +4110,19 @@ static bool isResultTypeMatchAtomicRMWKind(Type resultType,
   case arith::AtomicRMWKind::minimumf:
     return isa<FloatType>(resultType);
   case arith::AtomicRMWKind::maxs: {
-    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    auto intType = dyn_cast<IntegerType>(resultType);
     return intType && intType.isSigned();
   }
   case arith::AtomicRMWKind::mins: {
-    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    auto intType = dyn_cast<IntegerType>(resultType);
     return intType && intType.isSigned();
   }
   case arith::AtomicRMWKind::maxu: {
-    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    auto intType = dyn_cast<IntegerType>(resultType);
     return intType && intType.isUnsigned();
   }
   case arith::AtomicRMWKind::minu: {
-    auto intType = llvm::dyn_cast<IntegerType>(resultType);
+    auto intType = dyn_cast<IntegerType>(resultType);
     return intType && intType.isUnsigned();
   }
   case arith::AtomicRMWKind::ori:
@@ -4134,7 +4179,7 @@ LogicalResult AffineParallelOp::verify() {
   // ops
   for (auto it : llvm::enumerate((getReductions()))) {
     Attribute attr = it.value();
-    auto intAttr = llvm::dyn_cast<IntegerAttr>(attr);
+    auto intAttr = dyn_cast<IntegerAttr>(attr);
     if (!intAttr || !arith::symbolizeAtomicRMWKind(intAttr.getInt()))
       return emitOpError("invalid reduction attribute");
     auto kind = arith::symbolizeAtomicRMWKind(intAttr.getInt()).value();
@@ -4883,7 +4928,7 @@ struct DropUnitExtentBasis
     Location loc = delinearizeOp->getLoc();
     auto getZero = [&]() -> Value {
       if (!zero)
-        zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+        zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
       return zero.value();
     };
 
@@ -4906,8 +4951,8 @@ struct DropUnitExtentBasis
 
     if (!newBasis.empty()) {
       // Will drop the leading nullptr from `basis` if there was no outer bound.
-      auto newDelinearizeOp = rewriter.create<affine::AffineDelinearizeIndexOp>(
-          loc, delinearizeOp.getLinearIndex(), newBasis);
+      auto newDelinearizeOp = affine::AffineDelinearizeIndexOp::create(
+          rewriter, loc, delinearizeOp.getLinearIndex(), newBasis);
       int newIndex = 0;
       // Map back the new delinearized indices to the values they replace.
       for (auto &replacement : replacements) {
@@ -4971,12 +5016,12 @@ struct CancelDelinearizeOfLinearizeDisjointExactTail
       return success();
     }
 
-    Value newLinearize = rewriter.create<affine::AffineLinearizeIndexOp>(
-        linearizeOp.getLoc(), linearizeIns.drop_back(numMatches),
+    Value newLinearize = affine::AffineLinearizeIndexOp::create(
+        rewriter, linearizeOp.getLoc(), linearizeIns.drop_back(numMatches),
         ArrayRef<OpFoldResult>{linearizeBasis}.drop_back(numMatches),
         linearizeOp.getDisjoint());
-    auto newDelinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
-        delinearizeOp.getLoc(), newLinearize,
+    auto newDelinearize = affine::AffineDelinearizeIndexOp::create(
+        rewriter, delinearizeOp.getLoc(), newLinearize,
         ArrayRef<OpFoldResult>{delinearizeBasis}.drop_back(numMatches),
         delinearizeOp.hasOuterBound());
     SmallVector<Value> mergedResults(newDelinearize.getResults());
@@ -5048,19 +5093,16 @@ struct SplitDelinearizeSpanningLastLinearizeArg final
           delinearizeOp,
           "need at least two elements to form the basis product");
 
-    Value linearizeWithoutBack =
-        rewriter.create<affine::AffineLinearizeIndexOp>(
-            linearizeOp.getLoc(), linearizeOp.getMultiIndex().drop_back(),
-            linearizeOp.getDynamicBasis(),
-            linearizeOp.getStaticBasis().drop_back(),
-            linearizeOp.getDisjoint());
-    auto delinearizeWithoutSplitPart =
-        rewriter.create<affine::AffineDelinearizeIndexOp>(
-            delinearizeOp.getLoc(), linearizeWithoutBack,
-            delinearizeOp.getDynamicBasis(), basis.drop_back(elemsToSplit),
-            delinearizeOp.hasOuterBound());
-    auto delinearizeBack = rewriter.create<affine::AffineDelinearizeIndexOp>(
-        delinearizeOp.getLoc(), linearizeOp.getMultiIndex().back(),
+    Value linearizeWithoutBack = affine::AffineLinearizeIndexOp::create(
+        rewriter, linearizeOp.getLoc(), linearizeOp.getMultiIndex().drop_back(),
+        linearizeOp.getDynamicBasis(), linearizeOp.getStaticBasis().drop_back(),
+        linearizeOp.getDisjoint());
+    auto delinearizeWithoutSplitPart = affine::AffineDelinearizeIndexOp::create(
+        rewriter, delinearizeOp.getLoc(), linearizeWithoutBack,
+        delinearizeOp.getDynamicBasis(), basis.drop_back(elemsToSplit),
+        delinearizeOp.hasOuterBound());
+    auto delinearizeBack = affine::AffineDelinearizeIndexOp::create(
+        rewriter, delinearizeOp.getLoc(), linearizeOp.getMultiIndex().back(),
         basis.take_back(elemsToSplit), /*hasOuterBound=*/true);
     SmallVector<Value> results = llvm::to_vector(
         llvm::concat<Value>(delinearizeWithoutSplitPart.getResults(),
@@ -5272,7 +5314,7 @@ OpFoldResult computeProduct(Location loc, OpBuilder &builder,
   }
   if (auto constant = dyn_cast<AffineConstantExpr>(result))
     return getAsIndexOpFoldResult(builder.getContext(), constant.getValue());
-  return builder.create<AffineApplyOp>(loc, result, dynamicPart).getResult();
+  return AffineApplyOp::create(builder, loc, result, dynamicPart).getResult();
 }
 
 /// If conseceutive outputs of a delinearize_index are linearized with the same
@@ -5437,16 +5479,16 @@ struct CancelLinearizeOfDelinearizePortion final
       newDelinBasis.erase(newDelinBasis.begin() + m.delinStart,
                           newDelinBasis.begin() + m.delinStart + m.length);
       newDelinBasis.insert(newDelinBasis.begin() + m.delinStart, newSize);
-      auto newDelinearize = rewriter.create<AffineDelinearizeIndexOp>(
-          m.delinearize.getLoc(), m.delinearize.getLinearIndex(),
+      auto newDelinearize = AffineDelinearizeIndexOp::create(
+          rewriter, m.delinearize.getLoc(), m.delinearize.getLinearIndex(),
           newDelinBasis);
 
       // Since there may be other uses of the indices we just merged together,
       // create a residual affine.delinearize_index that delinearizes the
       // merged output into its component parts.
       Value combinedElem = newDelinearize.getResult(m.delinStart);
-      auto residualDelinearize = rewriter.create<AffineDelinearizeIndexOp>(
-          m.delinearize.getLoc(), combinedElem, basisToMerge);
+      auto residualDelinearize = AffineDelinearizeIndexOp::create(
+          rewriter, m.delinearize.getLoc(), combinedElem, basisToMerge);
 
       // Swap all the uses of the unaffected delinearize outputs to the new
       // delinearization so that the old code can be removed if this
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
index 8a989ff985c6f..59c630ca11dca 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a pass to tile loop nests.
+// This file implements a pass to tile affine loop nests.
 //
 //===----------------------------------------------------------------------===//
 
@@ -38,7 +38,7 @@ using namespace mlir::affine;
 
 namespace {
 
-/// A pass to perform loop tiling on all suitable loop nests of a Function.
+/// A pass to perform loop tiling on all suitable loop nests of a func op.
 struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> {
   LoopTiling() = default;
   explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true)
@@ -59,6 +59,20 @@ struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> {
 
 } // namespace
 
+/// Get bands of loops that are valid to tile from the top-level of `f`.
+static void
+getTopLevelTileableBands(func::FuncOp f,
+                         std::vector<SmallVector<AffineForOp, 6>> &bands) {
+  // Get maximal perfect nest of 'affine.for' ops starting from root
+  // (inclusive).
+  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
+    SmallVector<AffineForOp, 6> band;
+    getPerfectlyNestedLoops(band, forOp);
+    if (isTilingValid(band))
+      bands.push_back(band);
+  }
+}
+
 /// Creates a pass to perform loop tiling on all suitable loop nests of a
 /// Function.
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -122,10 +136,6 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
     return;
   }
 
-  // The first loop in the band.
-  AffineForOp rootForOp = band[0];
-  (void)rootForOp;
-
   // Obtain memory footprint and set tile sizes so that a tile fits in
   // the cache size. This is an approximation with the assumption that the
   // footprint increases with the tile size linearly in that dimension (i.e.,
@@ -136,6 +146,9 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
     llvm::fill(*tileSizes, LoopTiling::kDefaultTileSize);
     if (avoidMaxMinBounds)
       adjustToDivisorsOfTripCounts(band, tileSizes);
+    // The first loop in the band.
+    AffineForOp rootForOp = band[0];
+    (void)rootForOp;
     LLVM_DEBUG(
         rootForOp.emitWarning("memory footprint unknown: using default tile "
                               "sizes adjusted to trip count divisors"));
@@ -178,23 +191,17 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
 void LoopTiling::runOnOperation() {
   // Bands of loops to tile.
   std::vector<SmallVector<AffineForOp, 6>> bands;
-  getTileableBands(getOperation(), &bands);
+  getTopLevelTileableBands(getOperation(), bands);
 
   // Tile each band.
   for (auto &band : bands) {
-    if (!isTilingValid(band)) {
-      band.front().emitRemark("tiling nest is invalid due to dependences");
-      continue;
-    }
-
     // Set up tile sizes; fill missing tile sizes at the end with default tile
     // size or tileSize if one was provided.
     SmallVector<unsigned, 6> tileSizes;
     getTileSizes(band, &tileSizes);
     if (llvm::DebugFlag) {
       auto diag = band[0].emitRemark("using tile sizes [");
-      for (unsigned tSize : tileSizes)
-        diag << tSize << ' ';
+      llvm::interleaveComma(tileSizes, llvm::dbgs());
       diag << "]\n";
     }
     SmallVector<AffineForOp, 6> tiledNest;
@@ -213,10 +220,8 @@ void LoopTiling::runOnOperation() {
         assert(!intraTileLoops.empty() &&
                "guaranteed to succeed on empty bands");
         LLVM_DEBUG(intraTileLoops.front()->emitRemark(
-            "separation post tiling failed!\n"));
+            "separation post tiling failed!"));
       }
     }
   }
 }
-
-constexpr unsigned LoopTiling::kDefaultTileSize;
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 0501616ad912c..bc4e2d10f5176 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -868,20 +868,6 @@ void mlir::affine::getPerfectlyNestedLoops(
   }
 }
 
-/// Identify valid and profitable bands of loops to tile. This is currently just
-/// a temporary placeholder to test the mechanics of tiled code generation.
-/// Returns all maximal outermost perfect loop nests to tile.
-void mlir::affine::getTileableBands(
-    func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) {
-  // Get maximal perfect nest of 'affine.for' insts starting from root
-  // (inclusive).
-  for (AffineForOp forOp : f.getOps<AffineForOp>()) {
-    SmallVector<AffineForOp, 6> band;
-    getPerfectlyNestedLoops(band, forOp);
-    bands->push_back(band);
-  }
-}
-
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 4e40d4ebda004..910334b17748b 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -148,7 +148,7 @@ static FailureOr<APInt> getIntOrSplatIntValue(Attribute attr) {
 
 static Attribute getBoolAttribute(Type type, bool value) {
   auto boolAttr = BoolAttr::get(type.getContext(), value);
-  ShapedType shapedType = llvm::dyn_cast_or_null<ShapedType>(type);
+  ShapedType shapedType = dyn_cast_or_null<ShapedType>(type);
   if (!shapedType)
     return boolAttr;
   return DenseElementsAttr::get(shapedType, boolAttr);
@@ -169,7 +169,7 @@ namespace {
 /// Return the type of the same shape (scalar, vector or tensor) containing i1.
 static Type getI1SameShape(Type type) {
   auto i1Type = IntegerType::get(type.getContext(), 1);
-  if (auto shapedType = llvm::dyn_cast<ShapedType>(type))
+  if (auto shapedType = dyn_cast<ShapedType>(type))
     return shapedType.cloneWith(std::nullopt, i1Type);
   if (llvm::isa<UnrankedTensorType>(type))
     return UnrankedTensorType::get(i1Type);
@@ -183,8 +183,8 @@ static Type getI1SameShape(Type type) {
 void arith::ConstantOp::getAsmResultNames(
     function_ref<void(Value, StringRef)> setNameFn) {
   auto type = getType();
-  if (auto intCst = llvm::dyn_cast<IntegerAttr>(getValue())) {
-    auto intType = llvm::dyn_cast<IntegerType>(type);
+  if (auto intCst = dyn_cast<IntegerAttr>(getValue())) {
+    auto intType = dyn_cast<IntegerType>(type);
 
     // Sugar i1 constants with 'true' and 'false'.
     if (intType && intType.getWidth() == 1)
@@ -228,7 +228,7 @@ LogicalResult arith::ConstantOp::verify() {
 
 bool arith::ConstantOp::isBuildableWith(Attribute value, Type type) {
   // The value's type must be the same as the provided type.
-  auto typedAttr = llvm::dyn_cast<TypedAttr>(value);
+  auto typedAttr = dyn_cast<TypedAttr>(value);
   if (!typedAttr || typedAttr.getType() != type)
     return false;
   // Integer values must be signless.
@@ -242,7 +242,7 @@ bool arith::ConstantOp::isBuildableWith(Attribute value, Type type) {
 ConstantOp arith::ConstantOp::materialize(OpBuilder &builder, Attribute value,
                                           Type type, Location loc) {
   if (isBuildableWith(value, type))
-    return builder.create<arith::ConstantOp>(loc, cast<TypedAttr>(value));
+    return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(value));
   return nullptr;
 }
 
@@ -255,18 +255,66 @@ void arith::ConstantIntOp::build(OpBuilder &builder, OperationState &result,
                            builder.getIntegerAttr(type, value));
 }
 
+arith::ConstantIntOp arith::ConstantIntOp::create(OpBuilder &builder,
+                                                  Location location,
+                                                  int64_t value,
+                                                  unsigned width) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, value, width);
+  auto result = dyn_cast<ConstantIntOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+arith::ConstantIntOp arith::ConstantIntOp::create(ImplicitLocOpBuilder &builder,
+                                                  int64_t value,
+                                                  unsigned width) {
+  return create(builder, builder.getLoc(), value, width);
+}
+
 void arith::ConstantIntOp::build(OpBuilder &builder, OperationState &result,
                                  Type type, int64_t value) {
   arith::ConstantOp::build(builder, result, type,
                            builder.getIntegerAttr(type, value));
 }
 
+arith::ConstantIntOp arith::ConstantIntOp::create(OpBuilder &builder,
+                                                  Location location, Type type,
+                                                  int64_t value) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, type, value);
+  auto result = dyn_cast<ConstantIntOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+arith::ConstantIntOp arith::ConstantIntOp::create(ImplicitLocOpBuilder &builder,
+                                                  Type type, int64_t value) {
+  return create(builder, builder.getLoc(), type, value);
+}
+
 void arith::ConstantIntOp::build(OpBuilder &builder, OperationState &result,
                                  Type type, const APInt &value) {
   arith::ConstantOp::build(builder, result, type,
                            builder.getIntegerAttr(type, value));
 }
 
+arith::ConstantIntOp arith::ConstantIntOp::create(OpBuilder &builder,
+                                                  Location location, Type type,
+                                                  const APInt &value) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, type, value);
+  auto result = dyn_cast<ConstantIntOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+arith::ConstantIntOp arith::ConstantIntOp::create(ImplicitLocOpBuilder &builder,
+                                                  Type type,
+                                                  const APInt &value) {
+  return create(builder, builder.getLoc(), type, value);
+}
+
 bool arith::ConstantIntOp::classof(Operation *op) {
   if (auto constOp = dyn_cast_or_null<arith::ConstantOp>(op))
     return constOp.getType().isSignlessInteger();
@@ -279,6 +327,23 @@ void arith::ConstantFloatOp::build(OpBuilder &builder, OperationState &result,
                            builder.getFloatAttr(type, value));
 }
 
+arith::ConstantFloatOp arith::ConstantFloatOp::create(OpBuilder &builder,
+                                                      Location location,
+                                                      FloatType type,
+                                                      const APFloat &value) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, type, value);
+  auto result = dyn_cast<ConstantFloatOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+arith::ConstantFloatOp
+arith::ConstantFloatOp::create(ImplicitLocOpBuilder &builder, FloatType type,
+                               const APFloat &value) {
+  return create(builder, builder.getLoc(), type, value);
+}
+
 bool arith::ConstantFloatOp::classof(Operation *op) {
   if (auto constOp = dyn_cast_or_null<arith::ConstantOp>(op))
     return llvm::isa<FloatType>(constOp.getType());
@@ -291,6 +356,21 @@ void arith::ConstantIndexOp::build(OpBuilder &builder, OperationState &result,
                            builder.getIndexAttr(value));
 }
 
+arith::ConstantIndexOp arith::ConstantIndexOp::create(OpBuilder &builder,
+                                                      Location location,
+                                                      int64_t value) {
+  mlir::OperationState state(location, getOperationName());
+  build(builder, state, value);
+  auto result = dyn_cast<ConstantIndexOp>(builder.create(state));
+  assert(result && "builder didn't return the right type");
+  return result;
+}
+
+arith::ConstantIndexOp
+arith::ConstantIndexOp::create(ImplicitLocOpBuilder &builder, int64_t value) {
+  return create(builder, builder.getLoc(), value);
+}
+
 bool arith::ConstantIndexOp::classof(Operation *op) {
   if (auto constOp = dyn_cast_or_null<arith::ConstantOp>(op))
     return constOp.getType().isIndex();
@@ -304,7 +384,7 @@ Value mlir::arith::getZeroConstant(OpBuilder &builder, Location loc,
          "type doesn't have a zero representation");
   TypedAttr zeroAttr = builder.getZeroAttr(type);
   assert(zeroAttr && "unsupported type for zero attribute");
-  return builder.create<arith::ConstantOp>(loc, zeroAttr);
+  return arith::ConstantOp::create(builder, loc, zeroAttr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -343,7 +423,7 @@ void arith::AddIOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 
 std::optional<SmallVector<int64_t, 4>>
 arith::AddUIExtendedOp::getShapeForUnroll() {
-  if (auto vt = llvm::dyn_cast<VectorType>(getType(0)))
+  if (auto vt = dyn_cast<VectorType>(getType(0)))
     return llvm::to_vector<4>(vt.getShape());
   return std::nullopt;
 }
@@ -489,7 +569,7 @@ void arith::MulIOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 
 std::optional<SmallVector<int64_t, 4>>
 arith::MulSIExtendedOp::getShapeForUnroll() {
-  if (auto vt = llvm::dyn_cast<VectorType>(getType(0)))
+  if (auto vt = dyn_cast<VectorType>(getType(0)))
     return llvm::to_vector<4>(vt.getShape());
   return std::nullopt;
 }
@@ -535,7 +615,7 @@ void arith::MulSIExtendedOp::getCanonicalizationPatterns(
 
 std::optional<SmallVector<int64_t, 4>>
 arith::MulUIExtendedOp::getShapeForUnroll() {
-  if (auto vt = llvm::dyn_cast<VectorType>(getType(0)))
+  if (auto vt = dyn_cast<VectorType>(getType(0)))
     return llvm::to_vector<4>(vt.getShape());
   return std::nullopt;
 }
@@ -1815,7 +1895,7 @@ OpFoldResult arith::BitcastOp::fold(FoldAdaptor adaptor) {
     return {};
 
   /// Bitcast dense elements.
-  if (auto denseAttr = llvm::dyn_cast_or_null<DenseElementsAttr>(operand))
+  if (auto denseAttr = dyn_cast_or_null<DenseElementsAttr>(operand))
     return denseAttr.bitcast(llvm::cast<ShapedType>(resType).getElementType());
   /// Other shaped types unhandled.
   if (llvm::isa<ShapedType>(resType))
@@ -1832,7 +1912,7 @@ OpFoldResult arith::BitcastOp::fold(FoldAdaptor adaptor) {
   assert(resType.getIntOrFloatBitWidth() == bits.getBitWidth() &&
          "trying to fold on broken IR: operands have incompatible types");
 
-  if (auto resFloatType = llvm::dyn_cast<FloatType>(resType))
+  if (auto resFloatType = dyn_cast<FloatType>(resType))
     return FloatAttr::get(resType,
                           APFloat(resFloatType.getFloatSemantics(), bits));
   return IntegerAttr::get(resType, bits);
@@ -1896,10 +1976,10 @@ static bool applyCmpPredicateToEqualOperands(arith::CmpIPredicate predicate) {
 }
 
 static std::optional<int64_t> getIntegerWidth(Type t) {
-  if (auto intType = llvm::dyn_cast<IntegerType>(t)) {
+  if (auto intType = dyn_cast<IntegerType>(t)) {
     return intType.getWidth();
   }
-  if (auto vectorIntType = llvm::dyn_cast<VectorType>(t)) {
+  if (auto vectorIntType = dyn_cast<VectorType>(t)) {
     return llvm::cast<IntegerType>(vectorIntType.getElementType()).getWidth();
   }
   return std::nullopt;
@@ -1969,7 +2049,7 @@ OpFoldResult arith::CmpIOp::fold(FoldAdaptor adaptor) {
 
   // We are moving constants to the right side; So if lhs is constant rhs is
   // guaranteed to be a constant.
-  if (auto lhs = llvm::dyn_cast_if_present<TypedAttr>(adaptor.getLhs())) {
+  if (auto lhs = dyn_cast_if_present<TypedAttr>(adaptor.getLhs())) {
     return constFoldBinaryOp<IntegerAttr>(
         adaptor.getOperands(), getI1SameShape(lhs.getType()),
         [pred = getPredicate()](const APInt &lhs, const APInt &rhs) {
@@ -2039,8 +2119,8 @@ bool mlir::arith::applyCmpPredicate(arith::CmpFPredicate predicate,
 }
 
 OpFoldResult arith::CmpFOp::fold(FoldAdaptor adaptor) {
-  auto lhs = llvm::dyn_cast_if_present<FloatAttr>(adaptor.getLhs());
-  auto rhs = llvm::dyn_cast_if_present<FloatAttr>(adaptor.getRhs());
+  auto lhs = dyn_cast_if_present<FloatAttr>(adaptor.getLhs());
+  auto rhs = dyn_cast_if_present<FloatAttr>(adaptor.getRhs());
 
   // If one operand is NaN, making them both NaN does not change the result.
   if (lhs && lhs.getValue().isNaN())
@@ -2334,9 +2414,8 @@ class CmpFIntToFPConst final : public OpRewritePattern<CmpFOp> {
     // comparison.
     rewriter.replaceOpWithNewOp<CmpIOp>(
         op, pred, intVal,
-        rewriter.create<ConstantOp>(
-            op.getLoc(), intVal.getType(),
-            rewriter.getIntegerAttr(intVal.getType(), rhsInt)));
+        ConstantOp::create(rewriter, op.getLoc(), intVal.getType(),
+                           rewriter.getIntegerAttr(intVal.getType(), rhsInt)));
     return success();
   }
 };
@@ -2373,10 +2452,10 @@ struct SelectToExtUI : public OpRewritePattern<arith::SelectOp> {
         matchPattern(op.getFalseValue(), m_One())) {
       rewriter.replaceOpWithNewOp<arith::ExtUIOp>(
           op, op.getType(),
-          rewriter.create<arith::XOrIOp>(
-              op.getLoc(), op.getCondition(),
-              rewriter.create<arith::ConstantIntOp>(
-                  op.getLoc(), op.getCondition().getType(), 1)));
+          arith::XOrIOp::create(
+              rewriter, op.getLoc(), op.getCondition(),
+              arith::ConstantIntOp::create(rewriter, op.getLoc(),
+                                           op.getCondition().getType(), 1)));
       return success();
     }
 
@@ -2440,11 +2519,11 @@ OpFoldResult arith::SelectOp::fold(FoldAdaptor adaptor) {
   // Constant-fold constant operands over non-splat constant condition.
   // select %cst_vec, %cst0, %cst1 => %cst2
   if (auto cond =
-          llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getCondition())) {
+          dyn_cast_if_present<DenseElementsAttr>(adaptor.getCondition())) {
     if (auto lhs =
-            llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getTrueValue())) {
+            dyn_cast_if_present<DenseElementsAttr>(adaptor.getTrueValue())) {
       if (auto rhs =
-              llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getFalseValue())) {
+              dyn_cast_if_present<DenseElementsAttr>(adaptor.getFalseValue())) {
         SmallVector<Attribute> results;
         results.reserve(static_cast<size_t>(cond.getNumElements()));
         auto condVals = llvm::make_range(cond.value_begin<BoolAttr>(),
@@ -2493,8 +2572,7 @@ void arith::SelectOp::print(OpAsmPrinter &p) {
   p << " " << getOperands();
   p.printOptionalAttrDict((*this)->getAttrs());
   p << " : ";
-  if (ShapedType condType =
-          llvm::dyn_cast<ShapedType>(getCondition().getType()))
+  if (ShapedType condType = dyn_cast<ShapedType>(getCondition().getType()))
     p << condType << ", ";
   p << getType();
 }
@@ -2692,7 +2770,7 @@ Value mlir::arith::getIdentityValue(AtomicRMWKind op, Type resultType,
                                     bool useOnlyFiniteValue) {
   auto attr =
       getIdentityValueAttr(op, resultType, builder, loc, useOnlyFiniteValue);
-  return builder.create<arith::ConstantOp>(loc, attr);
+  return arith::ConstantOp::create(builder, loc, attr);
 }
 
 /// Return the value obtained by applying the reduction operation kind
@@ -2701,33 +2779,33 @@ Value mlir::arith::getReductionOp(AtomicRMWKind op, OpBuilder &builder,
                                   Location loc, Value lhs, Value rhs) {
   switch (op) {
   case AtomicRMWKind::addf:
-    return builder.create<arith::AddFOp>(loc, lhs, rhs);
+    return arith::AddFOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::addi:
-    return builder.create<arith::AddIOp>(loc, lhs, rhs);
+    return arith::AddIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::mulf:
-    return builder.create<arith::MulFOp>(loc, lhs, rhs);
+    return arith::MulFOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::muli:
-    return builder.create<arith::MulIOp>(loc, lhs, rhs);
+    return arith::MulIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::maximumf:
-    return builder.create<arith::MaximumFOp>(loc, lhs, rhs);
+    return arith::MaximumFOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::minimumf:
-    return builder.create<arith::MinimumFOp>(loc, lhs, rhs);
-   case AtomicRMWKind::maxnumf:
-    return builder.create<arith::MaxNumFOp>(loc, lhs, rhs);
+    return arith::MinimumFOp::create(builder, loc, lhs, rhs);
+  case AtomicRMWKind::maxnumf:
+    return arith::MaxNumFOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::minnumf:
-    return builder.create<arith::MinNumFOp>(loc, lhs, rhs);
+    return arith::MinNumFOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::maxs:
-    return builder.create<arith::MaxSIOp>(loc, lhs, rhs);
+    return arith::MaxSIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::mins:
-    return builder.create<arith::MinSIOp>(loc, lhs, rhs);
+    return arith::MinSIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::maxu:
-    return builder.create<arith::MaxUIOp>(loc, lhs, rhs);
+    return arith::MaxUIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::minu:
-    return builder.create<arith::MinUIOp>(loc, lhs, rhs);
+    return arith::MinUIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::ori:
-    return builder.create<arith::OrIOp>(loc, lhs, rhs);
+    return arith::OrIOp::create(builder, loc, lhs, rhs);
   case AtomicRMWKind::andi:
-    return builder.create<arith::AndIOp>(loc, lhs, rhs);
+    return arith::AndIOp::create(builder, loc, lhs, rhs);
   // TODO: Add remaining reduction operations.
   default:
     (void)emitOptionalError(loc, "Reduction operation type not supported");
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
index 62022bfb7df1e..f14264e2f55f3 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
@@ -118,9 +118,9 @@ void mlir::arith::populateEmulateUnsupportedFloatsLegality(
         return converter.isLegal(op);
       });
   // Manually mark arithmetic-performing vector instructions.
-  target.addDynamicallyLegalOp<
-      vector::ContractionOp, vector::ReductionOp, vector::MultiDimReductionOp,
-      vector::FMAOp, vector::OuterProductOp, vector::MatmulOp, vector::ScanOp>(
+  target.addDynamicallyLegalOp<vector::ContractionOp, vector::ReductionOp,
+                               vector::MultiDimReductionOp, vector::FMAOp,
+                               vector::OuterProductOp, vector::ScanOp>(
       [&](Operation *op) { return converter.isLegal(op); });
   target.addLegalOp<arith::BitcastOp, arith::ExtFOp, arith::TruncFOp,
                     arith::ConstantOp, vector::SplatOp>();
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index f497d2db3bf7c..ab57557f3f13d 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -518,10 +518,10 @@ struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
     Type i32Ty = cloneToShapedType(operandTy, b.getI32Type());
     Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
 
-    if (!isa<Float32Type>(operandETy))
-      operand = b.create<arith::ExtFOp>(f32Ty, operand);
     if (!isa<Float4E2M1FNType>(resultETy))
       return rewriter.notifyMatchFailure(op, "not a trunc of F4E2M1FN");
+    if (!isa<Float32Type>(operandETy))
+      operand = b.create<arith::ExtFOp>(f32Ty, operand);
 
     Value c0x1 = createConst(loc, i4Ty, 1, rewriter);
     Value c0x3 = createConst(loc, i4Ty, 3, rewriter);
@@ -657,6 +657,7 @@ struct ScalingExtFOpConverter : public OpRewritePattern<arith::ScalingExtFOp> {
       scaleOperand = b.create<arith::TruncFOp>(scaleTy, scaleOperand, nullptr,
                                                op.getFastmathAttr());
     }
+    // Catch scale types like f8E5M2.
     if (!llvm::isa<Float8E8M0FNUType>(scaleETy)) {
       return rewriter.notifyMatchFailure(
           op, "scaling_extf is using scales of type which can not be converted "
@@ -777,7 +778,7 @@ struct ArithExpandOpsPass
         if (includeBf16)
           legalTypes &= !(inETy.isF32() && outETy.isBF16());
         if (includeF8E8M0)
-          legalTypes &= !(llvm::isa<Float8E8M0FNUType>(outETy)); 
+          legalTypes &= !(llvm::isa<Float8E8M0FNUType>(outETy));
         if (includeF4E2M1)
           legalTypes &= !llvm::isa<Float4E2M1FNType>(outETy);
         return legalTypes;
@@ -832,7 +833,7 @@ void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
     MaximumMinimumFOpConverter<MaximumFOp, arith::CmpFPredicate::UGT>,
     MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>,
     MaxNumMinNumFOpConverter<MaxNumFOp, arith::CmpFPredicate::UGT>,
-    MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT> 
+    MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT>
    >(patterns.getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp
index a07be7801869f..59acb362191a7 100644
--- a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToNeonI8MMPattern.cpp
@@ -12,7 +12,7 @@
 // TODO: There may be opportunities to unify this with a similar pattern
 // for SVE. See:
 //   https://github.com/llvm/llvm-project/issues/145559
-//   LowerContractionToSVEI8MMPattern.cpp
+//   LowerContractToSVEPatterns.cpp
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,23 +31,15 @@ using namespace mlir;
 using namespace mlir::arm_neon;
 
 namespace {
-
-/// Return the shaped type with new element type.
-static Type matchContainerType(Type element, Type container) {
-  if (auto shapedTy = dyn_cast<ShapedType>(container)) {
-    return shapedTy.clone(element);
-  }
-  return element;
-}
-
-// Get the operand of a `vector.contract`. This function is intended to abstract
-// away from the particular way a value is extended before feeding it into the
-// `vector.contract` - via zero-extend or an explicit or implicit sign-extend
-// (for implicit sign-extension see `vector.contract` documentation).
-//
-// The template parameter `Op` indicates the extension operation (explicit or
-// implicit) for which we are checking.
-//
+/// Get the operand of a `vector.contract`. This function is intended to
+/// abstract away from the particular way a value is extended before feeding it
+/// into the `vector.contract` - via zero-extend or an explicit or implicit
+/// sign-extend (for implicit sign-extension see `vector.contract`
+/// documentation).
+///
+/// The template parameter `Op` indicates the extension operation (explicit or
+/// implicit) for which we are checking.
+///
 // Return success only for extensions from `iN` (N <= 8) to `i32`.
 template <typename Op>
 std::optional<Value> getExtOperand(Value v) {
@@ -85,202 +77,186 @@ std::optional<Value> getExtOperand(Value v) {
   return inOp;
 }
 
-// Designate the operation (resp. instruction) used to do sub-tile matrix
-// multiplications.
-enum class MMLA {
-  Signed,      // smmla
-  Unsigned,    // ummla
-  Mixed,       // usmmla
-  MixedSwapped // usmmla with LHS and RHS swapped
-};
+/// Helper function to extend a vector with elements iN, N < 8 to
+/// a vector of i8. Do sign extension if the parameter `signExt` is true,
+/// zero extension otherwise.
+Value extendSmallIntVector(Location loc, VectorType srcTy, Value val,
+                           bool signExt, PatternRewriter &rewriter) {
+  Type targetTy = srcTy.clone(rewriter.getI8Type());
+  return signExt ? rewriter.createOrFold<arith::ExtSIOp>(loc, targetTy, val)
+                 : rewriter.createOrFold<arith::ExtUIOp>(loc, targetTy, val);
+}
 
-// Create the matrix mulitply and accumulate operation according to `op`.
-Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc,
-                 mlir::Type accType, Value acc, Value lhs, Value rhs) {
-  switch (op) {
-  case MMLA::Signed:
-    return rewriter.createOrFold<arm_neon::SmmlaOp>(loc, accType, acc, lhs,
-                                                    rhs);
-  case MMLA::Unsigned:
-    return rewriter.createOrFold<arm_neon::UmmlaOp>(loc, accType, acc, lhs,
-                                                    rhs);
-  case MMLA::Mixed:
-    return rewriter.createOrFold<arm_neon::UsmmlaOp>(loc, accType, acc, lhs,
-                                                     rhs);
-  case MMLA::MixedSwapped:
-    // The accumulator comes transposed and the result will be transposed
-    // later, so all we have to do here is swap the operands.
-    return rewriter.createOrFold<arm_neon::UsmmlaOp>(loc, accType, acc, rhs,
-                                                     lhs);
+class VectorContractRewriter {
+protected:
+  // Designate the operation (resp. instruction) used to do sub-tile matrix
+  // multiplications.
+  enum class MMLA {
+    Nop,
+    Signed,      // smmla
+    Unsigned,    // ummla
+    Mixed,       // usmmla
+    MixedSwapped // usmmla with LHS and RHS swapped
+  };
+
+  // Lower-level operation to be emitted.
+  MMLA mmlaOp = MMLA::Nop;
+
+  // The operand tiles. These are not necessarily the operands of
+  // `vector.contract`, for example they could be operands to `arith.extsi`
+  // that is in turn fed into `vector.contract`.
+  Value lhs;
+  Value rhs;
+  Value acc;
+
+  // The dimensions logically corresponding to matrix multiplication of
+  // MxK * KxN -> MxN. The operands and the result do not necessarily have these
+  // shapes, for example RHS could be NxK with a transposing indexing map.
+  int64_t dimM = 0;
+  int64_t dimN = 0;
+  int64_t dimK = 0;
+
+  // Unroll iteration bounds. See documentaiton for `StaticTileOffsetRange`.
+  SmallVector<int64_t> iterationBounds;
+
+  // Sub-tile shape. The algorithm handles operand shapes, which are multiples
+  // of this shape.
+  SmallVector<int64_t> subTileShape;
+
+  // Create the matrix multiply and accumulate operation according to `mmlaOp`.
+  Value createMMLA(PatternRewriter &rewriter, Location loc, Value acc,
+                   Value lhs, Value rhs) {
+    switch (mmlaOp) {
+    case MMLA::Signed:
+      return rewriter.createOrFold<arm_neon::SmmlaOp>(loc, acc.getType(), acc,
+                                                      lhs, rhs);
+    case MMLA::Unsigned:
+      return rewriter.createOrFold<arm_neon::UmmlaOp>(loc, acc.getType(), acc,
+                                                      lhs, rhs);
+    case MMLA::Mixed:
+      return rewriter.createOrFold<arm_neon::UsmmlaOp>(loc, acc.getType(), acc,
+                                                       lhs, rhs);
+    case MMLA::MixedSwapped:
+      // The accumulator comes transposed and the result will be transposed
+      // later, so all we have to do here is swap the operands.
+      return rewriter.createOrFold<arm_neon::UsmmlaOp>(loc, acc.getType(), acc,
+                                                       rhs, lhs);
+    case MMLA::Nop:
+      llvm_unreachable("Uninitialized operation type");
+    }
   }
-}
 
-/// Lowering from a vector::contractOp arm neon smmla intrinsic. This will tile
-/// any vector.contract into multiple smmla instructions with unrolling so long
-/// as [2,2,8] is a divisor of its shape. It can also process vecmats with dimM
-/// = 1 (either explicitly or inferred if LHS has only dimK) If no unrolling is
-/// necessary, a single smmla instruction is emitted.
-class LowerContractionToNeonI8MMPattern
-    : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    // Infer tile sizes from operands. For vecmat, LHS may only have 1 dim.
-    // Note: RHS is not transposed.
-    mlir::VectorType lhsType = op.getLhsType();
-    mlir::VectorType rhsType = op.getRhsType();
+  // Check common preconditions for applying the patterns and initialize
+  // logical dimensions.
+  LogicalResult matchAndInit(vector::ContractionOp op,
+                             PatternRewriter &rewriter) {
+    // Check iterator types for matrix multiplication.
+    SmallVector<vector::IteratorType> itTypes = op.getIteratorTypesArray();
+    if (!((itTypes.size() == 3 &&
+           (itTypes[0] == vector::IteratorType::parallel &&
+            itTypes[1] == vector::IteratorType::parallel &&
+            itTypes[2] == vector::IteratorType::reduction)) ||
+          (itTypes.size() == 2 &&
+           (itTypes[0] == vector::IteratorType::parallel &&
+            itTypes[1] == vector::IteratorType::reduction))))
+      return rewriter.notifyMatchFailure(
+          op, "iterator types do not correspond to matrix multiplication");
+
     // Avoid 0-D vectors and 1-D rhs:
-    if (!lhsType.hasRank() || !rhsType.hasRank() || rhsType.getRank() < 2)
-      return failure();
+    VectorType lhsType = op.getLhsType();
+    VectorType rhsType = op.getRhsType();
+    if (!lhsType.hasRank() || !rhsType.hasRank() || lhsType.getRank() > 2 ||
+        rhsType.getRank() != 2)
+      return rewriter.notifyMatchFailure(op, "Invalid operand rank");
+
     // This codegen does not work for scalable vectors. Return failure so this
     // pattern is not accidentally chosen over patterns that lower to ArmSVE.
     if (lhsType.isScalable() || rhsType.isScalable())
-      return failure();
-    auto dimM = lhsType.getRank() == 1 ? 1 : lhsType.getDimSize(0);
-    auto dimN = rhsType.getDimSize(0);
-    auto dimK = rhsType.getDimSize(1);
-    bool isVecmat = dimM == 1 ? true : false;
-    if (lhsType.getDimSize(lhsType.getRank() - 1) !=
-        rhsType.getDimSize(rhsType.getRank() - 1)) {
-      return failure(); // dimK mismatch
-    }
-    // Unrolling patterns can handle any [2, 2, 8] shaped multiple of inputs for
-    // tiling.
-    if ((dimM % 2 != 0 && !isVecmat) || dimN % 2 != 0 || dimK % 8 != 0) {
-      return failure();
-    }
-
-    // Check iterator types for contract. All iterators except inner-most
-    // dimension must be parallel.
-    auto iteratorTypes = op.getIteratorTypesArray();
-    if (iteratorTypes.size() > 3 || iteratorTypes[iteratorTypes.size() - 1] !=
-                                        vector::IteratorType::reduction) {
-      return failure();
-    }
-    if (llvm::any_of(ArrayRef<vector::IteratorType>(iteratorTypes).drop_back(1),
-                     [](vector::IteratorType iteratorType) {
-                       return iteratorType != vector::IteratorType::parallel;
-                     })) {
-      return failure();
+      return rewriter.notifyMatchFailure(op,
+                                         "Not applicable to scalable vectors");
+
+    // Initialize dimensions and check for a matching K dimension.
+    dimM = lhsType.getDimSize(0);
+    dimN = rhsType.getDimSize(0);
+    dimK = rhsType.getDimSize(1);
+
+    int64_t lhsDimK;
+    if (lhsType.getRank() == 1) {
+      dimM = 1;
+      lhsDimK = lhsType.getDimSize(0);
+    } else {
+      lhsDimK = lhsType.getDimSize(1);
     }
 
-    // Check inputs are sign-/zero- extensions from iN (N <= 8) to i32. Get the
-    // values before the extension. All four signed/unsigned combinations for
-    // input operands are supported, but they are lowered to different
-    // operations. Determine which is the appropriate operation to lower to.
-    MMLA mmlaOp = MMLA::Signed;
-    auto maybeLhs = getExtOperand<arith::ExtSIOp>(op.getLhs());
-    if (!maybeLhs) {
-      mmlaOp = MMLA::Unsigned;
-      maybeLhs = getExtOperand<arith::ExtUIOp>(op.getLhs());
-    }
-    if (!maybeLhs)
-      return failure();
+    if (lhsDimK != dimK)
+      return rewriter.notifyMatchFailure(op, "Dimensions mismatch");
 
-    auto maybeRhs = getExtOperand<arith::ExtSIOp>(op.getRhs());
-    if (maybeRhs) {
-      if (mmlaOp == MMLA::Unsigned)
-        mmlaOp = MMLA::Mixed;
-    } else {
-      if (mmlaOp == MMLA::Signed)
-        mmlaOp = MMLA::MixedSwapped;
-      maybeRhs = getExtOperand<arith::ExtUIOp>(op.getRhs());
-    }
-    if (!maybeRhs)
-      return failure();
+    return success();
+  }
 
-    Value origLhs = *maybeLhs;
-    Value origRhs = *maybeRhs;
-
-    // Match any iX to i32 for X<8 then turn into an i8 output. Feed into
-    // following neon instruction. Check inputs for extsi are <=i8
-    Value extLhs;
-    Value extRhs;
-    if (auto lhsExtInType = dyn_cast<mlir::VectorType>(origLhs.getType())) {
-      if (lhsExtInType.getElementTypeBitWidth() <= 8) {
-        Type targetLhsExtTy =
-            matchContainerType(rewriter.getI8Type(), lhsExtInType);
-        if (mmlaOp == MMLA::Signed || mmlaOp == MMLA::Mixed)
-          extLhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetLhsExtTy,
-                                                         origLhs);
-        else
-          extLhs = rewriter.createOrFold<arith::ExtUIOp>(loc, targetLhsExtTy,
-                                                         origLhs);
-      }
-    }
-    if (auto rhsExtInType = dyn_cast<mlir::VectorType>(origRhs.getType())) {
-      if (rhsExtInType.getElementTypeBitWidth() <= 8) {
-        Type targetRhsExtTy =
-            matchContainerType(rewriter.getI8Type(), rhsExtInType);
-        if (mmlaOp == MMLA::Unsigned || mmlaOp == MMLA::Mixed)
-          extRhs = rewriter.createOrFold<arith::ExtUIOp>(loc, targetRhsExtTy,
-                                                         origRhs);
-        else
-          extRhs = rewriter.createOrFold<arith::ExtSIOp>(loc, targetRhsExtTy,
-                                                         origRhs);
-      }
-    }
+public:
+  void lower(vector::ContractionOp op, PatternRewriter &rewriter) {
+    // Create some convenience types.
+    auto inputElementType = cast<ShapedType>(lhs.getType()).getElementType();
+    auto accElementType = cast<ShapedType>(acc.getType()).getElementType();
+    auto inputExpandedType =
+        VectorType::get({2, subTileShape.back()}, inputElementType);
+    auto outputExpandedType = VectorType::get({2, 2}, accElementType);
+
+    // One-dimensional representation of logical sub-tiles as required by the
+    // ArmNeon ops.
+    auto collapsedInputType =
+        VectorType::get(inputExpandedType.getNumElements(), inputElementType);
+    auto collapsedOutputType =
+        VectorType::get(outputExpandedType.getNumElements(), accElementType);
+
+    // Get indexing maps for a more concise/convenient access.
+    auto indexingMaps = op.getIndexingMapsArray();
+    AffineMap &lhsPermutationMap = indexingMaps[0];
+    AffineMap &rhsPermutationMap = indexingMaps[1];
+    AffineMap &accPermutationMap = indexingMaps[2];
 
-    if (!extLhs || !extRhs) {
-      return failure();
-    }
+    Location loc = op.getLoc();
 
     // Initial accumulator for the final result. This is the un-tiled result if
     // tiling is done.
     Value result = rewriter.create<arith::ConstantOp>(
         loc, op.getResultType(), rewriter.getZeroAttr(op.getResultType()));
 
-    SmallVector<int64_t> unrolledSize = *op.getShapeForUnroll();
-    SmallVector<int64_t> smmlaShape = {2, 8};
-    SmallVector<int64_t> loopOrder = {0, 1};
-    if (unrolledSize.size() == 3) {
-      smmlaShape.insert(smmlaShape.begin(), isVecmat ? 1 : 2);
+    SmallVector<int64_t, 3> loopOrder = {0, 1};
+    if (iterationBounds.size() == 3)
       loopOrder.push_back(2);
-    }
 
     // Keep track of the previous accumulator when tiling over K.
     Value kAcc;
     for (SmallVector<int64_t> offsets :
-         StaticTileOffsetRange(unrolledSize, smmlaShape, loopOrder)) {
+         StaticTileOffsetRange(iterationBounds, subTileShape, loopOrder)) {
       // Helper to compute the new shape of each operand and extract the slice.
       auto extractOperand = [&](Value operand, AffineMap permutationMap,
                                 ArrayRef<int64_t> operandOffsets) {
-        SmallVector<int64_t> operandShape =
-            applyPermutationMap(permutationMap, ArrayRef<int64_t>(smmlaShape));
+        SmallVector<int64_t> operandShape = applyPermutationMap(
+            permutationMap, ArrayRef<int64_t>(subTileShape));
         SmallVector<int64_t> operandStrides(operandOffsets.size(), 1);
         return rewriter.createOrFold<vector::ExtractStridedSliceOp>(
             loc, operand, operandOffsets, operandShape, operandStrides);
       };
 
       // Extract tiled lhs, rhs, and acc
-      AffineMap lhsPermutationMap = op.getIndexingMapsArray()[0];
       SmallVector<int64_t> lhsOffsets =
           applyPermutationMap(lhsPermutationMap, ArrayRef<int64_t>(offsets));
-      Value tiledLhs = extractOperand(extLhs, lhsPermutationMap, lhsOffsets);
-      AffineMap rhsPermutationMap = op.getIndexingMapsArray()[1];
+      Value tiledLhs = extractOperand(lhs, lhsPermutationMap, lhsOffsets);
       SmallVector<int64_t> rhsOffsets =
           applyPermutationMap(rhsPermutationMap, ArrayRef<int64_t>(offsets));
-      Value tiledRhs = extractOperand(extRhs, rhsPermutationMap, rhsOffsets);
-      AffineMap accPermutationMap = op.getIndexingMapsArray()[2];
+      Value tiledRhs = extractOperand(rhs, rhsPermutationMap, rhsOffsets);
       SmallVector<int64_t> accOffsets =
           applyPermutationMap(accPermutationMap, ArrayRef<int64_t>(offsets));
-      Value tiledAcc =
-          extractOperand(op.getAcc(), accPermutationMap, accOffsets);
-
-      auto inputElementType =
-          cast<ShapedType>(tiledLhs.getType()).getElementType();
-      auto accElementType =
-          cast<ShapedType>(tiledAcc.getType()).getElementType();
-      auto inputExpandedType = VectorType::get({2, 8}, inputElementType);
-      auto outputExpandedType = VectorType::get({2, 2}, accElementType);
+      Value tiledAcc = extractOperand(acc, accPermutationMap, accOffsets);
 
       // With vecmat, tiled LHS and ACC will contain only one of 2 necessary
-      // rows along dimM. Expand their shapes to match the smmla op.
-      if (isVecmat) {
-        auto expandForSMMLA = [&](Value tiledOperand,
-                                  VectorType expandedTypeType) {
+      // rows along dimM. Expand their shapes to match the ArmNeon op.
+      if (dimM == 1) {
+        auto expandRowVector = [&](Value tiledOperand,
+                                   VectorType expandedTypeType) {
           auto emptyOperand = rewriter.create<arith::ConstantOp>(
               loc, expandedTypeType, rewriter.getZeroAttr(expandedTypeType));
           SmallVector<int64_t> offsets(
@@ -290,8 +266,8 @@ class LowerContractionToNeonI8MMPattern
           return rewriter.createOrFold<vector::InsertStridedSliceOp>(
               loc, tiledOperand, emptyOperand, offsets, strides);
         };
-        tiledLhs = expandForSMMLA(tiledLhs, inputExpandedType);
-        tiledAcc = expandForSMMLA(tiledAcc, outputExpandedType);
+        tiledLhs = expandRowVector(tiledLhs, inputExpandedType);
+        tiledAcc = expandRowVector(tiledAcc, outputExpandedType);
       }
 
       // Transpose ACC if doing signed by unsigned multiplication, because we're
@@ -301,15 +277,11 @@ class LowerContractionToNeonI8MMPattern
         tiledAcc = rewriter.create<vector::TransposeOp>(
             loc, tiledAcc, ArrayRef<int64_t>({1, 0}));
 
-      // Collapse tiled operands to 1D vectors required by smmla intrinsic
-      auto collapsedInputType =
-          VectorType::get(inputExpandedType.getNumElements(), inputElementType);
+      // Collapse tiled operands to 1D vectors required by the ArmNeon ops
       auto collapsedLhs = rewriter.createOrFold<vector::ShapeCastOp>(
           tiledLhs.getLoc(), collapsedInputType, tiledLhs);
       auto collapsedRhs = rewriter.createOrFold<vector::ShapeCastOp>(
           tiledRhs.getLoc(), collapsedInputType, tiledRhs);
-      auto collapsedOutputType =
-          VectorType::get(outputExpandedType.getNumElements(), accElementType);
 
       bool initialKAcc = offsets.back() == 0;
       Value collapsedRes;
@@ -321,8 +293,8 @@ class LowerContractionToNeonI8MMPattern
       }
 
       // Insert contract op
-      kAcc = createMMLA(rewriter, mmlaOp, op.getLoc(), collapsedRes.getType(),
-                        collapsedRes, collapsedLhs, collapsedRhs);
+      kAcc =
+          createMMLA(rewriter, loc, collapsedRes, collapsedLhs, collapsedRhs);
 
       // Reshape output back to 2D
       Value tiledRes = rewriter.createOrFold<vector::ShapeCastOp>(
@@ -336,9 +308,8 @@ class LowerContractionToNeonI8MMPattern
 
       // With vecmat, only one row of tiled ACC can be inserted into the final
       // result
-      if (isVecmat) {
+      if (dimM == 1)
         tiledRes = rewriter.createOrFold<vector::ExtractOp>(loc, tiledRes, 0);
-      }
 
       // Insert the tiled result back into the non tiled result of the
       // contract op.
@@ -349,6 +320,98 @@ class LowerContractionToNeonI8MMPattern
     }
 
     rewriter.replaceOp(op, result);
+  }
+};
+
+class VectorContractRewriterI8MM : public VectorContractRewriter {
+public:
+  LogicalResult matchAndInit(vector::ContractionOp op,
+                             PatternRewriter &rewriter) {
+    if (failed(VectorContractRewriter::matchAndInit(op, rewriter)))
+      return failure();
+
+    // Unrolling patterns can handle any [2, 2, 8] shaped multiple of inputs for
+    // tiling.
+    if ((dimM != 1 && dimM % 2 != 0) || dimN % 2 != 0 || dimK % 8 != 0)
+      return rewriter.notifyMatchFailure(op, "Unsupported operand shapes");
+
+    // Check inputs are sign-/zero- extensions from iN (N <= 8) to i32. Get the
+    // values before the extension. All four signed/unsigned combinations for
+    // input operands are supported, but they are lowered to different
+    // operations. Determine which is the appropriate operation to lower to.
+    mmlaOp = MMLA::Signed;
+    auto maybeLhs = getExtOperand<arith::ExtSIOp>(op.getLhs());
+    if (!maybeLhs) {
+      mmlaOp = MMLA::Unsigned;
+      maybeLhs = getExtOperand<arith::ExtUIOp>(op.getLhs());
+    }
+    if (!maybeLhs)
+      return rewriter.notifyMatchFailure(
+          op, "LHS is not a sign- or zero- extended iN, N <= 8");
+
+    auto maybeRhs = getExtOperand<arith::ExtSIOp>(op.getRhs());
+    if (maybeRhs) {
+      if (mmlaOp == MMLA::Unsigned)
+        mmlaOp = MMLA::Mixed;
+    } else {
+      if (mmlaOp == MMLA::Signed)
+        mmlaOp = MMLA::MixedSwapped;
+      maybeRhs = getExtOperand<arith::ExtUIOp>(op.getRhs());
+    }
+
+    if (!maybeRhs)
+      return rewriter.notifyMatchFailure(
+          op, "RHS is not a sign- or zero- extended iN, N <= 8");
+
+    lhs = *maybeLhs;
+    rhs = *maybeRhs;
+    acc = op.getAcc();
+
+    // Extend inputs from iN, N < 8 to i8.
+    Location loc = op.getLoc();
+    auto lhsExtInType = cast<VectorType>(lhs.getType());
+    if (lhsExtInType.getElementTypeBitWidth() < 8)
+      lhs = extendSmallIntVector(loc, lhsExtInType, lhs,
+                                 /* signExt */ mmlaOp == MMLA::Signed ||
+                                     mmlaOp == MMLA::Mixed,
+                                 rewriter);
+
+    auto rhsExtInType = cast<VectorType>(rhs.getType());
+    if (rhsExtInType.getElementTypeBitWidth() < 8)
+
+      rhs = extendSmallIntVector(loc, rhsExtInType, rhs,
+                                 /* signExt */ mmlaOp != MMLA::Unsigned &&
+                                     mmlaOp != MMLA::Mixed,
+                                 rewriter);
+
+    // Initialize parameters for unrolling.
+    iterationBounds = *op.getShapeForUnroll();
+    if (iterationBounds.size() == 3)
+      subTileShape = SmallVector<int64_t>({dimM == 1 ? 1 : 2, 2, 8});
+    else
+      subTileShape = SmallVector<int64_t>({2, 8});
+
+    return success();
+  }
+};
+
+/// Lowering from a vector::contractOp arm neon smmla intrinsic. This will tile
+/// any vector.contract into multiple smmla instructions with unrolling so long
+/// as [2,2,8] is a divisor of its shape. It can also process vecmats with dimM
+/// = 1 (either explicitly or inferred if LHS has only dimK) If no unrolling is
+/// necessary, a single smmla instruction is emitted.
+class LowerContractionToNeonI8MMPattern
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override {
+
+    VectorContractRewriterI8MM vcr;
+    if (failed(vcr.matchAndInit(op, rewriter)))
+      return failure();
+    vcr.lower(op, rewriter);
+
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.cpp b/mlir/lib/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.cpp
index b2ca4fc1eaa8c..8572c34c8b12b 100644
--- a/mlir/lib/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.cpp
+++ b/mlir/lib/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.cpp
@@ -18,11 +18,16 @@ using namespace mlir;
 // Apply...PatternsOp
 //===----------------------------------------------------------------------===//
 
-void transform::ApplyArmSVELowerContractionPatternsOp::populatePatterns(
+void transform::ApplyArmSVELowerContractionToI8MMPatternsOp::populatePatterns(
     RewritePatternSet &patterns) {
   mlir::populateLowerContractionToSVEI8MMPatternPatterns(patterns);
 }
 
+void transform::ApplyArmSVELowerContractionToBFMMLAPatternsOp::populatePatterns(
+    RewritePatternSet &patterns) {
+  mlir::populateLowerContractionToSVEBFMMLAPatterns(patterns);
+}
+
 //===----------------------------------------------------------------------===//
 // Transform op registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
index 65f98b44b1b69..c29eaca244b4a 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_mlir_dialect_library(MLIRArmSVETransforms
   LegalizeForLLVMExport.cpp
   LegalizeVectorStorage.cpp
-  LowerContractionToSVEI8MMPattern.cpp
+  LowerContractToSVEPatterns.cpp
 
   DEPENDS
   MLIRArmSVEConversionsIncGen
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractToSVEPatterns.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractToSVEPatterns.cpp
new file mode 100644
index 0000000000000..f63eac91a38aa
--- /dev/null
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractToSVEPatterns.cpp
@@ -0,0 +1,593 @@
+//===- LowerContractToSVEPatterns.cpp - Contract to I8MM/BF16 ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering patterns from vector.contract to operations
+// that map to instructions from the SVE FEAT_I8MM and FEAT_BF16 extensions.
+//
+// TODO: There may be opportunities to unify this with a similar pattern
+// for Neon. See:
+//   https://github.com/llvm/llvm-project/issues/145559
+//   LowerContractionToNeonI8MMPattern.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Transforms.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/PatternMatch.h"
+
+#include <cassert>
+#include <numeric>
+
+#define DEBUG_TYPE "lower-contract-to-arm-sve-i8mm"
+
+using namespace mlir;
+
+namespace {
+// Get the operand of a `vector.contract`. This function is intended to abstract
+// away from the particular way a value is extended before feeding it into the
+// `vector.contract` - via zero-extend or an explicit or implicit sign-extend
+// (for implicit sign-extension see `vector.contract` documentation).
+//
+// The template parameter `Op` indicates the extension operation (explicit or
+// implicit) for which we are checking.
+//
+// Return success only for extensions from `i8` to `i32`.
+template <typename Op>
+std::optional<Value> getExtOperand(Value v) {
+
+  static_assert(llvm::is_one_of<Op, arith::ExtSIOp, arith::ExtUIOp>::value,
+                "Must be instantiated with either sign- or zero- extension op");
+
+  // If the operand is not defined by an explicit extend operation of the
+  // accepted operation type allow for an implicit sign-extension.
+  auto extOp = dyn_cast_or_null<Op>(v.getDefiningOp());
+  if (!extOp) {
+    if constexpr (std::is_same<Op, arith::ExtSIOp>::value) {
+      auto vTy = cast<VectorType>(v.getType());
+      if (!vTy.getElementType().isSignlessInteger(8))
+        return {};
+      return v;
+    }
+    return {};
+  }
+
+  // If the operand is defined by an explicit extend operation of the accepted
+  // operation type, check it's extended from `i8` to `i32`.
+  auto inOp = extOp.getIn();
+  auto inTy = dyn_cast<VectorType>(inOp.getType());
+  if (!inTy || !inTy.getElementType().isSignlessInteger(8))
+    return {};
+
+  auto outTy = dyn_cast<VectorType>(extOp.getType());
+  if (!outTy || !outTy.getElementType().isSignlessInteger(32))
+    return {};
+
+  return inOp;
+}
+
+/// This class encapsulates the algorithm and parametrisation (in terms of types
+/// and dimensions) of lowering a `vector.contract` to "primitive" matrix
+/// multiplication operations of the SVE dialect (here "primitive" would mean
+/// corresponding to a single target instruction).
+///
+/// Supported are lowering to FEAT_I8MM `smmla`, `ummla`, and `usmmla`, and to
+/// FEAT_BF16 `bfmmla`. All the transformations are very similar to each other
+/// for concreteness the description below is given for `smmla`.
+///
+/// The lowering triggers for a contraction operation that performs a matrix
+/// multiply of two 8-bit integer matrix tiles with logical dimensions
+/// <Mx8> and <8x[N]> for the left-hand side (LHS) and the right-hand side
+/// (RHS), respectively, added to a 32-bit integer accumulator operand (ACC)
+/// with dimensions <Mx[N]>, yielding a <Mx[N]> 32-bit integer result (OUT).
+///
+/// The operands' shapes are such that the operands can be evenly split into
+/// sub-tiles with dimensions as expected by the targeted FEAT_I8MM
+/// instructions. The intent is that M and N are chosen (by higher level
+/// transforms) in such a way as to maximise register usage. The main use case
+/// we envision as of now is MMT4D, thus the RHS operand is expected
+/// pre-transposed.
+///
+/// The matrix multiplication is performed by unrolling the usual tiled matrix
+/// multiplication algorithm using sub-tiles with dimensions <2x8> for the
+/// LHS, <8x[2]> for the RHS, and <2x[2]> for the result and the input
+/// accumulator.
+///
+/// One way to illustrate the operation is as follows:
+///
+/// RHS<8x[N]>:       <8x[2]> <8x[2]> ... <8x[2]>
+///                 +-----------------------------
+/// LHS<Mx8>: <2x8> | <2x[2]> <2x[2]> ... <2x[2]>
+///           <2x8> | <2x[2]> <2x[2]> ... <2x[2]>
+///            ...  |   ...     ...   ...   ...
+///           <2x8> | <2x[2]> <2x[2]> ... <2x[2]>
+///
+/// The RHS operand is unpacked into N/2 values, each representing a sequence
+/// of VSCALE number of sub-tiles with dimensions <8x2>.
+/// The LHS operand is initially unpacked into M/2 values, each representing a
+/// sub-tile with dimensions <2x8>, and then each such sub-tile is replicated
+/// VSCALE times. Multiplying thus replicated LHS sub-tile by the corresponding
+/// RHS sub-tile correctly computes an entire result sub-tile.
+/// The 2x2 sub-tiles of the ACC and OUT have rows that are not adjacent
+/// (in memory or when imposing a row-major layout on the 2D vector value).
+/// Reading the ACC is implemented as reading two consecutive rows and
+/// interleaving the by pairs to obtain a vector having length twice the length
+/// of an ACC row. This vector now is a sequence of one-dimensional tiles with
+/// the exact layout needed by the `smmla`/`bfmmla`/etc instructions, which
+/// tiles are extracted one by one. For illustration, if we have an 2x4 ACC tile
+///   a0 a1 b0 b1
+///   a2 a3 b2 b3
+/// we read the two rows as separate values and then interleave by pairs
+/// to obtain
+///   a0 a1 a2 a3 b0 b1 b2 b3
+/// from which we extract `a0 a1 a2 a3` and `b0 b1 b2 b3`.
+///
+/// Writing the OUT tile is done by the reverse of the above procedure,
+/// concatenate two "flattened" sub-tiles into
+///   c0 c1 c2 c3 d0 d1 d2 d3
+/// deinterleave by pairs to obtain as separate values
+///   c0 c1 d0 d1
+///   c2 c3 d2 d3
+/// which are then inserted into the final result.
+///
+/// Multiplication of a signed LHS by an unsigned LHS is performed by
+/// swapping the order of the operands and emitting an `usmmla` (since there
+/// isn't an `summla` instruction). Therefore each ACC sub-tile needs
+/// to be transposed before the addition and the sum, an OUT sub-tile,
+/// needs to be transposed before insertion into the final result.
+/// This is done very elegantly by a modification of the above to
+/// interleave/deinterleave not by pairs, but by individual elements, e.g.
+/// after ordinary interleave we obtain
+///   a0 a2 a1 a3 b0 b2 b1 b3
+/// which is exactly the desired layout of having each individual 2x2 tile
+/// transposed.
+///
+/// All of the above readily applies to FEAT_BF16 `bfmmla` with the
+/// difference that the shapes of the LHS, RHS are <Mx4>, <4x[M]>, and
+/// respectively, that is the "K" dimension is fixed to 4, instead of 8 (like
+/// for the integer case).
+class VectorContractRewriter {
+protected:
+  // Designate the operation (resp. instruction) used to do sub-tile matrix
+  // multiplications.
+  enum class MMLA {
+    Nop,
+    SignedInt,   // smmla
+    UnsignedInt, // ummla
+    MixedInt,    // usmmla
+    Bfloat       // bfmmla
+  };
+
+  // Lower-level operation to be emitted.
+  MMLA mmlaOp = MMLA::Nop;
+
+  // Indicate if the operands for the ArmSVE dialect operation need to be
+  // swapped. Currently this is needed in order to emulate an "summla"
+  // operation.
+  bool swapOperands = false;
+
+  // The operand tiles. These are not necessarily the operends of
+  // `vector.contract`, for example they could be operands to `arith.extsi`
+  // that is in turn fed into `vector.contract`.
+  Value lhs;
+  Value rhs;
+  Value acc;
+
+  // Conventional names for matrix dimensions.
+  int64_t M = 0;
+  int64_t N = 0;
+  int64_t K = 0;
+
+  // Create the matrix mulitply and accumulate operation according to
+  // `mmlaOp`.
+  Value createMMLA(PatternRewriter &rewriter, Location loc, Value acc,
+                   Value lhs, Value rhs);
+
+  // Check general preconditions for applying the transformation, common to the
+  // integer and the bfloat16 case.
+  LogicalResult match(vector::ContractionOp op, PatternRewriter &rewriter);
+
+public:
+  VectorContractRewriter() = default;
+
+  // Do the actuall rewrite. This member function is shared by both integer and
+  // bfloat16 rewrites.
+  Value lower(vector::ContractionOp op, PatternRewriter &rewriter);
+};
+
+Value VectorContractRewriter::createMMLA(PatternRewriter &rewriter,
+                                         Location loc, Value acc, Value lhs,
+                                         Value rhs) {
+
+  Type resTy = acc.getType();
+  if (swapOperands)
+    std::swap(lhs, rhs);
+
+  switch (mmlaOp) {
+  case MMLA::SignedInt:
+    return rewriter.create<arm_sve::SmmlaOp>(loc, resTy, acc, lhs, rhs);
+  case MMLA::UnsignedInt:
+    return rewriter.create<arm_sve::UmmlaOp>(loc, resTy, acc, lhs, rhs);
+  case MMLA::MixedInt:
+    return rewriter.create<arm_sve::UsmmlaOp>(loc, resTy, acc, lhs, rhs);
+  case MMLA::Bfloat:
+    return rewriter.create<arm_sve::BfmmlaOp>(loc, resTy, acc, lhs, rhs);
+  default:
+    llvm_unreachable("Uninitialized operation kind");
+  }
+}
+
+LogicalResult VectorContractRewriter::match(vector::ContractionOp op,
+                                            PatternRewriter &rewriter) {
+  // Check iterator types for matrix multiplication.
+  auto itTypes = op.getIteratorTypesArray();
+  if (itTypes.size() != 3 || itTypes[0] != vector::IteratorType::parallel ||
+      itTypes[1] != vector::IteratorType::parallel ||
+      itTypes[2] != vector::IteratorType::reduction)
+    return rewriter.notifyMatchFailure(
+        op, "iterator types do not correspond to matrix multiplication");
+
+  // Check permutation maps. For now only accept
+  //   lhs: (d0, d1, d2) -> (d0, d2)
+  //   rhs: (d0, d1, d2) -> (d1, d2)
+  //   acc: (d0, d1, d2) -> (d0, d1)
+  // This corresponds to matrix multiplication with transposed RHS.
+  if (op.getIndexingMapsArray()[0] !=
+          AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 2u},
+                                               op.getContext()) ||
+      op.getIndexingMapsArray()[1] !=
+          AffineMap::getMultiDimMapWithTargets(3, ArrayRef{1u, 2u},
+                                               op.getContext()) ||
+      op.getIndexingMapsArray()[2] != AffineMap::getMultiDimMapWithTargets(
+                                          3, ArrayRef{0u, 1u}, op.getContext()))
+    return rewriter.notifyMatchFailure(op, "non-matching permutation maps");
+
+  // Check the combining kind is addition.
+  if (op.getKind() != vector::CombiningKind::ADD)
+    return rewriter.notifyMatchFailure(op, "combining kind is not an addition");
+
+  return success();
+}
+
+Value VectorContractRewriter::lower(vector::ContractionOp op,
+                                    PatternRewriter &rewriter) {
+
+  // Initialize some helper types.
+  Type operandEltType = cast<VectorType>(lhs.getType()).getElementType();
+  Type resultEltType = cast<VectorType>(op.getResultType()).getElementType();
+
+  const int64_t numOperandSubTileElts =
+      128 / operandEltType.getIntOrFloatBitWidth();
+
+  assert(resultEltType.getIntOrFloatBitWidth() == 32 &&
+         "Only implemented for i32 or f32 output");
+  const int64_t numResultSubTileElts = 4;
+
+  // Single-dimensional vector types for the operands of the ArmSVE dialect
+  // op.
+  auto flatLhsType =
+      VectorType::get(/*shape=*/numOperandSubTileElts, operandEltType,
+                      /*scalableDims=*/{true});
+  auto flatRhsType =
+      VectorType::get(/*shape=*/numOperandSubTileElts, operandEltType,
+                      /*scalableDims=*/{true});
+  auto flatAccType =
+      VectorType::get(/*shape=*/numResultSubTileElts, resultEltType,
+                      /*scalableDims=*/{true});
+
+  // Single-dimension vector type for the entire RHS tile.
+
+  auto flatRhsTileType = VectorType::get(/*shape=*/K * N, operandEltType,
+                                         /*scalableDims=*/{true});
+
+  // Vector type having the same number of elements as a row in the
+  // accumulator/output tile and the same element type.
+  auto accRowTy = VectorType::get(/*shape=*/N, resultEltType,
+                                  /*scalableDims=*/{true});
+
+  // Vector type having twice the number of elements as a row in the
+  // accumulator/output tile the same element type.
+  auto accRowX2Ty = VectorType::get(/*shape=*/2 * N, resultEltType,
+                                    /*scalableDims=*/{true});
+  // Vector type having half the number of elements as a row in the
+  // accumulator/output tile and an integer element type with twice the bit
+  // width.
+  auto accRow64Ty = VectorType::get(/*shape=*/N / 2, rewriter.getI64Type(),
+                                    /*scalableDims=*/{true});
+  // Vector type having the same the number of elements as a row in the
+  // accumulator/output tile and an integer element type with twice the bit
+  // width.
+  auto accRowX264Ty = VectorType::get(/*shape=*/N, rewriter.getI64Type(),
+                                      /*scalableDims=*/{true});
+
+  Location loc = op.getLoc();
+
+  // Extract LHS sub-tiles with logical shape <2xK>.
+  SmallVector<Value> lhsTile;
+  for (int64_t i = 0; i < M; i += 2) {
+    // Extract two consecutive rows of the LHS tile.
+    auto r0 =
+        rewriter.create<vector::ExtractOp>(loc, lhs, ArrayRef<int64_t>{i});
+    auto r1 =
+        rewriter.create<vector::ExtractOp>(loc, lhs, ArrayRef<int64_t>{i + 1});
+    // Concatenate to obtain a 2 x K x <input-type> flattened sub-tile.
+    SmallVector<int64_t> shuffleIdx(2 * K);
+    std::iota(shuffleIdx.begin(), shuffleIdx.end(), 0);
+    auto t = rewriter.create<vector::ShuffleOp>(loc, r0, r1, shuffleIdx);
+    // Turn it into a scalable vector.
+    auto s = rewriter.create<vector::ScalableInsertOp>(
+        loc, t, rewriter.create<ub::PoisonOp>(loc, flatLhsType), 0);
+    // Replicate the sub-tile VSCALE times to fill the entire vector.
+    auto r = rewriter.create<arm_sve::DupQLaneOp>(loc, s, 0);
+    lhsTile.push_back(r);
+  }
+
+  // "Flatten" the RHS tile from <[N]xK> to <[N*K]>.
+  auto rhs = rewriter.create<vector::ShapeCastOp>(this->rhs.getLoc(),
+                                                  flatRhsTileType, this->rhs);
+
+  // Extract the RHS sub-tiles with logical shape <Kx[2]>.
+  SmallVector<Value> rhsTile;
+  for (int64_t j = 0; j < N; j += 2)
+    rhsTile.push_back(rewriter.create<vector::ScalableExtractOp>(
+        loc, flatRhsType, rhs, j * K));
+
+  // Extract and pack the ACC sub-tiles.
+  SmallVector<Value> accTile;
+  for (int64_t i = 0; i < M; i += 2) {
+    // Extract two consecutive rows of the accumulator tile.
+    auto r0 = rewriter.create<vector::ExtractOp>(loc, op.getAcc(),
+                                                 ArrayRef<int64_t>{i});
+    auto r1 = rewriter.create<vector::ExtractOp>(loc, op.getAcc(),
+                                                 ArrayRef<int64_t>{i + 1});
+    Value accTileVec;
+    if (swapOperands) {
+      // We are performing the operation with swapped LHS and RHS we need to
+      // transpose each individual 2x2 tile of the accumulator and (later) the
+      // final result.
+      accTileVec = rewriter.create<vector::InterleaveOp>(loc, r0, r1);
+    } else {
+      // Bitcast accumulator rows to double-width integer elements, so
+      // subsequent interleave/deinterleave work on pairs of elements.
+      auto r0I64 = rewriter.create<vector::BitCastOp>(loc, accRow64Ty, r0);
+      auto r1I64 = rewriter.create<vector::BitCastOp>(loc, accRow64Ty, r1);
+
+      // Interleave the rows, effectively flattening each 2x2 tile into 4
+      // consecutive elements.
+      auto intrI64 = rewriter.create<vector::InterleaveOp>(loc, r0I64, r1I64);
+
+      // Bitcast back to original element type.
+      accTileVec = rewriter.create<vector::BitCastOp>(loc, accRowX2Ty, intrI64);
+    }
+    // Extract ACC sub-tiles.
+    for (int64_t j = 0; j < N; j += 2)
+      accTile.push_back(rewriter.create<vector::ScalableExtractOp>(
+          loc, flatAccType, accTileVec, j * 2));
+  }
+
+  // Emit sub-tile matrix multiplications.
+  SmallVector<Value> outTile;
+  for (int64_t i = 0; i < M / 2; ++i)
+    for (int64_t j = 0; j < N / 2; ++j) {
+      Value mmla = createMMLA(rewriter, loc, accTile[i * N / 2 + j], lhsTile[i],
+                              rhsTile[j]);
+      outTile.push_back(mmla);
+    }
+
+  // Unpack the OUT sub-tiles and insert into the result.
+  Value result = rewriter.create<ub::PoisonOp>(loc, op.getResultType());
+  for (int64_t i = 0; i < M / 2; ++i) {
+    // Collect a number of sub-tiles in a row.
+    Value row = rewriter.create<ub::PoisonOp>(loc, accRowX2Ty);
+    for (int64_t j = 0; j < N / 2; ++j)
+      row = rewriter.create<vector::ScalableInsertOp>(
+          loc, outTile[i * N / 2 + j], row, j * 4);
+
+    // Unpack the row to obtain two rows of the output. If we have the out
+    // sub-tiles transposed we obtain two consecutive output rows by
+    // separating even and odd elements, i.e. a simple deinterleave.
+    // Otherwise, the interleave is by pairs.
+    Value out0, out1;
+    if (swapOperands) {
+      auto tmp = rewriter.create<vector::DeinterleaveOp>(loc, row);
+      out0 = tmp.getRes1();
+      out1 = tmp.getRes2();
+    } else {
+      // Deinterleave by pairs.
+      auto row64 = rewriter.create<vector::BitCastOp>(loc, accRowX264Ty, row);
+      auto deintr64 = rewriter.create<vector::DeinterleaveOp>(loc, row64);
+
+      // Bitcast back into original element type and insert into the result.
+      out0 =
+          rewriter.create<vector::BitCastOp>(loc, accRowTy, deintr64.getRes1());
+      out1 =
+          rewriter.create<vector::BitCastOp>(loc, accRowTy, deintr64.getRes2());
+    }
+    result = rewriter.create<vector::InsertOp>(loc, out0, result, i * 2);
+    result = rewriter.create<vector::InsertOp>(loc, out1, result, i * 2 + 1);
+  }
+
+  return result;
+}
+
+class VectorContractRewriterI8MM : public VectorContractRewriter {
+public:
+  // Check the specific preconditions for the integer case. Initialise
+  // parametrisation types and dimensions.
+  LogicalResult matchAndInit(vector::ContractionOp op,
+                             PatternRewriter &rewriter) {
+    if (failed(match(op, rewriter)))
+      return failure();
+
+    VectorType lhsType = op.getLhsType();
+    VectorType rhsType = op.getRhsType();
+
+    M = lhsType.getDimSize(0);
+    N = rhsType.getDimSize(0);
+    K = rhsType.getDimSize(1);
+
+    // Check the operands have the expected shape:
+    //  * for LHS: fixed vector MxK
+    //  * for RHS: scalable vector [N]xK
+    //  * K == 8
+    //  * M and N even and at least 2
+    if (lhsType.isScalable() || !rhsType.getScalableDims()[0] ||
+        rhsType.getScalableDims()[1] || lhsType.getDimSize(1) != K || K != 8 ||
+        M < 2 || M % 2 != 0 || N < 2 || N % 2 != 0 ||
+        !rhsType.getScalableDims()[0])
+      return rewriter.notifyMatchFailure(op, "non-matching operand shape");
+
+    // Check the output is a vector of i32 elements.
+    auto outTy = dyn_cast<VectorType>(op.getResultType());
+    if (!outTy || outTy.getElementType() != rewriter.getI32Type())
+      return rewriter.notifyMatchFailure(op,
+                                         "output type is not a vector of i32");
+
+    // Check inputs are sign-/zero- extensions from i8 to i32. Get the values
+    // before the extension. All four signed/unsigned combinations for input
+    // operands are supported, but they are lowered to different operations.
+    // Determine which is the appropriate operation to lower to.
+    mmlaOp = MMLA::SignedInt;
+    swapOperands = false;
+    auto maybeLhs = getExtOperand<arith::ExtSIOp>(op.getLhs());
+    if (!maybeLhs) {
+      mmlaOp = MMLA::UnsignedInt;
+      maybeLhs = getExtOperand<arith::ExtUIOp>(op.getLhs());
+    }
+    if (!maybeLhs)
+      return rewriter.notifyMatchFailure(
+          op, "LHS is not a sign- or zero- extended i8");
+
+    auto maybeRhs = getExtOperand<arith::ExtSIOp>(op.getRhs());
+    if (maybeRhs) {
+      if (mmlaOp == MMLA::UnsignedInt)
+        mmlaOp = MMLA::MixedInt;
+    } else {
+      if (mmlaOp == MMLA::SignedInt) {
+        mmlaOp = MMLA::MixedInt;
+        swapOperands = true;
+      }
+      maybeRhs = getExtOperand<arith::ExtUIOp>(op.getRhs());
+    }
+    if (!maybeRhs)
+      return rewriter.notifyMatchFailure(
+          op, "RHS is not a sign- or zero- extended i8");
+
+    // Initialise algorithm parameters.
+    lhs = *maybeLhs;
+    rhs = *maybeRhs;
+    acc = op.getAcc();
+
+    return success();
+  }
+};
+
+class VectorContractRewriterBfloat : public VectorContractRewriter {
+public:
+  // Check the specific preconditions for the bfloat16 case. Initialise
+  // parametrisation types and dimensions.
+  LogicalResult matchAndInit(vector::ContractionOp op,
+                             PatternRewriter &rewriter) {
+    if (failed(match(op, rewriter)))
+      return failure();
+
+    VectorType lhsType = op.getLhsType();
+    VectorType rhsType = op.getRhsType();
+
+    M = lhsType.getDimSize(0);
+    N = rhsType.getDimSize(0);
+    K = rhsType.getDimSize(1);
+
+    // Check the operands have the expected shape:
+    //  * for LHS: fixed vector MxK
+    //  * for RHS: scalable vector [N]xK
+    //  * K == 4
+    //  * M and N even and at least 2
+    if (lhsType.isScalable() || !rhsType.getScalableDims()[0] ||
+        rhsType.getScalableDims()[1] || lhsType.getDimSize(1) != K || K != 4 ||
+        M < 2 || M % 2 != 0 || N < 2 || N % 2 != 0 ||
+        !rhsType.getScalableDims()[0])
+      return rewriter.notifyMatchFailure(op, "non-matching operand shape");
+
+    // Check the output is a vector of Float32 elements.
+    auto outTy = dyn_cast<VectorType>(op.getResultType());
+    if (!outTy || outTy.getElementType() != rewriter.getF32Type())
+      return rewriter.notifyMatchFailure(op,
+                                         "output type is not a vector of f32");
+
+    // Check the inputs are vectors of BFloat16 elements.
+    if (lhsType.getElementType() != rewriter.getBF16Type())
+      return rewriter.notifyMatchFailure(op,
+                                         "input type is not a vector of bf16");
+
+    // Initialise algorithm parameters.
+    mmlaOp = MMLA::Bfloat;
+    swapOperands = false;
+    lhs = op.getLhs();
+    rhs = op.getRhs();
+    acc = op.getAcc();
+
+    return success();
+  }
+};
+
+class LowerContractionToSVEI8MMPattern
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override {
+
+    // Match i8xi8 -> i32 matrix multiply and accumulate.
+    VectorContractRewriterI8MM vcr;
+    if (failed(vcr.matchAndInit(op, rewriter)))
+      return failure();
+
+    Value result = vcr.lower(op, rewriter);
+    rewriter.replaceOp(op, result);
+
+    return success();
+  }
+};
+
+class LowerContractionToSVEBFMMLAPattern
+    : public OpRewritePattern<vector::ContractionOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override {
+
+    // Match bf16xbf16 -> f32 matrix multiply and accumulate.
+    VectorContractRewriterBfloat vcr;
+    if (failed(vcr.matchAndInit(op, rewriter)))
+      return failure();
+
+    Value result = vcr.lower(op, rewriter);
+    rewriter.replaceOp(op, result);
+
+    return success();
+  }
+};
+
+} // namespace
+
+void mlir::populateLowerContractionToSVEI8MMPatternPatterns(
+    RewritePatternSet &patterns) {
+  MLIRContext *context = patterns.getContext();
+  patterns.add<LowerContractionToSVEI8MMPattern>(context, /*benefit=*/2);
+}
+
+void mlir::populateLowerContractionToSVEBFMMLAPatterns(
+    RewritePatternSet &patterns) {
+  MLIRContext *context = patterns.getContext();
+  patterns.add<LowerContractionToSVEBFMMLAPattern>(context, /*benefit=*/2);
+}
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
deleted file mode 100644
index bd051b100a91b..0000000000000
--- a/mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-//===- LowerContractionToSVEI8MMPattern.cpp - Contract to I8MM --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements lowering patterns from vector.contract to operations
-// that map to instructions from the SVE FEAT_I8MM extension.
-//
-// TODO: There may be opportunities to unify this with a similar pattern
-// for Neon. See:
-//   https://github.com/llvm/llvm-project/issues/145559
-//   LowerContractionToNeonI8MMPattern.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h"
-#include "mlir/Dialect/ArmSVE/Transforms/Transforms.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/PatternMatch.h"
-
-#include "mlir/Dialect/UB/IR/UBOps.h"
-
-#define DEBUG_TYPE "lower-contract-to-arm-sve-i8mm"
-
-using namespace mlir;
-
-namespace {
-// Get the operand of a `vector.contract`. This function is intended to abstract
-// away from the particular way a value is extended before feeding it into the
-// `vector.contract` - via zero-extend or an explicit or implicit sign-extend
-// (for implicit sign-extension see `vector.contract` documentation).
-//
-// The template parameter `Op` indicates the extension operation (explicit or
-// implicit) for which we are checking.
-//
-// Return success only for extensions from `i8` to `i32`.
-template <typename Op>
-std::optional<Value> getExtOperand(Value v) {
-
-  static_assert(llvm::is_one_of<Op, arith::ExtSIOp, arith::ExtUIOp>::value,
-                "Must be instantiated with either sign- or zero- extension op");
-
-  // If the operand is not defined by an explicit extend operation of the
-  // accepted operation type allow for an implicit sign-extension.
-  auto extOp = dyn_cast_or_null<Op>(v.getDefiningOp());
-  if (!extOp) {
-    if constexpr (std::is_same<Op, arith::ExtSIOp>::value) {
-      auto vTy = cast<VectorType>(v.getType());
-      if (!vTy.getElementType().isSignlessInteger(8))
-        return {};
-      return v;
-    }
-    return {};
-  }
-
-  // If the operand is defined by an explicit extend operation of the accepted
-  // operation type, check it's extended from `i8` to `i32`.
-  auto inOp = extOp.getIn();
-  auto inTy = dyn_cast<VectorType>(inOp.getType());
-  if (!inTy || !inTy.getElementType().isSignlessInteger(8))
-    return {};
-
-  auto outTy = dyn_cast<VectorType>(extOp.getType());
-  if (!outTy || !outTy.getElementType().isSignlessInteger(32))
-    return {};
-
-  return inOp;
-}
-
-// Designate the operation (resp. instruction) used to do sub-tile matrix
-// multiplications.
-enum class MMLA {
-  Signed,      // smmla
-  Unsigned,    // ummla
-  Mixed,       // usmmla
-  MixedSwapped // usmmla with LHS and RHS swapped
-};
-
-// Create the matrix mulitply and accumulate operation according to `op`.
-Value createMMLA(PatternRewriter &rewriter, MMLA op, Location loc,
-                 mlir::VectorType accType, Value acc, Value lhs, Value rhs) {
-  switch (op) {
-  case MMLA::Signed:
-    return rewriter.create<arm_sve::SmmlaOp>(loc, accType, acc, lhs, rhs);
-  case MMLA::Unsigned:
-    return rewriter.create<arm_sve::UmmlaOp>(loc, accType, acc, lhs, rhs);
-  case MMLA::Mixed:
-    return rewriter.create<arm_sve::UsmmlaOp>(loc, accType, acc, lhs, rhs);
-  case MMLA::MixedSwapped:
-    // The accumulator comes transposed and the result will be transposed
-    // later, so all we have to do here is swap the operands.
-    return rewriter.create<arm_sve::UsmmlaOp>(loc, accType, acc, rhs, lhs);
-  }
-}
-
-/// Lower a contraction operation that performs a matrix multiplication
-/// of two 8-bit integer matrix tiles with logical dimensions <Mx8> and <8x[N]>
-/// for the left-hand side and the right-hand side, respectively,
-/// yielding a <Mx[N]> 32-bit integer result.
-///
-/// The operands' shapes are such that the operands can be evenly split into
-/// sub-tiles with dimensions as expected by the targeted FEAT_I8MM
-/// instructions. The intent is that M and N are chosen (by higher level
-/// transforms) in such a way as to maximise register usage. The main use case
-/// we envision as of now is MMT4D, thus the RHS operand is expected
-/// pre-transposed.
-///
-/// The matrix multiplication is performed by unrolling the usual tiled matrix
-/// multiplication algorithm using sub-tiles with dimensions <2x8> for the LHS,
-/// <8x[2]> for the RHS, and <2x[2]> for the result and the input accumulator.
-///
-/// One way to illustrate the operation is as follows:
-///
-/// RHS<8x[N]>:       <8x[2]> <8x[2]> ... <8x[2]>
-///                 +-----------------------------
-/// LHS<Mx8>: <2x8> | <2x[2]> <2x[2]> ... <2x[2]>
-///           <2x8> | <2x[2]> <2x[2]> ... <2x[2]>
-///            ...  |   ...     ...   ...   ...
-///           <2x8> | <2x[2]> <2x[2]> ... <2x[2]>
-///
-/// The RHS operand is unpacked into N/2 values, each representing a sequence of
-/// VSCALE number of sub-tiles with dimensions <8x2>.
-/// The LHS operand is initially unpacked into M/2 values, each representing a
-/// sub-tile with dimensions <2x8>, and then each such sub-tile is replicated
-/// VSCALE times.
-/// Multiplying thus replicated LHS sub-tile by the corresponding RHS sub-tile
-/// correctly computes an entire result sub-tile.
-class LowerContractionToSVEI8MMPattern
-    : public OpRewritePattern<vector::ContractionOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override {
-
-    Location loc = op.getLoc();
-    mlir::VectorType lhsType = op.getLhsType();
-    mlir::VectorType rhsType = op.getRhsType();
-
-    // Check the rank the types so we can safely examine their dimensions.
-    if (lhsType.getRank() != 2 || rhsType.getRank() != 2)
-      return rewriter.notifyMatchFailure(op, "non-matching operand shape");
-
-    auto M = lhsType.getDimSize(0);
-    auto N = rhsType.getDimSize(0);
-    auto K = rhsType.getDimSize(1);
-
-    // Check the operands have the expected shape:
-    //  * for LHS: fixed vector MxK
-    //  * for RHS: scalable vector [N]xK
-    //  * K == 8
-    //  * M and N even and at least 2
-    if (lhsType.isScalable() || !rhsType.getScalableDims()[0] ||
-        rhsType.getScalableDims()[1] || lhsType.getDimSize(1) != K || K != 8 ||
-        M < 2 || M % 2 != 0 || N < 2 || N % 2 != 0 ||
-        !rhsType.getScalableDims()[0])
-      return rewriter.notifyMatchFailure(op, "non-matching operand shape");
-
-    // Check permutation maps. For now only accept
-    //   lhs: (d0, d1, d2) -> (d0, d2)
-    //   rhs: (d0, d1, d2) -> (d1, d2)
-    //   acc: (d0, d1, d2) -> (d0, d1)
-    // This corresponds to matrix multiplication with transposed RHS.
-    if (op.getIndexingMapsArray()[0] !=
-            AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 2u},
-                                                 op.getContext()) ||
-        op.getIndexingMapsArray()[1] !=
-            AffineMap::getMultiDimMapWithTargets(3, ArrayRef{1u, 2u},
-                                                 op.getContext()) ||
-        op.getIndexingMapsArray()[2] !=
-            AffineMap::getMultiDimMapWithTargets(3, ArrayRef{0u, 1u},
-                                                 op.getContext()))
-      return rewriter.notifyMatchFailure(op, "non-matching permutation maps");
-
-    // Check iterator types for matrix multiplication.
-    auto itTypes = op.getIteratorTypesArray();
-    if (itTypes.size() != 3 || itTypes[0] != vector::IteratorType::parallel ||
-        itTypes[1] != vector::IteratorType::parallel ||
-        itTypes[2] != vector::IteratorType::reduction)
-      return rewriter.notifyMatchFailure(
-          op, "iterator types do not correspond to matrix multiplication");
-
-    // Check the combining kind is addition.
-    if (op.getKind() != vector::CombiningKind::ADD)
-      return rewriter.notifyMatchFailure(op,
-                                         "combining kind is not an addition");
-
-    // Check the output is a vector of i32 elements.
-    auto outTy = dyn_cast<VectorType>(op.getResultType());
-    if (!outTy || outTy.getElementType() != rewriter.getI32Type())
-      return rewriter.notifyMatchFailure(op,
-                                         "output type is not a vector of i32");
-
-    // Check inputs are sign-/zero- extensions from i8 to i32. Get the values
-    // before the extension. All four signed/unsigned combinations for input
-    // operands are supported, but they are lowered to different operations.
-    // Determine which is the appropriate operation to lower to.
-    MMLA mmlaOp = MMLA::Signed;
-    auto maybeLhs = getExtOperand<arith::ExtSIOp>(op.getLhs());
-    if (!maybeLhs) {
-      mmlaOp = MMLA::Unsigned;
-      maybeLhs = getExtOperand<arith::ExtUIOp>(op.getLhs());
-    }
-    if (!maybeLhs)
-      return rewriter.notifyMatchFailure(
-          op, "LHS is not a sign- or zero- extended i8");
-
-    auto maybeRhs = getExtOperand<arith::ExtSIOp>(op.getRhs());
-    if (maybeRhs) {
-      if (mmlaOp == MMLA::Unsigned)
-        mmlaOp = MMLA::Mixed;
-    } else {
-      if (mmlaOp == MMLA::Signed)
-        mmlaOp = MMLA::MixedSwapped;
-      maybeRhs = getExtOperand<arith::ExtUIOp>(op.getRhs());
-    }
-    if (!maybeRhs)
-      return rewriter.notifyMatchFailure(
-          op, "RHS is not a sign- or zero- extended i8");
-
-    // One-dimensional vector types for arm_sve.*mmla
-    auto nxv16i8 = VectorType::get(/*shape=*/16, rewriter.getI8Type(),
-                                   /*scalableDims=*/{true});
-    auto nxv4i32 = VectorType::get(/*shape=*/4, rewriter.getI32Type(),
-                                   /*scalableDims=*/{true});
-
-    // Extract LHS sub-tiles with logicall shape <2x8>.
-    SmallVector<Value> lhsTile;
-    for (int64_t i = 0; i < M; i += 2) {
-      // Extract two consecutive rows of the LHS tile.
-      auto r0 = rewriter.create<vector::ExtractOp>(loc, *maybeLhs,
-                                                   ArrayRef<int64_t>{i});
-      auto r1 = rewriter.create<vector::ExtractOp>(loc, *maybeLhs,
-                                                   ArrayRef<int64_t>{i + 1});
-      // Concatenate to obtain a 16 x i8 flattened sub-tile.
-      auto t = rewriter.create<vector::ShuffleOp>(
-          loc, r0, r1,
-          llvm::ArrayRef<int64_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
-                                  14, 15});
-      // Turn it into a scalable vector.
-      auto s = rewriter.create<vector::ScalableInsertOp>(
-          loc, t, rewriter.create<ub::PoisonOp>(loc, nxv16i8), 0);
-      // Replicate the sub-tile VSCALE times to fill the entire vector.
-      auto r = rewriter.create<arm_sve::DupQLaneOp>(loc, s, 0);
-      lhsTile.push_back(r);
-    }
-
-    // "Flatten" the RHS tile from <[N]x8> to <[8*N]>.
-    auto rhs = rewriter.create<vector::ShapeCastOp>(
-        maybeRhs->getLoc(),
-        VectorType::get(/*shape=*/8 * N, rewriter.getI8Type(),
-                        /*scalableDims=*/{true}),
-        *maybeRhs);
-
-    // Extract the RHS sub-tiles with logical shape <8x[2]>.
-    SmallVector<Value> rhsTile;
-    for (int64_t j = 0; j < N; j += 2)
-      rhsTile.push_back(
-          rewriter.create<vector::ScalableExtractOp>(loc, nxv16i8, rhs, j * 8));
-
-    // Handy types for packing/unpacking of the accumulator tile.
-    auto accRowTy = VectorType::get(/*shape=*/N, rewriter.getI32Type(),
-                                    /*scalableDims=*/{true});
-    auto accRowX2Ty = VectorType::get(/*shape=*/2 * N, rewriter.getI32Type(),
-                                      /*scalableDims=*/{true});
-    auto accRow64Ty = VectorType::get(/*shape=*/N / 2, rewriter.getI64Type(),
-                                      /*scalableDims=*/{true});
-    auto accRowX264Ty = VectorType::get(/*shape=*/N, rewriter.getI64Type(),
-                                        /*scalableDims=*/{true});
-
-    // Extract and pack the ACC sub-tiles.
-    SmallVector<Value> accTile;
-    for (int64_t i = 0; i < M; i += 2) {
-      // Extract two consecutive rows of the accumulator tile.
-      auto r0 = rewriter.create<vector::ExtractOp>(loc, op.getAcc(),
-                                                   ArrayRef<int64_t>{i});
-      auto r1 = rewriter.create<vector::ExtractOp>(loc, op.getAcc(),
-                                                   ArrayRef<int64_t>{i + 1});
-      Value accTileVec;
-      if (mmlaOp == MMLA::MixedSwapped) {
-        // We need to swap the positions of the LHS and RHS (since we don't have
-        // a signed * unsigned operation), but then each individual 2x2 tile of
-        // the acumulator and (later) the result need to be transposed.
-        accTileVec = rewriter.create<vector::InterleaveOp>(loc, r0, r1);
-      } else {
-        // Bitcast them to 64-bit elements, so subsequent
-        // interleave/deinterleave work on pairs of 32-bit numbers.
-        auto r0I64 = rewriter.create<vector::BitCastOp>(loc, accRow64Ty, r0);
-        auto r1I64 = rewriter.create<vector::BitCastOp>(loc, accRow64Ty, r1);
-
-        // Interleave the rows, effectively flattening each 2x2 tile into 4
-        // consecutive elements.
-        auto intrI64 = rewriter.create<vector::InterleaveOp>(loc, r0I64, r1I64);
-
-        // Bitcast back to 32-bit elements.
-        accTileVec =
-            rewriter.create<vector::BitCastOp>(loc, accRowX2Ty, intrI64);
-      }
-      // Extract ACC sub-tiles.
-      for (int64_t j = 0; j < N; j += 2)
-        accTile.push_back(rewriter.create<vector::ScalableExtractOp>(
-            loc, nxv4i32, accTileVec, j * 2));
-    }
-
-    // Emit sub-tile matrix multiplications.
-    SmallVector<Value> outTile;
-    for (int64_t i = 0; i < M / 2; ++i)
-      for (int64_t j = 0; j < N / 2; ++j) {
-        Value mmla = createMMLA(rewriter, mmlaOp, loc, nxv4i32,
-                                accTile[i * N / 2 + j], lhsTile[i], rhsTile[j]);
-        outTile.push_back(mmla);
-      }
-
-    // Unpack the OUT sub-tiles and insert into the result.
-    Value result = rewriter.create<ub::PoisonOp>(loc, op.getResultType());
-    for (int64_t i = 0; i < M / 2; ++i) {
-      // Collect a number of sub-tiles in a row.
-      Value row = rewriter.create<ub::PoisonOp>(loc, accRowX2Ty);
-      for (int64_t j = 0; j < N / 2; ++j)
-        row = rewriter.create<vector::ScalableInsertOp>(
-            loc, outTile[i * N / 2 + j], row, j * 4);
-
-      // Unpack the row to obtain two rows of the output. If we have the out
-      // sub-tiles transposed we obtain two consecutive output rows by
-      // separating even and odd elements, i.e. a simple deinterleave.
-      // Otherwise, the interleave is by pairs.
-      Value out0, out1;
-      if (mmlaOp == MMLA::MixedSwapped) {
-        auto tmp = rewriter.create<vector::DeinterleaveOp>(loc, row);
-        out0 = tmp.getRes1();
-        out1 = tmp.getRes2();
-      } else {
-        // Deinterleave by pairs.
-        auto row64 = rewriter.create<vector::BitCastOp>(loc, accRowX264Ty, row);
-        auto deintr64 = rewriter.create<vector::DeinterleaveOp>(loc, row64);
-
-        // Bitcast back into 32-bit elements and insert into the result.
-        out0 = rewriter.create<vector::BitCastOp>(loc, accRowTy,
-                                                  deintr64.getRes1());
-        out1 = rewriter.create<vector::BitCastOp>(loc, accRowTy,
-                                                  deintr64.getRes2());
-      }
-      result = rewriter.create<vector::InsertOp>(loc, out0, result, i * 2);
-      result = rewriter.create<vector::InsertOp>(loc, out1, result, i * 2 + 1);
-    }
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-} // namespace
-
-void mlir::populateLowerContractionToSVEI8MMPatternPatterns(
-    RewritePatternSet &patterns) {
-  MLIRContext *context = patterns.getContext();
-  patterns.add<LowerContractionToSVEI8MMPattern>(context, /*benefit=*/2);
-}
diff --git a/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp b/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
index f5a42c572ff96..0adfb51a228bb 100644
--- a/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
+++ b/mlir/lib/Dialect/Complex/IR/ComplexDialect.cpp
@@ -48,8 +48,8 @@ Operation *complex::ComplexDialect::materializeConstant(OpBuilder &builder,
                                                         Type type,
                                                         Location loc) {
   if (complex::ConstantOp::isBuildableWith(value, type)) {
-    return builder.create<complex::ConstantOp>(loc, type,
-                                               llvm::cast<ArrayAttr>(value));
+    return complex::ConstantOp::create(builder, loc, type,
+                                       llvm::cast<ArrayAttr>(value));
   }
   return arith::ConstantOp::materialize(builder, value, type, loc);
 }
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index 0c11c76cf1f71..4a5c2a99c92aa 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -312,8 +312,9 @@ struct SimplifyCondBranchIdenticalSuccessors
       if (std::get<0>(it) == std::get<1>(it))
         mergedOperands.push_back(std::get<0>(it));
       else
-        mergedOperands.push_back(rewriter.create<arith::SelectOp>(
-            condbr.getLoc(), condition, std::get<0>(it), std::get<1>(it)));
+        mergedOperands.push_back(
+            arith::SelectOp::create(rewriter, condbr.getLoc(), condition,
+                                    std::get<0>(it), std::get<1>(it)));
     }
 
     rewriter.replaceOpWithNewOp<BranchOp>(condbr, trueDest, mergedOperands);
@@ -412,8 +413,8 @@ struct CondBranchTruthPropagation : public OpRewritePattern<CondBranchOp> {
           replaced = true;
 
           if (!constantTrue)
-            constantTrue = rewriter.create<arith::ConstantOp>(
-                condbr.getLoc(), ty, rewriter.getBoolAttr(true));
+            constantTrue = arith::ConstantOp::create(
+                rewriter, condbr.getLoc(), ty, rewriter.getBoolAttr(true));
 
           rewriter.modifyOpInPlace(use.getOwner(),
                                    [&] { use.set(constantTrue); });
@@ -427,8 +428,8 @@ struct CondBranchTruthPropagation : public OpRewritePattern<CondBranchOp> {
           replaced = true;
 
           if (!constantFalse)
-            constantFalse = rewriter.create<arith::ConstantOp>(
-                condbr.getLoc(), ty, rewriter.getBoolAttr(false));
+            constantFalse = arith::ConstantOp::create(
+                rewriter, condbr.getLoc(), ty, rewriter.getBoolAttr(false));
 
           rewriter.modifyOpInPlace(use.getOwner(),
                                    [&] { use.set(constantFalse); });
diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp
index a077f56f4f472..80dc0c597562d 100644
--- a/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp
@@ -87,8 +87,8 @@ struct CondBranchOpInterface
                                destOperands.getAsOperandRange(), toRetain);
       SmallVector<Value> adaptedConditions(
           llvm::map_range(conditions, conditionModifier));
-      auto deallocOp = builder.create<bufferization::DeallocOp>(
-          condBr.getLoc(), memrefs, adaptedConditions, toRetain);
+      auto deallocOp = bufferization::DeallocOp::create(
+          builder, condBr.getLoc(), memrefs, adaptedConditions, toRetain);
       state.resetOwnerships(deallocOp.getRetained(), condBr->getBlock());
       for (auto [retained, ownership] : llvm::zip(
                deallocOp.getRetained(), deallocOp.getUpdatedConditions())) {
@@ -115,18 +115,19 @@ struct CondBranchOpInterface
     DeallocOp thenTakenDeallocOp = insertDeallocForBranch(
         condBr.getTrueDest(), condBr.getTrueDestOperandsMutable(),
         [&](Value cond) {
-          return builder.create<arith::AndIOp>(condBr.getLoc(), cond,
-                                               condBr.getCondition());
+          return arith::AndIOp::create(builder, condBr.getLoc(), cond,
+                                       condBr.getCondition());
         },
         thenMapping);
     DeallocOp elseTakenDeallocOp = insertDeallocForBranch(
         condBr.getFalseDest(), condBr.getFalseDestOperandsMutable(),
         [&](Value cond) {
-          Value trueVal = builder.create<arith::ConstantOp>(
-              condBr.getLoc(), builder.getBoolAttr(true));
-          Value negation = builder.create<arith::XOrIOp>(
-              condBr.getLoc(), trueVal, condBr.getCondition());
-          return builder.create<arith::AndIOp>(condBr.getLoc(), cond, negation);
+          Value trueVal = arith::ConstantOp::create(builder, condBr.getLoc(),
+                                                    builder.getBoolAttr(true));
+          Value negation = arith::XOrIOp::create(
+              builder, condBr.getLoc(), trueVal, condBr.getCondition());
+          return arith::AndIOp::create(builder, condBr.getLoc(), cond,
+                                       negation);
         },
         elseMapping);
 
@@ -143,9 +144,9 @@ struct CondBranchOpInterface
 
     for (Value retained : commonValues) {
       state.resetOwnerships(retained, condBr->getBlock());
-      Value combinedOwnership = builder.create<arith::SelectOp>(
-          condBr.getLoc(), condBr.getCondition(), thenMapping[retained],
-          elseMapping[retained]);
+      Value combinedOwnership = arith::SelectOp::create(
+          builder, condBr.getLoc(), condBr.getCondition(),
+          thenMapping[retained], elseMapping[retained]);
       state.updateOwnership(retained, combinedOwnership, condBr->getBlock());
     }
 
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index fccbca6ed05dd..568da8905cbc8 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -47,13 +47,13 @@ void EmitCDialect::initialize() {
 Operation *EmitCDialect::materializeConstant(OpBuilder &builder,
                                              Attribute value, Type type,
                                              Location loc) {
-  return builder.create<emitc::ConstantOp>(loc, type, value);
+  return emitc::ConstantOp::create(builder, loc, type, value);
 }
 
 /// Default callback for builders of ops carrying a region. Inserts a yield
 /// without arguments.
 void mlir::emitc::buildTerminatedBody(OpBuilder &builder, Location loc) {
-  builder.create<emitc::YieldOp>(loc);
+  emitc::YieldOp::create(builder, loc);
 }
 
 bool mlir::emitc::isSupportedEmitCType(Type type) {
diff --git a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
index 12218f5072982..d5fe3b4ae1e7f 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
@@ -24,7 +24,7 @@ ExpressionOp createExpression(Operation *op, OpBuilder &builder) {
   Location loc = op->getLoc();
 
   builder.setInsertionPointAfter(op);
-  auto expressionOp = builder.create<emitc::ExpressionOp>(loc, resultType);
+  auto expressionOp = emitc::ExpressionOp::create(builder, loc, resultType);
 
   // Replace all op's uses with the new expression's result.
   result.replaceAllUsesWith(expressionOp.getResult());
@@ -33,7 +33,7 @@ ExpressionOp createExpression(Operation *op, OpBuilder &builder) {
   Region &region = expressionOp.getRegion();
   Block &block = region.emplaceBlock();
   builder.setInsertionPointToEnd(&block);
-  auto yieldOp = builder.create<emitc::YieldOp>(loc, result);
+  auto yieldOp = emitc::YieldOp::create(builder, loc, result);
 
   // Move op into the new expression.
   op->moveBefore(yieldOp);
diff --git a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp
index 72c8fd0f32485..ab7be8d6cedd9 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/TypeConversions.cpp
@@ -21,7 +21,7 @@ Value materializeAsUnrealizedCast(OpBuilder &builder, Type resultType,
   if (inputs.size() != 1)
     return Value();
 
-  return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+  return UnrealizedConversionCastOp::create(builder, loc, resultType, inputs)
       .getResult(0);
 }
 
diff --git a/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp b/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp
index 17d436f6df028..612e8099eaf35 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp
@@ -50,7 +50,7 @@ class WrapFuncInClass : public OpRewritePattern<emitc::FuncOp> {
                                 PatternRewriter &rewriter) const override {
 
     auto className = funcOp.getSymNameAttr().str() + "Class";
-    ClassOp newClassOp = rewriter.create<ClassOp>(funcOp.getLoc(), className);
+    ClassOp newClassOp = ClassOp::create(rewriter, funcOp.getLoc(), className);
 
     SmallVector<std::pair<StringAttr, TypeAttr>> fields;
     rewriter.createBlock(&newClassOp.getBody());
@@ -67,15 +67,15 @@ class WrapFuncInClass : public OpRewritePattern<emitc::FuncOp> {
 
       TypeAttr typeAttr = TypeAttr::get(val.getType());
       fields.push_back({fieldName, typeAttr});
-      rewriter.create<emitc::FieldOp>(funcOp.getLoc(), fieldName, typeAttr,
-                                      argAttr);
+      emitc::FieldOp::create(rewriter, funcOp.getLoc(), fieldName, typeAttr,
+                             argAttr);
     }
 
     rewriter.setInsertionPointToEnd(&newClassOp.getBody().front());
     FunctionType funcType = funcOp.getFunctionType();
     Location loc = funcOp.getLoc();
     FuncOp newFuncOp =
-        rewriter.create<emitc::FuncOp>(loc, ("execute"), funcType);
+        emitc::FuncOp::create(rewriter, loc, ("execute"), funcType);
 
     rewriter.createBlock(&newFuncOp.getBody());
     newFuncOp.getBody().takeBody(funcOp.getBody());
@@ -85,7 +85,7 @@ class WrapFuncInClass : public OpRewritePattern<emitc::FuncOp> {
     newArguments.reserve(fields.size());
     for (auto &[fieldName, attr] : fields) {
       GetFieldOp arg =
-          rewriter.create<emitc::GetFieldOp>(loc, attr.getValue(), fieldName);
+          emitc::GetFieldOp::create(rewriter, loc, attr.getValue(), fieldName);
       newArguments.push_back(arg);
     }
 
diff --git a/mlir/lib/Dialect/Func/Extensions/InlinerExtension.cpp b/mlir/lib/Dialect/Func/Extensions/InlinerExtension.cpp
index 3328d58551bff..c39e77d823b78 100644
--- a/mlir/lib/Dialect/Func/Extensions/InlinerExtension.cpp
+++ b/mlir/lib/Dialect/Func/Extensions/InlinerExtension.cpp
@@ -61,7 +61,8 @@ struct FuncInlinerInterface : public DialectInlinerInterface {
 
     // Replace the return with a branch to the dest.
     OpBuilder builder(op);
-    builder.create<cf::BranchOp>(op->getLoc(), newDest, returnOp.getOperands());
+    cf::BranchOp::create(builder, op->getLoc(), newDest,
+                         returnOp.getOperands());
     op->erase();
   }
 
diff --git a/mlir/lib/Dialect/Func/IR/FuncOps.cpp b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
index d8309d81f4a3f..3c09a2124bd77 100644
--- a/mlir/lib/Dialect/Func/IR/FuncOps.cpp
+++ b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
@@ -50,8 +50,8 @@ void FuncDialect::initialize() {
 Operation *FuncDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                             Type type, Location loc) {
   if (ConstantOp::isBuildableWith(value, type))
-    return builder.create<ConstantOp>(loc, type,
-                                      llvm::cast<FlatSymbolRefAttr>(value));
+    return ConstantOp::create(builder, loc, type,
+                              llvm::cast<FlatSymbolRefAttr>(value));
   return nullptr;
 }
 
diff --git a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
index 11fc696a258c0..935d3e5ac331b 100644
--- a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
+++ b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
@@ -170,8 +170,8 @@ transform::CastAndCallOp::apply(transform::TransformRewriter &rewriter,
     }
   }
 
-  auto callOp = rewriter.create<func::CallOp>(insertionPoint->getLoc(),
-                                              targetFunction, inputs);
+  auto callOp = func::CallOp::create(rewriter, insertionPoint->getLoc(),
+                                     targetFunction, inputs);
 
   // Cast the call results back to the expected types. If any conversions fail
   // this is a definite failure as the call has been constructed at this point.
diff --git a/mlir/lib/Dialect/Func/Transforms/FuncConversions.cpp b/mlir/lib/Dialect/Func/Transforms/FuncConversions.cpp
index a3638c8766a5c..b6c8cdf2f495a 100644
--- a/mlir/lib/Dialect/Func/Transforms/FuncConversions.cpp
+++ b/mlir/lib/Dialect/Func/Transforms/FuncConversions.cpp
@@ -46,9 +46,9 @@ struct CallOpSignatureConversion : public OpConversionPattern<CallOp> {
 
     // Substitute with the new result types from the corresponding FuncType
     // conversion.
-    auto newCallOp = rewriter.create<CallOp>(
-        callOp.getLoc(), callOp.getCallee(), convertedResults,
-        flattenValues(adaptor.getOperands()));
+    auto newCallOp =
+        CallOp::create(rewriter, callOp.getLoc(), callOp.getCallee(),
+                       convertedResults, flattenValues(adaptor.getOperands()));
     SmallVector<ValueRange> replacements;
     size_t offset = 0;
     for (int i = 0, e = callOp->getNumResults(); i < e; ++i) {
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index 0e9662689ef78..f781ed2d591b4 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -44,8 +44,8 @@ func::replaceFuncWithNewOrder(RewriterBase &rewriter, func::FuncOp funcOp,
   for (unsigned int idx : newResultsOrder)
     newOutputTypes.push_back(origOutputTypes[idx]);
   rewriter.setInsertionPoint(funcOp);
-  auto newFuncOp = rewriter.create<func::FuncOp>(
-      funcOp.getLoc(), funcOp.getName(),
+  auto newFuncOp = func::FuncOp::create(
+      rewriter, funcOp.getLoc(), funcOp.getName(),
       rewriter.getFunctionType(newInputTypes, newOutputTypes));
 
   Region &newRegion = newFuncOp.getBody();
@@ -80,7 +80,7 @@ func::replaceFuncWithNewOrder(RewriterBase &rewriter, func::FuncOp funcOp,
     newReturnValues.push_back(returnOp.getOperand(idx));
   rewriter.setInsertionPoint(returnOp);
   auto newReturnOp =
-      rewriter.create<func::ReturnOp>(newFuncOp.getLoc(), newReturnValues);
+      func::ReturnOp::create(rewriter, newFuncOp.getLoc(), newReturnValues);
   newReturnOp->setDiscardableAttrs(returnOp->getDiscardableAttrDictionary());
   rewriter.eraseOp(returnOp);
 
@@ -109,8 +109,9 @@ func::replaceCallOpWithNewOrder(RewriterBase &rewriter, func::CallOp callOp,
   // Replace the kernel call operation with a new one that has the
   // reordered arguments.
   rewriter.setInsertionPoint(callOp);
-  auto newCallOp = rewriter.create<func::CallOp>(
-      callOp.getLoc(), callOp.getCallee(), newResultTypes, newArgsOrderValues);
+  auto newCallOp =
+      func::CallOp::create(rewriter, callOp.getLoc(), callOp.getCallee(),
+                           newResultTypes, newArgsOrderValues);
   newCallOp.setNoInlineAttr(callOp.getNoInlineAttr());
   for (auto &&[newIndex, origIndex] : llvm::enumerate(newResultsOrder))
     rewriter.replaceAllUsesWith(callOp.getResult(origIndex),
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 30b5ac9809139..d186a480c0ce5 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -136,12 +136,13 @@ int64_t GPUMappingMaskAttr::getMaxNumPhysicalIds() const { return 64; }
 Value GPUMappingMaskAttr::createLogicalLinearMappingId(
     OpBuilder &b, Value physicalLinearMappingId) const {
   Location loc = physicalLinearMappingId.getLoc();
-  Value mask = b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(getMask()));
-  Value one = b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(1));
-  Value filter = b.create<arith::ShLIOp>(loc, one, physicalLinearMappingId);
-  filter = b.create<arith::SubIOp>(loc, filter, one);
-  Value filteredId = b.create<arith::AndIOp>(loc, mask, filter);
-  return b.create<math::CtPopOp>(loc, filteredId);
+  Value mask =
+      arith::ConstantOp::create(b, loc, b.getI64IntegerAttr(getMask()));
+  Value one = arith::ConstantOp::create(b, loc, b.getI64IntegerAttr(1));
+  Value filter = arith::ShLIOp::create(b, loc, one, physicalLinearMappingId);
+  filter = arith::SubIOp::create(b, loc, filter, one);
+  Value filteredId = arith::AndIOp::create(b, loc, mask, filter);
+  return math::CtPopOp::create(b, loc, filteredId);
 }
 
 ///                 8       4       0
@@ -157,12 +158,14 @@ Value GPUMappingMaskAttr::createLogicalLinearMappingId(
 Value GPUMappingMaskAttr::createIsActiveIdPredicate(
     OpBuilder &b, Value physicalLinearMappingId) const {
   Location loc = physicalLinearMappingId.getLoc();
-  Value mask = b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(getMask()));
-  Value one = b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(1));
-  Value filter = b.create<arith::ShLIOp>(loc, one, physicalLinearMappingId);
-  Value filtered = b.create<arith::AndIOp>(loc, mask, filter);
-  Value zero = b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(0));
-  return b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, filtered, zero);
+  Value mask =
+      arith::ConstantOp::create(b, loc, b.getI64IntegerAttr(getMask()));
+  Value one = arith::ConstantOp::create(b, loc, b.getI64IntegerAttr(1));
+  Value filter = arith::ShLIOp::create(b, loc, one, physicalLinearMappingId);
+  Value filtered = arith::AndIOp::create(b, loc, mask, filter);
+  Value zero = arith::ConstantOp::create(b, loc, b.getI64IntegerAttr(0));
+  return arith::CmpIOp::create(b, loc, arith::CmpIPredicate::ne, filtered,
+                               zero);
 }
 
 int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
@@ -1137,7 +1140,7 @@ struct FoldLaunchArguments : public OpRewritePattern<LaunchOp> {
         OpBuilder::InsertionGuard guard(rewriter);
         rewriter.setInsertionPointToStart(&op.getBody().front());
         zero =
-            rewriter.create<arith::ConstantIndexOp>(op.getLoc(), /*value=*/0);
+            arith::ConstantIndexOp::create(rewriter, op.getLoc(), /*value=*/0);
       }
       rewriter.replaceAllUsesWith(id, zero);
       simplified = true;
@@ -1381,10 +1384,10 @@ static void printLaunchFuncOperands(OpAsmPrinter &printer, Operation *,
 void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value value,
                       int32_t offset, int32_t width, ShuffleMode mode) {
   build(builder, result, value,
-        builder.create<arith::ConstantOp>(result.location,
-                                          builder.getI32IntegerAttr(offset)),
-        builder.create<arith::ConstantOp>(result.location,
-                                          builder.getI32IntegerAttr(width)),
+        arith::ConstantOp::create(builder, result.location,
+                                  builder.getI32IntegerAttr(offset)),
+        arith::ConstantOp::create(builder, result.location,
+                                  builder.getI32IntegerAttr(width)),
         mode);
 }
 
@@ -1395,10 +1398,10 @@ void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value value,
 void RotateOp::build(OpBuilder &builder, OperationState &result, Value value,
                      int32_t offset, int32_t width) {
   build(builder, result, value,
-        builder.create<arith::ConstantOp>(result.location,
-                                          builder.getI32IntegerAttr(offset)),
-        builder.create<arith::ConstantOp>(result.location,
-                                          builder.getI32IntegerAttr(width)));
+        arith::ConstantOp::create(builder, result.location,
+                                  builder.getI32IntegerAttr(offset)),
+        arith::ConstantOp::create(builder, result.location,
+                                  builder.getI32IntegerAttr(width)));
 }
 
 LogicalResult RotateOp::verify() {
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index c9e91535df946..1d8279c3199ea 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -560,8 +560,8 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
   Value predicate;
   if (originalBasisWasProvided) {
     for (Value tmpPredicate : builderResult.predicateOps) {
-      predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
-                                                             tmpPredicate)
+      predicate = predicate ? arith::AndIOp::create(rewriter, loc, predicate,
+                                                    tmpPredicate)
                             : tmpPredicate;
     }
   }
@@ -573,8 +573,8 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
   Block::iterator insertionPoint;
   if (predicate) {
     // Step 6.a. If predicated, move at the beginning.
-    auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
-                                           /*withElseRegion=*/false);
+    auto ifOp = scf::IfOp::create(rewriter, loc, predicate,
+                                  /*withElseRegion=*/false);
     targetBlock = ifOp.thenBlock();
     insertionPoint = ifOp.thenBlock()->begin();
   } else {
@@ -632,7 +632,7 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
     // the insertion point.
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPointToStart(parentBlock);
-    zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
   }
 
   ForallRewriteResult rewriteResult;
@@ -884,7 +884,7 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
     return diag;
   // Add a syncthreads if needed. TODO: warpsync
   if (syncAfterDistribute)
-    rewriter.create<BarrierOp>(loc);
+    BarrierOp::create(rewriter, loc);
 
   return DiagnosedSilenceableFailure::success();
 }
@@ -901,7 +901,7 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
 
   // Create an early zero index value for replacements.
   Location loc = target->getLoc();
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
   DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
   WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
     diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
index f6bdbe384c08f..518a42299484f 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -76,9 +76,10 @@ buildPredicates(RewriterBase &rewriter, Location loc, ArrayRef<Value> activeIds,
     }
     if (activeMappingSize == availableMappingSize)
       continue;
-    Value idx = rewriter.create<arith::ConstantIndexOp>(loc, activeMappingSize);
-    Value pred = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                                activeId, idx);
+    Value idx =
+        arith::ConstantIndexOp::create(rewriter, loc, activeMappingSize);
+    Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ult,
+                                       activeId, idx);
     predicateOps.push_back(pred);
   }
   return predicateOps;
@@ -98,11 +99,11 @@ static Value buildLinearId(RewriterBase &rewriter, Location loc,
   bindDims(rewriter.getContext(), tx, ty, tz);
   bindSymbols(rewriter.getContext(), bdx, bdy);
   SmallVector<OpFoldResult> vals{
-      rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
+      ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x)
           .getResult(),
-      rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
+      ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y)
           .getResult(),
-      rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
+      ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)
           .getResult(),
       originalBasisOfr[0], originalBasisOfr[1]};
   OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
@@ -151,12 +152,12 @@ commonLinearIdBuilderFn(int64_t multiplicity = 1,
     if (mask) {
       scaledLinearId =
           getValueOrCreateConstantIndexOp(rewriter, loc, scaledLinearIdOfr);
-      scaledLinearIdI64 = rewriter.create<arith::IndexCastUIOp>(
-          loc, rewriter.getI64Type(), scaledLinearId);
+      scaledLinearIdI64 = arith::IndexCastUIOp::create(
+          rewriter, loc, rewriter.getI64Type(), scaledLinearId);
       Value logicalLinearIdI64 =
           mask.createLogicalLinearMappingId(rewriter, scaledLinearIdI64);
-      scaledLinearId = rewriter.create<arith::IndexCastUIOp>(
-          loc, rewriter.getIndexType(), logicalLinearIdI64);
+      scaledLinearId = arith::IndexCastUIOp::create(
+          rewriter, loc, rewriter.getIndexType(), logicalLinearIdI64);
       LDBG("------adjusting linearId with mask: " << scaledLinearId);
     }
 
@@ -209,9 +210,9 @@ static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
                             ArrayRef<int64_t> originalBasis) {
     IndexType indexType = rewriter.getIndexType();
     SmallVector<Value> ids{
-        rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
-        rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
-        rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
+        ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::x),
+        ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::y),
+        ThreadOrBlockIdOp::create(rewriter, loc, indexType, Dimension::z)};
     // In the 3-D mapping case, scale the first dimension by the multiplicity.
     SmallVector<Value> scaledIds = ids;
     AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
@@ -411,7 +412,7 @@ DiagnosedSilenceableFailure createGpuLaunch(
     return diag;
 
   auto createConst = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
+    return arith::ConstantIndexOp::create(rewriter, loc, dim);
   };
   OpBuilder::InsertionGuard guard(rewriter);
   Value one = createConst(1);
@@ -421,10 +422,10 @@ DiagnosedSilenceableFailure createGpuLaunch(
   Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
   Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
   Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
-  launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
-                                       blkSizeX, blkSizeY, blkSizeZ);
+  launchOp = LaunchOp::create(rewriter, loc, gridSizeX, gridSizeY, gridSizeZ,
+                              blkSizeX, blkSizeY, blkSizeZ);
   rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-  rewriter.create<TerminatorOp>(loc);
+  TerminatorOp::create(rewriter, loc);
   return DiagnosedSilenceableFailure::success();
 }
 
@@ -445,8 +446,8 @@ DiagnosedSilenceableFailure alterGpuLaunch(
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointAfterValue(currentBlockdim.x);
   auto createConstValue = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
-                                                   dim);
+    return arith::ConstantIndexOp::create(rewriter, currentBlockdim.x.getLoc(),
+                                          dim);
   };
 
   if (gridDimX.has_value())
diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
index 98dc8ad3aa416..8c449144af3a9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp
@@ -145,7 +145,7 @@ struct GpuAllReduceRewriter {
   // Shortcut to create an op from rewriter using loc as the first argument.
   template <typename T, typename... Args>
   T create(Args... args) {
-    return rewriter.create<T>(loc, std::forward<Args>(args)...);
+    return T::create(rewriter, loc, std::forward<Args>(args)...);
   }
 
   // Creates dimension op of type T, with the result casted to int32.
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index c39ba4a41898d..cd138401e3177 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -129,7 +129,7 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
   }
 
   Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
-    return builder.create<gpu::WaitOp>(loc, resultType, operands)
+    return gpu::WaitOp::create(builder, loc, resultType, operands)
         .getAsyncToken();
   }
 
@@ -165,8 +165,9 @@ async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
 
   // Clone executeOp with the extra results.
   OpBuilder builder(executeOp);
-  auto newOp = builder.create<async::ExecuteOp>(
-      executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
+  auto newOp = async::ExecuteOp::create(
+      builder, executeOp.getLoc(),
+      TypeRange{resultTypes}.drop_front() /*drop token*/,
       executeOp.getDependencies(), executeOp.getBodyOperands());
   IRMapping mapper;
   newOp.getRegion().getBlocks().clear();
@@ -247,7 +248,7 @@ struct GpuAsyncRegionPass::DeferWaitCallback {
           builder.setInsertionPointAfter(op);
           for (auto asyncToken : asyncTokens)
             tokens.push_back(
-                builder.create<async::AwaitOp>(loc, asyncToken).getResult());
+                async::AwaitOp::create(builder, loc, asyncToken).getResult());
           // Set `it` after the inserted async.await ops.
           it = builder.getInsertionPoint();
         })
@@ -279,7 +280,7 @@ struct GpuAsyncRegionPass::DeferWaitCallback {
 
     // Otherwise, insert a gpu.wait before 'it'.
     builder.setInsertionPoint(it->getBlock(), it);
-    auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
+    auto waitOp = gpu::WaitOp::create(builder, loc, Type{}, tokens);
 
     // If the new waitOp is at the end of an async.execute region, add it to the
     // worklist. 'operator()(executeOp)' would do the same, but this is faster.
diff --git a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
index 65b9407a7efba..7b30906abc2fd 100644
--- a/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/DecomposeMemRefs.cpp
@@ -62,7 +62,7 @@ getFlatOffsetAndStrides(OpBuilder &rewriter, Location loc, Value source,
     OpBuilder::InsertionGuard g(rewriter);
     setInsertionPointToStart(rewriter, source);
     newExtractStridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, source);
+        memref::ExtractStridedMetadataOp::create(rewriter, loc, source);
   }
 
   auto &&[sourceStrides, sourceOffset] = sourceType.getStridesAndOffset();
@@ -108,9 +108,9 @@ static Value getFlatMemref(OpBuilder &rewriter, Location loc, Value source,
   auto &&[base, offset, ignore] =
       getFlatOffsetAndStrides(rewriter, loc, source, offsetsTemp);
   MemRefType retType = inferCastResultType(base, offset);
-  return rewriter.create<memref::ReinterpretCastOp>(loc, retType, base, offset,
-                                                    ArrayRef<OpFoldResult>(),
-                                                    ArrayRef<OpFoldResult>());
+  return memref::ReinterpretCastOp::create(rewriter, loc, retType, base, offset,
+                                           ArrayRef<OpFoldResult>(),
+                                           ArrayRef<OpFoldResult>());
 }
 
 static bool needFlatten(Value val) {
diff --git a/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp
index 153ceb23a6ecd..6519b65cec465 100644
--- a/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp
@@ -26,11 +26,11 @@ struct GpuGlobalIdRewriter : public OpRewritePattern<gpu::GlobalIdOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     auto dim = op.getDimension();
-    auto blockId = rewriter.create<gpu::BlockIdOp>(loc, dim);
-    auto blockDim = rewriter.create<gpu::BlockDimOp>(loc, dim);
+    auto blockId = gpu::BlockIdOp::create(rewriter, loc, dim);
+    auto blockDim = gpu::BlockDimOp::create(rewriter, loc, dim);
     // Compute blockId.x * blockDim.x
-    auto tmp = rewriter.create<index::MulOp>(op.getLoc(), blockId, blockDim);
-    auto threadId = rewriter.create<gpu::ThreadIdOp>(loc, dim);
+    auto tmp = index::MulOp::create(rewriter, op.getLoc(), blockId, blockDim);
+    auto threadId = gpu::ThreadIdOp::create(rewriter, loc, dim);
     // Compute threadId.x + blockId.x * blockDim.x
     rewriter.replaceOpWithNewOp<index::AddOp>(op, threadId, tmp);
     return success();
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 34ea9fcab4188..99f5c5b0cf139 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -40,7 +40,7 @@ template <typename OpTy>
 static void createForAllDimensions(OpBuilder &builder, Location loc,
                                    SmallVectorImpl<Value> &values) {
   for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
-    values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
+    values.push_back(OpTy::create(builder, loc, builder.getIndexType(), dim));
 }
 
 /// Adds operations generating block/thread ids and grid/block dimensions at the
@@ -195,8 +195,8 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
   }
   FunctionType type =
       FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
-  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(
-      loc, kernelFnName, type,
+  auto outlinedFunc = gpu::GPUFuncOp::create(
+      builder, loc, kernelFnName, type,
       TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
       TypeRange(ValueRange(launchOp.getPrivateAttributions())));
   outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
@@ -247,7 +247,7 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
     if (!terminator)
       continue;
     OpBuilder replacer(terminator);
-    replacer.create<gpu::ReturnOp>(terminator->getLoc());
+    gpu::ReturnOp::create(replacer, terminator->getLoc());
     terminator->erase();
   }
 
@@ -287,9 +287,9 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
   Value asyncToken = launchOp.getAsyncToken();
   std::optional<gpu::KernelDim3> clusterSize =
       launchOp.getClusterSizeOperandValues();
-  auto launchFunc = builder.create<gpu::LaunchFuncOp>(
-      launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
-      launchOp.getBlockSizeOperandValues(),
+  auto launchFunc = gpu::LaunchFuncOp::create(
+      builder, launchOp.getLoc(), kernelFunc,
+      launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(),
       launchOp.getDynamicSharedMemorySize(), operands,
       asyncToken ? asyncToken.getType() : nullptr,
       launchOp.getAsyncDependencies(), clusterSize);
@@ -415,8 +415,8 @@ class GpuKernelOutliningPass
     // Check if the module already exists in the symbol table
     if (!kernelModule) {
       // If not found, create a new GPU module
-      kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
-                                                      kernelModuleName);
+      kernelModule = gpu::GPUModuleOp::create(builder, kernelFunc.getLoc(),
+                                              kernelModuleName);
     }
 
     // If a valid data layout spec was provided, attach it to the kernel module.
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
index 14c44f27a6249..0d70fa2162bb2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -34,8 +34,8 @@ static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
   auto rank = memRefType.getRank();
 
   SmallVector<Value, 4> lbs, ubs, steps;
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
+  Value zero = arith::ConstantIndexOp::create(b, 0);
+  Value one = arith::ConstantIndexOp::create(b, 1);
 
   // Make sure we have enough loops to use all thread dimensions, these trivial
   // loops should be outermost and therefore inserted first.
@@ -59,8 +59,8 @@ static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
   auto indexType = b.getIndexType();
   SmallVector<Value, 3> threadIds, blockDims;
   for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
-    threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
-    blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
+    threadIds.push_back(gpu::ThreadIdOp::create(b, indexType, dim));
+    blockDims.push_back(gpu::BlockDimOp::create(b, indexType, dim));
   }
 
   // Produce the loop nest with copies.
@@ -70,8 +70,8 @@ static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
       [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
         ivs.assign(loopIvs.begin(), loopIvs.end());
         auto activeIvs = llvm::ArrayRef(ivs).take_back(rank);
-        Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
-        b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
+        Value loaded = memref::LoadOp::create(b, loc, from, activeIvs);
+        memref::StoreOp::create(b, loc, loaded, to, activeIvs);
       });
 
   // Map the innermost loops to threads in reverse order.
@@ -131,10 +131,10 @@ static void insertCopies(Region &region, Location loc, Value from, Value to) {
 
   auto b = ImplicitLocOpBuilder::atBlockBegin(loc, &region.front());
   insertCopyLoops(b, from, to);
-  b.create<gpu::BarrierOp>();
+  gpu::BarrierOp::create(b);
 
   b.setInsertionPoint(&region.front().back());
-  b.create<gpu::BarrierOp>();
+  gpu::BarrierOp::create(b);
   insertCopyLoops(b, to, from);
 }
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 9a69e6dde4274..3c447337d821f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -108,8 +108,8 @@ LogicalResult moduleSerializer(GPUModuleOp op,
       !handler && moduleHandler)
     handler = moduleHandler;
   builder.setInsertionPointAfter(op);
-  builder.create<gpu::BinaryOp>(op.getLoc(), op.getName(), handler,
-                                builder.getArrayAttr(objects));
+  gpu::BinaryOp::create(builder, op.getLoc(), op.getName(), handler,
+                        builder.getArrayAttr(objects));
   op->erase();
   return success();
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
index 171e64346f155..18c69f5f30e5d 100644
--- a/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
@@ -48,10 +48,10 @@ struct PromoteShuffleToSwizzlePattern
                                          "offset must be in the range [0, 31]");
 
     Location loc = op.getLoc();
-    Value res = rewriter.create<amdgpu::SwizzleBitModeOp>(
-        loc, op.getResult(0).getType(), op.getValue(), /*andMask=*/31,
+    Value res = amdgpu::SwizzleBitModeOp::create(
+        rewriter, loc, op.getResult(0).getType(), op.getValue(), /*andMask=*/31,
         /*orMask=*/0, /*xorMask=*/offsetValue);
-    Value valid = rewriter.create<arith::ConstantIntOp>(loc, 1, /*width*/ 1);
+    Value valid = arith::ConstantIntOp::create(rewriter, loc, 1, /*width*/ 1);
     rewriter.replaceOp(op, {res, valid});
     return success();
   }
diff --git a/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp
index 2d6df0ff6d02d..d88f4d56d9009 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ShuffleRewriter.cpp
@@ -47,16 +47,16 @@ struct GpuShuffleRewriter : public OpRewritePattern<gpu::ShuffleOp> {
 
     // Float types must be converted to i64 to extract the bits.
     if (isa<FloatType>(valueType))
-      value = rewriter.create<arith::BitcastOp>(valueLoc, i64, value);
+      value = arith::BitcastOp::create(rewriter, valueLoc, i64, value);
 
     // Get the low bits by trunc(value).
-    lo = rewriter.create<arith::TruncIOp>(valueLoc, i32, value);
+    lo = arith::TruncIOp::create(rewriter, valueLoc, i32, value);
 
     // Get the high bits by trunc(value >> 32).
-    auto c32 = rewriter.create<arith::ConstantOp>(
-        valueLoc, rewriter.getIntegerAttr(i64, 32));
-    hi = rewriter.create<arith::ShRUIOp>(valueLoc, value, c32);
-    hi = rewriter.create<arith::TruncIOp>(valueLoc, i32, hi);
+    auto c32 = arith::ConstantOp::create(rewriter, valueLoc,
+                                         rewriter.getIntegerAttr(i64, 32));
+    hi = arith::ShRUIOp::create(rewriter, valueLoc, value, c32);
+    hi = arith::TruncIOp::create(rewriter, valueLoc, i32, hi);
 
     // Shuffle the values.
     ValueRange loRes =
@@ -71,21 +71,21 @@ struct GpuShuffleRewriter : public OpRewritePattern<gpu::ShuffleOp> {
             .getResults();
 
     // Convert lo back to i64.
-    lo = rewriter.create<arith::ExtUIOp>(valueLoc, i64, loRes[0]);
+    lo = arith::ExtUIOp::create(rewriter, valueLoc, i64, loRes[0]);
 
     // Convert hi back to i64.
-    hi = rewriter.create<arith::ExtUIOp>(valueLoc, i64, hiRes[0]);
-    hi = rewriter.create<arith::ShLIOp>(valueLoc, hi, c32);
+    hi = arith::ExtUIOp::create(rewriter, valueLoc, i64, hiRes[0]);
+    hi = arith::ShLIOp::create(rewriter, valueLoc, hi, c32);
 
     // Obtain the shuffled bits hi | lo.
-    value = rewriter.create<arith::OrIOp>(loc, hi, lo);
+    value = arith::OrIOp::create(rewriter, loc, hi, lo);
 
     // Convert the value back to float.
     if (isa<FloatType>(valueType))
-      value = rewriter.create<arith::BitcastOp>(valueLoc, valueType, value);
+      value = arith::BitcastOp::create(rewriter, valueLoc, valueType, value);
 
     // Obtain the shuffle validity by combining both validities.
-    auto validity = rewriter.create<arith::AndIOp>(loc, loRes[1], hiRes[1]);
+    auto validity = arith::AndIOp::create(rewriter, loc, loRes[1], hiRes[1]);
 
     // Replace the op.
     rewriter.replaceOp(op, {value, validity});
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
index 05631ad87dd71..79be247c2a6b5 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
@@ -54,23 +54,25 @@ struct GpuSubgroupIdRewriter final : OpRewritePattern<gpu::SubgroupIdOp> {
     Location loc = op->getLoc();
     Type indexType = rewriter.getIndexType();
 
-    Value dimX = rewriter.create<gpu::BlockDimOp>(loc, gpu::Dimension::x);
-    Value dimY = rewriter.create<gpu::BlockDimOp>(loc, gpu::Dimension::y);
-    Value tidX = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
-    Value tidY = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::y);
-    Value tidZ = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::z);
+    Value dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
+    Value dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
+    Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
+    Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y);
+    Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z);
 
-    Value dimYxIdZ = rewriter.create<arith::MulIOp>(loc, indexType, dimY, tidZ);
+    Value dimYxIdZ =
+        arith::MulIOp::create(rewriter, loc, indexType, dimY, tidZ);
     Value dimYxIdZPlusIdY =
-        rewriter.create<arith::AddIOp>(loc, indexType, dimYxIdZ, tidY);
+        arith::AddIOp::create(rewriter, loc, indexType, dimYxIdZ, tidY);
     Value dimYxIdZPlusIdYTimesDimX =
-        rewriter.create<arith::MulIOp>(loc, indexType, dimX, dimYxIdZPlusIdY);
-    Value IdXPlusDimYxIdZPlusIdYTimesDimX = rewriter.create<arith::AddIOp>(
-        loc, indexType, tidX, dimYxIdZPlusIdYTimesDimX);
-    Value subgroupSize = rewriter.create<gpu::SubgroupSizeOp>(
-        loc, rewriter.getIndexType(), /*upper_bound = */ nullptr);
-    Value subgroupIdOp = rewriter.create<arith::DivUIOp>(
-        loc, indexType, IdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
+        arith::MulIOp::create(rewriter, loc, indexType, dimX, dimYxIdZPlusIdY);
+    Value IdXPlusDimYxIdZPlusIdYTimesDimX = arith::AddIOp::create(
+        rewriter, loc, indexType, tidX, dimYxIdZPlusIdYTimesDimX);
+    Value subgroupSize = gpu::SubgroupSizeOp::create(
+        rewriter, loc, rewriter.getIndexType(), /*upper_bound = */ nullptr);
+    Value subgroupIdOp =
+        arith::DivUIOp::create(rewriter, loc, indexType,
+                               IdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
     rewriter.replaceOp(op, {subgroupIdOp});
     return success();
   }
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 1b3d13623c548..b9e2dd5b19a6f 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -79,7 +79,7 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
 
     Location loc = op.getLoc();
     Value res =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(vecTy));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getZeroAttr(vecTy));
 
     for (unsigned i = 0; i != numNewReductions; ++i) {
       int64_t startIdx = i * elementsPerShuffle;
@@ -90,23 +90,24 @@ struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
       Value extracted;
       if (numElems == 1) {
         extracted =
-            rewriter.create<vector::ExtractOp>(loc, op.getValue(), startIdx);
+            vector::ExtractOp::create(rewriter, loc, op.getValue(), startIdx);
       } else {
-        extracted = rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, op.getValue(), /*offsets=*/startIdx, /*sizes=*/numElems,
+        extracted = vector::ExtractStridedSliceOp::create(
+            rewriter, loc, op.getValue(), /*offsets=*/startIdx,
+            /*sizes=*/numElems,
             /*strides=*/1);
       }
 
-      Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-          loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(),
-          op.getClusterStride());
+      Value reduce = gpu::SubgroupReduceOp::create(
+          rewriter, loc, extracted, op.getOp(), op.getUniform(),
+          op.getClusterSize(), op.getClusterStride());
       if (numElems == 1) {
-        res = rewriter.create<vector::InsertOp>(loc, reduce, res, startIdx);
+        res = vector::InsertOp::create(rewriter, loc, reduce, res, startIdx);
         continue;
       }
 
-      res = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, reduce, res, /*offsets=*/startIdx, /*strides=*/1);
+      res = vector::InsertStridedSliceOp::create(
+          rewriter, loc, reduce, res, /*offsets=*/startIdx, /*strides=*/1);
     }
 
     rewriter.replaceOp(op, res);
@@ -138,10 +139,11 @@ struct ScalarizeSingleElementReduce final
     assert(vecTy.getRank() == 1 && "Unexpected vector type");
     assert(!vecTy.isScalable() && "Unexpected vector type");
     Location loc = op.getLoc();
-    Value extracted = rewriter.create<vector::ExtractOp>(loc, op.getValue(), 0);
-    Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
-        loc, extracted, op.getOp(), op.getUniform(), op.getClusterSize(),
-        op.getClusterStride());
+    Value extracted =
+        vector::ExtractOp::create(rewriter, loc, op.getValue(), 0);
+    Value reduce = gpu::SubgroupReduceOp::create(
+        rewriter, loc, extracted, op.getOp(), op.getUniform(),
+        op.getClusterSize(), op.getClusterStride());
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, vecTy, reduce);
     return success();
   }
@@ -254,14 +256,14 @@ struct ScalarSubgroupReduceToShuffles final
     auto packFn = [loc, &rewriter, equivIntType,
                    shuffleIntType](Value unpackedVal) -> Value {
       auto asInt =
-          rewriter.create<arith::BitcastOp>(loc, equivIntType, unpackedVal);
-      return rewriter.create<arith::ExtUIOp>(loc, shuffleIntType, asInt);
+          arith::BitcastOp::create(rewriter, loc, equivIntType, unpackedVal);
+      return arith::ExtUIOp::create(rewriter, loc, shuffleIntType, asInt);
     };
     auto unpackFn = [loc, &rewriter, equivIntType,
                      valueTy](Value packedVal) -> Value {
       auto asInt =
-          rewriter.create<arith::TruncIOp>(loc, equivIntType, packedVal);
-      return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
+          arith::TruncIOp::create(rewriter, loc, equivIntType, packedVal);
+      return arith::BitcastOp::create(rewriter, loc, valueTy, asInt);
     };
 
     rewriter.replaceOp(
@@ -326,10 +328,10 @@ struct VectorSubgroupReduceToShuffles final
         static_cast<int64_t>(elementsPerShuffle), vecTy.getElementType());
     Value extendedInput = op.getValue();
     if (vecBitwidth < shuffleBitwidth) {
-      auto zero = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getZeroAttr(extendedVecTy));
-      extendedInput = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, extendedInput, zero, /*offsets=*/0, /*strides=*/1);
+      auto zero = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getZeroAttr(extendedVecTy));
+      extendedInput = vector::InsertStridedSliceOp::create(
+          rewriter, loc, extendedInput, zero, /*offsets=*/0, /*strides=*/1);
     }
 
     auto shuffleIntType = rewriter.getIntegerType(shuffleBitwidth);
@@ -337,22 +339,22 @@ struct VectorSubgroupReduceToShuffles final
 
     auto packFn = [loc, &rewriter, shuffleVecType](Value unpackedVal) -> Value {
       auto asIntVec =
-          rewriter.create<vector::BitCastOp>(loc, shuffleVecType, unpackedVal);
-      return rewriter.create<vector::ExtractOp>(loc, asIntVec, 0);
+          vector::BitCastOp::create(rewriter, loc, shuffleVecType, unpackedVal);
+      return vector::ExtractOp::create(rewriter, loc, asIntVec, 0);
     };
     auto unpackFn = [loc, &rewriter, shuffleVecType,
                      extendedVecTy](Value packedVal) -> Value {
       auto asIntVec =
-          rewriter.create<vector::BroadcastOp>(loc, shuffleVecType, packedVal);
-      return rewriter.create<vector::BitCastOp>(loc, extendedVecTy, asIntVec);
+          vector::BroadcastOp::create(rewriter, loc, shuffleVecType, packedVal);
+      return vector::BitCastOp::create(rewriter, loc, extendedVecTy, asIntVec);
     };
 
     Value res = createSubgroupShuffleReduction(
         rewriter, loc, extendedInput, op.getOp(), *ci, packFn, unpackFn);
 
     if (vecBitwidth < shuffleBitwidth) {
-      res = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, res, /*offsets=*/0, /*sizes=*/vecTy.getNumElements(),
+      res = vector::ExtractStridedSliceOp::create(
+          rewriter, loc, res, /*offsets=*/0, /*sizes=*/vecTy.getNumElements(),
           /*strides=*/1);
     }
 
@@ -378,8 +380,8 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
   const bool boundCtrl = true;
   if (ci.clusterSize >= 2) {
     // Perform reduction between all lanes N <-> N+1.
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+    dpp = amdgpu::DPPOp::create(
+        rewriter, loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
     res = vector::makeArithReduction(rewriter, loc,
                                      gpu::convertReductionKind(mode), res, dpp);
@@ -387,8 +389,8 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
 
   if (ci.clusterSize >= 4) {
     // Perform reduction between all lanes N <-> N+2.
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
+    dpp = amdgpu::DPPOp::create(
+        rewriter, loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
     res = vector::makeArithReduction(rewriter, loc,
                                      gpu::convertReductionKind(mode), res, dpp);
@@ -396,17 +398,18 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
   if (ci.clusterSize >= 8) {
     // Perform reduction between all lanes N <-> 7-N,
     // e.g lane[0] <-> lane[7], lane[1] <-> lane[6]..., lane[3] <-> lane[4].
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
-        rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
+    dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
+                                amdgpu::DPPPerm::row_half_mirror,
+                                rewriter.getUnitAttr(), allRows, allBanks,
+                                boundCtrl);
     res = vector::makeArithReduction(rewriter, loc,
                                      gpu::convertReductionKind(mode), res, dpp);
   }
   if (ci.clusterSize >= 16) {
     // Perform reduction between all lanes N <-> 15-N,
     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-    dpp = rewriter.create<amdgpu::DPPOp>(
-        loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
+    dpp = amdgpu::DPPOp::create(
+        rewriter, loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
     res = vector::makeArithReduction(rewriter, loc,
                                      gpu::convertReductionKind(mode), res, dpp);
@@ -415,20 +418,20 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
     if (chipset.majorVersion <= 9) {
       // Broadcast last value from each row to next row.
       // Use row mask to avoid polluting rows 1 and 3.
-      dpp = rewriter.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_15,
-          rewriter.getUnitAttr(), 0xa, allBanks,
-          /*bound_ctrl*/ false);
+      dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
+                                  amdgpu::DPPPerm::row_bcast_15,
+                                  rewriter.getUnitAttr(), 0xa, allBanks,
+                                  /*bound_ctrl*/ false);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
     } else if (chipset.majorVersion <= 12) {
       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-      Value uint32Max = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
-      dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
-                                                  uint32Max, uint32Max,
-                                                  /*fi=*/true,
-                                                  /*bound_ctrl=*/false);
+      Value uint32Max = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
+      dpp = ROCDL::PermlaneX16Op::create(rewriter, loc, res.getType(), res, res,
+                                         uint32Max, uint32Max,
+                                         /*fi=*/true,
+                                         /*bound_ctrl=*/false);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
     } else {
@@ -437,37 +440,39 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
               "this device.");
     }
     if (ci.subgroupSize == 32) {
-      Value lane31 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(31));
-      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane31);
+      Value lane31 = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(31));
+      res =
+          ROCDL::ReadlaneOp::create(rewriter, loc, res.getType(), res, lane31);
     }
   }
   if (ci.clusterSize >= 64) {
     if (chipset.majorVersion <= 9) {
       // Broadcast 31st lane value to rows 2 and 3.
-      dpp = rewriter.create<amdgpu::DPPOp>(
-          loc, res.getType(), res, res, amdgpu::DPPPerm::row_bcast_31,
-          rewriter.getUnitAttr(), 0xf, allBanks,
-          /*bound_ctrl*/ true);
+      dpp = amdgpu::DPPOp::create(rewriter, loc, res.getType(), res, res,
+                                  amdgpu::DPPPerm::row_bcast_31,
+                                  rewriter.getUnitAttr(), 0xf, allBanks,
+                                  /*bound_ctrl*/ true);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), dpp, res);
       // Obtain reduction from last rows, the previous rows are polluted.
-      Value lane63 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(63));
-      res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane63);
+      Value lane63 = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(63));
+      res =
+          ROCDL::ReadlaneOp::create(rewriter, loc, res.getType(), res, lane63);
 
     } else if (chipset.majorVersion <= 12) {
       // Assume reduction across 32 lanes has been done.
       // Perform final reduction manually by summing values in lane 0 and
       // lane 32.
-      Value lane31 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(31));
-      Value lane63 = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(63));
+      Value lane31 = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(31));
+      Value lane63 = arith::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(63));
       lane31 =
-          rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane31);
+          ROCDL::ReadlaneOp::create(rewriter, loc, res.getType(), res, lane31);
       lane63 =
-          rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane63);
+          ROCDL::ReadlaneOp::create(rewriter, loc, res.getType(), res, lane63);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), lane31, lane63);
     } else {
diff --git a/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
index 29f6f32892f72..384d1a0ddccd2 100644
--- a/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
@@ -27,9 +27,10 @@ WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns(
   // Create a new op before the existing one, with the extra operands.
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(warpOp);
-  auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
-      warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
-      warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
+  auto newWarpOp = WarpExecuteOnLane0Op::create(
+      rewriter, warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(),
+      warpOp.getWarpSize(), warpOp.getArgs(),
+      warpOp.getBody()->getArgumentTypes());
 
   Region &opBody = warpOp.getBodyRegion();
   Region &newOpBody = newWarpOp.getBodyRegion();
@@ -124,7 +125,7 @@ bool WarpDistributionPattern::delinearizeLaneId(
 
   int64_t usedThreads = 1;
 
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
   delinearizedIds.assign(sizes.size(), zero);
 
   for (int i = sizes.size() - 1; i >= 0; --i) {
diff --git a/mlir/lib/Dialect/Index/IR/IndexOps.cpp b/mlir/lib/Dialect/Index/IR/IndexOps.cpp
index bab9e2852a460..a3e1542e6a947 100644
--- a/mlir/lib/Dialect/Index/IR/IndexOps.cpp
+++ b/mlir/lib/Dialect/Index/IR/IndexOps.cpp
@@ -36,7 +36,7 @@ Operation *IndexDialect::materializeConstant(OpBuilder &b, Attribute value,
   if (auto boolValue = dyn_cast<BoolAttr>(value)) {
     if (!type.isSignlessInteger(1))
       return nullptr;
-    return b.create<BoolConstantOp>(loc, type, boolValue);
+    return BoolConstantOp::create(b, loc, type, boolValue);
   }
 
   // Materialize integer attributes as `index`.
@@ -46,7 +46,7 @@ Operation *IndexDialect::materializeConstant(OpBuilder &b, Attribute value,
       return nullptr;
     assert(indexValue.getValue().getBitWidth() ==
            IndexType::kInternalStorageBitWidth);
-    return b.create<ConstantOp>(loc, indexValue);
+    return ConstantOp::create(b, loc, indexValue);
   }
 
   return nullptr;
@@ -715,11 +715,11 @@ LogicalResult CmpOp::canonicalize(CmpOp op, PatternRewriter &rewriter) {
 
   index::CmpOp newCmp;
   if (rhsIsZero)
-    newCmp = rewriter.create<index::CmpOp>(op.getLoc(), op.getPred(),
-                                           subOp.getLhs(), subOp.getRhs());
+    newCmp = index::CmpOp::create(rewriter, op.getLoc(), op.getPred(),
+                                  subOp.getLhs(), subOp.getRhs());
   else
-    newCmp = rewriter.create<index::CmpOp>(op.getLoc(), op.getPred(),
-                                           subOp.getRhs(), subOp.getLhs());
+    newCmp = index::CmpOp::create(rewriter, op.getLoc(), op.getPred(),
+                                  subOp.getRhs(), subOp.getLhs());
   rewriter.replaceOp(op, newCmp);
   return success();
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp b/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp
index c17ef1029faf6..894de4408c375 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp
@@ -90,8 +90,8 @@ void PtxBuilder::insertValue(Value v, PTXRegisterMod itype) {
     }
     for (auto [idx, t] : llvm::enumerate(stype.getBody())) {
       if (itype != PTXRegisterMod::Write) {
-        Value extractValue = rewriter.create<LLVM::ExtractValueOp>(
-            interfaceOp->getLoc(), v, idx);
+        Value extractValue = LLVM::ExtractValueOp::create(
+            rewriter, interfaceOp->getLoc(), v, idx);
         addValue(extractValue);
       }
       if (itype == PTXRegisterMod::ReadWrite) {
@@ -132,8 +132,8 @@ LLVM::InlineAsmOp PtxBuilder::build() {
   // Replace all % with $
   llvm::replace(ptxInstruction, '%', '$');
 
-  return rewriter.create<LLVM::InlineAsmOp>(
-      interfaceOp->getLoc(),
+  return LLVM::InlineAsmOp::create(
+      rewriter, interfaceOp->getLoc(),
       /*result types=*/resultTypes,
       /*operands=*/ptxOperands,
       /*asm_string=*/llvm::StringRef(ptxInstruction),
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 89f765dacda35..feaffa34897b6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -89,8 +89,8 @@ mlir::LLVM::lookupOrCreateFn(OpBuilder &b, Operation *moduleOp, StringRef name,
   OpBuilder::InsertionGuard g(b);
   assert(!moduleOp->getRegion(0).empty() && "expected non-empty region");
   b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
-  auto funcOp = b.create<LLVM::LLVMFuncOp>(
-      moduleOp->getLoc(), name,
+  auto funcOp = LLVM::LLVMFuncOp::create(
+      b, moduleOp->getLoc(), name,
       LLVM::LLVMFunctionType::get(resultType, paramTypes, isVarArg));
 
   if (symbolTables) {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 62dce32bc4531..5b01596eb522f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -24,6 +24,7 @@
 #include "mlir/Interfaces/FunctionImplementation.h"
 #include "mlir/Transforms/InliningUtils.h"
 
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
@@ -130,6 +131,17 @@ static RetTy parseOptionalLLVMKeyword(OpAsmParser &parser,
   return static_cast<RetTy>(index);
 }
 
+static void printLLVMLinkage(OpAsmPrinter &p, Operation *, LinkageAttr val) {
+  p << stringifyLinkage(val.getLinkage());
+}
+
+static ParseResult parseLLVMLinkage(OpAsmParser &p, LinkageAttr &val) {
+  val = LinkageAttr::get(
+      p.getContext(),
+      parseOptionalLLVMKeyword<LLVM::Linkage>(p, LLVM::Linkage::External));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Operand bundle helpers.
 //===----------------------------------------------------------------------===//
@@ -1166,14 +1178,17 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
       return emitOpError()
              << "'" << calleeName.getValue()
              << "' does not reference a symbol in the current scope";
-    auto fn = dyn_cast<LLVMFuncOp>(callee);
-    if (!fn)
-      return emitOpError() << "'" << calleeName.getValue()
-                           << "' does not reference a valid LLVM function";
-
-    if (failed(verifyCallOpDebugInfo(*this, fn)))
-      return failure();
-    fnType = fn.getFunctionType();
+    if (auto fn = dyn_cast<LLVMFuncOp>(callee)) {
+      if (failed(verifyCallOpDebugInfo(*this, fn)))
+        return failure();
+      fnType = fn.getFunctionType();
+    } else if (auto ifunc = dyn_cast<IFuncOp>(callee)) {
+      fnType = ifunc.getIFuncType();
+    } else {
+      return emitOpError()
+             << "'" << calleeName.getValue()
+             << "' does not reference a valid LLVM function or IFunc";
+    }
   }
 
   LLVMFunctionType funcType = llvm::dyn_cast<LLVMFunctionType>(fnType);
@@ -2029,14 +2044,6 @@ LogicalResult ReturnOp::verify() {
 // LLVM::AddressOfOp.
 //===----------------------------------------------------------------------===//
 
-static Operation *parentLLVMModule(Operation *op) {
-  Operation *module = op->getParentOp();
-  while (module && !satisfiesLLVMModule(module))
-    module = module->getParentOp();
-  assert(module && "unexpected operation outside of a module");
-  return module;
-}
-
 GlobalOp AddressOfOp::getGlobal(SymbolTableCollection &symbolTable) {
   return dyn_cast_or_null<GlobalOp>(
       symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr()));
@@ -2052,6 +2059,11 @@ AliasOp AddressOfOp::getAlias(SymbolTableCollection &symbolTable) {
       symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr()));
 }
 
+IFuncOp AddressOfOp::getIFunc(SymbolTableCollection &symbolTable) {
+  return dyn_cast_or_null<IFuncOp>(
+      symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr()));
+}
+
 LogicalResult
 AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   Operation *symbol =
@@ -2060,10 +2072,11 @@ AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   auto global = dyn_cast_or_null<GlobalOp>(symbol);
   auto function = dyn_cast_or_null<LLVMFuncOp>(symbol);
   auto alias = dyn_cast_or_null<AliasOp>(symbol);
+  auto ifunc = dyn_cast_or_null<IFuncOp>(symbol);
 
-  if (!global && !function && !alias)
+  if (!global && !function && !alias && !ifunc)
     return emitOpError("must reference a global defined by 'llvm.mlir.global', "
-                       "'llvm.mlir.alias' or 'llvm.func'");
+                       "'llvm.mlir.alias' or 'llvm.func' or 'llvm.mlir.ifunc'");
 
   LLVMPointerType type = getType();
   if ((global && global.getAddrSpace() != type.getAddressSpace()) ||
@@ -2673,6 +2686,69 @@ unsigned AliasOp::getAddrSpace() {
   return ptrTy.getAddressSpace();
 }
 
+//===----------------------------------------------------------------------===//
+// IFuncOp
+//===----------------------------------------------------------------------===//
+
+void IFuncOp::build(OpBuilder &builder, OperationState &result, StringRef name,
+                    Type iFuncType, StringRef resolverName, Type resolverType,
+                    Linkage linkage, LLVM::Visibility visibility) {
+  return build(builder, result, name, iFuncType, resolverName, resolverType,
+               linkage, /*dso_local=*/false, /*address_space=*/0,
+               UnnamedAddr::None, visibility);
+}
+
+LogicalResult IFuncOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  Operation *symbol =
+      symbolTable.lookupSymbolIn(parentLLVMModule(*this), getResolverAttr());
+  // This matches LLVM IR verification logic, see llvm/lib/IR/Verifier.cpp
+  auto resolver = dyn_cast<LLVMFuncOp>(symbol);
+  auto alias = dyn_cast<AliasOp>(symbol);
+  while (alias) {
+    Block &initBlock = alias.getInitializerBlock();
+    auto returnOp = cast<ReturnOp>(initBlock.getTerminator());
+    auto addrOp = dyn_cast<AddressOfOp>(returnOp.getArg().getDefiningOp());
+    // FIXME: This is a best effort solution. The AliasOp body might be more
+    // complex and in that case we bail out with success. To completely match
+    // the LLVM IR logic it would be necessary to implement proper alias and
+    // cast stripping.
+    if (!addrOp)
+      return success();
+    resolver = addrOp.getFunction(symbolTable);
+    alias = addrOp.getAlias(symbolTable);
+  }
+  if (!resolver)
+    return emitOpError("must have a function resolver");
+  Linkage linkage = resolver.getLinkage();
+  if (resolver.isExternal() || linkage == Linkage::AvailableExternally)
+    return emitOpError("resolver must be a definition");
+  if (!isa<LLVMPointerType>(resolver.getFunctionType().getReturnType()))
+    return emitOpError("resolver must return a pointer");
+  auto resolverPtr = dyn_cast<LLVMPointerType>(getResolverType());
+  if (!resolverPtr || resolverPtr.getAddressSpace() != getAddressSpace())
+    return emitOpError("resolver has incorrect type");
+  return success();
+}
+
+LogicalResult IFuncOp::verify() {
+  switch (getLinkage()) {
+  case Linkage::External:
+  case Linkage::Internal:
+  case Linkage::Private:
+  case Linkage::Weak:
+  case Linkage::WeakODR:
+  case Linkage::Linkonce:
+  case Linkage::LinkonceODR:
+    break;
+  default:
+    return emitOpError() << "'" << stringifyLinkage(getLinkage())
+                         << "' linkage not supported in ifuncs, available "
+                            "options: private, internal, linkonce, weak, "
+                            "linkonce_odr, weak_odr, or external linkage";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ShuffleVectorOp
 //===----------------------------------------------------------------------===//
@@ -3112,6 +3188,18 @@ static int64_t getNumElements(Type t) {
   return 1;
 }
 
+/// Determine the element type of `type`. Supported types are `VectorType`,
+/// `TensorType`, and `LLVMArrayType`. Everything else is treated as a scalar.
+static Type getElementType(Type type) {
+  while (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(type))
+    type = arrayType.getElementType();
+  if (auto vecType = dyn_cast<VectorType>(type))
+    return vecType.getElementType();
+  if (auto tenType = dyn_cast<TensorType>(type))
+    return tenType.getElementType();
+  return type;
+}
+
 /// Check if the given type is a scalable vector type or a vector/array type
 /// that contains a nested scalable vector type.
 static bool hasScalableVectorType(Type t) {
@@ -3206,60 +3294,69 @@ LogicalResult LLVM::ConstantOp::verify() {
   }
   if (auto structType = dyn_cast<LLVMStructType>(getType())) {
     auto arrayAttr = dyn_cast<ArrayAttr>(getValue());
-    if (!arrayAttr) {
-      return emitOpError() << "expected array attribute for a struct constant";
-    }
+    if (!arrayAttr)
+      return emitOpError() << "expected array attribute for struct type";
 
     ArrayRef<Type> elementTypes = structType.getBody();
     if (arrayAttr.size() != elementTypes.size()) {
       return emitOpError() << "expected array attribute of size "
                            << elementTypes.size();
     }
-    for (auto elementTy : elementTypes) {
-      if (!isa<IntegerType, FloatType, LLVMPPCFP128Type>(elementTy)) {
+    for (auto [i, attr, type] : llvm::enumerate(arrayAttr, elementTypes)) {
+      if (!type.isSignlessIntOrIndexOrFloat()) {
         return emitOpError() << "expected struct element types to be floating "
                                 "point type or integer type";
       }
-    }
-
-    for (size_t i = 0; i < elementTypes.size(); ++i) {
-      Attribute element = arrayAttr[i];
-      if (!isa<IntegerAttr, FloatAttr>(element)) {
-        return emitOpError()
-               << "expected struct element attribute types to be floating "
-                  "point type or integer type";
+      if (!isa<FloatAttr, IntegerAttr>(attr)) {
+        return emitOpError() << "expected element of array attribute to be "
+                                "floating point or integer";
       }
-      auto elementType = cast<TypedAttr>(element).getType();
-      if (elementType != elementTypes[i]) {
+      if (cast<TypedAttr>(attr).getType() != type)
         return emitOpError()
                << "struct element at index " << i << " is of wrong type";
-      }
     }
 
     return success();
   }
-  if (auto targetExtType = dyn_cast<LLVMTargetExtType>(getType())) {
+  if (auto targetExtType = dyn_cast<LLVMTargetExtType>(getType()))
     return emitOpError() << "does not support target extension type.";
-  }
+
+  // Check that an attribute whose element type has floating point semantics
+  // `attributeFloatSemantics` is compatible with a type whose element type
+  // is `constantElementType`.
+  //
+  // Requirement is that either
+  // 1) They have identical floating point types.
+  // 2) `constantElementType` is an integer type of the same width as the float
+  //     attribute. This is to support builtin MLIR float types without LLVM
+  //     equivalents, see comments in getLLVMConstant for more details.
+  auto verifyFloatSemantics =
+      [this](const llvm::fltSemantics &attributeFloatSemantics,
+             Type constantElementType) -> LogicalResult {
+    if (auto floatType = dyn_cast<FloatType>(constantElementType)) {
+      if (&floatType.getFloatSemantics() != &attributeFloatSemantics) {
+        return emitOpError()
+               << "attribute and type have different float semantics";
+      }
+      return success();
+    }
+    unsigned floatWidth = APFloat::getSizeInBits(attributeFloatSemantics);
+    if (isa<IntegerType>(constantElementType)) {
+      if (!constantElementType.isInteger(floatWidth))
+        return emitOpError() << "expected integer type of width " << floatWidth;
+
+      return success();
+    }
+    return success();
+  };
 
   // Verification of IntegerAttr, FloatAttr, ElementsAttr, ArrayAttr.
-  if (auto intAttr = dyn_cast<IntegerAttr>(getValue())) {
+  if (isa<IntegerAttr>(getValue())) {
     if (!llvm::isa<IntegerType>(getType()))
       return emitOpError() << "expected integer type";
   } else if (auto floatAttr = dyn_cast<FloatAttr>(getValue())) {
-    const llvm::fltSemantics &sem = floatAttr.getValue().getSemantics();
-    unsigned floatWidth = APFloat::getSizeInBits(sem);
-    if (auto floatTy = dyn_cast<FloatType>(getType())) {
-      if (floatTy.getWidth() != floatWidth) {
-        return emitOpError() << "expected float type of width " << floatWidth;
-      }
-    }
-    // See the comment for getLLVMConstant for more details about why 8-bit
-    // floats can be represented by integers.
-    if (isa<IntegerType>(getType()) && !getType().isInteger(floatWidth)) {
-      return emitOpError() << "expected integer type of width " << floatWidth;
-    }
-  } else if (isa<ElementsAttr>(getValue())) {
+    return verifyFloatSemantics(floatAttr.getValue().getSemantics(), getType());
+  } else if (auto elementsAttr = dyn_cast<ElementsAttr>(getValue())) {
     if (hasScalableVectorType(getType())) {
       // The exact number of elements of a scalable vector is unknown, so we
       // allow only splat attributes.
@@ -3271,18 +3368,32 @@ LogicalResult LLVM::ConstantOp::verify() {
     }
     if (!isa<VectorType, LLVM::LLVMArrayType>(getType()))
       return emitOpError() << "expected vector or array type";
+
     // The number of elements of the attribute and the type must match.
-    if (auto elementsAttr = dyn_cast<ElementsAttr>(getValue())) {
-      int64_t attrNumElements = elementsAttr.getNumElements();
-      if (getNumElements(getType()) != attrNumElements)
-        return emitOpError()
-               << "type and attribute have a different number of elements: "
-               << getNumElements(getType()) << " vs. " << attrNumElements;
+    int64_t attrNumElements = elementsAttr.getNumElements();
+    if (getNumElements(getType()) != attrNumElements) {
+      return emitOpError()
+             << "type and attribute have a different number of elements: "
+             << getNumElements(getType()) << " vs. " << attrNumElements;
+    }
+
+    Type attrElmType = getElementType(elementsAttr.getType());
+    Type resultElmType = getElementType(getType());
+    if (auto floatType = dyn_cast<FloatType>(attrElmType))
+      return verifyFloatSemantics(floatType.getFloatSemantics(), resultElmType);
+
+    if (isa<IntegerType>(attrElmType) && !isa<IntegerType>(resultElmType)) {
+      return emitOpError(
+          "expected integer element type for integer elements attribute");
     }
   } else if (auto arrayAttr = dyn_cast<ArrayAttr>(getValue())) {
+
+    // The case where the constant is LLVMStructType has already been handled.
     auto arrayType = dyn_cast<LLVM::LLVMArrayType>(getType());
     if (!arrayType)
-      return emitOpError() << "expected array type";
+      return emitOpError()
+             << "expected array or struct type for array attribute";
+
     // When the attribute is an ArrayAttr, check that its nesting matches the
     // corresponding ArrayType or VectorType nesting.
     return verifyStructArrayConstant(*this, arrayType, arrayAttr, /*dim=*/0);
@@ -3309,7 +3420,7 @@ bool LLVM::ConstantOp::isBuildableWith(Attribute value, Type type) {
 ConstantOp LLVM::ConstantOp::materialize(OpBuilder &builder, Attribute value,
                                          Type type, Location loc) {
   if (isBuildableWith(value, type))
-    return builder.create<LLVM::ConstantOp>(loc, cast<TypedAttr>(value));
+    return LLVM::ConstantOp::create(builder, loc, cast<TypedAttr>(value));
   return nullptr;
 }
 
@@ -4058,9 +4169,11 @@ void LLVMDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
 #include "mlir/Dialect/LLVMIR/LLVMOps.cpp.inc"
+
       ,
 #define GET_OP_LIST
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicOps.cpp.inc"
+
       >();
 
   // Support unknown operations because not all LLVM operations are registered.
@@ -4275,13 +4388,13 @@ Operation *LLVMDialect::materializeConstant(OpBuilder &builder, Attribute value,
   // a builtin zero attribute and thus will materialize as a llvm.mlir.constant.
   if (auto symbol = dyn_cast<FlatSymbolRefAttr>(value))
     if (isa<LLVM::LLVMPointerType>(type))
-      return builder.create<LLVM::AddressOfOp>(loc, type, symbol);
+      return LLVM::AddressOfOp::create(builder, loc, type, symbol);
   if (isa<LLVM::UndefAttr>(value))
-    return builder.create<LLVM::UndefOp>(loc, type);
+    return LLVM::UndefOp::create(builder, loc, type);
   if (isa<LLVM::PoisonAttr>(value))
-    return builder.create<LLVM::PoisonOp>(loc, type);
+    return LLVM::PoisonOp::create(builder, loc, type);
   if (isa<LLVM::ZeroAttr>(value))
-    return builder.create<LLVM::ZeroOp>(loc, type);
+    return LLVM::ZeroOp::create(builder, loc, type);
   // Otherwise try materializing it as a regular llvm.mlir.constant op.
   return LLVM::ConstantOp::materialize(builder, value, type, loc);
 }
@@ -4304,19 +4417,27 @@ Value mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder,
   OpBuilder moduleBuilder(module.getBodyRegion(), builder.getListener());
   MLIRContext *ctx = builder.getContext();
   auto type = LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), value.size());
-  auto global = moduleBuilder.create<LLVM::GlobalOp>(
-      loc, type, /*isConstant=*/true, linkage, name,
+  auto global = LLVM::GlobalOp::create(
+      moduleBuilder, loc, type, /*isConstant=*/true, linkage, name,
       builder.getStringAttr(value), /*alignment=*/0);
 
   LLVMPointerType ptrType = LLVMPointerType::get(ctx);
   // Get the pointer to the first character in the global string.
   Value globalPtr =
-      builder.create<LLVM::AddressOfOp>(loc, ptrType, global.getSymNameAttr());
-  return builder.create<LLVM::GEPOp>(loc, ptrType, type, globalPtr,
-                                     ArrayRef<GEPArg>{0, 0});
+      LLVM::AddressOfOp::create(builder, loc, ptrType, global.getSymNameAttr());
+  return LLVM::GEPOp::create(builder, loc, ptrType, type, globalPtr,
+                             ArrayRef<GEPArg>{0, 0});
 }
 
 bool mlir::LLVM::satisfiesLLVMModule(Operation *op) {
   return op->hasTrait<OpTrait::SymbolTable>() &&
          op->hasTrait<OpTrait::IsIsolatedFromAbove>();
 }
+
+Operation *mlir::LLVM::parentLLVMModule(Operation *op) {
+  Operation *module = op->getParentOp();
+  while (module && !satisfiesLLVMModule(module))
+    module = module->getParentOp();
+  assert(module && "unexpected operation outside of a module");
+  return module;
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index bc451f8b028fc..e7d5dad4f71ef 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -37,7 +37,7 @@ llvm::SmallVector<MemorySlot> LLVM::AllocaOp::getPromotableSlots() {
 
 Value LLVM::AllocaOp::getDefaultValue(const MemorySlot &slot,
                                       OpBuilder &builder) {
-  return builder.create<LLVM::UndefOp>(getLoc(), slot.elemType);
+  return LLVM::UndefOp::create(builder, getLoc(), slot.elemType);
 }
 
 void LLVM::AllocaOp::handleBlockArgument(const MemorySlot &slot,
@@ -45,9 +45,9 @@ void LLVM::AllocaOp::handleBlockArgument(const MemorySlot &slot,
                                          OpBuilder &builder) {
   for (Operation *user : getOperation()->getUsers())
     if (auto declareOp = llvm::dyn_cast<LLVM::DbgDeclareOp>(user))
-      builder.create<LLVM::DbgValueOp>(declareOp.getLoc(), argument,
-                                       declareOp.getVarInfo(),
-                                       declareOp.getLocationExpr());
+      LLVM::DbgValueOp::create(builder, declareOp.getLoc(), argument,
+                               declareOp.getVarInfo(),
+                               declareOp.getLocationExpr());
 }
 
 std::optional<PromotableAllocationOpInterface>
@@ -89,8 +89,8 @@ DenseMap<Attribute, MemorySlot> LLVM::AllocaOp::destructure(
   for (Attribute index : usedIndices) {
     Type elemType = destructurableType.getTypeAtIndex(index);
     assert(elemType && "used index must exist");
-    auto subAlloca = builder.create<LLVM::AllocaOp>(
-        getLoc(), LLVM::LLVMPointerType::get(getContext()), elemType,
+    auto subAlloca = LLVM::AllocaOp::create(
+        builder, getLoc(), LLVM::LLVMPointerType::get(getContext()), elemType,
         getArraySize());
     newAllocators.push_back(subAlloca);
     slotMap.try_emplace<MemorySlot>(index, {subAlloca.getResult(), elemType});
@@ -260,14 +260,14 @@ static Value createExtractAndCast(OpBuilder &builder, Location loc,
   // Truncate the integer if the size of the target is less than the value.
   if (isBigEndian(dataLayout)) {
     uint64_t shiftAmount = srcTypeSize - targetTypeSize;
-    auto shiftConstant = builder.create<LLVM::ConstantOp>(
-        loc, builder.getIntegerAttr(srcType, shiftAmount));
+    auto shiftConstant = LLVM::ConstantOp::create(
+        builder, loc, builder.getIntegerAttr(srcType, shiftAmount));
     replacement =
         builder.createOrFold<LLVM::LShrOp>(loc, srcValue, shiftConstant);
   }
 
-  replacement = builder.create<LLVM::TruncOp>(
-      loc, builder.getIntegerType(targetTypeSize), replacement);
+  replacement = LLVM::TruncOp::create(
+      builder, loc, builder.getIntegerType(targetTypeSize), replacement);
 
   // Now cast the integer to the actual target type if required.
   return castIntValueToSameSizedType(builder, loc, replacement, targetType);
@@ -304,8 +304,9 @@ static Value createInsertAndCast(OpBuilder &builder, Location loc,
     // On big endian systems, a store to the base pointer overwrites the most
     // significant bits. To accomodate for this, the stored value needs to be
     // shifted into the according position.
-    Value bigEndianShift = builder.create<LLVM::ConstantOp>(
-        loc, builder.getIntegerAttr(defAsInt.getType(), sizeDifference));
+    Value bigEndianShift = LLVM::ConstantOp::create(
+        builder, loc,
+        builder.getIntegerAttr(defAsInt.getType(), sizeDifference));
     valueAsInt =
         builder.createOrFold<LLVM::ShlOp>(loc, valueAsInt, bigEndianShift);
   }
@@ -325,8 +326,8 @@ static Value createInsertAndCast(OpBuilder &builder, Location loc,
   }
 
   // Mask out the affected bits ...
-  Value mask = builder.create<LLVM::ConstantOp>(
-      loc, builder.getIntegerAttr(defAsInt.getType(), maskValue));
+  Value mask = LLVM::ConstantOp::create(
+      builder, loc, builder.getIntegerAttr(defAsInt.getType(), maskValue));
   Value masked = builder.createOrFold<LLVM::AndOp>(loc, defAsInt, mask);
 
   // ... and combine the result with the new value.
@@ -644,7 +645,7 @@ DeletionKind LLVM::DbgValueOp::removeBlockingUses(
   // debug local variable info. This allows the debugger to inform the user that
   // the variable has been optimized out.
   auto undef =
-      builder.create<UndefOp>(getValue().getLoc(), getValue().getType());
+      UndefOp::create(builder, getValue().getLoc(), getValue().getType());
   getValueMutable().assign(undef);
   return DeletionKind::Keep;
 }
@@ -655,8 +656,8 @@ void LLVM::DbgDeclareOp::visitReplacedValues(
     ArrayRef<std::pair<Operation *, Value>> definitions, OpBuilder &builder) {
   for (auto [op, value] : definitions) {
     builder.setInsertionPointAfter(op);
-    builder.create<LLVM::DbgValueOp>(getLoc(), value, getVarInfo(),
-                                     getLocationExpr());
+    LLVM::DbgValueOp::create(builder, getLoc(), value, getVarInfo(),
+                             getLocationExpr());
   }
 }
 
@@ -972,15 +973,14 @@ void createMemsetIntr(OpBuilder &builder, LLVM::MemsetOp toReplace,
                       DenseMap<Attribute, MemorySlot> &subslots,
                       Attribute index) {
   Value newMemsetSizeValue =
-      builder
-          .create<LLVM::ConstantOp>(
-              toReplace.getLen().getLoc(),
-              IntegerAttr::get(memsetLenAttr.getType(), newMemsetSize))
+      LLVM::ConstantOp::create(
+          builder, toReplace.getLen().getLoc(),
+          IntegerAttr::get(memsetLenAttr.getType(), newMemsetSize))
           .getResult();
 
-  builder.create<LLVM::MemsetOp>(toReplace.getLoc(), subslots.at(index).ptr,
-                                 toReplace.getVal(), newMemsetSizeValue,
-                                 toReplace.getIsVolatile());
+  LLVM::MemsetOp::create(builder, toReplace.getLoc(), subslots.at(index).ptr,
+                         toReplace.getVal(), newMemsetSizeValue,
+                         toReplace.getIsVolatile());
 }
 
 template <>
@@ -991,9 +991,9 @@ void createMemsetIntr(OpBuilder &builder, LLVM::MemsetInlineOp toReplace,
   auto newMemsetSizeValue =
       IntegerAttr::get(memsetLenAttr.getType(), newMemsetSize);
 
-  builder.create<LLVM::MemsetInlineOp>(
-      toReplace.getLoc(), subslots.at(index).ptr, toReplace.getVal(),
-      newMemsetSizeValue, toReplace.getIsVolatile());
+  LLVM::MemsetInlineOp::create(builder, toReplace.getLoc(),
+                               subslots.at(index).ptr, toReplace.getVal(),
+                               newMemsetSizeValue, toReplace.getIsVolatile());
 }
 
 } // namespace
@@ -1063,8 +1063,8 @@ static Value memsetGetStored(MemsetIntr op, const MemorySlot &slot,
       APInt memsetVal(/*numBits=*/width, /*val=*/0);
       for (unsigned loBit = 0; loBit < width; loBit += 8)
         memsetVal.insertBits(constantPattern.getValue(), loBit);
-      return builder.create<LLVM::ConstantOp>(
-          op.getLoc(), IntegerAttr::get(intType, memsetVal));
+      return LLVM::ConstantOp::create(builder, op.getLoc(),
+                                      IntegerAttr::get(intType, memsetVal));
     }
 
     // If the output is a single byte, we can return the pattern directly.
@@ -1075,14 +1075,14 @@ static Value memsetGetStored(MemsetIntr op, const MemorySlot &slot,
     // value and or-ing it with the previous value.
     uint64_t coveredBits = 8;
     Value currentValue =
-        builder.create<LLVM::ZExtOp>(op.getLoc(), intType, op.getVal());
+        LLVM::ZExtOp::create(builder, op.getLoc(), intType, op.getVal());
     while (coveredBits < width) {
       Value shiftBy =
-          builder.create<LLVM::ConstantOp>(op.getLoc(), intType, coveredBits);
+          LLVM::ConstantOp::create(builder, op.getLoc(), intType, coveredBits);
       Value shifted =
-          builder.create<LLVM::ShlOp>(op.getLoc(), currentValue, shiftBy);
+          LLVM::ShlOp::create(builder, op.getLoc(), currentValue, shiftBy);
       currentValue =
-          builder.create<LLVM::OrOp>(op.getLoc(), currentValue, shifted);
+          LLVM::OrOp::create(builder, op.getLoc(), currentValue, shifted);
       coveredBits *= 2;
     }
 
@@ -1094,7 +1094,7 @@ static Value memsetGetStored(MemsetIntr op, const MemorySlot &slot,
       })
       .Case([&](FloatType type) -> Value {
         Value intVal = buildMemsetValue(type.getWidth());
-        return builder.create<LLVM::BitcastOp>(op.getLoc(), type, intVal);
+        return LLVM::BitcastOp::create(builder, op.getLoc(), type, intVal);
       })
       .Default([](Type) -> Value {
         llvm_unreachable(
@@ -1282,7 +1282,7 @@ static bool memcpyStoresTo(MemcpyLike op, const MemorySlot &slot) {
 template <class MemcpyLike>
 static Value memcpyGetStored(MemcpyLike op, const MemorySlot &slot,
                              OpBuilder &builder) {
-  return builder.create<LLVM::LoadOp>(op.getLoc(), slot.elemType, op.getSrc());
+  return LLVM::LoadOp::create(builder, op.getLoc(), slot.elemType, op.getSrc());
 }
 
 template <class MemcpyLike>
@@ -1309,7 +1309,8 @@ memcpyRemoveBlockingUses(MemcpyLike op, const MemorySlot &slot,
                          const SmallPtrSetImpl<OpOperand *> &blockingUses,
                          OpBuilder &builder, Value reachingDefinition) {
   if (op.loadsFrom(slot))
-    builder.create<LLVM::StoreOp>(op.getLoc(), reachingDefinition, op.getDst());
+    LLVM::StoreOp::create(builder, op.getLoc(), reachingDefinition,
+                          op.getDst());
   return DeletionKind::Delete;
 }
 
@@ -1354,11 +1355,12 @@ template <class MemcpyLike>
 void createMemcpyLikeToReplace(OpBuilder &builder, const DataLayout &layout,
                                MemcpyLike toReplace, Value dst, Value src,
                                Type toCpy, bool isVolatile) {
-  Value memcpySize = builder.create<LLVM::ConstantOp>(
-      toReplace.getLoc(), IntegerAttr::get(toReplace.getLen().getType(),
-                                           layout.getTypeSize(toCpy)));
-  builder.create<MemcpyLike>(toReplace.getLoc(), dst, src, memcpySize,
-                             isVolatile);
+  Value memcpySize =
+      LLVM::ConstantOp::create(builder, toReplace.getLoc(),
+                               IntegerAttr::get(toReplace.getLen().getType(),
+                                                layout.getTypeSize(toCpy)));
+  MemcpyLike::create(builder, toReplace.getLoc(), dst, src, memcpySize,
+                     isVolatile);
 }
 
 template <>
@@ -1367,8 +1369,8 @@ void createMemcpyLikeToReplace(OpBuilder &builder, const DataLayout &layout,
                                Value src, Type toCpy, bool isVolatile) {
   Type lenType = IntegerType::get(toReplace->getContext(),
                                   toReplace.getLen().getBitWidth());
-  builder.create<LLVM::MemcpyInlineOp>(
-      toReplace.getLoc(), dst, src,
+  LLVM::MemcpyInlineOp::create(
+      builder, toReplace.getLoc(), dst, src,
       IntegerAttr::get(lenType, layout.getTypeSize(toCpy)), isVolatile);
 }
 
@@ -1409,9 +1411,9 @@ memcpyRewire(MemcpyLike op, const DestructurableMemorySlot &slot,
     SmallVector<LLVM::GEPArg> gepIndices{
         0, static_cast<int32_t>(
                cast<IntegerAttr>(index).getValue().getZExtValue())};
-    Value subslotPtrInOther = builder.create<LLVM::GEPOp>(
-        op.getLoc(), LLVM::LLVMPointerType::get(op.getContext()), slot.elemType,
-        isDst ? op.getSrc() : op.getDst(), gepIndices);
+    Value subslotPtrInOther = LLVM::GEPOp::create(
+        builder, op.getLoc(), LLVM::LLVMPointerType::get(op.getContext()),
+        slot.elemType, isDst ? op.getSrc() : op.getDst(), gepIndices);
 
     // Then create a new memcpy out of this source pointer.
     createMemcpyLikeToReplace(builder, dataLayout, op,
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/AddComdats.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/AddComdats.cpp
index 6fbb0d24826d0..1fb482b63670f 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/AddComdats.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/AddComdats.cpp
@@ -28,14 +28,15 @@ static void addComdat(LLVM::LLVMFuncOp &op, OpBuilder &builder,
     PatternRewriter::InsertionGuard guard(builder);
     builder.setInsertionPointToStart(module.getBody());
     comdatOp =
-        builder.create<mlir::LLVM::ComdatOp>(module.getLoc(), comdatName);
+        mlir::LLVM::ComdatOp::create(builder, module.getLoc(), comdatName);
     symbolTable.insert(comdatOp);
   }
 
   PatternRewriter::InsertionGuard guard(builder);
   builder.setInsertionPointToStart(&comdatOp.getBody().back());
-  auto selectorOp = builder.create<mlir::LLVM::ComdatSelectorOp>(
-      comdatOp.getLoc(), op.getSymName(), mlir::LLVM::comdat::Comdat::Any);
+  auto selectorOp = mlir::LLVM::ComdatSelectorOp::create(
+      builder, comdatOp.getLoc(), op.getSymName(),
+      mlir::LLVM::comdat::Comdat::Any);
   op.setComdatAttr(mlir::SymbolRefAttr::get(
       builder.getContext(), comdatName,
       mlir::FlatSymbolRefAttr::get(selectorOp.getSymNameAttr())));
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
index 7f3afffc9645e..935aa3ce6b1f1 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
@@ -113,20 +113,22 @@ handleInlinedAllocas(Operation *call,
     // scope if some are already present in the body of the caller. This is not
     // invalid IR, but LLVM cleans these up in InstCombineCalls.cpp, along with
     // other cases where the stacksave/stackrestore is redundant.
-    stackPtr = builder.create<LLVM::StackSaveOp>(
-        call->getLoc(), LLVM::LLVMPointerType::get(call->getContext()));
+    stackPtr = LLVM::StackSaveOp::create(
+        builder, call->getLoc(),
+        LLVM::LLVMPointerType::get(call->getContext()));
   }
   builder.setInsertionPointToStart(callerEntryBlock);
   for (auto &[allocaOp, arraySize, shouldInsertLifetime] : allocasToMove) {
-    auto newConstant = builder.create<LLVM::ConstantOp>(
-        allocaOp->getLoc(), allocaOp.getArraySize().getType(), arraySize);
+    auto newConstant =
+        LLVM::ConstantOp::create(builder, allocaOp->getLoc(),
+                                 allocaOp.getArraySize().getType(), arraySize);
     // Insert a lifetime start intrinsic where the alloca was before moving it.
     if (shouldInsertLifetime) {
       OpBuilder::InsertionGuard insertionGuard(builder);
       builder.setInsertionPoint(allocaOp);
-      builder.create<LLVM::LifetimeStartOp>(
-          allocaOp.getLoc(), arraySize.getValue().getLimitedValue(),
-          allocaOp.getResult());
+      LLVM::LifetimeStartOp::create(builder, allocaOp.getLoc(),
+                                    arraySize.getValue().getLimitedValue(),
+                                    allocaOp.getResult());
     }
     allocaOp->moveAfter(newConstant);
     allocaOp.getArraySizeMutable().assign(newConstant.getResult());
@@ -139,12 +141,12 @@ handleInlinedAllocas(Operation *call,
       continue;
     builder.setInsertionPoint(block.getTerminator());
     if (hasDynamicAlloca)
-      builder.create<LLVM::StackRestoreOp>(call->getLoc(), stackPtr);
+      LLVM::StackRestoreOp::create(builder, call->getLoc(), stackPtr);
     for (auto &[allocaOp, arraySize, shouldInsertLifetime] : allocasToMove) {
       if (shouldInsertLifetime)
-        builder.create<LLVM::LifetimeEndOp>(
-            allocaOp.getLoc(), arraySize.getValue().getLimitedValue(),
-            allocaOp.getResult());
+        LLVM::LifetimeEndOp::create(builder, allocaOp.getLoc(),
+                                    arraySize.getValue().getLimitedValue(),
+                                    allocaOp.getResult());
     }
   }
 }
@@ -311,7 +313,8 @@ static void createNewAliasScopesFromNoAliasParameter(
     auto scope = LLVM::AliasScopeAttr::get(functionDomain);
     pointerScopes[copyOp] = scope;
 
-    OpBuilder(call).create<LLVM::NoAliasScopeDeclOp>(call->getLoc(), scope);
+    auto builder = OpBuilder(call);
+    LLVM::NoAliasScopeDeclOp::create(builder, call->getLoc(), scope);
   }
 
   // Go through every instruction and attempt to find which noalias parameters
@@ -603,16 +606,17 @@ static Value handleByValArgumentInit(OpBuilder &builder, Location loc,
     OpBuilder::InsertionGuard insertionGuard(builder);
     Block *entryBlock = &(*argument.getParentRegion()->begin());
     builder.setInsertionPointToStart(entryBlock);
-    Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
-                                                 builder.getI64IntegerAttr(1));
-    allocaOp = builder.create<LLVM::AllocaOp>(
-        loc, argument.getType(), elementType, one, targetAlignment);
+    Value one = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                                         builder.getI64IntegerAttr(1));
+    allocaOp = LLVM::AllocaOp::create(builder, loc, argument.getType(),
+                                      elementType, one, targetAlignment);
   }
   // Copy the pointee to the newly allocated value.
-  Value copySize = builder.create<LLVM::ConstantOp>(
-      loc, builder.getI64Type(), builder.getI64IntegerAttr(elementTypeSize));
-  builder.create<LLVM::MemcpyOp>(loc, allocaOp, argument, copySize,
-                                 /*isVolatile=*/false);
+  Value copySize =
+      LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                               builder.getI64IntegerAttr(elementTypeSize));
+  LLVM::MemcpyOp::create(builder, loc, allocaOp, argument, copySize,
+                         /*isVolatile=*/false);
   return allocaOp;
 }
 
@@ -747,7 +751,7 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
 
     // Replace the return with a branch to the dest.
     OpBuilder builder(op);
-    builder.create<LLVM::BrOp>(op->getLoc(), returnOp.getOperands(), newDest);
+    LLVM::BrOp::create(builder, op->getLoc(), returnOp.getOperands(), newDest);
     op->erase();
   }
 
@@ -801,7 +805,7 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
     // and is extremely unlikely to exist in the code prior to inlining, using
     // this to communicate between this method and `processInlinedCallBlocks`.
     // TODO: Fix this by refactoring the inliner interface.
-    auto copyOp = builder.create<LLVM::SSACopyOp>(call->getLoc(), argument);
+    auto copyOp = LLVM::SSACopyOp::create(builder, call->getLoc(), argument);
     if (argumentAttrs.contains(LLVM::LLVMDialect::getNoAliasAttrName()))
       copyOp->setDiscardableAttr(
           builder.getStringAttr(LLVM::LLVMDialect::getNoAliasAttrName()),
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp
index 1a5a6e4ea1a3c..38a4bc844ea89 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/LegalizeForExport.cpp
@@ -58,8 +58,8 @@ static void ensureDistinctSuccessors(Block &bb) {
       terminator->setSuccessor(dummyBlock, position);
       for (BlockArgument arg : successor.first->getArguments())
         dummyBlock->addArgument(arg.getType(), arg.getLoc());
-      builder.create<LLVM::BrOp>(terminator->getLoc(),
-                                 dummyBlock->getArguments(), successor.first);
+      LLVM::BrOp::create(builder, terminator->getLoc(),
+                         dummyBlock->getArguments(), successor.first);
     }
   }
 }
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
index 8db32ec1526c4..7f34f7d142d73 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/OptimizeForNVVM.cpp
@@ -59,32 +59,32 @@ LogicalResult ExpandDivF16::matchAndRewrite(LLVM::FDivOp op,
   Type i32Type = rewriter.getI32Type();
 
   // Extend lhs and rhs to fp32.
-  Value lhs = rewriter.create<LLVM::FPExtOp>(loc, f32Type, op.getLhs());
-  Value rhs = rewriter.create<LLVM::FPExtOp>(loc, f32Type, op.getRhs());
+  Value lhs = LLVM::FPExtOp::create(rewriter, loc, f32Type, op.getLhs());
+  Value rhs = LLVM::FPExtOp::create(rewriter, loc, f32Type, op.getRhs());
 
   // float rcp = rcp.approx.ftz.f32(rhs), approx = lhs * rcp.
-  Value rcp = rewriter.create<NVVM::RcpApproxFtzF32Op>(loc, f32Type, rhs);
-  Value approx = rewriter.create<LLVM::FMulOp>(loc, lhs, rcp);
+  Value rcp = NVVM::RcpApproxFtzF32Op::create(rewriter, loc, f32Type, rhs);
+  Value approx = LLVM::FMulOp::create(rewriter, loc, lhs, rcp);
 
   // Refine the approximation with one Newton iteration:
   // float refined = approx + (lhs - approx * rhs) * rcp;
-  Value err = rewriter.create<LLVM::FMAOp>(
-      loc, approx, rewriter.create<LLVM::FNegOp>(loc, rhs), lhs);
-  Value refined = rewriter.create<LLVM::FMAOp>(loc, err, rcp, approx);
+  Value err = LLVM::FMAOp::create(
+      rewriter, loc, approx, LLVM::FNegOp::create(rewriter, loc, rhs), lhs);
+  Value refined = LLVM::FMAOp::create(rewriter, loc, err, rcp, approx);
 
   // Use refined value if approx is normal (exponent neither all 0 or all 1).
-  Value mask = rewriter.create<LLVM::ConstantOp>(
-      loc, i32Type, rewriter.getUI32IntegerAttr(0x7f800000));
-  Value cast = rewriter.create<LLVM::BitcastOp>(loc, i32Type, approx);
-  Value exp = rewriter.create<LLVM::AndOp>(loc, i32Type, cast, mask);
-  Value zero = rewriter.create<LLVM::ConstantOp>(
-      loc, i32Type, rewriter.getUI32IntegerAttr(0));
-  Value pred = rewriter.create<LLVM::OrOp>(
-      loc,
-      rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, exp, zero),
-      rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, exp, mask));
+  Value mask = LLVM::ConstantOp::create(
+      rewriter, loc, i32Type, rewriter.getUI32IntegerAttr(0x7f800000));
+  Value cast = LLVM::BitcastOp::create(rewriter, loc, i32Type, approx);
+  Value exp = LLVM::AndOp::create(rewriter, loc, i32Type, cast, mask);
+  Value zero = LLVM::ConstantOp::create(rewriter, loc, i32Type,
+                                        rewriter.getUI32IntegerAttr(0));
+  Value pred = LLVM::OrOp::create(
+      rewriter, loc,
+      LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::eq, exp, zero),
+      LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::eq, exp, mask));
   Value result =
-      rewriter.create<LLVM::SelectOp>(loc, f32Type, pred, approx, refined);
+      LLVM::SelectOp::create(rewriter, loc, f32Type, pred, approx, refined);
 
   // Replace with trucation back to fp16.
   rewriter.replaceOpWithNewOp<LLVM::FPTruncOp>(op, op.getType(), result);
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 94f2002fc51fa..085ae4c93b829 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -68,8 +68,14 @@ bool linalg::isaCopyOpInterface(LinalgOp op) {
       !mapRange.back().isIdentity()) {
     return false;
   }
-  // Region.
-  return llvm::hasSingleElement(op.getBlock()->getOperations());
+  // Check yield first block argument.
+  Block *body = op.getBlock();
+  if (body->getOperations().size() != 1)
+    return false;
+  auto yieldOp = dyn_cast<linalg::YieldOp>(body->back());
+  if (!yieldOp || yieldOp.getNumOperands() != 1)
+    return false;
+  return yieldOp->getOperand(0) == body->getArgument(0);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 5d5f9de465561..109e5b7f95ec0 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3920,7 +3920,10 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply(
     }
     FailureOr<VectorizationResult> vectorResults =
         linalg::vectorize(rewriter, target, vectorSizes, getScalableSizes(),
-                          getVectorizeNdExtract().value_or(false));
+                          getVectorizeNdExtract().value_or(false),
+                          /*flatten1DDepthwiseConv=*/false,
+                          getAssumeDynamicDimsMatchVecSizes().value_or(false),
+                          getCreateNamedContraction().value_or(false));
     if (failed(vectorResults)) {
       return mlir::emitSilenceableFailure(target->getLoc())
              << "Attempted to vectorize, but failed";
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 9c0f6e5d6469e..8a5c138304d5b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -172,8 +172,9 @@ bool mlir::linalg::areElementwiseOpsFusable(OpOperand *fusedOperand) {
 
   // Finally the index_map for the result must be invertible. For now just
   // verify it is a permutation.
+  auto producerResult = cast<OpResult>(fusedOperand->get());
   AffineMap producerResultIndexMap =
-      producer.getMatchingIndexingMap(producer.getDpsInitOperand(0));
+      producer.getIndexingMapMatchingResult(producerResult);
   if (!producerResultIndexMap.isPermutation())
     return false;
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index 513cecef29b61..28d99b130963a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "llvm/Support/Debug.h"
@@ -887,26 +888,64 @@ struct PackOpTiling
 
     ArrayRef<OpFoldResult> offsets(allOffsets[0]);
     ArrayRef<OpFoldResult> sizes(allSizes[0]);
-
     auto packOp = cast<PackOp>(op);
-    // It is not trivial to infer dest tile from source tile if `packOp` has
-    // padding semantic.
-    if (packOp.getPaddingValue())
-      return failure();
-
     Location loc = packOp.getLoc();
-
     SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
     DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
         packOp.getDimAndTileMapping();
+    SmallVector<int64_t> outerShapeWithoutTranspose(
+        packOp.getDestType().getShape().take_front(packOp.getSourceRank()));
+    if (!packOp.getOuterDimsPerm().empty()) {
+      applyPermutationToVector(
+          outerShapeWithoutTranspose,
+          invertPermutationVector(packOp.getOuterDimsPerm()));
+    }
     for (auto dim : llvm::seq<int64_t>(packOp.getSourceRank())) {
       if (dimAndTileMapping.count(dim)) {
-        FailureOr<int64_t> cstSize =
+        FailureOr<int64_t> cstTileSize =
             ValueBoundsConstraintSet::computeConstantBound(
                 presburger::BoundType::UB, sizes[dim],
                 /*stopCondition=*/nullptr, /*closedUB=*/true);
         std::optional<int64_t> cstInnerSize =
             getConstantIntValue(dimAndTileMapping[dim]);
+
+        // If a dimension is not tiled, it is always valid to fuse the pack op,
+        // even if the op has padding semantics. Because it always generates a
+        // full slice along the dimension. The tile sizes are for unpacked
+        // domain, i.e., `srcDimSize`, so `tileSize < srcDimSize` means that the
+        // dimension is tiled.
+        // TODO: It could be untiled if the `srcDimSize` is dynamic. It is a
+        // hard check to determine if a dimension is tiled or not.
+        int64_t srcDimSize = packOp.getSourceType().getDimSize(dim);
+        int64_t destDimSize = outerShapeWithoutTranspose[dim];
+        bool isTiled = failed(cstTileSize) ||
+                       ShapedType::isDynamic(srcDimSize) ||
+                       cstTileSize.value() < srcDimSize;
+        if (!isTiled) {
+          outerDimOffsets.push_back(offsets[dim]);
+          if (ShapedType::isStatic(destDimSize)) {
+            outerDimSizes.push_back(b.getIndexAttr(destDimSize));
+          } else {
+            outerDimSizes.push_back(
+                b.createOrFold<tensor::DimOp>(loc, packOp.getDest(), dim));
+          }
+          continue;
+        }
+
+        // If the dimension needs padding, it is not supported because there are
+        // iterations that only write padding values to the whole tile. The
+        // consumer fusion is driven by the source, so it is not possible to map
+        // an empty slice to the tile.
+        bool needExtraPadding =
+            ShapedType::isDynamic(destDimSize) || !cstInnerSize ||
+            destDimSize * cstInnerSize.value() != srcDimSize;
+        // Prioritize the case that the op already says that it does not need
+        // padding.
+        if (!packOp.getPaddingValue())
+          needExtraPadding = false;
+        if (needExtraPadding)
+          return failure();
+
         // Currently fusing `packOp` as consumer only expects perfect tiling
         // scenario because even if without padding semantic, the `packOp` may
         // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>,
@@ -921,9 +960,9 @@ struct PackOpTiling
         // another word, we can only support tiling with consumer if the tile
         // size for the producer is a multiple of the inner tile size for the
         // packed dimensions at this moment.
-        if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) {
+        if ((failed(cstTileSize) || !cstInnerSize ||
+             *cstTileSize % *cstInnerSize != 0))
           return failure();
-        }
 
         using AV = affine::AffineValueExpr;
         affine::AffineBuilder ab(b, loc);
@@ -988,7 +1027,8 @@ struct PackOpTiling
         loc, packOp.getDest(), outputOffsets, outputSizes, strides);
     tiledOperands.push_back(outSlice);
 
-    assert(!packOp.getPaddingValue() && "Expect no padding semantic");
+    if (auto val = packOp.getPaddingValue())
+      tiledOperands.push_back(val);
     for (auto tile : packOp.getInnerTiles())
       tiledOperands.push_back(tile);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 5a8c5eab3f444..77c85abab9aa0 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -219,7 +220,8 @@ struct VectorizationState {
   /// canonical vector shape for vectorization.
   LogicalResult initState(RewriterBase &rewriter, LinalgOp linalgOp,
                           ArrayRef<int64_t> inputVectorSizes,
-                          ArrayRef<bool> inputScalableVecDims);
+                          ArrayRef<bool> inputScalableVecDims,
+                          bool assumeDynamicDimsMatchVecSizes = false);
 
   /// Returns the canonical vector shape used to vectorize the iteration space.
   ArrayRef<int64_t> getCanonicalVecShape() const { return canonicalVecShape; }
@@ -328,6 +330,14 @@ struct VectorizationState {
   /// Global vectorization guard for the incoming rewriter. It's initialized
   /// when the vectorization state is initialized.
   OpBuilder::InsertionGuard rewriterGuard;
+
+  /// Do all dynamic dims match the corresponding vector sizes?
+  ///
+  /// When a dynamic tensor/memref dimension matches the corresponding vector
+  /// dimension, masking can be safely skipped, despite the presence of dynamic
+  /// shapes. Use this flag with care and only for cases where you are
+  /// confident the assumption holds.
+  bool assumeDynamicDimsMatchVecSizes = false;
 };
 
 LogicalResult
@@ -364,10 +374,12 @@ VectorizationState::precomputeIterSpaceValueSizes(RewriterBase &rewriter,
 /// Initializes the vectorization state, including the computation of the
 /// canonical vector shape for vectorization.
 // TODO: Move this to the constructor when we can remove the failure cases.
-LogicalResult
-VectorizationState::initState(RewriterBase &rewriter, LinalgOp linalgOp,
-                              ArrayRef<int64_t> inputVectorSizes,
-                              ArrayRef<bool> inputScalableVecDims) {
+LogicalResult VectorizationState::initState(RewriterBase &rewriter,
+                                            LinalgOp linalgOp,
+                                            ArrayRef<int64_t> inputVectorSizes,
+                                            ArrayRef<bool> inputScalableVecDims,
+                                            bool assumeDimsMatchVec) {
+  assumeDynamicDimsMatchVecSizes = assumeDimsMatchVec;
   // Initialize the insertion point.
   rewriter.setInsertionPoint(linalgOp);
 
@@ -467,6 +479,23 @@ Value VectorizationState::getOrCreateMaskFor(
     return Value();
   }
 
+  if (assumeDynamicDimsMatchVecSizes) {
+    // While for _dynamic_ dim sizes we can _assume_ that the corresponding
+    // vector sizes match, we still need to check the _static_ dim sizes. Only
+    // then we can be 100% sure that masking is not required.
+    if (llvm::all_of(llvm::zip(permutedStaticSizes, maskType.getShape()),
+                     [](auto it) {
+                       return std::get<0>(it) == ShapedType::kDynamic
+                                  ? true
+                                  : std::get<0>(it) == std::get<1>(it);
+                     })) {
+      LDBG("Dynamic + static dimensions match vector sizes, masking is not "
+           "required.\n");
+      activeMaskCache[maskingMap] = Value();
+      return Value();
+    }
+  }
+
   // Permute the iteration space value sizes to compute the mask upper bounds.
   SmallVector<Value> upperBounds =
       applyPermutationMap(maskingMap, ArrayRef<Value>(iterSpaceValueSizes));
@@ -1681,10 +1710,13 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore,
     return write;
 
   // Compute the mask and mask the write Op.
-  auto writeMaskType = VectorType::get(vecToStoreShape, builder.getI1Type());
+  auto writeMaskType = VectorType::get(vecToStoreShape, builder.getI1Type(),
+                                       vecToStoreType.getScalableDims());
 
   SmallVector<OpFoldResult> destSizes =
-      tensor::getMixedSizes(builder, loc, dest);
+      isa<MemRefType>(dest.getType())
+          ? memref::getMixedSizes(builder, loc, dest)
+          : tensor::getMixedSizes(builder, loc, dest);
   SmallVector<OpFoldResult> maskSizes(destSizes.end() - vecToStoreRank,
                                       destSizes.end());
 
@@ -1928,11 +1960,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
       unpackOp.getDestType().hasStaticShape()
           ? vectorSizes
           : shapeCastOp.getResultVectorType().getShape());
-  Value dest = rewriter.create<tensor::EmptyOp>(
-      loc, reifiedRetShapes[0],
-      shapeCastOp.getResult().getType().getElementType());
   Operation *write = createWriteOrMaskedWrite(
-      rewriter, loc, shapeCastOp.getResult(), dest,
+      rewriter, loc, shapeCastOp.getResult(), unpackOp.getDest(),
       /*writeIndices=*/{}, useInBoundsInsteadOfMasking);
   newResults.push_back(write->getResult(0));
   return success();
@@ -2093,6 +2122,92 @@ vectorizeInsertSliceOpPrecondition(tensor::InsertSliceOp sliceOp,
   return success();
 }
 
+/// Vectorize a named linalg contraction op into:
+///   vector::TransferReadOp - Reads vectors from the operands
+///   vector::ContractionOp - Performs contraction
+///   vector::TransferWriteOp - Write the result vector back to the
+///   destination
+/// The operands shapes are preserved and loaded directly into vectors.
+/// Any further permutations or numerical casting remain within contraction op.
+static LogicalResult
+vectorizeAsLinalgContraction(RewriterBase &rewriter, VectorizationState &state,
+                             LinalgOp linalgOp,
+                             SmallVectorImpl<Value> &newResults) {
+  Location loc = linalgOp.getLoc();
+  MLIRContext *ctx = linalgOp.getContext();
+
+  // For simplicity, contraction vectorization is limited to linalg named ops.
+  // Generic op is ignored as not every arbitrary contraction body can be
+  // expressed by a vector.contract.
+  if (!isa<ContractionOpInterface>(linalgOp.getOperation()))
+    return failure();
+
+  OpOperand *outOperand = linalgOp.getDpsInitOperand(0);
+  Operation *reduceOp = matchLinalgReduction(outOperand);
+  auto maybeKind = getCombinerOpKind(reduceOp);
+  if (!maybeKind) {
+    LDBG("Failed to determine contraction combining kind.\n");
+    return failure();
+  }
+
+  // Check that all dimensions are present in the input operands.
+  // Arbitrary broadcasts are not supported by the vector contraction.
+  // Broadcasts are expected to be decomposed before vectorization.
+  AffineMap lhsMap = linalgOp.getIndexingMapsArray()[0];
+  AffineMap rhsMap = linalgOp.getIndexingMapsArray()[1];
+  if (getUnusedDimsBitVector({lhsMap, rhsMap}).any()) {
+    LDBG("Contractions with broadcasts are not supported.\n");
+    return failure();
+  }
+
+  // Load operands.
+  SmallVector<Value> vecOperands;
+  for (OpOperand &opOperand : linalgOp->getOpOperands()) {
+    // The operand vector shape is computed by mapping the canonical vector
+    // shape to the operand's domain. Further permutations are left as a part of
+    // the contraction.
+    AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand);
+    AffineMap readMap = AffineMap::getMultiDimIdentityMap(
+        indexingMap.getNumResults(), rewriter.getContext());
+    Type elemType = getElementTypeOrSelf(opOperand.get());
+    VectorType readType =
+        state.getCanonicalVecType(elemType, readMap.compose(indexingMap));
+
+    Value read = mlir::vector::createReadOrMaskedRead(
+        rewriter, loc, opOperand.get(), readType.getShape(),
+        /*padding=*/arith::getZeroConstant(rewriter, loc, elemType),
+        /*useInBoundsInsteadOfMasking=*/false, readType.getScalableDims());
+    vecOperands.push_back(read);
+  }
+
+  // Remap iterators from linalg to vector.
+  SmallVector<Attribute> iterAttrs;
+  auto iterators = linalgOp.getIteratorTypesArray();
+  for (utils::IteratorType iter : iterators) {
+    auto vecIter = iter == utils::IteratorType::parallel
+                       ? vector::IteratorType::parallel
+                       : vector::IteratorType::reduction;
+    iterAttrs.push_back(vector::IteratorTypeAttr::get(ctx, vecIter));
+  }
+
+  // Create contraction.
+  Operation *contractOp = rewriter.create<vector::ContractionOp>(
+      loc, /*lhs=*/vecOperands[0],
+      /*rhs=*/vecOperands[1], /*acc=*/vecOperands[2],
+      linalgOp.getIndexingMaps(), rewriter.getArrayAttr(iterAttrs), *maybeKind);
+  contractOp = state.maskOperation(rewriter, contractOp, linalgOp);
+
+  // Store result.
+  Operation *write = createWriteOrMaskedWrite(
+      rewriter, loc, contractOp->getResult(0), outOperand->get());
+
+  // Finalize.
+  if (!write->getResults().empty())
+    newResults.push_back(write->getResult(0));
+
+  return success();
+}
+
 namespace {
 enum class ConvOperationKind { Conv, Pool };
 } // namespace
@@ -2472,7 +2587,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
                  isa<linalg::MatmulTransposeAOp>(op) ||
                  isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
-                 isa<linalg::MatvecOp>(op) || hasReductionIterator(linalgOp));
+                 isa<linalg::MatvecOp>(op) || isa<linalg::Mmt4DOp>(op) ||
+                 hasReductionIterator(linalgOp));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
@@ -2528,11 +2644,11 @@ bool mlir::linalg::hasVectorizationImpl(Operation *op) {
              tensor::InsertSliceOp>(op);
 }
 
-FailureOr<VectorizationResult>
-mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
-                        ArrayRef<int64_t> inputVectorSizes,
-                        ArrayRef<bool> inputScalableVecDims,
-                        bool vectorizeNDExtract, bool flatten1DDepthwiseConv) {
+FailureOr<VectorizationResult> mlir::linalg::vectorize(
+    RewriterBase &rewriter, Operation *op, ArrayRef<int64_t> inputVectorSizes,
+    ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract,
+    bool flatten1DDepthwiseConv, bool assumeDynamicDimsMatchVecSizes,
+    bool createNamedContraction) {
   LDBG("Attempting to vectorize:\n" << *op << "\n");
   LDBG("Input vector sizes: ");
   LLVM_DEBUG(llvm::interleaveComma(inputVectorSizes, llvm::dbgs()));
@@ -2552,7 +2668,8 @@ mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
   VectorizationState state(rewriter);
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
     if (failed(state.initState(rewriter, linalgOp, inputVectorSizes,
-                               inputScalableVecDims))) {
+                               inputScalableVecDims,
+                               assumeDynamicDimsMatchVecSizes))) {
       LDBG("Vectorization state couldn't be initialized\n");
       return failure();
     }
@@ -2578,6 +2695,11 @@ mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
               return failure();
             }
 
+            if (createNamedContraction &&
+                isa<ContractionOpInterface>(linalgOp.getOperation()))
+              return vectorizeAsLinalgContraction(rewriter, state, linalgOp,
+                                                  results);
+
             LDBG("Vectorize generic by broadcasting to the canonical vector "
                  "shape\n");
 
diff --git a/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp
index ff6af63eee531..364e4d385fd62 100644
--- a/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -135,8 +135,9 @@ struct GlobalStoreOpInterface
     auto memrefType = getMemRefTypeWithStaticIdentityLayout(tensorType);
 
     auto loc = globalStoreOp.getLoc();
-    auto targetMemref = rewriter.create<memref::GetGlobalOp>(
-        loc, memrefType, globalStoreOp.getGlobalAttr().getLeafReference());
+    auto targetMemref = memref::GetGlobalOp::create(
+        rewriter, loc, memrefType,
+        globalStoreOp.getGlobalAttr().getLeafReference());
 
     auto sourceMemref =
         getBuffer(rewriter, globalStoreOp.getValue(), options, state);
diff --git a/mlir/lib/Dialect/MPI/IR/MPIOps.cpp b/mlir/lib/Dialect/MPI/IR/MPIOps.cpp
index 7940ff60a48e7..f52c3f99189d2 100644
--- a/mlir/lib/Dialect/MPI/IR/MPIOps.cpp
+++ b/mlir/lib/Dialect/MPI/IR/MPIOps.cpp
@@ -60,8 +60,8 @@ struct FoldRank final : public mlir::OpRewritePattern<mlir::mpi::CommRankOp> {
     if (!isa<IntegerAttr>(dltiAttr.value()))
       return op->emitError()
              << "Expected an integer attribute for MPI:comm_world_rank";
-    Value res = b.create<arith::ConstantIndexOp>(
-        op.getLoc(), cast<IntegerAttr>(dltiAttr.value()).getInt());
+    Value res = arith::ConstantIndexOp::create(
+        b, op.getLoc(), cast<IntegerAttr>(dltiAttr.value()).getInt());
     if (Value retVal = op.getRetval())
       b.replaceOp(op, {retVal, res});
     else
diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp
index 26441a9d78658..a21631cbf8510 100644
--- a/mlir/lib/Dialect/Math/IR/MathOps.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp
@@ -746,7 +746,7 @@ Operation *math::MathDialect::materializeConstant(OpBuilder &builder,
                                                   Attribute value, Type type,
                                                   Location loc) {
   if (auto poison = dyn_cast<ub::PoisonAttr>(value))
-    return builder.create<ub::PoisonOp>(loc, type, poison);
+    return ub::PoisonOp::create(builder, loc, type, poison);
 
   return arith::ConstantOp::materialize(builder, value, type, loc);
 }
diff --git a/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp b/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp
index 13e2a4b5541b2..31785eb20a642 100644
--- a/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp
@@ -65,7 +65,7 @@ PowFStrengthReduction::matchAndRewrite(math::PowFOp op,
   // Maybe broadcasts scalar value into vector type compatible with `op`.
   auto bcast = [&](Value value) -> Value {
     if (auto vec = dyn_cast<VectorType>(op.getType()))
-      return rewriter.create<vector::BroadcastOp>(op.getLoc(), vec, value);
+      return vector::BroadcastOp::create(rewriter, op.getLoc(), vec, value);
     return value;
   };
 
@@ -84,15 +84,16 @@ PowFStrengthReduction::matchAndRewrite(math::PowFOp op,
   // Replace `pow(x, 3.0)` with `x * x * x`.
   if (isExponentValue(3.0)) {
     Value square =
-        rewriter.create<arith::MulFOp>(op.getLoc(), ValueRange({x, x}));
+        arith::MulFOp::create(rewriter, op.getLoc(), ValueRange({x, x}));
     rewriter.replaceOpWithNewOp<arith::MulFOp>(op, ValueRange({x, square}));
     return success();
   }
 
   // Replace `pow(x, -1.0)` with `1.0 / x`.
   if (isExponentValue(-1.0)) {
-    Value one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getFloatAttr(getElementTypeOrSelf(op.getType()), 1.0));
+    Value one = arith::ConstantOp::create(
+        rewriter, loc,
+        rewriter.getFloatAttr(getElementTypeOrSelf(op.getType()), 1.0));
     rewriter.replaceOpWithNewOp<arith::DivFOp>(op, ValueRange({bcast(one), x}));
     return success();
   }
@@ -111,8 +112,8 @@ PowFStrengthReduction::matchAndRewrite(math::PowFOp op,
 
   // Replace `pow(x, 0.75)` with `sqrt(sqrt(x)) * sqrt(x)`.
   if (isExponentValue(0.75)) {
-    Value powHalf = rewriter.create<math::SqrtOp>(op.getLoc(), x);
-    Value powQuarter = rewriter.create<math::SqrtOp>(op.getLoc(), powHalf);
+    Value powHalf = math::SqrtOp::create(rewriter, op.getLoc(), x);
+    Value powQuarter = math::SqrtOp::create(rewriter, op.getLoc(), powHalf);
     rewriter.replaceOpWithNewOp<arith::MulFOp>(op,
                                                ValueRange{powHalf, powQuarter});
     return success();
@@ -168,18 +169,18 @@ PowIStrengthReduction<PowIOpTy, DivOpTy, MulOpTy>::matchAndRewrite(
   // Maybe broadcasts scalar value into vector type compatible with `op`.
   auto bcast = [&loc, &op, &rewriter](Value value) -> Value {
     if (auto vec = dyn_cast<VectorType>(op.getType()))
-      return rewriter.create<vector::BroadcastOp>(loc, vec, value);
+      return vector::BroadcastOp::create(rewriter, loc, vec, value);
     return value;
   };
 
   Value one;
   Type opType = getElementTypeOrSelf(op.getType());
   if constexpr (std::is_same_v<PowIOpTy, math::FPowIOp>)
-    one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getFloatAttr(opType, 1.0));
+    one = arith::ConstantOp::create(rewriter, loc,
+                                    rewriter.getFloatAttr(opType, 1.0));
   else
-    one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIntegerAttr(opType, 1));
+    one = arith::ConstantOp::create(rewriter, loc,
+                                    rewriter.getIntegerAttr(opType, 1));
 
   // Replace `[fi]powi(x, 0)` with `1`.
   if (exponentValue == 0) {
@@ -208,12 +209,12 @@ PowIStrengthReduction<PowIOpTy, DivOpTy, MulOpTy>::matchAndRewrite(
   //     with:
   //       (1 / x) * (1 / x) * (1 / x) * ...
   for (unsigned i = 1; i < exponentValue; ++i)
-    result = rewriter.create<MulOpTy>(loc, result, base);
+    result = MulOpTy::create(rewriter, loc, result, base);
 
   // Inverse the base for negative exponent, i.e. for
   // `[fi]powi(x, negative_exponent)` set `x` to `1 / x`.
   if (exponentIsNegative)
-    result = rewriter.create<DivOpTy>(loc, bcast(one), result);
+    result = DivOpTy::create(rewriter, loc, bcast(one), result);
 
   rewriter.replaceOp(op, result);
   return success();
diff --git a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
index bccd486def4bf..5edb6e28fb018 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
@@ -32,11 +32,11 @@ static Value createFloatConst(Location loc, Type type, APFloat value,
                 APFloat::rmNearestTiesToEven, &losesInfo);
   auto attr = b.getFloatAttr(eltType, value);
   if (auto shapedTy = dyn_cast<ShapedType>(type)) {
-    return b.create<arith::ConstantOp>(loc,
-                                       DenseElementsAttr::get(shapedTy, attr));
+    return arith::ConstantOp::create(b, loc,
+                                     DenseElementsAttr::get(shapedTy, attr));
   }
 
-  return b.create<arith::ConstantOp>(loc, attr);
+  return arith::ConstantOp::create(b, loc, attr);
 }
 
 static Value createFloatConst(Location loc, Type type, double value,
@@ -49,11 +49,11 @@ static Value createIntConst(Location loc, Type type, int64_t value,
                             OpBuilder &b) {
   auto attr = b.getIntegerAttr(getElementTypeOrSelf(type), value);
   if (auto shapedTy = dyn_cast<ShapedType>(type)) {
-    return b.create<arith::ConstantOp>(loc,
-                                       DenseElementsAttr::get(shapedTy, attr));
+    return arith::ConstantOp::create(b, loc,
+                                     DenseElementsAttr::get(shapedTy, attr));
   }
 
-  return b.create<arith::ConstantOp>(loc, attr);
+  return arith::ConstantOp::create(b, loc, attr);
 }
 
 static Value createTruncatedFPValue(Value operand, ImplicitLocOpBuilder &b) {
@@ -61,11 +61,11 @@ static Value createTruncatedFPValue(Value operand, ImplicitLocOpBuilder &b) {
   Type i64Ty = b.getI64Type();
   if (auto shapedTy = dyn_cast<ShapedType>(opType))
     i64Ty = shapedTy.clone(i64Ty);
-  Value fixedConvert = b.create<arith::FPToSIOp>(i64Ty, operand);
-  Value fpFixedConvert = b.create<arith::SIToFPOp>(opType, fixedConvert);
+  Value fixedConvert = arith::FPToSIOp::create(b, i64Ty, operand);
+  Value fpFixedConvert = arith::SIToFPOp::create(b, opType, fixedConvert);
   // The truncation does not preserve the sign when the truncated
   // value is -0. So here the sign is copied again.
-  return b.create<math::CopySignOp>(fpFixedConvert, operand);
+  return math::CopySignOp::create(b, fpFixedConvert, operand);
 }
 
 // sinhf(float x) -> (exp(x) - exp(-x)) / 2
@@ -74,12 +74,12 @@ static LogicalResult convertSinhOp(math::SinhOp op, PatternRewriter &rewriter) {
   Value operand = op.getOperand();
   Type opType = operand.getType();
 
-  Value exp = b.create<math::ExpOp>(operand);
-  Value neg = b.create<arith::NegFOp>(operand);
-  Value nexp = b.create<math::ExpOp>(neg);
-  Value sub = b.create<arith::SubFOp>(exp, nexp);
+  Value exp = math::ExpOp::create(b, operand);
+  Value neg = arith::NegFOp::create(b, operand);
+  Value nexp = math::ExpOp::create(b, neg);
+  Value sub = arith::SubFOp::create(b, exp, nexp);
   Value half = createFloatConst(op->getLoc(), opType, 0.5, rewriter);
-  Value res = b.create<arith::MulFOp>(sub, half);
+  Value res = arith::MulFOp::create(b, sub, half);
   rewriter.replaceOp(op, res);
   return success();
 }
@@ -90,12 +90,12 @@ static LogicalResult convertCoshOp(math::CoshOp op, PatternRewriter &rewriter) {
   Value operand = op.getOperand();
   Type opType = operand.getType();
 
-  Value exp = b.create<math::ExpOp>(operand);
-  Value neg = b.create<arith::NegFOp>(operand);
-  Value nexp = b.create<math::ExpOp>(neg);
-  Value add = b.create<arith::AddFOp>(exp, nexp);
+  Value exp = math::ExpOp::create(b, operand);
+  Value neg = arith::NegFOp::create(b, operand);
+  Value nexp = math::ExpOp::create(b, neg);
+  Value add = arith::AddFOp::create(b, exp, nexp);
   Value half = createFloatConst(op->getLoc(), opType, 0.5, rewriter);
-  Value res = b.create<arith::MulFOp>(add, half);
+  Value res = arith::MulFOp::create(b, add, half);
   rewriter.replaceOp(op, res);
   return success();
 }
@@ -116,23 +116,23 @@ static LogicalResult convertTanhOp(math::TanhOp op, PatternRewriter &rewriter) {
   Value negTwo = createFloatConst(loc, floatType, -2.0, rewriter);
 
   // Compute sign(x) = cast<float_type>(x < 0) * (-2) + 1
-  Value isNegative = rewriter.create<arith::CmpFOp>(
-      loc, arith::CmpFPredicate::OLT, op.getOperand(), zero);
+  Value isNegative = arith::CmpFOp::create(
+      rewriter, loc, arith::CmpFPredicate::OLT, op.getOperand(), zero);
   Value isNegativeFloat =
-      rewriter.create<arith::UIToFPOp>(loc, floatType, isNegative);
+      arith::UIToFPOp::create(rewriter, loc, floatType, isNegative);
   Value isNegativeTimesNegTwo =
-      rewriter.create<arith::MulFOp>(loc, isNegativeFloat, negTwo);
-  Value sign = rewriter.create<arith::AddFOp>(loc, isNegativeTimesNegTwo, one);
+      arith::MulFOp::create(rewriter, loc, isNegativeFloat, negTwo);
+  Value sign = arith::AddFOp::create(rewriter, loc, isNegativeTimesNegTwo, one);
 
   // Normalize input to positive value: y = sign(x) * x
-  Value positiveX = rewriter.create<arith::MulFOp>(loc, sign, op.getOperand());
+  Value positiveX = arith::MulFOp::create(rewriter, loc, sign, op.getOperand());
 
   // Decompose on normalized input
-  Value negDoubledX = rewriter.create<arith::MulFOp>(loc, negTwo, positiveX);
-  Value exp2x = rewriter.create<math::ExpOp>(loc, negDoubledX);
-  Value dividend = rewriter.create<arith::SubFOp>(loc, one, exp2x);
-  Value divisor = rewriter.create<arith::AddFOp>(loc, one, exp2x);
-  Value positiveRes = rewriter.create<arith::DivFOp>(loc, dividend, divisor);
+  Value negDoubledX = arith::MulFOp::create(rewriter, loc, negTwo, positiveX);
+  Value exp2x = math::ExpOp::create(rewriter, loc, negDoubledX);
+  Value dividend = arith::SubFOp::create(rewriter, loc, one, exp2x);
+  Value divisor = arith::AddFOp::create(rewriter, loc, one, exp2x);
+  Value positiveRes = arith::DivFOp::create(rewriter, loc, dividend, divisor);
 
   // Multiply result by sign(x) to retain signs from negative inputs
   rewriter.replaceOpWithNewOp<arith::MulFOp>(op, sign, positiveRes);
@@ -145,9 +145,9 @@ static LogicalResult convertTanOp(math::TanOp op, PatternRewriter &rewriter) {
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
   Value operand = op.getOperand();
   Type type = operand.getType();
-  Value sin = b.create<math::SinOp>(type, operand);
-  Value cos = b.create<math::CosOp>(type, operand);
-  Value div = b.create<arith::DivFOp>(type, sin, cos);
+  Value sin = math::SinOp::create(b, type, operand);
+  Value cos = math::CosOp::create(b, type, operand);
+  Value div = arith::DivFOp::create(b, type, sin, cos);
   rewriter.replaceOp(op, div);
   return success();
 }
@@ -160,10 +160,10 @@ static LogicalResult convertAsinhOp(math::AsinhOp op,
   Type opType = operand.getType();
 
   Value one = createFloatConst(op->getLoc(), opType, 1.0, rewriter);
-  Value fma = b.create<math::FmaOp>(operand, operand, one);
-  Value sqrt = b.create<math::SqrtOp>(fma);
-  Value add = b.create<arith::AddFOp>(operand, sqrt);
-  Value res = b.create<math::LogOp>(add);
+  Value fma = math::FmaOp::create(b, operand, operand, one);
+  Value sqrt = math::SqrtOp::create(b, fma);
+  Value add = arith::AddFOp::create(b, operand, sqrt);
+  Value res = math::LogOp::create(b, add);
   rewriter.replaceOp(op, res);
   return success();
 }
@@ -176,10 +176,10 @@ static LogicalResult convertAcoshOp(math::AcoshOp op,
   Type opType = operand.getType();
 
   Value negOne = createFloatConst(op->getLoc(), opType, -1.0, rewriter);
-  Value fma = b.create<math::FmaOp>(operand, operand, negOne);
-  Value sqrt = b.create<math::SqrtOp>(fma);
-  Value add = b.create<arith::AddFOp>(operand, sqrt);
-  Value res = b.create<math::LogOp>(add);
+  Value fma = math::FmaOp::create(b, operand, operand, negOne);
+  Value sqrt = math::SqrtOp::create(b, fma);
+  Value add = arith::AddFOp::create(b, operand, sqrt);
+  Value res = math::LogOp::create(b, add);
   rewriter.replaceOp(op, res);
   return success();
 }
@@ -192,13 +192,13 @@ static LogicalResult convertAtanhOp(math::AtanhOp op,
   Type opType = operand.getType();
 
   Value one = createFloatConst(op->getLoc(), opType, 1.0, rewriter);
-  Value add = b.create<arith::AddFOp>(operand, one);
-  Value neg = b.create<arith::NegFOp>(operand);
-  Value sub = b.create<arith::AddFOp>(neg, one);
-  Value div = b.create<arith::DivFOp>(add, sub);
-  Value log = b.create<math::LogOp>(div);
+  Value add = arith::AddFOp::create(b, operand, one);
+  Value neg = arith::NegFOp::create(b, operand);
+  Value sub = arith::AddFOp::create(b, neg, one);
+  Value div = arith::DivFOp::create(b, add, sub);
+  Value log = math::LogOp::create(b, div);
   Value half = createFloatConst(op->getLoc(), opType, 0.5, rewriter);
-  Value res = b.create<arith::MulFOp>(log, half);
+  Value res = arith::MulFOp::create(b, log, half);
   rewriter.replaceOp(op, res);
   return success();
 }
@@ -209,8 +209,8 @@ static LogicalResult convertFmaFOp(math::FmaOp op, PatternRewriter &rewriter) {
   Value operandB = op.getOperand(1);
   Value operandC = op.getOperand(2);
   Type type = op.getType();
-  Value mult = b.create<arith::MulFOp>(type, operandA, operandB);
-  Value add = b.create<arith::AddFOp>(type, mult, operandC);
+  Value mult = arith::MulFOp::create(b, type, operandA, operandB);
+  Value add = arith::AddFOp::create(b, type, mult, operandC);
   rewriter.replaceOp(op, add);
   return success();
 }
@@ -235,11 +235,12 @@ static LogicalResult convertCeilOp(math::CeilOp op, PatternRewriter &rewriter) {
   Value zero = createFloatConst(op->getLoc(), opType, 0.00, rewriter);
   Value one = createFloatConst(op->getLoc(), opType, 1.00, rewriter);
 
-  Value gtCheck = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, operand,
-                                          fpFixedConvert);
-  Value incrValue = b.create<arith::SelectOp>(op->getLoc(), gtCheck, one, zero);
+  Value gtCheck = arith::CmpFOp::create(b, arith::CmpFPredicate::OGT, operand,
+                                        fpFixedConvert);
+  Value incrValue =
+      arith::SelectOp::create(b, op->getLoc(), gtCheck, one, zero);
 
-  Value ret = b.create<arith::AddFOp>(opType, fpFixedConvert, incrValue);
+  Value ret = arith::AddFOp::create(b, opType, fpFixedConvert, incrValue);
   rewriter.replaceOp(op, ret);
   return success();
 }
@@ -257,9 +258,9 @@ static LogicalResult convertFPowIOp(math::FPowIOp op,
 
   auto convertFPowItoPowf = [&]() -> LogicalResult {
     Value castPowerToFp =
-        rewriter.create<arith::SIToFPOp>(op.getLoc(), baseType, power);
-    Value res = rewriter.create<math::PowFOp>(op.getLoc(), baseType, base,
-                                              castPowerToFp);
+        arith::SIToFPOp::create(rewriter, op.getLoc(), baseType, power);
+    Value res = math::PowFOp::create(rewriter, op.getLoc(), baseType, base,
+                                     castPowerToFp);
     rewriter.replaceOp(op, res);
     return success();
   };
@@ -280,9 +281,9 @@ static LogicalResult convertFPowIOp(math::FPowIOp op,
 
   while (absPower > 0) {
     if (absPower & 1)
-      res = b.create<arith::MulFOp>(baseType, base, res);
+      res = arith::MulFOp::create(b, baseType, base, res);
     absPower >>= 1;
-    base = b.create<arith::MulFOp>(baseType, base, base);
+    base = arith::MulFOp::create(b, baseType, base, base);
   }
 
   // Make sure not to introduce UB in case of negative power.
@@ -302,14 +303,14 @@ static LogicalResult convertFPowIOp(math::FPowIOp op,
         createFloatConst(op->getLoc(), baseType,
                          APFloat::getInf(sem, /*Negative=*/true), rewriter);
     Value zeroEqCheck =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, res, zero);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, res, zero);
     Value negZeroEqCheck =
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, res, negZero);
-    res = b.create<arith::DivFOp>(baseType, one, res);
+        arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, res, negZero);
+    res = arith::DivFOp::create(b, baseType, one, res);
     res =
-        b.create<arith::SelectOp>(op->getLoc(), zeroEqCheck, posInfinity, res);
-    res = b.create<arith::SelectOp>(op->getLoc(), negZeroEqCheck, negInfinity,
-                                    res);
+        arith::SelectOp::create(b, op->getLoc(), zeroEqCheck, posInfinity, res);
+    res = arith::SelectOp::create(b, op->getLoc(), negZeroEqCheck, negInfinity,
+                                  res);
   }
 
   rewriter.replaceOp(op, res);
@@ -330,7 +331,7 @@ static LogicalResult convertPowfOp(math::PowFOp op, PatternRewriter &rewriter) {
       cast<mlir::FloatType>(getElementTypeOrSelf(typeB)).getFloatSemantics();
   APFloat valueB(sem);
   auto mulf = [&](Value x, Value y) -> Value {
-    return b.create<arith::MulFOp>(x, y);
+    return arith::MulFOp::create(b, x, y);
   };
   if (matchPattern(operandB, m_ConstantFloat(&valueB))) {
     if (valueB.isZero()) {
@@ -347,19 +348,19 @@ static LogicalResult convertPowfOp(math::PowFOp op, PatternRewriter &rewriter) {
     if (valueB.isExactlyValue(-1.0)) {
       // a^(-1) -> 1 / a
       Value one = createFloatConst(op->getLoc(), typeA, 1.0, rewriter);
-      Value div = b.create<arith::DivFOp>(one, operandA);
+      Value div = arith::DivFOp::create(b, one, operandA);
       rewriter.replaceOp(op, div);
       return success();
     }
     if (valueB.isExactlyValue(0.5)) {
       // a^(1/2) -> sqrt(a)
-      Value sqrt = b.create<math::SqrtOp>(operandA);
+      Value sqrt = math::SqrtOp::create(b, operandA);
       rewriter.replaceOp(op, sqrt);
       return success();
     }
     if (valueB.isExactlyValue(-0.5)) {
       // a^(-1/2) -> 1 / sqrt(a)
-      Value rsqrt = b.create<math::RsqrtOp>(operandA);
+      Value rsqrt = math::RsqrtOp::create(b, operandA);
       rewriter.replaceOp(op, rsqrt);
       return success();
     }
@@ -372,7 +373,7 @@ static LogicalResult convertPowfOp(math::PowFOp op, PatternRewriter &rewriter) {
       // a^(-2) -> 1 / (a * a)
       Value one =
           createFloatConst(op->getLoc(), operandA.getType(), 1.0, rewriter);
-      Value div = b.create<arith::DivFOp>(one, mulf(operandA, operandA));
+      Value div = arith::DivFOp::create(b, one, mulf(operandA, operandA));
       rewriter.replaceOp(op, div);
       return success();
     }
@@ -382,9 +383,9 @@ static LogicalResult convertPowfOp(math::PowFOp op, PatternRewriter &rewriter) {
     }
   }
 
-  Value logA = b.create<math::LogOp>(operandA);
-  Value mult = b.create<arith::MulFOp>(operandB, logA);
-  Value expResult = b.create<math::ExpOp>(mult);
+  Value logA = math::LogOp::create(b, operandA);
+  Value mult = arith::MulFOp::create(b, operandB, logA);
+  Value expResult = math::ExpOp::create(b, mult);
   rewriter.replaceOp(op, expResult);
   return success();
 }
@@ -399,8 +400,8 @@ static LogicalResult convertExp2fOp(math::Exp2Op op,
   Value operand = op.getOperand();
   Type opType = operand.getType();
   Value ln2 = createFloatConst(op->getLoc(), opType, llvm::numbers::ln2, b);
-  Value mult = b.create<arith::MulFOp>(opType, operand, ln2);
-  Value exp = b.create<math::ExpOp>(op->getLoc(), mult);
+  Value mult = arith::MulFOp::create(b, opType, operand, ln2);
+  Value exp = math::ExpOp::create(b, op->getLoc(), mult);
   rewriter.replaceOp(op, exp);
   return success();
 }
@@ -426,8 +427,8 @@ static LogicalResult convertRoundOp(math::RoundOp op,
   Value c127 = createIntConst(loc, i32Ty, 127, b);
   Value expMask = createIntConst(loc, i32Ty, (1 << 8) - 1, b);
 
-  Value incrValue = b.create<math::CopySignOp>(half, operand);
-  Value add = b.create<arith::AddFOp>(opType, operand, incrValue);
+  Value incrValue = math::CopySignOp::create(b, half, operand);
+  Value add = arith::AddFOp::create(b, opType, operand, incrValue);
   Value fpFixedConvert = createTruncatedFPValue(add, b);
 
   // There are three cases where adding 0.5 to the value and truncating by
@@ -450,15 +451,15 @@ static LogicalResult convertRoundOp(math::RoundOp op,
   //     i64 leading to wrong outputs.
   //
   // All three cases satisfy the property `biasedExp >= 23`.
-  Value operandBitcast = b.create<arith::BitcastOp>(i32Ty, operand);
-  Value operandExp = b.create<arith::AndIOp>(
-      b.create<arith::ShRUIOp>(operandBitcast, c23), expMask);
-  Value operandBiasedExp = b.create<arith::SubIOp>(operandExp, c127);
-  Value isSpecialValOrLargeVal =
-      b.create<arith::CmpIOp>(arith::CmpIPredicate::sge, operandBiasedExp, c23);
-
-  Value result = b.create<arith::SelectOp>(isSpecialValOrLargeVal, operand,
-                                           fpFixedConvert);
+  Value operandBitcast = arith::BitcastOp::create(b, i32Ty, operand);
+  Value operandExp = arith::AndIOp::create(
+      b, arith::ShRUIOp::create(b, operandBitcast, c23), expMask);
+  Value operandBiasedExp = arith::SubIOp::create(b, operandExp, c127);
+  Value isSpecialValOrLargeVal = arith::CmpIOp::create(
+      b, arith::CmpIPredicate::sge, operandBiasedExp, c23);
+
+  Value result = arith::SelectOp::create(b, isSpecialValOrLargeVal, operand,
+                                         fpFixedConvert);
   rewriter.replaceOp(op, result);
   return success();
 }
@@ -488,21 +489,21 @@ static LogicalResult convertCtlzOp(math::CountLeadingZerosOp op,
     auto bits = createIntConst(loc, operandTy, half, rewriter);
     auto mask = createIntConst(loc, operandTy, allbits >> half, rewriter);
 
-    Value pred =
-        rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ule, x, mask);
-    Value add = rewriter.create<arith::AddIOp>(loc, count, bits);
-    Value shift = rewriter.create<arith::ShLIOp>(loc, x, bits);
+    Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ule,
+                                       x, mask);
+    Value add = arith::AddIOp::create(rewriter, loc, count, bits);
+    Value shift = arith::ShLIOp::create(rewriter, loc, x, bits);
 
-    x = rewriter.create<arith::SelectOp>(loc, pred, shift, x);
-    count = rewriter.create<arith::SelectOp>(loc, pred, add, count);
+    x = arith::SelectOp::create(rewriter, loc, pred, shift, x);
+    count = arith::SelectOp::create(rewriter, loc, pred, add, count);
   }
 
   Value zero = createIntConst(loc, operandTy, 0, rewriter);
-  Value pred = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                              operand, zero);
+  Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                     operand, zero);
 
   Value bwval = createIntConst(loc, operandTy, bitwidth, rewriter);
-  Value sel = rewriter.create<arith::SelectOp>(loc, pred, bwval, count);
+  Value sel = arith::SelectOp::create(rewriter, loc, pred, bwval, count);
   rewriter.replaceOp(op, sel);
   return success();
 }
@@ -549,29 +550,29 @@ static LogicalResult convertRoundEvenOp(math::RoundEvenOp op,
   Value c23Mask = createIntConst(loc, iTy, (1ull << mantissaWidth) - 1, b);
   Value expMask = createIntConst(loc, iTy, (1ull << exponentWidth) - 1, b);
 
-  Value operandBitcast = b.create<arith::BitcastOp>(iTy, operand);
-  Value round = b.create<math::RoundOp>(operand);
-  Value roundBitcast = b.create<arith::BitcastOp>(iTy, round);
+  Value operandBitcast = arith::BitcastOp::create(b, iTy, operand);
+  Value round = math::RoundOp::create(b, operand);
+  Value roundBitcast = arith::BitcastOp::create(b, iTy, round);
 
   // Get biased exponents for operand and round(operand)
-  Value operandExp = b.create<arith::AndIOp>(
-      b.create<arith::ShRUIOp>(operandBitcast, c23), expMask);
-  Value operandBiasedExp = b.create<arith::SubIOp>(operandExp, c127);
-  Value roundExp = b.create<arith::AndIOp>(
-      b.create<arith::ShRUIOp>(roundBitcast, c23), expMask);
-  Value roundBiasedExp = b.create<arith::SubIOp>(roundExp, c127);
+  Value operandExp = arith::AndIOp::create(
+      b, arith::ShRUIOp::create(b, operandBitcast, c23), expMask);
+  Value operandBiasedExp = arith::SubIOp::create(b, operandExp, c127);
+  Value roundExp = arith::AndIOp::create(
+      b, arith::ShRUIOp::create(b, roundBitcast, c23), expMask);
+  Value roundBiasedExp = arith::SubIOp::create(b, roundExp, c127);
 
   auto safeShiftRight = [&](Value x, Value shift) -> Value {
     // Clamp shift to valid range [0, bitwidth - 1] to avoid undefined behavior
-    Value clampedShift = b.create<arith::MaxSIOp>(shift, c0);
-    clampedShift = b.create<arith::MinSIOp>(clampedShift, c31);
-    return b.create<arith::ShRUIOp>(x, clampedShift);
+    Value clampedShift = arith::MaxSIOp::create(b, shift, c0);
+    clampedShift = arith::MinSIOp::create(b, clampedShift, c31);
+    return arith::ShRUIOp::create(b, x, clampedShift);
   };
 
   auto maskMantissa = [&](Value mantissa,
                           Value mantissaMaskRightShift) -> Value {
     Value shiftedMantissaMask = safeShiftRight(c23Mask, mantissaMaskRightShift);
-    return b.create<arith::AndIOp>(mantissa, shiftedMantissaMask);
+    return arith::AndIOp::create(b, mantissa, shiftedMantissaMask);
   };
 
   // A whole number `x`, such that `|x| != 1`, is even if the mantissa, ignoring
@@ -589,13 +590,13 @@ static LogicalResult convertRoundEvenOp(math::RoundEvenOp op,
   // `biasedExp > 23`, so they get treated as large numbers with no room for
   // decimals, which are always even.
   Value roundBiasedExpEq0 =
-      b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, roundBiasedExp, c0);
-  Value roundBiasedExpMinus1 = b.create<arith::SubIOp>(roundBiasedExp, c1);
+      arith::CmpIOp::create(b, arith::CmpIPredicate::eq, roundBiasedExp, c0);
+  Value roundBiasedExpMinus1 = arith::SubIOp::create(b, roundBiasedExp, c1);
   Value roundMaskedMantissa = maskMantissa(roundBitcast, roundBiasedExpMinus1);
-  Value roundIsNotEvenOrSpecialVal = b.create<arith::CmpIOp>(
-      arith::CmpIPredicate::ne, roundMaskedMantissa, c0);
+  Value roundIsNotEvenOrSpecialVal = arith::CmpIOp::create(
+      b, arith::CmpIPredicate::ne, roundMaskedMantissa, c0);
   roundIsNotEvenOrSpecialVal =
-      b.create<arith::OrIOp>(roundIsNotEvenOrSpecialVal, roundBiasedExpEq0);
+      arith::OrIOp::create(b, roundIsNotEvenOrSpecialVal, roundBiasedExpEq0);
 
   // A value `x` with `0 <= biasedExp < 23`, is halfway between two consecutive
   // integers if the bit at index `biasedExp` starting from the left in the
@@ -604,37 +605,37 @@ static LogicalResult convertRoundEvenOp(math::RoundEvenOp op,
   // values +-0.5 are the only halfway values that have `biasedExp == -1 < 0`,
   // so these are handled separately. In particular, if `biasedExp == -1`, the
   // value is halfway if the entire mantissa is zero.
-  Value operandBiasedExpEqNeg1 = b.create<arith::CmpIOp>(
-      arith::CmpIPredicate::eq, operandBiasedExp, cNeg1);
-  Value expectedOperandMaskedMantissa = b.create<arith::SelectOp>(
-      operandBiasedExpEqNeg1, c0, safeShiftRight(c2To22, operandBiasedExp));
+  Value operandBiasedExpEqNeg1 = arith::CmpIOp::create(
+      b, arith::CmpIPredicate::eq, operandBiasedExp, cNeg1);
+  Value expectedOperandMaskedMantissa = arith::SelectOp::create(
+      b, operandBiasedExpEqNeg1, c0, safeShiftRight(c2To22, operandBiasedExp));
   Value operandMaskedMantissa = maskMantissa(operandBitcast, operandBiasedExp);
   Value operandIsHalfway =
-      b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, operandMaskedMantissa,
-                              expectedOperandMaskedMantissa);
+      arith::CmpIOp::create(b, arith::CmpIPredicate::eq, operandMaskedMantissa,
+                            expectedOperandMaskedMantissa);
   // Ensure `biasedExp` is in the valid range for half values.
-  Value operandBiasedExpGeNeg1 = b.create<arith::CmpIOp>(
-      arith::CmpIPredicate::sge, operandBiasedExp, cNeg1);
-  Value operandBiasedExpLt23 =
-      b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, operandBiasedExp, c23);
+  Value operandBiasedExpGeNeg1 = arith::CmpIOp::create(
+      b, arith::CmpIPredicate::sge, operandBiasedExp, cNeg1);
+  Value operandBiasedExpLt23 = arith::CmpIOp::create(
+      b, arith::CmpIPredicate::slt, operandBiasedExp, c23);
   operandIsHalfway =
-      b.create<arith::AndIOp>(operandIsHalfway, operandBiasedExpLt23);
+      arith::AndIOp::create(b, operandIsHalfway, operandBiasedExpLt23);
   operandIsHalfway =
-      b.create<arith::AndIOp>(operandIsHalfway, operandBiasedExpGeNeg1);
+      arith::AndIOp::create(b, operandIsHalfway, operandBiasedExpGeNeg1);
 
   // Adjust rounded operand with `round(operand) - sign(operand)` to correct the
   // case where `round` rounded in the opposite direction of `roundeven`.
-  Value sign = b.create<math::CopySignOp>(c1Float, operand);
-  Value roundShifted = b.create<arith::SubFOp>(round, sign);
+  Value sign = math::CopySignOp::create(b, c1Float, operand);
+  Value roundShifted = arith::SubFOp::create(b, round, sign);
   // If the rounded value is even or a special value, we default to the behavior
   // of `math.round`.
   Value needsShift =
-      b.create<arith::AndIOp>(roundIsNotEvenOrSpecialVal, operandIsHalfway);
-  Value result = b.create<arith::SelectOp>(needsShift, roundShifted, round);
+      arith::AndIOp::create(b, roundIsNotEvenOrSpecialVal, operandIsHalfway);
+  Value result = arith::SelectOp::create(b, needsShift, roundShifted, round);
   // The `x - sign` adjustment does not preserve the sign when we are adjusting
   // the value -1 to -0. So here the sign is copied again to ensure that -0.5 is
   // rounded to -0.0.
-  result = b.create<math::CopySignOp>(result, operand);
+  result = math::CopySignOp::create(b, result, operand);
   rewriter.replaceOp(op, result);
   return success();
 }
@@ -656,7 +657,7 @@ static LogicalResult convertRsqrtOp(math::RsqrtOp op,
 
   Location loc = op->getLoc();
   auto constOneFloat = createFloatConst(loc, operandTy, 1.0, rewriter);
-  auto sqrtOp = rewriter.create<math::SqrtOp>(loc, operand);
+  auto sqrtOp = math::SqrtOp::create(rewriter, loc, operand);
   rewriter.replaceOpWithNewOp<arith::DivFOp>(op, constOneFloat, sqrtOp);
   return success();
 }
diff --git a/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp b/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
index a570ed5118ef0..9d6ad613fc945 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
@@ -73,7 +73,7 @@ void mlir::math::populateExtendToSupportedTypesTypeConverter(
       });
   typeConverter.addTargetMaterialization(
       [](OpBuilder &b, Type target, ValueRange input, Location loc) {
-        auto extFOp = b.create<arith::ExtFOp>(loc, target, input);
+        auto extFOp = arith::ExtFOp::create(b, loc, target, input);
         extFOp.setFastmath(arith::FastMathFlags::contract);
         return extFOp;
       });
@@ -104,7 +104,7 @@ LogicalResult ExtendToSupportedTypesRewritePattern::matchAndRewrite(
   for (auto [result, newType, origType] : llvm::zip_equal(
            results, (*legalized)->getResultTypes(), op->getResultTypes())) {
     if (newType != origType) {
-      auto truncFOp = rewriter.create<arith::TruncFOp>(loc, origType, result);
+      auto truncFOp = arith::TruncFOp::create(rewriter, loc, origType, result);
       truncFOp.setFastmath(arith::FastMathFlags::contract);
       result = truncFOp.getResult();
     }
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index dd2dfe372b683..76720cfd4a98c 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -72,7 +72,7 @@ static Value broadcast(ImplicitLocOpBuilder &builder, Value value,
                        std::optional<VectorShape> shape) {
   assert(!isa<VectorType>(value.getType()) && "must be scalar value");
   auto type = broadcast(value.getType(), shape);
-  return shape ? builder.create<BroadcastOp>(type, value) : value;
+  return shape ? BroadcastOp::create(builder, type, value) : value;
 }
 
 //----------------------------------------------------------------------------//
@@ -130,7 +130,7 @@ handleMultidimensionalVectors(ImplicitLocOpBuilder &builder,
       auto eltType = cast<VectorType>(operand.getType()).getElementType();
       auto expandedType = VectorType::get(expandedShape, eltType);
       expandedOperands[i] =
-          builder.create<vector::ShapeCastOp>(expandedType, operand);
+          vector::ShapeCastOp::create(builder, expandedType, operand);
     }
   }
 
@@ -148,7 +148,7 @@ handleMultidimensionalVectors(ImplicitLocOpBuilder &builder,
     SmallVector<Value> extracted(expandedOperands.size());
     for (const auto &tuple : llvm::enumerate(expandedOperands))
       extracted[tuple.index()] =
-          builder.create<vector::ExtractOp>(tuple.value(), offsets);
+          vector::ExtractOp::create(builder, tuple.value(), offsets);
 
     results[i] = compute(extracted);
   }
@@ -156,16 +156,16 @@ handleMultidimensionalVectors(ImplicitLocOpBuilder &builder,
   // Stitch results together into one large vector.
   Type resultEltType = cast<VectorType>(results[0].getType()).getElementType();
   Type resultExpandedType = VectorType::get(expandedShape, resultEltType);
-  Value result = builder.create<arith::ConstantOp>(
-      resultExpandedType, builder.getZeroAttr(resultExpandedType));
+  Value result = arith::ConstantOp::create(
+      builder, resultExpandedType, builder.getZeroAttr(resultExpandedType));
 
   for (int64_t i = 0; i < maxIndex; ++i)
-    result = builder.create<vector::InsertOp>(results[i], result,
-                                              delinearize(i, strides));
+    result = vector::InsertOp::create(builder, results[i], result,
+                                      delinearize(i, strides));
 
   // Reshape back to the original vector shape.
-  return builder.create<vector::ShapeCastOp>(
-      VectorType::get(inputShape, resultEltType), result);
+  return vector::ShapeCastOp::create(
+      builder, VectorType::get(inputShape, resultEltType), result);
 }
 
 //----------------------------------------------------------------------------//
@@ -173,28 +173,28 @@ handleMultidimensionalVectors(ImplicitLocOpBuilder &builder,
 //----------------------------------------------------------------------------//
 
 static Value boolCst(ImplicitLocOpBuilder &builder, bool value) {
-  return builder.create<arith::ConstantOp>(builder.getBoolAttr(value));
+  return arith::ConstantOp::create(builder, builder.getBoolAttr(value));
 }
 
 static Value floatCst(ImplicitLocOpBuilder &builder, float value,
                       Type elementType) {
   assert((elementType.isF16() || elementType.isF32()) &&
          "x must be f16 or f32 type.");
-  return builder.create<arith::ConstantOp>(
-      builder.getFloatAttr(elementType, value));
+  return arith::ConstantOp::create(builder,
+                                   builder.getFloatAttr(elementType, value));
 }
 
 static Value f32Cst(ImplicitLocOpBuilder &builder, double value) {
-  return builder.create<arith::ConstantOp>(builder.getF32FloatAttr(value));
+  return arith::ConstantOp::create(builder, builder.getF32FloatAttr(value));
 }
 
 static Value i32Cst(ImplicitLocOpBuilder &builder, int32_t value) {
-  return builder.create<arith::ConstantOp>(builder.getI32IntegerAttr(value));
+  return arith::ConstantOp::create(builder, builder.getI32IntegerAttr(value));
 }
 
 static Value f32FromBits(ImplicitLocOpBuilder &builder, uint32_t bits) {
   Value i32Value = i32Cst(builder, static_cast<int32_t>(bits));
-  return builder.create<arith::BitcastOp>(builder.getF32Type(), i32Value);
+  return arith::BitcastOp::create(builder, builder.getF32Type(), i32Value);
 }
 
 //----------------------------------------------------------------------------//
@@ -203,15 +203,17 @@ static Value f32FromBits(ImplicitLocOpBuilder &builder, uint32_t bits) {
 
 // Return the minimum of the two values or NaN if value is NaN
 static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound) {
-  return builder.create<arith::SelectOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::ULT, value, bound),
+  return arith::SelectOp::create(
+      builder,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::ULT, value, bound),
       value, bound);
 }
 
 // Return the maximum of the two values or NaN if value is NaN
 static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound) {
-  return builder.create<arith::SelectOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::UGT, value, bound),
+  return arith::SelectOp::create(
+      builder,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::UGT, value, bound),
       value, bound);
 }
 
@@ -241,24 +243,24 @@ static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
   Value cstInvMantMask = f32FromBits(builder, ~0x7f800000u);
 
   // Bitcast to i32 for bitwise operations.
-  Value i32Half = builder.create<arith::BitcastOp>(i32, cstHalf);
-  Value i32InvMantMask = builder.create<arith::BitcastOp>(i32, cstInvMantMask);
-  Value i32Arg = builder.create<arith::BitcastOp>(i32Vec, arg);
+  Value i32Half = arith::BitcastOp::create(builder, i32, cstHalf);
+  Value i32InvMantMask = arith::BitcastOp::create(builder, i32, cstInvMantMask);
+  Value i32Arg = arith::BitcastOp::create(builder, i32Vec, arg);
 
   // Compute normalized fraction.
-  Value tmp0 = builder.create<arith::AndIOp>(i32Arg, bcast(i32InvMantMask));
-  Value tmp1 = builder.create<arith::OrIOp>(tmp0, bcast(i32Half));
-  Value normalizedFraction = builder.create<arith::BitcastOp>(f32Vec, tmp1);
+  Value tmp0 = arith::AndIOp::create(builder, i32Arg, bcast(i32InvMantMask));
+  Value tmp1 = arith::OrIOp::create(builder, tmp0, bcast(i32Half));
+  Value normalizedFraction = arith::BitcastOp::create(builder, f32Vec, tmp1);
 
   // Compute exponent.
-  Value arg0 = isPositive ? arg : builder.create<math::AbsFOp>(arg);
-  Value biasedExponentBits = builder.create<arith::ShRUIOp>(
-      builder.create<arith::BitcastOp>(i32Vec, arg0),
+  Value arg0 = isPositive ? arg : math::AbsFOp::create(builder, arg);
+  Value biasedExponentBits = arith::ShRUIOp::create(
+      builder, arith::BitcastOp::create(builder, i32Vec, arg0),
       bcast(i32Cst(builder, 23)));
   Value biasedExponent =
-      builder.create<arith::SIToFPOp>(f32Vec, biasedExponentBits);
+      arith::SIToFPOp::create(builder, f32Vec, biasedExponentBits);
   Value exponent =
-      builder.create<arith::SubFOp>(biasedExponent, bcast(cst126f));
+      arith::SubFOp::create(builder, biasedExponent, bcast(cst126f));
 
   return {normalizedFraction, exponent};
 }
@@ -278,10 +280,10 @@ static Value exp2I32(ImplicitLocOpBuilder &builder, Value arg) {
   // Set the exponent bias to zero.
   auto bias = bcast(i32Cst(builder, 127));
 
-  Value biasedArg = builder.create<arith::AddIOp>(arg, bias);
+  Value biasedArg = arith::AddIOp::create(builder, arg, bias);
   Value exp2ValueInt =
-      builder.create<arith::ShLIOp>(biasedArg, exponetBitLocation);
-  Value exp2ValueF32 = builder.create<arith::BitcastOp>(f32Vec, exp2ValueInt);
+      arith::ShLIOp::create(builder, biasedArg, exponetBitLocation);
+  Value exp2ValueF32 = arith::BitcastOp::create(builder, f32Vec, exp2ValueInt);
 
   return exp2ValueF32;
 }
@@ -300,10 +302,10 @@ Value makePolynomialCalculation(ImplicitLocOpBuilder &builder,
   if (coeffs.size() == 1)
     return coeffs[0];
 
-  Value res = builder.create<math::FmaOp>(x, coeffs[coeffs.size() - 1],
-                                          coeffs[coeffs.size() - 2]);
+  Value res = math::FmaOp::create(builder, x, coeffs[coeffs.size() - 1],
+                                  coeffs[coeffs.size() - 2]);
   for (auto i = ptrdiff_t(coeffs.size()) - 3; i >= 0; --i) {
-    res = builder.create<math::FmaOp>(x, res, coeffs[i]);
+    res = math::FmaOp::create(builder, x, res, coeffs[i]);
   }
   return res;
 }
@@ -343,9 +345,9 @@ LogicalResult insertCasts(Operation *op, PatternRewriter &rewriter) {
   Location loc = op->getLoc();
   SmallVector<Value> operands;
   for (auto operand : op->getOperands())
-    operands.push_back(rewriter.create<arith::ExtFOp>(loc, newType, operand));
+    operands.push_back(arith::ExtFOp::create(rewriter, loc, newType, operand));
   auto result =
-      rewriter.create<T>(loc, TypeRange{newType}, operands, op->getAttrs());
+      T::create(rewriter, loc, TypeRange{newType}, operands, op->getAttrs());
   rewriter.replaceOpWithNewOp<arith::TruncFOp>(op, origType, result);
   return success();
 }
@@ -393,18 +395,18 @@ AtanApproximation::matchAndRewrite(math::AtanOp op,
   std::optional<VectorShape> shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-  Value abs = builder.create<math::AbsFOp>(operand);
+  Value abs = math::AbsFOp::create(builder, operand);
 
   auto one = broadcast(builder, f32Cst(builder, 1.0), shape);
 
   // When 0.66 < x <= 2.41 we do (x-1) / (x+1):
   auto twoThirds = broadcast(builder, f32Cst(builder, 0.66), shape);
   Value cmp2 =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, abs, twoThirds);
-  Value addone = builder.create<arith::AddFOp>(abs, one);
-  Value subone = builder.create<arith::SubFOp>(abs, one);
-  Value xnum = builder.create<arith::SelectOp>(cmp2, subone, abs);
-  Value xden = builder.create<arith::SelectOp>(cmp2, addone, one);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, abs, twoThirds);
+  Value addone = arith::AddFOp::create(builder, abs, one);
+  Value subone = arith::SubFOp::create(builder, abs, one);
+  Value xnum = arith::SelectOp::create(builder, cmp2, subone, abs);
+  Value xden = arith::SelectOp::create(builder, cmp2, addone, one);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -413,12 +415,12 @@ AtanApproximation::matchAndRewrite(math::AtanOp op,
   // Break into the <= 0.66 or > 2.41 we do x or 1/x:
   auto tan3pio8 = bcast(f32Cst(builder, 2.41421356237309504880));
   Value cmp1 =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, abs, tan3pio8);
-  xnum = builder.create<arith::SelectOp>(cmp1, one, xnum);
-  xden = builder.create<arith::SelectOp>(cmp1, abs, xden);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, abs, tan3pio8);
+  xnum = arith::SelectOp::create(builder, cmp1, one, xnum);
+  xden = arith::SelectOp::create(builder, cmp1, abs, xden);
 
-  Value x = builder.create<arith::DivFOp>(xnum, xden);
-  Value xx = builder.create<arith::MulFOp>(x, x);
+  Value x = arith::DivFOp::create(builder, xnum, xden);
+  Value xx = arith::MulFOp::create(builder, x, x);
 
   // Perform the Taylor series approximation for atan over the range
   // [0.0, 0.66].
@@ -435,31 +437,31 @@ AtanApproximation::matchAndRewrite(math::AtanOp op,
 
   // Apply the polynomial approximation for the numerator:
   Value n = p0;
-  n = builder.create<math::FmaOp>(xx, n, p1);
-  n = builder.create<math::FmaOp>(xx, n, p2);
-  n = builder.create<math::FmaOp>(xx, n, p3);
-  n = builder.create<math::FmaOp>(xx, n, p4);
-  n = builder.create<arith::MulFOp>(n, xx);
+  n = math::FmaOp::create(builder, xx, n, p1);
+  n = math::FmaOp::create(builder, xx, n, p2);
+  n = math::FmaOp::create(builder, xx, n, p3);
+  n = math::FmaOp::create(builder, xx, n, p4);
+  n = arith::MulFOp::create(builder, n, xx);
 
   // Apply the polynomial approximation for the denominator:
   Value d = q0;
-  d = builder.create<math::FmaOp>(xx, d, q1);
-  d = builder.create<math::FmaOp>(xx, d, q2);
-  d = builder.create<math::FmaOp>(xx, d, q3);
-  d = builder.create<math::FmaOp>(xx, d, q4);
+  d = math::FmaOp::create(builder, xx, d, q1);
+  d = math::FmaOp::create(builder, xx, d, q2);
+  d = math::FmaOp::create(builder, xx, d, q3);
+  d = math::FmaOp::create(builder, xx, d, q4);
 
   // Compute approximation of theta:
-  Value ans0 = builder.create<arith::DivFOp>(n, d);
-  ans0 = builder.create<math::FmaOp>(ans0, x, x);
+  Value ans0 = arith::DivFOp::create(builder, n, d);
+  ans0 = math::FmaOp::create(builder, ans0, x, x);
 
   // Correct for the input mapping's angles:
   Value mpi4 = bcast(f32Cst(builder, llvm::numbers::pi / 4));
-  Value ans2 = builder.create<arith::AddFOp>(mpi4, ans0);
-  Value ans = builder.create<arith::SelectOp>(cmp2, ans2, ans0);
+  Value ans2 = arith::AddFOp::create(builder, mpi4, ans0);
+  Value ans = arith::SelectOp::create(builder, cmp2, ans2, ans0);
 
   Value mpi2 = bcast(f32Cst(builder, llvm::numbers::pi / 2));
-  Value ans1 = builder.create<arith::SubFOp>(mpi2, ans0);
-  ans = builder.create<arith::SelectOp>(cmp1, ans1, ans);
+  Value ans1 = arith::SubFOp::create(builder, mpi2, ans0);
+  ans = arith::SelectOp::create(builder, cmp1, ans1, ans);
 
   // Correct for signing of the input.
   rewriter.replaceOpWithNewOp<math::CopySignOp>(op, ans, operand);
@@ -492,44 +494,46 @@ Atan2Approximation::matchAndRewrite(math::Atan2Op op,
   std::optional<VectorShape> shape = vectorShape(op.getResult());
 
   // Compute atan in the valid range.
-  auto div = builder.create<arith::DivFOp>(y, x);
-  auto atan = builder.create<math::AtanOp>(div);
+  auto div = arith::DivFOp::create(builder, y, x);
+  auto atan = math::AtanOp::create(builder, div);
 
   // Determine what the atan would be for a 180 degree rotation.
   auto zero = broadcast(builder, f32Cst(builder, 0.0f), shape);
   auto pi = broadcast(builder, f32Cst(builder, 3.14159265359f), shape);
-  auto addPi = builder.create<arith::AddFOp>(atan, pi);
-  auto subPi = builder.create<arith::SubFOp>(atan, pi);
+  auto addPi = arith::AddFOp::create(builder, atan, pi);
+  auto subPi = arith::SubFOp::create(builder, atan, pi);
   auto atanGt =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, atan, zero);
-  auto flippedAtan = builder.create<arith::SelectOp>(atanGt, subPi, addPi);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, atan, zero);
+  auto flippedAtan = arith::SelectOp::create(builder, atanGt, subPi, addPi);
 
   // Determine whether to directly use atan or use the 180 degree flip
-  auto xGt = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, x, zero);
-  Value result = builder.create<arith::SelectOp>(xGt, atan, flippedAtan);
+  auto xGt = arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, x, zero);
+  Value result = arith::SelectOp::create(builder, xGt, atan, flippedAtan);
 
   // Handle x = 0, y > 0
   Value xZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, x, zero);
-  Value yGt = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, y, zero);
-  Value isHalfPi = builder.create<arith::AndIOp>(xZero, yGt);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, x, zero);
+  Value yGt =
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, y, zero);
+  Value isHalfPi = arith::AndIOp::create(builder, xZero, yGt);
   auto halfPi = broadcast(builder, f32Cst(builder, 1.57079632679f), shape);
-  result = builder.create<arith::SelectOp>(isHalfPi, halfPi, result);
+  result = arith::SelectOp::create(builder, isHalfPi, halfPi, result);
 
   // Handle x = 0, y < 0
-  Value yLt = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, y, zero);
-  Value isNegativeHalfPiPi = builder.create<arith::AndIOp>(xZero, yLt);
+  Value yLt =
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, y, zero);
+  Value isNegativeHalfPiPi = arith::AndIOp::create(builder, xZero, yLt);
   auto negativeHalfPiPi =
       broadcast(builder, f32Cst(builder, -1.57079632679f), shape);
-  result = builder.create<arith::SelectOp>(isNegativeHalfPiPi, negativeHalfPiPi,
-                                           result);
+  result = arith::SelectOp::create(builder, isNegativeHalfPiPi,
+                                   negativeHalfPiPi, result);
 
   // Handle x = 0, y = 0;
   Value yZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, y, zero);
-  Value isNan = builder.create<arith::AndIOp>(xZero, yZero);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, y, zero);
+  Value isNan = arith::AndIOp::create(builder, xZero, yZero);
   Value cstNan = broadcast(builder, f32FromBits(builder, 0x7fc00000), shape);
-  result = builder.create<arith::SelectOp>(isNan, cstNan, result);
+  result = arith::SelectOp::create(builder, isNan, cstNan, result);
 
   rewriter.replaceOp(op, result);
   return success();
@@ -569,9 +573,9 @@ TanhApproximation::matchAndRewrite(math::TanhOp op,
 
   // Mask for tiny values that are approximated with `operand`.
   Value tiny = bcast(f32Cst(builder, 0.0004f));
-  Value tinyMask = builder.create<arith::CmpFOp>(
-      arith::CmpFPredicate::OLT, builder.create<math::AbsFOp>(op.getOperand()),
-      tiny);
+  Value tinyMask = arith::CmpFOp::create(
+      builder, arith::CmpFPredicate::OLT,
+      math::AbsFOp::create(builder, op.getOperand()), tiny);
 
   // The monomial coefficients of the numerator polynomial (odd).
   Value alpha1 = bcast(f32Cst(builder, 4.89352455891786e-03f));
@@ -589,25 +593,25 @@ TanhApproximation::matchAndRewrite(math::TanhOp op,
   Value beta6 = bcast(f32Cst(builder, 1.19825839466702e-06f));
 
   // Since the polynomials are odd/even, we need x^2.
-  Value x2 = builder.create<arith::MulFOp>(x, x);
+  Value x2 = arith::MulFOp::create(builder, x, x);
 
   // Evaluate the numerator polynomial p.
-  Value p = builder.create<math::FmaOp>(x2, alpha13, alpha11);
-  p = builder.create<math::FmaOp>(x2, p, alpha9);
-  p = builder.create<math::FmaOp>(x2, p, alpha7);
-  p = builder.create<math::FmaOp>(x2, p, alpha5);
-  p = builder.create<math::FmaOp>(x2, p, alpha3);
-  p = builder.create<math::FmaOp>(x2, p, alpha1);
-  p = builder.create<arith::MulFOp>(x, p);
+  Value p = math::FmaOp::create(builder, x2, alpha13, alpha11);
+  p = math::FmaOp::create(builder, x2, p, alpha9);
+  p = math::FmaOp::create(builder, x2, p, alpha7);
+  p = math::FmaOp::create(builder, x2, p, alpha5);
+  p = math::FmaOp::create(builder, x2, p, alpha3);
+  p = math::FmaOp::create(builder, x2, p, alpha1);
+  p = arith::MulFOp::create(builder, x, p);
 
   // Evaluate the denominator polynomial q.
-  Value q = builder.create<math::FmaOp>(x2, beta6, beta4);
-  q = builder.create<math::FmaOp>(x2, q, beta2);
-  q = builder.create<math::FmaOp>(x2, q, beta0);
+  Value q = math::FmaOp::create(builder, x2, beta6, beta4);
+  q = math::FmaOp::create(builder, x2, q, beta2);
+  q = math::FmaOp::create(builder, x2, q, beta0);
 
   // Divide the numerator by the denominator.
-  Value res = builder.create<arith::SelectOp>(
-      tinyMask, x, builder.create<arith::DivFOp>(p, q));
+  Value res = arith::SelectOp::create(builder, tinyMask, x,
+                                      arith::DivFOp::create(builder, p, q));
 
   rewriter.replaceOp(op, res);
 
@@ -690,57 +694,57 @@ LogApproximationBase<Op>::logMatchAndRewrite(Op op, PatternRewriter &rewriter,
   //     e -= 1;
   //     x = x + x - 1.0;
   //   } else { x = x - 1.0; }
-  Value mask = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, x,
-                                             cstCephesSQRTHF);
-  Value tmp = builder.create<arith::SelectOp>(mask, x, cstZero);
+  Value mask = arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, x,
+                                     cstCephesSQRTHF);
+  Value tmp = arith::SelectOp::create(builder, mask, x, cstZero);
 
-  x = builder.create<arith::SubFOp>(x, cstOne);
-  e = builder.create<arith::SubFOp>(
-      e, builder.create<arith::SelectOp>(mask, cstOne, cstZero));
-  x = builder.create<arith::AddFOp>(x, tmp);
+  x = arith::SubFOp::create(builder, x, cstOne);
+  e = arith::SubFOp::create(
+      builder, e, arith::SelectOp::create(builder, mask, cstOne, cstZero));
+  x = arith::AddFOp::create(builder, x, tmp);
 
-  Value x2 = builder.create<arith::MulFOp>(x, x);
-  Value x3 = builder.create<arith::MulFOp>(x2, x);
+  Value x2 = arith::MulFOp::create(builder, x, x);
+  Value x3 = arith::MulFOp::create(builder, x2, x);
 
   // Evaluate the polynomial approximant of degree 8 in three parts.
   Value y0, y1, y2;
-  y0 = builder.create<math::FmaOp>(cstCephesLogP0, x, cstCephesLogP1);
-  y1 = builder.create<math::FmaOp>(cstCephesLogP3, x, cstCephesLogP4);
-  y2 = builder.create<math::FmaOp>(cstCephesLogP6, x, cstCephesLogP7);
-  y0 = builder.create<math::FmaOp>(y0, x, cstCephesLogP2);
-  y1 = builder.create<math::FmaOp>(y1, x, cstCephesLogP5);
-  y2 = builder.create<math::FmaOp>(y2, x, cstCephesLogP8);
-  y0 = builder.create<math::FmaOp>(y0, x3, y1);
-  y0 = builder.create<math::FmaOp>(y0, x3, y2);
-  y0 = builder.create<arith::MulFOp>(y0, x3);
-
-  y0 = builder.create<math::FmaOp>(cstNegHalf, x2, y0);
-  x = builder.create<arith::AddFOp>(x, y0);
+  y0 = math::FmaOp::create(builder, cstCephesLogP0, x, cstCephesLogP1);
+  y1 = math::FmaOp::create(builder, cstCephesLogP3, x, cstCephesLogP4);
+  y2 = math::FmaOp::create(builder, cstCephesLogP6, x, cstCephesLogP7);
+  y0 = math::FmaOp::create(builder, y0, x, cstCephesLogP2);
+  y1 = math::FmaOp::create(builder, y1, x, cstCephesLogP5);
+  y2 = math::FmaOp::create(builder, y2, x, cstCephesLogP8);
+  y0 = math::FmaOp::create(builder, y0, x3, y1);
+  y0 = math::FmaOp::create(builder, y0, x3, y2);
+  y0 = arith::MulFOp::create(builder, y0, x3);
+
+  y0 = math::FmaOp::create(builder, cstNegHalf, x2, y0);
+  x = arith::AddFOp::create(builder, x, y0);
 
   if (base2) {
     Value cstLog2e = bcast(f32Cst(builder, static_cast<float>(LOG2E_VALUE)));
-    x = builder.create<math::FmaOp>(x, cstLog2e, e);
+    x = math::FmaOp::create(builder, x, cstLog2e, e);
   } else {
     Value cstLn2 = bcast(f32Cst(builder, static_cast<float>(LN2_VALUE)));
-    x = builder.create<math::FmaOp>(e, cstLn2, x);
+    x = math::FmaOp::create(builder, e, cstLn2, x);
   }
 
-  Value invalidMask = builder.create<arith::CmpFOp>(arith::CmpFPredicate::ULT,
-                                                    op.getOperand(), cstZero);
-  Value zeroMask = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ,
-                                                 op.getOperand(), cstZero);
-  Value posInfMask = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ,
-                                                   op.getOperand(), cstPosInf);
+  Value invalidMask = arith::CmpFOp::create(builder, arith::CmpFPredicate::ULT,
+                                            op.getOperand(), cstZero);
+  Value zeroMask = arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ,
+                                         op.getOperand(), cstZero);
+  Value posInfMask = arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ,
+                                           op.getOperand(), cstPosInf);
 
   // Filter out invalid values:
   //  • x == 0     -> -INF
   //  • x < 0      ->  NAN
   //  • x == +INF  -> +INF
-  Value aproximation = builder.create<arith::SelectOp>(
-      zeroMask, cstMinusInf,
-      builder.create<arith::SelectOp>(
-          invalidMask, cstNan,
-          builder.create<arith::SelectOp>(posInfMask, cstPosInf, x)));
+  Value aproximation = arith::SelectOp::create(
+      builder, zeroMask, cstMinusInf,
+      arith::SelectOp::create(
+          builder, invalidMask, cstNan,
+          arith::SelectOp::create(builder, posInfMask, cstPosInf, x)));
 
   rewriter.replaceOp(op, aproximation);
 
@@ -805,17 +809,18 @@ Log1pApproximation::matchAndRewrite(math::Log1pOp op,
   //             "logLarge" below.
   Value cstOne = bcast(f32Cst(builder, 1.0f));
   Value x = op.getOperand();
-  Value u = builder.create<arith::AddFOp>(x, cstOne);
+  Value u = arith::AddFOp::create(builder, x, cstOne);
   Value uSmall =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, u, cstOne);
-  Value logU = builder.create<math::LogOp>(u);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, u, cstOne);
+  Value logU = math::LogOp::create(builder, u);
   Value uInf =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, u, logU);
-  Value logLarge = builder.create<arith::MulFOp>(
-      x, builder.create<arith::DivFOp>(
-             logU, builder.create<arith::SubFOp>(u, cstOne)));
-  Value approximation = builder.create<arith::SelectOp>(
-      builder.create<arith::OrIOp>(uSmall, uInf), x, logLarge);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, u, logU);
+  Value logLarge = arith::MulFOp::create(
+      builder, x,
+      arith::DivFOp::create(builder, logU,
+                            arith::SubFOp::create(builder, u, cstOne)));
+  Value approximation = arith::SelectOp::create(
+      builder, arith::OrIOp::create(builder, uSmall, uInf), x, logLarge);
   rewriter.replaceOp(op, approximation);
   return success();
 }
@@ -853,36 +858,37 @@ AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
   };
 
   auto fma = [&](Value a, Value b, Value c) -> Value {
-    return builder.create<math::FmaOp>(a, b, c);
+    return math::FmaOp::create(builder, a, b, c);
   };
 
   auto mul = [&](Value a, Value b) -> Value {
-    return builder.create<arith::MulFOp>(a, b);
+    return arith::MulFOp::create(builder, a, b);
   };
 
   auto sub = [&](Value a, Value b) -> Value {
-    return builder.create<arith::SubFOp>(a, b);
+    return arith::SubFOp::create(builder, a, b);
   };
 
-  auto abs = [&](Value a) -> Value { return builder.create<math::AbsFOp>(a); };
+  auto abs = [&](Value a) -> Value { return math::AbsFOp::create(builder, a); };
 
-  auto sqrt = [&](Value a) -> Value { return builder.create<math::SqrtOp>(a); };
+  auto sqrt = [&](Value a) -> Value {
+    return math::SqrtOp::create(builder, a);
+  };
 
   auto scopy = [&](Value a, Value b) -> Value {
-    return builder.create<math::CopySignOp>(a, b);
+    return math::CopySignOp::create(builder, a, b);
   };
 
   auto sel = [&](Value a, Value b, Value c) -> Value {
-    return builder.create<arith::SelectOp>(a, b, c);
+    return arith::SelectOp::create(builder, a, b, c);
   };
 
   Value abso = abs(operand);
   Value aa = mul(operand, operand);
   Value opp = sqrt(sub(bcast(floatCst(builder, 1.0, elementType)), aa));
 
-  Value gt =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, aa,
-                                    bcast(floatCst(builder, 0.5, elementType)));
+  Value gt = arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, aa,
+                                   bcast(floatCst(builder, 0.5, elementType)));
 
   Value x = sel(gt, opp, abso);
 
@@ -948,51 +954,51 @@ AcosPolynomialApproximation::matchAndRewrite(math::AcosOp op,
   };
 
   auto fma = [&](Value a, Value b, Value c) -> Value {
-    return builder.create<math::FmaOp>(a, b, c);
+    return math::FmaOp::create(builder, a, b, c);
   };
 
   auto mul = [&](Value a, Value b) -> Value {
-    return builder.create<arith::MulFOp>(a, b);
+    return arith::MulFOp::create(builder, a, b);
   };
 
-  Value negOperand = builder.create<arith::NegFOp>(operand);
+  Value negOperand = arith::NegFOp::create(builder, operand);
   Value zero = bcast(floatCst(builder, 0.0, elementType));
   Value half = bcast(floatCst(builder, 0.5, elementType));
   Value negOne = bcast(floatCst(builder, -1.0, elementType));
   Value selR =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, operand, zero);
-  Value r = builder.create<arith::SelectOp>(selR, negOperand, operand);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, operand, zero);
+  Value r = arith::SelectOp::create(builder, selR, negOperand, operand);
   Value chkConst = bcast(floatCst(builder, -0.5625, elementType));
   Value firstPred =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, r, chkConst);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, r, chkConst);
 
   Value trueVal =
       fma(bcast(floatCst(builder, 9.3282184640716537e-1, elementType)),
           bcast(floatCst(builder, 1.6839188885261840e+0, elementType)),
-          builder.create<math::AsinOp>(r));
+          math::AsinOp::create(builder, r));
 
-  Value falseVal = builder.create<math::SqrtOp>(fma(half, r, half));
-  falseVal = builder.create<math::AsinOp>(falseVal);
+  Value falseVal = math::SqrtOp::create(builder, fma(half, r, half));
+  falseVal = math::AsinOp::create(builder, falseVal);
   falseVal = mul(bcast(floatCst(builder, 2.0, elementType)), falseVal);
 
-  r = builder.create<arith::SelectOp>(firstPred, trueVal, falseVal);
+  r = arith::SelectOp::create(builder, firstPred, trueVal, falseVal);
 
   // Check whether the operand lies in between [-1.0, 0.0).
-  Value greaterThanNegOne =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGE, operand, negOne);
+  Value greaterThanNegOne = arith::CmpFOp::create(
+      builder, arith::CmpFPredicate::OGE, operand, negOne);
 
   Value lessThanZero =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, operand, zero);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, operand, zero);
 
   Value betweenNegOneZero =
-      builder.create<arith::AndIOp>(greaterThanNegOne, lessThanZero);
+      arith::AndIOp::create(builder, greaterThanNegOne, lessThanZero);
 
   trueVal = fma(bcast(floatCst(builder, 1.8656436928143307e+0, elementType)),
                 bcast(floatCst(builder, 1.6839188885261840e+0, elementType)),
-                builder.create<arith::NegFOp>(r));
+                arith::NegFOp::create(builder, r));
 
   Value finalVal =
-      builder.create<arith::SelectOp>(betweenNegOneZero, trueVal, r);
+      arith::SelectOp::create(builder, betweenNegOneZero, trueVal, r);
 
   rewriter.replaceOp(op, finalVal);
   return success();
@@ -1075,9 +1081,9 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
   bounds[2] = bcast(floatCst(builder, 3.75f, elementType));
 
   Value isNegativeArg =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, operand, zero);
-  Value negArg = builder.create<arith::NegFOp>(operand);
-  Value x = builder.create<arith::SelectOp>(isNegativeArg, negArg, operand);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, operand, zero);
+  Value negArg = arith::NegFOp::create(builder, operand);
+  Value x = arith::SelectOp::create(builder, isNegativeArg, negArg, operand);
 
   Value offset = offsets[0];
   Value p[polyDegree + 1];
@@ -1091,30 +1097,30 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
   Value isLessThanBound[intervalsCount];
   for (int j = 0; j < intervalsCount - 1; ++j) {
     isLessThanBound[j] =
-        builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, x, bounds[j]);
+        arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, x, bounds[j]);
     for (int i = 0; i <= polyDegree; ++i) {
-      p[i] = builder.create<arith::SelectOp>(isLessThanBound[j], p[i],
-                                             pp[j + 1][i]);
-      q[i] = builder.create<arith::SelectOp>(isLessThanBound[j], q[i],
-                                             qq[j + 1][i]);
+      p[i] = arith::SelectOp::create(builder, isLessThanBound[j], p[i],
+                                     pp[j + 1][i]);
+      q[i] = arith::SelectOp::create(builder, isLessThanBound[j], q[i],
+                                     qq[j + 1][i]);
     }
-    offset = builder.create<arith::SelectOp>(isLessThanBound[j], offset,
-                                             offsets[j + 1]);
+    offset = arith::SelectOp::create(builder, isLessThanBound[j], offset,
+                                     offsets[j + 1]);
   }
-  isLessThanBound[intervalsCount - 1] = builder.create<arith::CmpFOp>(
-      arith::CmpFPredicate::ULT, x, bounds[intervalsCount - 1]);
+  isLessThanBound[intervalsCount - 1] = arith::CmpFOp::create(
+      builder, arith::CmpFPredicate::ULT, x, bounds[intervalsCount - 1]);
 
   Value pPoly = makePolynomialCalculation(builder, p, x);
   Value qPoly = makePolynomialCalculation(builder, q, x);
-  Value rationalPoly = builder.create<arith::DivFOp>(pPoly, qPoly);
-  Value formula = builder.create<arith::AddFOp>(offset, rationalPoly);
-  formula = builder.create<arith::SelectOp>(isLessThanBound[intervalsCount - 1],
-                                            formula, one);
+  Value rationalPoly = arith::DivFOp::create(builder, pPoly, qPoly);
+  Value formula = arith::AddFOp::create(builder, offset, rationalPoly);
+  formula = arith::SelectOp::create(
+      builder, isLessThanBound[intervalsCount - 1], formula, one);
 
   // erf is odd function: erf(x) = -erf(-x).
-  Value negFormula = builder.create<arith::NegFOp>(formula);
+  Value negFormula = arith::NegFOp::create(builder, formula);
   Value res =
-      builder.create<arith::SelectOp>(isNegativeArg, negFormula, formula);
+      arith::SelectOp::create(builder, isNegativeArg, negFormula, formula);
 
   rewriter.replaceOp(op, res);
 
@@ -1155,65 +1161,67 @@ ErfcPolynomialApproximation::matchAndRewrite(math::ErfcOp op,
   Value posInf = bcast(floatCst(builder, INFINITY, et));
   Value clampVal = bcast(floatCst(builder, 10.0546875f, et));
 
-  Value a = builder.create<math::AbsFOp>(x);
-  Value p = builder.create<arith::AddFOp>(a, pos2);
-  Value r = builder.create<arith::DivFOp>(one, p);
-  Value q = builder.create<math::FmaOp>(neg4, r, one);
-  Value t = builder.create<math::FmaOp>(builder.create<arith::AddFOp>(q, one),
-                                        neg2, a);
-  Value e = builder.create<math::FmaOp>(builder.create<arith::NegFOp>(a), q, t);
-  q = builder.create<math::FmaOp>(r, e, q);
+  Value a = math::AbsFOp::create(builder, x);
+  Value p = arith::AddFOp::create(builder, a, pos2);
+  Value r = arith::DivFOp::create(builder, one, p);
+  Value q = math::FmaOp::create(builder, neg4, r, one);
+  Value t = math::FmaOp::create(builder, arith::AddFOp::create(builder, q, one),
+                                neg2, a);
+  Value e =
+      math::FmaOp::create(builder, arith::NegFOp::create(builder, a), q, t);
+  q = math::FmaOp::create(builder, r, e, q);
 
   p = bcast(floatCst(builder, -0x1.a4a000p-12f, et));        // -4.01139259e-4
   Value c1 = bcast(floatCst(builder, -0x1.42a260p-10f, et)); // -1.23075210e-3
-  p = builder.create<math::FmaOp>(p, q, c1);
+  p = math::FmaOp::create(builder, p, q, c1);
   Value c2 = bcast(floatCst(builder, 0x1.585714p-10f, et)); //  1.31355342e-3
-  p = builder.create<math::FmaOp>(p, q, c2);
+  p = math::FmaOp::create(builder, p, q, c2);
   Value c3 = bcast(floatCst(builder, 0x1.1adcc4p-07f, et)); // 8.63227434e-3
-  p = builder.create<math::FmaOp>(p, q, c3);
+  p = math::FmaOp::create(builder, p, q, c3);
   Value c4 = bcast(floatCst(builder, -0x1.081b82p-07f, et)); // -8.05991981e-3
-  p = builder.create<math::FmaOp>(p, q, c4);
+  p = math::FmaOp::create(builder, p, q, c4);
   Value c5 = bcast(floatCst(builder, -0x1.bc0b6ap-05f, et)); // -5.42046614e-2
-  p = builder.create<math::FmaOp>(p, q, c5);
+  p = math::FmaOp::create(builder, p, q, c5);
   Value c6 = bcast(floatCst(builder, 0x1.4ffc46p-03f, et)); //  1.64055392e-1
-  p = builder.create<math::FmaOp>(p, q, c6);
+  p = math::FmaOp::create(builder, p, q, c6);
   Value c7 = bcast(floatCst(builder, -0x1.540840p-03f, et)); // -1.66031361e-1
-  p = builder.create<math::FmaOp>(p, q, c7);
+  p = math::FmaOp::create(builder, p, q, c7);
   Value c8 = bcast(floatCst(builder, -0x1.7bf616p-04f, et)); // -9.27639827e-2
-  p = builder.create<math::FmaOp>(p, q, c8);
+  p = math::FmaOp::create(builder, p, q, c8);
   Value c9 = bcast(floatCst(builder, 0x1.1ba03ap-02f, et)); // 2.76978403e-1
-  p = builder.create<math::FmaOp>(p, q, c9);
-
-  Value d = builder.create<math::FmaOp>(pos2, a, one);
-  r = builder.create<arith::DivFOp>(one, d);
-  q = builder.create<math::FmaOp>(p, r, r);
-  Value negfa = builder.create<arith::NegFOp>(a);
-  Value fmaqah = builder.create<math::FmaOp>(q, negfa, onehalf);
-  Value psubq = builder.create<arith::SubFOp>(p, q);
-  e = builder.create<math::FmaOp>(fmaqah, pos2, psubq);
-  r = builder.create<math::FmaOp>(e, r, q);
-
-  Value s = builder.create<arith::MulFOp>(a, a);
-  e = builder.create<math::ExpOp>(builder.create<arith::NegFOp>(s));
-
-  t = builder.create<math::FmaOp>(builder.create<arith::NegFOp>(a), a, s);
-  r = builder.create<math::FmaOp>(
-      r, e,
-      builder.create<arith::MulFOp>(builder.create<arith::MulFOp>(r, e), t));
-
-  Value isNotLessThanInf = builder.create<arith::XOrIOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, a, posInf),
+  p = math::FmaOp::create(builder, p, q, c9);
+
+  Value d = math::FmaOp::create(builder, pos2, a, one);
+  r = arith::DivFOp::create(builder, one, d);
+  q = math::FmaOp::create(builder, p, r, r);
+  Value negfa = arith::NegFOp::create(builder, a);
+  Value fmaqah = math::FmaOp::create(builder, q, negfa, onehalf);
+  Value psubq = arith::SubFOp::create(builder, p, q);
+  e = math::FmaOp::create(builder, fmaqah, pos2, psubq);
+  r = math::FmaOp::create(builder, e, r, q);
+
+  Value s = arith::MulFOp::create(builder, a, a);
+  e = math::ExpOp::create(builder, arith::NegFOp::create(builder, s));
+
+  t = math::FmaOp::create(builder, arith::NegFOp::create(builder, a), a, s);
+  r = math::FmaOp::create(
+      builder, r, e,
+      arith::MulFOp::create(builder, arith::MulFOp::create(builder, r, e), t));
+
+  Value isNotLessThanInf = arith::XOrIOp::create(
+      builder,
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, a, posInf),
       trueValue);
-  r = builder.create<arith::SelectOp>(isNotLessThanInf,
-                                      builder.create<arith::AddFOp>(x, x), r);
+  r = arith::SelectOp::create(builder, isNotLessThanInf,
+                              arith::AddFOp::create(builder, x, x), r);
   Value isGreaterThanClamp =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, a, clampVal);
-  r = builder.create<arith::SelectOp>(isGreaterThanClamp, zero, r);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OGT, a, clampVal);
+  r = arith::SelectOp::create(builder, isGreaterThanClamp, zero, r);
 
   Value isNegative =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, x, zero);
-  r = builder.create<arith::SelectOp>(
-      isNegative, builder.create<arith::SubFOp>(pos2, r), r);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT, x, zero);
+  r = arith::SelectOp::create(builder, isNegative,
+                              arith::SubFOp::create(builder, pos2, r), r);
 
   rewriter.replaceOp(op, r);
   return success();
@@ -1235,8 +1243,9 @@ Value clampWithNormals(ImplicitLocOpBuilder &builder,
   };
 
   auto selectCmp = [&builder](auto pred, Value value, Value bound) {
-    return builder.create<arith::SelectOp>(
-        builder.create<arith::CmpFOp>(pred, value, bound), value, bound);
+    return arith::SelectOp::create(
+        builder, arith::CmpFOp::create(builder, pred, value, bound), value,
+        bound);
   };
 
   // Note: prefer UGE/ULE vs. UGT/ULT, since they generate vmaxps/vminps vs.
@@ -1268,17 +1277,17 @@ ExpApproximation::matchAndRewrite(math::ExpOp op,
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
 
   auto add = [&](Value a, Value b) -> Value {
-    return builder.create<arith::AddFOp>(a, b);
+    return arith::AddFOp::create(builder, a, b);
   };
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
   };
-  auto floor = [&](Value a) { return builder.create<math::FloorOp>(a); };
+  auto floor = [&](Value a) { return math::FloorOp::create(builder, a); };
   auto fmla = [&](Value a, Value b, Value c) {
-    return builder.create<math::FmaOp>(a, b, c);
+    return math::FmaOp::create(builder, a, b, c);
   };
   auto mul = [&](Value a, Value b) -> Value {
-    return builder.create<arith::MulFOp>(a, b);
+    return arith::MulFOp::create(builder, a, b);
   };
 
   // Polynomial approximation from Cephes.
@@ -1382,7 +1391,7 @@ ExpApproximation::matchAndRewrite(math::ExpOp op,
 
   // Convert n' to an i32.  This is safe because we clamped it above.
   auto i32Vec = broadcast(builder.getI32Type(), shape);
-  Value nI32 = builder.create<arith::FPToSIOp>(i32Vec, n);
+  Value nI32 = arith::FPToSIOp::create(builder, i32Vec, n);
 
   // Creates the value 2^n' if -126 <= n' <= 127 and 0 if n' = -127.
   Value pow2 = exp2I32(builder, nI32);
@@ -1430,26 +1439,26 @@ ExpM1Approximation::matchAndRewrite(math::ExpM1Op op,
   Value cstOne = bcast(f32Cst(builder, 1.0f));
   Value cstNegOne = bcast(f32Cst(builder, -1.0f));
   Value x = op.getOperand();
-  Value u = builder.create<math::ExpOp>(x);
+  Value u = math::ExpOp::create(builder, x);
   Value uEqOneOrNaN =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::UEQ, u, cstOne);
-  Value uMinusOne = builder.create<arith::SubFOp>(u, cstOne);
-  Value uMinusOneEqNegOne = builder.create<arith::CmpFOp>(
-      arith::CmpFPredicate::OEQ, uMinusOne, cstNegOne);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::UEQ, u, cstOne);
+  Value uMinusOne = arith::SubFOp::create(builder, u, cstOne);
+  Value uMinusOneEqNegOne = arith::CmpFOp::create(
+      builder, arith::CmpFPredicate::OEQ, uMinusOne, cstNegOne);
   // logU = log(u) ~= x
-  Value logU = builder.create<math::LogOp>(u);
+  Value logU = math::LogOp::create(builder, u);
 
   // Detect exp(x) = +inf; written this way to avoid having to form +inf.
   Value isInf =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, logU, u);
+      arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ, logU, u);
 
   // (u - 1) * (x / ~x)
-  Value expm1 = builder.create<arith::MulFOp>(
-      uMinusOne, builder.create<arith::DivFOp>(x, logU));
-  expm1 = builder.create<arith::SelectOp>(isInf, u, expm1);
-  Value approximation = builder.create<arith::SelectOp>(
-      uEqOneOrNaN, x,
-      builder.create<arith::SelectOp>(uMinusOneEqNegOne, cstNegOne, expm1));
+  Value expm1 = arith::MulFOp::create(builder, uMinusOne,
+                                      arith::DivFOp::create(builder, x, logU));
+  expm1 = arith::SelectOp::create(builder, isInf, u, expm1);
+  Value approximation = arith::SelectOp::create(
+      builder, uEqOneOrNaN, x,
+      arith::SelectOp::create(builder, uMinusOneEqNegOne, cstNegOne, expm1));
   rewriter.replaceOp(op, approximation);
   return success();
 }
@@ -1494,40 +1503,40 @@ LogicalResult SinAndCosApproximation<isSine, OpTy>::matchAndRewrite(
     return broadcast(builder, value, shape);
   };
   auto mul = [&](Value a, Value b) -> Value {
-    return builder.create<arith::MulFOp>(a, b);
+    return arith::MulFOp::create(builder, a, b);
   };
   auto sub = [&](Value a, Value b) -> Value {
-    return builder.create<arith::SubFOp>(a, b);
+    return arith::SubFOp::create(builder, a, b);
   };
-  auto floor = [&](Value a) { return builder.create<math::FloorOp>(a); };
+  auto floor = [&](Value a) { return math::FloorOp::create(builder, a); };
 
   auto i32Vec = broadcast(builder.getI32Type(), shape);
   auto fPToSingedInteger = [&](Value a) -> Value {
-    return builder.create<arith::FPToSIOp>(i32Vec, a);
+    return arith::FPToSIOp::create(builder, i32Vec, a);
   };
 
   auto modulo4 = [&](Value a) -> Value {
-    return builder.create<arith::AndIOp>(a, bcast(i32Cst(builder, 3)));
+    return arith::AndIOp::create(builder, a, bcast(i32Cst(builder, 3)));
   };
 
   auto isEqualTo = [&](Value a, Value b) -> Value {
-    return builder.create<arith::CmpIOp>(arith::CmpIPredicate::eq, a, b);
+    return arith::CmpIOp::create(builder, arith::CmpIPredicate::eq, a, b);
   };
 
   auto isGreaterThan = [&](Value a, Value b) -> Value {
-    return builder.create<arith::CmpIOp>(arith::CmpIPredicate::sgt, a, b);
+    return arith::CmpIOp::create(builder, arith::CmpIPredicate::sgt, a, b);
   };
 
   auto select = [&](Value cond, Value t, Value f) -> Value {
-    return builder.create<arith::SelectOp>(cond, t, f);
+    return arith::SelectOp::create(builder, cond, t, f);
   };
 
   auto fmla = [&](Value a, Value b, Value c) {
-    return builder.create<math::FmaOp>(a, b, c);
+    return math::FmaOp::create(builder, a, b, c);
   };
 
   auto bitwiseOr = [&](Value a, Value b) {
-    return builder.create<arith::OrIOp>(a, b);
+    return arith::OrIOp::create(builder, a, b);
   };
 
   Value twoOverPi = bcast(f32Cst(builder, (float)TWO_OVER_PI));
@@ -1624,7 +1633,7 @@ CbrtApproximation::matchAndRewrite(math::CbrtOp op,
   intTy = broadcast(intTy, shape);
 
   auto bconst = [&](TypedAttr attr) -> Value {
-    Value value = b.create<arith::ConstantOp>(attr);
+    Value value = arith::ConstantOp::create(b, attr);
     return broadcast(b, value, shape);
   };
 
@@ -1641,44 +1650,44 @@ CbrtApproximation::matchAndRewrite(math::CbrtOp op,
   // union {int ix; float x;};
   // x = x0;
   // ix = ix/4 + ix/16;
-  Value absValue = b.create<math::AbsFOp>(operand);
-  Value intValue = b.create<arith::BitcastOp>(intTy, absValue);
-  Value divideBy4 = b.create<arith::ShRSIOp>(intValue, intTwo);
-  Value divideBy16 = b.create<arith::ShRSIOp>(intValue, intFour);
-  intValue = b.create<arith::AddIOp>(divideBy4, divideBy16);
+  Value absValue = math::AbsFOp::create(b, operand);
+  Value intValue = arith::BitcastOp::create(b, intTy, absValue);
+  Value divideBy4 = arith::ShRSIOp::create(b, intValue, intTwo);
+  Value divideBy16 = arith::ShRSIOp::create(b, intValue, intFour);
+  intValue = arith::AddIOp::create(b, divideBy4, divideBy16);
 
   // ix = ix + ix/16;
-  divideBy16 = b.create<arith::ShRSIOp>(intValue, intFour);
-  intValue = b.create<arith::AddIOp>(intValue, divideBy16);
+  divideBy16 = arith::ShRSIOp::create(b, intValue, intFour);
+  intValue = arith::AddIOp::create(b, intValue, divideBy16);
 
   // ix = ix + ix/256;
-  Value divideBy256 = b.create<arith::ShRSIOp>(intValue, intEight);
-  intValue = b.create<arith::AddIOp>(intValue, divideBy256);
+  Value divideBy256 = arith::ShRSIOp::create(b, intValue, intEight);
+  intValue = arith::AddIOp::create(b, intValue, divideBy256);
 
   // ix = 0x2a5137a0 + ix;
-  intValue = b.create<arith::AddIOp>(intValue, intMagic);
+  intValue = arith::AddIOp::create(b, intValue, intMagic);
 
   // Perform one newtons step:
   // x = 0.33333333f*(2.0f*x + x0/(x*x));
-  Value floatValue = b.create<arith::BitcastOp>(floatTy, intValue);
-  Value squared = b.create<arith::MulFOp>(floatValue, floatValue);
-  Value mulTwo = b.create<arith::MulFOp>(floatValue, fpTwo);
-  Value divSquared = b.create<arith::DivFOp>(absValue, squared);
-  floatValue = b.create<arith::AddFOp>(mulTwo, divSquared);
-  floatValue = b.create<arith::MulFOp>(floatValue, fpThird);
+  Value floatValue = arith::BitcastOp::create(b, floatTy, intValue);
+  Value squared = arith::MulFOp::create(b, floatValue, floatValue);
+  Value mulTwo = arith::MulFOp::create(b, floatValue, fpTwo);
+  Value divSquared = arith::DivFOp::create(b, absValue, squared);
+  floatValue = arith::AddFOp::create(b, mulTwo, divSquared);
+  floatValue = arith::MulFOp::create(b, floatValue, fpThird);
 
   // x = 0.33333333f*(2.0f*x + x0/(x*x));
-  squared = b.create<arith::MulFOp>(floatValue, floatValue);
-  mulTwo = b.create<arith::MulFOp>(floatValue, fpTwo);
-  divSquared = b.create<arith::DivFOp>(absValue, squared);
-  floatValue = b.create<arith::AddFOp>(mulTwo, divSquared);
-  floatValue = b.create<arith::MulFOp>(floatValue, fpThird);
+  squared = arith::MulFOp::create(b, floatValue, floatValue);
+  mulTwo = arith::MulFOp::create(b, floatValue, fpTwo);
+  divSquared = arith::DivFOp::create(b, absValue, squared);
+  floatValue = arith::AddFOp::create(b, mulTwo, divSquared);
+  floatValue = arith::MulFOp::create(b, floatValue, fpThird);
 
   // Check for zero and restore sign.
   Value isZero =
-      b.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, absValue, fpZero);
-  floatValue = b.create<arith::SelectOp>(isZero, fpZero, floatValue);
-  floatValue = b.create<math::CopySignOp>(floatValue, operand);
+      arith::CmpFOp::create(b, arith::CmpFPredicate::OEQ, absValue, fpZero);
+  floatValue = arith::SelectOp::create(b, isZero, fpZero, floatValue);
+  floatValue = math::CopySignOp::create(b, floatValue, operand);
 
   rewriter.replaceOp(op, floatValue);
   return success();
@@ -1719,29 +1728,29 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op,
   Value cstNegHalf = bcast(f32Cst(builder, -0.5f));
   Value cstMinNormPos = bcast(f32FromBits(builder, 0x00800000u));
 
-  Value negHalf = builder.create<arith::MulFOp>(op.getOperand(), cstNegHalf);
+  Value negHalf = arith::MulFOp::create(builder, op.getOperand(), cstNegHalf);
 
   // Select only the inverse sqrt of positive normals (denormals are
   // flushed to zero).
-  Value ltMinMask = builder.create<arith::CmpFOp>(
-      arith::CmpFPredicate::OLT, op.getOperand(), cstMinNormPos);
-  Value infMask = builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ,
-                                                op.getOperand(), cstPosInf);
-  Value notNormalFiniteMask = builder.create<arith::OrIOp>(ltMinMask, infMask);
+  Value ltMinMask = arith::CmpFOp::create(builder, arith::CmpFPredicate::OLT,
+                                          op.getOperand(), cstMinNormPos);
+  Value infMask = arith::CmpFOp::create(builder, arith::CmpFPredicate::OEQ,
+                                        op.getOperand(), cstPosInf);
+  Value notNormalFiniteMask = arith::OrIOp::create(builder, ltMinMask, infMask);
 
   // Compute an approximate result.
   Value yApprox = handleMultidimensionalVectors(
       builder, op->getOperands(), 8, [&builder](ValueRange operands) -> Value {
-        return builder.create<x86vector::RsqrtOp>(operands);
+        return x86vector::RsqrtOp::create(builder, operands);
       });
 
   // Do a single step of Newton-Raphson iteration to improve the approximation.
   // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
   // It is essential to evaluate the inner term like this because forming
   // y_n^2 may over- or underflow.
-  Value inner = builder.create<arith::MulFOp>(negHalf, yApprox);
-  Value fma = builder.create<math::FmaOp>(yApprox, inner, cstOnePointFive);
-  Value yNewton = builder.create<arith::MulFOp>(yApprox, fma);
+  Value inner = arith::MulFOp::create(builder, negHalf, yApprox);
+  Value fma = math::FmaOp::create(builder, yApprox, inner, cstOnePointFive);
+  Value yNewton = arith::MulFOp::create(builder, yApprox, fma);
 
   // Select the result of the Newton-Raphson step for positive normal arguments.
   // For other arguments, choose the output of the intrinsic. This will
@@ -1749,7 +1758,7 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op,
   // x is zero or a positive denormalized float (equivalent to flushing positive
   // denormalized inputs to zero).
   Value res =
-      builder.create<arith::SelectOp>(notNormalFiniteMask, yApprox, yNewton);
+      arith::SelectOp::create(builder, notNormalFiniteMask, yApprox, yNewton);
   rewriter.replaceOp(op, res);
 
   return success();
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index d1a9920aa66c5..51c813682ce25 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -715,51 +715,6 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// If the source/target of a CopyOp is a CastOp that does not modify the shape
-/// and element type, the cast can be skipped. Such CastOps only cast the layout
-/// of the type.
-struct FoldCopyOfCast : public OpRewritePattern<CopyOp> {
-  using OpRewritePattern<CopyOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(CopyOp copyOp,
-                                PatternRewriter &rewriter) const override {
-    bool modified = false;
-
-    // Check source.
-    if (auto castOp = copyOp.getSource().getDefiningOp<CastOp>()) {
-      auto fromType = llvm::dyn_cast<MemRefType>(castOp.getSource().getType());
-      auto toType = llvm::dyn_cast<MemRefType>(castOp.getSource().getType());
-
-      if (fromType && toType) {
-        if (fromType.getShape() == toType.getShape() &&
-            fromType.getElementType() == toType.getElementType()) {
-          rewriter.modifyOpInPlace(copyOp, [&] {
-            copyOp.getSourceMutable().assign(castOp.getSource());
-          });
-          modified = true;
-        }
-      }
-    }
-
-    // Check target.
-    if (auto castOp = copyOp.getTarget().getDefiningOp<CastOp>()) {
-      auto fromType = llvm::dyn_cast<MemRefType>(castOp.getSource().getType());
-      auto toType = llvm::dyn_cast<MemRefType>(castOp.getSource().getType());
-
-      if (fromType && toType) {
-        if (fromType.getShape() == toType.getShape() &&
-            fromType.getElementType() == toType.getElementType()) {
-          rewriter.modifyOpInPlace(copyOp, [&] {
-            copyOp.getTargetMutable().assign(castOp.getSource());
-          });
-          modified = true;
-        }
-      }
-    }
-
-    return success(modified);
-  }
-};
 
 /// Fold memref.copy(%x, %x).
 struct FoldSelfCopy : public OpRewritePattern<CopyOp> {
@@ -797,22 +752,28 @@ struct FoldEmptyCopy final : public OpRewritePattern<CopyOp> {
 
 void CopyOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results.add<FoldCopyOfCast, FoldEmptyCopy, FoldSelfCopy>(context);
+  results.add<FoldEmptyCopy, FoldSelfCopy>(context);
 }
 
-LogicalResult CopyOp::fold(FoldAdaptor adaptor,
-                           SmallVectorImpl<OpFoldResult> &results) {
-  /// copy(memrefcast) -> copy
-  bool folded = false;
-  Operation *op = *this;
+/// If the source/target of a CopyOp is a CastOp that does not modify the shape
+/// and element type, the cast can be skipped. Such CastOps only cast the layout
+/// of the type.
+static LogicalResult FoldCopyOfCast(CopyOp op) {
   for (OpOperand &operand : op->getOpOperands()) {
     auto castOp = operand.get().getDefiningOp<memref::CastOp>();
     if (castOp && memref::CastOp::canFoldIntoConsumerOp(castOp)) {
       operand.set(castOp.getOperand());
-      folded = true;
+      return success();
     }
   }
-  return success(folded);
+  return failure();
+}
+
+LogicalResult CopyOp::fold(FoldAdaptor adaptor,
+                           SmallVectorImpl<OpFoldResult> &results) {
+
+  /// copy(memrefcast) -> copy
+  return FoldCopyOfCast(*this);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index cf506d1e7812b..ed0df4e8c5812 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -91,7 +91,7 @@ SmallVector<Value> mlir::mesh::getMixedAsValues(OpBuilder b,
       values.emplace_back(*(dyn++));
     } else {
       TypedAttr val = type == i64 ? b.getI64IntegerAttr(s) : b.getIndexAttr(s);
-      values.emplace_back(b.create<arith::ConstantOp>(loc, type, val));
+      values.emplace_back(arith::ConstantOp::create(b, loc, type, val));
     }
   }
   return values;
@@ -316,10 +316,10 @@ static void maybeInsertTargetShardingAnnotationImpl(MeshSharding sharding,
 
   if (!newShardOp) {
     auto shardingOp =
-        builder.create<ShardingOp>(operandValue.getLoc(), sharding);
-    newShardOp =
-        builder.create<ShardOp>(operandValue.getLoc(), operandValue, shardingOp,
-                                /*annotate_for_users*/ false);
+        ShardingOp::create(builder, operandValue.getLoc(), sharding);
+    newShardOp = ShardOp::create(builder, operandValue.getLoc(), operandValue,
+                                 shardingOp,
+                                 /*annotate_for_users*/ false);
   }
   operandValue.replaceUsesWithIf(
       newShardOp, [operandOp, operandValue](OpOperand &use) {
@@ -330,9 +330,9 @@ static void maybeInsertTargetShardingAnnotationImpl(MeshSharding sharding,
     return;
   }
 
-  auto newShardOp2 = builder.create<ShardOp>(operandValue.getLoc(), newShardOp,
-                                             newShardOp.getSharding(),
-                                             /*annotate_for_users*/ true);
+  auto newShardOp2 = ShardOp::create(builder, operandValue.getLoc(), newShardOp,
+                                     newShardOp.getSharding(),
+                                     /*annotate_for_users*/ true);
   newShardOp.getResult().replaceAllUsesExcept(newShardOp2, newShardOp2);
 }
 
@@ -378,10 +378,10 @@ void mlir::mesh::maybeInsertSourceShardingAnnotation(MeshSharding sharding,
 
   builder.setInsertionPoint(operandOp);
   auto shardingOp =
-      builder.create<ShardingOp>(operand.get().getLoc(), sharding);
+      ShardingOp::create(builder, operand.get().getLoc(), sharding);
   auto newShardOp =
-      builder.create<ShardOp>(operandValue.getLoc(), operandValue, shardingOp,
-                              /*annotate_for_users*/ true);
+      ShardOp::create(builder, operandValue.getLoc(), operandValue, shardingOp,
+                      /*annotate_for_users*/ true);
   IRRewriter rewriter(builder);
   rewriter.replaceUsesWithIf(
       operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) {
@@ -395,8 +395,8 @@ void mlir::mesh::maybeInsertSourceShardingAnnotation(MeshSharding sharding,
 
   builder.setInsertionPoint(newShardOp);
   auto newPreceedingShardOp =
-      builder.create<ShardOp>(operandValue.getLoc(), operandValue, shardingOp,
-                              /*annotate_for_users*/ false);
+      ShardOp::create(builder, operandValue.getLoc(), operandValue, shardingOp,
+                      /*annotate_for_users*/ false);
   rewriter.replaceUsesWithIf(
       newShardOp.getSrc(), newPreceedingShardOp, [&newShardOp](OpOperand &use) {
         return use.getOwner() == newShardOp.getOperation();
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Simplifications.cpp b/mlir/lib/Dialect/Mesh/Transforms/Simplifications.cpp
index 9da3c9a3dd160..db5fd6e494da1 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Simplifications.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Simplifications.cpp
@@ -91,15 +91,15 @@ struct MeshShapeFolder
         newShapeOpMeshAxes.push_back(opMeshAxes[i]);
       } else {
         // Fold static mesh axes.
-        newResults[i] = builder.create<arith::ConstantOp>(
-            builder.getIndexAttr(meshAxisSize));
+        newResults[i] = arith::ConstantOp::create(
+            builder, builder.getIndexAttr(meshAxisSize));
       }
     }
 
     // Leave only the dynamic mesh axes to be queried.
     if (!newShapeOpMeshAxes.empty()) {
       MeshShapeOp newShapeOp =
-          builder.create<MeshShapeOp>(mesh.getSymName(), newShapeOpMeshAxes);
+          MeshShapeOp::create(builder, mesh.getSymName(), newShapeOpMeshAxes);
       for (size_t i = 0; i < newShapeOp->getResults().size(); ++i) {
         newResults[newToOldResultsIndexMap[i]] = newShapeOp->getResults()[i];
       }
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
index d7b7234f69347..1e54affa8198f 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -265,7 +265,8 @@ unsplitLastAxisInResharding(ImplicitLocOpBuilder &builder,
       targetShardingInUnsplitLastAxis(ctx, sourceSharding, splitTensorAxis);
   ShapedType allGatherResultShape = allGatherResultShapeInUnsplitLastAxis(
       sourceShard.getType(), mesh.getShape()[splitMeshAxis], splitTensorAxis);
-  Value allGatherResult = builder.create<AllGatherOp>(
+  Value allGatherResult = AllGatherOp::create(
+      builder,
       RankedTensorType::get(allGatherResultShape.getShape(),
                             allGatherResultShape.getElementType()),
       mesh.getSymName(), SmallVector<MeshAxis>({splitMeshAxis}), sourceShard,
@@ -273,7 +274,8 @@ unsplitLastAxisInResharding(ImplicitLocOpBuilder &builder,
   ShapedType targetShape =
       shardShapedType(sourceUnshardedShape, mesh, targetSharding);
   TypedValue<ShapedType> targetShard = cast<TypedValue<ShapedType>>(
-      builder.create<tensor::CastOp>(targetShape, allGatherResult).getResult());
+      tensor::CastOp::create(builder, targetShape, allGatherResult)
+          .getResult());
   return {targetShard, targetSharding};
 }
 
@@ -398,7 +400,8 @@ moveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, MeshOp mesh,
   ShapedType allToAllResultShape = allToAllResultShapeInMoveLastAxis(
       sourceShard.getType(), mesh.getShape()[meshAxis], sourceTensorAxis,
       targetTensorAxis);
-  Value allToAllResult = builder.create<AllToAllOp>(
+  Value allToAllResult = AllToAllOp::create(
+      builder,
       RankedTensorType::get(allToAllResultShape.getShape(),
                             allToAllResultShape.getElementType()),
       mesh.getSymName(), SmallVector<MeshAxis>({meshAxis}), sourceShard,
@@ -406,7 +409,7 @@ moveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, MeshOp mesh,
   ShapedType targetShape =
       shardShapedType(sourceUnshardedShape, mesh, targetSharding);
   TypedValue<ShapedType> targetShard = cast<TypedValue<ShapedType>>(
-      builder.create<tensor::CastOp>(targetShape, allToAllResult).getResult());
+      tensor::CastOp::create(builder, targetShape, allToAllResult).getResult());
   return {targetShard, targetSharding};
 }
 
@@ -477,15 +480,16 @@ tryUpdateHaloInResharding(ImplicitLocOpBuilder &builder, MeshOp mesh,
 
   // Extract core from source and copy into destination core.
   auto noVals = ValueRange{};
-  auto initVal = builder.create<tensor::EmptyOp>(
-      sourceShard.getLoc(), outShape, sourceShard.getType().getElementType());
-  auto core = builder.create<tensor::ExtractSliceOp>(
-      sourceShard.getLoc(),
+  auto initVal =
+      tensor::EmptyOp::create(builder, sourceShard.getLoc(), outShape,
+                              sourceShard.getType().getElementType());
+  auto core = tensor::ExtractSliceOp::create(
+      builder, sourceShard.getLoc(),
       RankedTensorType::get(coreShape, sourceShard.getType().getElementType()),
       sourceShard, noVals, noVals, noVals, srcCoreOffs, coreShape, strides);
-  auto initOprnd = builder.create<tensor::InsertSliceOp>(
-      sourceShard.getLoc(), core, initVal, noVals, noVals, noVals, tgtCoreOffs,
-      coreShape, strides);
+  auto initOprnd = tensor::InsertSliceOp::create(
+      builder, sourceShard.getLoc(), core, initVal, noVals, noVals, noVals,
+      tgtCoreOffs, coreShape, strides);
 
   // Finally update the halo.
   auto updateHaloResult =
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp b/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
index f08ef75d8a004..6ae95ae1f8a49 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
@@ -49,10 +49,11 @@ struct ProcessMultiIndexOpLowering
 
     ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
     builder.setInsertionPointAfter(op.getOperation());
-    Value linearIndex = builder.create<ProcessLinearIndexOp>(mesh);
-    ValueRange meshShape = builder.create<MeshShapeOp>(mesh).getResults();
+    Value linearIndex = ProcessLinearIndexOp::create(builder, mesh);
+    ValueRange meshShape = MeshShapeOp::create(builder, mesh).getResults();
     SmallVector<Value> completeMultiIndex =
-        builder.create<affine::AffineDelinearizeIndexOp>(linearIndex, meshShape)
+        affine::AffineDelinearizeIndexOp::create(builder, linearIndex,
+                                                 meshShape)
             .getMultiIndex();
     SmallVector<Value> multiIndex;
     ArrayRef<MeshAxis> opMeshAxes = op.getAxes();
@@ -101,32 +102,33 @@ struct AllSliceOpLowering
     ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
     builder.setInsertionPointAfter(op.getOperation());
 
-    Value zero = builder.create<arith::ConstantOp>(builder.getIndexAttr(0));
+    Value zero = arith::ConstantOp::create(builder, builder.getIndexAttr(0));
 
     Operation::result_range processInGroupMultiIndex =
-        builder.create<ProcessMultiIndexOp>(mesh.getSymName(), op.getMeshAxes())
+        ProcessMultiIndexOp::create(builder, mesh.getSymName(),
+                                    op.getMeshAxes())
             .getResults();
 
     Operation::result_range processGroupShape =
-        builder.create<MeshShapeOp>(mesh.getSymName(), op.getMeshAxes())
+        MeshShapeOp::create(builder, mesh.getSymName(), op.getMeshAxes())
             .getResult();
     Value processGroupSize =
         createCollectiveProcessGroupSize(mesh, op.getMeshAxes(), builder);
 
     int64_t sliceAxis = op.getSliceAxis().getSExtValue();
     Value operandSliceAxisSize =
-        builder.create<tensor::DimOp>(op.getOperand(), sliceAxis);
+        tensor::DimOp::create(builder, op.getOperand(), sliceAxis);
     Value operandSliceAxisSizeModProcessGroupSize =
-        builder.create<arith::RemUIOp>(operandSliceAxisSize, processGroupSize);
-    Value isTargetShapeExactlyDivisible = builder.create<arith::CmpIOp>(
-        arith::CmpIPredicate::eq, operandSliceAxisSizeModProcessGroupSize,
-        zero);
-    builder.create<cf::AssertOp>(isTargetShapeExactlyDivisible,
-                                 "Slicing a tensor with axis size that is "
-                                 "not exactly divisible by the "
-                                 "mesh process group size is not supported.");
+        arith::RemUIOp::create(builder, operandSliceAxisSize, processGroupSize);
+    Value isTargetShapeExactlyDivisible =
+        arith::CmpIOp::create(builder, arith::CmpIPredicate::eq,
+                              operandSliceAxisSizeModProcessGroupSize, zero);
+    cf::AssertOp::create(builder, isTargetShapeExactlyDivisible,
+                         "Slicing a tensor with axis size that is "
+                         "not exactly divisible by the "
+                         "mesh process group size is not supported.");
     Value resultSliceAxisSize =
-        builder.create<arith::DivUIOp>(operandSliceAxisSize, processGroupSize);
+        arith::DivUIOp::create(builder, operandSliceAxisSize, processGroupSize);
     OpFoldResult processInGroupLinearIndex = affine::linearizeIndex(
         llvm::to_vector_of<OpFoldResult>(processInGroupMultiIndex),
         llvm::to_vector_of<OpFoldResult>(processGroupShape), builder);
@@ -139,7 +141,7 @@ struct AllSliceOpLowering
       if (i == sliceAxis) {
         sizes.emplace_back(resultSliceAxisSize);
       } else {
-        Value dimSize = builder.create<tensor::DimOp>(op.getOperand(), i);
+        Value dimSize = tensor::DimOp::create(builder, op.getOperand(), i);
         sizes.emplace_back(dimSize);
       }
     }
@@ -152,10 +154,10 @@ struct AllSliceOpLowering
                  resultSliceAxisSize);
     SmallVector<OpFoldResult> strides(
         operandType.getRank(), getAsIndexOpFoldResult(builder.getContext(), 1));
-    Value slice = builder.create<tensor::ExtractSliceOp>(
-        op.getOperand(), offsets, sizes, strides);
+    Value slice = tensor::ExtractSliceOp::create(builder, op.getOperand(),
+                                                 offsets, sizes, strides);
     Value newResult =
-        builder.create<tensor::CastOp>(op.getResult().getType(), slice);
+        tensor::CastOp::create(builder, op.getResult().getType(), slice);
     rewriter.replaceAllUsesWith(op.getResult(), newResult);
 
     return success();
@@ -201,7 +203,7 @@ TypedValue<IndexType>
 createCollectiveProcessGroupSize(MeshOp mesh, ArrayRef<MeshAxis> axes,
                                  ImplicitLocOpBuilder &builder) {
   Operation::result_range meshShape =
-      builder.create<mesh::MeshShapeOp>(mesh, axes).getResults();
+      mesh::MeshShapeOp::create(builder, mesh, axes).getResults();
   return cast<TypedValue<IndexType>>(arith::createProduct(
       builder, builder.getLoc(), llvm::to_vector_of<Value>(meshShape),
       builder.getIndexType()));
@@ -212,13 +214,14 @@ createProcessLinearIndex(StringRef mesh, ValueRange processInGroupMultiIndex,
                          ArrayRef<MeshAxis> meshAxes,
                          ImplicitLocOpBuilder &builder) {
   Operation::result_range processGroupShape =
-      builder.create<MeshShapeOp>(mesh, meshAxes).getResult();
+      MeshShapeOp::create(builder, mesh, meshAxes).getResult();
   OpFoldResult processInGroupLinearIndex = affine::linearizeIndex(
       llvm::to_vector_of<OpFoldResult>(processInGroupMultiIndex),
       llvm::to_vector_of<OpFoldResult>(processGroupShape), builder);
   auto res = dyn_cast<Value>(processInGroupLinearIndex);
   if (!res)
-    res = builder.create<arith::ConstantIndexOp>(
+    res = arith::ConstantIndexOp::create(
+        builder,
         cast<IntegerAttr>(cast<Attribute>(processInGroupLinearIndex)).getInt());
   return cast<TypedValue<IndexType>>(res);
 }
@@ -227,7 +230,7 @@ TypedValue<IndexType> createProcessLinearIndex(StringRef mesh,
                                                ArrayRef<MeshAxis> meshAxes,
                                                ImplicitLocOpBuilder &builder) {
   return createProcessLinearIndex(
-      mesh, builder.create<ProcessMultiIndexOp>(mesh, meshAxes).getResults(),
+      mesh, ProcessMultiIndexOp::create(builder, mesh, meshAxes).getResults(),
       meshAxes, builder);
 }
 } // namespace mlir::mesh
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index d2c94b124cdfb..5d253c1199dc0 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -333,15 +333,15 @@ static Operation *replaceOpWithPredicatedOp(RewriterBase &rewriter,
   //   srcElement = (pred) ?  prevSrcElements : 0;
   //
   Location loc = asyncCopyOp->getLoc();
-  Value dstElements =
-      rewriter.create<arith::ConstantOp>(loc, asyncCopyOp.getDstElementsAttr());
+  Value dstElements = arith::ConstantOp::create(
+      rewriter, loc, asyncCopyOp.getDstElementsAttr());
   Value originalSrcElement =
       asyncCopyOp.getSrcElements() ? asyncCopyOp.getSrcElements() : dstElements;
-  Value c0Index = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  auto srcElements = rewriter.create<arith::SelectOp>(
-      loc, predicate, originalSrcElement, c0Index);
-  auto asyncCopyZeroFillOp = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
-      loc, nvgpu::DeviceAsyncTokenType::get(asyncCopyOp.getContext()),
+  Value c0Index = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  auto srcElements = arith::SelectOp::create(rewriter, loc, predicate,
+                                             originalSrcElement, c0Index);
+  auto asyncCopyZeroFillOp = nvgpu::DeviceAsyncCopyOp::create(
+      rewriter, loc, nvgpu::DeviceAsyncTokenType::get(asyncCopyOp.getContext()),
       asyncCopyOp.getDst(), asyncCopyOp.getDstIndices(), asyncCopyOp.getSrc(),
       asyncCopyOp.getSrcIndices(), asyncCopyOp.getDstElements(), srcElements,
       UnitAttr());
@@ -675,7 +675,7 @@ MmaSyncBuilder::buildMemRefLoads(OpBuilder &b, Location loc,
   for (auto indexing : indexings) {
     Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row()));
     Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col()));
-    auto load = b.create<memref::LoadOp>(loc, memref, ValueRange{row, col});
+    auto load = memref::LoadOp::create(b, loc, memref, ValueRange{row, col});
     res.push_back(load);
   }
   return res;
@@ -688,7 +688,7 @@ Value MmaSyncBuilder::buildMmaSyncMemRefLoadOperand(
 
   Type elementType = getElementTypeOrSelf(memref.getType());
   auto vt = VectorType::get(vectorShape, elementType);
-  Value res = b.create<vector::SplatOp>(loc, vt, loads[0]);
+  Value res = vector::SplatOp::create(b, loc, vt, loads[0]);
   foreachIndividualVectorElement(
       res,
       /*applyFn=*/
@@ -697,7 +697,7 @@ Value MmaSyncBuilder::buildMmaSyncMemRefLoadOperand(
       },
       /*reduceFn=*/
       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {
-        res = b.create<vector::InsertOp>(loc, v, res, indices);
+        res = vector::InsertOp::create(b, loc, v, res, indices);
       });
 
   return res;
@@ -715,7 +715,7 @@ SmallVector<Operation *> MmaSyncBuilder::buildMemRefStores(
     Value row = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.row()));
     Value col = getValueOrCreateConstantIndexOp(b, loc, aff(indexing.col()));
     Operation *store =
-        b.create<memref::StoreOp>(loc, val, memref, ValueRange{row, col});
+        memref::StoreOp::create(b, loc, val, memref, ValueRange{row, col});
     res.push_back(store);
   }
   return res;
@@ -730,7 +730,7 @@ SmallVector<Operation *> MmaSyncBuilder::buildMmaSyncMemRefStoreOperand(
       vectorToStore,
       /*applyFn=*/
       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {
-        return b.create<vector::ExtractOp>(loc, vectorToStore, indices);
+        return vector::ExtractOp::create(b, loc, vectorToStore, indices);
       },
       /*reduceFn=*/
       [&](Value v, int64_t linearIdx, ArrayRef<int64_t> indices) {
@@ -810,8 +810,8 @@ FailureOr<Operation *> MmaSyncBuilder::buildMmaSync(LinalgOp linalgOp) {
                                             rhsIndexFn, rhsShape);
   Value res = buildMmaSyncMemRefLoadOperand(b, loc, laneId, resMemRef,
                                             resIndexFn, resShape);
-  res = b.create<nvgpu::MmaSyncOp>(loc, lhs, rhs, res, info.mmaShape,
-                                   info.tf32Enabled);
+  res = nvgpu::MmaSyncOp::create(b, loc, lhs, rhs, res, info.mmaShape,
+                                 info.tf32Enabled);
   buildMmaSyncMemRefStoreOperand(b, loc, res, laneId, resMemRef, resIndexFn,
                                  resShape);
   return res.getDefiningOp();
@@ -832,8 +832,8 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne(
     }
     Location loc = linalgOp.getLoc();
     // TODO: more robust computation of laneId, for now assume a single warp.
-    Value laneId = rewriter.create<gpu::ThreadIdOp>(
-        loc, rewriter.getIndexType(), gpu::Dimension::x);
+    Value laneId = gpu::ThreadIdOp::create(
+        rewriter, loc, rewriter.getIndexType(), gpu::Dimension::x);
     if (succeeded(MmaSyncBuilder(rewriter, loc, laneId).buildMmaSync(linalgOp)))
       fail = false;
   }
@@ -897,12 +897,12 @@ SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0(
     ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,
     TypedValue<nvgpu::MBarrierGroupType> barrier) {
   SmallVector<Operation *> loadOps;
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  Value tidx = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
-  Value cond =
-      rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, tidx, zero);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  Value tidx = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
+  Value cond = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                     tidx, zero);
   // clang-format off
-  rewriter.create<scf::IfOp>(
+  scf::IfOp::create(rewriter,
     /*location=*/loc,
     /*conditional=*/cond,
     /*thenBuilder=*/
@@ -917,14 +917,14 @@ SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0(
       // TODO: Note that cutlass predeclares the barrier arrive tx before the tma.async.load.
       // This may or may not have perf implications.
       buildBarrierArriveTx(barrier, sizes);
-      rewriter.create<scf::YieldOp>(loc);
+      scf::YieldOp::create(rewriter, loc);
     },
     /*elseBuilder=*/
     [&](OpBuilder &lb, Location loc) {
       // TODO: is this for no-thread divergence?
       // Should we just yield the size and hoist?
       buildBarrierArriveTx(barrier, getAsIndexOpFoldResult(rewriter.getContext(), 0));
-      rewriter.create<scf::YieldOp>(loc);
+      scf::YieldOp::create(rewriter, loc);
     });
   // clang-format on
   return loadOps;
@@ -939,14 +939,15 @@ static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) {
 TypedValue<nvgpu::MBarrierGroupType>
 HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) {
   auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);
-  Value barrier = rewriter.create<nvgpu::MBarrierCreateOp>(
-      loc,
+  Value barrier = nvgpu::MBarrierCreateOp::create(
+      rewriter, loc,
       nvgpu::MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace));
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  rewriter.create<nvgpu::MBarrierInitOp>(
-      loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads),
-      zero, Value());
-  rewriter.create<gpu::BarrierOp>(loc);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  nvgpu::MBarrierInitOp::create(
+      rewriter, loc, barrier,
+      getValueOrCreateConstantIndexOp(rewriter, loc, numThreads), zero,
+      Value());
+  gpu::BarrierOp::create(rewriter, loc);
   return cast<TypedValue<nvgpu::MBarrierGroupType>>(barrier);
 }
 
@@ -955,8 +956,8 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,
                                            gpu::LaunchOp launchOp) {
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(launchOp);
-  Value unrankedMemRef = rewriter.create<memref::CastOp>(
-      loc,
+  Value unrankedMemRef = memref::CastOp::create(
+      rewriter, loc,
       UnrankedMemRefType::get(memref.getType().getElementType(),
                               memref.getType().getMemorySpace()),
       memref);
@@ -966,8 +967,8 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,
       getValueOrCreateConstantIndexOp(rewriter, loc, mixedSizes);
 
   auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);
-  Value desc = rewriter.create<nvgpu::TmaCreateDescriptorOp>(
-      loc,
+  Value desc = nvgpu::TmaCreateDescriptorOp::create(
+      rewriter, loc,
       nvgpu::TensorMapDescriptorType::get(
           rewriter.getContext(),
           MemRefType::Builder(memref.getType())
@@ -985,10 +986,10 @@ OpFoldResult HopperBuilder::buildTmaAsyncLoad(
     TypedValue<nvgpu::MBarrierGroupType> barrier,
     SmallVectorImpl<Operation *> &loadOps) {
   MLIRContext *ctx = rewriter.getContext();
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  Operation *loadOp = rewriter.create<nvgpu::TmaAsyncLoadOp>(
-      loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}, zero,
-      Value(), Value());
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  Operation *loadOp = nvgpu::TmaAsyncLoadOp::create(
+      rewriter, loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero},
+      zero, Value(), Value());
   loadOps.push_back(loadOp);
   auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref);
   SmallVector<AffineExpr> symbols(mixedSizes.size());
@@ -1012,23 +1013,23 @@ void HopperBuilder::buildBarrierArriveTx(
   OpFoldResult size =
       affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes);
   Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size);
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal, zero,
-                                                   Value());
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  nvgpu::MBarrierArriveExpectTxOp::create(rewriter, loc, barrier, sizeVal, zero,
+                                          Value());
 }
 
 void HopperBuilder::buildTryWaitParity(
     TypedValue<nvgpu::MBarrierGroupType> barrier) {
   Type i1 = rewriter.getI1Type();
-  Value parity = rewriter.create<LLVM::ConstantOp>(loc, i1, 0);
+  Value parity = LLVM::ConstantOp::create(rewriter, loc, i1, 0);
   // 10M is an arbitrary, not too small or too big number to specify the number
   // of ticks before retry.
   // TODO: hoist this in a default dialect constant.
   Value ticksBeforeRetry =
-      rewriter.create<arith::ConstantIndexOp>(loc, 10000000);
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  rewriter.create<nvgpu::MBarrierTryWaitParityOp>(loc, barrier, parity,
-                                                  ticksBeforeRetry, zero);
+      arith::ConstantIndexOp::create(rewriter, loc, 10000000);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  nvgpu::MBarrierTryWaitParityOp::create(rewriter, loc, barrier, parity,
+                                         ticksBeforeRetry, zero);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index 47a0c7096de95..b392ffeb13de6 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -109,17 +109,17 @@ static Value buildNumReadElements(OpBuilder &b, Location loc,
   for (auto [pos, sz] : llvm::zip(transferMask->extractPosition,
                                   transferMask->createMaskOp->getOperands())) {
     Value cmp =
-        b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
-                                b.create<arith::ConstantIndexOp>(loc, pos), sz);
+        arith::CmpIOp::create(b, loc, arith::CmpIPredicate::slt,
+                              arith::ConstantIndexOp::create(b, loc, pos), sz);
     if (!cond) {
       cond = cmp;
       continue;
     }
-    cond = b.create<arith::AndIOp>(loc, cmp, cond);
+    cond = arith::AndIOp::create(b, loc, cmp, cond);
   }
-  return b.create<arith::SelectOp>(
-      loc, cond, transferMask->createMaskOp->getOperands().back(),
-      b.create<arith::ConstantIndexOp>(loc, 0));
+  return arith::SelectOp::create(
+      b, loc, cond, transferMask->createMaskOp->getOperands().back(),
+      arith::ConstantIndexOp::create(b, loc, 0));
 }
 
 /// Return "true" if the conversion to async copy is supported by "async copy".
@@ -251,8 +251,9 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
       int64_t sizeInBytes =
           (dstMemref.getElementTypeBitWidth() * numElements) / 8;
       // bypass_l1 only possible with 16 byte transfer.
-      Value token = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
-          writeOp->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
+      Value token = nvgpu::DeviceAsyncCopyOp::create(
+          rewriter, writeOp->getLoc(),
+          nvgpu::DeviceAsyncTokenType::get(op->getContext()),
           /*dst=*/storeBase, /*dstIndices=*/nvgpu::getIndices(writeOp),
           /*src=*/loadBase,
           /*srcIndices=*/nvgpu::getIndices(readOp),
@@ -264,11 +265,11 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
     }
 
     // Create the group and wait for it right after.
-    Value groupToken = rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(
-        op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
-        tokens);
-    rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), groupToken,
-                                              nullptr);
+    Value groupToken = nvgpu::DeviceAsyncCreateGroupOp::create(
+        rewriter, op->getLoc(),
+        nvgpu::DeviceAsyncTokenType::get(op->getContext()), tokens);
+    nvgpu::DeviceAsyncWaitOp::create(rewriter, op->getLoc(), groupToken,
+                                     nullptr);
     // Clean up old stores.
     for (Operation *writeOp : group)
       rewriter.eraseOp(writeOp);
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
index 44e7fa961da12..957b9632422a6 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
@@ -74,27 +74,28 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   int64_t mask = (1LL << (m - n)) - 1;
   if (permuteEveryN > 1)
     mask = mask << llvm::Log2_64(permuteEveryN);
-  Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);
-  srcBits = b.create<arith::AndIOp>(loc, src, srcBits);
+  Value srcBits = arith::ConstantIndexOp::create(b, loc, mask);
+  srcBits = arith::AndIOp::create(b, loc, src, srcBits);
 
   // Use the src bits to permute the target bits b[N:M] containing the
   // vector offset.
   if (permuteEveryN > 1) {
     int64_t shlBits = n - llvm::Log2_64(permuteEveryN);
     if (shlBits > 0) {
-      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, shlBits);
+      Value finalShiftVal = arith::ConstantIndexOp::create(b, loc, shlBits);
       srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
     } else if (shlBits < 0) {
-      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, -1 * shlBits);
+      Value finalShiftVal =
+          arith::ConstantIndexOp::create(b, loc, -1 * shlBits);
       srcBits = b.createOrFold<arith::ShRUIOp>(loc, srcBits, finalShiftVal);
     }
   } else {
-    Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, n);
+    Value finalShiftVal = arith::ConstantIndexOp::create(b, loc, n);
     srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
   }
 
   Value permutedVectorIdx =
-      b.create<arith::XOrIOp>(loc, indices[tgtDim], srcBits);
+      arith::XOrIOp::create(b, loc, indices[tgtDim], srcBits);
   return permutedVectorIdx;
 }
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f2eab62b286af..fbc1f003ab648 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/LogicalResult.h"
+#include <variant>
 
 using namespace mlir;
 using namespace acc;
@@ -3461,40 +3462,88 @@ LogicalResult acc::RoutineOp::verify() {
   return success();
 }
 
-static ParseResult parseBindName(OpAsmParser &parser, mlir::ArrayAttr &bindName,
-                                 mlir::ArrayAttr &deviceTypes) {
-  llvm::SmallVector<mlir::Attribute> bindNameAttrs;
-  llvm::SmallVector<mlir::Attribute> deviceTypeAttrs;
+static ParseResult parseBindName(OpAsmParser &parser,
+                                 mlir::ArrayAttr &bindIdName,
+                                 mlir::ArrayAttr &bindStrName,
+                                 mlir::ArrayAttr &deviceIdTypes,
+                                 mlir::ArrayAttr &deviceStrTypes) {
+  llvm::SmallVector<mlir::Attribute> bindIdNameAttrs;
+  llvm::SmallVector<mlir::Attribute> bindStrNameAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceIdTypeAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceStrTypeAttrs;
 
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseAttribute(bindNameAttrs.emplace_back()))
+        mlir::Attribute newAttr;
+        bool isSymbolRefAttr;
+        auto parseResult = parser.parseAttribute(newAttr);
+        if (auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(newAttr)) {
+          bindIdNameAttrs.push_back(symbolRefAttr);
+          isSymbolRefAttr = true;
+        } else if (auto stringAttr = dyn_cast<mlir::StringAttr>(newAttr)) {
+          bindStrNameAttrs.push_back(stringAttr);
+          isSymbolRefAttr = false;
+        }
+        if (parseResult)
           return failure();
         if (failed(parser.parseOptionalLSquare())) {
-          deviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
-              parser.getContext(), mlir::acc::DeviceType::None));
+          if (isSymbolRefAttr) {
+            deviceIdTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+                parser.getContext(), mlir::acc::DeviceType::None));
+          } else {
+            deviceStrTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+                parser.getContext(), mlir::acc::DeviceType::None));
+          }
         } else {
-          if (parser.parseAttribute(deviceTypeAttrs.emplace_back()) ||
-              parser.parseRSquare())
-            return failure();
+          if (isSymbolRefAttr) {
+            if (parser.parseAttribute(deviceIdTypeAttrs.emplace_back()) ||
+                parser.parseRSquare())
+              return failure();
+          } else {
+            if (parser.parseAttribute(deviceStrTypeAttrs.emplace_back()) ||
+                parser.parseRSquare())
+              return failure();
+          }
         }
         return success();
       })))
     return failure();
 
-  bindName = ArrayAttr::get(parser.getContext(), bindNameAttrs);
-  deviceTypes = ArrayAttr::get(parser.getContext(), deviceTypeAttrs);
+  bindIdName = ArrayAttr::get(parser.getContext(), bindIdNameAttrs);
+  bindStrName = ArrayAttr::get(parser.getContext(), bindStrNameAttrs);
+  deviceIdTypes = ArrayAttr::get(parser.getContext(), deviceIdTypeAttrs);
+  deviceStrTypes = ArrayAttr::get(parser.getContext(), deviceStrTypeAttrs);
 
   return success();
 }
 
 static void printBindName(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                          std::optional<mlir::ArrayAttr> bindName,
-                          std::optional<mlir::ArrayAttr> deviceTypes) {
-  llvm::interleaveComma(llvm::zip(*bindName, *deviceTypes), p,
-                        [&](const auto &pair) {
-                          p << std::get<0>(pair);
-                          printSingleDeviceType(p, std::get<1>(pair));
-                        });
+                          std::optional<mlir::ArrayAttr> bindIdName,
+                          std::optional<mlir::ArrayAttr> bindStrName,
+                          std::optional<mlir::ArrayAttr> deviceIdTypes,
+                          std::optional<mlir::ArrayAttr> deviceStrTypes) {
+  // Create combined vectors for all bind names and device types
+  llvm::SmallVector<mlir::Attribute> allBindNames;
+  llvm::SmallVector<mlir::Attribute> allDeviceTypes;
+
+  // Append bindIdName and deviceIdTypes
+  if (hasDeviceTypeValues(deviceIdTypes)) {
+    allBindNames.append(bindIdName->begin(), bindIdName->end());
+    allDeviceTypes.append(deviceIdTypes->begin(), deviceIdTypes->end());
+  }
+
+  // Append bindStrName and deviceStrTypes
+  if (hasDeviceTypeValues(deviceStrTypes)) {
+    allBindNames.append(bindStrName->begin(), bindStrName->end());
+    allDeviceTypes.append(deviceStrTypes->begin(), deviceStrTypes->end());
+  }
+
+  // Print the combined sequence
+  if (!allBindNames.empty())
+    llvm::interleaveComma(llvm::zip(allBindNames, allDeviceTypes), p,
+                          [&](const auto &pair) {
+                            p << std::get<0>(pair);
+                            printSingleDeviceType(p, std::get<1>(pair));
+                          });
 }
 
 static ParseResult parseRoutineGangClause(OpAsmParser &parser,
@@ -3654,19 +3703,32 @@ bool RoutineOp::hasSeq(mlir::acc::DeviceType deviceType) {
   return hasDeviceType(getSeq(), deviceType);
 }
 
-std::optional<llvm::StringRef> RoutineOp::getBindNameValue() {
+std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
+RoutineOp::getBindNameValue() {
   return getBindNameValue(mlir::acc::DeviceType::None);
 }
 
-std::optional<llvm::StringRef>
+std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
 RoutineOp::getBindNameValue(mlir::acc::DeviceType deviceType) {
-  if (!hasDeviceTypeValues(getBindNameDeviceType()))
+  if (!hasDeviceTypeValues(getBindIdNameDeviceType()) &&
+      !hasDeviceTypeValues(getBindStrNameDeviceType())) {
     return std::nullopt;
-  if (auto pos = findSegment(*getBindNameDeviceType(), deviceType)) {
-    auto attr = (*getBindName())[*pos];
+  }
+
+  if (auto pos = findSegment(*getBindIdNameDeviceType(), deviceType)) {
+    auto attr = (*getBindIdName())[*pos];
+    auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(attr);
+    assert(symbolRefAttr && "expected SymbolRef");
+    return symbolRefAttr;
+  }
+
+  if (auto pos = findSegment(*getBindStrNameDeviceType(), deviceType)) {
+    auto attr = (*getBindStrName())[*pos];
     auto stringAttr = dyn_cast<mlir::StringAttr>(attr);
-    return stringAttr.getValue();
+    assert(stringAttr && "expected String");
+    return stringAttr;
   }
+
   return std::nullopt;
 }
 
diff --git a/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp b/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp
index 793db73575b4f..58cd160948f7f 100644
--- a/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/LowerQuantOps.cpp
@@ -72,7 +72,7 @@ Value getScalarOrTensorConstant(OpBuilder &builder, Location loc, Value scalar,
 
   // Create tensor splat
   auto tensorConstant =
-      builder.create<tensor::SplatOp>(loc, scalar, referenceShape);
+      tensor::SplatOp::create(builder, loc, scalar, referenceShape);
   return tensorConstant;
 }
 
@@ -94,22 +94,22 @@ std::pair<Value, Value> flattenUnrankedTensor(OpBuilder &builder, Location loc,
   // Get unranked input shape and total size
   auto *context = builder.getContext();
   auto shapeType = shape::getExtentTensorType(context);
-  auto inputShape = builder.create<shape::ShapeOfOp>(loc, shapeType, input);
-  Value inputSize = builder.create<shape::NumElementsOp>(
-      loc, builder.getIndexType(), inputShape);
+  auto inputShape = shape::ShapeOfOp::create(builder, loc, shapeType, input);
+  Value inputSize = shape::NumElementsOp::create(
+      builder, loc, builder.getIndexType(), inputShape);
 
   // Turn input size into 1D tensor
   auto flatShapeType = shape::getExtentTensorType(context, 1);
   auto flatInputShape =
-      builder.create<tensor::FromElementsOp>(loc, flatShapeType, inputSize);
+      tensor::FromElementsOp::create(builder, loc, flatShapeType, inputSize);
 
   // Reshape input tensor into 1D
   auto inputType = cast<UnrankedTensorType>(input.getType());
   auto elementType = inputType.getElementType();
   auto flatInputType =
       RankedTensorType::get({ShapedType::kDynamic}, elementType);
-  auto flatInput = builder.create<tensor::ReshapeOp>(loc, flatInputType, input,
-                                                     flatInputShape);
+  auto flatInput = tensor::ReshapeOp::create(builder, loc, flatInputType, input,
+                                             flatInputShape);
   return std::make_pair(flatInput, inputShape);
 }
 
@@ -142,39 +142,40 @@ flattenUnrankedTensorAroundAxis(OpBuilder &builder, Location loc, Value input,
   auto *context = builder.getContext();
   auto indexType = builder.getIndexType();
   auto shapeType = shape::getExtentTensorType(context);
-  auto inputShape = builder.create<shape::ShapeOfOp>(loc, shapeType, input);
+  auto inputShape = shape::ShapeOfOp::create(builder, loc, shapeType, input);
 
   // Get shape and sizes on left and right of axis
-  auto axisValue = builder.create<arith::ConstantIndexOp>(loc, axis);
-  auto axisNextValue = builder.create<arith::ConstantIndexOp>(loc, axis + 1);
+  auto axisValue = arith::ConstantIndexOp::create(builder, loc, axis);
+  auto axisNextValue = arith::ConstantIndexOp::create(builder, loc, axis + 1);
   auto shapeLeft =
       builder
           .create<shape::SplitAtOp>(loc, TypeRange{shapeType, shapeType},
                                     inputShape, axisValue)
           .getResult(0);
   auto sizeLeft =
-      builder.create<shape::NumElementsOp>(loc, indexType, shapeLeft);
+      shape::NumElementsOp::create(builder, loc, indexType, shapeLeft);
   auto shapeRight =
       builder
           .create<shape::SplitAtOp>(loc, TypeRange{shapeType, shapeType},
                                     inputShape, axisNextValue)
           .getResult(1);
   auto sizeRight =
-      builder.create<shape::NumElementsOp>(loc, indexType, shapeRight);
+      shape::NumElementsOp::create(builder, loc, indexType, shapeRight);
 
   // Compute flat input shape as a 3-element 1D tensor
-  auto axisSizeValue = builder.create<arith::ConstantIndexOp>(loc, axisSize);
+  auto axisSizeValue = arith::ConstantIndexOp::create(builder, loc, axisSize);
   auto flatShapeType = shape::getExtentTensorType(context, 3);
-  auto flatInputShape = builder.create<tensor::FromElementsOp>(
-      loc, flatShapeType, ValueRange{sizeLeft, axisSizeValue, sizeRight});
+  auto flatInputShape = tensor::FromElementsOp::create(
+      builder, loc, flatShapeType,
+      ValueRange{sizeLeft, axisSizeValue, sizeRight});
 
   // Reshape input to 3D tensor
   auto inputType = cast<UnrankedTensorType>(input.getType());
   auto elementType = inputType.getElementType();
   auto flatInputType = RankedTensorType::get(
       {ShapedType::kDynamic, axisSize, ShapedType::kDynamic}, elementType);
-  auto flatInput = builder.create<tensor::ReshapeOp>(loc, flatInputType, input,
-                                                     flatInputShape);
+  auto flatInput = tensor::ReshapeOp::create(builder, loc, flatInputType, input,
+                                             flatInputShape);
 
   return std::make_pair(flatInput, inputShape);
 }
@@ -192,8 +193,8 @@ Value restoreUnrankedTensorShape(OpBuilder &builder, Location loc, Value input,
   auto inputType = cast<RankedTensorType>(input.getType());
   auto elementType = inputType.getElementType();
   auto unrankedType = UnrankedTensorType::get(elementType);
-  return builder.create<tensor::ReshapeOp>(loc, unrankedType, input,
-                                           inputShape);
+  return tensor::ReshapeOp::create(builder, loc, unrankedType, input,
+                                   inputShape);
 }
 
 // Create a tensor constant containing all scales in a per-channel quantized
@@ -215,7 +216,7 @@ Value materializePerChannelScales(OpBuilder &builder, Location loc,
   auto tensorType =
       RankedTensorType::get({(int64_t)scales.size()}, expressedType);
   auto scalesAttr = DenseElementsAttr::get(tensorType, scaleAttrs);
-  return builder.create<arith::ConstantOp>(loc, tensorType, scalesAttr);
+  return arith::ConstantOp::create(builder, loc, tensorType, scalesAttr);
 }
 
 // Create a tensor constant containing all zero points in a per-channel
@@ -239,7 +240,7 @@ Value materializePerChannelZeroPoints(
   auto tensorType =
       RankedTensorType::get({(int64_t)zeroPoints.size()}, storageType);
   auto zeroPointsAttr = DenseElementsAttr::get(tensorType, zeroPointAttrs);
-  return builder.create<arith::ConstantOp>(loc, tensorType, zeroPointsAttr);
+  return arith::ConstantOp::create(builder, loc, tensorType, zeroPointsAttr);
 }
 
 // Create a tensor constant containing all scales in a sub-channel quantized
@@ -263,7 +264,7 @@ Value materializeSubChannelScales(
   auto tensorType =
       RankedTensorType::get(scales.getType().getShape(), expressedType);
   auto scalesAttr = DenseElementsAttr::get(tensorType, scaleAttrs);
-  return builder.create<arith::ConstantOp>(loc, tensorType, scalesAttr);
+  return arith::ConstantOp::create(builder, loc, tensorType, scalesAttr);
 }
 
 // Create a tensor constant containing all zero points in a sub-channel
@@ -287,7 +288,7 @@ Value materializeSubChannelZeroPoints(
   auto tensorType =
       RankedTensorType::get(zeroPoints.getType().getShape(), storageType);
   auto zeroPointsAttr = DenseElementsAttr::get(tensorType, zeroPointAttrs);
-  return builder.create<arith::ConstantOp>(loc, tensorType, zeroPointsAttr);
+  return arith::ConstantOp::create(builder, loc, tensorType, zeroPointsAttr);
 }
 
 // Clamp the given scalar or tensor input using the storage bounds encoded in
@@ -314,10 +315,10 @@ Value clampScalarOrTensor(OpBuilder &builder, Location loc, Value input,
   // Materialize bounds
   auto inputType = input.getType();
   auto storageType = quantizedType.getStorageType();
-  auto storageMinScalar = builder.create<arith::ConstantIntOp>(
-      loc, storageType, quantizedType.getStorageTypeMin());
-  auto storageMaxScalar = builder.create<arith::ConstantIntOp>(
-      loc, storageType, quantizedType.getStorageTypeMax());
+  auto storageMinScalar = arith::ConstantIntOp::create(
+      builder, loc, storageType, quantizedType.getStorageTypeMin());
+  auto storageMaxScalar = arith::ConstantIntOp::create(
+      builder, loc, storageType, quantizedType.getStorageTypeMax());
   auto storageMin = getScalarOrTensorConstant(builder, loc, storageMinScalar,
                                               inputType, inputShape);
   auto storageMax = getScalarOrTensorConstant(builder, loc, storageMaxScalar,
@@ -325,11 +326,11 @@ Value clampScalarOrTensor(OpBuilder &builder, Location loc, Value input,
 
   // Clamp
   if (quantizedType.isSigned()) {
-    input = builder.create<arith::MaxSIOp>(loc, input, storageMin);
-    input = builder.create<arith::MinSIOp>(loc, input, storageMax);
+    input = arith::MaxSIOp::create(builder, loc, input, storageMin);
+    input = arith::MinSIOp::create(builder, loc, input, storageMax);
   } else {
-    input = builder.create<arith::MaxUIOp>(loc, input, storageMin);
-    input = builder.create<arith::MinUIOp>(loc, input, storageMax);
+    input = arith::MaxUIOp::create(builder, loc, input, storageMin);
+    input = arith::MinUIOp::create(builder, loc, input, storageMax);
   }
   return input;
 }
@@ -338,16 +339,16 @@ Value clampScalarOrTensor(OpBuilder &builder, Location loc, Value input,
 Value convertFloatToInteger(OpBuilder &builder, Location loc, Value input,
                             Type resultType, bool isSigned) {
   if (isSigned)
-    return builder.create<arith::FPToSIOp>(loc, resultType, input);
-  return builder.create<arith::FPToUIOp>(loc, resultType, input);
+    return arith::FPToSIOp::create(builder, loc, resultType, input);
+  return arith::FPToUIOp::create(builder, loc, resultType, input);
 }
 
 // Emit op 'arith.sitofp' or 'arith.uitofp'.
 Value convertIntegerToFloat(OpBuilder &builder, Location loc, Value input,
                             Type resultType, bool isSigned) {
   if (isSigned)
-    return builder.create<arith::SIToFPOp>(loc, resultType, input);
-  return builder.create<arith::UIToFPOp>(loc, resultType, input);
+    return arith::SIToFPOp::create(builder, loc, resultType, input);
+  return arith::UIToFPOp::create(builder, loc, resultType, input);
 }
 
 // Quantize a scalar or ranked tensor value. The stored value is clamped using
@@ -362,7 +363,7 @@ Value quantizeValue(OpBuilder &builder, Location loc, Value input,
   scale = getScalarOrTensorConstant(builder, loc, scale, inputType, inputShape);
 
   // Scale input
-  auto scaledValue = builder.create<arith::DivFOp>(loc, input, scale);
+  auto scaledValue = arith::DivFOp::create(builder, loc, input, scale);
 
   // Skip unnecessary computations if no zero point is given
   Value storedValueFloat = scaledValue;
@@ -377,7 +378,7 @@ Value quantizeValue(OpBuilder &builder, Location loc, Value input,
 
     // Add zero point to stored value
     storedValueFloat =
-        builder.create<arith::AddFOp>(loc, scaledValue, zeroPoint);
+        arith::AddFOp::create(builder, loc, scaledValue, zeroPoint);
   }
 
   // Convert stored value to storage type
@@ -418,11 +419,11 @@ Value dequantizeValue(OpBuilder &builder, Location loc, Value input,
                                       quantizedType.isSigned());
 
     // Subtract zero point to stored value
-    result = builder.create<arith::SubFOp>(loc, result, zeroPoint);
+    result = arith::SubFOp::create(builder, loc, result, zeroPoint);
   }
 
   // Multiply by scale
-  result = builder.create<arith::MulFOp>(loc, result, scale);
+  result = arith::MulFOp::create(builder, loc, result, scale);
   return result;
 }
 
@@ -477,11 +478,12 @@ Value convertPerLayerRanked(OpBuilder &builder, Location loc, Operation *op,
   auto storageType = quantizedType.getStorageType();
   auto scaleAttr =
       builder.getFloatAttr(expressedType, quantizedType.getScale());
-  auto scale = builder.create<arith::ConstantOp>(loc, expressedType, scaleAttr);
+  auto scale =
+      arith::ConstantOp::create(builder, loc, expressedType, scaleAttr);
   auto zeroPointAttr =
       builder.getIntegerAttr(storageType, quantizedType.getZeroPoint());
   auto zeroPoint =
-      builder.create<arith::ConstantOp>(loc, storageType, zeroPointAttr);
+      arith::ConstantOp::create(builder, loc, storageType, zeroPointAttr);
 
   auto inputShape = getScalarOrTensorShape(builder, loc, input);
   return convertRanked(builder, loc, op, input, inputShape, scale, zeroPoint,
@@ -546,7 +548,7 @@ Value convertPerChannelRanked(OpBuilder &builder, Location loc, Operation *op,
                          ? quantizedType.getStorageType()
                          : quantizedType.getExpressedType();
   auto initShape = tensor::getMixedSizes(builder, loc, input);
-  Value init = builder.create<tensor::EmptyOp>(loc, initShape, elementType);
+  Value init = tensor::EmptyOp::create(builder, loc, initShape, elementType);
 
   SmallVector<utils::IteratorType> iteratorTypes(inputRank,
                                                  utils::IteratorType::parallel);
@@ -572,7 +574,7 @@ Value convertPerChannelRanked(OpBuilder &builder, Location loc, Operation *op,
                               convertRanked(builder, loc, op, input, {}, scale,
                                             zeroPoint, quantizedType);
 
-                          builder.create<linalg::YieldOp>(loc, result);
+                          linalg::YieldOp::create(builder, loc, result);
                         })
                     .getResult(0);
 
@@ -642,7 +644,7 @@ Value convertSubChannel(OpBuilder &builder, Location loc, Operation *op,
                          ? quantizedType.getStorageType()
                          : quantizedType.getExpressedType();
   auto initShape = tensor::getMixedSizes(builder, loc, input);
-  Value init = builder.create<tensor::EmptyOp>(loc, initShape, elementType);
+  Value init = tensor::EmptyOp::create(builder, loc, initShape, elementType);
 
   SmallVector<utils::IteratorType> iteratorTypes(inputRank,
                                                  utils::IteratorType::parallel);
@@ -675,7 +677,7 @@ Value convertSubChannel(OpBuilder &builder, Location loc, Operation *op,
                               convertRanked(builder, loc, op, input, {}, scale,
                                             zeroPoint, quantizedType);
 
-                          builder.create<linalg::YieldOp>(loc, result);
+                          linalg::YieldOp::create(builder, loc, result);
                         })
                     .getResult(0);
 
@@ -729,8 +731,8 @@ struct DequantizeCastOpConversion
     // Convert quantized input to storage type
     auto storageScalarOrTensorType =
         getScalarOrTensorType(quantizedType.getStorageType(), input.getType());
-    input = rewriter.create<quant::StorageCastOp>(
-        loc, storageScalarOrTensorType, input);
+    input = quant::StorageCastOp::create(rewriter, loc,
+                                         storageScalarOrTensorType, input);
 
     auto result = convertQuantized(rewriter, loc, op, input, quantizedType);
 
diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
index 920b6ecb01d47..1ffb18fb7ab96 100644
--- a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
@@ -41,8 +41,8 @@ class QuantizedTypeConverter : public TypeConverter {
 
   static Value materializeConversion(OpBuilder &builder, Type type,
                                      ValueRange inputs, Location loc) {
-    return builder.create<quant::StorageCastOp>(loc, type,
-                                                llvm::getSingleElement(inputs));
+    return quant::StorageCastOp::create(builder, loc, type,
+                                        llvm::getSingleElement(inputs));
   }
 
 public:
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 00c31a1500e17..df41eba4ef533 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1926,11 +1926,13 @@ void ForallOp::getCanonicalizationPatterns(RewritePatternSet &results,
 /// not a constant.
 void ForallOp::getSuccessorRegions(RegionBranchPoint point,
                                    SmallVectorImpl<RegionSuccessor> &regions) {
-  // Both the operation itself and the region may be branching into the body or
-  // back into the operation itself. It is possible for loop not to enter the
-  // body.
-  regions.push_back(RegionSuccessor(&getRegion()));
-  regions.push_back(RegionSuccessor());
+  // In accordance with the semantics of forall, its body is executed in
+  // parallel by multiple threads. We should not expect to branch back into
+  // the forall body after the region's execution is complete.
+  if (point.isParent())
+    regions.push_back(RegionSuccessor(&getRegion()));
+  else
+    regions.push_back(RegionSuccessor());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
index 371456552b5b5..890406df74e72 100644
--- a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
@@ -391,7 +391,7 @@ void LoopOp::addEntryAndMergeBlock(OpBuilder &builder) {
   builder.createBlock(&getBody());
 
   // Add a spirv.mlir.merge op into the merge block.
-  builder.create<spirv::MergeOp>(getLoc());
+  spirv::MergeOp::create(builder, getLoc());
 }
 
 //===----------------------------------------------------------------------===//
@@ -543,7 +543,7 @@ void SelectionOp::addMergeBlock(OpBuilder &builder) {
   builder.createBlock(&getBody());
 
   // Add a spirv.mlir.merge op into the merge block.
-  builder.create<spirv::MergeOp>(getLoc());
+  spirv::MergeOp::create(builder, getLoc());
 }
 
 SelectionOp
@@ -551,7 +551,7 @@ SelectionOp::createIfThen(Location loc, Value condition,
                           function_ref<void(OpBuilder &builder)> thenBody,
                           OpBuilder &builder) {
   auto selectionOp =
-      builder.create<spirv::SelectionOp>(loc, spirv::SelectionControl::None);
+      spirv::SelectionOp::create(builder, loc, spirv::SelectionControl::None);
 
   selectionOp.addMergeBlock(builder);
   Block *mergeBlock = selectionOp.getMergeBlock();
@@ -562,17 +562,17 @@ SelectionOp::createIfThen(Location loc, Value condition,
     OpBuilder::InsertionGuard guard(builder);
     thenBlock = builder.createBlock(mergeBlock);
     thenBody(builder);
-    builder.create<spirv::BranchOp>(loc, mergeBlock);
+    spirv::BranchOp::create(builder, loc, mergeBlock);
   }
 
   // Build the header block.
   {
     OpBuilder::InsertionGuard guard(builder);
     builder.createBlock(thenBlock);
-    builder.create<spirv::BranchConditionalOp>(
-        loc, condition, thenBlock,
-        /*trueArguments=*/ArrayRef<Value>(), mergeBlock,
-        /*falseArguments=*/ArrayRef<Value>());
+    spirv::BranchConditionalOp::create(builder, loc, condition, thenBlock,
+                                       /*trueArguments=*/ArrayRef<Value>(),
+                                       mergeBlock,
+                                       /*falseArguments=*/ArrayRef<Value>());
   }
 
   return selectionOp;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
index 047f8da0cc003..2bde44baf961e 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -178,16 +178,16 @@ struct IAddCarryFold final : OpRewritePattern<spirv::IAddCarryOp> {
       return failure();
 
     Value addsVal =
-        rewriter.create<spirv::ConstantOp>(loc, constituentType, adds);
+        spirv::ConstantOp::create(rewriter, loc, constituentType, adds);
 
     Value carrysVal =
-        rewriter.create<spirv::ConstantOp>(loc, constituentType, carrys);
+        spirv::ConstantOp::create(rewriter, loc, constituentType, carrys);
 
     // Create empty struct
-    Value undef = rewriter.create<spirv::UndefOp>(loc, op.getType());
+    Value undef = spirv::UndefOp::create(rewriter, loc, op.getType());
     // Fill in adds at id 0
     Value intermediate =
-        rewriter.create<spirv::CompositeInsertOp>(loc, addsVal, undef, 0);
+        spirv::CompositeInsertOp::create(rewriter, loc, addsVal, undef, 0);
     // Fill in carrys at id 1
     rewriter.replaceOpWithNewOp<spirv::CompositeInsertOp>(op, carrysVal,
                                                           intermediate, 1);
@@ -260,16 +260,16 @@ struct MulExtendedFold final : OpRewritePattern<MulOp> {
       return failure();
 
     Value lowBitsVal =
-        rewriter.create<spirv::ConstantOp>(loc, constituentType, lowBits);
+        spirv::ConstantOp::create(rewriter, loc, constituentType, lowBits);
 
     Value highBitsVal =
-        rewriter.create<spirv::ConstantOp>(loc, constituentType, highBits);
+        spirv::ConstantOp::create(rewriter, loc, constituentType, highBits);
 
     // Create empty struct
-    Value undef = rewriter.create<spirv::UndefOp>(loc, op.getType());
+    Value undef = spirv::UndefOp::create(rewriter, loc, op.getType());
     // Fill in lowBits at id 0
     Value intermediate =
-        rewriter.create<spirv::CompositeInsertOp>(loc, lowBitsVal, undef, 0);
+        spirv::CompositeInsertOp::create(rewriter, loc, lowBitsVal, undef, 0);
     // Fill in highBits at id 1
     rewriter.replaceOpWithNewOp<spirv::CompositeInsertOp>(op, highBitsVal,
                                                           intermediate, 1);
@@ -1309,11 +1309,11 @@ struct ConvertSelectionOpToSelect final : OpRewritePattern<spirv::SelectionOp> {
     auto storeOpAttributes =
         cast<spirv::StoreOp>(trueBlock->front())->getAttrs();
 
-    auto selectOp = rewriter.create<spirv::SelectOp>(
-        selectionOp.getLoc(), trueValue.getType(),
+    auto selectOp = spirv::SelectOp::create(
+        rewriter, selectionOp.getLoc(), trueValue.getType(),
         brConditionalOp.getCondition(), trueValue, falseValue);
-    rewriter.create<spirv::StoreOp>(selectOp.getLoc(), ptrValue,
-                                    selectOp.getResult(), storeOpAttributes);
+    spirv::StoreOp::create(rewriter, selectOp.getLoc(), ptrValue,
+                           selectOp.getResult(), storeOpAttributes);
 
     // `spirv.mlir.selection` is not needed anymore.
     rewriter.eraseOp(op);
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index f32c53b8f0b9e..c9a8e97bd3296 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -940,12 +940,12 @@ Operation *SPIRVDialect::materializeConstant(OpBuilder &builder,
                                              Attribute value, Type type,
                                              Location loc) {
   if (auto poison = dyn_cast<ub::PoisonAttr>(value))
-    return builder.create<ub::PoisonOp>(loc, type, poison);
+    return ub::PoisonOp::create(builder, loc, type, poison);
 
   if (!spirv::ConstantOp::isBuildableWith(type))
     return nullptr;
 
-  return builder.create<spirv::ConstantOp>(loc, type, value);
+  return spirv::ConstantOp::create(builder, loc, type, value);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index 656236246b1ad..52c672a05fa43 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -651,26 +651,26 @@ spirv::ConstantOp spirv::ConstantOp::getZero(Type type, Location loc,
   if (auto intType = llvm::dyn_cast<IntegerType>(type)) {
     unsigned width = intType.getWidth();
     if (width == 1)
-      return builder.create<spirv::ConstantOp>(loc, type,
-                                               builder.getBoolAttr(false));
-    return builder.create<spirv::ConstantOp>(
-        loc, type, builder.getIntegerAttr(type, APInt(width, 0)));
+      return spirv::ConstantOp::create(builder, loc, type,
+                                       builder.getBoolAttr(false));
+    return spirv::ConstantOp::create(
+        builder, loc, type, builder.getIntegerAttr(type, APInt(width, 0)));
   }
   if (auto floatType = llvm::dyn_cast<FloatType>(type)) {
-    return builder.create<spirv::ConstantOp>(
-        loc, type, builder.getFloatAttr(floatType, 0.0));
+    return spirv::ConstantOp::create(builder, loc, type,
+                                     builder.getFloatAttr(floatType, 0.0));
   }
   if (auto vectorType = llvm::dyn_cast<VectorType>(type)) {
     Type elemType = vectorType.getElementType();
     if (llvm::isa<IntegerType>(elemType)) {
-      return builder.create<spirv::ConstantOp>(
-          loc, type,
+      return spirv::ConstantOp::create(
+          builder, loc, type,
           DenseElementsAttr::get(vectorType,
                                  IntegerAttr::get(elemType, 0).getValue()));
     }
     if (llvm::isa<FloatType>(elemType)) {
-      return builder.create<spirv::ConstantOp>(
-          loc, type,
+      return spirv::ConstantOp::create(
+          builder, loc, type,
           DenseFPElementsAttr::get(vectorType,
                                    FloatAttr::get(elemType, 0.0).getValue()));
     }
@@ -684,26 +684,26 @@ spirv::ConstantOp spirv::ConstantOp::getOne(Type type, Location loc,
   if (auto intType = llvm::dyn_cast<IntegerType>(type)) {
     unsigned width = intType.getWidth();
     if (width == 1)
-      return builder.create<spirv::ConstantOp>(loc, type,
-                                               builder.getBoolAttr(true));
-    return builder.create<spirv::ConstantOp>(
-        loc, type, builder.getIntegerAttr(type, APInt(width, 1)));
+      return spirv::ConstantOp::create(builder, loc, type,
+                                       builder.getBoolAttr(true));
+    return spirv::ConstantOp::create(
+        builder, loc, type, builder.getIntegerAttr(type, APInt(width, 1)));
   }
   if (auto floatType = llvm::dyn_cast<FloatType>(type)) {
-    return builder.create<spirv::ConstantOp>(
-        loc, type, builder.getFloatAttr(floatType, 1.0));
+    return spirv::ConstantOp::create(builder, loc, type,
+                                     builder.getFloatAttr(floatType, 1.0));
   }
   if (auto vectorType = llvm::dyn_cast<VectorType>(type)) {
     Type elemType = vectorType.getElementType();
     if (llvm::isa<IntegerType>(elemType)) {
-      return builder.create<spirv::ConstantOp>(
-          loc, type,
+      return spirv::ConstantOp::create(
+          builder, loc, type,
           DenseElementsAttr::get(vectorType,
                                  IntegerAttr::get(elemType, 1).getValue()));
     }
     if (llvm::isa<FloatType>(elemType)) {
-      return builder.create<spirv::ConstantOp>(
-          loc, type,
+      return spirv::ConstantOp::create(
+          builder, loc, type,
           DenseFPElementsAttr::get(vectorType,
                                    FloatAttr::get(elemType, 1.0).getValue()));
     }
@@ -1985,7 +1985,7 @@ ParseResult spirv::SpecConstantOperationOp::parse(OpAsmParser &parser,
 
   OpBuilder builder(parser.getContext());
   builder.setInsertionPointToEnd(&block);
-  builder.create<spirv::YieldOp>(wrappedOp->getLoc(), wrappedOp->getResult(0));
+  spirv::YieldOp::create(builder, wrappedOp->getLoc(), wrappedOp->getResult(0));
   result.location = wrappedOp->getLoc();
 
   result.addTypes(wrappedOp->getResult(0).getType());
diff --git a/mlir/lib/Dialect/SPIRV/Linking/ModuleCombiner/ModuleCombiner.cpp b/mlir/lib/Dialect/SPIRV/Linking/ModuleCombiner/ModuleCombiner.cpp
index 8da688806bade..2b9c7296830dc 100644
--- a/mlir/lib/Dialect/SPIRV/Linking/ModuleCombiner/ModuleCombiner.cpp
+++ b/mlir/lib/Dialect/SPIRV/Linking/ModuleCombiner/ModuleCombiner.cpp
@@ -105,8 +105,9 @@ OwningOpRef<spirv::ModuleOp> combine(ArrayRef<spirv::ModuleOp> inputModules,
     }
   }
 
-  auto combinedModule = combinedModuleBuilder.create<spirv::ModuleOp>(
-      firstModule.getLoc(), addressingModel, memoryModel, vceTriple);
+  auto combinedModule =
+      spirv::ModuleOp::create(combinedModuleBuilder, firstModule.getLoc(),
+                              addressingModel, memoryModel, vceTriple);
   combinedModuleBuilder.setInsertionPointToStart(combinedModule.getBody());
 
   // In some cases, a symbol in the (current state of the) combined module is
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
index 68e0206e30a59..b947447dad46a 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_OPTIONAL_SOURCES
   CanonicalizeGLPass.cpp
+  ConvertToReplicatedConstantCompositePass.cpp
   DecorateCompositeTypeLayoutPass.cpp
   LowerABIAttributesPass.cpp
   RewriteInsertsPass.cpp
@@ -30,6 +31,7 @@ add_mlir_dialect_library(MLIRSPIRVConversion
 
 add_mlir_dialect_library(MLIRSPIRVTransforms
   CanonicalizeGLPass.cpp
+  ConvertToReplicatedConstantCompositePass.cpp
   DecorateCompositeTypeLayoutPass.cpp
   LowerABIAttributesPass.cpp
   RewriteInsertsPass.cpp
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp
new file mode 100644
index 0000000000000..dbbe23aa08b3c
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp
@@ -0,0 +1,129 @@
+//===- ConvertToReplicatedConstantCompositePass.cpp -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert a splat composite spirv.Constant and
+// spirv.SpecConstantComposite to spirv.EXT.ConstantCompositeReplicate and
+// spirv.EXT.SpecConstantCompositeReplicate respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir::spirv {
+#define GEN_PASS_DEF_SPIRVREPLICATEDCONSTANTCOMPOSITEPASS
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h.inc"
+
+namespace {
+
+static Type getArrayElemType(Attribute attr) {
+  if (auto typedAttr = dyn_cast<TypedAttr>(attr)) {
+    return typedAttr.getType();
+  }
+
+  if (auto arrayAttr = dyn_cast<ArrayAttr>(attr)) {
+    return ArrayType::get(getArrayElemType(arrayAttr[0]), arrayAttr.size());
+  }
+
+  return nullptr;
+}
+
+static std::pair<Attribute, uint32_t>
+getSplatAttrAndNumElements(Attribute valueAttr, Type valueType) {
+  auto compositeType = dyn_cast_or_null<spirv::CompositeType>(valueType);
+  if (!compositeType)
+    return {nullptr, 1};
+
+  if (auto splatAttr = dyn_cast<SplatElementsAttr>(valueAttr)) {
+    return {splatAttr.getSplatValue<Attribute>(), splatAttr.size()};
+  }
+
+  if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+    if (llvm::all_equal(arrayAttr)) {
+      Attribute attr = arrayAttr[0];
+      uint32_t numElements = arrayAttr.size();
+
+      // Find the inner-most splat value for array of composites
+      auto [newAttr, newNumElements] =
+          getSplatAttrAndNumElements(attr, getArrayElemType(attr));
+      if (newAttr) {
+        attr = newAttr;
+        numElements *= newNumElements;
+      }
+      return {attr, numElements};
+    }
+  }
+
+  return {nullptr, 1};
+}
+
+struct ConstantOpConversion final : OpRewritePattern<spirv::ConstantOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(spirv::ConstantOp op,
+                                PatternRewriter &rewriter) const override {
+    auto [attr, numElements] =
+        getSplatAttrAndNumElements(op.getValue(), op.getType());
+    if (!attr)
+      return rewriter.notifyMatchFailure(op, "composite is not splat");
+
+    if (numElements == 1)
+      return rewriter.notifyMatchFailure(op,
+                                         "composite has only one constituent");
+
+    rewriter.replaceOpWithNewOp<spirv::EXTConstantCompositeReplicateOp>(
+        op, op.getType(), attr);
+    return success();
+  }
+};
+
+struct SpecConstantCompositeOpConversion final
+    : OpRewritePattern<spirv::SpecConstantCompositeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(spirv::SpecConstantCompositeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto compositeType = dyn_cast_or_null<spirv::CompositeType>(op.getType());
+    if (!compositeType)
+      return rewriter.notifyMatchFailure(op, "not a composite constant");
+
+    ArrayAttr constituents = op.getConstituents();
+    if (constituents.size() == 1)
+      return rewriter.notifyMatchFailure(op,
+                                         "composite has only one consituent");
+
+    if (!llvm::all_equal(constituents))
+      return rewriter.notifyMatchFailure(op, "composite is not splat");
+
+    auto splatConstituent = dyn_cast<FlatSymbolRefAttr>(constituents[0]);
+    if (!splatConstituent)
+      return rewriter.notifyMatchFailure(
+          op, "expected flat symbol reference for splat constituent");
+
+    rewriter.replaceOpWithNewOp<spirv::EXTSpecConstantCompositeReplicateOp>(
+        op, TypeAttr::get(op.getType()), op.getSymNameAttr(), splatConstituent);
+
+    return success();
+  }
+};
+
+struct ConvertToReplicatedConstantCompositePass final
+    : spirv::impl::SPIRVReplicatedConstantCompositePassBase<
+          ConvertToReplicatedConstantCompositePass> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    patterns.add<ConstantOpConversion, SpecConstantCompositeOpConversion>(
+        context);
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+  }
+};
+
+} // namespace
+} // namespace mlir::spirv
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
index 85525a5a02fa2..81365b44a3aad 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
@@ -70,9 +70,9 @@ createGlobalVarForEntryPointArgument(OpBuilder &builder, spirv::FuncOp funcOp,
   varType =
       spirv::PointerType::get(varPointeeType, varPtrType.getStorageClass());
 
-  return builder.create<spirv::GlobalVariableOp>(
-      funcOp.getLoc(), varType, varName, abiInfo.getDescriptorSet(),
-      abiInfo.getBinding());
+  return spirv::GlobalVariableOp::create(builder, funcOp.getLoc(), varType,
+                                         varName, abiInfo.getDescriptorSet(),
+                                         abiInfo.getBinding());
 }
 
 /// Gets the global variables that need to be specified as interface variable
@@ -146,17 +146,17 @@ static LogicalResult lowerEntryPointABIAttr(spirv::FuncOp funcOp,
     return funcOp.emitRemark("lower entry point failure: could not select "
                              "execution model based on 'spirv.target_env'");
 
-  builder.create<spirv::EntryPointOp>(funcOp.getLoc(), *executionModel, funcOp,
-                                      interfaceVars);
+  spirv::EntryPointOp::create(builder, funcOp.getLoc(), *executionModel, funcOp,
+                              interfaceVars);
 
   // Specifies the spirv.ExecutionModeOp.
   if (DenseI32ArrayAttr workgroupSizeAttr = entryPointAttr.getWorkgroupSize()) {
     std::optional<ArrayRef<spirv::Capability>> caps =
         spirv::getCapabilities(spirv::ExecutionMode::LocalSize);
     if (!caps || targetEnv.allows(*caps)) {
-      builder.create<spirv::ExecutionModeOp>(funcOp.getLoc(), funcOp,
-                                             spirv::ExecutionMode::LocalSize,
-                                             workgroupSizeAttr.asArrayRef());
+      spirv::ExecutionModeOp::create(builder, funcOp.getLoc(), funcOp,
+                                     spirv::ExecutionMode::LocalSize,
+                                     workgroupSizeAttr.asArrayRef());
       // Erase workgroup size.
       entryPointAttr = spirv::EntryPointABIAttr::get(
           entryPointAttr.getContext(), DenseI32ArrayAttr(),
@@ -167,9 +167,9 @@ static LogicalResult lowerEntryPointABIAttr(spirv::FuncOp funcOp,
     std::optional<ArrayRef<spirv::Capability>> caps =
         spirv::getCapabilities(spirv::ExecutionMode::SubgroupSize);
     if (!caps || targetEnv.allows(*caps)) {
-      builder.create<spirv::ExecutionModeOp>(funcOp.getLoc(), funcOp,
-                                             spirv::ExecutionMode::SubgroupSize,
-                                             *subgroupSize);
+      spirv::ExecutionModeOp::create(builder, funcOp.getLoc(), funcOp,
+                                     spirv::ExecutionMode::SubgroupSize,
+                                     *subgroupSize);
       // Erase subgroup size.
       entryPointAttr = spirv::EntryPointABIAttr::get(
           entryPointAttr.getContext(), entryPointAttr.getWorkgroupSize(),
@@ -180,8 +180,8 @@ static LogicalResult lowerEntryPointABIAttr(spirv::FuncOp funcOp,
     std::optional<ArrayRef<spirv::Capability>> caps =
         spirv::getCapabilities(spirv::ExecutionMode::SignedZeroInfNanPreserve);
     if (!caps || targetEnv.allows(*caps)) {
-      builder.create<spirv::ExecutionModeOp>(
-          funcOp.getLoc(), funcOp,
+      spirv::ExecutionModeOp::create(
+          builder, funcOp.getLoc(), funcOp,
           spirv::ExecutionMode::SignedZeroInfNanPreserve, *targetWidth);
       // Erase target width.
       entryPointAttr = spirv::EntryPointABIAttr::get(
@@ -259,7 +259,7 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite(
 
     // Insert spirv::AddressOf and spirv::AccessChain operations.
     Value replacement =
-        rewriter.create<spirv::AddressOfOp>(funcOp.getLoc(), var);
+        spirv::AddressOfOp::create(rewriter, funcOp.getLoc(), var);
     // Check if the arg is a scalar or vector type. In that case, the value
     // needs to be loaded into registers.
     // TODO: This is loading value of the scalar into registers
@@ -269,9 +269,9 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite(
     if (cast<spirv::SPIRVType>(argType.value()).isScalarOrVector()) {
       auto zero =
           spirv::ConstantOp::getZero(indexType, funcOp.getLoc(), rewriter);
-      auto loadPtr = rewriter.create<spirv::AccessChainOp>(
-          funcOp.getLoc(), replacement, zero.getConstant());
-      replacement = rewriter.create<spirv::LoadOp>(funcOp.getLoc(), loadPtr);
+      auto loadPtr = spirv::AccessChainOp::create(
+          rewriter, funcOp.getLoc(), replacement, zero.getConstant());
+      replacement = spirv::LoadOp::create(rewriter, funcOp.getLoc(), loadPtr);
     }
     signatureConverter.remapInput(argType.index(), replacement);
   }
@@ -308,7 +308,7 @@ void LowerABIAttributesPass::runOnOperation() {
                                             ValueRange inputs, Location loc) {
     if (inputs.size() != 1 || !isa<spirv::PointerType>(inputs[0].getType()))
       return Value();
-    return builder.create<spirv::BitcastOp>(loc, type, inputs[0]).getResult();
+    return spirv::BitcastOp::create(builder, loc, type, inputs[0]).getResult();
   });
 
   RewritePatternSet patterns(context);
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/RewriteInsertsPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/RewriteInsertsPass.cpp
index ab5898d0e3925..38ef547f0769f 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/RewriteInsertsPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/RewriteInsertsPass.cpp
@@ -65,8 +65,8 @@ void RewriteInsertsPass::runOnOperation() {
       operands.push_back(insertionOp.getObject());
 
     OpBuilder builder(lastCompositeInsertOp);
-    auto compositeConstructOp = builder.create<spirv::CompositeConstructOp>(
-        location, compositeType, operands);
+    auto compositeConstructOp = spirv::CompositeConstructOp::create(
+        builder, location, compositeType, operands);
 
     lastCompositeInsertOp.replaceAllUsesWith(
         compositeConstructOp->getResult(0));
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index f70b3325f8725..35ec0190b5a61 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -669,21 +669,24 @@ static Value castToSourceType(const spirv::TargetEnv &targetEnv,
                               Location loc) {
   // We can only cast one value in SPIR-V.
   if (inputs.size() != 1) {
-    auto castOp = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    auto castOp =
+        UnrealizedConversionCastOp::create(builder, loc, type, inputs);
     return castOp.getResult(0);
   }
   Value input = inputs.front();
 
   // Only support integer types for now. Floating point types to be implemented.
   if (!isa<IntegerType>(type)) {
-    auto castOp = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    auto castOp =
+        UnrealizedConversionCastOp::create(builder, loc, type, inputs);
     return castOp.getResult(0);
   }
   auto inputType = cast<IntegerType>(input.getType());
 
   auto scalarType = dyn_cast<spirv::ScalarType>(type);
   if (!scalarType) {
-    auto castOp = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    auto castOp =
+        UnrealizedConversionCastOp::create(builder, loc, type, inputs);
     return castOp.getResult(0);
   }
 
@@ -691,14 +694,15 @@ static Value castToSourceType(const spirv::TargetEnv &targetEnv,
   // truncating to go back so we don't need to worry about the signedness.
   // For extension, we cannot have enough signal here to decide which op to use.
   if (inputType.getIntOrFloatBitWidth() < scalarType.getIntOrFloatBitWidth()) {
-    auto castOp = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    auto castOp =
+        UnrealizedConversionCastOp::create(builder, loc, type, inputs);
     return castOp.getResult(0);
   }
 
   // Boolean values would need to use different ops than normal integer values.
   if (type.isInteger(1)) {
     Value one = spirv::ConstantOp::getOne(inputType, loc, builder);
-    return builder.create<spirv::IEqualOp>(loc, input, one);
+    return spirv::IEqualOp::create(builder, loc, input, one);
   }
 
   // Check that the source integer type is supported by the environment.
@@ -708,7 +712,8 @@ static Value castToSourceType(const spirv::TargetEnv &targetEnv,
   scalarType.getCapabilities(caps);
   if (failed(checkCapabilityRequirements(type, targetEnv, caps)) ||
       failed(checkExtensionRequirements(type, targetEnv, exts))) {
-    auto castOp = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    auto castOp =
+        UnrealizedConversionCastOp::create(builder, loc, type, inputs);
     return castOp.getResult(0);
   }
 
@@ -716,9 +721,9 @@ static Value castToSourceType(const spirv::TargetEnv &targetEnv,
   // care about signedness here. Still try to use a corresponding op for better
   // consistency though.
   if (type.isSignedInteger()) {
-    return builder.create<spirv::SConvertOp>(loc, type, input);
+    return spirv::SConvertOp::create(builder, loc, type, input);
   }
-  return builder.create<spirv::UConvertOp>(loc, type, input);
+  return spirv::UConvertOp::create(builder, loc, type, input);
 }
 
 //===----------------------------------------------------------------------===//
@@ -770,7 +775,7 @@ getOrInsertBuiltinVariable(Block &body, Location loc, spirv::BuiltIn builtin,
                                            spirv::StorageClass::Input);
     std::string name = getBuiltinVarName(builtin, prefix, suffix);
     newVarOp =
-        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
+        spirv::GlobalVariableOp::create(builder, loc, ptrType, name, builtin);
     break;
   }
   case spirv::BuiltIn::SubgroupId:
@@ -781,7 +786,7 @@ getOrInsertBuiltinVariable(Block &body, Location loc, spirv::BuiltIn builtin,
         spirv::PointerType::get(integerType, spirv::StorageClass::Input);
     std::string name = getBuiltinVarName(builtin, prefix, suffix);
     newVarOp =
-        builder.create<spirv::GlobalVariableOp>(loc, ptrType, name, builtin);
+        spirv::GlobalVariableOp::create(builder, loc, ptrType, name, builtin);
     break;
   }
   default:
@@ -842,8 +847,8 @@ getOrInsertPushConstantVariable(Location loc, Block &block,
   auto builder = OpBuilder::atBlockBegin(&block, b.getListener());
   auto type = getPushConstantStorageType(elementCount, builder, indexType);
   const char *name = "__push_constant_var__";
-  return builder.create<spirv::GlobalVariableOp>(loc, type, name,
-                                                 /*initializer=*/nullptr);
+  return spirv::GlobalVariableOp::create(builder, loc, type, name,
+                                         /*initializer=*/nullptr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -879,8 +884,8 @@ struct FuncOpConversion final : OpConversionPattern<func::FuncOp> {
     }
 
     // Create the converted spirv.func op.
-    auto newFuncOp = rewriter.create<spirv::FuncOp>(
-        funcOp.getLoc(), funcOp.getName(),
+    auto newFuncOp = spirv::FuncOp::create(
+        rewriter, funcOp.getLoc(), funcOp.getName(),
         rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
                                  resultType ? TypeRange(resultType)
                                             : TypeRange()));
@@ -919,8 +924,8 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
     }
 
     // Create a new func op with the original type and copy the function body.
-    auto newFuncOp = rewriter.create<func::FuncOp>(funcOp.getLoc(),
-                                                   funcOp.getName(), fnType);
+    auto newFuncOp = func::FuncOp::create(rewriter, funcOp.getLoc(),
+                                          funcOp.getName(), fnType);
     rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
                                 newFuncOp.end());
 
@@ -954,8 +959,8 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
       auto origVecType = dyn_cast<VectorType>(origType);
       if (!origVecType) {
         // We need a placeholder for the old argument that will be erased later.
-        Value result = rewriter.create<arith::ConstantOp>(
-            loc, origType, rewriter.getZeroAttr(origType));
+        Value result = arith::ConstantOp::create(
+            rewriter, loc, origType, rewriter.getZeroAttr(origType));
         rewriter.replaceAllUsesWith(newFuncOp.getArgument(origInputNo), result);
         tmpOps.insert({result.getDefiningOp(), newInputNo});
         oneToNTypeMapping.addInputs(origInputNo, origType);
@@ -967,8 +972,8 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
       auto targetShape = getTargetShape(origVecType);
       if (!targetShape) {
         // We need a placeholder for the old argument that will be erased later.
-        Value result = rewriter.create<arith::ConstantOp>(
-            loc, origType, rewriter.getZeroAttr(origType));
+        Value result = arith::ConstantOp::create(
+            rewriter, loc, origType, rewriter.getZeroAttr(origType));
         rewriter.replaceAllUsesWith(newFuncOp.getArgument(origInputNo), result);
         tmpOps.insert({result.getDefiningOp(), newInputNo});
         oneToNTypeMapping.addInputs(origInputNo, origType);
@@ -982,12 +987,12 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
           llvm::to_vector_of<int64_t, 4>(origVecType.getShape());
 
       // Prepare the result vector.
-      Value result = rewriter.create<arith::ConstantOp>(
-          loc, origVecType, rewriter.getZeroAttr(origVecType));
+      Value result = arith::ConstantOp::create(
+          rewriter, loc, origVecType, rewriter.getZeroAttr(origVecType));
       ++newOpCount;
       // Prepare the placeholder for the new arguments that will be added later.
-      Value dummy = rewriter.create<arith::ConstantOp>(
-          loc, unrolledType, rewriter.getZeroAttr(unrolledType));
+      Value dummy = arith::ConstantOp::create(
+          rewriter, loc, unrolledType, rewriter.getZeroAttr(unrolledType));
       ++newOpCount;
 
       // Create the `vector.insert_strided_slice` ops.
@@ -995,8 +1000,8 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
       SmallVector<Type> newTypes;
       for (SmallVector<int64_t> offsets :
            StaticTileOffsetRange(originalShape, *targetShape)) {
-        result = rewriter.create<vector::InsertStridedSliceOp>(
-            loc, dummy, result, offsets, strides);
+        result = vector::InsertStridedSliceOp::create(rewriter, loc, dummy,
+                                                      result, offsets, strides);
         newTypes.push_back(unrolledType);
         unrolledInputNums.push_back(newInputNo);
         ++newInputNo;
@@ -1109,12 +1114,12 @@ struct ReturnOpVectorUnroll final : OpRewritePattern<func::ReturnOp> {
       Value returnValue = returnOp.getOperand(origResultNo);
       for (SmallVector<int64_t> offsets :
            StaticTileOffsetRange(originalShape, *targetShape)) {
-        Value result = rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, returnValue, offsets, extractShape, strides);
+        Value result = vector::ExtractStridedSliceOp::create(
+            rewriter, loc, returnValue, offsets, extractShape, strides);
         if (originalShape.size() > 1) {
           SmallVector<int64_t> extractIndices(originalShape.size() - 1, 0);
           result =
-              rewriter.create<vector::ExtractOp>(loc, result, extractIndices);
+              vector::ExtractOp::create(rewriter, loc, result, extractIndices);
         }
         newOperands.push_back(result);
         newTypes.push_back(unrolledType);
@@ -1132,7 +1137,7 @@ struct ReturnOpVectorUnroll final : OpRewritePattern<func::ReturnOp> {
     // Replace the return op using the new operands. This will automatically
     // update the entry block as well.
     rewriter.replaceOp(returnOp,
-                       rewriter.create<func::ReturnOp>(loc, newOperands));
+                       func::ReturnOp::create(rewriter, loc, newOperands));
 
     return success();
   }
@@ -1157,8 +1162,8 @@ Value mlir::spirv::getBuiltinVariableValue(Operation *op,
   spirv::GlobalVariableOp varOp =
       getOrInsertBuiltinVariable(*parent->getRegion(0).begin(), op->getLoc(),
                                  builtin, integerType, builder, prefix, suffix);
-  Value ptr = builder.create<spirv::AddressOfOp>(op->getLoc(), varOp);
-  return builder.create<spirv::LoadOp>(op->getLoc(), ptr);
+  Value ptr = spirv::AddressOfOp::create(builder, op->getLoc(), varOp);
+  return spirv::LoadOp::create(builder, op->getLoc(), ptr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1179,12 +1184,12 @@ Value spirv::getPushConstantValue(Operation *op, unsigned elementCount,
       loc, parent->getRegion(0).front(), elementCount, builder, integerType);
 
   Value zeroOp = spirv::ConstantOp::getZero(integerType, loc, builder);
-  Value offsetOp = builder.create<spirv::ConstantOp>(
-      loc, integerType, builder.getI32IntegerAttr(offset));
-  auto addrOp = builder.create<spirv::AddressOfOp>(loc, varOp);
-  auto acOp = builder.create<spirv::AccessChainOp>(
-      loc, addrOp, llvm::ArrayRef({zeroOp, offsetOp}));
-  return builder.create<spirv::LoadOp>(loc, acOp);
+  Value offsetOp = spirv::ConstantOp::create(builder, loc, integerType,
+                                             builder.getI32IntegerAttr(offset));
+  auto addrOp = spirv::AddressOfOp::create(builder, loc, varOp);
+  auto acOp = spirv::AccessChainOp::create(builder, loc, addrOp,
+                                           llvm::ArrayRef({zeroOp, offsetOp}));
+  return spirv::LoadOp::create(builder, loc, acOp);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1244,7 +1249,7 @@ Value mlir::spirv::getVulkanElementPtr(const SPIRVTypeConverter &typeConverter,
     linearizedIndices.push_back(
         linearizeIndex(indices, strides, offset, indexType, loc, builder));
   }
-  return builder.create<spirv::AccessChainOp>(loc, basePtr, linearizedIndices);
+  return spirv::AccessChainOp::create(builder, loc, basePtr, linearizedIndices);
 }
 
 Value mlir::spirv::getOpenCLElementPtr(const SPIRVTypeConverter &typeConverter,
@@ -1275,11 +1280,11 @@ Value mlir::spirv::getOpenCLElementPtr(const SPIRVTypeConverter &typeConverter,
       cast<spirv::PointerType>(basePtr.getType()).getPointeeType();
   if (isa<spirv::ArrayType>(pointeeType)) {
     linearizedIndices.push_back(linearIndex);
-    return builder.create<spirv::AccessChainOp>(loc, basePtr,
-                                                linearizedIndices);
+    return spirv::AccessChainOp::create(builder, loc, basePtr,
+                                        linearizedIndices);
   }
-  return builder.create<spirv::PtrAccessChainOp>(loc, basePtr, linearIndex,
-                                                 linearizedIndices);
+  return spirv::PtrAccessChainOp::create(builder, loc, basePtr, linearIndex,
+                                         linearizedIndices);
 }
 
 Value mlir::spirv::getElementPtr(const SPIRVTypeConverter &typeConverter,
@@ -1465,7 +1470,7 @@ SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr,
       });
   addTargetMaterialization([](OpBuilder &builder, Type type, ValueRange inputs,
                               Location loc) {
-    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    auto cast = UnrealizedConversionCastOp::create(builder, loc, type, inputs);
     return cast.getResult(0);
   });
 }
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp
index af1cf2a1373e3..e0900005ea1bb 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVWebGPUTransforms.cpp
@@ -64,16 +64,16 @@ static Value lowerExtendedMultiplication(Operation *mulOp,
   //     and 4 additions after constant folding.
   //   - With sign-extended arguments, we end up emitting 8 multiplications and
   //     and 12 additions after CSE.
-  Value cstLowMask = rewriter.create<ConstantOp>(
-      loc, lhs.getType(), getScalarOrSplatAttr(argTy, (1 << 16) - 1));
+  Value cstLowMask = ConstantOp::create(
+      rewriter, loc, lhs.getType(), getScalarOrSplatAttr(argTy, (1 << 16) - 1));
   auto getLowDigit = [&rewriter, loc, cstLowMask](Value val) {
-    return rewriter.create<BitwiseAndOp>(loc, val, cstLowMask);
+    return BitwiseAndOp::create(rewriter, loc, val, cstLowMask);
   };
 
-  Value cst16 = rewriter.create<ConstantOp>(loc, lhs.getType(),
-                                            getScalarOrSplatAttr(argTy, 16));
+  Value cst16 = ConstantOp::create(rewriter, loc, lhs.getType(),
+                                   getScalarOrSplatAttr(argTy, 16));
   auto getHighDigit = [&rewriter, loc, cst16](Value val) {
-    return rewriter.create<ShiftRightLogicalOp>(loc, val, cst16);
+    return ShiftRightLogicalOp::create(rewriter, loc, val, cst16);
   };
 
   auto getSignDigit = [&rewriter, loc, cst16, &getHighDigit](Value val) {
@@ -82,11 +82,11 @@ static Value lowerExtendedMultiplication(Operation *mulOp,
     // fine. We do not have to introduce an extra constant since any
     // value in [15, 32) would do.
     return getHighDigit(
-        rewriter.create<ShiftRightArithmeticOp>(loc, val, cst16));
+        ShiftRightArithmeticOp::create(rewriter, loc, val, cst16));
   };
 
-  Value cst0 = rewriter.create<ConstantOp>(loc, lhs.getType(),
-                                           getScalarOrSplatAttr(argTy, 0));
+  Value cst0 = ConstantOp::create(rewriter, loc, lhs.getType(),
+                                  getScalarOrSplatAttr(argTy, 0));
 
   Value lhsLow = getLowDigit(lhs);
   Value lhsHigh = getHighDigit(lhs);
@@ -108,7 +108,7 @@ static Value lowerExtendedMultiplication(Operation *mulOp,
         continue;
 
       Value &thisResDigit = resultDigits[i + j];
-      Value mul = rewriter.create<IMulOp>(loc, lhsDigit, rhsDigit);
+      Value mul = IMulOp::create(rewriter, loc, lhsDigit, rhsDigit);
       Value current = rewriter.createOrFold<IAddOp>(loc, thisResDigit, mul);
       thisResDigit = getLowDigit(current);
 
@@ -122,14 +122,15 @@ static Value lowerExtendedMultiplication(Operation *mulOp,
   }
 
   auto combineDigits = [loc, cst16, &rewriter](Value low, Value high) {
-    Value highBits = rewriter.create<ShiftLeftLogicalOp>(loc, high, cst16);
-    return rewriter.create<BitwiseOrOp>(loc, low, highBits);
+    Value highBits = ShiftLeftLogicalOp::create(rewriter, loc, high, cst16);
+    return BitwiseOrOp::create(rewriter, loc, low, highBits);
   };
   Value low = combineDigits(resultDigits[0], resultDigits[1]);
   Value high = combineDigits(resultDigits[2], resultDigits[3]);
 
-  return rewriter.create<CompositeConstructOp>(
-      loc, mulOp->getResultTypes().front(), llvm::ArrayRef({low, high}));
+  return CompositeConstructOp::create(rewriter, loc,
+                                      mulOp->getResultTypes().front(),
+                                      llvm::ArrayRef({low, high}));
 }
 
 //===----------------------------------------------------------------------===//
@@ -184,18 +185,19 @@ struct ExpandAddCarryPattern final : OpRewritePattern<IAddCarryOp> {
           loc,
           llvm::formatv("Unexpected integer type for WebGPU: '{0}'", elemTy));
 
-    Value one =
-        rewriter.create<ConstantOp>(loc, argTy, getScalarOrSplatAttr(argTy, 1));
-    Value zero =
-        rewriter.create<ConstantOp>(loc, argTy, getScalarOrSplatAttr(argTy, 0));
+    Value one = ConstantOp::create(rewriter, loc, argTy,
+                                   getScalarOrSplatAttr(argTy, 1));
+    Value zero = ConstantOp::create(rewriter, loc, argTy,
+                                    getScalarOrSplatAttr(argTy, 0));
 
     // Calculate the carry by checking if the addition resulted in an overflow.
-    Value out = rewriter.create<IAddOp>(loc, lhs, rhs);
-    Value cmp = rewriter.create<ULessThanOp>(loc, out, lhs);
-    Value carry = rewriter.create<SelectOp>(loc, cmp, one, zero);
+    Value out = IAddOp::create(rewriter, loc, lhs, rhs);
+    Value cmp = ULessThanOp::create(rewriter, loc, out, lhs);
+    Value carry = SelectOp::create(rewriter, loc, cmp, one, zero);
 
-    Value add = rewriter.create<CompositeConstructOp>(
-        loc, op->getResultTypes().front(), llvm::ArrayRef({out, carry}));
+    Value add = CompositeConstructOp::create(rewriter, loc,
+                                             op->getResultTypes().front(),
+                                             llvm::ArrayRef({out, carry}));
 
     rewriter.replaceOp(op, add);
     return success();
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
index 527d92634c196..692f2e7616e5a 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
@@ -380,13 +380,13 @@ struct ConvertAccessChain : public ConvertAliasResource<spirv::AccessChainOp> {
       Type indexType = oldIndex.getType();
 
       int ratio = dstNumBytes / srcNumBytes;
-      auto ratioValue = rewriter.create<spirv::ConstantOp>(
-          loc, indexType, rewriter.getIntegerAttr(indexType, ratio));
+      auto ratioValue = spirv::ConstantOp::create(
+          rewriter, loc, indexType, rewriter.getIntegerAttr(indexType, ratio));
 
       indices.back() =
-          rewriter.create<spirv::SDivOp>(loc, indexType, oldIndex, ratioValue);
-      indices.push_back(
-          rewriter.create<spirv::SModOp>(loc, indexType, oldIndex, ratioValue));
+          spirv::SDivOp::create(rewriter, loc, indexType, oldIndex, ratioValue);
+      indices.push_back(spirv::SModOp::create(rewriter, loc, indexType,
+                                              oldIndex, ratioValue));
 
       rewriter.replaceOpWithNewOp<spirv::AccessChainOp>(
           acOp, adaptor.getBasePtr(), indices);
@@ -407,11 +407,11 @@ struct ConvertAccessChain : public ConvertAliasResource<spirv::AccessChainOp> {
       Type indexType = oldIndex.getType();
 
       int ratio = srcNumBytes / dstNumBytes;
-      auto ratioValue = rewriter.create<spirv::ConstantOp>(
-          loc, indexType, rewriter.getIntegerAttr(indexType, ratio));
+      auto ratioValue = spirv::ConstantOp::create(
+          rewriter, loc, indexType, rewriter.getIntegerAttr(indexType, ratio));
 
       indices.back() =
-          rewriter.create<spirv::IMulOp>(loc, indexType, oldIndex, ratioValue);
+          spirv::IMulOp::create(rewriter, loc, indexType, oldIndex, ratioValue);
 
       rewriter.replaceOpWithNewOp<spirv::AccessChainOp>(
           acOp, adaptor.getBasePtr(), indices);
@@ -435,15 +435,15 @@ struct ConvertLoad : public ConvertAliasResource<spirv::LoadOp> {
     auto dstElemType = cast<spirv::SPIRVType>(dstPtrType.getPointeeType());
 
     Location loc = loadOp.getLoc();
-    auto newLoadOp = rewriter.create<spirv::LoadOp>(loc, adaptor.getPtr());
+    auto newLoadOp = spirv::LoadOp::create(rewriter, loc, adaptor.getPtr());
     if (srcElemType == dstElemType) {
       rewriter.replaceOp(loadOp, newLoadOp->getResults());
       return success();
     }
 
     if (areSameBitwidthScalarType(srcElemType, dstElemType)) {
-      auto castOp = rewriter.create<spirv::BitcastOp>(loc, srcElemType,
-                                                      newLoadOp.getValue());
+      auto castOp = spirv::BitcastOp::create(rewriter, loc, srcElemType,
+                                             newLoadOp.getValue());
       rewriter.replaceOp(loadOp, castOp->getResults());
 
       return success();
@@ -475,14 +475,14 @@ struct ConvertLoad : public ConvertAliasResource<spirv::LoadOp> {
       auto indices = llvm::to_vector<4>(acOp.getIndices());
       for (int i = 1; i < ratio; ++i) {
         // Load all subsequent components belonging to this element.
-        indices.back() = rewriter.create<spirv::IAddOp>(
-            loc, i32Type, indices.back(), oneValue);
-        auto componentAcOp = rewriter.create<spirv::AccessChainOp>(
-            loc, acOp.getBasePtr(), indices);
+        indices.back() = spirv::IAddOp::create(rewriter, loc, i32Type,
+                                               indices.back(), oneValue);
+        auto componentAcOp = spirv::AccessChainOp::create(
+            rewriter, loc, acOp.getBasePtr(), indices);
         // Assuming little endian, this reads lower-ordered bits of the number
         // to lower-numbered components of the vector.
         components.push_back(
-            rewriter.create<spirv::LoadOp>(loc, componentAcOp));
+            spirv::LoadOp::create(rewriter, loc, componentAcOp));
       }
 
       // Create a vector of the components and then cast back to the larger
@@ -510,15 +510,15 @@ struct ConvertLoad : public ConvertAliasResource<spirv::LoadOp> {
               castType = VectorType::get({count}, castType);
 
             for (Value &c : components)
-              c = rewriter.create<spirv::BitcastOp>(loc, castType, c);
+              c = spirv::BitcastOp::create(rewriter, loc, castType, c);
           }
         }
-      Value vectorValue = rewriter.create<spirv::CompositeConstructOp>(
-          loc, vectorType, components);
+      Value vectorValue = spirv::CompositeConstructOp::create(
+          rewriter, loc, vectorType, components);
 
       if (!isa<VectorType>(srcElemType))
         vectorValue =
-            rewriter.create<spirv::BitcastOp>(loc, srcElemType, vectorValue);
+            spirv::BitcastOp::create(rewriter, loc, srcElemType, vectorValue);
       rewriter.replaceOp(loadOp, vectorValue);
       return success();
     }
@@ -546,7 +546,7 @@ struct ConvertStore : public ConvertAliasResource<spirv::StoreOp> {
     Location loc = storeOp.getLoc();
     Value value = adaptor.getValue();
     if (srcElemType != dstElemType)
-      value = rewriter.create<spirv::BitcastOp>(loc, dstElemType, value);
+      value = spirv::BitcastOp::create(rewriter, loc, dstElemType, value);
     rewriter.replaceOpWithNewOp<spirv::StoreOp>(storeOp, adaptor.getPtr(),
                                                 value, storeOp->getAttrs());
     return success();
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 38246b96977c8..1a9d9e158ee75 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -559,7 +559,8 @@ SparseTensorEncodingAttr::translateCrds(OpBuilder &builder, Location loc,
   SmallVector<Type> retType(
       dir == CrdTransDirectionKind::lvl2dim ? getDimRank() : getLvlRank(),
       builder.getIndexType());
-  auto transOp = builder.create<CrdTranslateOp>(loc, retType, crds, dir, *this);
+  auto transOp =
+      CrdTranslateOp::create(builder, loc, retType, crds, dir, *this);
   return transOp.getOutCrds();
 }
 
@@ -1481,7 +1482,7 @@ LogicalResult CrdTranslateOp::fold(FoldAdaptor adaptor,
 
 void LvlOp::build(OpBuilder &builder, OperationState &state, Value source,
                   int64_t index) {
-  Value val = builder.create<arith::ConstantIndexOp>(state.location, index);
+  Value val = arith::ConstantIndexOp::create(builder, state.location, index);
   return build(builder, state, source, val);
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp
index 9c84f4c25866f..abb37a5e10b9a 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp
@@ -41,8 +41,8 @@ LogicalResult sparse_tensor::detail::stageWithSortImpl(
 
   // -> sort
   Type dstCOOTp = dstStt.getCOOType(/*ordered=*/true);
-  Value dstCOO = rewriter.create<ReorderCOOOp>(
-      loc, dstCOOTp, srcCOO, SparseTensorSortKind::HybridQuickSort);
+  Value dstCOO = ReorderCOOOp::create(rewriter, loc, dstCOOTp, srcCOO,
+                                      SparseTensorSortKind::HybridQuickSort);
 
   // -> dest.
   if (dstCOO.getType() == finalTp) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp
index 8ee801ba46349..40c182f9dbb37 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp
@@ -88,13 +88,13 @@ static void convVals(OpBuilder &builder, Location loc, TypeRange types,
         } else if (directOut) {
           Value mem;
           if (kind == SparseTensorFieldKind::PosMemRef)
-            mem = builder.create<sparse_tensor::ToPositionsOp>(loc, inputs[0],
-                                                               lv);
+            mem = sparse_tensor::ToPositionsOp::create(builder, loc, inputs[0],
+                                                       lv);
           else if (kind == SparseTensorFieldKind::CrdMemRef)
-            mem = builder.create<sparse_tensor::ToCoordinatesOp>(loc, inputs[0],
-                                                                 lv);
+            mem = sparse_tensor::ToCoordinatesOp::create(builder, loc,
+                                                         inputs[0], lv);
           else
-            mem = builder.create<sparse_tensor::ToValuesOp>(loc, inputs[0]);
+            mem = sparse_tensor::ToValuesOp::create(builder, loc, inputs[0]);
           toVals.push_back(mem);
         } else {
           ShapedType rtp = cast<ShapedType>(t);
@@ -109,7 +109,7 @@ static void convVals(OpBuilder &builder, Location loc, TypeRange types,
 
     if (isIn) {
       // Assemble multiple inputs into a single sparse tensor.
-      auto a = builder.create<sparse_tensor::AssembleOp>(loc, rtp, inputs);
+      auto a = sparse_tensor::AssembleOp::create(builder, loc, rtp, inputs);
       toVals.push_back(a.getResult());
     } else if (!directOut) {
       // Disassemble a single sparse input into multiple outputs.
@@ -117,7 +117,7 @@ static void convVals(OpBuilder &builder, Location loc, TypeRange types,
       unsigned len = retTypes.size();
       retTypes.append(cntTypes);
       auto d =
-          builder.create<sparse_tensor::DisassembleOp>(loc, retTypes, inputs);
+          sparse_tensor::DisassembleOp::create(builder, loc, retTypes, inputs);
       for (unsigned i = 0; i < len; i++)
         toVals.push_back(d.getResult(i));
     }
@@ -199,8 +199,9 @@ struct SparseFuncAssembler : public OpRewritePattern<func::FuncOp> {
     OpBuilder moduleBuilder(modOp.getBodyRegion());
     unsigned extra = inputTypes.size();
     inputTypes.append(extraTypes);
-    auto func = moduleBuilder.create<func::FuncOp>(
-        loc, orgName, FunctionType::get(context, inputTypes, outputTypes));
+    auto func = func::FuncOp::create(
+        moduleBuilder, loc, orgName,
+        FunctionType::get(context, inputTypes, outputTypes));
     func.setPublic();
 
     // Construct new wrapper method body.
@@ -216,14 +217,14 @@ struct SparseFuncAssembler : public OpRewritePattern<func::FuncOp> {
     // Call the original, now private method. A subsequent inlining pass can
     // determine whether cloning the method body in place is worthwhile.
     auto org = SymbolRefAttr::get(context, wrapper);
-    auto call = rewriter.create<func::CallOp>(loc, funcOp.getResultTypes(), org,
-                                              inputs);
+    auto call = func::CallOp::create(rewriter, loc, funcOp.getResultTypes(),
+                                     org, inputs);
 
     // Convert outputs and return.
     SmallVector<Value> outputs;
     convVals(rewriter, loc, funcOp.getResultTypes(), call.getResults(),
              body->getArguments(), outputs, extra, /*isIn=*/false, directOut);
-    rewriter.create<func::ReturnOp>(loc, outputs);
+    func::ReturnOp::create(rewriter, loc, outputs);
 
     // Finally, migrate a potential c-interface property.
     if (funcOp->getAttrOfType<UnitAttr>(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
index 0c5912bb73772..02623198c25b5 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp
@@ -94,8 +94,8 @@ static FlatSymbolRefAttr getMangledSortHelperFunc(
     OpBuilder::InsertionGuard insertionGuard(builder);
     builder.setInsertionPoint(insertPoint);
     Location loc = insertPoint.getLoc();
-    func = builder.create<func::FuncOp>(
-        loc, nameOstream.str(),
+    func = func::FuncOp::create(
+        builder, loc, nameOstream.str(),
         FunctionType::get(context, operands.getTypes(), resultTypes));
     func.setPrivate();
     createFunc(builder, module, func, xPerm, ny, nTrailingP);
@@ -111,13 +111,13 @@ static void forEachIJPairInXs(
     uint64_t ny,
     function_ref<void(uint64_t, Value, Value, Value)> bodyBuilder) {
   Value cstep = constantIndex(builder, loc, xPerm.getNumResults() + ny);
-  Value iOffset = builder.create<arith::MulIOp>(loc, args[0], cstep);
-  Value jOffset = builder.create<arith::MulIOp>(loc, args[1], cstep);
+  Value iOffset = arith::MulIOp::create(builder, loc, args[0], cstep);
+  Value jOffset = arith::MulIOp::create(builder, loc, args[1], cstep);
   for (unsigned k = 0, e = xPerm.getNumResults(); k < e; k++) {
     unsigned actualK = cast<AffineDimExpr>(xPerm.getResult(k)).getPosition();
     Value ak = constantIndex(builder, loc, actualK);
-    Value i = builder.create<arith::AddIOp>(loc, ak, iOffset);
-    Value j = builder.create<arith::AddIOp>(loc, ak, jOffset);
+    Value i = arith::AddIOp::create(builder, loc, ak, iOffset);
+    Value j = arith::AddIOp::create(builder, loc, ak, jOffset);
     Value buffer = args[xStartIdx];
 
     bodyBuilder(k, i, j, buffer);
@@ -165,10 +165,10 @@ static void forEachIJPairInAllBuffers(
 static void createSwap(OpBuilder &builder, Location loc, ValueRange args,
                        AffineMap xPerm, uint64_t ny) {
   auto swapOnePair = [&](uint64_t unused, Value i, Value j, Value buffer) {
-    Value vi = builder.create<memref::LoadOp>(loc, buffer, i);
-    Value vj = builder.create<memref::LoadOp>(loc, buffer, j);
-    builder.create<memref::StoreOp>(loc, vj, buffer, i);
-    builder.create<memref::StoreOp>(loc, vi, buffer, j);
+    Value vi = memref::LoadOp::create(builder, loc, buffer, i);
+    Value vj = memref::LoadOp::create(builder, loc, buffer, j);
+    memref::StoreOp::create(builder, loc, vj, buffer, i);
+    memref::StoreOp::create(builder, loc, vi, buffer, j);
   };
 
   forEachIJPairInAllBuffers(builder, loc, args, xPerm, ny, swapOnePair);
@@ -193,7 +193,7 @@ static Value createInlinedCompareImplementation(
       OpBuilder::InsertionGuard insertionGuard(builder);
       auto ifOp = cast<scf::IfOp>(val.getDefiningOp());
       builder.setInsertionPointAfter(ifOp);
-      builder.create<scf::YieldOp>(loc, ifOp.getResult(0));
+      scf::YieldOp::create(builder, loc, ifOp.getResult(0));
     }
   };
 
@@ -207,25 +207,25 @@ static Value createInlinedCompareImplementation(
 /// result of the comparison.
 static Value createEqCompare(OpBuilder &builder, Location loc, Value i, Value j,
                              Value x, bool isFirstDim, bool isLastDim) {
-  Value vi = builder.create<memref::LoadOp>(loc, x, i);
-  Value vj = builder.create<memref::LoadOp>(loc, x, j);
+  Value vi = memref::LoadOp::create(builder, loc, x, i);
+  Value vj = memref::LoadOp::create(builder, loc, x, j);
 
   Value res;
   if (isLastDim) {
-    res = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, vi, vj);
+    res = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq, vi, vj);
     // For 1D, we create a compare without any control flow. Otherwise, we
     // create YieldOp to return the result in the nested if-stmt.
     if (!isFirstDim)
-      builder.create<scf::YieldOp>(loc, res);
+      scf::YieldOp::create(builder, loc, res);
   } else {
     Value ne =
-        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, vi, vj);
-    scf::IfOp ifOp = builder.create<scf::IfOp>(loc, builder.getIntegerType(1),
-                                               ne, /*else=*/true);
+        arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne, vi, vj);
+    scf::IfOp ifOp = scf::IfOp::create(builder, loc, builder.getIntegerType(1),
+                                       ne, /*else=*/true);
     // If (x[i] != x[j]).
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     Value f = constantI1(builder, loc, false);
-    builder.create<scf::YieldOp>(loc, f);
+    scf::YieldOp::create(builder, loc, f);
 
     // If (x[i] == x[j]). Set up the insertion point for the nested if-stmt that
     // checks the remaining dimensions.
@@ -261,26 +261,27 @@ static Value createInlinedEqCompare(OpBuilder &builder, Location loc,
 static Value createLessThanCompare(OpBuilder &builder, Location loc, Value i,
                                    Value j, Value x, bool isFirstDim,
                                    bool isLastDim) {
-  Value vi = builder.create<memref::LoadOp>(loc, x, i);
-  Value vj = builder.create<memref::LoadOp>(loc, x, j);
+  Value vi = memref::LoadOp::create(builder, loc, x, i);
+  Value vj = memref::LoadOp::create(builder, loc, x, j);
 
   Value res;
   if (isLastDim) {
-    res = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, vi, vj);
+    res =
+        arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult, vi, vj);
     // For 1D, we create a compare without any control flow. Otherwise, we
     // create YieldOp to return the result in the nested if-stmt.
     if (!isFirstDim)
-      builder.create<scf::YieldOp>(loc, res);
+      scf::YieldOp::create(builder, loc, res);
   } else {
     Value ne =
-        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, vi, vj);
-    scf::IfOp ifOp = builder.create<scf::IfOp>(loc, builder.getIntegerType(1),
-                                               ne, /*else=*/true);
+        arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne, vi, vj);
+    scf::IfOp ifOp = scf::IfOp::create(builder, loc, builder.getIntegerType(1),
+                                       ne, /*else=*/true);
     // If (x[i] != x[j]).
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     Value lt =
-        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, vi, vj);
-    builder.create<scf::YieldOp>(loc, lt);
+        arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult, vi, vj);
+    scf::YieldOp::create(builder, loc, lt);
 
     // If (x[i] == x[j]). Set up the insertion point for the nested if-stmt that
     // checks the remaining dimensions.
@@ -337,17 +338,17 @@ static void createBinarySearchFunc(OpBuilder &builder, ModuleOp module,
   ValueRange args = entryBlock->getArguments();
   Value p = args[hiIdx];
   SmallVector<Type, 2> types(2, p.getType()); // Only two types.
-  scf::WhileOp whileOp = builder.create<scf::WhileOp>(
-      loc, types, SmallVector<Value, 2>{args[loIdx], args[hiIdx]});
+  scf::WhileOp whileOp = scf::WhileOp::create(
+      builder, loc, types, SmallVector<Value, 2>{args[loIdx], args[hiIdx]});
 
   // The before-region of the WhileOp.
   Block *before =
       builder.createBlock(&whileOp.getBefore(), {}, types, {loc, loc});
   builder.setInsertionPointToEnd(before);
-  Value cond1 = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                              before->getArgument(0),
-                                              before->getArgument(1));
-  builder.create<scf::ConditionOp>(loc, cond1, before->getArguments());
+  Value cond1 =
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult,
+                            before->getArgument(0), before->getArgument(1));
+  scf::ConditionOp::create(builder, loc, cond1, before->getArguments());
 
   // The after-region of the WhileOp.
   Block *after =
@@ -357,9 +358,9 @@ static void createBinarySearchFunc(OpBuilder &builder, ModuleOp module,
   Value hi = after->getArgument(1);
   // Compute mid = (lo + hi) >> 1.
   Value c1 = constantIndex(builder, loc, 1);
-  Value mid = builder.create<arith::ShRUIOp>(
-      loc, builder.create<arith::AddIOp>(loc, lo, hi), c1);
-  Value midp1 = builder.create<arith::AddIOp>(loc, mid, c1);
+  Value mid = arith::ShRUIOp::create(
+      builder, loc, arith::AddIOp::create(builder, loc, lo, hi), c1);
+  Value midp1 = arith::AddIOp::create(builder, loc, mid, c1);
 
   // Compare xs[p] < xs[mid].
   SmallVector<Value> compareOperands{p, mid};
@@ -372,12 +373,12 @@ static void createBinarySearchFunc(OpBuilder &builder, ModuleOp module,
   //     hi = mid;
   //   else
   //     lo = mid + 1;
-  Value newLo = builder.create<arith::SelectOp>(loc, cond2, lo, midp1);
-  Value newHi = builder.create<arith::SelectOp>(loc, cond2, mid, hi);
-  builder.create<scf::YieldOp>(loc, ValueRange{newLo, newHi});
+  Value newLo = arith::SelectOp::create(builder, loc, cond2, lo, midp1);
+  Value newHi = arith::SelectOp::create(builder, loc, cond2, mid, hi);
+  scf::YieldOp::create(builder, loc, ValueRange{newLo, newHi});
 
   builder.setInsertionPointAfter(whileOp);
-  builder.create<func::ReturnOp>(loc, whileOp.getResult(0));
+  func::ReturnOp::create(builder, loc, whileOp.getResult(0));
 }
 
 /// Creates code to advance i in a loop based on xs[p] as follows:
@@ -393,7 +394,7 @@ static std::pair<Value, Value> createScanLoop(OpBuilder &builder,
                                               uint64_t ny, int step) {
   Location loc = func.getLoc();
   scf::WhileOp whileOp =
-      builder.create<scf::WhileOp>(loc, TypeRange{i.getType()}, ValueRange{i});
+      scf::WhileOp::create(builder, loc, TypeRange{i.getType()}, ValueRange{i});
 
   Block *before =
       builder.createBlock(&whileOp.getBefore(), {}, {i.getType()}, {loc});
@@ -409,14 +410,14 @@ static std::pair<Value, Value> createScanLoop(OpBuilder &builder,
   }
   compareOperands.append(xs.begin(), xs.end());
   Value cond = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
-  builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
+  scf::ConditionOp::create(builder, loc, cond, before->getArguments());
 
   Block *after =
       builder.createBlock(&whileOp.getAfter(), {}, {i.getType()}, {loc});
   builder.setInsertionPointToEnd(after);
   Value cs = constantIndex(builder, loc, step);
-  i = builder.create<arith::AddIOp>(loc, after->getArgument(0), cs);
-  builder.create<scf::YieldOp>(loc, ValueRange{i});
+  i = arith::AddIOp::create(builder, loc, after->getArgument(0), cs);
+  scf::YieldOp::create(builder, loc, ValueRange{i});
   i = whileOp.getResult(0);
 
   builder.setInsertionPointAfter(whileOp);
@@ -440,7 +441,7 @@ static scf::IfOp createCompareThenSwap(OpBuilder &builder, Location loc,
   compareOperands[0] = b;
   compareOperands[1] = a;
   Value cond = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
-  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, cond, /*else=*/false);
+  scf::IfOp ifOp = scf::IfOp::create(builder, loc, cond, /*else=*/false);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   swapOperands[0] = b;
   swapOperands[1] = a;
@@ -517,12 +518,12 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module,
   swapOperands.append(args.begin() + xStartIdx, args.end());
   Location loc = func.getLoc();
   Value c1 = constantIndex(builder, loc, 1);
-  Value hiP1 = builder.create<arith::AddIOp>(loc, hi, c1);
-  Value len = builder.create<arith::SubIOp>(loc, hiP1, lo);
+  Value hiP1 = arith::AddIOp::create(builder, loc, hi, c1);
+  Value len = arith::SubIOp::create(builder, loc, hiP1, lo);
   Value lenThreshold = constantIndex(builder, loc, 1000);
-  Value lenCond = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                                len, lenThreshold);
-  scf::IfOp lenIf = builder.create<scf::IfOp>(loc, lenCond, /*else=*/true);
+  Value lenCond = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult,
+                                        len, lenThreshold);
+  scf::IfOp lenIf = scf::IfOp::create(builder, loc, lenCond, /*else=*/true);
 
   // When len < 1000, choose pivot from median of 3 values.
   builder.setInsertionPointToStart(&lenIf.getThenRegion().front());
@@ -531,13 +532,13 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module,
 
   // When len >= 1000, choose pivot from median of 5 values.
   builder.setInsertionPointToStart(&lenIf.getElseRegion().front());
-  Value miP1 = builder.create<arith::AddIOp>(loc, hi, c1);
-  Value a = builder.create<arith::AddIOp>(loc, lo, miP1);
+  Value miP1 = arith::AddIOp::create(builder, loc, hi, c1);
+  Value a = arith::AddIOp::create(builder, loc, lo, miP1);
   // Value a is the middle between [loc, mi].
-  a = builder.create<arith::ShRUIOp>(loc, a, c1);
-  Value b = builder.create<arith::AddIOp>(loc, mi, hiP1);
+  a = arith::ShRUIOp::create(builder, loc, a, c1);
+  Value b = arith::AddIOp::create(builder, loc, mi, hiP1);
   // Value b is the middle between [mi, hi].
-  b = builder.create<arith::ShRUIOp>(loc, b, c1);
+  b = arith::ShRUIOp::create(builder, loc, b, c1);
   createSort5(builder, loc, xPerm, ny, swapOperands, compareOperands, lo, a, mi,
               b, hi);
 
@@ -589,25 +590,25 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module,
   ValueRange args = entryBlock->getArguments();
   Value lo = args[loIdx];
   Value hi = args[hiIdx];
-  Value sum = builder.create<arith::AddIOp>(loc, lo, hi);
+  Value sum = arith::AddIOp::create(builder, loc, lo, hi);
   Value c1 = constantIndex(builder, loc, 1);
-  Value p = builder.create<arith::ShRUIOp>(loc, sum, c1);
+  Value p = arith::ShRUIOp::create(builder, loc, sum, c1);
 
   Value i = lo;
-  Value j = builder.create<arith::SubIOp>(loc, hi, c1);
+  Value j = arith::SubIOp::create(builder, loc, hi, c1);
   createChoosePivot(builder, module, func, xPerm, ny, i, j, p, args);
   Value trueVal = constantI1(builder, loc, true); // The value for while (true)
   SmallVector<Value, 4> operands{i, j, p, trueVal}; // Exactly four values.
   SmallVector<Type, 4> types{i.getType(), j.getType(), p.getType(),
                              trueVal.getType()};
-  scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
+  scf::WhileOp whileOp = scf::WhileOp::create(builder, loc, types, operands);
 
   // The before-region of the WhileOp.
   Block *before = builder.createBlock(&whileOp.getBefore(), {}, types,
                                       {loc, loc, loc, loc});
   builder.setInsertionPointToEnd(before);
-  builder.create<scf::ConditionOp>(loc, before->getArgument(3),
-                                   before->getArguments());
+  scf::ConditionOp::create(builder, loc, before->getArgument(3),
+                           before->getArguments());
 
   // The after-region of the WhileOp.
   Block *after =
@@ -629,70 +630,72 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module,
 
   // If i < j:
   Value cond =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, i, j);
-  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, cond, /*else=*/true);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult, i, j);
+  scf::IfOp ifOp = scf::IfOp::create(builder, loc, types, cond, /*else=*/true);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   SmallVector<Value> swapOperands{i, j};
   swapOperands.append(args.begin() + xStartIdx, args.end());
   createSwap(builder, loc, swapOperands, xPerm, ny);
   // If the pivot is moved, update p with the new pivot.
   Value icond =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, i, p);
-  scf::IfOp ifOpI = builder.create<scf::IfOp>(loc, TypeRange{p.getType()},
-                                              icond, /*else=*/true);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq, i, p);
+  scf::IfOp ifOpI = scf::IfOp::create(builder, loc, TypeRange{p.getType()},
+                                      icond, /*else=*/true);
   builder.setInsertionPointToStart(&ifOpI.getThenRegion().front());
-  builder.create<scf::YieldOp>(loc, ValueRange{j});
+  scf::YieldOp::create(builder, loc, ValueRange{j});
   builder.setInsertionPointToStart(&ifOpI.getElseRegion().front());
   Value jcond =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, j, p);
-  scf::IfOp ifOpJ = builder.create<scf::IfOp>(loc, TypeRange{p.getType()},
-                                              jcond, /*else=*/true);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq, j, p);
+  scf::IfOp ifOpJ = scf::IfOp::create(builder, loc, TypeRange{p.getType()},
+                                      jcond, /*else=*/true);
   builder.setInsertionPointToStart(&ifOpJ.getThenRegion().front());
-  builder.create<scf::YieldOp>(loc, ValueRange{i});
+  scf::YieldOp::create(builder, loc, ValueRange{i});
   builder.setInsertionPointToStart(&ifOpJ.getElseRegion().front());
-  builder.create<scf::YieldOp>(loc, ValueRange{p});
+  scf::YieldOp::create(builder, loc, ValueRange{p});
   builder.setInsertionPointAfter(ifOpJ);
-  builder.create<scf::YieldOp>(loc, ifOpJ.getResults());
+  scf::YieldOp::create(builder, loc, ifOpJ.getResults());
   builder.setInsertionPointAfter(ifOpI);
   Value compareEqIJ =
-      builder.create<arith::AndIOp>(loc, iCompareEq, jCompareEq);
-  scf::IfOp ifOp2 = builder.create<scf::IfOp>(
-      loc, TypeRange{i.getType(), j.getType()}, compareEqIJ, /*else=*/true);
+      arith::AndIOp::create(builder, loc, iCompareEq, jCompareEq);
+  scf::IfOp ifOp2 =
+      scf::IfOp::create(builder, loc, TypeRange{i.getType(), j.getType()},
+                        compareEqIJ, /*else=*/true);
   builder.setInsertionPointToStart(&ifOp2.getThenRegion().front());
-  Value i2 = builder.create<arith::AddIOp>(loc, i, c1);
-  Value j2 = builder.create<arith::SubIOp>(loc, j, c1);
-  builder.create<scf::YieldOp>(loc, ValueRange{i2, j2});
+  Value i2 = arith::AddIOp::create(builder, loc, i, c1);
+  Value j2 = arith::SubIOp::create(builder, loc, j, c1);
+  scf::YieldOp::create(builder, loc, ValueRange{i2, j2});
   builder.setInsertionPointToStart(&ifOp2.getElseRegion().front());
-  builder.create<scf::YieldOp>(loc, ValueRange{i, j});
+  scf::YieldOp::create(builder, loc, ValueRange{i, j});
   builder.setInsertionPointAfter(ifOp2);
-  builder.create<scf::YieldOp>(
-      loc,
-      ValueRange{ifOp2.getResult(0), ifOp2.getResult(1), ifOpI.getResult(0),
-                 /*cont=*/constantI1(builder, loc, true)});
+  scf::YieldOp::create(builder, loc,
+                       ValueRange{ifOp2.getResult(0), ifOp2.getResult(1),
+                                  ifOpI.getResult(0),
+                                  /*cont=*/constantI1(builder, loc, true)});
 
   // False branch for if i < j (i.e., i >= j):
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-  p = builder.create<arith::AddIOp>(loc, j,
-                                    constantOne(builder, loc, j.getType()));
-  builder.create<scf::YieldOp>(
-      loc, ValueRange{i, j, p, /*cont=*/constantI1(builder, loc, false)});
+  p = arith::AddIOp::create(builder, loc, j,
+                            constantOne(builder, loc, j.getType()));
+  scf::YieldOp::create(
+      builder, loc,
+      ValueRange{i, j, p, /*cont=*/constantI1(builder, loc, false)});
 
   // Return for the whileOp.
   builder.setInsertionPointAfter(ifOp);
-  builder.create<scf::YieldOp>(loc, ifOp.getResults());
+  scf::YieldOp::create(builder, loc, ifOp.getResults());
 
   // Return for the function.
   builder.setInsertionPointAfter(whileOp);
-  builder.create<func::ReturnOp>(loc, whileOp.getResult(2));
+  func::ReturnOp::create(builder, loc, whileOp.getResult(2));
 }
 
 /// Computes (n-2)/n, assuming n has index type.
 static Value createSubTwoDividedByTwo(OpBuilder &builder, Location loc,
                                       Value n) {
   Value i2 = constantIndex(builder, loc, 2);
-  Value res = builder.create<arith::SubIOp>(loc, n, i2);
+  Value res = arith::SubIOp::create(builder, loc, n, i2);
   Value i1 = constantIndex(builder, loc, 1);
-  return builder.create<arith::ShRUIOp>(loc, res, i1);
+  return arith::ShRUIOp::create(builder, loc, res, i1);
 }
 
 /// Creates a function to heapify the subtree with root `start` within the full
@@ -743,16 +746,16 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   // If (n >= 2).
   Value c2 = constantIndex(builder, loc, 2);
   Value condN =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::uge, n, c2);
-  scf::IfOp ifN = builder.create<scf::IfOp>(loc, condN, /*else=*/false);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::uge, n, c2);
+  scf::IfOp ifN = scf::IfOp::create(builder, loc, condN, /*else=*/false);
   builder.setInsertionPointToStart(&ifN.getThenRegion().front());
-  Value child = builder.create<arith::SubIOp>(loc, start, first);
+  Value child = arith::SubIOp::create(builder, loc, start, first);
 
   // If ((n-2)/2 >= child).
   Value t = createSubTwoDividedByTwo(builder, loc, n);
   Value condNc =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::uge, t, child);
-  scf::IfOp ifNc = builder.create<scf::IfOp>(loc, condNc, /*else=*/false);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::uge, t, child);
+  scf::IfOp ifNc = scf::IfOp::create(builder, loc, condNc, /*else=*/false);
 
   builder.setInsertionPointToStart(&ifNc.getThenRegion().front());
   Value c1 = constantIndex(builder, loc, 1);
@@ -768,32 +771,32 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   //   if (child+1 < n && data[childIndex] < data[childIndex+1])
   //     childIndex ++; child ++ // Right child is bigger.
   auto getLargerChild = [&](Value r) -> std::pair<Value, Value> {
-    Value lChild = builder.create<arith::ShLIOp>(loc, r, c1);
-    lChild = builder.create<arith::AddIOp>(loc, lChild, c1);
-    Value lChildIdx = builder.create<arith::AddIOp>(loc, lChild, first);
-    Value rChild = builder.create<arith::AddIOp>(loc, lChild, c1);
-    Value cond1 = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                                rChild, n);
+    Value lChild = arith::ShLIOp::create(builder, loc, r, c1);
+    lChild = arith::AddIOp::create(builder, loc, lChild, c1);
+    Value lChildIdx = arith::AddIOp::create(builder, loc, lChild, first);
+    Value rChild = arith::AddIOp::create(builder, loc, lChild, c1);
+    Value cond1 = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult,
+                                        rChild, n);
     SmallVector<Type, 2> ifTypes(2, r.getType());
     scf::IfOp if1 =
-        builder.create<scf::IfOp>(loc, ifTypes, cond1, /*else=*/true);
+        scf::IfOp::create(builder, loc, ifTypes, cond1, /*else=*/true);
     builder.setInsertionPointToStart(&if1.getThenRegion().front());
-    Value rChildIdx = builder.create<arith::AddIOp>(loc, rChild, first);
+    Value rChildIdx = arith::AddIOp::create(builder, loc, rChild, first);
     // Compare data[left] < data[right].
     compareOperands[0] = lChildIdx;
     compareOperands[1] = rChildIdx;
     Value cond2 =
         createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
     scf::IfOp if2 =
-        builder.create<scf::IfOp>(loc, ifTypes, cond2, /*else=*/true);
+        scf::IfOp::create(builder, loc, ifTypes, cond2, /*else=*/true);
     builder.setInsertionPointToStart(&if2.getThenRegion().front());
-    builder.create<scf::YieldOp>(loc, ValueRange{rChild, rChildIdx});
+    scf::YieldOp::create(builder, loc, ValueRange{rChild, rChildIdx});
     builder.setInsertionPointToStart(&if2.getElseRegion().front());
-    builder.create<scf::YieldOp>(loc, ValueRange{lChild, lChildIdx});
+    scf::YieldOp::create(builder, loc, ValueRange{lChild, lChildIdx});
     builder.setInsertionPointAfter(if2);
-    builder.create<scf::YieldOp>(loc, if2.getResults());
+    scf::YieldOp::create(builder, loc, if2.getResults());
     builder.setInsertionPointToStart(&if1.getElseRegion().front());
-    builder.create<scf::YieldOp>(loc, ValueRange{lChild, lChildIdx});
+    scf::YieldOp::create(builder, loc, ValueRange{lChild, lChildIdx});
     builder.setInsertionPointAfter(if1);
     return std::make_pair(if1.getResult(0), if1.getResult(1));
   };
@@ -803,8 +806,8 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
 
   // While (data[start] < data[childIndex]).
   SmallVector<Type, 3> types(3, child.getType());
-  scf::WhileOp whileOp = builder.create<scf::WhileOp>(
-      loc, types, SmallVector<Value, 2>{start, child, childIdx});
+  scf::WhileOp whileOp = scf::WhileOp::create(
+      builder, loc, types, SmallVector<Value, 2>{start, child, childIdx});
 
   // The before-region of the WhileOp.
   SmallVector<Location, 3> locs(3, loc);
@@ -815,7 +818,7 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   compareOperands[0] = start;
   compareOperands[1] = childIdx;
   Value cond = createInlinedLessThan(builder, loc, compareOperands, xPerm, ny);
-  builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
+  scf::ConditionOp::create(builder, loc, cond, before->getArguments());
 
   // The after-region of the WhileOp.
   Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
@@ -827,20 +830,21 @@ static void createShiftDownFunc(OpBuilder &builder, ModuleOp module,
   createSwap(builder, loc, swapOperands, xPerm, ny);
   start = childIdx;
   Value cond2 =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::uge, t, child);
-  scf::IfOp if2 = builder.create<scf::IfOp>(
-      loc, TypeRange{child.getType(), child.getType()}, cond2, /*else=*/true);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::uge, t, child);
+  scf::IfOp if2 = scf::IfOp::create(builder, loc,
+                                    TypeRange{child.getType(), child.getType()},
+                                    cond2, /*else=*/true);
   builder.setInsertionPointToStart(&if2.getThenRegion().front());
   auto [newChild, newChildIdx] = getLargerChild(child);
-  builder.create<scf::YieldOp>(loc, ValueRange{newChild, newChildIdx});
+  scf::YieldOp::create(builder, loc, ValueRange{newChild, newChildIdx});
   builder.setInsertionPointToStart(&if2.getElseRegion().front());
-  builder.create<scf::YieldOp>(loc, ValueRange{child, childIdx});
+  scf::YieldOp::create(builder, loc, ValueRange{child, childIdx});
   builder.setInsertionPointAfter(if2);
-  builder.create<scf::YieldOp>(
-      loc, ValueRange{start, if2.getResult(0), if2.getResult(1)});
+  scf::YieldOp::create(builder, loc,
+                       ValueRange{start, if2.getResult(0), if2.getResult(1)});
 
   builder.setInsertionPointAfter(ifN);
-  builder.create<func::ReturnOp>(loc);
+  func::ReturnOp::create(builder, loc);
 }
 
 /// Creates a function to perform heap sort on the values in the range of index
@@ -870,45 +874,45 @@ static void createHeapSortFunc(OpBuilder &builder, ModuleOp module,
   ValueRange args = entryBlock->getArguments();
   Value lo = args[loIdx];
   Value hi = args[hiIdx];
-  Value n = builder.create<arith::SubIOp>(loc, hi, lo);
+  Value n = arith::SubIOp::create(builder, loc, hi, lo);
 
   // For i = (n-2)/2 downto 0.
   Value c0 = constantIndex(builder, loc, 0);
   Value c1 = constantIndex(builder, loc, 1);
   Value s = createSubTwoDividedByTwo(builder, loc, n);
-  Value up = builder.create<arith::AddIOp>(loc, s, c1);
-  scf::ForOp forI = builder.create<scf::ForOp>(loc, c0, up, c1);
+  Value up = arith::AddIOp::create(builder, loc, s, c1);
+  scf::ForOp forI = scf::ForOp::create(builder, loc, c0, up, c1);
   builder.setInsertionPointToStart(forI.getBody());
-  Value i = builder.create<arith::SubIOp>(loc, s, forI.getInductionVar());
-  Value lopi = builder.create<arith::AddIOp>(loc, lo, i);
+  Value i = arith::SubIOp::create(builder, loc, s, forI.getInductionVar());
+  Value lopi = arith::AddIOp::create(builder, loc, lo, i);
   SmallVector<Value> shiftDownOperands = {lo, lopi};
   shiftDownOperands.append(args.begin() + xStartIdx, args.end());
   shiftDownOperands.push_back(n);
   FlatSymbolRefAttr shiftDownFunc = getMangledSortHelperFunc(
       builder, func, TypeRange(), kShiftDownFuncNamePrefix, xPerm, ny,
       shiftDownOperands, createShiftDownFunc, /*nTrailingP=*/1);
-  builder.create<func::CallOp>(loc, shiftDownFunc, TypeRange(),
-                               shiftDownOperands);
+  func::CallOp::create(builder, loc, shiftDownFunc, TypeRange(),
+                       shiftDownOperands);
 
   builder.setInsertionPointAfter(forI);
   // For l = n downto 2.
-  up = builder.create<arith::SubIOp>(loc, n, c1);
-  scf::ForOp forL = builder.create<scf::ForOp>(loc, c0, up, c1);
+  up = arith::SubIOp::create(builder, loc, n, c1);
+  scf::ForOp forL = scf::ForOp::create(builder, loc, c0, up, c1);
   builder.setInsertionPointToStart(forL.getBody());
-  Value l = builder.create<arith::SubIOp>(loc, n, forL.getInductionVar());
-  Value loplm1 = builder.create<arith::AddIOp>(loc, lo, l);
-  loplm1 = builder.create<arith::SubIOp>(loc, loplm1, c1);
+  Value l = arith::SubIOp::create(builder, loc, n, forL.getInductionVar());
+  Value loplm1 = arith::AddIOp::create(builder, loc, lo, l);
+  loplm1 = arith::SubIOp::create(builder, loc, loplm1, c1);
   SmallVector<Value> swapOperands{lo, loplm1};
   swapOperands.append(args.begin() + xStartIdx, args.end());
   createSwap(builder, loc, swapOperands, xPerm, ny);
   shiftDownOperands[1] = lo;
   shiftDownOperands[shiftDownOperands.size() - 1] =
-      builder.create<arith::SubIOp>(loc, l, c1);
-  builder.create<func::CallOp>(loc, shiftDownFunc, TypeRange(),
-                               shiftDownOperands);
+      arith::SubIOp::create(builder, loc, l, c1);
+  func::CallOp::create(builder, loc, shiftDownFunc, TypeRange(),
+                       shiftDownOperands);
 
   builder.setInsertionPointAfter(forL);
-  builder.create<func::ReturnOp>(loc);
+  func::ReturnOp::create(builder, loc);
 }
 
 /// A helper for generating code to perform quick sort. It partitions [lo, hi),
@@ -933,35 +937,35 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func,
                                       args.drop_back(nTrailingP))
                 .getResult(0);
 
-  Value lenLow = builder.create<arith::SubIOp>(loc, p, lo);
-  Value lenHigh = builder.create<arith::SubIOp>(loc, hi, p);
+  Value lenLow = arith::SubIOp::create(builder, loc, p, lo);
+  Value lenHigh = arith::SubIOp::create(builder, loc, hi, p);
   // Partition already sorts array with len <= 2
   Value c2 = constantIndex(builder, loc, 2);
-  Value len = builder.create<arith::SubIOp>(loc, hi, lo);
+  Value len = arith::SubIOp::create(builder, loc, hi, lo);
   Value lenGtTwo =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ugt, len, c2);
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ugt, len, c2);
   scf::IfOp ifLenGtTwo =
-      builder.create<scf::IfOp>(loc, types, lenGtTwo, /*else=*/true);
+      scf::IfOp::create(builder, loc, types, lenGtTwo, /*else=*/true);
   builder.setInsertionPointToStart(&ifLenGtTwo.getElseRegion().front());
   // Returns an empty range to mark the entire region is fully sorted.
-  builder.create<scf::YieldOp>(loc, ValueRange{lo, lo});
+  scf::YieldOp::create(builder, loc, ValueRange{lo, lo});
 
   // Else len > 2, need recursion.
   builder.setInsertionPointToStart(&ifLenGtTwo.getThenRegion().front());
-  Value cond = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
-                                             lenLow, lenHigh);
+  Value cond = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ule,
+                                     lenLow, lenHigh);
 
   Value c0 = constantIndex(builder, loc, 0);
-  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, cond, /*else=*/true);
+  scf::IfOp ifOp = scf::IfOp::create(builder, loc, types, cond, /*else=*/true);
 
   auto mayRecursion = [&](Value low, Value high, Value len) {
     Value cond =
-        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, len, c0);
-    scf::IfOp ifOp = builder.create<scf::IfOp>(loc, cond, /*else=*/false);
+        arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne, len, c0);
+    scf::IfOp ifOp = scf::IfOp::create(builder, loc, cond, /*else=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     SmallVector<Value> operands{low, high};
     operands.append(args.begin() + xStartIdx, args.end());
-    builder.create<func::CallOp>(loc, func, operands);
+    func::CallOp::create(builder, loc, func, operands);
     builder.setInsertionPointAfter(ifOp);
   };
 
@@ -969,14 +973,14 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func,
   // the bigger partition to be processed by the enclosed while-loop.
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   mayRecursion(lo, p, lenLow);
-  builder.create<scf::YieldOp>(loc, ValueRange{p, hi});
+  scf::YieldOp::create(builder, loc, ValueRange{p, hi});
 
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
   mayRecursion(p, hi, lenHigh);
-  builder.create<scf::YieldOp>(loc, ValueRange{lo, p});
+  scf::YieldOp::create(builder, loc, ValueRange{lo, p});
 
   builder.setInsertionPointAfter(ifOp);
-  builder.create<scf::YieldOp>(loc, ifOp.getResults());
+  scf::YieldOp::create(builder, loc, ifOp.getResults());
 
   builder.setInsertionPointAfter(ifLenGtTwo);
   return std::make_pair(ifLenGtTwo.getResult(0), ifLenGtTwo.getResult(1));
@@ -1011,10 +1015,10 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   Value c1 = constantIndex(builder, loc, 1);
   Value lo = args[loIdx];
   Value hi = args[hiIdx];
-  Value lop1 = builder.create<arith::AddIOp>(loc, lo, c1);
+  Value lop1 = arith::AddIOp::create(builder, loc, lo, c1);
 
   // Start the outer for-stmt with induction variable i.
-  scf::ForOp forOpI = builder.create<scf::ForOp>(loc, lop1, hi, c1);
+  scf::ForOp forOpI = scf::ForOp::create(builder, loc, lop1, hi, c1);
   builder.setInsertionPointToStart(forOpI.getBody());
   Value i = forOpI.getInductionVar();
 
@@ -1035,24 +1039,24 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   forEachIJPairInAllBuffers(
       builder, loc, operands, xPerm, ny,
       [&](uint64_t unused, Value i, Value unused2, Value buffer) {
-        d.push_back(builder.create<memref::LoadOp>(loc, buffer, i));
+        d.push_back(memref::LoadOp::create(builder, loc, buffer, i));
       });
 
   // Start the inner for-stmt with induction variable j, for moving data[p..i)
   // to data[p+1..i+1).
-  Value imp = builder.create<arith::SubIOp>(loc, i, p);
+  Value imp = arith::SubIOp::create(builder, loc, i, p);
   Value c0 = constantIndex(builder, loc, 0);
-  scf::ForOp forOpJ = builder.create<scf::ForOp>(loc, c0, imp, c1);
+  scf::ForOp forOpJ = scf::ForOp::create(builder, loc, c0, imp, c1);
   builder.setInsertionPointToStart(forOpJ.getBody());
   Value j = forOpJ.getInductionVar();
-  Value imj = builder.create<arith::SubIOp>(loc, i, j);
+  Value imj = arith::SubIOp::create(builder, loc, i, j);
   operands[1] = imj;
-  operands[0] = builder.create<arith::SubIOp>(loc, imj, c1);
+  operands[0] = arith::SubIOp::create(builder, loc, imj, c1);
   forEachIJPairInAllBuffers(
       builder, loc, operands, xPerm, ny,
       [&](uint64_t unused, Value imjm1, Value imj, Value buffer) {
-        Value t = builder.create<memref::LoadOp>(loc, buffer, imjm1);
-        builder.create<memref::StoreOp>(loc, t, buffer, imj);
+        Value t = memref::LoadOp::create(builder, loc, buffer, imjm1);
+        memref::StoreOp::create(builder, loc, t, buffer, imj);
       });
 
   // Store the value at data[i] to data[p].
@@ -1061,11 +1065,11 @@ static void createSortStableFunc(OpBuilder &builder, ModuleOp module,
   forEachIJPairInAllBuffers(
       builder, loc, operands, xPerm, ny,
       [&](uint64_t k, Value p, Value usused, Value buffer) {
-        builder.create<memref::StoreOp>(loc, d[k], buffer, p);
+        memref::StoreOp::create(builder, loc, d[k], buffer, p);
       });
 
   builder.setInsertionPointAfter(forOpI);
-  builder.create<func::ReturnOp>(loc);
+  func::ReturnOp::create(builder, loc);
 }
 
 /// Creates a function to perform quick sort or a hybrid quick sort on the
@@ -1127,7 +1131,7 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
   Value hi = args[hiIdx];
   SmallVector<Type, 2> types(2, lo.getType()); // Only two types.
   scf::WhileOp whileOp =
-      builder.create<scf::WhileOp>(loc, types, SmallVector<Value, 2>{lo, hi});
+      scf::WhileOp::create(builder, loc, types, SmallVector<Value, 2>{lo, hi});
 
   // The before-region of the WhileOp.
   Block *before =
@@ -1136,10 +1140,10 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
   lo = before->getArgument(0);
   hi = before->getArgument(1);
   Value loP1 =
-      builder.create<arith::AddIOp>(loc, lo, constantIndex(builder, loc, 1));
+      arith::AddIOp::create(builder, loc, lo, constantIndex(builder, loc, 1));
   Value needSort =
-      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, loP1, hi);
-  builder.create<scf::ConditionOp>(loc, needSort, before->getArguments());
+      arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult, loP1, hi);
+  scf::ConditionOp::create(builder, loc, needSort, before->getArguments());
 
   // The after-region of the WhileOp.
   Block *after =
@@ -1151,53 +1155,53 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
   args[1] = hi;
 
   if (isHybrid) {
-    Value len = builder.create<arith::SubIOp>(loc, hi, lo);
+    Value len = arith::SubIOp::create(builder, loc, hi, lo);
     Value lenLimit = constantIndex(builder, loc, 30);
-    Value lenCond = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ule, len, lenLimit);
+    Value lenCond = arith::CmpIOp::create(
+        builder, loc, arith::CmpIPredicate::ule, len, lenLimit);
     scf::IfOp lenIf =
-        builder.create<scf::IfOp>(loc, types, lenCond, /*else=*/true);
+        scf::IfOp::create(builder, loc, types, lenCond, /*else=*/true);
 
     // When len <= limit.
     builder.setInsertionPointToStart(&lenIf.getThenRegion().front());
     FlatSymbolRefAttr insertionSortFunc = getMangledSortHelperFunc(
         builder, func, TypeRange(), kSortStableFuncNamePrefix, xPerm, ny,
         ValueRange(args).drop_back(nTrailingP), createSortStableFunc);
-    builder.create<func::CallOp>(loc, insertionSortFunc, TypeRange(),
-                                 ValueRange(args).drop_back(nTrailingP));
-    builder.create<scf::YieldOp>(loc, ValueRange{lo, lo});
+    func::CallOp::create(builder, loc, insertionSortFunc, TypeRange(),
+                         ValueRange(args).drop_back(nTrailingP));
+    scf::YieldOp::create(builder, loc, ValueRange{lo, lo});
 
     // When len > limit.
     builder.setInsertionPointToStart(&lenIf.getElseRegion().front());
     Value depthLimit = args.back();
-    depthLimit = builder.create<arith::SubIOp>(loc, depthLimit,
-                                               constantI64(builder, loc, 1));
+    depthLimit = arith::SubIOp::create(builder, loc, depthLimit,
+                                       constantI64(builder, loc, 1));
     Value depthCond =
-        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
-                                      depthLimit, constantI64(builder, loc, 0));
+        arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ule,
+                              depthLimit, constantI64(builder, loc, 0));
     scf::IfOp depthIf =
-        builder.create<scf::IfOp>(loc, types, depthCond, /*else=*/true);
+        scf::IfOp::create(builder, loc, types, depthCond, /*else=*/true);
 
     // When depth exceeds limit.
     builder.setInsertionPointToStart(&depthIf.getThenRegion().front());
     FlatSymbolRefAttr heapSortFunc = getMangledSortHelperFunc(
         builder, func, TypeRange(), kHeapSortFuncNamePrefix, xPerm, ny,
         ValueRange(args).drop_back(nTrailingP), createHeapSortFunc);
-    builder.create<func::CallOp>(loc, heapSortFunc, TypeRange(),
-                                 ValueRange(args).drop_back(nTrailingP));
-    builder.create<scf::YieldOp>(loc, ValueRange{lo, lo});
+    func::CallOp::create(builder, loc, heapSortFunc, TypeRange(),
+                         ValueRange(args).drop_back(nTrailingP));
+    scf::YieldOp::create(builder, loc, ValueRange{lo, lo});
 
     // When depth doesn't exceed limit.
     builder.setInsertionPointToStart(&depthIf.getElseRegion().front());
     args.back() = depthLimit;
     std::tie(lo, hi) =
         createQuickSort(builder, module, func, args, xPerm, ny, nTrailingP);
-    builder.create<scf::YieldOp>(loc, ValueRange{lo, hi});
+    scf::YieldOp::create(builder, loc, ValueRange{lo, hi});
 
     builder.setInsertionPointAfter(depthIf);
     lo = depthIf.getResult(0);
     hi = depthIf.getResult(1);
-    builder.create<scf::YieldOp>(loc, ValueRange{lo, hi});
+    scf::YieldOp::create(builder, loc, ValueRange{lo, hi});
 
     builder.setInsertionPointAfter(lenIf);
     lo = lenIf.getResult(0);
@@ -1208,11 +1212,11 @@ static void createQuickSortFunc(OpBuilder &builder, ModuleOp module,
   }
 
   // New [lo, hi) for the next while-loop iteration.
-  builder.create<scf::YieldOp>(loc, ValueRange{lo, hi});
+  scf::YieldOp::create(builder, loc, ValueRange{lo, hi});
 
   // After the while-loop.
   builder.setInsertionPointAfter(whileOp);
-  builder.create<func::ReturnOp>(loc);
+  func::ReturnOp::create(builder, loc);
 }
 
 /// Implements the rewriting for operator sort and sort_coo.
@@ -1228,7 +1232,7 @@ LogicalResult matchAndRewriteSortOp(OpTy op, ValueRange xys, AffineMap xPerm,
     if (!mtp.isDynamicDim(0)) {
       auto newMtp =
           MemRefType::get({ShapedType::kDynamic}, mtp.getElementType());
-      v = rewriter.create<memref::CastOp>(loc, newMtp, v);
+      v = memref::CastOp::create(rewriter, loc, newMtp, v);
     }
     operands.push_back(v);
   }
@@ -1248,12 +1252,12 @@ LogicalResult matchAndRewriteSortOp(OpTy op, ValueRange xys, AffineMap xPerm,
     // As a heuristics, set depthLimit = 2 * log2(n).
     Value lo = operands[loIdx];
     Value hi = operands[hiIdx];
-    Value len = rewriter.create<arith::IndexCastOp>(
-        loc, rewriter.getI64Type(),
-        rewriter.create<arith::SubIOp>(loc, hi, lo));
-    Value depthLimit = rewriter.create<arith::SubIOp>(
-        loc, constantI64(rewriter, loc, 64),
-        rewriter.create<math::CountLeadingZerosOp>(loc, len));
+    Value len = arith::IndexCastOp::create(
+        rewriter, loc, rewriter.getI64Type(),
+        arith::SubIOp::create(rewriter, loc, hi, lo));
+    Value depthLimit = arith::SubIOp::create(
+        rewriter, loc, constantI64(rewriter, loc, 64),
+        math::CountLeadingZerosOp::create(rewriter, loc, len));
     operands.push_back(depthLimit);
     break;
   }
@@ -1307,33 +1311,33 @@ struct PushBackRewriter : OpRewritePattern<PushBackOp> {
     Location loc = op->getLoc();
     Value c0 = constantIndex(rewriter, loc, 0);
     Value buffer = op.getInBuffer();
-    Value capacity = rewriter.create<memref::DimOp>(loc, buffer, c0);
+    Value capacity = memref::DimOp::create(rewriter, loc, buffer, c0);
     Value size = op.getCurSize();
     Value value = op.getValue();
 
     Value n = op.getN() ? op.getN() : constantIndex(rewriter, loc, 1);
-    Value newSize = rewriter.create<arith::AddIOp>(loc, size, n);
+    Value newSize = arith::AddIOp::create(rewriter, loc, size, n);
     auto nValue = dyn_cast_or_null<arith::ConstantIndexOp>(n.getDefiningOp());
     bool nIsOne = (nValue && nValue.value() == 1);
 
     if (!op.getInbounds()) {
-      Value cond = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::ugt, newSize, capacity);
+      Value cond = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::ugt, newSize, capacity);
 
       Value c2 = constantIndex(rewriter, loc, 2);
       auto bufferType =
           MemRefType::get({ShapedType::kDynamic}, value.getType());
-      scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, bufferType, cond,
-                                                  /*else=*/true);
+      scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, bufferType, cond,
+                                         /*else=*/true);
       // True branch.
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
       if (nIsOne) {
-        capacity = rewriter.create<arith::MulIOp>(loc, capacity, c2);
+        capacity = arith::MulIOp::create(rewriter, loc, capacity, c2);
       } else {
         // Use a do-while loop to calculate the new capacity as follows:
         //   do { new_capacity *= 2 } while (size > new_capacity)
         scf::WhileOp whileOp =
-            rewriter.create<scf::WhileOp>(loc, capacity.getType(), capacity);
+            scf::WhileOp::create(rewriter, loc, capacity.getType(), capacity);
 
         // The before-region of the WhileOp.
         Block *before = rewriter.createBlock(&whileOp.getBefore(), {},
@@ -1341,36 +1345,37 @@ struct PushBackRewriter : OpRewritePattern<PushBackOp> {
         rewriter.setInsertionPointToEnd(before);
 
         capacity =
-            rewriter.create<arith::MulIOp>(loc, before->getArgument(0), c2);
-        cond = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ugt,
-                                              newSize, capacity);
-        rewriter.create<scf::ConditionOp>(loc, cond, ValueRange{capacity});
+            arith::MulIOp::create(rewriter, loc, before->getArgument(0), c2);
+        cond = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::ugt,
+                                     newSize, capacity);
+        scf::ConditionOp::create(rewriter, loc, cond, ValueRange{capacity});
         // The after-region of the WhileOp.
         Block *after = rewriter.createBlock(&whileOp.getAfter(), {},
                                             {capacity.getType()}, {loc});
         rewriter.setInsertionPointToEnd(after);
-        rewriter.create<scf::YieldOp>(loc, after->getArguments());
+        scf::YieldOp::create(rewriter, loc, after->getArguments());
 
         rewriter.setInsertionPointAfter(whileOp);
         capacity = whileOp.getResult(0);
       }
 
-      Value newBuffer =
-          rewriter.create<memref::ReallocOp>(loc, bufferType, buffer, capacity);
+      Value newBuffer = memref::ReallocOp::create(rewriter, loc, bufferType,
+                                                  buffer, capacity);
       if (enableBufferInitialization) {
-        Value fillSize = rewriter.create<arith::SubIOp>(loc, capacity, newSize);
+        Value fillSize =
+            arith::SubIOp::create(rewriter, loc, capacity, newSize);
         Value fillValue = constantZero(rewriter, loc, value.getType());
-        Value subBuffer = rewriter.create<memref::SubViewOp>(
-            loc, newBuffer, /*offset=*/ValueRange{newSize},
+        Value subBuffer = memref::SubViewOp::create(
+            rewriter, loc, newBuffer, /*offset=*/ValueRange{newSize},
             /*size=*/ValueRange{fillSize},
             /*step=*/ValueRange{constantIndex(rewriter, loc, 1)});
-        rewriter.create<linalg::FillOp>(loc, fillValue, subBuffer);
+        linalg::FillOp::create(rewriter, loc, fillValue, subBuffer);
       }
-      rewriter.create<scf::YieldOp>(loc, newBuffer);
+      scf::YieldOp::create(rewriter, loc, newBuffer);
 
       // False branch.
       rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
-      rewriter.create<scf::YieldOp>(loc, buffer);
+      scf::YieldOp::create(rewriter, loc, buffer);
 
       // Prepare for adding the value to the end of the buffer.
       rewriter.setInsertionPointAfter(ifOp);
@@ -1379,12 +1384,13 @@ struct PushBackRewriter : OpRewritePattern<PushBackOp> {
 
     // Add the value to the end of the buffer.
     if (nIsOne) {
-      rewriter.create<memref::StoreOp>(loc, value, buffer, size);
+      memref::StoreOp::create(rewriter, loc, value, buffer, size);
     } else {
-      Value subBuffer = rewriter.create<memref::SubViewOp>(
-          loc, buffer, /*offset=*/ValueRange{size}, /*size=*/ValueRange{n},
+      Value subBuffer = memref::SubViewOp::create(
+          rewriter, loc, buffer, /*offset=*/ValueRange{size},
+          /*size=*/ValueRange{n},
           /*step=*/ValueRange{constantIndex(rewriter, loc, 1)});
-      rewriter.create<linalg::FillOp>(loc, value, subBuffer);
+      linalg::FillOp::create(rewriter, loc, value, subBuffer);
     }
 
     // Update the buffer size.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index e89b34d457ff8..a317abd6c560b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -59,8 +59,8 @@ static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) {
     return op; // existing
   markAsGPUContainer(topModule);
   builder.setInsertionPointToStart(topModule.getBody());
-  return builder.create<gpu::GPUModuleOp>(topModule->getLoc(),
-                                          "sparse_kernels");
+  return gpu::GPUModuleOp::create(builder, topModule->getLoc(),
+                                  "sparse_kernels");
 }
 
 /// Constructs a new GPU kernel in the given GPU module.
@@ -81,7 +81,7 @@ static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule,
     argsTp.push_back(arg.getType());
   FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {});
   auto gpuFunc =
-      builder.create<gpu::GPUFuncOp>(gpuModule->getLoc(), kernelName, type);
+      gpu::GPUFuncOp::create(builder, gpuModule->getLoc(), kernelName, type);
   gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                    builder.getUnitAttr());
   return gpuFunc;
@@ -115,28 +115,28 @@ static Value genHostRegisterMemref(OpBuilder &builder, Location loc,
   MemRefType memTp = cast<MemRefType>(mem.getType());
   UnrankedMemRefType resTp =
       UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0);
-  Value cast = builder.create<memref::CastOp>(loc, resTp, mem);
-  builder.create<gpu::HostRegisterOp>(loc, cast);
+  Value cast = memref::CastOp::create(builder, loc, resTp, mem);
+  gpu::HostRegisterOp::create(builder, loc, cast);
   return cast;
 }
 
 /// Unmaps the provided buffer, expecting the casted buffer.
 static void genHostUnregisterMemref(OpBuilder &builder, Location loc,
                                     Value cast) {
-  builder.create<gpu::HostUnregisterOp>(loc, cast);
+  gpu::HostUnregisterOp::create(builder, loc, cast);
 }
 
 /// Generates first wait in an asynchronous chain.
 static Value genFirstWait(OpBuilder &builder, Location loc) {
   Type tokenType = builder.getType<gpu::AsyncTokenType>();
-  return builder.create<gpu::WaitOp>(loc, tokenType, ValueRange())
+  return gpu::WaitOp::create(builder, loc, tokenType, ValueRange())
       .getAsyncToken();
 }
 
 /// Generates last, blocking wait in an asynchronous chain.
 static void genBlockingWait(OpBuilder &builder, Location loc,
                             ValueRange operands) {
-  builder.create<gpu::WaitOp>(loc, Type(), operands);
+  gpu::WaitOp::create(builder, loc, Type(), operands);
 }
 
 /// Allocates memory on the device.
@@ -156,23 +156,23 @@ static gpu::AllocOp genAllocMemRef(OpBuilder &builder, Location loc, Value mem,
       dynamicSizes.push_back(dimOp);
     }
   }
-  return builder.create<gpu::AllocOp>(loc, TypeRange({memTp, token.getType()}),
-                                      token, dynamicSizes, ValueRange());
+  return gpu::AllocOp::create(builder, loc, TypeRange({memTp, token.getType()}),
+                              token, dynamicSizes, ValueRange());
 }
 
 // Allocates a typed buffer on the host with given size.
 static Value genHostBuffer(OpBuilder &builder, Location loc, Type type,
                            Value size) {
   const auto memTp = MemRefType::get({ShapedType::kDynamic}, type);
-  return builder.create<memref::AllocOp>(loc, memTp, size).getResult();
+  return memref::AllocOp::create(builder, loc, memTp, size).getResult();
 }
 
 // Allocates a typed buffer on the device with given size.
 static gpu::AllocOp genAllocBuffer(OpBuilder &builder, Location loc, Type type,
                                    Value size, Value token) {
   const auto memTp = MemRefType::get({ShapedType::kDynamic}, type);
-  return builder.create<gpu::AllocOp>(loc, TypeRange({memTp, token.getType()}),
-                                      token, size, ValueRange());
+  return gpu::AllocOp::create(builder, loc, TypeRange({memTp, token.getType()}),
+                              token, size, ValueRange());
 }
 
 // Allocates a void buffer on the device with given size.
@@ -184,14 +184,14 @@ static gpu::AllocOp genAllocBuffer(OpBuilder &builder, Location loc, Value size,
 /// Deallocates memory from the device.
 static Value genDeallocMemRef(OpBuilder &builder, Location loc, Value mem,
                               Value token) {
-  return builder.create<gpu::DeallocOp>(loc, token.getType(), token, mem)
+  return gpu::DeallocOp::create(builder, loc, token.getType(), token, mem)
       .getAsyncToken();
 }
 
 /// Copies memory between host and device (direction is implicit).
 static Value genCopyMemRef(OpBuilder &builder, Location loc, Value dst,
                            Value src, Value token) {
-  return builder.create<gpu::MemcpyOp>(loc, token.getType(), token, dst, src)
+  return gpu::MemcpyOp::create(builder, loc, token.getType(), token, dst, src)
       .getAsyncToken();
 }
 
@@ -212,7 +212,7 @@ static Value genTensorToMemref(PatternRewriter &rewriter, Location loc,
   auto tensorType = llvm::cast<ShapedType>(tensor.getType());
   auto memrefType =
       MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-  return rewriter.create<bufferization::ToBufferOp>(loc, memrefType, tensor);
+  return bufferization::ToBufferOp::create(rewriter, loc, memrefType, tensor);
 }
 
 /// Prepares the outlined arguments, passing scalars and buffers in. Here we
@@ -293,13 +293,13 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
   // so that:
   //   row = blockIdx.x * blockDim.x + threadIdx.x
   //   inc = blockDim.x * gridDim.x
-  Value bid = rewriter.create<gpu::BlockIdOp>(loc, gpu::Dimension::x);
-  Value bsz = rewriter.create<gpu::BlockDimOp>(loc, gpu::Dimension::x);
-  Value tid = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
-  Value gsz = rewriter.create<gpu::GridDimOp>(loc, gpu::Dimension::x);
-  Value mul = rewriter.create<arith::MulIOp>(loc, bid, bsz);
-  Value row = rewriter.create<arith::AddIOp>(loc, mul, tid);
-  Value inc = rewriter.create<arith::MulIOp>(loc, bsz, gsz);
+  Value bid = gpu::BlockIdOp::create(rewriter, loc, gpu::Dimension::x);
+  Value bsz = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
+  Value tid = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
+  Value gsz = gpu::GridDimOp::create(rewriter, loc, gpu::Dimension::x);
+  Value mul = arith::MulIOp::create(rewriter, loc, bid, bsz);
+  Value row = arith::AddIOp::create(rewriter, loc, mul, tid);
+  Value inc = arith::MulIOp::create(rewriter, loc, bsz, gsz);
 
   // Construct the iteration over the computational space that
   // accounts for the fact that the total number of threads and
@@ -308,7 +308,7 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
   //     <loop-body>
   //   }
   Value upper = irMap.lookup(forallOp.getUpperBound()[0]);
-  scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, row, upper, inc);
+  scf::ForOp forOp = scf::ForOp::create(rewriter, loc, row, upper, inc);
   // The scf.for builder creates an empty block. scf.for does not allow multiple
   // blocks in its region, so delete the block before `cloneRegionBefore` adds
   // an additional block.
@@ -321,7 +321,7 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc,
 
   // Done.
   rewriter.setInsertionPointAfter(forOp);
-  rewriter.create<gpu::ReturnOp>(gpuFunc->getLoc());
+  gpu::ReturnOp::create(rewriter, gpuFunc->getLoc());
 }
 
 //===----------------------------------------------------------------------===//
@@ -496,11 +496,11 @@ static Value genFirstPosOrCrds(OpBuilder &builder, Location loc, Value a,
   if (format == CuSparseFormat::kCOO) {
     // Library uses SoA COO, direct IR uses AoS COO.
     if (enableRT)
-      return builder.create<ToCoordinatesOp>(loc, a, 0);
-    return builder.create<ToCoordinatesBufferOp>(loc, a);
+      return ToCoordinatesOp::create(builder, loc, a, 0);
+    return ToCoordinatesBufferOp::create(builder, loc, a);
   }
   // Formats CSR/CSC and BSR use positions at 1.
-  return builder.create<ToPositionsOp>(loc, a, 1);
+  return ToPositionsOp::create(builder, loc, a, 1);
 }
 
 /// Generates the second coordinates of a sparse matrix.
@@ -510,7 +510,7 @@ static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,
   if (isCOO && !enableRT)
     return Value(); // nothing needed
   // Formats CSR/CSC and BSR use coordinates at 1.
-  return builder.create<ToCoordinatesOp>(loc, a, 1);
+  return ToCoordinatesOp::create(builder, loc, a, 1);
 }
 
 /// Generates the sparse matrix handle.
@@ -523,24 +523,24 @@ static Operation *genSpMat(OpBuilder &builder, Location loc,
     // Library uses SoA COO, direct IR uses AoS COO.
     if (enableRT) {
       assert(colA);
-      return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
-                                              sz1, sz2, nseA, rowA, colA, valA);
+      return gpu::CreateCooOp::create(builder, loc, handleTp, tokenTp, token,
+                                      sz1, sz2, nseA, rowA, colA, valA);
     }
 #ifdef CUSPARSE_COO_AOS
     assert(!colA);
-    return builder.create<gpu::CreateCooAoSOp>(loc, handleTp, tokenTp, token,
-                                               sz1, sz2, nseA, rowA, valA);
+    return gpu::CreateCooAoSOp::create(builder, loc, handleTp, tokenTp, token,
+                                       sz1, sz2, nseA, rowA, valA);
 #else
     llvm_unreachable("gpu::CreateCooAoSOp is deprecated");
 #endif
   }
   assert(colA);
   if (format == CuSparseFormat::kCSR)
-    return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
-                                            sz2, nseA, rowA, colA, valA);
+    return gpu::CreateCsrOp::create(builder, loc, handleTp, tokenTp, token, sz1,
+                                    sz2, nseA, rowA, colA, valA);
   if (format == CuSparseFormat::kCSC)
-    return builder.create<gpu::CreateCscOp>(loc, handleTp, tokenTp, token, sz1,
-                                            sz2, nseA, rowA, colA, valA);
+    return gpu::CreateCscOp::create(builder, loc, handleTp, tokenTp, token, sz1,
+                                    sz2, nseA, rowA, colA, valA);
   // BSR requires a bit more work since we need to pass in the block size
   // and all others sizes in terms of blocks (#block-rows, #block-cols,
   // #nonzero-blocks).
@@ -549,13 +549,12 @@ static Operation *genSpMat(OpBuilder &builder, Location loc,
   assert(dims.size() == 2 && dims[0] == dims[1]);
   uint64_t b = dims[0];
   Value bSz = constantIndex(builder, loc, b);
-  Value bRows = builder.create<arith::DivUIOp>(loc, sz1, bSz);
-  Value bCols = builder.create<arith::DivUIOp>(loc, sz2, bSz);
-  Value bNum = builder.create<arith::DivUIOp>(
-      loc, nseA, constantIndex(builder, loc, b * b));
-  return builder.create<gpu::CreateBsrOp>(loc, handleTp, tokenTp, token, bRows,
-                                          bCols, bNum, bSz, bSz, rowA, colA,
-                                          valA);
+  Value bRows = arith::DivUIOp::create(builder, loc, sz1, bSz);
+  Value bCols = arith::DivUIOp::create(builder, loc, sz2, bSz);
+  Value bNum = arith::DivUIOp::create(builder, loc, nseA,
+                                      constantIndex(builder, loc, b * b));
+  return gpu::CreateBsrOp::create(builder, loc, handleTp, tokenTp, token, bRows,
+                                  bCols, bNum, bSz, bSz, rowA, colA, valA);
 }
 
 /// Match and rewrite SpMV kernel.
@@ -579,12 +578,12 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
   //   a : memR/memC/memV -> rowA,colA,valA
   //   x : memX           -> vecX
   //   y : memY           -> vecY
-  Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
+  Value nseA = NumberOfEntriesOp::create(rewriter, loc, a);
   Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
   Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
   Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
   Value memC = genSecondCrds(rewriter, loc, a, format, enableRT); // or empty
-  Value memV = rewriter.create<ToValuesOp>(loc, a);
+  Value memV = ToValuesOp::create(rewriter, loc, a);
   Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
   Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
   Value valA = genAllocCopy(rewriter, loc, memV, tokens);
@@ -606,19 +605,19 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
                nseA, rowA, colA, valA, format, enableRT);
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
-  auto dvecX = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, vecX, szX);
+  auto dvecX = gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp,
+                                             tokenTp, token, vecX, szX);
   Value dnX = dvecX.getResult(0);
   token = dvecX.getAsyncToken();
-  auto dvecY = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, vecY, szY);
+  auto dvecY = gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp,
+                                             tokenTp, token, vecY, szY);
   Value dnY = dvecY.getResult(0);
   token = dvecY.getAsyncToken();
   auto dnYType = llvm::cast<ShapedType>(y.getType()).getElementType();
 
   // Precompute buffersize for SpMV.
-  auto bufferComp = rewriter.create<gpu::SpMVBufferSizeOp>(
-      loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
+  auto bufferComp = gpu::SpMVBufferSizeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, spMatA, dnX, dnY,
       /*computeType=*/dnYType);
   Value bufferSz = bufferComp.getResult(0);
   token = bufferComp.getAsyncToken();
@@ -627,16 +626,17 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
   token = buf.getAsyncToken();
 
   // Perform the SpMV.
-  auto spmvComp = rewriter.create<gpu::SpMVOp>(
-      loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer);
+  auto spmvComp =
+      gpu::SpMVOp::create(rewriter, loc, tokenTp, token, spMatA, dnX, dnY,
+                          /*computeType=*/dnYType, buffer);
   token = spmvComp.getAsyncToken();
 
   // Copy data back to host and free all the resoures.
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnX)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnX)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnY)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnY)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
@@ -676,13 +676,13 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
   //   a : memR/memC/memV -> rowA,colA,valA
   //   b : bufB           -> matB
   //   c : bufC           -> matC
-  Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
+  Value nseA = NumberOfEntriesOp::create(rewriter, loc, a);
   Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
   Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
   Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
   Value memR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
   Value memC = genSecondCrds(rewriter, loc, a, format, enableRT); // or empty
-  Value memV = rewriter.create<ToValuesOp>(loc, a);
+  Value memV = ToValuesOp::create(rewriter, loc, a);
   Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
   Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
   Value valA = genAllocCopy(rewriter, loc, memV, tokens);
@@ -704,21 +704,21 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
                nseA, rowA, colA, valA, format, enableRT);
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
-  auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, matB,
-      SmallVector<Value>{szk, szn});
+  auto dmatB =
+      gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
+                                    token, matB, SmallVector<Value>{szk, szn});
   Value dnB = dmatB.getResult(0);
   token = dmatB.getAsyncToken();
-  auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, matC,
-      SmallVector<Value>{szm, szn});
+  auto dmatC =
+      gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
+                                    token, matC, SmallVector<Value>{szm, szn});
   Value dnC = dmatC.getResult(0);
   token = dmatC.getAsyncToken();
   auto dmatCType = llvm::cast<ShapedType>(c.getType()).getElementType();
 
   // Precompute buffersize for SpMM.
-  auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(
-      loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
+  auto bufferComp = gpu::SpMMBufferSizeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, spMatA, dnB, dnC,
       /*computeType=*/dmatCType);
   Value bufferSz = bufferComp.getResult(0);
   token = bufferComp.getAsyncToken();
@@ -728,16 +728,17 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
   auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
 
   // Perform the SpMM.
-  auto spmmComp = rewriter.create<gpu::SpMMOp>(
-      loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer);
+  auto spmmComp =
+      gpu::SpMMOp::create(rewriter, loc, tokenTp, token, spMatA, dnB, dnC,
+                          /*computeType=*/dnCType, buffer);
   token = spmmComp.getAsyncToken();
 
   // Copy data back to host and free all the resoures.
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnB)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnC)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
@@ -778,17 +779,17 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
   //   b : bmemR/bmemC/bmemV -> rowB,colB,valB
   //   c : materializes
   auto dnCType = cTp.getElementType();
-  Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
-  Value nseB = rewriter.create<NumberOfEntriesOp>(loc, b);
+  Value nseA = NumberOfEntriesOp::create(rewriter, loc, a);
+  Value nseB = NumberOfEntriesOp::create(rewriter, loc, b);
   Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
   Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
   Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
   Value amemR = genFirstPosOrCrds(rewriter, loc, a, format, enableRT);
   Value amemC = genSecondCrds(rewriter, loc, a, format, enableRT); // not empty
-  Value amemV = rewriter.create<ToValuesOp>(loc, a);
+  Value amemV = ToValuesOp::create(rewriter, loc, a);
   Value bmemR = genFirstPosOrCrds(rewriter, loc, b, format, enableRT);
   Value bmemC = genSecondCrds(rewriter, loc, b, format, enableRT); // not empty
-  Value bmemV = rewriter.create<ToValuesOp>(loc, b);
+  Value bmemV = ToValuesOp::create(rewriter, loc, b);
   Value rowA = genAllocCopy(rewriter, loc, amemR, tokens);
   Value colA = genAllocCopy(rewriter, loc, amemC, tokens);
   Value valA = genAllocCopy(rewriter, loc, amemV, tokens);
@@ -818,7 +819,7 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
   // Sparse matrix C materializes (also assumes beta == 0).
   Value zero = constantIndex(rewriter, loc, 0);
   Value one = constantIndex(rewriter, loc, 1);
-  Value mplus1 = rewriter.create<arith::AddIOp>(loc, szm, one);
+  Value mplus1 = arith::AddIOp::create(rewriter, loc, szm, one);
   auto e1 = genAllocBuffer(rewriter, loc, cTp.getPosType(), mplus1, token);
   Value rowC = e1.getResult(0);
   token = e1.getAsyncToken();
@@ -836,44 +837,47 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
 
   // Precompute buffersizes for SpGEMM.
   Operation *descOp =
-      rewriter.create<gpu::SpGEMMCreateDescrOp>(loc, descTp, tokenTp, token);
+      gpu::SpGEMMCreateDescrOp::create(rewriter, loc, descTp, tokenTp, token);
   Value desc = descOp->getResult(0);
   token = descOp->getResult(1);
-  Operation *work1 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(
-      loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
-      gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,
-      valC, gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
+  Operation *work1 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, desc,
+      gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
+      spMatA, spMatB, spMatC, dnCType, zero, valC,
+      gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
   Value bufferSz1 = work1->getResult(0);
   token = work1->getResult(1);
   auto buf1 = genAllocBuffer(rewriter, loc, bufferSz1, token);
   Value buffer1 = buf1.getResult(0);
   token = buf1.getAsyncToken();
-  Operation *work2 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(
-      loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
-      gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,
-      bufferSz1, buffer1,
+  Operation *work2 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, desc,
+      gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
+      spMatA, spMatB, spMatC, dnCType, bufferSz1, buffer1,
       gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION);
   token = work2->getResult(1);
 
   // Compute step.
-  Operation *compute1 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(
-      loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
-      gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType, zero,
-      valC, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
+  Operation *compute1 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, desc,
+      gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
+      spMatA, spMatB, spMatC, dnCType, zero, valC,
+      gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
   Value bufferSz2 = compute1->getResult(0);
   token = compute1->getResult(1);
   auto buf2 = genAllocBuffer(rewriter, loc, bufferSz2, token);
   Value buffer2 = buf2.getResult(0);
   token = buf2.getAsyncToken();
-  Operation *compute2 = rewriter.create<gpu::SpGEMMWorkEstimationOrComputeOp>(
-      loc, indexTp, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
-      gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType,
-      bufferSz2, buffer2, gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
+  Operation *compute2 = gpu::SpGEMMWorkEstimationOrComputeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, desc,
+      gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
+      spMatA, spMatB, spMatC, dnCType, bufferSz2, buffer2,
+      gpu::SpGEMMWorkEstimationOrComputeKind::COMPUTE);
   token = compute2->getResult(1);
 
   // Get sizes.
-  Operation *sizes = rewriter.create<gpu::SpMatGetSizeOp>(
-      loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC);
+  Operation *sizes = gpu::SpMatGetSizeOp::create(
+      rewriter, loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC);
   Value nnz = sizes->getResult(2);
   token = sizes->getResult(3);
   auto a2 = genAllocBuffer(rewriter, loc, cTp.getCrdType(), nnz, token);
@@ -884,11 +888,11 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
   token = a3.getAsyncToken();
 
   // Update C with new pointers and copy final product back into C.
-  Operation *update = rewriter.create<gpu::SetCsrPointersOp>(
-      loc, tokenTp, token, spMatC, rowC, colC, valC);
+  Operation *update = gpu::SetCsrPointersOp::create(
+      rewriter, loc, tokenTp, token, spMatC, rowC, colC, valC);
   token = update->getResult(0);
-  Operation *copy = rewriter.create<gpu::SpGEMMCopyOp>(
-      loc, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
+  Operation *copy = gpu::SpGEMMCopyOp::create(
+      rewriter, loc, tokenTp, token, desc, gpu::TransposeMode::NON_TRANSPOSE,
       gpu::TransposeMode::NON_TRANSPOSE, spMatA, spMatB, spMatC, dnCType);
   token = copy->getResult(0);
 
@@ -898,13 +902,13 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
   Value valH = genHostBuffer(rewriter, loc, dnCType, nnz);
 
   // Copy data back to host and free all the resoures.
-  token = rewriter.create<gpu::SpGEMMDestroyDescrOp>(loc, tokenTp, token, desc)
+  token = gpu::SpGEMMDestroyDescrOp::create(rewriter, loc, tokenTp, token, desc)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatB)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatB)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatC)
               .getAsyncToken();
   token = genCopyMemRef(rewriter, loc, rowH, rowC, token);
   token = genCopyMemRef(rewriter, loc, colH, colC, token);
@@ -925,12 +929,12 @@ static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter,
   tokens.clear();
 
   // Done.
-  Value vt = rewriter.create<bufferization::ToTensorOp>(
-      loc, memref::getTensorTypeFromMemRefType(valH.getType()), valH);
-  Value rt = rewriter.create<bufferization::ToTensorOp>(
-      loc, memref::getTensorTypeFromMemRefType(rowH.getType()), rowH);
-  Value ct = rewriter.create<bufferization::ToTensorOp>(
-      loc, memref::getTensorTypeFromMemRefType(colH.getType()), colH);
+  Value vt = bufferization::ToTensorOp::create(
+      rewriter, loc, memref::getTensorTypeFromMemRefType(valH.getType()), valH);
+  Value rt = bufferization::ToTensorOp::create(
+      rewriter, loc, memref::getTensorTypeFromMemRefType(rowH.getType()), rowH);
+  Value ct = bufferization::ToTensorOp::create(
+      rewriter, loc, memref::getTensorTypeFromMemRefType(colH.getType()), colH);
   rewriter.replaceOpWithNewOp<AssembleOp>(op, c.getType(), ValueRange{rt, ct},
                                           vt);
   return success();
@@ -980,19 +984,19 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
   Value token = genFirstWait(rewriter, loc);
-  Operation *spGenA = rewriter.create<gpu::Create2To4SpMatOp>(
-      loc, spMatHandleTp, tokenTp, token, szm, szk,
+  Operation *spGenA = gpu::Create2To4SpMatOp::create(
+      rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk,
       gpu::Prune2To4SpMatFlag::PRUNE_AND_CHECK, matA);
   Value spMatA = spGenA->getResult(0);
   token = spGenA->getResult(1);
-  auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, matB,
-      SmallVector<Value>{szk, szn});
+  auto dmatB =
+      gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
+                                    token, matB, SmallVector<Value>{szk, szn});
   Value dnB = dmatB.getResult(0);
   token = dmatB.getAsyncToken();
-  auto dmatC = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnTensorHandleTp, tokenTp, token, matC,
-      SmallVector<Value>{szm, szn});
+  auto dmatC =
+      gpu::CreateDnTensorOp::create(rewriter, loc, dnTensorHandleTp, tokenTp,
+                                    token, matC, SmallVector<Value>{szm, szn});
   Value dnC = dmatC.getResult(0);
   token = dmatC.getAsyncToken();
   auto dmatCType = llvm::cast<ShapedType>(matC.getType()).getElementType();
@@ -1000,9 +1004,10 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
   // Precompute buffersize for SpMM.
   SmallVector<Type> bufferTypes_{indexTp, indexTp, indexTp};
   TypeRange bufferTypes(bufferTypes_);
-  auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(
-      loc, bufferTypes, tokenTp, token, gpu::TransposeMode::NON_TRANSPOSE,
-      gpu::TransposeMode::NON_TRANSPOSE, spMatA, dnB, dnC,
+  auto bufferComp = gpu::SpMMBufferSizeOp::create(
+      rewriter, loc, bufferTypes, tokenTp, token,
+      gpu::TransposeMode::NON_TRANSPOSE, gpu::TransposeMode::NON_TRANSPOSE,
+      spMatA, dnB, dnC,
       /*computeType=*/dmatCType);
   token = bufferComp.getAsyncToken();
 
@@ -1022,17 +1027,17 @@ static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter,
 
   // Perform the SpMM.
   auto dnCType = llvm::cast<ShapedType>(matC.getType()).getElementType();
-  auto spmmComp = rewriter.create<gpu::SpMMOp>(
-      loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType,
+  auto spmmComp = gpu::SpMMOp::create(
+      rewriter, loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType,
       SmallVector<Value>{buffer1, buffer2, buffer3});
   token = spmmComp.getAsyncToken();
 
   // Copy data back to host and free all the resources.
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatA)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnB)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnC)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnC)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, buffer1, token);
   token = genDeallocMemRef(rewriter, loc, buffer2, token);
@@ -1073,7 +1078,7 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
   //   a : bufA           -> matA
   //   b : bufB           -> matB
   //   c : memR/memC/memV -> rowC,colC,valC
-  Value nseC = rewriter.create<NumberOfEntriesOp>(loc, c);
+  Value nseC = NumberOfEntriesOp::create(rewriter, loc, c);
   Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
   Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
   Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
@@ -1083,7 +1088,7 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
   Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
   Value memR = genFirstPosOrCrds(rewriter, loc, c, format, enableRT);
   Value memC = genSecondCrds(rewriter, loc, c, format, enableRT); // or empty
-  Value memV = rewriter.create<ToValuesOp>(loc, c);
+  Value memV = ToValuesOp::create(rewriter, loc, c);
   Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
   Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
   Value valC = genAllocCopy(rewriter, loc, memV, tokens);
@@ -1096,12 +1101,14 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
   Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
   Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
   Value token = genFirstWait(rewriter, loc);
-  auto dmatA = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnMatHandleTp, tokenTp, token, matA, SmallVector<Value>{szm, szk});
+  auto dmatA =
+      gpu::CreateDnTensorOp::create(rewriter, loc, dnMatHandleTp, tokenTp,
+                                    token, matA, SmallVector<Value>{szm, szk});
   Value dnA = dmatA.getResult(0);
   token = dmatA.getAsyncToken();
-  auto dmatB = rewriter.create<gpu::CreateDnTensorOp>(
-      loc, dnMatHandleTp, tokenTp, token, matB, SmallVector<Value>{szk, szn});
+  auto dmatB =
+      gpu::CreateDnTensorOp::create(rewriter, loc, dnMatHandleTp, tokenTp,
+                                    token, matB, SmallVector<Value>{szk, szn});
   Value dnB = dmatB.getResult(0);
   token = dmatB.getAsyncToken();
   Operation *spGenC =
@@ -1112,8 +1119,8 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
   auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
 
   // Precompute buffersize for SDDMM.
-  auto bufferComp = rewriter.create<gpu::SDDMMBufferSizeOp>(
-      loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
+  auto bufferComp = gpu::SDDMMBufferSizeOp::create(
+      rewriter, loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType);
   Value bufferSz = bufferComp.getResult(0);
   token = bufferComp.getAsyncToken();
   auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
@@ -1121,16 +1128,16 @@ static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
   token = buf.getAsyncToken();
 
   // Perform the SDDMM.
-  auto sddmmComp = rewriter.create<gpu::SDDMMOp>(loc, tokenTp, token, dnA, dnB,
-                                                 spMatC, dnCType, buffer);
+  auto sddmmComp = gpu::SDDMMOp::create(rewriter, loc, tokenTp, token, dnA, dnB,
+                                        spMatC, dnCType, buffer);
   token = sddmmComp.getAsyncToken();
 
   // Copy data back to host and free all the resoures.
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnA)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnA)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroyDnTensorOp>(loc, tokenTp, token, dnB)
+  token = gpu::DestroyDnTensorOp::create(rewriter, loc, tokenTp, token, dnB)
               .getAsyncToken();
-  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
+  token = gpu::DestroySpMatOp::create(rewriter, loc, tokenTp, token, spMatC)
               .getAsyncToken();
   token = genDeallocMemRef(rewriter, loc, buffer, token);
   token = genDeallocMemRef(rewriter, loc, matA, token);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
index 2f68008e68b5f..dfb127444e281 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
@@ -67,12 +67,12 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op,
       op.getRegionDefinedSpace(newBlock->getParent()->getRegionNumber());
   for (unsigned i : caseBits.bits()) {
     SparseIterator *it = iters[i].get();
-    Value pred = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                it->getCrd(), loopCrd);
-    casePred = rewriter.create<arith::AndIOp>(loc, casePred, pred);
+    Value pred = arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                       it->getCrd(), loopCrd);
+    casePred = arith::AndIOp::create(rewriter, loc, casePred, pred);
   }
-  scf::IfOp ifOp = rewriter.create<scf::IfOp>(
-      loc, ValueRange(userReduc).getTypes(), casePred, /*else=*/true);
+  scf::IfOp ifOp = scf::IfOp::create(
+      rewriter, loc, ValueRange(userReduc).getTypes(), casePred, /*else=*/true);
   rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
 
   // Erase the empty block.
@@ -103,7 +103,7 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op,
   ValueRange yields = spY.getResults();
   rewriter.eraseOp(spY);
   rewriter.setInsertionPointToEnd(&ifOp.getThenRegion().front());
-  rewriter.create<scf::YieldOp>(loc, yields);
+  scf::YieldOp::create(rewriter, loc, yields);
 
   // Generates remaining case recursively.
   rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
@@ -111,7 +111,7 @@ genCoIterateBranchNest(PatternRewriter &rewriter, Location loc, CoIterateOp op,
                                           newBlocks.drop_front(),
                                           oldBlocks.drop_front(), userReduc);
   if (!res.empty())
-    rewriter.create<scf::YieldOp>(loc, res);
+    scf::YieldOp::create(rewriter, loc, res);
 
   rewriter.setInsertionPointAfter(ifOp);
   return ifOp.getResults();
@@ -127,8 +127,8 @@ static ValueRange genLoopWithIterator(
   if (it->iteratableByFor()) {
     auto [lo, hi] = it->genForCond(rewriter, loc);
     Value step = constantIndex(rewriter, loc, 1);
-    scf::ForOp forOp = rewriter.create<scf::ForOp>(
-        loc, lo, hi, step, reduc,
+    scf::ForOp forOp = scf::ForOp::create(
+        rewriter, loc, lo, hi, step, reduc,
         [&](OpBuilder &b, Location loc, Value iv, ValueRange iterArgs) {
           // Empty builder function to ensure that no terminator is created.
         });
@@ -140,7 +140,7 @@ static ValueRange genLoopWithIterator(
                                            it, forOp.getRegionIterArgs());
 
       rewriter.setInsertionPointToEnd(forOp.getBody());
-      rewriter.create<scf::YieldOp>(loc, ret);
+      scf::YieldOp::create(rewriter, loc, ret);
     }
     return forOp.getResults();
   }
@@ -149,7 +149,7 @@ static ValueRange genLoopWithIterator(
   llvm::append_range(ivs, it->getCursor());
 
   TypeRange types = ValueRange(ivs).getTypes();
-  auto whileOp = rewriter.create<scf::WhileOp>(loc, types, ivs);
+  auto whileOp = scf::WhileOp::create(rewriter, loc, types, ivs);
   {
     OpBuilder::InsertionGuard guard(rewriter);
     // Generates loop conditions.
@@ -158,7 +158,7 @@ static ValueRange genLoopWithIterator(
     rewriter.setInsertionPointToStart(before);
     ValueRange bArgs = before->getArguments();
     auto [whileCond, remArgs] = it->genWhileCond(rewriter, loc, bArgs);
-    rewriter.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
+    scf::ConditionOp::create(rewriter, loc, whileCond, before->getArguments());
 
     // Delegates loop body generation.
     Region &dstRegion = whileOp.getAfter();
@@ -175,7 +175,7 @@ static ValueRange genLoopWithIterator(
     SmallVector<Value> yields;
     llvm::append_range(yields, ret);
     llvm::append_range(yields, it->forward(rewriter, loc));
-    rewriter.create<scf::YieldOp>(loc, yields);
+    scf::YieldOp::create(rewriter, loc, yields);
   }
   return whileOp.getResults().drop_front(it->getCursor().size());
 }
@@ -212,8 +212,8 @@ class ExtractValOpConverter : public OpConversionPattern<ExtractValOp> {
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     Value pos = adaptor.getIterator().back();
-    Value valBuf = rewriter.create<ToValuesOp>(
-        loc, llvm::getSingleElement(adaptor.getTensor()));
+    Value valBuf = ToValuesOp::create(
+        rewriter, loc, llvm::getSingleElement(adaptor.getTensor()));
     rewriter.replaceOpWithNewOp<memref::LoadOp>(op, valBuf, pos);
     return success();
   }
@@ -385,12 +385,12 @@ class SparseCoIterateOpConverter : public OpConversionPattern<CoIterateOp> {
         SmallVector<Value> nextIterYields(res);
         // 2nd. foward the loop.
         for (SparseIterator *it : validIters) {
-          Value cmp = rewriter.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::eq, it->getCrd(), loopCrd);
+          Value cmp = arith::CmpIOp::create(
+              rewriter, loc, arith::CmpIPredicate::eq, it->getCrd(), loopCrd);
           it->forwardIf(rewriter, loc, cmp);
           llvm::append_range(nextIterYields, it->getCursor());
         }
-        rewriter.create<scf::YieldOp>(loc, nextIterYields);
+        scf::YieldOp::create(rewriter, loc, nextIterYields);
 
         // Exit the loop, relink the iterator SSA value.
         rewriter.setInsertionPointAfter(loop);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index 4f554756b3dd2..df9b6cf040efa 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -43,7 +43,8 @@ struct DemapInsRewriter : public OpRewritePattern<SourceOp> {
     SmallVector<Value> deMappedIns(op->getOperands());
     for (Value &in : deMappedIns) {
       if (auto stt = tryGetSparseTensorType(in); stt && !stt->isIdentity()) {
-        in = rewriter.create<ReinterpretMapOp>(loc, stt->getDemappedType(), in);
+        in =
+            ReinterpretMapOp::create(rewriter, loc, stt->getDemappedType(), in);
         changed = true;
       }
     }
@@ -337,14 +338,14 @@ translateMap(linalg::GenericOp op, PatternRewriter &rewriter) {
 // Generates a "de"mapping reinterpretation of the map.
 static Value genDemap(OpBuilder &builder, SparseTensorEncodingAttr enc,
                       Value val) {
-  return builder.create<ReinterpretMapOp>(val.getLoc(), enc.withoutDimToLvl(),
-                                          val);
+  return ReinterpretMapOp::create(builder, val.getLoc(), enc.withoutDimToLvl(),
+                                  val);
 }
 
 // Generates a "re"mapping reinterpretation of the map.
 static Value genRemap(OpBuilder &builder, SparseTensorEncodingAttr enc,
                       Value val) {
-  return builder.create<ReinterpretMapOp>(val.getLoc(), enc, val);
+  return ReinterpretMapOp::create(builder, val.getLoc(), enc, val);
 }
 
 static SmallVector<Value> remapValueRange(OpBuilder &rewriter, TypeRange types,
@@ -353,7 +354,7 @@ static SmallVector<Value> remapValueRange(OpBuilder &rewriter, TypeRange types,
   assert(outs.size() == types.size());
   for (auto [r, t] : llvm::zip(ret, types))
     if (r.getType() != t)
-      r = rewriter.create<ReinterpretMapOp>(r.getLoc(), t, r);
+      r = ReinterpretMapOp::create(rewriter, r.getLoc(), t, r);
   return ret;
 }
 
@@ -566,7 +567,7 @@ struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
       // Inserting the transpose
       rewriter.setInsertionPoint(linalgOp);
       RankedTensorType dstTp = stt.withDimToLvl(dimToLvl).getRankedTensorType();
-      Value dst = rewriter.create<ConvertOp>(tval.getLoc(), dstTp, tval);
+      Value dst = ConvertOp::create(rewriter, tval.getLoc(), dstTp, tval);
       rewriter.modifyOpInPlace(linalgOp, [&]() {
         linalgOp->setOperand(t->getOperandNumber(), dst);
       });
@@ -574,7 +575,7 @@ struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
       // Release the transposed form afterwards.
       // TODO: CSE when used in more than one following op?
       rewriter.setInsertionPointAfter(linalgOp);
-      rewriter.create<bufferization::DeallocTensorOp>(dst.getLoc(), dst);
+      bufferization::DeallocTensorOp::create(rewriter, dst.getLoc(), dst);
 
       return success();
     }
@@ -604,8 +605,8 @@ struct TensorAllocDemapper : public OpRewritePattern<AllocOp> {
     ValueRange dynSz = op.getDynamicSizes();
     for (int64_t dimSz : stt.getDimShape()) {
       if (ShapedType::isDynamic(dimSz)) {
-        Value maxCrd = rewriter.create<arith::SubIOp>(
-            loc, dynSz.front(), constantIndex(rewriter, loc, 1));
+        Value maxCrd = arith::SubIOp::create(rewriter, loc, dynSz.front(),
+                                             constantIndex(rewriter, loc, 1));
         maxDimCrds.push_back(maxCrd);
         dynSz = dynSz.drop_front();
       } else {
@@ -619,8 +620,8 @@ struct TensorAllocDemapper : public OpRewritePattern<AllocOp> {
     SmallVector<Value> dynLvlSzs;
     for (unsigned i = 0, e = lvlShape.size(); i < e; i++) {
       if (ShapedType::isDynamic(lvlShape[i])) {
-        Value sz = rewriter.create<arith::AddIOp>(
-            loc, maxLvlCrds[i], constantIndex(rewriter, loc, 1));
+        Value sz = arith::AddIOp::create(rewriter, loc, maxLvlCrds[i],
+                                         constantIndex(rewriter, loc, 1));
         dynLvlSzs.push_back(sz);
       }
     }
@@ -650,8 +651,8 @@ struct TensorInsertDemapper
     auto stt = getSparseTensorType(op.getResult());
     ValueRange lvlCrd = stt.translateCrds(rewriter, loc, op.getIndices(),
                                           CrdTransDirectionKind::dim2lvl);
-    auto insertOp = rewriter.create<tensor::InsertOp>(
-        loc, op.getScalar(), adaptor.getDest(), lvlCrd);
+    auto insertOp = tensor::InsertOp::create(rewriter, loc, op.getScalar(),
+                                             adaptor.getDest(), lvlCrd);
 
     Value out = genRemap(rewriter, stt.getEncoding(), insertOp.getResult());
     rewriter.replaceOp(op, out);
@@ -765,7 +766,7 @@ struct ForeachOpDemapper
           stt && !stt->isIdentity()) {
         Value y =
             genDemap(rewriter, stt->getEncoding(), yield.getSingleResult());
-        rewriter.create<YieldOp>(loc, y);
+        YieldOp::create(rewriter, loc, y);
         rewriter.eraseOp(yield);
       }
     }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
index f85c4761a8d52..81cd3296de294 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
@@ -126,8 +126,8 @@ void collapseSparseSpace(MutableArrayRef<CollapseSpaceInfo> toCollapse) {
   OpBuilder builder(root);
 
   // Construct the collapsed iteration space.
-  auto collapsedSpace = builder.create<ExtractIterSpaceOp>(
-      loc, root.getTensor(), root.getParentIter(), root.getLoLvl(),
+  auto collapsedSpace = ExtractIterSpaceOp::create(
+      builder, loc, root.getTensor(), root.getParentIter(), root.getLoLvl(),
       leaf.getHiLvl());
 
   auto rItOp = llvm::cast<IterateOp>(*root->getUsers().begin());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseStorageSpecifierToLLVM.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseStorageSpecifierToLLVM.cpp
index 01028f71c20bb..6dfffbb6e7442 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseStorageSpecifierToLLVM.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseStorageSpecifierToLLVM.cpp
@@ -69,15 +69,15 @@ class SpecifierStructBuilder : public StructBuilder {
   Value extractField(OpBuilder &builder, Location loc,
                      ArrayRef<int64_t> indices) const {
     return genCast(builder, loc,
-                   builder.create<LLVM::ExtractValueOp>(loc, value, indices),
+                   LLVM::ExtractValueOp::create(builder, loc, value, indices),
                    builder.getIndexType());
   }
 
   void insertField(OpBuilder &builder, Location loc, ArrayRef<int64_t> indices,
                    Value v) {
-    value = builder.create<LLVM::InsertValueOp>(
-        loc, value, genCast(builder, loc, v, builder.getIntegerType(64)),
-        indices);
+    value = LLVM::InsertValueOp::create(
+        builder, loc, value,
+        genCast(builder, loc, v, builder.getIntegerType(64)), indices);
   }
 
 public:
@@ -110,7 +110,7 @@ class SpecifierStructBuilder : public StructBuilder {
 
 Value SpecifierStructBuilder::getInitValue(OpBuilder &builder, Location loc,
                                            Type structType, Value source) {
-  Value metaData = builder.create<LLVM::PoisonOp>(loc, structType);
+  Value metaData = LLVM::PoisonOp::create(builder, loc, structType);
   SpecifierStructBuilder md(metaData);
   if (!source) {
     auto memSizeArrayType =
@@ -204,15 +204,15 @@ void SpecifierStructBuilder::setMemSize(OpBuilder &builder, Location loc,
 /// Builds IR extracting the memory size array from the descriptor.
 Value SpecifierStructBuilder::memSizeArray(OpBuilder &builder,
                                            Location loc) const {
-  return builder.create<LLVM::ExtractValueOp>(loc, value,
-                                              kMemSizePosInSpecifier);
+  return LLVM::ExtractValueOp::create(builder, loc, value,
+                                      kMemSizePosInSpecifier);
 }
 
 /// Builds IR inserting the memory size array into the descriptor.
 void SpecifierStructBuilder::setMemSizeArray(OpBuilder &builder, Location loc,
                                              Value array) {
-  value = builder.create<LLVM::InsertValueOp>(loc, value, array,
-                                              kMemSizePosInSpecifier);
+  value = LLVM::InsertValueOp::create(builder, loc, value, array,
+                                      kMemSizePosInSpecifier);
 }
 
 } // namespace
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 001ea62b07360..70795e2eb211b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -50,7 +50,7 @@ static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
 /// Generates a load with proper `index` typing.
 static Value genLoad(OpBuilder &builder, Location loc, Value mem, Value idx) {
   idx = genCast(builder, loc, idx, builder.getIndexType());
-  return builder.create<memref::LoadOp>(loc, mem, idx);
+  return memref::LoadOp::create(builder, loc, mem, idx);
 }
 
 /// Generates a store with proper `index` typing and proper value.
@@ -59,7 +59,7 @@ static void genStore(OpBuilder &builder, Location loc, Value val, Value mem,
   idx = genCast(builder, loc, idx, builder.getIndexType());
   val = genCast(builder, loc, val,
                 cast<ShapedType>(mem.getType()).getElementType());
-  builder.create<memref::StoreOp>(loc, val, mem, idx);
+  memref::StoreOp::create(builder, loc, val, mem, idx);
 }
 
 /// Creates a straightforward counting for-loop.
@@ -70,7 +70,8 @@ static scf::ForOp createFor(OpBuilder &builder, Location loc, Value upper,
   if (!lower)
     lower = constantZero(builder, loc, indexType);
   Value one = constantOne(builder, loc, indexType);
-  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lower, upper, one, fields);
+  scf::ForOp forOp =
+      scf::ForOp::create(builder, loc, lower, upper, one, fields);
   for (unsigned i = 0, e = fields.size(); i < e; i++)
     fields[i] = forOp.getRegionIterArg(i);
   builder.setInsertionPointToStart(forOp.getBody());
@@ -86,9 +87,9 @@ static void createPushback(OpBuilder &builder, Location loc,
   Value field = desc.getMemRefField(kind, lvl);
   StorageSpecifierKind specFieldKind = toSpecifierKind(kind);
 
-  auto pushBackOp = builder.create<PushBackOp>(
-      loc, desc.getSpecifierField(builder, loc, specFieldKind, lvl), field,
-      genCast(builder, loc, value, etp), repeat);
+  auto pushBackOp = PushBackOp::create(
+      builder, loc, desc.getSpecifierField(builder, loc, specFieldKind, lvl),
+      field, genCast(builder, loc, value, etp), repeat);
 
   desc.setMemRefField(kind, lvl, pushBackOp.getOutBuffer());
   desc.setSpecifierField(builder, loc, specFieldKind, lvl,
@@ -112,7 +113,7 @@ static void allocSchemeForRank(OpBuilder &builder, Location loc,
       Value posZero = constantZero(builder, loc, stt.getPosType());
       if (isLooseCompressedLT(lt)) {
         Value two = constantIndex(builder, loc, 2);
-        linear = builder.create<arith::MulIOp>(loc, linear, two);
+        linear = arith::MulIOp::create(builder, loc, linear, two);
       }
       createPushback(builder, loc, desc, SparseTensorFieldKind::PosMemRef, lvl,
                      /*value=*/posZero, /*repeat=*/linear);
@@ -125,7 +126,7 @@ static void allocSchemeForRank(OpBuilder &builder, Location loc,
     // otherwise the values array for the from-here "all-dense" case.
     assert(isDenseLT(lt));
     Value size = desc.getLvlSize(builder, loc, lvl);
-    linear = builder.create<arith::MulIOp>(loc, linear, size);
+    linear = arith::MulIOp::create(builder, loc, linear, size);
   }
   // Reached values array so prepare for an insertion.
   Value valZero = constantZero(builder, loc, stt.getElementType());
@@ -137,11 +138,11 @@ static void allocSchemeForRank(OpBuilder &builder, Location loc,
 static Value createAllocation(OpBuilder &builder, Location loc,
                               MemRefType memRefType, Value sz,
                               bool enableInit) {
-  Value buffer = builder.create<memref::AllocOp>(loc, memRefType, sz);
+  Value buffer = memref::AllocOp::create(builder, loc, memRefType, sz);
   Type elemType = memRefType.getElementType();
   if (enableInit) {
     Value fillValue = constantZero(builder, loc, elemType);
-    builder.create<linalg::FillOp>(loc, fillValue, buffer);
+    linalg::FillOp::create(builder, loc, fillValue, buffer);
   }
   return buffer;
 }
@@ -178,16 +179,16 @@ static void createAllocFields(OpBuilder &builder, Location loc,
   if (stt.isAllDense()) {
     valHeuristic = lvlSizesValues[0];
     for (Level lvl = 1; lvl < lvlRank; lvl++)
-      valHeuristic =
-          builder.create<arith::MulIOp>(loc, valHeuristic, lvlSizesValues[lvl]);
+      valHeuristic = arith::MulIOp::create(builder, loc, valHeuristic,
+                                           lvlSizesValues[lvl]);
   } else if (sizeHint) {
     if (stt.getAoSCOOStart() == 0) {
       posHeuristic = constantIndex(builder, loc, 2);
-      crdHeuristic = builder.create<arith::MulIOp>(
-          loc, constantIndex(builder, loc, lvlRank), sizeHint); // AOS
+      crdHeuristic = arith::MulIOp::create(
+          builder, loc, constantIndex(builder, loc, lvlRank), sizeHint); // AOS
     } else if (lvlRank == 2 && stt.isDenseLvl(0) && stt.isCompressedLvl(1)) {
-      posHeuristic = builder.create<arith::AddIOp>(
-          loc, sizeHint, constantIndex(builder, loc, 1));
+      posHeuristic = arith::AddIOp::create(builder, loc, sizeHint,
+                                           constantIndex(builder, loc, 1));
       crdHeuristic = sizeHint;
     } else {
       posHeuristic = crdHeuristic = constantIndex(builder, loc, 16);
@@ -280,7 +281,7 @@ static Value genCompressed(OpBuilder &builder, Location loc,
   unsigned crdStride;
   std::tie(crdFidx, crdStride) = desc.getCrdMemRefIndexAndStride(lvl);
   const Value one = constantIndex(builder, loc, 1);
-  const Value pp1 = builder.create<arith::AddIOp>(loc, parentPos, one);
+  const Value pp1 = arith::AddIOp::create(builder, loc, parentPos, one);
   const Value positionsAtLvl = desc.getPosMemRef(lvl);
   const Value pstart = genLoad(builder, loc, positionsAtLvl, parentPos);
   const Value pstop = genLoad(builder, loc, positionsAtLvl, pp1);
@@ -288,29 +289,29 @@ static Value genCompressed(OpBuilder &builder, Location loc,
   const Value crdStrideC =
       crdStride > 1 ? constantIndex(builder, loc, crdStride) : Value();
   const Value msz =
-      crdStrideC ? builder.create<arith::DivUIOp>(loc, crdMsz, crdStrideC)
+      crdStrideC ? arith::DivUIOp::create(builder, loc, crdMsz, crdStrideC)
                  : crdMsz;
-  const Value plast = builder.create<arith::SubIOp>(
-      loc, genCast(builder, loc, pstop, indexType), one);
+  const Value plast = arith::SubIOp::create(
+      builder, loc, genCast(builder, loc, pstop, indexType), one);
   // Conditional expression.
-  Value lt = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
-                                           pstart, pstop);
+  Value lt = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ult,
+                                   pstart, pstop);
   types.push_back(boolType);
-  scf::IfOp ifOp1 = builder.create<scf::IfOp>(loc, types, lt, /*else*/ true);
+  scf::IfOp ifOp1 = scf::IfOp::create(builder, loc, types, lt, /*else*/ true);
   types.pop_back();
   builder.setInsertionPointToStart(&ifOp1.getThenRegion().front());
-  Value crd =
-      genLoad(builder, loc, desc.getMemRefField(crdFidx),
-              crdStrideC ? builder.create<arith::MulIOp>(loc, plast, crdStrideC)
-                         : plast);
-  Value eq = builder.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::eq, genCast(builder, loc, crd, indexType),
-      lvlCoords[lvl]);
-  builder.create<scf::YieldOp>(loc, eq);
+  Value crd = genLoad(
+      builder, loc, desc.getMemRefField(crdFidx),
+      crdStrideC ? arith::MulIOp::create(builder, loc, plast, crdStrideC)
+                 : plast);
+  Value eq = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq,
+                                   genCast(builder, loc, crd, indexType),
+                                   lvlCoords[lvl]);
+  scf::YieldOp::create(builder, loc, eq);
   builder.setInsertionPointToStart(&ifOp1.getElseRegion().front());
   if (lvl > 0)
     genStore(builder, loc, msz, positionsAtLvl, parentPos);
-  builder.create<scf::YieldOp>(loc, constantI1(builder, loc, false));
+  scf::YieldOp::create(builder, loc, constantI1(builder, loc, false));
   builder.setInsertionPointAfter(ifOp1);
   // If present construct. Note that for a non-unique dimension level, we
   // simply set the condition to false and rely on CSE/DCE to clean up the IR.
@@ -322,19 +323,19 @@ static Value genCompressed(OpBuilder &builder, Location loc,
   types.push_back(indexType);
   const Value p = stt.isUniqueLvl(lvl) ? ifOp1.getResult(0)
                                        : constantI1(builder, loc, false);
-  scf::IfOp ifOp2 = builder.create<scf::IfOp>(loc, types, p, /*else*/ true);
+  scf::IfOp ifOp2 = scf::IfOp::create(builder, loc, types, p, /*else*/ true);
   // If present (fields unaffected, update pnext to plast).
   builder.setInsertionPointToStart(&ifOp2.getThenRegion().front());
 
   // FIXME: This does not looks like a clean way, but probably the most
   // efficient way.
   desc.getFields().push_back(plast);
-  builder.create<scf::YieldOp>(loc, desc.getFields());
+  scf::YieldOp::create(builder, loc, desc.getFields());
   desc.getFields().pop_back();
 
   // If !present (changes fields, update pnext).
   builder.setInsertionPointToStart(&ifOp2.getElseRegion().front());
-  Value mszp1 = builder.create<arith::AddIOp>(loc, msz, one);
+  Value mszp1 = arith::AddIOp::create(builder, loc, msz, one);
   genStore(builder, loc, mszp1, positionsAtLvl, pp1);
   createPushback(builder, loc, desc, SparseTensorFieldKind::CrdMemRef, lvl,
                  /*value=*/lvlCoords[lvl]);
@@ -343,7 +344,7 @@ static Value genCompressed(OpBuilder &builder, Location loc,
     allocSchemeForRank(builder, loc, desc, lvl + 1);
 
   desc.getFields().push_back(msz);
-  builder.create<scf::YieldOp>(loc, desc.getFields());
+  scf::YieldOp::create(builder, loc, desc.getFields());
   desc.getFields().pop_back();
 
   // Update fields and return next pos.
@@ -381,17 +382,17 @@ static void genEndInsert(OpBuilder &builder, Location loc,
         Value oldv = loop.getRegionIterArg(0);
         Value newv = genLoad(builder, loc, posMemRef, i);
         Value posZero = constantZero(builder, loc, posType);
-        Value cond = builder.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::eq, newv, posZero);
-        scf::IfOp ifOp = builder.create<scf::IfOp>(loc, TypeRange(posType),
-                                                   cond, /*else*/ true);
+        Value cond = arith::CmpIOp::create(
+            builder, loc, arith::CmpIPredicate::eq, newv, posZero);
+        scf::IfOp ifOp = scf::IfOp::create(builder, loc, TypeRange(posType),
+                                           cond, /*else*/ true);
         builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
         genStore(builder, loc, oldv, posMemRef, i);
-        builder.create<scf::YieldOp>(loc, oldv);
+        scf::YieldOp::create(builder, loc, oldv);
         builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-        builder.create<scf::YieldOp>(loc, newv);
+        scf::YieldOp::create(builder, loc, newv);
         builder.setInsertionPointAfter(ifOp);
-        builder.create<scf::YieldOp>(loc, ifOp.getResult(0));
+        scf::YieldOp::create(builder, loc, ifOp.getResult(0));
         builder.setInsertionPointAfter(loop);
       }
     } else {
@@ -484,7 +485,7 @@ class SparseInsertGenerator
         //   <insert @ positions[lvl] at next level lvl + 1>
         if (isLooseCompressedLT(lt)) {
           Value two = constantIndex(builder, loc, 2);
-          parentPos = builder.create<arith::MulIOp>(loc, parentPos, two);
+          parentPos = arith::MulIOp::create(builder, loc, parentPos, two);
         }
         parentPos =
             genCompressed(builder, loc, desc, coords, value, parentPos, lvl);
@@ -501,8 +502,8 @@ class SparseInsertGenerator
         //   positions[lvl] = size * positions[lvl-1] + coords[lvl]
         //   <insert @ positions[lvl] at next level lvl + 1>
         Value size = desc.getLvlSize(builder, loc, lvl);
-        Value mult = builder.create<arith::MulIOp>(loc, size, parentPos);
-        parentPos = builder.create<arith::AddIOp>(loc, mult, coords[lvl]);
+        Value mult = arith::MulIOp::create(builder, loc, size, parentPos);
+        parentPos = arith::AddIOp::create(builder, loc, mult, coords[lvl]);
       }
     }
     // Reached the actual value append/insert.
@@ -582,8 +583,9 @@ class SparseCallConverter : public OpConversionPattern<func::CallOp> {
       return failure();
 
     // (1) Generates new call with flattened return value.
-    auto newCall = rewriter.create<func::CallOp>(
-        loc, op.getCallee(), finalRetTy, flattenValues(adaptor.getOperands()));
+    auto newCall =
+        func::CallOp::create(rewriter, loc, op.getCallee(), finalRetTy,
+                             flattenValues(adaptor.getOperands()));
     // (2) Gather sparse tensor returns.
     SmallVector<SmallVector<Value>> packedResultVals;
     // Tracks the offset of current return value (of the original call)
@@ -671,8 +673,8 @@ struct SparseReorderCOOConverter : public OpConversionPattern<ReorderCOOOp> {
 
     auto id = AffineMap::getMultiDimIdentityMap(srcStt.getLvlRank(), ctx);
 
-    rewriter.create<SortOp>(loc, nnz, crd, ValueRange{val}, id,
-                            rewriter.getIndexAttr(0), op.getAlgorithm());
+    SortOp::create(rewriter, loc, nnz, crd, ValueRange{val}, id,
+                   rewriter.getIndexAttr(0), op.getAlgorithm());
 
     // Since we do in-place sorting, the destinate tensor will have the same set
     // of memrefs as the source tensor.
@@ -757,10 +759,10 @@ class SparseTensorAllocConverter
       // Memcpy on memref fields.
       for (auto field : desc.getMemRefFields()) {
         auto memrefTp = cast<MemRefType>(field.getType());
-        auto size = rewriter.create<memref::DimOp>(loc, field, 0);
+        auto size = memref::DimOp::create(rewriter, loc, field, 0);
         auto copied =
-            rewriter.create<memref::AllocOp>(loc, memrefTp, ValueRange{size});
-        rewriter.create<memref::CopyOp>(loc, field, copied);
+            memref::AllocOp::create(rewriter, loc, memrefTp, ValueRange{size});
+        memref::CopyOp::create(rewriter, loc, field, copied);
         fields.push_back(copied);
       }
       // Reuses specifier.
@@ -863,7 +865,7 @@ class SparseTensorDeallocConverter
           cast<RankedTensorType>(op.getTensor().getType()));
       for (auto input : desc.getMemRefFields())
         // Deallocate every buffer used to store the sparse tensor handler.
-        rewriter.create<memref::DeallocOp>(loc, input);
+        memref::DeallocOp::create(rewriter, loc, input);
     }
     rewriter.eraseOp(op);
     return success();
@@ -917,7 +919,7 @@ class SparseExpandConverter : public OpConversionPattern<ExpandOp> {
     // Generate a memref for `sz` elements of type `t`.
     const auto genAlloc = [&](Type t) {
       const auto memTp = MemRefType::get({ShapedType::kDynamic}, t);
-      return rewriter.create<memref::AllocOp>(loc, memTp, ValueRange{sz});
+      return memref::AllocOp::create(rewriter, loc, memTp, ValueRange{sz});
     };
     // Allocate temporary buffers for values/filled-switch and added.
     // We do not use stack buffers for this, since the expanded size may
@@ -931,12 +933,12 @@ class SparseExpandConverter : public OpConversionPattern<ExpandOp> {
     // operation is amortized over the innermost loops for the access
     // pattern expansion. As noted in the operation doc, we would like
     // to amortize this setup cost even between kernels.
-    rewriter.create<linalg::FillOp>(
-        loc, ValueRange{constantZero(rewriter, loc, eltType)},
-        ValueRange{values});
-    rewriter.create<linalg::FillOp>(
-        loc, ValueRange{constantZero(rewriter, loc, boolType)},
-        ValueRange{filled});
+    linalg::FillOp::create(rewriter, loc,
+                           ValueRange{constantZero(rewriter, loc, eltType)},
+                           ValueRange{values});
+    linalg::FillOp::create(rewriter, loc,
+                           ValueRange{constantZero(rewriter, loc, boolType)},
+                           ValueRange{filled});
     // Replace expansion op with these buffers and initial coordinate.
     assert(op.getNumResults() == 4);
     rewriter.replaceOp(op, {values, filled, added, zero});
@@ -965,9 +967,10 @@ class SparseCompressConverter : public OpConversionPattern<CompressOp> {
     // If the innermost level is ordered, we need to sort the coordinates
     // in the "added" array prior to applying the compression.
     if (dstType.isOrderedLvl(dstType.getLvlRank() - 1))
-      rewriter.create<SortOp>(
-          loc, count, added, ValueRange{}, rewriter.getMultiDimIdentityMap(1),
-          rewriter.getIndexAttr(0), SparseTensorSortKind::HybridQuickSort);
+      SortOp::create(rewriter, loc, count, added, ValueRange{},
+                     rewriter.getMultiDimIdentityMap(1),
+                     rewriter.getIndexAttr(0),
+                     SparseTensorSortKind::HybridQuickSort);
     // While performing the insertions, we also need to reset the elements
     // of the values/filled-switch by only iterating over the set elements,
     // to ensure that the runtime complexity remains proportional to the
@@ -1000,15 +1003,15 @@ class SparseCompressConverter : public OpConversionPattern<CompressOp> {
     SmallVector<Value> insertRet = insertGen.genCallOrInline(rewriter, loc);
     genStore(rewriter, loc, constantZero(rewriter, loc, eltType), values, crd);
     genStore(rewriter, loc, constantI1(rewriter, loc, false), filled, crd);
-    rewriter.create<scf::YieldOp>(loc, insertRet);
+    scf::YieldOp::create(rewriter, loc, insertRet);
 
     rewriter.setInsertionPointAfter(loop);
     // Deallocate the buffers on exit of the full loop nest.
     Operation *parent = getTop(op);
     rewriter.setInsertionPointAfter(parent);
-    rewriter.create<memref::DeallocOp>(loc, values);
-    rewriter.create<memref::DeallocOp>(loc, filled);
-    rewriter.create<memref::DeallocOp>(loc, added);
+    memref::DeallocOp::create(rewriter, loc, values);
+    memref::DeallocOp::create(rewriter, loc, filled);
+    memref::DeallocOp::create(rewriter, loc, added);
     // Replace operation with resulting memrefs.
     rewriter.replaceOpWithMultiple(op, {loop->getResults()});
     return success();
@@ -1192,8 +1195,8 @@ class SparseConvertConverter : public OpConversionPattern<ConvertOp> {
             // would require a subViewOp to avoid overflow when copying
             // values.
             Value sz = linalg::createOrFoldDimOp(rewriter, loc, srcMem, 0);
-            auto dstMem = rewriter.create<memref::AllocOp>(
-                loc, cast<MemRefType>(fTp), sz);
+            auto dstMem = memref::AllocOp::create(rewriter, loc,
+                                                  cast<MemRefType>(fTp), sz);
             if (fTp != srcMem.getType()) {
               // Converts elements type.
               scf::buildLoopNest(
@@ -1201,16 +1204,16 @@ class SparseConvertConverter : public OpConversionPattern<ConvertOp> {
                   constantIndex(rewriter, loc, 1),
                   [srcMem, &dstMem](OpBuilder &builder, Location loc,
                                     ValueRange ivs) {
-                    Value v = builder.create<memref::LoadOp>(loc, srcMem, ivs);
+                    Value v = memref::LoadOp::create(builder, loc, srcMem, ivs);
                     Value casted = genCast(builder, loc, v,
                                            dstMem.getType().getElementType());
-                    builder.create<memref::StoreOp>(loc, casted, dstMem, ivs);
+                    memref::StoreOp::create(builder, loc, casted, dstMem, ivs);
                   });
             } else {
               // TODO: We can even reuse the same memref for the new tensor,
               // but that requires a `ref-counting` based memory management
               // for shared memrefs between multiple sparse tensors.
-              rewriter.create<memref::CopyOp>(loc, srcMem, dstMem);
+              memref::CopyOp::create(rewriter, loc, srcMem, dstMem);
             }
             fields.push_back(dstMem);
           }
@@ -1242,8 +1245,9 @@ class SparseExtractSliceConverter
     auto desc = getMutDescriptorFromTensorTuple(adaptor.getSource(), fields,
                                                 op.getSource().getType());
 
-    auto newSpec = rewriter.create<StorageSpecifierInitOp>(
-        loc, StorageSpecifierType::get(ctx, dstEnc), desc.getSpecifier());
+    auto newSpec = StorageSpecifierInitOp::create(
+        rewriter, loc, StorageSpecifierType::get(ctx, dstEnc),
+        desc.getSpecifier());
     desc.setSpecifier(newSpec);
 
     // Fills in slice information.
@@ -1326,11 +1330,11 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
               // Flattens the buffer to batchLvlRank.
               auto reassoc = getReassociationForFlattening(
                   mem.getType(), stt.getBatchLvlRank());
-              mem = rewriter.create<memref::CastOp>(
-                  loc, fType,
-                  rewriter.create<memref::CollapseShapeOp>(loc, mem, reassoc));
+              mem = memref::CastOp::create(
+                  rewriter, loc, fType,
+                  memref::CollapseShapeOp::create(rewriter, loc, mem, reassoc));
             } else {
-              mem = rewriter.create<memref::CastOp>(loc, fType, mem);
+              mem = memref::CastOp::create(rewriter, loc, fType, mem);
             }
             fields.push_back(mem);
           }
@@ -1362,8 +1366,8 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
       LevelType lt = stt.getLvlType(lvl);
       // Simply forwards the position index when this is a dense level.
       if (lt.isa<LevelFormat::Dense>()) {
-        memSize = rewriter.create<arith::MulIOp>(loc, lvlSize, memSize);
-        posBack = rewriter.create<arith::SubIOp>(loc, memSize, c1);
+        memSize = arith::MulIOp::create(rewriter, loc, lvlSize, memSize);
+        posBack = arith::SubIOp::create(rewriter, loc, memSize, c1);
         continue;
       }
       if (lt.isa<LevelFormat::Batch>()) {
@@ -1376,12 +1380,12 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
       if (isWithPosLT(lt)) {
         assert(isCompressedLT(lt) || isLooseCompressedLT(lt));
         if (isLooseCompressedLT(lt)) {
-          memSize = rewriter.create<arith::MulIOp>(loc, memSize, c2);
-          posBack = rewriter.create<arith::SubIOp>(loc, memSize, c1);
+          memSize = arith::MulIOp::create(rewriter, loc, memSize, c2);
+          posBack = arith::SubIOp::create(rewriter, loc, memSize, c1);
         } else {
           assert(isCompressedLT(lt));
           posBack = memSize;
-          memSize = rewriter.create<arith::AddIOp>(loc, memSize, c1);
+          memSize = arith::AddIOp::create(rewriter, loc, memSize, c1);
         }
         desc.setPosMemSize(rewriter, loc, lvl, memSize);
         // The last value in position array is the memory size for next level.
@@ -1391,13 +1395,13 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
                                    constantIndex(rewriter, loc, 0));
         batched.push_back(posBack);
         memSize = genIndexLoad(rewriter, loc, desc.getPosMemRef(lvl), batched);
-        posBack = rewriter.create<arith::SubIOp>(loc, posBack, c1);
+        posBack = arith::SubIOp::create(rewriter, loc, posBack, c1);
       }
       assert(isWithCrdLT(lt) && lvl <= trailCOOStart);
       // FIXME: This seems to be unnecessarily complex, can we simplify it?
       if (lvl == trailCOOStart) {
-        Value cooSz = rewriter.create<arith::MulIOp>(
-            loc, memSize, constantIndex(rewriter, loc, trailCOORank));
+        Value cooSz = arith::MulIOp::create(
+            rewriter, loc, memSize, constantIndex(rewriter, loc, trailCOORank));
         desc.setCrdMemSize(rewriter, loc, lvl, cooSz);
       } else {
         desc.setCrdMemSize(rewriter, loc, lvl, memSize);
@@ -1460,19 +1464,20 @@ struct SparseDisassembleOpConverter
       if (dst.getType().getRank() > stt.getBatchLvlRank() + 1) {
         auto reassoc =
             getReassociationForFlattening(dst.getType(), stt.getBatchLvlRank());
-        flatOut = rewriter.create<memref::CollapseShapeOp>(loc, dst, reassoc);
+        flatOut = memref::CollapseShapeOp::create(rewriter, loc, dst, reassoc);
       }
       Value dstMem = genSliceToSize(rewriter, loc, flatOut, sz);
       Value srcMem = genSliceToSize(rewriter, loc, src, sz);
-      rewriter.create<memref::CopyOp>(loc, srcMem, dstMem);
+      memref::CopyOp::create(rewriter, loc, srcMem, dstMem);
       return true;
     });
 
     // Converts MemRefs back to Tensors.
     SmallVector<Value> retValues = llvm::to_vector(
         llvm::map_range(retMem, [&rewriter, loc](Value v) -> Value {
-          return rewriter.create<bufferization::ToTensorOp>(
-              loc, memref::getTensorTypeFromMemRefType(v.getType()), v);
+          return bufferization::ToTensorOp::create(
+              rewriter, loc, memref::getTensorTypeFromMemRefType(v.getType()),
+              v);
         }));
     // Appends the actual memory length used in each buffer returned.
     retValues.append(retLen.begin(), retLen.end());
@@ -1549,15 +1554,15 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
     const Level lvlRank = dstTp.getLvlRank();
     if (dstTp.isOrderedLvl(lvlRank - 1)) {
       Value kFalse = constantI1(rewriter, loc, false);
-      Value notSorted = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, isSorted, kFalse);
+      Value notSorted = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::eq, isSorted, kFalse);
       scf::IfOp ifOp =
-          rewriter.create<scf::IfOp>(loc, notSorted, /*else*/ false);
+          scf::IfOp::create(rewriter, loc, notSorted, /*else*/ false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
       auto xPerm = rewriter.getMultiDimIdentityMap(lvlRank);
-      rewriter.create<SortOp>(loc, nse, xs, ValueRange{ys}, xPerm,
-                              rewriter.getIndexAttr(0),
-                              SparseTensorSortKind::HybridQuickSort);
+      SortOp::create(rewriter, loc, nse, xs, ValueRange{ys}, xPerm,
+                     rewriter.getIndexAttr(0),
+                     SparseTensorSortKind::HybridQuickSort);
       rewriter.setInsertionPointAfter(ifOp);
     }
 
@@ -1566,11 +1571,11 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
     const Value posMemref0 = desc.getPosMemRef(0);
     const Type posTp = dstTp.getPosType();
     const Value posNse = genCast(rewriter, loc, nse, posTp);
-    rewriter.create<memref::StoreOp>(loc, posNse, posMemref0, c1);
+    memref::StoreOp::create(rewriter, loc, posNse, posMemref0, c1);
 
     // Update storage specifier.
-    Value coordinatesSize = rewriter.create<arith::MulIOp>(
-        loc, nse, constantIndex(rewriter, loc, lvlRank));
+    Value coordinatesSize = arith::MulIOp::create(
+        rewriter, loc, nse, constantIndex(rewriter, loc, lvlRank));
     desc.setSpecifierField(rewriter, loc, StorageSpecifierKind::CrdMemSize, 0,
                            coordinatesSize);
     desc.setSpecifierField(rewriter, loc, StorageSpecifierKind::ValMemSize,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 50ccb43d432b6..134aef3a6c719 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -137,7 +137,7 @@ static SmallVector<Value> getDimSizes(OpBuilder &builder, Location loc,
 /// this buffer must be explicitly deallocated by client.
 static Value genAlloc(RewriterBase &rewriter, Location loc, Value sz, Type tp) {
   auto memTp = MemRefType::get({ShapedType::kDynamic}, tp);
-  return rewriter.create<memref::AllocOp>(loc, memTp, ValueRange{sz});
+  return memref::AllocOp::create(rewriter, loc, memTp, ValueRange{sz});
 }
 
 /// Generates a temporary buffer for the level-types of the given encoding.
@@ -154,7 +154,7 @@ static Value genLvlTypesBuffer(OpBuilder &builder, Location loc,
 static Value extractBarePtrFromTensor(OpBuilder &builder, Location loc,
                                       Value tensor) {
   auto buf = genToMemref(builder, loc, tensor);
-  return builder.create<memref::ExtractAlignedPointerAsIndexOp>(loc, buf);
+  return memref::ExtractAlignedPointerAsIndexOp::create(builder, loc, buf);
 }
 
 /// Generates a temporary buffer for the level-types of the given encoding.
@@ -168,12 +168,12 @@ static Value genLvlPtrsBuffers(OpBuilder &builder, Location loc,
 
   // Passing in value buffer pointers.
   lvlBarePtrs.push_back(extractBarePtrFromTensor(builder, loc, valTensor));
-  Value idxPtr = builder.create<memref::ExtractAlignedPointerAsIndexOp>(
-      loc, allocaBuffer(builder, loc, lvlBarePtrs));
+  Value idxPtr = memref::ExtractAlignedPointerAsIndexOp::create(
+      builder, loc, allocaBuffer(builder, loc, lvlBarePtrs));
   Value idxCast =
-      builder.create<arith::IndexCastOp>(loc, builder.getI64Type(), idxPtr);
-  return builder.create<LLVM::IntToPtrOp>(loc, getOpaquePointerType(builder),
-                                          idxCast);
+      arith::IndexCastOp::create(builder, loc, builder.getI64Type(), idxPtr);
+  return LLVM::IntToPtrOp::create(builder, loc, getOpaquePointerType(builder),
+                                  idxCast);
 }
 
 /// This class abstracts over the API of `_mlir_ciface_newSparseTensor`:
@@ -227,7 +227,7 @@ class NewCallParams final {
     assert(isInitialized() && "Must initialize before genNewCall");
     StringRef name = "newSparseTensor";
     params[kParamAction] = constantAction(builder, loc, action);
-    params[kParamPtr] = ptr ? ptr : builder.create<LLVM::ZeroOp>(loc, pTp);
+    params[kParamPtr] = ptr ? ptr : LLVM::ZeroOp::create(builder, loc, pTp);
     return createFuncCall(builder, loc, name, pTp, params, EmitCInterface::On)
         .getResult(0);
   }
@@ -539,7 +539,7 @@ class SparseTensorToCoordinatesConverter
     // Cast the MemRef type to the type expected by the users, though these
     // two types should be compatible at runtime.
     if (op.getType() != crds.getType())
-      crds = rewriter.create<memref::CastOp>(loc, op.getType(), crds);
+      crds = memref::CastOp::create(rewriter, loc, op.getType(), crds);
     rewriter.replaceOp(op, crds);
     return success();
   }
@@ -560,7 +560,7 @@ class SparseToCoordinatesBufferConverter
     // Cast the MemRef type to the type expected by the users, though these
     // two types should be compatible at runtime.
     if (op.getType() != crds.getType())
-      crds = rewriter.create<memref::CastOp>(loc, op.getType(), crds);
+      crds = memref::CastOp::create(rewriter, loc, op.getType(), crds);
     rewriter.replaceOp(op, crds);
     return success();
   }
@@ -652,7 +652,7 @@ class SparseTensorInsertConverter
       vref = genAllocaScalar(rewriter, loc, elemTp);
     }
     storeAll(rewriter, loc, lvlCoords, adaptor.getIndices());
-    rewriter.create<memref::StoreOp>(loc, adaptor.getScalar(), vref);
+    memref::StoreOp::create(rewriter, loc, adaptor.getScalar(), vref);
     SmallString<12> name{"lexInsert", primaryTypeFunctionSuffix(elemTp)};
     createFuncCall(rewriter, loc, name, {},
                    {adaptor.getDest(), lvlCoords, vref}, EmitCInterface::On);
@@ -690,12 +690,12 @@ class SparseTensorExpandConverter : public OpConversionPattern<ExpandOp> {
     // operation is amortized over the innermost loops for the access
     // pattern expansion. As noted in the operation doc, we would like
     // to amortize this setup cost even between kernels.
-    rewriter.create<linalg::FillOp>(
-        loc, ValueRange{constantZero(rewriter, loc, eltType)},
-        ValueRange{values});
-    rewriter.create<linalg::FillOp>(
-        loc, ValueRange{constantZero(rewriter, loc, boolType)},
-        ValueRange{filled});
+    linalg::FillOp::create(rewriter, loc,
+                           ValueRange{constantZero(rewriter, loc, eltType)},
+                           ValueRange{values});
+    linalg::FillOp::create(rewriter, loc,
+                           ValueRange{constantZero(rewriter, loc, boolType)},
+                           ValueRange{filled});
     // Replace expansion op with these buffers and initial coordinate.
     assert(op.getNumResults() == 4);
     rewriter.replaceOp(op, {values, filled, lastLvlCoordinates, zero});
@@ -733,9 +733,9 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
     rewriter.replaceOp(op, adaptor.getTensor());
     // Deallocate the buffers on exit of the loop nest.
     rewriter.setInsertionPointAfter(parent);
-    rewriter.create<memref::DeallocOp>(loc, values);
-    rewriter.create<memref::DeallocOp>(loc, filled);
-    rewriter.create<memref::DeallocOp>(loc, added);
+    memref::DeallocOp::create(rewriter, loc, values);
+    memref::DeallocOp::create(rewriter, loc, filled);
+    memref::DeallocOp::create(rewriter, loc, added);
     return success();
   }
 };
@@ -837,21 +837,21 @@ class SparseTensorDisassembleConverter
                                       cooStartLvl + 1);
       auto crdLen = linalg::createOrFoldDimOp(rewriter, loc, crds0, 0);
       auto two = constantIndex(rewriter, loc, 2);
-      auto bufLen = rewriter.create<arith::MulIOp>(loc, crdLen, two);
+      auto bufLen = arith::MulIOp::create(rewriter, loc, crdLen, two);
       Type indexType = rewriter.getIndexType();
       auto zero = constantZero(rewriter, loc, indexType);
       auto one = constantOne(rewriter, loc, indexType);
-      scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, zero, crdLen, one);
+      scf::ForOp forOp = scf::ForOp::create(rewriter, loc, zero, crdLen, one);
       auto idx = forOp.getInductionVar();
       rewriter.setInsertionPointToStart(forOp.getBody());
-      auto c0 = rewriter.create<memref::LoadOp>(loc, crds0, idx);
-      auto c1 = rewriter.create<memref::LoadOp>(loc, crds1, idx);
+      auto c0 = memref::LoadOp::create(rewriter, loc, crds0, idx);
+      auto c1 = memref::LoadOp::create(rewriter, loc, crds1, idx);
       SmallVector<Value> args;
       args.push_back(idx);
       args.push_back(zero);
-      rewriter.create<memref::StoreOp>(loc, c0, buf, args);
+      memref::StoreOp::create(rewriter, loc, c0, buf, args);
       args[1] = one;
-      rewriter.create<memref::StoreOp>(loc, c1, buf, args);
+      memref::StoreOp::create(rewriter, loc, c1, buf, args);
       rewriter.setInsertionPointAfter(forOp);
       auto bufLenTp = op.getLvlLens().getTypes()[retLen.size()];
       retVal.push_back(buf);
@@ -867,11 +867,11 @@ class SparseTensorDisassembleConverter
     // Converts MemRefs back to Tensors.
     assert(retVal.size() + retLen.size() == op.getNumResults());
     for (unsigned i = 0, sz = retVal.size(); i < sz; i++) {
-      auto tensor = rewriter.create<bufferization::ToTensorOp>(
-          loc, memref::getTensorTypeFromMemRefType(retVal[i].getType()),
-          retVal[i]);
+      auto tensor = bufferization::ToTensorOp::create(
+          rewriter, loc,
+          memref::getTensorTypeFromMemRefType(retVal[i].getType()), retVal[i]);
       retVal[i] =
-          rewriter.create<tensor::CastOp>(loc, op.getResultTypes()[i], tensor);
+          tensor::CastOp::create(rewriter, loc, op.getResultTypes()[i], tensor);
     }
 
     // Appends the actual memory length used in each buffer returned.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index d4a02bf7a70b6..b444ac5ba1285 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -127,7 +127,7 @@ static void sizesForTensor(OpBuilder &builder, SmallVectorImpl<Value> &sizes,
   for (const auto &d : enumerate(stp.getShape())) {
     Value dim;
     if (d.value() == ShapedType::kDynamic)
-      dim = builder.create<tensor::DimOp>(loc, tensor, d.index());
+      dim = tensor::DimOp::create(builder, loc, tensor, d.index());
     else
       dim = constantIndex(builder, loc, d.value());
     sizes.push_back(dim);
@@ -198,7 +198,7 @@ static void concatSizesFromInputs(OpBuilder &builder,
     for (const auto &src : srcs.drop_front()) {
       Value srcSz = linalg::createOrFoldDimOp(builder, loc, src, dim);
       // Sum up all the sizes.
-      sizes[dim] = builder.create<arith::AddIOp>(loc, sizes[dim], srcSz);
+      sizes[dim] = arith::AddIOp::create(builder, loc, sizes[dim], srcSz);
     }
   }
 }
@@ -405,8 +405,8 @@ struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {
     inputOps.push_back(op.getDpsInputOperand(1 - other)->get());
     fusedIndexMaps.push_back(fusedIndexMaps.back()); // mimic other
     // Fuse producer and consumer into a new generic op.
-    auto fusedOp = rewriter.create<GenericOp>(
-        loc, op.getResult(0).getType(), inputOps, outputOps,
+    auto fusedOp = GenericOp::create(
+        rewriter, loc, op.getResult(0).getType(), inputOps, outputOps,
         rewriter.getAffineMapArrayAttr(fusedIndexMaps), prod.getIteratorTypes(),
         /*doc=*/nullptr, /*library_call=*/nullptr);
     Block &prodBlock = prod.getRegion().front();
@@ -430,7 +430,7 @@ struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {
     mapper.map(consBlock.getArgument(other), fusedBlock->back().getResult(0));
     mapper.map(last, rewriter.clone(*sampler, mapper)->getResult(0));
     last = rewriter.clone(*acc, mapper)->getResult(0);
-    rewriter.create<linalg::YieldOp>(loc, last);
+    linalg::YieldOp::create(rewriter, loc, last);
     // Force initial value on merged allocation for dense outputs.
     // TODO: deal with non alloc tensor here one day
     if (!getSparseTensorEncoding(op.getResult(0).getType())) {
@@ -534,7 +534,7 @@ struct GenSemiRingSelect : public OpRewritePattern<GenericOp> {
       assert(t.getType() == f.getType());
       auto selTp = t.getType();
       auto c0 = constantZero(rewriter, loc, selTp);
-      auto binOp = rewriter.create<sparse_tensor::BinaryOp>(loc, selTp, t, f);
+      auto binOp = sparse_tensor::BinaryOp::create(rewriter, loc, selTp, t, f);
       // Initializes all the blocks.
       rewriter.createBlock(&binOp.getOverlapRegion(), {}, {selTp, selTp},
                            {t.getLoc(), f.getLoc()});
@@ -564,7 +564,7 @@ struct GenSemiRingSelect : public OpRewritePattern<GenericOp> {
           irMap.map(f, b->getArgument(1));
         }
         auto y = rewriter.clone(inst, irMap)->getResult(0);
-        rewriter.create<sparse_tensor::YieldOp>(loc, y);
+        sparse_tensor::YieldOp::create(rewriter, loc, y);
       }
 
       // We successfully rewrited a operation. We can not do replacement here
@@ -674,29 +674,29 @@ struct GenSemiRingReduction : public OpRewritePattern<GenericOp> {
     // Identity.
     Location loc = op.getLoc();
     Value identity =
-        rewriter.create<tensor::ExtractOp>(loc, init->get(), ValueRange());
+        tensor::ExtractOp::create(rewriter, loc, init->get(), ValueRange());
     // Unary {
     //    present -> value
     //    absent  -> zero.
     // }
     Type rtp = s0.getType();
     rewriter.setInsertionPointToStart(&op.getRegion().front());
-    auto semiring = rewriter.create<sparse_tensor::UnaryOp>(loc, rtp, s0);
+    auto semiring = sparse_tensor::UnaryOp::create(rewriter, loc, rtp, s0);
     Block *present =
         rewriter.createBlock(&semiring.getPresentRegion(), {}, rtp, loc);
     rewriter.setInsertionPointToStart(&semiring.getPresentRegion().front());
-    rewriter.create<sparse_tensor::YieldOp>(loc, present->getArgument(0));
+    sparse_tensor::YieldOp::create(rewriter, loc, present->getArgument(0));
     rewriter.createBlock(&semiring.getAbsentRegion(), {}, {}, {});
     rewriter.setInsertionPointToStart(&semiring.getAbsentRegion().front());
     auto zero =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(rtp));
-    rewriter.create<sparse_tensor::YieldOp>(loc, zero);
+        arith::ConstantOp::create(rewriter, loc, rewriter.getZeroAttr(rtp));
+    sparse_tensor::YieldOp::create(rewriter, loc, zero);
     rewriter.setInsertionPointAfter(semiring);
     // CustomReduce {
     //    x = x REDUC y, identity
     // }
-    auto custom = rewriter.create<sparse_tensor::ReduceOp>(
-        loc, rtp, semiring.getResult(), s1, identity);
+    auto custom = sparse_tensor::ReduceOp::create(
+        rewriter, loc, rtp, semiring.getResult(), s1, identity);
     Block *region =
         rewriter.createBlock(&custom.getRegion(), {}, {rtp, rtp}, {loc, loc});
     rewriter.setInsertionPointToStart(&custom.getRegion().front());
@@ -704,7 +704,7 @@ struct GenSemiRingReduction : public OpRewritePattern<GenericOp> {
     irMap.map(red->getOperand(0), region->getArgument(0));
     irMap.map(red->getOperand(1), region->getArgument(1));
     auto *cloned = rewriter.clone(*red, irMap);
-    rewriter.create<sparse_tensor::YieldOp>(loc, cloned->getResult(0));
+    sparse_tensor::YieldOp::create(rewriter, loc, cloned->getResult(0));
     rewriter.setInsertionPointAfter(custom);
     rewriter.replaceOp(red, custom.getResult());
     return success();
@@ -723,14 +723,15 @@ struct PrintRewriter : public OpRewritePattern<PrintOp> {
     auto tensor = op.getTensor();
     auto stt = getSparseTensorType(tensor);
     // Header with NSE.
-    auto nse = rewriter.create<NumberOfEntriesOp>(loc, tensor);
-    rewriter.create<vector::PrintOp>(
-        loc, rewriter.getStringAttr("---- Sparse Tensor ----\nnse = "));
-    rewriter.create<vector::PrintOp>(loc, nse);
+    auto nse = NumberOfEntriesOp::create(rewriter, loc, tensor);
+    vector::PrintOp::create(
+        rewriter, loc,
+        rewriter.getStringAttr("---- Sparse Tensor ----\nnse = "));
+    vector::PrintOp::create(rewriter, loc, nse);
     // Print run-time contents for dim/lvl sizes.
-    rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("dim = "));
+    vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("dim = "));
     printSizes(rewriter, loc, tensor, stt.getDimRank(), /*isDim=*/true);
-    rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("lvl = "));
+    vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("lvl = "));
     printSizes(rewriter, loc, tensor, stt.getLvlRank(), /*isDim=*/false);
     // Use the "codegen" foreach loop construct to iterate over
     // all typical sparse tensor components for printing.
@@ -744,42 +745,42 @@ struct PrintRewriter : public OpRewritePattern<PrintOp> {
       }
       case SparseTensorFieldKind::PosMemRef: {
         auto lvl = constantIndex(rewriter, loc, l);
-        rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("pos["));
-        rewriter.create<vector::PrintOp>(
-            loc, lvl, vector::PrintPunctuation::NoPunctuation);
-        rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("] : "));
-        auto pos = rewriter.create<ToPositionsOp>(loc, tensor, l);
+        vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("pos["));
+        vector::PrintOp::create(rewriter, loc, lvl,
+                                vector::PrintPunctuation::NoPunctuation);
+        vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("] : "));
+        auto pos = ToPositionsOp::create(rewriter, loc, tensor, l);
         printContents(rewriter, loc, pos);
         break;
       }
       case SparseTensorFieldKind::CrdMemRef: {
         auto lvl = constantIndex(rewriter, loc, l);
-        rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("crd["));
-        rewriter.create<vector::PrintOp>(
-            loc, lvl, vector::PrintPunctuation::NoPunctuation);
-        rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("] : "));
+        vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("crd["));
+        vector::PrintOp::create(rewriter, loc, lvl,
+                                vector::PrintPunctuation::NoPunctuation);
+        vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("] : "));
         Value crd = nullptr;
         // For COO AoS storage, we want to print a single, linear view of
         // the full coordinate storage at this level. For any other storage,
         // we show the coordinate storage for every indivual level.
         if (stt.getAoSCOOStart() == l)
-          crd = rewriter.create<ToCoordinatesBufferOp>(loc, tensor);
+          crd = ToCoordinatesBufferOp::create(rewriter, loc, tensor);
         else
-          crd = rewriter.create<ToCoordinatesOp>(loc, tensor, l);
+          crd = ToCoordinatesOp::create(rewriter, loc, tensor, l);
         printContents(rewriter, loc, crd);
         break;
       }
       case SparseTensorFieldKind::ValMemRef: {
-        rewriter.create<vector::PrintOp>(loc,
-                                         rewriter.getStringAttr("values : "));
-        auto val = rewriter.create<ToValuesOp>(loc, tensor);
+        vector::PrintOp::create(rewriter, loc,
+                                rewriter.getStringAttr("values : "));
+        auto val = ToValuesOp::create(rewriter, loc, tensor);
         printContents(rewriter, loc, val);
         break;
       }
       }
       return true;
     });
-    rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("----\n"));
+    vector::PrintOp::create(rewriter, loc, rewriter.getStringAttr("----\n"));
     rewriter.eraseOp(op);
     return success();
   }
@@ -797,7 +798,7 @@ struct PrintRewriter : public OpRewritePattern<PrintOp> {
     auto shape = cast<ShapedType>(vec.getType()).getShape();
     SmallVector<Value> idxs;
     printContentsLevel(rewriter, loc, vec, 0, shape, idxs);
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::NewLine);
+    vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::NewLine);
   }
 
   // Helper to the helper.
@@ -805,13 +806,13 @@ struct PrintRewriter : public OpRewritePattern<PrintOp> {
                                  Value vec, unsigned i, ArrayRef<int64_t> shape,
                                  SmallVectorImpl<Value> &idxs) {
     // Open bracket.
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
+    vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Open);
     // Generate for loop.
     auto zero = constantIndex(rewriter, loc, 0);
     auto index = constantIndex(rewriter, loc, i);
-    auto size = rewriter.create<memref::DimOp>(loc, vec, index);
+    auto size = memref::DimOp::create(rewriter, loc, vec, index);
     auto step = constantIndex(rewriter, loc, 1);
-    auto forOp = rewriter.create<scf::ForOp>(loc, zero, size, step);
+    auto forOp = scf::ForOp::create(rewriter, loc, zero, size, step);
     idxs.push_back(forOp.getInductionVar());
     rewriter.setInsertionPointToStart(forOp.getBody());
     if (i < shape.size() - 1) {
@@ -819,56 +820,56 @@ struct PrintRewriter : public OpRewritePattern<PrintOp> {
       printContentsLevel(rewriter, loc, vec, i + 1, shape, idxs);
     } else {
       // Actual contents printing.
-      auto val = rewriter.create<memref::LoadOp>(loc, vec, idxs);
+      auto val = memref::LoadOp::create(rewriter, loc, vec, idxs);
       if (llvm::isa<ComplexType>(val.getType())) {
         // Since the vector dialect does not support complex types in any op,
         // we split those into (real, imag) pairs here.
-        Value real = rewriter.create<complex::ReOp>(loc, val);
-        Value imag = rewriter.create<complex::ImOp>(loc, val);
-        rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
-        rewriter.create<vector::PrintOp>(loc, real,
-                                         vector::PrintPunctuation::Comma);
-        rewriter.create<vector::PrintOp>(loc, imag,
-                                         vector::PrintPunctuation::Close);
+        Value real = complex::ReOp::create(rewriter, loc, val);
+        Value imag = complex::ImOp::create(rewriter, loc, val);
+        vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Open);
+        vector::PrintOp::create(rewriter, loc, real,
+                                vector::PrintPunctuation::Comma);
+        vector::PrintOp::create(rewriter, loc, imag,
+                                vector::PrintPunctuation::Close);
       } else {
-        rewriter.create<vector::PrintOp>(
-            loc, val, vector::PrintPunctuation::NoPunctuation);
+        vector::PrintOp::create(rewriter, loc, val,
+                                vector::PrintPunctuation::NoPunctuation);
       }
       // Terminating comma (except at end).
-      auto bound = rewriter.create<arith::AddIOp>(loc, idxs.back(), step);
-      Value cond = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                                  bound, size);
-      scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, cond, /*else*/ false);
+      auto bound = arith::AddIOp::create(rewriter, loc, idxs.back(), step);
+      Value cond = arith::CmpIOp::create(rewriter, loc,
+                                         arith::CmpIPredicate::ne, bound, size);
+      scf::IfOp ifOp = scf::IfOp::create(rewriter, loc, cond, /*else*/ false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
-      rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Comma);
+      vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Comma);
     }
     idxs.pop_back();
     rewriter.setInsertionPointAfter(forOp);
     // Close bracket.
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Close);
+    vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Close);
   }
 
   // Helper method to print run-time lvl/dim sizes.
   static void printSizes(PatternRewriter &rewriter, Location loc, Value tensor,
                          unsigned size, bool isDim) {
     // Open bracket.
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
+    vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Open);
     // Print unrolled contents (dimop requires constant value).
     for (unsigned i = 0; i < size; i++) {
       auto idx = constantIndex(rewriter, loc, i);
       Value val;
       if (isDim)
-        val = rewriter.create<tensor::DimOp>(loc, tensor, idx);
+        val = tensor::DimOp::create(rewriter, loc, tensor, idx);
       else
-        val = rewriter.create<LvlOp>(loc, tensor, idx);
-      rewriter.create<vector::PrintOp>(
-          loc, val,
-          i != size - 1 ? vector::PrintPunctuation::Comma
-                        : vector::PrintPunctuation::NoPunctuation);
+        val = LvlOp::create(rewriter, loc, tensor, idx);
+      vector::PrintOp::create(rewriter, loc, val,
+                              i != size - 1
+                                  ? vector::PrintPunctuation::Comma
+                                  : vector::PrintPunctuation::NoPunctuation);
     }
     // Close bracket and end of line.
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Close);
-    rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::NewLine);
+    vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::Close);
+    vector::PrintOp::create(rewriter, loc, vector::PrintPunctuation::NewLine);
   }
 };
 
@@ -896,7 +897,7 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
     for (Dimension d : dstTp->getDimShape())
       dstSizes.push_back(constantIndex(rewriter, loc, d));
 
-    Value nnz = rewriter.create<NumberOfEntriesOp>(loc, srcTensor);
+    Value nnz = NumberOfEntriesOp::create(rewriter, loc, srcTensor);
     // Only need an unordered COO buffer if input and output are not sorted
     // in the same way.
     Type bufferTp = getBufferType(
@@ -920,8 +921,8 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
     //   %t = sparse_tensor.cast %tmp
     // depending on whether the input/output are sorted in the same way.
     const auto encSrc = srcTp->getEncoding();
-    ForeachOp foreachOp = rewriter.create<ForeachOp>(
-        loc, srcTensor, buffer,
+    ForeachOp foreachOp = ForeachOp::create(
+        rewriter, loc, srcTensor, buffer,
         [&](OpBuilder &builder, Location loc, ValueRange srcLcvs, Value v,
             ValueRange reduc) {
           const Dimension srcRank = srcTp->getDimRank();
@@ -935,7 +936,7 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
           Value collapseSize = constantIndex(builder, loc, 1);
           for (Dimension d = 0; d < srcRank; d++)
             collapseSize =
-                builder.create<arith::MulIOp>(loc, collapseSize, srcSizes[d]);
+                arith::MulIOp::create(builder, loc, collapseSize, srcSizes[d]);
           SmallVector<Value, 1> collapsedSizes = {collapseSize};
 
           ReassociationIndices collapseIdx;
@@ -955,15 +956,15 @@ struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
                      dstSizes, dstDcvs);
 
           auto t =
-              builder.create<tensor::InsertOp>(loc, v, reduc.front(), dstDcvs);
-          builder.create<sparse_tensor::YieldOp>(loc, t);
+              tensor::InsertOp::create(builder, loc, v, reduc.front(), dstDcvs);
+          sparse_tensor::YieldOp::create(builder, loc, t);
         });
 
-    Value t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
+    Value t = LoadOp::create(rewriter, loc, foreachOp.getResult(0), true);
     if (bufferTp != *dstTp) {
       auto dstRTT = dstTp->getRankedTensorType();
-      Value converted = rewriter.create<ConvertOp>(loc, dstRTT, t).getResult();
-      rewriter.create<DeallocTensorOp>(loc, t);
+      Value converted = ConvertOp::create(rewriter, loc, dstRTT, t).getResult();
+      DeallocTensorOp::create(rewriter, loc, t);
       t = converted;
     }
     rewriter.replaceOp(op, t);
@@ -1004,7 +1005,7 @@ struct Sparse2SparseReshapeRewriter : public OpRewritePattern<ReshapeOp> {
           dstDynSizes.push_back(dstSizes[idx]);
       }
     }
-    Value nnz = rewriter.create<NumberOfEntriesOp>(loc, srcTensor);
+    Value nnz = NumberOfEntriesOp::create(rewriter, loc, srcTensor);
     // Only need a unordered COO buffer if input and output are not sorted
     // in the same way.
     Type bufferTp = getBufferType(
@@ -1025,8 +1026,8 @@ struct Sparse2SparseReshapeRewriter : public OpRewritePattern<ReshapeOp> {
     //   %t = sparse_tensor.cast %tmp
     // depending on whether the input/output are sorted in the same way.
     const auto encSrc = srcTp.getEncoding();
-    ForeachOp foreachOp = rewriter.create<ForeachOp>(
-        loc, srcTensor, buffer,
+    ForeachOp foreachOp = ForeachOp::create(
+        rewriter, loc, srcTensor, buffer,
         [&](OpBuilder &builder, Location loc, ValueRange srcLcvs, Value v,
             ValueRange reduc) {
           const Dimension dimRank = srcTp.getDimRank();
@@ -1040,15 +1041,15 @@ struct Sparse2SparseReshapeRewriter : public OpRewritePattern<ReshapeOp> {
           reshapeCvs(builder, loc, op.getReassociationIndices(), srcSizes,
                      srcDcvs, dstSizes, dstDcvs);
           auto t =
-              builder.create<tensor::InsertOp>(loc, v, reduc.front(), dstDcvs);
-          builder.create<sparse_tensor::YieldOp>(loc, t);
+              tensor::InsertOp::create(builder, loc, v, reduc.front(), dstDcvs);
+          sparse_tensor::YieldOp::create(builder, loc, t);
         });
 
-    Value t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
+    Value t = LoadOp::create(rewriter, loc, foreachOp.getResult(0), true);
     if (bufferTp != dstTp) {
       auto dstRTT = dstTp.getRankedTensorType();
-      Value converted = rewriter.create<ConvertOp>(loc, dstRTT, t).getResult();
-      rewriter.create<DeallocTensorOp>(loc, t);
+      Value converted = ConvertOp::create(rewriter, loc, dstRTT, t).getResult();
+      DeallocTensorOp::create(rewriter, loc, t);
       t = converted;
     }
     rewriter.replaceOp(op, t);
@@ -1079,7 +1080,7 @@ struct ReshapeRewriter : public OpRewritePattern<ReshapeOp> {
       auto rtp = getRankedTensorType(op.getSrc());
       auto denseTp =
           RankedTensorType::get(rtp.getShape(), rtp.getElementType());
-      auto convert = rewriter.create<ConvertOp>(loc, denseTp, op.getSrc());
+      auto convert = ConvertOp::create(rewriter, loc, denseTp, op.getSrc());
       rewriter.modifyOpInPlace(op, [&]() { op->setOperand(0, convert); });
       return success();
     }
@@ -1089,14 +1090,14 @@ struct ReshapeRewriter : public OpRewritePattern<ReshapeOp> {
           RankedTensorType::get(rtp.getShape(), rtp.getElementType());
       ReshapeOp reshape;
       if constexpr (std::is_same<ReshapeOp, tensor::ExpandShapeOp>::value) {
-        reshape = rewriter.create<ReshapeOp>(
-            loc, denseTp, op.getSrc(), op.getReassociation(),
-            op.getOutputShape(), op.getStaticOutputShape());
+        reshape = ReshapeOp::create(rewriter, loc, denseTp, op.getSrc(),
+                                    op.getReassociation(), op.getOutputShape(),
+                                    op.getStaticOutputShape());
       } else {
-        reshape = rewriter.create<ReshapeOp>(loc, denseTp, op.getSrc(),
-                                             op.getReassociation());
+        reshape = ReshapeOp::create(rewriter, loc, denseTp, op.getSrc(),
+                                    op.getReassociation());
       }
-      Value convert = rewriter.create<ConvertOp>(loc, rtp, reshape);
+      Value convert = ConvertOp::create(rewriter, loc, rtp, reshape);
       rewriter.replaceOp(op, convert);
       return success();
     }
@@ -1112,20 +1113,20 @@ struct TensorLike {
     SmallVector<Value> dynSzs;
     getDynamicSizes(rtt, sizes, dynSzs);
 
-    val = builder.create<AllocTensorOp>(loc, rtt, dynSzs);
+    val = AllocTensorOp::create(builder, loc, rtt, dynSzs);
     if (!isSparse()) {
       Value c0 = constantZero(builder, loc, rtt.getElementType());
-      val = builder.create<linalg::FillOp>(loc, c0, val).getResult(0);
+      val = linalg::FillOp::create(builder, loc, c0, val).getResult(0);
     }
   }
 
   void insert(OpBuilder &builder, Location loc, Value v, ValueRange crds) {
-    val = builder.create<tensor::InsertOp>(loc, v, val, crds);
+    val = tensor::InsertOp::create(builder, loc, v, val, crds);
   }
 
   Value finalize(OpBuilder &builder, Location loc, RankedTensorType rtp) const {
     if (isSparse())
-      return builder.create<LoadOp>(loc, val, true);
+      return LoadOp::create(builder, loc, val, true);
     return val;
   }
 
@@ -1160,19 +1161,21 @@ struct SparseTensorDimOpRewriter : public OpRewritePattern<tensor::DimOp> {
     Location loc = op.getLoc();
     SmallVector<Value> maxLvlCrds;
     for (Level l = 0; l < stt->getLvlRank(); l++) {
-      Value lvlSz = rewriter.create<LvlOp>(loc, op.getSource(), l);
-      Value maxLvlCrd = rewriter.create<arith::SubIOp>(
-          loc, lvlSz, constantOne(rewriter, loc, rewriter.getIndexType()));
+      Value lvlSz = LvlOp::create(rewriter, loc, op.getSource(), l);
+      Value maxLvlCrd = arith::SubIOp::create(
+          rewriter, loc, lvlSz,
+          constantOne(rewriter, loc, rewriter.getIndexType()));
       maxLvlCrds.push_back(maxLvlCrd);
     }
 
     AffineExpr lvl2DimExp = stt->getLvlToDim().getResult(*dim);
-    Value maxDimCrd = rewriter.create<affine::AffineApplyOp>(
-        op.getLoc(), AffineMap::get(stt->getLvlRank(), 0, lvl2DimExp),
+    Value maxDimCrd = affine::AffineApplyOp::create(
+        rewriter, op.getLoc(), AffineMap::get(stt->getLvlRank(), 0, lvl2DimExp),
         maxLvlCrds);
 
-    Value dimSz = rewriter.create<arith::AddIOp>(
-        loc, maxDimCrd, constantOne(rewriter, loc, rewriter.getIndexType()));
+    Value dimSz = arith::AddIOp::create(
+        rewriter, loc, maxDimCrd,
+        constantOne(rewriter, loc, rewriter.getIndexType()));
     rewriter.replaceOp(op, dimSz);
     return success();
   }
@@ -1212,26 +1215,27 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
     for (Value input : op.getInputs()) {
       // Builds a for op for each input tensor to append new values into the
       // output tensor.
-      foreachOp = rewriter.create<ForeachOp>(
-          loc, input, iterArg,
+      foreachOp = ForeachOp::create(
+          rewriter, loc, input, iterArg,
           [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
               ValueRange reduc) {
             SmallVector<Value> offDimCrd(dcvs);
             offDimCrd[conDim] =
-                builder.create<arith::AddIOp>(loc, offDimCrd[conDim], offset);
+                arith::AddIOp::create(builder, loc, offDimCrd[conDim], offset);
 
             // Enters foreach, updates the SSA chain.
             dstBuf.val = reduc.front();
             if (!dstTp.isAllDense()) {
               Value cond = genIsNonzero(builder, loc, v);
-              auto ifOp = builder.create<scf::IfOp>(loc, reduc.getTypes(), cond,
-                                                    /*else*/ true);
+              auto ifOp =
+                  scf::IfOp::create(builder, loc, reduc.getTypes(), cond,
+                                    /*else*/ true);
               builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-              builder.create<scf::YieldOp>(loc, dstBuf.val);
+              scf::YieldOp::create(builder, loc, dstBuf.val);
 
               builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
               dstBuf.insert(builder, loc, v, offDimCrd);
-              builder.create<scf::YieldOp>(loc, dstBuf.val);
+              scf::YieldOp::create(builder, loc, dstBuf.val);
 
               // Exits the ifOp, update the sparse tensor SSA value.
               builder.setInsertionPointAfter(ifOp);
@@ -1239,15 +1243,15 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
             } else {
               dstBuf.insert(builder, loc, v, offDimCrd);
             }
-            builder.create<sparse_tensor::YieldOp>(loc, dstBuf.val);
+            sparse_tensor::YieldOp::create(builder, loc, dstBuf.val);
           });
       // Accumulates the offset. Note that only static-shaped inputs are allowed
       // by concatenate op verifier, which saves us from computing the offset
       // dynamically.
       const Size sz = getSparseTensorType(input).getDynamicDimSize(conDim);
       assert(ShapedType::isStatic(sz));
-      offset = rewriter.create<arith::AddIOp>(loc, offset,
-                                              constantIndex(rewriter, loc, sz));
+      offset = arith::AddIOp::create(rewriter, loc, offset,
+                                     constantIndex(rewriter, loc, sz));
       iterArg = foreachOp.getResult(0);
       dstBuf.val = iterArg;
     }
@@ -1299,22 +1303,22 @@ struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
     ValueRange vs;
     TensorLike dstBuf(rewriter, loc, dstStt.getRankedTensorType(), sizes);
 
-    auto foreachOp = rewriter.create<ForeachOp>(
-        loc, src, dstBuf.val, foreachOrder,
+    auto foreachOp = ForeachOp::create(
+        rewriter, loc, src, dstBuf.val, foreachOrder,
         [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
             ValueRange reduc) {
           // Enters the loop, update the SSA value for insertion chain.
           dstBuf.val = reduc.front();
           if (!skipZeroCheck) {
             Value cond = genIsNonzero(builder, loc, v);
-            auto ifOp = builder.create<scf::IfOp>(loc, reduc.getTypes(), cond,
-                                                  /*else*/ true);
+            auto ifOp = scf::IfOp::create(builder, loc, reduc.getTypes(), cond,
+                                          /*else*/ true);
             builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-            builder.create<scf::YieldOp>(loc, dstBuf.val);
+            scf::YieldOp::create(builder, loc, dstBuf.val);
 
             builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
             dstBuf.insert(builder, loc, v, dcvs);
-            builder.create<scf::YieldOp>(loc, dstBuf.val);
+            scf::YieldOp::create(builder, loc, dstBuf.val);
 
             // Exits the ifOp, update the sparse tensor SSA value.
             builder.setInsertionPointAfter(ifOp);
@@ -1322,7 +1326,7 @@ struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
           } else {
             dstBuf.insert(builder, loc, v, dcvs);
           }
-          builder.create<sparse_tensor::YieldOp>(loc, dstBuf.val);
+          sparse_tensor::YieldOp::create(builder, loc, dstBuf.val);
         });
 
     rewriter.setInsertionPointAfter(foreachOp);
@@ -1349,8 +1353,8 @@ struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
       // TODO: we should probably expand the affine map to IR using our own
       // rules, since affine.apply assume signed value, while the cooridinates
       // we provided must always be signless.
-      Value trans = rewriter.create<affine::AffineApplyOp>(
-          op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
+      Value trans = affine::AffineApplyOp::create(
+          rewriter, op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
           op.getInCrds());
       outCrds.push_back(trans);
     }
@@ -1412,8 +1416,8 @@ struct ForeachRewriter : public OpRewritePattern<ForeachOp> {
     SmallVector<Value> pos = loopEmitter.getValPosits(0);
     // Loads the value from sparse tensor using position-index;
     // loads the value from dense tensor using coords.
-    Value val = enc ? rewriter.create<memref::LoadOp>(loc, vals, pos)
-                    : rewriter.create<memref::LoadOp>(loc, vals, lcvs);
+    Value val = enc ? memref::LoadOp::create(rewriter, loc, vals, pos)
+                    : memref::LoadOp::create(rewriter, loc, vals, lcvs);
 
     // 2. Inline the block in the foreach operator.
     Block *srcBlock = op.getBody();
@@ -1472,22 +1476,22 @@ struct NewRewriter : public OpRewritePattern<NewOp> {
     // with enveloping reinterpreted_map ops for non-permutations.
     RankedTensorType dstTp = stt.getRankedTensorType();
     RankedTensorType cooTp = stt.getCOOType(/*ordered=*/true);
-    Value cooTensor = rewriter.create<NewOp>(loc, cooTp, op.getSource());
+    Value cooTensor = NewOp::create(rewriter, loc, cooTp, op.getSource());
     Value convert = cooTensor;
     auto enc = stt.getEncoding();
     if (!stt.isPermutation()) { // demap coo, demap dstTp
       auto coo = getSparseTensorType(cooTensor).getEncoding().withoutDimToLvl();
-      convert = rewriter.create<ReinterpretMapOp>(loc, coo, convert);
+      convert = ReinterpretMapOp::create(rewriter, loc, coo, convert);
       dstTp = getSparseTensorType(convert).withEncoding(enc.withoutDimToLvl());
     }
-    convert = rewriter.create<ConvertOp>(loc, dstTp, convert);
+    convert = ConvertOp::create(rewriter, loc, dstTp, convert);
     if (!stt.isPermutation()) // remap to original enc
-      convert = rewriter.create<ReinterpretMapOp>(loc, enc, convert);
+      convert = ReinterpretMapOp::create(rewriter, loc, enc, convert);
     rewriter.replaceOp(op, convert);
 
     // Release the temporary ordered COO tensor.
     rewriter.setInsertionPointAfterValue(convert);
-    rewriter.create<DeallocTensorOp>(loc, cooTensor);
+    DeallocTensorOp::create(rewriter, loc, cooTensor);
 
     return success();
   }
@@ -1501,7 +1505,7 @@ struct OutRewriter : public OpRewritePattern<OutOp> {
     Location loc = op.getLoc();
     // Calculate NNZ.
     Value src = op.getTensor();
-    Value nnz = rewriter.create<NumberOfEntriesOp>(loc, src);
+    Value nnz = NumberOfEntriesOp::create(rewriter, loc, src);
 
     // Allocate a temporary buffer for storing dimension-sizes/coordinates.
     const auto srcTp = getSparseTensorType(src);
@@ -1514,8 +1518,8 @@ struct OutRewriter : public OpRewritePattern<OutOp> {
     SmallVector<Value> dims;
     sizesForTensor(rewriter, dims, loc, srcTp, src);
     for (Dimension d = 0; d < dimRank; d++) {
-      rewriter.create<memref::StoreOp>(loc, dims[d], dimSizes,
-                                       constantIndex(rewriter, loc, d));
+      memref::StoreOp::create(rewriter, loc, dims[d], dimSizes,
+                              constantIndex(rewriter, loc, d));
     }
 
     // Create a sparse tensor writer and output meta data.
@@ -1536,20 +1540,20 @@ struct OutRewriter : public OpRewritePattern<OutOp> {
     ModuleOp module = op->getParentOfType<ModuleOp>();
 
     // For each element in the source tensor, output the element.
-    rewriter.create<ForeachOp>(
-        loc, src, ValueRange(),
+    ForeachOp::create(
+        rewriter, loc, src, ValueRange(),
         [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
             ValueRange reduc) {
           for (Dimension d = 0; d < dimRank; d++) {
-            rewriter.create<memref::StoreOp>(loc, dcvs[d], dimCoords,
-                                             constantIndex(builder, loc, d));
+            memref::StoreOp::create(rewriter, loc, dcvs[d], dimCoords,
+                                    constantIndex(builder, loc, d));
           }
-          rewriter.create<memref::StoreOp>(loc, v, value);
+          memref::StoreOp::create(rewriter, loc, v, value);
           SmallVector<Value> operands{writer, rankValue, dimCoords, value};
           FlatSymbolRefAttr fn = getFunc(module, outNextFuncName, {}, operands,
                                          EmitCInterface::On);
-          builder.create<func::CallOp>(loc, TypeRange(), fn, operands);
-          builder.create<sparse_tensor::YieldOp>(loc);
+          func::CallOp::create(builder, loc, TypeRange(), fn, operands);
+          sparse_tensor::YieldOp::create(builder, loc);
         });
 
     // Release the writer.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
index 52b66badef44b..4464450fd328f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
@@ -78,7 +78,7 @@ static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl,
       matchPattern(step, m_Constant(&stepInt))) {
     if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0) {
       Value trueVal = constantI1(rewriter, loc, true);
-      return rewriter.create<vector::BroadcastOp>(loc, mtp, trueVal);
+      return vector::BroadcastOp::create(rewriter, loc, mtp, trueVal);
     }
   }
   // Otherwise, generate a vector mask that avoids overrunning the upperbound
@@ -92,7 +92,7 @@ static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl,
       rewriter.getContext());
   Value end = rewriter.createOrFold<affine::AffineMinOp>(
       loc, min, ValueRange{hi, iv, step});
-  return rewriter.create<vector::CreateMaskOp>(loc, mtp, end);
+  return vector::CreateMaskOp::create(rewriter, loc, mtp, end);
 }
 
 /// Generates a vectorized invariant. Here we rely on subsequent loop
@@ -100,7 +100,7 @@ static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl,
 static Value genVectorInvariantValue(PatternRewriter &rewriter, VL vl,
                                      Value val) {
   VectorType vtp = vectorType(vl, val.getType());
-  return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
+  return vector::BroadcastOp::create(rewriter, val.getLoc(), vtp, val);
 }
 
 /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi],
@@ -115,11 +115,11 @@ static Value genVectorLoad(PatternRewriter &rewriter, Location loc, VL vl,
     SmallVector<Value> scalarArgs(idxs);
     Value indexVec = idxs.back();
     scalarArgs.back() = constantIndex(rewriter, loc, 0);
-    return rewriter.create<vector::GatherOp>(loc, vtp, mem, scalarArgs,
-                                             indexVec, vmask, pass);
+    return vector::GatherOp::create(rewriter, loc, vtp, mem, scalarArgs,
+                                    indexVec, vmask, pass);
   }
-  return rewriter.create<vector::MaskedLoadOp>(loc, vtp, mem, idxs, vmask,
-                                               pass);
+  return vector::MaskedLoadOp::create(rewriter, loc, vtp, mem, idxs, vmask,
+                                      pass);
 }
 
 /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs
@@ -132,11 +132,11 @@ static void genVectorStore(PatternRewriter &rewriter, Location loc, Value mem,
     SmallVector<Value> scalarArgs(idxs);
     Value indexVec = idxs.back();
     scalarArgs.back() = constantIndex(rewriter, loc, 0);
-    rewriter.create<vector::ScatterOp>(loc, mem, scalarArgs, indexVec, vmask,
-                                       rhs);
+    vector::ScatterOp::create(rewriter, loc, mem, scalarArgs, indexVec, vmask,
+                              rhs);
     return;
   }
-  rewriter.create<vector::MaskedStoreOp>(loc, mem, idxs, vmask, rhs);
+  vector::MaskedStoreOp::create(rewriter, loc, mem, idxs, vmask, rhs);
 }
 
 /// Detects a vectorizable reduction operations and returns the
@@ -197,18 +197,18 @@ static Value genVectorReducInit(PatternRewriter &rewriter, Location loc,
   case vector::CombiningKind::ADD:
   case vector::CombiningKind::XOR:
     // Initialize reduction vector to: | 0 | .. | 0 | r |
-    return rewriter.create<vector::InsertOp>(loc, r,
-                                             constantZero(rewriter, loc, vtp),
-                                             constantIndex(rewriter, loc, 0));
+    return vector::InsertOp::create(rewriter, loc, r,
+                                    constantZero(rewriter, loc, vtp),
+                                    constantIndex(rewriter, loc, 0));
   case vector::CombiningKind::MUL:
     // Initialize reduction vector to: | 1 | .. | 1 | r |
-    return rewriter.create<vector::InsertOp>(loc, r,
-                                             constantOne(rewriter, loc, vtp),
-                                             constantIndex(rewriter, loc, 0));
+    return vector::InsertOp::create(rewriter, loc, r,
+                                    constantOne(rewriter, loc, vtp),
+                                    constantIndex(rewriter, loc, 0));
   case vector::CombiningKind::AND:
   case vector::CombiningKind::OR:
     // Initialize reduction vector to: | r | .. | r | r |
-    return rewriter.create<vector::BroadcastOp>(loc, vtp, r);
+    return vector::BroadcastOp::create(rewriter, loc, vtp, r);
   default:
     break;
   }
@@ -300,11 +300,11 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
         Type etp = llvm::cast<VectorType>(vload.getType()).getElementType();
         if (!llvm::isa<IndexType>(etp)) {
           if (etp.getIntOrFloatBitWidth() < 32)
-            vload = rewriter.create<arith::ExtUIOp>(
-                loc, vectorType(vl, rewriter.getI32Type()), vload);
+            vload = arith::ExtUIOp::create(
+                rewriter, loc, vectorType(vl, rewriter.getI32Type()), vload);
           else if (etp.getIntOrFloatBitWidth() < 64 && !vl.enableSIMDIndex32)
-            vload = rewriter.create<arith::ExtUIOp>(
-                loc, vectorType(vl, rewriter.getI64Type()), vload);
+            vload = arith::ExtUIOp::create(
+                rewriter, loc, vectorType(vl, rewriter.getI64Type()), vload);
         }
         idxs.push_back(vload);
       }
@@ -328,7 +328,7 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
             return false;
           if (codegen)
             idxs.push_back(
-                rewriter.create<arith::AddIOp>(forOp.getLoc(), inv, idx));
+                arith::AddIOp::create(rewriter, forOp.getLoc(), inv, idx));
           continue; // success so far
         }
       }
@@ -341,7 +341,7 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
 #define UNAOP(xxx)                                                             \
   if (isa<xxx>(def)) {                                                         \
     if (codegen)                                                               \
-      vexp = rewriter.create<xxx>(loc, vx);                                    \
+      vexp = xxx::create(rewriter, loc, vx);                                   \
     return true;                                                               \
   }
 
@@ -349,7 +349,7 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
   if (auto x = dyn_cast<xxx>(def)) {                                           \
     if (codegen) {                                                             \
       VectorType vtp = vectorType(vl, x.getType());                            \
-      vexp = rewriter.create<xxx>(loc, vtp, vx);                               \
+      vexp = xxx::create(rewriter, loc, vtp, vx);                              \
     }                                                                          \
     return true;                                                               \
   }
@@ -357,7 +357,7 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
 #define BINOP(xxx)                                                             \
   if (isa<xxx>(def)) {                                                         \
     if (codegen)                                                               \
-      vexp = rewriter.create<xxx>(loc, vx, vy);                                \
+      vexp = xxx::create(rewriter, loc, vx, vy);                               \
     return true;                                                               \
   }
 
@@ -380,9 +380,9 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
       // such as a[i] = i, which must convert to [i, i+1, ...].
       if (codegen) {
         VectorType vtp = vectorType(vl, arg.getType());
-        Value veci = rewriter.create<vector::BroadcastOp>(loc, vtp, arg);
-        Value incr = rewriter.create<vector::StepOp>(loc, vtp);
-        vexp = rewriter.create<arith::AddIOp>(loc, veci, incr);
+        Value veci = vector::BroadcastOp::create(rewriter, loc, vtp, arg);
+        Value incr = vector::StepOp::create(rewriter, loc, vtp);
+        vexp = arith::AddIOp::create(rewriter, loc, veci, incr);
       }
       return true;
     }
@@ -525,16 +525,16 @@ static bool vectorizeStmt(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
     Value step = constantIndex(rewriter, loc, vl.vectorLength);
     if (vl.enableVLAVectorization) {
       Value vscale =
-          rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
-      step = rewriter.create<arith::MulIOp>(loc, vscale, step);
+          vector::VectorScaleOp::create(rewriter, loc, rewriter.getIndexType());
+      step = arith::MulIOp::create(rewriter, loc, vscale, step);
     }
     if (!yield.getResults().empty()) {
       Value init = forOp.getInitArgs()[0];
       VectorType vtp = vectorType(vl, init.getType());
       Value vinit = genVectorReducInit(rewriter, loc, yield->getOperand(0),
                                        forOp.getRegionIterArg(0), init, vtp);
-      forOpNew = rewriter.create<scf::ForOp>(
-          loc, forOp.getLowerBound(), forOp.getUpperBound(), step, vinit);
+      forOpNew = scf::ForOp::create(rewriter, loc, forOp.getLowerBound(),
+                                    forOp.getUpperBound(), step, vinit);
       forOpNew->setAttr(
           LoopEmitter::getLoopEmitterLoopAttrName(),
           forOp->getAttr(LoopEmitter::getLoopEmitterLoopAttrName()));
@@ -562,10 +562,10 @@ static bool vectorizeStmt(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
       if (codegen) {
         Value partial = forOpNew.getResult(0);
         Value vpass = genVectorInvariantValue(rewriter, vl, iter);
-        Value vred = rewriter.create<arith::SelectOp>(loc, vmask, vrhs, vpass);
-        rewriter.create<scf::YieldOp>(loc, vred);
+        Value vred = arith::SelectOp::create(rewriter, loc, vmask, vrhs, vpass);
+        scf::YieldOp::create(rewriter, loc, vred);
         rewriter.setInsertionPointAfter(forOpNew);
-        Value vres = rewriter.create<vector::ReductionOp>(loc, kind, partial);
+        Value vres = vector::ReductionOp::create(rewriter, loc, kind, partial);
         // Now do some relinking (last one is not completely type safe
         // but all bad ones are removed right away). This also folds away
         // nop broadcast operations.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index d0e3e88f131d3..0a5f5595bba56 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -316,8 +316,8 @@ static void genBuffers(CodegenEnv &env, OpBuilder &builder) {
         if (!isInit) {
           Value zero = constantZero(builder, loc,
                                     getElementTypeOrSelf(tensor.getType()));
-          builder.create<linalg::FillOp>(loc, ValueRange{zero},
-                                         ValueRange{init});
+          linalg::FillOp::create(builder, loc, ValueRange{zero},
+                                 ValueRange{init});
         }
         return init;
       },
@@ -379,7 +379,7 @@ static Value genInsertionLoad(CodegenEnv &env, OpBuilder &builder,
   }
   // Load from expanded access pattern.
   Value index = genIndex(env, t);
-  return builder.create<memref::LoadOp>(loc, env.getExpandValues(), index);
+  return memref::LoadOp::create(builder, loc, env.getExpandValues(), index);
 }
 
 /// Generates insertion code to implement dynamic tensor load for reduction.
@@ -395,22 +395,22 @@ static Value genInsertionLoadReduce(CodegenEnv &env, OpBuilder &builder,
   Value values = env.getExpandValues();
   Value filled = env.getExpandFilled();
   Value index = genIndex(env, t);
-  Value isFilled = builder.create<memref::LoadOp>(loc, filled, index);
-  Value valAtIndex = builder.create<memref::LoadOp>(loc, values, index);
-  return builder.create<arith::SelectOp>(loc, isFilled, valAtIndex, identity);
+  Value isFilled = memref::LoadOp::create(builder, loc, filled, index);
+  Value valAtIndex = memref::LoadOp::create(builder, loc, values, index);
+  return arith::SelectOp::create(builder, loc, isFilled, valAtIndex, identity);
 }
 
 static Value genConditionalInsert(Location loc, OpBuilder &builder, Value cond,
                                   Value sparseOut, ValueRange ivs, Value v) {
   scf::IfOp condInsert =
-      builder.create<scf::IfOp>(loc, sparseOut.getType(), cond, true);
+      scf::IfOp::create(builder, loc, sparseOut.getType(), cond, true);
   // True branch.
   builder.setInsertionPointToStart(condInsert.thenBlock());
-  Value res = builder.create<tensor::InsertOp>(loc, v, sparseOut, ivs);
-  builder.create<scf::YieldOp>(loc, res);
+  Value res = tensor::InsertOp::create(builder, loc, v, sparseOut, ivs);
+  scf::YieldOp::create(builder, loc, res);
   // False branch.
   builder.setInsertionPointToStart(condInsert.elseBlock());
-  builder.create<scf::YieldOp>(loc, sparseOut);
+  scf::YieldOp::create(builder, loc, sparseOut);
   // Value assignment.
   builder.setInsertionPointAfter(condInsert);
   return condInsert.getResult(0);
@@ -447,7 +447,7 @@ static void genInsertionStore(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
         Value nz = genIsNonzero(builder, loc, rhs);
         sparseOut = genConditionalInsert(loc, builder, nz, chain, ivs, rhs);
       } else {
-        sparseOut = builder.create<tensor::InsertOp>(loc, rhs, chain, ivs);
+        sparseOut = tensor::InsertOp::create(builder, loc, rhs, chain, ivs);
       }
       // Generates regular insertion chain.
       env.updateInsertionChain(sparseOut);
@@ -468,25 +468,25 @@ static void genInsertionStore(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
   Value fval = constantI1(builder, loc, false);
   Value tval = constantI1(builder, loc, true);
   // If statement.
-  Value isFilled = builder.create<memref::LoadOp>(loc, filled, index);
-  Value cond = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                             isFilled, fval);
-  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, builder.getIndexType(), cond,
-                                             /*else=*/true);
+  Value isFilled = memref::LoadOp::create(builder, loc, filled, index);
+  Value cond = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq,
+                                     isFilled, fval);
+  scf::IfOp ifOp = scf::IfOp::create(builder, loc, builder.getIndexType(), cond,
+                                     /*else=*/true);
   // True branch.
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-  builder.create<memref::StoreOp>(loc, tval, filled, index);
-  builder.create<memref::StoreOp>(loc, index, added, count);
+  memref::StoreOp::create(builder, loc, tval, filled, index);
+  memref::StoreOp::create(builder, loc, index, added, count);
   Value one = constantIndex(builder, loc, 1);
-  Value add = builder.create<arith::AddIOp>(loc, count, one);
-  builder.create<scf::YieldOp>(loc, add);
+  Value add = arith::AddIOp::create(builder, loc, count, one);
+  scf::YieldOp::create(builder, loc, add);
   // False branch.
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-  builder.create<scf::YieldOp>(loc, count);
+  scf::YieldOp::create(builder, loc, count);
   builder.setInsertionPointAfter(ifOp);
   // Value assignment.
   env.updateExpandCount(ifOp.getResult(0));
-  builder.create<memref::StoreOp>(loc, rhs, values, index);
+  memref::StoreOp::create(builder, loc, rhs, values, index);
 }
 
 /// Generates a load on a dense or sparse tensor.
@@ -516,9 +516,10 @@ static Value genTensorLoad(CodegenEnv &env, OpBuilder &builder, ExprId exp) {
   if (llvm::isa<TensorType>(ptr.getType())) {
     assert(env.options().sparseEmitStrategy ==
            SparseEmitStrategy::kSparseIterator);
-    return builder.create<ExtractValOp>(loc, ptr, llvm::getSingleElement(args));
+    return ExtractValOp::create(builder, loc, ptr,
+                                llvm::getSingleElement(args));
   }
-  return builder.create<memref::LoadOp>(loc, ptr, args);
+  return memref::LoadOp::create(builder, loc, ptr, args);
 }
 
 /// Generates a store on a dense or sparse tensor.
@@ -545,7 +546,7 @@ static void genTensorStore(CodegenEnv &env, OpBuilder &builder, ExprId exp,
   if (!env.isSparseOutput(t)) {
     SmallVector<Value> args;
     Value ptr = genSubscript(env, builder, t, args);
-    builder.create<memref::StoreOp>(loc, rhs, ptr, args);
+    memref::StoreOp::create(builder, loc, rhs, ptr, args);
     return;
   }
   // Store during sparse insertion.
@@ -556,7 +557,7 @@ static void genTensorStore(CodegenEnv &env, OpBuilder &builder, ExprId exp,
   // Select operation insertion.
   Value chain = env.getInsertionChain();
   scf::IfOp ifOp =
-      builder.create<scf::IfOp>(loc, chain.getType(), rhs, /*else=*/true);
+      scf::IfOp::create(builder, loc, chain.getType(), rhs, /*else=*/true);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   // Existing value was preserved to be used here.
   assert(env.exp(exp).val);
@@ -565,10 +566,10 @@ static void genTensorStore(CodegenEnv &env, OpBuilder &builder, ExprId exp,
   env.merger().clearExprValue(exp);
   // Yield modified insertion chain along true branch.
   Value mchain = env.getInsertionChain();
-  builder.create<scf::YieldOp>(op.getLoc(), mchain);
+  scf::YieldOp::create(builder, op.getLoc(), mchain);
   // Yield original insertion chain along false branch.
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-  builder.create<scf::YieldOp>(loc, chain);
+  scf::YieldOp::create(builder, loc, chain);
   // Done with if statement.
   env.updateInsertionChain(ifOp->getResult(0));
   builder.setInsertionPointAfter(ifOp);
@@ -597,7 +598,7 @@ static Value relinkBranch(CodegenEnv &env, RewriterBase &rewriter, Block *block,
       assert(!getSparseTensorType(t->get()).hasEncoding()); // dense!
       SmallVector<Value> args;
       Value ptr = genSubscript(env, rewriter, t, args);
-      return rewriter.create<memref::LoadOp>(op.getLoc(), ptr, args);
+      return memref::LoadOp::create(rewriter, op.getLoc(), ptr, args);
     }
   } else if (Operation *def = e.getDefiningOp()) {
     // Handle index computation.
@@ -768,7 +769,8 @@ static void genExpand(CodegenEnv &env, OpBuilder &builder, LoopId curr,
     Type t2 = MemRefType::get(dynShape, builder.getI1Type());
     Type t3 = MemRefType::get(dynShape, builder.getIndexType());
     Type t4 = builder.getIndexType();
-    auto r = builder.create<ExpandOp>(loc, TypeRange({t1, t2, t3, t4}), tensor);
+    auto r =
+        ExpandOp::create(builder, loc, TypeRange({t1, t2, t3, t4}), tensor);
     assert(r.getNumResults() == 4);
     env.startExpand(r.getResult(0), r.getResult(1), r.getResult(2),
                     r.getResult(3));
@@ -781,8 +783,8 @@ static void genExpand(CodegenEnv &env, OpBuilder &builder, LoopId curr,
     Value added = env.getExpandAdded();
     Value count = env.getExpandCount();
     Value chain = env.getInsertionChain();
-    Value compress = builder.create<CompressOp>(loc, values, filled, added,
-                                                count, chain, indices);
+    Value compress = CompressOp::create(builder, loc, values, filled, added,
+                                        count, chain, indices);
     env.updateInsertionChain(compress);
     env.endExpand();
   }
@@ -889,7 +891,7 @@ static void finalizeWhileOp(CodegenEnv &env, OpBuilder &builder,
         env.updateInsertionChain(ifOp->getResult(y++));
       }
       assert(y == yields.size());
-      builder.create<scf::YieldOp>(loc, yields);
+      scf::YieldOp::create(builder, loc, yields);
       builder.setInsertionPointAfter(ifOp);
     }
   }
@@ -942,13 +944,14 @@ static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId curr,
           assert(lvl.has_value());
           const Value crd = env.emitter().getCoord(tid, *lvl);
           const Value lvar = env.getLoopVar(curr);
-          clause = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                 crd, lvar);
+          clause = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq,
+                                         crd, lvar);
         } else {
           assert(lt.hasDenseSemantic() || isUndefLT(lt));
           clause = constantI1(builder, loc, true);
         }
-        cond = cond ? builder.create<arith::AndIOp>(loc, cond, clause) : clause;
+        cond =
+            cond ? arith::AndIOp::create(builder, loc, cond, clause) : clause;
       });
   if (env.isReduc()) {
     types.push_back(env.getReduc().getType());
@@ -959,7 +962,7 @@ static scf::IfOp genIf(CodegenEnv &env, OpBuilder &builder, LoopId curr,
     types.push_back(builder.getIndexType());
   if (env.getInsertionChain())
     types.push_back(env.getInsertionChain().getType());
-  scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, cond, /*else=*/true);
+  scf::IfOp ifOp = scf::IfOp::create(builder, loc, types, cond, /*else=*/true);
   builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
   return ifOp;
 }
@@ -987,7 +990,7 @@ static void endIf(CodegenEnv &env, OpBuilder &builder, scf::IfOp ifOp,
     env.updateInsertionChain(insInput);
   }
   if (!operands.empty())
-    builder.create<scf::YieldOp>(env.op().getLoc(), operands);
+    scf::YieldOp::create(builder, env.op().getLoc(), operands);
   builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
 }
 
@@ -1301,7 +1304,7 @@ static void genStmt(CodegenEnv &env, RewriterBase &rewriter, ExprId exp,
           genStmt(env, rewriter, ej, curr + 1);
           // TODO: handle yield values.
           assert(reduc.empty() && "Not Implemented");
-          rewriter.create<sparse_tensor::YieldOp>(env.op().getLoc());
+          sparse_tensor::YieldOp::create(rewriter, env.op().getLoc());
           return std::nullopt;
         });
         // endIf(env, rewriter, ifOp, redInput, cntInput, insInput, validIns);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
index 7835c6c3b7797..684a2d418f66c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
@@ -41,7 +41,7 @@ struct GuardSparseAlloc
     // operation that leaves the underlying storage in a proper state
     // before the tensor escapes across the method boundary.
     rewriter.setInsertionPointAfter(op);
-    auto load = rewriter.create<LoadOp>(op.getLoc(), op.getResult(), true);
+    auto load = LoadOp::create(rewriter, op.getLoc(), op.getResult(), true);
     rewriter.replaceAllUsesExcept(op, load, load);
     return success();
   }
@@ -60,7 +60,7 @@ struct StageUnorderedSparseOps : public OpRewritePattern<StageWithSortOp> {
     // Deallocate tmpBuf.
     // TODO: Delegate to buffer deallocation pass in the future.
     if (succeeded(stageResult) && tmpBuf)
-      rewriter.create<bufferization::DeallocTensorOp>(loc, tmpBuf);
+      bufferization::DeallocTensorOp::create(rewriter, loc, tmpBuf);
 
     return stageResult;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
index 33be62d1d5e7e..f57f7f7fc0946 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp
@@ -153,7 +153,7 @@ Value sparse_tensor::genCast(OpBuilder &builder, Location loc, Value value,
 
   // int <=> index
   if (isa<IndexType>(srcTp) || isa<IndexType>(dstTp))
-    return builder.create<arith::IndexCastOp>(loc, dstTp, value);
+    return arith::IndexCastOp::create(builder, loc, dstTp, value);
 
   const auto srcIntTp = dyn_cast_or_null<IntegerType>(srcTp);
   const bool isUnsignedCast = srcIntTp ? srcIntTp.isUnsigned() : false;
@@ -166,19 +166,19 @@ Value sparse_tensor::genScalarToTensor(OpBuilder &builder, Location loc,
     // Scalars can only be converted to 0-ranked tensors.
     assert(rtp.getRank() == 0);
     elem = sparse_tensor::genCast(builder, loc, elem, rtp.getElementType());
-    return builder.create<tensor::FromElementsOp>(loc, rtp, elem);
+    return tensor::FromElementsOp::create(builder, loc, rtp, elem);
   }
   return sparse_tensor::genCast(builder, loc, elem, dstTp);
 }
 
 Value sparse_tensor::genIndexLoad(OpBuilder &builder, Location loc, Value mem,
                                   ValueRange s) {
-  Value load = builder.create<memref::LoadOp>(loc, mem, s);
+  Value load = memref::LoadOp::create(builder, loc, mem, s);
   if (!isa<IndexType>(load.getType())) {
     if (load.getType().getIntOrFloatBitWidth() < 64)
-      load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load);
+      load = arith::ExtUIOp::create(builder, loc, builder.getI64Type(), load);
     load =
-        builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load);
+        arith::IndexCastOp::create(builder, loc, builder.getIndexType(), load);
   }
   return load;
 }
@@ -203,13 +203,13 @@ Value mlir::sparse_tensor::genIsNonzero(OpBuilder &builder, mlir::Location loc,
   Type tp = v.getType();
   Value zero = constantZero(builder, loc, tp);
   if (isa<FloatType>(tp))
-    return builder.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNE, v,
-                                         zero);
+    return arith::CmpFOp::create(builder, loc, arith::CmpFPredicate::UNE, v,
+                                 zero);
   if (tp.isIntOrIndex())
-    return builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, v,
-                                         zero);
+    return arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne, v,
+                                 zero);
   if (isa<ComplexType>(tp))
-    return builder.create<complex::NotEqualOp>(loc, v, zero);
+    return complex::NotEqualOp::create(builder, loc, v, zero);
   llvm_unreachable("Non-numeric type");
 }
 
@@ -223,7 +223,7 @@ void mlir::sparse_tensor::genReshapeDstShape(
     for (const auto &map : llvm::enumerate(reassociation)) {
       auto dstDim = constantIndex(builder, loc, 1);
       for (unsigned i = start; i < start + map.value().size(); i++) {
-        dstDim = builder.create<arith::MulIOp>(loc, dstDim, srcShape[i]);
+        dstDim = arith::MulIOp::create(builder, loc, dstDim, srcShape[i]);
       }
       dstShape.push_back(dstDim);
       start = start + map.value().size();
@@ -257,7 +257,7 @@ void mlir::sparse_tensor::genReshapeDstShape(
         // Compute the dynamic dimension size.
         Value productVal = constantIndex(builder, loc, product);
         Value dynamicSize =
-            builder.create<arith::DivUIOp>(loc, srcDim, productVal);
+            arith::DivUIOp::create(builder, loc, srcDim, productVal);
         dstShape.push_back(dynamicSize);
       } else {
         // The expanded dimension is statically known.
@@ -286,7 +286,7 @@ void mlir::sparse_tensor::reshapeCvs(
     // Prepare strides information in dimension slice.
     Value linear = constantIndex(builder, loc, 1);
     for (unsigned j = start, end = start + map.value().size(); j < end; j++) {
-      linear = builder.create<arith::MulIOp>(loc, linear, sizes[j]);
+      linear = arith::MulIOp::create(builder, loc, linear, sizes[j]);
     }
     // Start expansion.
     Value val;
@@ -294,16 +294,17 @@ void mlir::sparse_tensor::reshapeCvs(
       val = srcCvs[i];
     // Iterate over dimension slice.
     for (unsigned j = start, end = start + map.value().size(); j < end; j++) {
-      linear = builder.create<arith::DivUIOp>(loc, linear, sizes[j]);
+      linear = arith::DivUIOp::create(builder, loc, linear, sizes[j]);
       if (isCollapse) {
-        const Value mul = builder.create<arith::MulIOp>(loc, srcCvs[j], linear);
-        val = val ? builder.create<arith::AddIOp>(loc, val, mul) : mul;
+        const Value mul =
+            arith::MulIOp::create(builder, loc, srcCvs[j], linear);
+        val = val ? arith::AddIOp::create(builder, loc, val, mul) : mul;
       } else {
         const Value old = val;
-        val = builder.create<arith::DivUIOp>(loc, val, linear);
+        val = arith::DivUIOp::create(builder, loc, val, linear);
         assert(dstCvs.size() == j);
         dstCvs.push_back(val);
-        val = builder.create<arith::RemUIOp>(loc, old, linear);
+        val = arith::RemUIOp::create(builder, loc, old, linear);
       }
     }
     // Finalize collapse.
@@ -326,8 +327,8 @@ FlatSymbolRefAttr mlir::sparse_tensor::getFunc(ModuleOp module, StringRef name,
   auto func = module.lookupSymbol<func::FuncOp>(result.getAttr());
   if (!func) {
     OpBuilder moduleBuilder(module.getBodyRegion());
-    func = moduleBuilder.create<func::FuncOp>(
-        module.getLoc(), name,
+    func = func::FuncOp::create(
+        moduleBuilder, module.getLoc(), name,
         FunctionType::get(context, operands.getTypes(), resultType));
     func.setPrivate();
     if (static_cast<bool>(emitCInterface))
@@ -343,7 +344,7 @@ func::CallOp mlir::sparse_tensor::createFuncCall(
   auto module = builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
   FlatSymbolRefAttr fn =
       getFunc(module, name, resultType, operands, emitCInterface);
-  return builder.create<func::CallOp>(loc, resultType, fn, operands);
+  return func::CallOp::create(builder, loc, resultType, fn, operands);
 }
 
 Type mlir::sparse_tensor::getOpaquePointerType(MLIRContext *ctx) {
@@ -358,7 +359,7 @@ Value mlir::sparse_tensor::genAlloca(OpBuilder &builder, Location loc,
                                      unsigned sz, Type tp, bool staticShape) {
   if (staticShape) {
     auto memTp = MemRefType::get({sz}, tp);
-    return builder.create<memref::AllocaOp>(loc, memTp);
+    return memref::AllocaOp::create(builder, loc, memTp);
   }
   return genAlloca(builder, loc, constantIndex(builder, loc, sz), tp);
 }
@@ -366,12 +367,12 @@ Value mlir::sparse_tensor::genAlloca(OpBuilder &builder, Location loc,
 Value mlir::sparse_tensor::genAlloca(OpBuilder &builder, Location loc, Value sz,
                                      Type tp) {
   auto memTp = MemRefType::get({ShapedType::kDynamic}, tp);
-  return builder.create<memref::AllocaOp>(loc, memTp, ValueRange{sz});
+  return memref::AllocaOp::create(builder, loc, memTp, ValueRange{sz});
 }
 
 Value mlir::sparse_tensor::genAllocaScalar(OpBuilder &builder, Location loc,
                                            Type tp) {
-  return builder.create<memref::AllocaOp>(loc, MemRefType::get({}, tp));
+  return memref::AllocaOp::create(builder, loc, MemRefType::get({}, tp));
 }
 
 Value mlir::sparse_tensor::allocaBuffer(OpBuilder &builder, Location loc,
@@ -381,7 +382,7 @@ Value mlir::sparse_tensor::allocaBuffer(OpBuilder &builder, Location loc,
   Value buffer = genAlloca(builder, loc, sz, values[0].getType());
   for (unsigned i = 0; i < sz; i++) {
     Value idx = constantIndex(builder, loc, i);
-    builder.create<memref::StoreOp>(loc, values[i], buffer, idx);
+    memref::StoreOp::create(builder, loc, values[i], buffer, idx);
   }
   return buffer;
 }
@@ -397,15 +398,15 @@ Value mlir::sparse_tensor::allocDenseTensor(OpBuilder &builder, Location loc,
     if (shape[i] == ShapedType::kDynamic)
       dynamicSizes.push_back(sizes[i]);
   }
-  Value mem = builder.create<memref::AllocOp>(loc, memTp, dynamicSizes);
+  Value mem = memref::AllocOp::create(builder, loc, memTp, dynamicSizes);
   Value zero = constantZero(builder, loc, elemTp);
-  builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{mem});
+  linalg::FillOp::create(builder, loc, ValueRange{zero}, ValueRange{mem});
   return mem;
 }
 
 void mlir::sparse_tensor::deallocDenseTensor(OpBuilder &builder, Location loc,
                                              Value buffer) {
-  builder.create<memref::DeallocOp>(loc, buffer);
+  memref::DeallocOp::create(builder, loc, buffer);
 }
 
 void mlir::sparse_tensor::sizesFromSrc(OpBuilder &builder,
@@ -483,17 +484,17 @@ void sparse_tensor::foreachInSparseConstant(
     cvs.clear();
     for (Dimension d = 0; d < dimRank; d++) {
       auto crd = elems[i].first[d].getInt();
-      cvs.push_back(builder.create<arith::ConstantIndexOp>(loc, crd));
+      cvs.push_back(arith::ConstantIndexOp::create(builder, loc, crd));
     }
     // Remap value.
     Value val;
     if (isa<ComplexType>(attr.getElementType())) {
       auto valAttr = cast<ArrayAttr>(elems[i].second);
-      val = builder.create<complex::ConstantOp>(loc, attr.getElementType(),
-                                                valAttr);
+      val = complex::ConstantOp::create(builder, loc, attr.getElementType(),
+                                        valAttr);
     } else {
       auto valAttr = cast<TypedAttr>(elems[i].second);
-      val = builder.create<arith::ConstantOp>(loc, valAttr);
+      val = arith::ConstantOp::create(builder, loc, valAttr);
     }
     assert(val);
     callback(cvs, val);
@@ -513,10 +514,10 @@ SmallVector<Value> sparse_tensor::loadAll(OpBuilder &builder, Location loc,
   SmallVector<Value> vs;
   vs.reserve(size);
   for (unsigned i = 0; i < size; i++) {
-    Value v = builder.create<memref::LoadOp>(loc, mem,
-                                             constantIndex(builder, loc, i));
+    Value v = memref::LoadOp::create(builder, loc, mem,
+                                     constantIndex(builder, loc, i));
     if (i == offsetIdx && offsetVal)
-      v = builder.create<arith::AddIOp>(loc, v, offsetVal);
+      v = arith::AddIOp::create(builder, loc, v, offsetVal);
     vs.push_back(v);
   }
   return vs;
@@ -535,10 +536,10 @@ void sparse_tensor::storeAll(OpBuilder &builder, Location loc, Value mem,
   for (const auto &v : llvm::enumerate(vs)) {
     const Value w =
         (offsetIdx == v.index() && offsetVal)
-            ? builder.create<arith::AddIOp>(loc, v.value(), offsetVal)
+            ? arith::AddIOp::create(builder, loc, v.value(), offsetVal)
             : v.value();
-    builder.create<memref::StoreOp>(loc, w, mem,
-                                    constantIndex(builder, loc, v.index()));
+    memref::StoreOp::create(builder, loc, w, mem,
+                            constantIndex(builder, loc, v.index()));
   }
 }
 
@@ -547,7 +548,7 @@ sparse_tensor::genToMemref(OpBuilder &builder, Location loc, Value tensor) {
   auto tTp = llvm::cast<TensorType>(tensor.getType());
   auto mTp = MemRefType::get(tTp.getShape(), tTp.getElementType());
   return cast<TypedValue<BaseMemRefType>>(
-      builder.create<bufferization::ToBufferOp>(loc, mTp, tensor).getResult());
+      bufferization::ToBufferOp::create(builder, loc, mTp, tensor).getResult());
 }
 
 Value sparse_tensor::createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc,
@@ -557,7 +558,7 @@ Value sparse_tensor::createOrFoldSliceOffsetOp(OpBuilder &builder, Location loc,
   std::optional<unsigned> offset = enc.getStaticDimSliceOffset(dim);
   if (offset.has_value())
     return constantIndex(builder, loc, *offset);
-  return builder.create<ToSliceOffsetOp>(loc, tensor, APInt(64, dim));
+  return ToSliceOffsetOp::create(builder, loc, tensor, APInt(64, dim));
 }
 
 Value sparse_tensor::createOrFoldSliceStrideOp(OpBuilder &builder, Location loc,
@@ -567,7 +568,7 @@ Value sparse_tensor::createOrFoldSliceStrideOp(OpBuilder &builder, Location loc,
   std::optional<unsigned> stride = enc.getStaticDimSliceStride(dim);
   if (stride.has_value())
     return constantIndex(builder, loc, *stride);
-  return builder.create<ToSliceStrideOp>(loc, tensor, APInt(64, dim));
+  return ToSliceStrideOp::create(builder, loc, tensor, APInt(64, dim));
 }
 
 Value sparse_tensor::genReader(OpBuilder &builder, Location loc,
@@ -609,8 +610,8 @@ Value sparse_tensor::genReader(OpBuilder &builder, Location loc,
     // subsequent clients need the values (DCE will remove unused).
     for (Dimension d = 0; d < dimRank; d++) {
       if (stt.isDynamicDim(d))
-        dimSizesValues[d] = builder.create<memref::LoadOp>(
-            loc, dimSizesBuffer, constantIndex(builder, loc, d));
+        dimSizesValues[d] = memref::LoadOp::create(
+            builder, loc, dimSizesBuffer, constantIndex(builder, loc, d));
     }
   }
   return reader;
@@ -686,8 +687,8 @@ Value sparse_tensor::genMapBuffers(
     if (cm == 0) {
       lvlSz = dimSizesValues[d];
       if (cf != 0)
-        lvlSz = builder.create<arith::DivUIOp>(loc, lvlSz,
-                                               constantIndex(builder, loc, cf));
+        lvlSz = arith::DivUIOp::create(builder, loc, lvlSz,
+                                       constantIndex(builder, loc, cf));
     } else {
       lvlSz = constantIndex(builder, loc, cm);
     }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
index dc017e6baa6dc..1c10dd5566184 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.h
@@ -97,8 +97,8 @@ class FuncCallOrInlineGenerator {
       // Create the function if not already exist.
       OpBuilder::InsertionGuard insertionGuard(builder);
       builder.setInsertionPoint(getParentOpOf<func::FuncOp>(builder));
-      func = builder.create<func::FuncOp>(
-          loc, funcName,
+      func = func::FuncOp::create(
+          builder, loc, funcName,
           FunctionType::get(context, params.getTypes(), retTypes));
       func.setPrivate();
       // Set the insertion point to the body of the function.
@@ -108,10 +108,10 @@ class FuncCallOrInlineGenerator {
       // Delegates to user to generate the actually implementation.
       SmallVector<Value> result =
           genImplementation(retTypes, args, builder, loc);
-      builder.create<func::ReturnOp>(loc, result);
+      func::ReturnOp::create(builder, loc, result);
     }
     // Returns the CallOp result.
-    func::CallOp call = builder.create<func::CallOp>(loc, func, params);
+    func::CallOp call = func::CallOp::create(builder, loc, func, params);
     return call.getResults();
   }
 
@@ -310,9 +310,9 @@ inline Value constantZero(OpBuilder &builder, Location loc, Type tp) {
   if (auto ctp = dyn_cast<ComplexType>(tp)) {
     auto zeroe = builder.getZeroAttr(ctp.getElementType());
     auto zeroa = builder.getArrayAttr({zeroe, zeroe});
-    return builder.create<complex::ConstantOp>(loc, tp, zeroa);
+    return complex::ConstantOp::create(builder, loc, tp, zeroa);
   }
-  return builder.create<arith::ConstantOp>(loc, tp, builder.getZeroAttr(tp));
+  return arith::ConstantOp::create(builder, loc, tp, builder.getZeroAttr(tp));
 }
 
 /// Generates a 1-valued constant of the given type.  This supports all
@@ -322,39 +322,39 @@ inline Value constantOne(OpBuilder &builder, Location loc, Type tp) {
     auto zeroe = builder.getZeroAttr(ctp.getElementType());
     auto onee = getOneAttr(builder, ctp.getElementType());
     auto zeroa = builder.getArrayAttr({onee, zeroe});
-    return builder.create<complex::ConstantOp>(loc, tp, zeroa);
+    return complex::ConstantOp::create(builder, loc, tp, zeroa);
   }
-  return builder.create<arith::ConstantOp>(loc, tp, getOneAttr(builder, tp));
+  return arith::ConstantOp::create(builder, loc, tp, getOneAttr(builder, tp));
 }
 
 /// Generates a constant of `index` type.
 inline Value constantIndex(OpBuilder &builder, Location loc, int64_t i) {
-  return builder.create<arith::ConstantIndexOp>(loc, i);
+  return arith::ConstantIndexOp::create(builder, loc, i);
 }
 
 /// Generates a constant of `i64` type.
 inline Value constantI64(OpBuilder &builder, Location loc, int64_t i) {
-  return builder.create<arith::ConstantIntOp>(loc, i, 64);
+  return arith::ConstantIntOp::create(builder, loc, i, 64);
 }
 
 /// Generates a constant of `i32` type.
 inline Value constantI32(OpBuilder &builder, Location loc, int32_t i) {
-  return builder.create<arith::ConstantIntOp>(loc, i, 32);
+  return arith::ConstantIntOp::create(builder, loc, i, 32);
 }
 
 /// Generates a constant of `i16` type.
 inline Value constantI16(OpBuilder &builder, Location loc, int16_t i) {
-  return builder.create<arith::ConstantIntOp>(loc, i, 16);
+  return arith::ConstantIntOp::create(builder, loc, i, 16);
 }
 
 /// Generates a constant of `i8` type.
 inline Value constantI8(OpBuilder &builder, Location loc, int8_t i) {
-  return builder.create<arith::ConstantIntOp>(loc, i, 8);
+  return arith::ConstantIntOp::create(builder, loc, i, 8);
 }
 
 /// Generates a constant of `i1` type.
 inline Value constantI1(OpBuilder &builder, Location loc, bool b) {
-  return builder.create<arith::ConstantIntOp>(loc, b, 1);
+  return arith::ConstantIntOp::create(builder, loc, b, 1);
 }
 
 /// Generates a constant of the given `Action`.
@@ -400,12 +400,12 @@ inline Value constantLevelTypeEncoding(OpBuilder &builder, Location loc,
 inline Value genValFromAttr(OpBuilder &builder, Location loc, Attribute attr) {
   if (auto complexAttr = dyn_cast<complex::NumberAttr>(attr)) {
     Type tp = cast<ComplexType>(complexAttr.getType()).getElementType();
-    return builder.create<complex::ConstantOp>(
-        loc, complexAttr.getType(),
+    return complex::ConstantOp::create(
+        builder, loc, complexAttr.getType(),
         builder.getArrayAttr({FloatAttr::get(tp, complexAttr.getReal()),
                               FloatAttr::get(tp, complexAttr.getImag())}));
   }
-  return builder.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
+  return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(attr));
 }
 
 // TODO: is this at the right place?
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index a77e3036ac519..659282a995123 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -25,18 +25,18 @@ using namespace mlir::sparse_tensor;
 //===----------------------------------------------------------------------===//
 
 #define CMPI(p, l, r)                                                          \
-  (builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r))       \
+  (arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::p, (l), (r))      \
        .getResult())
 
 #define C_IDX(v) (constantIndex(builder, loc, (v)))
-#define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
-#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
-#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
-#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
-#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
-#define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
-#define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
-#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
+#define YIELD(vs) (scf::YieldOp::create(builder, loc, (vs)))
+#define ADDI(lhs, rhs) (arith::AddIOp::create(builder, loc, (lhs), (rhs)))
+#define ANDI(lhs, rhs) (arith::AndIOp::create(builder, loc, (lhs), (rhs)))
+#define SUBI(lhs, rhs) (arith::SubIOp::create(builder, loc, (lhs), (rhs)))
+#define MULI(lhs, rhs) (arith::MulIOp::create(builder, loc, (lhs), (rhs)))
+#define REMUI(lhs, rhs) (arith::RemUIOp::create(builder, loc, (lhs), (rhs)))
+#define DIVUI(lhs, rhs) (arith::DivUIOp::create(builder, loc, (lhs), (rhs)))
+#define SELECT(c, l, r) (arith::SelectOp::create(builder, loc, (c), (l), (r)))
 
 //===----------------------------------------------------------------------===//
 // Debugging utils
@@ -45,8 +45,8 @@ using namespace mlir::sparse_tensor;
 #ifndef NDEBUG
 LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
                                                   Location loc, Value memref) {
-  memref = builder.create<memref::CastOp>(
-      loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
+  memref = memref::CastOp::create(
+      builder, loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
   createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
                  ValueRange{memref}, EmitCInterface::On);
 }
@@ -261,7 +261,7 @@ void LoopEmitter::initializeLoopEmit(
         denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
 
       Value denseVal =
-          builder.create<bufferization::ToBufferOp>(loc, denseTp, tensor);
+          bufferization::ToBufferOp::create(builder, loc, denseTp, tensor);
       // Dense outputs need special handling.
       if (isOutput && updater)
         denseVal = updater(builder, loc, denseVal, tensor);
@@ -271,7 +271,7 @@ void LoopEmitter::initializeLoopEmit(
       // Annotated sparse tensors.
       // We also need the value buffer for all-dense annotated "sparse"
       // tensors.
-      valBuffer[t] = builder.create<ToValuesOp>(loc, tensor);
+      valBuffer[t] = ToValuesOp::create(builder, loc, tensor);
     }
   }
 
@@ -479,7 +479,7 @@ std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
   Value iv;
   if (isParallel) {
     scf::ParallelOp parOp =
-        builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
+        scf::ParallelOp::create(builder, loc, lo, hi, step, reduc);
     builder.setInsertionPointToStart(parOp.getBody());
     assert(parOp.getNumReductions() == reduc.size());
     iv = parOp.getInductionVars()[0];
@@ -495,7 +495,7 @@ std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
       reduc[i] = parOp.getInitVals()[i];
     loop = parOp;
   } else {
-    scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
+    scf::ForOp forOp = scf::ForOp::create(builder, loc, lo, hi, step, reduc);
     builder.setInsertionPointToStart(forOp.getBody());
     iv = forOp.getInductionVar();
 
@@ -603,12 +603,12 @@ Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
 
       // Extract and iterate over the iteration space.
       ExtractIterSpaceOp extractSpaceOp =
-          lvl == 0 ? builder.create<ExtractIterSpaceOp>(loc, t)
-                   : builder.create<ExtractIterSpaceOp>(
-                         loc, t, spIterVals[tid][lvl - 1], lvl);
+          lvl == 0 ? ExtractIterSpaceOp::create(builder, loc, t)
+                   : ExtractIterSpaceOp::create(builder, loc, t,
+                                                spIterVals[tid][lvl - 1], lvl);
 
-      IterateOp iterOp = builder.create<IterateOp>(
-          loc, extractSpaceOp.getExtractedSpace(), reduc);
+      IterateOp iterOp = IterateOp::create(
+          builder, loc, extractSpaceOp.getExtractedSpace(), reduc);
       spIterVals[tid][lvl] = iterOp.getIterator();
 
       // Update the reduction varaibles.
@@ -625,12 +625,12 @@ Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
     for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
       Value t = tensors[tid];
       ExtractIterSpaceOp extractSpaceOp =
-          lvl == 0 ? builder.create<ExtractIterSpaceOp>(loc, t)
-                   : builder.create<ExtractIterSpaceOp>(
-                         loc, t, spIterVals[tid][lvl - 1], lvl);
+          lvl == 0 ? ExtractIterSpaceOp::create(builder, loc, t)
+                   : ExtractIterSpaceOp::create(builder, loc, t,
+                                                spIterVals[tid][lvl - 1], lvl);
       spaces.push_back(extractSpaceOp.getExtractedSpace());
     }
-    auto coIterOp = builder.create<CoIterateOp>(loc, spaces, reduc, numCases);
+    auto coIterOp = CoIterateOp::create(builder, loc, spaces, reduc, numCases);
     // The CoIterationOp does not have insertion block nor induction variable.
     // TODO: the `struct LoopInfo` should be simplied after full migration.
     loopStack.emplace_back(tidLvls, coIterOp, /*insertion block*/ nullptr,
@@ -728,7 +728,7 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
   if (emitStrategy == SparseEmitStrategy::kSparseIterator) {
     auto iterateOp = llvm::cast<IterateOp>(loopInfo.loop);
     assert(reduc.size() == iterateOp.getNumResults());
-    rewriter.create<sparse_tensor::YieldOp>(loc, reduc);
+    sparse_tensor::YieldOp::create(rewriter, loc, reduc);
     // Exit the loop.
     rewriter.setInsertionPointAfter(iterateOp);
     // In-place update reduction variables.
@@ -738,7 +738,7 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
   if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
     if (!reduc.empty()) {
       assert(reduc.size() == forOp.getNumResults());
-      rewriter.create<scf::YieldOp>(loc, reduc);
+      scf::YieldOp::create(rewriter, loc, reduc);
     }
     // Exit the loop.
     rewriter.setInsertionPointAfter(forOp);
@@ -777,7 +777,7 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
 #endif // NDEBUG
 
       rewriter.setInsertionPointAfter(redExp);
-      auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
+      auto redOp = scf::ReduceOp::create(rewriter, loc, curVal);
       // Attach to the reduction op.
       Block *redBlock = &redOp.getReductions().front().front();
       rewriter.setInsertionPointToEnd(redBlock);
@@ -789,7 +789,7 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
       // Erases the out-dated reduction expression.
       rewriter.eraseOp(redExp);
       rewriter.setInsertionPointToEnd(redBlock);
-      rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
+      scf::ReduceReturnOp::create(rewriter, loc, newRed->getResult(0));
     }
     rewriter.setInsertionPointAfter(parOp);
     // In-place update reduction variables.
@@ -863,7 +863,7 @@ void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
   if (emitStrategy == SparseEmitStrategy::kSparseIterator) {
     Operation *p = loopInfo.loop;
     if (isa<IterateOp>(p))
-      rewriter.create<sparse_tensor::YieldOp>(loc, reduc);
+      sparse_tensor::YieldOp::create(rewriter, loc, reduc);
 
     // Exit the loop.
     rewriter.setInsertionPointAfter(p);
@@ -929,7 +929,7 @@ std::pair<Operation *, Value> sparse_tensor::genCoIteration(
   // Ensures all operands are valid.
   assert(!llvm::is_contained(ivs, nullptr));
   TypeRange types = ValueRange(ivs).getTypes();
-  auto whileOp = builder.create<scf::WhileOp>(loc, types, ivs);
+  auto whileOp = scf::WhileOp::create(builder, loc, types, ivs);
 
   SmallVector<Location> locs(types.size(), loc);
   Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
@@ -948,7 +948,7 @@ std::pair<Operation *, Value> sparse_tensor::genCoIteration(
   // The remaining block arguments are user-provided reduction values and an
   // optional universal index. Make sure their sizes match.
   assert(bArgs.size() == reduc.size() + (uniIdx ? 1 : 0));
-  builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
+  scf::ConditionOp::create(builder, loc, whileCond, before->getArguments());
 
   // Generates loop body.
   builder.setInsertionPointToStart(after);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
index 1c8a4789e2065..3b3b0aadf638c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.cpp
@@ -78,15 +78,16 @@ SparseTensorTypeToBufferConverter::SparseTensorTypeToBufferConverter() {
 
 Value SparseTensorSpecifier::getInitValue(OpBuilder &builder, Location loc,
                                           SparseTensorType stt) {
-  return builder.create<StorageSpecifierInitOp>(
-      loc, StorageSpecifierType::get(stt.getEncoding()));
+  return StorageSpecifierInitOp::create(
+      builder, loc, StorageSpecifierType::get(stt.getEncoding()));
 }
 
 Value SparseTensorSpecifier::getSpecifierField(OpBuilder &builder, Location loc,
                                                StorageSpecifierKind kind,
                                                std::optional<Level> lvl) {
-  return builder.create<GetStorageSpecifierOp>(
-      loc, specifier, kind, optionalLevelAttr(specifier.getContext(), lvl));
+  return GetStorageSpecifierOp::create(
+      builder, loc, specifier, kind,
+      optionalLevelAttr(specifier.getContext(), lvl));
 }
 
 void SparseTensorSpecifier::setSpecifierField(OpBuilder &builder, Location loc,
@@ -95,8 +96,9 @@ void SparseTensorSpecifier::setSpecifierField(OpBuilder &builder, Location loc,
                                               std::optional<Level> lvl) {
   // TODO: make `v` have type `TypedValue<IndexType>` instead.
   assert(v.getType().isIndex());
-  specifier = builder.create<SetStorageSpecifierOp>(
-      loc, specifier, kind, optionalLevelAttr(specifier.getContext(), lvl), v);
+  specifier = SetStorageSpecifierOp::create(
+      builder, loc, specifier, kind,
+      optionalLevelAttr(specifier.getContext(), lvl), v);
 }
 
 //===----------------------------------------------------------------------===//
@@ -111,9 +113,9 @@ Value sparse_tensor::SparseTensorDescriptor::getCrdMemRefOrView(
 
   Value stride = constantIndex(builder, loc, rType.getLvlRank() - cooStart);
   Value size = getCrdMemSize(builder, loc, cooStart);
-  size = builder.create<arith::DivUIOp>(loc, size, stride);
-  return builder.create<memref::SubViewOp>(
-      loc, getMemRefField(SparseTensorFieldKind::CrdMemRef, cooStart),
+  size = arith::DivUIOp::create(builder, loc, size, stride);
+  return memref::SubViewOp::create(
+      builder, loc, getMemRefField(SparseTensorFieldKind::CrdMemRef, cooStart),
       /*offset=*/ValueRange{constantIndex(builder, loc, lvl - cooStart)},
       /*size=*/ValueRange{size},
       /*step=*/ValueRange{stride});
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
index 869c7864d7535..45d142a807c36 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorDescriptor.h
@@ -231,7 +231,7 @@ class MutSparseTensorDescriptor
 /// Packs the given values as a "tuple" value.
 inline Value genTuple(OpBuilder &builder, Location loc, Type tp,
                       ValueRange values) {
-  return builder.create<UnrealizedConversionCastOp>(loc, TypeRange(tp), values)
+  return UnrealizedConversionCastOp::create(builder, loc, TypeRange(tp), values)
       .getResult(0);
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
index aad5e97ed14ab..46d0baac58f06 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
@@ -22,23 +22,23 @@ using ValueTuple = std::tuple<Value, Value, Value>;
 // File local helper functions/macros.
 //===----------------------------------------------------------------------===//
 #define CMPI(p, lhs, rhs)                                                      \
-  (b.create<arith::CmpIOp>(l, arith::CmpIPredicate::p, (lhs), (rhs))           \
+  (arith::CmpIOp::create(b, l, arith::CmpIPredicate::p, (lhs), (rhs))          \
        .getResult())
 
 #define C_FALSE (constantI1(b, l, false))
 #define C_TRUE (constantI1(b, l, true))
 #define C_IDX(v) (constantIndex(b, l, (v)))
-#define YIELD(vs) (b.create<scf::YieldOp>(l, (vs)))
-#define ADDI(lhs, rhs) (b.create<arith::AddIOp>(l, (lhs), (rhs)).getResult())
-#define ORI(lhs, rhs) (b.create<arith::OrIOp>(l, (lhs), (rhs)).getResult())
-#define ANDI(lhs, rhs) (b.create<arith::AndIOp>(l, (lhs), (rhs)).getResult())
-#define SUBI(lhs, rhs) (b.create<arith::SubIOp>(l, (lhs), (rhs)).getResult())
-#define MULI(lhs, rhs) (b.create<arith::MulIOp>(l, (lhs), (rhs)).getResult())
-#define MINUI(lhs, rhs) (b.create<arith::MinUIOp>(l, (lhs), (rhs)).getResult())
-#define REMUI(lhs, rhs) (b.create<arith::RemUIOp>(l, (lhs), (rhs)).getResult())
-#define DIVUI(lhs, rhs) (b.create<arith::DivUIOp>(l, (lhs), (rhs)).getResult())
+#define YIELD(vs) (scf::YieldOp::create(b, l, (vs)))
+#define ADDI(lhs, rhs) (arith::AddIOp::create(b, l, (lhs), (rhs)).getResult())
+#define ORI(lhs, rhs) (arith::OrIOp::create(b, l, (lhs), (rhs)).getResult())
+#define ANDI(lhs, rhs) (arith::AndIOp::create(b, l, (lhs), (rhs)).getResult())
+#define SUBI(lhs, rhs) (arith::SubIOp::create(b, l, (lhs), (rhs)).getResult())
+#define MULI(lhs, rhs) (arith::MulIOp::create(b, l, (lhs), (rhs)).getResult())
+#define MINUI(lhs, rhs) (arith::MinUIOp::create(b, l, (lhs), (rhs)).getResult())
+#define REMUI(lhs, rhs) (arith::RemUIOp::create(b, l, (lhs), (rhs)).getResult())
+#define DIVUI(lhs, rhs) (arith::DivUIOp::create(b, l, (lhs), (rhs)).getResult())
 #define SELECT(c, lhs, rhs)                                                    \
-  (b.create<arith::SelectOp>(l, (c), (lhs), (rhs)).getResult())
+  (arith::SelectOp::create(b, l, (c), (lhs), (rhs)).getResult())
 
 //===----------------------------------------------------------------------===//
 // SparseTensorLevel derived classes.
@@ -150,19 +150,19 @@ class CompressedLevel : public SparseLevel</*hasPosBuf=*/true> {
       return loadRange();
 
     SmallVector<Type, 2> types{b.getIndexType(), b.getIndexType()};
-    scf::IfOp posRangeIf = b.create<scf::IfOp>(l, types, inPadZone, true);
+    scf::IfOp posRangeIf = scf::IfOp::create(b, l, types, inPadZone, true);
     // True branch, returns a "fake" empty range [0, 0) if parent
     // iterator is in pad zone.
     b.setInsertionPointToStart(posRangeIf.thenBlock());
 
     SmallVector<Value, 2> emptyRange{C_IDX(0), C_IDX(0)};
-    b.create<scf::YieldOp>(l, emptyRange);
+    scf::YieldOp::create(b, l, emptyRange);
 
     // False branch, returns the actual range.
     b.setInsertionPointToStart(posRangeIf.elseBlock());
     auto [pLo, pHi] = loadRange();
     SmallVector<Value, 2> loadedRange{pLo, pHi};
-    b.create<scf::YieldOp>(l, loadedRange);
+    scf::YieldOp::create(b, l, loadedRange);
 
     b.setInsertionPointAfter(posRangeIf);
     ValueRange posRange = posRangeIf.getResults();
@@ -248,7 +248,7 @@ static scf::ValueVector genWhenInBound(
     llvm::function_ref<scf::ValueVector(OpBuilder &, Location, Value)>
         builder) {
   TypeRange ifRetTypes = elseRet.getTypes();
-  auto ifOp = b.create<scf::IfOp>(l, ifRetTypes, it.genNotEnd(b, l), true);
+  auto ifOp = scf::IfOp::create(b, l, ifRetTypes, it.genNotEnd(b, l), true);
 
   b.setInsertionPointToStart(ifOp.thenBlock());
   Value crd = it.deref(b, l);
@@ -732,29 +732,29 @@ class NonEmptySubSectIterator : public SparseIterator {
   //      [itVal0, itVal1, ..., pNx0],
   //      ...]
   Value allocSubSectPosBuf(OpBuilder &b, Location l) {
-    return b.create<memref::AllocaOp>(
-        l,
+    return memref::AllocaOp::create(
+        b, l,
         MemRefType::get({ShapedType::kDynamic, tupleSz + 1}, b.getIndexType()),
         maxTupleCnt);
   }
 
   void storeNxLvlStart(OpBuilder &b, Location l, Value tupleId,
                        Value start) const {
-    b.create<memref::StoreOp>(l, start, subSectPosBuf,
-                              ValueRange{tupleId, C_IDX(tupleSz)});
+    memref::StoreOp::create(b, l, start, subSectPosBuf,
+                            ValueRange{tupleId, C_IDX(tupleSz)});
   }
 
   Value loadNxLvlStart(OpBuilder &b, Location l, Value tupleId) const {
-    return b.create<memref::LoadOp>(l, subSectPosBuf,
-                                    ValueRange{tupleId, C_IDX(tupleSz)});
+    return memref::LoadOp::create(b, l, subSectPosBuf,
+                                  ValueRange{tupleId, C_IDX(tupleSz)});
   }
 
   void storeCursorVals(OpBuilder &b, Location l, Value tupleId,
                        ValueRange itVals) const {
     assert(itVals.size() == tupleSz);
     for (unsigned i = 0; i < tupleSz; i++) {
-      b.create<memref::StoreOp>(l, itVals[i], subSectPosBuf,
-                                ValueRange{tupleId, C_IDX(i)});
+      memref::StoreOp::create(b, l, itVals[i], subSectPosBuf,
+                              ValueRange{tupleId, C_IDX(i)});
     }
   }
 
@@ -762,8 +762,8 @@ class NonEmptySubSectIterator : public SparseIterator {
                                     Value tupleId) const {
     SmallVector<Value> ret;
     for (unsigned i = 0; i < tupleSz; i++) {
-      Value v = b.create<memref::LoadOp>(l, subSectPosBuf,
-                                         ValueRange{tupleId, C_IDX(i)});
+      Value v = memref::LoadOp::create(b, l, subSectPosBuf,
+                                       ValueRange{tupleId, C_IDX(i)});
       ret.push_back(v);
     }
     return ret;
@@ -1043,7 +1043,7 @@ ValueRange SparseIterator::forward(OpBuilder &b, Location l) {
 }
 
 ValueRange SparseIterator::forwardIf(OpBuilder &b, Location l, Value cond) {
-  auto ifOp = b.create<scf::IfOp>(l, getCursor().getTypes(), cond, true);
+  auto ifOp = scf::IfOp::create(b, l, getCursor().getTypes(), cond, true);
   // Generate else branch first, otherwise iterator values will be updated by
   // `forward()`.
   b.setInsertionPointToStart(ifOp.elseBlock());
@@ -1058,12 +1058,12 @@ ValueRange SparseIterator::forwardIf(OpBuilder &b, Location l, Value cond) {
 }
 
 Value DedupIterator::genSegmentHigh(OpBuilder &b, Location l, Value pos) {
-  auto whileOp = b.create<scf::WhileOp>(
-      l, pos.getType(), pos,
+  auto whileOp = scf::WhileOp::create(
+      b, l, pos.getType(), pos,
       /*beforeBuilder=*/
       [this, pos](OpBuilder &b, Location l, ValueRange ivs) {
         Value inBound = CMPI(ult, ivs.front(), posHi);
-        auto ifInBound = b.create<scf::IfOp>(l, b.getI1Type(), inBound, true);
+        auto ifInBound = scf::IfOp::create(b, l, b.getI1Type(), inBound, true);
         {
           OpBuilder::InsertionGuard guard(b);
           // If in bound, load the next coordinates and check duplication.
@@ -1076,7 +1076,7 @@ Value DedupIterator::genSegmentHigh(OpBuilder &b, Location l, Value pos) {
           b.setInsertionPointToStart(ifInBound.elseBlock());
           YIELD(constantI1(b, l, false));
         }
-        b.create<scf::ConditionOp>(l, ifInBound.getResults()[0], ivs);
+        scf::ConditionOp::create(b, l, ifInBound.getResults()[0], ivs);
       },
       /*afterBuilder=*/
       [](OpBuilder &b, Location l, ValueRange ivs) {
@@ -1137,8 +1137,8 @@ ValueRange FilterIterator::forwardImpl(OpBuilder &b, Location l) {
 
   SmallVector<Value> whileArgs(getCursor().begin(), getCursor().end());
   whileArgs.push_back(isFirst);
-  auto whileOp = b.create<scf::WhileOp>(
-      l, ValueRange(whileArgs).getTypes(), whileArgs,
+  auto whileOp = scf::WhileOp::create(
+      b, l, ValueRange(whileArgs).getTypes(), whileArgs,
       /*beforeBuilder=*/
       [this](OpBuilder &b, Location l, ValueRange ivs) {
         ValueRange isFirst = linkNewScope(ivs);
@@ -1154,7 +1154,7 @@ ValueRange FilterIterator::forwardImpl(OpBuilder &b, Location l) {
                              ret = ORI(ret, llvm::getSingleElement(isFirst));
                              return {ret};
                            });
-        b.create<scf::ConditionOp>(l, cont.front(), ivs);
+        scf::ConditionOp::create(b, l, cont.front(), ivs);
       },
       /*afterBuilder=*/
       [this](OpBuilder &b, Location l, ValueRange ivs) {
@@ -1219,8 +1219,8 @@ ValueRange NonEmptySubSectIterator::inflateSubSectTree(
     SmallVector<Value> iterArgs;
     iterArgs.push_back(C_IDX(0));
     iterArgs.append(reduc.begin(), reduc.end());
-    auto forEachLeaf = b.create<scf::ForOp>(
-        l, /*lb=*/C_IDX(0), /*ub=*/tupleCnt, /*step=*/C_IDX(1), iterArgs,
+    auto forEachLeaf = scf::ForOp::create(
+        b, l, /*lb=*/C_IDX(0), /*ub=*/tupleCnt, /*step=*/C_IDX(1), iterArgs,
         [&helper, &builder](OpBuilder &b, Location l, Value tupleId,
                             ValueRange iterArgs) {
           // Deserialize the iterator at the cached position (tupleId).
@@ -1235,12 +1235,12 @@ ValueRange NonEmptySubSectIterator::inflateSubSectTree(
           SmallVector<Value> whileArgs(helper.wrap.getCursor());
           whileArgs.append(iterArgs.begin(), iterArgs.end());
 
-          auto whileOp = b.create<scf::WhileOp>(
-              l, ValueRange(whileArgs).getTypes(), whileArgs,
+          auto whileOp = scf::WhileOp::create(
+              b, l, ValueRange(whileArgs).getTypes(), whileArgs,
               /*beforeBuilder=*/
               [&helper](OpBuilder &b, Location l, ValueRange ivs) {
                 helper.wrap.linkNewScope(ivs);
-                b.create<scf::ConditionOp>(l, helper.genNotEnd(b, l), ivs);
+                scf::ConditionOp::create(b, l, helper.genNotEnd(b, l), ivs);
               },
               /*afterBuilder=*/
               [&helper, &builder](OpBuilder &b, Location l, ValueRange ivs) {
@@ -1267,8 +1267,8 @@ ValueRange NonEmptySubSectIterator::inflateSubSectTree(
                                      ValueRange reduc) {
     assert(!parent || parent->lvl + 1 == lvl);
     delegate->genInit(b, l, parent);
-    auto forOp = b.create<scf::ForOp>(
-        l, /*lb=*/C_IDX(0), /*ub=*/subSectSz, /*step=*/C_IDX(1), reduc,
+    auto forOp = scf::ForOp::create(
+        b, l, /*lb=*/C_IDX(0), /*ub=*/subSectSz, /*step=*/C_IDX(1), reduc,
         [&](OpBuilder &b, Location l, Value crd, ValueRange iterArgs) {
           helper.locate(b, l, crd);
           scf::ValueVector nx = builder(b, l, &helper.wrap, iterArgs);
@@ -1411,7 +1411,7 @@ ValueRange NonEmptySubSectIterator::forwardImpl(OpBuilder &b, Location l) {
   // if (offset + size > parents.size)
   //   isNonEmpty = false;
   Value fastPathP = CMPI(ugt, getMinCrd(), getAbsOff());
-  auto ifOp = b.create<scf::IfOp>(l, getCursor().getTypes(), fastPathP, true);
+  auto ifOp = scf::IfOp::create(b, l, getCursor().getTypes(), fastPathP, true);
   {
     OpBuilder::InsertionGuard guard(b);
     // Take the fast path
@@ -1448,7 +1448,7 @@ ValueRange NonEmptySubSectIterator::forwardImpl(OpBuilder &b, Location l) {
                 Value isMin = CMPI(eq, crd, getMinCrd());
                 delegate->forwardIf(b, l, isMin);
                 // Update the forwarded iterator values if needed.
-                auto ifIsMin = b.create<scf::IfOp>(l, isMin, false);
+                auto ifIsMin = scf::IfOp::create(b, l, isMin, false);
                 b.setInsertionPointToStart(&ifIsMin.getThenRegion().front());
                 storeCursorVals(b, l, tupleId, delegate->serialize());
                 b.setInsertionPointAfter(ifIsMin);
@@ -1458,8 +1458,8 @@ ValueRange NonEmptySubSectIterator::forwardImpl(OpBuilder &b, Location l) {
                 return genWhenInBound(b, l, *delegate, /*elseRet=*/iterArgs,
                                       [nxMin](OpBuilder &b, Location l,
                                               Value crd) -> scf::ValueVector {
-                                        Value nx = b.create<arith::MinUIOp>(
-                                            l, crd, nxMin);
+                                        Value nx = arith::MinUIOp::create(
+                                            b, l, crd, nxMin);
                                         return {nx, C_TRUE};
                                       });
               });
@@ -1480,7 +1480,7 @@ ValueRange NonEmptySubSectIterator::forwardImpl(OpBuilder &b, Location l) {
 
   // We should at least forward the offset by one.
   Value minAbsOff = ADDI(getAbsOff(), c1);
-  nxAbsOff = b.create<arith::MaxUIOp>(l, minAbsOff, nxAbsOff);
+  nxAbsOff = arith::MaxUIOp::create(b, l, minAbsOff, nxAbsOff);
 
   seek(ValueRange{nxMinCrd, nxAbsOff, nxNotEnd});
   // The coordinate should not exceeds the space upper bound.
@@ -1581,16 +1581,17 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t,
   auto stt = getSparseTensorType(t);
 
   LevelType lt = stt.getLvlType(lvl);
-  Value sz = stt.hasEncoding() ? b.create<LvlOp>(l, t, lvl).getResult()
-                               : b.create<tensor::DimOp>(l, t, lvl).getResult();
+  Value sz = stt.hasEncoding()
+                 ? LvlOp::create(b, l, t, lvl).getResult()
+                 : tensor::DimOp::create(b, l, t, lvl).getResult();
 
   SmallVector<Value, 2> buffers;
   if (lt.isWithPosLT()) {
-    Value pos = b.create<ToPositionsOp>(l, t, lvl);
+    Value pos = ToPositionsOp::create(b, l, t, lvl);
     buffers.push_back(pos);
   }
   if (lt.isWithCrdLT()) {
-    Value pos = b.create<ToCoordinatesOp>(l, t, lvl);
+    Value pos = ToCoordinatesOp::create(b, l, t, lvl);
     buffers.push_back(pos);
   }
   return makeSparseTensorLevel(lt, sz, buffers, tid, lvl);
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 0258f797143cb..5847fecc45404 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -1563,7 +1563,7 @@ static Value insertYieldOp(RewriterBase &rewriter, Location loc, Region &region,
   Block &clonedBlock = tmpRegion.front();
   YieldOp clonedYield = cast<YieldOp>(clonedBlock.getTerminator());
   // Merge cloned block and return yield value.
-  Operation *placeholder = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Operation *placeholder = arith::ConstantIndexOp::create(rewriter, loc, 0);
   rewriter.inlineBlockBefore(&tmpRegion.front(), placeholder, vals);
   Value val = clonedYield.getSingleResult();
   rewriter.eraseOp(clonedYield);
@@ -1603,16 +1603,16 @@ static Value buildRelu(RewriterBase &rewriter, Location loc, Value v0,
                        Attribute attr) {
   Type tp = v0.getType();
   auto zero =
-      rewriter.create<arith::ConstantOp>(loc, tp, rewriter.getZeroAttr(tp));
+      arith::ConstantOp::create(rewriter, loc, tp, rewriter.getZeroAttr(tp));
   Value cmp;
   if (isa<FloatType>(tp)) {
     auto pred = llvm::cast<arith::CmpFPredicateAttr>(attr);
-    cmp = rewriter.create<arith::CmpFOp>(loc, pred, v0, zero);
+    cmp = arith::CmpFOp::create(rewriter, loc, pred, v0, zero);
   } else {
     auto pred = llvm::cast<arith::CmpIPredicateAttr>(attr);
-    cmp = rewriter.create<arith::CmpIOp>(loc, pred, v0, zero);
+    cmp = arith::CmpIOp::create(rewriter, loc, pred, v0, zero);
   }
-  return rewriter.create<arith::SelectOp>(loc, cmp, v0, zero);
+  return arith::SelectOp::create(rewriter, loc, cmp, v0, zero);
 }
 
 Value Merger::buildExp(RewriterBase &rewriter, Location loc, ExprId e, Value v0,
@@ -1627,128 +1627,128 @@ Value Merger::buildExp(RewriterBase &rewriter, Location loc, ExprId e, Value v0,
     llvm_unreachable("unexpected non-op");
   // Unary operations.
   case TensorExp::Kind::kAbsF:
-    return rewriter.create<math::AbsFOp>(loc, v0);
+    return math::AbsFOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kAbsC: {
     auto type = cast<ComplexType>(v0.getType());
     auto eltType = cast<FloatType>(type.getElementType());
-    return rewriter.create<complex::AbsOp>(loc, eltType, v0);
+    return complex::AbsOp::create(rewriter, loc, eltType, v0);
   }
   case TensorExp::Kind::kAbsI:
-    return rewriter.create<math::AbsIOp>(loc, v0);
+    return math::AbsIOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kCeilF:
-    return rewriter.create<math::CeilOp>(loc, v0);
+    return math::CeilOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kFloorF:
-    return rewriter.create<math::FloorOp>(loc, v0);
+    return math::FloorOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kSqrtF:
-    return rewriter.create<math::SqrtOp>(loc, v0);
+    return math::SqrtOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kSqrtC:
-    return rewriter.create<complex::SqrtOp>(loc, v0);
+    return complex::SqrtOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kExpm1F:
-    return rewriter.create<math::ExpM1Op>(loc, v0);
+    return math::ExpM1Op::create(rewriter, loc, v0);
   case TensorExp::Kind::kExpm1C:
-    return rewriter.create<complex::Expm1Op>(loc, v0);
+    return complex::Expm1Op::create(rewriter, loc, v0);
   case TensorExp::Kind::kLog1pF:
-    return rewriter.create<math::Log1pOp>(loc, v0);
+    return math::Log1pOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kLog1pC:
-    return rewriter.create<complex::Log1pOp>(loc, v0);
+    return complex::Log1pOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kRelu:
     return buildRelu(rewriter, loc, v0, expr.attr);
   case TensorExp::Kind::kSinF:
-    return rewriter.create<math::SinOp>(loc, v0);
+    return math::SinOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kSinC:
-    return rewriter.create<complex::SinOp>(loc, v0);
+    return complex::SinOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kTanhF:
-    return rewriter.create<math::TanhOp>(loc, v0);
+    return math::TanhOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kTanhC:
-    return rewriter.create<complex::TanhOp>(loc, v0);
+    return complex::TanhOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kNegF:
-    return rewriter.create<arith::NegFOp>(loc, v0);
+    return arith::NegFOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kNegC:
-    return rewriter.create<complex::NegOp>(loc, v0);
+    return complex::NegOp::create(rewriter, loc, v0);
   case TensorExp::Kind::kNegI: // no negi in std
-    return rewriter.create<arith::SubIOp>(
-        loc,
-        rewriter.create<arith::ConstantOp>(loc, v0.getType(),
-                                           rewriter.getZeroAttr(v0.getType())),
+    return arith::SubIOp::create(
+        rewriter, loc,
+        arith::ConstantOp::create(rewriter, loc, v0.getType(),
+                                  rewriter.getZeroAttr(v0.getType())),
         v0);
   case TensorExp::Kind::kTruncF:
-    return rewriter.create<arith::TruncFOp>(loc, inferType(e, v0), v0);
+    return arith::TruncFOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kExtF:
-    return rewriter.create<arith::ExtFOp>(loc, inferType(e, v0), v0);
+    return arith::ExtFOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastFS:
-    return rewriter.create<arith::FPToSIOp>(loc, inferType(e, v0), v0);
+    return arith::FPToSIOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastFU:
-    return rewriter.create<arith::FPToUIOp>(loc, inferType(e, v0), v0);
+    return arith::FPToUIOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastSF:
-    return rewriter.create<arith::SIToFPOp>(loc, inferType(e, v0), v0);
+    return arith::SIToFPOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastUF:
-    return rewriter.create<arith::UIToFPOp>(loc, inferType(e, v0), v0);
+    return arith::UIToFPOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastS:
-    return rewriter.create<arith::ExtSIOp>(loc, inferType(e, v0), v0);
+    return arith::ExtSIOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastU:
-    return rewriter.create<arith::ExtUIOp>(loc, inferType(e, v0), v0);
+    return arith::ExtUIOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCastIdx:
-    return rewriter.create<arith::IndexCastOp>(loc, inferType(e, v0), v0);
+    return arith::IndexCastOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kTruncI:
-    return rewriter.create<arith::TruncIOp>(loc, inferType(e, v0), v0);
+    return arith::TruncIOp::create(rewriter, loc, inferType(e, v0), v0);
   case TensorExp::Kind::kCIm: {
     auto type = cast<ComplexType>(v0.getType());
     auto eltType = cast<FloatType>(type.getElementType());
-    return rewriter.create<complex::ImOp>(loc, eltType, v0);
+    return complex::ImOp::create(rewriter, loc, eltType, v0);
   }
   case TensorExp::Kind::kCRe: {
     auto type = cast<ComplexType>(v0.getType());
     auto eltType = cast<FloatType>(type.getElementType());
-    return rewriter.create<complex::ReOp>(loc, eltType, v0);
+    return complex::ReOp::create(rewriter, loc, eltType, v0);
   }
   case TensorExp::Kind::kBitCast:
-    return rewriter.create<arith::BitcastOp>(loc, inferType(e, v0), v0);
+    return arith::BitcastOp::create(rewriter, loc, inferType(e, v0), v0);
   // Binary operations.
   case TensorExp::Kind::kMulF:
-    return rewriter.create<arith::MulFOp>(loc, v0, v1);
+    return arith::MulFOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kMulC:
-    return rewriter.create<complex::MulOp>(loc, v0, v1);
+    return complex::MulOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kMulI:
-    return rewriter.create<arith::MulIOp>(loc, v0, v1);
+    return arith::MulIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kDivF:
-    return rewriter.create<arith::DivFOp>(loc, v0, v1);
+    return arith::DivFOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kDivC:
-    return rewriter.create<complex::DivOp>(loc, v0, v1);
+    return complex::DivOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kDivS:
-    return rewriter.create<arith::DivSIOp>(loc, v0, v1);
+    return arith::DivSIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kDivU:
-    return rewriter.create<arith::DivUIOp>(loc, v0, v1);
+    return arith::DivUIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kAddF:
-    return rewriter.create<arith::AddFOp>(loc, v0, v1);
+    return arith::AddFOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kAddC:
-    return rewriter.create<complex::AddOp>(loc, v0, v1);
+    return complex::AddOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kAddI:
-    return rewriter.create<arith::AddIOp>(loc, v0, v1);
+    return arith::AddIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kSubF:
-    return rewriter.create<arith::SubFOp>(loc, v0, v1);
+    return arith::SubFOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kSubC:
-    return rewriter.create<complex::SubOp>(loc, v0, v1);
+    return complex::SubOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kSubI:
-    return rewriter.create<arith::SubIOp>(loc, v0, v1);
+    return arith::SubIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kAndI:
-    return rewriter.create<arith::AndIOp>(loc, v0, v1);
+    return arith::AndIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kOrI:
-    return rewriter.create<arith::OrIOp>(loc, v0, v1);
+    return arith::OrIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kXorI:
-    return rewriter.create<arith::XOrIOp>(loc, v0, v1);
+    return arith::XOrIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kShrS:
-    return rewriter.create<arith::ShRSIOp>(loc, v0, v1);
+    return arith::ShRSIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kShrU:
-    return rewriter.create<arith::ShRUIOp>(loc, v0, v1);
+    return arith::ShRUIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kShlI:
-    return rewriter.create<arith::ShLIOp>(loc, v0, v1);
+    return arith::ShLIOp::create(rewriter, loc, v0, v1);
   case TensorExp::Kind::kCmpI: {
     auto predicate = llvm::cast<arith::CmpIPredicateAttr>(expr.attr);
-    return rewriter.create<arith::CmpIOp>(loc, predicate, v0, v1);
+    return arith::CmpIOp::create(rewriter, loc, predicate, v0, v1);
   }
   case TensorExp::Kind::kCmpF: {
     auto predicate = llvm::cast<arith::CmpFPredicateAttr>(expr.attr);
-    return rewriter.create<arith::CmpFOp>(loc, predicate, v0, v1);
+    return arith::CmpFOp::create(rewriter, loc, predicate, v0, v1);
   }
   case TensorExp::Kind::kBinaryBranch: // semi-ring ops with custom logic.
     return insertYieldOp(rewriter, loc, *expr.op->getBlock()->getParent(),
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 5758d8d5ef506..606626dfe4d2c 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -845,9 +845,9 @@ struct PadSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
         getTosaConstShape(rewriter, sliceOp.getLoc(), newPadPaddings);
     auto newPadTy =
         RankedTensorType::get(newPadShape, inputTy.getElementType());
-    auto newPadOp = rewriter.create<tosa::PadOp>(
-        padOp.getLoc(), newPadTy, padOp.getInput1(), newPaddingsOp,
-        padOp.getPadConst());
+    auto newPadOp = tosa::PadOp::create(rewriter, padOp.getLoc(), newPadTy,
+                                        padOp.getInput1(), newPaddingsOp,
+                                        padOp.getPadConst());
 
     // Update SliceOp and point to new PadOp
     auto newStartOp =
@@ -897,9 +897,9 @@ struct SliceDynamicSizeCanonicalization
     }
 
     auto size_op = getTosaConstShape(rewriter, sliceOp.getLoc(), sliceSizes);
-    auto newSliceOp = rewriter.create<tosa::SliceOp>(
-        sliceOp.getLoc(), sliceOp.getType(), sliceOp.getInput1(),
-        sliceOp.getStart(), size_op);
+    auto newSliceOp =
+        tosa::SliceOp::create(rewriter, sliceOp.getLoc(), sliceOp.getType(),
+                              sliceOp.getInput1(), sliceOp.getStart(), size_op);
 
     rewriter.replaceOp(sliceOp, newSliceOp.getResult());
     return success();
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index f0ff430bae882..648e508a9788f 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -180,12 +180,12 @@ Operation *TosaDialect::materializeConstant(OpBuilder &builder, Attribute value,
   // Tosa dialect constants only support ElementsAttr unlike standard dialect
   // constant which supports all attributes.
   if (llvm::isa<shapeType>(type) && llvm::isa<DenseIntElementsAttr>(value)) {
-    return builder.create<tosa::ConstShapeOp>(
-        loc, type, llvm::cast<DenseIntElementsAttr>(value));
+    return tosa::ConstShapeOp::create(builder, loc, type,
+                                      llvm::cast<DenseIntElementsAttr>(value));
   }
   if (llvm::isa<ElementsAttr>(value))
-    return builder.create<tosa::ConstOp>(loc, type,
-                                         llvm::cast<ElementsAttr>(value));
+    return tosa::ConstOp::create(builder, loc, type,
+                                 llvm::cast<ElementsAttr>(value));
   return nullptr;
 }
 
@@ -323,7 +323,7 @@ Value mlir::tosa::createPadConstTensor(OpBuilder &builder, Location loc,
                                    builder.getFloatAttr(srcElemType, val))
           : DenseElementsAttr::get(padConstEType,
                                    builder.getIntegerAttr(srcElemType, val))};
-  return builder.create<tosa::ConstOp>(loc, padConstType, padConstAttr);
+  return tosa::ConstOp::create(builder, loc, padConstType, padConstAttr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -945,10 +945,11 @@ LogicalResult tosa::ClampOp::verify() {
       return emitOpError("min/max attributes types are incompatible with "
                          "input/output element types.");
 
-    const bool isUnsigned = cast<IntegerType>(inputETy).isUnsigned();
+    const bool isUnsigned = inputETy.isUnsignedInteger();
+    const bool isBoolean = inputETy.isInteger(1);
     const APInt minVal = intMinValAttr.getValue();
     const APInt maxVal = intMaxValAttr.getValue();
-    if (isUnsigned ? maxVal.ult(minVal) : maxVal.slt(minVal))
+    if ((isUnsigned || isBoolean) ? maxVal.ult(minVal) : maxVal.slt(minVal))
       return emitOpError("expected min_val <= max_val, got min_val=")
              << minValAttr << ", max_val=" << maxValAttr;
   } else {
@@ -2415,7 +2416,7 @@ LogicalResult TransposeOp::reifyResultShapes(
     int32_t dimInInput = transposePerms[dim];
     if (inputType.isDynamicDim(dimInInput))
       returnedDims[dim] =
-          builder.create<tensor::DimOp>(getLoc(), input, dimInInput)
+          tensor::DimOp::create(builder, getLoc(), input, dimInInput)
               .getResult();
     else
       returnedDims[dim] =
@@ -3947,12 +3948,12 @@ std::optional<Value> mlir::tosa::createZeroPointTensor(OpBuilder &builder,
   if (llvm::isa<FloatType>(srcElemType)) {
     auto zpAttr = DenseElementsAttr::get(
         zpType, builder.getFloatAttr(srcElemType, static_cast<double>(zp)));
-    return builder.create<tosa::ConstOp>(loc, zpType, zpAttr);
+    return tosa::ConstOp::create(builder, loc, zpType, zpAttr);
   }
   if (llvm::isa<IntegerType>(srcElemType)) {
     auto zpAttr =
         DenseElementsAttr::get(zpType, builder.getIntegerAttr(srcElemType, zp));
-    return builder.create<tosa::ConstOp>(loc, zpType, zpAttr);
+    return tosa::ConstOp::create(builder, loc, zpType, zpAttr);
   }
   llvm::errs() << "zero point is not allowed for unsupported data types\n";
   return std::nullopt;
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
index f6caa2a985a4d..9474299a39582 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
@@ -90,12 +90,12 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
 
     if (inputETy != resultETy) {
       inputType = inputType.clone(resultETy);
-      input = rewriter.create<tosa::CastOp>(op.getLoc(), inputType, input);
+      input = tosa::CastOp::create(rewriter, op.getLoc(), inputType, input);
     }
 
     if (weightETy != resultETy) {
       weightType = weightType.clone(resultETy);
-      weight = rewriter.create<tosa::CastOp>(op.getLoc(), weightType, weight);
+      weight = tosa::CastOp::create(rewriter, op.getLoc(), weightType, weight);
     }
 
     if (iZp != 0 || wZp != 0) {
@@ -109,9 +109,9 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
         auto zpTy = RankedTensorType::get(shape, ety);
         auto zpAttr =
             DenseElementsAttr::get(zpTy, rewriter.getIntegerAttr(ety, zp));
-        auto zpVal = rewriter.create<tosa::ConstOp>(op.getLoc(), zpTy, zpAttr);
-        return rewriter.create<tosa::SubOp>(op.getLoc(), val.getType(), val,
-                                            zpVal);
+        auto zpVal = tosa::ConstOp::create(rewriter, op.getLoc(), zpTy, zpAttr);
+        return tosa::SubOp::create(rewriter, op.getLoc(), val.getType(), val,
+                                   zpVal);
       };
 
       input = applyZp(input, iZp);
@@ -138,10 +138,10 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
       auto padTy = RankedTensorType::get({1}, inputETy);
       auto padAttr = DenseElementsAttr::get(padTy, zeroAttr);
       Value padVal =
-          rewriter.create<tosa::ConstOp>(op->getLoc(), padTy, padAttr);
+          tosa::ConstOp::create(rewriter, op->getLoc(), padTy, padAttr);
       inputType = RankedTensorType::get(newShape, inputETy);
-      input = rewriter.create<tosa::PadOp>(op->getLoc(), inputType, input,
-                                           padSizeVal, padVal);
+      input = tosa::PadOp::create(rewriter, op->getLoc(), inputType, input,
+                                  padSizeVal, padVal);
     }
 
     // Perform an elementwise mul over the reshaped input and weight.
@@ -161,7 +161,7 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
     auto shiftZeroAttr = DenseElementsAttr::get(
         shiftType, rewriter.getIntegerAttr(shiftElementType, 0));
     Value constZero =
-        rewriter.create<tosa::ConstOp>(op.getLoc(), shiftType, shiftZeroAttr);
+        tosa::ConstOp::create(rewriter, op.getLoc(), shiftType, shiftZeroAttr);
     Value mulValue = rewriter
                          .create<tosa::MulOp>(op.getLoc(), mulShapeType, input,
                                               weight, constZero)
@@ -174,8 +174,8 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
         dyn_cast<RankedTensorType>(input.getType()).getElementType());
     auto outputShapeValue =
         getTosaConstShape(rewriter, op->getLoc(), outputShape);
-    Value outputValue = rewriter.create<tosa::ReshapeOp>(
-        op.getLoc(), outputShapeType, mulValue, outputShapeValue);
+    Value outputValue = tosa::ReshapeOp::create(
+        rewriter, op.getLoc(), outputShapeType, mulValue, outputShapeValue);
 
     Value bias = op.getBias();
     if (EqualizeRanks(rewriter, op.getLoc(), outputValue, bias).failed()) {
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index df6d52615478e..dc5c51b0abad5 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -62,14 +62,16 @@ class TransposeConvNonStridedConverter
     convPad[2] = kernelWidth - 1 + pad[2];
     convPad[3] = kernelWidth - 1 + pad[3];
 
-    auto reverse1 = rewriter.create<tosa::ReverseOp>(
-        loc, weightTy, weight, /* axis = */ rewriter.getI32IntegerAttr(1));
-    auto reverse2 = rewriter.create<tosa::ReverseOp>(
-        loc, weightTy, reverse1, /* axis = */ rewriter.getI32IntegerAttr(2));
-
-    Value conv2d = rewriter.create<tosa::Conv2DOp>(
-        loc, resultTy, input, reverse2, bias, op.getInputZp(), op.getWeightZp(),
-        rewriter.getDenseI64ArrayAttr(convPad),
+    auto reverse1 =
+        tosa::ReverseOp::create(rewriter, loc, weightTy, weight,
+                                /* axis = */ rewriter.getI32IntegerAttr(1));
+    auto reverse2 =
+        tosa::ReverseOp::create(rewriter, loc, weightTy, reverse1,
+                                /* axis = */ rewriter.getI32IntegerAttr(2));
+
+    Value conv2d = tosa::Conv2DOp::create(
+        rewriter, loc, resultTy, input, reverse2, bias, op.getInputZp(),
+        op.getWeightZp(), rewriter.getDenseI64ArrayAttr(convPad),
         rewriter.getDenseI64ArrayAttr(stride),
         rewriter.getDenseI64ArrayAttr({1, 1}),
         /* acc_type = */ op.getAccType());
@@ -216,8 +218,8 @@ class TransposeConvStridedConverter
         inputPaddingVal, inputPadConst);
 
     // We use a zero bias as we need to broadcast the bias.
-    auto zeroBias = rewriter.create<tosa::ConstOp>(
-        loc,
+    auto zeroBias = tosa::ConstOp::create(
+        rewriter, loc,
         RankedTensorType::get({outputChannels * stride[0] * stride[1]},
                               biasETy),
         DenseElementsAttr::get(
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
index a9e98c8908e15..4d347c02ee16d 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp
@@ -112,7 +112,7 @@ class TypeModificationState {
           OpBuilder builder{value.getContext()};
           builder.setInsertionPointAfter(value.getDefiningOp());
           castValue =
-              builder.create<tensor::CastOp>(value.getLoc(), oldType, value);
+              tensor::CastOp::create(builder, value.getLoc(), oldType, value);
         }
 
         use->set(castValue);
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
index 8ebbbc94eb6a2..5590927c3f774 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp
@@ -178,10 +178,8 @@ std::optional<DenseElementsAttr>
 TosaReduceTransposes::transposeDenseAttribute(DenseElementsAttr input,
                                               ArrayRef<int32_t> perms) {
   RankedTensorType oldType = llvm::cast<RankedTensorType>(input.getType());
-  RankedTensorType newType =
-      RankedTensorType::get(applyTOSAPermutation(oldType.getShape(), perms),
-                            oldType.getElementType());
-  size_t rank = oldType.getRank();
+  ArrayRef<int64_t> oldShape = oldType.getShape();
+  int64_t rank = oldType.getRank();
 
   // Asserted by TransposeOp verifier and TOSA disallowing tensor with dimension
   // 0. If not in place, something is very wrong.
@@ -190,65 +188,83 @@ TosaReduceTransposes::transposeDenseAttribute(DenseElementsAttr input,
     return std::nullopt;
   }
 
-  if (input.isSplat())
+  auto newShape = applyTOSAPermutation(oldShape, perms);
+  RankedTensorType newType =
+      RankedTensorType::get(newShape, oldType.getElementType());
+
+  if (input.isSplat()) {
     return input.reshape(newType);
+  }
+
+  auto rawData = input.getRawData();
+  if (!rawData.data()) {
+    return std::nullopt;
+  }
 
   // The algorithm is approximately as follows:
-  // input: perms, input flat array, input tensor type
-  // (1/2) determine the strides of input/output if
-  // they were strided in row-major order. (3) adjust the strides for the
-  // input to be in the same order of indices as the output is written.
-  // (4) process dimension by dimension. example: perms 2, 0, 1; input
-  // 2x3x4; output 4x2x3 for i ... 4, j ... 2, k ... 3: output[i][j][k] =
-  // input[j][k][i] output[6i + 3j + k] = input[12j + 4k + i] and we adjust
-  // input strides to be as input[i + 12j + 4k] so we may process
-  // layer-by-layer.
-
-  // Step 1/2: Strides for input. We ignore output since row-major and can just
-  // push_back.
-
-  SmallVector<int64_t> originalInputStrides(rank);
-  originalInputStrides[rank - 1] = 1;
-  // index with int64_t to avoid overflow
-  for (int64_t i = rank - 2; i >= 0; i--)
-    originalInputStrides[i] =
-        originalInputStrides[i + 1] * oldType.getDimSize(i + 1);
-
-  // Step 3: Transpose strides of input to be same indexing (i, j, k, ...) as
-  // output which is done in row-major order.
-
-  SmallVector<int64_t> newInputStrides;
-  newInputStrides.reserve(rank);
-  for (int32_t v : perms)
-    newInputStrides.push_back(originalInputStrides[v]);
-
-  // Step 4: Write out the transposed "flat array" dimension by dimension.
-
-  auto inputArray = input.getValues<Attribute>();
-  SmallVector<std::pair<int64_t, int64_t>> boundsAndStrides;
-  for (size_t i = 0; i < rank; i++)
-    boundsAndStrides.push_back({newType.getDimSize(i), newInputStrides[i]});
-
-  SmallVector<Attribute> resultArray;
-  resultArray.reserve(inputArray.size());
-
-  std::function<void(int64_t,
-                     SmallVector<std::pair<int64_t, int64_t>>::const_iterator)>
-      processTransposeDim = [&](auto accumulatedIndex, auto it) {
-        if (it == boundsAndStrides.end()) {
-          resultArray.push_back(inputArray[accumulatedIndex]);
-          return;
-        }
-
-        for (int64_t i = 0; i < it->first; i++) {
-          int64_t j = accumulatedIndex + i * it->second;
-          processTransposeDim(j, it + 1);
-        }
-      };
-
-  processTransposeDim(0, boundsAndStrides.begin());
-
-  return DenseElementsAttr::get(newType, resultArray);
+  // 1. Determine the strides of both input and output tensors in row-major
+  // order
+  // 2. Iterate through the output tensor linearly.
+  // 3. For each output position, decompose the linear index into
+  //    multi-dimensional coordinates using output strides.
+  // 4. Use the permutation to map output coordinates to input coordinates and
+  //    calculate the source linear index.
+
+  // Example: perms [2, 0, 1]; input 2x3x4; output 4x2x3
+  // for output linear index 11: decompose to output[1][1][2]
+  // using output strides [6,3,1]. Map to input coordinates using
+  // perms: dim 0→2, dim 1→0, dim 2→1, giving source position
+  // calculated as 1*inputStrides[2] + 1*inputStrides[0] + 2*inputStrides[1]
+  // = 1*1 + 1*12 + 2*4 = 21
+
+  size_t elementSize = oldType.getElementTypeBitWidth() / 8;
+  int64_t numElements = oldType.getNumElements();
+
+  SmallVector<char> outputBuffer(numElements * elementSize);
+  const char *inputPtr = rawData.data();
+  char *outputPtr = outputBuffer.data();
+
+  auto calculateStrides = [](ArrayRef<int64_t> shape) -> SmallVector<int64_t> {
+    int64_t rank = shape.size();
+    SmallVector<int64_t> strides(rank);
+    strides[rank - 1] = 1;
+    for (int64_t i = rank - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * shape[i + 1];
+    }
+    return strides;
+  };
+
+  // Calculate strides for both input and output tensors
+  SmallVector<int64_t> inputStrides = calculateStrides(oldShape);
+  SmallVector<int64_t> outputStrides = calculateStrides(newShape);
+
+  auto mapCoordinates = [&](int64_t destLinearIndex) -> int64_t {
+    int64_t tempDestIndex = destLinearIndex;
+    int64_t sourceLinearIndex = 0;
+
+    // Decompose linear destination index into multi-dimensional
+    // coordinates dividing by output strides.
+    // Simultaneously map these coordinates through the permutation
+    // to calculate the corresponding source linear index.
+    for (auto j : llvm::seq<int64_t>(rank)) {
+      int64_t destCoord = tempDestIndex / outputStrides[j];
+      tempDestIndex %= outputStrides[j];
+      sourceLinearIndex += destCoord * inputStrides[perms[j]];
+    }
+
+    return sourceLinearIndex;
+  };
+
+  for (auto destLinearIndex : llvm::seq<int64_t>(numElements)) {
+    int64_t sourceLinearIndex = mapCoordinates(destLinearIndex);
+
+    // Copy the element from source to destination using type-agnostic byte
+    // copying.
+    std::memcpy(outputPtr + destLinearIndex * elementSize,
+                inputPtr + sourceLinearIndex * elementSize, elementSize);
+  }
+
+  return DenseElementsAttr::getFromRawBuffer(newType, outputBuffer);
 }
 
 // The SetVector should only contain ConstOp, ReshapeOp, TransposeOp
@@ -403,8 +419,8 @@ std::optional<Value> TosaReduceTransposes::buildMappedToValue(
     return std::nullopt;
   }
   ImplicitLocOpBuilder builder(reshapeOp.getLoc(), rewriter);
-  auto foldedReshape = rewriter.create<ReshapeOp>(
-      reshapeOp.getLoc(),
+  auto foldedReshape = ReshapeOp::create(
+      rewriter, reshapeOp.getLoc(),
       RankedTensorType::get(applyTOSAPermutation(shape, hoistedPerms),
                             reshapeOutputType.getElementType()),
       reshapeOp.getInput1(),
@@ -423,8 +439,8 @@ std::optional<Value> TosaReduceTransposes::buildMappedToValue(
   if (!maybeNewDenseAttr.has_value())
     return std::nullopt;
   auto newDenseAttr = maybeNewDenseAttr.value();
-  auto newConstOp = rewriter.create<ConstOp>(
-      constOp.getLoc(), newDenseAttr.getType(), newDenseAttr);
+  auto newConstOp = ConstOp::create(rewriter, constOp.getLoc(),
+                                    newDenseAttr.getType(), newDenseAttr);
   return newConstOp->getResult(0);
 }
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
index 3b697a2ee3e47..677d8e9904a67 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
@@ -37,7 +37,7 @@ void mlir::tosa::populateTosaTypeConversion(TypeConverter &converter) {
     if (inputs.size() != 1)
       return Value();
 
-    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+    return UnrealizedConversionCastOp::create(builder, loc, resultType, inputs)
         .getResult(0);
   });
   converter.addTargetMaterialization([&](OpBuilder &builder, Type resultType,
@@ -46,7 +46,7 @@ void mlir::tosa::populateTosaTypeConversion(TypeConverter &converter) {
     if (inputs.size() != 1)
       return Value();
 
-    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+    return UnrealizedConversionCastOp::create(builder, loc, resultType, inputs)
         .getResult(0);
   });
 }
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 3f27849b8c90c..32b5fb63a6ece 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -1193,12 +1193,33 @@ bool checkErrorIfPad(Operation *op) {
   return true;
 }
 
-// Returns true if the operation takes no input operands, excluding attributes.
-static bool isNullaryOperation(Operation *op) {
-  if (isa<tosa::ConstOp>(op) || isa<tosa::ConstShapeOp>(op) ||
-      isa<tosa::YieldOp>(op) || isa<tosa::VariableOp>(op))
-    return true;
-  return false;
+static bool isOpIsolatedWithinRegion(Operation *op, Region *region) {
+  return llvm::all_of(op->getOperands(), [&](auto operand) {
+    Region *operandRegion = operand.getParentRegion();
+    return operandRegion && region->isAncestor(operandRegion);
+  });
+}
+
+static bool isRegionIsolatedFromAbove(Region &regionToCheck) {
+  bool noLiveInValue = true;
+  regionToCheck.walk([&noLiveInValue, &regionToCheck](Operation *op) {
+    if (!isOpIsolatedWithinRegion(op, &regionToCheck)) {
+      noLiveInValue = false;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return noLiveInValue;
+}
+
+LogicalResult checkIsolatedRegion(Operation *op, Region &regionToCheck,
+                                  StringRef regionName) {
+  if (isRegionIsolatedFromAbove(regionToCheck))
+    return success();
+  op->emitOpError()
+      << "is not conformant to the TOSA specification. It requires the '"
+      << regionName << "' region is isolated from above.\n";
+  return failure();
 }
 
 bool checkErrorIfCondIf(Operation *op) {
@@ -1206,42 +1227,48 @@ bool checkErrorIfCondIf(Operation *op) {
   if (!ifOp)
     return true;
 
-  // Whether the types and shapes of operands between the input/output list and
-  // internal regions are validated by the operation verifier. However, with
-  // support for the simplified form - where redundant operand notations are
-  // omitted - is not conformant to the specification. According to the
-  // specification, all operands passed into an operation must be explicitly
-  // declared at each operation's structure. This code section verify that the
-  // operation's form complies with this requirement.
-
-  // Returns true if the region uses no external input operands.
-  auto isNullaryRegion = [](Region &region) -> bool {
-    bool noLiveInValue = true;
-    region.walk([&noLiveInValue](Operation *op) {
-      if (!isNullaryOperation(op)) {
-        noLiveInValue = false;
-        return WalkResult::interrupt();
-      }
-      return WalkResult::advance();
-    });
-    return noLiveInValue;
-  };
+  // Currently the dialect supports declaring cond_if operations that
+  // have then/else regions that reference values from outside these
+  // regions. According to the specification, all values used by the
+  // then/else regions must be explicitly declared within the regions.
+  // Therefore we must check that the then/else regions are
+  // "isolated from above", in order to be conformant to the
+  // specification.
+  //
+  // Note: the dialect currently supports two styles of syntax for
+  // declaring "cond_if" operations. We'll refer to these as follows:
+  //
+  // Generic:
+  // %0 = "tosa.cond_if"(%arg0, %arg1, %arg2) ({
+  //   ^bb0(%arg3, %arg4):
+  //     tosa.yield %arg3
+  // },  {
+  //   ^bb0(%arg3, %arg4):
+  //     tosa.yield %arg4
+  // })
+  //
+  // Simplified:
+  // %0 = tosa.cond_if %arg2 {
+  //   tosa.yield %arg0
+  // } else {
+  //   tosa.yield %arg1
+  // }
+  //
+  // Unfortunately, the simplified syntax does not encapsulate values
+  // used in then/else regions (see 'simplified' example above), so it
+  // must be rewritten to use the generic syntax in order to be conformant
+  // to the specification.
+  return failed(checkIsolatedRegion(op, ifOp.getThenGraph(), "then")) ||
+         failed(checkIsolatedRegion(op, ifOp.getElseGraph(), "else"));
+}
 
-  mlir::Region &thenGraph = ifOp.getThenGraph();
-  mlir::Region &elseGraph = ifOp.getElseGraph();
-  bool isThenGraphNullaryRegion = isNullaryRegion(thenGraph);
-  bool isElseGraphNullaryRegion = isNullaryRegion(elseGraph);
-  bool isInputListEmpty = ifOp.getInputList().size() == 0;
-
-  if ((isInputListEmpty != isThenGraphNullaryRegion) ||
-      (isInputListEmpty != isElseGraphNullaryRegion)) {
-    op->emitOpError()
-        << "the current simplified form is not strictly conformant to the "
-           "spec, please use the generic format\n";
-    return false;
-  }
+bool checkErrorIfWhileLoop(Operation *op) {
+  auto whileOp = dyn_cast<tosa::WhileOp>(op);
+  if (!whileOp)
+    return true;
 
-  return true;
+  return failed(checkIsolatedRegion(op, whileOp.getCondGraph(), "cond")) ||
+         failed(checkIsolatedRegion(op, whileOp.getBodyGraph(), "body"));
 }
 
 bool checkErrorIfScatter(Operation *op) {
@@ -1273,7 +1300,7 @@ LogicalResult TosaValidation::applyErrorIfCheck(Operation *op) {
   if (!checkErrorIfResize(op) || !checkErrorIfMul(op) ||
       !checkErrorIfTable(op) || !checkErrorIfRescale(op) ||
       !checkErrorIfPad(op) || !checkErrorIfCondIf(op) ||
-      !checkErrorIfScatter(op))
+      !checkErrorIfWhileLoop(op) || !checkErrorIfScatter(op))
     return failure();
   return success();
 }
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index 9844abcc34cb1..69eda03e03ab3 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -33,18 +33,18 @@ mlir::tosa::condenseValues(const SmallVector<Value> &values) {
 
 Value mlir::tosa::clampFloatHelper(Location loc, Value arg, Value min,
                                    Value max, OpBuilder &rewriter) {
-  Value minValue = rewriter.create<arith::MinimumFOp>(loc, arg, max);
-  return rewriter.create<arith::MaximumFOp>(loc, minValue, min);
+  Value minValue = arith::MinimumFOp::create(rewriter, loc, arg, max);
+  return arith::MaximumFOp::create(rewriter, loc, minValue, min);
 }
 
 Value mlir::tosa::clampIntHelper(Location loc, Value arg, Value min, Value max,
                                  OpBuilder &rewriter, bool isUnsigned) {
   if (isUnsigned) {
-    auto minOrArg = rewriter.create<arith::MaxUIOp>(loc, min, arg);
-    return rewriter.create<arith::MinUIOp>(loc, max, minOrArg);
+    auto minOrArg = arith::MaxUIOp::create(rewriter, loc, min, arg);
+    return arith::MinUIOp::create(rewriter, loc, max, minOrArg);
   }
-  auto minOrArg = rewriter.create<arith::MaxSIOp>(loc, min, arg);
-  return rewriter.create<arith::MinSIOp>(loc, max, minOrArg);
+  auto minOrArg = arith::MaxSIOp::create(rewriter, loc, min, arg);
+  return arith::MinSIOp::create(rewriter, loc, max, minOrArg);
 }
 
 bool mlir::tosa::validIntegerRange(IntegerType ty, int64_t value) {
@@ -144,8 +144,8 @@ LogicalResult mlir::tosa::EqualizeRanks(ImplicitLocOpBuilder &builder,
       ArrayRef<int64_t>(reshapeOutputShape), reshapeInputType.getElementType());
   auto reshapeOutputShapeValue = getTosaConstShape(builder, reshapeOutputShape);
 
-  auto reshapeLower = builder.create<tosa::ReshapeOp>(
-      reshapeOutputType, lowerTensorValue, reshapeOutputShapeValue);
+  auto reshapeLower = tosa::ReshapeOp::create(
+      builder, reshapeOutputType, lowerTensorValue, reshapeOutputShapeValue);
 
   if (input1Rank > input2Rank) {
     input1 = higherTensorValue;
@@ -162,7 +162,7 @@ Value mlir::tosa::getTosaConstShape(ImplicitLocOpBuilder &builder,
                                     llvm::ArrayRef<int64_t> shape) {
   auto attr = builder.getIndexTensorAttr(convertFromMlirShape(shape));
   auto type = mlir::tosa::shapeType::get(builder.getContext(), shape.size());
-  mlir::Operation *mlir_op = builder.create<tosa::ConstShapeOp>(type, attr);
+  mlir::Operation *mlir_op = tosa::ConstShapeOp::create(builder, type, attr);
   return mlir_op->getResult(0);
 }
 
diff --git a/mlir/lib/Dialect/UB/IR/UBOps.cpp b/mlir/lib/Dialect/UB/IR/UBOps.cpp
index 5b2cfe7bf4264..ee523f9522953 100644
--- a/mlir/lib/Dialect/UB/IR/UBOps.cpp
+++ b/mlir/lib/Dialect/UB/IR/UBOps.cpp
@@ -52,7 +52,7 @@ void UBDialect::initialize() {
 Operation *UBDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                           Type type, Location loc) {
   if (auto attr = dyn_cast<PoisonAttr>(value))
-    return builder.create<PoisonOp>(loc, type, attr);
+    return PoisonOp::create(builder, loc, type, attr);
 
   return nullptr;
 }
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 1cded38c4419e..e6ef0282101d2 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -181,12 +181,16 @@ bool isEqualConstantIntOrValueArray(ArrayRef<OpFoldResult> ofrs1,
   return true;
 }
 
-/// Return a vector of OpFoldResults with the same size a staticValues, but all
+/// Return a vector of OpFoldResults with the same size as staticValues, but all
 /// elements for which ShapedType::isDynamic is true, will be replaced by
 /// dynamicValues.
 SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
                                          ValueRange dynamicValues,
                                          MLIRContext *context) {
+  assert(dynamicValues.size() == static_cast<size_t>(llvm::count_if(
+                                     staticValues, ShapedType::isDynamic)) &&
+         "expected the rank of dynamic values to match the number of "
+         "values known to be dynamic");
   SmallVector<OpFoldResult> res;
   res.reserve(staticValues.size());
   unsigned numDynamic = 0;
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 7d615bfc12984..4c00fb58e4d30 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -124,7 +124,7 @@ static MaskFormat getMaskFormat(Value mask) {
 /// Default callback to build a region with a 'vector.yield' terminator with no
 /// arguments.
 void mlir::vector::buildTerminatedBody(OpBuilder &builder, Location loc) {
-  builder.create<vector::YieldOp>(loc);
+  vector::YieldOp::create(builder, loc);
 }
 
 // Helper for verifying combining kinds in contractions and reductions.
@@ -596,16 +596,16 @@ struct ElideUnitDimsInMultiDimReduction
         VectorType newMaskType =
             VectorType::get(dstVecType.getShape(), rewriter.getI1Type(),
                             dstVecType.getScalableDims());
-        mask = rewriter.create<vector::ShapeCastOp>(loc, newMaskType, mask);
+        mask = vector::ShapeCastOp::create(rewriter, loc, newMaskType, mask);
       }
-      cast = rewriter.create<vector::ShapeCastOp>(
-          loc, reductionOp.getDestType(), reductionOp.getSource());
+      cast = vector::ShapeCastOp::create(
+          rewriter, loc, reductionOp.getDestType(), reductionOp.getSource());
     } else {
       // This means we are reducing all the dimensions, and all reduction
       // dimensions are of size 1. So a simple extraction would do.
       if (mask)
-        mask = rewriter.create<vector::ExtractOp>(loc, mask);
-      cast = rewriter.create<vector::ExtractOp>(loc, reductionOp.getSource());
+        mask = vector::ExtractOp::create(rewriter, loc, mask);
+      cast = vector::ExtractOp::create(rewriter, loc, reductionOp.getSource());
     }
 
     Value result =
@@ -672,36 +672,36 @@ Value mlir::vector::getVectorReductionOp(arith::AtomicRMWKind op,
   switch (op) {
   case arith::AtomicRMWKind::addf:
   case arith::AtomicRMWKind::addi:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::ADD, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::ADD, vector);
   case arith::AtomicRMWKind::mulf:
   case arith::AtomicRMWKind::muli:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MUL, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MUL, vector);
   case arith::AtomicRMWKind::minimumf:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MINIMUMF, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MINIMUMF, vector);
   case arith::AtomicRMWKind::mins:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MINSI, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MINSI, vector);
   case arith::AtomicRMWKind::minu:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MINUI, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MINUI, vector);
   case arith::AtomicRMWKind::maximumf:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MAXIMUMF, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MAXIMUMF, vector);
   case arith::AtomicRMWKind::maxs:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MAXSI, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MAXSI, vector);
   case arith::AtomicRMWKind::maxu:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::MAXUI, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MAXUI, vector);
   case arith::AtomicRMWKind::andi:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::AND, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::AND, vector);
   case arith::AtomicRMWKind::ori:
-    return builder.create<vector::ReductionOp>(vector.getLoc(),
-                                               CombiningKind::OR, vector);
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::OR, vector);
   // TODO: Add remaining reduction operations.
   default:
     (void)emitOptionalError(loc, "Reduction operation type not supported");
@@ -740,8 +740,8 @@ struct ElideSingleElementReduction : public OpRewritePattern<ReductionOp> {
 
     Location loc = reductionOp.getLoc();
     if (mask)
-      mask = rewriter.create<ExtractOp>(loc, mask);
-    Value result = rewriter.create<ExtractOp>(loc, reductionOp.getVector());
+      mask = ExtractOp::create(rewriter, loc, mask);
+    Value result = ExtractOp::create(rewriter, loc, reductionOp.getVector());
 
     if (Value acc = reductionOp.getAcc())
       result = vector::makeArithReduction(rewriter, loc, reductionOp.getKind(),
@@ -1707,59 +1707,99 @@ static bool hasZeroDimVectors(Operation *op) {
          llvm::any_of(op->getResultTypes(), hasZeroDimVectorType);
 }
 
-/// Fold extractOp with scalar result coming from BroadcastOp or SplatOp.
+/// All BroadcastOps and SplatOps, as well as ShapeCastOps that only prepend
+/// 1s, are considered to be 'broadcastlike'.
+static bool isBroadcastLike(Operation *op) {
+  if (isa<BroadcastOp, SplatOp>(op))
+    return true;
+
+  auto shapeCast = dyn_cast<ShapeCastOp>(op);
+  if (!shapeCast)
+    return false;
+
+  // Check that shape_cast **only** prepends 1s, like (2,3) -> (1,1,2,3).
+  // Checking that the destination shape has a prefix of 1s is not sufficient,
+  // for example (2,3) -> (1,3,2) is not broadcastlike. A sufficient condition
+  // is that the source shape is a suffix of the destination shape.
+  VectorType srcType = shapeCast.getSourceVectorType();
+  ArrayRef<int64_t> srcShape = srcType.getShape();
+  uint64_t srcRank = srcType.getRank();
+  ArrayRef<int64_t> dstShape = shapeCast.getType().getShape();
+  return dstShape.size() >= srcRank && dstShape.take_back(srcRank) == srcShape;
+}
+
+/// Fold extract(broadcast(X)) to either extract(X) or just X.
+///
+/// Example:
+///
+///        broadcast             extract [1][2]
+/// (3, 4) --------> (2, 3, 4) ----------------> (4)
+///
+/// becomes
+///                  extract [1]
+/// (3,4) -------------------------------------> (4)
+///
+///
+/// The variable names used in this implementation correspond to the above
+/// shapes as,
+///
+/// - (3, 4) is `input` shape.
+/// - (2, 3, 4) is `broadcast` shape.
+/// - (4) is `extract` shape.
+///
+/// This folding is possible when the suffix of `input` shape is the same as
+/// `extract` shape.
 static Value foldExtractFromBroadcast(ExtractOp extractOp) {
+
   Operation *defOp = extractOp.getVector().getDefiningOp();
-  if (!defOp || !isa<vector::BroadcastOp, SplatOp>(defOp))
+  if (!defOp || !isBroadcastLike(defOp))
     return Value();
 
-  Value source = defOp->getOperand(0);
-  if (extractOp.getType() == source.getType())
-    return source;
-  auto getRank = [](Type type) {
-    return llvm::isa<VectorType>(type) ? llvm::cast<VectorType>(type).getRank()
-                                       : 0;
-  };
+  Value input = defOp->getOperand(0);
 
-  // If splat or broadcast from a scalar, just return the source scalar.
-  unsigned broadcastSrcRank = getRank(source.getType());
-  if (broadcastSrcRank == 0 && source.getType() == extractOp.getType())
-    return source;
+  // Replace extract(broadcast(X)) with X
+  if (extractOp.getType() == input.getType())
+    return input;
 
-  unsigned extractResultRank = getRank(extractOp.getType());
-  if (extractResultRank > broadcastSrcRank)
-    return Value();
-  // Check that the dimension of the result haven't been broadcasted.
-  auto extractVecType = llvm::dyn_cast<VectorType>(extractOp.getType());
-  auto broadcastVecType = llvm::dyn_cast<VectorType>(source.getType());
-  if (extractVecType && broadcastVecType &&
-      extractVecType.getShape() !=
-          broadcastVecType.getShape().take_back(extractResultRank))
+  // Get required types and ranks in the chain
+  //    input -> broadcast -> extract
+  // (scalars are treated as rank-0).
+  auto inputType = llvm::dyn_cast<VectorType>(input.getType());
+  auto extractType = llvm::dyn_cast<VectorType>(extractOp.getType());
+  unsigned inputRank = inputType ? inputType.getRank() : 0;
+  unsigned broadcastRank = extractOp.getSourceVectorType().getRank();
+  unsigned extractRank = extractType ? extractType.getRank() : 0;
+
+  // Cannot do without the broadcast if overall the rank increases.
+  if (extractRank > inputRank)
     return Value();
 
-  auto broadcastOp = cast<vector::BroadcastOp>(defOp);
-  int64_t broadcastDstRank = broadcastOp.getResultVectorType().getRank();
+  // The above condition guarantees that input is a vector.
+  assert(inputType && "input must be a vector type because of previous checks");
+  ArrayRef<int64_t> inputShape = inputType.getShape();
 
-  // Detect all the positions that come from "dim-1" broadcasting.
-  // These dimensions correspond to "dim-1" broadcasted dims; set the mathching
-  // extract position to `0` when extracting from the source operand.
-  llvm::SetVector<int64_t> broadcastedUnitDims =
-      broadcastOp.computeBroadcastedUnitDims();
-  SmallVector<OpFoldResult> extractPos(extractOp.getMixedPosition());
-  OpBuilder b(extractOp.getContext());
-  int64_t broadcastRankDiff = broadcastDstRank - broadcastSrcRank;
-  for (int64_t i = broadcastRankDiff, e = extractPos.size(); i < e; ++i)
-    if (broadcastedUnitDims.contains(i))
-      extractPos[i] = b.getIndexAttr(0);
-  // `rankDiff` leading dimensions correspond to new broadcasted dims, drop the
-  // matching extract position when extracting from the source operand.
-  int64_t rankDiff = broadcastSrcRank - extractResultRank;
-  extractPos.erase(extractPos.begin(),
-                   std::next(extractPos.begin(), extractPos.size() - rankDiff));
-  // OpBuilder is only used as a helper to build an I64ArrayAttr.
-  auto [staticPos, dynPos] = decomposeMixedValues(extractPos);
+  // In the case where there is a broadcast dimension in the suffix, it is not
+  // possible to replace extract(broadcast(X)) with extract(X). Example:
+  //
+  //     broadcast       extract
+  // (1) --------> (3,4) ------> (4)
+  if (extractType &&
+      extractType.getShape() != inputShape.take_back(extractRank))
+    return Value();
+
+  // Replace extract(broadcast(X)) with extract(X).
+  // First, determine the new extraction position.
+  unsigned deltaOverall = inputRank - extractRank;
+  unsigned deltaBroadcast = broadcastRank - inputRank;
+  SmallVector<OpFoldResult> oldPositions = extractOp.getMixedPosition();
+  SmallVector<OpFoldResult> newPositions(deltaOverall);
+  IntegerAttr zero = OpBuilder(extractOp.getContext()).getIndexAttr(0);
+  for (auto [i, size] : llvm::enumerate(inputShape.take_front(deltaOverall))) {
+    newPositions[i] = size == 1 ? zero : oldPositions[i + deltaBroadcast];
+  }
+  auto [staticPos, dynPos] = decomposeMixedValues(newPositions);
   extractOp->setOperands(
-      llvm::to_vector(llvm::concat<Value>(ValueRange(source), dynPos)));
+      llvm::to_vector(llvm::concat<Value>(ValueRange(input), dynPos)));
   extractOp.setStaticPosition(staticPos);
   return extractOp.getResult();
 }
@@ -2204,32 +2244,18 @@ class ExtractOpFromBroadcast final : public OpRewritePattern<ExtractOp> {
 
   LogicalResult matchAndRewrite(ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
+
     Operation *defOp = extractOp.getVector().getDefiningOp();
-    if (!defOp || !isa<vector::BroadcastOp, SplatOp>(defOp))
+    VectorType outType = dyn_cast<VectorType>(extractOp.getType());
+    if (!defOp || !isBroadcastLike(defOp) || !outType)
       return failure();
 
     Value source = defOp->getOperand(0);
-    if (extractOp.getType() == source.getType())
-      return failure();
-    auto getRank = [](Type type) {
-      return llvm::isa<VectorType>(type)
-                 ? llvm::cast<VectorType>(type).getRank()
-                 : 0;
-    };
-    unsigned broadcastSrcRank = getRank(source.getType());
-    unsigned extractResultRank = getRank(extractOp.getType());
-    // We only consider the case where the rank of the source is less than or
-    // equal to the rank of the extract dst. The other cases are handled in the
-    // folding patterns.
-    if (extractResultRank < broadcastSrcRank)
-      return failure();
-    // For scalar result, the input can only be a rank-0 vector, which will
-    // be handled by the folder.
-    if (extractResultRank == 0)
+    if (isBroadcastableTo(source.getType(), outType) !=
+        BroadcastableToResult::Success)
       return failure();
 
-    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(
-        extractOp, extractOp.getType(), source);
+    rewriter.replaceOpWithNewOp<BroadcastOp>(extractOp, outType, source);
     return success();
   }
 };
@@ -4146,9 +4172,9 @@ class StridedSliceCreateMaskFolder final
       // greater than the vector dim size.
       IntegerAttr offsetAttr =
           rewriter.getIntegerAttr(maskDimSize.getType(), sliceOffset);
-      Value offset = rewriter.create<arith::ConstantOp>(loc, offsetAttr);
+      Value offset = arith::ConstantOp::create(rewriter, loc, offsetAttr);
       Value sliceMaskDimSize =
-          rewriter.create<arith::SubIOp>(loc, maskDimSize, offset);
+          arith::SubIOp::create(rewriter, loc, maskDimSize, offset);
       sliceMaskDimSizes.push_back(sliceMaskDimSize);
     }
     // Add unchanged dimensions.
@@ -4263,8 +4289,8 @@ class StridedSliceBroadcast final
           sizes[i] = 1;
         }
       }
-      source = rewriter.create<ExtractStridedSliceOp>(
-          op->getLoc(), source, offsets, sizes,
+      source = ExtractStridedSliceOp::create(
+          rewriter, op->getLoc(), source, offsets, sizes,
           getI64SubArray(op.getStrides(), /*dropFront=*/rankDiff));
     }
     rewriter.replaceOpWithNewOp<BroadcastOp>(op, op.getType(), source);
@@ -4356,8 +4382,8 @@ class ContiguousExtractStridedSliceToExtract final
 
     SmallVector<int64_t> offsets = getI64SubArray(op.getOffsets());
     auto extractOffsets = ArrayRef(offsets).take_front(numOffsets);
-    Value extract = rewriter.create<vector::ExtractOp>(op->getLoc(), source,
-                                                       extractOffsets);
+    Value extract = vector::ExtractOp::create(rewriter, op->getLoc(), source,
+                                              extractOffsets);
     rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, op.getType(), extract);
     return success();
   }
@@ -4387,7 +4413,7 @@ void TransferReadOp::build(OpBuilder &builder, OperationState &result,
 
   Type elemType = llvm::cast<ShapedType>(source.getType()).getElementType();
   if (!padding)
-    padding = builder.create<ub::PoisonOp>(result.location, elemType);
+    padding = ub::PoisonOp::create(builder, result.location, elemType);
   build(builder, result, vectorType, source, indices, permutationMapAttr,
         *padding, /*mask=*/Value(), inBoundsAttr);
 }
@@ -4405,7 +4431,7 @@ void TransferReadOp::build(OpBuilder &builder, OperationState &result,
                                 SmallVector<bool>(vectorType.getRank(), false));
   Type elemType = llvm::cast<ShapedType>(source.getType()).getElementType();
   if (!padding)
-    padding = builder.create<ub::PoisonOp>(result.location, elemType);
+    padding = ub::PoisonOp::create(builder, result.location, elemType);
   build(builder, result, vectorType, source, indices, *padding,
         permutationMapAttr, inBoundsAttr);
 }
@@ -4424,7 +4450,7 @@ void TransferReadOp::build(OpBuilder &builder, OperationState &result,
                                 SmallVector<bool>(vectorType.getRank(), false));
   Type elemType = llvm::cast<ShapedType>(source.getType()).getElementType();
   if (!padding)
-    padding = builder.create<ub::PoisonOp>(result.location, elemType);
+    padding = ub::PoisonOp::create(builder, result.location, elemType);
   build(builder, result, vectorType, source, indices, permutationMapAttr,
         *padding,
         /*mask=*/Value(), inBoundsAttr);
@@ -4949,7 +4975,7 @@ struct TransferReadAfterWriteToBroadcast
     VectorType broadcastedType = VectorType::get(
         broadcastShape, defWrite.getVectorType().getElementType(),
         broadcastScalableFlags);
-    vec = rewriter.create<vector::BroadcastOp>(loc, broadcastedType, vec);
+    vec = vector::BroadcastOp::create(rewriter, loc, broadcastedType, vec);
     SmallVector<int64_t> transposePerm(permutation.begin(), permutation.end());
     rewriter.replaceOpWithNewOp<vector::TransposeOp>(readOp, vec,
                                                      transposePerm);
@@ -5427,13 +5453,14 @@ struct SwapExtractSliceOfTransferWrite
     // Swap the tensor::ExtractSliceOp in front of the vector::TransferWriteOp.
     // Set all in_bounds to false and let the folder infer them.
     SmallVector<bool> newInBounds(vectorShape.size(), false);
-    auto newExtractOp = rewriter.create<tensor::ExtractSliceOp>(
-        extractOp.getLoc(), insertOp.getSourceType(), insertOp.getDest(),
-        insertOp.getMixedOffsets(), insertOp.getMixedSizes(),
-        insertOp.getMixedStrides());
-    auto newTransferWriteOp = rewriter.create<TransferWriteOp>(
-        transferOp.getLoc(), transferOp.getVector(), newExtractOp.getResult(),
-        transferOp.getIndices(), transferOp.getPermutationMapAttr(),
+    auto newExtractOp = tensor::ExtractSliceOp::create(
+        rewriter, extractOp.getLoc(), insertOp.getSourceType(),
+        insertOp.getDest(), insertOp.getMixedOffsets(),
+        insertOp.getMixedSizes(), insertOp.getMixedStrides());
+    auto newTransferWriteOp = TransferWriteOp::create(
+        rewriter, transferOp.getLoc(), transferOp.getVector(),
+        newExtractOp.getResult(), transferOp.getIndices(),
+        transferOp.getPermutationMapAttr(),
         rewriter.getBoolArrayAttr(newInBounds));
     rewriter.modifyOpInPlace(insertOp, [&]() {
       insertOp.getSourceMutable().assign(newTransferWriteOp.getResult());
@@ -6957,7 +6984,7 @@ void MaskOp::ensureTerminator(Region &region, Builder &builder, Location loc) {
   OpBuilder opBuilder(builder.getContext());
   Operation *maskedOp = &block.front();
   opBuilder.setInsertionPointToEnd(&block);
-  opBuilder.create<vector::YieldOp>(loc, maskedOp->getResults());
+  vector::YieldOp::create(opBuilder, loc, maskedOp->getResults());
 }
 
 LogicalResult MaskOp::verify() {
@@ -7292,7 +7319,7 @@ void mlir::vector::createMaskOpRegion(OpBuilder &builder,
   // Create a block and move the op to that block.
   insBlock->getOperations().splice(
       insBlock->begin(), maskableOp->getBlock()->getOperations(), maskableOp);
-  builder.create<YieldOp>(maskableOp->getLoc(), maskableOp->getResults());
+  YieldOp::create(builder, maskableOp->getLoc(), maskableOp->getResults());
 }
 
 /// Creates a vector.mask operation around a maskable operation. Returns the
@@ -7304,12 +7331,12 @@ Operation *mlir::vector::maskOperation(OpBuilder &builder,
   if (!mask)
     return maskableOp;
   if (passthru)
-    return builder.create<MaskOp>(maskableOp->getLoc(),
-                                  maskableOp->getResultTypes(), mask, passthru,
-                                  maskableOp, createMaskOpRegion);
-  return builder.create<MaskOp>(maskableOp->getLoc(),
-                                maskableOp->getResultTypes(), mask, maskableOp,
-                                createMaskOpRegion);
+    return MaskOp::create(builder, maskableOp->getLoc(),
+                          maskableOp->getResultTypes(), mask, passthru,
+                          maskableOp, createMaskOpRegion);
+  return MaskOp::create(builder, maskableOp->getLoc(),
+                        maskableOp->getResultTypes(), mask, maskableOp,
+                        createMaskOpRegion);
 }
 
 /// Creates a vector select operation that picks values from `newValue` or
@@ -7324,8 +7351,8 @@ Value mlir::vector::selectPassthru(OpBuilder &builder, Value mask,
   if (!mask)
     return newValue;
 
-  return builder.create<arith::SelectOp>(newValue.getLoc(), newValue.getType(),
-                                         mask, newValue, passthru);
+  return arith::SelectOp::create(builder, newValue.getLoc(), newValue.getType(),
+                                 mask, newValue, passthru);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
index 9da051150e409..66196194b0585 100644
--- a/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -116,8 +116,8 @@ struct TransferWriteOpInterface
         getBuffer(rewriter, writeOp.getBase(), options, state);
     if (failed(resultBuffer))
       return failure();
-    rewriter.create<vector::TransferWriteOp>(
-        writeOp.getLoc(), writeOp.getVector(), *resultBuffer,
+    vector::TransferWriteOp::create(
+        rewriter, writeOp.getLoc(), writeOp.getVector(), *resultBuffer,
         writeOp.getIndices(), writeOp.getPermutationMapAttr(),
         writeOp.getMask(), writeOp.getInBoundsAttr());
     replaceOpWithBufferizedValues(rewriter, op, *resultBuffer);
@@ -241,8 +241,9 @@ struct MaskOpInterface
     // Create a new vector.mask op.
     ValueRange newYieldedValuesRange(newYieldedValues);
     TypeRange newResultTypes(newYieldedValuesRange);
-    auto newOp = rewriter.create<vector::MaskOp>(
-        op->getLoc(), newResultTypes, maskOp.getMask(), maskOp.getPassthru(),
+    auto newOp = vector::MaskOp::create(
+        rewriter, op->getLoc(), newResultTypes, maskOp.getMask(),
+        maskOp.getPassthru(),
         /*maskableOp=*/nullptr,
         /*maskRegionBuilder=*/[](OpBuilder &b, Operation *) {});
     newOp.getRegion().takeBody(maskOp.getMaskRegion());
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp
index 89930a6bd35fa..4c3a04cfb5bfa 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp
@@ -64,14 +64,14 @@ class UnrollBitCastOp final : public OpRewritePattern<vector::BitCastOp> {
         VectorType::get(shape, resultType.getElementType(), scalableDims);
 
     Location loc = op.getLoc();
-    Value result = rewriter.create<ub::PoisonOp>(loc, resultType);
+    Value result = ub::PoisonOp::create(rewriter, loc, resultType);
     for (auto position : *unrollIterator) {
       Value extract =
-          rewriter.create<vector::ExtractOp>(loc, op.getSource(), position);
+          vector::ExtractOp::create(rewriter, loc, op.getSource(), position);
       Value bitcast =
-          rewriter.create<vector::BitCastOp>(loc, bitcastResType, extract);
+          vector::BitCastOp::create(rewriter, loc, bitcastResType, extract);
       result =
-          rewriter.create<vector::InsertOp>(loc, bitcast, result, position);
+          vector::InsertOp::create(rewriter, loc, bitcast, result, position);
     }
 
     rewriter.replaceOp(op, result);
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
index 11dcfe421e0c4..cb8e566869cfd 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
@@ -52,7 +52,7 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
 
     // Stretching scalar inside vector (e.g. vector<1xf32>) can use splat.
     if (srcRank <= 1 && dstRank == 1) {
-      Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource());
+      Value ext = vector::ExtractOp::create(rewriter, loc, op.getSource());
       rewriter.replaceOpWithNewOp<vector::SplatOp>(op, dstType, ext);
       return success();
     }
@@ -70,10 +70,10 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
       // Duplication.
       VectorType resType = VectorType::Builder(dstType).dropDim(0);
       Value bcst =
-          rewriter.create<vector::BroadcastOp>(loc, resType, op.getSource());
-      Value result = rewriter.create<ub::PoisonOp>(loc, dstType);
+          vector::BroadcastOp::create(rewriter, loc, resType, op.getSource());
+      Value result = ub::PoisonOp::create(rewriter, loc, dstType);
       for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d)
-        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
+        result = vector::InsertOp::create(rewriter, loc, bcst, result, d);
       rewriter.replaceOp(op, result);
       return success();
     }
@@ -111,13 +111,13 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
     VectorType resType =
         VectorType::get(dstType.getShape().drop_front(), eltType,
                         dstType.getScalableDims().drop_front());
-    Value result = rewriter.create<ub::PoisonOp>(loc, dstType);
+    Value result = ub::PoisonOp::create(rewriter, loc, dstType);
     if (m == 0) {
       // Stetch at start.
-      Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), 0);
-      Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
+      Value ext = vector::ExtractOp::create(rewriter, loc, op.getSource(), 0);
+      Value bcst = vector::BroadcastOp::create(rewriter, loc, resType, ext);
       for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d)
-        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
+        result = vector::InsertOp::create(rewriter, loc, bcst, result, d);
     } else {
       // Stetch not at start.
       if (dstType.getScalableDims()[0]) {
@@ -125,9 +125,9 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
         return failure();
       }
       for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d) {
-        Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), d);
-        Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
-        result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
+        Value ext = vector::ExtractOp::create(rewriter, loc, op.getSource(), d);
+        Value bcst = vector::BroadcastOp::create(rewriter, loc, resType, ext);
+        result = vector::InsertOp::create(rewriter, loc, bcst, result, d);
       }
     }
     rewriter.replaceOp(op, result);
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index 0cd3f786d8101..65702ffa152d9 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -81,17 +81,17 @@ static Value reshapeLoad(Location loc, Value val, VectorType type,
 
   // At extraction dimension?
   if (index == 0)
-    return rewriter.create<vector::ExtractOp>(loc, val, pos);
+    return vector::ExtractOp::create(rewriter, loc, val, pos);
 
   // Unroll leading dimensions.
   VectorType vType = VectorType::Builder(type).dropDim(0);
   VectorType resType = VectorType::Builder(type).dropDim(index);
-  Value result = rewriter.create<arith::ConstantOp>(
-      loc, resType, rewriter.getZeroAttr(resType));
+  Value result = arith::ConstantOp::create(rewriter, loc, resType,
+                                           rewriter.getZeroAttr(resType));
   for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) {
-    Value ext = rewriter.create<vector::ExtractOp>(loc, val, d);
+    Value ext = vector::ExtractOp::create(rewriter, loc, val, d);
     Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter);
-    result = rewriter.create<vector::InsertOp>(loc, load, result, d);
+    result = vector::InsertOp::create(rewriter, loc, load, result, d);
   }
   return result;
 }
@@ -106,15 +106,15 @@ static Value reshapeStore(Location loc, Value val, Value result,
     return val;
   // At insertion dimension?
   if (index == 0)
-    return rewriter.create<vector::InsertOp>(loc, val, result, pos);
+    return vector::InsertOp::create(rewriter, loc, val, result, pos);
 
   // Unroll leading dimensions.
   VectorType vType = VectorType::Builder(type).dropDim(0);
   for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) {
-    Value ext = rewriter.create<vector::ExtractOp>(loc, result, d);
-    Value ins = rewriter.create<vector::ExtractOp>(loc, val, d);
+    Value ext = vector::ExtractOp::create(rewriter, loc, result, d);
+    Value ins = vector::ExtractOp::create(rewriter, loc, val, d);
     Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter);
-    result = rewriter.create<vector::InsertOp>(loc, sto, result, d);
+    result = vector::InsertOp::create(rewriter, loc, sto, result, d);
   }
   return result;
 }
@@ -132,7 +132,7 @@ createContractArithOp(Location loc, Value x, Value y, Value acc,
         kind == CombiningKind::MINIMUMF || kind == CombiningKind::MAXIMUMF)
       // Only valid for floating point types.
       return std::nullopt;
-    mul = rewriter.create<arith::MulIOp>(loc, x, y);
+    mul = arith::MulIOp::create(rewriter, loc, x, y);
   } else {
     // Float case.
     if (kind == CombiningKind::AND || kind == CombiningKind::MINUI ||
@@ -143,14 +143,14 @@ createContractArithOp(Location loc, Value x, Value y, Value acc,
       return std::nullopt;
     // Special case for fused multiply-add.
     if (acc && isa<VectorType>(acc.getType()) && kind == CombiningKind::ADD) {
-      Value fma = rewriter.create<vector::FMAOp>(loc, x, y, acc);
+      Value fma = vector::FMAOp::create(rewriter, loc, x, y, acc);
       if (mask)
         // The fma op doesn't need explicit masking. However, fma ops used in
         // reductions must preserve previous 'acc' values for masked-out lanes.
         fma = selectPassthru(rewriter, mask, fma, acc);
       return fma;
     }
-    mul = rewriter.create<arith::MulFOp>(loc, x, y);
+    mul = arith::MulFOp::create(rewriter, loc, x, y);
   }
 
   if (!acc)
@@ -186,8 +186,8 @@ static std::optional<unsigned> getDimPosition(AffineMap map, unsigned dim) {
 static Value createAdd(Location loc, Value x, Value y, bool isInt,
                        PatternRewriter &rewriter) {
   if (isInt)
-    return rewriter.create<arith::AddIOp>(loc, x, y);
-  return rewriter.create<arith::AddFOp>(loc, x, y);
+    return arith::AddIOp::create(rewriter, loc, x, y);
+  return arith::AddFOp::create(rewriter, loc, x, y);
 }
 
 /// Creates a MulIOp if `isInt` is true otherwise create an MulFOp using
@@ -195,55 +195,12 @@ static Value createAdd(Location loc, Value x, Value y, bool isInt,
 static Value createMul(Location loc, Value x, Value y, bool isInt,
                        PatternRewriter &rewriter) {
   if (isInt)
-    return rewriter.create<arith::MulIOp>(loc, x, y);
-  return rewriter.create<arith::MulFOp>(loc, x, y);
+    return arith::MulIOp::create(rewriter, loc, x, y);
+  return arith::MulFOp::create(rewriter, loc, x, y);
 }
 
 namespace {
 
-/// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to:
-/// ```
-///    %flattened_a = vector.shape_cast %a
-///    %flattened_b = vector.shape_cast %b
-///    %flattened_d = vector.matrix_multiply %flattened_a, %flattened_b
-///    %d = vector.shape_cast %%flattened_d
-///    %e = add %c, %d
-/// ```
-/// `vector.matrix_multiply` later lowers to `llvm.matrix.multiply`.
-//
-/// This only kicks in when vectorContractLowering is set to Matmul and
-/// the vector.contract op is a row-major matrix multiply.
-class ContractionOpToMatmulOpLowering
-    : public vector::MaskableOpRewritePattern<vector::ContractionOp> {
-public:
-  using MaskableOpRewritePattern::MaskableOpRewritePattern;
-
-  using FilterConstraintType =
-      std::function<LogicalResult(vector::ContractionOp op)>;
-
-  static LogicalResult defaultFilter(vector::ContractionOp op) {
-    return success();
-  }
-
-  ContractionOpToMatmulOpLowering(
-      vector::VectorContractLowering vectorContractLowering,
-      MLIRContext *context, PatternBenefit benefit = 1,
-      FilterConstraintType constraint = defaultFilter)
-      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),
-        vectorContractLowering(vectorContractLowering),
-        filter(std::move(constraint)) {}
-
-  FailureOr<Value>
-  matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,
-                            PatternRewriter &rewriter) const override;
-
-private:
-  /// Options to control the vector patterns.
-  vector::VectorContractLowering vectorContractLowering;
-  FilterConstraintType filter;
-};
-
 /// Progressive lowering of a `vector.contract %a, %b, %c` with row-major matmul
 /// semantics to a reduction_size-unrolled sequence:
 /// ```
@@ -402,7 +359,7 @@ struct UnrolledOuterProductGenerator
   Value t(Value v, ArrayRef<int64_t> perm = {1, 0}) {
     if (!v)
       return v;
-    return rewriter.create<vector::TransposeOp>(loc, v, perm);
+    return vector::TransposeOp::create(rewriter, loc, v, perm);
   }
 
   Value promote(Value v, Type dstElementType) {
@@ -416,8 +373,8 @@ struct UnrolledOuterProductGenerator
     if (vecType)
       promotedType = vecType.clone(promotedType);
     if (isa<FloatType>(dstElementType))
-      return rewriter.create<arith::ExtFOp>(loc, promotedType, v);
-    return rewriter.create<arith::ExtSIOp>(loc, promotedType, v);
+      return arith::ExtFOp::create(rewriter, loc, promotedType, v);
+    return arith::ExtSIOp::create(rewriter, loc, promotedType, v);
   }
 
   FailureOr<Value> outerProd(Value lhs, Value rhs, Value res,
@@ -429,17 +386,17 @@ struct UnrolledOuterProductGenerator
 
     Type resElementType = cast<VectorType>(res.getType()).getElementType();
     for (int64_t k = 0; k < reductionSize; ++k) {
-      Value extractA = rewriter.create<vector::ExtractOp>(loc, lhs, k);
-      Value extractB = rewriter.create<vector::ExtractOp>(loc, rhs, k);
+      Value extractA = vector::ExtractOp::create(rewriter, loc, lhs, k);
+      Value extractB = vector::ExtractOp::create(rewriter, loc, rhs, k);
       extractA = promote(extractA, resElementType);
       extractB = promote(extractB, resElementType);
       Value extractMask;
       if (maybeMask.has_value() && maybeMask.value())
         extractMask =
-            rewriter.create<vector::ExtractOp>(loc, maybeMask.value(), k);
+            vector::ExtractOp::create(rewriter, loc, maybeMask.value(), k);
 
-      Operation *outerProdOp = rewriter.create<vector::OuterProductOp>(
-          loc, res.getType(), extractA, extractB, res, kind);
+      Operation *outerProdOp = vector::OuterProductOp::create(
+          rewriter, loc, res.getType(), extractA, extractB, res, kind);
       res = maskOperation(rewriter, outerProdOp, extractMask)->getResult(0);
     }
     return res;
@@ -689,28 +646,28 @@ FailureOr<Value> ContractionOpToDotLowering::matchAndRewriteMaskableOp(
     // Two outer parallel, one inner reduction (matmat flavor).
     //
     if (maps == infer({{m, k}, {k, n}, {m, n}})) {
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
     } else if (maps == infer({{m, k}, {n, k}, {m, n}})) {
       // No need to permute anything.
     } else if (maps == infer({{k, m}, {k, n}, {m, n}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
-      rhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
     } else if (maps == infer({{k, m}, {n, k}, {m, n}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else if (maps == infer({{m, k}, {k, n}, {n, m}})) {
       // This is the classical row-major matmul. Just permute the lhs.
       Value tmp = lhs;
-      lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
       rhs = tmp;
     } else if (maps == infer({{m, k}, {n, k}, {n, m}})) {
       std::swap(lhs, rhs);
     } else if (maps == infer({{k, m}, {k, n}, {n, m}})) {
       Value tmp = lhs;
-      lhs = rewriter.create<vector::TransposeOp>(loc, rhs, perm);
-      rhs = rewriter.create<vector::TransposeOp>(loc, tmp, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, rhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, tmp, perm);
     } else if (maps == infer({{k, m}, {n, k}, {n, m}})) {
       Value tmp = rhs;
-      rhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      rhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
       lhs = tmp;
     } else {
       return failure();
@@ -723,12 +680,12 @@ FailureOr<Value> ContractionOpToDotLowering::matchAndRewriteMaskableOp(
     if (maps == infer({{m, n}, {n}, {m}})) {
       // No need to permute anything.
     } else if (maps == infer({{n, m}, {n}, {m}})) {
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else if (maps == infer({{n}, {m, n}, {m}})) {
       std::swap(lhs, rhs);
     } else if (maps == infer({{n}, {n, m}, {m}})) {
       std::swap(lhs, rhs);
-      lhs = rewriter.create<vector::TransposeOp>(loc, lhs, perm);
+      lhs = vector::TransposeOp::create(rewriter, loc, lhs, perm);
     } else {
       return failure();
     }
@@ -745,31 +702,32 @@ FailureOr<Value> ContractionOpToDotLowering::matchAndRewriteMaskableOp(
   unsigned dstColumns = rank == 1 ? 1 : dstType.getShape()[1];
 
   // ExtractOp does not allow dynamic indexing, we must unroll explicitly.
-  Value res = rewriter.create<arith::ConstantOp>(loc, dstType,
-                                                 rewriter.getZeroAttr(dstType));
+  Value res = arith::ConstantOp::create(rewriter, loc, dstType,
+                                        rewriter.getZeroAttr(dstType));
   bool isInt = isa<IntegerType>(dstType.getElementType());
   llvm::SmallVector<Value> extractedCols;
   extractedCols.reserve(dstColumns);
   for (unsigned r = 0; r < dstRows; ++r) {
-    Value rowLhs = rewriter.create<vector::ExtractOp>(op.getLoc(), lhs, r);
+    Value rowLhs = vector::ExtractOp::create(rewriter, op.getLoc(), lhs, r);
     for (unsigned c = 0; c < dstColumns; ++c) {
       // Extract each respective row and column of the LHS and RHS once to
       // avoid having duplicate SSA values pointing to the same rows/columns.
       if (r == 0) {
         Value colRhs =
-            rank == 1 ? rhs
-                      : rewriter.create<vector::ExtractOp>(op.getLoc(), rhs, c);
+            rank == 1
+                ? rhs
+                : vector::ExtractOp::create(rewriter, op.getLoc(), rhs, c);
         extractedCols.push_back(colRhs);
       }
       Value extractedColRhs = extractedCols[c];
       Value product =
           createMul(op.getLoc(), rowLhs, extractedColRhs, isInt, rewriter);
-      Value sum = rewriter.create<vector::ReductionOp>(
-          op.getLoc(), vector::CombiningKind::ADD, product);
+      Value sum = vector::ReductionOp::create(
+          rewriter, op.getLoc(), vector::CombiningKind::ADD, product);
 
       SmallVector<int64_t, 2> pos = rank == 1 ? SmallVector<int64_t, 2>{r}
                                               : SmallVector<int64_t, 2>{r, c};
-      res = rewriter.create<vector::InsertOp>(op.getLoc(), sum, res, pos);
+      res = vector::InsertOp::create(rewriter, op.getLoc(), sum, res, pos);
     }
   }
   if (auto acc = op.getAcc())
@@ -870,21 +828,21 @@ struct ContractOpToElementwise
       lhsDims.append(lhsShape.begin(), lhsShape.end());
       auto expandedType =
           VectorType::get(lhsDims, contractOp.getLhsType().getElementType());
-      newLhs = rewriter.create<vector::BroadcastOp>(loc, expandedType, newLhs);
+      newLhs = vector::BroadcastOp::create(rewriter, loc, expandedType, newLhs);
     }
     if (!rhsDims.empty()) {
       rhsDims.append(rhsShape.begin(), rhsShape.end());
       auto expandedType =
           VectorType::get(rhsDims, contractOp.getRhsType().getElementType());
-      newRhs = rewriter.create<vector::BroadcastOp>(loc, expandedType, newRhs);
+      newRhs = vector::BroadcastOp::create(rewriter, loc, expandedType, newRhs);
     }
     bool isInt = contractOp.getLhsType().getElementType().isIntOrIndex();
-    newLhs = rewriter.create<vector::TransposeOp>(loc, newLhs, lhsTranspose);
-    newRhs = rewriter.create<vector::TransposeOp>(loc, newRhs, rhsTranspose);
+    newLhs = vector::TransposeOp::create(rewriter, loc, newLhs, lhsTranspose);
+    newRhs = vector::TransposeOp::create(rewriter, loc, newRhs, rhsTranspose);
     SmallVector<int64_t> lhsOffsets(lhsReductionDims.size(), 0);
     SmallVector<int64_t> rhsOffsets(rhsReductionDims.size(), 0);
-    newLhs = rewriter.create<vector::ExtractOp>(loc, newLhs, lhsOffsets);
-    newRhs = rewriter.create<vector::ExtractOp>(loc, newRhs, rhsOffsets);
+    newLhs = vector::ExtractOp::create(rewriter, loc, newLhs, lhsOffsets);
+    newRhs = vector::ExtractOp::create(rewriter, loc, newRhs, rhsOffsets);
     std::optional<Value> result =
         createContractArithOp(loc, newLhs, newRhs, contractOp.getAcc(),
                               contractOp.getKind(), rewriter, isInt);
@@ -939,24 +897,18 @@ FailureOr<Value> ContractionOpLowering::matchAndRewriteMaskableOp(
   // TODO: implement benefits, cost models.
   MLIRContext *ctx = op.getContext();
 
-  ContractionOpToMatmulOpLowering pat1(vectorContractLoweringOption, ctx);
+  ContractionOpToOuterProductOpLowering pat1(vectorContractLoweringOption, ctx);
   FailureOr<Value> newVal1 =
       pat1.matchAndRewriteMaskableOp(op, maskOp, rewriter);
   if (!failed(newVal1))
     return newVal1;
 
-  ContractionOpToOuterProductOpLowering pat2(vectorContractLoweringOption, ctx);
+  ContractionOpToDotLowering pat2(vectorContractLoweringOption, ctx);
   FailureOr<Value> newVal2 =
       pat2.matchAndRewriteMaskableOp(op, maskOp, rewriter);
   if (!failed(newVal2))
     return newVal2;
 
-  ContractionOpToDotLowering pat3(vectorContractLoweringOption, ctx);
-  FailureOr<Value> newVal3 =
-      pat3.matchAndRewriteMaskableOp(op, maskOp, rewriter);
-  if (!failed(newVal3))
-    return newVal3;
-
   ContractOpToElementwise pat4(vectorContractLoweringOption, ctx);
   FailureOr<Value> newVal4 =
       pat4.matchAndRewriteMaskableOp(op, maskOp, rewriter);
@@ -1088,8 +1040,8 @@ FailureOr<Value> ContractionOpLowering::lowerParallel(PatternRewriter &rewriter,
       rewriter.getArrayAttr(adjustIter(op.getIteratorTypes(), iterIndex));
   // Unroll into a series of lower dimensional vector.contract ops.
   Location loc = op.getLoc();
-  Value result = rewriter.create<arith::ConstantOp>(
-      loc, resType, rewriter.getZeroAttr(resType));
+  Value result = arith::ConstantOp::create(rewriter, loc, resType,
+                                           rewriter.getZeroAttr(resType));
 
   for (int64_t d = 0; d < dimSize; ++d) {
     auto lhs = reshapeLoad(loc, op.getLhs(), lhsType, lhsIndex, d, rewriter);
@@ -1101,8 +1053,8 @@ FailureOr<Value> ContractionOpLowering::lowerParallel(PatternRewriter &rewriter,
       lowMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),
                             iterIndex, d, rewriter);
 
-    Operation *lowContract = rewriter.create<vector::ContractionOp>(
-        loc, lhs, rhs, acc, lowAffine, lowIter);
+    Operation *lowContract = vector::ContractionOp::create(
+        rewriter, loc, lhs, rhs, acc, lowAffine, lowIter);
     lowContract = maskOperation(rewriter, lowContract, lowMask);
     result = reshapeStore(loc, lowContract->getResult(0), result, resType,
                           resIndex, d, rewriter);
@@ -1152,8 +1104,8 @@ FailureOr<Value> ContractionOpLowering::lowerReduction(
 
     Value acc = op.getAcc();
     Operation *reductionOp =
-        acc ? rewriter.create<vector::ReductionOp>(loc, kind, m, acc)
-            : rewriter.create<vector::ReductionOp>(loc, kind, m);
+        acc ? vector::ReductionOp::create(rewriter, loc, kind, m, acc)
+            : vector::ReductionOp::create(rewriter, loc, kind, m);
     return maskOperation(rewriter, reductionOp, mask)->getResult(0);
   }
   // Construct new iterator types and affine map array attribute.
@@ -1177,8 +1129,8 @@ FailureOr<Value> ContractionOpLowering::lowerReduction(
       newMask = reshapeLoad(loc, mask, cast<VectorType>(mask.getType()),
                             iterIndex, d, rewriter);
 
-    Operation *newContract = rewriter.create<vector::ContractionOp>(
-        loc, lhs, rhs, result, lowAffine, lowIter);
+    Operation *newContract = vector::ContractionOp::create(
+        rewriter, loc, lhs, rhs, result, lowAffine, lowIter);
     result = maskOperation(rewriter, newContract, newMask)->getResult(0);
   }
   return result;
@@ -1231,7 +1183,8 @@ class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
 
     if (!rhsType) {
       // Special case: AXPY operation.
-      Value b = rewriter.create<vector::BroadcastOp>(loc, lhsType, op.getRhs());
+      Value b =
+          vector::BroadcastOp::create(rewriter, loc, lhsType, op.getRhs());
       std::optional<Value> mult = createContractArithOp(
           loc, op.getLhs(), b, acc, kind, rewriter, isInt, mask);
       if (!mult.has_value())
@@ -1240,23 +1193,23 @@ class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
       return success();
     }
 
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resType, rewriter.getZeroAttr(resType));
+    Value result = arith::ConstantOp::create(rewriter, loc, resType,
+                                             rewriter.getZeroAttr(resType));
     for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) {
-      Value x = rewriter.create<vector::ExtractOp>(loc, op.getLhs(), d);
-      Value a = rewriter.create<vector::BroadcastOp>(loc, rhsType, x);
+      Value x = vector::ExtractOp::create(rewriter, loc, op.getLhs(), d);
+      Value a = vector::BroadcastOp::create(rewriter, loc, rhsType, x);
       Value r = nullptr;
       if (acc)
-        r = rewriter.create<vector::ExtractOp>(loc, acc, d);
+        r = vector::ExtractOp::create(rewriter, loc, acc, d);
       Value extrMask;
       if (mask)
-        extrMask = rewriter.create<vector::ExtractOp>(loc, mask, d);
+        extrMask = vector::ExtractOp::create(rewriter, loc, mask, d);
 
       std::optional<Value> m = createContractArithOp(
           loc, a, op.getRhs(), r, kind, rewriter, isInt, extrMask);
       if (!m.has_value())
         return failure();
-      result = rewriter.create<vector::InsertOp>(loc, *m, result, d);
+      result = vector::InsertOp::create(rewriter, loc, *m, result, d);
     }
 
     rewriter.replaceOp(rootOp, result);
@@ -1264,118 +1217,6 @@ class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
   }
 };
 
-/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul
-/// semantics to:
-/// ```
-///    %mta = maybe_transpose
-///    %mtb = maybe_transpose
-///    %flattened_a = vector.shape_cast %mta
-///    %flattened_b = vector.shape_cast %mtb
-///    %flattened_d = vector.matrix_multiply %flattened_a, %flattened_b
-///    %mtd = vector.shape_cast %flattened_d
-///    %d = maybe_untranspose %mtd
-///    %e = add %c, %d
-/// ```
-/// `vector.matrix_multiply` later lowers to `llvm.matrix.multiply`.
-//
-/// This only kicks in when vectorContractLowering is set to `Matmul`.
-/// vector.transpose operations are inserted if the vector.contract op is not a
-/// row-major matrix multiply.
-///
-/// Scalable vectors are not supported.
-FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp(
-    vector::ContractionOp op, MaskingOpInterface maskOp,
-    PatternRewriter &rew) const {
-  // TODO: Support vector.mask.
-  if (maskOp)
-    return failure();
-
-  if (vectorContractLowering != vector::VectorContractLowering::Matmul)
-    return failure();
-  if (failed(filter(op)))
-    return failure();
-
-  auto iteratorTypes = op.getIteratorTypes().getValue();
-  if (!isParallelIterator(iteratorTypes[0]) ||
-      !isParallelIterator(iteratorTypes[1]) ||
-      !isReductionIterator(iteratorTypes[2]))
-    return failure();
-
-  Type opResType = op.getType();
-  VectorType vecType = dyn_cast<VectorType>(opResType);
-  if (vecType && vecType.isScalable()) {
-    // Note - this is sufficient to reject all cases with scalable vectors.
-    return failure();
-  }
-
-  Type elementType = op.getLhsType().getElementType();
-  if (!elementType.isIntOrFloat())
-    return failure();
-
-  Type dstElementType = vecType ? vecType.getElementType() : opResType;
-  if (elementType != dstElementType)
-    return failure();
-
-  // Perform lhs + rhs transpositions to conform to matmul row-major semantics.
-  // Bail out if the contraction cannot be put in this form.
-  MLIRContext *ctx = op.getContext();
-  Location loc = op.getLoc();
-  AffineExpr m, n, k;
-  bindDims(rew.getContext(), m, n, k);
-  // LHS must be A(m, k) or A(k, m).
-  Value lhs = op.getLhs();
-  auto lhsMap = op.getIndexingMapsArray()[0];
-  if (lhsMap == AffineMap::get(3, 0, {k, m}, ctx))
-    lhs = rew.create<vector::TransposeOp>(loc, lhs, ArrayRef<int64_t>{1, 0});
-  else if (lhsMap != AffineMap::get(3, 0, {m, k}, ctx))
-    return failure();
-
-  // RHS must be B(k, n) or B(n, k).
-  Value rhs = op.getRhs();
-  auto rhsMap = op.getIndexingMapsArray()[1];
-  if (rhsMap == AffineMap::get(3, 0, {n, k}, ctx))
-    rhs = rew.create<vector::TransposeOp>(loc, rhs, ArrayRef<int64_t>{1, 0});
-  else if (rhsMap != AffineMap::get(3, 0, {k, n}, ctx))
-    return failure();
-
-  // At this point lhs and rhs are in row-major.
-  VectorType lhsType = cast<VectorType>(lhs.getType());
-  VectorType rhsType = cast<VectorType>(rhs.getType());
-  int64_t lhsRows = lhsType.getDimSize(0);
-  int64_t lhsColumns = lhsType.getDimSize(1);
-  int64_t rhsColumns = rhsType.getDimSize(1);
-
-  Type flattenedLHSType =
-      VectorType::get(lhsType.getNumElements(), lhsType.getElementType());
-  lhs = rew.create<vector::ShapeCastOp>(loc, flattenedLHSType, lhs);
-
-  Type flattenedRHSType =
-      VectorType::get(rhsType.getNumElements(), rhsType.getElementType());
-  rhs = rew.create<vector::ShapeCastOp>(loc, flattenedRHSType, rhs);
-
-  Value mul = rew.create<vector::MatmulOp>(loc, lhs, rhs, lhsRows, lhsColumns,
-                                           rhsColumns);
-  mul = rew.create<vector::ShapeCastOp>(
-      loc,
-      VectorType::get({lhsRows, rhsColumns},
-                      getElementTypeOrSelf(op.getAcc().getType())),
-      mul);
-
-  // ACC must be C(m, n) or C(n, m).
-  auto accMap = op.getIndexingMapsArray()[2];
-  if (accMap == AffineMap::get(3, 0, {n, m}, ctx))
-    mul = rew.create<vector::TransposeOp>(loc, mul, ArrayRef<int64_t>{1, 0});
-  else if (accMap != AffineMap::get(3, 0, {m, n}, ctx))
-    llvm_unreachable("invalid contraction semantics");
-
-  Value res =
-      isa<IntegerType>(elementType)
-          ? static_cast<Value>(rew.create<arith::AddIOp>(loc, op.getAcc(), mul))
-          : static_cast<Value>(
-                rew.create<arith::AddFOp>(loc, op.getAcc(), mul));
-
-  return res;
-}
 } // namespace
 
 void mlir::vector::populateVectorContractLoweringPatterns(
@@ -1384,8 +1225,7 @@ void mlir::vector::populateVectorContractLoweringPatterns(
     bool disableOuterProductLowering) {
   if (!disableOuterProductLowering)
     patterns.add<OuterProductOpLowering>(patterns.getContext(), benefit);
-  patterns.add<ContractionOpLowering, ContractionOpToMatmulOpLowering,
-               ContractionOpToOuterProductOpLowering>(
+  patterns.add<ContractionOpLowering, ContractionOpToOuterProductOpLowering>(
       vectorContractLoweringOption, patterns.getContext(), benefit);
 }
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
index f4ad56b4178db..2484670c39caa 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
@@ -68,8 +68,8 @@ struct UnrollGather : OpRewritePattern<vector::GatherOp> {
     Value maskVec = op.getMask();
     Value passThruVec = op.getPassThru();
 
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resultTy, rewriter.getZeroAttr(resultTy));
+    Value result = arith::ConstantOp::create(rewriter, loc, resultTy,
+                                             rewriter.getZeroAttr(resultTy));
 
     VectorType subTy = VectorType::Builder(resultTy).dropDim(0);
 
@@ -77,16 +77,16 @@ struct UnrollGather : OpRewritePattern<vector::GatherOp> {
       int64_t thisIdx[1] = {i};
 
       Value indexSubVec =
-          rewriter.create<vector::ExtractOp>(loc, indexVec, thisIdx);
+          vector::ExtractOp::create(rewriter, loc, indexVec, thisIdx);
       Value maskSubVec =
-          rewriter.create<vector::ExtractOp>(loc, maskVec, thisIdx);
+          vector::ExtractOp::create(rewriter, loc, maskVec, thisIdx);
       Value passThruSubVec =
-          rewriter.create<vector::ExtractOp>(loc, passThruVec, thisIdx);
-      Value subGather = rewriter.create<vector::GatherOp>(
-          loc, subTy, op.getBase(), op.getIndices(), indexSubVec, maskSubVec,
-          passThruSubVec);
+          vector::ExtractOp::create(rewriter, loc, passThruVec, thisIdx);
+      Value subGather = vector::GatherOp::create(
+          rewriter, loc, subTy, op.getBase(), op.getIndices(), indexSubVec,
+          maskSubVec, passThruSubVec);
       result =
-          rewriter.create<vector::InsertOp>(loc, subGather, result, thisIdx);
+          vector::InsertOp::create(rewriter, loc, subGather, result, thisIdx);
     }
 
     rewriter.replaceOp(op, result);
@@ -152,24 +152,24 @@ struct RemoveStrideFromGatherSource : OpRewritePattern<vector::GatherOp> {
 
     // 1. Collapse the input memref so that it's "flat".
     SmallVector<ReassociationIndices> reassoc = {{0, 1}};
-    Value collapsed = rewriter.create<memref::CollapseShapeOp>(
-        op.getLoc(), subview.getSource(), reassoc);
+    Value collapsed = memref::CollapseShapeOp::create(
+        rewriter, op.getLoc(), subview.getSource(), reassoc);
 
     // 2. Generate new gather indices that will model the
     // strided access.
     IntegerAttr stride = rewriter.getIndexAttr(srcTrailingDim);
     VectorType vType = op.getIndexVec().getType();
-    Value mulCst = rewriter.create<arith::ConstantOp>(
-        op.getLoc(), vType, DenseElementsAttr::get(vType, stride));
+    Value mulCst = arith::ConstantOp::create(
+        rewriter, op.getLoc(), vType, DenseElementsAttr::get(vType, stride));
 
     Value newIdxs =
-        rewriter.create<arith::MulIOp>(op.getLoc(), op.getIndexVec(), mulCst);
+        arith::MulIOp::create(rewriter, op.getLoc(), op.getIndexVec(), mulCst);
 
     // 3. Create an updated gather op with the collapsed input memref and the
     // updated indices.
-    Value newGather = rewriter.create<vector::GatherOp>(
-        op.getLoc(), op.getResult().getType(), collapsed, op.getIndices(),
-        newIdxs, op.getMask(), op.getPassThru());
+    Value newGather = vector::GatherOp::create(
+        rewriter, op.getLoc(), op.getResult().getType(), collapsed,
+        op.getIndices(), newIdxs, op.getMask(), op.getPassThru());
     rewriter.replaceOp(op, newGather);
 
     return success();
@@ -222,8 +222,8 @@ struct Gather1DToConditionalLoads : OpRewritePattern<vector::GatherOp> {
     for (int64_t i = 0, e = resultTy.getNumElements(); i < e; ++i) {
       int64_t thisIdx[1] = {i};
       Value condition =
-          rewriter.create<vector::ExtractOp>(loc, condMask, thisIdx);
-      Value index = rewriter.create<vector::ExtractOp>(loc, indexVec, thisIdx);
+          vector::ExtractOp::create(rewriter, loc, condMask, thisIdx);
+      Value index = vector::ExtractOp::create(rewriter, loc, indexVec, thisIdx);
       baseOffsets.back() =
           rewriter.createOrFold<arith::AddIOp>(loc, lastBaseOffset, index);
 
@@ -233,19 +233,19 @@ struct Gather1DToConditionalLoads : OpRewritePattern<vector::GatherOp> {
           // `vector.load` does not support scalar result; emit a vector load
           // and extract the single result instead.
           Value load =
-              b.create<vector::LoadOp>(loc, elemVecTy, base, baseOffsets);
+              vector::LoadOp::create(b, loc, elemVecTy, base, baseOffsets);
           int64_t zeroIdx[1] = {0};
-          extracted = b.create<vector::ExtractOp>(loc, load, zeroIdx);
+          extracted = vector::ExtractOp::create(b, loc, load, zeroIdx);
         } else {
-          extracted = b.create<tensor::ExtractOp>(loc, base, baseOffsets);
+          extracted = tensor::ExtractOp::create(b, loc, base, baseOffsets);
         }
 
         Value newResult =
-            b.create<vector::InsertOp>(loc, extracted, result, thisIdx);
-        b.create<scf::YieldOp>(loc, newResult);
+            vector::InsertOp::create(b, loc, extracted, result, thisIdx);
+        scf::YieldOp::create(b, loc, newResult);
       };
       auto passThruBuilder = [result](OpBuilder &b, Location loc) {
-        b.create<scf::YieldOp>(loc, result);
+        scf::YieldOp::create(b, loc, result);
       };
 
       result =
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp
index cab0f213b14a9..9d6a865a9301f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp
@@ -60,14 +60,16 @@ class UnrollInterleaveOp final : public OpRewritePattern<vector::InterleaveOp> {
       return failure();
 
     auto loc = op.getLoc();
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resultType, rewriter.getZeroAttr(resultType));
+    Value result = arith::ConstantOp::create(rewriter, loc, resultType,
+                                             rewriter.getZeroAttr(resultType));
     for (auto position : *unrollIterator) {
-      Value extractLhs = rewriter.create<ExtractOp>(loc, op.getLhs(), position);
-      Value extractRhs = rewriter.create<ExtractOp>(loc, op.getRhs(), position);
+      Value extractLhs =
+          ExtractOp::create(rewriter, loc, op.getLhs(), position);
+      Value extractRhs =
+          ExtractOp::create(rewriter, loc, op.getRhs(), position);
       Value interleave =
-          rewriter.create<InterleaveOp>(loc, extractLhs, extractRhs);
-      result = rewriter.create<InsertOp>(loc, interleave, result, position);
+          InterleaveOp::create(rewriter, loc, extractLhs, extractRhs);
+      result = InsertOp::create(rewriter, loc, interleave, result, position);
     }
 
     rewriter.replaceOp(op, result);
@@ -123,20 +125,20 @@ class UnrollDeinterleaveOp final
       return failure();
 
     auto loc = op.getLoc();
-    Value emptyResult = rewriter.create<arith::ConstantOp>(
-        loc, resultType, rewriter.getZeroAttr(resultType));
+    Value emptyResult = arith::ConstantOp::create(
+        rewriter, loc, resultType, rewriter.getZeroAttr(resultType));
     Value evenResult = emptyResult;
     Value oddResult = emptyResult;
 
     for (auto position : *unrollIterator) {
       auto extractSrc =
-          rewriter.create<vector::ExtractOp>(loc, op.getSource(), position);
+          vector::ExtractOp::create(rewriter, loc, op.getSource(), position);
       auto deinterleave =
-          rewriter.create<vector::DeinterleaveOp>(loc, extractSrc);
-      evenResult = rewriter.create<vector::InsertOp>(
-          loc, deinterleave.getRes1(), evenResult, position);
-      oddResult = rewriter.create<vector::InsertOp>(loc, deinterleave.getRes2(),
-                                                    oddResult, position);
+          vector::DeinterleaveOp::create(rewriter, loc, extractSrc);
+      evenResult = vector::InsertOp::create(
+          rewriter, loc, deinterleave.getRes1(), evenResult, position);
+      oddResult = vector::InsertOp::create(
+          rewriter, loc, deinterleave.getRes2(), oddResult, position);
     }
     rewriter.replaceOp(op, ValueRange{evenResult, oddResult});
     return success();
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
index ba21092d2af3c..45ef7f01a85f1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
@@ -67,19 +67,20 @@ class CreateMaskOpLowering : public OpRewritePattern<vector::CreateMaskOp> {
     Value idx = op.getOperand(0);
 
     VectorType lowType = VectorType::Builder(dstType).dropDim(0);
-    Value trueVal = rewriter.create<vector::CreateMaskOp>(
-        loc, lowType, op.getOperands().drop_front());
-    Value falseVal = rewriter.create<arith::ConstantOp>(
-        loc, lowType, rewriter.getZeroAttr(lowType));
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstType, rewriter.getZeroAttr(dstType));
+    Value trueVal = vector::CreateMaskOp::create(rewriter, loc, lowType,
+                                                 op.getOperands().drop_front());
+    Value falseVal = arith::ConstantOp::create(rewriter, loc, lowType,
+                                               rewriter.getZeroAttr(lowType));
+    Value result = arith::ConstantOp::create(rewriter, loc, dstType,
+                                             rewriter.getZeroAttr(dstType));
     for (int64_t d = 0; d < dim; d++) {
       Value bnd =
-          rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(d));
-      Value val = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
-                                                 bnd, idx);
-      Value sel = rewriter.create<arith::SelectOp>(loc, val, trueVal, falseVal);
-      result = rewriter.create<vector::InsertOp>(loc, sel, result, d);
+          arith::ConstantOp::create(rewriter, loc, rewriter.getIndexAttr(d));
+      Value val = arith::CmpIOp::create(rewriter, loc,
+                                        arith::CmpIPredicate::slt, bnd, idx);
+      Value sel =
+          arith::SelectOp::create(rewriter, loc, val, trueVal, falseVal);
+      result = vector::InsertOp::create(rewriter, loc, sel, result, d);
     }
     rewriter.replaceOp(op, result);
     return success();
@@ -146,12 +147,12 @@ class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
           op, "Cannot unroll leading scalable dim in dstType");
 
     VectorType lowType = VectorType::Builder(dstType).dropDim(0);
-    Value trueVal = rewriter.create<vector::ConstantMaskOp>(
-        loc, lowType, dimSizes.drop_front());
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstType, rewriter.getZeroAttr(dstType));
+    Value trueVal = vector::ConstantMaskOp::create(rewriter, loc, lowType,
+                                                   dimSizes.drop_front());
+    Value result = arith::ConstantOp::create(rewriter, loc, dstType,
+                                             rewriter.getZeroAttr(dstType));
     for (int64_t d = 0; d < trueDimSize; d++)
-      result = rewriter.create<vector::InsertOp>(loc, trueVal, result, d);
+      result = vector::InsertOp::create(rewriter, loc, trueVal, result, d);
 
     rewriter.replaceOp(op, result);
     return success();
@@ -261,8 +262,8 @@ struct MaskedGatherOpPattern : public MaskOpRewritePattern<GatherOp> {
                             PatternRewriter &rewriter) const override {
     Value passthru = maskingOp.hasPassthru()
                          ? maskingOp.getPassthru()
-                         : rewriter.create<arith::ConstantOp>(
-                               gatherOp.getLoc(),
+                         : arith::ConstantOp::create(
+                               rewriter, gatherOp.getLoc(),
                                rewriter.getZeroAttr(gatherOp.getVectorType()));
 
     // Replace the `vector.mask` operation.
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
index ce524b259d8d4..4773732d8d9a6 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
@@ -103,12 +103,12 @@ class InnerOuterDimReductionConversion
     // If masked, transpose the original mask.
     Value transposedMask;
     if (maskableOp.isMasked()) {
-      transposedMask = rewriter.create<vector::TransposeOp>(
-          loc, maskableOp.getMaskingOp().getMask(), indices);
+      transposedMask = vector::TransposeOp::create(
+          rewriter, loc, maskableOp.getMaskingOp().getMask(), indices);
     }
 
     // Transpose reduction source.
-    auto transposeOp = rewriter.create<vector::TransposeOp>(loc, src, indices);
+    auto transposeOp = vector::TransposeOp::create(rewriter, loc, src, indices);
     SmallVector<bool> reductionMask(srcRank, false);
     for (int i = 0; i < reductionSize; ++i) {
       if (useInnerDimsForReduction)
@@ -117,8 +117,8 @@ class InnerOuterDimReductionConversion
         reductionMask[i] = true;
     }
 
-    Operation *newMultiRedOp = rewriter.create<vector::MultiDimReductionOp>(
-        multiReductionOp.getLoc(), transposeOp.getResult(),
+    Operation *newMultiRedOp = vector::MultiDimReductionOp::create(
+        rewriter, multiReductionOp.getLoc(), transposeOp.getResult(),
         multiReductionOp.getAcc(), reductionMask, multiReductionOp.getKind());
     newMultiRedOp =
         mlir::vector::maskOperation(rewriter, newMultiRedOp, transposedMask);
@@ -255,15 +255,15 @@ class ReduceMultiDimReductionRank
       auto maskCastedType = VectorType::get(
           vectorShape,
           llvm::cast<VectorType>(vectorMask.getType()).getElementType());
-      newVectorMask =
-          rewriter.create<vector::ShapeCastOp>(loc, maskCastedType, vectorMask);
+      newVectorMask = vector::ShapeCastOp::create(rewriter, loc, maskCastedType,
+                                                  vectorMask);
     }
 
     auto castedType = VectorType::get(
         vectorShape, multiReductionOp.getSourceVectorType().getElementType(),
         scalableDims);
-    Value cast = rewriter.create<vector::ShapeCastOp>(
-        loc, castedType, multiReductionOp.getSource());
+    Value cast = vector::ShapeCastOp::create(rewriter, loc, castedType,
+                                             multiReductionOp.getSource());
 
     Value acc = multiReductionOp.getAcc();
     if (flattenedParallelDim) {
@@ -271,12 +271,12 @@ class ReduceMultiDimReductionRank
           {flattenedParallelDim},
           multiReductionOp.getSourceVectorType().getElementType(),
           /*scalableDims=*/{isParallelDimScalable});
-      acc = rewriter.create<vector::ShapeCastOp>(loc, accType, acc);
+      acc = vector::ShapeCastOp::create(rewriter, loc, accType, acc);
     }
     // 6. Creates the flattened form of vector.multi_reduction with inner/outer
     // most dim as reduction.
-    Operation *newMultiDimRedOp = rewriter.create<vector::MultiDimReductionOp>(
-        loc, cast, acc, mask, multiReductionOp.getKind());
+    Operation *newMultiDimRedOp = vector::MultiDimReductionOp::create(
+        rewriter, loc, cast, acc, mask, multiReductionOp.getKind());
     newMultiDimRedOp =
         mlir::vector::maskOperation(rewriter, newMultiDimRedOp, newVectorMask);
 
@@ -339,11 +339,11 @@ struct TwoDimMultiReductionToElementWise
 
     Value result = multiReductionOp.getAcc();
     for (int64_t i = 0; i < srcShape[0]; i++) {
-      auto operand = rewriter.create<vector::ExtractOp>(
-          loc, multiReductionOp.getSource(), i);
+      auto operand = vector::ExtractOp::create(rewriter, loc,
+                                               multiReductionOp.getSource(), i);
       Value extractMask = nullptr;
       if (mask) {
-        extractMask = rewriter.create<vector::ExtractOp>(loc, mask, i);
+        extractMask = vector::ExtractOp::create(rewriter, loc, mask, i);
       }
       result =
           makeArithReduction(rewriter, loc, multiReductionOp.getKind(), operand,
@@ -383,28 +383,29 @@ struct TwoDimMultiReductionToReduction
     }
 
     auto loc = multiReductionOp.getLoc();
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, multiReductionOp.getDestType(),
+    Value result = arith::ConstantOp::create(
+        rewriter, loc, multiReductionOp.getDestType(),
         rewriter.getZeroAttr(multiReductionOp.getDestType()));
     int outerDim = multiReductionOp.getSourceVectorType().getShape()[0];
 
     for (int i = 0; i < outerDim; ++i) {
-      auto v = rewriter.create<vector::ExtractOp>(
-          loc, multiReductionOp.getSource(), ArrayRef<int64_t>{i});
-      auto acc = rewriter.create<vector::ExtractOp>(
-          loc, multiReductionOp.getAcc(), ArrayRef<int64_t>{i});
-      Operation *reductionOp = rewriter.create<vector::ReductionOp>(
-          loc, multiReductionOp.getKind(), v, acc);
+      auto v = vector::ExtractOp::create(
+          rewriter, loc, multiReductionOp.getSource(), ArrayRef<int64_t>{i});
+      auto acc = vector::ExtractOp::create(
+          rewriter, loc, multiReductionOp.getAcc(), ArrayRef<int64_t>{i});
+      Operation *reductionOp = vector::ReductionOp::create(
+          rewriter, loc, multiReductionOp.getKind(), v, acc);
 
       // If masked, slice the mask and mask the new reduction operation.
       if (maskableOp.isMasked()) {
-        Value mask = rewriter.create<vector::ExtractOp>(
-            loc, maskableOp.getMaskingOp().getMask(), ArrayRef<int64_t>{i});
+        Value mask = vector::ExtractOp::create(
+            rewriter, loc, maskableOp.getMaskingOp().getMask(),
+            ArrayRef<int64_t>{i});
         reductionOp = mlir::vector::maskOperation(rewriter, reductionOp, mask);
       }
 
-      result = rewriter.create<vector::InsertOp>(loc, reductionOp->getResult(0),
-                                                 result, i);
+      result = vector::InsertOp::create(rewriter, loc,
+                                        reductionOp->getResult(0), result, i);
     }
 
     rewriter.replaceOp(rootOp, result);
@@ -459,10 +460,10 @@ struct OneDimMultiReductionToTwoDim
     SmallVector<bool, 2> reductionMask{false, true};
 
     /// vector.extract(vector.multi_reduce(vector.shape_cast(v, 1xk)), 0)
-    Value cast = rewriter.create<vector::ShapeCastOp>(
-        loc, castedType, multiReductionOp.getSource());
-    Value castAcc = rewriter.create<vector::BroadcastOp>(
-        loc, accType, multiReductionOp.getAcc());
+    Value cast = vector::ShapeCastOp::create(rewriter, loc, castedType,
+                                             multiReductionOp.getSource());
+    Value castAcc = vector::BroadcastOp::create(rewriter, loc, accType,
+                                                multiReductionOp.getAcc());
     Value castMask;
     if (maskableOp.isMasked()) {
       auto maskType = llvm::cast<VectorType>(mask.getType());
@@ -470,11 +471,12 @@ struct OneDimMultiReductionToTwoDim
           ArrayRef<int64_t>{1, maskType.getShape().back()},
           maskType.getElementType(),
           ArrayRef<bool>{false, maskType.getScalableDims().back()});
-      castMask = rewriter.create<vector::BroadcastOp>(loc, castMaskType, mask);
+      castMask = vector::BroadcastOp::create(rewriter, loc, castMaskType, mask);
     }
 
-    Operation *newOp = rewriter.create<vector::MultiDimReductionOp>(
-        loc, cast, castAcc, reductionMask, multiReductionOp.getKind());
+    Operation *newOp = vector::MultiDimReductionOp::create(
+        rewriter, loc, cast, castAcc, reductionMask,
+        multiReductionOp.getKind());
     newOp = vector::maskOperation(rewriter, newOp, castMask);
 
     rewriter.replaceOpWithNewOp<vector::ExtractOp>(rootOp, newOp->getResult(0),
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
index 6f3955f522775..af4851eb5f158 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
@@ -112,8 +112,8 @@ struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
       return failure();
 
     VectorType resType = VectorType::get(destShape, elType);
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resType, rewriter.getZeroAttr(resType));
+    Value result = arith::ConstantOp::create(rewriter, loc, resType,
+                                             rewriter.getZeroAttr(resType));
     int64_t reductionDim = scanOp.getReductionDim();
     bool inclusive = scanOp.getInclusive();
     int64_t destRank = destType.getRank();
@@ -134,9 +134,9 @@ struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
     for (int i = 0; i < destShape[reductionDim]; i++) {
       offsets[reductionDim] = i;
       ArrayAttr scanOffsets = rewriter.getI64ArrayAttr(offsets);
-      Value input = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, reductionType, scanOp.getSource(), scanOffsets, scanSizes,
-          scanStrides);
+      Value input = vector::ExtractStridedSliceOp::create(
+          rewriter, loc, reductionType, scanOp.getSource(), scanOffsets,
+          scanSizes, scanStrides);
       Value output;
       if (i == 0) {
         if (inclusive) {
@@ -144,11 +144,11 @@ struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
         } else {
           if (initialValueRank == 0) {
             // ShapeCastOp cannot handle 0-D vectors
-            output = rewriter.create<vector::BroadcastOp>(
-                loc, input.getType(), scanOp.getInitialValue());
+            output = vector::BroadcastOp::create(rewriter, loc, input.getType(),
+                                                 scanOp.getInitialValue());
           } else {
-            output = rewriter.create<vector::ShapeCastOp>(
-                loc, input.getType(), scanOp.getInitialValue());
+            output = vector::ShapeCastOp::create(rewriter, loc, input.getType(),
+                                                 scanOp.getInitialValue());
           }
         }
       } else {
@@ -156,20 +156,20 @@ struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
         output = vector::makeArithReduction(rewriter, loc, scanOp.getKind(),
                                             lastOutput, y);
       }
-      result = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, output, result, offsets, strides);
+      result = vector::InsertStridedSliceOp::create(rewriter, loc, output,
+                                                    result, offsets, strides);
       lastOutput = output;
       lastInput = input;
     }
 
     Value reduction;
     if (initialValueRank == 0) {
-      Value v = rewriter.create<vector::ExtractOp>(loc, lastOutput, 0);
+      Value v = vector::ExtractOp::create(rewriter, loc, lastOutput, 0);
       reduction =
-          rewriter.create<vector::BroadcastOp>(loc, initialValueType, v);
+          vector::BroadcastOp::create(rewriter, loc, initialValueType, v);
     } else {
-      reduction = rewriter.create<vector::ShapeCastOp>(loc, initialValueType,
-                                                       lastOutput);
+      reduction = vector::ShapeCastOp::create(rewriter, loc, initialValueType,
+                                              lastOutput);
     }
 
     rewriter.replaceOp(scanOp, {result, reduction});
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
index 39c16fab21c4e..603ea41d43360 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
@@ -137,11 +137,12 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
     const int64_t resultLeading = delta > 0 ? 0 : -delta;
 
     const Value source = shapeCast.getSource();
-    const Value poison = rewriter.create<ub::PoisonOp>(loc, resultType);
-    const Value extracted = rewriter.create<vector::ExtractOp>(
-        loc, source, SmallVector<int64_t>(sourceLeading, 0));
-    const Value result = rewriter.create<vector::InsertOp>(
-        loc, extracted, poison, SmallVector<int64_t>(resultLeading, 0));
+    const Value poison = ub::PoisonOp::create(rewriter, loc, resultType);
+    const Value extracted = vector::ExtractOp::create(
+        rewriter, loc, source, SmallVector<int64_t>(sourceLeading, 0));
+    const Value result =
+        vector::InsertOp::create(rewriter, loc, extracted, poison,
+                                 SmallVector<int64_t>(resultLeading, 0));
 
     rewriter.replaceOp(shapeCast, result);
     return success();
@@ -171,14 +172,14 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
 
     SmallVector<int64_t> extractIndex(sourceDim, 0);
     SmallVector<int64_t> insertIndex(resultDim, 0);
-    Value result = rewriter.create<ub::PoisonOp>(loc, resultType);
+    Value result = ub::PoisonOp::create(rewriter, loc, resultType);
 
     for (int i = 0; i < nSlices; ++i) {
       Value extracted =
-          rewriter.create<vector::ExtractOp>(loc, source, extractIndex);
+          vector::ExtractOp::create(rewriter, loc, source, extractIndex);
 
-      result = rewriter.create<vector::InsertOp>(loc, extracted, result,
-                                                 insertIndex);
+      result = vector::InsertOp::create(rewriter, loc, extracted, result,
+                                        insertIndex);
 
       inplaceAdd(1, sourceShape.take_front(sourceDim), extractIndex);
       inplaceAdd(1, resultShape.take_front(resultDim), insertIndex);
@@ -276,9 +277,9 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
     Value extracted = {};
     Value extractedStrided = {};
     Value insertedSlice = {};
-    Value result = rewriter.create<ub::PoisonOp>(loc, resultType);
+    Value result = ub::PoisonOp::create(rewriter, loc, resultType);
     const Value partResult =
-        rewriter.create<ub::PoisonOp>(loc, insertStridedType);
+        ub::PoisonOp::create(rewriter, loc, insertStridedType);
 
     for (size_t i = 0; i < nAtomicSlices; ++i) {
 
@@ -288,28 +289,28 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
       // vector.extract
       if (extractStridedPhase == 0) {
         extracted =
-            rewriter.create<vector::ExtractOp>(loc, source, extractIndex);
+            vector::ExtractOp::create(rewriter, loc, source, extractIndex);
         inplaceAdd(1, sourceShape.take_front(sourceSuffixStartDim),
                    extractIndex);
       }
 
       // vector.extract_strided_slice
       extractOffsets[0] = extractStridedPhase * greatestCommonDivisor;
-      extractedStrided = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, extracted, extractOffsets, atomicShape, sizes);
+      extractedStrided = vector::ExtractStridedSliceOp::create(
+          rewriter, loc, extracted, extractOffsets, atomicShape, sizes);
 
       // vector.insert_strided_slice
       if (insertStridedPhase == 0) {
         insertedSlice = partResult;
       }
       insertOffsets[0] = insertStridedPhase * greatestCommonDivisor;
-      insertedSlice = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, extractedStrided, insertedSlice, insertOffsets, sizes);
+      insertedSlice = vector::InsertStridedSliceOp::create(
+          rewriter, loc, extractedStrided, insertedSlice, insertOffsets, sizes);
 
       // vector.insert
       if (insertStridedPhase + 1 == insertPeriod) {
-        result = rewriter.create<vector::InsertOp>(loc, insertedSlice, result,
-                                                   insertIndex);
+        result = vector::InsertOp::create(rewriter, loc, insertedSlice, result,
+                                          insertIndex);
         inplaceAdd(1, resultType.getShape().take_front(resultSuffixStartDim),
                    insertIndex);
       }
@@ -394,7 +395,7 @@ class ScalableShapeCastOpRewritePattern
     auto extractionVectorType = VectorType::get(
         {minExtractionSize}, sourceVectorType.getElementType(), {true});
 
-    Value result = rewriter.create<ub::PoisonOp>(loc, resultVectorType);
+    Value result = ub::PoisonOp::create(rewriter, loc, resultVectorType);
     SmallVector<int64_t> srcIdx(srcRank, 0);
     SmallVector<int64_t> resIdx(resRank, 0);
 
@@ -406,16 +407,18 @@ class ScalableShapeCastOpRewritePattern
       // 1. Extract a scalable subvector from the source vector.
       if (!currentSourceScalableVector) {
         if (srcRank != 1) {
-          currentSourceScalableVector = rewriter.create<vector::ExtractOp>(
-              loc, op.getSource(), llvm::ArrayRef(srcIdx).drop_back());
+          currentSourceScalableVector =
+              vector::ExtractOp::create(rewriter, loc, op.getSource(),
+                                        llvm::ArrayRef(srcIdx).drop_back());
         } else {
           currentSourceScalableVector = op.getSource();
         }
       }
       Value sourceSubVector = currentSourceScalableVector;
       if (minExtractionSize < minSourceTrailingSize) {
-        sourceSubVector = rewriter.create<vector::ScalableExtractOp>(
-            loc, extractionVectorType, sourceSubVector, srcIdx.back());
+        sourceSubVector = vector::ScalableExtractOp::create(
+            rewriter, loc, extractionVectorType, sourceSubVector,
+            srcIdx.back());
       }
 
       // 2. Insert the scalable subvector into the result vector.
@@ -423,15 +426,16 @@ class ScalableShapeCastOpRewritePattern
         if (minExtractionSize == minResultTrailingSize) {
           currentResultScalableVector = sourceSubVector;
         } else if (resRank != 1) {
-          currentResultScalableVector = rewriter.create<vector::ExtractOp>(
-              loc, result, llvm::ArrayRef(resIdx).drop_back());
+          currentResultScalableVector = vector::ExtractOp::create(
+              rewriter, loc, result, llvm::ArrayRef(resIdx).drop_back());
         } else {
           currentResultScalableVector = result;
         }
       }
       if (minExtractionSize < minResultTrailingSize) {
-        currentResultScalableVector = rewriter.create<vector::ScalableInsertOp>(
-            loc, sourceSubVector, currentResultScalableVector, resIdx.back());
+        currentResultScalableVector = vector::ScalableInsertOp::create(
+            rewriter, loc, sourceSubVector, currentResultScalableVector,
+            resIdx.back());
       }
 
       // 3. Update the source and result scalable vectors if needed.
@@ -439,9 +443,9 @@ class ScalableShapeCastOpRewritePattern
           currentResultScalableVector != result) {
         // Finished row of result. Insert complete scalable vector into result
         // (n-D) vector.
-        result = rewriter.create<vector::InsertOp>(
-            loc, currentResultScalableVector, result,
-            llvm::ArrayRef(resIdx).drop_back());
+        result = vector::InsertOp::create(rewriter, loc,
+                                          currentResultScalableVector, result,
+                                          llvm::ArrayRef(resIdx).drop_back());
         currentResultScalableVector = {};
       }
       if (srcIdx.back() + minExtractionSize >= minSourceTrailingSize) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp
index 475528289f01f..6407a868abd85 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp
@@ -629,8 +629,8 @@ Value VectorShuffleTreeBuilder::generateShuffleTree(PatternRewriter &rewriter) {
                                                     nextLevelVectorSize);
       }
 
-      Value shuffleVal = rewriter.create<vector::ShuffleOp>(
-          loc, lhsVector, rhsVector, shuffleMask);
+      Value shuffleVal = vector::ShuffleOp::create(rewriter, loc, lhsVector,
+                                                   rhsVector, shuffleMask);
       levelOutputs.push_back(shuffleVal);
     }
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
index fb040bc51a993..e9109322ed3d8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
@@ -44,7 +44,7 @@ static Value extendVectorRank(OpBuilder &builder, Location loc, Value vec,
                          originalVecType.getScalableDims().end());
   VectorType newVecType = VectorType::get(
       newShape, originalVecType.getElementType(), newScalableDims);
-  return builder.create<vector::BroadcastOp>(loc, newVecType, vec);
+  return vector::BroadcastOp::create(builder, loc, newVecType, vec);
 }
 
 /// Extend the rank of a vector Value by `addedRanks` by adding inner unit
@@ -59,7 +59,7 @@ static Value extendMaskRank(OpBuilder &builder, Location loc, Value vec,
     permutation.push_back(i);
   for (int64_t i = 0; i < addedRank; ++i)
     permutation.push_back(i);
-  return builder.create<vector::TransposeOp>(loc, broadcasted, permutation);
+  return vector::TransposeOp::create(builder, loc, broadcasted, permutation);
 }
 
 //===----------------------------------------------------------------------===//
@@ -135,8 +135,8 @@ struct TransferReadPermutationLowering
     // Generate new transfer_read operation.
     VectorType newReadType = VectorType::get(
         newVectorShape, op.getVectorType().getElementType(), newScalableDims);
-    Value newRead = rewriter.create<vector::TransferReadOp>(
-        op.getLoc(), newReadType, op.getBase(), op.getIndices(),
+    Value newRead = vector::TransferReadOp::create(
+        rewriter, op.getLoc(), newReadType, op.getBase(), op.getIndices(),
         AffineMapAttr::get(newMap), op.getPadding(), op.getMask(),
         newInBoundsAttr);
 
@@ -206,12 +206,12 @@ struct TransferWritePermutationLowering
         inverseTransposeInBoundsAttr(rewriter, op.getInBounds(), permutation);
 
     // Generate new transfer_write operation.
-    Value newVec = rewriter.create<vector::TransposeOp>(
-        op.getLoc(), op.getVector(), indices);
+    Value newVec = vector::TransposeOp::create(rewriter, op.getLoc(),
+                                               op.getVector(), indices);
     auto newMap = AffineMap::getMinorIdentityMap(
         map.getNumDims(), map.getNumResults(), rewriter.getContext());
-    auto newWrite = rewriter.create<vector::TransferWriteOp>(
-        op.getLoc(), newVec, op.getBase(), op.getIndices(),
+    auto newWrite = vector::TransferWriteOp::create(
+        rewriter, op.getLoc(), newVec, op.getBase(), op.getIndices(),
         AffineMapAttr::get(newMap), op.getMask(), newInBoundsAttr);
     if (newWrite.hasPureTensorSemantics())
       return newWrite.getResult();
@@ -296,8 +296,8 @@ struct TransferWriteNonPermutationLowering
       newInBoundsValues.push_back(op.isDimInBounds(i));
     }
     ArrayAttr newInBoundsAttr = rewriter.getBoolArrayAttr(newInBoundsValues);
-    auto newWrite = rewriter.create<vector::TransferWriteOp>(
-        op.getLoc(), newVec, op.getBase(), op.getIndices(),
+    auto newWrite = vector::TransferWriteOp::create(
+        rewriter, op.getLoc(), newVec, op.getBase(), op.getIndices(),
         AffineMapAttr::get(newMap), newMask, newInBoundsAttr);
     if (newWrite.hasPureTensorSemantics())
       return newWrite.getResult();
@@ -367,8 +367,8 @@ struct TransferOpReduceRank
             ? rewriter.getArrayAttr(
                   op.getInBoundsAttr().getValue().take_back(reducedShapeRank))
             : ArrayAttr();
-    Value newRead = rewriter.create<vector::TransferReadOp>(
-        op.getLoc(), newReadType, op.getBase(), op.getIndices(),
+    Value newRead = vector::TransferReadOp::create(
+        rewriter, op.getLoc(), newReadType, op.getBase(), op.getIndices(),
         AffineMapAttr::get(newMap), op.getPadding(), op.getMask(),
         newInBoundsAttr);
     return rewriter
@@ -468,21 +468,21 @@ struct TransferReadToVectorLoadLowering
             read, "vector type is not rank 1, can't create masked load, needs "
                   "VectorToSCF");
 
-      Value fill = rewriter.create<vector::SplatOp>(
-          read.getLoc(), unbroadcastedVectorType, read.getPadding());
-      res = rewriter.create<vector::MaskedLoadOp>(
-          read.getLoc(), unbroadcastedVectorType, read.getBase(),
+      Value fill = vector::SplatOp::create(
+          rewriter, read.getLoc(), unbroadcastedVectorType, read.getPadding());
+      res = vector::MaskedLoadOp::create(
+          rewriter, read.getLoc(), unbroadcastedVectorType, read.getBase(),
           read.getIndices(), read.getMask(), fill);
     } else {
-      res = rewriter.create<vector::LoadOp>(read.getLoc(),
-                                            unbroadcastedVectorType,
-                                            read.getBase(), read.getIndices());
+      res = vector::LoadOp::create(rewriter, read.getLoc(),
+                                   unbroadcastedVectorType, read.getBase(),
+                                   read.getIndices());
     }
 
     // Insert a broadcasting op if required.
     if (!broadcastedDims.empty())
-      res = rewriter.create<vector::BroadcastOp>(
-          read.getLoc(), read.getVectorType(), res->getResult(0));
+      res = vector::BroadcastOp::create(
+          rewriter, read.getLoc(), read.getVectorType(), res->getResult(0));
     return res->getResult(0);
   }
 
@@ -566,12 +566,12 @@ struct TransferWriteToVectorStoreLowering
                    << write;
             });
 
-      rewriter.create<vector::MaskedStoreOp>(
-          write.getLoc(), write.getBase(), write.getIndices(), write.getMask(),
-          write.getVector());
+      vector::MaskedStoreOp::create(rewriter, write.getLoc(), write.getBase(),
+                                    write.getIndices(), write.getMask(),
+                                    write.getVector());
     } else {
-      rewriter.create<vector::StoreOp>(write.getLoc(), write.getVector(),
-                                       write.getBase(), write.getIndices());
+      vector::StoreOp::create(rewriter, write.getLoc(), write.getVector(),
+                              write.getBase(), write.getIndices());
     }
     // There's no return value for StoreOps. Use Value() to signal success to
     // matchAndRewrite.
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
index 1fac967e4a2f9..e14f96e7eec59 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
@@ -79,8 +79,8 @@ getUnpackShufflePermFor128Lane(ArrayRef<int64_t> vals, int numBits) {
 static Value createUnpackLoPd(ImplicitLocOpBuilder &b, Value v1, Value v2,
                               int numBits) {
   int numElem = numBits / 32;
-  return b.create<vector::ShuffleOp>(
-      v1, v2,
+  return vector::ShuffleOp::create(
+      b, v1, v2,
       getUnpackShufflePermFor128Lane({0, 1, numElem, numElem + 1}, numBits));
 }
 
@@ -93,8 +93,8 @@ static Value createUnpackLoPd(ImplicitLocOpBuilder &b, Value v1, Value v2,
 static Value createUnpackHiPd(ImplicitLocOpBuilder &b, Value v1, Value v2,
                               int numBits) {
   int numElem = numBits / 32;
-  return b.create<vector::ShuffleOp>(
-      v1, v2,
+  return vector::ShuffleOp::create(
+      b, v1, v2,
       getUnpackShufflePermFor128Lane({2, 3, numElem + 2, numElem + 3},
                                      numBits));
 }
@@ -108,8 +108,8 @@ static Value createUnpackHiPd(ImplicitLocOpBuilder &b, Value v1, Value v2,
 static Value createUnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2,
                               int numBits) {
   int numElem = numBits / 32;
-  auto shuffle = b.create<vector::ShuffleOp>(
-      v1, v2,
+  auto shuffle = vector::ShuffleOp::create(
+      b, v1, v2,
       getUnpackShufflePermFor128Lane({0, numElem, 1, numElem + 1}, numBits));
   return shuffle;
 }
@@ -123,8 +123,8 @@ static Value createUnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2,
 static Value createUnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2,
                               int numBits) {
   int numElem = numBits / 32;
-  return b.create<vector::ShuffleOp>(
-      v1, v2,
+  return vector::ShuffleOp::create(
+      b, v1, v2,
       getUnpackShufflePermFor128Lane({2, numElem + 2, 3, numElem + 3},
                                      numBits));
 }
@@ -180,7 +180,7 @@ static Value create4x128BitSuffle(ImplicitLocOpBuilder &b, Value v1, Value v2,
   appendToMask(0, b23);
   appendToMask(16, b45);
   appendToMask(16, b67);
-  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
+  return vector::ShuffleOp::create(b, v1, v2, shuffleMask);
 }
 
 /// Lowers the value to a vector.shuffle op. The `source` is expected to be a
@@ -191,7 +191,7 @@ static Value transposeToShuffle1D(OpBuilder &b, Value source, int m, int n) {
   for (int64_t j = 0; j < n; ++j)
     for (int64_t i = 0; i < m; ++i)
       mask.push_back(i * n + j);
-  return b.create<vector::ShuffleOp>(source.getLoc(), source, source, mask);
+  return vector::ShuffleOp::create(b, source.getLoc(), source, source, mask);
 }
 
 /// Lowers the value to a sequence of vector.shuffle ops. The `source` is
@@ -283,9 +283,9 @@ static Value transposeToShuffle16x16(OpBuilder &builder, Value source, int m,
 
   auto reshInputType = VectorType::get(
       {m, n}, cast<VectorType>(source.getType()).getElementType());
-  Value res = b.create<ub::PoisonOp>(reshInputType);
+  Value res = ub::PoisonOp::create(b, reshInputType);
   for (int64_t i = 0; i < m; ++i)
-    res = b.create<vector::InsertOp>(vs[i], res, i);
+    res = vector::InsertOp::create(b, vs[i], res, i);
   return res;
 }
 
@@ -328,21 +328,6 @@ class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
       return rewriter.notifyMatchFailure(
           op, "Options specifies lowering to shuffle");
 
-    // Handle a true 2-D matrix transpose differently when requested.
-    if (vectorTransposeLowering == vector::VectorTransposeLowering::Flat &&
-        resType.getRank() == 2 && transp[0] == 1 && transp[1] == 0) {
-      Type flattenedType =
-          VectorType::get(resType.getNumElements(), resType.getElementType());
-      auto matrix =
-          rewriter.create<vector::ShapeCastOp>(loc, flattenedType, input);
-      auto rows = rewriter.getI32IntegerAttr(resType.getShape()[0]);
-      auto columns = rewriter.getI32IntegerAttr(resType.getShape()[1]);
-      Value trans = rewriter.create<vector::FlatTransposeOp>(
-          loc, flattenedType, matrix, rows, columns);
-      rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, resType, trans);
-      return success();
-    }
-
     // Generate unrolled extract/insert ops. We do not unroll the rightmost
     // (i.e., highest-order) dimensions that are not transposed and leave them
     // in vector form to improve performance. Therefore, we prune those
@@ -358,7 +343,7 @@ class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
     // of the leftmost transposed dimensions. We traverse every transpose
     // element using a linearized index that we delinearize to generate the
     // appropriate indices for the extract/insert operations.
-    Value result = rewriter.create<ub::PoisonOp>(loc, resType);
+    Value result = ub::PoisonOp::create(rewriter, loc, resType);
     int64_t numTransposedElements = ShapedType::getNumElements(prunedInShape);
 
     for (int64_t linearIdx = 0; linearIdx < numTransposedElements;
@@ -481,14 +466,14 @@ class TransposeOp2DToShuffleLowering
     Location loc = op.getLoc();
     auto flattenedType = VectorType::get({n * m}, srcType.getElementType());
     auto reshInputType = VectorType::get({m, n}, srcType.getElementType());
-    auto reshInput = rewriter.create<vector::ShapeCastOp>(loc, flattenedType,
-                                                          op.getVector());
+    auto reshInput = vector::ShapeCastOp::create(rewriter, loc, flattenedType,
+                                                 op.getVector());
 
     Value res;
     if (vectorTransposeLowering == VectorTransposeLowering::Shuffle16x16 &&
         m == 16 && n == 16) {
       reshInput =
-          rewriter.create<vector::ShapeCastOp>(loc, reshInputType, reshInput);
+          vector::ShapeCastOp::create(rewriter, loc, reshInputType, reshInput);
       res = transposeToShuffle16x16(rewriter, reshInput, m, n);
     } else {
       // Fallback to shuffle on 1D approach.
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 436029c31e7f8..58e94ea00189f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -114,7 +114,7 @@ struct DistributedLoadStoreHelper {
            "preregistered sequential value.");
     // Scalar case can directly use memref.store.
     if (!isa<VectorType>(val.getType()))
-      return b.create<memref::StoreOp>(loc, val, buffer, zero);
+      return memref::StoreOp::create(b, loc, val, buffer, zero);
 
     // Vector case must use vector::TransferWriteOp which will later lower to
     //   vector.store of memref.store depending on further lowerings.
@@ -127,8 +127,8 @@ struct DistributedLoadStoreHelper {
       }
     }
     SmallVector<bool> inBounds(indices.size(), true);
-    return b.create<vector::TransferWriteOp>(
-        loc, val, buffer, indices,
+    return vector::TransferWriteOp::create(
+        b, loc, val, buffer, indices,
         ArrayRef<bool>(inBounds.begin(), inBounds.end()));
   }
 
@@ -156,7 +156,7 @@ struct DistributedLoadStoreHelper {
 
     // Scalar case can directly use memref.store.
     if (!isa<VectorType>(type))
-      return b.create<memref::LoadOp>(loc, buffer, zero);
+      return memref::LoadOp::create(b, loc, buffer, zero);
 
     // Other cases must be vector atm.
     // Vector case must use vector::TransferReadOp which will later lower to
@@ -172,8 +172,9 @@ struct DistributedLoadStoreHelper {
       }
     }
     SmallVector<bool> inBounds(indices.size(), true);
-    return b.create<vector::TransferReadOp>(
-        loc, cast<VectorType>(type), buffer, indices, /*padding=*/std::nullopt,
+    return vector::TransferReadOp::create(
+        b, loc, cast<VectorType>(type), buffer, indices,
+        /*padding=*/std::nullopt,
         ArrayRef<bool>(inBounds.begin(), inBounds.end()));
   }
 
@@ -243,11 +244,11 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern {
     rewriter.setInsertionPoint(warpOp);
 
     // Step 1: Create scf.if op.
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value isLane0 = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, warpOp.getLaneid(), c0);
-    auto ifOp = rewriter.create<scf::IfOp>(loc, isLane0,
-                                           /*withElseRegion=*/false);
+    Value c0 = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    Value isLane0 = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::eq, warpOp.getLaneid(), c0);
+    auto ifOp = scf::IfOp::create(rewriter, loc, isLane0,
+                                  /*withElseRegion=*/false);
     rewriter.eraseOp(ifOp.thenBlock()->getTerminator());
 
     // Step 2: insert appropriate (alloc, write)-pairs before the scf.if and
@@ -325,7 +326,7 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern {
     // Step 7. Delete terminator and add empty scf.yield.
     rewriter.eraseOp(yieldOp);
     rewriter.setInsertionPointToEnd(ifOp.thenBlock());
-    rewriter.create<scf::YieldOp>(yieldLoc);
+    scf::YieldOp::create(rewriter, yieldLoc);
 
     // Compute replacements for WarpOp results.
     rewriter.replaceOp(warpOp, replacements);
@@ -512,8 +513,9 @@ struct WarpOpTransferWrite : public WarpDistributionPattern {
     rewriter.setInsertionPointAfter(newWarpOp);
 
     // Create a second warp op that contains only writeOp.
-    auto secondWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
-        loc, TypeRange(), newWarpOp.getLaneid(), newWarpOp.getWarpSize());
+    auto secondWarpOp = WarpExecuteOnLane0Op::create(rewriter, loc, TypeRange(),
+                                                     newWarpOp.getLaneid(),
+                                                     newWarpOp.getWarpSize());
     Block &body = secondWarpOp.getBodyRegion().front();
     rewriter.setInsertionPointToStart(&body);
     auto newWriteOp =
@@ -521,7 +523,7 @@ struct WarpOpTransferWrite : public WarpDistributionPattern {
     newWriteOp.getValueToStoreMutable().assign(
         newWarpOp.getResult(newRetIndices[0]));
     rewriter.eraseOp(writeOp);
-    rewriter.create<gpu::YieldOp>(newWarpOp.getLoc());
+    gpu::YieldOp::create(rewriter, newWarpOp.getLoc());
     return success();
   }
 
@@ -698,7 +700,7 @@ struct WarpOpConstant : public WarpDistributionPattern {
         cast<ShapedType>(warpOp.getResult(operandIndex).getType()), scalarAttr);
     Location loc = warpOp.getLoc();
     rewriter.setInsertionPointAfter(warpOp);
-    Value distConstant = rewriter.create<arith::ConstantOp>(loc, newAttr);
+    Value distConstant = arith::ConstantOp::create(rewriter, loc, newAttr);
     rewriter.replaceAllUsesWith(warpOp.getResult(operandIndex), distConstant);
     rewriter.finalizeOpModification(warpOp);
     return success();
@@ -823,9 +825,9 @@ struct WarpOpTransferRead : public WarpDistributionPattern {
     Value newMask =
         hasMask ? newWarpOp.getResult(newRetIndices[newRetIndices.size() - 1])
                 : Value();
-    auto newRead = rewriter.create<vector::TransferReadOp>(
-        read.getLoc(), distributedVal.getType(), read.getBase(), newIndices,
-        read.getPermutationMapAttr(), newPadding, newMask,
+    auto newRead = vector::TransferReadOp::create(
+        rewriter, read.getLoc(), distributedVal.getType(), read.getBase(),
+        newIndices, read.getPermutationMapAttr(), newPadding, newMask,
         read.getInBoundsAttr());
 
     rewriter.replaceAllUsesWith(distributedVal, newRead);
@@ -965,8 +967,8 @@ struct WarpOpBroadcast : public WarpDistributionPattern {
     WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, {broadcastSrc}, {broadcastSrcType}, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
-    Value broadcasted = rewriter.create<vector::BroadcastOp>(
-        loc, destVecType, newWarpOp->getResult(newRetIndices[0]));
+    Value broadcasted = vector::BroadcastOp::create(
+        rewriter, loc, destVecType, newWarpOp->getResult(newRetIndices[0]));
     rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
                                 broadcasted);
     return success();
@@ -1008,8 +1010,8 @@ struct WarpOpShapeCast : public WarpDistributionPattern {
         rewriter, warpOp, {oldCastOp.getSource()}, {castDistributedType},
         newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
-    Value newCast = rewriter.create<vector::ShapeCastOp>(
-        oldCastOp.getLoc(), castResultType,
+    Value newCast = vector::ShapeCastOp::create(
+        rewriter, oldCastOp.getLoc(), castResultType,
         newWarpOp->getResult(newRetIndices[0]));
     rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), newCast);
     return success();
@@ -1091,7 +1093,7 @@ struct WarpOpCreateMask : public WarpDistributionPattern {
     }
 
     auto newMask =
-        rewriter.create<vector::CreateMaskOp>(loc, distType, newOperands);
+        vector::CreateMaskOp::create(rewriter, loc, distType, newOperands);
     rewriter.replaceAllUsesWith(warpOp.getResult(operandIndex), newMask);
     rewriter.finalizeOpModification(warpOp);
     return success();
@@ -1182,9 +1184,10 @@ struct WarpOpInsertStridedSlice : public WarpDistributionPattern {
     Value distributedDest = newWarpOp->getResult(newRetIndices[1]);
     // Create a new insert strided slice op that inserts distributed source into
     // distributed dest.
-    Value newInsert = rewriter.create<vector::InsertStridedSliceOp>(
-        insertOp.getLoc(), distributedDest.getType(), distributedSource,
-        distributedDest, insertOp.getOffsets(), insertOp.getStrides());
+    Value newInsert = vector::InsertStridedSliceOp::create(
+        rewriter, insertOp.getLoc(), distributedDest.getType(),
+        distributedSource, distributedDest, insertOp.getOffsets(),
+        insertOp.getStrides());
     rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), newInsert);
     return success();
   }
@@ -1277,8 +1280,8 @@ struct WarpOpExtractStridedSlice : public WarpDistributionPattern {
     // Create a new extract strided slice op that extracts from the
     // distributed vector.
     Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
-    Value newExtract = rewriter.create<vector::ExtractStridedSliceOp>(
-        extractOp.getLoc(), distributedType, distributedVec,
+    Value newExtract = vector::ExtractStridedSliceOp::create(
+        rewriter, extractOp.getLoc(), distributedType, distributedVec,
         extractOp.getOffsets(),
         ArrayAttr::get(rewriter.getContext(), distributedSizes),
         extractOp.getStrides());
@@ -1323,8 +1326,8 @@ struct WarpOpExtract : public WarpDistributionPattern {
       rewriter.setInsertionPointAfter(newWarpOp);
       Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
       // Extract from distributed vector.
-      Value newExtract = rewriter.create<vector::ExtractOp>(
-          loc, distributedVec, extractOp.getMixedPosition());
+      Value newExtract = vector::ExtractOp::create(
+          rewriter, loc, distributedVec, extractOp.getMixedPosition());
       rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
                                   newExtract);
       return success();
@@ -1352,8 +1355,8 @@ struct WarpOpExtract : public WarpDistributionPattern {
     rewriter.setInsertionPointAfter(newWarpOp);
     Value distributedVec = newWarpOp->getResult(newRetIndices[0]);
     // Extract from distributed vector.
-    Value newExtract = rewriter.create<vector::ExtractOp>(
-        loc, distributedVec, extractOp.getMixedPosition());
+    Value newExtract = vector::ExtractOp::create(rewriter, loc, distributedVec,
+                                                 extractOp.getMixedPosition());
     rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
                                 newExtract);
     return success();
@@ -1422,7 +1425,7 @@ struct WarpOpExtractScalar : public WarpDistributionPattern {
       Value newExtract;
       SmallVector<int64_t> indices(extractSrcType.getRank(), 0);
       newExtract =
-          rewriter.create<vector::ExtractOp>(loc, distributedVec, indices);
+          vector::ExtractOp::create(rewriter, loc, distributedVec, indices);
       rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
                                   newExtract);
       return success();
@@ -1442,11 +1445,11 @@ struct WarpOpExtractScalar : public WarpDistributionPattern {
     // Extract at position: pos % elementsPerLane
     Value newPos =
         elementsPerLane == 1
-            ? rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult()
+            ? arith::ConstantIndexOp::create(rewriter, loc, 0).getResult()
             : affine::makeComposedAffineApply(rewriter, loc,
                                               sym0 % elementsPerLane, pos);
     Value extracted =
-        rewriter.create<vector::ExtractOp>(loc, distributedVec, newPos);
+        vector::ExtractOp::create(rewriter, loc, distributedVec, newPos);
 
     // Shuffle the extracted value to all lanes.
     Value shuffled = warpShuffleFromIdxFn(
@@ -1514,8 +1517,8 @@ struct WarpOpInsertScalar : public WarpDistributionPattern {
       if (pos) {
         indices.push_back(pos);
       }
-      newInsert = rewriter.create<vector::InsertOp>(loc, newSource,
-                                                    distributedVec, indices);
+      newInsert = vector::InsertOp::create(rewriter, loc, newSource,
+                                           distributedVec, indices);
       // Broadcast: Simply move the vector.insert op out.
       rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
                                   newInsert);
@@ -1531,21 +1534,22 @@ struct WarpOpInsertScalar : public WarpDistributionPattern {
     // Insert position: pos % elementsPerLane
     OpFoldResult newPos = affine::makeComposedFoldedAffineApply(
         rewriter, loc, sym0 % elementsPerLane, pos);
-    Value isInsertingLane = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, newWarpOp.getLaneid(), insertingLane);
+    Value isInsertingLane =
+        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                              newWarpOp.getLaneid(), insertingLane);
     Value newResult =
         rewriter
             .create<scf::IfOp>(
                 loc, isInsertingLane,
                 /*thenBuilder=*/
                 [&](OpBuilder &builder, Location loc) {
-                  Value newInsert = builder.create<vector::InsertOp>(
-                      loc, newSource, distributedVec, newPos);
-                  builder.create<scf::YieldOp>(loc, newInsert);
+                  Value newInsert = vector::InsertOp::create(
+                      builder, loc, newSource, distributedVec, newPos);
+                  scf::YieldOp::create(builder, loc, newInsert);
                 },
                 /*elseBuilder=*/
                 [&](OpBuilder &builder, Location loc) {
-                  builder.create<scf::YieldOp>(loc, distributedVec);
+                  scf::YieldOp::create(builder, loc, distributedVec);
                 })
             .getResult(0);
     rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber), newResult);
@@ -1582,8 +1586,9 @@ struct WarpOpInsert : public WarpDistributionPattern {
       rewriter.setInsertionPointAfter(newWarpOp);
       Value distributedSrc = newWarpOp->getResult(newRetIndices[0]);
       Value distributedDest = newWarpOp->getResult(newRetIndices[1]);
-      Value newResult = rewriter.create<vector::InsertOp>(
-          loc, distributedSrc, distributedDest, insertOp.getMixedPosition());
+      Value newResult = vector::InsertOp::create(rewriter, loc, distributedSrc,
+                                                 distributedDest,
+                                                 insertOp.getMixedPosition());
       rewriter.replaceAllUsesWith(newWarpOp->getResult(operandNumber),
                                   newResult);
       return success();
@@ -1632,27 +1637,29 @@ struct WarpOpInsert : public WarpDistributionPattern {
     Value newResult;
     if (distrSrcDim >= 0) {
       // Every lane inserts a small piece.
-      newResult = rewriter.create<vector::InsertOp>(
-          loc, distributedSrc, distributedDest, insertOp.getMixedPosition());
+      newResult = vector::InsertOp::create(rewriter, loc, distributedSrc,
+                                           distributedDest,
+                                           insertOp.getMixedPosition());
     } else {
       // One lane inserts the entire source vector.
       int64_t elementsPerLane = distrDestType.getDimSize(distrDestDim);
       SmallVector<OpFoldResult> pos = insertOp.getMixedPosition();
       SmallVector<int64_t> newPos = getAsIntegers(pos);
       // tid of inserting lane: pos / elementsPerLane
-      Value insertingLane = rewriter.create<arith::ConstantIndexOp>(
-          loc, newPos[distrDestDim] / elementsPerLane);
-      Value isInsertingLane = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, newWarpOp.getLaneid(), insertingLane);
+      Value insertingLane = arith::ConstantIndexOp::create(
+          rewriter, loc, newPos[distrDestDim] / elementsPerLane);
+      Value isInsertingLane =
+          arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq,
+                                newWarpOp.getLaneid(), insertingLane);
       // Insert position: pos % elementsPerLane
       newPos[distrDestDim] %= elementsPerLane;
       auto insertingBuilder = [&](OpBuilder &builder, Location loc) {
-        Value newInsert = builder.create<vector::InsertOp>(
-            loc, distributedSrc, distributedDest, newPos);
-        builder.create<scf::YieldOp>(loc, newInsert);
+        Value newInsert = vector::InsertOp::create(builder, loc, distributedSrc,
+                                                   distributedDest, newPos);
+        scf::YieldOp::create(builder, loc, newInsert);
       };
       auto nonInsertingBuilder = [&](OpBuilder &builder, Location loc) {
-        builder.create<scf::YieldOp>(loc, distributedDest);
+        scf::YieldOp::create(builder, loc, distributedDest);
       };
       newResult = rewriter
                       .create<scf::IfOp>(loc, isInsertingLane,
@@ -1820,8 +1827,8 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     // Create a new `ForOp` outside the new `WarpOp` region.
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newWarpOp);
-    auto newForOp = rewriter.create<scf::ForOp>(
-        forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
+    auto newForOp = scf::ForOp::create(
+        rewriter, forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
         forOp.getStep(), newForOpOperands);
     // Next, we insert a new `WarpOp` (called inner `WarpOp`) inside the
     // newly created `ForOp`. This `WarpOp` will contain all ops that were
@@ -1845,9 +1852,10 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
           escapingValueInputTypes[i - escapingValuesStartIdx]);
     }
     // Create the inner `WarpOp` with the new input values and types.
-    auto innerWarp = rewriter.create<WarpExecuteOnLane0Op>(
-        newWarpOp.getLoc(), newForOp.getResultTypes(), newWarpOp.getLaneid(),
-        newWarpOp.getWarpSize(), innerWarpInput, innerWarpInputType);
+    auto innerWarp = WarpExecuteOnLane0Op::create(
+        rewriter, newWarpOp.getLoc(), newForOp.getResultTypes(),
+        newWarpOp.getLaneid(), newWarpOp.getWarpSize(), innerWarpInput,
+        innerWarpInputType);
 
     // Inline the `ForOp` body into the inner `WarpOp` body.
     SmallVector<Value> argMapping;
@@ -1866,12 +1874,12 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     // Insert a gpu `YieldOp` at the end of the inner `WarpOp` body that yields
     // original `ForOp` results.
     rewriter.setInsertionPointToEnd(innerWarp.getBody());
-    rewriter.create<gpu::YieldOp>(innerWarp.getLoc(), yieldOperands);
+    gpu::YieldOp::create(rewriter, innerWarp.getLoc(), yieldOperands);
     rewriter.setInsertionPointAfter(innerWarp);
     // Insert a scf.yield op at the end of the new `ForOp` body that yields
     // the inner `WarpOp` results.
     if (!innerWarp.getResults().empty())
-      rewriter.create<scf::YieldOp>(forOp.getLoc(), innerWarp.getResults());
+      scf::YieldOp::create(rewriter, forOp.getLoc(), innerWarp.getResults());
 
     // Update the users of original `WarpOp` results that were coming from the
     // original `ForOp` to the corresponding new `ForOp` result.
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index 067d4e3491391..73388a5da3e4f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -77,8 +77,8 @@ struct CastAwayExtractStridedSliceLeadingOneDim
 
     Location loc = extractOp.getLoc();
 
-    Value newSrcVector = rewriter.create<vector::ExtractOp>(
-        loc, extractOp.getVector(), splatZero(dropCount));
+    Value newSrcVector = vector::ExtractOp::create(
+        rewriter, loc, extractOp.getVector(), splatZero(dropCount));
 
     // The offsets/sizes/strides attribute can have a less number of elements
     // than the input vector's rank: it is meant for the leading dimensions.
@@ -89,8 +89,9 @@ struct CastAwayExtractStridedSliceLeadingOneDim
     auto newStrides = rewriter.getArrayAttr(
         extractOp.getStrides().getValue().drop_front(dropCount));
 
-    auto newExtractOp = rewriter.create<vector::ExtractStridedSliceOp>(
-        loc, newDstType, newSrcVector, newOffsets, newSizes, newStrides);
+    auto newExtractOp = vector::ExtractStridedSliceOp::create(
+        rewriter, loc, newDstType, newSrcVector, newOffsets, newSizes,
+        newStrides);
 
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(extractOp, oldDstType,
                                                      newExtractOp);
@@ -120,18 +121,19 @@ struct CastAwayInsertStridedSliceLeadingOneDim
     // Trim leading one dimensions from both operands.
     Location loc = insertOp.getLoc();
 
-    Value newSrcVector = rewriter.create<vector::ExtractOp>(
-        loc, insertOp.getValueToStore(), splatZero(srcDropCount));
-    Value newDstVector = rewriter.create<vector::ExtractOp>(
-        loc, insertOp.getDest(), splatZero(dstDropCount));
+    Value newSrcVector = vector::ExtractOp::create(
+        rewriter, loc, insertOp.getValueToStore(), splatZero(srcDropCount));
+    Value newDstVector = vector::ExtractOp::create(
+        rewriter, loc, insertOp.getDest(), splatZero(dstDropCount));
 
     auto newOffsets = rewriter.getArrayAttr(
         insertOp.getOffsets().getValue().take_back(newDstType.getRank()));
     auto newStrides = rewriter.getArrayAttr(
         insertOp.getStrides().getValue().take_back(newSrcType.getRank()));
 
-    auto newInsertOp = rewriter.create<vector::InsertStridedSliceOp>(
-        loc, newDstType, newSrcVector, newDstVector, newOffsets, newStrides);
+    auto newInsertOp = vector::InsertStridedSliceOp::create(
+        rewriter, loc, newDstType, newSrcVector, newDstVector, newOffsets,
+        newStrides);
 
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(insertOp, oldDstType,
                                                      newInsertOp);
@@ -169,11 +171,11 @@ struct CastAwayInsertLeadingOneDim : public OpRewritePattern<vector::InsertOp> {
 
     Value newSrcVector = insertOp.getValueToStore();
     if (oldSrcRank != 0) {
-      newSrcVector = rewriter.create<vector::ExtractOp>(
-          loc, insertOp.getValueToStore(), splatZero(srcDropCount));
+      newSrcVector = vector::ExtractOp::create(
+          rewriter, loc, insertOp.getValueToStore(), splatZero(srcDropCount));
     }
-    Value newDstVector = rewriter.create<vector::ExtractOp>(
-        loc, insertOp.getDest(), splatZero(dstDropCount));
+    Value newDstVector = vector::ExtractOp::create(
+        rewriter, loc, insertOp.getDest(), splatZero(dstDropCount));
 
     // New position rank needs to be computed in two steps: (1) if destination
     // type has leading unit dims, we also trim the position array accordingly,
@@ -187,8 +189,8 @@ struct CastAwayInsertLeadingOneDim : public OpRewritePattern<vector::InsertOp> {
     newPosition.resize(newDstType.getRank() - newSrcRank,
                        rewriter.getI64IntegerAttr(0));
 
-    auto newInsertOp = rewriter.create<vector::InsertOp>(
-        loc, newSrcVector, newDstVector, newPosition);
+    auto newInsertOp = vector::InsertOp::create(rewriter, loc, newSrcVector,
+                                                newDstVector, newPosition);
 
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(insertOp, oldDstType,
                                                      newInsertOp);
@@ -209,9 +211,9 @@ static Value dropUnitDimsFromMask(OpBuilder &b, Location loc, Value mask,
   if (vector::isBroadcastableTo(newMaskType, oldMaskType) ==
       BroadcastableToResult::Success) {
     int64_t dropDim = oldMaskType.getRank() - newMaskType.getRank();
-    return b.create<vector::ExtractOp>(loc, mask, splatZero(dropDim));
+    return vector::ExtractOp::create(b, loc, mask, splatZero(dropDim));
   }
-  return b.create<vector::ShapeCastOp>(loc, newMaskType, mask);
+  return vector::ShapeCastOp::create(b, loc, newMaskType, mask);
 }
 
 // Turns vector.transfer_read on vector with leading 1 dimensions into
@@ -259,8 +261,8 @@ struct CastAwayTransferReadLeadingOneDim
                                   newType, newMap, maskType);
     }
 
-    auto newRead = rewriter.create<vector::TransferReadOp>(
-        read.getLoc(), newType, read.getBase(), read.getIndices(),
+    auto newRead = vector::TransferReadOp::create(
+        rewriter, read.getLoc(), newType, read.getBase(), read.getIndices(),
         AffineMapAttr::get(newMap), read.getPadding(), mask, inBoundsAttr);
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(read, oldType, newRead);
 
@@ -306,8 +308,8 @@ struct CastAwayTransferWriteLeadingOneDim
       inBoundsAttr = rewriter.getArrayAttr(
           write.getInBoundsAttr().getValue().take_back(newType.getRank()));
 
-    auto newVector = rewriter.create<vector::ExtractOp>(
-        write.getLoc(), write.getVector(), splatZero(dropDim));
+    auto newVector = vector::ExtractOp::create(
+        rewriter, write.getLoc(), write.getVector(), splatZero(dropDim));
 
     if (write.getMask()) {
       VectorType maskType = write.getMaskType();
@@ -443,22 +445,23 @@ mlir::vector::castAwayContractionLeadingOneDim(vector::ContractionOp contractOp,
                                              contractOp.getContext()));
     // Extract if its a valid extraction, otherwise use the operand
     // without extraction.
-    newOperands.push_back(
-        validExtract ? rewriter.create<vector::ExtractOp>(
-                           loc, operands[it.index()], splatZero(dropDim))
-                     : operands[it.index()]);
+    newOperands.push_back(validExtract
+                              ? vector::ExtractOp::create(rewriter, loc,
+                                                          operands[it.index()],
+                                                          splatZero(dropDim))
+                              : operands[it.index()]);
   }
 
   // Depending on whether this vector.contract is masked, the replacing Op
   // should either be a new vector.contract Op or vector.mask Op.
-  Operation *newOp = rewriter.create<vector::ContractionOp>(
-      loc, newOperands[0], newOperands[1], newOperands[2],
+  Operation *newOp = vector::ContractionOp::create(
+      rewriter, loc, newOperands[0], newOperands[1], newOperands[2],
       rewriter.getAffineMapArrayAttr(newIndexingMaps),
       rewriter.getArrayAttr(newIteratorTypes), contractOp.getKind());
 
   if (maskingOp) {
-    auto newMask = rewriter.create<vector::ExtractOp>(loc, maskingOp.getMask(),
-                                                      splatZero(dropDim));
+    auto newMask = vector::ExtractOp::create(rewriter, loc, maskingOp.getMask(),
+                                             splatZero(dropDim));
 
     newOp = mlir::vector::maskOperation(rewriter, newOp, newMask);
   }
@@ -519,8 +522,8 @@ class CastAwayElementwiseLeadingOneDim : public RewritePattern {
     SmallVector<Value, 4> newOperands;
     for (Value operand : op->getOperands()) {
       if (auto opVecType = dyn_cast<VectorType>(operand.getType())) {
-        newOperands.push_back(rewriter.create<vector::ExtractOp>(
-            op->getLoc(), operand, splatZero(dropDim)));
+        newOperands.push_back(vector::ExtractOp::create(
+            rewriter, op->getLoc(), operand, splatZero(dropDim)));
       } else {
         newOperands.push_back(operand);
       }
@@ -559,8 +562,8 @@ struct CastAwayConstantMaskLeadingOneDim
     SmallVector<int64_t> newDimSizes = {flatLeadingSize};
     newDimSizes.append(dimSizes.begin() + dropDim + 1, dimSizes.end());
 
-    auto newMask = rewriter.create<vector::ConstantMaskOp>(
-        mask.getLoc(), newType, newDimSizes);
+    auto newMask = vector::ConstantMaskOp::create(rewriter, mask.getLoc(),
+                                                  newType, newDimSizes);
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(mask, oldType, newMask);
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
index 8cc7008d80b3e..cb3e8dc67a1ae 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
@@ -65,26 +65,27 @@ struct VectorMaskedLoadOpConverter final
     Value base = maskedLoadOp.getBase();
     Value iValue = maskedLoadOp.getPassThru();
     auto indices = llvm::to_vector_of<Value>(maskedLoadOp.getIndices());
-    Value one = rewriter.create<arith::ConstantOp>(
-        loc, indexType, IntegerAttr::get(indexType, 1));
+    Value one = arith::ConstantOp::create(rewriter, loc, indexType,
+                                          IntegerAttr::get(indexType, 1));
     for (int64_t i = 0; i < maskLength; ++i) {
-      auto maskBit = rewriter.create<vector::ExtractOp>(loc, mask, i);
+      auto maskBit = vector::ExtractOp::create(rewriter, loc, mask, i);
 
-      auto ifOp = rewriter.create<scf::IfOp>(
-          loc, maskBit,
+      auto ifOp = scf::IfOp::create(
+          rewriter, loc, maskBit,
           [&](OpBuilder &builder, Location loc) {
             auto loadedValue =
-                builder.create<memref::LoadOp>(loc, base, indices);
+                memref::LoadOp::create(builder, loc, base, indices);
             auto combinedValue =
-                builder.create<vector::InsertOp>(loc, loadedValue, iValue, i);
-            builder.create<scf::YieldOp>(loc, combinedValue.getResult());
+                vector::InsertOp::create(builder, loc, loadedValue, iValue, i);
+            scf::YieldOp::create(builder, loc, combinedValue.getResult());
           },
           [&](OpBuilder &builder, Location loc) {
-            builder.create<scf::YieldOp>(loc, iValue);
+            scf::YieldOp::create(builder, loc, iValue);
           });
       iValue = ifOp.getResult(0);
 
-      indices.back() = rewriter.create<arith::AddIOp>(loc, indices.back(), one);
+      indices.back() =
+          arith::AddIOp::create(rewriter, loc, indices.back(), one);
     }
 
     rewriter.replaceOp(maskedLoadOp, iValue);
@@ -132,18 +133,19 @@ struct VectorMaskedStoreOpConverter final
     Value base = maskedStoreOp.getBase();
     Value value = maskedStoreOp.getValueToStore();
     auto indices = llvm::to_vector_of<Value>(maskedStoreOp.getIndices());
-    Value one = rewriter.create<arith::ConstantOp>(
-        loc, indexType, IntegerAttr::get(indexType, 1));
+    Value one = arith::ConstantOp::create(rewriter, loc, indexType,
+                                          IntegerAttr::get(indexType, 1));
     for (int64_t i = 0; i < maskLength; ++i) {
-      auto maskBit = rewriter.create<vector::ExtractOp>(loc, mask, i);
+      auto maskBit = vector::ExtractOp::create(rewriter, loc, mask, i);
 
-      auto ifOp = rewriter.create<scf::IfOp>(loc, maskBit, /*else=*/false);
+      auto ifOp = scf::IfOp::create(rewriter, loc, maskBit, /*else=*/false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
-      auto extractedValue = rewriter.create<vector::ExtractOp>(loc, value, i);
-      rewriter.create<memref::StoreOp>(loc, extractedValue, base, indices);
+      auto extractedValue = vector::ExtractOp::create(rewriter, loc, value, i);
+      memref::StoreOp::create(rewriter, loc, extractedValue, base, indices);
 
       rewriter.setInsertionPointAfter(ifOp);
-      indices.back() = rewriter.create<arith::AddIOp>(loc, indices.back(), one);
+      indices.back() =
+          arith::AddIOp::create(rewriter, loc, indices.back(), one);
     }
 
     rewriter.eraseOp(maskedStoreOp);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 0fe08417f818f..e6bb96f453fbc 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -132,8 +132,8 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
                 SmallVector<Value> newMaskOperands(maskOperands.drop_back());
                 newMaskOperands.push_back(
                     getValueOrCreateConstantIndexOp(rewriter, loc, maskIndex));
-                return rewriter.create<vector::CreateMaskOp>(loc, newMaskType,
-                                                             newMaskOperands);
+                return vector::CreateMaskOp::create(rewriter, loc, newMaskType,
+                                                    newMaskOperands);
               })
           .Case<vector::ConstantMaskOp>(
               [&](auto constantMaskOp) -> std::optional<Operation *> {
@@ -143,8 +143,8 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
                 int64_t &maskIndex = maskDimSizes.back();
                 maskIndex = llvm::divideCeil(numFrontPadElems + maskIndex,
                                              numSrcElemsPerDest);
-                return rewriter.create<vector::ConstantMaskOp>(loc, newMaskType,
-                                                               maskDimSizes);
+                return vector::ConstantMaskOp::create(
+                    rewriter, loc, newMaskType, maskDimSizes);
               })
           .Case<arith::ConstantOp>([&](auto constantOp)
                                        -> std::optional<Operation *> {
@@ -182,16 +182,18 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
               }
               compressedMaskValues.push_back(combinedValue);
             }
-            return rewriter.create<arith::ConstantOp>(
-                loc, DenseElementsAttr::get(newMaskType, compressedMaskValues));
+            return arith::ConstantOp::create(
+                rewriter, loc,
+                DenseElementsAttr::get(newMaskType, compressedMaskValues));
           });
 
   if (!newMask)
     return failure();
 
   while (!extractOps.empty()) {
-    newMask = rewriter.create<vector::ExtractOp>(
-        loc, (*newMask)->getResults()[0], extractOps.back().getMixedPosition());
+    newMask =
+        vector::ExtractOp::create(rewriter, loc, (*newMask)->getResults()[0],
+                                  extractOps.back().getMixedPosition());
     extractOps.pop_back();
   }
 
@@ -258,8 +260,8 @@ static Value staticallyInsertSubvector(OpBuilder &rewriter, Location loc,
 
   auto offsets = rewriter.getI64ArrayAttr({offset});
   auto strides = rewriter.getI64ArrayAttr({1});
-  return rewriter.create<vector::InsertStridedSliceOp>(loc, destVecTy, src,
-                                                       dest, offsets, strides);
+  return vector::InsertStridedSliceOp::create(rewriter, loc, destVecTy, src,
+                                              dest, offsets, strides);
 }
 
 /// Extracts 1-D subvector from a 1-D vector.
@@ -301,11 +303,12 @@ static Value dynamicallyExtractSubVector(OpBuilder &rewriter, Location loc,
   for (int i = 0; i < numElemsToExtract; ++i) {
     Value extractLoc =
         (i == 0) ? dyn_cast<Value>(offset)
-                 : rewriter.create<arith::AddIOp>(
-                       loc, rewriter.getIndexType(), dyn_cast<Value>(offset),
-                       rewriter.create<arith::ConstantIndexOp>(loc, i));
-    auto extractOp = rewriter.create<vector::ExtractOp>(loc, src, extractLoc);
-    dest = rewriter.create<vector::InsertOp>(loc, extractOp, dest, i);
+                 : arith::AddIOp::create(
+                       rewriter, loc, rewriter.getIndexType(),
+                       dyn_cast<Value>(offset),
+                       arith::ConstantIndexOp::create(rewriter, loc, i));
+    auto extractOp = vector::ExtractOp::create(rewriter, loc, src, extractLoc);
+    dest = vector::InsertOp::create(rewriter, loc, extractOp, dest, i);
   }
   return dest;
 }
@@ -344,13 +347,13 @@ static Value dynamicallyInsertSubVector(RewriterBase &rewriter, Location loc,
 
   Value destOffsetVal = getValueOrCreateConstantIndexOp(rewriter, loc, offset);
   for (int64_t i = 0; i < numElemsToInsert; ++i) {
-    auto insertLoc = i == 0
-                         ? destOffsetVal
-                         : rewriter.create<arith::AddIOp>(
-                               loc, rewriter.getIndexType(), destOffsetVal,
-                               rewriter.create<arith::ConstantIndexOp>(loc, i));
-    auto extractOp = rewriter.create<vector::ExtractOp>(loc, src, i);
-    dest = rewriter.create<vector::InsertOp>(loc, extractOp, dest, insertLoc);
+    auto insertLoc =
+        i == 0 ? destOffsetVal
+               : arith::AddIOp::create(
+                     rewriter, loc, rewriter.getIndexType(), destOffsetVal,
+                     arith::ConstantIndexOp::create(rewriter, loc, i));
+    auto extractOp = vector::ExtractOp::create(rewriter, loc, src, i);
+    dest = vector::InsertOp::create(rewriter, loc, extractOp, dest, insertLoc);
   }
   return dest;
 }
@@ -369,11 +372,11 @@ static VectorValue emulatedVectorLoad(OpBuilder &rewriter, Location loc,
                                       Type containerElemTy) {
   auto emulatedPerContainerElem = containerElemTy.getIntOrFloatBitWidth() /
                                   emulatedElemTy.getIntOrFloatBitWidth();
-  auto newLoad = rewriter.create<vector::LoadOp>(
-      loc, VectorType::get(numContainerElemsToLoad, containerElemTy), base,
-      getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
-  return rewriter.create<vector::BitCastOp>(
-      loc,
+  auto newLoad = vector::LoadOp::create(
+      rewriter, loc, VectorType::get(numContainerElemsToLoad, containerElemTy),
+      base, getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+  return vector::BitCastOp::create(
+      rewriter, loc,
       VectorType::get(numContainerElemsToLoad * emulatedPerContainerElem,
                       emulatedElemTy),
       newLoad);
@@ -390,16 +393,17 @@ static Value downcastSelectAndUpcast(OpBuilder &builder, Location loc,
           upcastType.getNumElements() * upcastType.getElementTypeBitWidth() &&
       "expected input and output number of bits to match");
   if (trueValue.getType() != downcastType) {
-    trueValue = builder.create<vector::BitCastOp>(loc, downcastType, trueValue);
+    trueValue =
+        vector::BitCastOp::create(builder, loc, downcastType, trueValue);
   }
   if (falseValue.getType() != downcastType) {
     falseValue =
-        builder.create<vector::BitCastOp>(loc, downcastType, falseValue);
+        vector::BitCastOp::create(builder, loc, downcastType, falseValue);
   }
   Value selectedType =
-      builder.create<arith::SelectOp>(loc, mask, trueValue, falseValue);
+      arith::SelectOp::create(builder, loc, mask, trueValue, falseValue);
   // Upcast the selected value to the new type.
-  return builder.create<vector::BitCastOp>(loc, upcastType, selectedType);
+  return vector::BitCastOp::create(builder, loc, upcastType, selectedType);
 }
 
 /// Emits `memref.generic_atomic_rmw` op to store a subbyte-sized value to a
@@ -422,8 +426,8 @@ static void atomicRMW(OpBuilder &builder, Location loc,
 
   // Create an atomic load-modify-write region using
   // `memref.generic_atomic_rmw`.
-  auto atomicOp = builder.create<memref::GenericAtomicRMWOp>(
-      loc, linearizedMemref, ValueRange{storeIdx});
+  auto atomicOp = memref::GenericAtomicRMWOp::create(
+      builder, loc, linearizedMemref, ValueRange{storeIdx});
   Value origValue = atomicOp.getCurrentValue();
 
   OpBuilder::InsertionGuard guard(builder);
@@ -432,16 +436,16 @@ static void atomicRMW(OpBuilder &builder, Location loc,
   // Load the original value from memory, and cast it to the original element
   // type.
   auto oneElemVecType = VectorType::get({1}, origValue.getType());
-  Value origVecValue = builder.create<vector::FromElementsOp>(
-      loc, oneElemVecType, ValueRange{origValue});
+  Value origVecValue = vector::FromElementsOp::create(
+      builder, loc, oneElemVecType, ValueRange{origValue});
 
   // Construct the final masked value and yield it.
   Value maskedValue =
       downcastSelectAndUpcast(builder, loc, valueToStore.getType(),
                               oneElemVecType, mask, valueToStore, origVecValue);
   auto scalarMaskedValue =
-      builder.create<vector::ExtractOp>(loc, maskedValue, 0);
-  builder.create<memref::AtomicYieldOp>(loc, scalarMaskedValue);
+      vector::ExtractOp::create(builder, loc, maskedValue, 0);
+  memref::AtomicYieldOp::create(builder, loc, scalarMaskedValue);
 }
 
 /// Generate a non-atomic read-modify-write sequence for storing to the emulated
@@ -453,16 +457,17 @@ static void nonAtomicRMW(OpBuilder &builder, Location loc,
 
   auto oneElemVecType =
       VectorType::get({1}, linearizedMemref.getType().getElementType());
-  Value origVecValue = builder.create<vector::LoadOp>(
-      loc, oneElemVecType, linearizedMemref, ValueRange{linearizedIndex});
-  origVecValue = builder.create<vector::BitCastOp>(loc, valueToStore.getType(),
-                                                   origVecValue);
+  Value origVecValue =
+      vector::LoadOp::create(builder, loc, oneElemVecType, linearizedMemref,
+                             ValueRange{linearizedIndex});
+  origVecValue = vector::BitCastOp::create(builder, loc, valueToStore.getType(),
+                                           origVecValue);
 
   Value maskedValue =
       downcastSelectAndUpcast(builder, loc, valueToStore.getType(),
                               oneElemVecType, mask, valueToStore, origVecValue);
-  builder.create<vector::StoreOp>(loc, maskedValue, linearizedMemref,
-                                  linearizedIndex);
+  vector::StoreOp::create(builder, loc, maskedValue, linearizedMemref,
+                          linearizedIndex);
 }
 
 /// Extract `sliceNumElements` from source `vector` at `extractOffset`,
@@ -489,8 +494,9 @@ static Value extractSliceIntoByte(ConversionPatternRewriter &rewriter,
   assert(8 % vectorElementType.getIntOrFloatBitWidth() == 0 &&
          "vector element must be a valid sub-byte type");
   auto emulatedPerContainerElem = 8 / vectorElementType.getIntOrFloatBitWidth();
-  auto emptyByteVector = rewriter.create<arith::ConstantOp>(
-      loc, VectorType::get({emulatedPerContainerElem}, vectorElementType),
+  auto emptyByteVector = arith::ConstantOp::create(
+      rewriter, loc,
+      VectorType::get({emulatedPerContainerElem}, vectorElementType),
       rewriter.getZeroAttr(
           VectorType::get({emulatedPerContainerElem}, vectorElementType)));
   auto extracted = staticallyExtractSubvector(rewriter, loc, vector,
@@ -602,7 +608,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
         ShapedType::isDynamic(trailingDim) || trailingDim == origElements;
 
     auto stridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+        memref::ExtractStridedMetadataOp::create(rewriter, loc, op.getBase());
 
     // FIXME: ATM, we do not test cases where offsets, sizes, or strides are
     // non-zero. As such, this is not needed.
@@ -664,8 +670,8 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     if (!emulationRequiresPartialStores) {
       // Basic case: storing full bytes.
       auto numElements = origElements / emulatedPerContainerElem;
-      auto bitCast = rewriter.create<vector::BitCastOp>(
-          loc, VectorType::get(numElements, containerElemTy),
+      auto bitCast = vector::BitCastOp::create(
+          rewriter, loc, VectorType::get(numElements, containerElemTy),
           op.getValueToStore());
       rewriter.replaceOpWithNewOp<vector::StoreOp>(
           op, bitCast.getResult(), memrefBase,
@@ -732,8 +738,9 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
         std::fill_n(frontMaskValues.end() - frontSubWidthStoreElem,
                     *foldedNumFrontPadElems, true);
       }
-      auto frontMask = rewriter.create<arith::ConstantOp>(
-          loc, DenseElementsAttr::get(subWidthStoreMaskType, frontMaskValues));
+      auto frontMask = arith::ConstantOp::create(
+          rewriter, loc,
+          DenseElementsAttr::get(subWidthStoreMaskType, frontMaskValues));
 
       currentSourceIndex = emulatedPerContainerElem - (*foldedNumFrontPadElems);
       auto value =
@@ -751,9 +758,9 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
 
     // Increment the destination index by 1 to align to the emulated width
     // boundary.
-    auto constantOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    currentDestIndex = rewriter.create<arith::AddIOp>(
-        loc, rewriter.getIndexType(), currentDestIndex, constantOne);
+    auto constantOne = arith::ConstantIndexOp::create(rewriter, loc, 1);
+    currentDestIndex = arith::AddIOp::create(
+        rewriter, loc, rewriter.getIndexType(), currentDestIndex, constantOne);
 
     // 2. Full width store for the inner output bytes.
     // After the previous step, the store address is aligned to the emulated
@@ -772,15 +779,15 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
       auto storeType = VectorType::get(
           {originType.getNumElements() / emulatedPerContainerElem},
           memrefElemType);
-      auto bitCast = rewriter.create<vector::BitCastOp>(loc, storeType,
-                                                        fullWidthStorePart);
-      rewriter.create<vector::StoreOp>(loc, bitCast.getResult(), memrefBase,
-                                       currentDestIndex);
+      auto bitCast = vector::BitCastOp::create(rewriter, loc, storeType,
+                                               fullWidthStorePart);
+      vector::StoreOp::create(rewriter, loc, bitCast.getResult(), memrefBase,
+                              currentDestIndex);
 
       currentSourceIndex += numNonFullWidthElements;
-      currentDestIndex = rewriter.create<arith::AddIOp>(
-          loc, rewriter.getIndexType(), currentDestIndex,
-          rewriter.create<arith::ConstantIndexOp>(loc, fullWidthStoreSize));
+      currentDestIndex = arith::AddIOp::create(
+          rewriter, loc, rewriter.getIndexType(), currentDestIndex,
+          arith::ConstantIndexOp::create(rewriter, loc, fullWidthStoreSize));
     }
 
     // 3. Partial width store for the trailing output byte.
@@ -795,8 +802,9 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
       // Generate back mask.
       auto maskValues = SmallVector<bool>(emulatedPerContainerElem, 0);
       std::fill_n(maskValues.begin(), remainingElements, 1);
-      auto backMask = rewriter.create<arith::ConstantOp>(
-          loc, DenseElementsAttr::get(subWidthStoreMaskType, maskValues));
+      auto backMask = arith::ConstantOp::create(
+          rewriter, loc,
+          DenseElementsAttr::get(subWidthStoreMaskType, maskValues));
 
       storeFunc(rewriter, loc, memrefBase, currentDestIndex,
                 cast<VectorValue>(subWidthStorePart), backMask.getResult());
@@ -848,7 +856,7 @@ struct ConvertVectorMaskedStore final
       return failure();
 
     auto stridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+        memref::ExtractStridedMetadataOp::create(rewriter, loc, op.getBase());
     OpFoldResult linearizedIndicesOfr;
     memref::LinearizedMemRefInfo linearizedInfo;
     std::tie(linearizedInfo, linearizedIndicesOfr) =
@@ -901,21 +909,21 @@ struct ConvertVectorMaskedStore final
     auto numElements = (origElements + emulatedPerContainerElem - 1) /
                        emulatedPerContainerElem;
     auto newType = VectorType::get(numElements, containerElemTy);
-    auto passThru = rewriter.create<arith::ConstantOp>(
-        loc, newType, rewriter.getZeroAttr(newType));
+    auto passThru = arith::ConstantOp::create(rewriter, loc, newType,
+                                              rewriter.getZeroAttr(newType));
 
-    auto newLoad = rewriter.create<vector::MaskedLoadOp>(
-        loc, newType, adaptor.getBase(), linearizedIndices,
+    auto newLoad = vector::MaskedLoadOp::create(
+        rewriter, loc, newType, adaptor.getBase(), linearizedIndices,
         newMask.value()->getResult(0), passThru);
 
     auto newBitCastType =
         VectorType::get(numElements * emulatedPerContainerElem, emulatedElemTy);
     Value valueToStore =
-        rewriter.create<vector::BitCastOp>(loc, newBitCastType, newLoad);
-    valueToStore = rewriter.create<arith::SelectOp>(
-        loc, op.getMask(), op.getValueToStore(), valueToStore);
+        vector::BitCastOp::create(rewriter, loc, newBitCastType, newLoad);
+    valueToStore = arith::SelectOp::create(rewriter, loc, op.getMask(),
+                                           op.getValueToStore(), valueToStore);
     valueToStore =
-        rewriter.create<vector::BitCastOp>(loc, newType, valueToStore);
+        vector::BitCastOp::create(rewriter, loc, newType, valueToStore);
 
     rewriter.replaceOpWithNewOp<vector::MaskedStoreOp>(
         op, adaptor.getBase(), linearizedIndices, newMask.value()->getResult(0),
@@ -990,7 +998,7 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
     bool isDivisibleInSize = origElements % emulatedPerContainerElem == 0;
 
     auto stridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+        memref::ExtractStridedMetadataOp::create(rewriter, loc, op.getBase());
 
     OpFoldResult linearizedIndices;
     memref::LinearizedMemRefInfo linearizedInfo;
@@ -1016,8 +1024,8 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
                            numElements, emulatedElemTy, containerElemTy);
 
     if (!foldedIntraVectorOffset) {
-      auto resultVector = rewriter.create<arith::ConstantOp>(
-          loc, op.getType(), rewriter.getZeroAttr(op.getType()));
+      auto resultVector = arith::ConstantOp::create(
+          rewriter, loc, op.getType(), rewriter.getZeroAttr(op.getType()));
       result = dynamicallyExtractSubVector(
           rewriter, loc, dyn_cast<TypedValue<VectorType>>(result), resultVector,
           linearizedInfo.intraDataOffset, origElements);
@@ -1111,7 +1119,7 @@ struct ConvertVectorMaskedLoad final
     bool isDivisibleInSize = origElements % emulatedPerContainerElem == 0;
 
     auto stridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+        memref::ExtractStridedMetadataOp::create(rewriter, loc, op.getBase());
     OpFoldResult linearizedIndices;
     memref::LinearizedMemRefInfo linearizedInfo;
     std::tie(linearizedInfo, linearizedIndices) =
@@ -1142,8 +1150,8 @@ struct ConvertVectorMaskedLoad final
     auto newBitcastType =
         VectorType::get(numElements * emulatedPerContainerElem, emulatedElemTy);
 
-    auto emptyVector = rewriter.create<arith::ConstantOp>(
-        loc, newBitcastType, rewriter.getZeroAttr(newBitcastType));
+    auto emptyVector = arith::ConstantOp::create(
+        rewriter, loc, newBitcastType, rewriter.getZeroAttr(newBitcastType));
     if (!foldedIntraVectorOffset) {
       passthru = dynamicallyInsertSubVector(
           rewriter, loc, passthru, emptyVector, linearizedInfo.intraDataOffset,
@@ -1153,25 +1161,26 @@ struct ConvertVectorMaskedLoad final
                                            *foldedIntraVectorOffset);
     }
     auto newPassThru =
-        rewriter.create<vector::BitCastOp>(loc, loadType, passthru);
+        vector::BitCastOp::create(rewriter, loc, loadType, passthru);
 
     // Generating the new masked load.
-    auto newLoad = rewriter.create<vector::MaskedLoadOp>(
-        loc, loadType, adaptor.getBase(),
+    auto newLoad = vector::MaskedLoadOp::create(
+        rewriter, loc, loadType, adaptor.getBase(),
         getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices),
         newMask.value()->getResult(0), newPassThru);
 
     // Setting the part that originally was not effectively loaded from memory
     // to pass through.
     auto bitCast =
-        rewriter.create<vector::BitCastOp>(loc, newBitcastType, newLoad);
+        vector::BitCastOp::create(rewriter, loc, newBitcastType, newLoad);
 
     Value mask = op.getMask();
     auto newSelectMaskType = VectorType::get(
         numElements * emulatedPerContainerElem, rewriter.getI1Type());
     // TODO: try to fold if op's mask is constant
-    auto emptyMask = rewriter.create<arith::ConstantOp>(
-        loc, newSelectMaskType, rewriter.getZeroAttr(newSelectMaskType));
+    auto emptyMask =
+        arith::ConstantOp::create(rewriter, loc, newSelectMaskType,
+                                  rewriter.getZeroAttr(newSelectMaskType));
     if (!foldedIntraVectorOffset) {
       mask = dynamicallyInsertSubVector(rewriter, loc, mask, emptyMask,
                                         linearizedInfo.intraDataOffset,
@@ -1182,7 +1191,7 @@ struct ConvertVectorMaskedLoad final
     }
 
     Value result =
-        rewriter.create<arith::SelectOp>(loc, mask, bitCast, passthru);
+        arith::SelectOp::create(rewriter, loc, mask, bitCast, passthru);
     if (!foldedIntraVectorOffset) {
       result = dynamicallyExtractSubVector(
           rewriter, loc, result, op.getPassThru(),
@@ -1272,17 +1281,17 @@ struct ConvertVectorTransferRead final
     // thus their values don't matter.
     Value padding = adaptor.getPadding();
     if (!padding.getType().isInteger()) {
-      padding = rewriter.create<arith::BitcastOp>(
-          loc,
+      padding = arith::BitcastOp::create(
+          rewriter, loc,
           IntegerType::get(rewriter.getContext(),
                            padding.getType().getIntOrFloatBitWidth()),
           padding);
     }
     auto newPadding =
-        rewriter.create<arith::ExtUIOp>(loc, containerElemTy, padding);
+        arith::ExtUIOp::create(rewriter, loc, containerElemTy, padding);
 
     auto stridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+        memref::ExtractStridedMetadataOp::create(rewriter, loc, op.getBase());
 
     OpFoldResult linearizedIndices;
     memref::LinearizedMemRefInfo linearizedInfo;
@@ -1303,20 +1312,21 @@ struct ConvertVectorTransferRead final
     auto numElements = llvm::divideCeil(maxIntraDataOffset + origElements,
                                         emulatedPerContainerElem);
 
-    auto newRead = rewriter.create<vector::TransferReadOp>(
-        loc, VectorType::get(numElements, containerElemTy), adaptor.getBase(),
+    auto newRead = vector::TransferReadOp::create(
+        rewriter, loc, VectorType::get(numElements, containerElemTy),
+        adaptor.getBase(),
         getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices),
         newPadding);
 
-    auto bitCast = rewriter.create<vector::BitCastOp>(
-        loc,
+    auto bitCast = vector::BitCastOp::create(
+        rewriter, loc,
         VectorType::get(numElements * emulatedPerContainerElem, emulatedElemTy),
         newRead);
 
     Value result = bitCast->getResult(0);
     if (!foldedIntraVectorOffset) {
-      auto zeros = rewriter.create<arith::ConstantOp>(
-          loc, op.getType(), rewriter.getZeroAttr(op.getType()));
+      auto zeros = arith::ConstantOp::create(
+          rewriter, loc, op.getType(), rewriter.getZeroAttr(op.getType()));
       result = dynamicallyExtractSubVector(rewriter, loc, bitCast, zeros,
                                            linearizedInfo.intraDataOffset,
                                            origElements);
@@ -1689,32 +1699,33 @@ Value BitCastRewriter::genericRewriteStep(
     PatternRewriter &rewriter, Location loc, Value initialValue,
     Value runningResult, const BitCastRewriter::Metadata &metadata) {
   // Create vector.shuffle from the metadata.
-  auto shuffleOp = rewriter.create<vector::ShuffleOp>(
-      loc, initialValue, initialValue, metadata.shuffles);
+  auto shuffleOp = vector::ShuffleOp::create(rewriter, loc, initialValue,
+                                             initialValue, metadata.shuffles);
 
   // Intersect with the mask.
   VectorType shuffledVectorType = shuffleOp.getResultVectorType();
-  auto constOp = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(shuffledVectorType, metadata.masks));
-  Value andValue = rewriter.create<arith::AndIOp>(loc, shuffleOp, constOp);
+  auto constOp = arith::ConstantOp::create(
+      rewriter, loc,
+      DenseElementsAttr::get(shuffledVectorType, metadata.masks));
+  Value andValue = arith::AndIOp::create(rewriter, loc, shuffleOp, constOp);
 
   // Align right on 0.
-  auto shiftRightConstantOp = rewriter.create<arith::ConstantOp>(
-      loc,
+  auto shiftRightConstantOp = arith::ConstantOp::create(
+      rewriter, loc,
       DenseElementsAttr::get(shuffledVectorType, metadata.shiftRightAmounts));
   Value shiftedRight =
-      rewriter.create<arith::ShRUIOp>(loc, andValue, shiftRightConstantOp);
+      arith::ShRUIOp::create(rewriter, loc, andValue, shiftRightConstantOp);
 
   // Shift bits left into their final position.
-  auto shiftLeftConstantOp = rewriter.create<arith::ConstantOp>(
-      loc,
+  auto shiftLeftConstantOp = arith::ConstantOp::create(
+      rewriter, loc,
       DenseElementsAttr::get(shuffledVectorType, metadata.shiftLeftAmounts));
   Value shiftedLeft =
-      rewriter.create<arith::ShLIOp>(loc, shiftedRight, shiftLeftConstantOp);
+      arith::ShLIOp::create(rewriter, loc, shiftedRight, shiftLeftConstantOp);
 
   runningResult =
       runningResult
-          ? rewriter.create<arith::OrIOp>(loc, runningResult, shiftedLeft)
+          ? arith::OrIOp::create(rewriter, loc, runningResult, shiftedLeft)
           : shiftedLeft;
 
   return runningResult;
@@ -1737,7 +1748,7 @@ static Value bitcastSubByteVectorToI8(PatternRewriter &rewriter, Location loc,
   // Adjust last dimension of the vector, so the total size remains the same.
   vecShape.back() = vecShape.back() / numSrcElemsPerByte;
   auto i8VecType = VectorType::get(vecShape, rewriter.getI8Type());
-  return rewriter.create<vector::BitCastOp>(loc, i8VecType, subByteVec);
+  return vector::BitCastOp::create(rewriter, loc, i8VecType, subByteVec);
 }
 
 /// Extracts a signed N-bit sequence from each element of a vector of bytes,
@@ -1765,15 +1776,15 @@ static Value extractNBitsPerByteAndSignExtendToI8(PatternRewriter &rewriter,
   assert(bitIdx >= 0 && bitsToShiftLeft >= 0 && numBits > 0 && numBits <= 8 &&
          "Invalid bitIdx range");
   if (bitsToShiftLeft != 0) {
-    Value shiftLeftValues = rewriter.create<arith::ConstantOp>(
-        loc, DenseElementsAttr::get(srcType, bitsToShiftLeft));
-    shl = rewriter.create<arith::ShLIOp>(loc, src, shiftLeftValues);
+    Value shiftLeftValues = arith::ConstantOp::create(
+        rewriter, loc, DenseElementsAttr::get(srcType, bitsToShiftLeft));
+    shl = arith::ShLIOp::create(rewriter, loc, src, shiftLeftValues);
   }
 
   int8_t bitsToShiftRight = 8 - numBits;
-  Value shiftRightValues = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(srcType, bitsToShiftRight));
-  Value shr = rewriter.create<arith::ShRSIOp>(loc, shl, shiftRightValues);
+  Value shiftRightValues = arith::ConstantOp::create(
+      rewriter, loc, DenseElementsAttr::get(srcType, bitsToShiftRight));
+  Value shr = arith::ShRSIOp::create(rewriter, loc, shl, shiftRightValues);
   return shr;
 }
 
@@ -1807,17 +1818,17 @@ static Value extractNBitsPerByteAndExtendToI8(PatternRewriter &rewriter,
   int8_t bitsToShiftRight = bitIdx;
   Value shr = src;
   if (bitsToShiftRight != 0) {
-    Value shiftRightValues = rewriter.create<arith::ConstantOp>(
-        loc, DenseElementsAttr::get(srcType, bitsToShiftRight));
-    shr = rewriter.create<arith::ShRUIOp>(loc, src, shiftRightValues);
+    Value shiftRightValues = arith::ConstantOp::create(
+        rewriter, loc, DenseElementsAttr::get(srcType, bitsToShiftRight));
+    shr = arith::ShRUIOp::create(rewriter, loc, src, shiftRightValues);
   }
   if (bitIdx + numBits == 8) {
     return shr;
   }
   uint8_t lowBitsMask = (1 << numBits) - 1;
-  Value lowBitsMaskValues = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(srcType, lowBitsMask));
-  return rewriter.create<arith::AndIOp>(loc, shr, lowBitsMaskValues);
+  Value lowBitsMaskValues = arith::ConstantOp::create(
+      rewriter, loc, DenseElementsAttr::get(srcType, lowBitsMask));
+  return arith::AndIOp::create(rewriter, loc, shr, lowBitsMaskValues);
 }
 
 using ExtractNBitsFn =
@@ -1840,7 +1851,7 @@ static Value rewriteI4ToI8Ext(PatternRewriter &rewriter, Location loc,
   Value high = extFn(rewriter, loc, i8Vector, 4, 4);
 
   // 3. Interleave low and high i8 elements.
-  return rewriter.create<vector::InterleaveOp>(loc, low, high);
+  return vector::InterleaveOp::create(rewriter, loc, low, high);
 }
 
 /// Rewrite the i2 -> i8  extension into a sequence of shuffles and
@@ -1873,9 +1884,10 @@ static Value rewriteI2ToI8Ext(PatternRewriter &rewriter, Location loc,
   // 02    = [0,2,0,2,0,2,0,2],...
   // 13    = [1,3,1,3,1,3,1,3],...
   // 0213  = [0,1,2,3,...],...
-  Value interleave02 = rewriter.create<vector::InterleaveOp>(loc, vec0, vec2);
-  Value interleave13 = rewriter.create<vector::InterleaveOp>(loc, vec1, vec3);
-  return rewriter.create<vector::InterleaveOp>(loc, interleave02, interleave13);
+  Value interleave02 = vector::InterleaveOp::create(rewriter, loc, vec0, vec2);
+  Value interleave13 = vector::InterleaveOp::create(rewriter, loc, vec1, vec3);
+  return vector::InterleaveOp::create(rewriter, loc, interleave02,
+                                      interleave13);
 }
 
 /// Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise
@@ -1887,29 +1899,29 @@ static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
          "Expected i8 type");
 
   // 1. De-interleave low and high i8 elements.
-  auto deinterleaveOp = rewriter.create<vector::DeinterleaveOp>(loc, srcValue);
+  auto deinterleaveOp = vector::DeinterleaveOp::create(rewriter, loc, srcValue);
 
   // 2. Zero out the upper side of each low i8 element.
   constexpr int8_t i8LowBitMask = 0x0F;
   VectorType deinterI8VecType = deinterleaveOp.getResultVectorType();
-  Value zeroOutMask = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(deinterI8VecType, i8LowBitMask));
-  Value zeroOutLow = rewriter.create<arith::AndIOp>(
-      loc, deinterleaveOp.getRes1(), zeroOutMask);
+  Value zeroOutMask = arith::ConstantOp::create(
+      rewriter, loc, DenseElementsAttr::get(deinterI8VecType, i8LowBitMask));
+  Value zeroOutLow = arith::AndIOp::create(
+      rewriter, loc, deinterleaveOp.getRes1(), zeroOutMask);
 
   // 3. Move high i4 values to upper side of the byte.
   constexpr int8_t bitsToShift = 4;
-  auto shiftValues = rewriter.create<arith::ConstantOp>(
-      loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
-  Value shlHigh = rewriter.create<arith::ShLIOp>(loc, deinterleaveOp.getRes2(),
-                                                 shiftValues);
+  auto shiftValues = arith::ConstantOp::create(
+      rewriter, loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
+  Value shlHigh = arith::ShLIOp::create(rewriter, loc, deinterleaveOp.getRes2(),
+                                        shiftValues);
 
   // 4. Merge high and low i4 values.
-  auto mergedHiLowOp = rewriter.create<arith::OrIOp>(loc, zeroOutLow, shlHigh);
+  auto mergedHiLowOp = arith::OrIOp::create(rewriter, loc, zeroOutLow, shlHigh);
 
   // 5. Generate a bitcast vector<Xxi8> -> vector<2Xxi4>.
   auto i4VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI4Type());
-  return rewriter.create<vector::BitCastOp>(loc, i4VecType, mergedHiLowOp);
+  return vector::BitCastOp::create(rewriter, loc, i4VecType, mergedHiLowOp);
 }
 
 namespace {
@@ -2151,7 +2163,7 @@ struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
     Location loc = truncOp.getLoc();
     auto i8VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI8Type());
     Value i8TruncVal =
-        rewriter.create<arith::TruncIOp>(loc, i8VecType, srcValue);
+        arith::TruncIOp::create(rewriter, loc, i8VecType, srcValue);
 
     // Rewrite the i8 -> i4 truncation part.
     Value subByteTrunc = rewriteI8ToI4Trunc(rewriter, loc, i8TruncVal);
@@ -2199,10 +2211,10 @@ struct RewriteVectorTranspose : OpRewritePattern<vector::TransposeOp> {
     // support is available.
     auto srcNativeVecType = srcSubByteVecType.cloneWith(
         std::nullopt, rewriter.getIntegerType(minNativeBitwidth));
-    Value extOp = rewriter.create<arith::ExtSIOp>(loc, srcNativeVecType,
-                                                  transposeOp.getVector());
-    Value newTranspose = rewriter.create<vector::TransposeOp>(
-        loc, extOp, transposeOp.getPermutation());
+    Value extOp = arith::ExtSIOp::create(rewriter, loc, srcNativeVecType,
+                                         transposeOp.getVector());
+    Value newTranspose = vector::TransposeOp::create(
+        rewriter, loc, extOp, transposeOp.getPermutation());
     VectorType dstSubByteVecType = transposeOp.getResultVectorType();
     rewriter.replaceOpWithNewOp<arith::TruncIOp>(transposeOp, dstSubByteVecType,
                                                  newTranspose);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
index d834a99076834..72352d72bfe77 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
@@ -53,15 +53,15 @@ class DecomposeDifferentRankInsertStridedSlice
     int64_t rankRest = dstType.getRank() - rankDiff;
     // Extract / insert the subvector of matching rank and InsertStridedSlice
     // on it.
-    Value extracted = rewriter.create<ExtractOp>(
-        loc, op.getDest(),
-        getI64SubArray(op.getOffsets(), /*dropFront=*/0,
-                       /*dropBack=*/rankRest));
+    Value extracted =
+        ExtractOp::create(rewriter, loc, op.getDest(),
+                          getI64SubArray(op.getOffsets(), /*dropFront=*/0,
+                                         /*dropBack=*/rankRest));
 
     // A different pattern will kick in for InsertStridedSlice with matching
     // ranks.
-    auto stridedSliceInnerOp = rewriter.create<InsertStridedSliceOp>(
-        loc, op.getValueToStore(), extracted,
+    auto stridedSliceInnerOp = InsertStridedSliceOp::create(
+        rewriter, loc, op.getValueToStore(), extracted,
         getI64SubArray(op.getOffsets(), /*dropFront=*/rankDiff),
         getI64SubArray(op.getStrides(), /*dropFront=*/0));
 
@@ -131,8 +131,8 @@ class ConvertSameRankInsertStridedSliceIntoShuffle
       SmallVector<int64_t> offsets(nDest, 0);
       for (int64_t i = 0; i < nSrc; ++i)
         offsets[i] = i;
-      Value scaledSource = rewriter.create<ShuffleOp>(
-          loc, op.getValueToStore(), op.getValueToStore(), offsets);
+      Value scaledSource = ShuffleOp::create(
+          rewriter, loc, op.getValueToStore(), op.getValueToStore(), offsets);
 
       // 2. Create a mask where we take the value from scaledSource of dest
       // depending on the offset.
@@ -156,21 +156,21 @@ class ConvertSameRankInsertStridedSliceIntoShuffle
          off += stride, ++idx) {
       // 1. extract the proper subvector (or element) from source
       Value extractedSource =
-          rewriter.create<ExtractOp>(loc, op.getValueToStore(), idx);
+          ExtractOp::create(rewriter, loc, op.getValueToStore(), idx);
       if (isa<VectorType>(extractedSource.getType())) {
         // 2. If we have a vector, extract the proper subvector from destination
         // Otherwise we are at the element level and no need to recurse.
         Value extractedDest =
-            rewriter.create<ExtractOp>(loc, op.getDest(), off);
+            ExtractOp::create(rewriter, loc, op.getDest(), off);
         // 3. Reduce the problem to lowering a new InsertStridedSlice op with
         // smaller rank.
-        extractedSource = rewriter.create<InsertStridedSliceOp>(
-            loc, extractedSource, extractedDest,
+        extractedSource = InsertStridedSliceOp::create(
+            rewriter, loc, extractedSource, extractedDest,
             getI64SubArray(op.getOffsets(), /* dropFront=*/1),
             getI64SubArray(op.getStrides(), /* dropFront=*/1));
       }
       // 4. Insert the extractedSource into the res vector.
-      res = rewriter.create<InsertOp>(loc, extractedSource, res, off);
+      res = InsertOp::create(rewriter, loc, extractedSource, res, off);
     }
 
     rewriter.replaceOp(op, res);
@@ -250,12 +250,12 @@ class Convert1DExtractStridedSliceIntoExtractInsertChain final
     SmallVector<Value> elements;
     elements.reserve(size);
     for (int64_t i = offset, e = offset + size * stride; i < e; i += stride)
-      elements.push_back(rewriter.create<ExtractOp>(loc, op.getVector(), i));
+      elements.push_back(ExtractOp::create(rewriter, loc, op.getVector(), i));
 
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getZeroAttr(op.getType()));
+    Value result = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getZeroAttr(op.getType()));
     for (int64_t i = 0; i < size; ++i)
-      result = rewriter.create<InsertOp>(loc, elements[i], result, i);
+      result = InsertOp::create(rewriter, loc, elements[i], result, i);
 
     rewriter.replaceOp(op, result);
     return success();
@@ -301,17 +301,17 @@ class DecomposeNDExtractStridedSlice
       return failure();
 
     // Extract/insert on a lower ranked extract strided slice op.
-    Value zero = rewriter.create<arith::ConstantOp>(
-        loc, elemType, rewriter.getZeroAttr(elemType));
-    Value res = rewriter.create<SplatOp>(loc, dstType, zero);
+    Value zero = arith::ConstantOp::create(rewriter, loc, elemType,
+                                           rewriter.getZeroAttr(elemType));
+    Value res = SplatOp::create(rewriter, loc, dstType, zero);
     for (int64_t off = offset, e = offset + size * stride, idx = 0; off < e;
          off += stride, ++idx) {
-      Value one = rewriter.create<ExtractOp>(loc, op.getVector(), off);
-      Value extracted = rewriter.create<ExtractStridedSliceOp>(
-          loc, one, getI64SubArray(op.getOffsets(), /* dropFront=*/1),
+      Value one = ExtractOp::create(rewriter, loc, op.getVector(), off);
+      Value extracted = ExtractStridedSliceOp::create(
+          rewriter, loc, one, getI64SubArray(op.getOffsets(), /* dropFront=*/1),
           getI64SubArray(op.getSizes(), /* dropFront=*/1),
           getI64SubArray(op.getStrides(), /* dropFront=*/1));
-      res = rewriter.create<InsertOp>(loc, extracted, res, idx);
+      res = InsertOp::create(rewriter, loc, extracted, res, idx);
     }
     rewriter.replaceOp(op, res);
     return success();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index fe17b3c0b2cfc..491b448e9e1e9 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -658,7 +658,7 @@ struct LinearizeVectorCreateMask final
     // The result of the comparison is then multiplied with
     // the second operand of create_mask to get the 1D mask.
     auto firstOperand = adaptor.getOperands().front();
-    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
     auto isNonZero = rewriter.createOrFold<mlir::arith::CmpIOp>(
         loc, mlir::arith::CmpIPredicate::sgt, firstOperand, zero);
     auto isNonZeroIndex = rewriter.createOrFold<mlir::arith::IndexCastOp>(
@@ -668,7 +668,7 @@ struct LinearizeVectorCreateMask final
         loc, rewriter.getIndexType(), isNonZeroIndex, secondOperand);
 
     auto newMask =
-        rewriter.create<mlir::vector::CreateMaskOp>(loc, dstTy, maskSize);
+        mlir::vector::CreateMaskOp::create(rewriter, loc, dstTy, maskSize);
     rewriter.replaceOp(createMaskOp, newMask);
     return success();
   }
@@ -710,8 +710,9 @@ struct LinearizeVectorLoad final : public OpConversionPattern<vector::LoadOp> {
 
     auto linearTy = typeConverter->convertType<VectorType>(vecTy);
 
-    auto newLoad = rewriter.create<vector::LoadOp>(
-        loadOp.getLoc(), linearTy, adaptor.getBase(), adaptor.getIndices());
+    auto newLoad =
+        vector::LoadOp::create(rewriter, loadOp.getLoc(), linearTy,
+                               adaptor.getBase(), adaptor.getIndices());
     rewriter.replaceOp(loadOp, newLoad.getResult());
     return success();
   }
@@ -832,7 +833,7 @@ void mlir::vector::populateForVectorLinearize(TypeConverter &typeConverter,
     if (!isa<VectorType>(type) || !isa<VectorType>(value.getType()))
       return nullptr;
 
-    return builder.create<vector::ShapeCastOp>(loc, type, value);
+    return vector::ShapeCastOp::create(builder, loc, type, value);
   };
   typeConverter.addSourceMaterialization(materializeCast);
   typeConverter.addTargetMaterialization(materializeCast);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
index a7403250a069b..8a181a429e41c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
@@ -82,8 +82,8 @@ LogicalResult resolveAllTrueCreateMaskOp(IRRewriter &rewriter,
   // Replace createMaskOp with an all-true constant. This should result in the
   // mask being removed in most cases (as xfer ops + vector.mask have folds to
   // remove all-true masks).
-  auto allTrue = rewriter.create<vector::ConstantMaskOp>(
-      createMaskOp.getLoc(), maskType, ConstantMaskKind::AllTrue);
+  auto allTrue = vector::ConstantMaskOp::create(
+      rewriter, createMaskOp.getLoc(), maskType, ConstantMaskKind::AllTrue);
   rewriter.replaceAllUsesWith(createMaskOp, allTrue);
   return success();
 }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index c20a1b355996c..2676d254c9b64 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -286,8 +286,8 @@ static Value rankReducingSubviewDroppingUnitDims(PatternRewriter &rewriter,
   if (resultType.canonicalizeStridedLayout() ==
       inputType.canonicalizeStridedLayout())
     return input;
-  return rewriter.create<memref::SubViewOp>(loc, resultType, input, offsets,
-                                            sizes, strides);
+  return memref::SubViewOp::create(rewriter, loc, resultType, input, offsets,
+                                   sizes, strides);
 }
 
 /// Returns the number of dims that aren't unit dims.
@@ -395,13 +395,13 @@ class TransferReadDropUnitDimsPattern
 
     Value reducedShapeSource =
         rankReducingSubviewDroppingUnitDims(rewriter, loc, source);
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value c0 = arith::ConstantIndexOp::create(rewriter, loc, 0);
     SmallVector<Value> zeros(reducedRank, c0);
     auto identityMap = rewriter.getMultiDimIdentityMap(reducedRank);
     SmallVector<bool> inBounds(reducedVectorType.getRank(), true);
-    Operation *newTransferReadOp = rewriter.create<vector::TransferReadOp>(
-        loc, reducedVectorType, reducedShapeSource, zeros, identityMap,
-        transferReadOp.getPadding(), maskOp,
+    Operation *newTransferReadOp = vector::TransferReadOp::create(
+        rewriter, loc, reducedVectorType, reducedShapeSource, zeros,
+        identityMap, transferReadOp.getPadding(), maskOp,
         rewriter.getBoolArrayAttr(inBounds));
 
     if (maskingOp) {
@@ -477,15 +477,15 @@ class TransferWriteDropUnitDimsPattern
     }
     Value reducedShapeSource =
         rankReducingSubviewDroppingUnitDims(rewriter, loc, source);
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value c0 = arith::ConstantIndexOp::create(rewriter, loc, 0);
     SmallVector<Value> zeros(reducedRank, c0);
     auto identityMap = rewriter.getMultiDimIdentityMap(reducedRank);
     SmallVector<bool> inBounds(reducedVectorType.getRank(), true);
     auto shapeCastSrc = rewriter.createOrFold<vector::ShapeCastOp>(
         loc, reducedVectorType, vector);
-    Operation *newXferWrite = rewriter.create<vector::TransferWriteOp>(
-        loc, Type(), shapeCastSrc, reducedShapeSource, zeros, identityMap,
-        maskOp, rewriter.getBoolArrayAttr(inBounds));
+    Operation *newXferWrite = vector::TransferWriteOp::create(
+        rewriter, loc, Type(), shapeCastSrc, reducedShapeSource, zeros,
+        identityMap, maskOp, rewriter.getBoolArrayAttr(inBounds));
 
     if (maskingOp) {
       auto shapeCastMask = rewriter.createOrFold<vector::ShapeCastOp>(
@@ -520,7 +520,7 @@ static Value collapseInnerDims(PatternRewriter &rewriter, mlir::Location loc,
   for (int64_t i = firstDimToCollapse; i < inputType.getRank(); ++i)
     collapsedIndices.push_back(i);
   reassociation.push_back(collapsedIndices);
-  return rewriter.create<memref::CollapseShapeOp>(loc, input, reassociation);
+  return memref::CollapseShapeOp::create(rewriter, loc, input, reassociation);
 }
 
 /// Returns the new indices that collapses the inner dimensions starting from
@@ -559,7 +559,7 @@ static SmallVector<Value> getCollapsedIndices(RewriterBase &rewriter,
   // one would get the following offset:
   //    %offset = %arg0 * 43
   OpFoldResult collapsedOffset =
-      rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult();
+      arith::ConstantIndexOp::create(rewriter, loc, 0).getResult();
 
   auto collapsedStrides = computeSuffixProduct(
       ArrayRef<int64_t>(shape.begin() + firstDimToCollapse, shape.end()));
@@ -573,8 +573,8 @@ static SmallVector<Value> getCollapsedIndices(RewriterBase &rewriter,
   if (auto value = dyn_cast<Value>(collapsedOffset)) {
     indicesAfterCollapsing.push_back(value);
   } else {
-    indicesAfterCollapsing.push_back(rewriter.create<arith::ConstantIndexOp>(
-        loc, *getConstantIntValue(collapsedOffset)));
+    indicesAfterCollapsing.push_back(arith::ConstantIndexOp::create(
+        rewriter, loc, *getConstantIntValue(collapsedOffset)));
   }
 
   return indicesAfterCollapsing;
@@ -659,8 +659,8 @@ class FlattenContiguousRowMajorTransferReadPattern
     // 3. Create new vector.transfer_read that reads from the collapsed memref
     VectorType flatVectorType = VectorType::get({vectorType.getNumElements()},
                                                 vectorType.getElementType());
-    vector::TransferReadOp flatRead = rewriter.create<vector::TransferReadOp>(
-        loc, flatVectorType, collapsedSource, collapsedIndices,
+    vector::TransferReadOp flatRead = vector::TransferReadOp::create(
+        rewriter, loc, flatVectorType, collapsedSource, collapsedIndices,
         transferReadOp.getPadding(), collapsedMap);
     flatRead.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
 
@@ -757,10 +757,10 @@ class FlattenContiguousRowMajorTransferWritePattern
     VectorType flatVectorType = VectorType::get({vectorType.getNumElements()},
                                                 vectorType.getElementType());
     Value flatVector =
-        rewriter.create<vector::ShapeCastOp>(loc, flatVectorType, vector);
-    vector::TransferWriteOp flatWrite =
-        rewriter.create<vector::TransferWriteOp>(
-            loc, flatVector, collapsedSource, collapsedIndices, collapsedMap);
+        vector::ShapeCastOp::create(rewriter, loc, flatVectorType, vector);
+    vector::TransferWriteOp flatWrite = vector::TransferWriteOp::create(
+        rewriter, loc, flatVector, collapsedSource, collapsedIndices,
+        collapsedMap);
     flatWrite.setInBoundsAttr(rewriter.getBoolArrayAttr({true}));
 
     // 4. Replace the old transfer_write with the new one writing the
@@ -846,8 +846,8 @@ class RewriteScalarExtractOfTransferRead
       if (auto value = dyn_cast<Value>(composedIdx)) {
         newIndices[idx] = value;
       } else {
-        newIndices[idx] = rewriter.create<arith::ConstantIndexOp>(
-            extractOp.getLoc(), *getConstantIntValue(composedIdx));
+        newIndices[idx] = arith::ConstantIndexOp::create(
+            rewriter, extractOp.getLoc(), *getConstantIntValue(composedIdx));
       }
     }
     if (isa<MemRefType>(xferOp.getBase().getType())) {
@@ -883,8 +883,8 @@ class RewriteScalarWrite : public OpRewritePattern<vector::TransferWriteOp> {
     if (!xferOp.getPermutationMap().isMinorIdentity())
       return failure();
     // Only float and integer element types are supported.
-    Value scalar =
-        rewriter.create<vector::ExtractOp>(xferOp.getLoc(), xferOp.getVector());
+    Value scalar = vector::ExtractOp::create(rewriter, xferOp.getLoc(),
+                                             xferOp.getVector());
     // Construct a scalar store.
     if (isa<MemRefType>(xferOp.getBase().getType())) {
       rewriter.replaceOpWithNewOp<memref::StoreOp>(
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
index eee090d495c17..05b00744beea2 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -57,12 +57,12 @@ static Value createInBoundsCond(RewriterBase &b,
     if (maybeCstSum && maybeCstDimSz && *maybeCstSum <= *maybeCstDimSz)
       return;
     Value cond =
-        b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sle,
-                                getValueOrCreateConstantIndexOp(b, loc, sum),
-                                getValueOrCreateConstantIndexOp(b, loc, dimSz));
+        arith::CmpIOp::create(b, loc, arith::CmpIPredicate::sle,
+                              getValueOrCreateConstantIndexOp(b, loc, sum),
+                              getValueOrCreateConstantIndexOp(b, loc, dimSz));
     // Conjunction over all dims for which we are in-bounds.
     if (inBoundsCond)
-      inBoundsCond = b.create<arith::AndIOp>(loc, inBoundsCond, cond);
+      inBoundsCond = arith::AndIOp::create(b, loc, inBoundsCond, cond);
     else
       inBoundsCond = cond;
   });
@@ -170,11 +170,12 @@ static Value castToCompatibleMemRefType(OpBuilder &b, Value memref,
     sourceType = MemRefType::get(
         sourceType.getShape(), sourceType.getElementType(),
         sourceType.getLayout(), compatibleMemRefType.getMemorySpace());
-    res = b.create<memref::MemorySpaceCastOp>(memref.getLoc(), sourceType, res);
+    res =
+        memref::MemorySpaceCastOp::create(b, memref.getLoc(), sourceType, res);
   }
   if (sourceType == compatibleMemRefType)
     return res;
-  return b.create<memref::CastOp>(memref.getLoc(), compatibleMemRefType, res);
+  return memref::CastOp::create(b, memref.getLoc(), compatibleMemRefType, res);
 }
 
 /// Operates under a scoped context to build the intersection between the
@@ -196,16 +197,17 @@ createSubViewIntersection(RewriterBase &b, VectorTransferOpInterface xferOp,
   xferOp.zipResultAndIndexing([&](int64_t resultIdx, int64_t indicesIdx) {
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
     Value dimMemRef =
-        b.create<memref::DimOp>(xferOp.getLoc(), xferOp.getBase(), indicesIdx);
-    Value dimAlloc = b.create<memref::DimOp>(loc, alloc, resultIdx);
+        memref::DimOp::create(b, xferOp.getLoc(), xferOp.getBase(), indicesIdx);
+    Value dimAlloc = memref::DimOp::create(b, loc, alloc, resultIdx);
     Value index = xferOp.getIndices()[indicesIdx];
     AffineExpr i, j, k;
     bindDims(xferOp.getContext(), i, j, k);
     SmallVector<AffineMap, 4> maps =
         AffineMap::inferFromExprList(MapList{{i - j, k}}, b.getContext());
     // affine_min(%dimMemRef - %index, %dimAlloc)
-    Value affineMin = b.create<affine::AffineMinOp>(
-        loc, index.getType(), maps[0], ValueRange{dimMemRef, index, dimAlloc});
+    Value affineMin =
+        affine::AffineMinOp::create(b, loc, index.getType(), maps[0],
+                                    ValueRange{dimMemRef, index, dimAlloc});
     sizes.push_back(affineMin);
   });
 
@@ -213,10 +215,10 @@ createSubViewIntersection(RewriterBase &b, VectorTransferOpInterface xferOp,
       xferOp.getIndices(), [](Value idx) -> OpFoldResult { return idx; }));
   SmallVector<OpFoldResult> destIndices(memrefRank, b.getIndexAttr(0));
   SmallVector<OpFoldResult> strides(memrefRank, b.getIndexAttr(1));
-  auto copySrc = b.create<memref::SubViewOp>(
-      loc, isaWrite ? alloc : xferOp.getBase(), srcIndices, sizes, strides);
-  auto copyDest = b.create<memref::SubViewOp>(
-      loc, isaWrite ? xferOp.getBase() : alloc, destIndices, sizes, strides);
+  auto copySrc = memref::SubViewOp::create(
+      b, loc, isaWrite ? alloc : xferOp.getBase(), srcIndices, sizes, strides);
+  auto copyDest = memref::SubViewOp::create(
+      b, loc, isaWrite ? xferOp.getBase() : alloc, destIndices, sizes, strides);
   return std::make_pair(copySrc, copyDest);
 }
 
@@ -244,32 +246,32 @@ createFullPartialLinalgCopy(RewriterBase &b, vector::TransferReadOp xferOp,
                             TypeRange returnTypes, Value inBoundsCond,
                             MemRefType compatibleMemRefType, Value alloc) {
   Location loc = xferOp.getLoc();
-  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  Value zero = arith::ConstantIndexOp::create(b, loc, 0);
   Value memref = xferOp.getBase();
-  return b.create<scf::IfOp>(
-      loc, inBoundsCond,
+  return scf::IfOp::create(
+      b, loc, inBoundsCond,
       [&](OpBuilder &b, Location loc) {
         Value res = castToCompatibleMemRefType(b, memref, compatibleMemRefType);
         scf::ValueVector viewAndIndices{res};
         llvm::append_range(viewAndIndices, xferOp.getIndices());
-        b.create<scf::YieldOp>(loc, viewAndIndices);
+        scf::YieldOp::create(b, loc, viewAndIndices);
       },
       [&](OpBuilder &b, Location loc) {
-        b.create<linalg::FillOp>(loc, ValueRange{xferOp.getPadding()},
-                                 ValueRange{alloc});
+        linalg::FillOp::create(b, loc, ValueRange{xferOp.getPadding()},
+                               ValueRange{alloc});
         // Take partial subview of memref which guarantees no dimension
         // overflows.
         IRRewriter rewriter(b);
         std::pair<Value, Value> copyArgs = createSubViewIntersection(
             rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
             alloc);
-        b.create<memref::CopyOp>(loc, copyArgs.first, copyArgs.second);
+        memref::CopyOp::create(b, loc, copyArgs.first, copyArgs.second);
         Value casted =
             castToCompatibleMemRefType(b, alloc, compatibleMemRefType);
         scf::ValueVector viewAndIndices{casted};
         viewAndIndices.insert(viewAndIndices.end(), xferOp.getTransferRank(),
                               zero);
-        b.create<scf::YieldOp>(loc, viewAndIndices);
+        scf::YieldOp::create(b, loc, viewAndIndices);
       });
 }
 
@@ -297,30 +299,30 @@ static scf::IfOp createFullPartialVectorTransferRead(
     Value inBoundsCond, MemRefType compatibleMemRefType, Value alloc) {
   Location loc = xferOp.getLoc();
   scf::IfOp fullPartialIfOp;
-  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  Value zero = arith::ConstantIndexOp::create(b, loc, 0);
   Value memref = xferOp.getBase();
-  return b.create<scf::IfOp>(
-      loc, inBoundsCond,
+  return scf::IfOp::create(
+      b, loc, inBoundsCond,
       [&](OpBuilder &b, Location loc) {
         Value res = castToCompatibleMemRefType(b, memref, compatibleMemRefType);
         scf::ValueVector viewAndIndices{res};
         llvm::append_range(viewAndIndices, xferOp.getIndices());
-        b.create<scf::YieldOp>(loc, viewAndIndices);
+        scf::YieldOp::create(b, loc, viewAndIndices);
       },
       [&](OpBuilder &b, Location loc) {
         Operation *newXfer = b.clone(*xferOp.getOperation());
         Value vector = cast<VectorTransferOpInterface>(newXfer).getVector();
-        b.create<memref::StoreOp>(
-            loc, vector,
-            b.create<vector::TypeCastOp>(
-                loc, MemRefType::get({}, vector.getType()), alloc));
+        memref::StoreOp::create(
+            b, loc, vector,
+            vector::TypeCastOp::create(
+                b, loc, MemRefType::get({}, vector.getType()), alloc));
 
         Value casted =
             castToCompatibleMemRefType(b, alloc, compatibleMemRefType);
         scf::ValueVector viewAndIndices{casted};
         viewAndIndices.insert(viewAndIndices.end(), xferOp.getTransferRank(),
                               zero);
-        b.create<scf::YieldOp>(loc, viewAndIndices);
+        scf::YieldOp::create(b, loc, viewAndIndices);
       });
 }
 
@@ -344,7 +346,7 @@ getLocationToWriteFullVec(RewriterBase &b, vector::TransferWriteOp xferOp,
                           TypeRange returnTypes, Value inBoundsCond,
                           MemRefType compatibleMemRefType, Value alloc) {
   Location loc = xferOp.getLoc();
-  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  Value zero = arith::ConstantIndexOp::create(b, loc, 0);
   Value memref = xferOp.getBase();
   return b
       .create<scf::IfOp>(
@@ -354,7 +356,7 @@ getLocationToWriteFullVec(RewriterBase &b, vector::TransferWriteOp xferOp,
                 castToCompatibleMemRefType(b, memref, compatibleMemRefType);
             scf::ValueVector viewAndIndices{res};
             llvm::append_range(viewAndIndices, xferOp.getIndices());
-            b.create<scf::YieldOp>(loc, viewAndIndices);
+            scf::YieldOp::create(b, loc, viewAndIndices);
           },
           [&](OpBuilder &b, Location loc) {
             Value casted =
@@ -362,7 +364,7 @@ getLocationToWriteFullVec(RewriterBase &b, vector::TransferWriteOp xferOp,
             scf::ValueVector viewAndIndices{casted};
             viewAndIndices.insert(viewAndIndices.end(),
                                   xferOp.getTransferRank(), zero);
-            b.create<scf::YieldOp>(loc, viewAndIndices);
+            scf::YieldOp::create(b, loc, viewAndIndices);
           })
       ->getResults();
 }
@@ -384,15 +386,15 @@ static void createFullPartialLinalgCopy(RewriterBase &b,
                                         vector::TransferWriteOp xferOp,
                                         Value inBoundsCond, Value alloc) {
   Location loc = xferOp.getLoc();
-  auto notInBounds = b.create<arith::XOrIOp>(
-      loc, inBoundsCond, b.create<arith::ConstantIntOp>(loc, true, 1));
-  b.create<scf::IfOp>(loc, notInBounds, [&](OpBuilder &b, Location loc) {
+  auto notInBounds = arith::XOrIOp::create(
+      b, loc, inBoundsCond, arith::ConstantIntOp::create(b, loc, true, 1));
+  scf::IfOp::create(b, loc, notInBounds, [&](OpBuilder &b, Location loc) {
     IRRewriter rewriter(b);
     std::pair<Value, Value> copyArgs = createSubViewIntersection(
         rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
         alloc);
-    b.create<memref::CopyOp>(loc, copyArgs.first, copyArgs.second);
-    b.create<scf::YieldOp>(loc, ValueRange{});
+    memref::CopyOp::create(b, loc, copyArgs.first, copyArgs.second);
+    scf::YieldOp::create(b, loc, ValueRange{});
   });
 }
 
@@ -413,18 +415,18 @@ static void createFullPartialVectorTransferWrite(RewriterBase &b,
                                                  Value inBoundsCond,
                                                  Value alloc) {
   Location loc = xferOp.getLoc();
-  auto notInBounds = b.create<arith::XOrIOp>(
-      loc, inBoundsCond, b.create<arith::ConstantIntOp>(loc, true, 1));
-  b.create<scf::IfOp>(loc, notInBounds, [&](OpBuilder &b, Location loc) {
+  auto notInBounds = arith::XOrIOp::create(
+      b, loc, inBoundsCond, arith::ConstantIntOp::create(b, loc, true, 1));
+  scf::IfOp::create(b, loc, notInBounds, [&](OpBuilder &b, Location loc) {
     IRMapping mapping;
-    Value load = b.create<memref::LoadOp>(
-        loc,
-        b.create<vector::TypeCastOp>(
-            loc, MemRefType::get({}, xferOp.getVector().getType()), alloc),
+    Value load = memref::LoadOp::create(
+        b, loc,
+        vector::TypeCastOp::create(
+            b, loc, MemRefType::get({}, xferOp.getVector().getType()), alloc),
         ValueRange());
     mapping.map(xferOp.getVector(), load);
     b.clone(*xferOp.getOperation(), mapping);
-    b.create<scf::YieldOp>(loc, ValueRange{});
+    scf::YieldOp::create(b, loc, ValueRange{});
   });
 }
 
@@ -554,9 +556,9 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
     b.setInsertionPointToStart(&scope->getRegion(0).front());
     auto shape = xferOp.getVectorType().getShape();
     Type elementType = xferOp.getVectorType().getElementType();
-    alloc = b.create<memref::AllocaOp>(scope->getLoc(),
-                                       MemRefType::get(shape, elementType),
-                                       ValueRange{}, b.getI64IntegerAttr(32));
+    alloc = memref::AllocaOp::create(b, scope->getLoc(),
+                                     MemRefType::get(shape, elementType),
+                                     ValueRange{}, b.getI64IntegerAttr(32));
   }
 
   MemRefType compatibleMemRefType =
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index fe2707629d82e..73ca327bb49c5 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -381,8 +381,8 @@ FailureOr<Value> combineContractAndBroadcast(vector::ContractionOp contractOp,
   if (getUnusedDimsBitVector({maps[0], maps[1]}).any())
     return failure();
 
-  Operation *newOp = rewriter.create<vector::ContractionOp>(
-      contractOp.getLoc(), lhs, rhs, contractOp.getAcc(),
+  Operation *newOp = vector::ContractionOp::create(
+      rewriter, contractOp.getLoc(), lhs, rhs, contractOp.getAcc(),
       rewriter.getAffineMapArrayAttr(maps), rewriter.getArrayAttr(iterators));
 
   // Handle the mask.
@@ -534,8 +534,8 @@ struct ReorderElementwiseOpsOnTranspose final
         // This is a constant. Create a reverse transpose op for it.
         auto vectorType =
             srcType.clone(cast<VectorType>(operand.getType()).getElementType());
-        srcValues.push_back(rewriter.create<vector::TransposeOp>(
-            operand.getLoc(), vectorType, operand, invOrder));
+        srcValues.push_back(vector::TransposeOp::create(
+            rewriter, operand.getLoc(), vectorType, operand, invOrder));
       }
     }
 
@@ -608,20 +608,20 @@ struct BubbleDownVectorBitCastForExtract
     // Get the single scalar (as a vector) in the source value that packs the
     // desired scalar. E.g. extract vector<1xf32> from vector<4xf32>
     Location loc = extractOp.getLoc();
-    Value packedValue = rewriter.create<vector::ExtractOp>(
-        loc, castOp.getSource(), index / expandRatio);
+    Value packedValue = vector::ExtractOp::create(
+        rewriter, loc, castOp.getSource(), index / expandRatio);
     Type packedVecType = VectorType::get(/*shape=*/{1}, packedValue.getType());
-    Value zero = rewriter.create<arith::ConstantOp>(
-        loc, packedVecType, rewriter.getZeroAttr(packedVecType));
-    packedValue = rewriter.create<vector::InsertOp>(loc, packedValue, zero,
-                                                    /*position=*/0);
+    Value zero = arith::ConstantOp::create(rewriter, loc, packedVecType,
+                                           rewriter.getZeroAttr(packedVecType));
+    packedValue = vector::InsertOp::create(rewriter, loc, packedValue, zero,
+                                           /*position=*/0);
 
     // Cast it to a vector with the desired scalar's type.
     // E.g. f32 -> vector<2xf16>
     VectorType packedType =
         VectorType::get({expandRatio}, castDstType.getElementType());
     Value castedValue =
-        rewriter.create<vector::BitCastOp>(loc, packedType, packedValue);
+        vector::BitCastOp::create(rewriter, loc, packedType, packedValue);
 
     // Finally extract the desired scalar.
     rewriter.replaceOpWithNewOp<vector::ExtractOp>(extractOp, castedValue,
@@ -700,9 +700,9 @@ struct BubbleDownBitCastForStridedSliceExtract
     VectorType newExtractType =
         VectorType::get(dims, castSrcType.getElementType());
 
-    auto newExtractOp = rewriter.create<vector::ExtractStridedSliceOp>(
-        extractOp.getLoc(), newExtractType, castOp.getSource(), newOffsets,
-        newSizes, extractOp.getStrides());
+    auto newExtractOp = vector::ExtractStridedSliceOp::create(
+        rewriter, extractOp.getLoc(), newExtractType, castOp.getSource(),
+        newOffsets, newSizes, extractOp.getStrides());
 
     rewriter.replaceOpWithNewOp<vector::BitCastOp>(
         extractOp, extractOp.getType(), newExtractOp);
@@ -761,8 +761,9 @@ struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
         isNumElemsShrink ? srcDims.back() / ratio : srcDims.back() * ratio;
     VectorType newCastSrcType =
         VectorType::get(srcDims, castDstType.getElementType());
-    auto newCastSrcOp = rewriter.create<vector::BitCastOp>(
-        bitcastOp.getLoc(), newCastSrcType, insertOp.getValueToStore());
+    auto newCastSrcOp =
+        vector::BitCastOp::create(rewriter, bitcastOp.getLoc(), newCastSrcType,
+                                  insertOp.getValueToStore());
 
     SmallVector<int64_t> dstDims(insertOp.getDestVectorType().getShape());
     dstDims.back() =
@@ -771,8 +772,8 @@ struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
         VectorType::get(dstDims, castDstType.getElementType());
 
     // Bitcast the destination.
-    auto newCastDstOp = rewriter.create<vector::BitCastOp>(
-        bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
+    auto newCastDstOp = vector::BitCastOp::create(
+        rewriter, bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
 
     // Generate new insert.
     rewriter.replaceOpWithNewOp<vector::InsertOp>(
@@ -852,8 +853,9 @@ struct BubbleUpBitCastForStridedSliceInsert
     VectorType newCastSrcType =
         VectorType::get(srcDims, castDstType.getElementType());
 
-    auto newCastSrcOp = rewriter.create<vector::BitCastOp>(
-        bitcastOp.getLoc(), newCastSrcType, insertOp.getValueToStore());
+    auto newCastSrcOp =
+        vector::BitCastOp::create(rewriter, bitcastOp.getLoc(), newCastSrcType,
+                                  insertOp.getValueToStore());
 
     SmallVector<int64_t> dstDims =
         llvm::to_vector<4>(insertOp.getDestVectorType().getShape());
@@ -861,8 +863,8 @@ struct BubbleUpBitCastForStridedSliceInsert
     VectorType newCastDstType =
         VectorType::get(dstDims, castDstType.getElementType());
 
-    auto newCastDstOp = rewriter.create<vector::BitCastOp>(
-        bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
+    auto newCastDstOp = vector::BitCastOp::create(
+        rewriter, bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
 
     rewriter.replaceOpWithNewOp<vector::InsertStridedSliceOp>(
         bitcastOp, bitcastOp.getType(), newCastSrcOp, newCastDstOp, newOffsets,
@@ -936,9 +938,9 @@ struct BreakDownVectorBitCast : public OpRewritePattern<vector::BitCastOp> {
     Type elemType = castDstType.getElementType();
     assert(elemType.isSignlessIntOrIndexOrFloat());
 
-    Value zero = rewriter.create<arith::ConstantOp>(
-        loc, elemType, rewriter.getZeroAttr(elemType));
-    Value res = rewriter.create<SplatOp>(loc, castDstType, zero);
+    Value zero = arith::ConstantOp::create(rewriter, loc, elemType,
+                                           rewriter.getZeroAttr(elemType));
+    Value res = SplatOp::create(rewriter, loc, castDstType, zero);
 
     SmallVector<int64_t> sliceShape = {castDstLastDim};
     SmallVector<int64_t> strides = {1};
@@ -947,13 +949,13 @@ struct BreakDownVectorBitCast : public OpRewritePattern<vector::BitCastOp> {
                         castDstType.getElementType());
 
     for (int i = 0, e = shrinkRatio; i < e; ++i) {
-      Value extracted = rewriter.create<ExtractStridedSliceOp>(
-          loc, bitcastOp.getSource(), ArrayRef<int64_t>{i * castDstLastDim},
-          sliceShape, strides);
+      Value extracted = ExtractStridedSliceOp::create(
+          rewriter, loc, bitcastOp.getSource(),
+          ArrayRef<int64_t>{i * castDstLastDim}, sliceShape, strides);
       Value bitcast =
-          rewriter.create<BitCastOp>(loc, newCastDstType, extracted);
-      res = rewriter.create<InsertStridedSliceOp>(
-          loc, bitcast, res,
+          BitCastOp::create(rewriter, loc, newCastDstType, extracted);
+      res = InsertStridedSliceOp::create(
+          rewriter, loc, bitcast, res,
           ArrayRef<int64_t>{i * castDstLastDim / shrinkRatio}, strides);
     }
     rewriter.replaceOp(bitcastOp, res);
@@ -1103,7 +1105,7 @@ class ExtractOpFromElementwise final
     Location loc = eltwise->getLoc();
     SmallVector<OpFoldResult> pos = op.getMixedPosition();
     for (Value arg : eltwise->getOperands()) {
-      Value newArg = rewriter.create<vector::ExtractOp>(loc, arg, pos);
+      Value newArg = vector::ExtractOp::create(rewriter, loc, arg, pos);
       mapping.map(arg, newArg);
     }
 
@@ -1292,19 +1294,19 @@ static Value buildVectorComparison(PatternRewriter &rewriter, Operation *op,
     indicesAttr = rewriter.getI64VectorAttr(
         llvm::to_vector<4>(llvm::seq<int64_t>(0, dim)));
   }
-  Value indices = rewriter.create<arith::ConstantOp>(loc, indicesAttr);
+  Value indices = arith::ConstantOp::create(rewriter, loc, indicesAttr);
   // Add in an offset if requested.
   if (off) {
     Value o = getValueOrCreateCastToIndexLike(rewriter, loc, idxType, *off);
-    Value ov = rewriter.create<vector::SplatOp>(loc, indices.getType(), o);
-    indices = rewriter.create<arith::AddIOp>(loc, ov, indices);
+    Value ov = vector::SplatOp::create(rewriter, loc, indices.getType(), o);
+    indices = arith::AddIOp::create(rewriter, loc, ov, indices);
   }
   // Construct the vector comparison.
   Value bound = getValueOrCreateCastToIndexLike(rewriter, loc, idxType, b);
   Value bounds =
-      rewriter.create<vector::SplatOp>(loc, indices.getType(), bound);
-  return rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, indices,
-                                        bounds);
+      vector::SplatOp::create(rewriter, loc, indices.getType(), bound);
+  return arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt,
+                               indices, bounds);
 }
 
 template <typename ConcreteOp>
@@ -1335,15 +1337,15 @@ struct MaterializeTransferMask : public OpRewritePattern<ConcreteOp> {
     Value off = xferOp.getIndices()[lastIndex];
     Value dim =
         vector::createOrFoldDimOp(rewriter, loc, xferOp.getBase(), lastIndex);
-    Value b = rewriter.create<arith::SubIOp>(loc, dim.getType(), dim, off);
-    Value mask = rewriter.create<vector::CreateMaskOp>(
-        loc,
+    Value b = arith::SubIOp::create(rewriter, loc, dim.getType(), dim, off);
+    Value mask = vector::CreateMaskOp::create(
+        rewriter, loc,
         VectorType::get(vtp.getShape(), rewriter.getI1Type(),
                         vtp.getScalableDims()),
         b);
     if (xferOp.getMask()) {
       // Intersect the in-bounds with the mask specified as an op parameter.
-      mask = rewriter.create<arith::AndIOp>(loc, mask, xferOp.getMask());
+      mask = arith::AndIOp::create(rewriter, loc, mask, xferOp.getMask());
     }
 
     rewriter.modifyOpInPlace(xferOp, [&]() {
@@ -1548,12 +1550,13 @@ class DropInnerMostUnitDimsTransferRead
         strides);
     ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
         readOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
-    Value rankedReducedView = rewriter.create<memref::SubViewOp>(
-        loc, resultMemrefType, readOp.getBase(), offsets, sizes, strides);
+    Value rankedReducedView =
+        memref::SubViewOp::create(rewriter, loc, resultMemrefType,
+                                  readOp.getBase(), offsets, sizes, strides);
     auto permMap = getTransferMinorIdentityMap(
         cast<ShapedType>(rankedReducedView.getType()), resultTargetVecType);
-    Value result = rewriter.create<vector::TransferReadOp>(
-        loc, resultTargetVecType, rankedReducedView,
+    Value result = vector::TransferReadOp::create(
+        rewriter, loc, resultTargetVecType, rankedReducedView,
         readOp.getIndices().drop_back(dimsToDrop), AffineMapAttr::get(permMap),
         readOp.getPadding(),
         // TODO: support mask.
@@ -1639,8 +1642,9 @@ class DropInnerMostUnitDimsTransferWrite
     ArrayAttr inBoundsAttr = rewriter.getArrayAttr(
         writeOp.getInBoundsAttr().getValue().drop_back(dimsToDrop));
 
-    Value rankedReducedView = rewriter.create<memref::SubViewOp>(
-        loc, resultMemrefType, writeOp.getBase(), offsets, sizes, strides);
+    Value rankedReducedView =
+        memref::SubViewOp::create(rewriter, loc, resultMemrefType,
+                                  writeOp.getBase(), offsets, sizes, strides);
     auto permMap = getTransferMinorIdentityMap(
         cast<ShapedType>(rankedReducedView.getType()), resultTargetVecType);
 
@@ -1708,21 +1712,21 @@ struct CanonicalizeContractMatmulToMMT final
     auto createTranspose = [&rewriter, loc](Value mat) -> Value {
       if (auto sext = mat.getDefiningOp<arith::ExtSIOp>()) {
         Value trans =
-            rewriter.create<vector::TransposeOp>(loc, sext.getIn(), perm);
+            vector::TransposeOp::create(rewriter, loc, sext.getIn(), perm);
         VectorType newType =
             cast<VectorType>(trans.getType())
                 .clone(cast<VectorType>(mat.getType()).getElementType());
-        return rewriter.create<arith::ExtSIOp>(loc, newType, trans);
+        return arith::ExtSIOp::create(rewriter, loc, newType, trans);
       }
       if (auto zext = mat.getDefiningOp<arith::ExtUIOp>()) {
         Value trans =
-            rewriter.create<vector::TransposeOp>(loc, zext.getIn(), perm);
+            vector::TransposeOp::create(rewriter, loc, zext.getIn(), perm);
         VectorType newType =
             VectorType::get(cast<VectorType>(trans.getType()).getShape(),
                             cast<VectorType>(mat.getType()).getElementType());
-        return rewriter.create<arith::ExtUIOp>(loc, newType, trans);
+        return arith::ExtUIOp::create(rewriter, loc, newType, trans);
       }
-      return rewriter.create<vector::TransposeOp>(loc, mat, perm);
+      return vector::TransposeOp::create(rewriter, loc, mat, perm);
     };
 
     if (maps == infer({{m, k}, {k, n}, {m, n}})) {
@@ -1836,8 +1840,8 @@ struct ChainedReduction final : OpRewritePattern<vector::ReductionOp> {
       vAdd = rewriter.createOrFold<arith::AddIOp>(
           loc, parentReduction.getVector(), op.getVector());
     } else {
-      vAdd = rewriter.create<arith::AddFOp>(loc, parentReduction.getVector(),
-                                            op.getVector());
+      vAdd = arith::AddFOp::create(rewriter, loc, parentReduction.getVector(),
+                                   op.getVector());
     }
     rewriter.replaceOpWithNewOp<vector::ReductionOp>(op, op.getKind(), vAdd,
                                                      parentReduction.getAcc());
@@ -1925,7 +1929,7 @@ struct DropUnitDimFromElementwiseOps final
       if (newVType == opVectorType)
         return rewriter.notifyMatchFailure(op, "No unit dimension to remove.");
 
-      auto opSC = rewriter.create<vector::ShapeCastOp>(loc, newVType, operand);
+      auto opSC = vector::ShapeCastOp::create(rewriter, loc, newVType, operand);
       newOperands.push_back(opSC);
     }
 
@@ -2004,11 +2008,11 @@ struct DropUnitDimsFromTransposeOp final
 
     Location loc = op.getLoc();
     // Drop the unit dims via shape_cast.
-    auto dropDimsShapeCast = rewriter.create<vector::ShapeCastOp>(
-        loc, sourceTypeWithoutUnitDims, op.getVector());
+    auto dropDimsShapeCast = vector::ShapeCastOp::create(
+        rewriter, loc, sourceTypeWithoutUnitDims, op.getVector());
     // Create the new transpose.
     auto transposeWithoutUnitDims =
-        rewriter.create<vector::TransposeOp>(loc, dropDimsShapeCast, newPerm);
+        vector::TransposeOp::create(rewriter, loc, dropDimsShapeCast, newPerm);
     // Restore the unit dims via shape cast.
     rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
         op, op.getResultVectorType(), transposeWithoutUnitDims);
@@ -2059,7 +2063,7 @@ struct DropUnitDimsFromScfForOp final : OpRewritePattern<scf::ForOp> {
 
       // Create a new ForOp with that iter operand replaced.
       auto castFn = [](OpBuilder &b, Location loc, Type type, Value source) {
-        return b.create<vector::ShapeCastOp>(loc, type, source);
+        return vector::ShapeCastOp::create(b, loc, type, source);
       };
 
       Value replacement =
@@ -2111,8 +2115,8 @@ struct ReduceRedundantZero final : OpRewritePattern<vector::ReductionOp> {
     if (!matchPattern(addLhs.getRhs(), m_AnyZeroFloat()))
       return failure();
 
-    auto newAdd = rewriter.create<arith::AddFOp>(vAdd.getLoc(), addLhs.getLhs(),
-                                                 vAdd.getRhs());
+    auto newAdd = arith::AddFOp::create(rewriter, vAdd.getLoc(),
+                                        addLhs.getLhs(), vAdd.getRhs());
     rewriter.replaceOpWithNewOp<vector::ReductionOp>(op, op.getKind(), newAdd,
                                                      op.getAcc());
     return success();
@@ -2154,8 +2158,8 @@ struct BreakDownVectorReduction final : OpRewritePattern<vector::ReductionOp> {
     Location loc = op.getLoc();
     SmallVector<Value> extracted(numElems, nullptr);
     for (auto [idx, extractedElem] : llvm::enumerate(extracted))
-      extractedElem = rewriter.create<vector::ExtractOp>(
-          loc, op.getVector(), static_cast<int64_t>(idx));
+      extractedElem = vector::ExtractOp::create(rewriter, loc, op.getVector(),
+                                                static_cast<int64_t>(idx));
 
     Value res = extracted.front();
     for (auto extractedElem : llvm::drop_begin(extracted))
@@ -2234,8 +2238,8 @@ struct FoldArithToVectorOuterProduct : public OpRewritePattern<MulOpType> {
       if (!broadcastedRhs || !isValidBroadcastSource(broadcastedRhs))
         return failure();
 
-      return rewriter.create<vector::OuterProductOp>(
-          mulOp->getLoc(), resType, broadcastedLhs.getSource(),
+      return vector::OuterProductOp::create(
+          rewriter, mulOp->getLoc(), resType, broadcastedLhs.getSource(),
           broadcastedRhs.getSource(), Value(), vector::CombiningKind::ADD);
     };
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
index 693f4f955994d..fceba65fa3e3a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
@@ -49,7 +49,7 @@ static SmallVector<Value> sliceTransferIndices(ArrayRef<int64_t> elementOffsets,
                 getAffineConstantExpr(elementOffsets[dim.index()], ctx);
     auto map = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, expr);
     slicedIndices[pos] =
-        builder.create<affine::AffineApplyOp>(loc, map, indices[pos]);
+        affine::AffineApplyOp::create(builder, loc, map, indices[pos]);
   }
   return slicedIndices;
 }
@@ -68,9 +68,9 @@ static SmallVector<Value> sliceLoadStoreIndices(PatternRewriter &rewriter,
   auto start = indices.size() - offsets.size();
   for (auto [i, offset] : llvm::enumerate(offsets)) {
     if (offset != 0) {
-      indices[start + i] = rewriter.create<arith::AddIOp>(
-          loc, originalIndices[start + i],
-          rewriter.create<arith::ConstantIndexOp>(loc, offset));
+      indices[start + i] = arith::AddIOp::create(
+          rewriter, loc, originalIndices[start + i],
+          arith::ConstantIndexOp::create(rewriter, loc, offset));
     }
   }
   return indices;
@@ -172,8 +172,9 @@ struct UnrollTransferReadPattern
     ArrayRef<int64_t> originalSize = readOp.getVectorType().getShape();
 
     // Prepare the result vector;
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, sourceVectorType, rewriter.getZeroAttr(sourceVectorType));
+    Value result =
+        arith::ConstantOp::create(rewriter, loc, sourceVectorType,
+                                  rewriter.getZeroAttr(sourceVectorType));
     auto targetType =
         VectorType::get(*targetShape, sourceVectorType.getElementType());
     SmallVector<Value> originalIndices(readOp.getIndices().begin(),
@@ -185,8 +186,8 @@ struct UnrollTransferReadPattern
       SmallVector<Value> indices =
           sliceTransferIndices(elementOffsets, originalIndices,
                                readOp.getPermutationMap(), loc, rewriter);
-      auto slicedRead = rewriter.create<vector::TransferReadOp>(
-          loc, targetType, readOp.getBase(), indices,
+      auto slicedRead = vector::TransferReadOp::create(
+          rewriter, loc, targetType, readOp.getBase(), indices,
           readOp.getPermutationMapAttr(), readOp.getPadding(), readOp.getMask(),
           readOp.getInBoundsAttr());
 
@@ -236,9 +237,10 @@ struct UnrollTransferWritePattern
       SmallVector<Value> indices =
           sliceTransferIndices(elementOffsets, originalIndices,
                                writeOp.getPermutationMap(), loc, rewriter);
-      Operation *slicedWrite = rewriter.create<vector::TransferWriteOp>(
-          loc, slicedVector, resultTensor ? resultTensor : writeOp.getBase(),
-          indices, writeOp.getPermutationMapAttr(), writeOp.getInBoundsAttr());
+      Operation *slicedWrite = vector::TransferWriteOp::create(
+          rewriter, loc, slicedVector,
+          resultTensor ? resultTensor : writeOp.getBase(), indices,
+          writeOp.getPermutationMapAttr(), writeOp.getInBoundsAttr());
       // For the tensor case update the destination for the next transfer write.
       if (!slicedWrite->getResults().empty())
         resultTensor = slicedWrite->getResult(0);
@@ -348,8 +350,8 @@ struct UnrollContractionPattern
       accCache[dstOffets] = newOp->getResult(0);
     }
     // Assemble back the accumulator into a single vector.
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstVecType, rewriter.getZeroAttr(dstVecType));
+    Value result = arith::ConstantOp::create(rewriter, loc, dstVecType,
+                                             rewriter.getZeroAttr(dstVecType));
     for (const auto &it : accCache) {
       SmallVector<int64_t> dstStrides(it.first.size(), 1);
       result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
@@ -427,8 +429,8 @@ struct UnrollMultiReductionPattern
       accCache[destOffset] = result;
     }
     // Assemble back the accumulator into a single vector.
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, reductionOp.getDestType(),
+    Value result = arith::ConstantOp::create(
+        rewriter, loc, reductionOp.getDestType(),
         rewriter.getZeroAttr(reductionOp.getDestType()));
     for (const auto &it : accCache) {
       SmallVector<int64_t> dstStrides(it.first.size(), 1);
@@ -468,8 +470,8 @@ struct UnrollElementwisePattern : public RewritePattern {
           op, "expected input vector rank to match target shape rank");
     Location loc = op->getLoc();
     // Prepare the result vector.
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, dstVecType, rewriter.getZeroAttr(dstVecType));
+    Value result = arith::ConstantOp::create(rewriter, loc, dstVecType,
+                                             rewriter.getZeroAttr(dstVecType));
     SmallVector<int64_t> strides(targetShape->size(), 1);
     VectorType newVecType =
         VectorType::get(*targetShape, dstVecType.getElementType());
@@ -567,8 +569,9 @@ struct UnrollTransposePattern : public OpRewritePattern<vector::TransposeOp> {
     ArrayRef<int64_t> originalSize = originalVectorType.getShape();
 
     // Prepare the result vector;
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, originalVectorType, rewriter.getZeroAttr(originalVectorType));
+    Value result =
+        arith::ConstantOp::create(rewriter, loc, originalVectorType,
+                                  rewriter.getZeroAttr(originalVectorType));
     ArrayRef<int64_t> permutation = transposeOp.getPermutation();
 
     // Unroll the computation.
@@ -618,8 +621,9 @@ struct UnrollGatherPattern : public OpRewritePattern<vector::GatherOp> {
     ArrayRef<int64_t> originalSize = gatherOp.getVectorType().getShape();
 
     // Prepare the result vector;
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, sourceVectorType, rewriter.getZeroAttr(sourceVectorType));
+    Value result =
+        arith::ConstantOp::create(rewriter, loc, sourceVectorType,
+                                  rewriter.getZeroAttr(sourceVectorType));
     auto targetType =
         VectorType::get(*targetShape, sourceVectorType.getElementType());
 
@@ -638,8 +642,8 @@ struct UnrollGatherPattern : public OpRewritePattern<vector::GatherOp> {
           rewriter.createOrFold<vector::ExtractStridedSliceOp>(
               loc, gatherOp.getPassThru(), elementOffsets, *targetShape,
               strides);
-      auto slicedGather = rewriter.create<vector::GatherOp>(
-          loc, targetType, gatherOp.getBase(), gatherOp.getIndices(),
+      auto slicedGather = vector::GatherOp::create(
+          rewriter, loc, targetType, gatherOp.getBase(), gatherOp.getIndices(),
           indexSubVec, maskSubVec, passThruSubVec);
 
       result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
@@ -671,8 +675,8 @@ struct UnrollLoadPattern : public OpRewritePattern<vector::LoadOp> {
     ArrayRef<int64_t> originalShape = vecType.getShape();
     SmallVector<int64_t> strides(targetShape->size(), 1);
 
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, vecType, rewriter.getZeroAttr(vecType));
+    Value result = arith::ConstantOp::create(rewriter, loc, vecType,
+                                             rewriter.getZeroAttr(vecType));
 
     SmallVector<int64_t> loopOrder =
         getUnrollOrder(originalShape.size(), loadOp, options);
@@ -684,8 +688,8 @@ struct UnrollLoadPattern : public OpRewritePattern<vector::LoadOp> {
          StaticTileOffsetRange(originalShape, *targetShape, loopOrder)) {
       SmallVector<Value> indices =
           sliceLoadStoreIndices(rewriter, loc, loadOp.getIndices(), offsets);
-      Value slicedLoad = rewriter.create<vector::LoadOp>(
-          loc, targetVecType, loadOp.getBase(), indices);
+      Value slicedLoad = vector::LoadOp::create(rewriter, loc, targetVecType,
+                                                loadOp.getBase(), indices);
       result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
           loc, slicedLoad, result, offsets, strides);
     }
@@ -727,7 +731,7 @@ struct UnrollStorePattern : public OpRewritePattern<vector::StoreOp> {
           sliceLoadStoreIndices(rewriter, loc, storeOp.getIndices(), offsets);
       Value slice = rewriter.createOrFold<vector::ExtractStridedSliceOp>(
           loc, vector, offsets, *targetShape, strides);
-      rewriter.create<vector::StoreOp>(loc, slice, base, indices);
+      vector::StoreOp::create(rewriter, loc, slice, base, indices);
     }
     rewriter.eraseOp(storeOp);
     return success();
@@ -755,8 +759,8 @@ struct UnrollBroadcastPattern : public OpRewritePattern<vector::BroadcastOp> {
     VectorType resType = broadcastOp.getResultVectorType();
     VectorType targetType =
         resType.cloneWith(*targetShape, resType.getElementType());
-    Value result = rewriter.create<arith::ConstantOp>(
-        loc, resType, rewriter.getZeroAttr(resType));
+    Value result = arith::ConstantOp::create(rewriter, loc, resType,
+                                             rewriter.getZeroAttr(resType));
 
     SmallVector<int64_t> originalShape = *broadcastOp.getShapeForUnroll();
     SmallVector<int64_t> strides(originalShape.size(), 1);
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 7e4984582b373..c045063e8194f 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -320,18 +320,20 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
                                      Value source,
                                      ArrayRef<int64_t> inputVectorSizes,
                                      Value padValue,
-                                     bool useInBoundsInsteadOfMasking) {
+                                     bool useInBoundsInsteadOfMasking,
+                                     ArrayRef<bool> scalableDims) {
   assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) &&
          "invalid input vector sizes");
   auto sourceShapedType = cast<ShapedType>(source.getType());
   auto sourceShape = sourceShapedType.getShape();
   assert(sourceShape.size() == inputVectorSizes.size() &&
          "expected same ranks.");
-  auto vectorType = VectorType::get(inputVectorSizes, padValue.getType());
+  auto vectorType =
+      VectorType::get(inputVectorSizes, padValue.getType(), scalableDims);
   assert(padValue.getType() == sourceShapedType.getElementType() &&
          "expected same pad element type to match source element type");
   int64_t readRank = inputVectorSizes.size();
-  auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  auto zero = arith::ConstantIndexOp::create(builder, loc, 0);
   SmallVector<bool> inBoundsVal(readRank, true);
 
   if (useInBoundsInsteadOfMasking) {
@@ -341,8 +343,8 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
       inBoundsVal[i] = (sourceShape[i] == inputVectorSizes[i]) &&
                        ShapedType::isStatic(sourceShape[i]);
   }
-  auto transferReadOp = builder.create<vector::TransferReadOp>(
-      loc,
+  auto transferReadOp = vector::TransferReadOp::create(
+      builder, loc,
       /*vectorType=*/vectorType,
       /*source=*/source,
       /*indices=*/SmallVector<Value>(readRank, zero),
@@ -352,11 +354,14 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
   if (llvm::equal(inputVectorSizes, sourceShape) || useInBoundsInsteadOfMasking)
     return transferReadOp;
   SmallVector<OpFoldResult> mixedSourceDims =
-      tensor::getMixedSizes(builder, loc, source);
+      isa<MemRefType>(source.getType())
+          ? memref::getMixedSizes(builder, loc, source)
+          : tensor::getMixedSizes(builder, loc, source);
 
-  auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type());
+  auto maskType =
+      VectorType::get(inputVectorSizes, builder.getI1Type(), scalableDims);
   Value mask =
-      builder.create<vector::CreateMaskOp>(loc, maskType, mixedSourceDims);
+      vector::CreateMaskOp::create(builder, loc, maskType, mixedSourceDims);
   return mlir::vector::maskOperation(builder, transferReadOp, mask)
       ->getResult(0);
 }
diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
index 7de32f7cbfb8b..0fa353abc4972 100644
--- a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
+++ b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
@@ -60,11 +60,11 @@ SmallVector<Value> x86vector::MaskCompressOp::getIntrinsicOperands(
   if (adaptor.getSrc()) {
     src = adaptor.getSrc();
   } else if (adaptor.getConstantSrc()) {
-    src = rewriter.create<LLVM::ConstantOp>(loc, opType,
-                                            adaptor.getConstantSrcAttr());
+    src = LLVM::ConstantOp::create(rewriter, loc, opType,
+                                   adaptor.getConstantSrcAttr());
   } else {
     auto zeroAttr = rewriter.getZeroAttr(opType);
-    src = rewriter.create<LLVM::ConstantOp>(loc, opType, zeroAttr);
+    src = LLVM::ConstantOp::create(rewriter, loc, opType, zeroAttr);
   }
 
   return SmallVector<Value>{adaptor.getA(), src, adaptor.getK()};
@@ -77,7 +77,7 @@ x86vector::DotOp::getIntrinsicOperands(ArrayRef<Value> operands,
   SmallVector<Value> intrinsicOperands(operands);
   // Dot product of all elements, broadcasted to all elements.
   Value scale =
-      rewriter.create<LLVM::ConstantOp>(getLoc(), rewriter.getI8Type(), 0xff);
+      LLVM::ConstantOp::create(rewriter, getLoc(), rewriter.getI8Type(), 0xff);
   intrinsicOperands.push_back(scale);
 
   return intrinsicOperands;
@@ -90,14 +90,14 @@ SmallVector<Value> x86vector::DotInt8Op::getIntrinsicOperands(
   Adaptor adaptor(operands, *this);
   intrinsicOprnds.push_back(adaptor.getW());
   // Bitcast `a` and `b` to i32
-  Value bitcast_a = rewriter.create<LLVM::BitcastOp>(
-      getLoc(),
+  Value bitcast_a = LLVM::BitcastOp::create(
+      rewriter, getLoc(),
       VectorType::get((getA().getType().getShape()[0] / 4),
                       rewriter.getIntegerType(32)),
       adaptor.getA());
   intrinsicOprnds.push_back(bitcast_a);
-  Value bitcast_b = rewriter.create<LLVM::BitcastOp>(
-      getLoc(),
+  Value bitcast_b = LLVM::BitcastOp::create(
+      rewriter, getLoc(),
       VectorType::get((getB().getType().getShape()[0] / 4),
                       rewriter.getIntegerType(32)),
       adaptor.getB());
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp b/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
index 87f7867fe1b7c..385ec5e824051 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
@@ -37,8 +37,8 @@ Value mlir::x86vector::avx2::inline_asm::mm256BlendPsAsm(
       "=x,x,x"; // Careful: constraint parser is very brittle: no ws!
   SmallVector<Value> asmVals{v1, v2};
   auto asmStr = llvm::formatv(asmTp, llvm::format_hex(mask, /*width=*/2)).str();
-  auto asmOp = b.create<LLVM::InlineAsmOp>(
-      v1.getType(), /*operands=*/asmVals, /*asm_string=*/asmStr,
+  auto asmOp = LLVM::InlineAsmOp::create(
+      b, v1.getType(), /*operands=*/asmVals, /*asm_string=*/asmStr,
       /*constraints=*/asmCstr, /*has_side_effects=*/false,
       /*is_align_stack=*/false, LLVM::TailCallKind::None,
       /*asm_dialect=*/asmDialectAttr,
@@ -48,14 +48,14 @@ Value mlir::x86vector::avx2::inline_asm::mm256BlendPsAsm(
 
 Value mlir::x86vector::avx2::intrin::mm256UnpackLoPs(ImplicitLocOpBuilder &b,
                                                      Value v1, Value v2) {
-  return b.create<vector::ShuffleOp>(
-      v1, v2, ArrayRef<int64_t>{0, 8, 1, 9, 4, 12, 5, 13});
+  return vector::ShuffleOp::create(b, v1, v2,
+                                   ArrayRef<int64_t>{0, 8, 1, 9, 4, 12, 5, 13});
 }
 
 Value mlir::x86vector::avx2::intrin::mm256UnpackHiPs(ImplicitLocOpBuilder &b,
                                                      Value v1, Value v2) {
-  return b.create<vector::ShuffleOp>(
-      v1, v2, ArrayRef<int64_t>{2, 10, 3, 11, 6, 14, 7, 15});
+  return vector::ShuffleOp::create(
+      b, v1, v2, ArrayRef<int64_t>{2, 10, 3, 11, 6, 14, 7, 15});
 }
 ///                            a  a   b   b  a  a   b   b
 /// Takes an 8 bit mask, 2 bit for each position of a[0, 3)  **and** b[0, 4):
@@ -68,7 +68,7 @@ Value mlir::x86vector::avx2::intrin::mm256ShufflePs(ImplicitLocOpBuilder &b,
   MaskHelper::extractShuffle(mask, b01, b23, b45, b67);
   SmallVector<int64_t> shuffleMask = {
       b01, b23, b45 + 8, b67 + 8, b01 + 4, b23 + 4, b45 + 8 + 4, b67 + 8 + 4};
-  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
+  return vector::ShuffleOp::create(b, v1, v2, shuffleMask);
 }
 
 // imm[0:1] out of imm[0:3] is:
@@ -96,7 +96,7 @@ Value mlir::x86vector::avx2::intrin::mm256Permute2f128Ps(
   MaskHelper::extractPermute(mask, b03, b47);
   appendToMask(b03);
   appendToMask(b47);
-  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
+  return vector::ShuffleOp::create(b, v1, v2, shuffleMask);
 }
 
 /// If bit i of `mask` is zero, take f32@i from v1 else take it from v2.
@@ -108,7 +108,7 @@ Value mlir::x86vector::avx2::intrin::mm256BlendPs(ImplicitLocOpBuilder &b,
     bool isSet = mask & (1 << i);
     shuffleMask.push_back(!isSet ? i : i + 8);
   }
-  return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
+  return vector::ShuffleOp::create(b, v1, v2, shuffleMask);
 }
 
 /// AVX2 4x8xf32-specific transpose lowering using a "C intrinsics" model.
@@ -244,13 +244,13 @@ class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
           VectorType::get({n * m}, op.getSourceVectorType().getElementType());
       auto reshInputType = VectorType::get({m, n}, srcType.getElementType());
       auto reshInput =
-          ib.create<vector::ShapeCastOp>(flattenedType, op.getVector());
-      reshInput = ib.create<vector::ShapeCastOp>(reshInputType, reshInput);
+          vector::ShapeCastOp::create(ib, flattenedType, op.getVector());
+      reshInput = vector::ShapeCastOp::create(ib, reshInputType, reshInput);
 
       // Extract 1-D vectors from the higher-order dimension of the input
       // vector.
       for (int64_t i = 0; i < m; ++i)
-        vs.push_back(ib.create<vector::ExtractOp>(reshInput, i));
+        vs.push_back(vector::ExtractOp::create(ib, reshInput, i));
 
       // Transpose set of 1-D vectors.
       if (m == 4)
@@ -260,16 +260,16 @@ class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
 
       // Insert transposed 1-D vectors into the higher-order dimension of the
       // output vector.
-      Value res = ib.create<arith::ConstantOp>(reshInputType,
-                                               ib.getZeroAttr(reshInputType));
+      Value res = arith::ConstantOp::create(ib, reshInputType,
+                                            ib.getZeroAttr(reshInputType));
       for (int64_t i = 0; i < m; ++i)
-        res = ib.create<vector::InsertOp>(vs[i], res, i);
+        res = vector::InsertOp::create(ib, vs[i], res, i);
 
       // The output vector still has the shape of the input vector (e.g., 4x8).
       // We have to transpose their dimensions and retrieve its original rank
       // (e.g., 1x8x1x4x1).
-      res = ib.create<vector::ShapeCastOp>(flattenedType, res);
-      res = ib.create<vector::ShapeCastOp>(op.getResultVectorType(), res);
+      res = vector::ShapeCastOp::create(ib, flattenedType, res);
+      res = vector::ShapeCastOp::create(ib, op.getResultVectorType(), res);
       rewriter.replaceOp(op, res);
       return success();
     };
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 642c393cbc2c8..8ab404d52eab4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -112,6 +112,11 @@ BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
   return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
 }
 
+bool BlockTensorDescAttr::hasDefaultsOnly() {
+  return getMemorySpace().getValue() == xegpu::MemorySpace::Global &&
+         getArrayLength().getInt() == 1 && getBoundaryCheck().getValue();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_ScatterTensorDescAttr
 //===----------------------------------------------------------------------===//
@@ -253,10 +258,11 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   if (parser.parseGreater())
     return {};
 
+  MLIRContext *ctxt = parser.getContext();
   return TensorDescType::getChecked(
-      [&]() { return parser.emitError(parser.getNameLoc()); },
-      parser.getContext(), shape, elementType,
-      encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
+      [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape,
+      elementType, encoding.value_or(BlockTensorDescAttr::get(ctxt)),
+      layout.value_or(mlir::Attribute()));
 }
 
 void TensorDescType::print(::mlir::AsmPrinter &printer) const {
@@ -273,7 +279,9 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
 
   printer << getElementType();
 
-  if (auto encoding = getEncoding())
+  auto encoding = getEncoding();
+  auto blockAttr = llvm::dyn_cast_if_present<BlockTensorDescAttr>(encoding);
+  if (encoding && (!blockAttr || !blockAttr.hasDefaultsOnly()))
     printer << ", " << encoding;
 
   if (auto layout = getLayout())
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index ef7cd1424e7a4..c8da5558438ea 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 #include "llvm/Support/Debug.h"
 
@@ -112,6 +113,68 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy,
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source) {
+  [[maybe_unused]] auto ty = source.getType();
+  assert(ty.hasStaticShape() && "expecting a memref with static shape");
+
+  build(builder, state, tdesc, source, ValueRange({}) /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        DenseI64ArrayAttr({}) /* const offsets */,
+        DenseI64ArrayAttr({}) /* empty const shape*/,
+        DenseI64ArrayAttr({}) /* empty const strides*/);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source,
+                           llvm::ArrayRef<OpFoldResult> shape,
+                           llvm::ArrayRef<OpFoldResult> strides) {
+  assert(shape.size() && strides.size() && shape.size() == strides.size() &&
+         "Shape and strides must be present and of equal size for ui64 "
+         "initialization.");
+
+  llvm::SmallVector<int64_t> staticShape;
+  llvm::SmallVector<int64_t> staticStrides;
+  llvm::SmallVector<Value> dynamicShape;
+  llvm::SmallVector<Value> dynamicStrides;
+
+  dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+
+  auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+  auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+  build(builder, state, tdesc, source, ValueRange({}), dynamicShape,
+        dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr,
+        staticStridesAttr);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<IntegerType> source,
+                           llvm::ArrayRef<OpFoldResult> shape,
+                           llvm::ArrayRef<OpFoldResult> strides) {
+  assert(shape.size() && strides.size() && shape.size() == strides.size() &&
+         "Shape and strides must be present and of equal size for ui64 "
+         "initialization.");
+
+  llvm::SmallVector<int64_t> staticShape;
+  llvm::SmallVector<int64_t> staticStrides;
+  llvm::SmallVector<Value> dynamicShape;
+  llvm::SmallVector<Value> dynamicStrides;
+
+  dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+
+  auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+  auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+  build(builder, state, tdesc, source, ValueRange({}), dynamicShape,
+        dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr,
+        staticStridesAttr);
+}
+
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type tdesc, TypedValue<MemRefType> source,
                            llvm::ArrayRef<OpFoldResult> offsets) {
@@ -125,8 +188,8 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
         ValueRange({}) /* empty dynamic shape */,
         ValueRange({}) /* empty dynamic strides */,
-        staticOffsets /* const offsets */, {} /* empty const shape*/,
-        {} /* empty const strides*/);
+        builder.getDenseI64ArrayAttr(staticOffsets) /* const offsets */,
+        {} /* empty const shape*/, {} /* empty const strides*/);
 }
 
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
@@ -197,6 +260,13 @@ LogicalResult CreateNdDescOp::verify() {
     invalidElemTy |= memrefTy.getElementType() != getElementType();
   }
 
+  if (llvm::isa<IntegerType>(getSourceType())) {
+    // strides and shape must present for integer source.
+    if (getMixedStrides().empty() || getMixedSizes().empty())
+      return emitOpError("Expecting strides and shape to be present for "
+                         "integer source.");
+  }
+
   // mismatches among shape, strides, and offsets are
   // already handeled by OffsetSizeAndStrideOpInterface.
   // So they are not check here.
@@ -221,6 +291,53 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
+ParseResult parseOptionalDynamicIndexList(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
+    DenseI64ArrayAttr &integers, SmallVectorImpl<Type> *valueTypes = nullptr,
+    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
+
+  SmallVector<int64_t, 4> integerVals;
+  auto parseIntegerOrValue = [&]() {
+    OpAsmParser::UnresolvedOperand operand;
+    auto res = parser.parseOptionalOperand(operand);
+
+    if (res.has_value() && succeeded(res.value())) {
+      values.push_back(operand);
+      integerVals.push_back(ShapedType::kDynamic);
+      if (valueTypes && parser.parseColonType(valueTypes->emplace_back()))
+        return failure();
+    } else {
+      int64_t integer;
+      if (failed(parser.parseInteger(integer)))
+        return failure();
+      integerVals.push_back(integer);
+    }
+    return success();
+  };
+
+  // If the optional values are given there must be left bracket
+  if (parser.parseOptionalLSquare().succeeded()) {
+    if (parser.parseCommaSeparatedList(parseIntegerOrValue) ||
+        parser.parseRSquare())
+      return parser.emitError(parser.getNameLoc())
+             << "expected a list of SSA values or integers";
+    integers = parser.getBuilder().getDenseI64ArrayAttr(integerVals);
+    return success();
+  }
+
+  return success();
+}
+
+void printOptionalDynamicIndexList(
+    OpAsmPrinter &printer, Operation *op, OperandRange values,
+    ArrayRef<int64_t> integers, TypeRange valueTypes = TypeRange(),
+    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
+
+  return printDynamicIndexList(printer, op, values, integers,
+                               /*scalableFlags=*/{}, valueTypes, delimiter);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_PrefetchNdOp
 //===----------------------------------------------------------------------===//
@@ -414,7 +531,7 @@ void CreateDescOp::build(OpBuilder &builder, OperationState &state,
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
   auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
-  auto offset = builder.create<vector::FromElementsOp>(loc, type, values);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
   build(builder, state, TensorDesc, source, offset);
 }
 
@@ -534,7 +651,7 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get({size}, builder.getIndexType());
   auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
-  auto offset = builder.create<vector::FromElementsOp>(loc, type, values);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
   build(builder, state, tdescTy, tensorDesc, offset);
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 5319496edc5af..e95d2f75d8b5a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -120,14 +120,14 @@ static Value resolveDistributedTy(Value orig, T expected,
   // If orig is a vector type, create a shape cast op to reconcile the types.
   if (isa<VectorType>(orig.getType())) {
     auto castOp =
-        rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+        vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
     return castOp.getResult();
   }
   // If orig is a tensor descriptor type, create an unrealized conversion cast
   // op to reconcile the types.
   if (isa<xegpu::TensorDescType>(orig.getType())) {
-    auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
-                                                              expected, orig);
+    auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
+                                                     expected, orig);
     castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
     return castOp.getResult(0);
   }
@@ -135,19 +135,6 @@ static Value resolveDistributedTy(Value orig, T expected,
   return orig;
 }
 
-/// Helper function to filter out the temporary layout attributes attached
-/// during the layout assignment process. These are not needed after going to
-/// SIMT.
-static SmallVector<NamedAttribute>
-removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
-  SmallVector<NamedAttribute> newAttrs;
-  for (NamedAttribute attr : attrs) {
-    if (!isa<xegpu::LayoutAttr>(attr.getValue()))
-      newAttrs.push_back(attr);
-  }
-  return newAttrs;
-}
-
 /// Helper function to check if the layout is packed. Layout is packed if it is
 /// 2D and lane_data[0] != 1 (data packed from col dimension).
 static bool hasPackedLayout(xegpu::LayoutAttr layout) {
@@ -197,18 +184,27 @@ struct MoveFuncBodyToWarpExecuteOnLane0
           return isa<gpu::WarpExecuteOnLane0Op>(op);
         }))
       return failure();
-    // Create a new function with the same signature.
-    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
-        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+    // Create a new function with the same signature and same attributes.
+    SmallVector<Type> workgroupAttributionsTypes =
+        llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
+                            [](BlockArgument arg) { return arg.getType(); });
+    SmallVector<Type> privateAttributionsTypes =
+        llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
+                            [](BlockArgument arg) { return arg.getType(); });
+    auto newGpuFunc = gpu::GPUFuncOp::create(
+        rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
+        gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
+        privateAttributionsTypes);
+    newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
     // Create a WarpExecuteOnLane0Op with same arguments and results as the
     // original gpuFuncOp.
     rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
-    auto laneId = rewriter.create<gpu::LaneIdOp>(
-        newGpuFunc.getLoc(), rewriter.getIndexType(),
+    auto laneId = gpu::LaneIdOp::create(
+        rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
         /** upperBound = **/ mlir::IntegerAttr());
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
-    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
-        laneId.getLoc(), gpuFuncResultType, laneId,
+    auto warpOp = gpu::WarpExecuteOnLane0Op::create(
+        rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
         xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
         newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
@@ -216,8 +212,8 @@ struct MoveFuncBodyToWarpExecuteOnLane0
     auto origRetunOp =
         cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
     rewriter.setInsertionPointAfter(origRetunOp);
-    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
-                                  origRetunOp.getOperands());
+    gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
+                         origRetunOp.getOperands());
     rewriter.eraseOp(origRetunOp);
     // Move the original function body to the WarpExecuteOnLane0Op body.
     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
@@ -225,7 +221,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
     rewriter.eraseBlock(&warpBodyBlock);
     // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
     rewriter.setInsertionPointAfter(warpOp);
-    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+    gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
     rewriter.replaceOp(gpuFuncOp, newGpuFunc);
     return success();
   }
@@ -265,13 +261,13 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 /// ```
 struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+        getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
-          subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+          warpOp, "warp result is not a xegpu::CreateNdDesc op");
     auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
     unsigned operandIdx = operand->getOperandNumber();
 
@@ -288,9 +284,9 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
       newYieldValues.push_back(operand);
       newYieldTypes.push_back(operand.getType());
     }
-    rewriter.setInsertionPoint(subgroupOp);
+    rewriter.setInsertionPoint(warpOp);
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+        rewriter, warpOp, /* new yieled values = */ newYieldValues,
         /* new yielded types = */ newYieldTypes, newRetIndices);
 
     SmallVector<Value> newDescOperands;
@@ -301,8 +297,8 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
     xegpu::TensorDescType distributedTensorDescTy =
         descOp.getType().dropLayouts(); // Distributed tensor descriptor type
                                         // does not contain layout info.
-    Value newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-        newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
+    Value newDescOp = xegpu::CreateNdDescOp::create(
+        rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
         descOp->getAttrs());
 
     Value distributedVal = newWarpOp.getResult(operandIdx);
@@ -347,10 +343,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
     if (!storeOp)
@@ -372,7 +368,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp,
+        rewriter, warpOp,
         /* new yielded values = */
         ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
         /* new yielded types = */
@@ -403,9 +399,10 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
         resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
                              distributedTensorDescTy, rewriter));
 
-    rewriter.create<xegpu::StoreNdOp>(
-        newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
-        removeTemporaryLayoutAttributes(storeOp->getAttrs()));
+    auto newStoreOp =
+        xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
+                                 newStoreOperands, storeOp->getAttrs());
+    xegpu::removeLayoutAttrs(newStoreOp);
     rewriter.eraseOp(storeOp);
     return success();
   }
@@ -449,21 +446,22 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+    OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
+      if (!isa<xegpu::LoadNdOp>(op))
+        return false;
+      // Make sure the same load op is the last operation in the warp op body.
+      // This ensure that load op is not sinked earlier violating any barrier
+      // synchronizations.
+      auto yield = cast<gpu::YieldOp>(
+          warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+      return yield->getPrevNode() == op;
+    });
+
     if (!operand)
       return rewriter.notifyMatchFailure(
-          subgroupOp, "warp result is not a xegpu::LoadNd op");
-    // Make sure the load op is the last operation in the warp op body. This
-    // ensure that load op is not sinked earlier violating any barrier
-    // synchronizations.
-    auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
-    Operation *lastNode = yield->getPrevNode();
-    if (!dyn_cast_or_null<xegpu::LoadNdOp>(lastNode))
-      return failure();
+          warpOp, "warp result is not a xegpu::LoadNd op");
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
@@ -474,11 +472,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
 
     unsigned operandIdx = operand->getOperandNumber();
     VectorType distributedTypeByWarpOp =
-        cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp,
+        rewriter, warpOp,
         /* new yielded values = */ loadOp.getTensorDesc(),
         /* new yielded types = */ tensorDescTy, newRetIndices);
 
@@ -494,11 +492,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
         loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
                                                   // descriptor type does not
                                                   // contain layout info.
-    auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-        newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+    auto newLoadOp = xegpu::LoadNdOp::create(
+        rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
         resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
                              distributedTensorDescTy, rewriter),
-        removeTemporaryLayoutAttributes(loadOp->getAttrs()));
+        loadOp->getAttrs());
+    xegpu::removeLayoutAttrs(newLoadOp);
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(hasPackedLayout(layout));
     Value distributedVal = newWarpOp.getResult(operandIdx);
@@ -548,12 +547,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct DpasDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
+    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
     if (!operand)
-      return rewriter.notifyMatchFailure(subgroupOp,
+      return rewriter.notifyMatchFailure(warpOp,
                                          "warp result is not a xegpu::Dpas op");
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
@@ -599,7 +597,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     // Create a new warp op without the dpas.
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
 
     FailureOr<VectorType> expectedDistLhsTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
@@ -630,14 +628,16 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
           resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                                newDpasOperandExpectedTypes[i], rewriter));
     }
-    Value newDpasOp = rewriter.create<xegpu::DpasOp>(
-        newWarpOp->getLoc(), distributedResultTy, newDpasOperands,
-        removeTemporaryLayoutAttributes(dpasOp->getAttrs()));
+    auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
+                                           distributedResultTy, newDpasOperands,
+                                           dpasOp->getAttrs());
+    xegpu::removeLayoutAttrs(newDpasOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type.
-    newDpasOp = resolveDistributedTy(
-        newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, newDpasOp);
+    Value typeResolved =
+        resolveDistributedTy(newDpasOp.getResult(),
+                             distResultTypeByWarpOpOrFailure.value(), rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };
@@ -678,13 +678,13 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
+        getWarpResult(warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
-          subgroupOp, "warp result is not a xegpu::UpdateNdOffset op");
+          warpOp, "warp result is not a xegpu::UpdateNdOffset op");
     auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
     unsigned operandIdx = operand->getOperandNumber();
     // new update op does not have layout attribute.
@@ -703,7 +703,7 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
     }
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newUpdateOperands;
     for (size_t i : newRetIndices) {
@@ -717,14 +717,15 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
       }
     }
     // Create a new update op outside the warp op.
-    Value newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
-        newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
-        removeTemporaryLayoutAttributes(updateOp->getAttrs()));
+    auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
+        rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
+        updateOp->getAttrs());
+    xegpu::removeLayoutAttrs(newUpdateOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the distributed type with the original type.
-    newUpdateOp =
-        resolveDistributedTy(newUpdateOp, distributedVal.getType(), rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, newUpdateOp);
+    Value typeResolved = resolveDistributedTy(
+        newUpdateOp.getResult(), distributedVal.getType(), rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };
@@ -758,10 +759,10 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
     if (!prefetchOp)
@@ -775,7 +776,7 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
     // Create a new prefetch op outside the warp op with updated tensor
     // descriptor type. Source tensor descriptor require type resolution.
     xegpu::TensorDescType newTensorDescTy =
@@ -783,9 +784,9 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
-    rewriter.create<xegpu::PrefetchNdOp>(
-        newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
-        removeTemporaryLayoutAttributes(prefetchOp->getAttrs()));
+    xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
+                                newPrefetchOperands, prefetchOp->getAttrs());
+    xegpu::removeLayoutAttrs(prefetchOp);
     rewriter.eraseOp(prefetchOp);
     return success();
   }
@@ -795,20 +796,20 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
 /// region. This will simply move the barrier op outside of the warp op.
 struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     // The last node must be a gpu::BarrierOp.
     auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
     if (!barrierOp)
       return failure();
     // Move the barrier op outside of the warp op.
-    rewriter.setInsertionPointAfter(subgroupOp);
-    rewriter.create<gpu::BarrierOp>(
-        barrierOp.getLoc(), barrierOp->getResultTypes(),
-        barrierOp->getOperands(), barrierOp->getAttrs());
+    rewriter.setInsertionPointAfter(warpOp);
+    gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
+                           barrierOp->getResultTypes(),
+                           barrierOp->getOperands(), barrierOp->getAttrs());
     rewriter.eraseOp(barrierOp);
     return success();
   }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index dc76441b27c02..de67098d397f4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -79,8 +79,9 @@ struct UnrollPattern : public OpRewritePattern<SourceOp> {
                                  rewriter.getUnitAttr());
       auto blkAttr = NamedAttribute(rewriter.getStringAttr(blockAttrName),
                                     rewriter.getDenseI64ArrayAttr(blockSize));
-      auto castOp = rewriter.create<UnrealizedConversionCastOp>(
-          loc, destTy, srcs, ArrayRef<NamedAttribute>({attr, blkAttr}));
+      auto castOp = UnrealizedConversionCastOp::create(
+          rewriter, loc, destTy, srcs,
+          ArrayRef<NamedAttribute>({attr, blkAttr}));
       return castOp.getResult(0);
     }
 
@@ -105,8 +106,9 @@ struct UnrollPattern : public OpRewritePattern<SourceOp> {
                                  rewriter.getUnitAttr());
       auto blkAttr = NamedAttribute(rewriter.getStringAttr(blockAttrName),
                                     rewriter.getDenseI64ArrayAttr(blockSize));
-      auto castOp = rewriter.create<UnrealizedConversionCastOp>(
-          loc, destTypes, src, ArrayRef<NamedAttribute>({attr, blkAttr}));
+      auto castOp = UnrealizedConversionCastOp::create(
+          rewriter, loc, destTypes, src,
+          ArrayRef<NamedAttribute>({attr, blkAttr}));
       return castOp.getResults();
     }
 
@@ -140,10 +142,10 @@ struct UnrollCreateNdOp : public UnrollPattern<xegpu::CreateNdDescOp> {
     auto addi = [&](OpFoldResult a, int64_t b) -> Value {
       std::optional<int64_t> maybeInt = getConstantIntValue(a);
       if (maybeInt) {
-        return rewriter.create<arith::ConstantIndexOp>(loc, *maybeInt + b);
+        return arith::ConstantIndexOp::create(rewriter, loc, *maybeInt + b);
       } else {
         auto aV = llvm::cast<Value>(a);
-        auto bV = rewriter.create<arith::ConstantIndexOp>(loc, b);
+        auto bV = arith::ConstantIndexOp::create(rewriter, loc, b);
         return rewriter.createOrFold<arith::AddIOp>(loc, aV, bV);
       }
     };
@@ -165,9 +167,9 @@ struct UnrollCreateNdOp : public UnrollPattern<xegpu::CreateNdDescOp> {
            llvm::zip(validIdxes, oldOffsets, offsets))
         mixedOffsets[idx] = addi(oldOff, offset);
 
-      auto newOp = rewriter.create<xegpu::CreateNdDescOp>(
-          loc, newTdescTy, op.getSource(), mixedOffsets, op.getMixedSizes(),
-          op.getMixedStrides());
+      auto newOp = xegpu::CreateNdDescOp::create(
+          rewriter, loc, newTdescTy, op.getSource(), mixedOffsets,
+          op.getMixedSizes(), op.getMixedStrides());
       newOps.push_back(newOp);
     }
     Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
@@ -195,8 +197,8 @@ struct UnrollUpdateNdOffsetOp : public UnrollPattern<xegpu::UpdateNdOffsetOp> {
 
     SmallVector<Value> newOps;
     for (auto t : convertedTdesc) {
-      auto newOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
-          loc, t.getType(), t, op.getOffsets(), op.getConstOffsets());
+      auto newOp = xegpu::UpdateNdOffsetOp::create(
+          rewriter, loc, t.getType(), t, op.getOffsets(), op.getConstOffsets());
       newOps.push_back(newOp);
     }
     Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
@@ -222,7 +224,8 @@ struct UnrollPrefetchNdOp : public UnrollPattern<xegpu::PrefetchNdOp> {
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
     for (auto t : convertedTdesc)
-      rewriter.create<xegpu::PrefetchNdOp>(loc, TypeRange(), t, op->getAttrs());
+      xegpu::PrefetchNdOp::create(rewriter, loc, TypeRange(), t,
+                                  op->getAttrs());
 
     rewriter.eraseOp(op);
     return success();
@@ -253,7 +256,7 @@ struct UnrollLoadNdOp : public UnrollPattern<xegpu::LoadNdOp> {
     SmallVector<Value> newOps;
     for (auto t : convertedTdescs) {
       auto newOp =
-          rewriter.create<xegpu::LoadNdOp>(loc, newValueTy, t, op->getAttrs());
+          xegpu::LoadNdOp::create(rewriter, loc, newValueTy, t, op->getAttrs());
       newOps.push_back(newOp);
     }
 
@@ -287,8 +290,8 @@ struct UnrollStoreNdOp : public UnrollPattern<xegpu::StoreNdOp> {
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
     for (auto [v, t] : llvm::zip(convertedValues, convertedTdescs))
-      rewriter.create<xegpu::StoreNdOp>(loc, v, t, op.getL1HintAttr(),
-                                        op.getL2HintAttr(), op.getL3HintAttr());
+      xegpu::StoreNdOp::create(rewriter, loc, v, t, op.getL1HintAttr(),
+                               op.getL2HintAttr(), op.getL3HintAttr());
 
     rewriter.eraseOp(op);
     return success();
@@ -380,8 +383,8 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
           if (tmpC)
             operands.push_back(tmpC);
 
-          tmpC = rewriter.create<xegpu::DpasOp>(loc, vecTy, operands,
-                                                op->getAttrs());
+          tmpC = xegpu::DpasOp::create(rewriter, loc, vecTy, operands,
+                                       op->getAttrs());
         }
         newOps.push_back(tmpC);
       }
@@ -432,22 +435,23 @@ struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
            llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
         for (int64_t i = 0; i < numNewChunks; ++i) {
           // Compute the offset
-          Value inc = rewriter.create<arith::ConstantIndexOp>(
-              loc, i * blockedChunkSize);
-          Value incVec = rewriter.create<vector::SplatOp>(loc, indiceType, inc);
+          Value inc = arith::ConstantIndexOp::create(rewriter, loc,
+                                                     i * blockedChunkSize);
+          Value incVec =
+              vector::SplatOp::create(rewriter, loc, indiceType, inc);
           Value offsetIndice =
-              rewriter.create<arith::AddIOp>(loc, indice, incVec);
+              arith::AddIOp::create(rewriter, loc, indice, incVec);
 
-          auto newOp = rewriter.create<xegpu::CreateDescOp>(
-              loc, newTdescTy, op.getSource(), offsetIndice);
+          auto newOp = xegpu::CreateDescOp::create(
+              rewriter, loc, newTdescTy, op.getSource(), offsetIndice);
 
           newOps.push_back(newOp);
         }
       }
     } else {
       for (auto indice : convertedIndiceVec) {
-        auto newOp = rewriter.create<xegpu::CreateDescOp>(
-            loc, newTdescTy, op.getSource(), indice);
+        auto newOp = xegpu::CreateDescOp::create(rewriter, loc, newTdescTy,
+                                                 op.getSource(), indice);
         newOps.push_back(newOp);
       }
     }
@@ -511,9 +515,9 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
 
     SmallVector<Value> newOps;
     for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) {
-      auto newOp = rewriter.create<xegpu::LoadGatherOp>(
-          loc, newValueTy, t, m, op.getL1HintAttr(), op.getL2HintAttr(),
-          op.getL3HintAttr());
+      auto newOp = xegpu::LoadGatherOp::create(
+          rewriter, loc, newValueTy, t, m, op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
       newOps.push_back(newOp);
     }
 
@@ -543,7 +547,7 @@ struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
     for (auto t : convertedTdesc)
-      rewriter.create<xegpu::PrefetchOp>(loc, TypeRange(), t, op->getAttrs());
+      xegpu::PrefetchOp::create(rewriter, loc, TypeRange(), t, op->getAttrs());
 
     rewriter.eraseOp(op);
     return success();
@@ -604,9 +608,8 @@ struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
       Value v = convertedValues[i];
       Value t = convertedTdescs[i];
       Value m = op.getMask() ? convertedMasks[i] : nullptr;
-      rewriter.create<xegpu::StoreScatterOp>(loc, v, t, m, op.getL1HintAttr(),
-                                             op.getL2HintAttr(),
-                                             op.getL3HintAttr());
+      xegpu::StoreScatterOp::create(rewriter, loc, v, t, m, op.getL1HintAttr(),
+                                    op.getL2HintAttr(), op.getL3HintAttr());
     }
 
     rewriter.eraseOp(op);
@@ -658,7 +661,7 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
 
     for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
       auto newOp =
-          rewriter.create<xegpu::UpdateOffsetOp>(loc, t.getType(), t, o);
+          xegpu::UpdateOffsetOp::create(rewriter, loc, t.getType(), t, o);
       newOps.push_back(newOp);
     }
     Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index be7b860dd1729..80bb5e888bdc7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -121,11 +121,11 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     for (size_t i = 0; i < rank; ++i) {
       size_t dimIdx = originalOffsets.size() - rank + i;
       Value constOffset =
-          rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
+          arith::ConstantIndexOp::create(rewriter, loc, distUnitBaseAddr[i]);
       Value offset =
           rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
       Value modValue =
-          rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
+          arith::ConstantIndexOp::create(rewriter, loc, distUnitShape[i]);
       Value offsetMod =
           rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
       Value origOffset = getValueOrCreateConstantIndexOp(
@@ -162,7 +162,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
     // TODO : Handle order attribute
     // Get the subgroup ID
     auto linearSgId =
-        rewriter.create<gpu::SubgroupIdOp>(loc, /*upper_bound=*/nullptr);
+        gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
     // Create constants for layout dimensions
     SmallVector<Value> sgLayoutDim(sgLayout.size());
@@ -170,8 +170,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
 
     for (size_t i = 0; i < sgLayout.size(); i++) {
       sgLayoutDim[i] =
-          rewriter.create<arith::ConstantIndexOp>(loc, sgLayout[i]);
-      sgDataDim[i] = rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]);
+          arith::ConstantIndexOp::create(rewriter, loc, sgLayout[i]);
+      sgDataDim[i] = arith::ConstantIndexOp::create(rewriter, loc, sgShape[i]);
     }
 
     auto deLinearizeSgId =
@@ -201,9 +201,9 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
           calculateGlobalOffsets(rewriter, loc, originalOffsets, localOffset,
                                  distUnitBaseAddr, distUnitShape);
 
-      auto newCreateNdOp = rewriter.create<xegpu::CreateNdDescOp>(
-          loc, newTdescTy, op.getSource(), globalOffsets, op.getMixedSizes(),
-          op.getMixedStrides());
+      auto newCreateNdOp = xegpu::CreateNdDescOp::create(
+          rewriter, loc, newTdescTy, op.getSource(), globalOffsets,
+          op.getMixedSizes(), op.getMixedStrides());
       newCreateNdOps.push_back(newCreateNdOp);
     }
 
@@ -224,8 +224,8 @@ struct WgToSgLoadNdOp : public OpConversionPattern<xegpu::LoadNdOp> {
           dyn_cast<xegpu::TensorDescType>(src.getType());
       ArrayRef<int64_t> srcShape = tdescTy.getShape();
       VectorType newResTy = VectorType::get(srcShape, tdescTy.getElementType());
-      auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(op.getLoc(), newResTy,
-                                                        src, op->getAttrs());
+      auto newLoadOp = xegpu::LoadNdOp::create(rewriter, op.getLoc(), newResTy,
+                                               src, op->getAttrs());
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -242,8 +242,8 @@ struct WgToSgStoreNdOp : public OpConversionPattern<xegpu::StoreNdOp> {
   matchAndRewrite(xegpu::StoreNdOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     for (auto [v, t] : llvm::zip(adaptor.getValue(), adaptor.getTensorDesc()))
-      rewriter.create<xegpu::StoreNdOp>(op.getLoc(), v, t, op.getL1HintAttr(),
-                                        op.getL2HintAttr(), op.getL3HintAttr());
+      xegpu::StoreNdOp::create(rewriter, op.getLoc(), v, t, op.getL1HintAttr(),
+                               op.getL2HintAttr(), op.getL3HintAttr());
 
     rewriter.eraseOp(op);
     return success();
@@ -261,8 +261,8 @@ struct WgToSgUpdateNdOffsetOp
                   ConversionPatternRewriter &rewriter) const override {
     llvm::SmallVector<Value> newUpdateTileOffsetOps;
     for (auto tDesc : adaptor.getTensorDesc()) {
-      auto newUpdateTileOffsetOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
-          op.getLoc(), tDesc.getType(), tDesc, op.getOffsets(),
+      auto newUpdateTileOffsetOp = xegpu::UpdateNdOffsetOp::create(
+          rewriter, op.getLoc(), tDesc.getType(), tDesc, op.getOffsets(),
           op.getConstOffsets());
       newUpdateTileOffsetOps.push_back(newUpdateTileOffsetOp);
     }
@@ -305,7 +305,7 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
             llvm::cast<VectorType>(bVec.getType()).getShape();
         VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]},
                                            resultTy.getElementType());
-        tmpC = rewriter.create<xegpu::DpasOp>(loc, resTy, operands);
+        tmpC = xegpu::DpasOp::create(rewriter, loc, resTy, operands);
         xegpu::setLayoutAttr(cast<OpResult>(tmpC),
                              originalLayout.dropSgLayoutAndData());
 
@@ -324,8 +324,8 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
   matchAndRewrite(xegpu::PrefetchNdOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     for (auto src : adaptor.getTensorDesc())
-      rewriter.create<xegpu::PrefetchNdOp>(op.getLoc(), TypeRange(), src,
-                                           op->getAttrs());
+      xegpu::PrefetchNdOp::create(rewriter, op.getLoc(), TypeRange(), src,
+                                  op->getAttrs());
     rewriter.eraseOp(op);
     return success();
   }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 6b85a66a8bd36..66a2f03da71b2 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -54,7 +54,7 @@ mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
                                 std::multiplies<int64_t>());
 
   // Case 1: regular loads/stores
-  auto scatterAttr = tdescTy.getEncodingAsScatterTensorDescAttr();
+  auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
   if (scatterAttr) {
     auto chunkSize = scatterAttr.getChunkSize().getInt();
     // Verify if the first dimension of the tensor descriptor shape is
@@ -184,6 +184,31 @@ void xegpu::setLayoutAttrs(Operation *op,
   });
 }
 
+template <typename T, typename>
+void xegpu::removeLayoutAttr(const T &operandOrResult) {
+  Operation *owner = operandOrResult.getOwner();
+  std::string name = xegpu::getLayoutName(operandOrResult);
+  if (owner->hasAttrOfType<LayoutAttr>(name))
+    owner->removeAttr(name);
+}
+
+// Explicit instantiation for OpResult
+template void
+xegpu::removeLayoutAttr<mlir::OpResult>(const mlir::OpResult &result);
+
+// Explicit instantiation for OpOperand
+template void
+xegpu::removeLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand);
+
+void xegpu::removeLayoutAttrs(Operation *op) {
+  op->walk([&](Operation *nestOp) {
+    for (OpOperand &opr : nestOp->getOpOperands())
+      removeLayoutAttr(opr);
+    for (OpResult result : nestOp->getOpResults())
+      removeLayoutAttr(result);
+  });
+}
+
 SmallVector<Value>
 xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
                                         Value value, ArrayRef<int64_t> shape) {
@@ -198,8 +223,8 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
   SmallVector<Value> result;
   for (SmallVector<int64_t> offsets : StaticTileOffsetRange(srcShape, shape)) {
     SmallVector<int64_t> staticStrides(offsets.size(), 1);
-    result.push_back(builder.create<vector::ExtractStridedSliceOp>(
-        loc, value, offsets, shape, staticStrides));
+    result.push_back(vector::ExtractStridedSliceOp::create(
+        builder, loc, value, offsets, shape, staticStrides));
   }
 
   return result;
@@ -218,14 +243,14 @@ Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
 
   VectorType resultTy = VectorType::get(shape, elemTy);
   auto zeroAttr = builder.getZeroAttr(elemTy);
-  Value result = builder.create<arith::ConstantOp>(
-      loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
+  Value result = arith::ConstantOp::create(
+      builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
 
   for (auto [src, offsets] :
        llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
     SmallVector<int64_t> staticStrides(offsets.size(), 1);
-    result = builder.create<vector::InsertStridedSliceOp>(
-        loc, src, result, offsets, staticStrides);
+    result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
+                                                  offsets, staticStrides);
   }
   return result;
 }
@@ -236,7 +261,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
 
   auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
                             Location loc) -> Value {
-    return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+    return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
         .getResult(0);
   };
 
@@ -343,8 +368,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
 
         if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
           SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
-          auto newOp = rewriter.create<UnrealizedConversionCastOp>(
-              op.getLoc(), outputTy, values);
+          auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
+                                                          outputTy, values);
           rewriter.replaceOp(op, newOp);
           return success();
         }
@@ -355,7 +380,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
     converter.addSourceMaterialization(materializeCast);
     converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
                                            ValueRange inputs, Location loc) {
-      return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResults();
     });
 
diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h
index 26d40ac3a38f6..cb9d21bf3e611 100644
--- a/mlir/lib/IR/AttributeDetail.h
+++ b/mlir/lib/IR/AttributeDetail.h
@@ -19,11 +19,9 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Support/StorageUniquer.h"
-#include "mlir/Support/ThreadLocalCache.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/Allocator.h"
+#include <mutex>
 
 namespace mlir {
 namespace detail {
@@ -396,27 +394,30 @@ class DistinctAttributeUniquer {
                                               Attribute referencedAttr);
 };
 
-/// An allocator for distinct attribute storage instances. It uses thread local
-/// bump pointer allocators stored in a thread local cache to ensure the storage
-/// is freed after the destruction of the distinct attribute allocator.
-class DistinctAttributeAllocator {
+/// An allocator for distinct attribute storage instances. Uses a synchronized
+/// BumpPtrAllocator to ensure thread-safety. The allocated storage is deleted
+/// when the DistinctAttributeAllocator is destroyed.
+class DistinctAttributeAllocator final {
 public:
   DistinctAttributeAllocator() = default;
-
   DistinctAttributeAllocator(DistinctAttributeAllocator &&) = delete;
   DistinctAttributeAllocator(const DistinctAttributeAllocator &) = delete;
   DistinctAttributeAllocator &
   operator=(const DistinctAttributeAllocator &) = delete;
 
-  /// Allocates a distinct attribute storage using a thread local bump pointer
-  /// allocator to enable synchronization free parallel allocations.
   DistinctAttrStorage *allocate(Attribute referencedAttr) {
-    return new (allocatorCache.get().Allocate<DistinctAttrStorage>())
+    std::scoped_lock<std::mutex> guard(allocatorMutex);
+    return new (allocator.Allocate<DistinctAttrStorage>())
         DistinctAttrStorage(referencedAttr);
-  }
+  };
 
 private:
-  ThreadLocalCache<llvm::BumpPtrAllocator> allocatorCache;
+  /// Used to allocate distict attribute storages. The managed memory is freed
+  /// automatically when the allocator instance is destroyed.
+  llvm::BumpPtrAllocator allocator;
+
+  /// Used to lock access to the allocator.
+  std::mutex allocatorMutex;
 };
 } // namespace detail
 } // namespace mlir
diff --git a/mlir/lib/IR/BuiltinDialect.cpp b/mlir/lib/IR/BuiltinDialect.cpp
index 6d7e2aa0ece7d..c88b328282275 100644
--- a/mlir/lib/IR/BuiltinDialect.cpp
+++ b/mlir/lib/IR/BuiltinDialect.cpp
@@ -132,7 +132,7 @@ void ModuleOp::build(OpBuilder &builder, OperationState &state,
 /// Construct a module from the given context.
 ModuleOp ModuleOp::create(Location loc, std::optional<StringRef> name) {
   OpBuilder builder(loc->getContext());
-  return builder.create<ModuleOp>(loc, name);
+  return ModuleOp::create(builder, loc, name);
 }
 
 DataLayoutSpecInterface ModuleOp::getDataLayoutSpec() {
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index 4cabac185171c..3ef69cea18f0a 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -29,6 +29,7 @@ add_mlir_library(MLIRIR
   ODSSupport.cpp
   Operation.cpp
   OperationSupport.cpp
+  PatternLoggingListener.cpp
   PatternMatch.cpp
   Region.cpp
   RegionKindInterface.cpp
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 716d9c85a377d..06ec1c85fb4d5 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/ThreadPool.h"
diff --git a/mlir/lib/IR/PatternLoggingListener.cpp b/mlir/lib/IR/PatternLoggingListener.cpp
new file mode 100644
index 0000000000000..ce2123ae1a19a
--- /dev/null
+++ b/mlir/lib/IR/PatternLoggingListener.cpp
@@ -0,0 +1,50 @@
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "pattern-logging-listener"
+#define DBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+
+void RewriterBase::PatternLoggingListener::notifyOperationInserted(
+    Operation *op, InsertPoint previous) {
+  LDBG(patternName << " | notifyOperationInserted"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationInserted(op, previous);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationModified(
+    Operation *op) {
+  LDBG(patternName << " | notifyOperationModified"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationModified(op);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationReplaced(
+    Operation *op, Operation *newOp) {
+  LDBG(patternName << " | notifyOperationReplaced (with op)"
+                   << " | " << op->getName() << " | " << newOp->getName());
+  ForwardingListener::notifyOperationReplaced(op, newOp);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationReplaced(
+    Operation *op, ValueRange replacement) {
+  LDBG(patternName << " | notifyOperationReplaced (with values)"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationReplaced(op, replacement);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationErased(
+    Operation *op) {
+  LDBG(patternName << " | notifyOperationErased"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationErased(op);
+}
+
+void RewriterBase::PatternLoggingListener::notifyPatternBegin(
+    const Pattern &pattern, Operation *op) {
+  LDBG(patternName << " | notifyPatternBegin"
+                   << " | " << op->getName());
+  ForwardingListener::notifyPatternBegin(pattern, op);
+}
diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp
index 3112da9ef182a..0cddf658cce52 100644
--- a/mlir/lib/Interfaces/ViewLikeInterface.cpp
+++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp
@@ -94,6 +94,32 @@ SliceBoundsVerificationResult mlir::verifyInBoundsSlice(
 
 LogicalResult
 mlir::detail::verifyOffsetSizeAndStrideOp(OffsetSizeAndStrideOpInterface op) {
+  // A dynamic size is represented as ShapedType::kDynamic in `static_sizes`.
+  // Its corresponding Value appears in `sizes`. Thus, the number of dynamic
+  // dimensions in `static_sizes` must equal the rank of `sizes`.
+  // The same applies to strides and offsets.
+  size_t numDynamicDims =
+      llvm::count_if(op.getStaticSizes(), ShapedType::isDynamic);
+  if (op.getSizes().size() != numDynamicDims) {
+    return op->emitError("expected the number of 'sizes' to match the number "
+                         "of dynamic entries in 'static_sizes' (")
+           << op.getSizes().size() << " vs " << numDynamicDims << ")";
+  }
+  size_t numDynamicStrides =
+      llvm::count_if(op.getStaticStrides(), ShapedType::isDynamic);
+  if (op.getStrides().size() != numDynamicStrides) {
+    return op->emitError("expected the number of 'strides' to match the number "
+                         "of dynamic entries in 'static_strides' (")
+           << op.getStrides().size() << " vs " << numDynamicStrides << ")";
+  }
+  size_t numDynamicOffsets =
+      llvm::count_if(op.getStaticOffsets(), ShapedType::isDynamic);
+  if (op.getOffsets().size() != numDynamicOffsets) {
+    return op->emitError("expected the number of 'offsets' to match the number "
+                         "of dynamic entries in 'static_offsets' (")
+           << op.getOffsets().size() << " vs " << numDynamicOffsets << ")";
+  }
+
   std::array<unsigned, 3> maxRanks = op.getArrayAttrMaxRanks();
   // Offsets can come in 2 flavors:
   //   1. Either single entry (when maxRanks == 1).
diff --git a/mlir/lib/Pass/PassCrashRecovery.cpp b/mlir/lib/Pass/PassCrashRecovery.cpp
index 08f5114ae6eb2..3c9735f910094 100644
--- a/mlir/lib/Pass/PassCrashRecovery.cpp
+++ b/mlir/lib/Pass/PassCrashRecovery.cpp
@@ -411,14 +411,19 @@ struct FileReproducerStream : public mlir::ReproducerStream {
 
 LogicalResult PassManager::runWithCrashRecovery(Operation *op,
                                                 AnalysisManager am) {
+  const bool threadingEnabled = getContext()->isMultithreadingEnabled();
   crashReproGenerator->initialize(getPasses(), op, verifyPasses);
 
   // Safely invoke the passes within a recovery context.
   LogicalResult passManagerResult = failure();
   llvm::CrashRecoveryContext recoveryContext;
-  recoveryContext.RunSafelyOnThread(
-      [&] { passManagerResult = runPasses(op, am); });
+  const auto runPassesFn = [&] { passManagerResult = runPasses(op, am); };
+  if (threadingEnabled)
+    recoveryContext.RunSafelyOnThread(runPassesFn);
+  else
+    recoveryContext.RunSafely(runPassesFn);
   crashReproGenerator->finalize(op, passManagerResult);
+
   return passManagerResult;
 }
 
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index b5a9d2f1d350c..03e4177bbbe24 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -77,7 +77,7 @@ static Operation *extractFunction(std::vector<Operation *> &ops,
                       clonedOp->result_end());
   }
   // Add return operation
-  builder.create<func::ReturnOp>(loc, clonedVals);
+  func::ReturnOp::create(builder, loc, clonedVals);
 
   // Remove unused function arguments
   size_t currentIndex = 0;
diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp
index 4a12183492fd4..b2b372b7b1249 100644
--- a/mlir/lib/Rewrite/PatternApplicator.cpp
+++ b/mlir/lib/Rewrite/PatternApplicator.cpp
@@ -15,6 +15,10 @@
 #include "ByteCode.h"
 #include "llvm/Support/Debug.h"
 
+#ifndef NDEBUG
+#include "llvm/ADT/ScopeExit.h"
+#endif
+
 #define DEBUG_TYPE "pattern-application"
 
 using namespace mlir;
@@ -206,11 +210,19 @@ LogicalResult PatternApplicator::matchAndRewrite(
           } else {
             LLVM_DEBUG(llvm::dbgs() << "Trying to match \""
                                     << bestPattern->getDebugName() << "\"\n");
-
             const auto *pattern =
                 static_cast<const RewritePattern *>(bestPattern);
-            result = pattern->matchAndRewrite(op, rewriter);
 
+#ifndef NDEBUG
+            OpBuilder::Listener *oldListener = rewriter.getListener();
+            auto loggingListener =
+                std::make_unique<RewriterBase::PatternLoggingListener>(
+                    oldListener, pattern->getDebugName());
+            rewriter.setListener(loggingListener.get());
+            auto resetListenerCallback = llvm::make_scope_exit(
+                [&] { rewriter.setListener(oldListener); });
+#endif
+            result = pattern->matchAndRewrite(op, rewriter);
             LLVM_DEBUG(llvm::dbgs()
                        << "\"" << bestPattern->getDebugName() << "\" result "
                        << succeeded(result) << "\n");
diff --git a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp
index 538ea4f450b29..d6b8a8a1df426 100644
--- a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp
+++ b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp
@@ -226,6 +226,15 @@ static void generateOpBuilderDeclarations(irdl::detail::dictionary &dict,
   stream << llvm::formatv(
       R"(static void build(::mlir::OpBuilder &opBuilder, ::mlir::OperationState &opState, {0} {1} ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {{});)",
       resultParams, operandParams);
+  stream << "\n";
+  stream << llvm::formatv(
+      R"(static {0} create(::mlir::OpBuilder &opBuilder, ::mlir::Location location, {1} {2} ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {{});)",
+      opStrings.opCppName, resultParams, operandParams);
+  stream << "\n";
+  stream << llvm::formatv(
+      R"(static {0} create(::mlir::ImplicitLocOpBuilder &opBuilder, {1} {2} ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {{});)",
+      opStrings.opCppName, resultParams, operandParams);
+  stream << "\n";
   dict["OP_BUILD_DECLS"] = buildDecls;
 }
 
@@ -339,9 +348,25 @@ void {0}::build(::mlir::OpBuilder &opBuilder, ::mlir::OperationState &opState, {
 {3}
 {4}
 }
+
+{0} {0}::create(::mlir::OpBuilder &opBuilder, ::mlir::Location location, {1} {2} ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) {{
+  ::mlir::OperationState __state__(location, getOperationName());
+  build(opBuilder, __state__, {5} {6} attributes);
+  auto __res__ = ::llvm::dyn_cast<{0}>(opBuilder.create(__state__));
+  assert(__res__ && "builder didn't return the right type");
+  return __res__;
+}
+
+{0} {0}::create(::mlir::ImplicitLocOpBuilder &opBuilder, {1} {2} ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) {{
+  return create(opBuilder, opBuilder.getLoc(), {5} {6} attributes);
+}
 )",
       opStrings.opCppName, std::move(resultTypes), std::move(operandTypes),
-      std::move(operandAdder), std::move(resultAdder));
+      std::move(operandAdder), std::move(resultAdder),
+      llvm::join(opStrings.opResultNames, ",") +
+          (!opStrings.opResultNames.empty() ? "," : ""),
+      llvm::join(opStrings.opOperandNames, ",") +
+          (!opStrings.opOperandNames.empty() ? "," : ""));
 
   dict["OP_BUILD_DEFS"] = buildDefinition;
 
diff --git a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt
index 9e787b8056a89..e9068e9488f99 100644
--- a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt
+++ b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt
@@ -175,6 +175,17 @@ public:
                     ::mlir::TypeRange resultTypes, 
                     ::mlir::ValueRange operands, 
                     ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {});
+
+  static __OP_CPP_NAME__ create(::mlir::OpBuilder &odsBuilder,
+                                ::mlir::Location location,
+                                ::mlir::TypeRange resultTypes,
+                                ::mlir::ValueRange operands,
+                                ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {});
+
+  static __OP_CPP_NAME__ create(::mlir::ImplicitLocOpBuilder &odsBuilder,
+                                ::mlir::TypeRange resultTypes,
+                                ::mlir::ValueRange operands,
+                                ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {});
 };
 
 
diff --git a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt
index 88f19d0f22ca4..30ca420d77448 100644
--- a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt
+++ b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt
@@ -21,6 +21,29 @@ void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder,
   odsState.addTypes(resultTypes);
 }
 
+__OP_CPP_NAME__
+__OP_CPP_NAME__::create(::mlir::OpBuilder &odsBuilder,
+                        ::mlir::Location location,
+                        ::mlir::TypeRange resultTypes,
+                        ::mlir::ValueRange operands,
+                        ::llvm::ArrayRef<::mlir::NamedAttribute> attributes)
+{
+  ::mlir::OperationState state(location, getOperationName());
+  build(odsBuilder, state, resultTypes, operands, attributes);
+  auto res = ::llvm::dyn_cast<__OP_CPP_NAME__>(odsBuilder.create(state));
+  assert(res && "builder didn't return the right type");
+  return res;
+}
+
+__OP_CPP_NAME__
+__OP_CPP_NAME__::create(::mlir::ImplicitLocOpBuilder &odsBuilder,
+                        ::mlir::TypeRange resultTypes,
+                        ::mlir::ValueRange operands,
+                        ::llvm::ArrayRef<::mlir::NamedAttribute> attributes)
+{
+  return create(odsBuilder, odsBuilder.getLoc(), resultTypes, operands, attributes);
+}
+
 
 __NAMESPACE_CLOSE__
 
diff --git a/mlir/lib/Target/IRDLToCpp/TemplatingUtils.h b/mlir/lib/Target/IRDLToCpp/TemplatingUtils.h
index aeb3c09543213..be16a1f612c5b 100644
--- a/mlir/lib/Target/IRDLToCpp/TemplatingUtils.h
+++ b/mlir/lib/Target/IRDLToCpp/TemplatingUtils.h
@@ -15,6 +15,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <variant>
+#include <vector>
 
 namespace mlir::irdl::detail {
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 70029d7e15a90..ff34a0825215c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -422,9 +422,18 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
     ArrayRef<llvm::Value *> operandsRef(operands);
     llvm::CallInst *call;
     if (auto attr = callOp.getCalleeAttr()) {
-      call =
-          builder.CreateCall(moduleTranslation.lookupFunction(attr.getValue()),
-                             operandsRef, opBundles);
+      if (llvm::Function *function =
+              moduleTranslation.lookupFunction(attr.getValue())) {
+        call = builder.CreateCall(function, operandsRef, opBundles);
+      } else {
+        Operation *moduleOp = parentLLVMModule(&opInst);
+        Operation *ifuncOp =
+            moduleTranslation.symbolTable().lookupSymbolIn(moduleOp, attr);
+        llvm::GlobalValue *ifunc = moduleTranslation.lookupIFunc(ifuncOp);
+        llvm::FunctionType *calleeType = llvm::cast<llvm::FunctionType>(
+            moduleTranslation.convertType(callOp.getCalleeFunctionType()));
+        call = builder.CreateCall(calleeType, ifunc, operandsRef, opBundles);
+      }
     } else {
       llvm::FunctionType *calleeType = llvm::cast<llvm::FunctionType>(
           moduleTranslation.convertType(callOp.getCalleeFunctionType()));
@@ -648,18 +657,21 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
     LLVM::LLVMFuncOp function =
         addressOfOp.getFunction(moduleTranslation.symbolTable());
     LLVM::AliasOp alias = addressOfOp.getAlias(moduleTranslation.symbolTable());
+    LLVM::IFuncOp ifunc = addressOfOp.getIFunc(moduleTranslation.symbolTable());
 
     // The verifier should not have allowed this.
-    assert((global || function || alias) &&
-           "referencing an undefined global, function, or alias");
+    assert((global || function || alias || ifunc) &&
+           "referencing an undefined global, function, alias, or ifunc");
 
     llvm::Value *llvmValue = nullptr;
     if (global)
       llvmValue = moduleTranslation.lookupGlobal(global);
     else if (alias)
       llvmValue = moduleTranslation.lookupAlias(alias);
-    else
+    else if (function)
       llvmValue = moduleTranslation.lookupFunction(function.getName());
+    else
+      llvmValue = moduleTranslation.lookupIFunc(ifunc);
 
     moduleTranslation.mapValue(addressOfOp.getResult(), llvmValue);
     return success();
diff --git a/mlir/lib/Target/LLVMIR/LLVMImportInterface.cpp b/mlir/lib/Target/LLVMIR/LLVMImportInterface.cpp
index 4e7f1d3185129..580afdddd7078 100644
--- a/mlir/lib/Target/LLVMIR/LLVMImportInterface.cpp
+++ b/mlir/lib/Target/LLVMIR/LLVMImportInterface.cpp
@@ -37,8 +37,8 @@ LogicalResult mlir::LLVMImportInterface::convertUnregisteredIntrinsic(
     return failure();
 
   Type resultType = moduleImport.convertType(inst->getType());
-  auto op = builder.create<::mlir::LLVM::CallIntrinsicOp>(
-      moduleImport.translateLoc(inst->getDebugLoc()),
+  auto op = CallIntrinsicOp::create(
+      builder, moduleImport.translateLoc(inst->getDebugLoc()),
       isa<LLVMVoidType>(resultType) ? TypeRange{} : TypeRange{resultType},
       StringAttr::get(builder.getContext(), intrinName),
       ValueRange{mlirOperands}, FastmathFlagsAttr{});
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bfda223fe0f5f..94db7f8888129 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -185,7 +185,7 @@ ComdatOp ModuleImport::getGlobalComdatOp() {
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(mlirModule.getBody());
   globalComdatOp =
-      builder.create<ComdatOp>(mlirModule.getLoc(), getGlobalComdatOpName());
+      ComdatOp::create(builder, mlirModule.getLoc(), getGlobalComdatOpName());
   globalInsertionOp = globalComdatOp;
   return globalComdatOp;
 }
@@ -864,8 +864,8 @@ LogicalResult ModuleImport::convertModuleFlagsMetadata() {
   }
 
   if (!moduleFlags.empty())
-    builder.create<LLVM::ModuleFlagsOp>(mlirModule.getLoc(),
-                                        builder.getArrayAttr(moduleFlags));
+    LLVM::ModuleFlagsOp::create(builder, mlirModule.getLoc(),
+                                builder.getArrayAttr(moduleFlags));
 
   return success();
 }
@@ -880,8 +880,8 @@ LogicalResult ModuleImport::convertLinkerOptionsMetadata() {
       options.reserve(node->getNumOperands());
       for (const llvm::MDOperand &option : node->operands())
         options.push_back(cast<llvm::MDString>(option)->getString());
-      builder.create<LLVM::LinkerOptionsOp>(mlirModule.getLoc(),
-                                            builder.getStrArrayAttr(options));
+      LLVM::LinkerOptionsOp::create(builder, mlirModule.getLoc(),
+                                    builder.getStrArrayAttr(options));
     }
   }
   return success();
@@ -984,8 +984,8 @@ void ModuleImport::processComdat(const llvm::Comdat *comdat) {
   ComdatOp comdatOp = getGlobalComdatOp();
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPointToEnd(&comdatOp.getBody().back());
-  auto selectorOp = builder.create<ComdatSelectorOp>(
-      mlirModule.getLoc(), comdat->getName(),
+  auto selectorOp = ComdatSelectorOp::create(
+      builder, mlirModule.getLoc(), comdat->getName(),
       convertComdatFromLLVM(comdat->getSelectionKind()));
   auto symbolRef =
       SymbolRefAttr::get(builder.getContext(), getGlobalComdatOpName(),
@@ -1031,6 +1031,16 @@ LogicalResult ModuleImport::convertAliases() {
   return success();
 }
 
+LogicalResult ModuleImport::convertIFuncs() {
+  for (llvm::GlobalIFunc &ifunc : llvmModule->ifuncs()) {
+    if (failed(convertIFunc(&ifunc))) {
+      return emitError(UnknownLoc::get(context))
+             << "unhandled global ifunc: " << diag(ifunc);
+    }
+  }
+  return success();
+}
+
 LogicalResult ModuleImport::convertDataLayout() {
   Location loc = mlirModule.getLoc();
   DataLayoutImporter dataLayoutImporter(context, llvmModule->getDataLayout());
@@ -1346,12 +1356,12 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
   OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
 
   Type type = convertType(alias->getValueType());
-  AliasOp aliasOp = builder.create<AliasOp>(
-      mlirModule.getLoc(), type, convertLinkageFromLLVM(alias->getLinkage()),
-      alias->getName(),
-      /*dso_local=*/alias->isDSOLocal(),
-      /*thread_local=*/alias->isThreadLocal(),
-      /*attrs=*/ArrayRef<NamedAttribute>());
+  AliasOp aliasOp = AliasOp::create(builder, mlirModule.getLoc(), type,
+                                    convertLinkageFromLLVM(alias->getLinkage()),
+                                    alias->getName(),
+                                    /*dso_local=*/alias->isDSOLocal(),
+                                    /*thread_local=*/alias->isThreadLocal(),
+                                    /*attrs=*/ArrayRef<NamedAttribute>());
   globalInsertionOp = aliasOp;
 
   clearRegionState();
@@ -1360,7 +1370,7 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
   FailureOr<Value> initializer = convertConstantExpr(alias->getAliasee());
   if (failed(initializer))
     return failure();
-  builder.create<ReturnOp>(aliasOp.getLoc(), *initializer);
+  ReturnOp::create(builder, aliasOp.getLoc(), *initializer);
 
   if (alias->hasAtLeastLocalUnnamedAddr())
     aliasOp.setUnnamedAddr(convertUnnamedAddrFromLLVM(alias->getUnnamedAddr()));
@@ -1369,6 +1379,21 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
   return success();
 }
 
+LogicalResult ModuleImport::convertIFunc(llvm::GlobalIFunc *ifunc) {
+  OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
+
+  Type type = convertType(ifunc->getValueType());
+  llvm::Constant *resolver = ifunc->getResolver();
+  Type resolverType = convertType(resolver->getType());
+  IFuncOp::create(builder, mlirModule.getLoc(), ifunc->getName(), type,
+                  resolver->getName(), resolverType,
+                  convertLinkageFromLLVM(ifunc->getLinkage()),
+                  ifunc->isDSOLocal(), ifunc->getAddressSpace(),
+                  convertUnnamedAddrFromLLVM(ifunc->getUnnamedAddr()),
+                  convertVisibilityFromLLVM(ifunc->getVisibility()));
+  return success();
+}
+
 LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) {
   // Insert the global after the last one or at the start of the module.
   OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
@@ -1403,8 +1428,8 @@ LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) {
   if (globalName.empty())
     globalName = getOrCreateNamelessSymbolName(globalVar).getValue();
 
-  GlobalOp globalOp = builder.create<GlobalOp>(
-      mlirModule.getLoc(), type, globalVar->isConstant(),
+  GlobalOp globalOp = GlobalOp::create(
+      builder, mlirModule.getLoc(), type, globalVar->isConstant(),
       convertLinkageFromLLVM(globalVar->getLinkage()), StringRef(globalName),
       valueAttr, alignment, /*addr_space=*/globalVar->getAddressSpace(),
       /*dso_local=*/globalVar->isDSOLocal(),
@@ -1420,7 +1445,7 @@ LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) {
         convertConstantExpr(globalVar->getInitializer());
     if (failed(initializer))
       return failure();
-    builder.create<ReturnOp>(globalOp.getLoc(), *initializer);
+    ReturnOp::create(builder, globalOp.getLoc(), *initializer);
   }
   if (globalVar->hasAtLeastLocalUnnamedAddr()) {
     globalOp.setUnnamedAddr(
@@ -1488,13 +1513,13 @@ ModuleImport::convertGlobalCtorsAndDtors(llvm::GlobalVariable *globalVar) {
   OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
 
   if (globalVar->getName() == getGlobalCtorsVarName()) {
-    globalInsertionOp = builder.create<LLVM::GlobalCtorsOp>(
-        mlirModule.getLoc(), builder.getArrayAttr(funcs),
+    globalInsertionOp = LLVM::GlobalCtorsOp::create(
+        builder, mlirModule.getLoc(), builder.getArrayAttr(funcs),
         builder.getI32ArrayAttr(priorities), builder.getArrayAttr(dataList));
     return success();
   }
-  globalInsertionOp = builder.create<LLVM::GlobalDtorsOp>(
-      mlirModule.getLoc(), builder.getArrayAttr(funcs),
+  globalInsertionOp = LLVM::GlobalDtorsOp::create(
+      builder, mlirModule.getLoc(), builder.getArrayAttr(funcs),
       builder.getI32ArrayAttr(priorities), builder.getArrayAttr(dataList));
   return success();
 }
@@ -1569,33 +1594,33 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
   if (Attribute attr = getConstantAsAttr(constant)) {
     Type type = convertType(constant->getType());
     if (auto symbolRef = dyn_cast<FlatSymbolRefAttr>(attr)) {
-      return builder.create<AddressOfOp>(loc, type, symbolRef.getValue())
+      return AddressOfOp::create(builder, loc, type, symbolRef.getValue())
           .getResult();
     }
-    return builder.create<ConstantOp>(loc, type, attr).getResult();
+    return ConstantOp::create(builder, loc, type, attr).getResult();
   }
 
   // Convert null pointer constants.
   if (auto *nullPtr = dyn_cast<llvm::ConstantPointerNull>(constant)) {
     Type type = convertType(nullPtr->getType());
-    return builder.create<ZeroOp>(loc, type).getResult();
+    return ZeroOp::create(builder, loc, type).getResult();
   }
 
   // Convert none token constants.
   if (isa<llvm::ConstantTokenNone>(constant)) {
-    return builder.create<NoneTokenOp>(loc).getResult();
+    return NoneTokenOp::create(builder, loc).getResult();
   }
 
   // Convert poison.
   if (auto *poisonVal = dyn_cast<llvm::PoisonValue>(constant)) {
     Type type = convertType(poisonVal->getType());
-    return builder.create<PoisonOp>(loc, type).getResult();
+    return PoisonOp::create(builder, loc, type).getResult();
   }
 
   // Convert undef.
   if (auto *undefVal = dyn_cast<llvm::UndefValue>(constant)) {
     Type type = convertType(undefVal->getType());
-    return builder.create<UndefOp>(loc, type).getResult();
+    return UndefOp::create(builder, loc, type).getResult();
   }
 
   // Convert dso_local_equivalent.
@@ -1621,7 +1646,7 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
           getOrCreateNamelessSymbolName(cast<llvm::GlobalVariable>(globalObj));
     else
       symbolRef = FlatSymbolRefAttr::get(context, globalName);
-    return builder.create<AddressOfOp>(loc, type, symbolRef).getResult();
+    return AddressOfOp::create(builder, loc, type, symbolRef).getResult();
   }
 
   // Convert global alias accesses.
@@ -1629,7 +1654,7 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
     Type type = convertType(globalAliasObj->getType());
     StringRef aliaseeName = globalAliasObj->getName();
     FlatSymbolRefAttr symbolRef = FlatSymbolRefAttr::get(context, aliaseeName);
-    return builder.create<AddressOfOp>(loc, type, symbolRef).getResult();
+    return AddressOfOp::create(builder, loc, type, symbolRef).getResult();
   }
 
   // Convert constant expressions.
@@ -1680,16 +1705,17 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
     bool isArrayOrStruct = isa<LLVMArrayType, LLVMStructType>(rootType);
     assert((isArrayOrStruct || LLVM::isCompatibleVectorType(rootType)) &&
            "unrecognized aggregate type");
-    Value root = builder.create<UndefOp>(loc, rootType);
+    Value root = UndefOp::create(builder, loc, rootType);
     for (const auto &it : llvm::enumerate(elementValues)) {
       if (isArrayOrStruct) {
-        root = builder.create<InsertValueOp>(loc, root, it.value(), it.index());
+        root =
+            InsertValueOp::create(builder, loc, root, it.value(), it.index());
       } else {
         Attribute indexAttr = builder.getI32IntegerAttr(it.index());
         Value indexValue =
-            builder.create<ConstantOp>(loc, builder.getI32Type(), indexAttr);
-        root = builder.create<InsertElementOp>(loc, rootType, root, it.value(),
-                                               indexValue);
+            ConstantOp::create(builder, loc, builder.getI32Type(), indexAttr);
+        root = InsertElementOp::create(builder, loc, rootType, root, it.value(),
+                                       indexValue);
       }
     }
     return root;
@@ -1702,7 +1728,7 @@ FailureOr<Value> ModuleImport::convertConstant(llvm::Constant *constant) {
            "target extension type does not support zero-initialization");
     // Create llvm.mlir.zero operation to represent zero-initialization of
     // target extension type.
-    return builder.create<LLVM::ZeroOp>(loc, targetExtType).getRes();
+    return LLVM::ZeroOp::create(builder, loc, targetExtType).getRes();
   }
 
   if (auto *blockAddr = dyn_cast<llvm::BlockAddress>(constant)) {
@@ -1973,8 +1999,9 @@ ModuleImport::convertCallOperands(llvm::CallBase *callInst,
   // treated as indirect calls to constant operands that need to be converted.
   // Skip the callee operand if it's inline assembly, as it's handled separately
   // in InlineAsmOp.
-  if (!isa<llvm::Function>(callInst->getCalledOperand()) && !isInlineAsm) {
-    FailureOr<Value> called = convertValue(callInst->getCalledOperand());
+  llvm::Value *calleeOperand = callInst->getCalledOperand();
+  if (!isa<llvm::Function, llvm::GlobalIFunc>(calleeOperand) && !isInlineAsm) {
+    FailureOr<Value> called = convertValue(calleeOperand);
     if (failed(called))
       return failure();
     operands.push_back(*called);
@@ -2035,12 +2062,20 @@ ModuleImport::convertFunctionType(llvm::CallBase *callInst,
   if (failed(callType))
     return failure();
   auto *callee = dyn_cast<llvm::Function>(calledOperand);
+
+  llvm::FunctionType *origCalleeType = nullptr;
+  if (callee) {
+    origCalleeType = callee->getFunctionType();
+  } else if (auto *ifunc = dyn_cast<llvm::GlobalIFunc>(calledOperand)) {
+    origCalleeType = cast<llvm::FunctionType>(ifunc->getValueType());
+  }
+
   // For indirect calls, return the type of the call itself.
-  if (!callee)
+  if (!origCalleeType)
     return callType;
 
   FailureOr<LLVMFunctionType> calleeType =
-      castOrFailure(convertType(callee->getFunctionType()));
+      castOrFailure(convertType(origCalleeType));
   if (failed(calleeType))
     return failure();
 
@@ -2059,8 +2094,8 @@ ModuleImport::convertFunctionType(llvm::CallBase *callInst,
 
 FlatSymbolRefAttr ModuleImport::convertCalleeName(llvm::CallBase *callInst) {
   llvm::Value *calledOperand = callInst->getCalledOperand();
-  if (auto *callee = dyn_cast<llvm::Function>(calledOperand))
-    return SymbolRefAttr::get(context, callee->getName());
+  if (isa<llvm::Function, llvm::GlobalIFunc>(calledOperand))
+    return SymbolRefAttr::get(context, calledOperand->getName());
   return {};
 }
 
@@ -2124,16 +2159,16 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     }
 
     if (!brInst->isConditional()) {
-      auto brOp = builder.create<LLVM::BrOp>(loc, succBlockArgs.front(),
-                                             succBlocks.front());
+      auto brOp = LLVM::BrOp::create(builder, loc, succBlockArgs.front(),
+                                     succBlocks.front());
       mapNoResultOp(inst, brOp);
       return success();
     }
     FailureOr<Value> condition = convertValue(brInst->getCondition());
     if (failed(condition))
       return failure();
-    auto condBrOp = builder.create<LLVM::CondBrOp>(
-        loc, *condition, succBlocks.front(), succBlockArgs.front(),
+    auto condBrOp = LLVM::CondBrOp::create(
+        builder, loc, *condition, succBlocks.front(), succBlockArgs.front(),
         succBlocks.back(), succBlockArgs.back());
     mapNoResultOp(inst, condBrOp);
     return success();
@@ -2166,9 +2201,9 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
       caseBlocks[it.index()] = lookupBlock(succBB);
     }
 
-    auto switchOp = builder.create<SwitchOp>(
-        loc, *condition, lookupBlock(defaultBB), defaultBlockArgs, caseValues,
-        caseBlocks, caseOperandRefs);
+    auto switchOp = SwitchOp::create(builder, loc, *condition,
+                                     lookupBlock(defaultBB), defaultBlockArgs,
+                                     caseValues, caseBlocks, caseOperandRefs);
     mapNoResultOp(inst, switchOp);
     return success();
   }
@@ -2218,14 +2253,14 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
         // IR). Build the indirect call by passing an empty `callee` operand and
         // insert into `operands` to include the indirect call target.
         FlatSymbolRefAttr calleeSym = convertCalleeName(callInst);
-        Value indirectCallVal = builder.create<LLVM::AddressOfOp>(
-            loc, LLVM::LLVMPointerType::get(context), calleeSym);
+        Value indirectCallVal = LLVM::AddressOfOp::create(
+            builder, loc, LLVM::LLVMPointerType::get(context), calleeSym);
         operands->insert(operands->begin(), indirectCallVal);
       } else {
         // Regular direct call using callee name.
         callee = convertCalleeName(callInst);
       }
-      CallOp callOp = builder.create<CallOp>(loc, *funcTy, callee, *operands);
+      CallOp callOp = CallOp::create(builder, loc, *funcTy, callee, *operands);
 
       if (failed(convertCallAttributes(callInst, callOp)))
         return failure();
@@ -2260,7 +2295,7 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
 
     Type type = convertType(lpInst->getType());
     auto lpOp =
-        builder.create<LandingpadOp>(loc, type, lpInst->isCleanup(), operands);
+        LandingpadOp::create(builder, loc, type, lpInst->isCleanup(), operands);
     mapValue(inst, lpOp);
     return success();
   }
@@ -2310,8 +2345,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
       // IR). Build the indirect invoke by passing an empty `callee` operand and
       // insert into `operands` to include the indirect invoke target.
       FlatSymbolRefAttr calleeSym = convertCalleeName(invokeInst);
-      Value indirectInvokeVal = builder.create<LLVM::AddressOfOp>(
-          loc, LLVM::LLVMPointerType::get(context), calleeSym);
+      Value indirectInvokeVal = LLVM::AddressOfOp::create(
+          builder, loc, LLVM::LLVMPointerType::get(context), calleeSym);
       operands->insert(operands->begin(), indirectInvokeVal);
     } else {
       // Regular direct invoke using callee name.
@@ -2320,9 +2355,9 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     // Create the invoke operation. Normal destination block arguments will be
     // added later on to handle the case in which the operation result is
     // included in this list.
-    auto invokeOp = builder.create<InvokeOp>(
-        loc, *funcTy, calleeName, *operands, directNormalDest, ValueRange(),
-        lookupBlock(invokeInst->getUnwindDest()), unwindArgs);
+    auto invokeOp = InvokeOp::create(
+        builder, loc, *funcTy, calleeName, *operands, directNormalDest,
+        ValueRange(), lookupBlock(invokeInst->getUnwindDest()), unwindArgs);
 
     if (failed(convertInvokeAttributes(invokeInst, invokeOp)))
       return failure();
@@ -2348,7 +2383,7 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
       // arguments (including the invoke operation's result).
       OpBuilder::InsertionGuard g(builder);
       builder.setInsertionPointToStart(directNormalDest);
-      builder.create<LLVM::BrOp>(loc, normalArgs, normalDest);
+      LLVM::BrOp::create(builder, loc, normalArgs, normalDest);
     } else {
       // If the invoke operation's result is not a block argument to the normal
       // destination block, just add the block arguments as usual.
@@ -2382,8 +2417,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     }
 
     Type type = convertType(inst->getType());
-    auto gepOp = builder.create<GEPOp>(
-        loc, type, sourceElementType, *basePtr, indices,
+    auto gepOp = GEPOp::create(
+        builder, loc, type, sourceElementType, *basePtr, indices,
         static_cast<GEPNoWrapFlags>(gepInst->getNoWrapFlags().getRaw()));
     mapValue(inst, gepOp);
     return success();
@@ -2409,8 +2444,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     SmallVector<ValueRange> succBlockArgsRange =
         llvm::to_vector_of<ValueRange>(succBlockArgs);
     Location loc = translateLoc(inst->getDebugLoc());
-    auto indBrOp = builder.create<LLVM::IndirectBrOp>(
-        loc, *basePtr, succBlockArgsRange, succBlocks);
+    auto indBrOp = LLVM::IndirectBrOp::create(builder, loc, *basePtr,
+                                              succBlockArgsRange, succBlocks);
 
     mapNoResultOp(inst, indBrOp);
     return success();
@@ -2854,8 +2889,8 @@ LogicalResult ModuleImport::processFunction(llvm::Function *func) {
   builder.setInsertionPointToEnd(mlirModule.getBody());
 
   Location loc = debugImporter->translateFuncLocation(func);
-  LLVMFuncOp funcOp = builder.create<LLVMFuncOp>(
-      loc, func->getName(), functionType,
+  LLVMFuncOp funcOp = LLVMFuncOp::create(
+      builder, loc, func->getName(), functionType,
       convertLinkageFromLLVM(func->getLinkage()), dsoLocal, cconv);
 
   convertParameterAttributes(func, funcOp, builder);
@@ -3032,12 +3067,12 @@ ModuleImport::processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr,
   Operation *op =
       llvm::TypeSwitch<llvm::DbgVariableIntrinsic *, Operation *>(dbgIntr)
           .Case([&](llvm::DbgDeclareInst *) {
-            return builder.create<LLVM::DbgDeclareOp>(
-                loc, *argOperand, localVariableAttr, locationExprAttr);
+            return LLVM::DbgDeclareOp::create(
+                builder, loc, *argOperand, localVariableAttr, locationExprAttr);
           })
           .Case([&](llvm::DbgValueInst *) {
-            return builder.create<LLVM::DbgValueOp>(
-                loc, *argOperand, localVariableAttr, locationExprAttr);
+            return LLVM::DbgValueOp::create(
+                builder, loc, *argOperand, localVariableAttr, locationExprAttr);
           });
   mapNoResultOp(dbgIntr, op);
   setNonDebugMetadataAttrs(dbgIntr, op);
@@ -3082,8 +3117,8 @@ LogicalResult ModuleImport::processBasicBlock(llvm::BasicBlock *bb,
   if (bb->hasAddressTaken()) {
     OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPointToStart(block);
-    builder.create<BlockTagOp>(block->getParentOp()->getLoc(),
-                               BlockTagAttr::get(context, bb->getNumber()));
+    BlockTagOp::create(builder, block->getParentOp()->getLoc(),
+                       BlockTagAttr::get(context, bb->getNumber()));
   }
   return success();
 }
@@ -3162,6 +3197,8 @@ OwningOpRef<ModuleOp> mlir::translateLLVMIRToModule(
     return {};
   if (failed(moduleImport.convertAliases()))
     return {};
+  if (failed(moduleImport.convertIFuncs()))
+    return {};
   moduleImport.convertTargetTriple();
   return module;
 }
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 8908703cc1368..b997e559885e2 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -791,6 +791,8 @@ void ModuleTranslation::forgetMapping(Region &region) {
           globalsMapping.erase(&op);
         if (isa<LLVM::AliasOp>(op))
           aliasesMapping.erase(&op);
+        if (isa<LLVM::IFuncOp>(op))
+          ifuncMapping.erase(&op);
         if (isa<LLVM::CallOp>(op))
           callMapping.erase(&op);
         llvm::append_range(
@@ -1868,6 +1870,33 @@ LogicalResult ModuleTranslation::convertFunctions() {
   return success();
 }
 
+LogicalResult ModuleTranslation::convertIFuncs() {
+  for (auto op : getModuleBody(mlirModule).getOps<IFuncOp>()) {
+    llvm::Type *type = convertType(op.getIFuncType());
+    llvm::GlobalValue::LinkageTypes linkage =
+        convertLinkageToLLVM(op.getLinkage());
+    llvm::Constant *resolver;
+    if (auto *resolverFn = lookupFunction(op.getResolver())) {
+      resolver = cast<llvm::Constant>(resolverFn);
+    } else {
+      Operation *aliasOp = symbolTable().lookupSymbolIn(parentLLVMModule(op),
+                                                        op.getResolverAttr());
+      resolver = cast<llvm::Constant>(lookupAlias(aliasOp));
+    }
+
+    auto *ifunc =
+        llvm::GlobalIFunc::create(type, op.getAddressSpace(), linkage,
+                                  op.getSymName(), resolver, llvmModule.get());
+    addRuntimePreemptionSpecifier(op.getDsoLocal(), ifunc);
+    ifunc->setUnnamedAddr(convertUnnamedAddrToLLVM(op.getUnnamedAddr()));
+    ifunc->setVisibility(convertVisibilityToLLVM(op.getVisibility_()));
+
+    ifuncMapping.try_emplace(op, ifunc);
+  }
+
+  return success();
+}
+
 LogicalResult ModuleTranslation::convertComdats() {
   for (auto comdatOp : getModuleBody(mlirModule).getOps<ComdatOp>()) {
     for (auto selectorOp : comdatOp.getOps<ComdatSelectorOp>()) {
@@ -2284,6 +2313,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
     return nullptr;
   if (failed(translator.convertGlobalsAndAliases()))
     return nullptr;
+  if (failed(translator.convertIFuncs()))
+    return nullptr;
   if (failed(translator.createTBAAMetadata()))
     return nullptr;
   if (failed(translator.createIdentMetadata()))
@@ -2296,7 +2327,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   // Convert other top-level operations if possible.
   for (Operation &o : getModuleBody(module).getOperations()) {
     if (!isa<LLVM::LLVMFuncOp, LLVM::AliasOp, LLVM::GlobalOp,
-             LLVM::GlobalCtorsOp, LLVM::GlobalDtorsOp, LLVM::ComdatOp>(&o) &&
+             LLVM::GlobalCtorsOp, LLVM::GlobalDtorsOp, LLVM::ComdatOp,
+             LLVM::IFuncOp>(&o) &&
         !o.hasTrait<OpTrait::IsTerminator>() &&
         failed(translator.convertOperation(o, llvmBuilder))) {
       return nullptr;
diff --git a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
index 9fa03725d05ee..ee18cf815e4a7 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
@@ -42,37 +42,38 @@ static inline spirv::Opcode extractOpcode(uint32_t word) {
 Value spirv::Deserializer::getValue(uint32_t id) {
   if (auto constInfo = getConstant(id)) {
     // Materialize a `spirv.Constant` op at every use site.
-    return opBuilder.create<spirv::ConstantOp>(unknownLoc, constInfo->second,
-                                               constInfo->first);
+    return spirv::ConstantOp::create(opBuilder, unknownLoc, constInfo->second,
+                                     constInfo->first);
   }
   if (std::optional<std::pair<Attribute, Type>> constCompositeReplicateInfo =
           getConstantCompositeReplicate(id)) {
-    return opBuilder.create<spirv::EXTConstantCompositeReplicateOp>(
-        unknownLoc, constCompositeReplicateInfo->second,
+    return spirv::EXTConstantCompositeReplicateOp::create(
+        opBuilder, unknownLoc, constCompositeReplicateInfo->second,
         constCompositeReplicateInfo->first);
   }
   if (auto varOp = getGlobalVariable(id)) {
-    auto addressOfOp = opBuilder.create<spirv::AddressOfOp>(
-        unknownLoc, varOp.getType(), SymbolRefAttr::get(varOp.getOperation()));
+    auto addressOfOp =
+        spirv::AddressOfOp::create(opBuilder, unknownLoc, varOp.getType(),
+                                   SymbolRefAttr::get(varOp.getOperation()));
     return addressOfOp.getPointer();
   }
   if (auto constOp = getSpecConstant(id)) {
-    auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
-        unknownLoc, constOp.getDefaultValue().getType(),
+    auto referenceOfOp = spirv::ReferenceOfOp::create(
+        opBuilder, unknownLoc, constOp.getDefaultValue().getType(),
         SymbolRefAttr::get(constOp.getOperation()));
     return referenceOfOp.getReference();
   }
   if (SpecConstantCompositeOp specConstCompositeOp =
           getSpecConstantComposite(id)) {
-    auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
-        unknownLoc, specConstCompositeOp.getType(),
+    auto referenceOfOp = spirv::ReferenceOfOp::create(
+        opBuilder, unknownLoc, specConstCompositeOp.getType(),
         SymbolRefAttr::get(specConstCompositeOp.getOperation()));
     return referenceOfOp.getReference();
   }
   if (auto specConstCompositeReplicateOp =
           getSpecConstantCompositeReplicate(id)) {
-    auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
-        unknownLoc, specConstCompositeReplicateOp.getType(),
+    auto referenceOfOp = spirv::ReferenceOfOp::create(
+        opBuilder, unknownLoc, specConstCompositeReplicateOp.getType(),
         SymbolRefAttr::get(specConstCompositeReplicateOp.getOperation()));
     return referenceOfOp.getReference();
   }
@@ -83,7 +84,7 @@ Value spirv::Deserializer::getValue(uint32_t id) {
         specConstOperationInfo->enclosedOpOperands);
   }
   if (auto undef = getUndefType(id)) {
-    return opBuilder.create<spirv::UndefOp>(unknownLoc, undef);
+    return spirv::UndefOp::create(opBuilder, unknownLoc, undef);
   }
   return valueMap.lookup(id);
 }
@@ -387,8 +388,9 @@ Deserializer::processOp<spirv::EntryPointOp>(ArrayRef<uint32_t> words) {
     interface.push_back(SymbolRefAttr::get(arg.getOperation()));
     wordIndex++;
   }
-  opBuilder.create<spirv::EntryPointOp>(
-      unknownLoc, execModel, SymbolRefAttr::get(opBuilder.getContext(), fnName),
+  spirv::EntryPointOp::create(
+      opBuilder, unknownLoc, execModel,
+      SymbolRefAttr::get(opBuilder.getContext(), fnName),
       opBuilder.getArrayAttr(interface));
   return success();
 }
@@ -420,9 +422,10 @@ Deserializer::processOp<spirv::ExecutionModeOp>(ArrayRef<uint32_t> words) {
     attrListElems.push_back(opBuilder.getI32IntegerAttr(words[wordIndex++]));
   }
   auto values = opBuilder.getArrayAttr(attrListElems);
-  opBuilder.create<spirv::ExecutionModeOp>(
-      unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), fn.getName()),
-      execMode, values);
+  spirv::ExecutionModeOp::create(
+      opBuilder, unknownLoc,
+      SymbolRefAttr::get(opBuilder.getContext(), fn.getName()), execMode,
+      values);
   return success();
 }
 
@@ -459,8 +462,8 @@ Deserializer::processOp<spirv::FunctionCallOp>(ArrayRef<uint32_t> operands) {
     arguments.push_back(value);
   }
 
-  auto opFunctionCall = opBuilder.create<spirv::FunctionCallOp>(
-      unknownLoc, resultType,
+  auto opFunctionCall = spirv::FunctionCallOp::create(
+      opBuilder, unknownLoc, resultType,
       SymbolRefAttr::get(opBuilder.getContext(), functionName), arguments);
 
   if (resultType)
@@ -536,7 +539,8 @@ Deserializer::processOp<spirv::CopyMemoryOp>(ArrayRef<uint32_t> words) {
   }
 
   Location loc = createFileLineColLoc(opBuilder);
-  opBuilder.create<spirv::CopyMemoryOp>(loc, resultTypes, operands, attributes);
+  spirv::CopyMemoryOp::create(opBuilder, loc, resultTypes, operands,
+                              attributes);
 
   return success();
 }
@@ -567,8 +571,8 @@ LogicalResult Deserializer::processOp<spirv::GenericCastToPtrExplicitOp>(
   operands.push_back(arg);
 
   Location loc = createFileLineColLoc(opBuilder);
-  Operation *op = opBuilder.create<spirv::GenericCastToPtrExplicitOp>(
-      loc, resultTypes, operands);
+  Operation *op = spirv::GenericCastToPtrExplicitOp::create(
+      opBuilder, loc, resultTypes, operands);
   valueMap[valueID] = op->getResult(0);
   return success();
 }
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index d133d0332e271..88799a5ee8d52 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -518,8 +518,8 @@ spirv::Deserializer::processFunction(ArrayRef<uint32_t> operands) {
   }
 
   std::string fnName = getFunctionSymbol(fnID);
-  auto funcOp = opBuilder.create<spirv::FuncOp>(
-      unknownLoc, fnName, functionType, fnControl.value());
+  auto funcOp = spirv::FuncOp::create(opBuilder, unknownLoc, fnName,
+                                      functionType, fnControl.value());
   // Processing other function attributes.
   if (decorations.count(fnID)) {
     for (auto attr : decorations[fnID].getAttrs()) {
@@ -714,8 +714,8 @@ spirv::SpecConstantOp
 spirv::Deserializer::createSpecConstant(Location loc, uint32_t resultID,
                                         TypedAttr defaultValue) {
   auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
-  auto op = opBuilder.create<spirv::SpecConstantOp>(unknownLoc, symName,
-                                                    defaultValue);
+  auto op = spirv::SpecConstantOp::create(opBuilder, unknownLoc, symName,
+                                          defaultValue);
   if (decorations.count(resultID)) {
     for (auto attr : decorations[resultID].getAttrs())
       op->setAttr(attr.getName(), attr.getValue());
@@ -790,9 +790,9 @@ spirv::Deserializer::processGlobalVariable(ArrayRef<uint32_t> operands) {
            << wordIndex << " of " << operands.size() << " processed";
   }
   auto loc = createFileLineColLoc(opBuilder);
-  auto varOp = opBuilder.create<spirv::GlobalVariableOp>(
-      loc, TypeAttr::get(type), opBuilder.getStringAttr(variableName),
-      initializer);
+  auto varOp = spirv::GlobalVariableOp::create(
+      opBuilder, loc, TypeAttr::get(type),
+      opBuilder.getStringAttr(variableName), initializer);
 
   // Decorations.
   if (decorations.count(variableID)) {
@@ -1637,8 +1637,8 @@ spirv::Deserializer::processSpecConstantComposite(ArrayRef<uint32_t> operands) {
     elements.push_back(SymbolRefAttr::get(elementInfo));
   }
 
-  auto op = opBuilder.create<spirv::SpecConstantCompositeOp>(
-      unknownLoc, TypeAttr::get(resultType), symName,
+  auto op = spirv::SpecConstantCompositeOp::create(
+      opBuilder, unknownLoc, TypeAttr::get(resultType), symName,
       opBuilder.getArrayAttr(elements));
   specConstCompositeMap[resultID] = op;
 
@@ -1671,8 +1671,8 @@ LogicalResult spirv::Deserializer::processSpecConstantCompositeReplicateEXT(
   auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
   spirv::SpecConstantOp constituentSpecConstantOp =
       getSpecConstant(operands[2]);
-  auto op = opBuilder.create<spirv::EXTSpecConstantCompositeReplicateOp>(
-      unknownLoc, TypeAttr::get(resultType), symName,
+  auto op = spirv::EXTSpecConstantCompositeReplicateOp::create(
+      opBuilder, unknownLoc, TypeAttr::get(resultType), symName,
       SymbolRefAttr::get(constituentSpecConstantOp));
 
   specConstCompositeReplicateMap[resultID] = op;
@@ -1747,7 +1747,7 @@ Value spirv::Deserializer::materializeSpecConstantOperation(
 
   auto loc = createFileLineColLoc(opBuilder);
   auto specConstOperationOp =
-      opBuilder.create<spirv::SpecConstantOperationOp>(loc, resultType);
+      spirv::SpecConstantOperationOp::create(opBuilder, loc, resultType);
 
   Region &body = specConstOperationOp.getBody();
   // Move the new block into SpecConstantOperation's body.
@@ -1760,7 +1760,7 @@ Value spirv::Deserializer::materializeSpecConstantOperation(
   OpBuilder::InsertionGuard moduleInsertionGuard(opBuilder);
   opBuilder.setInsertionPointToEnd(&block);
 
-  opBuilder.create<spirv::YieldOp>(loc, block.front().getResult(0));
+  spirv::YieldOp::create(opBuilder, loc, block.front().getResult(0));
   return specConstOperationOp.getResult();
 }
 
@@ -1824,7 +1824,7 @@ LogicalResult spirv::Deserializer::processBranch(ArrayRef<uint32_t> operands) {
   // The preceding instruction for the OpBranch instruction could be an
   // OpLoopMerge or an OpSelectionMerge instruction, in this case they will have
   // the same OpLine information.
-  opBuilder.create<spirv::BranchOp>(loc, target);
+  spirv::BranchOp::create(opBuilder, loc, target);
 
   clearDebugLine();
   return success();
@@ -1855,8 +1855,8 @@ spirv::Deserializer::processBranchConditional(ArrayRef<uint32_t> operands) {
   // an OpSelectionMerge instruction, in this case they will have the same
   // OpLine information.
   auto loc = createFileLineColLoc(opBuilder);
-  opBuilder.create<spirv::BranchConditionalOp>(
-      loc, condition, trueBlock,
+  spirv::BranchConditionalOp::create(
+      opBuilder, loc, condition, trueBlock,
       /*trueArguments=*/ArrayRef<Value>(), falseBlock,
       /*falseArguments=*/ArrayRef<Value>(), weights);
 
@@ -2038,7 +2038,7 @@ ControlFlowStructurizer::createSelectionOp(uint32_t selectionControl) {
   OpBuilder builder(&mergeBlock->front());
 
   auto control = static_cast<spirv::SelectionControl>(selectionControl);
-  auto selectionOp = builder.create<spirv::SelectionOp>(location, control);
+  auto selectionOp = spirv::SelectionOp::create(builder, location, control);
   selectionOp.addMergeBlock(builder);
 
   return selectionOp;
@@ -2050,7 +2050,7 @@ spirv::LoopOp ControlFlowStructurizer::createLoopOp(uint32_t loopControl) {
   OpBuilder builder(&mergeBlock->front());
 
   auto control = static_cast<spirv::LoopControl>(loopControl);
-  auto loopOp = builder.create<spirv::LoopOp>(location, control);
+  auto loopOp = spirv::LoopOp::create(builder, location, control);
   loopOp.addEntryAndMergeBlock(builder);
 
   return loopOp;
@@ -2183,8 +2183,8 @@ LogicalResult ControlFlowStructurizer::structurize() {
     // The loop entry block should have a unconditional branch jumping to the
     // loop header block.
     builder.setInsertionPointToEnd(&body.front());
-    builder.create<spirv::BranchOp>(location, mapper.lookupOrNull(headerBlock),
-                                    ArrayRef<Value>(blockArgs));
+    spirv::BranchOp::create(builder, location, mapper.lookupOrNull(headerBlock),
+                            ArrayRef<Value>(blockArgs));
   }
 
   // Values defined inside the selection region that need to be yielded outside
@@ -2268,12 +2268,12 @@ LogicalResult ControlFlowStructurizer::structurize() {
     Operation *newOp = nullptr;
 
     if (isLoop)
-      newOp = builder.create<spirv::LoopOp>(
-          location, TypeRange(ValueRange(outsideUses)),
-          static_cast<spirv::LoopControl>(control));
+      newOp = spirv::LoopOp::create(builder, location,
+                                    TypeRange(ValueRange(outsideUses)),
+                                    static_cast<spirv::LoopControl>(control));
     else
-      newOp = builder.create<spirv::SelectionOp>(
-          location, TypeRange(ValueRange(outsideUses)),
+      newOp = spirv::SelectionOp::create(
+          builder, location, TypeRange(ValueRange(outsideUses)),
           static_cast<spirv::SelectionControl>(control));
 
     newOp->getRegion(0).takeBody(body);
@@ -2399,7 +2399,7 @@ LogicalResult ControlFlowStructurizer::structurize() {
       // but replace all ops inside with a branch to the merge block.
       block->clear();
       builder.setInsertionPointToEnd(block);
-      builder.create<spirv::BranchOp>(location, mergeBlock);
+      spirv::BranchOp::create(builder, location, mergeBlock);
     } else {
       LLVM_DEBUG(logger.startLine() << "[cf] erasing block " << block << "\n");
       block->erase();
@@ -2453,22 +2453,22 @@ LogicalResult spirv::Deserializer::wireUpBlockArgument() {
 
     if (auto branchOp = dyn_cast<spirv::BranchOp>(op)) {
       // Replace the previous branch op with a new one with block arguments.
-      opBuilder.create<spirv::BranchOp>(branchOp.getLoc(), branchOp.getTarget(),
-                                        blockArgs);
+      spirv::BranchOp::create(opBuilder, branchOp.getLoc(),
+                              branchOp.getTarget(), blockArgs);
       branchOp.erase();
     } else if (auto branchCondOp = dyn_cast<spirv::BranchConditionalOp>(op)) {
       assert((branchCondOp.getTrueBlock() == target ||
               branchCondOp.getFalseBlock() == target) &&
              "expected target to be either the true or false target");
       if (target == branchCondOp.getTrueTarget())
-        opBuilder.create<spirv::BranchConditionalOp>(
-            branchCondOp.getLoc(), branchCondOp.getCondition(), blockArgs,
-            branchCondOp.getFalseBlockArguments(),
+        spirv::BranchConditionalOp::create(
+            opBuilder, branchCondOp.getLoc(), branchCondOp.getCondition(),
+            blockArgs, branchCondOp.getFalseBlockArguments(),
             branchCondOp.getBranchWeightsAttr(), branchCondOp.getTrueTarget(),
             branchCondOp.getFalseTarget());
       else
-        opBuilder.create<spirv::BranchConditionalOp>(
-            branchCondOp.getLoc(), branchCondOp.getCondition(),
+        spirv::BranchConditionalOp::create(
+            opBuilder, branchCondOp.getLoc(), branchCondOp.getCondition(),
             branchCondOp.getTrueBlockArguments(), blockArgs,
             branchCondOp.getBranchWeightsAttr(), branchCondOp.getTrueBlock(),
             branchCondOp.getFalseBlock());
@@ -2528,7 +2528,7 @@ LogicalResult spirv::Deserializer::splitConditionalBlocks() {
     if (!llvm::hasSingleElement(*block) || splitHeaderMergeBlock) {
       Block *newBlock = block->splitBlock(terminator);
       OpBuilder builder(block, block->end());
-      builder.create<spirv::BranchOp>(block->getParent()->getLoc(), newBlock);
+      spirv::BranchOp::create(builder, block->getParent()->getLoc(), newBlock);
 
       // After splitting we need to update the map to use the new block as a
       // header.
diff --git a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
index 824201d17b5ab..0677828b635d4 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/MLIRGen.cpp
@@ -134,7 +134,7 @@ class CodeGen {
 
 OwningOpRef<ModuleOp> CodeGen::generate(const ast::Module &module) {
   OwningOpRef<ModuleOp> mlirModule =
-      builder.create<ModuleOp>(genLoc(module.getLoc()));
+      ModuleOp::create(builder, genLoc(module.getLoc()));
   builder.setInsertionPointToStart(mlirModule->getBody());
 
   // Generate code for each of the decls within the module.
@@ -205,8 +205,8 @@ static void checkAndNestUnderRewriteOp(OpBuilder &builder, Value rootExpr,
                                        Location loc) {
   if (isa<pdl::PatternOp>(builder.getInsertionBlock()->getParentOp())) {
     pdl::RewriteOp rewrite =
-        builder.create<pdl::RewriteOp>(loc, rootExpr, /*name=*/StringAttr(),
-                                       /*externalArgs=*/ValueRange());
+        pdl::RewriteOp::create(builder, loc, rootExpr, /*name=*/StringAttr(),
+                               /*externalArgs=*/ValueRange());
     builder.createBlock(&rewrite.getBodyRegion());
   }
 }
@@ -219,7 +219,7 @@ void CodeGen::genImpl(const ast::EraseStmt *stmt) {
   // Make sure we are nested in a RewriteOp.
   OpBuilder::InsertionGuard guard(builder);
   checkAndNestUnderRewriteOp(builder, rootExpr, loc);
-  builder.create<pdl::EraseOp>(loc, rootExpr);
+  pdl::EraseOp::create(builder, loc, rootExpr);
 }
 
 void CodeGen::genImpl(const ast::LetStmt *stmt) { genVar(stmt->getVarDecl()); }
@@ -242,8 +242,8 @@ void CodeGen::genImpl(const ast::ReplaceStmt *stmt) {
   bool usesReplOperation =
       replValues.size() == 1 &&
       isa<pdl::OperationType>(replValues.front().getType());
-  builder.create<pdl::ReplaceOp>(
-      loc, rootExpr, usesReplOperation ? replValues[0] : Value(),
+  pdl::ReplaceOp::create(
+      builder, loc, rootExpr, usesReplOperation ? replValues[0] : Value(),
       usesReplOperation ? ValueRange() : ValueRange(replValues));
 }
 
@@ -283,8 +283,8 @@ void CodeGen::genImpl(const ast::PatternDecl *decl) {
 
   // FIXME: Properly model HasBoundedRecursion in PDL so that we don't drop it
   // here.
-  pdl::PatternOp pattern = builder.create<pdl::PatternOp>(
-      genLoc(decl->getLoc()), decl->getBenefit(),
+  pdl::PatternOp pattern = pdl::PatternOp::create(
+      builder, genLoc(decl->getLoc()), decl->getBenefit(),
       name ? std::optional<StringRef>(name->getName())
            : std::optional<StringRef>());
 
@@ -338,30 +338,31 @@ Value CodeGen::genNonInitializerVar(const ast::VariableDecl *varDecl,
   ast::Type type = varDecl->getType();
   Type mlirType = genType(type);
   if (isa<ast::ValueType>(type))
-    return builder.create<pdl::OperandOp>(loc, mlirType, getTypeConstraint());
+    return pdl::OperandOp::create(builder, loc, mlirType, getTypeConstraint());
   if (isa<ast::TypeType>(type))
-    return builder.create<pdl::TypeOp>(loc, mlirType, /*type=*/TypeAttr());
+    return pdl::TypeOp::create(builder, loc, mlirType, /*type=*/TypeAttr());
   if (isa<ast::AttributeType>(type))
-    return builder.create<pdl::AttributeOp>(loc, getTypeConstraint());
+    return pdl::AttributeOp::create(builder, loc, getTypeConstraint());
   if (ast::OperationType opType = dyn_cast<ast::OperationType>(type)) {
-    Value operands = builder.create<pdl::OperandsOp>(
-        loc, pdl::RangeType::get(builder.getType<pdl::ValueType>()),
+    Value operands = pdl::OperandsOp::create(
+        builder, loc, pdl::RangeType::get(builder.getType<pdl::ValueType>()),
         /*type=*/Value());
-    Value results = builder.create<pdl::TypesOp>(
-        loc, pdl::RangeType::get(builder.getType<pdl::TypeType>()),
+    Value results = pdl::TypesOp::create(
+        builder, loc, pdl::RangeType::get(builder.getType<pdl::TypeType>()),
         /*types=*/ArrayAttr());
-    return builder.create<pdl::OperationOp>(loc, opType.getName(), operands,
-                                            ArrayRef<StringRef>(), ValueRange(),
-                                            results);
+    return pdl::OperationOp::create(builder, loc, opType.getName(), operands,
+                                    ArrayRef<StringRef>(), ValueRange(),
+                                    results);
   }
 
   if (ast::RangeType rangeTy = dyn_cast<ast::RangeType>(type)) {
     ast::Type eleTy = rangeTy.getElementType();
     if (isa<ast::ValueType>(eleTy))
-      return builder.create<pdl::OperandsOp>(loc, mlirType,
-                                             getTypeConstraint());
+      return pdl::OperandsOp::create(builder, loc, mlirType,
+                                     getTypeConstraint());
     if (isa<ast::TypeType>(eleTy))
-      return builder.create<pdl::TypesOp>(loc, mlirType, /*types=*/ArrayAttr());
+      return pdl::TypesOp::create(builder, loc, mlirType,
+                                  /*types=*/ArrayAttr());
   }
 
   llvm_unreachable("invalid non-initialized variable type");
@@ -404,7 +405,7 @@ SmallVector<Value> CodeGen::genExpr(const ast::Expr *expr) {
 Value CodeGen::genExprImpl(const ast::AttributeExpr *expr) {
   Attribute attr = parseAttribute(expr->getValue(), builder.getContext());
   assert(attr && "invalid MLIR attribute data");
-  return builder.create<pdl::AttributeOp>(genLoc(expr->getLoc()), attr);
+  return pdl::AttributeOp::create(builder, genLoc(expr->getLoc()), attr);
 }
 
 SmallVector<Value> CodeGen::genExprImpl(const ast::CallExpr *expr) {
@@ -443,9 +444,9 @@ Value CodeGen::genExprImpl(const ast::MemberAccessExpr *expr) {
     if (isa<ast::AllResultsMemberAccessExpr>(expr)) {
       Type mlirType = genType(expr->getType());
       if (isa<pdl::ValueType>(mlirType))
-        return builder.create<pdl::ResultOp>(loc, mlirType, parentExprs[0],
-                                             builder.getI32IntegerAttr(0));
-      return builder.create<pdl::ResultsOp>(loc, mlirType, parentExprs[0]);
+        return pdl::ResultOp::create(builder, loc, mlirType, parentExprs[0],
+                                     builder.getI32IntegerAttr(0));
+      return pdl::ResultsOp::create(builder, loc, mlirType, parentExprs[0]);
     }
 
     const ods::Operation *odsOp = opType.getODSOperation();
@@ -455,8 +456,8 @@ Value CodeGen::genExprImpl(const ast::MemberAccessExpr *expr) {
       unsigned resultIndex;
       name.getAsInteger(/*Radix=*/10, resultIndex);
       IntegerAttr index = builder.getI32IntegerAttr(resultIndex);
-      return builder.create<pdl::ResultOp>(loc, genType(expr->getType()),
-                                           parentExprs[0], index);
+      return pdl::ResultOp::create(builder, loc, genType(expr->getType()),
+                                   parentExprs[0], index);
     }
 
     // Find the result with the member name or by index.
@@ -474,8 +475,8 @@ Value CodeGen::genExprImpl(const ast::MemberAccessExpr *expr) {
 
     // Generate the result access.
     IntegerAttr index = builder.getI32IntegerAttr(resultIndex);
-    return builder.create<pdl::ResultsOp>(loc, genType(expr->getType()),
-                                          parentExprs[0], index);
+    return pdl::ResultsOp::create(builder, loc, genType(expr->getType()),
+                                  parentExprs[0], index);
   }
 
   // Handle tuple based member access.
@@ -518,8 +519,8 @@ Value CodeGen::genExprImpl(const ast::OperationExpr *expr) {
   for (const ast::Expr *result : expr->getResultTypes())
     results.push_back(genSingleExpr(result));
 
-  return builder.create<pdl::OperationOp>(loc, opName, operands, attrNames,
-                                          attrValues, results);
+  return pdl::OperationOp::create(builder, loc, opName, operands, attrNames,
+                                  attrValues, results);
 }
 
 Value CodeGen::genExprImpl(const ast::RangeExpr *expr) {
@@ -527,8 +528,8 @@ Value CodeGen::genExprImpl(const ast::RangeExpr *expr) {
   for (const ast::Expr *element : expr->getElements())
     llvm::append_range(elements, genExpr(element));
 
-  return builder.create<pdl::RangeOp>(genLoc(expr->getLoc()),
-                                      genType(expr->getType()), elements);
+  return pdl::RangeOp::create(builder, genLoc(expr->getLoc()),
+                              genType(expr->getType()), elements);
 }
 
 SmallVector<Value> CodeGen::genExprImpl(const ast::TupleExpr *expr) {
@@ -541,9 +542,9 @@ SmallVector<Value> CodeGen::genExprImpl(const ast::TupleExpr *expr) {
 Value CodeGen::genExprImpl(const ast::TypeExpr *expr) {
   Type type = parseType(expr->getValue(), builder.getContext());
   assert(type && "invalid MLIR type data");
-  return builder.create<pdl::TypeOp>(genLoc(expr->getLoc()),
-                                     builder.getType<pdl::TypeType>(),
-                                     TypeAttr::get(type));
+  return pdl::TypeOp::create(builder, genLoc(expr->getLoc()),
+                             builder.getType<pdl::TypeType>(),
+                             TypeAttr::get(type));
 }
 
 SmallVector<Value>
@@ -586,8 +587,8 @@ CodeGen::genConstraintOrRewriteCall(const T *decl, Location loc,
     } else {
       resultTypes.push_back(genType(declResultType));
     }
-    PDLOpT pdlOp = builder.create<PDLOpT>(loc, resultTypes,
-                                          decl->getName().getName(), inputs);
+    PDLOpT pdlOp = PDLOpT::create(builder, loc, resultTypes,
+                                  decl->getName().getName(), inputs);
     if (isNegated && std::is_same_v<PDLOpT, pdl::ApplyNativeConstraintOp>)
       cast<pdl::ApplyNativeConstraintOp>(pdlOp).setIsNegated(true);
     return pdlOp->getResults();
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4c4ce3cb41fd5..d224f732a198b 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1502,7 +1502,7 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
   OpBuilder builder(outputTypes.front().getContext());
   builder.setInsertionPoint(ip.getBlock(), ip.getPoint());
   auto convertOp =
-      builder.create<UnrealizedConversionCastOp>(loc, outputTypes, inputs);
+      UnrealizedConversionCastOp::create(builder, loc, outputTypes, inputs);
   if (!valuesToMap.empty())
     mapping.map(valuesToMap, convertOp.getResults());
   if (castOp)
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index ed476da28d6be..be71737e4b5b4 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -200,6 +200,7 @@ class _OperationBase:
     def get_asm(
         binary: Literal[True],
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -212,6 +213,7 @@ class _OperationBase:
         self,
         binary: bool = False,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -253,6 +255,7 @@ class _OperationBase:
     def print(
         self,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -270,6 +273,10 @@ class _OperationBase:
           binary: Whether to write bytes (True) or str (False). Defaults to False.
           large_elements_limit: Whether to elide elements attributes above this
             number of elements. Defaults to None (no limit).
+          large_resource_limit: Whether to elide resource strings above this
+            number of characters. Defaults to None (no limit). If large_elements_limit
+            is set and this is None, the behavior will be to use large_elements_limit
+            as large_resource_limit.
           enable_debug_info: Whether to print debug/location information. Defaults
             to False.
           pretty_debug_info: Whether to format debug information for easier reading
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
index 0d2eaffe16d3e..1010daddae2aa 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
@@ -23,6 +23,7 @@ class PassManager:
         print_after_change: bool = False,
         print_after_failure: bool = False,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         print_generic_op_form: bool = False,
         tree_printing_dir_path: str | None = None,
diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index 2d0047b76c702..678ceeebac204 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -17,7 +17,7 @@
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Optional, Sequence, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
@@ -71,6 +71,123 @@ def inner_iter_args(self):
         return self.body.arguments[1:]
 
 
+def _dispatch_index_op_fold_results(
+    ofrs: Sequence[Union[Operation, OpView, Value, int]],
+) -> Tuple[List[Value], List[int]]:
+    """`mlir::dispatchIndexOpFoldResults`"""
+    dynamic_vals = []
+    static_vals = []
+    for ofr in ofrs:
+        if isinstance(ofr, (Operation, OpView, Value)):
+            val = _get_op_result_or_value(ofr)
+            dynamic_vals.append(val)
+            static_vals.append(ShapedType.get_dynamic_size())
+        else:
+            static_vals.append(ofr)
+    return dynamic_vals, static_vals
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class ForallOp(ForallOp):
+    """Specialization for the SCF forall op class."""
+
+    def __init__(
+        self,
+        lower_bounds: Sequence[Union[Operation, OpView, Value, int]],
+        upper_bounds: Sequence[Union[Operation, OpView, Value, int]],
+        steps: Sequence[Union[Value, int]],
+        shared_outs: Optional[Union[Operation, OpView, Sequence[Value]]] = None,
+        *,
+        mapping=None,
+        loc=None,
+        ip=None,
+    ):
+        """Creates an SCF `forall` operation.
+
+        - `lower_bounds` are the values to use as lower bounds of the loop.
+        - `upper_bounds` are the values to use as upper bounds of the loop.
+        - `steps` are the values to use as loop steps.
+        - `shared_outs` is a list of additional loop-carried arguments or an operation
+          producing them as results.
+        """
+        assert (
+            len(lower_bounds) == len(upper_bounds) == len(steps)
+        ), "Mismatch in length of lower bounds, upper bounds, and steps"
+        if shared_outs is None:
+            shared_outs = []
+        shared_outs = _get_op_results_or_values(shared_outs)
+
+        dynamic_lbs, static_lbs = _dispatch_index_op_fold_results(lower_bounds)
+        dynamic_ubs, static_ubs = _dispatch_index_op_fold_results(upper_bounds)
+        dynamic_steps, static_steps = _dispatch_index_op_fold_results(steps)
+
+        results = [arg.type for arg in shared_outs]
+        super().__init__(
+            results,
+            dynamic_lbs,
+            dynamic_ubs,
+            dynamic_steps,
+            static_lbs,
+            static_ubs,
+            static_steps,
+            shared_outs,
+            mapping=mapping,
+            loc=loc,
+            ip=ip,
+        )
+        rank = len(static_lbs)
+        iv_types = [IndexType.get()] * rank
+        self.regions[0].blocks.append(*iv_types, *results)
+
+    @property
+    def body(self) -> Block:
+        """Returns the body (block) of the loop."""
+        return self.regions[0].blocks[0]
+
+    @property
+    def rank(self) -> int:
+        """Returns the number of induction variables the loop has."""
+        return len(self.staticLowerBound)
+
+    @property
+    def induction_variables(self) -> BlockArgumentList:
+        """Returns the induction variables usable within the loop."""
+        return self.body.arguments[: self.rank]
+
+    @property
+    def inner_iter_args(self) -> BlockArgumentList:
+        """Returns the loop-carried arguments usable within the loop.
+
+        To obtain the loop-carried operands, use `iter_args`.
+        """
+        return self.body.arguments[self.rank :]
+
+    def terminator(self) -> InParallelOp:
+        """
+        Returns the loop terminator if it exists.
+        Otherwise, creates a new one.
+        """
+        ops = self.body.operations
+        with InsertionPoint(self.body):
+            if not ops:
+                return InParallelOp()
+            last = ops[len(ops) - 1]
+            return last if isinstance(last, InParallelOp) else InParallelOp()
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class InParallelOp(InParallelOp):
+    """Specialization of the SCF forall.in_parallel op class."""
+
+    def __init__(self, loc=None, ip=None):
+        super().__init__(loc=loc, ip=ip)
+        self.region.blocks.append()
+
+    @property
+    def block(self) -> Block:
+        return self.region.blocks[0]
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class IfOp(IfOp):
     """Specialization for the SCF if op class."""
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 8871b2ce0eadb..cc1162d8b0de8 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -68,7 +68,7 @@ func.func @fat_raw_buffer_cast_dyn_size_offset(%buf: memref<?xi32, strided<[1],
 }
 
 // CHECK-LABEL: func @fat_raw_buffer_cast_reset_offset
-func.func @fat_raw_buffer_cast_reset_offset(%buf: memref<?xi32, strided<[1], offset: ?>, #gpu_global_addrspace>) -> memref<?xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>> {
+func.func @fat_raw_buffer_cast_reset_offset(%buf: memref<?xi32, strided<[1], offset: ?>, #gpu_global_addrspace>) -> memref<?xi32, #amdgpu.address_space<fat_raw_buffer>> {
   // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref<?xi32, strided<[1], offset: ?>, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK-DAG: %[[memRefPtr:.*]] = llvm.extractvalue %[[desc]][1]
   // CHECK-DAG: %[[memRefOff:.*]] = llvm.extractvalue %[[desc]][2]
@@ -77,8 +77,8 @@ func.func @fat_raw_buffer_cast_reset_offset(%buf: memref<?xi32, strided<[1], off
   // CHECK: %[[fatBuf:.*]] = rocdl.make.buffer.rsrc %[[basePtr]], %{{.*}}, %{{.*}}, %{{.*}}
   // CHECK: llvm.insertvalue %[[fatBuf]], %{{.*}}[1]
   // CHECK: llvm.insertvalue %[[zeroOff]], %{{.*}}[2]
-  %ret = amdgpu.fat_raw_buffer_cast %buf resetOffset : memref<?xi32, strided<[1], offset: ?>, #gpu_global_addrspace> to memref<?xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>>
-  return %ret : memref<?xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>>
+  %ret = amdgpu.fat_raw_buffer_cast %buf resetOffset : memref<?xi32, strided<[1], offset: ?>, #gpu_global_addrspace> to memref<?xi32, #amdgpu.address_space<fat_raw_buffer>>
+  return %ret : memref<?xi32, #amdgpu.address_space<fat_raw_buffer>>
 }
 
 // CHECK-LABEL: func @fat_raw_buffer_cast_valid_bytes
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
index 77103fa5c25f1..e48c94195ea56 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
   // CHECK: %[[ALLOC:.*]] = memref.alloc()
   // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
   // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
   // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
-  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
+  // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]]
   // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
   %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
   %c0 = arith.constant 0 : index
@@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
   // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
 
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
+  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
   // CHECK: %[[C32:.*]] = arith.constant 32 : index
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
new file mode 100644
index 0000000000000..1016ee859e462
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+  // GFX9: rocdl.s.waitcnt 53119
+  // GFX10: rocdl.s.waitcnt 65407
+  // GFX11: rocdl.s.waitcnt 65527
+  // GFX12-NOT: rocdl.s.wait.loadcnt
+  // GFX12-NOT: rocdl.s.wait.storecnt
+  // GFX12-NOT: rocdl.s.wait.expcnt
+  // GFX12-NOT: rocdl.s.wait.dscnt
+  amdgpu.memory_counter_wait
+
+  // GFX9: rocdl.s.waitcnt 3952
+  // GFX10: rocdl.s.waitcnt 16240
+  // GFX11: rocdl.s.waitcnt 1015
+  // GFX12: rocdl.s.wait.loadcnt 0
+  amdgpu.memory_counter_wait load(0)
+
+  // GFX9: rocdl.s.waitcnt 3952
+  // GFX10: rocdl.s.waitcnt 16240
+  // GFX11: rocdl.s.waitcnt 1015
+  // GFX12: rocdl.s.wait.storecnt 0
+  amdgpu.memory_counter_wait store(0)
+
+  // GFX9: rocdl.s.waitcnt 53007
+  // GFX10: rocdl.s.waitcnt 65295
+  // GFX11: rocdl.s.waitcnt 65520
+  // GFX12: rocdl.s.wait.expcnt 0
+  amdgpu.memory_counter_wait exp(0)
+
+  // GFX9: rocdl.s.waitcnt 49279
+  // GFX10: rocdl.s.waitcnt 49279
+  // GFX11: rocdl.s.waitcnt 64519
+  // GFX12: rocdl.s.wait.dscnt 0
+  amdgpu.memory_counter_wait ds(0)
+
+  return
+}
diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
index 095f3e575eca8..b98045195f8cf 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-extf.mlir
@@ -230,9 +230,10 @@ func.func @conversion_broadcast_odd(%in: vector<6xf8E5M2>, %scale: vector<2xf8E8
 }
 
 // -----
-// CHECK-LABEL: @conversion_splat
+
+// CHECK-LABEL: @conversion_broadcast
 // CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-// CHECK-DAG:     %[[SCALE_SPLAT:.+]] = vector.splat %arg1 : vector<4xf8E8M0FNU>
+// CHECK-DAG:     %[[SCALE_SPLAT:.+]] = vector.broadcast %arg1 : f8E8M0FNU to vector<4xf8E8M0FNU>
 // CHECK-DAG:     %[[SCALE_EXTF:.+]] = arith.extf %[[SCALE_SPLAT]] : vector<4xf8E8M0FNU> to vector<4xf32>
 // CHECK-DAG:     %[[SCALE_SCALAR:.+]] = vector.extract %[[SCALE_EXTF]][0] : f32 from vector<4xf32>
 // CHECK:         %[[IN_CHUNK0:.+]] = vector.extract_strided_slice %arg0 {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
@@ -242,8 +243,8 @@ func.func @conversion_broadcast_odd(%in: vector<6xf8E5M2>, %scale: vector<2xf8E8
 // CHECK-NEXT:    %[[OUT_CHUNK1:.+]] = amdgpu.scaled_ext_packed %[[IN_CHUNK1]][0], %[[SCALE_SCALAR]] : vector<2xf8E5M2> to vector<2xf32>
 // CHECK-NEXT:    %[[FINAL_RESULT:.+]] = vector.insert_strided_slice %[[OUT_CHUNK1]], %[[ACCUM_A]] {offsets = [2], strides = [1]} : vector<2xf32> into vector<4xf32>
 // CHECK-NEXT:    return %[[FINAL_RESULT]] : vector<4xf32>
-func.func @conversion_splat(%in: vector<4xf8E5M2>, %scale: f8E8M0FNU) -> vector<4xf32> {
-    %splat = vector.splat %scale : vector<4xf8E8M0FNU>
+func.func @conversion_broadcast(%in: vector<4xf8E5M2>, %scale: f8E8M0FNU) -> vector<4xf32> {
+    %splat = vector.broadcast %scale : f8E8M0FNU to vector<4xf8E8M0FNU>
     %ext = arith.scaling_extf %in, %splat : vector<4xf8E5M2>, vector<4xf8E8M0FNU> to vector<4xf32>
     return %ext : vector<4xf32>
 }
@@ -252,7 +253,7 @@ func.func @conversion_splat(%in: vector<4xf8E5M2>, %scale: f8E8M0FNU) -> vector<
 
 // CHECK-LABEL: @conversion_scalar
 // CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
-// CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf8E5M2>
+// CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.broadcast %arg0 : f8E5M2 to vector<1xf8E5M2>
 // CHECK-NEXT:    %[[PACKED_EXT:.+]] = amdgpu.scaled_ext_packed %[[SPLAT_IN]][0], %[[SCALE_F32]] : vector<1xf8E5M2> to vector<2xf32>
 // CHECK-NEXT:    %[[RESULT:.+]] = vector.extract %[[PACKED_EXT]][0] : f32 from vector<2xf32>
 // CHECK-NEXT:    return %[[RESULT]] : f32
diff --git a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
index 0519050c5ecc4..488e75cbb1843 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/scaling-truncf.mlir
@@ -159,9 +159,9 @@ func.func @conversion_broadcast_odd(%in: vector<6xf32>, %scale: vector<2xf8E8M0F
 
 // -----
 
-// CHECK-LABEL: @conversion_splat
+// CHECK-LABEL: @conversion_broadcast
 // CHECK-DAG:     %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<4xf8E5M2>
-// CHECK-DAG:     %[[SCALE_SPLAT:.+]] = vector.splat %arg1 : vector<4xf8E8M0FNU>
+// CHECK-DAG:     %[[SCALE_SPLAT:.+]] = vector.broadcast %arg1 : f8E8M0FNU to vector<4xf8E8M0FNU>
 // CHECK-DAG:     %[[SCALE_EXTF:.+]] = arith.extf %[[SCALE_SPLAT]] : vector<4xf8E8M0FNU> to vector<4xf32>
 // CHECK-DAG:     %[[SCALE_SCALAR:.+]] = vector.extract %[[SCALE_EXTF]][0] : f32 from vector<4xf32>
 // CHECK:         %[[IN_CHUNK0:.+]] = vector.extract_strided_slice %arg0 {offsets = [0], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
@@ -173,8 +173,8 @@ func.func @conversion_broadcast_odd(%in: vector<6xf32>, %scale: vector<2xf8E8M0F
 // CHECK-NEXT:    %[[OUT_CHUNK1:.+]] = vector.extract_strided_slice %[[PACKED1]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2> to vector<2xf8E5M2>
 // CHECK-NEXT:    %[[FINAL_RESULT:.+]] = vector.insert_strided_slice %[[OUT_CHUNK1]], %[[ACCUM_A]] {offsets = [2], strides = [1]} : vector<2xf8E5M2> into vector<4xf8E5M2>
 // CHECK-NEXT:    return %[[FINAL_RESULT]] : vector<4xf8E5M2>
-func.func @conversion_splat(%in: vector<4xf32>, %scale: f8E8M0FNU) -> vector<4xf8E5M2> {
-    %splat = vector.splat %scale : vector<4xf8E8M0FNU>
+func.func @conversion_broadcast(%in: vector<4xf32>, %scale: f8E8M0FNU) -> vector<4xf8E5M2> {
+    %splat = vector.broadcast %scale : f8E8M0FNU to vector<4xf8E8M0FNU>
     %ext = arith.scaling_truncf %in, %splat : vector<4xf32>, vector<4xf8E8M0FNU> to vector<4xf8E5M2>
     return %ext : vector<4xf8E5M2>
 }
@@ -183,7 +183,7 @@ func.func @conversion_splat(%in: vector<4xf32>, %scale: f8E8M0FNU) -> vector<4xf
 
 // CHECK-LABEL: @conversion_scalar
 // CHECK:         %[[SCALE_F32:.+]] = arith.extf %arg1 : f8E8M0FNU to f32
-// CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.splat %arg0 : vector<1xf32>
+// CHECK-NEXT:    %[[SPLAT_IN:.+]] = vector.broadcast %arg0 : f32 to vector<1xf32>
 // CHECK-NEXT:    %[[PACKED_TRUNC:.+]] = amdgpu.packed_scaled_trunc %[[SPLAT_IN]] into undef[0], %[[SCALE_F32]]
 // CHECK-NEXT:    %[[RESULT:.+]] = vector.extract %[[PACKED_TRUNC]][0]
 // CHECK-NEXT:    return %[[RESULT]] : f8E5M2
diff --git a/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir b/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir
new file mode 100644
index 0000000000000..bae7c5986ef9e
--- /dev/null
+++ b/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt %s -convert-complex-to-rocdl-library-calls | FileCheck %s
+
+// CHECK-DAG: @__ocml_cabs_f32(complex<f32>) -> f32
+// CHECK-DAG: @__ocml_cabs_f64(complex<f64>) -> f64
+// CHECK-DAG: @__ocml_cexp_f32(complex<f32>) -> complex<f32>
+// CHECK-DAG: @__ocml_cexp_f64(complex<f64>) -> complex<f64>
+
+//CHECK-LABEL: @abs_caller
+func.func @abs_caller(%f: complex<f32>, %d: complex<f64>) -> (f32, f64) {
+  // CHECK: %[[RF:.*]] = call @__ocml_cabs_f32(%{{.*}})
+  %rf = complex.abs %f : complex<f32>
+  // CHECK: %[[RD:.*]] = call @__ocml_cabs_f64(%{{.*}})
+  %rd = complex.abs %d : complex<f64>
+  // CHECK: return %[[RF]], %[[RD]]
+  return %rf, %rd : f32, f64
+}
+
+//CHECK-LABEL: @exp_caller
+func.func @exp_caller(%f: complex<f32>, %d: complex<f64>) -> (complex<f32>, complex<f64>) {
+  // CHECK: %[[EF:.*]] = call @__ocml_cexp_f32(%{{.*}})
+  %ef = complex.exp %f : complex<f32>
+  // CHECK: %[[ED:.*]] = call @__ocml_cexp_f64(%{{.*}})
+  %ed = complex.exp %d : complex<f64>
+  // CHECK: return %[[EF]], %[[ED]]
+  return %ef, %ed : complex<f32>, complex<f64>
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
index 510f7a2d94c9e..fb14feb8442b0 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -198,7 +198,28 @@ func.func @splat(%f : f32) -> vector<4xf32> {
 //  CHECK-SAME: (%[[A:.+]]: f32)
 //       CHECK:   spirv.ReturnValue %[[A]] : f32
 func.func @splat_size1_vector(%f : f32) -> vector<1xf32> {
-  %splat = vector.splat %f : vector<1xf32>
+  %bc = vector.broadcast %f : f32 to vector<1xf32>
+  return %bc : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scalar_broadcast
+//  CHECK-SAME: (%[[A:.+]]: f32)
+//       CHECK:   %[[VAL:.+]] = spirv.CompositeConstruct %[[A]], %[[A]], %[[A]], %[[A]]
+//       CHECK:   spirv.ReturnValue %[[VAL]] : vector<4xf32>
+func.func @scalar_broadcast(%f : f32) -> vector<4xf32> {
+  %bc = vector.broadcast %f : f32 to vector<4xf32>
+  return %bc : vector<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scalar_broadcast_size1_vector
+//  CHECK-SAME: (%[[A:.+]]: f32)
+//       CHECK:   spirv.ReturnValue %[[A]] : f32
+func.func @scalar_broadcast_size1_vector(%f : f32) -> vector<1xf32> {
+  %splat = vector.broadcast %f : f32 to vector<1xf32>
   return %splat : vector<1xf32>
 }
 
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
index d37fd1de90add..2b4eda37903d4 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
@@ -41,6 +41,8 @@ func.func @memref_load(%buff : memref<4x8xf32>, %i: index, %j: index) -> f32 {
 module @globals {
   memref.global "private" constant @internal_global : memref<3x7xf32> = dense<4.0>
   // CHECK-NEXT: emitc.global static const @internal_global : !emitc.array<3x7xf32> = dense<4.000000e+00>
+  memref.global "private" constant @__constant_xi32 : memref<i32> = dense<-1>
+  // CHECK-NEXT: emitc.global static const @__constant_xi32 : i32 = -1
   memref.global @public_global : memref<3x7xf32>
   // CHECK-NEXT: emitc.global extern @public_global : !emitc.array<3x7xf32>
   memref.global @uninitialized_global : memref<3x7xf32> = uninitialized
@@ -50,6 +52,9 @@ module @globals {
   func.func @use_global() {
     // CHECK-NEXT: emitc.get_global @public_global : !emitc.array<3x7xf32>
     %0 = memref.get_global @public_global : memref<3x7xf32>
+    // CHECK-NEXT: emitc.get_global @__constant_xi32 : !emitc.lvalue<i32>
+    // CHECK-NEXT: emitc.apply "&"(%1) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+    %1 = memref.get_global @__constant_xi32 : memref<i32>
     return
   }
 }
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
index c03d67fdc33fa..8c135d51c495b 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
@@ -130,6 +130,48 @@ func.func @broadcast_vec1d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector
 
 // -----
 
+// CHECK-LABEL: @broadcast_vec0d_from_scalar
+// CHECK-SAME: %[[ELT:.*]]: f32
+func.func @broadcast_vec0d_from_scalar(%elt: f32) -> vector<f32> {
+  %v = vector.broadcast %elt : f32 to vector<f32>
+  return %v : vector<f32>
+}
+// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.poison : vector<1xf32>
+// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : i32] : vector<1xf32>
+// CHECK-NEXT: %[[VCAST:[0-9]+]] = builtin.unrealized_conversion_cast %[[V]] : vector<1xf32> to vector<f32>
+// CHECK-NEXT: return %[[VCAST]] : vector<f32>
+
+// -----
+
+// CHECK-LABEL: @broadcast_vec1d_from_scalar
+// CHECK-SAME: %[[VEC:[0-9a-zA-Z]+]]: vector<4xf32>
+// CHECK-SAME: %[[ELT:[0-9a-zA-Z]+]]: f32
+func.func @broadcast_vec1d_from_scalar(%vec: vector<4xf32>, %elt: f32) -> vector<4xf32> {
+  %vb = vector.broadcast %elt : f32 to vector<4xf32>
+  return %vb : vector<4xf32>
+}
+// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.poison : vector<4xf32>
+// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : i32] : vector<4xf32>
+// CHECK-NEXT: %[[SPLAT:[0-9]+]] = llvm.shufflevector %[[V]], %[[UNDEF]] [0, 0, 0, 0]
+// CHECK-NEXT: return %[[SPLAT]] : vector<4xf32>
+
+// -----
+
+// CHECK-LABEL: @broadcast_scalable_vec1d_from_scalar
+// CHECK-SAME: %[[VEC:[0-9a-zA-Z]+]]: vector<[4]xf32>
+// CHECK-SAME: %[[ELT:[0-9a-zA-Z]+]]: f32
+func.func @broadcast_scalable_vec1d_from_scalar(%vec: vector<[4]xf32>, %elt: f32) -> vector<[4]xf32> {
+  %vb = vector.broadcast %elt : f32 to vector<[4]xf32>
+  return %vb : vector<[4]xf32>
+}
+// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.poison : vector<[4]xf32>
+// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : i32] : vector<[4]xf32>
+// CHECK-NEXT: %[[SPLAT:[0-9]+]] = llvm.shufflevector %[[V]], %[[UNDEF]] [0, 0, 0, 0]
+// CHECK-NEXT: return %[[SPLAT]] : vector<[4]xf32>
+
 //===----------------------------------------------------------------------===//
 // vector.shuffle
 //===----------------------------------------------------------------------===//
@@ -2241,51 +2283,16 @@ func.func @compress_store_op_index(%arg0: memref<?xindex>, %arg1: vector<11xi1>,
 // vector.splat
 //===----------------------------------------------------------------------===//
 
+// vector.splat is converted to vector.broadcast. Then, vector.broadcast is converted to LLVM.
 // CHECK-LABEL: @splat_0d
-// CHECK-SAME: %[[ELT:.*]]: f32
-func.func @splat_0d(%elt: f32) -> vector<f32> {
-  %v = vector.splat %elt : vector<f32>
-  return %v : vector<f32>
-}
-// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.poison : vector<1xf32>
-// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : i32] : vector<1xf32>
-// CHECK-NEXT: %[[VCAST:[0-9]+]] = builtin.unrealized_conversion_cast %[[V]] : vector<1xf32> to vector<f32>
-// CHECK-NEXT: return %[[VCAST]] : vector<f32>
-
-// -----
-
-// CHECK-LABEL: @splat
-// CHECK-SAME: %[[VEC:[0-9a-zA-Z]+]]: vector<4xf32>
-// CHECK-SAME: %[[ELT:[0-9a-zA-Z]+]]: f32
-func.func @splat(%vec: vector<4xf32>, %elt: f32) -> vector<4xf32> {
-  %vb = vector.splat %elt : vector<4xf32>
-  %r = arith.mulf %vec, %vb : vector<4xf32>
-  return %r : vector<4xf32>
-}
-// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.poison : vector<4xf32>
-// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : i32] : vector<4xf32>
-// CHECK-NEXT: %[[SPLAT:[0-9]+]] = llvm.shufflevector %[[V]], %[[UNDEF]] [0, 0, 0, 0]
-// CHECK-NEXT: %[[SCALE:[0-9]+]] = arith.mulf %[[VEC]], %[[SPLAT]] : vector<4xf32>
-// CHECK-NEXT: return %[[SCALE]] : vector<4xf32>
-
-// -----
-
-// CHECK-LABEL: @splat_scalable
-// CHECK-SAME: %[[VEC:[0-9a-zA-Z]+]]: vector<[4]xf32>
-// CHECK-SAME: %[[ELT:[0-9a-zA-Z]+]]: f32
-func.func @splat_scalable(%vec: vector<[4]xf32>, %elt: f32) -> vector<[4]xf32> {
-  %vb = vector.splat %elt : vector<[4]xf32>
-  %r = arith.mulf %vec, %vb : vector<[4]xf32>
-  return %r : vector<[4]xf32>
+// CHECK-NOT: splat
+// CHECK: return
+func.func @splat_0d(%elt: f32) -> (vector<f32>, vector<4xf32>, vector<[4]xf32>) {
+  %a = vector.splat %elt : vector<f32>
+  %b = vector.splat %elt : vector<4xf32>
+  %c = vector.splat %elt : vector<[4]xf32>
+  return %a, %b, %c : vector<f32>, vector<4xf32>, vector<[4]xf32>
 }
-// CHECK-NEXT: %[[UNDEF:[0-9]+]] = llvm.mlir.poison : vector<[4]xf32>
-// CHECK-NEXT: %[[ZERO:[0-9]+]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK-NEXT: %[[V:[0-9]+]] = llvm.insertelement %[[ELT]], %[[UNDEF]][%[[ZERO]] : i32] : vector<[4]xf32>
-// CHECK-NEXT: %[[SPLAT:[0-9]+]] = llvm.shufflevector %[[V]], %[[UNDEF]] [0, 0, 0, 0]
-// CHECK-NEXT: %[[SCALE:[0-9]+]] = arith.mulf %[[VEC]], %[[SPLAT]] : vector<[4]xf32>
-// CHECK-NEXT: return %[[SCALE]] : vector<[4]xf32>
 
 // -----
 
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 64e51f5554628..72810b5dddaa3 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1424,36 +1424,6 @@ func.func @fma_scalable(%vec_1d: vector<[8]xf32>, %vec_2d: vector<2x[4]xf32>, %v
 
   return %0, %1, %2: vector<[8]xf32>, vector<2x[4]xf32>, vector<1x1x[1]xf32>
 }
-// -----
-
-//===----------------------------------------------------------------------===//
-// vector.matrix_multiply
-//===----------------------------------------------------------------------===//
-
-//                          4x16                16x3               4x3
-func.func @matrix_ops(%A: vector<64xf64>, %B: vector<48xf64>) -> vector<12xf64> {
-  %C = vector.matrix_multiply %A, %B
-    { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } :
-    (vector<64xf64>, vector<48xf64>) -> vector<12xf64>
-  return %C: vector<12xf64>
-}
-// CHECK-LABEL: @matrix_ops
-//       CHECK:   llvm.intr.matrix.multiply %{{.*}}, %{{.*}} {
-//  CHECK-SAME: lhs_columns = 16 : i32, lhs_rows = 4 : i32, rhs_columns = 3 : i32
-//  CHECK-SAME: } : (vector<64xf64>, vector<48xf64>) -> vector<12xf64>
-
-// -----
-
-func.func @matrix_ops_index(%A: vector<64xindex>, %B: vector<48xindex>) -> vector<12xindex> {
-  %C = vector.matrix_multiply %A, %B
-    { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } :
-    (vector<64xindex>, vector<48xindex>) -> vector<12xindex>
-  return %C: vector<12xindex>
-}
-// CHECK-LABEL: @matrix_ops_index
-//       CHECK:   llvm.intr.matrix.multiply %{{.*}}, %{{.*}} {
-//  CHECK-SAME: lhs_columns = 16 : i32, lhs_rows = 4 : i32, rhs_columns = 3 : i32
-//  CHECK-SAME: } : (vector<64xi64>, vector<48xi64>) -> vector<12xi64>
 
 // -----
 
@@ -1602,56 +1572,6 @@ func.func @create_mask_1d_scalable(%num_elems : index) -> vector<[4]xi1> {
 
 // -----
 
-//===----------------------------------------------------------------------===//
-// vector.flat_transpose
-//===----------------------------------------------------------------------===//
-
-func.func @flat_transpose(%arg0: vector<16xf32>) -> vector<16xf32> {
-  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 }
-     : vector<16xf32> -> vector<16xf32>
-  return %0 : vector<16xf32>
-}
-
-// CHECK-LABEL: func @flat_transpose
-// CHECK-SAME:  %[[A:.*]]: vector<16xf32>
-// CHECK:       %[[T:.*]] = llvm.intr.matrix.transpose %[[A]]
-// CHECK-SAME:      {columns = 4 : i32, rows = 4 : i32} :
-// CHECK-SAME:      vector<16xf32> into vector<16xf32>
-// CHECK:       return %[[T]] : vector<16xf32>
-
-// -----
-
-func.func @flat_transpose_index(%arg0: vector<16xindex>) -> vector<16xindex> {
-  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 }
-     : vector<16xindex> -> vector<16xindex>
-  return %0 : vector<16xindex>
-}
-// CHECK-LABEL: func @flat_transpose_index
-// CHECK-SAME:  %[[A:.*]]: vector<16xindex>
-// CHECK:       %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<16xindex> to vector<16xi64>
-// CHECK:       %[[T1:.*]] = llvm.intr.matrix.transpose %[[T0]]
-// CHECK-SAME:      {columns = 4 : i32, rows = 4 : i32} :
-// CHECK-SAME:      vector<16xi64> into vector<16xi64>
-// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<16xi64> to vector<16xindex>
-// CHECK:       return %[[T2]] : vector<16xindex>
-
-// -----
-
-func.func @flat_transpose(%arg0: vector<16xf32>) -> vector<16xf32> {
-  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 }
-     : vector<16xf32> -> vector<16xf32>
-  return %0 : vector<16xf32>
-}
-
-// CHECK-LABEL: func @flat_transpose
-// CHECK-SAME:  %[[A:.*]]: vector<16xf32>
-// CHECK:       %[[T:.*]] = llvm.intr.matrix.transpose %[[A]]
-// CHECK-SAME:      {columns = 4 : i32, rows = 4 : i32} :
-// CHECK-SAME:      vector<16xf32> into vector<16xf32>
-// CHECK:       return %[[T]] : vector<16xf32>
-
-// -----
-
 //===----------------------------------------------------------------------===//
 // vector.gather
 //
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
index 33177736eb5fe..1ed82954398f0 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
@@ -558,10 +558,9 @@ func.func @vector_print_vector_0d(%arg0: vector<f32>) {
 // CHECK-SAME:                                      %[[VEC:.*]]: vector<f32>) {
 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32>
 // CHECK:           vector.print punctuation <open>
 // CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
-// CHECK:             %[[EL:.*]] = vector.extract %[[FLAT_VEC]][%[[IDX]]] : f32 from vector<1xf32>
+// CHECK:             %[[EL:.*]] = vector.extract %[[VEC]][] : f32 from vector<f32>
 // CHECK:             vector.print %[[EL]] : f32 punctuation <no_punctuation>
 // CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[C0]] : index
 // CHECK:             scf.if %[[IS_NOT_LAST]] {
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index 27fd74e12d36e..f43a41a0af2f4 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -579,7 +579,7 @@ func.func @fma_size1_vector(%a: vector<1xf32>, %b: vector<1xf32>, %c: vector<1xf
 //       CHECK:   %[[VAL:.+]] = spirv.CompositeConstruct %[[A]], %[[A]], %[[A]], %[[A]]
 //       CHECK:   return %[[VAL]]
 func.func @splat(%f : f32) -> vector<4xf32> {
-  %splat = vector.splat %f : vector<4xf32>
+  %splat = vector.broadcast %f : f32 to vector<4xf32>
   return %splat : vector<4xf32>
 }
 
@@ -590,7 +590,7 @@ func.func @splat(%f : f32) -> vector<4xf32> {
 //       CHECK:   %[[VAL:.+]] = builtin.unrealized_conversion_cast %[[A]]
 //       CHECK:   return %[[VAL]]
 func.func @splat_size1_vector(%f : f32) -> vector<1xf32> {
-  %splat = vector.splat %f : vector<1xf32>
+  %splat = vector.broadcast %f : f32 to vector<1xf32>
   return %splat : vector<1xf32>
 }
 
@@ -1161,3 +1161,15 @@ func.func @vector_store_2d(%arg0 : memref<4x4xf32, #spirv.storage_class<StorageB
 }
 
 } // end module
+
+// -----
+
+// Ensure the case without module attributes not crash.
+
+// CHECK-LABEL: @vector_load
+//       CHECK:   vector.load
+func.func @vector_load(%arg0 : memref<4xf32, #spirv.storage_class<StorageBuffer>>) -> vector<4xf32> {
+  %idx = arith.constant 0 : index
+  %0 = vector.load %arg0[%idx] : memref<4xf32, #spirv.storage_class<StorageBuffer>>, vector<4xf32>
+  return %0: vector<4xf32>
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
index 4af7061a4f8a3..9908205f07c92 100644
--- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
@@ -30,7 +30,7 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
 // CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
@@ -54,8 +54,8 @@ func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
@@ -73,7 +73,7 @@ func.func @load_out_of_bounds(%source: memref<7x15xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
 // CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:    memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
index d68a02b54e967..2c498dcc2a071 100644
--- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
@@ -32,7 +32,7 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
 // CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
 // -----
@@ -56,8 +56,8 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
 // -----
@@ -75,7 +75,7 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
 // CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
 // -----
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c2f760b29afc4..d1e5a62ad3e9b 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -51,7 +51,7 @@ func.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<32x64xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:    memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
@@ -96,7 +96,7 @@ func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
 // CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index 8de6c2283b37c..d5f1221aebed5 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -60,7 +60,7 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
 // CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
@@ -80,7 +80,7 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
 // CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32,
+// CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
 // -----
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 6d55583f8bc7c..0d2fd245af9e2 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
   func.return %0 : vector<8xi6>
 }
+
+// -----
+
+func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
+  // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
+  amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 51f3bbd9ae45c..fe78b5365745a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -360,10 +360,54 @@ func.func @fat_raw_buffer_cast_easy(%m: memref<8xi32>) -> memref<8xi32, #amdgpu.
 // CHECK-SAME: cacheSwizzleStride(%{{[^)]*}})
 // CHECK-SAME: boundsCheck(false)
 // CHECK-SAME: resetOffset
-func.func @fat_raw_buffer_cast(%m: memref<8xi32, strided<[1], offset: ?>>, %validBytes: i32, %cacheSwizzle: i14) -> memref<8xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>> {
+func.func @fat_raw_buffer_cast(%m: memref<8xi32, strided<[1], offset: ?>>, %validBytes: i32, %cacheSwizzle: i14) -> memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> {
   %ret = amdgpu.fat_raw_buffer_cast %m validBytes(%validBytes) cacheSwizzleStride(%cacheSwizzle) boundsCheck(false) resetOffset
-    : memref<8xi32, strided<[1], offset: ?>> to memref<8xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>>
-  func.return %ret : memref<8xi32, strided<[1]>, #amdgpu.address_space<fat_raw_buffer>>
+    : memref<8xi32, strided<[1], offset: ?>> to memref<8xi32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return %ret : memref<8xi32, #amdgpu.address_space<fat_raw_buffer>>
+}
+
+// CHECK-LABEL: func @fat_raw_buffer_cast_dynamic_1d_reset_offset
+// CHECK: amdgpu.fat_raw_buffer_cast
+func.func @fat_raw_buffer_cast_dynamic_1d_reset_offset(%m: memref<?xi32, strided<[1], offset: ?>>) -> memref<?xi32, #amdgpu.address_space<fat_raw_buffer>> {
+  %ret = amdgpu.fat_raw_buffer_cast %m resetOffset
+    : memref<?xi32, strided<[1], offset: ?>> to memref<?xi32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return %ret : memref<?xi32, #amdgpu.address_space<fat_raw_buffer>>
+}
+
+// CHECK-LABEL: func @fat_raw_buffer_cast_dynamic_0d_reset_offset
+// CHECK: %[[ret:.+]] = amdgpu.fat_raw_buffer_cast
+// CHECK: return %[[ret]]
+func.func @fat_raw_buffer_cast_dynamic_0d_reset_offset(%m: memref<i32, strided<[], offset: ?>>) -> memref<i32, #amdgpu.address_space<fat_raw_buffer>> {
+  %ret = amdgpu.fat_raw_buffer_cast %m resetOffset
+    : memref<i32, strided<[], offset: ?>> to memref<i32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return %ret : memref<i32, #amdgpu.address_space<fat_raw_buffer>>
+}
+
+// CHECK-LABEL: func @fat_raw_buffer_cast_static_shape_2d_reset_offset
+// CHECK: %[[ret:.+]] = amdgpu.fat_raw_buffer_cast
+// CHECK: return %[[ret]]
+func.func @fat_raw_buffer_cast_static_shape_2d_reset_offset(%m: memref<4x4xi32, strided<[4, 1], offset: ?>>) -> memref<4x4xi32, #amdgpu.address_space<fat_raw_buffer>> {
+  %ret = amdgpu.fat_raw_buffer_cast %m resetOffset
+    : memref<4x4xi32, strided<[4, 1], offset: ?>> to memref<4x4xi32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return %ret : memref<4x4xi32, #amdgpu.address_space<fat_raw_buffer>>
+}
+
+// CHECK-LABEL: func @fat_raw_buffer_cast_dynamic_2d_reset_offset
+// CHECK: %[[ret:.+]] = amdgpu.fat_raw_buffer_cast
+// CHECK: return %[[ret]]
+func.func @fat_raw_buffer_cast_dynamic_2d_reset_offset(%m: memref<?x?xi32, strided<[?, 1], offset: ?>>) -> memref<?x?xi32, strided<[?, 1]>, #amdgpu.address_space<fat_raw_buffer>> {
+  %ret = amdgpu.fat_raw_buffer_cast %m resetOffset
+    : memref<?x?xi32, strided<[?, 1], offset: ?>> to memref<?x?xi32, strided<[?, 1]>, #amdgpu.address_space<fat_raw_buffer>>
+  func.return %ret : memref<?x?xi32, strided<[?, 1]>, #amdgpu.address_space<fat_raw_buffer>>
+}
+
+// CHECK-LABEL: func @fat_raw_buffer_cast_noncontiguous_2d_reset_offset
+// CHECK: %[[ret:.+]] = amdgpu.fat_raw_buffer_cast
+// CHECK: return %[[ret]]
+func.func @fat_raw_buffer_cast_noncontiguous_2d_reset_offset(%m: memref<4x4xi32, strided<[8, 1], offset: ?>>) -> memref<4x4xi32, strided<[8, 1]>, #amdgpu.address_space<fat_raw_buffer>> {
+  %ret = amdgpu.fat_raw_buffer_cast %m resetOffset
+    : memref<4x4xi32, strided<[8, 1], offset: ?>> to memref<4x4xi32, strided<[8, 1]>, #amdgpu.address_space<fat_raw_buffer>>
+  func.return %ret : memref<4x4xi32, strided<[8, 1]>, #amdgpu.address_space<fat_raw_buffer>>
 }
 
 // CHECK-LABEL: func @raw_buffer_load_f32_from_rank_1
@@ -493,3 +537,31 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
   func.return %0 : vector<4xf16>
 }
+
+// CHECK-LABEL: func @gather_to_lds
+func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}],          %{{.*}}[%{{.*}}, %{{.*}}]
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1]        : vector<2xf16>, memref<32x32xf16>, memref<32xf16,    #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem1[%idx1],        %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>,    memref<32x32xf16, #gpu.address_space<workgroup>>
+  func.return
+}
+
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+  // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+  // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1)
+  // CHECK: amdgpu.memory_counter_wait load(1)
+  // CHECK: amdgpu.memory_counter_wait store(2)
+  // CHECK: amdgpu.memory_counter_wait ds(3)
+  // CHECK: amdgpu.memory_counter_wait exp(4)
+  amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+  amdgpu.memory_counter_wait exp(1) store(2) ds(3) load(4)
+  amdgpu.memory_counter_wait load(1)
+  amdgpu.memory_counter_wait store(2)
+  amdgpu.memory_counter_wait ds(3)
+  amdgpu.memory_counter_wait exp(4)
+  func.return
+}
diff --git a/mlir/test/Dialect/Affine/loop-tiling-validity.mlir b/mlir/test/Dialect/Affine/loop-tiling-validity.mlir
index e2c3832f695cc..cf2e12613ab92 100644
--- a/mlir/test/Dialect/Affine/loop-tiling-validity.mlir
+++ b/mlir/test/Dialect/Affine/loop-tiling-validity.mlir
@@ -7,8 +7,8 @@
 // CHECK-DAG: [[$LB:#map[0-9]*]] = affine_map<(d0) -> (d0)>
 // CHECK-DAG: [[$UB:#map[0-9]*]] = affine_map<(d0) -> (d0 + 32)>
 
-// CHECK-LABEL: func @legal_loop()
-func.func @legal_loop() {
+// CHECK-LABEL: func @valid_to_tile()
+func.func @valid_to_tile() {
   %0 = memref.alloc() : memref<64xf32>
 
   affine.for %i = 0 to 64 {
@@ -20,8 +20,8 @@ func.func @legal_loop() {
   return
 }
 
-// CHECK:   affine.for %{{.*}} = 0 to 64 step 32 {
-// CHECK-NEXT:     affine.for %{{.*}} = [[$LB]](%{{.*}}) to [[$UB]](%{{.*}}) {
+// CHECK:       affine.for %{{.*}} = 0 to 64 step 32 {
+// CHECK-NEXT:    affine.for %{{.*}} = [[$LB]](%{{.*}}) to [[$UB]](%{{.*}}) {
 
 // -----
 
@@ -33,8 +33,10 @@ func.func @legal_loop() {
 func.func @illegal_loop_with_diag_dependence() {
   %A = memref.alloc() : memref<64x64xf32>
 
+  // No tiling here.
+  // CHECK:       affine.for %{{.*}} = 0 to 64 {
+  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 64 {
   affine.for %i = 0 to 64 {
-    // expected-remark@above {{tiling nest is invalid due to dependences}}
     affine.for %j = 0 to 64 {
       %0 = affine.load %A[%j, %i] : memref<64x64xf32>
       %1 = affine.load %A[%i, %j - 1] : memref<64x64xf32>
diff --git a/mlir/test/Dialect/LLVMIR/ifunc.mlir b/mlir/test/Dialect/LLVMIR/ifunc.mlir
new file mode 100644
index 0000000000000..33e21bab0d51b
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/ifunc.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt %s -split-input-file --verify-roundtrip | FileCheck %s
+
+// CHECK: llvm.mlir.ifunc external @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.mlir.ifunc @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: llvm.mlir.ifunc private @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.mlir.ifunc private @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: llvm.mlir.ifunc weak @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.mlir.ifunc weak @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index bd1106e304c60..ac1737444fcf0 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -394,7 +394,7 @@ llvm.func @array_attribute_two_different_types() -> !llvm.struct<(f64, f64)> {
 // -----
 
 llvm.func @struct_wrong_attribute_type() -> !llvm.struct<(f64, f64)> {
-  // expected-error @+1 {{expected array attribute}}
+  // expected-error @+1 {{expected array attribute for struct type}}
   %0 = llvm.mlir.constant(1.0 : f64) : !llvm.struct<(f64, f64)>
   llvm.return %0 : !llvm.struct<(f64, f64)>
 }
@@ -439,6 +439,111 @@ llvm.func @scalable_vec_requires_splat() -> vector<[4]xf64> {
   llvm.return %0 : vector<[4]xf64>
 }
 
+
+// -----
+
+llvm.func @int_attr_requires_int_type() -> f32 {
+  // expected-error @below{{expected integer type}}
+  %0 = llvm.mlir.constant(1 : index) : f32
+  llvm.return %0 : f32
+}
+
+// -----
+
+llvm.func @vector_int_attr_requires_int_type() -> vector<2xf32> {
+  // expected-error @below{{expected integer element type}}
+  %0 = llvm.mlir.constant(dense<[1, 2]> : vector<2xi32>) : vector<2xf32>
+  llvm.return %0 : vector<2xf32>
+}
+
+// -----
+
+llvm.func @float_attr_and_type_required_same() -> f16 {
+  // expected-error @below{{attribute and type have different float semantics}}
+  %cst = llvm.mlir.constant(1.0 : bf16) : f16
+  llvm.return %cst : f16
+}
+
+// -----
+
+llvm.func @vector_float_attr_and_type_required_same() -> vector<2xf16> {
+  // expected-error @below{{attribute and type have different float semantics}}
+  %cst = llvm.mlir.constant(dense<[1.0, 2.0]> : vector<2xbf16>) : vector<2xf16>
+  llvm.return %cst : vector<2xf16>
+}
+
+// -----
+
+llvm.func @incompatible_integer_type_for_float_attr() -> i32 {
+  // expected-error @below{{expected integer type of width 16}}
+  %cst = llvm.mlir.constant(1.0 : f16) : i32
+  llvm.return %cst : i32
+}
+
+// -----
+
+llvm.func @vector_incompatible_integer_type_for_float_attr() -> vector<2xi8> {
+  // expected-error @below{{expected integer type of width 16}}
+  %cst = llvm.mlir.constant(dense<[1.0, 2.0]> : vector<2xf16>) : vector<2xi8>
+  llvm.return %cst : vector<2xi8>
+}
+
+// -----
+
+llvm.func @vector_with_non_vector_type() -> f32 {
+  // expected-error @below{{expected vector or array type}}
+  %cst = llvm.mlir.constant(dense<100.0> : vector<1xf64>) : f32
+  llvm.return %cst : f32
+}
+
+// -----
+
+llvm.func @array_attr_with_invalid_type() -> i32 {
+  // expected-error @below{{expected array or struct type for array attribute}}
+  %0 = llvm.mlir.constant([1 : i32]) : i32
+  llvm.return %0 : i32
+}
+
+// -----
+
+llvm.func @elements_attribute_incompatible_nested_array_struct1_type() -> !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>> {
+  // expected-error @below{{expected integer element type for integer elements attribute}}
+  %0 = llvm.mlir.constant(dense<[[[1, 2], [3, 4]], [[42, 43], [44, 45]]]> : tensor<2x2x2xi32>) : !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>>
+  llvm.return %0 : !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>>
+}
+
+// -----
+
+llvm.func @elements_attribute_incompatible_nested_array_struct3_type() -> !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>> {
+  // expected-error @below{{expected integer element type for integer elements attribute}}
+  %0 = llvm.mlir.constant(dense<[[[1, 2], [3, 4]], [[42, 43], [44, 45]]]> : tensor<2x2x2xi32>) : !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>>
+  llvm.return %0 : !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>>
+}
+
+// -----
+
+llvm.func @invalid_struct_element_type() -> !llvm.struct<(f64, array<2 x i32>)> {
+  // expected-error @below{{expected struct element types to be floating point type or integer type}}
+  %0 = llvm.mlir.constant([1.0 : f64, dense<[1, 2]> : tensor<2xi32>]) : !llvm.struct<(f64, array<2 x i32>)>
+  llvm.return %0 : !llvm.struct<(f64, array<2 x i32>)>
+}
+
+// -----
+
+llvm.func @wrong_struct_element_attr_type() -> !llvm.struct<(f64, f64)> {
+  // expected-error @below{{expected element of array attribute to be floating point or integer}}
+  %0 = llvm.mlir.constant([dense<[1, 2]> : tensor<2xi32>, 2.0 : f64]) : !llvm.struct<(f64, f64)>
+  llvm.return %0 : !llvm.struct<(f64, f64)>
+}
+
+// -----
+
+llvm.func @struct_wrong_attribute_element_type() -> !llvm.struct<(f64, f64)> {
+  // expected-error @below{{struct element at index 0 is of wrong type}}
+  %0 = llvm.mlir.constant([1.0 : f32, 1.0 : f32]) : !llvm.struct<(f64, f64)>
+  llvm.return %0 : !llvm.struct<(f64, f64)>
+}
+
 // -----
 
 func.func @insertvalue_non_llvm_type(%a : i32, %b : i32) {
@@ -484,13 +589,13 @@ func.func @extractvalue_invalid_type(%a : !llvm.array<4 x vector<8xf32>>) -> !ll
   return %b : !llvm.array<4 x vector<8xf32>>
 }
 
-
 // -----
 
 func.func @extractvalue_non_llvm_type(%a : i32, %b : tensor<*xi32>) {
   // expected-error@+2 {{expected LLVM IR Dialect type}}
   llvm.extractvalue %b[0] : tensor<*xi32>
 }
+
 // -----
 
 func.func @extractvalue_struct_out_of_bounds() {
@@ -659,6 +764,7 @@ func.func @atomicrmw_scalable_vector(%ptr : !llvm.ptr, %f32_vec : vector<[2]xf32
   %0 = llvm.atomicrmw fadd %ptr, %f32_vec unordered : !llvm.ptr, vector<[2]xf32>
   llvm.return
 }
+
 // -----
 
 func.func @atomicrmw_vector_expected_float(%ptr : !llvm.ptr, %i32_vec : vector<3xi32>) {
@@ -1667,7 +1773,6 @@ func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !
   return
 }
 
-
 // -----
 
 func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
@@ -1931,3 +2036,30 @@ llvm.func @invalid_xevm_matrix_3(%a: !llvm.ptr<1>, %base_width_a: i32, %base_hei
   llvm.return %loaded_a : vector<8xi16>
 }
 
+// -----
+
+llvm.func external @resolve_foo() -> !llvm.ptr attributes {dso_local}
+// expected-error@+1 {{'llvm.mlir.ifunc' op resolver must be a definition}}
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+
+// -----
+
+llvm.mlir.global external @resolve_foo() : !llvm.ptr
+// expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}}
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+
+// -----
+
+llvm.func external @resolve_foo() -> !llvm.ptr
+// expected-error@+1 {{'llvm.mlir.ifunc' op 'common' linkage not supported in ifuncs}}
+llvm.mlir.ifunc common @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+
+// -----
+
+llvm.mlir.global external @resolve_foo() : !llvm.ptr
+llvm.mlir.alias external @alias_resolver : !llvm.ptr {
+  %0 = llvm.mlir.addressof @resolve_foo : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+// expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}}
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @alias_resolver {dso_local}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index a6a29bf858e59..a2b2f84606ba0 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -958,6 +958,27 @@ llvm.func @rocdl.s.wait.dscnt() {
   llvm.return
 }
 
+llvm.func @rocdl.s.wait.loadcnt() {
+  // CHECK-LABEL: rocdl.s.wait.loadcnt
+  // CHECK: rocdl.s.wait.loadcnt 0
+  rocdl.s.wait.loadcnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.wait.storecnt() {
+  // CHECK-LABEL: rocdl.s.wait.storecnt
+  // CHECK: rocdl.s.wait.storecnt 0
+  rocdl.s.wait.storecnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.wait.expcnt() {
+  // CHECK-LABEL: rocdl.s.wait.expcnt
+  // CHECK: rocdl.s.wait.expcnt 0
+  rocdl.s.wait.expcnt 0
+  llvm.return
+}
+
 // -----
 
 llvm.func @rocdl.readlane(%src : f32) -> f32 {
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index 66fc55fadf8fa..bc55c12c02f29 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -1014,3 +1014,69 @@ module {
 //   CHECK-DAG:     %[[T3:.+]] = arith.addf %[[T2]], %[[B1]]
 //       CHECK:     linalg.yield %[[T3]] : f32
 //       CHECK:   return %[[GENERIC]]
+
+// -----
+
+// In this test we expect the first two linalg.generic operations to be fused into one, but the third one (the matmul) to remain separate. 
+// The reason is that when the pattern is applied the 1st time, the fusion of the first two operations produces a fused operation with 
+// an additional result and ana dditional output indexing map that is not a permutation / not invertible. 
+// The fused op will still produce also the original result (and its output indexing map), which is preserved because the new indexing map 
+// is not invertible. Thus the fused op will have 2 results, but only the 2nd one will be used by the following matmul op as an input argument.
+// When trying to apply the fusion pattern again, the matmul op won't be fused because the operand to fuse was not produced with an invertible indexing map.
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 4 + d1 * 2 + d2 + d3, 0, 0, 0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d3, d4, d5, d6)>
+#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5)>
+module {
+  func.func @fuse_only_as_long_as_result_map_is_permutation(%arg0: tensor<1x1x2x1xf32>, %arg1: tensor<1x1x2x1xf32>) -> tensor<1x1x2x4xf32> {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<1x2x2x1xf32>
+    %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%0 : tensor<1x2x2x1xf32>) {
+    ^bb0(%out: f32):
+      %6 = linalg.index 1 : index
+      %7 = linalg.index 2 : index
+      %8 = arith.cmpi ult, %6, %c1 : index
+      %9 = arith.cmpi ult, %7, %c2 : index
+      %10 = arith.andi %8, %9 : i1
+      %11 = scf.if %10 -> (f32) {
+        %extracted = tensor.extract %arg1[%c0, %6, %7, %c0] : tensor<1x1x2x1xf32>
+        scf.yield %extracted : f32
+      } else {
+        scf.yield %cst : f32
+      }
+      linalg.yield %11 : f32
+    } -> tensor<1x2x2x1xf32>
+    %2 = tensor.empty() : tensor<4x1x1x1xf32>
+    %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1 : tensor<1x2x2x1xf32>) outs(%2 : tensor<4x1x1x1xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+    } -> tensor<4x1x1x1xf32>
+    %4 = tensor.empty() : tensor<1x1x2x4xf32>
+    %expanded = tensor.expand_shape %4 [[0], [1], [2], [3, 4, 5]] output_shape [1, 1, 2, 4, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x2x4x1x1xf32>
+    %5 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%arg0, %3 : tensor<1x1x2x1xf32>, tensor<4x1x1x1xf32>) outs(%expanded : tensor<1x1x2x4x1x1xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %6 = arith.mulf %in, %in_0 : f32
+      %7 = arith.addf %6, %out : f32
+      linalg.yield %7 : f32
+    } -> tensor<1x1x2x4x1x1xf32>
+    %collapsed = tensor.collapse_shape %5 [[0], [1], [2], [3, 4, 5]] : tensor<1x1x2x4x1x1xf32> into tensor<1x1x2x4xf32>
+    return %collapsed : tensor<1x1x2x4xf32>
+  }
+}
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0 * 4 + d1 * 2 + d2 + d3, 0, 0, 0)>
+// CHECK:     func.func @fuse_only_as_long_as_result_map_is_permutation
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<1x1x2x1xf32>, %[[ARG1:.*]]: tensor<1x1x2x1xf32>) -> tensor<1x1x2x4xf32> {
+// CHECK-DAG:     %[[OUT0:.+]] = tensor.empty() : tensor<1x2x2x1xf32>
+// CHECK-DAG:     %[[OUT1:.+]] = tensor.empty() : tensor<4x1x1x1xf32>
+// CHECK:         %[[FUSED:.+]]:2 = linalg.generic {indexing_maps = [#[[MAP0]], #[[MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+// CHECK-SAME:     outs(%[[OUT0]], %[[OUT1]] : tensor<1x2x2x1xf32>, tensor<4x1x1x1xf32>)
+// CHECK-NOT:     linalg.generic
+// CHECK:         tensor.expand_shape
+// CHECK:         linalg.generic {{.*}}, iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "reduction"]}
+// CHECK-SAME:     ins(%[[ARG0]], %[[FUSED]]#1 : tensor<1x1x2x1xf32>, tensor<4x1x1x1xf32>)
\ No newline at end of file
diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir
index 357f2c11a7936..5d66837fca510 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir
@@ -29,3 +29,20 @@ func.func @neither_permutation_nor_broadcast(%init : tensor<8xi32>) -> tensor<8x
   } -> tensor<8xi32>
   return %res : tensor<8xi32>
 }
+
+// -----
+
+#map = affine_map<(d0) -> (d0)>
+// CHECK-LABEL: func @not_copy
+//  CHECK-NOT:    linalg.copy
+//      CHECK:    linalg.generic
+func.func @not_copy(%input: tensor<8xi32>, %init: tensor<8xi32>) -> tensor<8xi32> {
+  %c0_i32 = arith.constant 0 : i32
+  %res = linalg.generic {
+    indexing_maps = [#map, #map], iterator_types = ["parallel"]
+  } ins(%input: tensor<8xi32>) outs(%init: tensor<8xi32>) {
+  ^bb0(%in: i32, %out: i32):
+    linalg.yield %c0_i32 : i32
+  } -> tensor<8xi32>
+  return %res : tensor<8xi32>
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization/contraction-interface.mlir b/mlir/test/Dialect/Linalg/vectorization/contraction-interface.mlir
new file mode 100644
index 0000000000000..d8f897cca958d
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/vectorization/contraction-interface.mlir
@@ -0,0 +1,484 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
+
+///----------------------------------------------------------------------------------------
+/// Tests for vectorizing operations implementing contraction op interface.
+/// Ops implementing the contraction interface are vectorized directly to their
+/// vector dialect named counterparts.
+///----------------------------------------------------------------------------------------
+
+func.func @matmul(%A: tensor<8x4xf32>, %B: tensor<4x16xf32>,
+    %C: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = linalg.matmul
+    ins(%A, %B : tensor<8x4xf32>, tensor<4x16xf32>)
+    outs(%C: tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul(
+// CHECK-SAME:    %[[A:.*]]: tensor<8x4xf32>, %[[B:.*]]: tensor<4x16xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<8x16xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.transfer_read %[[A]]{{.*}}: tensor<8x4xf32>, vector<8x4xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.transfer_read %[[B]]{{.*}}: tensor<4x16xf32>, vector<4x16xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.transfer_read %[[C]]{{.*}}: tensor<8x16xf32>, vector<8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x16xf32>, tensor<8x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @matmul_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul_dynamic(
+// CHECK-SAME:    %[[A:.*]]: tensor<?x?xf32>, %[[B:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<?x?xf32>)
+
+/// Get the contraction dimensions
+//  CHECK: %[[MATMUL_DIM_M_IDX:.*]] = arith.constant 0 : index
+//  CHECK: %[[MATMUL_DIM_M:.*]] = tensor.dim %[[A]], %[[MATMUL_DIM_M_IDX]] : tensor<?x?xf32>
+//  CHECK: %[[MATMUL_DIM_N_IDX:.*]] = arith.constant 1 : index
+//  CHECK: %[[MATMUL_DIM_N:.*]] = tensor.dim %[[B]], %[[MATMUL_DIM_N_IDX]] : tensor<?x?xf32>
+//  CHECK: %[[MATMUL_DIM_K_IDX:.*]] = arith.constant 1 : index
+//  CHECK: %[[MATMUL_DIM_K:.*]] = tensor.dim %[[A]], %[[MATMUL_DIM_K_IDX]] : tensor<?x?xf32>
+
+/// Create a mask for the A matrix
+//      CHECK: %[[A_OFFSET:.*]] = arith.constant 0 : index
+//      CHECK: %[[A_DIM_M_IDX:.*]] = arith.constant 0 : index
+//      CHECK: %[[A_DIM_M:.*]] = tensor.dim %[[A]], %[[A_DIM_M_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[A_DIM_K_IDX:.*]] = arith.constant 1 : index
+//      CHECK: %[[A_DIM_K:.*]] = tensor.dim %[[A]], %[[A_DIM_K_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[LOAD_A_MASK:.*]] = vector.create_mask
+// CHECK-SAME:   %[[A_DIM_M]], %[[A_DIM_K]] : vector<8x4xi1>
+/// Read the A matrix
+//      CHECK: %[[LOAD_A:.*]] = vector.mask %[[LOAD_A_MASK]]
+// CHECK-SAME:   { vector.transfer_read %[[A]]{{\[}}%[[A_OFFSET]], %[[A_OFFSET]]{{\]}}
+// CHECK-SAME:     : tensor<?x?xf32>, vector<8x4xf32> }
+// CHECK-SAME:   : vector<8x4xi1> -> vector<8x4xf32>
+
+/// Create a mask for the B matrix
+//      CHECK: %[[B_OFFSET:.*]] = arith.constant 0 : index
+//      CHECK: %[[B_DIM_K_IDX:.*]] = arith.constant 0 : index
+//      CHECK: %[[B_DIM_K:.*]] = tensor.dim %[[B]], %[[B_DIM_K_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[B_DIM_N_IDX:.*]] = arith.constant 1 : index
+//      CHECK: %[[B_DIM_N:.*]] = tensor.dim %[[B]], %[[B_DIM_N_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[LOAD_B_MASK:.*]] = vector.create_mask
+// CHECK-SAME:   %[[B_DIM_K]], %[[B_DIM_N]] : vector<4x16xi1>
+/// Read the B matrix
+//      CHECK: %[[LOAD_B:.*]] = vector.mask %[[LOAD_B_MASK]]
+// CHECK-SAME:   { vector.transfer_read %[[B]]{{\[}}%[[B_OFFSET]], %[[B_OFFSET]]{{\]}}
+// CHECK-SAME:     : tensor<?x?xf32>, vector<4x16xf32> }
+// CHECK-SAME:   : vector<4x16xi1> -> vector<4x16xf32>
+
+/// Create a mask for the C matrix
+//      CHECK: %[[C_OFFSET:.*]] = arith.constant 0 : index
+//      CHECK: %[[C_DIM_M_IDX:.*]] = arith.constant 0 : index
+//      CHECK: %[[C_DIM_M:.*]] = tensor.dim %[[C]], %[[C_DIM_M_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[C_DIM_N_IDX:.*]] = arith.constant 1 : index
+//      CHECK: %[[C_DIM_N:.*]] = tensor.dim %[[C]], %[[C_DIM_N_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[LOAD_C_MASK:.*]] = vector.create_mask
+// CHECK-SAME:   %[[C_DIM_M]], %[[C_DIM_N]] : vector<8x16xi1>
+/// Read the C matrix
+//      CHECK: %[[LOAD_C:.*]] = vector.mask %[[LOAD_C_MASK]]
+// CHECK-SAME:   { vector.transfer_read %[[C]]{{\[}}%[[C_OFFSET]], %[[C_OFFSET]]{{\]}}
+// CHECK-SAME:     : tensor<?x?xf32>, vector<8x16xf32> }
+// CHECK-SAME:   : vector<8x16xi1> -> vector<8x16xf32>
+
+/// Create a mask for the contraction
+//      CHECK: %[[CONTRACTION_MASK:.*]] = vector.create_mask
+// CHECK-SAME:   %[[MATMUL_DIM_M]], %[[MATMUL_DIM_N]], %[[MATMUL_DIM_K]]
+// CHECK-SAME:   : vector<8x16x4xi1>
+/// Perform the contraction
+//      CHECK: %[[D:.*]] = vector.mask %[[CONTRACTION_MASK]]
+// CHECK-SAME:   { vector.contract
+// CHECK-SAME:     indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:     kind = #vector.kind<add>
+// CHECK-SAME:     %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+// CHECK-SAME:   } : vector<8x16x4xi1> -> vector<8x16xf32>
+
+/// Create a mask for the result
+//      CHECK: %[[D_OFFSET:.*]] = arith.constant 0 : index
+//      CHECK: %[[D_DIM_M_IDX:.*]] = arith.constant 0 : index
+//      CHECK: %[[D_DIM_M:.*]] = tensor.dim %[[C]], %[[D_DIM_M_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[D_DIM_N_IDX:.*]] = arith.constant 1 : index
+//      CHECK: %[[D_DIM_N:.*]] = tensor.dim %[[C]], %[[D_DIM_N_IDX]] : tensor<?x?xf32>
+//      CHECK: %[[LOAD_D_MASK:.*]] = vector.create_mask
+// CHECK-SAME:   %[[D_DIM_M]], %[[D_DIM_N]] : vector<8x16xi1>
+/// Write the result
+//      CHECK: vector.mask %[[LOAD_D_MASK]]
+// CHECK-SAME: { vector.transfer_write %[[D]], %[[C]]{{\[}}%[[D_OFFSET]], %[[D_OFFSET]]{{\]}}
+// CHECK-SAME:   : vector<8x16xf32>, tensor<?x?xf32> }
+// CHECK-SAME: : vector<8x16xi1> -> tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, 16, 4]
+      {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @matmul_dynamic_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>,
+    %C: memref<?x?xf32>) {
+  linalg.matmul
+    ins(%A, %B : memref<?x?xf32>, memref<?x?xf32>)
+    outs(%C: memref<?x?xf32>)
+  return
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul_dynamic_memref(
+// CHECK-SAME:    %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>,
+// CHECK-SAME:    %[[C:.*]]: memref<?x?xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[A]]{{.*}}: memref<?x?xf32>, vector<8x4xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[B]]{{.*}}: memref<?x?xf32>, vector<4x16xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[C]]{{.*}}: memref<?x?xf32>, vector<8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.mask{{.*}}{ vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.mask{{.*}}{ vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x16xf32>, memref<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, 16, 4]
+      {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @matmul_dynamic_scalable(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul_dynamic_scalable(
+// CHECK-SAME:    %[[A:.*]]: tensor<?x?xf32>, %[[B:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<?x?xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[A]]{{.*}}: tensor<?x?xf32>, vector<8x4xf32> }
+// CHECK-SAME:   : vector<8x4xi1> -> vector<8x4xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[B]]{{.*}}: tensor<?x?xf32>, vector<4x[16]xf32> }
+// CHECK-SAME:   : vector<4x[16]xi1> -> vector<4x[16]xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[C]]{{.*}}: tensor<?x?xf32>, vector<8x[16]xf32> }
+// CHECK-SAME:   : vector<8x[16]xi1> -> vector<8x[16]xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.mask{{.*}}{ vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+// CHECK-SAME:   } : vector<8x[16]x4xi1> -> vector<8x[16]xf32>
+//      CHECK: vector.mask{{.*}}{ vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x[16]xf32>, tensor<?x?xf32> }
+// CHECK-SAME:   : vector<8x[16]xi1> -> tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, [16], 4]
+      {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @matmul_transpose(%A: tensor<4x8xf32>, %B: tensor<16x4xf32>,
+    %C: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = linalg.matmul
+    indexing_maps = [affine_map<(m, n, k) -> (k, m)>, // transpose A
+                     affine_map<(m, n, k) -> (n, k)>, // transpose B
+                     affine_map<(m, n, k) -> (m, n)>]
+    ins(%A, %B : tensor<4x8xf32>, tensor<16x4xf32>)
+    outs(%C: tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul_transpose(
+// CHECK-SAME:    %[[A:.*]]: tensor<4x8xf32>, %[[B:.*]]: tensor<16x4xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<8x16xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.transfer_read %[[A]]{{.*}}: tensor<4x8xf32>, vector<4x8xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.transfer_read %[[B]]{{.*}}: tensor<16x4xf32>, vector<16x4xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.transfer_read %[[C]]{{.*}}: tensor<8x16xf32>, vector<8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x16xf32>, tensor<8x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @matmul_dynamic_transpose(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    indexing_maps = [affine_map<(m, n, k) -> (k, m)>, // transpose A
+                     affine_map<(m, n, k) -> (n, k)>, // transpose B
+                     affine_map<(m, n, k) -> (m, n)>]
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul_dynamic_transpose(
+// CHECK-SAME:    %[[A:.*]]: tensor<?x?xf32>, %[[B:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<?x?xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[A]]{{.*}}: tensor<?x?xf32>, vector<4x8xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[B]]{{.*}}: tensor<?x?xf32>, vector<16x4xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.mask{{.*}}{ vector.transfer_read %[[C]]{{.*}}: tensor<?x?xf32>, vector<8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.mask{{.*}}{ vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.mask{{.*}}{ vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x16xf32>, tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, 16, 4]
+      {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+/// Contractions with arbitrarty broadcasts are not supported in contraction interface
+/// vectorization.
+/// Dimension broadcasts are expected to be decomposed first which removes ambiguity
+/// caused by possible variants of dimensions materialization.
+/// For example, whether the below target LHS input layout is (m, k) or (k, m).
+
+func.func @negative_matmul_broadcast(%A: tensor<4xf32>, %B: tensor<4x16xf32>,
+    %C: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.matmul
+    indexing_maps = [affine_map<(m, n, k) -> (k)>, // broadcast
+                     affine_map<(m, n, k) -> (k, n)>,
+                     affine_map<(m, n, k) -> (m, n)>]
+    ins(%A, %B : tensor<4xf32>, tensor<4x16xf32>)
+    outs(%C: tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @matmul_mixed_precision(%A: tensor<8x4xf16>, %B: tensor<4x16xf16>,
+    %C: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = linalg.matmul
+    ins(%A, %B : tensor<8x4xf16>, tensor<4x16xf16>)
+    outs(%C: tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @matmul_mixed_precision(
+// CHECK-SAME:    %[[A:.*]]: tensor<8x4xf16>, %[[B:.*]]: tensor<4x16xf16>,
+// CHECK-SAME:    %[[C:.*]]: tensor<8x16xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.transfer_read %[[A]]{{.*}}: tensor<8x4xf16>, vector<8x4xf16>
+//      CHECK: %[[LOAD_B:.*]] = vector.transfer_read %[[B]]{{.*}}: tensor<4x16xf16>, vector<4x16xf16>
+//      CHECK: %[[LOAD_C:.*]] = vector.transfer_read %[[C]]{{.*}}: tensor<8x16xf32>, vector<8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x16xf32>, tensor<8x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @batch_matmul(%A: tensor<3x8x4xf32>, %B: tensor<3x4x16xf32>,
+    %C: tensor<3x8x16xf32>) -> tensor<3x8x16xf32> {
+  %0 = linalg.batch_matmul
+    ins(%A, %B : tensor<3x8x4xf32>, tensor<3x4x16xf32>)
+    outs(%C: tensor<3x8x16xf32>) -> tensor<3x8x16xf32>
+  return %0 : tensor<3x8x16xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CHECK-LABEL: func.func @batch_matmul(
+// CHECK-SAME:    %[[A:.*]]: tensor<3x8x4xf32>, %[[B:.*]]: tensor<3x4x16xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<3x8x16xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.transfer_read %[[A]]{{.*}}: tensor<3x8x4xf32>, vector<3x8x4xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.transfer_read %[[B]]{{.*}}: tensor<3x4x16xf32>, vector<3x4x16xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.transfer_read %[[C]]{{.*}}: tensor<3x8x16xf32>, vector<3x8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<3x8x16xf32>, tensor<3x8x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.batch_matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @batch_reduce_matmul(%A: tensor<3x8x4xf32>, %B: tensor<3x4x16xf32>,
+    %C: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = linalg.batch_reduce_matmul
+    ins(%A, %B : tensor<3x8x4xf32>, tensor<3x4x16xf32>)
+    outs(%C: tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+// CHECK-LABEL: func.func @batch_reduce_matmul(
+// CHECK-SAME:    %[[A:.*]]: tensor<3x8x4xf32>, %[[B:.*]]: tensor<3x4x16xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<8x16xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.transfer_read %[[A]]{{.*}}: tensor<3x8x4xf32>, vector<3x8x4xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.transfer_read %[[B]]{{.*}}: tensor<3x4x16xf32>, vector<3x4x16xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.transfer_read %[[C]]{{.*}}: tensor<8x16xf32>, vector<8x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<8x16xf32>, tensor<8x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.batch_reduce_matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @contract(%A: tensor<4x8x2xf32>, %B: tensor<8x16x2xf32>,
+    %C: tensor<4x16xf32>) -> tensor<4x16xf32> {
+  %0 = linalg.contract
+    indexing_maps = [affine_map<(m, n, k, kk) -> (m, k, kk)>,
+                     affine_map<(m, n, k, kk) -> (k, n, kk)>,
+                     affine_map<(m, n, k, kk) -> (m, n)>]
+    ins(%A, %B : tensor<4x8x2xf32>, tensor<8x16x2xf32>)
+    outs(%C : tensor<4x16xf32>) -> tensor<4x16xf32>
+  return %0 : tensor<4x16xf32>
+}
+
+// CHECK: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CHECK: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3)>
+// CHECK: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+// CHECK-LABEL: func.func @contract(
+// CHECK-SAME:    %[[A:.*]]: tensor<4x8x2xf32>, %[[B:.*]]: tensor<8x16x2xf32>,
+// CHECK-SAME:    %[[C:.*]]: tensor<4x16xf32>)
+//      CHECK: %[[LOAD_A:.*]] = vector.transfer_read %[[A]]{{.*}}: tensor<4x8x2xf32>, vector<4x8x2xf32>
+//      CHECK: %[[LOAD_B:.*]] = vector.transfer_read %[[B]]{{.*}}: tensor<8x16x2xf32>, vector<8x16x2xf32>
+//      CHECK: %[[LOAD_C:.*]] = vector.transfer_read %[[C]]{{.*}}: tensor<4x16xf32>, vector<4x16xf32>
+//      CHECK: %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:   indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CHECK-SAME:   kind = #vector.kind<add>
+// CHECK-SAME:   %[[LOAD_A]], %[[LOAD_B]], %[[LOAD_C]]
+//      CHECK: vector.transfer_write %[[CONTRACT]], %[[C]]{{.*}}: vector<4x16xf32>, tensor<4x16xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+/// Generic can represent contractions but it does not implement contraction interface.
+/// Thus, direct lowering to vector.contract is not supported.
+/// Vectorization still works and applies generic rewrite logic.
+
+func.func @negative_generic(%A: tensor<8x4xf32>, %B: tensor<4x16xf32>,
+    %C: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = linalg.generic {
+    indexing_maps = [affine_map<(m, n, k) -> (m, k)>,
+                     affine_map<(m, n, k) -> (k, n)>,
+                     affine_map<(m, n, k) -> (m, n)>],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<8x4xf32>, tensor<4x16xf32>)
+    outs(%C : tensor<8x16xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %1 = arith.mulf %in, %in_0 : f32
+      %2 = arith.addf %out, %1 : f32
+      linalg.yield %2 : f32
+    } -> tensor<8x16xf32>
+    return %0 : tensor<8x16xf32>
+}
+
+// CHECK-LABEL: func.func @negative_generic(
+// CHECK-NOT: vector.contract
+// CHECK: vector.multi_reduction
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 {create_named_contraction} : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 9e501affdd2a5..98e8f5079176c 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -840,116 +840,303 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
+// -----
+
 ///----------------------------------------------------------------------------------------
-/// Tests for other Ops
+/// Tests for linalg.mmt4d
 ///----------------------------------------------------------------------------------------
 
-// -----
-
-func.func @vectorize_dynamic_fill(%A : tensor<?x?xf32>, %arg0 : f32) -> tensor<?x?xf32> {
-  %0 = linalg.fill ins(%arg0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
+               outs(%C_in: memref<16x16x8x8xf32>)
+  return
 }
 
-// CHECK-LABEL: func.func @vectorize_dynamic_fill
-//   CHECK: %[[DIM0:.*]] = tensor.dim
-//   CHECK: %[[DIM1:.*]] = tensor.dim
-//   CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM0]], %[[DIM1]] : vector<8x16xi1>
-//   CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f32 to vector<8x16xf32>
-//   CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[BCAST]], {{.*}} {in_bounds = [true, true]} : vector<8x16xf32>, tensor<?x?xf32> } : vector<8x16xi1>
+// CHECK-LABEL:   func.func @mmt4d(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [8, 16] : !transform.any_op
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-// NOTE: Often, non-trailing scalable sizes are problematic - there are no
-// "scalable" arrays of vectors at the LLVM level (multi-dim vectors are
-// decomposed into arrays of aggregates). However, the trailing dim in this
-// case is 1 and that can be folded away later.
-
-// NOTE: This is similar to the example above, but the trailing dim was set to
-// 1 to make it foldable + vectorizable.
-
-func.func @vectorize_dynamic_fill_scalable(%A : tensor<?x?xf32>, %arg0 : f32) -> tensor<?x?xf32> {
-  %0 = linalg.fill ins(%arg0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+func.func @mmt4d_scalable(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>, %C_in: memref<16x16x8x?xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x?x1xf32>)
+               outs(%C_in: memref<16x16x8x?xf32>)
+  return
 }
+// CHECK-LABEL:   func.func @mmt4d_scalable(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<16x16x?x1xf32>,
+// CHECK-SAME:      %[[C_IN:.*]]: memref<16x16x8x?xf32>) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 16 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[DIM_2:.*]] = memref.dim %[[B]], %[[C2]] : memref<16x16x?x1xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_1:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_2]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x[4]x1xi1>
+// CHECK:           %[[VEC_B:.*]] = vector.mask %[[MASK_1]] { vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32> } : vector<16x16x[4]x1xi1> -> vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_2:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[C8]], %[[DIM_2]] : vector<16x16x8x[4]xi1>
+// CHECK:           %[[VAL_15:.*]] = vector.mask %[[MASK_2]] { vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32> } : vector<16x16x8x[4]xi1> -> vector<16x16x8x[4]xf32>
+// CHECK:           %[[VAL_16:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_3:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[C8]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x16x8x[4]x1xi1>
+// CHECK:           %[[VAL_18:.*]] = vector.mask %[[MASK_3]] { vector.multi_reduction <add>, %[[VAL_16]], %[[VAL_15]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32> } : vector<16x16x16x8x[4]x1xi1> -> vector<16x16x8x[4]xf32>
+// CHECK:           vector.mask %[[MASK_2]] { vector.transfer_write %[[VAL_18]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32> } : vector<16x16x8x[4]xi1>
 
-// CHECK-LABEL: func.func @vectorize_dynamic_fill_scalable
-//   CHECK: %[[DIM0:.*]] = tensor.dim
-//   CHECK: %[[DIM1:.*]] = tensor.dim
-//   CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM0]], %[[DIM1]] : vector<[8]x1xi1>
-//   CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f32 to vector<[8]x1xf32>
-//   CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[BCAST]], {{.*}} {in_bounds = [true, true]} : vector<[8]x1xf32>, tensor<?x?xf32> } : vector<[8]x1xi1>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [[8], 1] : !transform.any_op
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d vector_sizes [16, 16, 16, 8, [4], 1] : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-// CHECK: #[[MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: func @test_masked_vectorize_linalg_transpose
-func.func @test_masked_vectorize_linalg_transpose(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG:  %[[D0:.*]] = tensor.dim %arg0, %[[C0]]
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG:  %[[D1:.*]] = tensor.dim %arg0, %[[C1]]
-  // CHECK:      %[[MASK0:.*]] = vector.create_mask %[[D0]], %[[D1]]
-  // CHECK:      %[[LOAD:.*]] = vector.mask %[[MASK0]] { vector.transfer_read %arg0{{.+}} permutation_map = #[[MAP]]{{.+}} }
-  // CHECK-SAME:   vector<4x2xi1> -> vector<2x4xf32>
-  // CHECK:      %[[MASK1:.*]] = vector.create_mask %[[D1]], %[[D0]]
-  // CHECK:      %[[WRITE:.*]] = vector.mask %[[MASK1]] { vector.transfer_write %[[LOAD]], %arg1{{.+}} }
-  // CHECK-SAME:   vector<2x4xi1> -> tensor<?x?xf32>
-  // CHECK:      return %[[WRITE]]
-  %0 = linalg.transpose ins(%arg0 : tensor<?x?xf32>) outs(%arg1 : tensor<?x?xf32>) permutation = [1, 0]
-  return %0 : tensor<?x?xf32>
+func.func @mmt4d_scalable_with_assume(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>, %C_in: memref<16x16x8x?xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x?x1xf32>)
+               outs(%C_in: memref<16x16x8x?xf32>)
+  return
 }
+// CHECK-LABEL:   func.func @mmt4d_scalable_with_assume(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<16x16x?x1xf32>,
+// CHECK-SAME:      %[[C_IN:.*]]: memref<16x16x8x?xf32>) {
+// CHECK-NOT:       mask
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VAL_13:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32>
+// CHECK:           %[[VAL_14:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VAL_15:.*]] = vector.multi_reduction <add>, %[[VAL_14]], %[[VAL_13]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32>
+// CHECK:           vector.transfer_write %[[VAL_15]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d vector_sizes [16, 16, 16, 8, [4], 1] {assume_dynamic_dims_match_vec_sizes} : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-// CHECK-LABEL: func @test_masked_vectorize_linalg_copy
-func.func @test_masked_vectorize_linalg_copy(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
-  // CHECK: %[[c0:.*]] = arith.constant 0 : index
-  // CHECK: %[[d0:.*]] = memref.dim %{{.*}}, %[[c0]] : memref<?x?xf32>
-  // CHECK: %[[c1:.*]] = arith.constant 1 : index
-  // CHECK: %[[d1:.*]] = memref.dim %{{.*}}, %[[c1]] : memref<?x?xf32>
-  // CHECK: %[[mask:.*]] = vector.create_mask %[[d0]], %[[d1]] : vector<2x4xi1>
-  // CHECK: vector.mask %[[mask]] {{.*}} vector.transfer_read %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x4xf32> } : vector<2x4xi1> -> vector<2x4xf32>
-  // CHECK: vector.mask %[[mask]] {{.*}} vector.transfer_write %{{.*}} {in_bounds = [true, true]} : vector<2x4xf32>, memref<?x?xf32> } : vector<2x4xi1>
-  linalg.copy ins(%A : memref<?x?xf32>) outs(%B : memref<?x?xf32>)
-  return
-}
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.unpack
+///----------------------------------------------------------------------------------------
 
+// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<?x?xf32>,
+func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
+// CHECK: %[[C0:.*]] = arith.constant 0
+// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32>
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[DIM0:.*]] = tensor.dim %arg0, %[[C1]] : tensor<?x?xf32>
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+// CHECK: %[[C01:.*]] = arith.constant 0
+// CHECK: %[[C02:.*]] = arith.constant 0
+// CHECK: %[[DIM4:.*]] = tensor.dim %arg1, %[[C02]] : tensor<?x?x16x2xf32>
+// CHECK: %[[CNST14:.*]] = arith.constant 1
+// CHECK: %[[DIM6:.*]] = tensor.dim %arg1, %[[CNST14]] : tensor<?x?x16x2xf32>
+// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
+// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
+// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
+// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
+// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
+// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
+// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
+// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]]
+// CHECK: return %[[write0]]
+ %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
+ return %ret : tensor<?x?xf32>
+}
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+   %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op
+   transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_unpack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
+func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+    // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK: %[[C0:.*]]= arith.constant 0 : index
+    // CHECK: %[[C8:.*]] = arith.constant 8 : index
+    // CHECK: %[[C80:.*]] = arith.constant 8 : index
+    // CHECK: %[[C32:.*]] = arith.constant 32 : index
+    // CHECK: %[[C16:.*]] = arith.constant 16 : index
+    // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1>
+    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] { vector.transfer_read %[[SRC]]{{.*}}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
+    // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32>
+    // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32>
+    // CHECK: %[[C01:.*]] = arith.constant 0 : index
+    // CHECK: %[[C256:.*]] = arith.constant 256 : index
+    // CHECK: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1>
+    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] { vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<512x128xi1> -> tensor<256x128xf32>
+    // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_unpack_no_masks
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
+func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> 
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> 
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+    transform.yield
+  }
+ }
+
+// -----
+
+// CHECK-LABEL: test_vectorize_unpack_with_outer_perm
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
+  func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> 
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> 
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
     transform.yield
   }
 }
 
+// -----
+
+// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
+func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> 
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> 
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+ }
+
+// -----
+
+// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes_slice_output
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x4x16x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<64x127xf32>
+func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> {
+  //      CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  //      CHECK: %[[C0:.*]] = arith.constant 0 : index
+  //      CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32>
+  //      CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32>
+  //      CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32>
+  //      CHECK: %[[C00:.*]] = arith.constant 0 : index
+  //      CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]
+  // CHECK-SAME:  {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32>
+  //      CHECK: return %[[WRIT]] : tensor<64x127xf32>
+   %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32>
+   return %0 : tensor<64x127xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+ }
+
+// -----
+
+// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes_permute
+// CHECK-SAME:      %[[SRC:.*]]:  tensor<4x7x4xf32>
+// CHECK-SAME:      %[[DEST:.*]]:  tensor<7x16xf32>
+func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> {
+   %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32>
+   return %0 : tensor<7x16xf32>
+ }
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<4x7x4xf32>, vector<4x7x4xf32>
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<7x16xf32>, tensor<7x16xf32> 
+  // CHECK: return %[[WRIT]] : tensor<7x16xf32>
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+ }
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.pack
+///----------------------------------------------------------------------------------------
+
 // Input identical as the test in vectorization-with-patterns.mlir. Output is
 // different - vector sizes are inferred (rather than user-specified) and hence
 // masking was used.
@@ -1060,337 +1247,238 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @matmul(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
-  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
-            outs(%C: memref<?x?xf32>)
-  return
+// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
+func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
+  %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
+  return %pack : tensor<2x4x16x2xf32>
 }
-
-// CHECK-LABEL:   func.func @matmul(
-// CHECK-SAME:      %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>, %[[C:.*]]: memref<?x?xf32>) {
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[VAL_4:.*]] = memref.dim %[[A]], %[[VAL_3]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = memref.dim %[[B]], %[[VAL_5]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_8:.*]] = memref.dim %[[A]], %[[VAL_7]] : memref<?x?xf32>
-// CHECK:           %[[MASK_A:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
-// CHECK:           %[[LOAD_A:.*]] = vector.mask %[[MASK_A]] { vector.transfer_read %[[A]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32>
-// CHECK:           %[[MASK_B:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x16xi1>
-// CHECK:           %[[LOAD_B:.*]] = vector.mask %[[MASK_B]] { vector.transfer_read %[[B]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32>
-// CHECK:           %[[MASK_C:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x16xi1>
-// CHECK:           %[[LOAD_C:.*]] = vector.mask %[[MASK_C]] { vector.transfer_read %[[C]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x16xf32> } : vector<8x16xi1> -> vector<8x16xf32>
-// CHECK:           %[[MULF:.*]] = arith.mulf %[[LOAD_A]], %[[LOAD_B]] : vector<8x16x4xf32>
-// CHECK:           %[[MASK_MULIT_RED:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : vector<8x16x4xi1>
-// CHECK:           %[[MULTI_RED:.*]] = vector.mask %[[MASK_MULIT_RED]] { vector.multi_reduction <add>, %[[MULF]], %[[LOAD_C]] [2] : vector<8x16x4xf32> to vector<8x16xf32> } : vector<8x16x4xi1> -> vector<8x16xf32>
-// CHECK:           %[[C2:.*]] = arith.constant 0 : index
-// CHECK:           vector.mask %[[MASK_C]] { vector.transfer_write %[[MULTI_RED]], %[[C]]{{\[}}%[[C2]], %[[C2]]] {in_bounds = [true, true]} : vector<8x16xf32>, memref<?x?xf32> } : vector<8x16xi1>
+//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
+// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
+//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
+//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
+//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
+//      CHECK: return %[[write]] : tensor<2x4x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %matmul vector_sizes [8, 16, 4] : !transform.any_op
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
-  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
-               outs(%C_in: memref<16x16x8x8xf32>)
-  return
+// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
+func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+  %pad = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  return %pack : tensor<32x4x1x16x2xf32>
 }
-
-// CHECK-LABEL:   func.func @mmt4d(
-// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
-// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
-// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
-// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
-// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
-// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
-// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
+//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+//      CHECK: %[[transfer_read:.*]] =  vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
+//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %mmt4d : !transform.any_op
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
     transform.yield
   }
 }
 
+
+///----------------------------------------------------------------------------------------
+/// Tests for other Ops
+///----------------------------------------------------------------------------------------
+
 // -----
 
-func.func @matmul_scalable(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
-  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
-            outs(%C: memref<?x?xf32>)
-  return
+func.func @vectorize_dynamic_fill(%A : tensor<?x?xf32>, %arg0 : f32) -> tensor<?x?xf32> {
+  %0 = linalg.fill ins(%arg0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL:   func.func @matmul_scalable(
-// CHECK-SAME:      %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>, %[[C:.*]]: memref<?x?xf32>) {
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[VAL_4:.*]] = memref.dim %[[A]], %[[VAL_3]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = memref.dim %[[B]], %[[VAL_5]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_8:.*]] = memref.dim %[[A]], %[[VAL_7]] : memref<?x?xf32>
-// CHECK:           %[[MASK_A:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
-// CHECK:           %[[LOAD_A:.*]] = vector.mask %[[MASK_A]] { vector.transfer_read %[[A]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x[16]x4xf32> } : vector<8x4xi1> -> vector<8x[16]x4xf32>
-// CHECK:           %[[MASK_B:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x[16]xi1>
-// CHECK:           %[[LOAD_B:.*]] = vector.mask %[[MASK_B]] { vector.transfer_read %[[B]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x[16]x4xf32> } : vector<4x[16]xi1> -> vector<8x[16]x4xf32>
-// CHECK:           %[[MASK_C:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x[16]xi1>
-// CHECK:           %[[LOAD_C:.*]] = vector.mask %[[MASK_C]] { vector.transfer_read %[[C]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x[16]xf32> } : vector<8x[16]xi1> -> vector<8x[16]xf32>
-// CHECK:           %[[MULF:.*]] = arith.mulf %[[LOAD_A]], %[[LOAD_B]] : vector<8x[16]x4xf32>
-// CHECK:           %[[MASK_MULIT_RED:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : vector<8x[16]x4xi1>
-// CHECK:           %[[MULTI_RED:.*]] = vector.mask %[[MASK_MULIT_RED]] { vector.multi_reduction <add>, %[[MULF]], %[[LOAD_C]] [2] : vector<8x[16]x4xf32> to vector<8x[16]xf32> } : vector<8x[16]x4xi1> -> vector<8x[16]xf32>
-// CHECK:           %[[C2:.*]] = arith.constant 0 : index
-// CHECK:           vector.mask %[[MASK_C]] { vector.transfer_write %[[MULTI_RED]], %[[C]]{{\[}}%[[C2]], %[[C2]]] {in_bounds = [true, true]} : vector<8x[16]xf32>, memref<?x?xf32> } : vector<8x[16]xi1>
+// CHECK-LABEL: func.func @vectorize_dynamic_fill
+//   CHECK: %[[DIM0:.*]] = tensor.dim
+//   CHECK: %[[DIM1:.*]] = tensor.dim
+//   CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM0]], %[[DIM1]] : vector<8x16xi1>
+//   CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f32 to vector<8x16xf32>
+//   CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[BCAST]], {{.*}} {in_bounds = [true, true]} : vector<8x16xf32>, tensor<?x?xf32> } : vector<8x16xi1>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %matmul vector_sizes [8, [16], 4] : !transform.any_op
+    %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, 16] : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
-func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
-// CHECK: %[[C0:.*]] = arith.constant 0
-// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32>
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[DIM0:.*]] = tensor.dim %arg0, %[[C1]] : tensor<?x?xf32>
-// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
-// CHECK: %[[C01:.*]] = arith.constant 0
-// CHECK: %[[C02:.*]] = arith.constant 0
-// CHECK: %[[DIM4:.*]] = tensor.dim %arg1, %[[C02]] : tensor<?x?x16x2xf32>
-// CHECK: %[[CNST14:.*]] = arith.constant 1
-// CHECK: %[[DIM6:.*]] = tensor.dim %arg1, %[[CNST14]] : tensor<?x?x16x2xf32>
-// CHECK: %[[CNST16:.*]] = arith.constant 16 : index
-// CHECK: %[[CNST2:.*]] = arith.constant 2 : index
-// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
-// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
-// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
-// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
-// CHECK: %[[empt0:.*]] = tensor.empty
-// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]]
-// CHECK: return %[[write0]]
- %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
- return %ret : tensor<?x?xf32>
-}
-module attributes {transform.with_named_sequence} {
- transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-   %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op
-   transform.yield
- }
+// NOTE: Often, non-trailing scalable sizes are problematic - there are no
+// "scalable" arrays of vectors at the LLVM level (multi-dim vectors are
+// decomposed into arrays of aggregates). However, the trailing dim in this
+// case is 1 and that can be folded away later.
+
+// NOTE: This is similar to the example above, but the trailing dim was set to
+// 1 to make it foldable + vectorizable.
+
+func.func @vectorize_dynamic_fill_scalable(%A : tensor<?x?xf32>, %arg0 : f32) -> tensor<?x?xf32> {
+  %0 = linalg.fill ins(%arg0 : f32) outs(%A : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// -----
+// CHECK-LABEL: func.func @vectorize_dynamic_fill_scalable
+//   CHECK: %[[DIM0:.*]] = tensor.dim
+//   CHECK: %[[DIM1:.*]] = tensor.dim
+//   CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM0]], %[[DIM1]] : vector<[8]x1xi1>
+//   CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f32 to vector<[8]x1xf32>
+//   CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[BCAST]], {{.*}} {in_bounds = [true, true]} : vector<[8]x1xf32>, tensor<?x?xf32> } : vector<[8]x1xi1>
 
-// CHECK-LABEL: func @test_vectorize_unpack
-func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
-    // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-    // CHECK: %[[C0:.*]]= arith.constant 0 : index
-    // CHECK: %[[C8:.*]] = arith.constant 8 : index
-    // CHECK: %[[C80:.*]] = arith.constant 8 : index
-    // CHECK: %[[C32:.*]] = arith.constant 32 : index
-    // CHECK: %[[C16:.*]] = arith.constant 16 : index
-    // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1>
-    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
-    // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32>
-    // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32>
-    // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
-    // CHECK: %[[C01:.*]] = arith.constant 0 : index
-    // CHECK: %[[C256:.*]] = arith.constant 256 : index
-    // CHECK: %[[C128:.*]] = arith.constant 128 : index
-    // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1>
-    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32>
-    // CHECK: return %[[WRIT]] : tensor<256x128xf32>
-   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
-   return %0 : tensor<256x128xf32>
- }
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[8], 1] : !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_unpack_no_masks
-func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
-  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
-  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
-  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
-  // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
-  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
-   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
-   return %0 : tensor<256x128xf32>
- }
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
-    transform.yield
-  }
- }
-
-  // -----
+// CHECK: #[[MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK: func @test_masked_vectorize_linalg_transpose
+func.func @test_masked_vectorize_linalg_transpose(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG:  %[[D0:.*]] = tensor.dim %arg0, %[[C0]]
+  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG:  %[[D1:.*]] = tensor.dim %arg0, %[[C1]]
+  // CHECK:      %[[MASK0:.*]] = vector.create_mask %[[D0]], %[[D1]]
+  // CHECK:      %[[LOAD:.*]] = vector.mask %[[MASK0]] { vector.transfer_read %arg0{{.+}} permutation_map = #[[MAP]]{{.+}} }
+  // CHECK-SAME:   vector<4x2xi1> -> vector<2x4xf32>
+  // CHECK:      %[[MASK1:.*]] = vector.create_mask %[[D1]], %[[D0]]
+  // CHECK:      %[[WRITE:.*]] = vector.mask %[[MASK1]] { vector.transfer_write %[[LOAD]], %arg1{{.+}} }
+  // CHECK-SAME:   vector<2x4xi1> -> tensor<?x?xf32>
+  // CHECK:      return %[[WRITE]]
+  %0 = linalg.transpose ins(%arg0 : tensor<?x?xf32>) outs(%arg1 : tensor<?x?xf32>) permutation = [1, 0]
+  return %0 : tensor<?x?xf32>
+}
 
-  // CHECK-LABEL: test_vectorize_unpack_with_outer_perm
-  func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
-  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
-  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
-  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
-  // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
-  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
-   %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
-   return %0 : tensor<256x128xf32>
- }
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.transpose"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
     transform.yield
   }
 }
 
-  // -----
+// -----
 
-// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
-func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
-  %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
-  return %pack : tensor<2x4x16x2xf32>
+// CHECK-LABEL: func @test_masked_vectorize_linalg_copy
+func.func @test_masked_vectorize_linalg_copy(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
+  // CHECK: %[[c0:.*]] = arith.constant 0 : index
+  // CHECK: %[[d0:.*]] = memref.dim %{{.*}}, %[[c0]] : memref<?x?xf32>
+  // CHECK: %[[c1:.*]] = arith.constant 1 : index
+  // CHECK: %[[d1:.*]] = memref.dim %{{.*}}, %[[c1]] : memref<?x?xf32>
+  // CHECK: %[[mask:.*]] = vector.create_mask %[[d0]], %[[d1]] : vector<2x4xi1>
+  // CHECK: vector.mask %[[mask]] {{.*}} vector.transfer_read %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<2x4xf32> } : vector<2x4xi1> -> vector<2x4xf32>
+  // CHECK: vector.mask %[[mask]] {{.*}} vector.transfer_write %{{.*}} {in_bounds = [true, true]} : vector<2x4xf32>, memref<?x?xf32> } : vector<2x4xi1>
+  linalg.copy ins(%A : memref<?x?xf32>) outs(%B : memref<?x?xf32>)
+  return
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
-// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
-//      CHECK: return %[[write]] : tensor<2x4x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op
     transform.yield
   }
 }
 
-  // -----
 
-// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
-func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
-  %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
-  return %pack : tensor<32x4x1x16x2xf32>
+
+// -----
+
+func.func @matmul(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+            outs(%C: memref<?x?xf32>)
+  return
 }
-//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-//      CHECK: %[[transfer_read:.*]] =  vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
-// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+
+// CHECK-LABEL:   func.func @matmul(
+// CHECK-SAME:      %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>, %[[C:.*]]: memref<?x?xf32>) {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = memref.dim %[[A]], %[[VAL_3]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = memref.dim %[[B]], %[[VAL_5]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = memref.dim %[[A]], %[[VAL_7]] : memref<?x?xf32>
+// CHECK:           %[[MASK_A:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
+// CHECK:           %[[LOAD_A:.*]] = vector.mask %[[MASK_A]] { vector.transfer_read %[[A]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<8x4xi1> -> vector<8x16x4xf32>
+// CHECK:           %[[MASK_B:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x16xi1>
+// CHECK:           %[[LOAD_B:.*]] = vector.mask %[[MASK_B]] { vector.transfer_read %[[B]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x16x4xf32> } : vector<4x16xi1> -> vector<8x16x4xf32>
+// CHECK:           %[[MASK_C:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x16xi1>
+// CHECK:           %[[LOAD_C:.*]] = vector.mask %[[MASK_C]] { vector.transfer_read %[[C]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x16xf32> } : vector<8x16xi1> -> vector<8x16xf32>
+// CHECK:           %[[MULF:.*]] = arith.mulf %[[LOAD_A]], %[[LOAD_B]] : vector<8x16x4xf32>
+// CHECK:           %[[MASK_MULIT_RED:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : vector<8x16x4xi1>
+// CHECK:           %[[MULTI_RED:.*]] = vector.mask %[[MASK_MULIT_RED]] { vector.multi_reduction <add>, %[[MULF]], %[[LOAD_C]] [2] : vector<8x16x4xf32> to vector<8x16xf32> } : vector<8x16x4xi1> -> vector<8x16xf32>
+// CHECK:           %[[C2:.*]] = arith.constant 0 : index
+// CHECK:           vector.mask %[[MASK_C]] { vector.transfer_write %[[MULTI_RED]], %[[C]]{{\[}}%[[C2]], %[[C2]]] {in_bounds = [true, true]} : vector<8x16xf32>, memref<?x?xf32> } : vector<8x16xi1>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [8, 16, 4] : !transform.any_op
     transform.yield
   }
 }
 
-  // -----
 
-func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
-  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
-  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
-  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
-  // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
-  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
-   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
-   return %0 : tensor<256x128xf32>
- }
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
- }
-
-  // -----
+// -----
 
-func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> {
-  //      CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-  //      CHECK: %[[C0:.*]] = arith.constant 0 : index
-  //      CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32>
-  //      CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32>
-  //      CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32>
-  //      CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<64x127xf32>
-  //      CHECK: %[[C00:.*]] = arith.constant 0 : index
-  //      CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]]
-  // CHECK-SAME:  {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32>
-  //      CHECK: return %[[WRIT]] : tensor<64x127xf32>
-   %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32>
-   return %0 : tensor<64x127xf32>
- }
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
- }
+func.func @matmul_scalable(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+            outs(%C: memref<?x?xf32>)
+  return
+}
 
-// -----
+// CHECK-LABEL:   func.func @matmul_scalable(
+// CHECK-SAME:      %[[A:.*]]: memref<?x?xf32>, %[[B:.*]]: memref<?x?xf32>, %[[C:.*]]: memref<?x?xf32>) {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = memref.dim %[[A]], %[[VAL_3]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = memref.dim %[[B]], %[[VAL_5]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = memref.dim %[[A]], %[[VAL_7]] : memref<?x?xf32>
+// CHECK:           %[[MASK_A:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_8]] : vector<8x4xi1>
+// CHECK:           %[[LOAD_A:.*]] = vector.mask %[[MASK_A]] { vector.transfer_read %[[A]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x[16]x4xf32> } : vector<8x4xi1> -> vector<8x[16]x4xf32>
+// CHECK:           %[[MASK_B:.*]] = vector.create_mask %[[VAL_8]], %[[VAL_6]] : vector<4x[16]xi1>
+// CHECK:           %[[LOAD_B:.*]] = vector.mask %[[MASK_B]] { vector.transfer_read %[[B]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true, true], permutation_map = #{{.*}}} : memref<?x?xf32>, vector<8x[16]x4xf32> } : vector<4x[16]xi1> -> vector<8x[16]x4xf32>
+// CHECK:           %[[MASK_C:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]] : vector<8x[16]xi1>
+// CHECK:           %[[LOAD_C:.*]] = vector.mask %[[MASK_C]] { vector.transfer_read %[[C]]{{\[}}%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : memref<?x?xf32>, vector<8x[16]xf32> } : vector<8x[16]xi1> -> vector<8x[16]xf32>
+// CHECK:           %[[MULF:.*]] = arith.mulf %[[LOAD_A]], %[[LOAD_B]] : vector<8x[16]x4xf32>
+// CHECK:           %[[MASK_MULIT_RED:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : vector<8x[16]x4xi1>
+// CHECK:           %[[MULTI_RED:.*]] = vector.mask %[[MASK_MULIT_RED]] { vector.multi_reduction <add>, %[[MULF]], %[[LOAD_C]] [2] : vector<8x[16]x4xf32> to vector<8x[16]xf32> } : vector<8x[16]x4xi1> -> vector<8x[16]xf32>
+// CHECK:           %[[C2:.*]] = arith.constant 0 : index
+// CHECK:           vector.mask %[[MASK_C]] { vector.transfer_write %[[MULTI_RED]], %[[C]]{{\[}}%[[C2]], %[[C2]]] {in_bounds = [true, true]} : vector<8x[16]xf32>, memref<?x?xf32> } : vector<8x[16]xi1>
 
-func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> {
-   %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32>
-   return %0 : tensor<7x16xf32>
- }
-  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<4x7x4xf32>, vector<4x7x4xf32>
-  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32>
-  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<7x16xf32>
-  // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<7x16xf32>, tensor<7x16xf32>
-  // CHECK: return %[[WRIT]] : tensor<7x16xf32>
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-   transform.structured.vectorize %0 : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [8, [16], 4] : !transform.any_op
     transform.yield
   }
- }
+}
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 704cdaf838f45..b4476036d6513 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -658,6 +658,18 @@ func.func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
 
 // -----
 
+// This test is not written in the op's assembly format, to reproduce a mismatch
+// between the rank of static_offsets and the number of Values sent as the
+// dynamic offsets.
+func.func @invalid_subview(%arg0 : memref<?x128xi8, 1>) {
+  %0 = memref.alloc() :memref<1xf32>
+  // expected-error@+1 {{expected the number of 'offsets' to match the number of dynamic entries in 'static_offsets' (0 vs 1)}}
+  "memref.subview"(%0) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 1>, static_strides = array<i64: 1>}> : (memref<1xf32>) -> memref<1xf32, strided<[1], offset: ?>>
+  return
+}
+
+// -----
+
 func.func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = memref.alloc() : memref<8x16x4xf32>
   // expected-error@+1 {{expected mixed sizes rank to match mixed strides rank (3 vs 2) so the rank of the result type is well-formed}}
@@ -962,6 +974,24 @@ func.func @test_store_zero_results2(%x: i32, %p: memref<i32>) {
 
 // -----
 
+func.func @invalid_load_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'memref.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  %val = memref.load %memref[%c0] { alignment = -1 } : memref<4xi32>
+  return
+}
+
+// -----
+
+func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: i32) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'memref.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  memref.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>
+  return
+}
+
+// -----
+
 func.func @test_alloc_memref_map_rank_mismatch() {
 ^bb0:
   // expected-error@+1 {{memref layout mismatch between rank and affine map: 2 != 1}}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index e11de7bec2d0a..6c2298a3f8acb 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -265,6 +265,17 @@ func.func @zero_dim_no_idx(%arg0 : memref<i32>, %arg1 : memref<i32>, %arg2 : mem
   // CHECK: memref.store %{{.*}}, %{{.*}}[] : memref<i32>
 }
 
+
+// CHECK-LABEL: func @load_store_alignment
+func.func @load_store_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: memref.load {{.*}} {alignment = 16 : i64}
+  %val = memref.load %memref[%c0] { alignment = 16 } : memref<4xi32>
+  // CHECK: memref.store {{.*}} {alignment = 16 : i64}
+  memref.store %val, %memref[%c0] { alignment = 16 } : memref<4xi32>
+  return
+}
+
 // CHECK-LABEL: func @memref_view(%arg0
 func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = memref.alloc() : memref<2048xi8>
diff --git a/mlir/test/Dialect/Mesh/simplifications.mlir b/mlir/test/Dialect/Mesh/simplifications.mlir
index 2540fbf9510c4..e955f4c134259 100644
--- a/mlir/test/Dialect/Mesh/simplifications.mlir
+++ b/mlir/test/Dialect/Mesh/simplifications.mlir
@@ -165,3 +165,15 @@ func.func @all_reduce_arith_minsi_endomorphism(
   // CHECK: return %[[ALL_REDUCE_RES]]
   return %2 : tensor<5xi32>
 }
+
+// Ensure this case without endomorphism op not crash.
+// CHECK-LABEL: func.func @no_endomorphism_op
+func.func @no_endomorphism_op(%arg0: tensor<2xi64>) -> i64 {
+  %c0 = arith.constant 0 : index
+  %c1_i64 = arith.constant 1 : i64
+  // CHECK: tensor.extract
+  %extracted = tensor.extract %arg0[%c0] : tensor<2xi64>
+  // CHECK: arith.maxsi
+  %0 = arith.maxsi %extracted, %c1_i64 : i64
+  return %0 : i64
+}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir b/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir
new file mode 100644
index 0000000000000..56e26eee83ff9
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir
@@ -0,0 +1,283 @@
+// RUN: mlir-opt --spirv-promote-to-replicated-constants --split-input-file %s | FileCheck %s
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+  spirv.func @splat_vector_of_i32() -> (vector<3xi32>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : vector<3xi32>
+    %0 = spirv.Constant dense<2> : vector<3xi32>
+    spirv.ReturnValue %0 : vector<3xi32>
+  }
+
+  spirv.func @splat_array_of_i32() -> (!spirv.array<3 x i32>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<3 x i32>
+    %0 = spirv.Constant [1 : i32, 1 : i32, 1 : i32] : !spirv.array<3 x i32>
+    spirv.ReturnValue %0 : !spirv.array<3 x i32>
+  }
+
+  spirv.func @splat_array_of_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>>
+    %0 = spirv.Constant [[3 : i32, 3 : i32, 3 : i32], [3 : i32, 3 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+  }
+
+  spirv.func @splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+    %0 = spirv.Constant [[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+  }
+
+  spirv.func @splat_array_of_vectors_of_i32() -> (!spirv.array<2xvector<2xi32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+    %0 = spirv.Constant [dense<[1, 2]> : vector<2xi32>, dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+  }
+
+  spirv.func @splat_array_of_splat_vectors_of_i32() -> (!spirv.array<2 x vector<2xi32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.array<2 x vector<2xi32>>
+    %0 = spirv.Constant [dense<2> : vector<2xi32>, dense<2> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+  }
+
+  spirv.func @splat_tensor_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>>
+    %0 = spirv.Constant dense<3> : tensor<2x3xi32> : !spirv.array<2 x !spirv.array<3 x i32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+  }
+
+  spirv.func @splat_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.arm.tensor<2x3xi32>
+    %0 = spirv.Constant dense<2> : !spirv.arm.tensor<2x3xi32>
+    spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32>
+  }
+
+  spirv.func @array_of_splat_array_of_non_splat_vectors_of_i32() -> (!spirv.array<1 x !spirv.array<2 x vector<2xi32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>
+    %0 = spirv.Constant [[dense<[1, 2]> : vector<2xi32>, dense<[1, 2]> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>>
+    spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>>
+  }
+
+  spirv.func @array_of_one_splat_array_of_vector_of_one_i32() -> !spirv.array<1 x !spirv.array<2 x vector<1xi32>>> "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1> : vector<1xi32>] : !spirv.array<1 x !spirv.array<2 x vector<1xi32>
+    %cst = spirv.Constant [[dense<1> : vector<1xi32>], [dense<1> : vector<1xi32>]] : !spirv.array<1 x !spirv.array<2 x vector<1xi32>>>
+    spirv.ReturnValue %cst : !spirv.array<1 x !spirv.array<2 x vector<1xi32>>>
+  }
+
+  spirv.func @splat_array_of_array_of_one_vector_of_one_i32() -> (!spirv.array<2 x !spirv.array<1 x vector<1xi32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1> : vector<1xi32>] : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>>
+    %0 = spirv.Constant [[dense<1> : vector<1xi32>], [dense<1> : vector<1xi32>]] : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>>
+  }
+
+  spirv.func @array_of_one_array_of_one_splat_vector_of_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xi32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+    %0 = spirv.Constant [[dense<1> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+    spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+  }
+
+  spirv.func @splat_array_of_splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>
+    %0 = spirv.Constant [[[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]], [[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>
+  }
+
+  spirv.func @splat_vector_of_f32() -> (vector<3xf32>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : vector<3xf32>
+    %0 = spirv.Constant dense<2.0> : vector<3xf32>
+    spirv.ReturnValue %0 : vector<3xf32>
+  }
+
+  spirv.func @splat_array_of_f32() -> (!spirv.array<3 x f32>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<3 x f32>
+    %0 = spirv.Constant [1.0 : f32, 1.0 : f32, 1.0 : f32] : !spirv.array<3 x f32>
+    spirv.ReturnValue %0 : !spirv.array<3 x f32>
+  }
+
+  spirv.func @splat_array_of_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>>
+    %0 = spirv.Constant [[3.0 : f32, 3.0 : f32, 3.0 : f32], [3.0 : f32, 3.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+  }
+
+  spirv.func @splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+    %0 = spirv.Constant [[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+  }
+
+  spirv.func @splat_array_of_vectors_of_f32() -> (!spirv.array<2xvector<2xf32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+    %0 = spirv.Constant [dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 2.0]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+  }
+
+  spirv.func @splat_array_of_splat_vectors_of_f32() -> (!spirv.array<2 x vector<2xf32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.array<2 x vector<2xf32>>
+    %0 = spirv.Constant [dense<2.0> : vector<2xf32>, dense<2.0> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+  }
+
+  spirv.func @splat_tensor_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>>
+    %0 = spirv.Constant dense<3.0> : tensor<2x3xf32> : !spirv.array<2 x !spirv.array<3 x f32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+  }
+
+  spirv.func @splat_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.arm.tensor<2x3xf32>
+    %0 = spirv.Constant dense<2.0> : !spirv.arm.tensor<2x3xf32>
+    spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xf32>
+  }
+
+  spirv.func @array_of_splat_array_of_non_splat_vectors_of_f32() -> (!spirv.array<1 x !spirv.array<2 x vector<2xf32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>
+    %0 = spirv.Constant [[dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 2.0]> : vector<2xf32>]] : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>>
+    spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>>
+  }
+
+  spirv.func @array_of_one_splat_array_of_vector_of_one_f32() -> !spirv.array<1 x !spirv.array<2 x vector<1xf32>>> "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1.000000e+00> : vector<1xf32>] : !spirv.array<1 x !spirv.array<2 x vector<1xf32>
+    %cst = spirv.Constant [[dense<1.0> : vector<1xf32>], [dense<1.0> : vector<1xf32>]] : !spirv.array<1 x !spirv.array<2 x vector<1xf32>>>
+    spirv.ReturnValue %cst : !spirv.array<1 x !spirv.array<2 x vector<1xf32>>>
+  }
+
+  spirv.func @splat_array_of_array_of_one_vector_of_one_f32() -> (!spirv.array<2 x !spirv.array<1 x vector<1xf32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1.000000e+00> : vector<1xf32>] : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>>
+    %0 = spirv.Constant [[dense<1.0> : vector<1xf32>], [dense<1.0> : vector<1xf32>]] : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>>
+  }
+
+  spirv.func @array_of_one_array_of_one_splat_vector_of_f32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xf32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>>
+    %0 = spirv.Constant [[dense<1.0> : vector<2xf32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>>
+    spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>>
+  }
+
+  spirv.func @splat_array_of_splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>) "None" {
+    // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>
+    %0 = spirv.Constant [[[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]], [[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>
+  }
+
+  spirv.func @array_of_one_i32() -> (!spirv.array<1 x i32>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant [1 : i32] : !spirv.array<1 x i32>
+    spirv.ReturnValue %0 : !spirv.array<1 x i32>
+  }
+
+  spirv.func @arm_tensor_of_one_i32() -> (!spirv.arm.tensor<1xi32>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant dense<1> : !spirv.arm.tensor<1xi32>
+    spirv.ReturnValue %0 : !spirv.arm.tensor<1xi32>
+  }
+
+  spirv.func @non_splat_vector_of_i32() -> (vector<3xi32>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant dense<[0, 1, 2]> : vector<3xi32>
+    spirv.ReturnValue %0 : vector<3xi32>
+  }
+
+  spirv.func @non_splat_array_of_vectors_of_i32() -> (!spirv.array<2xvector<2xi32>>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant [dense<[1, 2]> : vector<2xi32>, dense<[1, 3]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+  }
+
+  spirv.func @array_of_one_f32() -> (!spirv.array<1 x f32>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant [1.0 : f32] : !spirv.array<1 x f32>
+    spirv.ReturnValue %0 : !spirv.array<1 x f32>
+  }
+
+  spirv.func @arm_tensor_of_one_f32() -> (!spirv.arm.tensor<1xf32>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant dense<1.0> : !spirv.arm.tensor<1xf32>
+    spirv.ReturnValue %0 : !spirv.arm.tensor<1xf32>
+  }
+
+  spirv.func @non_splat_vector_of_f32() -> (vector<3xf32>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant dense<[0.0, 1.0, 2.0]> : vector<3xf32>
+    spirv.ReturnValue %0 : vector<3xf32>
+  }
+
+  spirv.func @non_splat_array_of_vectors_of_f32() -> (!spirv.array<2xvector<2xf32>>) "None" {
+    // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant [dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 3.0]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+  }
+
+  spirv.func @array_of_one_array_of_one_non_splat_vector_of_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xi32>>>) "None" {
+    // CHECK-NOT spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant [[dense<[1, 2]> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+    spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+  }
+  
+  spirv.func @array_of_one_array_of_one_vector_of_one_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<1xi32>>>) "None" {
+    // CHECK-NOT spirv.EXT.ConstantCompositeReplicate
+    %0 = spirv.Constant [[dense<1> : vector<1xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<1xi32>>>
+    spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<1xi32>>>
+  }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+
+  spirv.SpecConstant @sc_i32_1 = 1 : i32
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_i32 (@sc_i32_1) : !spirv.array<3 x i32>
+  spirv.SpecConstantComposite @scc_splat_array_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.array<3 x i32>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_i32 (@sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+  spirv.SpecConstantComposite @scc_splat_struct_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_i32 (@sc_i32_1) : vector<3xi32>
+  spirv.SpecConstantComposite @scc_splat_vector_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : vector<3 x i32>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_arm_tensor_of_i32 (@sc_i32_1) : !spirv.arm.tensor<3xi32>
+  spirv.SpecConstantComposite @scc_splat_arm_tensor_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.arm.tensor<3xi32>
+
+  spirv.SpecConstant @sc_f32_1 = 1.0 : f32
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_f32 (@sc_f32_1) : !spirv.array<3 x f32>
+  spirv.SpecConstantComposite @scc_splat_array_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.array<3 x f32>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_f32 (@sc_f32_1) : !spirv.struct<(f32, f32, f32)>
+  spirv.SpecConstantComposite @scc_splat_struct_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.struct<(f32, f32, f32)>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_f32 (@sc_f32_1) : vector<3xf32>
+  spirv.SpecConstantComposite @scc_splat_vector_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : vector<3 x f32>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_arm_tensor_of_f32 (@sc_f32_1) : !spirv.arm.tensor<3xf32>
+  spirv.SpecConstantComposite @scc_splat_arm_tensor_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.arm.tensor<3xf32>
+
+  spirv.SpecConstant @sc_i32_2 = 2 : i32
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_array_of_one_i32 (@sc_i32_1) : !spirv.array<1 x i32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_arm_tensor_of_one_i32 (@sc_i32_1) : !spirv.arm.tensor<1xi32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_non_splat_vector_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_2) : vector<3 x i32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_non_splat_arm_tensor_of_i32 (@sc_i32_2, @sc_i32_1, @sc_i32_1) : !spirv.arm.tensor<3xi32>
+
+  spirv.SpecConstant @sc_f32_2 = 2.0 : f32
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_array_of_one_f32 (@sc_f32_1) : !spirv.array<1 x f32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_arm_tensor_of_one_f32 (@sc_f32_1) : !spirv.arm.tensor<1xf32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_non_splat_vector_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_2) : vector<3 x f32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_non_splat_arm_tensor_of_f32 (@sc_f32_2, @sc_f32_1, @sc_f32_1) : !spirv.arm.tensor<3xf32>
+
+  // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+  spirv.SpecConstantComposite @scc_struct_of_i32_and_f32 (@sc_i32_1, @sc_i32_1, @sc_f32_1) : !spirv.struct<(i32, i32, f32)>
+}
diff --git a/mlir/test/Dialect/Tosa/error_if_check.mlir b/mlir/test/Dialect/Tosa/error_if_check.mlir
index 1f25132d6bcf3..eb25011ff3a9d 100644
--- a/mlir/test/Dialect/Tosa/error_if_check.mlir
+++ b/mlir/test/Dialect/Tosa/error_if_check.mlir
@@ -227,15 +227,113 @@ func.func @test_error_i32_unsigned_output(%arg0: tensor<1xi8>) -> tensor<1xi32>
 }
 
 // -----
-// CHECK-LABEL: cond_if_simplified_form
-func.func @test_cond_if_simplified_form(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
-  // expected-error@+1 {{'tosa.cond_if' op the current simplified form is not strictly conformant to the spec, please use the generic format}}
+
+func.func @test_cond_if_then_not_isolated_from_above(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
+    // expected-error@+1 {{'tosa.cond_if' op is not conformant to the TOSA specification. It requires the 'then' region is isolated from above.}}
+    %0 = "tosa.cond_if"(%arg2, %arg1) ({
+      ^bb0(%arg3: tensor<f32>):
+        tosa.yield %arg1 : tensor<f32>
+      },  {
+      ^bb0(%arg3: tensor<f32>):
+        tosa.yield %arg3 : tensor<f32>
+      }) : (tensor<i1>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @test_cond_if_else_not_isolated_from_above(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
+  // expected-error@+1 {{'tosa.cond_if' op is not conformant to the TOSA specification. It requires the 'else' region is isolated from above.}}
+  %0 = "tosa.cond_if"(%arg2, %arg0, %arg1) ({
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      tosa.yield %arg3 : tensor<f32>
+    },  {
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      %add = tosa.add %arg0, %arg4 : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      tosa.yield %add : tensor<f32>
+    }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @test_cond_if_simplified_form_not_isolated_from_above(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
+  // expected-error@+1 {{'tosa.cond_if' op is not conformant to the TOSA specification. It requires the 'then' region is isolated from above.}}
   %0 = tosa.cond_if %arg2 -> (tensor<f32>) {
-    %1 = tosa.add %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<f32>
-    tosa.yield %1 : tensor<f32>
+    tosa.yield %arg0 : tensor<f32>
   } else {
-    %1 = tosa.sub %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<f32>
-    tosa.yield %1 : tensor<f32>
+    tosa.yield %arg1 : tensor<f32>
   }
   return %0 : tensor<f32>
 }
+
+// -----
+
+// Check isolated cond_if's are valid
+func.func @test_cond_if_isolated_from_above(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> tensor<f32> {
+  %0 = "tosa.cond_if"(%arg2, %arg0, %arg1) ({
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      tosa.yield %arg3 : tensor<f32>
+    },  {
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      tosa.yield %arg4 : tensor<f32>
+    }) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @test_while_loop_cond_not_isolated_from_above(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<f32>) {
+  %0 = "tosa.const"() {values = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // expected-error@+1 {{'tosa.while_loop' op is not conformant to the TOSA specification. It requires the 'cond' region is isolated from above.}}
+  %1 = "tosa.while_loop"(%0) ({
+  ^bb0(%arg3: tensor<i32>):
+    %2 = "tosa.greater_equal"(%arg3, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %3 = "tosa.logical_not"(%2) : (tensor<i1>) -> tensor<i1>
+    tosa.yield %3 : tensor<i1>
+  },  {
+  ^bb0(%arg3: tensor<i32>):
+    %2 = "tosa.const"() {values = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tosa.add"(%arg3, %2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    tosa.yield %3 : tensor<i32>
+  }) : (tensor<i32>) -> (tensor<i32>)
+  return
+}
+
+// -----
+
+func.func @test_while_loop_body_not_isolated_from_above(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<f32>) {
+  %0 = "tosa.const"() {values = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // expected-error@+1 {{'tosa.while_loop' op is not conformant to the TOSA specification. It requires the 'body' region is isolated from above.}}
+  %1 = "tosa.while_loop"(%0) ({
+  ^bb0(%arg3: tensor<i32>):
+    %2 = "tosa.const"() {values = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tosa.greater_equal"(%arg3, %2) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %4 = "tosa.logical_not"(%3) : (tensor<i1>) -> tensor<i1>
+    tosa.yield %4 : tensor<i1>
+  },  {
+  ^bb0(%arg3: tensor<i32>):
+    %3 = "tosa.add"(%arg3, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    tosa.yield %3 : tensor<i32>
+  }) : (tensor<i32>) -> (tensor<i32>)
+  return
+}
+
+// -----
+
+// Check isolated while_loops are valid
+func.func @test_while_loop_isolated_from_above(%arg0: tensor<f32>, %arg1: tensor<i32>) {
+  %0 = "tosa.const"() {values = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1:3 = "tosa.while_loop"(%0, %arg0, %arg1) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<f32>, %arg5: tensor<i32>):
+    %2 = "tosa.greater_equal"(%arg3, %arg5) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %3 = "tosa.logical_not"(%2) : (tensor<i1>) -> tensor<i1>
+    "tosa.yield"(%3) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<f32>, %arg5: tensor<i32>):
+    %2 = "tosa.const"() {values = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tosa.add"(%arg3, %2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    "tosa.yield"(%3, %arg4, %arg5) : (tensor<i32>, tensor<f32>, tensor<i32>) -> ()
+  }) : (tensor<i32>, tensor<f32>, tensor<i32>) -> (tensor<i32>, tensor<f32>, tensor<i32>)
+  return
+}
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 5a424c41775c9..ed747145369d7 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -750,6 +750,15 @@ func.func @test_mismatch_in_out_shape_clamp(%arg0: tensor<13x21x3xf32>) -> tenso
 
 // -----
 
+// CHECK-LABEL: test_unsupported_boolean_type_clamp
+func.func @test_unsupported_boolean_type_clamp(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> {
+  // expected-error@+1 {{'tosa.clamp' op illegal: operation operand/result data types did not align with any profile or extension, got (i1,i1), did you mean (i8,i8)?}}
+  %0 = tosa.clamp %arg0 {min_val = false, max_val = true} : (tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+  return %0 : tensor<13x21x3xi1>
+}
+
+// -----
+
 // CHECK-LABEL: test_mismatch_in_out_data_type_erf
 func.func @test_mismatch_in_out_data_type_erf(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf16> {
   // expected-error@+1 {{'tosa.erf' op requires the same element type for all operands and results}}
diff --git a/mlir/test/Dialect/Vector/CPU/ArmSVE/vector-bfmmla.mlir b/mlir/test/Dialect/Vector/CPU/ArmSVE/vector-bfmmla.mlir
new file mode 100644
index 0000000000000..ca9d91576b512
--- /dev/null
+++ b/mlir/test/Dialect/Vector/CPU/ArmSVE/vector-bfmmla.mlir
@@ -0,0 +1,105 @@
+// RUN:  mlir-opt %s --transform-interpreter | FileCheck %s
+
+#attrs = {
+  indexing_maps = [
+    affine_map<(d0, d1, d2) -> (d0, d2)>,
+    affine_map<(d0, d1, d2) -> (d1, d2)>,
+    affine_map<(d0, d1, d2) -> (d0, d1)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"],
+  kind = #vector.kind<add>
+}
+
+// CHECK-LABEL: @test_vector_contract_to_bfmmla
+// CHECK-SAME:    %[[LHS:.+]]: vector<4x4xbf16>, %[[RHS:.+]]: vector<[4]x4xbf16>, %[[ACC:.+]]: vector<4x[4]xf32>) -> vector<4x[4]xf32> {
+// CHECK-NEXT:    %[[T0:.+]]  = ub.poison : vector<[8]xf32>
+// CHECK-NEXT:    %[[UB:.+]] = ub.poison : vector<4x[4]xf32>
+// CHECK-NEXT:    %[[T2:.+]]  = ub.poison : vector<[8]xbf16>
+
+// Extract rows 0 and 1 of the LHS, concatenate them, and replicate the resulting 8xbf16 vector
+// VSCALE times to obtain a [8]xbf16 vector.
+// CHECK-NEXT:    %[[T3:.+]]  = vector.extract %[[LHS]][0] : vector<4xbf16> from vector<4x4xbf16>
+// CHECK-NEXT:    %[[T4:.+]]  = vector.extract %[[LHS]][1] : vector<4xbf16> from vector<4x4xbf16>
+// CHECK-NEXT:    %[[T5:.+]]  = vector.shuffle %[[T3]], %[[T4]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4xbf16>, vector<4xbf16>
+// CHECK-NEXT:    %[[T6:.+]]  = vector.scalable.insert %[[T5]], %[[T2]][0] : vector<8xbf16> into vector<[8]xbf16>
+// CHECK-NEXT:    %[[LHS_00:.+]] = arm_sve.dupq_lane %[[T6]][0] : vector<[8]xbf16>
+
+// Same for rows 2 and 3 of the LHS.
+// CHECK-NEXT:    %[[T8:.+]]  = vector.extract %[[LHS]][2] : vector<4xbf16> from vector<4x4xbf16>
+// CHECK-NEXT:    %[[T9:.+]]  = vector.extract %[[LHS]][3] : vector<4xbf16> from vector<4x4xbf16>
+// CHECK-NEXT:    %[[T10:.+]]  = vector.shuffle %[[T8]], %[[T9]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4xbf16>, vector<4xbf16>
+// CHECK-NEXT:    %[[T11:.+]]  = vector.scalable.insert %[[T10]], %[[T2]][0] : vector<8xbf16> into vector<[8]xbf16>
+// CHECK-NEXT:    %[[LHS_10:.+]] = arm_sve.dupq_lane %[[T11]][0] : vector<[8]xbf16>
+
+// Extract sub-tiles from the RHS
+// CHECK-NEXT:    %[[T13:.+]]  = vector.shape_cast %[[RHS]] : vector<[4]x4xbf16> to vector<[16]xbf16>
+// CHECK-NEXT:    %[[RHS_00:.+]] = vector.scalable.extract %[[T13]][0] : vector<[8]xbf16> from vector<[16]xbf16>
+// CHECK-NEXT:    %[[RHS_01:.+]] = vector.scalable.extract %[[T13]][8] : vector<[8]xbf16> from vector<[16]xbf16>
+
+
+// Extract accumulator rows 0 and 1 and pack (into "registers")
+// CHECK-NEXT:    %[[T16:.+]]  = vector.extract %[[ACC]][0] : vector<[4]xf32> from vector<4x[4]xf32>
+// CHECK-NEXT:    %[[T17:.+]]  = vector.extract %[[ACC]][1] : vector<[4]xf32> from vector<4x[4]xf32>
+// CHECK-NEXT:    %[[T18:.+]]  = vector.bitcast %[[T16]] : vector<[4]xf32> to vector<[2]xi64>
+// CHECK-NEXT:    %[[T19:.+]]  = vector.bitcast %[[T17]] : vector<[4]xf32> to vector<[2]xi64>
+// CHECK-NEXT:    %[[T20:.+]]  = vector.interleave %[[T18]], %[[T19]] : vector<[2]xi64> -> vector<[4]xi64>
+// CHECK-NEXT:    %[[T21:.+]]  = vector.bitcast %[[T20]] : vector<[4]xi64> to vector<[8]xf32>
+// CHECK-NEXT:    %[[ACC_00:.+]] = vector.scalable.extract %[[T21]][0] : vector<[4]xf32> from vector<[8]xf32>
+// CHECK-NEXT:    %[[ACC_01:.+]] = vector.scalable.extract %[[T21]][4] : vector<[4]xf32> from vector<[8]xf32>
+
+// Same for accumulator rows 2 and 3
+// CHECK-NEXT:    %[[T24:.+]]  = vector.extract %[[ACC]][2] : vector<[4]xf32> from vector<4x[4]xf32>
+// CHECK-NEXT:    %[[T25:.+]]  = vector.extract %[[ACC]][3] : vector<[4]xf32> from vector<4x[4]xf32>
+// CHECK-NEXT:    %[[T26:.+]]  = vector.bitcast %[[T24]] : vector<[4]xf32> to vector<[2]xi64>
+// CHECK-NEXT:    %[[T27:.+]]  = vector.bitcast %[[T25]] : vector<[4]xf32> to vector<[2]xi64>
+// CHECK-NEXT:    %[[T28:.+]]  = vector.interleave %[[T26]], %[[T27]] : vector<[2]xi64> -> vector<[4]xi64>
+// CHECK-NEXT:    %[[T29:.+]]  = vector.bitcast %[[T28]] : vector<[4]xi64> to vector<[8]xf32>
+// CHECK-NEXT:    %[[ACC_10:.+]] = vector.scalable.extract %[[T29]][0] : vector<[4]xf32> from vector<[8]xf32>
+// CHECK-NEXT:    %[[ACC_11:.+]] = vector.scalable.extract %[[T29]][4] : vector<[4]xf32> from vector<[8]xf32>
+
+// Do the sub-tile matrix multiplications
+// CHECK-NEXT:    %[[PACK_RES_00:.+]] = arm_sve.intr.bfmmla %[[ACC_00]], %[[LHS_00]], %[[RHS_00]] : vector<[8]xbf16> to vector<[4]xf32>
+// CHECK-NEXT:    %[[PACK_RES_01:.+]] = arm_sve.intr.bfmmla %[[ACC_01]], %[[LHS_00]], %[[RHS_01]] : vector<[8]xbf16> to vector<[4]xf32>
+// CHECK-NEXT:    %[[PACK_RES_10:.+]] = arm_sve.intr.bfmmla %[[ACC_10]], %[[LHS_10]], %[[RHS_00]] : vector<[8]xbf16> to vector<[4]xf32>
+// CHECK-NEXT:    %[[PACK_RES_11:.+]] = arm_sve.intr.bfmmla %[[ACC_11]], %[[LHS_10]], %[[RHS_01]] : vector<[8]xbf16> to vector<[4]xf32>
+
+// Unpack (from "registers") and insert in the output result rows 0 and 1
+// CHECK-NEXT:    %[[T36:.+]]  = vector.scalable.insert %[[PACK_RES_00]], %[[T0]][0] : vector<[4]xf32> into vector<[8]xf32>
+// CHECK-NEXT:    %[[T37:.+]]  = vector.scalable.insert %[[PACK_RES_01]], %[[T36]][4] : vector<[4]xf32> into vector<[8]xf32>
+// CHECK-NEXT:    %[[T38:.+]]  = vector.bitcast %[[T37]] : vector<[8]xf32> to vector<[4]xi64>
+// CHECK-NEXT:    %res1, %res2 = vector.deinterleave %[[T38]] : vector<[4]xi64> -> vector<[2]xi64>
+// CHECK-NEXT:    %[[UNPACK_RES_00:.+]] = vector.bitcast %res1 : vector<[2]xi64> to vector<[4]xf32>
+// CHECK-NEXT:    %[[UNPACK_RES_01:.+]] = vector.bitcast %res2 : vector<[2]xi64> to vector<[4]xf32>
+// CHECK-NEXT:    %[[TMP_OUT_0:.+]] = vector.insert %[[UNPACK_RES_00]], %[[UB]] [0] : vector<[4]xf32> into vector<4x[4]xf32>
+// CHECK-NEXT:    %[[TMP_OUT_1:.+]] = vector.insert %[[UNPACK_RES_01]], %[[TMP_OUT_0]] [1] : vector<[4]xf32> into vector<4x[4]xf32>
+
+// Same for result rows 2 and 3
+// CHECK-NEXT:    %[[T43:.+]]  = vector.scalable.insert %[[PACK_RES_10]], %[[T0]][0] : vector<[4]xf32> into vector<[8]xf32>
+// CHECK-NEXT:    %[[T44:.+]]  = vector.scalable.insert %[[PACK_RES_11]], %[[T43]][4] : vector<[4]xf32> into vector<[8]xf32>
+// CHECK-NEXT:    %[[T45:.+]]  = vector.bitcast %[[T44]] : vector<[8]xf32> to vector<[4]xi64>
+// CHECK-NEXT:    %res1_0, %res2_1 = vector.deinterleave %[[T45]] : vector<[4]xi64> -> vector<[2]xi64>
+// CHECK-NEXT:    %[[UNPACK_RES_10:.+]] = vector.bitcast %res1_0 : vector<[2]xi64> to vector<[4]xf32>
+// CHECK-NEXT:    %[[UNPACK_RES_11:.+]] = vector.bitcast %res2_1 : vector<[2]xi64> to vector<[4]xf32>
+// CHECK-NEXT:    %[[TMP_OUT_2:.+]] = vector.insert %[[UNPACK_RES_10]], %[[TMP_OUT_1]] [2] : vector<[4]xf32> into vector<4x[4]xf32>
+// CHECK-NEXT:    %[[OUT:.+]] = vector.insert %[[UNPACK_RES_11]], %[[TMP_OUT_2]] [3] : vector<[4]xf32> into vector<4x[4]xf32>
+// CHECK-NEXT:    return %[[OUT]] : vector<4x[4]xf32>
+func.func @test_vector_contract_to_bfmmla(%lhs: vector<4x4xbf16>,
+                                          %rhs: vector<[4]x4xbf16>,
+                                          %acc: vector<4x[4]xf32>) -> vector<4x[4]xf32> {
+  %0 = vector.contract #attrs %lhs, %rhs, %acc
+    : vector<4x4xbf16>, vector<[4]x4xbf16> into vector<4x[4]xf32>
+
+  return %0 : vector<4x[4]xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func {
+      transform.apply_patterns.arm_sve.vector_contract_to_bfmmla
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index ea2343efd246e..6809122974545 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -823,10 +823,10 @@ func.func @negative_fold_extract_broadcast(%a : vector<1x1xf32>) -> vector<4xf32
 
 // -----
 
-// CHECK-LABEL: fold_extract_splat
+// CHECK-LABEL: fold_extract_scalar_from_splat
 //  CHECK-SAME:   %[[A:.*]]: f32
 //       CHECK:   return %[[A]] : f32
-func.func @fold_extract_splat(%a : f32, %idx0 : index, %idx1 : index, %idx2 : index) -> f32 {
+func.func @fold_extract_scalar_from_splat(%a : f32, %idx0 : index, %idx1 : index, %idx2 : index) -> f32 {
   %b = vector.splat %a : vector<1x2x4xf32>
   %r = vector.extract %b[%idx0, %idx1, %idx2] : f32 from vector<1x2x4xf32>
   return %r : f32
@@ -834,6 +834,16 @@ func.func @fold_extract_splat(%a : f32, %idx0 : index, %idx1 : index, %idx2 : in
 
 // -----
 
+// CHECK-LABEL: fold_extract_vector_from_splat
+//       CHECK: vector.broadcast {{.*}} f32 to vector<4xf32>
+func.func @fold_extract_vector_from_splat(%a : f32, %idx0 : index, %idx1 : index) -> vector<4xf32> {
+  %b = vector.splat %a : vector<1x2x4xf32>
+  %r = vector.extract %b[%idx0, %idx1] : vector<4xf32> from vector<1x2x4xf32>
+  return %r : vector<4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: fold_extract_broadcast_dim1_broadcasting
 //  CHECK-SAME:   %[[A:.*]]: vector<2x1xf32>
 //  CHECK-SAME:   %[[IDX:.*]]: index, %[[IDX1:.*]]: index, %[[IDX2:.*]]: index
@@ -863,6 +873,35 @@ func.func @fold_extract_broadcast_to_lower_rank(%a : vector<2x4xf32>,
 
 // -----
 
+// Test where the shape_cast is broadcast-like.
+// CHECK-LABEL: fold_extract_shape_cast_to_lower_rank
+//  CHECK-SAME:   %[[A:.*]]: vector<2x4xf32>
+//  CHECK-SAME:   %[[IDX0:.*]]: index, %[[IDX1:.*]]: index
+//       CHECK:   %[[B:.+]] = vector.extract %[[A]][%[[IDX1]]] : vector<4xf32> from vector<2x4xf32>
+//       CHECK:   return %[[B]] : vector<4xf32>
+func.func @fold_extract_shape_cast_to_lower_rank(%a : vector<2x4xf32>,
+  %idx0 : index, %idx1 : index) -> vector<4xf32> {
+  %b = vector.shape_cast %a : vector<2x4xf32> to vector<1x2x4xf32>
+  %r = vector.extract %b[%idx0, %idx1] : vector<4xf32> from vector<1x2x4xf32>
+  return %r : vector<4xf32>
+}
+
+// -----
+
+// Test where the shape_cast is not broadcast-like, even though it prepends 1s.
+// CHECK-LABEL: negative_fold_extract_shape_cast_to_lower_rank
+//  CHECK-NEXT: vector.shape_cast
+//  CHECK-NEXT: vector.extract
+//  CHECK-NEXT: return
+func.func @negative_fold_extract_shape_cast_to_lower_rank(%a : vector<2x4xf32>,
+  %idx0 : index, %idx1 : index) -> vector<2xf32> {
+  %b = vector.shape_cast %a : vector<2x4xf32> to vector<1x4x2xf32>
+  %r = vector.extract %b[%idx0, %idx1] : vector<2xf32> from vector<1x4x2xf32>
+  return %r : vector<2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: fold_extract_broadcast_to_higher_rank
 //       CHECK:   %[[B:.*]] = vector.broadcast %{{.*}} : f32 to vector<4xf32>
 //       CHECK:   return %[[B]] : vector<4xf32>
@@ -890,6 +929,19 @@ func.func @fold_extract_broadcast_to_equal_rank(%a : vector<1xf32>, %idx0 : inde
 
 // -----
 
+// CHECK-LABEL: fold_extract_broadcastlike_shape_cast
+//  CHECK-SAME:   %[[A:.*]]: vector<1xf32>
+//       CHECK:   %[[R:.*]] = vector.broadcast %[[A]] : vector<1xf32> to vector<1x1xf32>
+//       CHECK:   return %[[R]] : vector<1x1xf32>
+func.func @fold_extract_broadcastlike_shape_cast(%a : vector<1xf32>, %idx0 : index)
+  -> vector<1x1xf32> {
+  %s = vector.shape_cast %a : vector<1xf32> to vector<1x1x1xf32>
+  %r = vector.extract %s[%idx0] : vector<1x1xf32> from vector<1x1x1xf32>
+  return %r : vector<1x1xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @fold_extract_shuffle
 //  CHECK-SAME:   %[[A:.*]]: vector<8xf32>, %[[B:.*]]: vector<8xf32>
 //   CHECK-NOT:   vector.shuffle
@@ -1623,7 +1675,7 @@ func.func @negative_store_to_load_tensor_memref(
     %arg0 : tensor<?x?xf32>,
     %arg1 : memref<?x?xf32>,
     %v0 : vector<4x2xf32>
-  ) -> vector<4x2xf32> 
+  ) -> vector<4x2xf32>
 {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
@@ -1680,7 +1732,7 @@ func.func @negative_store_to_load_tensor_broadcast_out_of_bounds(%arg0 : tensor<
 //       CHECK:   vector.transfer_read
 func.func @negative_store_to_load_tensor_broadcast_masked(
     %arg0 : tensor<?x?xf32>, %v0 : vector<4x2xf32>, %mask : vector<4x2xi1>)
-  -> vector<4x2x6xf32> 
+  -> vector<4x2x6xf32>
 {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 5038646e1f026..ca837d3cf7cfb 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1321,13 +1321,6 @@ func.func @transpose_dim_size_mismatch(%arg0: vector<11x7x3x2xi32>) {
 
 // -----
 
-func.func @flat_transpose_type_mismatch(%arg0: vector<16xf32>) {
-  // expected-error@+1 {{'vector.flat_transpose' op failed to verify that source operand and result have same element type}}
-  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 } : vector<16xf32> -> vector<16xf64>
-}
-
-// -----
-
 func.func @type_cast_layout(%arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>>) {
   // expected-error@+1 {{expects operand to be a memref with identity layout}}
   %0 = vector.type_cast %arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>> to memref<vector<4x3xf32>>
@@ -1939,26 +1932,6 @@ func.func @invalid_step_2d() {
 
 // -----
 
-func.func @matrix_multiply_scalable(%a: vector<[4]xf64>, %b: vector<4xf64>) {
-  // expected-error @+1 {{'vector.matrix_multiply' op operand #0 must be fixed-length vector of signless integer or signed integer or index or floating-point values of ranks 1, but got 'vector<[4]xf64>'}}
-  %c = vector.matrix_multiply %a, %b {
-    lhs_rows = 2: i32,
-    lhs_columns = 2: i32 ,
-    rhs_columns = 2: i32 }
-  : (vector<[4]xf64>, vector<4xf64>) -> vector<4xf64>
-
-  return
-}
-
-// -----
-
-func.func @flat_transpose_scalable(%arg0: vector<[16]xf32>) -> vector<[16]xf32> {
-  // expected-error @+1 {{'vector.flat_transpose' op operand #0 must be fixed-length vector of signless integer or signed integer or index or floating-point values of ranks 1, but got 'vector<[16]xf32>'}}
-  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 }
-     : vector<[16]xf32> -> vector<[16]xf32>
-  return %0 : vector<[16]xf32>
-}
-
 //===----------------------------------------------------------------------===//
 // vector.splat
 //===----------------------------------------------------------------------===//
@@ -1995,6 +1968,15 @@ func.func @vector_load(%src : memref<?xi8>) {
 
 // -----
 
+func.func @invalid_load_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  %val = vector.load %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // vector.store
 //===----------------------------------------------------------------------===//
@@ -2005,3 +1987,12 @@ func.func @vector_store(%dest : memref<?xi8>, %vec : vector<16x16xi8>) {
   vector.store %vec, %dest[%c0] : memref<?xi8>, vector<16x16xi8>
   return
 }
+
+// -----
+
+func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  vector.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32>
+  return
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 10bf0f1620568..6a56116322d19 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -738,22 +738,6 @@ func.func @transpose_int_0d(%arg0: vector<i32>) -> vector<i32> {
   return %0 : vector<i32>
 }
 
-// CHECK-LABEL: @flat_transpose_fp
-func.func @flat_transpose_fp(%arg0: vector<16xf32>) -> vector<16xf32> {
-  // CHECK: %[[X:.*]] = vector.flat_transpose %{{.*}} {columns = 4 : i32, rows = 4 : i32} : vector<16xf32> -> vector<16xf32>
-  %0 = vector.flat_transpose %arg0 { rows = 4: i32, columns = 4: i32 } : vector<16xf32> -> vector<16xf32>
-  // CHECK: return %[[X]] : vector<16xf32>
-  return %0 : vector<16xf32>
-}
-
-// CHECK-LABEL: @flat_transpose_int
-func.func @flat_transpose_int(%arg0: vector<16xi32>) -> vector<16xi32> {
-  // CHECK: %[[X:.*]] = vector.flat_transpose %{{.*}} {columns = 8 : i32, rows = 2 : i32} : vector<16xi32> -> vector<16xi32>
-  %0 = vector.flat_transpose %arg0 { rows = 2: i32, columns = 8: i32 } : vector<16xi32> -> vector<16xi32>
-  // CHECK: return %[[X]] : vector<16xi32>
-  return %0 : vector<16xi32>
-}
-
 // CHECK-LABEL: @vector_load_and_store_0d_scalar_memref
 func.func @vector_load_and_store_0d_scalar_memref(%memref : memref<200x100xf32>,
                                                   %i : index, %j : index) {
@@ -853,6 +837,16 @@ func.func @vector_load_and_store_2d_vector_memref(%memref : memref<200x100xvecto
   return
 }
 
+// CHECK-LABEL: func @load_store_alignment
+func.func @load_store_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: vector.load {{.*}} {alignment = 16 : i64}
+  %val = vector.load %memref[%c0] { alignment = 16 } : memref<4xi32>, vector<4xi32>
+  // CHECK: vector.store {{.*}} {alignment = 16 : i64}
+  vector.store %val, %memref[%c0] { alignment = 16 } : memref<4xi32>, vector<4xi32>
+  return
+}
+
 // CHECK-LABEL: @masked_load_and_store
 func.func @masked_load_and_store(%base: memref<?xf32>, %mask: vector<16xi1>, %passthru: vector<16xf32>) {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
index 08ac2ac5bb7d5..3950e54006eec 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+// RUN: mlir-opt %s  --convert-vector-to-llvm='vector-contract-lowering=matmul' | FileCheck %s
 
 #matmat_accesses = [
   affine_map<(i, j, k) -> (i, k)>,
@@ -10,31 +10,54 @@
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
-// CHECK-LABEL: func @matmul
-// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
-// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
-// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
-//  CHECK-DAG:  %[[ub:.*]] = ub.poison : vector<8xf32>
-//  CHECK-DAG:  %[[ub_0:.*]] = ub.poison : vector<12xf32>
-//  CHECK-DAG:  %[[ub_1:.*]] = ub.poison : vector<2x3xf32>
-//      CHECK:  %[[a0:.*]] = vector.extract %[[A]][0] : vector<4xf32> from vector<2x4xf32>
-//      CHECK:  %[[a1:.*]] = vector.insert_strided_slice %[[a0]], %[[ub]] {offsets = [0], strides = [1]} : vector<4xf32> into vector<8xf32>
-//      CHECK:  %[[a2:.*]] = vector.extract %[[A]][1] : vector<4xf32> from vector<2x4xf32>
-//      CHECK:  %[[a3:.*]] = vector.insert_strided_slice %[[a2]], %[[a1]] {offsets = [4], strides = [1]} : vector<4xf32> into vector<8xf32>
-//      CHECK:  %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<4x3xf32>
-//      CHECK:  %[[b1:.*]] = vector.insert_strided_slice %[[b0]], %[[ub_0]] {offsets = [0], strides = [1]} : vector<3xf32> into vector<12xf32>
-//      CHECK:  %[[b2:.*]] = vector.extract %[[B]][1] : vector<3xf32> from vector<4x3xf32>
-//      CHECK:  %[[b3:.*]] = vector.insert_strided_slice %[[b2]], %[[b1]] {offsets = [3], strides = [1]} : vector<3xf32> into vector<12xf32>
-//      CHECK:  %[[b4:.*]] = vector.extract %[[B]][2] : vector<3xf32> from vector<4x3xf32>
-//      CHECK:  %[[b5:.*]] = vector.insert_strided_slice %[[b4]], %[[b3]] {offsets = [6], strides = [1]} : vector<3xf32> into vector<12xf32>
-//      CHECK:  %[[b6:.*]] = vector.extract %[[B]][3] : vector<3xf32> from vector<4x3xf32>
-//      CHECK:  %[[b7:.*]] = vector.insert_strided_slice %[[b6]], %[[b5]] {offsets = [9], strides = [1]} : vector<3xf32> into vector<12xf32>
-//      CHECK:  %[[mm1:.*]] = vector.matrix_multiply %[[a3]], %[[b7]] {lhs_columns = 4 : i32, lhs_rows = 2 : i32, rhs_columns = 3 : i32} : (vector<8xf32>, vector<12xf32>) -> vector<6xf32>
-//      CHECK:  %[[mm2:.*]] = vector.extract_strided_slice %[[mm1]] {offsets = [0], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
-//      CHECK:  %[[mm3:.*]] = vector.insert %[[mm2]], %[[ub_1]] [0] : vector<3xf32> into vector<2x3xf32>
-//      CHECK:  %[[mm4:.*]] = vector.extract_strided_slice %[[mm1]] {offsets = [3], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
-//      CHECK:  %[[mm5:.*]] = vector.insert %[[mm4]], %[[mm3]] [1] : vector<3xf32> into vector<2x3xf32>
-//      CHECK:  %[[mm6:.*]] = arith.addf %[[C]], %[[mm5]] : vector<2x3xf32>
+// CHECK-LABEL:   func.func @matmul(
+// CHECK-SAME:                      %[[ARG0:.*]]: vector<2x4xf32>,
+// CHECK-SAME:                      %[[ARG1:.*]]: vector<4x3xf32>,
+// CHECK-SAME:                      %[[ARG2:.*]]: vector<2x3xf32>) -> vector<2x3xf32> {
+// CHECK:           %[[VAL_0:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : vector<4x3xf32> to !llvm.array<4 x vector<3xf32>>
+// CHECK:           %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : vector<2x4xf32> to !llvm.array<2 x vector<4xf32>>
+// CHECK:           %[[VAL_2:.*]] = ub.poison : vector<2x3xf32>
+// CHECK:           %[[VAL_3:.*]] = builtin.unrealized_conversion_cast %[[VAL_2]] : vector<2x3xf32> to !llvm.array<2 x vector<3xf32>>
+// CHECK:           %[[POISON_RHS:.*]] = ub.poison : vector<12xf32>
+// CHECK:           %[[POISON_LHS:.*]] = ub.poison : vector<8xf32>
+
+// ===> Extract LHS
+//       | ROW_1 |
+//       | ----- | --> | ROW_1 | ROW_2 |
+//       | ROW_2 |
+//
+// CHECK:           %[[LHS_ROW_1:.*]] = llvm.extractvalue %[[VAL_1]][0] : !llvm.array<2 x vector<4xf32>>
+// CHECK:           %[[TP_1:.*]] = llvm.shufflevector %[[LHS_ROW_1]], %[[LHS_ROW_1]] [0, 1, 2, 3, 0, 0, 0, 0] : vector<4xf32>
+// CHECK:           %[[TP_2:.*]] = llvm.shufflevector %[[TP_1]], %[[POISON_LHS]] [0, 1, 2, 3, 12, 13, 14, 15] : vector<8xf32>
+// CHECK:           %[[LHS_ROW_2:.*]] = llvm.extractvalue %[[VAL_1]][1] : !llvm.array<2 x vector<4xf32>>
+// CHECK:           %[[TP_3:.*]] = llvm.shufflevector %[[LHS_ROW_2]], %[[LHS_ROW_2]] [0, 1, 2, 3, 0, 0, 0, 0] : vector<4xf32>
+// CHECK:           %[[LHS:.*]] = llvm.shufflevector %[[TP_3]], %[[TP_2]] [8, 9, 10, 11, 0, 1, 2, 3] : vector<8xf32>
+
+// == Extract RHS
+//       | ROW_1 |
+//       | ----- |
+//       | ROW_2 |
+//       | ----- | --> | ROW_1 | ROW_2 | ROW_3 | ROW_4 |
+//       | ROW_3 |
+//       | ----- |
+//       | ROW_4 |
+// CHECK:           %[[RHS_ROW_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.array<4 x vector<3xf32>>
+// CHECK:           %[[TP_4:.*]] = llvm.shufflevector %[[RHS_ROW_1]], %[[RHS_ROW_1]] [0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<3xf32>
+// CHECK:           %[[TP_5:.*]] = llvm.shufflevector %[[TP_4]], %[[POISON_RHS]] [0, 1, 2, 15, 16, 17, 18, 19, 20, 21, 22, 23] : vector<12xf32>
+// CHECK:           %[[RHS_ROW_2:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.array<4 x vector<3xf32>>
+// CHECK:           %[[TP_6:.*]] = llvm.shufflevector %[[RHS_ROW_2]], %[[RHS_ROW_2]] [0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<3xf32>
+// CHECK:           %[[TP_7:.*]] = llvm.shufflevector %[[TP_6]], %[[TP_5]] [12, 13, 14, 0, 1, 2, 18, 19, 20, 21, 22, 23] : vector<12xf32>
+// CHECK:           %[[RHS_ROW_3:.*]] = llvm.extractvalue %[[VAL_0]][2] : !llvm.array<4 x vector<3xf32>>
+// CHECK:           %[[TP_8:.*]] = llvm.shufflevector %[[RHS_ROW_3]], %[[RHS_ROW_3]] [0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<3xf32>
+// CHECK:           %[[TP_9:.*]] = llvm.shufflevector %[[TP_8]], %[[TP_7]] [12, 13, 14, 15, 16, 17, 0, 1, 2, 21, 22, 23] : vector<12xf32>
+// CHECK:           %[[RHS_ROW_4:.*]] = llvm.extractvalue %[[VAL_0]][3] : !llvm.array<4 x vector<3xf32>>
+// CHECK:           %[[TP_10:.*]] = llvm.shufflevector %[[RHS_ROW_4]], %[[RHS_ROW_4]] [0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<3xf32>
+// CHECK:           %[[RHS:.*]] = llvm.shufflevector %[[TP_10]], %[[TP_9]] [12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 1, 2] : vector<12xf32>
+
+// ===> Matrix multiply
+// CHECK:           %[[MM:.*]] = llvm.intr.matrix.multiply %[[LHS]], %[[RHS]] {lhs_columns = 4 : i32, lhs_rows = 2 : i32, rhs_columns = 3 : i32} : (vector<8xf32>, vector<12xf32>) -> vector<6xf32>
+// CHECK:           %[[RES:.*]] = arith.addf %[[ARG2]], %{{.*}} : vector<2x3xf32>
+// CHECK:           return %[[RES]] : vector<2x3xf32>
 func.func @matmul(%arg0: vector<2x4xf32>,
                   %arg1: vector<4x3xf32>,
                   %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
@@ -44,7 +67,7 @@ func.func @matmul(%arg0: vector<2x4xf32>,
 }
 
 // CHECK-LABEL: func @matmul_scalable
-// CHECK-NOT: vector.matrix_multiply
+// CHECK-NOT: llvm.intr.matrix.multiply
 func.func @matmul_scalable(%arg0: vector<2x4xf32>,
                            %arg1: vector<4x[3]xf32>,
                            %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
@@ -52,19 +75,3 @@ func.func @matmul_scalable(%arg0: vector<2x4xf32>,
     : vector<2x4xf32>, vector<4x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
-    %f = transform.structured.match ops{["func.func"]} in %module_op
-      : (!transform.any_op) -> !transform.any_op
-
-    transform.apply_patterns to %f {
-      transform.apply_patterns.vector.lower_contraction lowering_strategy = "matmulintrinsics"
-    } : !transform.any_op
-
-    transform.apply_patterns to %f {
-      transform.apply_patterns.vector.lower_shape_cast
-    } : !transform.any_op
-    transform.yield
-  }
-}
diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
index a730f217f027d..7838aad1825bc 100644
--- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
@@ -136,38 +136,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @transpose(
-func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
-  // CHECK:       vector.shape_cast {{.*}} : vector<2x4xf32> to vector<8xf32>
-  // CHECK:       vector.flat_transpose %{{.*}} {columns = 2 : i32, rows = 4 : i32} : vector<8xf32> -> vector<8xf32>
-  // CHECK:       vector.shape_cast {{.*}} : vector<8xf32> to vector<4x2xf32>
-  %0 = vector.transpose %arg0, [1, 0] : vector<2x4xf32> to vector<4x2xf32>
-  return %0 : vector<4x2xf32>
-}
-
-/// Scalable vectors are not supported
-
-// CHECK-LABEL: func @transpose_scalable(
-func.func @transpose_scalable(%arg0: vector<2x[4]xf32>) -> vector<[4]x2xf32> {
-  // CHECK-NOT:       vector.shape_cast
-  // CHECK-NOT:       vector.flat_transpose
-  // CHECK:           vector.transpose
-  %0 = vector.transpose %arg0, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
-  return %0 : vector<[4]x2xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
-    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.vector.lower_transpose lowering_strategy = "flat_transpose"
-    } : !transform.op<"func.func">
-    transform.yield
-  }
-}
-
-// -----
-
 // CHECK-LABEL: @transpose_shuffle16x16xf32(
 func.func @transpose_shuffle16x16xf32(%arg0: vector<16x16xf32>) -> vector<16x16xf32> {
   // CHECK: vector.shuffle {{.*}} [0, 16, 1, 17, 4, 20, 5, 21, 8, 24, 9, 25, 12, 28, 13, 29] : vector<16xf32>, vector<16xf32>
diff --git a/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir b/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir
new file mode 100644
index 0000000000000..94689fa0dfb88
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s --convert-vector-to-llvm='vector-transpose-lowering=flat' --split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @transpose(
+func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
+  // CHECK:       llvm.intr.matrix.transpose %{{.*}} {columns = 2 : i32, rows = 4 : i32} : vector<8xf32> into vector<8xf32>
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x4xf32> to vector<4x2xf32>
+  return %0 : vector<4x2xf32>
+}
+
+/// Scalable vectors are not supported
+
+// CHECK-LABEL: func @transpose_scalable(
+func.func @transpose_scalable(%arg0: vector<2x[4]xf32>) -> vector<[4]x2xf32> {
+  // CHECK-NOT:       llvm.intr.matrix.transpose
+  // CHECK:           vector.transpose
+  %0 = vector.transpose %arg0, [1, 0] : vector<2x[4]xf32> to vector<[4]x2xf32>
+  return %0 : vector<[4]x2xf32>
+}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 83a98ab0622b7..eb564d55bfd51 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
 // -----
-func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+func.func @create_nd_tdesc_1(%src: memref<24xf32>) {
   // expected-error@+1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides, offsets or the memref source}}
   %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
@@ -9,47 +9,62 @@ func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
 
 // -----
 
-func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+func.func @create_nd_tdesc_2(%src: memref<24x32xf32>) {
   // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_3(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{SLM is only supported for 1D block tensor}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_4(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_5(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_6(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
   return
 }
 
+// -----
+func.func @create_nd_tdesc_8(%src: ui64) {
+  // expected-error@+1 {{'xegpu.create_nd_tdesc' op Expecting strides and shape to be present for integer source}}
+  %1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32>
+  return
+}
+
+// -----
+func.func @create_nd_tdesc_9(%src: ui64) {
+  // expected-error@+1 {{expected mixed offsets rank to match mixed sizes rank}}
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32>
+  return
+}
+
+
 // -----
 func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 3bfe1fa81aa6e..695437354cd7c 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -17,8 +17,8 @@ gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
 gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]],  %[[arg4]]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
@@ -62,6 +62,47 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) {
 }
 
 
+// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>) 
+gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) {
+  //CHECK: %[[C:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %src2 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ 
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) 
+gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+  
+  %c1 = arith.constant 1 : index   
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %2 = xegpu.create_nd_tdesc %src, shape : [%h, %w], strides : [%w, %c1]  : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ 
+  gpu.return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) 
+
+gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[%arg3, %arg4], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides:[%w, %c1]  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+  gpu.return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}}) 
+gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {  
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16> 
+  %2 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides:[%w, %c1]  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+  gpu.return
+}
+
 // CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @prefetch_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 3d91b2269bc4b..e78ae4a17710b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -95,10 +95,10 @@ gpu.module @test {
 // -----
 // CHECK-LABEL: gpu.func @load_dpas_store
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
 // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK: xegpu.store_nd %[[T4]], %[[T5]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
@@ -120,10 +120,10 @@ gpu.module @test {
 // -----
 // CHECK-LABEL: gpu.func @load_dpas_postop_store
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
 // CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
 // CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
@@ -150,16 +150,16 @@ gpu.module @test {
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
 // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
 // CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
 // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
 // CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
 gpu.module @test {
   gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
     %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir
new file mode 100644
index 0000000000000..c521110a87aa3
--- /dev/null
+++ b/mlir/test/IR/test-pattern-logging-listener.mlir
@@ -0,0 +1,25 @@
+// REQUIRES: asserts
+// RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \
+// RUN:   --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s
+
+// Check that when replacing an op with a new op, we get appropriate
+// pattern-logging lines. The use of check same is to avoid the complexity of
+// matching the anonymous namespace prefix, which can be one of {anonymous} vs
+// {anonymous_namespace} vs `anonymous_namespace` (and maybe others?) on the
+// various platforms.
+
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationInserted | test.new_op
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op
+func.func @replace_with_new_op() -> i32 {
+  %a = "test.replace_with_new_op"() : () -> (i32)
+  %res = arith.addi %a, %a : i32
+  return %res : i32
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-bfmmla.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-bfmmla.mlir
new file mode 100644
index 0000000000000..8b209d3f777b5
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-bfmmla.mlir
@@ -0,0 +1,201 @@
+// REQUIRES: arm-emulator
+
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:   --convert-vector-to-scf --convert-scf-to-cf  --convert-vector-to-llvm='enable-arm-sve enable-arm-bf16' \
+// DEFINE:   --expand-strided-metadata --convert-to-llvm --finalize-memref-to-llvm  \
+// DEFINE:   --lower-affine --convert-arith-to-llvm --reconcile-unrealized-casts \
+// DEFINE: -o %t
+
+// DEFINE: %{entry_point} = main
+
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+bf16" \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
+
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
+
+#packed_maps = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+
+//
+// Test the lowering of `vector.contract` using the `LowerContractionToSVEBFMMLAPattern`
+//
+// The operation that the `vector.contract` in this test performs is matrix
+// multiplication with accumulate
+//     OUT = ACC + LHS * RHS
+// of two BFloat16 matrices LHS and RHS, and a Float32 matrix ACC into a Float32 OUT.
+//
+// Tested are calculations as well as that the relevant `ArmSVE` dialect
+// operation ('arm_sve.intr.bfmmla`) is emitted.
+//
+// That pattern above handles (therefore this test prepares) input/output vectors with
+// specific shapes:
+//   * LHS:      vector<Mx4xbf16>
+//   * RHS:      vector<[N]x4xbf16>
+//   * ACC, OUT: vector<Mx[N]xf32>
+// Note that the RHS is transposed.
+// This data layout makes it efficient to load data into SVE
+// registers in the layout expected by te BFMMLA instruction.
+// Such a `vector.contract` is representative of the code we aim to generate
+// by scalable vectorisation of `linalg.mmt4d`.
+// See mlir/lib/Dialect/ArmSVE/Transforms/LowerContractToSVEPatterns.cpp
+// for more information and rationale about these shapes.
+//
+// In this specific test we use M == 4 and N == 4
+//
+
+// Allocate and initialise a memref containing test data for use as the ACC
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the columns of the memref.
+func.func private @prepareAccTestData(%in: vector<4x4xf32>) -> memref<4x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+
+  %vs = vector.vscale
+  %nCols = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%nCols) : memref<4x?xf32>
+
+  scf.for %j = %c0 to %nCols step %c4 {
+    vector.transfer_write %in, %mem[%c0, %j] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x?xf32>
+  }
+
+  return %mem : memref<4x?xf32>
+}
+
+// Allocate and initialise a memref containing test data for use as the LHS
+// operand. This function just writes the parameter `%in` into the memref.
+// The size of the LHS does not depends on VSCALE.
+func.func private @prepareLHSTestData(%in: vector<4x4xbf16>) -> memref<4x4xbf16> {
+  %c0 = arith.constant 0 : index
+
+  %mem = memref.alloc() : memref<4x4xbf16>
+  vector.transfer_write %in, %mem[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xbf16>, memref<4x4xbf16>
+
+  return %mem : memref<4x4xbf16>
+}
+
+// Allocate and initialise a memref containing test data for use as the RHS
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the rows of the memref.
+//
+// For convenience, flatten the memref, since the RHS vector is read first as a
+// single-dimensional scalable vector and then cast into [N]x4 shape.
+func.func private @prepareRHSTestData(%in: vector<4x4xbf16>) -> memref<?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+
+  %vs = vector.vscale
+  %nRows = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%nRows) : memref<?x4xbf16>
+
+  scf.for %i = %c0 to %nRows step %c4 {
+    vector.transfer_write %in, %mem[%i, %c0] {in_bounds = [true, true]} : vector<4x4xbf16>, memref<?x4xbf16>
+  }
+
+  %mem_out = memref.collapse_shape %mem [[0, 1]] : memref<?x4xbf16> into memref<?xbf16>
+  return %mem_out : memref<?xbf16>
+}
+
+
+// CHECK-IR-LABEL: llvm.func @test_bfmmla
+// CHECK-IR-COUNT-4: arm_sve.intr.bfmmla
+func.func @test_bfmmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_f32 = arith.constant 0.0 : f32
+  %c0_bf16 = arith.constant 0.0 : bf16
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[ 0.7,  1.0, -0.1,  1.8],
+                                   [-0.5,  0.9,  0.7, -0.7],
+                                   [ 0.5, -1.3, -2.2,  0.1],
+                                   [-0.7,  1.0,  1.7, -1.0]]> : vector<4x4xf32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xf32>) -> memref<4x?xf32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_f32 {in_bounds = [true, true]} : memref<4x?xf32>, vector<4x[4]xf32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[ 0.1,  0.7, -0.9,  1.3],
+                                   [-1.6,  0.7, -0.3, -0.3],
+                                   [-0.4,  0.6,  0.8, -0.5],
+                                   [-0.6, -1.0, -1.0, -1.0]]> : vector<4x4xbf16>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x4xbf16>) -> memref<4x4xbf16>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_bf16 {in_bounds = [true, true]} : memref<4x4xbf16>, vector<4x4xbf16>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[ 0.6,  1.3,  0.1, -0.9],
+                                   [ 0.5,  1.6,  1.8,  1.6],
+                                   [-0.2,  0.4,  1.0,  0.4],
+                                   [-1.3, -0.2, -2.2,  0.3]]> : vector<4x4xbf16>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x4xbf16>) -> memref<?xbf16>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_bf16 {in_bounds = [true]} :  memref<?xbf16>, vector<[16]xbf16>
+  %rhs = vector.shape_cast %rhs_flat : vector<[16]xbf16> to vector<[4]x4xbf16>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %lhs, %rhs, %acc
+    : vector<4x4xbf16>, vector<[4]x4xbf16> into vector<4x[4]xf32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(BFMMLA):\n"
+  %u0 = vector.extract %0[0] : vector<[4]xf32> from vector<4x[4]xf32>
+  %u1 = vector.extract %0[1] : vector<[4]xf32> from vector<4x[4]xf32>
+  %u2 = vector.extract %0[2] : vector<[4]xf32> from vector<4x[4]xf32>
+  %u3 = vector.extract %0[3] : vector<[4]xf32> from vector<4x[4]xf32>
+  vector.print %u0 : vector<[4]xf32>
+  vector.print %u1 : vector<[4]xf32>
+  vector.print %u2 : vector<[4]xf32>
+  vector.print %u3 : vector<[4]xf32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xf32>
+  memref.dealloc %lhs_mem : memref<4x4xbf16>
+  memref.dealloc %rhs_mem : memref<?xbf16>
+
+  return
+}
+
+// Perform each test with SVE vector lengths 128 bits and 256 bits (i.e. VSCALEs
+// 1 and 2, respectively). The vector length is set via the `setArmVLBits`
+// function. The effect of setting a different vector length is that the tests
+// allocate and operate on different sized buffers (see `prepare<X>TestData`
+// functions).
+
+func.func @main() {
+  %c128 = arith.constant 128 : i32
+  %c256 = arith.constant 256 : i32
+
+// CHECK-LABEL: Result(BFMMLA):
+// CHECK: (  0.411922, 2.63254,  -0.219259,  3.89965 )
+// CHECK: ( -0.316515, 0.196875,  0.879375,  1.80924 )
+// CHECK: (  1.56867,  0.101367, -1.2784,   -1.41579 )
+// CHECK: ( -1.56041, -4.30078,   0.0196488, 1.88269 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_bfmmla() : () -> ()
+
+// CHECK: Result(BFMMLA):
+// CHECK: (  0.411922, 2.63254,  -0.219259,  3.89965,  0.411922, 2.63254,  -0.219259,  3.89965 )
+// CHECK: ( -0.316515, 0.196875,  0.879375,  1.80924, -0.316515, 0.196875,  0.879375,  1.80924 )
+// CHECK: (  1.56867,  0.101367, -1.2784,   -1.41579,  1.56867,  0.101367, -1.2784,   -1.41579 )
+// CHECK: ( -1.56041, -4.30078,   0.0196488, 1.88269, -1.56041, -4.30078,   0.0196488, 1.88269 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_bfmmla() : () -> ()
+
+  return
+}
+
+func.func private @setArmVLBits(%bits : i32)
+func.func private @printMemrefF32(%ptr : memref<*xf32>)
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
index 222f9542ad58e..45fa35d207f44 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
@@ -20,7 +20,7 @@
 ]
 
 //
-// Test the lowering of `vector.contract` using the `LowerContractionToSVEI8MMPattern`
+// Test the lowering of `vector.contract` using the `LowerContractionToSVEBFMMLAPattern`
 //
 // The operation that the `vector.contract` in this test performs is matrix
 // multiplication with accumulate
@@ -42,7 +42,7 @@
 // registers in the layout expected by FEAT_I8MM instructions.
 // Such a `vector.contract` is representative of the code we aim to generate
 // by scalable vectorisation of `linalg.mmt4d`.
-// See mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+// See mlir/lib/Dialect/ArmSVE/Transforms/LowerContractToSVEPatterns.cpp
 // for more information and rationale about these shapes.
 //
 // In this specific test we use M == 4 and N == 4
@@ -312,7 +312,7 @@ func.func @test_usmmla() attributes {no_inline} {
 
 // Test the operation where LHS is interpreted as signed and RHS is interpreted
 // as unsigned. In this test we ultimately emit end execute the `usmmla`
-// instruction with reversed operands, see `LowerContractionToSVEI8MMPattern.cpp`
+// instruction with reversed operands, see `LowerContractToSVEPatterns.cpp`
 // for more details.
 
 // CHECK-IR-LABEL: llvm.func @test_summla
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
index b414242b34cc0..86bd0b1e09763 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
@@ -57,10 +57,10 @@ func.func @entry() {
   // ( 1, 4 )    ->  ( 3, 4, 5 )
   // ( 2, 5 )
   //
-  %d = vector.flat_transpose %a { rows = 2: i32, columns = 2: i32 } : vector<4xf64> -> vector<4xf64>
-  %e = vector.flat_transpose %b { rows = 2: i32, columns = 2: i32 } : vector<4xf64> -> vector<4xf64>
-  %f = vector.flat_transpose %c { rows = 2: i32, columns = 3: i32 } : vector<6xf64> -> vector<6xf64>
-  %g = vector.flat_transpose %c { rows = 3: i32, columns = 2: i32 } : vector<6xf64> -> vector<6xf64>
+  %d = llvm.intr.matrix.transpose %a { rows = 2: i32, columns = 2: i32 } : vector<4xf64> into vector<4xf64>
+  %e = llvm.intr.matrix.transpose %b { rows = 2: i32, columns = 2: i32 } : vector<4xf64> into vector<4xf64>
+  %f = llvm.intr.matrix.transpose %c { rows = 2: i32, columns = 3: i32 } : vector<6xf64> into vector<6xf64>
+  %g = llvm.intr.matrix.transpose %c { rows = 3: i32, columns = 2: i32 } : vector<6xf64> into vector<6xf64>
 
   vector.print %d : vector<4xf64>
   vector.print %e : vector<4xf64>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
index 95b178e04a2bb..55103bc686fb2 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
@@ -57,10 +57,10 @@ func.func @entry() {
   // ( 2, 3 )    ->  ( 1, 3, 5 )
   // ( 4, 5 )
   //
-  %d = vector.flat_transpose %a { rows = 2: i32, columns = 2: i32 } : vector<4xf64> -> vector<4xf64>
-  %e = vector.flat_transpose %b { rows = 2: i32, columns = 2: i32 } : vector<4xf64> -> vector<4xf64>
-  %f = vector.flat_transpose %c { rows = 2: i32, columns = 3: i32 } : vector<6xf64> -> vector<6xf64>
-  %g = vector.flat_transpose %c { rows = 3: i32, columns = 2: i32 } : vector<6xf64> -> vector<6xf64>
+  %d = llvm.intr.matrix.transpose %a { rows = 2: i32, columns = 2: i32 } : vector<4xf64> into vector<4xf64>
+  %e = llvm.intr.matrix.transpose %b { rows = 2: i32, columns = 2: i32 } : vector<4xf64> into vector<4xf64>
+  %f = llvm.intr.matrix.transpose %c { rows = 2: i32, columns = 3: i32 } : vector<6xf64> into vector<6xf64>
+  %g = llvm.intr.matrix.transpose %c { rows = 3: i32, columns = 2: i32 } : vector<6xf64> into vector<6xf64>
 
   vector.print %d : vector<4xf64>
   vector.print %e : vector<4xf64>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
index 8f75ec98465ca..09941192cbc42 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
@@ -39,7 +39,7 @@ func.func @entry() {
   //           x          =                  |/ | column-major!
   // ( 1, 3 )     (5, 7)     ( 19, 27 )
   //
-  %c = vector.matrix_multiply %a, %b
+  %c = llvm.intr.matrix.multiply %a, %b
       { lhs_rows = 2: i32, lhs_columns = 2: i32 , rhs_columns = 2: i32 }
       : (vector<4xf64>, vector<4xf64>) -> vector<4xf64>
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
index b7d27c45226ef..d5f511c8ac119 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
@@ -39,7 +39,7 @@ func.func @entry() {
   //           x          =
   // ( 2, 3 )     (6, 7)     ( 26, 31 )
   //
-  %c = vector.matrix_multiply %a, %b
+  %c = llvm.intr.matrix.multiply %a, %b
       { lhs_rows = 2: i32, lhs_columns = 2: i32 , rhs_columns = 2: i32 }
       : (vector<4xf64>, vector<4xf64>) -> vector<4xf64>
 
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index d09373bdb3f14..cdbca7228ded3 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -193,33 +193,33 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_tileable_consumer_scf_forall_multi_yielding_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x64xf32>, %arg3: tensor<64x32xf32>) -> (tensor<64x64xf32>, tensor<2048xf32>) {
-      %c4 = arith.constant 4 : index
-      %c64 = arith.constant 64 : index
-      %c0 = arith.constant 0 : index
-      %0:2 = scf.forall (%arg4, %arg5) in (2, 2) shared_outs(%arg6 = %arg3, %arg7 = %arg2) -> (tensor<64x32xf32>, tensor<64x64xf32>) {
-        %extracted_slice = tensor.extract_slice %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-        %extracted_slice_0 = tensor.extract_slice %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x64xf32> to tensor<32x32xf32>
-        %6 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32>
-        scf.forall.in_parallel {
-          tensor.parallel_insert_slice %6 into %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x64xf32>
-          tensor.parallel_insert_slice %extracted_slice_0 into %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-        }
+  func.func @fuse_tileable_consumer_scf_forall_multi_yielding_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x64xf32>, %arg3: tensor<64x32xf32>) -> (tensor<64x64xf32>, tensor<2048xf32>) {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %0:2 = scf.forall (%arg4, %arg5) in (2, 2) shared_outs(%arg6 = %arg3, %arg7 = %arg2) -> (tensor<64x32xf32>, tensor<64x64xf32>) {
+      %extracted_slice = tensor.extract_slice %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %extracted_slice_0 = tensor.extract_slice %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x64xf32> to tensor<32x32xf32>
+      %6 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %6 into %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x64xf32>
+        tensor.parallel_insert_slice %extracted_slice_0 into %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
       }
-      %1 = tensor.empty() : tensor<64x64xf32>
-      %2 = tensor.empty() : tensor<64x64xf32>
-      %3 = tensor.empty() : tensor<64x64xf32>
-      %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%0#1, %1 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2, %3 : tensor<64x64xf32>, tensor<64x64xf32>) {
-      ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32):
-        %6 = arith.mulf %in, %in_0 : f32
-        %7 = arith.subf %out, %6 : f32
-        %8 = arith.addf %out_1, %in : f32
-        linalg.yield %7, %8 : f32, f32
-      } -> (tensor<64x64xf32>, tensor<64x64xf32>)
-      %5 = tensor.empty() : tensor<2048xf32>
-      %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32>
-      return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32>
     }
+    %1 = tensor.empty() : tensor<64x64xf32>
+    %2 = tensor.empty() : tensor<64x64xf32>
+    %3 = tensor.empty() : tensor<64x64xf32>
+    %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%0#1, %1 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2, %3 : tensor<64x64xf32>, tensor<64x64xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32):
+      %6 = arith.mulf %in, %in_0 : f32
+      %7 = arith.subf %out, %6 : f32
+      %8 = arith.addf %out_1, %in : f32
+      linalg.yield %7, %8 : f32, f32
+    } -> (tensor<64x64xf32>, tensor<64x64xf32>)
+    %5 = tensor.empty() : tensor<2048xf32>
+    %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32>
+    return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
@@ -269,38 +269,38 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2048xf32> {
-        %c4 = arith.constant 4 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
-            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
-                ^bb0(%in: f32, %in_16: f32, %out: f32):
-                %13 = arith.mulf %in, %in_16 : f32
-                %14 = arith.addf %out, %13 : f32
-                linalg.yield %14 : f32
-            } -> tensor<32x32xf32>
-            scf.forall.in_parallel {
-                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-            }
-        }
-        %output = tensor.empty() : tensor<2048xf32>
-        %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32>
-        return %unpack : tensor<2048xf32>
+  func.func @fuse_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2048xf32> {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+      %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+        ^bb0(%in: f32, %in_16: f32, %out: f32):
+        %13 = arith.mulf %in, %in_16 : f32
+        %14 = arith.addf %out, %13 : f32
+        linalg.yield %14 : f32
+      } -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+      }
     }
+    %output = tensor.empty() : tensor<2048xf32>
+    %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32>
+    return %unpack : tensor<2048xf32>
+  }
 }
-  
+
 module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
-        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %loop = transform.structured.match ops{["scf.forall"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
-        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.yield
-    }
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
+    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 //  CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
 //  CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2048)>
@@ -332,38 +332,38 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> {
-        %c4 = arith.constant 4 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
-            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
-                ^bb0(%in: f32, %in_16: f32, %out: f32):
-                %13 = arith.mulf %in, %in_16 : f32
-                %14 = arith.addf %out, %13 : f32
-                linalg.yield %14 : f32
-            } -> tensor<32x32xf32>
-            scf.forall.in_parallel {
-                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-            }
-        }
-        %output = tensor.empty() : tensor<2047xf32>
-        %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32>
-        return %unpack : tensor<2047xf32>
+  func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+      %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+        ^bb0(%in: f32, %in_16: f32, %out: f32):
+        %13 = arith.mulf %in, %in_16 : f32
+        %14 = arith.addf %out, %13 : f32
+        linalg.yield %14 : f32
+      } -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+      }
     }
+    %output = tensor.empty() : tensor<2047xf32>
+    %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32>
+    return %unpack : tensor<2047xf32>
+  }
 }
-  
+
 module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
-        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %loop = transform.structured.match ops{["scf.forall"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
-        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.yield
-    }
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
+    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 //  CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
 //  CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2047)>
@@ -395,46 +395,46 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_pack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> {
-        %c4 = arith.constant 4 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
-            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
-                ^bb0(%in: f32, %in_16: f32, %out: f32):
-                %13 = arith.mulf %in, %in_16 : f32
-                %14 = arith.addf %out, %13 : f32
-                linalg.yield %14 : f32
-            } -> tensor<32x32xf32>
-            scf.forall.in_parallel {
-                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-            }
-        }
-        %output = tensor.empty() : tensor<4x32x16xf32>
-        %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
-        return %pack : tensor<4x32x16xf32>
+  func.func @fuse_perfect_tiling_pack_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %1 = scf.forall (%arg3, %arg4) in (2, 1) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+      %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+        ^bb0(%in: f32, %in_16: f32, %out: f32):
+        %13 = arith.mulf %in, %in_16 : f32
+        %14 = arith.addf %out, %13 : f32
+        linalg.yield %14 : f32
+      } -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+      }
     }
+    %output = tensor.empty() : tensor<4x32x16xf32>
+    %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
+    return %pack : tensor<4x32x16xf32>
+  }
 }
-  
+
 module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
-        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %loop = transform.structured.match ops{["scf.forall"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
-        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.yield
-    }
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
+    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 //      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
-//      CHECK: func.func @fuse_pack_consumer_into_scf_forall(
+//      CHECK: func.func @fuse_perfect_tiling_pack_consumer(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>)
 //      CHECK:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<4x32x16xf32>
-//      CHECK:   %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2)
+//      CHECK:   %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 1)
 // CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
 // CHECK-SAME:   {
 //      CHECK:      %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
@@ -451,6 +451,318 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+#map = affine_map<(d0) -> (-d0 + 4, 16)>
+func.func @fuse_pack_consumer_if_single_iteration(%arg0: tensor<4x4xf32>) -> tensor<1x4x16x1xf32> {
+  %0 = tensor.empty() : tensor<1x4x16x1xf32>
+  %1 = tensor.empty() : tensor<4x4xf32>
+  %2 = scf.forall (%arg1) = (0) to (4) step (16) shared_outs(%arg2 = %1) -> (tensor<4x4xf32>) {
+    %3 = affine.min #map(%arg1)
+    %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [%3, 4] [1, 1] : tensor<4x4xf32> to tensor<?x4xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg2[%arg1, 0] [%3, 4] [1, 1] : tensor<4x4xf32> to tensor<?x4xf32>
+    %4 = linalg.exp ins(%extracted_slice : tensor<?x4xf32>) outs(%extracted_slice_0 : tensor<?x4xf32>) -> tensor<?x4xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %4 into %arg2[%arg1, 0] [%3, 4] [1, 1] : tensor<?x4xf32> into tensor<4x4xf32>
+    }
+  }
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %0 : tensor<4x4xf32> -> tensor<1x4x16x1xf32>
+  return %pack : tensor<1x4x16x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[MAP:.*]] = affine_map<(d0) -> (-d0 + 4, 16)>
+//      CHECK: func.func @fuse_pack_consumer_if_single_iteration(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+//  CHECK-DAG:   %[[PACK_INIT:.*]] = tensor.empty() : tensor<1x4x16x1xf32>
+//  CHECK-DAG:   %[[ELEM_INIT:.*]] = tensor.empty() : tensor<4x4xf32>
+//  CHECK-DAG:   %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (4) step (16)
+// CHECK-SAME:      shared_outs(%[[ELEM_OUT_ARG:.*]] = %[[ELEM_INIT]], %[[PACK_OUT_ARG:.*]] = %[[PACK_INIT]])
+//  CHECK-DAG:      %[[SIZE:.+]] = affine.min #[[MAP]](%[[IV]])
+//  CHECK-DAG:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1]
+//  CHECK-DAG:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[ELEM_OUT_ARG]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[IV]], 0, 0, 0] [1, 4, 16, 1] [1, 1, 1, 1]
+//      CHECK:      %[[PACK:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        padding_value(%[[PAD_VAL]] : f32)
+// CHECK-SAME:        outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[ELEM]] into %[[ELEM_OUT_ARG]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][%[[IV]], 0, 0, 0] [1, 4, 16, 1] [1, 1, 1, 1]
+
+// -----
+
+func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>, %arg2: tensor<2x64x16x1xf32>) -> tensor<2x64x16x1xf32> {
+  %0 = scf.forall (%arg3) = (0) to (32) step (16) shared_outs(%arg4 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg3] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg4[0, %arg3] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %1 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %1 into %arg4[0, %arg3] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %pack = linalg.pack %0 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %arg2 : tensor<64x32xf32> -> tensor<2x64x16x1xf32>
+  return %pack : tensor<2x64x16x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[ARG2]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], 0, 0, 0] [1, 64, 16, 1] [1, 1, 1, 1]
+//      CHECK:      %[[PACK:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[ELEM]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], 0, 0, 0] [1, 64, 16, 1] [1, 1, 1, 1]
+
+// -----
+
+// It is valid to fuse the pack op in perfect tiling scenario when the dimension
+// is dynamic and padding is not needed.
+
+func.func @fuse_pack_consumer_with_no_pad_dynamic_dim(%arg0: tensor<64x?xf32>, %arg1: tensor<64x?xf32>, %1: tensor<64x?x16xf32>) -> tensor<64x?x16xf32> {
+  %c1 = arith.constant 1 : index
+  %d1 = tensor.dim %arg0, %c1 : tensor<64x?xf32>
+  %0 = scf.forall (%arg2) = (0) to (%d1) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x?xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x?xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x?xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x?xf32>
+    }
+  }
+  %pack = linalg.pack %0 inner_dims_pos = [1] inner_tiles = [16] into %1 : tensor<64x?xf32> -> tensor<64x?x16xf32>
+  return %pack : tensor<64x?x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_with_no_pad_dynamic_dim(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (%{{.+}}) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[ARG2]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0] [64, 1, 16] [1, 1, 1]
+//      CHECK:      %[[PACK:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        inner_dims_pos = [1] inner_tiles = [16]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[ELEM]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0] [64, 1, 16] [1, 1, 1]
+
+// -----
+
+// It is valid to fuse the pack op with padding semantics if the tiled
+// dimensions do not need padding.
+
+func.func @fuse_pack_consumer_with_padding_semantics(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<22x2x3x16xf32> {
+  %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %1 = tensor.empty() : tensor<22x2x3x16xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<22x2x3x16xf32>
+  return %pack : tensor<22x2x3x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_with_padding_semantics(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+//  CHECK-DAG:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<22x2x3x16xf32>
+//  CHECK-DAG:   %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [22, 1, 3, 16] [1, 1, 1, 1]
+//      CHECK:      %[[TILED_PACK_OUT:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        padding_value(%[[PAD_VAL]] : f32)
+// CHECK-SAME:        inner_dims_pos = [0, 1] inner_tiles = [3, 16]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [22, 1, 3, 16] [1, 1, 1, 1]
+
+// -----
+
+// It is valid to fuse the pack if the dimension is not tiled even when it needs
+// extra padding.
+
+func.func @fuse_pack_consumer_with_untiled_extra_padding(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<33x2x3x16xf32> {
+  %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %1 = tensor.empty() : tensor<33x2x3x16xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<33x2x3x16xf32>
+  return %pack : tensor<33x2x3x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_with_untiled_extra_padding(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+//  CHECK-DAG:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<33x2x3x16xf32>
+//  CHECK-DAG:   %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [33, 1, 3, 16] [1, 1, 1, 1]
+//      CHECK:      %[[TILED_PACK_OUT:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        padding_value(%[[PAD_VAL]] : f32)
+// CHECK-SAME:        inner_dims_pos = [0, 1] inner_tiles = [3, 16]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [33, 1, 3, 16] [1, 1, 1, 1]
+
+// -----
+
+// If the dimension is tiled and it needs extra padding, do not fuse the pack
+// op.
+
+func.func @nofuse_pack_consumer_with_extra_padding(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<23x32x3x16xf32> {
+  %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      // expected-error @below {{failed to fuse consumer of slice}}
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %1 = tensor.empty() : tensor<23x32x3x16xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<23x32x3x16xf32>
+  return %pack : tensor<23x32x3x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+// Imperfect tiling is not supported in pack op consumer fusion.
+
+#map = affine_map<(d0) -> (d0 * 5)>
+#map1 = affine_map<(d0) -> (d0)>
+func.func @nofuse_pack_with_imperfect_tiling(%arg0: tensor<30xf32>) -> tensor<5x6xf32> {
+  %0 = tensor.empty() : tensor<30xf32>
+  %1 = scf.forall (%arg1) in (6) shared_outs(%arg2 = %0) -> (tensor<30xf32>) {
+    %3 = affine.apply #map(%arg1)
+    %extracted_slice = tensor.extract_slice %arg0[%3] [5] [1] : tensor<30xf32> to tensor<5xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg2[%3] [5] [1] : tensor<30xf32> to tensor<5xf32>
+    %4 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel"]} ins(%extracted_slice : tensor<5xf32>) outs(%extracted_slice_0 : tensor<5xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %5 = arith.addf %in, %in : f32
+      linalg.yield %5 : f32
+    } -> tensor<5xf32>
+    scf.forall.in_parallel {
+      // expected-error @below {{failed to fuse consumer of slice}}
+      tensor.parallel_insert_slice %4 into %arg2[%3] [5] [1] : tensor<5xf32> into tensor<30xf32>
+    }
+  }
+  %2 = tensor.empty() : tensor<5x6xf32>
+  %pack = linalg.pack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [6] into %2 : tensor<30xf32> -> tensor<5x6xf32>
+  return %pack : tensor<5x6xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
 module {
   func.func @fuse_add_multiple_tilable_consumers(%arg0: tensor<256x256xf32>, %arg1: tensor<256x256xf32>, %arg2: tensor<256x256xf32>) -> (tensor<256x256xf32>, tensor<256x256xf32>) {
     %c0 = arith.constant 0 : index
@@ -489,7 +801,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32>
 //      CHECK:   %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32>
 //      CHECK:   %[[LOOP_RESULT:.*]]:3 = scf.for %[[IV1:.*]] = %[[C0]]
-// CHECK-SAME:       iter_args(%[[FIRST_OUT_ARG:.*]] = %[[dest0]], %[[SECOND_OUT_ARG:.*]] = %[[dest0]], %[[THIRD_OUT_ARG:.*]] = %[[dest0]]) 
+// CHECK-SAME:       iter_args(%[[FIRST_OUT_ARG:.*]] = %[[dest0]], %[[SECOND_OUT_ARG:.*]] = %[[dest0]], %[[THIRD_OUT_ARG:.*]] = %[[dest0]])
 // CHECK-SAME:   {
 //      CHECK:          %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], 0] [64, 256] [1, 1]
 //      CHECK:          %[[ADD_INS0_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 256] [1, 1]
@@ -645,7 +957,7 @@ func.func @multi_slice_fusion1(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %generic#0 into %init0[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
       tensor.parallel_insert_slice %generic#1 into %init1[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
-    }	
+    }
   }
   %empty = tensor.empty(%dim0) : tensor<?xf32>
   %result = linalg.generic {
@@ -719,7 +1031,7 @@ func.func @multi_slice_fusion2(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %generic0 into %init0[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
       tensor.parallel_insert_slice %generic1 into %init1[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
-    }	
+    }
   }
   %empty = tensor.empty(%dim0) : tensor<?xf32>
   %result = linalg.generic {
diff --git a/mlir/test/Target/LLVMIR/Import/ifunc.ll b/mlir/test/Target/LLVMIR/Import/ifunc.ll
new file mode 100644
index 0000000000000..0cec205dfce68
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/ifunc.ll
@@ -0,0 +1,63 @@
+; RUN: mlir-translate --import-llvm %s --split-input-file | FileCheck %s
+
+; CHECK: llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+@foo = dso_local ifunc void (ptr, i32), ptr @resolve_foo
+
+define dso_local void @call_foo(ptr noundef %0, i32 noundef %1) {
+; CHECK: llvm.call @foo
+  call void @foo(ptr noundef %0, i32 noundef %1)
+  ret void
+}
+
+define dso_local ptr @foo_fptr() {
+; CHECK: [[FPTR:%[0-9]+]] = llvm.mlir.addressof @foo
+; CHECK: llvm.return [[FPTR]]
+  ret ptr @foo
+}
+
+define internal ptr @resolve_foo() {
+  ret ptr @foo_1
+}
+
+declare void @foo_1(ptr noundef, i32 noundef)
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+@resolver_alias = alias ptr (), ptr @resolver
+@resolver_alias_alias = alias ptr (), ptr @resolver_alias
+
+; CHECK-DAG: llvm.mlir.ifunc external @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias
+@ifunc = ifunc float (i64), ptr @resolver_alias
+; CHECK-DAG: llvm.mlir.ifunc external @ifunc2 : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias_alias
+@ifunc2 = ifunc float (i64), ptr @resolver_alias_alias
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+; CHECK: llvm.mlir.ifunc linkonce_odr hidden @ifunc
+@ifunc = linkonce_odr hidden ifunc float (i64), ptr @resolver
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+; CHECK: llvm.mlir.ifunc private @ifunc {{.*}} {dso_local}
+@ifunc = private dso_local ifunc float (i64), ptr @resolver
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+; CHECK: llvm.mlir.ifunc weak @ifunc
+@ifunc = weak ifunc float (i64), ptr @resolver
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll b/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll
index f9c553d672e50..797a75cceb3de 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic-prefer-unregistered.ll
@@ -1,10 +1,11 @@
 ; RUN: mlir-translate -import-llvm -prefer-unregistered-intrinsics %s | FileCheck %s
 
 ; CHECK-LABEL: llvm.func @lifetime
-define void @lifetime(ptr %0) {
-  ; CHECK: llvm.call_intrinsic "llvm.lifetime.start.p0"({{.*}}, %arg0) : (i64, !llvm.ptr {llvm.nonnull}) -> ()
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %0)
-  ; CHECK: llvm.call_intrinsic "llvm.lifetime.end.p0"({{.*}}, %arg0) : (i64, !llvm.ptr {llvm.nonnull}) -> ()
-  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %0)
+define void @lifetime() {
+  %a = alloca [16 x i8]
+  ; CHECK: llvm.call_intrinsic "llvm.lifetime.start.p0"({{.*}}, %[[ptr:.*]]) : (i64, !llvm.ptr {llvm.nonnull}) -> ()
+  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %a)
+  ; CHECK: llvm.call_intrinsic "llvm.lifetime.end.p0"({{.*}}, %[[ptr]]) : (i64, !llvm.ptr {llvm.nonnull}) -> ()
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %a)
   ret void
 }
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 36afa84a031f4..24380b575753e 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -874,11 +874,12 @@ define void @stack_restore(ptr %0, ptr addrspace(1) %1) {
 }
 
 ; CHECK-LABEL: llvm.func @lifetime
-define void @lifetime(ptr %0) {
+define void @lifetime() {
+  %a = alloca [16 x i8]
   ; CHECK: llvm.intr.lifetime.start 16, %{{.*}} : !llvm.ptr
-  call void @llvm.lifetime.start.p0(i64 16, ptr %0)
+  call void @llvm.lifetime.start.p0(i64 16, ptr %a)
   ; CHECK: llvm.intr.lifetime.end 32, %{{.*}} : !llvm.ptr
-  call void @llvm.lifetime.end.p0(i64 32, ptr %0)
+  call void @llvm.lifetime.end.p0(i64 32, ptr %a)
   ret void
 }
 
diff --git a/mlir/test/Target/LLVMIR/ifunc.mlir b/mlir/test/Target/LLVMIR/ifunc.mlir
new file mode 100644
index 0000000000000..bba306c1e1ab3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/ifunc.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-translate -mlir-to-llvmir %s --split-input-file | FileCheck %s
+
+// CHECK: @foo = dso_local ifunc void (ptr, i32), ptr @resolve_foo
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+llvm.func @call_foo(%arg0: !llvm.ptr {llvm.noundef}, %arg1: i32 {llvm.noundef}) attributes {dso_local} {
+// CHECK: call void @foo
+  llvm.call @foo(%arg0, %arg1) : (!llvm.ptr {llvm.noundef}, i32 {llvm.noundef}) -> ()
+  llvm.return
+}
+llvm.func @foo_fptr() -> !llvm.ptr attributes {dso_local} {
+  %1 = llvm.mlir.addressof @foo : !llvm.ptr
+// CHECK: ret ptr @foo
+  llvm.return %1 : !llvm.ptr
+}
+llvm.func internal @resolve_foo() -> !llvm.ptr attributes {dso_local} {
+  %0 = llvm.mlir.addressof @foo_1 : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+llvm.func @foo_1(!llvm.ptr {llvm.noundef}, i32 {llvm.noundef})
+
+// -----
+
+llvm.mlir.alias external @resolver_alias : !llvm.func<ptr ()> {
+  %0 = llvm.mlir.addressof @resolver : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+llvm.mlir.alias external @resolver_alias_alias : !llvm.func<ptr ()> {
+  %0 = llvm.mlir.addressof @resolver_alias : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+
+// CHECK-DAG: @ifunc = ifunc float (i64), ptr @resolver_alias
+// CHECK-DAG: @ifunc2 = ifunc float (i64), ptr @resolver_alias_alias
+llvm.mlir.ifunc external @ifunc2 : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias_alias
+llvm.mlir.ifunc external @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: @ifunc = linkonce_odr hidden ifunc
+llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: @ifunc = private ifunc
+llvm.mlir.ifunc private @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: @ifunc = weak ifunc
+llvm.mlir.ifunc weak @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index ba12140c59b35..44074ce0577e2 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -1096,11 +1096,13 @@ llvm.func @vector_deinterleave2(%vec1: vector<4xf64>, %vec2: vector<[8]xi32>) {
 }
 
 // CHECK-LABEL: @lifetime
-llvm.func @lifetime(%p: !llvm.ptr) {
+llvm.func @lifetime() {
+  %c = llvm.mlir.constant(16 : i64) : i64
+  %a = llvm.alloca %c x i8 : (i64) -> !llvm.ptr
   // CHECK: call void @llvm.lifetime.start
-  llvm.intr.lifetime.start 16, %p : !llvm.ptr
+  llvm.intr.lifetime.start 16, %a : !llvm.ptr
   // CHECK: call void @llvm.lifetime.end
-  llvm.intr.lifetime.end 16, %p : !llvm.ptr
+  llvm.intr.lifetime.end 16, %a : !llvm.ptr
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index a8ef401fff27e..b09ceeeb86cc0 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -7,78 +7,6 @@ func.func @foo() {
 
 // -----
 
-llvm.func @vector_with_non_vector_type() -> f32 {
-  // expected-error @below{{expected vector or array type}}
-  %cst = llvm.mlir.constant(dense<100.0> : vector<1xf64>) : f32
-  llvm.return %cst : f32
-}
-
-// -----
-
-llvm.func @non_array_attr_for_struct() -> !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>> {
-  // expected-error @below{{expected an array attribute for a struct constant}}
-  %0 = llvm.mlir.constant(dense<[[[1, 2], [3, 4]], [[42, 43], [44, 45]]]> : tensor<2x2x2xi32>) : !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>>
-  llvm.return %0 : !llvm.array<2 x array<2 x array<2 x struct<(i32)>>>>
-}
-
-// -----
-
-llvm.func @non_array_attr_for_struct() -> !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>> {
-  // expected-error @below{{expected an array attribute for a struct constant}}
-  %0 = llvm.mlir.constant(dense<[[[1, 2], [3, 4]], [[42, 43], [44, 45]]]> : tensor<2x2x2xi32>) : !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>>
-  llvm.return %0 : !llvm.array<2 x array<2 x array<2 x struct<(i32, i32, i32)>>>>
-}
-
-// -----
-
-llvm.func @invalid_struct_element_type() -> !llvm.struct<(f64, array<2 x i32>)> {
-  // expected-error @below{{expected struct element types to be floating point type or integer type}}
-  %0 = llvm.mlir.constant([1.0 : f64, dense<[1, 2]> : tensor<2xi32>]) : !llvm.struct<(f64, array<2 x i32>)>
-  llvm.return %0 : !llvm.struct<(f64, array<2 x i32>)>
-}
-
-// -----
-
-llvm.func @wrong_struct_element_attr_type() -> !llvm.struct<(f64, f64)> {
-  // expected-error @below{{expected struct element attribute types to be floating point type or integer type}}
-  %0 = llvm.mlir.constant([dense<[1, 2]> : tensor<2xi32>, 2.0 : f64]) : !llvm.struct<(f64, f64)>
-  llvm.return %0 : !llvm.struct<(f64, f64)>
-}
-
-// -----
-
-llvm.func @struct_wrong_attribute_element_type() -> !llvm.struct<(f64, f64)> {
-  // expected-error @below{{struct element at index 0 is of wrong type}}
-  %0 = llvm.mlir.constant([1.0 : f32, 1.0 : f32]) : !llvm.struct<(f64, f64)>
-  llvm.return %0 : !llvm.struct<(f64, f64)>
-}
-
-// -----
-
-llvm.func @integer_with_float_type() -> f32 {
-  // expected-error @+1 {{expected integer type}}
-  %0 = llvm.mlir.constant(1 : index) : f32
-  llvm.return %0 : f32
-}
-
-// -----
-
-llvm.func @incompatible_float_attribute_type() -> f32 {
-  // expected-error @below{{expected float type of width 64}}
-  %cst = llvm.mlir.constant(1.0 : f64) : f32
-  llvm.return %cst : f32
-}
-
-// -----
-
-llvm.func @incompatible_integer_type_for_float_attr() -> i32 {
-  // expected-error @below{{expected integer type of width 16}}
-  %cst = llvm.mlir.constant(1.0 : f16) : i32
-  llvm.return %cst : i32
-}
-
-// -----
-
 // expected-error @below{{LLVM attribute 'readonly' does not expect a value}}
 llvm.func @passthrough_unexpected_value() attributes {passthrough = [["readonly", "42"]]}
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 0742eb3620a7c..740990a6e589b 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -196,6 +196,27 @@ llvm.func @rocdl.s.wait.dscnt() {
   llvm.return
 }
 
+llvm.func @rocdl.s.wait.loadcnt() {
+  // CHECK-LABEL: rocdl.s.wait.loadcnt
+  // CHECK-NEXT: call void @llvm.amdgcn.s.wait.loadcnt(i16 0)
+  rocdl.s.wait.loadcnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.wait.storecnt() {
+  // CHECK-LABEL: rocdl.s.wait.storecnt
+  // CHECK-NEXT: call void @llvm.amdgcn.s.wait.storecnt(i16 0)
+  rocdl.s.wait.storecnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.wait.expcnt() {
+  // CHECK-LABEL: rocdl.s.wait.expcnt
+  // CHECK-NEXT: call void @llvm.amdgcn.s.wait.expcnt(i16 0)
+  rocdl.s.wait.expcnt 0
+  llvm.return
+}
+
 llvm.func @rocdl.setprio() {
   // CHECK: call void @llvm.amdgcn.s.setprio(i16 0)
   rocdl.s.setprio 0
diff --git a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
index f8e76356c4321..39a8cd953f7ca 100644
--- a/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestAffineLoopParametricTiling.cpp
@@ -74,9 +74,13 @@ getTilingParameters(ArrayRef<AffineForOp> band,
 }
 
 void TestAffineLoopParametricTiling::runOnOperation() {
-  // Bands of loops to tile.
+  // Get maximal perfect nest of 'affine.for' ops at the top-level.
   std::vector<SmallVector<AffineForOp, 6>> bands;
-  getTileableBands(getOperation(), &bands);
+  for (AffineForOp forOp : getOperation().getOps<AffineForOp>()) {
+    SmallVector<AffineForOp, 6> band;
+    getPerfectlyNestedLoops(band, forOp);
+    bands.push_back(band);
+  }
 
   // Tile each band.
   for (MutableArrayRef<AffineForOp> band : bands) {
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9b5cadd62befc..233fef8ec4296 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -301,6 +301,17 @@ def find_real_python_interpreter():
             ToolSubst("mlir-opt", "mlir-opt --verify-roundtrip", unresolved="fatal"),
         ]
     )
+elif "MLIR_GENERATE_PATTERN_CATALOG" in os.environ:
+    tools.extend(
+        [
+            ToolSubst(
+                "mlir-opt",
+                "mlir-opt --debug-only=pattern-logging-listener --mlir-disable-threading",
+                unresolved="fatal",
+            ),
+            ToolSubst("FileCheck", "FileCheck --dump-input=always", unresolved="fatal"),
+        ]
+    )
 else:
     tools.extend(["mlir-opt"])
 
diff --git a/mlir/test/python/dialects/amdgpu.py b/mlir/test/python/dialects/amdgpu.py
index c8039d494cf81..b479576dac093 100644
--- a/mlir/test/python/dialects/amdgpu.py
+++ b/mlir/test/python/dialects/amdgpu.py
@@ -2,7 +2,7 @@
 # This is just a smoke test that the dialect is functional.
 
 from mlir.ir import *
-from mlir.dialects import amdgpu, arith, memref
+from mlir.dialects import amdgpu, func
 
 
 def constructAndPrintInModule(f):
@@ -20,3 +20,26 @@ def constructAndPrintInModule(f):
 def testSmoke():
     # CHECK: amdgpu.lds_barrier
     amdgpu.LDSBarrierOp()
+
+
+# CHECK-LABEL: testFatRawBufferCastOpParams
+@constructAndPrintInModule
+def testFatRawBufferCastOpParams():
+    memref_type = MemRefType.get(
+        [ShapedType.get_dynamic_size(), ShapedType.get_dynamic_size()],
+        F32Type.get(),
+    )
+    f = func.FuncOp("test_raw_buffer_cast_params", ([memref_type], []))
+    with InsertionPoint(f.add_entry_block()):
+        block_args = f.arguments
+        amdgpu.FatRawBufferCastOp(block_args[0])
+        amdgpu.FatRawBufferCastOp(block_args[0], resetOffset=True)
+        amdgpu.FatRawBufferCastOp(block_args[0], boundsCheck=False)
+        amdgpu.FatRawBufferCastOp(block_args[0], boundsCheck=False, resetOffset=True)
+        func.ReturnOp([])
+
+    # CHECK:     func.func @test_raw_buffer_cast_params(%[[ARG0:.+]]: memref<?x?xf32>) {
+    # CHECK:        amdgpu.fat_raw_buffer_cast %[[ARG0]] : memref<?x?xf32> to memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
+    # CHECK-NEXT:   amdgpu.fat_raw_buffer_cast %[[ARG0]] resetOffset : memref<?x?xf32> to memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
+    # CHECK-NEXT:   amdgpu.fat_raw_buffer_cast %[[ARG0]] boundsCheck(false) : memref<?x?xf32> to memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
+    # CHECK-NEXT:   amdgpu.fat_raw_buffer_cast %[[ARG0]] boundsCheck(false) resetOffset : memref<?x?xf32> to memref<?x?xf32, #amdgpu.address_space<fat_raw_buffer>>
diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py
index de61f4613868f..62d11d5e189c8 100644
--- a/mlir/test/python/dialects/scf.py
+++ b/mlir/test/python/dialects/scf.py
@@ -18,6 +18,26 @@ def constructAndPrintInModule(f):
     return f
 
 
+# CHECK-LABEL: TEST: testSimpleForall
+# CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (4, 8) shared_outs(%[[BOUND_ARG:.*]] = %{{.*}}) -> (tensor<4x8xf32>)
+# CHECK:   arith.addi %[[IV0]], %[[IV1]]
+# CHECK:   scf.forall.in_parallel
+@constructAndPrintInModule
+def testSimpleForall():
+    f32 = F32Type.get()
+    tensor_type = RankedTensorType.get([4, 8], f32)
+
+    @func.FuncOp.from_py_func(tensor_type)
+    def forall_loop(tensor):
+        loop = scf.ForallOp([0, 0], [4, 8], [1, 1], [tensor])
+        with InsertionPoint(loop.body):
+            i, j = loop.induction_variables
+            arith.addi(i, j)
+            loop.terminator()
+        # The verifier will check that the regions have been created properly.
+        assert loop.verify()
+
+
 # CHECK-LABEL: TEST: testSimpleLoop
 @constructAndPrintInModule
 def testSimpleLoop():
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index b08fe98397fbc..ede1571f940f6 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -686,6 +686,15 @@ def testOperationPrint():
         skip_regions=True,
     )
 
+    # Test print with large_resource_limit.
+    # CHECK: func.func @f1(%arg0: i32) -> i32
+    # CHECK-NOT: resource1: "0x08
+    module.operation.print(large_resource_limit=2)
+
+    # Test large_elements_limit has no effect on resource string
+    # CHECK: func.func @f1(%arg0: i32) -> i32
+    # CHECK: resource1: "0x08
+    module.operation.print(large_elements_limit=2)
 
 # CHECK-LABEL: TEST: testKnownOpView
 @run
diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py
index 85d2eb304882e..e26d42bb32913 100644
--- a/mlir/test/python/pass_manager.py
+++ b/mlir/test/python/pass_manager.py
@@ -363,6 +363,63 @@ def testPrintIrLargeLimitElements():
         pm.run(module)
 
 
+# CHECK-LABEL: TEST: testPrintIrLargeResourceLimit
+@run
+def testPrintIrLargeResourceLimit():
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() -> tensor<3xi64> {
+              %0 = arith.constant dense_resource<blob1> : tensor<3xi64>
+              return %0 : tensor<3xi64>
+            }
+          }
+          {-#
+            dialect_resources: {
+              builtin: {
+                blob1: "0x010000000000000002000000000000000300000000000000"
+              }
+            }
+          #-}
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        ctx.enable_multithreading(False)
+        pm.enable_ir_printing(large_resource_limit=4)
+        # CHECK-NOT: blob1: "0x01
+        pm.run(module)
+
+
+# CHECK-LABEL: TEST: testPrintIrLargeResourceLimitVsElementsLimit
+@run
+def testPrintIrLargeResourceLimitVsElementsLimit():
+    """Test that large_elements_limit does not affect the printing of resources."""
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() -> tensor<3xi64> {
+              %0 = arith.constant dense_resource<blob1> : tensor<3xi64>
+              return %0 : tensor<3xi64>
+            }
+          }
+          {-#
+            dialect_resources: {
+              builtin: {
+                blob1: "0x010000000000000002000000000000000300000000000000"
+              }
+            }
+          #-}
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        ctx.enable_multithreading(False)
+        pm.enable_ir_printing(large_elements_limit=1)
+        # CHECK-NOT: blob1: "0x01
+        pm.run(module)
+
+
 # CHECK-LABEL: TEST: testPrintIrTree
 @run
 def testPrintIrTree():
diff --git a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
index 7df500bc9568a..dd0b09f7f05d2 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
@@ -608,3 +608,97 @@ TEST(IntegerRelationTest, convertVarKindToLocal) {
   EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[3]));
   EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[4]));
 }
+
+TEST(IntegerRelationTest, rangeProduct) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, j, k) : (2*i + 3*k == 0, i >= 0, j >= 0, k >= 0)", 2);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, j, l) : (4*i + 6*j + 9*l == 0, i >= 0, j >= 0, l >= 0)", 2);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j, k, l) : (2*i + 3*k == 0, 4*i + 6*j + 9*l == "
+                           "0, i >= 0, j >= 0, k >= 0, l >= 0)",
+                           2);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductMultdimRange) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, k, l, m) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == "
+                           "0, i >= 0, k >= 0, l >= 0, m >= 0)",
+                           1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductMultdimRangeSwapped) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1);
+  IntegerRelation r2 =
+      parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, l, m, k) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == "
+                           "0, i >= 0, k >= 0, l >= 0, m >= 0)",
+                           1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyDomain) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 0);
+  IntegerRelation r2 =
+      parseRelationFromSet("(k, l) : (2*k + 3*l == 0, k >= 0, l >= 0)", 0);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j, k, l) : (2*k + 3*l == 0, 4*i + 9*j == "
+                           "0, i >= 0, j >= 0, k >= 0, l >= 0)",
+                           0);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyRange) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 2);
+  IntegerRelation r2 =
+      parseRelationFromSet("(i, j) : (2*i + 3*j == 0, i >= 0, j >= 0)", 2);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j) : (2*i + 3*j == 0, 4*i + 9*j == "
+                           "0, i >= 0, j >= 0)",
+                           2);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyDomainAndRange) {
+  IntegerRelation r1 = parseRelationFromSet("() : ()", 0);
+  IntegerRelation r2 = parseRelationFromSet("() : ()", 0);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected = parseRelationFromSet("() : ()", 0);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductSymbols) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, j)[s] : (2*i + 3*j + s == 0, i >= 0, j >= 0)", 1);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, l)[s] : (3*i + 4*l + s == 0, i >= 0, l >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected = parseRelationFromSet(
+      "(i, j, l)[s] : (2*i + 3*j + s == 0, 3*i + 4*l + s == "
+      "0, i >= 0, j >= 0, l >= 0)",
+      1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
diff --git a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp
index f688fa97e8409..6a81422b6b66b 100644
--- a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp
+++ b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp
@@ -24,7 +24,7 @@ static Operation *createOp(MLIRContext *context, Location loc,
                            unsigned int numRegions = 0) {
   context->allowUnregisteredDialects();
   return Operation::create(loc, OperationName(operationName, context), {}, {},
-                           std::nullopt, OpaqueProperties(nullptr), {},
+                           NamedAttrList(), OpaqueProperties(nullptr), {},
                            numRegions);
 }
 
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index aa16421cbec51..836efdb307f97 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -519,14 +519,44 @@ TEST_F(OpenACCOpsTest, routineOpTest) {
   op->removeGangDimDeviceTypeAttr();
   op->removeGangDimAttr();
 
-  op->setBindNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
-  op->setBindNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
+  op->setBindIdNameDeviceTypeAttr(
+      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host)}));
+  op->setBindStrNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
+  op->setBindIdNameAttr(
+      b.getArrayAttr({SymbolRefAttr::get(&context, "test_symbol")}));
+  op->setBindStrNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
   EXPECT_TRUE(op->getBindNameValue().has_value());
-  EXPECT_EQ(op->getBindNameValue().value(), "fname");
-  for (auto d : dtypesWithoutNone)
-    EXPECT_FALSE(op->getBindNameValue(d).has_value());
-  op->removeBindNameDeviceTypeAttr();
-  op->removeBindNameAttr();
+  EXPECT_TRUE(op->getBindNameValue(DeviceType::Host).has_value());
+  EXPECT_EQ(std::visit(
+                [](const auto &attr) -> std::string {
+                  if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                               mlir::StringAttr>) {
+                    return attr.str();
+                  } else {
+                    return attr.getLeafReference().str();
+                  }
+                },
+                op->getBindNameValue().value()),
+            "fname");
+  EXPECT_EQ(std::visit(
+                [](const auto &attr) -> std::string {
+                  if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                               mlir::StringAttr>) {
+                    return attr.str();
+                  } else {
+                    return attr.getLeafReference().str();
+                  }
+                },
+                op->getBindNameValue(DeviceType::Host).value()),
+            "test_symbol");
+  for (auto d : dtypesWithoutNone) {
+    if (d != DeviceType::Host)
+      EXPECT_FALSE(op->getBindNameValue(d).has_value());
+  }
+  op->removeBindIdNameDeviceTypeAttr();
+  op->removeBindStrNameDeviceTypeAttr();
+  op->removeBindIdNameAttr();
+  op->removeBindStrNameAttr();
 }
 
 template <typename Op>
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index d22afb3003e76..a46e64718dab9 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_unittest(MLIRIRTests
   AttrTypeReplacerTest.cpp
   Diagnostic.cpp
   DialectTest.cpp
+  DistinctAttributeAllocatorTest.cpp
   InterfaceTest.cpp
   IRMapping.cpp
   InterfaceAttachmentTest.cpp
diff --git a/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
new file mode 100644
index 0000000000000..99067d09f7bed
--- /dev/null
+++ b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
@@ -0,0 +1,45 @@
+//=== DistinctAttributeAllocatorTest.cpp - DistinctAttr storage alloc test ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/CrashRecoveryContext.h"
+#include <thread>
+
+using namespace mlir;
+
+//
+// Test that a DistinctAttr that is created on a separate thread does
+// not have its storage deleted when the thread joins.
+//
+TEST(DistinctAttributeAllocatorTest, TestAttributeWellFormedAfterThreadJoin) {
+  MLIRContext ctx;
+  OpBuilder builder(&ctx);
+  DistinctAttr attr;
+
+  std::thread t([&ctx, &attr]() {
+    attr = DistinctAttr::create(UnitAttr::get(&ctx));
+    ASSERT_TRUE(attr);
+  });
+  t.join();
+
+  // If the attribute storage got deleted after the thread joins (which we don't
+  // want) then trying to access it triggers an assert in Debug mode, and a
+  // crash otherwise. Run this in a CrashRecoveryContext to avoid bringing down
+  // the whole test suite if this test fails. Additionally, MSAN and/or TSAN
+  // should raise failures here if the attribute storage was deleted.
+  llvm::CrashRecoveryContext crc;
+  EXPECT_TRUE(crc.RunSafely([attr]() { (void)attr.getAbstractAttribute(); }));
+  EXPECT_TRUE(
+      crc.RunSafely([attr]() { (void)*cast<Attribute>(attr).getImpl(); }));
+
+  ASSERT_TRUE(attr);
+}
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
index 4b800fa36a375..7bc1a044d0dad 100644
--- a/mlir/unittests/IR/OperationSupportTest.cpp
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -24,7 +24,7 @@ static Operation *createOp(MLIRContext *context, ArrayRef<Value> operands = {},
   context->allowUnregisteredDialects();
   return Operation::create(UnknownLoc::get(context),
                            OperationName("foo.bar", context), resultTypes,
-                           operands, std::nullopt, nullptr, {}, numRegions);
+                           operands, NamedAttrList(), nullptr, {}, numRegions);
 }
 
 namespace {
@@ -236,7 +236,7 @@ TEST(OperationFormatPrintTest, CanPrintNameAsPrefix) {
   Operation *op = Operation::create(
       NameLoc::get(StringAttr::get(&context, "my_named_loc")),
       OperationName("t.op", &context), builder.getIntegerType(16), {},
-      std::nullopt, nullptr, {}, 0);
+      NamedAttrList(), nullptr, {}, 0);
 
   std::string str;
   OpPrintingFlags flags;
diff --git a/mlir/unittests/IR/ValueTest.cpp b/mlir/unittests/IR/ValueTest.cpp
index fc671be39f1eb..97e32d474d522 100644
--- a/mlir/unittests/IR/ValueTest.cpp
+++ b/mlir/unittests/IR/ValueTest.cpp
@@ -22,7 +22,7 @@ static Operation *createOp(MLIRContext *context, ArrayRef<Value> operands = {},
   context->allowUnregisteredDialects();
   return Operation::create(UnknownLoc::get(context),
                            OperationName("foo.bar", context), resultTypes,
-                           operands, std::nullopt, nullptr, {}, numRegions);
+                           operands, NamedAttrList(), nullptr, {}, numRegions);
 }
 
 namespace {
diff --git a/mlir/unittests/Transforms/DialectConversion.cpp b/mlir/unittests/Transforms/DialectConversion.cpp
index 7bb27f721414c..6418c9dc0ac5b 100644
--- a/mlir/unittests/Transforms/DialectConversion.cpp
+++ b/mlir/unittests/Transforms/DialectConversion.cpp
@@ -15,7 +15,7 @@ static Operation *createOp(MLIRContext *context) {
   context->allowUnregisteredDialects();
   return Operation::create(UnknownLoc::get(context),
                            OperationName("foo.bar", context), {}, {},
-                           std::nullopt, /*properties=*/nullptr, {}, 0);
+                           NamedAttrList(), /*properties=*/nullptr, {}, 0);
 }
 
 namespace {
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index c4e7f9689a900..ffc9016bca0a3 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -84,17 +84,20 @@ struct ol_program_impl_t {
         DeviceImage(DeviceImage) {}
   plugin::DeviceImageTy *Image;
   std::unique_ptr<llvm::MemoryBuffer> ImageData;
-  std::vector<std::unique_ptr<ol_symbol_impl_t>> Symbols;
+  std::mutex SymbolListMutex;
   __tgt_device_image DeviceImage;
+  llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> KernelSymbols;
+  llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> GlobalSymbols;
 };
 
 struct ol_symbol_impl_t {
-  ol_symbol_impl_t(GenericKernelTy *Kernel)
-      : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL) {}
-  ol_symbol_impl_t(GlobalTy &&Global)
-      : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE) {}
+  ol_symbol_impl_t(const char *Name, GenericKernelTy *Kernel)
+      : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL), Name(Name) {}
+  ol_symbol_impl_t(const char *Name, GlobalTy &&Global)
+      : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE), Name(Name) {}
   std::variant<GenericKernelTy *, GlobalTy> PluginImpl;
   ol_symbol_kind_t Kind;
+  llvm::StringRef Name;
 };
 
 namespace llvm {
@@ -714,32 +717,40 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
                        ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol) {
   auto &Device = Program->Image->getDevice();
 
+  std::lock_guard<std::mutex> Lock{Program->SymbolListMutex};
+
   switch (Kind) {
   case OL_SYMBOL_KIND_KERNEL: {
-    auto KernelImpl = Device.constructKernel(Name);
-    if (!KernelImpl)
-      return KernelImpl.takeError();
+    auto &Kernel = Program->KernelSymbols[Name];
+    if (!Kernel) {
+      auto KernelImpl = Device.constructKernel(Name);
+      if (!KernelImpl)
+        return KernelImpl.takeError();
 
-    if (auto Err = KernelImpl->init(Device, *Program->Image))
-      return Err;
+      if (auto Err = KernelImpl->init(Device, *Program->Image))
+        return Err;
+
+      Kernel = std::make_unique<ol_symbol_impl_t>(KernelImpl->getName(),
+                                                  &*KernelImpl);
+    }
 
-    *Symbol =
-        Program->Symbols
-            .emplace_back(std::make_unique<ol_symbol_impl_t>(&*KernelImpl))
-            .get();
+    *Symbol = Kernel.get();
     return Error::success();
   }
   case OL_SYMBOL_KIND_GLOBAL_VARIABLE: {
-    GlobalTy GlobalObj{Name};
-    if (auto Res = Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice(
-            Device, *Program->Image, GlobalObj))
-      return Res;
-
-    *Symbol = Program->Symbols
-                  .emplace_back(
-                      std::make_unique<ol_symbol_impl_t>(std::move(GlobalObj)))
-                  .get();
+    auto &Global = Program->KernelSymbols[Name];
+    if (!Global) {
+      GlobalTy GlobalObj{Name};
+      if (auto Res =
+              Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice(
+                  Device, *Program->Image, GlobalObj))
+        return Res;
+
+      Global = std::make_unique<ol_symbol_impl_t>(GlobalObj.getName().c_str(),
+                                                  std::move(GlobalObj));
+    }
 
+    *Symbol = Global.get();
     return Error::success();
   }
   default:
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..f8db9bf0ae739 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1115,6 +1115,18 @@ struct AMDGPUStreamTy {
     return Plugin::success();
   }
 
+  /// Complete pending post actions until and including the event in target
+  /// slot.
+  Error completeUntil(uint32_t TargetSlot) {
+    for (uint32_t Slot = 0; Slot <= TargetSlot; ++Slot) {
+      // Take the post action of the operation if any.
+      if (auto Err = Slots[Slot].performAction())
+        return Err;
+    }
+
+    return Plugin::success();
+  }
+
   /// Make the current stream wait on a specific operation of another stream.
   /// The idea is to make the current stream waiting on two signals: 1) the last
   /// signal of the current stream, and 2) the last signal of the other stream.
@@ -1502,6 +1514,11 @@ struct AMDGPUStreamTy {
     return complete();
   }
 
+  /// Synchronize the stream until the given event. The current thread waits
+  /// until the provided event is finalized, and it performs the pending post
+  /// actions for that and prior events.
+  Error synchronizeOn(AMDGPUEventTy &Event);
+
   /// Query the stream and complete pending post actions if operations finished.
   /// Return whether all the operations completed. This operation does not block
   /// the calling thread.
@@ -1575,6 +1592,21 @@ struct AMDGPUEventTy {
     return Stream.waitEvent(*this);
   }
 
+  Error sync() {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    if (!RecordedStream)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "event does not have any recorded stream");
+
+    // No need to wait on anything, the recorded stream already finished the
+    // corresponding operation.
+    if (RecordedSlot < 0)
+      return Plugin::success();
+
+    return RecordedStream->synchronizeOn(*this);
+  }
+
 protected:
   /// The stream registered in this event.
   AMDGPUStreamTy *RecordedStream;
@@ -1630,6 +1662,27 @@ Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) {
   return waitOnStreamOperation(RecordedStream, Event.RecordedSlot);
 }
 
+Error AMDGPUStreamTy::synchronizeOn(AMDGPUEventTy &Event) {
+  std::lock_guard<std::mutex> Lock(Mutex);
+
+  // If this event was for an older sync cycle, it has already been finalized
+  if (Event.RecordedSyncCycle < SyncCycle)
+    return Plugin::success();
+  assert(Event.RecordedSyncCycle == SyncCycle && "event is from the future?");
+
+  // Wait until the requested slot has completed
+  if (auto Err = Slots[Event.RecordedSlot].Signal->wait(
+          StreamBusyWaitMicroseconds, &Device))
+    return Err;
+
+  // If the event is the last one in the stream, just do a full finalize
+  if (Event.RecordedSlot == last())
+    return complete();
+
+  // Otherwise, only finalize until the appropriate event
+  return completeUntil(Event.RecordedSlot);
+}
+
 struct AMDGPUStreamManagerTy final
     : GenericDeviceResourceManagerTy<AMDGPUResourceRef<AMDGPUStreamTy>> {
   using ResourceRef = AMDGPUResourceRef<AMDGPUStreamTy>;
@@ -2540,8 +2593,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Synchronize the current thread with the event.
   Error syncEventImpl(void *EventPtr) override {
-    return Plugin::error(ErrorCode::UNIMPLEMENTED,
-                         "synchronize event not implemented");
+    AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
+    return Event->sync();
   }
 
   /// Print information about the device.
@@ -2562,7 +2615,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
     if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS)
       Info.add("HSA Runtime Version",
-               std::to_string(Major) + "." + std::to_string(Minor));
+               std::to_string(Major) + "." + std::to_string(Minor), "",
+               DeviceInfo::DRIVER_VERSION);
 
     Info.add("HSA OpenMP Device Number", DeviceId);
 
@@ -2572,11 +2626,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Device Name", TmpChar);
+      Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar);
     if (Status == HSA_STATUS_SUCCESS)
-      Info.add("Vendor Name", TmpChar);
+      Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR);
 
     hsa_device_type_t DevType;
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType);
@@ -2652,7 +2706,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      auto &MaxSize = *Info.add("Workgroup Max Size per Dimension");
+      auto &MaxSize =
+          *Info.add("Workgroup Max Size per Dimension", std::monostate{}, "",
+                    DeviceInfo::MAX_WORK_GROUP_SIZE);
       MaxSize.add("x", WorkgrpMaxDim[0]);
       MaxSize.add("y", WorkgrpMaxDim[1]);
       MaxSize.add("z", WorkgrpMaxDim[2]);
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..8c17a2ee07047 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -113,6 +113,12 @@ struct AsyncInfoWrapperTy {
   __tgt_async_info *AsyncInfoPtr;
 };
 
+enum class DeviceInfo {
+#define OFFLOAD_DEVINFO(Name, _, Value) Name = Value,
+#include "OffloadInfo.inc"
+#undef OFFLOAD_DEVINFO
+};
+
 /// Tree node for device information
 ///
 /// This information is either printed or used by liboffload to extract certain
@@ -133,6 +139,8 @@ struct InfoTreeNode {
   // * The same key can appear multiple times
   std::unique_ptr<llvm::SmallVector<InfoTreeNode, 8>> Children;
 
+  llvm::DenseMap<DeviceInfo, size_t> DeviceInfoMap;
+
   InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {}
   InfoTreeNode(std::string Key, VariantType Value, std::string Units)
       : Key(Key), Value(Value), Units(Units) {}
@@ -140,10 +148,12 @@ struct InfoTreeNode {
   /// Add a new info entry as a child of this node. The entry requires at least
   /// a key string in \p Key. The value in \p Value is optional and can be any
   /// type that is representable as a string. The units in \p Units is optional
-  /// and must be a string.
+  /// and must be a string. Providing a device info key allows liboffload to
+  /// use that value for an appropriate olGetDeviceInfo query
   template <typename T = std::monostate>
   InfoTreeNode *add(std::string Key, T Value = T(),
-                    const std::string &Units = std::string()) {
+                    const std::string &Units = std::string(),
+                    std::optional<DeviceInfo> DeviceInfoKey = std::nullopt) {
     assert(!Key.empty() && "Invalid info key");
 
     if (!Children)
@@ -157,7 +167,12 @@ struct InfoTreeNode {
     else
       ValueVariant = std::string{Value};
 
-    return &Children->emplace_back(Key, ValueVariant, Units);
+    auto Ptr = &Children->emplace_back(Key, ValueVariant, Units);
+
+    if (DeviceInfoKey)
+      DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1;
+
+    return Ptr;
   }
 
   std::optional<InfoTreeNode *> get(StringRef Key) {
@@ -171,6 +186,13 @@ struct InfoTreeNode {
     return It;
   }
 
+  std::optional<InfoTreeNode *> get(DeviceInfo Info) {
+    auto Result = DeviceInfoMap.find(Info);
+    if (Result != DeviceInfoMap.end())
+      return &(*Children)[Result->second];
+    return std::nullopt;
+  }
+
   /// Print all info entries in the tree
   void print() const {
     // Fake an additional indent so that values are offset from the keys
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..c34b6c6bcbdb6 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1972,7 +1972,7 @@ int32_t GenericPluginTy::data_submit(int32_t DeviceId, void *TgtPtr,
 
 int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
                                            void *HstPtr, int64_t Size,
-                                           __tgt_async_info *AsyncInfoPtr) {
+                                           __tgt_async_info *AsyncInfoPtr, bool InOrder) {
   auto Err = getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, AsyncInfoPtr);
   if (Err) {
     REPORT("Failure to copy data from host to device. Pointers: host "
diff --git a/offload/plugins-nextgen/common/src/Utils/ELF.cpp b/offload/plugins-nextgen/common/src/Utils/ELF.cpp
index dfec55432f202..b0ee1984c42ce 100644
--- a/offload/plugins-nextgen/common/src/Utils/ELF.cpp
+++ b/offload/plugins-nextgen/common/src/Utils/ELF.cpp
@@ -60,23 +60,30 @@ static Expected<bool>
 checkMachineImpl(const object::ELFObjectFile<ELFT> &ELFObj, uint16_t EMachine) {
   const auto Header = ELFObj.getELFFile().getHeader();
   if (Header.e_type != ET_EXEC && Header.e_type != ET_DYN)
-    return createError("Only executable ELF files are supported");
+    return createError("only executable ELF files are supported");
 
   if (Header.e_machine == EM_AMDGPU) {
     if (Header.e_ident[EI_OSABI] != ELFOSABI_AMDGPU_HSA)
-      return createError("Invalid AMD OS/ABI, must be AMDGPU_HSA");
+      return createError("invalid AMD OS/ABI, must be AMDGPU_HSA");
     if (Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V5 &&
         Header.e_ident[EI_ABIVERSION] != ELFABIVERSION_AMDGPU_HSA_V6)
-      return createError("Invalid AMD ABI version, must be version 5 or above");
+      return createError("invalid AMD ABI version, must be version 5 or above");
     if ((Header.e_flags & EF_AMDGPU_MACH) < EF_AMDGPU_MACH_AMDGCN_GFX700 ||
         (Header.e_flags & EF_AMDGPU_MACH) >
             EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC)
-      return createError("Unsupported AMDGPU architecture");
+      return createError("unsupported AMDGPU architecture");
   } else if (Header.e_machine == EM_CUDA) {
-    if (~Header.e_flags & EF_CUDA_64BIT_ADDRESS)
-      return createError("Invalid CUDA addressing mode");
-    if ((Header.e_flags & EF_CUDA_SM) < EF_CUDA_SM35)
-      return createError("Unsupported NVPTX architecture");
+    if (Header.e_ident[EI_ABIVERSION] == ELFABIVERSION_CUDA_V1) {
+      if (~Header.e_flags & EF_CUDA_64BIT_ADDRESS)
+        return createError("invalid CUDA addressing mode");
+      if ((Header.e_flags & EF_CUDA_SM) < EF_CUDA_SM35)
+        return createError("unsupported NVPTX architecture");
+    } else if (Header.e_ident[EI_ABIVERSION] == ELFABIVERSION_CUDA_V2) {
+      if ((Header.e_flags & EF_CUDA_SM_MASK) < EF_CUDA_SM100)
+        return createError("unsupported NVPTX architecture");
+    } else {
+      return createError("invalid CUDA ABI version");
+    }
   }
 
   return Header.e_machine == EMachine;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b787376eb1770..5a391a4d36006 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -935,15 +935,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       // For consistency with other drivers, store the version as a string
       // rather than an integer
-      Info.add("CUDA Driver Version", std::to_string(TmpInt));
+      Info.add("CUDA Driver Version", std::to_string(TmpInt), "",
+               DeviceInfo::DRIVER_VERSION);
 
     Info.add("CUDA OpenMP Device Number", DeviceId);
 
     Res = cuDeviceGetName(TmpChar, 1000, Device);
     if (Res == CUDA_SUCCESS)
-      Info.add("Device Name", TmpChar);
+      Info.add("Device Name", TmpChar, "", DeviceInfo::NAME);
 
-    Info.add("Vendor Name", "NVIDIA");
+    Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR);
 
     Res = cuDeviceTotalMem(&TmpSt, Device);
     if (Res == CUDA_SUCCESS)
@@ -978,7 +979,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       Info.add("Maximum Threads per Block", TmpInt);
 
-    auto &MaxBlock = *Info.add("Maximum Block Dimensions", "");
+    auto &MaxBlock = *Info.add("Maximum Block Dimensions", std::monostate{}, "",
+                               DeviceInfo::MAX_WORK_GROUP_SIZE);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
       MaxBlock.add("x", TmpInt);
@@ -1442,7 +1444,11 @@ struct CUDAPluginTy final : public GenericPluginTy {
       return ElfOrErr.takeError();
 
     // Get the numeric value for the image's `sm_` value.
-    auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM;
+    const auto Header = ElfOrErr->getELFFile().getHeader();
+    unsigned SM =
+        Header.e_ident[ELF::EI_ABIVERSION] == ELF::ELFABIVERSION_CUDA_V1
+            ? Header.e_flags & ELF::EF_CUDA_SM
+            : (Header.e_flags & ELF::EF_CUDA_SM_MASK) >> 8;
 
     CUdevice Device;
     CUresult Res = cuDeviceGet(&Device, DeviceId);
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index e5d815ecda965..546921164f691 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -171,9 +171,6 @@ struct OffloadQueueTest : OffloadDeviceTest {
 struct OffloadEventTest : OffloadQueueTest {
   void SetUp() override {
     RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
-    if (getPlatformBackend() == OL_PLATFORM_BACKEND_AMDGPU)
-      GTEST_SKIP() << "AMDGPU synchronize event not implemented";
-
     // Get an event from a memcpy. We can still use it in olGetEventInfo etc
     // after it has been waited on.
     void *Alloc;
diff --git a/offload/unittests/OffloadAPI/event/olWaitEvent.cpp b/offload/unittests/OffloadAPI/event/olWaitEvent.cpp
index 05356d4ef8d75..1f2977eda64e2 100644
--- a/offload/unittests/OffloadAPI/event/olWaitEvent.cpp
+++ b/offload/unittests/OffloadAPI/event/olWaitEvent.cpp
@@ -14,9 +14,6 @@ using olWaitEventTest = OffloadQueueTest;
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olWaitEventTest);
 
 TEST_P(olWaitEventTest, Success) {
-  if (getPlatformBackend() == OL_PLATFORM_BACKEND_AMDGPU)
-    GTEST_SKIP() << "AMDGPU synchronize event not implemented";
-
   uint32_t Src = 42;
   void *DstPtr;
 
@@ -33,3 +30,20 @@ TEST_P(olWaitEventTest, Success) {
 TEST_P(olWaitEventTest, InvalidNullEvent) {
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, olWaitEvent(nullptr));
 }
+
+TEST_P(olWaitEventTest, SuccessMultipleWait) {
+  uint32_t Src = 42;
+  void *DstPtr;
+
+  ol_event_handle_t Event = nullptr;
+  ASSERT_SUCCESS(
+      olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(uint32_t), &DstPtr));
+  ASSERT_SUCCESS(
+      olMemcpy(Queue, DstPtr, Device, &Src, Host, sizeof(Src), &Event));
+  ASSERT_NE(Event, nullptr);
+
+  for (size_t I = 0; I < 10; I++)
+    ASSERT_SUCCESS(olWaitEvent(Event));
+
+  ASSERT_SUCCESS(olDestroyEvent(Event));
+}
diff --git a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
index 5e87ab5b29621..1f496b9c6e1ae 100644
--- a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
+++ b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
@@ -41,6 +41,14 @@ TEST_P(olGetSymbolKernelTest, Success) {
   ASSERT_NE(Kernel, nullptr);
 }
 
+TEST_P(olGetSymbolKernelTest, SuccessSamePtr) {
+  ol_symbol_handle_t KernelA = nullptr;
+  ol_symbol_handle_t KernelB = nullptr;
+  ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &KernelA));
+  ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &KernelB));
+  ASSERT_EQ(KernelA, KernelB);
+}
+
 TEST_P(olGetSymbolKernelTest, InvalidNullProgram) {
   ol_symbol_handle_t Kernel = nullptr;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
@@ -72,6 +80,16 @@ TEST_P(olGetSymbolGlobalTest, Success) {
   ASSERT_NE(Global, nullptr);
 }
 
+TEST_P(olGetSymbolGlobalTest, SuccessSamePtr) {
+  ol_symbol_handle_t GlobalA = nullptr;
+  ol_symbol_handle_t GlobalB = nullptr;
+  ASSERT_SUCCESS(
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &GlobalA));
+  ASSERT_SUCCESS(
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &GlobalB));
+  ASSERT_EQ(GlobalA, GlobalB);
+}
+
 TEST_P(olGetSymbolGlobalTest, InvalidNullProgram) {
   ol_symbol_handle_t Global = nullptr;
   ASSERT_ERROR(
diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var
index 20639f60b5d97..90d7e49ebf549 100644
--- a/openmp/runtime/src/include/omp_lib.F90.var
+++ b/openmp/runtime/src/include/omp_lib.F90.var
@@ -937,9 +937,8 @@
             integer (kind=omp_integer_kind), value :: libnum
           end subroutine kmp_set_library
 
-          subroutine kmp_set_defaults(string) bind(c)
-            use, intrinsic :: iso_c_binding
-            character (kind=c_char) :: string(*)
+          subroutine kmp_set_defaults(string)
+            character (len=*) :: string
           end subroutine kmp_set_defaults
 
           function kmp_get_stacksize() bind(c)
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 5793a3ac2e685..a50bb018c7cc3 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -1010,8 +1010,8 @@
           integer (kind=omp_integer_kind), value :: libnum
         end subroutine kmp_set_library
 
-        subroutine kmp_set_defaults(string) bind(c)
-          character string(*)
+        subroutine kmp_set_defaults(string)
+          character (len=*) :: string
         end subroutine kmp_set_defaults
 
         function kmp_get_stacksize() bind(c)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f62cabee6ea84..818edf9060ad1 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -34,15 +34,6 @@
 #define TASK_CURRENT_NOT_QUEUED 0
 #define TASK_CURRENT_QUEUED 1
 
-#ifdef BUILD_TIED_TASK_STACK
-#define TASK_STACK_EMPTY 0 // entries when the stack is empty
-#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
-// Number of entries in each task stack array
-#define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS)
-// Mask for determining index into stack block
-#define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1)
-#endif // BUILD_TIED_TASK_STACK
-
 #define TASK_NOT_PUSHED 1
 #define TASK_SUCCESSFULLY_PUSHED 0
 #define TASK_TIED 1
@@ -169,17 +160,6 @@ class kmp_stats_list;
 #define USE_FAST_MEMORY 3
 #endif
 
-#ifndef KMP_NESTED_HOT_TEAMS
-#define KMP_NESTED_HOT_TEAMS 0
-#define USE_NESTED_HOT_ARG(x)
-#else
-#if KMP_NESTED_HOT_TEAMS
-#define USE_NESTED_HOT_ARG(x) , x
-#else
-#define USE_NESTED_HOT_ARG(x)
-#endif
-#endif
-
 // Assume using BGET compare_exchange instruction instead of lock by default.
 #ifndef USE_CMP_XCHG_FOR_BGET
 #define USE_CMP_XCHG_FOR_BGET 1
@@ -2704,23 +2684,6 @@ extern std::atomic<kmp_int32> __kmp_tdg_task_id;
 extern kmp_int32 __kmp_num_tdg;
 #endif
 
-#ifdef BUILD_TIED_TASK_STACK
-
-/* Tied Task stack definitions */
-typedef struct kmp_stack_block {
-  kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE];
-  struct kmp_stack_block *sb_next;
-  struct kmp_stack_block *sb_prev;
-} kmp_stack_block_t;
-
-typedef struct kmp_task_stack {
-  kmp_stack_block_t ts_first_block; // first block of stack entries
-  kmp_taskdata_t **ts_top; // pointer to the top of stack
-  kmp_int32 ts_entries; // number of entries on the stack
-} kmp_task_stack_t;
-
-#endif // BUILD_TIED_TASK_STACK
-
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
 #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   /* Same fields as in the #else branch, but in reverse order */
@@ -2863,10 +2826,6 @@ typedef struct kmp_base_thread_data {
   kmp_int32 td_deque_ntasks; // Number of tasks in deque
   // GEH: shouldn't this be volatile since used in while-spin?
   kmp_int32 td_deque_last_stolen; // Thread number of last successful steal
-#ifdef BUILD_TIED_TASK_STACK
-  kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task
-// scheduling constraint
-#endif // BUILD_TIED_TASK_STACK
 } kmp_base_thread_data_t;
 
 #define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
@@ -2943,14 +2902,12 @@ typedef struct kmp_free_list {
   // sync list)
 } kmp_free_list_t;
 #endif
-#if KMP_NESTED_HOT_TEAMS
 // Hot teams array keeps hot teams and their sizes for given thread. Hot teams
 // are not put in teams pool, and they don't put threads in threads pool.
 typedef struct kmp_hot_team_ptr {
   kmp_team_p *hot_team; // pointer to hot_team of given nesting level
   kmp_int32 hot_team_nth; // number of threads allocated for the hot_team
 } kmp_hot_team_ptr_t;
-#endif
 typedef struct kmp_teams_size {
   kmp_int32 nteams; // number of teams in a league
   kmp_int32 nth; // number of threads in each team of the league
@@ -3025,9 +2982,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   int th_nt_sev; // error severity for strict modifier
   const char *th_nt_msg; // error message for strict modifier
   int th_set_nested_nth_sz;
-#if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
-#endif
   kmp_proc_bind_t
       th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
   kmp_teams_size_t
@@ -3583,10 +3538,8 @@ extern int __kmp_dflt_max_active_levels;
 extern bool __kmp_dflt_max_active_levels_set;
 extern int __kmp_dispatch_num_buffers; /* max possible dynamic loops in
                                           concurrent execution per team */
-#if KMP_NESTED_HOT_TEAMS
 extern int __kmp_hot_teams_mode;
 extern int __kmp_hot_teams_max_level;
-#endif
 
 #if KMP_MIC_SUPPORTED
 extern enum mic_type __kmp_mic_type;
@@ -4070,16 +4023,16 @@ extern void __kmp_suspend_uninitialize_thread(kmp_info_t *th);
 
 extern kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
                                          int tid);
-extern kmp_team_t *
-__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
+extern kmp_team_t *__kmp_allocate_team(kmp_root_t *root, int new_nproc,
+                                       int max_nproc,
 #if OMPT_SUPPORT
-                    ompt_data_t ompt_parallel_data,
+                                       ompt_data_t ompt_parallel_data,
 #endif
-                    kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs,
-                    int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
+                                       kmp_proc_bind_t proc_bind,
+                                       kmp_internal_control_t *new_icvs,
+                                       int argc, kmp_info_t *thr);
 extern void __kmp_free_thread(kmp_info_t *);
-extern void __kmp_free_team(kmp_root_t *,
-                            kmp_team_t *USE_NESTED_HOT_ARG(kmp_info_t *));
+extern void __kmp_free_team(kmp_root_t *, kmp_team_t *, kmp_info_t *);
 extern kmp_team_t *__kmp_reap_team(kmp_team_t *);
 
 /* ------------------------------------------------------------------------ */
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index a6065fe792d55..6bfdfbf2d3cdc 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -1396,11 +1396,13 @@ bool kmp_topology_t::filter_hw_subset() {
   // One last check that we shouldn't allow filtering entire machine
   if (num_filtered == num_hw_threads) {
     KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
+    KMP_CPU_FREE(filtered_mask);
     return false;
   }
 
   // Apply the filter
   restrict_to_mask(filtered_mask);
+  KMP_CPU_FREE(filtered_mask);
   return true;
 }
 
@@ -2225,7 +2227,7 @@ class cpuid_cache_info_t {
       cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
       cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
       table[depth].level = cache_level;
-      table[depth].mask = ((-1) << cache_mask_width);
+      table[depth].mask = ((0xffffffffu) << cache_mask_width);
       depth++;
       level++;
     }
@@ -2755,13 +2757,13 @@ static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
   // Set the masks to & with apicid
   for (unsigned i = 0; i < levels_index; ++i) {
     if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
-      levels[i].mask = ~((-1) << levels[i].mask_width);
-      levels[i].cache_mask = (-1) << levels[i].mask_width;
+      levels[i].mask = ~((0xffffffffu) << levels[i].mask_width);
+      levels[i].cache_mask = (0xffffffffu) << levels[i].mask_width;
       for (unsigned j = 0; j < i; ++j)
         levels[i].mask ^= levels[j].mask;
     } else {
       KMP_DEBUG_ASSERT(i > 0);
-      levels[i].mask = (-1) << levels[i - 1].mask_width;
+      levels[i].mask = (0xffffffffu) << levels[i - 1].mask_width;
       levels[i].cache_mask = 0;
     }
     info->description.add(info->levels[i].level_type);
@@ -4217,6 +4219,9 @@ static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
     if (stride > 0) {
       do {
         ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        // Prevent possible overflow calculation
+        if (end - start < stride)
+          break;
         start += stride;
       } while (start <= end);
     } else {
@@ -4238,6 +4243,7 @@ static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
   if (nextNewMask == 0) {
     *out_masks = NULL;
     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    KMP_CPU_FREE(sumMask);
     return;
   }
   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
@@ -4406,6 +4412,7 @@ static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
     (*scan)++; // skip '!'
     __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
+    KMP_CPU_AND(tempMask, __kmp_affin_fullMask);
   } else if ((**scan >= '0') && (**scan <= '9')) {
     next = *scan;
     SKIP_DIGITS(next);
@@ -4559,6 +4566,8 @@ void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
   *out_numMasks = nextNewMask;
   if (nextNewMask == 0) {
     *out_masks = NULL;
+    KMP_CPU_FREE(tempMask);
+    KMP_CPU_FREE(previousMask);
     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
     return;
   }
@@ -5280,13 +5289,18 @@ void __kmp_affinity_uninitialize(void) {
     if (affinity->os_id_masks != NULL)
       KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
     if (affinity->proclist != NULL)
-      __kmp_free(affinity->proclist);
+      KMP_INTERNAL_FREE(affinity->proclist);
     if (affinity->ids != NULL)
       __kmp_free(affinity->ids);
     if (affinity->attrs != NULL)
       __kmp_free(affinity->attrs);
     *affinity = KMP_AFFINITY_INIT(affinity->env_var);
   }
+  if (__kmp_affin_fullMask != NULL) {
+    KMP_CPU_FREE(__kmp_affin_fullMask);
+    __kmp_affin_fullMask = NULL;
+  }
+  __kmp_avail_proc = 0;
   if (__kmp_affin_origMask != NULL) {
     if (KMP_AFFINITY_CAPABLE()) {
 #if KMP_OS_AIX
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index d7ef57c608149..88a5cbb69ba87 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -205,6 +205,31 @@ void distributedBarrier::init(size_t nthr) {
     team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
 }
 
+void distributedBarrier::deallocate(distributedBarrier *db) {
+  for (int i = 0; i < MAX_ITERS; ++i) {
+    if (db->flags[i])
+      KMP_INTERNAL_FREE(db->flags[i]);
+    db->flags[i] = NULL;
+  }
+  if (db->go) {
+    KMP_INTERNAL_FREE(db->go);
+    db->go = NULL;
+  }
+  if (db->iter) {
+    KMP_INTERNAL_FREE(db->iter);
+    db->iter = NULL;
+  }
+  if (db->sleep) {
+    KMP_INTERNAL_FREE(db->sleep);
+    db->sleep = NULL;
+  }
+  if (db->team_icvs) {
+    __kmp_free(db->team_icvs);
+    db->team_icvs = NULL;
+  }
+  KMP_ALIGNED_FREE(db);
+}
+
 // This function is used only when KMP_BLOCKTIME is not infinite.
 // static
 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
@@ -1890,8 +1915,6 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
         break;
       }
       case bp_hyper_bar: {
-        // don't set branch bits to 0; use linear
-        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
         break;
@@ -1902,8 +1925,6 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
         break;
       }
       case bp_tree_bar: {
-        // don't set branch bits to 0; use linear
-        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
         break;
@@ -2297,7 +2318,6 @@ void __kmp_join_barrier(int gtid) {
     break;
   }
   case bp_hyper_bar: {
-    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
     break;
@@ -2308,7 +2328,6 @@ void __kmp_join_barrier(int gtid) {
     break;
   }
   case bp_tree_bar: {
-    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
     break;
diff --git a/openmp/runtime/src/kmp_barrier.h b/openmp/runtime/src/kmp_barrier.h
index ae9b8d62f4c3d..ce6100acc008e 100644
--- a/openmp/runtime/src/kmp_barrier.h
+++ b/openmp/runtime/src/kmp_barrier.h
@@ -130,8 +130,7 @@ class distributedBarrier {
     d->init(nThreads);
     return d;
   }
-
-  static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
+  static void deallocate(distributedBarrier *db);
 
   void update_num_threads(size_t nthr) { init(nthr); }
 
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index d64c9a4b557df..40f1087fd7f27 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -114,7 +114,6 @@
 # define BUILD_I8 1
 #endif
 
-#define KMP_NESTED_HOT_TEAMS 1
 #define KMP_ADJUST_BLOCKTIME 1
 #define BUILD_PARALLEL_ORDERED 1
 #define KMP_ASM_INTRINS 1
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 59a9571d59534..2b0063eb23a0a 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -572,16 +572,14 @@ static void __kmp_fortran_strncpy_truncate(char *buffer, size_t buf_size,
 // Convert a Fortran string to a C string by adding null byte
 class ConvertedString {
   char *buf;
-  kmp_info_t *th;
 
 public:
   ConvertedString(char const *fortran_str, size_t size) {
-    th = __kmp_get_thread();
-    buf = (char *)__kmp_thread_malloc(th, size + 1);
+    buf = (char *)KMP_INTERNAL_MALLOC(size + 1);
     KMP_STRNCPY_S(buf, size + 1, fortran_str, size);
     buf[size] = '\0';
   }
-  ~ConvertedString() { __kmp_thread_free(th, buf); }
+  ~ConvertedString() { KMP_INTERNAL_FREE(buf); }
   const char *get() const { return buf; }
 };
 #endif // KMP_STUB
@@ -1495,10 +1493,18 @@ void FTN_STDCALL FTN_SET_DEFAULTS(char const *str
 #endif
 ) {
 #ifndef KMP_STUB
+  size_t sz;
+  char const *defaults = str;
+
 #ifdef PASS_ARGS_BY_VALUE
-  int len = (int)KMP_STRLEN(str);
+  sz = KMP_STRLEN(str);
+#else
+  sz = (size_t)len;
+  ConvertedString cstr(str, sz);
+  defaults = cstr.get();
 #endif
-  __kmp_aux_set_defaults(str, len);
+
+  __kmp_aux_set_defaults(defaults, sz);
 #endif
 }
 
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 87c0a66a16c0a..323d13e948b42 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -135,11 +135,9 @@ int __kmp_tp_cached = 0;
 int __kmp_dispatch_num_buffers = KMP_DFLT_DISP_NUM_BUFF;
 int __kmp_dflt_max_active_levels = 1; // Nesting off by default
 bool __kmp_dflt_max_active_levels_set = false; // Don't override set value
-#if KMP_NESTED_HOT_TEAMS
 int __kmp_hot_teams_mode = 0; /* 0 - free extra threads when reduced */
 /* 1 - keep extra threads when reduced */
 int __kmp_hot_teams_max_level = 1; /* nesting level of hot teams */
-#endif
 enum library_type __kmp_library = library_none;
 enum sched_type __kmp_sched =
     kmp_sch_default; /* scheduling method for runtime scheduling */
diff --git a/openmp/runtime/src/kmp_i18n.cpp b/openmp/runtime/src/kmp_i18n.cpp
index a164aa180dd48..f93e2b9f9592f 100644
--- a/openmp/runtime/src/kmp_i18n.cpp
+++ b/openmp/runtime/src/kmp_i18n.cpp
@@ -791,8 +791,19 @@ void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, va_list args) {
   kmp_msg_t fmsg; // formatted message
   kmp_str_buf_t buffer;
 
-  if (severity != kmp_ms_fatal && __kmp_generate_warnings == kmp_warnings_off)
+  if (severity != kmp_ms_fatal && __kmp_generate_warnings == kmp_warnings_off) {
+    // Have to free all possible pre-allocated messages
+    // sent in through message and args
+    __kmp_str_free(&message.str);
+    for (;;) {
+      message = va_arg(args, kmp_msg_t);
+      if (message.type == kmp_mt_dummy && message.str == NULL) {
+        break;
+      }
+      __kmp_str_free(&message.str);
+    }
     return; // no reason to form a string in order to not print it
+  }
 
   __kmp_str_buf_init(&buffer);
 
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 11fa233c4bd27..fd1300352e95b 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -712,16 +712,9 @@ static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
 }
 
 int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
-  kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
-                                                  std::memory_order_relaxed) -
-                        std::atomic_load_explicit(&lck->lk.now_serving,
-                                                  std::memory_order_relaxed);
-
   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
                                  std::memory_order_release);
 
-  KMP_YIELD(distance >
-            (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
   return KMP_LOCK_RELEASED;
 }
 
@@ -3460,6 +3453,7 @@ void __kmp_cleanup_indirect_user_locks() {
       }
       __kmp_free(ptr->table[row]);
     }
+    __kmp_free(ptr->table);
     kmp_indirect_lock_table_t *next_table = ptr->next_table;
     if (ptr != &__kmp_i_lock_table)
       __kmp_free(ptr);
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 6afea9b994de4..39b7834d358af 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -977,8 +977,7 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
   master_th->th.th_team_serialized = FALSE;
   master_th->th.th_dispatch = &team->t.t_dispatch[0];
 
-/* make sure we are not the optimized hot team */
-#if KMP_NESTED_HOT_TEAMS
+  /* make sure we are not the optimized hot team */
   use_hot_team = 0;
   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
   if (hot_teams) { // hot teams array is not allocated if
@@ -1009,9 +1008,6 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
       use_hot_team = 0;
     }
   }
-#else
-  use_hot_team = team == root->r.r_hot_team;
-#endif
   if (!use_hot_team) {
 
     /* install the primary thread */
@@ -1255,13 +1251,12 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
-      new_team =
-          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
+      new_team = __kmp_allocate_team(
+          this_thr->th.th_root, 1, 1,
 #if OMPT_SUPPORT
-                              ompt_parallel_data,
+          ompt_parallel_data,
 #endif
-                              proc_bind, &this_thr->th.th_current_task->td_icvs,
-                              0 USE_NESTED_HOT_ARG(NULL));
+          proc_bind, &this_thr->th.th_current_task->td_icvs, 0, NULL);
       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
       KMP_ASSERT(new_team);
 
@@ -1952,9 +1947,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
   int level;
   int active_level;
   int teams_level;
-#if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t **p_hot_teams;
-#endif
   { // KMP_TIME_BLOCK
     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
@@ -2012,7 +2005,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     active_level = parent_team->t.t_active_level;
     // needed to check nesting inside the teams
     teams_level = master_th->th.th_teams_level;
-#if KMP_NESTED_HOT_TEAMS
     p_hot_teams = &master_th->th.th_hot_teams;
     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
@@ -2021,7 +2013,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       // it is either actual or not needed (when active_level > 0)
       (*p_hot_teams)[0].hot_team_nth = 1;
     }
-#endif
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
@@ -2200,20 +2191,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 #if OMPT_SUPPORT
                                  ompt_parallel_data,
 #endif
-                                 proc_bind, &new_icvs,
-                                 argc USE_NESTED_HOT_ARG(master_th));
+                                 proc_bind, &new_icvs, argc, master_th);
       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
     } else {
       /* allocate a new parallel team */
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
-      team = __kmp_allocate_team(root, nthreads, nthreads,
+      team = __kmp_allocate_team(
+          root, nthreads, nthreads,
 #if OMPT_SUPPORT
-                                 ompt_parallel_data,
+          ompt_parallel_data,
 #endif
-                                 proc_bind,
-                                 &master_th->th.th_current_task->td_icvs,
-                                 argc USE_NESTED_HOT_ARG(master_th));
+          proc_bind, &master_th->th.th_current_task->td_icvs, argc, master_th);
       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
         copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
                   &master_th->th.th_current_task->td_icvs);
@@ -2699,8 +2688,7 @@ void __kmp_join_call(ident_t *loc, int gtid
   if (root->r.r_active != master_active)
     root->r.r_active = master_active;
 
-  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
-                            master_th)); // this will free worker threads
+  __kmp_free_team(root, team, master_th); // this will free worker threads
 
   /* this race was fun to find. make sure the following is in the critical
      region otherwise assertions may fail occasionally since the old team may be
@@ -2716,8 +2704,7 @@ void __kmp_join_call(ident_t *loc, int gtid
   if (parent_team->t.t_serialized &&
       parent_team != master_th->th.th_serial_team &&
       parent_team != root->r.r_root_team) {
-    __kmp_free_team(root,
-                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
+    __kmp_free_team(root, master_th->th.th_serial_team, NULL);
     master_th->th.th_serial_team = parent_team;
   }
 
@@ -2823,11 +2810,8 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
   // rather than waiting for the next parallel region.
   root = thread->th.th_root;
   if (__kmp_init_parallel && (!root->r.r_active) &&
-      (root->r.r_hot_team->t.t_nproc > new_nth)
-#if KMP_NESTED_HOT_TEAMS
-      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
-#endif
-  ) {
+      (root->r.r_hot_team->t.t_nproc > new_nth) && __kmp_hot_teams_max_level &&
+      !__kmp_hot_teams_mode) {
     kmp_team_t *hot_team = root->r.r_hot_team;
     int f;
 
@@ -2848,12 +2832,10 @@ void __kmp_set_num_threads(int new_nth, int gtid) {
       hot_team->t.t_threads[f] = NULL;
     }
     hot_team->t.t_nproc = new_nth;
-#if KMP_NESTED_HOT_TEAMS
     if (thread->th.th_hot_teams) {
       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
     }
-#endif
 
     if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       hot_team->t.b->update_num_threads(new_nth);
@@ -3375,17 +3357,16 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   /* allocate the root team structure */
   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
 
-  root_team =
-      __kmp_allocate_team(root,
-                          1, // new_nproc
-                          1, // max_nproc
+  root_team = __kmp_allocate_team(root,
+                                  1, // new_nproc
+                                  1, // max_nproc
 #if OMPT_SUPPORT
-                          ompt_data_none, // root parallel id
+                                  ompt_data_none, // root parallel id
 #endif
-                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
-                          0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
-      );
+                                  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
+                                  0, // argc
+                                  NULL // primary thread is unknown
+  );
 #if USE_DEBUGGER
   // Non-NULL value should be assigned to make the debugger display the root
   // team.
@@ -3413,17 +3394,16 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   /* allocate the hot team structure */
   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
 
-  hot_team =
-      __kmp_allocate_team(root,
-                          1, // new_nproc
-                          __kmp_dflt_team_nth_ub * 2, // max_nproc
+  hot_team = __kmp_allocate_team(root,
+                                 1, // new_nproc
+                                 __kmp_dflt_team_nth_ub * 2, // max_nproc
 #if OMPT_SUPPORT
-                          ompt_data_none, // root parallel id
+                                 ompt_data_none, // root parallel id
 #endif
-                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
-                          0 // argc
-                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
-      );
+                                 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
+                                 0, // argc
+                                 NULL // primary thread is unknown
+  );
   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
 
   root->r.r_hot_team = hot_team;
@@ -3962,12 +3942,12 @@ int __kmp_register_root(int initial_thread) {
   if (!root_thread->th.th_serial_team) {
     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
-    root_thread->th.th_serial_team = __kmp_allocate_team(
-        root, 1, 1,
+    root_thread->th.th_serial_team =
+        __kmp_allocate_team(root, 1, 1,
 #if OMPT_SUPPORT
-        ompt_data_none, // root parallel id
+                            ompt_data_none, // root parallel id
 #endif
-        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
+                            proc_bind_default, &r_icvs, 0, NULL);
   }
   KMP_ASSERT(root_thread->th.th_serial_team);
   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
@@ -4073,7 +4053,6 @@ int __kmp_register_root(int initial_thread) {
   return gtid;
 }
 
-#if KMP_NESTED_HOT_TEAMS
 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
                                 const int max_level) {
   int i, n, nth;
@@ -4098,7 +4077,6 @@ static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
   __kmp_free_team(root, team, NULL);
   return n;
 }
-#endif
 
 // Resets a root thread and clear its root and hot teams.
 // Returns the number of __kmp_threads entries directly and indirectly freed.
@@ -4114,8 +4092,7 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
   root->r.r_hot_team = NULL;
   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
   // before call to __kmp_free_team().
-  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
-#if KMP_NESTED_HOT_TEAMS
+  __kmp_free_team(root, root_team, NULL);
   if (__kmp_hot_teams_max_level >
       0) { // need to free nested hot teams and their threads if any
     for (i = 0; i < hot_team->t.t_nproc; ++i) {
@@ -4129,8 +4106,7 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
       }
     }
   }
-#endif
-  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
+  __kmp_free_team(root, hot_team, NULL);
 
   // Before we can reap the thread, we need to make certain that all other
   // threads in the teams that had this root as ancestor have stopped trying to
@@ -4437,9 +4413,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 
   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
   KMP_DEBUG_ASSERT(root && team);
-#if !KMP_NESTED_HOT_TEAMS
-  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
-#endif
   KMP_MB();
 
   /* first, try to get one from the thread pool unless allocating thread is
@@ -4614,8 +4587,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 #if OMPT_SUPPORT
                                           ompt_data_none, // root parallel id
 #endif
-                                          proc_bind_default, &r_icvs,
-                                          0 USE_NESTED_HOT_ARG(NULL));
+                                          proc_bind_default, &r_icvs, 0, NULL);
   }
   KMP_ASSERT(serial_team);
   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
@@ -5139,14 +5111,13 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
 
 /* allocate a new team data structure to use.  take one off of the free pool if
    available */
-kmp_team_t *
-__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
+kmp_team_t *__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
-                    ompt_data_t ompt_parallel_data,
+                                ompt_data_t ompt_parallel_data,
 #endif
-                    kmp_proc_bind_t new_proc_bind,
-                    kmp_internal_control_t *new_icvs,
-                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
+                                kmp_proc_bind_t new_proc_bind,
+                                kmp_internal_control_t *new_icvs, int argc,
+                                kmp_info_t *master) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
   int f;
   kmp_team_t *team;
@@ -5159,7 +5130,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
   KMP_MB();
 
-#if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *hot_teams;
   if (master) {
     team = master->th.th_team;
@@ -5193,15 +5163,10 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
     // check we won't access uninitialized hot_teams, just in case
     KMP_DEBUG_ASSERT(new_nproc == 1);
   }
-#endif
   // Optimization to use a "hot" team
   if (use_hot_team && new_nproc > 1) {
     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
-#if KMP_NESTED_HOT_TEAMS
     team = hot_teams[level].hot_team;
-#else
-    team = root->r.r_hot_team;
-#endif
 #if KMP_DEBUG
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
@@ -5288,20 +5253,17 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           th->th.th_task_team = NULL;
         }
       }
-#if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
         // mode, can be bigger in mode 1, when hot team has threads in reserve
         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
         hot_teams[level].hot_team_nth = new_nproc;
-#endif // KMP_NESTED_HOT_TEAMS
         /* release the extra threads we don't need any more */
         for (f = new_nproc; f < team->t.t_nproc; f++) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
           __kmp_free_thread(team->t.t_threads[f]);
           team->t.t_threads[f] = NULL;
         }
-#if KMP_NESTED_HOT_TEAMS
       } // (__kmp_hot_teams_mode == 0)
       else {
         // When keeping extra threads in team, switch threads to wait on own
@@ -5317,7 +5279,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
           }
         }
       }
-#endif // KMP_NESTED_HOT_TEAMS
       team->t.t_nproc = new_nproc;
       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
@@ -5358,7 +5319,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       int old_nproc = team->t.t_nproc; // save old value and use to update only
       team->t.t_size_changed = 1;
 
-#if KMP_NESTED_HOT_TEAMS
       int avail_threads = hot_teams[level].hot_team_nth;
       if (new_nproc < avail_threads)
         avail_threads = new_nproc;
@@ -5386,7 +5346,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         // get reserved threads involved if any.
         team->t.t_nproc = hot_teams[level].hot_team_nth;
         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
-#endif // KMP_NESTED_HOT_TEAMS
         if (team->t.t_max_nproc < new_nproc) {
           /* reallocate larger arrays */
           __kmp_reallocate_team_arrays(team, new_nproc);
@@ -5435,9 +5394,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         /* Restore initial primary thread's affinity mask */
         new_temp_affinity.restore();
 #endif
-#if KMP_NESTED_HOT_TEAMS
       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
-#endif // KMP_NESTED_HOT_TEAMS
       if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
         // Barrier size already increased earlier in this function
         // Activate team threads via th_used_in_team
@@ -5484,7 +5441,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         thr->th.th_teams_size = master->th.th_teams_size;
       }
     }
-#if KMP_NESTED_HOT_TEAMS
     if (level) {
       // Sync barrier state for nested hot teams, not needed for outermost hot
       // team.
@@ -5501,7 +5457,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         }
       }
     }
-#endif // KMP_NESTED_HOT_TEAMS
 
     /* reallocate space for arguments if necessary */
     __kmp_alloc_argv_entries(argc, team, TRUE);
@@ -5666,8 +5621,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
 /* free the team.  return it to the team pool.  release all the threads
  * associated with it */
-void __kmp_free_team(kmp_root_t *root,
-                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
+void __kmp_free_team(kmp_root_t *root, kmp_team_t *team, kmp_info_t *master) {
   int f;
   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
                 team->t.t_id));
@@ -5679,7 +5633,6 @@ void __kmp_free_team(kmp_root_t *root,
   KMP_DEBUG_ASSERT(team->t.t_threads);
 
   int use_hot_team = team == root->r.r_hot_team;
-#if KMP_NESTED_HOT_TEAMS
   int level;
   if (master) {
     level = team->t.t_active_level - 1;
@@ -5702,7 +5655,6 @@ void __kmp_free_team(kmp_root_t *root,
       use_hot_team = 1;
     }
   }
-#endif // KMP_NESTED_HOT_TEAMS
 
   /* team is done working */
   TCW_SYNC_PTR(team->t.t_pkfn,
@@ -5749,9 +5701,7 @@ void __kmp_free_team(kmp_root_t *root,
               20,
               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
                __kmp_get_gtid(), task_team, team->t.t_id));
-#if KMP_NESTED_HOT_TEAMS
           __kmp_free_task_team(master, task_team);
-#endif
           team->t.t_task_team[tt_idx] = NULL;
         }
       }
@@ -8331,6 +8281,7 @@ void __kmp_cleanup(void) {
     __kmp_free(ptr);
     ptr = next;
   }
+  __kmp_old_threads_list = NULL;
 
 #if KMP_USE_DYNAMIC_LOCK
   __kmp_cleanup_indirect_user_locks();
@@ -8338,7 +8289,7 @@ void __kmp_cleanup(void) {
   __kmp_cleanup_user_locks();
 #endif
 #if OMPD_SUPPORT
-  if (ompd_state) {
+  if (ompd_env_block) {
     __kmp_free(ompd_env_block);
     ompd_env_block = NULL;
     ompd_env_block_size = 0;
@@ -8364,6 +8315,8 @@ void __kmp_cleanup(void) {
   __kmp_nested_proc_bind.bind_types = NULL;
   __kmp_nested_proc_bind.size = 0;
   __kmp_nested_proc_bind.used = 0;
+  __kmp_dflt_team_nth = 0;
+  __kmp_dflt_team_nth_ub = 0;
   if (__kmp_affinity_format) {
     KMP_INTERNAL_FREE(__kmp_affinity_format);
     __kmp_affinity_format = NULL;
@@ -8371,6 +8324,9 @@ void __kmp_cleanup(void) {
 
   __kmp_i18n_catclose();
 
+  if (__kmp_nesting_nth_level)
+    KMP_INTERNAL_FREE(__kmp_nesting_nth_level);
+
 #if KMP_USE_HIER_SCHED
   __kmp_hier_scheds.deallocate();
 #endif
@@ -8379,6 +8335,9 @@ void __kmp_cleanup(void) {
   __kmp_stats_fini();
 #endif
 
+  __kmpc_destroy_allocator(KMP_GTID_SHUTDOWN, __kmp_def_allocator);
+  __kmp_def_allocator = omp_default_mem_alloc;
+
   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
 }
 
@@ -8774,11 +8733,15 @@ static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
     break;
 #if KMP_AFFINITY_SUPPORTED
   case 'A': {
-    kmp_str_buf_t buf;
-    __kmp_str_buf_init(&buf);
-    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
-    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
-    __kmp_str_buf_free(&buf);
+    if (th->th.th_affin_mask) {
+      kmp_str_buf_t buf;
+      __kmp_str_buf_init(&buf);
+      __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
+      rc = __kmp_str_buf_print(field_buffer, format, buf.str);
+      __kmp_str_buf_free(&buf);
+    } else {
+      rc = __kmp_str_buf_print(field_buffer, "%s", "disabled");
+    }
   } break;
 #endif
   default:
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 392a02ebbd9aa..31342c8c6203d 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -1158,7 +1158,6 @@ static void __kmp_parse_nested_num_threads(const char *var, const char *env,
   }
   if (!__kmp_dflt_max_active_levels_set && total > 1)
     __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
-  KMP_DEBUG_ASSERT(total > 0);
   if (total <= 0) {
     KMP_WARNING(NthSyntaxError, var, env);
     return;
@@ -1248,8 +1247,11 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value,
   // TODO: Remove this option. OMP_NUM_THREADS is a list of positive integers!
   if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
     // The array of 1 element
-    __kmp_nested_nth.nth = (int *)KMP_INTERNAL_MALLOC(sizeof(int));
-    __kmp_nested_nth.size = __kmp_nested_nth.used = 1;
+    if (!__kmp_nested_nth.nth) {
+      __kmp_nested_nth.nth = (int *)KMP_INTERNAL_MALLOC(sizeof(int));
+      __kmp_nested_nth.size = 1;
+    }
+    __kmp_nested_nth.used = 1;
     __kmp_nested_nth.nth[0] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
         __kmp_xproc;
   } else {
@@ -1361,6 +1363,11 @@ static void __kmp_stg_parse_tasking(char const *name, char const *value,
                                     void *data) {
   __kmp_stg_parse_int(name, value, 0, (int)tskm_max,
                       (int *)&__kmp_tasking_mode);
+  // KMP_TASKING=1 (task barrier) doesn't work anymore, change to task_teams (2)
+  if (__kmp_tasking_mode == tskm_extra_barrier) {
+    KMP_WARNING(StgInvalidValue, name, value);
+    __kmp_tasking_mode = tskm_task_teams;
+  }
 } // __kmp_stg_parse_tasking
 
 static void __kmp_stg_print_tasking(kmp_str_buf_t *buffer, char const *name,
@@ -1501,7 +1508,6 @@ static void __kmp_stg_print_disp_buffers(kmp_str_buf_t *buffer,
   __kmp_stg_print_int(buffer, name, __kmp_dispatch_num_buffers);
 } // __kmp_stg_print_disp_buffers
 
-#if KMP_NESTED_HOT_TEAMS
 // -----------------------------------------------------------------------------
 // KMP_HOT_TEAMS_MAX_LEVEL, KMP_HOT_TEAMS_MODE
 
@@ -1511,8 +1517,8 @@ static void __kmp_stg_parse_hot_teams_level(char const *name, char const *value,
     KMP_WARNING(EnvParallelWarn, name);
     return;
   } // read value before first parallel only
-  __kmp_stg_parse_int(name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT,
-                      &__kmp_hot_teams_max_level);
+  __kmp_stg_parse_int(name, value, 0, 1024, &__kmp_hot_teams_max_level);
+
 } // __kmp_stg_parse_hot_teams_level
 
 static void __kmp_stg_print_hot_teams_level(kmp_str_buf_t *buffer,
@@ -1535,8 +1541,6 @@ static void __kmp_stg_print_hot_teams_mode(kmp_str_buf_t *buffer,
   __kmp_stg_print_int(buffer, name, __kmp_hot_teams_mode);
 } // __kmp_stg_print_hot_teams_mode
 
-#endif // KMP_NESTED_HOT_TEAMS
-
 // -----------------------------------------------------------------------------
 // KMP_HANDLE_SIGNALS
 
@@ -1678,6 +1682,11 @@ static void __kmp_stg_parse_align_alloc(char const *name, char const *value,
                                         void *data) {
   __kmp_stg_parse_size(name, value, CACHE_LINE, INT_MAX, NULL,
                        &__kmp_align_alloc, 1);
+  // Must be power of 2
+  if (__kmp_align_alloc == 0 || ((__kmp_align_alloc - 1) & __kmp_align_alloc)) {
+    KMP_WARNING(StgInvalidValue, name, value);
+    __kmp_align_alloc = CACHE_LINE;
+  }
 } // __kmp_stg_parse_align_alloc
 
 static void __kmp_stg_print_align_alloc(kmp_str_buf_t *buffer, char const *name,
@@ -1710,15 +1719,16 @@ static void __kmp_stg_parse_barrier_branch_bit(char const *name,
       } else {
         __kmp_barrier_release_branch_bits[i] =
             (kmp_uint32)__kmp_str_to_int(comma + 1, 0);
-
-        if (__kmp_barrier_release_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
+        if (__kmp_barrier_release_branch_bits[i] == 0 ||
+            __kmp_barrier_release_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
           __kmp_msg(kmp_ms_warning,
                     KMP_MSG(BarrReleaseValueInvalid, name, comma + 1),
                     __kmp_msg_null);
           __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
         }
       }
-      if (__kmp_barrier_gather_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
+      if (__kmp_barrier_gather_branch_bits[i] == 0 ||
+          __kmp_barrier_gather_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
         KMP_WARNING(BarrGatherValueInvalid, name, value);
         KMP_INFORM(Using_uint_Value, name, __kmp_barrier_gather_bb_dflt);
         __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
@@ -2198,7 +2208,7 @@ static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
 
   {
     ptrdiff_t len = next - env;
-    char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+    char *retlist = (char *)KMP_INTERNAL_MALLOC((len + 1) * sizeof(char));
     KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
     retlist[len] = '\0';
     *proclist = retlist;
@@ -3016,7 +3026,7 @@ static int __kmp_parse_place_list(const char *var, const char *env,
 
   {
     ptrdiff_t len = scan - env;
-    char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+    char *retlist = (char *)KMP_INTERNAL_MALLOC((len + 1) * sizeof(char));
     KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
     retlist[len] = '\0';
     *place_list = retlist;
@@ -3486,18 +3496,8 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
 
   const char *buf = value;
   const char *next;
-  int num;
+
   SKIP_WS(buf);
-  if ((*buf >= '0') && (*buf <= '9')) {
-    next = buf;
-    SKIP_DIGITS(next);
-    num = __kmp_str_to_int(buf, *next);
-    KMP_ASSERT(num >= 0);
-    buf = next;
-    SKIP_WS(buf);
-  } else {
-    num = -1;
-  }
 
   next = buf;
   if (__kmp_match_str("disabled", buf, &next)) {
@@ -3508,8 +3508,7 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
 #endif /* KMP_AFFINITY_SUPPORTED */
     __kmp_nested_proc_bind.used = 1;
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
-  } else if ((num == (int)proc_bind_false) ||
-             __kmp_match_str("false", buf, &next)) {
+  } else if (__kmp_match_str("false", buf, &next)) {
     buf = next;
     SKIP_WS(buf);
 #if KMP_AFFINITY_SUPPORTED
@@ -3517,8 +3516,7 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
 #endif /* KMP_AFFINITY_SUPPORTED */
     __kmp_nested_proc_bind.used = 1;
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
-  } else if ((num == (int)proc_bind_true) ||
-             __kmp_match_str("true", buf, &next)) {
+  } else if (__kmp_match_str("true", buf, &next)) {
     buf = next;
     SKIP_WS(buf);
     __kmp_nested_proc_bind.used = 1;
@@ -3554,19 +3552,16 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
     for (;;) {
       enum kmp_proc_bind_t bind;
 
-      if ((num == (int)proc_bind_primary) ||
-          __kmp_match_str("master", buf, &next) ||
+      if (__kmp_match_str("master", buf, &next) ||
           __kmp_match_str("primary", buf, &next)) {
         buf = next;
         SKIP_WS(buf);
         bind = proc_bind_primary;
-      } else if ((num == (int)proc_bind_close) ||
-                 __kmp_match_str("close", buf, &next)) {
+      } else if (__kmp_match_str("close", buf, &next)) {
         buf = next;
         SKIP_WS(buf);
         bind = proc_bind_close;
-      } else if ((num == (int)proc_bind_spread) ||
-                 __kmp_match_str("spread", buf, &next)) {
+      } else if (__kmp_match_str("spread", buf, &next)) {
         buf = next;
         SKIP_WS(buf);
         bind = proc_bind_spread;
@@ -3581,21 +3576,13 @@ static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
       if (i >= nelem) {
         break;
       }
-      KMP_DEBUG_ASSERT(*buf == ',');
+      if (*buf != ',') {
+        KMP_WARNING(ParseExtraCharsWarn, name, buf);
+        while (*buf != ',')
+          buf++;
+      }
       buf++;
       SKIP_WS(buf);
-
-      // Read next value if it was specified as an integer
-      if ((*buf >= '0') && (*buf <= '9')) {
-        next = buf;
-        SKIP_DIGITS(next);
-        num = __kmp_str_to_int(buf, *next);
-        KMP_ASSERT(num >= 0);
-        buf = next;
-        SKIP_WS(buf);
-      } else {
-        num = -1;
-      }
     }
     SKIP_WS(buf);
   }
@@ -4536,6 +4523,10 @@ static void __kmp_stg_print_atomic_mode(kmp_str_buf_t *buffer, char const *name,
 
 static void __kmp_stg_parse_consistency_check(char const *name,
                                               char const *value, void *data) {
+  if (TCR_4(__kmp_init_serial)) {
+    KMP_WARNING(EnvSerialWarn, name);
+    return;
+  } // read value before serial initialization only
   if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
     // Note, this will not work from kmp_set_defaults because th_cons stack was
     // not allocated
@@ -4902,7 +4893,6 @@ static void __kmp_stg_parse_spin_backoff_params(const char *name,
       }
     }
   }
-  KMP_DEBUG_ASSERT(total > 0);
   if (total <= 0) {
     KMP_WARNING(EnvSyntaxError, name, value);
     return;
@@ -4998,7 +4988,6 @@ static void __kmp_stg_parse_adaptive_lock_props(const char *name,
       }
     }
   }
-  KMP_DEBUG_ASSERT(total > 0);
   if (total <= 0) {
     KMP_WARNING(EnvSyntaxError, name, value);
     return;
@@ -5569,12 +5558,10 @@ static kmp_setting_t __kmp_stg_table[] = {
      __kmp_stg_print_wait_policy, NULL, 0, 0},
     {"KMP_DISP_NUM_BUFFERS", __kmp_stg_parse_disp_buffers,
      __kmp_stg_print_disp_buffers, NULL, 0, 0},
-#if KMP_NESTED_HOT_TEAMS
     {"KMP_HOT_TEAMS_MAX_LEVEL", __kmp_stg_parse_hot_teams_level,
      __kmp_stg_print_hot_teams_level, NULL, 0, 0},
     {"KMP_HOT_TEAMS_MODE", __kmp_stg_parse_hot_teams_mode,
      __kmp_stg_print_hot_teams_mode, NULL, 0, 0},
-#endif // KMP_NESTED_HOT_TEAMS
 
 #if KMP_HANDLE_SIGNALS
     {"KMP_HANDLE_SIGNALS", __kmp_stg_parse_handle_signals,
@@ -5758,7 +5745,8 @@ static kmp_setting_t __kmp_stg_table[] = {
 #if OMPX_TASKGRAPH
     {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
      0, 0},
-    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0, 0},
+    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0,
+     0},
 #endif
 
 #if OMPT_SUPPORT
diff --git a/openmp/runtime/src/kmp_str.cpp b/openmp/runtime/src/kmp_str.cpp
index 6ee2df724487c..12cce53074821 100644
--- a/openmp/runtime/src/kmp_str.cpp
+++ b/openmp/runtime/src/kmp_str.cpp
@@ -628,6 +628,11 @@ int __kmp_basic_str_to_int(char const *str) {
   for (t = str; *t != '\0'; ++t) {
     if (*t < '0' || *t > '9')
       break;
+    // Cap parsing to create largest integer
+    if (result >= (INT_MAX - (*t - '0')) / 10) {
+      result = INT_MAX;
+      break;
+    }
     result = (result * 10) + (*t - '0');
   }
 
@@ -643,9 +648,20 @@ int __kmp_str_to_int(char const *str, char sentinel) {
   for (t = str; *t != '\0'; ++t) {
     if (*t < '0' || *t > '9')
       break;
+    // Cap parsing to create largest integer
+    if (result >= (INT_MAX - (*t - '0')) / 10) {
+      result = INT_MAX;
+      break;
+    }
     result = (result * 10) + (*t - '0');
   }
 
+  // Parse rest of large number by skipping the digits so t points to sentinel
+  if (result == INT_MAX)
+    for (t = str; *t != '\0'; ++t)
+      if (*t < '0' || *t > '9')
+        break;
+
   switch (*t) {
   case '\0': /* the current default for no suffix is bytes */
     factor = 1;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index d7bc4922d54f7..e4d92a78fd6b9 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -42,221 +42,6 @@ static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
 int __kmp_taskloop_task(int gtid, void *ptask);
 #endif
 
-#ifdef BUILD_TIED_TASK_STACK
-
-//  __kmp_trace_task_stack: print the tied tasks from the task stack in order
-//  from top do bottom
-//
-//  gtid: global thread identifier for thread containing stack
-//  thread_data: thread data for task team thread containing stack
-//  threshold: value above which the trace statement triggers
-//  location: string identifying call site of this function (for trace)
-static void __kmp_trace_task_stack(kmp_int32 gtid,
-                                   kmp_thread_data_t *thread_data,
-                                   int threshold, char *location) {
-  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
-  kmp_taskdata_t **stack_top = task_stack->ts_top;
-  kmp_int32 entries = task_stack->ts_entries;
-  kmp_taskdata_t *tied_task;
-
-  KA_TRACE(
-      threshold,
-      ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
-       "first_block = %p, stack_top = %p \n",
-       location, gtid, entries, task_stack->ts_first_block, stack_top));
-
-  KMP_DEBUG_ASSERT(stack_top != NULL);
-  KMP_DEBUG_ASSERT(entries > 0);
-
-  while (entries != 0) {
-    KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
-    // fix up ts_top if we need to pop from previous block
-    if (entries & TASK_STACK_INDEX_MASK == 0) {
-      kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
-
-      stack_block = stack_block->sb_prev;
-      stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
-    }
-
-    // finish bookkeeping
-    stack_top--;
-    entries--;
-
-    tied_task = *stack_top;
-
-    KMP_DEBUG_ASSERT(tied_task != NULL);
-    KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
-
-    KA_TRACE(threshold,
-             ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
-              "stack_top=%p, tied_task=%p\n",
-              location, gtid, entries, stack_top, tied_task));
-  }
-  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
-
-  KA_TRACE(threshold,
-           ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
-            location, gtid));
-}
-
-//  __kmp_init_task_stack: initialize the task stack for the first time
-//  after a thread_data structure is created.
-//  It should not be necessary to do this again (assuming the stack works).
-//
-//  gtid: global thread identifier of calling thread
-//  thread_data: thread data for task team thread containing stack
-static void __kmp_init_task_stack(kmp_int32 gtid,
-                                  kmp_thread_data_t *thread_data) {
-  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
-  kmp_stack_block_t *first_block;
-
-  // set up the first block of the stack
-  first_block = &task_stack->ts_first_block;
-  task_stack->ts_top = (kmp_taskdata_t **)first_block;
-  memset((void *)first_block, '\0',
-         TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
-
-  // initialize the stack to be empty
-  task_stack->ts_entries = TASK_STACK_EMPTY;
-  first_block->sb_next = NULL;
-  first_block->sb_prev = NULL;
-}
-
-//  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
-//
-//  gtid: global thread identifier for calling thread
-//  thread_data: thread info for thread containing stack
-static void __kmp_free_task_stack(kmp_int32 gtid,
-                                  kmp_thread_data_t *thread_data) {
-  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
-  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
-
-  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
-  // free from the second block of the stack
-  while (stack_block != NULL) {
-    kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
-
-    stack_block->sb_next = NULL;
-    stack_block->sb_prev = NULL;
-    if (stack_block != &task_stack->ts_first_block) {
-      __kmp_thread_free(thread,
-                        stack_block); // free the block, if not the first
-    }
-    stack_block = next_block;
-  }
-  // initialize the stack to be empty
-  task_stack->ts_entries = 0;
-  task_stack->ts_top = NULL;
-}
-
-//  __kmp_push_task_stack: Push the tied task onto the task stack.
-//     Grow the stack if necessary by allocating another block.
-//
-//  gtid: global thread identifier for calling thread
-//  thread: thread info for thread containing stack
-//  tied_task: the task to push on the stack
-static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
-                                  kmp_taskdata_t *tied_task) {
-  // GEH - need to consider what to do if tt_threads_data not allocated yet
-  kmp_thread_data_t *thread_data =
-      &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
-  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
-
-  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
-    return; // Don't push anything on stack if team or team tasks are serialized
-  }
-
-  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
-  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
-
-  KA_TRACE(20,
-           ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
-            gtid, thread, tied_task));
-  // Store entry
-  *(task_stack->ts_top) = tied_task;
-
-  // Do bookkeeping for next push
-  task_stack->ts_top++;
-  task_stack->ts_entries++;
-
-  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
-    // Find beginning of this task block
-    kmp_stack_block_t *stack_block =
-        (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
-
-    // Check if we already have a block
-    if (stack_block->sb_next !=
-        NULL) { // reset ts_top to beginning of next block
-      task_stack->ts_top = &stack_block->sb_next->sb_block[0];
-    } else { // Alloc new block and link it up
-      kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
-          thread, sizeof(kmp_stack_block_t));
-
-      task_stack->ts_top = &new_block->sb_block[0];
-      stack_block->sb_next = new_block;
-      new_block->sb_prev = stack_block;
-      new_block->sb_next = NULL;
-
-      KA_TRACE(
-          30,
-          ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
-           gtid, tied_task, new_block));
-    }
-  }
-  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
-                tied_task));
-}
-
-//  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
-//  the task, just check to make sure it matches the ending task passed in.
-//
-//  gtid: global thread identifier for the calling thread
-//  thread: thread info structure containing stack
-//  tied_task: the task popped off the stack
-//  ending_task: the task that is ending (should match popped task)
-static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
-                                 kmp_taskdata_t *ending_task) {
-  // GEH - need to consider what to do if tt_threads_data not allocated yet
-  kmp_thread_data_t *thread_data =
-      &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
-  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
-  kmp_taskdata_t *tied_task;
-
-  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
-    // Don't pop anything from stack if team or team tasks are serialized
-    return;
-  }
-
-  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
-  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
-
-  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
-                thread));
-
-  // fix up ts_top if we need to pop from previous block
-  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
-    kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
-
-    stack_block = stack_block->sb_prev;
-    task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
-  }
-
-  // finish bookkeeping
-  task_stack->ts_top--;
-  task_stack->ts_entries--;
-
-  tied_task = *(task_stack->ts_top);
-
-  KMP_DEBUG_ASSERT(tied_task != NULL);
-  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
-  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
-
-  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
-                tied_task));
-  return;
-}
-#endif /* BUILD_TIED_TASK_STACK */
-
 // returns 1 if new task is allowed to execute, 0 otherwise
 // checks Task Scheduling constraint (if requested) and
 // mutexinoutset dependencies if any
@@ -683,13 +468,6 @@ static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
   // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
   current_task->td_flags.executing = 0;
 
-// Add task to stack if tied
-#ifdef BUILD_TIED_TASK_STACK
-  if (taskdata->td_flags.tiedness == TASK_TIED) {
-    __kmp_push_task_stack(gtid, thread, taskdata);
-  }
-#endif /* BUILD_TIED_TASK_STACK */
-
   // mark starting task as executing and as current task
   thread->th.th_current_task = taskdata;
 
@@ -1041,13 +819,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   is_taskgraph = taskdata->is_taskgraph;
 #endif
 
-// Pop task from stack if tied
-#ifdef BUILD_TIED_TASK_STACK
-  if (taskdata->td_flags.tiedness == TASK_TIED) {
-    __kmp_pop_task_stack(gtid, thread, taskdata);
-  }
-#endif /* BUILD_TIED_TASK_STACK */
-
   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to check the counter so that the task structure is not
     // freed prematurely
@@ -3786,13 +3557,6 @@ static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
     thread_data->td.td_deque = NULL;
     __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
   }
-
-#ifdef BUILD_TIED_TASK_STACK
-  // GEH: Figure out what to do here for td_susp_tied_tasks
-  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
-    __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
-  }
-#endif // BUILD_TIED_TASK_STACK
 }
 
 // __kmp_realloc_task_threads_data:
@@ -3849,14 +3613,7 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
         KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
                      (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
 
-#ifdef BUILD_TIED_TASK_STACK
-        // GEH: Figure out if this is the right thing to do
-        for (i = maxthreads; i < nthreads; i++) {
-          kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
-          __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
-        }
-#endif // BUILD_TIED_TASK_STACK
-       // Install the new data and free the old data
+        // Install the new data and free the old data
         (*threads_data_p) = new_data;
         __kmp_free(old_data);
       } else {
@@ -3868,13 +3625,6 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
         // kmp_reap_task_team( ).
         *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
             nthreads * sizeof(kmp_thread_data_t));
-#ifdef BUILD_TIED_TASK_STACK
-        // GEH: Figure out if this is the right thing to do
-        for (i = 0; i < nthreads; i++) {
-          kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
-          __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
-        }
-#endif // BUILD_TIED_TASK_STACK
       }
       task_team->tt.tt_max_threads = nthreads;
     } else {
diff --git a/openmp/runtime/test/env/check_certain_values.c b/openmp/runtime/test/env/check_certain_values.c
new file mode 100644
index 0000000000000..6d2623749ab0a
--- /dev/null
+++ b/openmp/runtime/test/env/check_certain_values.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile
+// RUN: env KMP_FORKJOIN_BARRIER=0,0 %libomp-run
+// RUN: env KMP_PLAIN_BARRIER=0,0 %libomp-run
+// RUN: env KMP_REDUCTION_BARRIER=0,0 %libomp-run
+// RUN: env KMP_ALIGN_ALLOC=7 %libomp-run
+// RUN: env KMP_ALIGN_ALLOC=8 %libomp-run
+// RUN: env KMP_AFFINITY='explicit,proclist=[0-1222333333333444444]' %libomp-run
+// RUN: env KMP_AFFINITY=disabled OMP_DISPLAY_AFFINITY=TRUE %libomp-run
+//
+// Test that certain environment variable values do not crash the runtime.
+#include <omp.h>
+#include <stdlib.h>
+
+int a = 0;
+
+int test() {
+#pragma omp parallel reduction(+ : a)
+  {
+    a += omp_get_thread_num();
+  }
+  if (a == 0) {
+    // If the test passes, 'a' should not be zero
+    // because we are using reduction on thread numbers.
+    return 0;
+  }
+  return 1;
+}
+
+int main(int argc, char **argv) {
+  int status = EXIT_SUCCESS;
+  if (!test()) {
+    status = EXIT_FAILURE;
+  }
+  return status;
+}
diff --git a/openmp/runtime/test/tasking/no_task_barrier.c b/openmp/runtime/test/tasking/no_task_barrier.c
new file mode 100644
index 0000000000000..da2e99ee408f7
--- /dev/null
+++ b/openmp/runtime/test/tasking/no_task_barrier.c
@@ -0,0 +1,28 @@
+// RUN: %libomp-compile
+// RUN: env KMP_TASKING=0 %libomp-run
+// RUN: env KMP_TASKING=1 %libomp-run
+// RUN: env KMP_TASKING=2 %libomp-run
+//
+// Test to make sure the KMP_TASKING=1 option doesn't crash
+// Can use KMP_TASKING=0 (immediate exec) or 2 (defer to task queue
+// and steal during regular barrier) but cannot use
+// KMP_TASKING=1 (explicit tasking barrier before regular barrier)
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+int main() {
+  int i;
+#pragma omp parallel
+  {
+#pragma omp single
+    {
+      for (i = 0; i < 10; i++) {
+#pragma omp task
+        {
+          printf("Task %d executed by thread %d\n", i, omp_get_thread_num());
+        }
+      }
+    }
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
index 472c6c67a45e2..ab02e639f0d2a 100644
--- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
+++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
@@ -66,8 +66,8 @@ if.then52:                                        ; preds = %land.lhs.true49
   br label %cleanup
 
 cleanup:                                          ; preds = %if.then52, %land.lhs.true49, %entry.split
-  call void @llvm.lifetime.end(i64 24, i8* %0)
+  call void @dummy(ptr %0)
   ret void
 }
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @dummy(ptr)
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index 3993ae17f7fc4..d1ae6cc82b943 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -358,7 +358,6 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
     C.num_sharing = static_cast<int>(b.count());
     C.level = cache.Level;
     C.size = cache.Size;
-    C.type = "Unknown";
     switch (cache.Type) {
       case CacheUnified:
         C.type = "Unified";
@@ -372,6 +371,9 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
       case CacheTrace:
         C.type = "Trace";
         break;
+      default:
+        C.type = "Unknown";
+        break;
     }
     res.push_back(C);
   }
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index ac4e3813fbba8..3598944381900 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -270,6 +270,12 @@ td_library(
     includes = ["include"],
 )
 
+td_library(
+    name = "BuiltinsRISCVXAndesTdFiles",
+    srcs = ["include/clang/Basic/BuiltinsRISCVXAndes.td"],
+    includes = ["include"],
+)
+
 td_library(
     name = "BuiltinsX86BaseTdFiles",
     srcs = ["include/clang/Basic/BuiltinsX86Base.td"],
@@ -348,6 +354,7 @@ gentbl_cc_library(
     td_file = "include/clang/Basic/BuiltinsRISCV.td",
     deps = [
         ":BuiltinsBaseTdFiles",
+        ":BuiltinsRISCVXAndesTdFiles",
         ":BuiltinsRISCVXCVTdFiles",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
index 58bcf3abfbe5d..a72fa30f1a756 100644
--- a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
@@ -114,6 +114,7 @@ cc_test(
         "//clang:frontend",
         "//clang:lex",
         "//clang:parse",
+        "//clang:testing",
         "//clang:tooling",
         "//llvm:Support",
         "//third-party/unittest:gmock",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9e288f7fec0a8..7c2020aab0698 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1920,6 +1920,7 @@ libc_support_library(
     hdrs = ["src/math/generic/common_constants.h"],
     deps = [
         ":__support_math_exp_constants",
+        ":__support_math_acosh_float_constants",
         ":__support_number_pair",
     ],
 )
@@ -1996,36 +1997,20 @@ libc_support_library(
 
 libc_support_library(
     name = "explogxf",
-    srcs = ["src/math/generic/explogxf.cpp"],
     hdrs = ["src/math/generic/explogxf.h"],
     deps = [
-        ":__support_common",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fma",
         ":__support_fputil_multiply_add",
         ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
         ":__support_math_exp_utils",
+        ":__support_math_exp10f_utils",
+        ":__support_math_acoshf_utils",
+        ":__support_macros_properties_cpu_features",
         ":common_constants",
     ],
 )
 
-libc_support_library(
-    name = "inv_trigf_utils",
-    srcs = ["src/math/generic/inv_trigf_utils.cpp"],
-    hdrs = [
-        "src/math/generic/inv_trigf_utils.h",
-    ],
-    deps = [
-        ":__support_common",
-        ":__support_fputil_double_double",
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_polyeval",
-        ":__support_integer_literals",
-    ],
-)
-
 libc_support_library(
     name = "atan_utils",
     hdrs = ["src/math/generic/atan_utils.h"],
@@ -2051,11 +2036,14 @@ libc_support_library(
 )
 
 libc_support_library(
-    name = "exp10f_impl",
-    hdrs = ["src/math/generic/exp10f_impl.h"],
+    name = "exp2f_impl",
+    hdrs = ["src/math/generic/exp2f_impl.h"],
     deps = [
+        ":__support_fputil_except_value_utils",
         ":__support_fputil_fma",
         ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
         ":common_constants",
@@ -2064,29 +2052,132 @@ libc_support_library(
 )
 
 libc_support_library(
-    name = "exp2f_impl",
-    hdrs = ["src/math/generic/exp2f_impl.h"],
+    name = "expxf16",
+    hdrs = ["src/math/generic/expxf16.h"],
     deps = [
+        ":__support_fputil_cast",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_nearest_integer",
+        ":__support_math_expf16_utils",
+        ":__support_math_exp10_float16_constants",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_acos",
+    hdrs = ["src/__support/math/acos.h"],
+    deps = [
+        ":__support_math_asin_utils",
+        ":__support_fputil_double_double",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_sqrt",
+        ":__support_macros_optimization",
+        ":__support_macros_properties_types",
+        ":__support_macros_properties_cpu_features",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_acosf",
+    hdrs = ["src/__support/math/acosf.h"],
+    deps = [
+        ":__support_math_inv_trigf_utils",
         ":__support_fputil_except_value_utils",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_sqrt",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_acosf16",
+    hdrs = ["src/__support/math/acosf16.h"],
+    deps = [
+        ":__support_fputil_cast",
         ":__support_fputil_fma",
         ":__support_fputil_multiply_add",
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
+        ":__support_fputil_sqrt",
         ":__support_macros_optimization",
-        ":common_constants",
-        ":explogxf",
     ],
 )
 
 libc_support_library(
-    name = "expxf16",
-    hdrs = ["src/math/generic/expxf16.h"],
+    name = "__support_math_acosh_float_constants",
+    hdrs = ["src/__support/math/acosh_float_constants.h"],
+    deps = [
+        ":__support_macros_config",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_acoshf_utils",
+    hdrs = ["src/__support/math/acoshf_utils.h"],
+    deps = [
+        ":__support_math_acosh_float_constants",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_acoshf",
+    hdrs = ["src/__support/math/acoshf.h"],
+    deps = [
+        ":__support_math_acoshf_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_sqrt",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_acoshf16",
+    hdrs = ["src/__support/math/acoshf16.h"],
     deps = [
+        ":__support_math_acoshf_utils",
         ":__support_fputil_cast",
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_sqrt",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_asin_utils",
+    hdrs = ["src/__support/math/asin_utils.h"],
+    deps = [
+        ":__support_integer_literals",
+        ":__support_fputil_double_double",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_multiply_add",
         ":__support_fputil_nearest_integer",
-        ":__support_math_expf16_utils",
+        ":__support_fputil_polyeval",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_erff",
+    hdrs = ["src/__support/math/erff.h"],
+    deps = [
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_macros_optimization",
     ],
 )
 
@@ -2158,6 +2249,16 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_inv_trigf_utils",
+    hdrs = ["src/__support/math/inv_trigf_utils.h"],
+    deps = [
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_common",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_frexpf16",
     hdrs = ["src/__support/math/frexpf16.h"],
@@ -2245,6 +2346,83 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp10",
+    hdrs = ["src/__support/math/exp10.h"],
+    deps = [
+        ":__support_math_exp_constants",
+        ":__support_math_exp_utils",
+        ":__support_fputil_double_double",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_fputil_triple_double",
+        ":__support_integer_literals",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10f_utils",
+    hdrs = ["src/__support/math/exp10f_utils.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_common",
+        ":__support_math_exp_utils",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10f",
+    hdrs = ["src/__support/math/exp10f.h"],
+    deps = [
+        ":__support_math_exp10f_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10_float16_constants",
+    hdrs = ["src/__support/math/exp10_float16_constants.h"],
+    deps = [
+        ":__support_cpp_array",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10f16_utils",
+    hdrs = ["src/__support/math/exp10f16_utils.h"],
+    deps = [
+        ":__support_math_exp10_float16_constants",
+        ":__support_math_expf16_utils",
+        ":__support_fputil_fp_bits",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10f16",
+    hdrs = ["src/__support/math/exp10f16.h"],
+    deps = [
+        ":__support_math_exp10f16_utils",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_cast",
+        ":__support_fputil_rounding_mode",
+        ":__support_fputil_except_value_utils",
+        ":__support_macros_optimization",
+        ":__support_macros_properties_cpu_features",
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -2490,47 +2668,44 @@ libc_function(
 
 ################################ math targets ##################################
 
+libc_math_function(
+    name = "acos",
+    additional_deps = [
+        ":__support_math_acos",
+    ],
+)
+
 libc_math_function(
     name = "acosf",
     additional_deps = [
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_sqrt",
-        ":__support_macros_optimization",
-        ":inv_trigf_utils",
+        ":__support_math_acosf",
     ],
 )
 
 libc_math_function(
     name = "acosf16",
     additional_deps = [
-        ":__support_fputil_cast",
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_sqrt",
-        ":__support_macros_optimization",
-        ":inv_trigf_utils",
+        ":__support_math_acosf16",
+        ":errno",
     ],
 )
 
 libc_math_function(
     name = "acoshf",
     additional_deps = [
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_sqrt",
-        ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_acoshf",
         ":explogxf",
     ],
 )
 
+libc_math_function(
+    name = "acoshf16",
+    additional_deps = [
+        ":__support_math_acoshf16",
+        ":errno",
+    ],
+)
+
 libc_math_function(
     name = "asinf",
     additional_deps = [
@@ -2541,7 +2716,7 @@ libc_math_function(
         ":__support_fputil_sqrt",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":inv_trigf_utils",
+        ":__support_math_inv_trigf_utils",
     ],
 )
 
@@ -2555,7 +2730,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_fputil_sqrt",
         ":__support_macros_optimization",
-        ":inv_trigf_utils",
+        ":__support_math_inv_trigf_utils",
     ],
 )
 
@@ -2582,7 +2757,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
-        ":inv_trigf_utils",
+        ":__support_math_inv_trigf_utils",
     ],
 )
 
@@ -2601,7 +2776,7 @@ libc_math_function(
     additional_deps = [
         ":__support_fputil_double_double",
         ":__support_fputil_nearest_integer",
-        ":inv_trigf_utils",
+        ":__support_math_inv_trigf_utils",
     ],
 )
 
@@ -2708,10 +2883,10 @@ libc_math_function(
     name = "cosf",
     additional_deps = [
         ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
         ":sincosf_utils",
+        ":errno",
     ],
 )
 
@@ -2816,9 +2991,7 @@ libc_math_function(name = "dsubf128")
 libc_math_function(
     name = "erff",
     additional_deps = [
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_polyeval",
-        ":__support_macros_optimization",
+        ":__support_math_erff"
     ],
 )
 
@@ -2849,38 +3022,31 @@ libc_math_function(
 libc_math_function(
     name = "exp10",
     additional_deps = [
-        ":__support_fputil_double_double",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_fputil_triple_double",
-        ":__support_integer_literals",
-        ":__support_macros_optimization",
-        ":common_constants",
-        ":explogxf",
+        ":__support_math_exp10",
+        ":errno",
     ],
 )
 
 libc_math_function(
     name = "exp10f",
     additional_deps = [
-        ":exp10f_impl",
+        ":__support_math_exp10f",
+        ":errno",
     ],
 )
 
 libc_math_function(
     name = "exp10f16",
     additional_deps = [
-        ":expxf16",
+        ":__support_math_exp10f16",
+        ":errno",
     ],
 )
 
 libc_math_function(
     name = "exp10m1f16",
     additional_deps = [
-        ":expxf16",
+        ":__support_math_exp10f16_utils",
     ],
 )
 
@@ -3373,6 +3539,14 @@ libc_math_function(
     ],
 )
 
+libc_math_function(
+    name = "hypotf16",
+    additional_deps = [
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_sqrt",
+    ],
+)
+
 libc_math_function(name = "ilogb")
 
 libc_math_function(name = "ilogbf")
@@ -3715,14 +3889,13 @@ libc_math_function(
         ":__support_fputil_multiply_add",
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
         ":__support_fputil_sqrt",
         ":__support_fputil_triple_double",
         ":__support_macros_optimization",
+        ":__support_math_exp10f",
         ":common_constants",
         ":explogxf",
         ":exp2f_impl",
-        ":exp10f_impl",
     ],
 )
 
@@ -3831,7 +4004,6 @@ libc_math_function(
     name = "sinf",
     additional_deps = [
         ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
@@ -4450,6 +4622,7 @@ libc_support_library(
         "src/string/memory_utils/aarch64/inline_memset.h",
         "src/string/memory_utils/arm/common.h",
         "src/string/memory_utils/arm/inline_memcpy.h",
+        "src/string/memory_utils/arm/inline_memset.h",
         "src/string/memory_utils/generic/aligned_access.h",
         "src/string/memory_utils/generic/byte_per_byte.h",
         "src/string/memory_utils/inline_bcmp.h",
@@ -5910,6 +6083,17 @@ libc_function(
 
 ############################## wchar targets ###############################
 
+libc_support_library(
+    name = "wchar_utils",
+    hdrs = ["src/wchar/wchar_utils.h"],
+    deps = [
+        ":__support_common",
+        ":__support_macros_attributes",
+        ":types_size_t",
+        ":types_wchar_t",
+    ],
+)
+
 libc_function(
     name = "btowc",
     srcs = ["src/wchar/btowc.cpp"],
@@ -6007,6 +6191,7 @@ libc_function(
         ":__support_macros_config",
         ":types_size_t",
         ":types_wchar_t",
+        ":wchar_utils",
     ],
 )
 
@@ -6118,6 +6303,7 @@ libc_function(
         ":__support_macros_config",
         ":types_size_t",
         ":types_wchar_t",
+        ":wchar_utils",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
index d20922df1b8b5..7f119e3e9b637 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
@@ -286,6 +286,13 @@ math_mpfr_test(
     ],
 )
 
+math_mpfr_test(
+    name = "hypotf16",
+    hdrs = [
+        "HypotTest.h",
+    ],
+)
+
 math_mpfr_test(
     name = "llrint",
     hdrs = ["RoundToIntegerTest.h"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
index 4df1c4508d764..6d9d1d6a33255 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
@@ -898,6 +898,11 @@ math_test(
     hdrs = ["HypotTest.h"],
 )
 
+math_test(
+    name = "hypotf16",
+    hdrs = ["HypotTest.h"],
+)
+
 math_test(
     name = "ilogb",
     hdrs = ["ILogbTest.h"],
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f5f0d92685e0c..9ec7c51da4065 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1620,6 +1620,7 @@ cc_library(
         ":AMDGPUIncGen",
         ":ArithDialect",
         ":BytecodeOpInterface",
+        ":DialectUtils",
         ":GPUDialect",
         ":IR",
         ":InferTypeOpInterface",
@@ -3958,6 +3959,7 @@ cc_library(
         ":BufferizationToMemRef",
         ":ComplexToLLVM",
         ":ComplexToLibm",
+        ":ComplexToROCDLLibraryCalls",
         ":ComplexToSPIRV",
         ":ComplexToStandard",
         ":ControlFlowToLLVM",
@@ -11900,6 +11902,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ComplexToROCDLLibraryCalls",
+    srcs = glob([
+        "lib/Conversion/ComplexToROCDLLibraryCalls/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Conversion/ComplexToROCDLLibraryCalls/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":ComplexDialect",
+        ":ConversionPassIncGen",
+        ":FuncDialect",
+        ":IR",
+        ":Pass",
+        ":TransformUtils",
+    ],
+)
+
 cc_library(
     name = "ComplexToSPIRV",
     srcs = glob([
@@ -13657,17 +13678,17 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
-        ":XeVMDialect",
         ":ConversionPassIncGen",
-	":ConvertToLLVMInterface",
-	":GPUDialect",
+        ":ConvertToLLVMInterface",
+        ":GPUDialect",
         ":IR",
-	":LLVMCommonConversion",
-	":LLVMDialect",
+        ":LLVMCommonConversion",
+        ":LLVMDialect",
         ":Pass",
-	":Support",
+        ":Support",
         ":TransformUtils",
         ":VectorDialect",
-	"//llvm:Support",
+        ":XeVMDialect",
+        "//llvm:Support",
     ],
 )